diff --git a/CHANGES.txt b/CHANGES.txt index 89eb2db..ce85777 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -4,7 +4,9 @@ Release notes 1.5.1 (unreleased) ------------------ -- Nothing changed yet. +- Fix guest agent revamp: 0xFF handling did not work as advertised. (PL-132809) + +- Implement automatic (root) volume migration between different RBD pool.s () 1.5.0 (2024-10-11) diff --git a/HACKING.txt b/HACKING.txt index 7f14c74..d36e13e 100644 --- a/HACKING.txt +++ b/HACKING.txt @@ -16,6 +16,23 @@ To run only some tests you can pass arguments to pytest: hydra01 ~/fc-nixos $ nix-build --arg useCheckout true --argstr testOpts "-k test_agent_check" tests/kvm_host_ceph-nautilus.nix +To interact with the tests using PDB you can add breakpoints and/or cause the tests to fail and then: + +Keep the build outputs: + + hydra01 ~/fc-nixos $ nix-build --arg useCheckout true --argstr testOpts "-k test_agent_check" tests/kvm_host_ceph-nautilus.nix -K + +Get a live pdb session: + + sudo -i + cd /tmp/nix-build-python3.8-fc.qemu-dev.drv-50/build/fc.qemu + grep + source ../env-vars + + + + + Real-world testing on FCIO DEV network -------------------------------------- diff --git a/pytest.ini b/pytest.ini index 2abff92..c5a7255 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,11 +1,11 @@ [pytest] -addopts = --tb=native --timeout=20 --cov=src --cov-report=html --junitxml=/tmp/fc.qemu-report.xml +addopts = --tb=native --timeout=20 --cov=fc.qemu --cov-report=html --junitxml=/tmp/fc.qemu-report.xml markers = slow: mark test as slow. live: requires a full live environment capable of running qemu and ceph unit: can be run without any further environment -testpaths = src +testpaths = tests cache_dir = /tmp/fc.qemu-pytest-cache/ diff --git a/src/fc/qemu/agent.py b/src/fc/qemu/agent.py index 640e415..d2c0317 100644 --- a/src/fc/qemu/agent.py +++ b/src/fc/qemu/agent.py @@ -16,6 +16,7 @@ import typing from codecs import decode from multiprocessing.pool import ThreadPool +from pathlib import Path import colorama import consulate @@ -183,16 +184,16 @@ def locked(blocking=True): # and then unlock when we're back to zero. def lock_decorator(f): def locked_func(self, *args, **kw): - # New lockfile behaviour: lock with a global file that is really + # New lock file behaviour: lock with a global file that is really # only used for this purpose and is never replaced. - self.log.debug("acquire-lock", target=self.lockfile) - if not self._lockfile_fd: - if not os.path.exists(self.lockfile): - open(self.lockfile, "a+").close() - self._lockfile_fd = os.open(self.lockfile, os.O_RDONLY) + self.log.debug("acquire-lock", target=str(self.lock_file)) + if not self._lock_file_fd: + if not os.path.exists(self.lock_file): + open(self.lock_file, "a+").close() + self._lock_file_fd = os.open(self.lock_file, os.O_RDONLY) mode = fcntl.LOCK_EX | (fcntl.LOCK_NB if not blocking else 0) try: - fcntl.flock(self._lockfile_fd, mode) + fcntl.flock(self._lock_file_fd, mode) except IOError: # This happens in nonblocking mode and we just give up as # that's what's expected to speed up things. @@ -206,7 +207,7 @@ def locked_func(self, *args, **kw): self._lock_count += 1 self.log.debug( "acquire-lock", - target=self.lockfile, + target=str(self.lock_file), result="locked", count=self._lock_count, ) @@ -216,26 +217,26 @@ def locked_func(self, *args, **kw): self._lock_count -= 1 self.log.debug( "release-lock", - target=self.lockfile, + target=self.lock_file, count=self._lock_count, ) if self._lock_count == 0: try: - fcntl.flock(self._lockfile_fd, fcntl.LOCK_UN) + fcntl.flock(self._lock_file_fd, fcntl.LOCK_UN) self.log.debug( "release-lock", - target=self.lockfile, + target=self.lock_file, result="unlocked", ) except: # noqa self.log.debug( "release-lock", exc_info=True, - target=self.lockfile, + target=self.lock_file, result="error", ) - os.close(self._lockfile_fd) - self._lockfile_fd = None + os.close(self._lock_file_fd) + self._lock_file_fd = None return locked_func @@ -261,7 +262,7 @@ class Agent(object): # from the sysconfig module. See this class' __init__. The defaults # are here to support testing. - prefix = "/" + prefix = Path("/") this_host = "" migration_ctl_address = None accelerator = "" @@ -269,10 +270,11 @@ class Agent(object): vhost = False ceph_id = "admin" timeout_graceful = 10 - vm_config_template_path = [ - "/etc/qemu/qemu.vm.cfg.in", - pkg_resources.resource_filename(__name__, "qemu.vm.cfg.in"), - ] + + system_config_template = Path("etc/qemu/qemu.vm.cfg.in") + builtin_config_template = Path( + pkg_resources.resource_filename(__name__, "qemu.vm.cfg.in") + ) consul_token = None consul_generation = -1 @@ -286,15 +288,15 @@ class Agent(object): binary_generation = 0 # For upgrade-purposes we're running an old and a new locking mechanism - # in step-lock. We used to lock the configfile but we're using rename to + # in step-lock. We used to lock the config file but we're using rename to # update it atomically. That's not compatible and will result in a consul # event replacing the file and then getting a lock on the new file while # there still is another process running. This will result in problems # accessing the QMP socket as that only accepts a single connection and # will then time out. _lock_count = 0 - _lockfile_fd = None - _configfile_fd = None + _lock_file_fd = None + _config_file_fd = None def __init__(self, name, enc=None): # Update configuration values from system or test config. @@ -304,6 +306,8 @@ def __init__(self, name, enc=None): self.name = name + self.system_config_template = self.prefix / self.system_config_template + # The ENC data is the parsed configuration from the config file, or # from consul (or from wherever). This is intended to stay true to # the config file format (not adding computed values or manipulating @@ -318,29 +322,23 @@ def __init__(self, name, enc=None): self.contexts = () @property - def configfile(self): - return os.path.join( - self.prefix, "etc/qemu/vm", "{}.cfg".format(self.name) - ) + def config_file(self): + return self.prefix / "etc/qemu/vm" / f"{self.name}.cfg" @property - def configfile_staging(self): - return os.path.join( - self.prefix, "etc/qemu/vm", ".{}.cfg.staging".format(self.name) - ) + def config_file_staging(self): + return self.prefix / "etc/qemu/vm" / f".{self.name}.cfg.staging" @property - def lockfile(self): - return os.path.join( - self.prefix, "run", "qemu.{}.lock".format(self.name) - ) + def lock_file(self): + return self.prefix / "run" / f"qemu.{self.name}.lock" def _load_enc(self): try: - with open(self.configfile) as f: + with self.config_file.open() as f: return yaml.safe_load(f) except IOError: - if os.path.exists(self.configfile_staging): + if self.config_file_staging.exists(): # The VM has been freshly created. Set up a boilerplate # configuration and let the actual config be established # through the 'ensure' command. @@ -348,7 +346,7 @@ def _load_enc(self): else: self.log.error("unknown-vm") raise VMConfigNotFound( - "Could not load {}".format(self.configfile) + "Could not load {}".format(self.config_file) ) @classmethod @@ -371,9 +369,8 @@ def handle_consul_event(cls, input: typing.TextIO = sys.stdin): @classmethod def _vm_agents_for_host(cls): - for candidate in sorted(glob.glob(cls.prefix + "run/qemu.*.pid")): - name = candidate.replace(cls.prefix + "run/qemu.", "") - name = name.replace(".pid", "") + for candidate in sorted((cls.prefix / "run").glob("qemu.*.pid")): + name = candidate.name.replace("qemu.", "").replace(".pid", "") try: agent = Agent(name) yield agent @@ -700,22 +697,21 @@ def stage_new_config(self): """ # Ensure the config directory exists - if not p.isdir(p.dirname(self.configfile)): - os.makedirs(p.dirname(self.configfile)) + self.config_file.parent.mkdir(parents=True, exist_ok=True) # Ensure the staging file exists. - open(self.configfile_staging, "a+").close() - staging_lock = os.open(self.configfile_staging, os.O_RDONLY) + self.config_file_staging.touch() + staging_lock = os.open(self.config_file_staging, os.O_RDONLY) fcntl.flock(staging_lock, fcntl.LOCK_EX) self.log.debug( "acquire-staging-lock", - target=self.configfile_staging, + target=self.config_file_staging, result="locked", ) try: update_staging_config = False # Verify generation of config to protect against lost updates. - with open(self.configfile_staging, "r") as current_staging: + with self.config_file_staging.open("r") as current_staging: try: current_staging_config = yaml.safe_load(current_staging) current_staging_generation = current_staging_config[ @@ -735,14 +731,14 @@ def stage_new_config(self): if update_staging_config: self.log.debug("save-staging-config") # Update the file in place to avoid lock breaking. - with open(self.configfile_staging, "w") as new_staging: + with self.config_file_staging.open("w") as new_staging: yaml.safe_dump(self.enc, new_staging) - os.chmod(self.configfile_staging, 0o644) + self.config_file_staging.chmod(0o644) # Do we need to activate this config? activate_staging_config = False try: - with open(self.configfile, "r") as current_active: + with self.config_file.open("r") as current_active: current_active_config = yaml.safe_load(current_active) current_active_generation = current_active_config[ "consul-generation" @@ -762,7 +758,7 @@ def stage_new_config(self): os.close(staging_lock) self.log.debug( "release-staging-lock", - target=self.configfile_staging, + target=self.config_file_staging, result="released", ) @@ -774,20 +770,20 @@ def activate_new_config(self): re-entered to activate the changed configuration. """ - if not os.path.exists(self.configfile_staging): + if not self.config_file_staging.exists(): self.log.debug("check-staging-config", result="none") return False - staging_lock = os.open(self.configfile_staging, os.O_RDONLY) + staging_lock = os.open(self.config_file_staging, os.O_RDONLY) fcntl.flock(staging_lock, fcntl.LOCK_EX) self.log.debug( "acquire-staging-lock", - target=self.configfile_staging, + target=self.config_file_staging, result="locked", ) try: # Verify generation of config to protect against lost updates. - with open(self.configfile_staging, "r") as current_staging: + with self.config_file_staging.open("r") as current_staging: try: current_staging_config = yaml.safe_load(current_staging) staging_generation = current_staging_config[ @@ -797,7 +793,7 @@ def activate_new_config(self): self.log.debug( "update-check", result="inconsistent", action="purge" ) - os.unlink(self.configfile_staging) + self.config_file_staging.unlink(missing_ok=True) return False else: if staging_generation <= self.consul_generation: @@ -825,7 +821,7 @@ def activate_new_config(self): # The config seems consistent and newer, lets update. # We can replace the config file because that one is protected # by the global VM lock. - shutil.copy2(self.configfile_staging, self.configfile) + shutil.copy2(self.config_file_staging, self.config_file) self.enc = self._load_enc() return True finally: @@ -833,25 +829,25 @@ def activate_new_config(self): os.close(staging_lock) self.log.debug( "release-staging-lock", - target=self.configfile_staging, + target=self.config_file_staging, result="released", ) def has_new_config(self): - if not os.path.exists(self.configfile_staging): + if not self.config_file_staging.exists(): self.log.debug("check-staging-config", result="none") return False - staging_lock = os.open(self.configfile_staging, os.O_RDONLY) + staging_lock = os.open(self.config_file_staging, os.O_RDONLY) fcntl.flock(staging_lock, fcntl.LOCK_EX) self.log.debug( "acquire-staging-lock", - target=self.configfile_staging, + target=self.config_file_staging, result="locked", ) try: # Verify generation of config to protect against lost updates. - with open(self.configfile_staging, "r") as current_staging: + with self.config_file_staging.open("r") as current_staging: try: current_staging_config = yaml.safe_load(current_staging) staging_generation = current_staging_config[ @@ -861,7 +857,7 @@ def has_new_config(self): self.log.debug( "update-check", result="inconsistent", action="purge" ) - os.unlink(self.configfile_staging) + self.config_file_staging.unlink(missing_ok=True) return False else: if staging_generation <= self.consul_generation: @@ -891,7 +887,7 @@ def has_new_config(self): os.close(staging_lock) self.log.debug( "release-staging-lock", - target=self.configfile_staging, + target=self.config_file_staging, result="released", ) @@ -902,20 +898,27 @@ def _update_from_enc(self): # reflect back into the config file. self.cfg = copy.copy(self.enc["parameters"]) self.cfg["name"] = self.enc["name"] + self.cfg["root_size"] = self.cfg["disk"] * (1024**3) self.cfg["swap_size"] = swap_size(self.cfg["memory"]) self.cfg["tmp_size"] = tmp_size(self.cfg["disk"]) self.cfg["ceph_id"] = self.ceph_id self.cfg["cpu_model"] = self.cfg.get("cpu_model", "qemu64") + self.cfg["binary_generation"] = self.binary_generation self.consul_generation = self.enc["consul-generation"] self.qemu = Qemu(self.cfg) - self.ceph = Ceph(self.cfg) + self.ceph = Ceph(self.cfg, self.enc) self.contexts = [self.qemu, self.ceph] for attr in ["migration_ctl_address"]: setattr(self, attr, getattr(self, attr).format(**self.cfg)) - for cand in self.vm_config_template_path: - if p.exists(cand): - self.vm_config_template = cand + for candidate in [ + self.system_config_template, + self.builtin_config_template, + ]: + if candidate.exists(): + self.vm_config_template = candidate break + else: + raise RuntimeError("Could not find Qemu config file template.") def __enter__(self): # Allow updating our config by exiting/entering after setting new ENC @@ -1079,6 +1082,7 @@ def ensure_online_local(self): self.ensure_online_disk_size() self.ensure_online_disk_throttle() self.ensure_watchdog() + self.ceph.ensure() # Be aggressive/opportunistic about re-acquiring locks in case # they were taken away. self.ceph.lock() @@ -1093,8 +1097,8 @@ def ensure_online_local(self): def cleanup(self): """Removes various run and tmp files.""" self.qemu.clean_run_files() - for tmp in glob.glob(self.configfile + "?*"): - os.unlink(tmp) + for tmp in self.config_file.parent.glob(self.config_file.name + "?*"): + tmp.unlink(missing_ok=True) def ensure_thawed(self): self.log.info("ensure-thawed", volume="root") @@ -1135,19 +1139,20 @@ def mark_qemu_binary_generation(self): def ensure_online_disk_size(self): """Trigger block resize action for the root disk.""" - target_size = self.cfg["disk"] * (1024**3) - if self.ceph.root.size >= target_size: + target_size = self.cfg["root_size"] + current_size = self.ceph.volumes["root"].size + if current_size >= target_size: self.log.info( "check-disk-size", wanted=target_size, - found=self.ceph.root.size, + found=current_size, action="none", ) return self.log.info( "check-disk-size", wanted=target_size, - found=self.ceph.root.size, + found=current_size, action="resize", ) self.qemu.resize_root(target_size) @@ -1222,8 +1227,8 @@ def consul_deregister(self): @locked() @running(False) def start(self): + self.ceph.start() self.generate_config() - self.ceph.start(self.enc, self.binary_generation) self.qemu.start() self.consul_register() self.ensure_online_disk_throttle() @@ -1280,35 +1285,42 @@ def snapshot(self, snapshot, keep=0): if keep: until = util.today() + datetime.timedelta(days=keep) snapshot = snapshot + "-keep-until-" + until.strftime("%Y%m%d") - if snapshot in [x.snapname for x in self.ceph.root.snapshots]: + if snapshot in [ + x.snapname for x in self.ceph.volumes["root"].snapshots + ]: self.log.info("snapshot-exists", snapshot=snapshot) return self.log.info("snapshot-create", name=snapshot) with self.frozen_vm() as frozen: if frozen: - self.ceph.root.snapshots.create(snapshot) + self.ceph.volumes["root"].snapshots.create(snapshot) else: self.log.error("snapshot-ignore", reason="not frozen") raise RuntimeError("VM not frozen, not making snapshot.") + # This must be locked because we're going to use the + # QMP socket and that only supports talking to one person at a time. + # Alternatively we'd had to connect/disconnect and do weird things + # for every single command ... @locked() def status(self): """Determine status of the VM.""" - if self.qemu.is_running(): - status = 0 - self.log.info("vm-status", result="online") - for device in list(self.qemu.block_info().values()): - self.log.info( - "disk-throttle", - device=device["device"], - iops=device["inserted"]["iops"], - ) - else: - status = 1 - self.log.info("vm-status", result="offline") - for volume in self.ceph.volumes: - locker = volume.lock_status() - self.log.info("rbd-status", volume=volume.fullname, locker=locker) + try: + if self.qemu.is_running(): + status = 0 + self.log.info("vm-status", result="online") + for device in list(self.qemu.block_info().values()): + self.log.info( + "disk-throttle", + device=device["device"], + iops=device["inserted"]["iops"], + ) + else: + status = 1 + self.log.info("vm-status", result="offline") + except VMStateInconsistent: + self.log.exception("vm-status", result="inconsistent") + self.ceph.status() consul = locate_live_service(self.consul, "qemu-" + self.name) if consul: self.log.info( @@ -1316,6 +1328,7 @@ def status(self): ) else: self.log.info("consul", service="") + return status def telnet(self): @@ -1486,6 +1499,9 @@ def generate_config(self): """ self.log.debug("generate-config") + # WARNING: those names embedded in double curly braces must stay + # compatible between older and newer fc.qemu versions to allow + # upgrade/downgrade live migrations! self.qemu.args = [ "-nodefaults", "-only-migratable", @@ -1494,7 +1510,7 @@ def generate_config(self): "-name {name},process=kvm.{name}", "-chroot {{chroot}}", "-runas nobody", - "-serial file:/var/log/vm/{name}.log", + "-serial file:{serial_file}", "-display vnc={{vnc}}", "-pidfile {{pidfile}}", "-vga std", @@ -1529,7 +1545,7 @@ def generate_config(self): ) ) - with open(self.vm_config_template) as f: + with self.vm_config_template.open() as f: tpl = f.read() accelerator = ( ' accel = "{}"'.format(self.accelerator) @@ -1541,5 +1557,6 @@ def generate_config(self): accelerator=accelerator, machine_type=machine_type, network="".join(netconfig), + ceph=self.ceph, **self.cfg, ) diff --git a/src/fc/qemu/hazmat/ceph.py b/src/fc/qemu/hazmat/ceph.py index 7e7e238..54aecad 100644 --- a/src/fc/qemu/hazmat/ceph.py +++ b/src/fc/qemu/hazmat/ceph.py @@ -4,14 +4,348 @@ """ import hashlib +import json +from typing import Dict, List, Optional import rados +import rbd from ..sysconfig import sysconfig -from ..util import cmd, log +from ..timeout import TimeoutError +from ..util import cmd, log, parse_export_format from .volume import Volume +def valid_rbd_pool_name(name): + if name == "rbd": + return True + if ".rgw." in name: + return False + if name.startswith("rbd."): + return True + return False + + +# This is an additional abstraction layer on top of generic Ceph volume +# handling. When introducing the ability to migrate root volumes we had to +# start differentiating between the abstract concept of "I need a +# root/tmp/swap" volume and the specific volume objects in Ceph that may or +# may not exist. + + +class VolumeSpecification: + """This is a specification of a disk attached to a VM. + + This will correlate to specific RBD volumes but exists + independent of any specific one to allow easier state + reconciliation and life cycle management. + + """ + + # Used for internal book-keeping, suffixing the rbd image name, and for + # labels on partitions and/or file systems. + suffix: str + + current_pool: Optional[str] + + def __init__(self, ceph): + self.ceph = ceph + + self.name = f"{ceph.cfg['name']}.{self.suffix}" + + self.ceph.specs[self.suffix] = self + self.ceph.volumes.setdefault(self.suffix, None) + + self._log = ceph.log + self.cmd = lambda cmdline, **args: cmd(cmdline, log=self.log, **args) + + @property + def desired_pool(self) -> str: + return self.ceph.cfg["rbd_pool"] + + @desired_pool.setter + def desired_pool(self, value: str): + self.ceph.cfg["rbd_pool"] = value + + @property + def desired_size(self) -> int: + return self.ceph.cfg[f"{self.suffix}_size"] + + @desired_size.setter + def desired_size(self, value: int): + self.ceph.cfg[f"{self.suffix}_size"] = value + + @property + def log(self): + if self.volume: + return self._log.bind(volume=self.volume.fullname) + return self._log.bind(volume=self.name) + + @property + def volume(self): + return self.ceph.volumes.get(self.suffix) + + def exists_in_pools(self): + result = [] + for pool, ioctx in self.ceph.ioctxs.items(): + if self.name in self.ceph.rbd.list(ioctx): + result.append(pool) + return result + + def exists_in_pool(self): + pools = self.exists_in_pools() + if len(pools) > 1: + raise RuntimeError(f"Inconsistent pools: {pools}") + if pools: + return pools[0] + return None + + def pre_start(self): # pragma: no cover + pass + + def start(self): # pragma: no cover + pass + + def status(self): + if self.volume: + locker = self.volume.lock_status() + self.log.info( + "rbd-status", volume=self.volume.fullname, locker=locker + ) + else: + self.log.info( + "rbd-status", + volume=f"{self.desired_pool}/{self.name}", + presence="missing", + ) + + def ensure(self): # pragma: no cover + pass + + def ensure_presence(self): + self.ceph.ensure_volume_presence( + self.name, self.desired_pool, self.desired_size + ) + self.ceph.get_volume(self) + + +class RootSpec(VolumeSpecification): + suffix = "root" + + def start(self): + self.log.info("start-root") + + current_pool = self.exists_in_pool() + self.log.debug("root-found-in", current_pool=current_pool) + + if current_pool == self.desired_pool: + return + + # Image needs migration. + if self.ensure_migration(allow_execute=False): + # Variaton 1: The image that exists is itself being migrated + # currently, so postpone the migration and keep running with + # the existing disk. + # If there was a migration in progress, give it a chance to + # commit. but do not start executing it as that seems to create + # exclusive-locking issues result in VMs stuck booting until + # the execution phase has finished. + self.log.info( + "migrate-vm-root-disk", + action="postpone", + reason="A different migration is currently in progress.", + ) + else: + # Variation 2: The image is ready to be migrated, so let's go. + self.log.info( + "migrate-vm-root-disk", + action="start", + pool_from=current_pool, + pool_to=self.desired_pool, + ) + self.volume.unlock() + self.volume.close() + self.cmd( + f"rbd migration prepare {current_pool}/{self.name} " + f"{self.desired_pool}/{self.name}" + ) + # Ensure we now expose the correct volume. + self.ceph.get_volume(self) + + def ensure_presence(self): + if self.exists_in_pool(): + self.ceph.get_volume(self) + return + self.log.info("create-vm") + # We rely on the image being created in the CREATE_VM script as this + # will perform necessary cloning (or other) operations from whatever + # source it considers best. + self.cmd(self.ceph.CREATE_VM.format(**self.ceph.cfg)) + self.ceph.get_volume(self) + self.regen_xfs_uuid() + + def ensure(self): + super().ensure() + self.ensure_migration() + + def ensure_migration(self, allow_execute=True, allow_commit=True): + migration = self.migration_status() + if not migration: + return + elif migration["state"] == "prepared" and allow_execute: + self.log.info("root-migration-execute") + self.cmd( + f"ceph rbd task add migration execute {self.volume.fullname}" + ) + return migration + elif migration["state"] == "executed" and allow_commit: + self.log.info("root-migration-commit") + self.cmd( + f"rbd --no-progress migration commit {self.volume.fullname}", + ) + return + # Indicate that there is a migration pending. + return migration + + def status(self): + super().status() + self.migration_status() + + def migration_status(self): + if not self.volume: + return + output = self.cmd( + f"rbd status --format json {self.volume.fullname}", + encoding="utf-8", + ) + migration = json.loads(output).get("migration", None) + if not migration: + return + self.log.info( + "root-migration-status", + status=migration["state"], + pool_from=migration["source_pool_name"], + pool_to=migration["dest_pool_name"], + progress=migration["state_description"], + ) + return migration + + def regen_xfs_uuid(self): + """Regenerate the UUID of the XFS filesystem on partition 1.""" + with self.volume.mapped(): + try: + self.volume.wait_for_part1dev() + except TimeoutError: + self.log.warn( + "regenerate-xfs-uuid", + status="skipped", + reason="no partition found", + ) + return + partition = self.volume.part1dev + output = self.cmd(f"blkid {partition} -o export") + values = parse_export_format(output) + fs_type = values.get("TYPE") + if fs_type != "xfs": + self.log.info( + "regenerate-xfs-uuid", + device=partition, + status="skipped", + fs_type=fs_type, + reason="filesystem type != xfs", + ) + return + with self.volume.mounted(): + # Mount once to ensure a clean log. + pass + self.log.info("regenerate-xfs-uuid", device=partition) + self.cmd(f"xfs_admin -U generate {partition}") + + +class TmpSpec(VolumeSpecification): + suffix = "tmp" + + ENC_SEED_PARAMETERS = ["cpu_model", "rbd_pool"] + + def pre_start(self): + for pool in self.exists_in_pools(): + if pool != self.desired_pool: + self.log.info("delete-outdated-tmp", pool=pool, image=self.name) + self.ceph.remove_volume(self.name, pool) + + def start(self): + self.log.info("start-tmp") + with self.volume.mapped(): + self.mkfs() + self.seed(self.ceph.enc, self.ceph.cfg["binary_generation"]) + + def mkfs(self): + self.log.debug("create-fs") + device = self.volume.device + assert device, f"volume must be mapped first: {device}" + self.cmd(f'sgdisk -o "{device}"') + self.cmd( + f'sgdisk -a 8192 -n 1:8192:0 -c "1:{self.suffix}" ' + f'-t 1:8300 "{device}"' + ) + self.cmd(f"partprobe {device}") + self.volume.wait_for_part1dev() + options = getattr(self.ceph, "MKFS_XFS") + self.cmd( + f'mkfs.xfs {options} -L "{self.suffix}" {self.volume.part1dev}' + ) + + def seed(self, enc, generation): + self.log.info("seed") + with self.volume.mounted() as target: + target.chmod(0o1777) + fc_data = target / "fc-data" + fc_data.mkdir() + fc_data.chmod(0o750) + enc_json = fc_data / "enc.json" + enc_json.touch(0o640) + with enc_json.open("w") as f: + json.dump(enc, f) + f.write("\n") + # Seed boot-time VM properties which require a reboot to + # change. While some of these properties are copied from + # the ENC data, a separate file allows properties which + # are not exposed to guests through ENC to be added in the + # future. + properties = {} + properties["binary_generation"] = generation + for key in self.ENC_SEED_PARAMETERS: + if key in enc["parameters"]: + properties[key] = enc["parameters"][key] + self.log.debug("guest-properties", properties=properties) + guest_properties = fc_data / "qemu-guest-properties-booted" + with guest_properties.open("w") as f: + json.dump(properties, f) + # For backwards compatibility with old fc-agent versions, + # write the Qemu binary generation into a separate file. + self.log.debug("binary-generation", generation=generation) + generation_marker = fc_data / "qemu-binary-generation-booted" + with generation_marker.open("w") as f: + f.write(str(generation) + "\n") + + +class SwapSpec(VolumeSpecification): + suffix = "swap" + + def pre_start(self): + for pool in self.exists_in_pools(): + if pool != self.desired_pool: + self.log.info( + "delete-outdated-swap", pool=pool, image=self.name + ) + self.ceph.remove_volume(self.name, pool) + + def start(self): + self.log.info("start-swap") + with self.volume.mapped(): + self.cmd(f'mkswap -f -L "{self.suffix}" {self.volume.device}') + + class Ceph(object): # Attributes on this class can be overriden in a controlled fashion # from the sysconfig module. See __init__(). The defaults are here to @@ -19,19 +353,34 @@ class Ceph(object): CREATE_VM = None - def __init__(self, cfg): + # Those are two different representations of the disks/volumes we manage. + # The can be treated from client code as well-known structures, so that + # when the context manager is active then the keys 'root', 'tmp','swp' + # etc. always exist. The specs will always carry a proper object, but the + # volumes may be None, as that depends on the bootstrapping of a VM which + # may not have happened. + # Otherwise, this code takes care that the volume + # objects are available to client code as long as a real Ceph volume does + # exist. + specs: Dict[str, VolumeSpecification] + volumes: Dict[str, Optional[Volume]] + + def __init__(self, cfg, enc) -> None: # Update configuration values from system or test config. self.__dict__.update(sysconfig.ceph) self.log = log.bind(subsystem="ceph", machine=cfg["name"]) + # enc `parameters` plus additional configs not included in the enc self.cfg = cfg + # the original enc data + self.enc = enc + self.rados = None - self.ioctx = None - self.root = None - self.swap = None - self.tmp = None - self.volumes = [] - self.pool = self.cfg["rbd_pool"] + self.ioctxs: Dict[str, rados.Ioctx] = {} + self.rbd = rbd.RBD() + + self.specs = {} + self.volumes = {} def __enter__(self): # Not sure whether it makes sense that we configure the client ID @@ -39,75 +388,116 @@ def __enter__(self): # Rados binding does ... :/ self.log.debug("connect-rados") self.rados = rados.Rados( - conffile=self.CEPH_CONF, name="client." + self.CEPH_CLIENT + conffile=self.CEPH_CONF, + name="client." + self.CEPH_CLIENT, ) self.rados.connect() - self.ioctx = self.rados.open_ioctx(self.pool) - - volume_prefix = self.cfg["name"] - self.root = Volume(self, volume_prefix + ".root", "root") - self.swap = Volume(self, volume_prefix + ".swap", "swap") - self.tmp = Volume(self, volume_prefix + ".tmp", "tmp") + # Keep open ioctx handles to all relevant pools. + for pool_name in self.rados.list_pools(): + if not valid_rbd_pool_name(pool_name): + continue + self.ioctxs[pool_name] = self.rados.open_ioctx(pool_name) - self.volumes = [self.root, self.swap, self.tmp] + RootSpec(self) + SwapSpec(self) + TmpSpec(self) + for spec in self.specs.values(): + self.get_volume(spec) def __exit__(self, exc_value, exc_type, exc_tb): - for vol in self.volumes: - if vol._image: - vol._image.close() - self.ioctx.close() + for volume in self.opened_volumes: + volume.close() + self.volumes.clear() + for ioctx in self.ioctxs.values(): + ioctx.close() + self.ioctxs.clear() self.rados.shutdown() - def start(self, enc_data, generation): - self.ensure_root_volume() - self.ensure_tmp_volume(enc_data, generation) - self.ensure_swap_volume() + def start(self): + """Perform Ceph-related tasks before starting a VM.""" + for spec in self.specs.values(): + # The pre-start phase guarantes that volumes are not locked + # and have no watchers, so that they can be deleted if needed. + if spec.volume: + spec.volume.unlock() + spec.volume.close() + self.log.debug("pre-start", volume_spec=spec.suffix) + spec.pre_start() + + self.log.debug("ensure-presence", volume_spec=spec.suffix) + spec.ensure_presence() + + # The start phase guarantees the locks again. + spec.volume.lock() + + self.log.debug("ensure-size", volume_spec=spec.suffix) + spec.volume.ensure_size(spec.desired_size) + + self.log.debug("start", volume_spec=spec.suffix) + spec.start() def stop(self): + """Perform Ceph-related tasks after a VM has been stopped.""" self.unlock() - def ensure_root_volume(self): - self.log.info("ensure-root") - if not self.root.exists(): - self.log.info("create-vm") - cmd(self.CREATE_VM.format(**self.cfg), self.log) - self.root.regen_xfs_uuid() - self.root.lock() - - def ensure_swap_volume(self): - self.log.info("ensure-swap") - self.swap.ensure_presence(self.cfg["swap_size"]) - self.swap.lock() - self.swap.ensure_size(self.cfg["swap_size"]) - with self.swap.mapped(): - self.swap.mkswap() - - def ensure_tmp_volume(self, enc_data, generation): - self.log.info("ensure-tmp") - self.tmp.ensure_presence(self.cfg["tmp_size"]) - self.tmp.lock() - self.tmp.ensure_size(self.cfg["tmp_size"]) - with self.tmp.mapped(): - self.tmp.mkfs() - self.tmp.seed(enc_data, generation) + def ensure(self): + """Perform Ceph-related tasks to maintain a running VM.""" + for spec in self.specs.values(): + spec.ensure() + + def ensure_volume_presence(self, name, pool, size): + for ioctx in self.ioctxs.values(): + if name in self.rbd.list(ioctx): + return + self.rbd.create(self.ioctxs[pool], name, size) + + def remove_volume(self, name, pool): + self.rbd.remove(self.ioctxs[pool], name) + + def get_volume(self, spec): + """(Re-)Attach a volume object for a spec.""" + if volume := self.volumes[spec.suffix]: + volume.close() + current_pool = spec.exists_in_pool() + if not current_pool: + return + self.volumes[spec.suffix] = volume = Volume( + self, self.ioctxs[current_pool], spec.name + ) + return volume + + @property + def opened_volumes(self): + return filter(None, self.volumes.values()) + + def _clean_volume(self, volume): + for key, candidate in self.volumes.items(): + if candidate is volume: + self.volumes[key] = None + + def status(self): + # Report status for CLI usage + for spec in self.specs.values(): + spec.status() def locks(self): - for vol in self.volumes: - status = vol.lock_status() + for volume in self.opened_volumes: + status = volume.lock_status() if not status: continue - yield vol.name, status[1] + yield volume.name, status[1] def is_unlocked(self): """Returns True if no volume is locked.""" - return all(not vol.lock_status() for vol in self.volumes) + return all(not volume.lock_status() for volume in self.opened_volumes) def locked_by_me(self): """Returns True if CEPH_LOCK_HOST holds locks for all volumes.""" try: return all( - v.lock_status()[1] == self.CEPH_LOCK_HOST for v in self.volumes + v.lock_status()[1] == self.CEPH_LOCK_HOST + for v in self.opened_volumes ) except TypeError: # status[1] not accessible return False @@ -119,17 +509,17 @@ def locked_by(self): """ lock_owners = set( - v.lock_status()[1] for v in self.volumes if v.lock_status() + v.lock_status()[1] for v in self.opened_volumes if v.lock_status() ) if not lock_owners: return None if len(lock_owners) != 1: - raise ValueError("Multiple lock owners: {}".format(lock_owners)) + raise ValueError(f"Multiple lock owners: {lock_owners}") return lock_owners.pop() def lock(self): - for vol in self.volumes: - vol.lock() + for volume in self.opened_volumes: + volume.lock() def unlock(self): """Remove all of *our* volume locks. @@ -140,11 +530,11 @@ def unlock(self): This leaves other hosts' locks in place. """ exception = False - for vol in self.volumes: + for volume in self.opened_volumes: try: - vol.unlock() + volume.unlock() except Exception: - vol.log.warning("unlock-failed", exc_info=True) + volume.log.warning("unlock-failed", exc_info=True) exception = True if exception: raise RuntimeError( @@ -152,8 +542,8 @@ def unlock(self): ) def force_unlock(self): - for vol in self.volumes: - vol.unlock(force=True) + for volume in self.opened_volumes: + volume.unlock(force=True) def auth_cookie(self): """This is a cookie that can be used to validate that a party @@ -162,7 +552,10 @@ def auth_cookie(self): Used to authenticate migration requests. """ c = hashlib.sha1() - for vol in self.volumes: + for key in ["root", "swap", "tmp"]: + # This order needs to stay stable to support the auth cookie + # between old and new versions of fc.qemu + vol = self.volumes[key] status = [vol.name] lock = vol.lock_status() if lock: diff --git a/src/fc/qemu/hazmat/guestagent.py b/src/fc/qemu/hazmat/guestagent.py index 109e86d..6e8d5ef 100644 --- a/src/fc/qemu/hazmat/guestagent.py +++ b/src/fc/qemu/hazmat/guestagent.py @@ -10,6 +10,9 @@ class ClientError(RuntimeError): pass +SYNC_TIMEOUT = 30 + + class GuestAgent(object): """Wraps qemu guest agent wire protocol.""" @@ -58,6 +61,7 @@ def cmd(self, cmd, timeout=None, fire_and_forget=False, **args): self.connect() message = json.dumps({"execute": cmd, "arguments": args}) message = message.encode("utf-8") + self.log.debug("send", message=message) self.client.send(message) if not fire_and_forget: self.client.settimeout(timeout or self.timeout) @@ -84,28 +88,14 @@ def sync(self): try: while buffer := self.client.recv(4096): self.log.debug("found-buffer-garbage", buffer=buffer) - except (BlockingIOError, socket.timeout): + except socket.timeout: self.log.debug("cleared-buffer") # Phase 3: ensure we see proper agent interactions. To be sure we # test this with two diagnostic calls. The timeout can be higher now # as we expect the agent to actually have to respond to us. - - # \xff is an invalid utf-8 character and recommended to safely - # ensure that the parser of the guest agent at the other end - # is reset to a known state. This is recommended for sync. - # http://wiki.qemu-project.org/index.php/Features/GuestAgent#guest-sync - self.log.debug("clear-guest-parser") - self.client.send(b"\xff") - - ping_result = self.cmd("guest-ping") - if ping_result != {}: - raise ClientError( - f"Agent did not respond properly to ping command: {ping_result}" - ) - sync_id = random.randint(0, 0xFFFF) - result = self.cmd("guest-sync", timeout=30, id=sync_id) + result = self.cmd("guest-sync", timeout=SYNC_TIMEOUT, id=sync_id) self.log.debug("sync-response", expected=sync_id, got=result) if result == sync_id: diff --git a/src/fc/qemu/hazmat/qemu.py b/src/fc/qemu/hazmat/qemu.py index 665aba1..4dbe4ab 100644 --- a/src/fc/qemu/hazmat/qemu.py +++ b/src/fc/qemu/hazmat/qemu.py @@ -2,12 +2,12 @@ import datetime import fcntl -import glob -import os.path +import os import socket import subprocess import time from codecs import encode +from pathlib import Path import psutil import yaml @@ -16,7 +16,6 @@ from ..sysconfig import sysconfig from ..timeout import TimeOut from ..util import ControlledRuntimeException, log -from . import supervise from .guestagent import ClientError, GuestAgent from .qmp import QEMUMonitorProtocol as Qmp from .qmp import QMPConnectError @@ -54,23 +53,24 @@ def detect_current_machine_type( def locked_global(f): - LOCK = "/run/fc-qemu.lock" + LOCK = Path("run/fc-qemu.lock") # This is thread-safe *AS LONG* as every thread uses a separate instance # of the agent. Using multiple file descriptors will guarantee that the # lock can only be held once even within a single process. def locked(self, *args, **kw): - self.log.debug("acquire-global-lock", target=LOCK) + lock = self.prefix / LOCK + self.log.debug("acquire-global-lock", target=lock) if not self._global_lock_fd: - if not os.path.exists(LOCK): - open(LOCK, "a+").close() - self._global_lock_fd = os.open(LOCK, os.O_RDONLY) - self.log.debug("global-lock-acquire", target=LOCK, result="locked") + if not lock.exists(): + lock.touch() + self._global_lock_fd = os.open(lock, os.O_RDONLY) + self.log.debug("global-lock-acquire", target=lock, result="locked") fcntl.flock(self._global_lock_fd, fcntl.LOCK_EX) self._global_lock_count += 1 self.log.debug( - "global-lock-status", target=LOCK, count=self._global_lock_count + "global-lock-status", target=lock, count=self._global_lock_count ) try: return f(self, *args, **kw) @@ -78,11 +78,11 @@ def locked(self, *args, **kw): self._global_lock_count -= 1 self.log.debug( "global-lock-status", - target=LOCK, + target=lock, count=self._global_lock_count, ) if self._global_lock_count == 0: - self.log.debug("global-lock-release", target=LOCK) + self.log.debug("global-lock-release", target=lock) fcntl.flock(self._global_lock_fd, fcntl.LOCK_UN) self.log.debug("global-lock-release", result="unlocked") @@ -90,7 +90,7 @@ def locked(self, *args, **kw): class Qemu(object): - prefix = "" + prefix = Path("/") executable = "qemu-system-x86_64" # Attributes on this class can be overriden (in a controlled fashion @@ -119,20 +119,23 @@ class Qemu(object): config = "" # Host-specific qemu configuration - chroot = "/srv/vm/{name}" + chroot = Path("srv/vm/{name}") vnc = "localhost:1" MONITOR_OFFSET = 20000 - pidfile = "/run/qemu.{name}.pid" - configfile = "/run/qemu.{name}.cfg" - argfile = "/run/qemu.{name}.args" - qmp_socket = "/run/qemu.{name}.qmp.sock" + pid_file = Path("run/qemu.{name}.pid") + config_file = Path("run/qemu.{name}.cfg") + config_file_in = Path("run/qemu.{name}.cfg.in") + arg_file = Path("run/qemu.{name}.args") + arg_file_in = Path("run/qemu.{name}.args.in") + qmp_socket = Path("run/qemu.{name}.qmp.sock") + serial_file = Path("var/log/vm/{name}.log") _global_lock_fd = None _global_lock_count = 0 - migration_lockfile = "/run/qemu.migration.lock" + migration_lock_file = Path("run/qemu.migration.lock") _migration_lock_fd = None def __init__(self, vm_cfg): @@ -141,14 +144,24 @@ def __init__(self, vm_cfg): self.cfg = vm_cfg # expand template keywords in configuration variables + for f in ["migration_address"]: + setattr(self, f, getattr(self, f).format(**vm_cfg)) + + # expand template keywords in paths and apply prefix for f in [ - "pidfile", - "configfile", - "argfile", - "migration_address", + "pid_file", + "config_file", + "config_file_in", + "arg_file", + "arg_file_in", "qmp_socket", + "migration_lock_file", + "chroot", + "serial_file", ]: - setattr(self, f, getattr(self, f).format(**vm_cfg)) + expanded = self.prefix / str(getattr(self, f)).format(**vm_cfg) + setattr(self, f, expanded) + vm_cfg[f] = expanded # We are running qemu with chroot which causes us to not be able to # resolve names. :( See #13837. @@ -162,19 +175,12 @@ def __init__(self, vm_cfg): self.log = log.bind(machine=self.name, subsystem="qemu") - self.pidfile = self.prefix + self.pidfile - self.configfile = self.prefix + self.configfile - self.argfile = self.prefix + self.argfile - self.qmp_socket = self.prefix + self.qmp_socket - self.migration_lockfile = self.prefix + self.migration_lockfile - self.chroot = self.prefix + self.chroot - __qmp = None @property def qmp(self): if self.__qmp is None: - qmp = Qmp(self.qmp_socket, self.log) + qmp = Qmp(str(self.qmp_socket), self.log) qmp.settimeout(self.qmp_timeout) try: qmp.connect() @@ -202,7 +208,7 @@ def proc(self): not running. """ try: - with open(self.pidfile) as p: + with self.pid_file.open() as p: # pid file may contain trailing lines with garbage for line in p: proc = psutil.Process(int(line)) @@ -215,14 +221,14 @@ def proc(self): pass def prepare_log(self): - if not os.path.exists("/var/log/vm"): + log_dir = self.prefix / Path("/var/log/vm") + if not log_dir.is_dir(): raise RuntimeError("Expected directory /var/log/vm to exist.") - logfile = "/var/log/vm/{}.log".format(self.name) - alternate = "/var/log/vm/{}-{}.log".format( - self.name, datetime.datetime.now().isoformat() - ) - if os.path.exists(logfile): - os.rename(logfile, alternate) + log_file = log_dir / f"{self.name}.log" + if log_file.exists(): + alt_marker = datetime.datetime.now().isoformat() + alternate = log_dir / f"{self.name}-{alt_marker}.log" + log_file.rename(alternate) def _current_vms_booked_memory(self): """Determine the amount of booked memory (MiB) from the @@ -303,11 +309,11 @@ def _verify_memory(self): ) # This lock protects checking the amount of available memory and actually - # starting the VM. This ensure that no other process checks at the same + # starting the VM. This ensures that no other process checks at the same # time and we end up using the free memory twice. @locked_global def _start(self, additional_args=()): - if self.require_kvm and not os.path.exists("/dev/kvm"): + if self.require_kvm and not Path("/dev/kvm").exists(): self.log.error("missing-kvm-support") raise ControlledRuntimeException( "Refusing to start without /dev/kvm support." @@ -406,8 +412,12 @@ def write_file(self, path, content: bytes): self.log.error("guest-write-file", exc_info=True) def inmigrate(self): - self._start(["-incoming {}".format(self.migration_address)]) - time.sleep(1) + self._start([f"-incoming {self.migration_address}"]) + + timeout = TimeOut(30, 1, raise_on_timeout=True) + while self.qmp is None: + timeout.tick() + status = self.qmp.command("query-status") assert not status["running"], status assert status["status"] == "inmigrate", status @@ -629,59 +639,64 @@ def watchdog_action(self, action): ) def clean_run_files(self): - runfiles = glob.glob("/run/qemu.{}.*".format(self.cfg["name"])) + runfiles = list( + (self.prefix / "run").glob(f"qemu.{self.cfg['name']}.*") + ) if not runfiles: return self.log.debug("clean-run-files") for runfile in runfiles: - if runfile.endswith(".lock"): + if runfile.suffix == ".lock": # Never, ever, remove lock files. Those should be on # partitions that get cleaned out during reboot, but # never otherwise. continue - os.unlink(runfile) + runfile.unlink(missing_ok=True) def prepare_config(self): - chroot = self.chroot.format(**self.cfg) - if not os.path.exists(chroot): - os.makedirs(chroot) + if not self.chroot.exists(): + self.chroot.mkdir(parents=True, exist_ok=True) def format(s): + # These names must stay compatible between + # fc.qemu versions so that VMs can migrate between + # older and newer versions freely. return s.format( - pidfile=self.pidfile, - configfile=self.configfile, monitor_port=self.monitor_port, vnc=self.vnc.format(**self.cfg), - chroot=chroot, + pidfile=self.cfg["pid_file"], + configfile=self.cfg["config_file"], **self.cfg, ) self.local_args = [format(a) for a in self.args] self.local_config = format(self.config) - with open(self.configfile + ".in", "w") as f: + with self.config_file_in.open("w") as f: f.write(self.config) - with open(self.configfile, "w") as f: + + with self.config_file.open("w") as f: f.write(self.local_config) - with open(self.argfile + ".in", "w") as f: + + with self.arg_file_in.open("w") as f: yaml.safe_dump(self.args, f) # Qemu tends to overwrite the pid file incompletely -> truncate - open(self.pidfile, "w").close() + self.pid_file.open("w").close() def get_running_config(self): """Return the host-independent version of the current running config.""" - with open(self.argfile + ".in") as a: + with self.arg_file_in.open() as a: args = yaml.safe_load(a.read()) - with open(self.configfile + ".in") as c: + with self.config_file_in.open() as c: config = c.read() return args, config def acquire_migration_lock(self): assert not self._migration_lock_fd - open(self.migration_lockfile, "a+").close() - self._migration_lock_fd = os.open(self.migration_lockfile, os.O_RDONLY) + open(self.migration_lock_file, "a+").close() + self._migration_lock_fd = os.open(self.migration_lock_file, os.O_RDONLY) try: fcntl.flock(self._migration_lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB) self.log.debug("acquire-migration-lock", result="success") diff --git a/src/fc/qemu/hazmat/tests/test_ceph.py b/src/fc/qemu/hazmat/tests/test_ceph.py deleted file mode 100644 index 9ca56ef..0000000 --- a/src/fc/qemu/hazmat/tests/test_ceph.py +++ /dev/null @@ -1,41 +0,0 @@ -import pytest -import rbd - - -@pytest.fixture -def ceph_with_volumes(ceph_inst): - for vol in ceph_inst.volumes: - vol.ensure_presence() - ceph_inst.lock() - yield ceph_inst - for vol in ceph_inst.volumes: - vol.unlock(force=True) - vol.snapshots.purge() - if vol._image: - vol._image.close() - rbd.RBD().remove(ceph_inst.ioctx, vol.name) - - -def test_ceph_stop_should_unlock_all_volumes(ceph_with_volumes): - for vol in ceph_with_volumes.volumes: - assert vol.lock_status() - ceph_with_volumes.stop() - for vol in ceph_with_volumes.volumes: - assert vol.lock_status() is None - - -def test_ceph_stop_remove_only_own_locks(ceph_with_volumes): - """Test case where failed migrations leave inconsistent locking.""" - ceph_with_volumes.root.unlock() - ceph_with_volumes.root.rbdimage.lock_exclusive("someotherhost") - # It unlocks what it can. - ceph_with_volumes.stop() - assert ceph_with_volumes.root.lock_status() - assert ceph_with_volumes.swap.lock_status() is None - assert ceph_with_volumes.tmp.lock_status() is None - - -def test_is_unlocked(ceph_with_volumes): - assert ceph_with_volumes.is_unlocked() is False - ceph_with_volumes.unlock() - assert ceph_with_volumes.is_unlocked() is True diff --git a/src/fc/qemu/hazmat/tests/test_volume.py b/src/fc/qemu/hazmat/tests/test_volume.py deleted file mode 100644 index fd2bbf4..0000000 --- a/src/fc/qemu/hazmat/tests/test_volume.py +++ /dev/null @@ -1,202 +0,0 @@ -import os.path -import time - -import pytest -import rbd - -from ..volume import Volume - - -@pytest.fixture -def volume(ceph_inst): - volume = Volume(ceph_inst, "othervolume", "label") - volume.snapshots.purge() - try: - if volume._image: - volume._image.close() - rbd.RBD().remove(ceph_inst.ioctx, "othervolume") - except rbd.ImageNotFound: - pass - - yield volume - time.sleep(0.2) - - lock = volume.lock_status() - if lock is not None: - volume.rbdimage.break_lock(*lock) - volume.snapshots.purge() - try: - if volume._image: - volume._image.close() - rbd.RBD().remove(ceph_inst.ioctx, "othervolume") - except rbd.ImageNotFound: - pass - - -def test_volume_presence(volume): - assert volume.fullname == "rbd.hdd/othervolume" - assert not volume.exists() - with pytest.raises(rbd.ImageNotFound): - volume.rbdimage - volume.ensure_presence() - assert volume.rbdimage - # Check that ensure_presence is fine with being called multiple times. - volume.ensure_presence() - - -def test_volume_snapshot(volume): - volume.ensure_presence() - volume.snapshots.create("s0") - snaps = list(volume.snapshots) - assert len(snaps) == 1 - snapshot = snaps[0] - assert snapshot.name == "othervolume" - assert snapshot.snapname == "s0" - assert snapshot.size == volume.size - assert snapshot.id == volume.snapshots["s0"].id - - snapshot.remove() - assert [] == list(volume.snapshots) - - -def test_purge_snapshots(volume): - volume.ensure_presence() - for snap in ["s0", "s1"]: - volume.snapshots.create(snap) - assert len(volume.snapshots) == 2 - volume.snapshots.purge() - assert len(volume.snapshots) == 0 - - -def test_snapshot_not_found(volume): - with pytest.raises(KeyError): - volume.snapshots["no-such-key"] - - -def test_volume_size(volume): - volume.ensure_presence() - assert volume.rbdimage - assert volume.size == 1024 - volume.ensure_size(2048) - assert volume.size == 2048 - # Call ensure multiple times to help triggering caching code paths. - volume.ensure_size(2048) - assert volume.size == 2048 - - -def test_volume_shared_lock_protection(volume): - volume.ensure_presence() - volume.rbdimage.lock_shared("host1", "a") - volume.rbdimage.lock_shared("remotehost", "a") - with pytest.raises(NotImplementedError): - volume.lock_status() - lockers = volume.rbdimage.list_lockers() - for client, cookie, _ in lockers["lockers"]: - volume.rbdimage.break_lock(client, cookie) - - -def test_volume_locking(volume): - # Non-existing volumes report None as locking status but do not raise - # an exception. - assert not volume.exists() - assert volume.lock_status() is None - volume.ensure_presence() - assert volume.lock_status() is None - volume.lock() - assert volume.lock_status()[1] == "host1" - # We want to smoothen out that some other process has locked the same image - # for the same tag already and assume that this is another incarnation of - # us - for that we have our own lock. - volume.lock() - assert volume.lock_status()[1] == "host1" - volume.unlock() - assert volume.lock_status() is None - # We can call unlock twice if it isn't locked. - volume.unlock() - - volume.rbdimage.lock_exclusive("someotherhost") - with pytest.raises(rbd.ImageBusy): - volume.lock() - - # Can not unlock locks that someone else holds. - volume.unlock() - assert volume.lock_status()[1] == "someotherhost" - - -def test_force_unlock(volume): - volume.ensure_presence() - volume.rbdimage.lock_exclusive("someotherhost") - volume.unlock(force=True) - assert volume.lock_status() is None - - -# increase timeout from the default of 3s -@pytest.mark.timeout(10) -def test_volume_mkswap(volume): - volume.ensure_presence() - volume.ensure_size(5 * 1024**2) - with volume.mapped(): - volume.mkswap() - - -@pytest.mark.timeout(60) -def test_volume_mkfs(volume): - volume.ensure_presence() - volume.ensure_size(40 * 1024**2) - with volume.mapped(): - volume.mkfs() - - -def test_unmapped_volume_should_have_no_part1(volume): - # not something like 'None-part1' - assert volume.part1dev is None - - -def test_volume_map_unmap_is_idempotent(volume): - # This is more of an internal sanity test within our mocking infrastructure. - volume.ensure_presence() - volume.map() - device = volume.device - assert os.path.exists(device) - volume.map() - assert os.path.exists(device) - volume.unmap() - assert not os.path.exists(device) - volume.unmap() - assert not os.path.exists(device) - - -def test_map_snapshot(volume): - volume.ensure_presence() - volume.snapshots.create("s0") - with volume.snapshots["s0"].mapped() as device: - assert os.path.exists(device) - - -def test_mount_should_fail_if_not_mapped(volume): - volume.ensure_presence() - with pytest.raises(RuntimeError): - volume.mount() - - -@pytest.mark.timeout(60) -@pytest.mark.live() -def test_mount_snapshot(volume): - volume.ensure_presence() - volume.ensure_size(40 * 1024**2) - with volume.mapped(): - volume.mkfs(fstype="xfs", gptbios=True) - volume.snapshots.create("s0") - snap = volume.snapshots["s0"] - with snap.mounted() as mp: - mountpoint = mp - assert os.path.ismount(mp) - with open("/proc/self/mounts") as mounts: - assert "{} xfs ro".format(mp) in mounts.read() - # test for idempotence - snap.mount() - assert os.path.ismount(mp) - assert not os.path.ismount(mountpoint) - # test for idempotence - snap.umount() - assert not os.path.ismount(mountpoint) diff --git a/src/fc/qemu/hazmat/volume.py b/src/fc/qemu/hazmat/volume.py index 52d081f..95110d8 100644 --- a/src/fc/qemu/hazmat/volume.py +++ b/src/fc/qemu/hazmat/volume.py @@ -6,26 +6,27 @@ """ import contextlib -import json -import os -import os.path as p import time +from pathlib import Path +from typing import Optional import rbd -from ..timeout import TimeOut, TimeoutError -from ..util import cmd, parse_export_format, remove_empty_dirs +from ..timeout import TimeOut +from ..util import cmd, remove_empty_dirs class Image(object): """Abstract base class for all images (volumes and snapshots).""" - device = None - mountpoint = None + device: Optional[Path] = None + mountpoint: Optional[Path] = None - def __init__(self, ceph, name): + _image = None + + def __init__(self, ceph, ioctx, name): self.ceph = ceph - self.ioctx = ceph.ioctx + self.ioctx = ioctx self.name = name self.log = ceph.log.bind(image=self.name) self.rbd = rbd.RBD() @@ -45,24 +46,23 @@ def fullname(self): # pragma: no cover def part1dev(self): if not self.device: return None - return self.device + "-part1" + return self.device.with_name(self.device.name + "-part1") def wait_for_part1dev(self): timeout = TimeOut(5, interval=0.1, raise_on_timeout=True, log=self.log) while timeout.tick(): - if os.path.exists(self.part1dev): + if self.part1dev.exists(): break def map(self): if self.device is not None: return self.cmd( - 'rbd -c "{}" --id "{}" map "{}"'.format( - self.ceph.CEPH_CONF, self.ceph.CEPH_CLIENT, self.fullname - ) + f'rbd -c "{self.ceph.CEPH_CONF}" --id "{self.ceph.CEPH_CLIENT}" ' + f'map "{self.fullname}"' ) - device = "/dev/rbd/" + self.fullname - while not os.path.exists(device): + device = Path("/dev/rbd") / self.fullname + while not device.exists(): time.sleep(0.1) self.device = device @@ -70,9 +70,8 @@ def unmap(self): if self.device is None: return self.cmd( - 'rbd -c "{}" --id "{}" unmap "{}"'.format( - self.ceph.CEPH_CONF, self.ceph.CEPH_CLIENT, self.device - ) + f'rbd -c "{self.ceph.CEPH_CONF}" --id "{self.ceph.CEPH_CLIENT}" ' + f'unmap "{self.device}"' ) self.device = None @@ -97,19 +96,16 @@ def mount(self): raise RuntimeError( "image must be mapped before mounting", self.fullname ) - mountpoint = "/mnt/rbd/{}".format(self.fullname) - try: - os.makedirs(mountpoint) - except OSError: # pragma: no cover - pass + mountpoint = Path("/mnt/rbd") / self.fullname + mountpoint.mkdir(parents=True, exist_ok=True) self.wait_for_part1dev() - self.cmd('mount "{}" "{}"'.format(self.part1dev, mountpoint)) + self.cmd(f'mount "{self.part1dev}" "{mountpoint}"') self.mountpoint = mountpoint def umount(self): if self.mountpoint is None: return - self.cmd('umount "{}"'.format(self.mountpoint)) + self.cmd(f'umount "{self.mountpoint}"') remove_empty_dirs(self.mountpoint) self.mountpoint = None @@ -179,7 +175,7 @@ class Snapshot(Image): """Single snapshot of a Volume.""" def __init__(self, volume, snapname, id, snapsize): - super(Snapshot, self).__init__(volume.ceph, volume.name) + super(Snapshot, self).__init__(volume.ceph, volume.ioctx, volume.name) self.vol = volume self.snapname = snapname self.id = id @@ -189,7 +185,9 @@ def __init__(self, volume, snapname, id, snapsize): @property def rbdimage(self): - return rbd.Image(self.ioctx, self.name, self.snapname) + if self._image is None: + self._image = rbd.Image(self.ioctx, self.name, self.snapname) + return self._image @property def fullname(self): @@ -208,28 +206,13 @@ def remove(self): class Volume(Image): - """RBD Volume interface. - - A volume has a name (shown in `rbd ls`) and a filesystem label. The - latter is only used for mkfs/mkswap. A volume may have a set of - snapshots accessible via the `snapshots` container object. - - """ - - MKFS_CMD = { - "xfs": ('mkfs.xfs {options} -L "{label}" "{device}"'), - "ext4": ( - 'mkfs.ext4 {options} -L "{label}" "{device}" ' - '&& tune2fs -e remount-ro "{device}"' - ), - } + """RBD volume interface.""" # ENC parameters which should be seeded at boot-time into the VM ENC_SEED_PARAMETERS = ["cpu_model", "rbd_pool"] - def __init__(self, ceph, name, label): - super(Volume, self).__init__(ceph, name) - self.label = label + def __init__(self, ceph, ioctx, name): + super(Volume, self).__init__(ceph, ioctx, name) self.log = ceph.log.bind(volume=self.fullname) self.cmd = lambda cmdline: cmd(cmdline, log=self.log) self.snapshots = Snapshots(self) @@ -242,6 +225,11 @@ def rbdimage(self): self._image = rbd.Image(self.ioctx, self.name) return self._image + def close(self): + if self._image: + self._image.close() + self.ceph._clean_volume(self) + @property def fullname(self): return self.ioctx.name.decode("ascii") + "/" + self.name @@ -251,14 +239,6 @@ def size(self): """Image size in Bytes.""" return self.rbdimage.size() - def exists(self): - return self.name in self.rbd.list(self.ioctx) - - def ensure_presence(self, size=1024): - if self.name in self.rbd.list(self.ioctx): - return - self.rbd.create(self.ioctx, self.name, size) - def ensure_size(self, size): if self.size == size: return @@ -332,13 +312,9 @@ def _break_lock(): self.log.warn("buggy-lock", action="low-level-delete") # This looks a lot like a broken hammer installation. # Be cruel. - img_id = self.ioctx.read( - "rbd_id.{}".format(self.name) - ).strip() + img_id = self.ioctx.read(f"rbd_id.{self.name}").strip() img_id = img_id[4:] - self.ioctx.rm_xattr( - "rbd_header.{}".format(img_id), "lock.rbd_lock" - ) + self.ioctx.rm_xattr(f"rbd_header.{img_id}", "lock.rbd_lock") if lock_id == self.ceph.CEPH_LOCK_HOST: # This host owns this lock but from a different connection. This @@ -349,93 +325,3 @@ def _break_lock(): elif force: self.log.info("break-lock") _break_lock() - - def mkswap(self): - """Creates a swap partition. Requires the volume to be mappped.""" - assert self.device, "volume must be mapped first" - self.cmd('mkswap -f -L "{}" "{}"'.format(self.label, self.device)) - - def mkfs(self, fstype="xfs", gptbios=False): - self.log.debug("create-fs", type=fstype) - assert self.device, "volume must be mapped first" - self.cmd('sgdisk -o "{}"'.format(self.device)) - self.cmd( - 'sgdisk -a 8192 -n 1:8192:0 -c "1:{}" -t 1:8300 ' - '"{}"'.format(self.label, self.device) - ) - if gptbios: - self.cmd( - 'sgdisk -n 2:2048:+1M -c 2:gptbios -t 2:EF02 "{}"'.format( - self.device - ) - ) - self.cmd("partprobe {}".format(self.device)) - self.wait_for_part1dev() - options = getattr(self.ceph, "MKFS_" + fstype.upper()) - self.cmd( - self.MKFS_CMD[fstype].format( - options=options, device=self.part1dev, label=self.label - ) - ) - - def regen_xfs_uuid(self): - """Regenerate the UUID of the XFS filesystem on partition 1.""" - with self.mapped(): - try: - self.wait_for_part1dev() - except TimeoutError: - self.log.warn( - "regenerate-xfs-uuid", - status="skipped", - reason="no partition found", - ) - return - output = self.cmd(f"blkid {self.part1dev} -o export") - values = parse_export_format(output) - fs_type = values.get("TYPE") - if fs_type != "xfs": - self.log.info( - "regenerate-xfs-uuid", - device=self.part1dev, - status="skipped", - fs_type=fs_type, - reason="filesystem type != xfs", - ) - return - with self.mounted(): - # Mount once to ensure a clean log. - pass - self.log.info("regenerate-xfs-uuid", device=self.part1dev) - self.cmd(f"xfs_admin -U generate {self.part1dev}") - - def seed(self, enc, generation): - self.log.info("seed") - with self.mounted() as target: - os.chmod(target, 0o1777) - fc_data = p.join(target, "fc-data") - os.mkdir(fc_data) - os.chmod(fc_data, 0o750) - with open(p.join(fc_data, "enc.json"), "w") as f: - os.fchmod(f.fileno(), 0o640) - json.dump(enc, f) - f.write("\n") - # Seed boot-time VM properties which require a reboot to - # change. While some of these properties are copied from - # the ENC data, a separate file allows properties which - # are not exposed to guests through ENC to be added in the - # future. - properties = {} - properties["binary_generation"] = generation - for key in self.ENC_SEED_PARAMETERS: - if key in enc["parameters"]: - properties[key] = enc["parameters"][key] - self.log.debug("guest-properties", properties=properties) - guest_properties = p.join(fc_data, "qemu-guest-properties-booted") - with open(guest_properties, "w") as f: - json.dump(properties, f) - # For backwards compatibility with old fc-agent versions, - # write the Qemu binary generation into a separate file. - self.log.debug("binary-generation", generation=generation) - generation_marker = p.join(fc_data, "qemu-binary-generation-booted") - with open(generation_marker, "w") as f: - f.write(str(generation) + "\n") diff --git a/src/fc/qemu/logging.py b/src/fc/qemu/logging.py index 030e93a..febc088 100644 --- a/src/fc/qemu/logging.py +++ b/src/fc/qemu/logging.py @@ -6,6 +6,7 @@ import os import sys +from pathlib import Path import structlog from six import StringIO @@ -96,6 +97,12 @@ def prefix(prefix, line): return "{}>\t".format(prefix) + line.replace("\n", "\n{}>\t".format(prefix)) +def format_value(value): + if isinstance(value, Path): + value = str(value) + return repr(value) + + class MultiConsoleRenderer(object): """ Render `event_dict` nicely aligned, in colors, and ordered with @@ -181,7 +188,11 @@ def write(line): machine = event_dict.pop("machine", "") if machine: - write(machine.ljust(20) + " ") + write(machine.ljust(10) + " ") + subsystem = event_dict.pop("subsystem", "") + if machine or subsystem: + # Always keep the columns in order if we have a machine. + write(subsystem.rjust(10)[:10] + " ") output = event_dict.pop("output", None) output_line = event_dict.pop("output", None) @@ -204,7 +215,7 @@ def write(line): + RESET_ALL + "=" + MAGENTA - + repr(event_dict[key]) + + format_value(event_dict[key]) + RESET_ALL for key in sorted(event_dict.keys()) ) diff --git a/src/fc/qemu/main.py b/src/fc/qemu/main.py index 8b97e76..30adab5 100644 --- a/src/fc/qemu/main.py +++ b/src/fc/qemu/main.py @@ -230,7 +230,7 @@ def main(): agent = Agent(vm) with agent: sys.exit(getattr(agent, func)(**kwargs) or 0) - except (ControlledRuntimeException): + except ControlledRuntimeException: # Those exceptions are properly logged and indicate an error with a # proper shutdown. log.exception("controlled-exception", exc_info=True) @@ -238,7 +238,6 @@ def main(): except (VMConfigNotFound, InvalidCommand): # Those exceptions are properly logged and don't have to be shown # with their traceback. - log.debug("unexpected-exception") sys.exit(69) # EX_UNAVAILABLE except Exception: log.exception("unexpected-exception", exc_info=True) diff --git a/src/fc/qemu/outgoing.py b/src/fc/qemu/outgoing.py index a448ba1..2a7182a 100644 --- a/src/fc/qemu/outgoing.py +++ b/src/fc/qemu/outgoing.py @@ -85,6 +85,7 @@ def __init__(self, agent): def __call__(self): self.cookie = self.agent.ceph.auth_cookie() + self.log.debug("setup-outgoing-migration", cookie=self.cookie) with self: self.connect() self.acquire_migration_locks() diff --git a/src/fc/qemu/qemu.vm.cfg.in b/src/fc/qemu/qemu.vm.cfg.in index 5b60f14..6bd5e5e 100644 --- a/src/fc/qemu/qemu.vm.cfg.in +++ b/src/fc/qemu/qemu.vm.cfg.in @@ -18,7 +18,7 @@ media = "disk" if = "virtio" format = "rbd" - file = "rbd:{rbd_pool}/{name}.root:id={{ceph_id}}" + file = "rbd:{ceph.volumes[root].fullname}:id={{ceph_id}}" aio = "threads" cache = "none" @@ -27,7 +27,7 @@ media = "disk" if = "virtio" format = "rbd" - file = "rbd:{rbd_pool}/{name}.swap:id={{ceph_id}}" + file = "rbd:{ceph.volumes[tmp].fullname}:id={{ceph_id}}" aio = "threads" cache = "none" @@ -36,7 +36,7 @@ media = "disk" if = "virtio" format = "rbd" - file = "rbd:{rbd_pool}/{name}.tmp:id={{ceph_id}}" + file = "rbd:{ceph.volumes[swap].fullname}:id={{ceph_id}}" aio = "threads" cache = "none" diff --git a/src/fc/qemu/sysconfig.py b/src/fc/qemu/sysconfig.py index 08f40ca..6fb3992 100644 --- a/src/fc/qemu/sysconfig.py +++ b/src/fc/qemu/sysconfig.py @@ -85,7 +85,6 @@ def load_system_config(self): self.ceph["CEPH_LOCK_HOST"] = self.cp.get("ceph", "lock_host") self.ceph["CREATE_VM"] = self.cp.get("ceph", "create-vm") self.ceph["MKFS_XFS"] = self.cp.get("ceph", "mkfs-xfs") - self.ceph["MKFS_EXT4"] = self.cp.get("ceph", "mkfs-ext4") sysconfig = SysConfig() diff --git a/src/fc/qemu/tests/qemu_test.py b/src/fc/qemu/tests/qemu_test.py deleted file mode 100644 index 15662cf..0000000 --- a/src/fc/qemu/tests/qemu_test.py +++ /dev/null @@ -1,74 +0,0 @@ -import os -import subprocess -import time - -import psutil -import pytest - -from ..hazmat.qemu import Qemu - - -@pytest.fixture -def qemu_with_pidfile(tmpdir): - # The fixture uses a very long name which catches a special case where - # the proc name is limited to 16 bytes and then we failed to match - # the running VM. Parametrizing fixtures doesn't work the way I want - # so I did it this way ... - pidfile = str(tmpdir / "run/qemu.testvmwithverylongname.pid") - try: - os.unlink(pidfile) - except OSError: - pass - proc = subprocess.Popen( - [ - "qemu-system-x86_64", - "-name", - "testvmwithverylongname,process=kvm.testvmwithverylongname", - "-nodefaults", - "-pidfile", - pidfile, - ] - ) - while not os.path.exists(pidfile): - time.sleep(0.01) - q = Qemu(dict(name="testvmwithverylongname", id=1234)) - while not q.proc(): - time.sleep(0.01) - try: - yield q - finally: - proc.kill() - - -def test_proc_running(qemu_with_pidfile): - assert isinstance(qemu_with_pidfile.proc(), psutil.Process) - - -def test_proc_not_running(qemu_with_pidfile): - with open(qemu_with_pidfile.pidfile, "w") as p: - p.write("0\n") - assert qemu_with_pidfile.proc() is None - - -def test_proc_wrong_process(qemu_with_pidfile): - with open(qemu_with_pidfile.pidfile, "w") as p: - p.write("1\n") - assert qemu_with_pidfile.proc() is None - - -def test_proc_no_pidfile(qemu_with_pidfile): - os.unlink(qemu_with_pidfile.pidfile) - assert qemu_with_pidfile.proc() is None - - -def test_proc_empty_pidfile(qemu_with_pidfile): - open(qemu_with_pidfile.pidfile, "w").close() - assert qemu_with_pidfile.proc() is None - - -def test_proc_pidfile_with_garbage(qemu_with_pidfile): - """pidfiles are allowed to contain trailing lines with garbage, - process retrieval must still work then.""" - with open(qemu_with_pidfile.pidfile, "a") as f: - f.write("trailing line\n") - assert isinstance(qemu_with_pidfile.proc(), psutil.Process) diff --git a/src/fc/qemu/hazmat/tests/__init__.py b/tests/__init__.py similarity index 100% rename from src/fc/qemu/hazmat/tests/__init__.py rename to tests/__init__.py diff --git a/src/fc/qemu/conftest.py b/tests/conftest.py similarity index 86% rename from src/fc/qemu/conftest.py rename to tests/conftest.py index 3ce2df2..1e52219 100644 --- a/src/fc/qemu/conftest.py +++ b/tests/conftest.py @@ -4,6 +4,7 @@ import sys import time import traceback +from pathlib import Path import pkg_resources import pytest @@ -11,8 +12,8 @@ import fc.qemu.agent import fc.qemu.hazmat.qemu - -from .agent import Agent +from fc.qemu.agent import Agent +from tests.hazmat.conftest import ceph_inst, ceph_mock # noqa def pytest_collectstart(collector): @@ -22,21 +23,21 @@ def pytest_collectstart(collector): @pytest.fixture(autouse=True) -def synthetic_root(request, monkeypatch, tmpdir): +def synthetic_root(request, monkeypatch, tmp_path): is_live = request.node.get_closest_marker("live") if is_live is not None: # This is a live test. Do not mock things. return - monkeypatch.setattr(fc.qemu.hazmat.qemu.Qemu, "prefix", str(tmpdir)) - os.makedirs(str(tmpdir / "run")) - os.makedirs(str(tmpdir / "etc/qemu/vm")) - monkeypatch.setattr(Agent, "prefix", str(tmpdir)) + (tmp_path / "run").mkdir() + (tmp_path / "etc/qemu/vm").mkdir(parents=True) + monkeypatch.setattr(fc.qemu.hazmat.qemu.Qemu, "prefix", tmp_path) + monkeypatch.setattr(Agent, "prefix", tmp_path) monkeypatch.setattr(fc.qemu.agent, "EXECUTABLE", "true") @pytest.fixture(scope="session") def setup_structlog(): - from . import util + from fc.qemu import util # set to True to temporarily get detailed tracebacks log_exceptions = True @@ -104,7 +105,7 @@ def test_logger(logger, method_name, event): @pytest.fixture(autouse=True) def reset_structlog(setup_structlog): - from . import util + from fc.qemu import util util.log_data = [] util.test_log_options = {"show_methods": [], "show_events": []} @@ -128,7 +129,8 @@ def Popen_recording_init(self, *args, **kw): binary = args[0][0] if kw.get("shell"): binary = args[0].split()[0] - CALLED_BINARIES.add(binary) + if not binary.startswith("/"): + CALLED_BINARIES.add(binary) return original(self, *args, **kw) monkeypatch.setattr(subprocess.Popen, "__init__", Popen_recording_init) @@ -142,7 +144,8 @@ def recording_subprocess_run(*args, **kw): binary = args[0][0] if kw.get("shell"): binary = args[0].split()[0] - CALLED_BINARIES.add(binary) + if not binary.startswith("/"): + CALLED_BINARIES.add(binary) return subprocess_run_orig(*args, **kw) monkeypatch.setattr(subprocess, "run", recording_subprocess_run) @@ -221,24 +224,27 @@ def vm(clean_environment, monkeypatch, tmpdir): import fc.qemu.hazmat.qemu monkeypatch.setattr(fc.qemu.hazmat.qemu.Qemu, "guestagent_timeout", 0.1) - fixtures = pkg_resources.resource_filename(__name__, "tests/fixtures") + fixtures = pkg_resources.resource_filename(__name__, "fixtures") shutil.copy(fixtures + "/simplevm.yaml", "/etc/qemu/vm/simplevm.cfg") - if os.path.exists("/etc/qemu/vm/.simplevm.cfg.staging"): - os.unlink("/etc/qemu/vm/.simplevm.cfg.staging") + Path("/etc/qemu/vm/.simplevm.cfg.staging").unlink(missing_ok=True) + vm = Agent("simplevm") vm.timeout_graceful = 1 vm.__enter__() vm.qemu.qmp_timeout = 0.1 vm.qemu.vm_expected_overhead = 128 - for snapshot in vm.ceph.root.snapshots: - snapshot.remove() + + for volume in vm.ceph.opened_volumes: + for snapshot in volume.snapshots: + snapshot.remove() vm.qemu.destroy(kill_supervisor=True) vm.unlock() get_log() yield vm - for snapshot in vm.ceph.root.snapshots: - snapshot.remove() + for volume in vm.ceph.opened_volumes: + for snapshot in volume.snapshots: + snapshot.remove() exc_info = sys.exc_info() vm.__exit__(*exc_info) if len(exc_info): diff --git a/src/fc/qemu/ellipsis.py b/tests/ellipsis.py similarity index 100% rename from src/fc/qemu/ellipsis.py rename to tests/ellipsis.py diff --git a/src/fc/qemu/tests/fixtures/conf.d/kvm.testvm b/tests/fixtures/conf.d/kvm.testvm similarity index 100% rename from src/fc/qemu/tests/fixtures/conf.d/kvm.testvm rename to tests/fixtures/conf.d/kvm.testvm diff --git a/src/fc/qemu/tests/fixtures/simplevm.yaml b/tests/fixtures/simplevm.yaml similarity index 100% rename from src/fc/qemu/tests/fixtures/simplevm.yaml rename to tests/fixtures/simplevm.yaml diff --git a/src/fc/qemu/tests/__init__.py b/tests/hazmat/__init__.py similarity index 100% rename from src/fc/qemu/tests/__init__.py rename to tests/hazmat/__init__.py diff --git a/src/fc/qemu/hazmat/conftest.py b/tests/hazmat/conftest.py similarity index 87% rename from src/fc/qemu/hazmat/conftest.py rename to tests/hazmat/conftest.py index f7c05c1..70a63fa 100644 --- a/src/fc/qemu/hazmat/conftest.py +++ b/tests/hazmat/conftest.py @@ -1,5 +1,4 @@ import errno -import os import tempfile import typing @@ -8,9 +7,9 @@ import rados import rbd -from . import volume -from .ceph import Ceph -from .guestagent import GuestAgent +from fc.qemu.hazmat import volume +from fc.qemu.hazmat.ceph import Ceph, RootSpec, VolumeSpecification +from fc.qemu.hazmat.guestagent import GuestAgent class RadosMock(object): @@ -33,6 +32,9 @@ def shutdown(self): assert self.__connected__ self.__connected__ = False + def list_pools(self): + return ["rbd", "data", "rbd.ssd", "rbd.hdd", "rbd.rgw.foo"] + class IoctxMock(object): """Mock access to a pool.""" @@ -193,39 +195,43 @@ def close(self): @pytest.fixture -def ceph_mock(request, monkeypatch, tmpdir): +def ceph_mock(request, monkeypatch, tmp_path): is_live = request.node.get_closest_marker("live") if is_live is not None: # This is a live test. Do not mock things. return + def ensure_presence(self): + VolumeSpecification.ensure_presence(self) + def image_map(self): if self.device: return - self.device = str(tmpdir / self.fullname.replace("/", "-")) - - raw = self.device + ".raw" - if not os.path.exists(raw): - with open(raw, "wb") as f: + self.device = tmp_path / self.fullname.replace("/", "-") + raw = self.device.with_name("{self.device.name}.raw") + if not raw.exists(): + with raw.open("wb") as f: f.seek(self.size) f.write(b"\0") f.close() - os.symlink(raw, self.device) + raw.rename(self.device) # create an implicit first partition as we can't really do the # partprobe dance. - raw = self.part1dev + ".raw" - with open(raw, "wb") as f: + raw = self.part1dev.with_name("{self.part1dev.name}.raw") + with raw.open("wb") as f: f.seek(self.size) f.write(b"\0") f.close() - os.symlink(raw, self.part1dev) + raw.rename(self.part1dev) def image_unmap(self): if self.device is None: return - os.unlink(self.device) - os.unlink(self.part1dev) + self.device.rename(self.device.with_name("{self.device.name}.raw")) + self.part1dev.rename( + self.part1dev.with_name("{self.part1dev.name}.raw") + ) self.device = None monkeypatch.setattr(rados, "Rados", RadosMock) @@ -233,6 +239,7 @@ def image_unmap(self): monkeypatch.setattr(rbd, "Image", ImageMock) monkeypatch.setattr(volume.Image, "map", image_map) monkeypatch.setattr(volume.Image, "unmap", image_unmap) + monkeypatch.setattr(RootSpec, "ensure_presence", ensure_presence) @pytest.fixture @@ -242,8 +249,13 @@ def ceph_inst(ceph_mock): "rbd_pool": "rbd.hdd", "name": "simplevm", "disk": 10, + "tmp_size": 1024 * 1024, + "swap_size": 1024 * 1024, + "root_size": 1024 * 1024, + "binary_generation": 2, } - ceph = Ceph(cfg) + enc = {"parameters": {}} + ceph = Ceph(cfg, enc) ceph.CREATE_VM = "echo {name}" ceph.MKFS_XFS = "-q -f -K" ceph.__enter__() diff --git a/tests/hazmat/test_ceph.py b/tests/hazmat/test_ceph.py new file mode 100644 index 0000000..d9a99df --- /dev/null +++ b/tests/hazmat/test_ceph.py @@ -0,0 +1,232 @@ +import pytest +import rbd + +from tests.conftest import get_log + + +@pytest.fixture +def ceph_with_volumes(ceph_inst): + for vol in ceph_inst.specs.values(): + vol.ensure_presence() + ceph_inst.lock() + yield ceph_inst + for volume in ceph_inst.opened_volumes: + volume.unlock(force=True) + volume.snapshots.purge() + volume.close() + rbd.RBD().remove(volume.ioctx, volume.name) + + +def test_ceph_stop_should_unlock_all_volumes(ceph_with_volumes): + for volume in ceph_with_volumes.opened_volumes: + assert volume.lock_status() + ceph_with_volumes.stop() + for volume in ceph_with_volumes.opened_volumes: + assert volume.lock_status() is None + + +def test_ceph_stop_remove_only_own_locks(ceph_with_volumes): + """Test case where failed migrations leave inconsistent locking.""" + ceph_with_volumes.volumes["root"].unlock() + ceph_with_volumes.volumes["root"].rbdimage.lock_exclusive("someotherhost") + # It unlocks what it can. + ceph_with_volumes.stop() + assert ceph_with_volumes.volumes["root"].lock_status() + assert ceph_with_volumes.volumes["swap"].lock_status() is None + assert ceph_with_volumes.volumes["tmp"].lock_status() is None + + +def test_is_unlocked(ceph_with_volumes): + assert ceph_with_volumes.is_unlocked() is False + ceph_with_volumes.unlock() + assert ceph_with_volumes.is_unlocked() is True + + +def test_multiple_images_raises_error(ceph_inst): + rbd.RBD().create(ceph_inst.ioctxs["rbd.hdd"], "simplevm.root", 1024) + rbd.RBD().create(ceph_inst.ioctxs["rbd.ssd"], "simplevm.root", 1024) + root_spec = ceph_inst.specs["root"] + assert sorted(root_spec.exists_in_pools()) == ["rbd.hdd", "rbd.ssd"] + with pytest.raises(RuntimeError): + root_spec.exists_in_pool() + + +@pytest.mark.timeout(60) +@pytest.mark.live() +def test_rbd_pool_migration(ceph_inst, patterns): + ceph_inst.cfg["tmp_size"] = 50 * 1024 * 1024 + ceph_inst.cfg["swap_size"] = 50 * 1024 * 1024 + ceph_inst.cfg["root_size"] = 50 * 1024 * 1024 + rbd.RBD().create( + ceph_inst.ioctxs["rbd.ssd"], + "simplevm.root", + ceph_inst.cfg["root_size"], + ) + rbd.RBD().create( + ceph_inst.ioctxs["rbd.ssd"], + "simplevm.tmp", + ceph_inst.cfg["tmp_size"], + ) + rbd.RBD().create( + ceph_inst.ioctxs["rbd.ssd"], + "simplevm.swap", + ceph_inst.cfg["swap_size"], + ) + assert ceph_inst.specs["root"].exists_in_pool() == "rbd.ssd" + assert ceph_inst.specs["swap"].exists_in_pool() == "rbd.ssd" + assert ceph_inst.specs["tmp"].exists_in_pool() == "rbd.ssd" + + ceph_inst.start() + ceph_inst.status() + + first_start = patterns.first_start + first_start.optional( + """ +waiting interval=0 machine=simplevm remaining=4 subsystem=ceph volume=rbd.hdd/simplevm.tmp +""" + ) + first_start.in_order( + """ +connect-rados machine=simplevm subsystem=ceph + +pre-start machine=simplevm subsystem=ceph volume_spec=root +ensure-presence machine=simplevm subsystem=ceph volume_spec=root +lock machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.root +ensure-size machine=simplevm subsystem=ceph volume_spec=root +start machine=simplevm subsystem=ceph volume_spec=root +start-root machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.root +root-found-in current_pool=rbd.ssd machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.root +rbd args=status --format json rbd.ssd/simplevm.root machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.root +rbd> {"watchers":[{"address":"192.168.4.6:0/...","client":...,"cookie":...}]} +rbd machine=simplevm returncode=0 subsystem=ceph volume=rbd.ssd/simplevm.root + +migrate-vm-root-disk action=start machine=simplevm pool_from=rbd.ssd pool_to=rbd.hdd subsystem=ceph volume=rbd.ssd/simplevm.root +unlock machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.root +rbd args=migration prepare rbd.ssd/simplevm.root rbd.hdd/simplevm.root machine=simplevm subsystem=ceph volume=simplevm.root +rbd machine=simplevm returncode=0 subsystem=ceph volume=simplevm.root + +pre-start machine=simplevm subsystem=ceph volume_spec=swap +delete-outdated-swap image=simplevm.swap machine=simplevm pool=rbd.ssd subsystem=ceph volume=simplevm.swap +ensure-presence machine=simplevm subsystem=ceph volume_spec=swap +lock machine=simplevm subsystem=ceph volume=rbd.hdd/simplevm.swap +ensure-size machine=simplevm subsystem=ceph volume_spec=swap +start machine=simplevm subsystem=ceph volume_spec=swap +start-swap machine=simplevm subsystem=ceph volume=rbd.hdd/simplevm.swap +rbd args=-c "/etc/ceph/ceph.conf" --id "host1" map "rbd.hdd/simplevm.swap" machine=simplevm subsystem=ceph volume=rbd.hdd/simplevm.swap +rbd> /dev/rbd0 +rbd machine=simplevm returncode=0 subsystem=ceph volume=rbd.hdd/simplevm.swap +mkswap args=-f -L "swap" /dev/rbd/rbd.hdd/simplevm.swap machine=simplevm subsystem=ceph volume=rbd.hdd/simplevm.swap +mkswap> Setting up swapspace version 1, size = 50 MiB (52424704 bytes) +mkswap> LABEL=swap, UUID=...-...-...-...-... +mkswap machine=simplevm returncode=0 subsystem=ceph volume=rbd.hdd/simplevm.swap +rbd args=-c "/etc/ceph/ceph.conf" --id "host1" unmap "/dev/rbd/rbd.hdd/simplevm.swap" machine=simplevm subsystem=ceph volume=rbd.hdd/simplevm.swap +rbd machine=simplevm returncode=0 subsystem=ceph volume=rbd.hdd/simplevm.swap + +pre-start machine=simplevm subsystem=ceph volume_spec=tmp +delete-outdated-tmp image=simplevm.tmp machine=simplevm pool=rbd.ssd subsystem=ceph volume=simplevm.tmp +ensure-presence machine=simplevm subsystem=ceph volume_spec=tmp +lock machine=simplevm subsystem=ceph volume=rbd.hdd/simplevm.tmp +ensure-size machine=simplevm subsystem=ceph volume_spec=tmp +start machine=simplevm subsystem=ceph volume_spec=tmp +start-tmp machine=simplevm subsystem=ceph volume=rbd.hdd/simplevm.tmp +rbd args=-c "/etc/ceph/ceph.conf" --id "host1" map "rbd.hdd/simplevm.tmp" machine=simplevm subsystem=ceph volume=rbd.hdd/simplevm.tmp +rbd> /dev/rbd0 +rbd machine=simplevm returncode=0 subsystem=ceph volume=rbd.hdd/simplevm.tmp +create-fs machine=simplevm subsystem=ceph volume=rbd.hdd/simplevm.tmp +sgdisk args=-o "/dev/rbd/rbd.hdd/simplevm.tmp" machine=simplevm subsystem=ceph volume=rbd.hdd/simplevm.tmp +sgdisk> Creating new GPT entries in memory. +sgdisk> The operation has completed successfully. +sgdisk machine=simplevm returncode=0 subsystem=ceph volume=rbd.hdd/simplevm.tmp +sgdisk args=-a 8192 -n 1:8192:0 -c "1:tmp" -t 1:8300 "/dev/rbd/rbd.hdd/simplevm.tmp" machine=simplevm subsystem=ceph volume=rbd.hdd/simplevm.tmp +sgdisk> Setting name! +sgdisk> partNum is 0 +sgdisk> The operation has completed successfully. +sgdisk machine=simplevm returncode=0 subsystem=ceph volume=rbd.hdd/simplevm.tmp +partprobe args=/dev/rbd/rbd.hdd/simplevm.tmp machine=simplevm subsystem=ceph volume=rbd.hdd/simplevm.tmp +partprobe machine=simplevm returncode=0 subsystem=ceph volume=rbd.hdd/simplevm.tmp +mkfs.xfs args=-q -f -K -L "tmp" /dev/rbd/rbd.hdd/simplevm.tmp-part1 machine=simplevm subsystem=ceph volume=rbd.hdd/simplevm.tmp +mkfs.xfs machine=simplevm returncode=0 subsystem=ceph volume=rbd.hdd/simplevm.tmp +seed machine=simplevm subsystem=ceph volume=rbd.hdd/simplevm.tmp +mount args="/dev/rbd/rbd.hdd/simplevm.tmp-part1" "/mnt/rbd/rbd.hdd/simplevm.tmp" machine=simplevm subsystem=ceph volume=rbd.hdd/simplevm.tmp +mount machine=simplevm returncode=0 subsystem=ceph volume=rbd.hdd/simplevm.tmp +guest-properties machine=simplevm properties={'binary_generation': 2} subsystem=ceph volume=rbd.hdd/simplevm.tmp +binary-generation generation=2 machine=simplevm subsystem=ceph volume=rbd.hdd/simplevm.tmp +umount args="/mnt/rbd/rbd.hdd/simplevm.tmp" machine=simplevm subsystem=ceph volume=rbd.hdd/simplevm.tmp +umount machine=simplevm returncode=0 subsystem=ceph volume=rbd.hdd/simplevm.tmp +rbd args=-c "/etc/ceph/ceph.conf" --id "host1" unmap "/dev/rbd/rbd.hdd/simplevm.tmp" machine=simplevm subsystem=ceph volume=rbd.hdd/simplevm.tmp +rbd machine=simplevm returncode=0 subsystem=ceph volume=rbd.hdd/simplevm.tmp + +rbd-status locker=None machine=simplevm subsystem=ceph volume=rbd.hdd/simplevm.root +rbd args=status --format json rbd.hdd/simplevm.root machine=simplevm subsystem=ceph volume=rbd.hdd/simplevm.root +rbd> {"watchers":[{"address":"192.168.4.6:0/...","client":...,"cookie":...}],"migration":{"source_pool_name":"rbd.ssd","source_pool_namespace":"","source_image_name":"simplevm.root","source_image_id":"...","dest_pool_name":"rbd.hdd","dest_pool_namespace":"","dest_image_name":"simplevm.root","dest_image_id":"...","state":"prepared","state_description":""}} +rbd machine=simplevm returncode=0 subsystem=ceph volume=rbd.hdd/simplevm.root +root-migration-status machine=simplevm pool_from=rbd.ssd pool_to=rbd.hdd progress= status=prepared subsystem=ceph volume=rbd.hdd/simplevm.root +rbd-status locker=('client...', 'host1') machine=simplevm subsystem=ceph volume=rbd.hdd/simplevm.swap +rbd-status locker=('client...', 'host1') machine=simplevm subsystem=ceph volume=rbd.hdd/simplevm.tmp +""" + ) + + assert get_log() == first_start + + assert ceph_inst.specs["root"].exists_in_pool() == "rbd.hdd" + assert ceph_inst.specs["swap"].exists_in_pool() == "rbd.hdd" + assert ceph_inst.specs["tmp"].exists_in_pool() == "rbd.hdd" + + ceph_inst.ensure() + ceph_inst.status() + + first_ensure = patterns.first_ensure + first_ensure.in_order( + """ +rbd args=status --format json rbd.hdd/simplevm.root machine=simplevm subsystem=ceph volume=rbd.hdd/simplevm.root +rbd> {"watchers":[{"address":"192.168.4.6:0/...","client":...,"cookie":...}],"migration":{"source_pool_name":"rbd.ssd","source_pool_namespace":"","source_image_name":"simplevm.root","source_image_id":"...","dest_pool_name":"rbd.hdd","dest_pool_namespace":"","dest_image_name":"simplevm.root","dest_image_id":"...","state":"prepared","state_description":""}} +rbd machine=simplevm returncode=0 subsystem=ceph volume=rbd.hdd/simplevm.root +root-migration-status machine=simplevm pool_from=rbd.ssd pool_to=rbd.hdd progress= status=prepared subsystem=ceph volume=rbd.hdd/simplevm.root + +root-migration-execute machine=simplevm subsystem=ceph volume=rbd.hdd/simplevm.root +ceph args=rbd task add migration execute rbd.hdd/simplevm.root machine=simplevm subsystem=ceph volume=rbd.hdd/simplevm.root +ceph> {"sequence": 1, "id": "...-...-...-...-...", "message": "Migrating image rbd.ssd/simplevm.root to rbd.ssd/simplevm.root", "refs": {"action": "migrate execute", "pool_name": "rbd.hdd", "pool_namespace": "", "image_name": "simplevm.root", "image_id": "..."}} +ceph machine=simplevm returncode=0 subsystem=ceph volume=rbd.hdd/simplevm.root + +rbd-status locker=None machine=simplevm subsystem=ceph volume=rbd.hdd/simplevm.root +rbd args=status --format json rbd.hdd/simplevm.root machine=simplevm subsystem=ceph volume=rbd.hdd/simplevm.root +rbd> {"watchers":[{"address":"192.168.4.6:0/...","client":...,"cookie":...}],"migration":{"source_pool_name":"rbd.ssd","source_pool_namespace":"","source_image_name":"simplevm.root","source_image_id":"...","dest_pool_name":"rbd.hdd","dest_pool_namespace":"","dest_image_name":"simplevm.root","dest_image_id":"...","state":"...","state_description":...}} +rbd machine=simplevm returncode=0 subsystem=ceph volume=rbd.hdd/simplevm.root +root-migration-status machine=simplevm pool_from=rbd.ssd pool_to=rbd.hdd progress=...status=... subsystem=ceph volume=rbd.hdd/simplevm.root + +rbd-status locker=('client...', 'host1') machine=simplevm subsystem=ceph volume=rbd.hdd/simplevm.swap +rbd-status locker=('client...', 'host1') machine=simplevm subsystem=ceph volume=rbd.hdd/simplevm.tmp +""" + ) + + assert get_log() == first_ensure + + while "status=executed" not in (log := get_log()): + ceph_inst.status() + + ceph_inst.ensure() + ceph_inst.status() + + commit_ensure = patterns.commit_ensure + commit_ensure.in_order( + """ +rbd args=status --format json rbd.hdd/simplevm.root machine=simplevm subsystem=ceph volume=rbd.hdd/simplevm.root +rbd> {"watchers":[{"address":"192.168.4.6:0/...","client":...,"cookie":...}],"migration":{"source_pool_name":"rbd.ssd","source_pool_namespace":"","source_image_name":"simplevm.root","source_image_id":"...","dest_pool_name":"rbd.hdd","dest_pool_namespace":"","dest_image_name":"simplevm.root","dest_image_id":"...","state":"executed","state_description":""}} +rbd machine=simplevm returncode=0 subsystem=ceph volume=rbd.hdd/simplevm.root + +root-migration-status machine=simplevm pool_from=rbd.ssd pool_to=rbd.hdd progress= status=executed subsystem=ceph volume=rbd.hdd/simplevm.root +root-migration-commit machine=simplevm subsystem=ceph volume=rbd.hdd/simplevm.root +rbd args=--no-progress migration commit rbd.hdd/simplevm.root machine=simplevm subsystem=ceph volume=rbd.hdd/simplevm.root +rbd machine=simplevm returncode=0 subsystem=ceph volume=rbd.hdd/simplevm.root + +rbd-status locker=None machine=simplevm subsystem=ceph volume=rbd.hdd/simplevm.root +rbd args=status --format json rbd.hdd/simplevm.root machine=simplevm subsystem=ceph volume=rbd.hdd/simplevm.root +rbd> {"watchers":[{"address":"192.168.4.6:0/...","client":...,"cookie":...}]} +rbd machine=simplevm returncode=0 subsystem=ceph volume=rbd.hdd/simplevm.root + +rbd-status locker=('client...', 'host1') machine=simplevm subsystem=ceph volume=rbd.hdd/simplevm.swap +rbd-status locker=('client...', 'host1') machine=simplevm subsystem=ceph volume=rbd.hdd/simplevm.tmp +""" + ) + + assert get_log() == commit_ensure diff --git a/src/fc/qemu/hazmat/tests/test_cpuscan.py b/tests/hazmat/test_cpuscan.py similarity index 90% rename from src/fc/qemu/hazmat/tests/test_cpuscan.py rename to tests/hazmat/test_cpuscan.py index bd8dc2b..bc3fc5f 100644 --- a/src/fc/qemu/hazmat/tests/test_cpuscan.py +++ b/tests/hazmat/test_cpuscan.py @@ -1,6 +1,6 @@ import pytest -from ..cpuscan import AbstractHost, AMDHost, IntelHost, scan_cpus +from fc.qemu.hazmat.cpuscan import AbstractHost, AMDHost, IntelHost, scan_cpus @pytest.mark.timeout(60) diff --git a/src/fc/qemu/hazmat/tests/guestagent_test.py b/tests/hazmat/test_guestagent.py similarity index 84% rename from src/fc/qemu/hazmat/tests/guestagent_test.py rename to tests/hazmat/test_guestagent.py index 7df9e14..68ad949 100644 --- a/src/fc/qemu/hazmat/tests/guestagent_test.py +++ b/tests/hazmat/test_guestagent.py @@ -2,7 +2,7 @@ import pytest -from ..guestagent import ClientError +from fc.qemu.hazmat.guestagent import ClientError def test_ga_read(guest_agent): @@ -18,7 +18,6 @@ def test_ga_read_error(guest_agent): def test_ga_sync_immediate(guest_agent): guest_agent._client_stub.responses = [ - "{}", '{"return": 87643}', ] @@ -30,15 +29,12 @@ def test_ga_sync_immediate(guest_agent): assert guest_agent.client.messages_sent == [ b'{"execute": "guest-fsfreeze-thaw"}', - b"\xff", - b'{"execute": "guest-ping", "arguments": {}}', b'{"execute": "guest-sync", "arguments": {"id": 87643}}', ] def test_ga_sync_wrong_response(guest_agent): guest_agent._client_stub.responses = [ - "{}", '{"return": 1}', ] @@ -47,7 +43,5 @@ def test_ga_sync_wrong_response(guest_agent): assert guest_agent.client.messages_sent == [ b'{"execute": "guest-fsfreeze-thaw"}', - b"\xff", - b'{"execute": "guest-ping", "arguments": {}}', b'{"execute": "guest-sync", "arguments": {"id": 87643}}', ] diff --git a/src/fc/qemu/hazmat/tests/test_qemu.py b/tests/hazmat/test_qemu.py similarity index 93% rename from src/fc/qemu/hazmat/tests/test_qemu.py rename to tests/hazmat/test_qemu.py index c290f66..e92db8f 100644 --- a/src/fc/qemu/hazmat/tests/test_qemu.py +++ b/tests/hazmat/test_qemu.py @@ -17,8 +17,6 @@ def test_write_file_no_error(guest_agent): # the emulated answers of the guest agent: guest_agent._client_stub.responses = [ - # ping - "{}", # sync ID, hard-coded in fixture '{"return": 87643}', # emulated non-empty result of executions: @@ -37,8 +35,6 @@ def test_write_file_no_error(guest_agent): assert guest_agent.client.messages_sent == [ b'{"execute": "guest-fsfreeze-thaw"}', - b"\xff", - b'{"execute": "guest-ping", "arguments": {}}', b'{"execute": "guest-sync", "arguments": {"id": 87643}}', b'{"execute": "guest-file-open", "arguments": {"path": "/tmp/foo", "mode": "w"}}', b'{"execute": "guest-file-write", "arguments": {"handle": "file-handle-1", "buf-b64": "ImFzZGYi\\n"}}', diff --git a/tests/hazmat/test_volume.py b/tests/hazmat/test_volume.py new file mode 100644 index 0000000..ea88f8e --- /dev/null +++ b/tests/hazmat/test_volume.py @@ -0,0 +1,222 @@ +import os.path +import socket +import subprocess +import time + +import pytest +import rbd + + +@pytest.fixture +def tmp_spec(ceph_inst): + for volume in ceph_inst.opened_volumes: + volume.snapshots.purge() + name, ioctx = volume.name, volume.ioctx + volume.close() + rbd.RBD().remove(ioctx, name) + + spec = ceph_inst.specs["tmp"] + + yield spec + time.sleep(0.2) + + try: + for volume in ceph_inst.opened_volumes: + lock = volume.lock_status() + if lock is not None: + volume.rbdimage.break_lock(*lock) + volume.snapshots.purge() + name, ioctx = volume.name, volume.ioctx + volume.close() + rbd.RBD().remove(ceph_inst.ioctxs["rbd.hdd"], "simplevm.root") + except rbd.ImageNotFound: + pass + + +def test_volume_presence(ceph_inst, tmp_spec): + assert not tmp_spec.volume + assert not ceph_inst.volumes["tmp"] + assert not tmp_spec.exists_in_pool() + assert tmp_spec.exists_in_pools() == [] + tmp_spec.ensure_presence() + assert tmp_spec.exists_in_pool() == "rbd.hdd" + assert tmp_spec.exists_in_pools() == ["rbd.hdd"] + assert tmp_spec.volume.rbdimage + assert tmp_spec.volume.rbdimage is ceph_inst.volumes["tmp"].rbdimage + # Check that ensure_presence is fine with being called multiple times. + assert tmp_spec.volume.fullname == "rbd.hdd/simplevm.tmp" + tmp_spec.ensure_presence() + assert tmp_spec.exists_in_pool() == "rbd.hdd" + assert tmp_spec.exists_in_pools() == ["rbd.hdd"] + + +def test_volume_snapshot(tmp_spec): + tmp_spec.ensure_presence() + volume = tmp_spec.volume + + volume.snapshots.create("s0") + snaps = list(volume.snapshots) + assert len(snaps) == 1 + snapshot = snaps[0] + assert snapshot.name == "simplevm.tmp" + assert snapshot.snapname == "s0" + assert snapshot.size == volume.size + assert snapshot.id == volume.snapshots["s0"].id + + snapshot.remove() + assert [] == list(volume.snapshots) + + +def test_purge_snapshots(tmp_spec): + tmp_spec.ensure_presence() + volume = tmp_spec.volume + for snap in ["s0", "s1"]: + volume.snapshots.create(snap) + assert len(volume.snapshots) == 2 + volume.snapshots.purge() + assert len(volume.snapshots) == 0 + + +def test_snapshot_not_found(tmp_spec): + tmp_spec.ensure_presence() + with pytest.raises(KeyError): + tmp_spec.volume.snapshots["no-such-key"] + + +def test_volume_size(tmp_spec): + tmp_spec.desired_size = 1024 + tmp_spec.ensure_presence() + volume = tmp_spec.volume + assert volume.rbdimage + assert volume.size == 1024 + volume.ensure_size(2048) + assert volume.size == 2048 + # Call ensure multiple times to help triggering caching code paths. + volume.ensure_size(2048) + assert volume.size == 2048 + + +def test_volume_shared_lock_protection(tmp_spec): + tmp_spec.ensure_presence() + volume = tmp_spec.volume + volume.rbdimage.lock_shared("host1", "a") + volume.rbdimage.lock_shared("remotehost", "a") + with pytest.raises(NotImplementedError): + volume.lock_status() + lockers = volume.rbdimage.list_lockers() + for client, cookie, _ in lockers["lockers"]: + volume.rbdimage.break_lock(client, cookie) + + +def test_volume_locking(tmp_spec): + tmp_spec.ensure_presence() + volume = tmp_spec.volume + assert volume.lock_status() is None + volume.lock() + assert volume.lock_status()[1] == socket.gethostname() + # We want to smoothen out that some other process has locked the same image + # for the same tag already and assume that this is another incarnation of + # us - for that we have our own lock. + volume.lock() + assert volume.lock_status()[1] == socket.gethostname() + volume.unlock() + assert volume.lock_status() is None + # We can call unlock twice if it isn't locked. + volume.unlock() + + volume.rbdimage.lock_exclusive("someotherhost") + with pytest.raises(rbd.ImageBusy): + volume.lock() + + # Can not unlock locks that someone else holds. + volume.unlock() + assert volume.lock_status()[1] == "someotherhost" + + +def test_force_unlock(tmp_spec): + tmp_spec.ensure_presence() + volume = tmp_spec.volume + volume = tmp_spec.volume + volume.rbdimage.lock_exclusive("someotherhost") + volume.unlock(force=True) + assert volume.lock_status() is None + + +# increase timeout from the default of 3s +@pytest.mark.timeout(10) +def test_volume_mkswap(ceph_inst): + swap = ceph_inst.specs["swap"] + swap.ensure_presence() + swap.start() + with swap.volume.mapped(): + output = subprocess.check_output(["file", swap.volume.device]) + output = output.decode("ascii") + assert "Linux swap file" in output + + +@pytest.mark.timeout(60) +def test_volume_tmp_mkfs(tmp_spec): + tmp_spec.desired_size = 100 * 1024 * 1024 + tmp_spec.ensure_presence() + with tmp_spec.volume.mapped(): + tmp_spec.mkfs() + + +def test_unmapped_volume_should_have_no_part1(tmp_spec): + tmp_spec.ensure_presence() + volume = tmp_spec.volume + # not something like 'None-part1' + assert volume.part1dev is None + + +def test_volume_map_unmap_is_idempotent(tmp_spec): + # This is more of an internal sanity test within our mocking infrastructure. + tmp_spec.ensure_presence() + volume = tmp_spec.volume + volume.map() + device = volume.device + assert os.path.exists(device) + volume.map() + assert os.path.exists(device) + volume.unmap() + assert not os.path.exists(device) + volume.unmap() + assert not os.path.exists(device) + + +def test_map_snapshot(tmp_spec): + tmp_spec.ensure_presence() + volume = tmp_spec.volume + volume.snapshots.create("s0") + with volume.snapshots["s0"].mapped() as device: + assert os.path.exists(device) + + +def test_mount_should_fail_if_not_mapped(tmp_spec): + tmp_spec.ensure_presence() + volume = tmp_spec.volume + with pytest.raises(RuntimeError): + volume.mount() + + +@pytest.mark.timeout(60) +@pytest.mark.live() +def test_mount_snapshot(tmp_spec): + tmp_spec.desired_size = 100 * 1024 * 1024 + tmp_spec.ensure_presence() + volume = tmp_spec.volume + with volume.mapped(): + tmp_spec.mkfs() + volume.snapshots.create("s0") + snap = volume.snapshots["s0"] + with snap.mounted() as mountpoint: + assert mountpoint.is_mount(), "not a mountpoint" + with open("/proc/self/mounts") as mounts: + assert f"{mountpoint} xfs ro" in mounts.read() + # test for idempotence + snap.mount() + assert mountpoint.is_mount() + assert not mountpoint.is_mount() + # test for idempotence + snap.umount() + assert not mountpoint.is_mount() diff --git a/src/fc/qemu/tests/agent_test.py b/tests/test_agent.py similarity index 60% rename from src/fc/qemu/tests/agent_test.py rename to tests/test_agent.py index 71bc597..d4ed2f7 100644 --- a/src/fc/qemu/tests/agent_test.py +++ b/tests/test_agent.py @@ -1,55 +1,55 @@ import os import shutil +from pathlib import Path import mock import pkg_resources import psutil import pytest -from ..agent import Agent -from ..exc import VMStateInconsistent +from fc.qemu.agent import Agent +from fc.qemu.exc import VMStateInconsistent +from fc.qemu.hazmat.qemu import Qemu, detect_current_machine_type @pytest.fixture -def cleanup_files(): - files = [] - yield files - for f in files: - if os.path.exists(f): - os.unlink(f) - - -@pytest.fixture -def simplevm_cfg(cleanup_files): - fixtures = pkg_resources.resource_filename(__name__, "fixtures") - shutil.copy(fixtures + "/simplevm.yaml", "/etc/qemu/vm/simplevm.cfg") - cleanup_files.append("/etc/qemu/vm/simplevm.cfg") +def simplevm_cfg(monkeypatch): + fixtures = Path(pkg_resources.resource_filename(__name__, "fixtures")) + source = fixtures / "simplevm.yaml" + # The Qemu prefix gets adjusted automatically in the synhetic_root + # auto-use fixture that checks whether this is a live test or not. + dest = Qemu.prefix / "etc/qemu/vm/simplevm.cfg" + dest.parent.mkdir(parents=True, exist_ok=True) + shutil.copy(source, dest) yield "simplevm" + a = Agent("simplevm") + a.system_config_template.unlink(missing_ok=True) @pytest.mark.live -def test_builtin_config_template(simplevm_cfg): +def test_builtin_config_template(simplevm_cfg, ceph_inst): a = Agent(simplevm_cfg) with a: + a.ceph.start() a.generate_config() - # machine type must match Qemu version in virtualbox - assert 'type = "pc-i440fx-4.1"' in a.qemu.config + # machine type must match Qemu version + current_machine_type = detect_current_machine_type(a.machine_type) + assert current_machine_type.count("-") == 2 + assert f'type = "{current_machine_type}"' in a.qemu.config @pytest.mark.live -def test_userdefined_config_template(simplevm_cfg, cleanup_files): - with open("/etc/qemu/qemu.vm.cfg.in", "w") as f: - f.write("# user defined config template\n") - cleanup_files.append("/etc/qemu/qemu.vm.cfg.in") +def test_userdefined_config_template(simplevm_cfg, ceph_inst): a = Agent(simplevm_cfg) + with a.system_config_template.open("w") as f: + f.write("# user defined config template\n") with a: + a.ceph.start() a.generate_config() assert "user defined config template" in a.qemu.config -@pytest.mark.live -@pytest.mark.timeout(60) -def test_consistency_vm_running(simplevm_cfg): +def test_consistency_vm_running(simplevm_cfg, ceph_inst): a = Agent(simplevm_cfg) with a: a.qemu.is_running = mock.Mock(return_value=True) @@ -58,9 +58,7 @@ def test_consistency_vm_running(simplevm_cfg): a.raise_if_inconsistent() -@pytest.mark.live -@pytest.mark.timeout(60) -def test_consistency_vm_not_running(simplevm_cfg): +def test_consistency_vm_not_running(simplevm_cfg, ceph_inst): a = Agent(simplevm_cfg) with a: a.qemu.is_running = mock.Mock(return_value=False) @@ -69,8 +67,7 @@ def test_consistency_vm_not_running(simplevm_cfg): a.raise_if_inconsistent() -@pytest.mark.live -def test_consistency_process_dead(simplevm_cfg): +def test_consistency_process_dead(simplevm_cfg, ceph_inst): a = Agent(simplevm_cfg) with a: a.qemu.is_running = mock.Mock(return_value=True) @@ -80,9 +77,7 @@ def test_consistency_process_dead(simplevm_cfg): a.raise_if_inconsistent() -@pytest.mark.live -@pytest.mark.timeout(60) -def test_consistency_pidfile_missing(simplevm_cfg): +def test_consistency_pid_file_missing(simplevm_cfg, ceph_inst): a = Agent(simplevm_cfg) with a: a.qemu.is_running = mock.Mock(return_value=True) @@ -92,9 +87,7 @@ def test_consistency_pidfile_missing(simplevm_cfg): a.raise_if_inconsistent() -@pytest.mark.live -@pytest.mark.timeout(60) -def test_consistency_ceph_lock_missing(simplevm_cfg): +def test_consistency_ceph_lock_missing(simplevm_cfg, ceph_inst): a = Agent(simplevm_cfg) with a: a.qemu.is_running = mock.Mock(return_value=True) @@ -104,9 +97,7 @@ def test_consistency_ceph_lock_missing(simplevm_cfg): a.raise_if_inconsistent() -@pytest.mark.live -@pytest.mark.timeout(60) -def test_ensure_inconsistent_state_detected(simplevm_cfg): +def test_ensure_inconsistent_state_detected(simplevm_cfg, ceph_inst): a = Agent(simplevm_cfg) with a: a.qemu.is_running = mock.Mock(return_value=True) diff --git a/src/fc/qemu/tests/test_binenv.py b/tests/test_binenv.py similarity index 72% rename from src/fc/qemu/tests/test_binenv.py rename to tests/test_binenv.py index ac27636..5222908 100644 --- a/src/fc/qemu/tests/test_binenv.py +++ b/tests/test_binenv.py @@ -2,11 +2,13 @@ import pytest -import fc.qemu.conftest +import tests.conftest KNOWN_BINARIES = set( [ + "blkid", "mkfs.xfs", + "mkswap", "mount", "partprobe", "pgrep", @@ -16,6 +18,7 @@ "sgdisk", "systemctl", "umount", + "xfs_admin", ] ) @@ -28,7 +31,8 @@ def test_known_binaries_reachable(): @pytest.mark.live def test_no_unexpected_binaries(): - assert fc.qemu.conftest.CALLED_BINARIES == KNOWN_BINARIES + # This needs to be last tu ensure we tracked all subprocess calls. + assert tests.conftest.CALLED_BINARIES == KNOWN_BINARIES @pytest.mark.unit diff --git a/src/fc/qemu/tests/test_consul.py b/tests/test_consul.py similarity index 95% rename from src/fc/qemu/tests/test_consul.py rename to tests/test_consul.py index c87cd3f..2a82e10 100644 --- a/src/fc/qemu/tests/test_consul.py +++ b/tests/test_consul.py @@ -9,9 +9,8 @@ from fc.qemu import util from fc.qemu.agent import Agent from fc.qemu.hazmat.qemu import Qemu - -from ..conftest import get_log -from ..ellipsis import Ellipsis +from tests.conftest import get_log +from tests.ellipsis import Ellipsis @pytest.fixture(autouse=True) @@ -50,7 +49,7 @@ def test_no_key_event(): start-consul-events count=1 handle-key-failed key=None Traceback (most recent call last): - File "/nix/store/...-python3-...-env/lib/python3.8/site-packages/fc/qemu/agent.py", line ..., in handle + File ".../fc/qemu/agent.py", line ..., in handle log.debug("handle-key", key=event["Key"]) KeyError: 'Key' finish-handle-key key=None @@ -61,18 +60,16 @@ def test_no_key_event(): @pytest.fixture -def clean_config_test22(tmpdir): +def clean_config_test22(tmp_path): targets = [ - str(tmpdir / "etc/qemu/vm/test22.cfg"), - str(tmpdir / "/etc/qemu/vm/.test22.cfg.staging"), + tmp_path / "etc/qemu/vm/test22.cfg", + tmp_path / "/etc/qemu/vm/.test22.cfg.staging", ] for target in targets: - if os.path.exists(target): - os.unlink(target) + target.unlink(missing_ok=True) yield for target in targets: - if os.path.exists(target): - os.unlink(target) + target.unlink(missing_ok=True) def prepare_consul_event( @@ -157,11 +154,11 @@ def test_qemu_config_change(clean_config_test22): util.log_data = [] agent = Agent("test22") assert agent.has_new_config() - assert not os.path.exists(agent.configfile) + assert not os.path.exists(agent.config_file) agent.activate_new_config() agent._update_from_enc() assert not agent.has_new_config() - assert os.path.exists(agent.configfile) + assert os.path.exists(agent.config_file) assert util.log_data == [] # Applying the same config again doesn't cause another ensure run. @@ -305,7 +302,7 @@ def test_snapshot_offline_vm(vm): vm.stage_new_config() vm.activate_new_config() - vm.ceph.ensure_root_volume() + vm.ceph.specs["root"].ensure_presence() vm.ensure_offline() print(get_log()) diff --git a/src/fc/qemu/tests/incoming_test.py b/tests/test_incoming.py similarity index 95% rename from src/fc/qemu/tests/incoming_test.py rename to tests/test_incoming.py index 25bb611..960f3a8 100644 --- a/src/fc/qemu/tests/incoming_test.py +++ b/tests/test_incoming.py @@ -1,8 +1,8 @@ import mock import pytest -from ..exc import MigrationError -from ..incoming import IncomingAPI, IncomingServer +from fc.qemu.exc import MigrationError +from fc.qemu.incoming import IncomingAPI, IncomingServer @pytest.fixture diff --git a/src/fc/qemu/tests/test_logging.py b/tests/test_logging.py similarity index 100% rename from src/fc/qemu/tests/test_logging.py rename to tests/test_logging.py diff --git a/src/fc/qemu/tests/test_migration.py b/tests/test_migration.py similarity index 100% rename from src/fc/qemu/tests/test_migration.py rename to tests/test_migration.py diff --git a/src/fc/qemu/tests/outgoing_test.py b/tests/test_outgoing.py similarity index 94% rename from src/fc/qemu/tests/outgoing_test.py rename to tests/test_outgoing.py index dbb112a..7a86170 100644 --- a/src/fc/qemu/tests/outgoing_test.py +++ b/tests/test_outgoing.py @@ -1,7 +1,7 @@ import mock import pytest -from ..outgoing import Outgoing +from fc.qemu.outgoing import Outgoing @pytest.fixture diff --git a/tests/test_qemu.py b/tests/test_qemu.py new file mode 100644 index 0000000..5743e1a --- /dev/null +++ b/tests/test_qemu.py @@ -0,0 +1,72 @@ +import os +import subprocess +import time + +import psutil +import pytest + +from fc.qemu.hazmat.qemu import Qemu + + +@pytest.fixture +def qemu_with_pid_file(tmp_path): + # The fixture uses a very long name which catches a special case where + # the proc name is limited to 16 bytes and then we failed to match + # the running VM. Parametrizing fixtures doesn't work the way I want + # so I did it this way ... + pid_file = tmp_path / "run/qemu.testvmwithverylongname.pid" + pid_file.unlink(missing_ok=True) + proc = subprocess.Popen( + [ + "qemu-system-x86_64", + "-name", + "testvmwithverylongname,process=kvm.testvmwithverylongname", + "-nodefaults", + "-pidfile", + str(pid_file), + ] + ) + while not pid_file.exists(): + time.sleep(0.01) + q = Qemu(dict(name="testvmwithverylongname", id=1234)) + while not q.proc(): + time.sleep(0.01) + try: + yield q + finally: + proc.kill() + + +def test_proc_running(qemu_with_pid_file): + assert isinstance(qemu_with_pid_file.proc(), psutil.Process) + + +def test_proc_not_running(qemu_with_pid_file): + with qemu_with_pid_file.pid_file.open("w") as p: + p.write("0\n") + assert qemu_with_pid_file.proc() is None + + +def test_proc_wrong_process(qemu_with_pid_file): + with qemu_with_pid_file.pid_file.open("w") as p: + p.write("1\n") + assert qemu_with_pid_file.proc() is None + + +def test_proc_no_pid_file(qemu_with_pid_file): + os.unlink(qemu_with_pid_file.pid_file) + assert qemu_with_pid_file.proc() is None + + +def test_proc_empty_pid_file(qemu_with_pid_file): + # Empty out the file + qemu_with_pid_file.pid_file.open("w").close() + assert qemu_with_pid_file.proc() is None + + +def test_proc_pid_file_with_garbage(qemu_with_pid_file): + """pid files are allowed to contain trailing lines with garbage, + process retrieval must still work then.""" + with qemu_with_pid_file.pid_file.open("a") as f: + f.write("trailing line\n") + assert isinstance(qemu_with_pid_file.proc(), psutil.Process) diff --git a/src/fc/qemu/tests/test_timeout.py b/tests/test_timeout.py similarity index 88% rename from src/fc/qemu/tests/test_timeout.py rename to tests/test_timeout.py index f38fa89..57ca041 100644 --- a/src/fc/qemu/tests/test_timeout.py +++ b/tests/test_timeout.py @@ -1,9 +1,8 @@ import pytest +from fc.qemu.timeout import TimeOut, TimeoutError from fc.qemu.util import log -from ..timeout import TimeOut, TimeoutError - def test_timeout_raises(): log_ = log.bind(machine="test") diff --git a/src/fc/qemu/tests/test_util.py b/tests/test_util.py similarity index 100% rename from src/fc/qemu/tests/test_util.py rename to tests/test_util.py diff --git a/src/fc/qemu/tests/vm_test.py b/tests/test_vm.py similarity index 60% rename from src/fc/qemu/tests/vm_test.py rename to tests/test_vm.py index 16f6688..d0c75b7 100644 --- a/src/fc/qemu/tests/vm_test.py +++ b/tests/test_vm.py @@ -11,12 +11,11 @@ import yaml from fc.qemu import util - -from ..agent import Agent, InvalidCommand, swap_size, tmp_size -from ..conftest import get_log -from ..ellipsis import Ellipsis -from ..hazmat import qemu -from ..util import GiB, MiB +from fc.qemu.agent import Agent, InvalidCommand, swap_size, tmp_size +from fc.qemu.hazmat import guestagent, qemu +from fc.qemu.util import GiB, MiB +from tests.conftest import get_log +from tests.ellipsis import Ellipsis def clean_output(output): @@ -35,9 +34,9 @@ def test_simple_vm_lifecycle_start_stop(vm, patterns): status.continuous( """ vm-status machine=simplevm result=offline -rbd-status locker=None machine=simplevm volume=rbd.ssd/simplevm.root -rbd-status locker=None machine=simplevm volume=rbd.ssd/simplevm.swap -rbd-status locker=None machine=simplevm volume=rbd.ssd/simplevm.tmp +rbd-status machine=simplevm presence=missing subsystem=ceph volume=rbd.ssd/simplevm.root +rbd-status machine=simplevm presence=missing subsystem=ceph volume=rbd.ssd/simplevm.swap +rbd-status machine=simplevm presence=missing subsystem=ceph volume=rbd.ssd/simplevm.tmp """ ) @@ -57,15 +56,42 @@ def test_simple_vm_lifecycle_start_stop(vm, patterns): """ acquire-lock machine=simplevm target=/run/qemu.simplevm.lock acquire-lock count=1 machine=simplevm result=locked target=/run/qemu.simplevm.lock -generate-config machine=simplevm -ensure-root machine=simplevm subsystem=ceph + +pre-start machine=simplevm subsystem=ceph volume_spec=root + +ensure-presence machine=simplevm subsystem=ceph volume_spec=root lock machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.root -ensure-tmp machine=simplevm subsystem=ceph +ensure-size machine=simplevm subsystem=ceph volume_spec=root +start machine=simplevm subsystem=ceph volume_spec=root +start-root machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.root +root-found-in current_pool=rbd.ssd machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.root + +pre-start machine=simplevm subsystem=ceph volume_spec=swap +ensure-presence machine=simplevm subsystem=ceph volume_spec=swap +lock machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.swap +ensure-size machine=simplevm subsystem=ceph volume_spec=swap +start machine=simplevm subsystem=ceph volume_spec=swap +start-swap machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.swap +rbd args=-c "/etc/ceph/ceph.conf" --id "host1" map "rbd.ssd/simplevm.swap" machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.swap +rbd> /dev/rbd0 +rbd machine=simplevm returncode=0 subsystem=ceph volume=rbd.ssd/simplevm.swap +mkswap args=-f -L "swap" /dev/rbd/rbd.ssd/simplevm.swap machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.swap +mkswap> Setting up swapspace version 1, size = 1024 MiB (1073737728 bytes) +mkswap> LABEL=swap, UUID=...-...-...-...-... +mkswap machine=simplevm returncode=0 subsystem=ceph volume=rbd.ssd/simplevm.swap +rbd args=-c "/etc/ceph/ceph.conf" --id "host1" unmap "/dev/rbd/rbd.ssd/simplevm.swap" machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.swap +rbd machine=simplevm returncode=0 subsystem=ceph volume=rbd.ssd/simplevm.swap + +pre-start machine=simplevm subsystem=ceph volume_spec=tmp +ensure-presence machine=simplevm subsystem=ceph volume_spec=tmp lock machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.tmp +ensure-size machine=simplevm subsystem=ceph volume_spec=tmp +start machine=simplevm subsystem=ceph volume_spec=tmp +start-tmp machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.tmp rbd args=-c "/etc/ceph/ceph.conf" --id "host1" map "rbd.ssd/simplevm.tmp" machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.tmp rbd> /dev/rbd0 rbd machine=simplevm returncode=0 subsystem=ceph volume=rbd.ssd/simplevm.tmp -create-fs machine=simplevm subsystem=ceph type=xfs volume=rbd.ssd/simplevm.tmp +create-fs machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.tmp sgdisk args=-o "/dev/rbd/rbd.ssd/simplevm.tmp" machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.tmp sgdisk> The operation has completed successfully. sgdisk machine=simplevm returncode=0 subsystem=ceph volume=rbd.ssd/simplevm.tmp @@ -76,8 +102,7 @@ def test_simple_vm_lifecycle_start_stop(vm, patterns): sgdisk machine=simplevm returncode=0 subsystem=ceph volume=rbd.ssd/simplevm.tmp partprobe args=/dev/rbd/rbd.ssd/simplevm.tmp machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.tmp partprobe machine=simplevm returncode=0 subsystem=ceph volume=rbd.ssd/simplevm.tmp -waiting interval=0 machine=simplevm remaining=4 subsystem=ceph volume=rbd.ssd/simplevm.tmp -mkfs.xfs args=-q -f -K -m crc=1,finobt=1 -d su=4m,sw=1 -L "tmp" "/dev/rbd/rbd.ssd/simplevm.tmp-part1" machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.tmp +mkfs.xfs args=-q -f -K -m crc=1,finobt=1 -d su=4m,sw=1 -L "tmp" /dev/rbd/rbd.ssd/simplevm.tmp-part1 machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.tmp mkfs.xfs> mkfs.xfs: Specified data stripe unit 8192 is not the same as the volume stripe unit 128 mkfs.xfs> mkfs.xfs: Specified data stripe width 8192 is not the same as the volume stripe width 128 mkfs.xfs> log stripe unit (4194304 bytes) is too large (maximum is 256KiB) @@ -92,17 +117,7 @@ def test_simple_vm_lifecycle_start_stop(vm, patterns): umount machine=simplevm returncode=0 subsystem=ceph volume=rbd.ssd/simplevm.tmp rbd args=-c "/etc/ceph/ceph.conf" --id "host1" unmap "/dev/rbd/rbd.ssd/simplevm.tmp" machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.tmp rbd machine=simplevm returncode=0 subsystem=ceph volume=rbd.ssd/simplevm.tmp -ensure-swap machine=simplevm subsystem=ceph -lock machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.swap -rbd args=-c "/etc/ceph/ceph.conf" --id "host1" map "rbd.ssd/simplevm.swap" machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.swap -rbd> /dev/rbd0 -rbd machine=simplevm returncode=0 subsystem=ceph volume=rbd.ssd/simplevm.swap -mkswap args=-f -L "swap" "/dev/rbd/rbd.ssd/simplevm.swap" machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.swap -mkswap> Setting up swapspace version 1, size = 1024 MiB (1073737728 bytes) -mkswap> LABEL=swap, UUID=...-...-...-...-... -mkswap machine=simplevm returncode=0 subsystem=ceph volume=rbd.ssd/simplevm.swap -rbd args=-c "/etc/ceph/ceph.conf" --id "host1" unmap "/dev/rbd/rbd.ssd/simplevm.swap" machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.swap -rbd machine=simplevm returncode=0 subsystem=ceph volume=rbd.ssd/simplevm.swap +generate-config machine=simplevm acquire-global-lock machine=simplevm subsystem=qemu target=/run/fc-qemu.lock global-lock-acquire machine=simplevm result=locked subsystem=qemu target=/run/fc-qemu.lock global-lock-status count=1 machine=simplevm subsystem=qemu target=/run/fc-qemu.lock @@ -135,12 +150,13 @@ def test_simple_vm_lifecycle_start_stop(vm, patterns): """ sgdisk> Creating new GPT entries in memory. rbd> /dev/rbd0 +waiting interval=0 machine=simplevm remaining=... subsystem=ceph volume=rbd.ssd/simplevm.tmp """ ) BOOTSTRAP = """ -create-vm machine=simplevm subsystem=ceph -/nix/store/.../bin/fc-create-vm args=-I simplevm machine=simplevm subsystem=ceph +create-vm machine=simplevm subsystem=ceph volume=simplevm.root +/nix/store/.../bin/fc-create-vm args=-I simplevm machine=simplevm subsystem=ceph volume=simplevm.root fc-create-vm> fc-create-vm> Establishing system identity fc-create-vm> ---------------------------- @@ -152,18 +168,18 @@ def test_simple_vm_lifecycle_start_stop(vm, patterns): fc-create-vm> fc-create-vm> Finished fc-create-vm> -------- -/nix/store/.../bin/fc-create-vm machine=simplevm returncode=0 subsystem=ceph +/nix/store/.../bin/fc-create-vm machine=simplevm returncode=0 subsystem=ceph volume=simplevm.root rbd args=-c "/etc/ceph/ceph.conf" --id "host1" map "rbd.ssd/simplevm.root" machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.root rbd> /dev/rbd0 rbd machine=simplevm returncode=0 subsystem=ceph volume=rbd.ssd/simplevm.root waiting interval=0 machine=simplevm remaining=4 subsystem=ceph volume=rbd.ssd/simplevm.root blkid args=/dev/rbd/rbd.ssd/simplevm.root-part1 -o export machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.root blkid> DEVNAME=/dev/rbd/rbd.ssd/simplevm.root-part1 -blkid> UUID=... +blkid> UUID=...-...-...-...-... blkid> BLOCK_SIZE=512 blkid> TYPE=xfs blkid> PARTLABEL=ROOT -blkid> PARTUUID=... +blkid> PARTUUID=...-...-...-...-... blkid machine=simplevm returncode=0 subsystem=ceph volume=rbd.ssd/simplevm.root mount args="/dev/rbd/rbd.ssd/simplevm.root-part1" "/mnt/rbd/rbd.ssd/simplevm.root" machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.root mount machine=simplevm returncode=0 subsystem=ceph volume=rbd.ssd/simplevm.root @@ -173,7 +189,7 @@ def test_simple_vm_lifecycle_start_stop(vm, patterns): xfs_admin args=-U generate /dev/rbd/rbd.ssd/simplevm.root-part1 machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.root xfs_admin> Clearing log and setting UUID xfs_admin> writing all SBs -xfs_admin> new UUID = ... +xfs_admin> new UUID = ...-...-...-...-... xfs_admin machine=simplevm returncode=0 subsystem=ceph volume=rbd.ssd/simplevm.root rbd args=-c "/etc/ceph/ceph.conf" --id "host1" unmap "/dev/rbd/rbd.ssd/simplevm.root" machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.root rbd machine=simplevm returncode=0 subsystem=ceph volume=rbd.ssd/simplevm.root @@ -208,9 +224,9 @@ def test_simple_vm_lifecycle_start_stop(vm, patterns): disk-throttle device=virtio0 iops=10000 machine=simplevm disk-throttle device=virtio1 iops=10000 machine=simplevm disk-throttle device=virtio2 iops=10000 machine=simplevm -rbd-status locker=('client...', 'host1') machine=simplevm volume=rbd.ssd/simplevm.root -rbd-status locker=('client...', 'host1') machine=simplevm volume=rbd.ssd/simplevm.swap -rbd-status locker=('client...', 'host1') machine=simplevm volume=rbd.ssd/simplevm.tmp""" +rbd-status locker=('client...', 'host1') machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.root +rbd-status locker=('client...', 'host1') machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.swap +rbd-status locker=('client...', 'host1') machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.tmp""" ) vm.stop() @@ -221,9 +237,9 @@ def test_simple_vm_lifecycle_start_stop(vm, patterns): get_log() == """\ vm-status machine=simplevm result=offline -rbd-status locker=None machine=simplevm volume=rbd.ssd/simplevm.root -rbd-status locker=None machine=simplevm volume=rbd.ssd/simplevm.swap -rbd-status locker=None machine=simplevm volume=rbd.ssd/simplevm.tmp""" +rbd-status locker=None machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.root +rbd-status locker=None machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.swap +rbd-status locker=None machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.tmp""" ) # Starting a second time doesn't show the bootstrap code! @@ -282,9 +298,9 @@ def test_simple_vm_lifecycle_ensure_going_offline(vm, capsys, caplog): get_log() == """\ vm-status machine=simplevm result=offline -rbd-status locker=None machine=simplevm volume=rbd.ssd/simplevm.root -rbd-status locker=None machine=simplevm volume=rbd.ssd/simplevm.swap -rbd-status locker=None machine=simplevm volume=rbd.ssd/simplevm.tmp""" +rbd-status machine=simplevm presence=missing subsystem=ceph volume=rbd.ssd/simplevm.root +rbd-status machine=simplevm presence=missing subsystem=ceph volume=rbd.ssd/simplevm.swap +rbd-status machine=simplevm presence=missing subsystem=ceph volume=rbd.ssd/simplevm.tmp""" ) vm.ensure() @@ -301,9 +317,9 @@ def test_simple_vm_lifecycle_ensure_going_offline(vm, capsys, caplog): disk-throttle device=virtio0 iops=10000 machine=simplevm disk-throttle device=virtio1 iops=10000 machine=simplevm disk-throttle device=virtio2 iops=10000 machine=simplevm -rbd-status locker=('client...', 'host1') machine=simplevm volume=rbd.ssd/simplevm.root -rbd-status locker=('client...', 'host1') machine=simplevm volume=rbd.ssd/simplevm.swap -rbd-status locker=('client...', 'host1') machine=simplevm volume=rbd.ssd/simplevm.tmp""" +rbd-status locker=('client...', 'host1') machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.root +rbd-status locker=('client...', 'host1') machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.swap +rbd-status locker=('client...', 'host1') machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.tmp""" ) vm.enc["parameters"]["online"] = False @@ -324,9 +340,9 @@ def test_simple_vm_lifecycle_ensure_going_offline(vm, capsys, caplog): get_log() == """\ vm-status machine=simplevm result=offline -rbd-status locker=None machine=simplevm volume=rbd.ssd/simplevm.root -rbd-status locker=None machine=simplevm volume=rbd.ssd/simplevm.swap -rbd-status locker=None machine=simplevm volume=rbd.ssd/simplevm.tmp""" +rbd-status locker=None machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.root +rbd-status locker=None machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.swap +rbd-status locker=None machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.tmp""" ) @@ -339,9 +355,9 @@ def test_vm_not_running_here(vm, capsys): get_log() == """\ vm-status machine=simplevm result=offline -rbd-status locker=None machine=simplevm volume=rbd.ssd/simplevm.root -rbd-status locker=None machine=simplevm volume=rbd.ssd/simplevm.swap -rbd-status locker=None machine=simplevm volume=rbd.ssd/simplevm.tmp""" +rbd-status machine=simplevm presence=missing subsystem=ceph volume=rbd.ssd/simplevm.root +rbd-status machine=simplevm presence=missing subsystem=ceph volume=rbd.ssd/simplevm.swap +rbd-status machine=simplevm presence=missing subsystem=ceph volume=rbd.ssd/simplevm.tmp""" ) vm.enc["parameters"]["kvm_host"] = "otherhost" @@ -353,9 +369,9 @@ def test_vm_not_running_here(vm, capsys): assert get_log() == Ellipsis( """\ vm-status machine=simplevm result=offline -rbd-status locker=None machine=simplevm volume=rbd.ssd/simplevm.root -rbd-status locker=None machine=simplevm volume=rbd.ssd/simplevm.swap -rbd-status locker=None machine=simplevm volume=rbd.ssd/simplevm.tmp""" +rbd-status machine=simplevm presence=missing subsystem=ceph volume=rbd.ssd/simplevm.root +rbd-status machine=simplevm presence=missing subsystem=ceph volume=rbd.ssd/simplevm.swap +rbd-status machine=simplevm presence=missing subsystem=ceph volume=rbd.ssd/simplevm.tmp""" ) @@ -375,9 +391,9 @@ def test_crashed_vm_clean_restart(vm): assert get_log() == Ellipsis( """\ vm-status machine=simplevm result=offline -rbd-status locker=None machine=simplevm volume=rbd.ssd/simplevm.root -rbd-status locker=None machine=simplevm volume=rbd.ssd/simplevm.swap -rbd-status locker=None machine=simplevm volume=rbd.ssd/simplevm.tmp""" +rbd-status machine=simplevm presence=missing subsystem=ceph volume=rbd.ssd/simplevm.root +rbd-status machine=simplevm presence=missing subsystem=ceph volume=rbd.ssd/simplevm.swap +rbd-status machine=simplevm presence=missing subsystem=ceph volume=rbd.ssd/simplevm.tmp""" ) vm.ensure() @@ -391,9 +407,9 @@ def test_crashed_vm_clean_restart(vm): disk-throttle device=virtio0 iops=10000 machine=simplevm disk-throttle device=virtio1 iops=10000 machine=simplevm disk-throttle device=virtio2 iops=10000 machine=simplevm -rbd-status locker=('client...', 'host1') machine=simplevm volume=rbd.ssd/simplevm.root -rbd-status locker=('client...', 'host1') machine=simplevm volume=rbd.ssd/simplevm.swap -rbd-status locker=('client...', 'host1') machine=simplevm volume=rbd.ssd/simplevm.tmp""" +rbd-status locker=('client...', 'host1') machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.root +rbd-status locker=('client...', 'host1') machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.swap +rbd-status locker=('client...', 'host1') machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.tmp""" ) p = vm.qemu.proc() @@ -405,9 +421,9 @@ def test_crashed_vm_clean_restart(vm): assert get_log() == Ellipsis( """\ vm-status machine=simplevm result=offline -rbd-status locker=('client...', 'host1') machine=simplevm volume=rbd.ssd/simplevm.root -rbd-status locker=('client...', 'host1') machine=simplevm volume=rbd.ssd/simplevm.swap -rbd-status locker=('client...', 'host1') machine=simplevm volume=rbd.ssd/simplevm.tmp""" +rbd-status locker=('client...', 'host1') machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.root +rbd-status locker=('client...', 'host1') machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.swap +rbd-status locker=('client...', 'host1') machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.tmp""" ) vm.ensure() @@ -422,9 +438,9 @@ def test_crashed_vm_clean_restart(vm): disk-throttle device=virtio0 iops=10000 machine=simplevm disk-throttle device=virtio1 iops=10000 machine=simplevm disk-throttle device=virtio2 iops=10000 machine=simplevm -rbd-status locker=('client...', 'host1') machine=simplevm volume=rbd.ssd/simplevm.root -rbd-status locker=('client...', 'host1') machine=simplevm volume=rbd.ssd/simplevm.swap -rbd-status locker=('client...', 'host1') machine=simplevm volume=rbd.ssd/simplevm.tmp""" +rbd-status locker=('client...', 'host1') machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.root +rbd-status locker=('client...', 'host1') machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.swap +rbd-status locker=('client...', 'host1') machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.tmp""" ) util.test_log_options["show_events"] = [ @@ -454,9 +470,9 @@ def test_crashed_vm_clean_restart(vm): consul-deregister machine=simplevm clean-run-files machine=simplevm subsystem=qemu vm-status machine=simplevm result=offline -rbd-status locker=None machine=simplevm volume=rbd.ssd/simplevm.root -rbd-status locker=None machine=simplevm volume=rbd.ssd/simplevm.swap -rbd-status locker=None machine=simplevm volume=rbd.ssd/simplevm.tmp +rbd-status locker=None machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.root +rbd-status locker=None machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.swap +rbd-status locker=None machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.tmp consul machine=simplevm service=""" ) @@ -482,8 +498,8 @@ def test_do_not_clean_up_crashed_vm_that_doesnt_get_restarted(vm): @pytest.mark.live @pytest.mark.timeout(60) def test_vm_snapshot_only_if_running(vm): - assert list(x.fullname for x in vm.ceph.root.snapshots) == [] - vm.ceph.root.ensure_presence() + vm.ceph.specs["root"].ensure_presence() + assert list(x.fullname for x in vm.ceph.volumes["root"].snapshots) == [] with pytest.raises(InvalidCommand): vm.snapshot("asdf") @@ -502,8 +518,10 @@ def test_vm_snapshot_with_missing_guest_agent(vm, monkeypatch): monkeypatch.setattr(util, "today", lambda: datetime.date(2010, 1, 1)) monkeypatch.setattr(qemu, "FREEZE_TIMEOUT", 1) + monkeypatch.setattr(guestagent, "SYNC_TIMEOUT", 1) - assert list(x.fullname for x in vm.ceph.root.snapshots) == [] + vm.ceph.specs["root"].ensure_presence() + assert list(x.fullname for x in vm.ceph.volumes["root"].snapshots) == [] vm.ensure() get_log() @@ -586,26 +604,25 @@ def test_vm_resize_disk(vm, patterns): vm.start() get_log() - # The cloned image is smaller than the initial desired - # disk so we immediately get a resize. + vm.cfg["root_size"] += 1 * 1024**3 vm.ensure_online_disk_size() resize_1 = patterns.resize_1 resize_1.in_order( """ -check-disk-size action=resize found=104857600 machine=simplevm wanted=2147483648 -block_resize arguments={'device': 'virtio0', 'size': 2147483648} id=None machine=simplevm subsystem=qemu/qmp +check-disk-size action=resize found=2147483648 machine=simplevm wanted=3221225472 +block_resize arguments={'device': 'virtio0', 'size': 3221225472} id=None machine=simplevm subsystem=qemu/qmp """ ) assert get_log() == resize_1 # Increasing the desired disk size also triggers a change. - vm.cfg["disk"] *= 2 + vm.cfg["root_size"] *= 2 vm.ensure_online_disk_size() resize_2 = patterns.resize_2 resize_2.in_order( """ -check-disk-size action=resize found=2147483648 machine=simplevm wanted=4294967296 -block_resize arguments={'device': 'virtio0', 'size': 4294967296} id=None machine=simplevm subsystem=qemu/qmp +check-disk-size action=resize found=3221225472 machine=simplevm wanted=6442450944 +block_resize arguments={'device': 'virtio0', 'size': 6442450944} id=None machine=simplevm subsystem=qemu/qmp """ ) assert get_log() == resize_2 @@ -615,7 +632,7 @@ def test_vm_resize_disk(vm, patterns): resize_noop = patterns.resize_noop resize_noop.in_order( """ -check-disk-size action=none found=4294967296 machine=simplevm wanted=4294967296 +check-disk-size action=none found=6442450944 machine=simplevm wanted=6442450944 """ ) @@ -661,72 +678,72 @@ def outmigrate_pattern(patterns): """ /nix/store/.../bin/fc-qemu -v outmigrate simplevm load-system-config -simplevm connect-rados subsystem='ceph' -simplevm acquire-lock target='/run/qemu.simplevm.lock' -simplevm acquire-lock count=1 result='locked' target='/run/qemu.simplevm.lock' -simplevm qmp_capabilities arguments={} id=None subsystem='qemu/qmp' -simplevm query-status arguments={} id=None subsystem='qemu/qmp' - -simplevm outmigrate -simplevm consul-register -simplevm locate-inmigration-service -simplevm check-staging-config result='none' -simplevm located-inmigration-service url='http://host2.mgm.test.gocept.net:...' - -simplevm acquire-migration-locks -simplevm check-staging-config result='none' -simplevm acquire-migration-lock result='success' subsystem='qemu' -simplevm acquire-local-migration-lock result='success' -simplevm acquire-remote-migration-lock -simplevm acquire-remote-migration-lock result='success' - -simplevm unlock subsystem='ceph' volume='rbd.ssd/simplevm.root' -simplevm unlock subsystem='ceph' volume='rbd.ssd/simplevm.swap' -simplevm unlock subsystem='ceph' volume='rbd.ssd/simplevm.tmp' - -simplevm prepare-remote-environment -simplevm start-migration target='tcp:192.168.4.7:...' -simplevm migrate subsystem='qemu' -simplevm migrate-set-capabilities arguments={'capabilities': [{'capability': 'xbzrle', 'state': False}, {'capability': 'auto-converge', 'state': True}]} id=None subsystem='qemu/qmp' -simplevm migrate-set-parameters arguments={'compress-level': 0, 'downtime-limit': 4000, 'max-bandwidth': 22500} id=None subsystem='qemu/qmp' -simplevm migrate arguments={'uri': 'tcp:192.168.4.7:...'} id=None subsystem='qemu/qmp' - -simplevm query-migrate-parameters arguments={} id=None subsystem='qemu/qmp' -simplevm migrate-parameters announce-initial=50 announce-max=550 announce-rounds=5 announce-step=100 block-incremental=False compress-level=0 compress-threads=8 compress-wait-thread=True cpu-throttle-increment=10 cpu-throttle-initial=20 cpu-throttle-tailslow=False decompress-threads=2 downtime-limit=4000 max-bandwidth=22500 max-cpu-throttle=99 max-postcopy-bandwidth=0 multifd-channels=2 multifd-compression='none' multifd-zlib-level=1 multifd-zstd-level=1 subsystem='qemu' throttle-trigger-threshold=50 tls-authz='' tls-creds='' tls-hostname='' x-checkpoint-delay=20000 xbzrle-cache-size=67108864 - -simplevm query-migrate arguments={} id=None subsystem='qemu/qmp' -simplevm migration-status mbps='-' remaining='0' status='setup' - -simplevm query-migrate arguments={} id=None subsystem='qemu/qmp' -simplevm migration-status mbps=... remaining='...' status='active' - -simplevm migration-status mbps=... remaining='...' status='completed' - -simplevm query-status arguments={} id=None subsystem='qemu/qmp' -simplevm finish-migration - -simplevm vm-destroy-kill-supervisor attempt=1 subsystem='qemu' -simplevm vm-destroy-kill-supervisor attempt=2 subsystem='qemu' -simplevm vm-destroy-kill-vm attempt=1 subsystem='qemu' -simplevm vm-destroy-kill-vm attempt=2 subsystem='qemu' -simplevm clean-run-files subsystem='qemu' -simplevm finish-remote -simplevm consul-deregister -simplevm outmigrate-finished exitcode=0 -simplevm release-lock count=0 target='/run/qemu.simplevm.lock' -simplevm release-lock result='unlocked' target='/run/qemu.simplevm.lock' +simplevm ceph connect-rados +simplevm acquire-lock target='/run/qemu.simplevm.lock' +simplevm acquire-lock count=1 result='locked' target='/run/qemu.simplevm.lock' +simplevm qemu/qmp qmp_capabilities arguments={} id=None +simplevm qemu/qmp query-status arguments={} id=None + +simplevm outmigrate +simplevm consul-register +simplevm setup-outgoing-migration cookie='...' +simplevm locate-inmigration-service +simplevm check-staging-config result='none' +simplevm located-inmigration-service url='http://host2.mgm.test.gocept.net:...' + +simplevm acquire-migration-locks +simplevm check-staging-config result='none' +simplevm qemu acquire-migration-lock result='success' +simplevm acquire-local-migration-lock result='success' +simplevm acquire-remote-migration-lock +simplevm acquire-remote-migration-lock result='success' + +simplevm ceph unlock volume='rbd.ssd/simplevm.root' +simplevm ceph unlock volume='rbd.ssd/simplevm.swap' +simplevm ceph unlock volume='rbd.ssd/simplevm.tmp' + +simplevm prepare-remote-environment +simplevm start-migration target='tcp:192.168.4.7:...' +simplevm qemu migrate +simplevm qemu/qmp migrate-set-capabilities arguments={'capabilities': [{'capability': 'xbzrle', 'state': False}, {'capability': 'auto-converge', 'state': True}]} id=None +simplevm qemu/qmp migrate-set-parameters arguments={'compress-level': 0, 'downtime-limit': 4000, 'max-bandwidth': 22500} id=None +simplevm qemu/qmp migrate arguments={'uri': 'tcp:192.168.4.7:...'} id=None + +simplevm qemu/qmp query-migrate-parameters arguments={} id=None +simplevm qemu migrate-parameters announce-initial=50 announce-max=550 announce-rounds=5 announce-step=100 block-incremental=False compress-level=0 compress-threads=8 compress-wait-thread=True cpu-throttle-increment=10 cpu-throttle-initial=20 cpu-throttle-tailslow=False decompress-threads=2 downtime-limit=4000 max-bandwidth=22500 max-cpu-throttle=99 max-postcopy-bandwidth=0 multifd-channels=2 multifd-compression='none' multifd-zlib-level=1 multifd-zstd-level=1 throttle-trigger-threshold=50 tls-authz='' tls-creds='' tls-hostname='' x-checkpoint-delay=20000 xbzrle-cache-size=67108864 + +simplevm qemu/qmp query-migrate arguments={} id=None +simplevm migration-status mbps='-' remaining='0' status='setup' + +simplevm qemu/qmp query-migrate arguments={} id=None +simplevm migration-status mbps=... remaining='...' status='active' + +simplevm migration-status mbps=... remaining='...' status='completed' + +simplevm qemu/qmp query-status arguments={} id=None +simplevm finish-migration + +simplevm qemu vm-destroy-kill-supervisor attempt=1 +simplevm qemu vm-destroy-kill-supervisor attempt=2 +simplevm qemu vm-destroy-kill-vm attempt=1 +simplevm qemu vm-destroy-kill-vm attempt=2 +simplevm qemu clean-run-files +simplevm finish-remote +simplevm consul-deregister +simplevm outmigrate-finished exitcode=0 +simplevm release-lock count=0 target='/run/qemu.simplevm.lock' +simplevm release-lock result='unlocked' target='/run/qemu.simplevm.lock' """ ) # There are a couple of steps in the migration process that may repeat due to # timings,so this may or may not appear more often: outmigrate.optional( """ -simplevm waiting interval=3 remaining=... -simplevm check-staging-config result='none' -simplevm query-migrate arguments={} id=None subsystem='qemu/qmp' -simplevm migration-status mbps=... remaining='...' status='active' -simplevm vm-destroy-kill-vm attempt=... subsystem='qemu' - """ +simplevm waiting interval=3 remaining=... +simplevm check-staging-config result='none' +simplevm qemu/qmp query-migrate arguments={} id=None +simplevm migration-status mbps=... remaining='...' status='active' +simplevm qemu vm-destroy-kill-vm attempt=... """ ) # Expect debug output that doesn't matter as much patterns.debug.optional("simplevm> ...") @@ -734,17 +751,17 @@ def outmigrate_pattern(patterns): # This part of the heartbeats must show up patterns.heartbeat.in_order( """\ -simplevm heartbeat-initialized -simplevm started-heartbeat-ping -simplevm heartbeat-ping +simplevm heartbeat-initialized +simplevm started-heartbeat-ping +simplevm heartbeat-ping """ ) # The pings may happen more times and sometimes the stopping part # isn't visible because we terminate too fast. patterns.heartbeat.optional( """ -simplevm heartbeat-ping -simplevm stopped-heartbeat-ping +simplevm heartbeat-ping +simplevm stopped-heartbeat-ping """ ) @@ -759,44 +776,45 @@ def test_vm_migration_pattern(outmigrate_pattern): assert ( outmigrate_pattern == """\ -/nix/store/c5kfr1yx6920s7lcswsv9dn61g8dc5xk-python3.8-fc.qemu-dev/bin/fc-qemu -v outmigrate simplevm +/nix/store/.../bin/fc-qemu -v outmigrate simplevm load-system-config -simplevm connect-rados subsystem='ceph' -simplevm acquire-lock target='/run/qemu.simplevm.lock' -simplevm acquire-lock count=1 result='locked' target='/run/qemu.simplevm.lock' -simplevm qmp_capabilities arguments={} id=None subsystem='qemu/qmp' -simplevm query-status arguments={} id=None subsystem='qemu/qmp' -simplevm outmigrate -simplevm consul-register -simplevm heartbeat-initialized -simplevm locate-inmigration-service -simplevm check-staging-config result='none' -simplevm located-inmigration-service url='http://host2.mgm.test.gocept.net:36573' -simplevm started-heartbeat-ping -simplevm acquire-migration-locks -simplevm heartbeat-ping -simplevm check-staging-config result='none' -simplevm acquire-migration-lock result='success' subsystem='qemu' -simplevm acquire-local-migration-lock result='success' -simplevm acquire-remote-migration-lock -simplevm acquire-remote-migration-lock result='success' -simplevm unlock subsystem='ceph' volume='rbd.ssd/simplevm.root' -simplevm unlock subsystem='ceph' volume='rbd.ssd/simplevm.swap' -simplevm unlock subsystem='ceph' volume='rbd.ssd/simplevm.tmp' -simplevm prepare-remote-environment -simplevm start-migration target='tcp:192.168.4.7:2345' -simplevm migrate subsystem='qemu' -simplevm migrate-set-capabilities arguments={'capabilities': [{'capability': 'xbzrle', 'state': False}, {'capability': 'auto-converge', 'state': True}]} id=None subsystem='qemu/qmp' -simplevm migrate-set-parameters arguments={'compress-level': 0, 'downtime-limit': 4000, 'max-bandwidth': 22500} id=None subsystem='qemu/qmp' -simplevm migrate arguments={'uri': 'tcp:192.168.4.7:2345'} id=None subsystem='qemu/qmp' -simplevm query-migrate-parameters arguments={} id=None subsystem='qemu/qmp' -simplevm migrate-parameters announce-initial=50 announce-max=550 announce-rounds=5 announce-step=100 block-incremental=False compress-level=0 compress-threads=8 compress-wait-thread=True cpu-throttle-increment=10 cpu-throttle-initial=20 cpu-throttle-tailslow=False decompress-threads=2 downtime-limit=4000 max-bandwidth=22500 max-cpu-throttle=99 max-postcopy-bandwidth=0 multifd-channels=2 multifd-compression='none' multifd-zlib-level=1 multifd-zstd-level=1 subsystem='qemu' throttle-trigger-threshold=50 tls-authz='' tls-creds='' tls-hostname='' x-checkpoint-delay=20000 xbzrle-cache-size=67108864 -simplevm query-migrate arguments={} id=None subsystem='qemu/qmp' -simplevm migration-status mbps='-' remaining='0' status='setup' -simplevm> {'blocked': False, 'status': 'setup'} -simplevm query-migrate arguments={} id=None subsystem='qemu/qmp' -simplevm migration-status mbps=0.32976 remaining='285,528,064' status='active' -simplevm> {'blocked': False, +simplevm ceph connect-rados +simplevm acquire-lock target='/run/qemu.simplevm.lock' +simplevm acquire-lock count=1 result='locked' target='/run/qemu.simplevm.lock' +simplevm qemu/qmp qmp_capabilities arguments={} id=None +simplevm qemu/qmp query-status arguments={} id=None +simplevm outmigrate +simplevm consul-register +simplevm setup-outgoing-migration cookie='b76481202c5afb5b70feae12350c120b8e881356' +simplevm heartbeat-initialized +simplevm locate-inmigration-service +simplevm check-staging-config result='none' +simplevm located-inmigration-service url='http://host2.mgm.test.gocept.net:36573' +simplevm started-heartbeat-ping +simplevm acquire-migration-locks +simplevm heartbeat-ping +simplevm check-staging-config result='none' +simplevm qemu acquire-migration-lock result='success' +simplevm acquire-local-migration-lock result='success' +simplevm acquire-remote-migration-lock +simplevm acquire-remote-migration-lock result='success' +simplevm ceph unlock volume='rbd.ssd/simplevm.root' +simplevm ceph unlock volume='rbd.ssd/simplevm.swap' +simplevm ceph unlock volume='rbd.ssd/simplevm.tmp' +simplevm prepare-remote-environment +simplevm start-migration target='tcp:192.168.4.7:2345' +simplevm qemu migrate +simplevm qemu/qmp migrate-set-capabilities arguments={'capabilities': [{'capability': 'xbzrle', 'state': False}, {'capability': 'auto-converge', 'state': True}]} id=None +simplevm qemu/qmp migrate-set-parameters arguments={'compress-level': 0, 'downtime-limit': 4000, 'max-bandwidth': 22500} id=None +simplevm qemu/qmp migrate arguments={'uri': 'tcp:192.168.4.7:2345'} id=None +simplevm qemu/qmp query-migrate-parameters arguments={} id=None +simplevm qemu migrate-parameters announce-initial=50 announce-max=550 announce-rounds=5 announce-step=100 block-incremental=False compress-level=0 compress-threads=8 compress-wait-thread=True cpu-throttle-increment=10 cpu-throttle-initial=20 cpu-throttle-tailslow=False decompress-threads=2 downtime-limit=4000 max-bandwidth=22500 max-cpu-throttle=99 max-postcopy-bandwidth=0 multifd-channels=2 multifd-compression='none' multifd-zlib-level=1 multifd-zstd-level=1 throttle-trigger-threshold=50 tls-authz='' tls-creds='' tls-hostname='' x-checkpoint-delay=20000 xbzrle-cache-size=67108864 +simplevm qemu/qmp query-migrate arguments={} id=None +simplevm migration-status mbps='-' remaining='0' status='setup' +simplevm> {'blocked': False, 'status': 'setup'} +simplevm qemu/qmp query-migrate arguments={} id=None +simplevm migration-status mbps=0.32976 remaining='285,528,064' status='active' +simplevm> {'blocked': False, simplevm> 'expected-downtime': 4000, simplevm> 'ram': {'dirty-pages-rate': 0, simplevm> 'dirty-sync-count': 1, @@ -815,9 +833,9 @@ def test_vm_migration_pattern(outmigrate_pattern): simplevm> 'setup-time': 3, simplevm> 'status': 'active', simplevm> 'total-time': 1419} -simplevm query-migrate arguments={} id=None subsystem='qemu/qmp' -simplevm migration-status mbps=0.32976 remaining='285,331,456' status='active' -simplevm> {'blocked': False, +simplevm qemu/qmp query-migrate arguments={} id=None +simplevm migration-status mbps=0.32976 remaining='285,331,456' status='active' +simplevm> {'blocked': False, simplevm> 'expected-downtime': 4000, simplevm> 'ram': {'dirty-pages-rate': 0, simplevm> 'dirty-sync-count': 1, @@ -836,9 +854,9 @@ def test_vm_migration_pattern(outmigrate_pattern): simplevm> 'setup-time': 3, simplevm> 'status': 'active', simplevm> 'total-time': 3423} -simplevm query-migrate arguments={} id=None subsystem='qemu/qmp' -simplevm migration-status mbps=0.18144 remaining='267,878,400' status='active' -simplevm> {'blocked': False, +simplevm qemu/qmp query-migrate arguments={} id=None +simplevm migration-status mbps=0.18144 remaining='267,878,400' status='active' +simplevm> {'blocked': False, simplevm> 'expected-downtime': 4000, simplevm> 'ram': {'dirty-pages-rate': 0, simplevm> 'dirty-sync-count': 1, @@ -857,10 +875,10 @@ def test_vm_migration_pattern(outmigrate_pattern): simplevm> 'setup-time': 3, simplevm> 'status': 'active', simplevm> 'total-time': 6255} -simplevm heartbeat-ping -simplevm query-migrate arguments={} id=None subsystem='qemu/qmp' -simplevm migration-status mbps=0.17964356435643564 remaining='226,918,400' status='active' -simplevm> {'blocked': False, +simplevm heartbeat-ping +simplevm qemu/qmp query-migrate arguments={} id=None +simplevm migration-status mbps=0.17964356435643564 remaining='226,918,400' status='active' +simplevm> {'blocked': False, simplevm> 'expected-downtime': 4000, simplevm> 'ram': {'dirty-pages-rate': 0, simplevm> 'dirty-sync-count': 1, @@ -879,9 +897,9 @@ def test_vm_migration_pattern(outmigrate_pattern): simplevm> 'setup-time': 3, simplevm> 'status': 'active', simplevm> 'total-time': 10261} -simplevm query-migrate arguments={} id=None subsystem='qemu/qmp' -simplevm migration-status mbps=0.18144 remaining='169,574,400' status='active' -simplevm> {'blocked': False, +simplevm qemu/qmp query-migrate arguments={} id=None +simplevm migration-status mbps=0.18144 remaining='169,574,400' status='active' +simplevm> {'blocked': False, simplevm> 'expected-downtime': 4000, simplevm> 'ram': {'dirty-pages-rate': 0, simplevm> 'dirty-sync-count': 1, @@ -900,10 +918,10 @@ def test_vm_migration_pattern(outmigrate_pattern): simplevm> 'setup-time': 3, simplevm> 'status': 'active', simplevm> 'total-time': 15925} -simplevm heartbeat-ping -simplevm query-migrate arguments={} id=None subsystem='qemu/qmp' -simplevm migration-status mbps=0.18144 remaining='87,654,400' status='active' -simplevm> {'blocked': False, +simplevm heartbeat-ping +simplevm qemu/qmp query-migrate arguments={} id=None +simplevm migration-status mbps=0.18144 remaining='87,654,400' status='active' +simplevm> {'blocked': False, simplevm> 'expected-downtime': 4000, simplevm> 'ram': {'dirty-pages-rate': 0, simplevm> 'dirty-sync-count': 1, @@ -922,10 +940,10 @@ def test_vm_migration_pattern(outmigrate_pattern): simplevm> 'setup-time': 3, simplevm> 'status': 'active', simplevm> 'total-time': 23935} -simplevm heartbeat-ping -simplevm query-migrate arguments={} id=None subsystem='qemu/qmp' -simplevm migration-status mbps=0.32976 remaining='18,825,216' status='active' -simplevm> {'blocked': False, +simplevm heartbeat-ping +simplevm qemu/qmp query-migrate arguments={} id=None +simplevm migration-status mbps=0.32976 remaining='18,825,216' status='active' +simplevm> {'blocked': False, simplevm> 'expected-downtime': 4000, simplevm> 'ram': {'dirty-pages-rate': 0, simplevm> 'dirty-sync-count': 1, @@ -944,10 +962,10 @@ def test_vm_migration_pattern(outmigrate_pattern): simplevm> 'setup-time': 3, simplevm> 'status': 'active', simplevm> 'total-time': 35261} -simplevm heartbeat-ping -simplevm query-migrate arguments={} id=None subsystem='qemu/qmp' -simplevm migration-status mbps=0.3264950495049505 remaining='839,680' status='active' -simplevm> {'blocked': False, +simplevm heartbeat-ping +simplevm qemu/qmp query-migrate arguments={} id=None +simplevm migration-status mbps=0.3264950495049505 remaining='839,680' status='active' +simplevm> {'blocked': False, simplevm> 'expected-downtime': 4000, simplevm> 'ram': {'dirty-pages-rate': 0, simplevm> 'dirty-sync-count': 1, @@ -966,10 +984,10 @@ def test_vm_migration_pattern(outmigrate_pattern): simplevm> 'setup-time': 3, simplevm> 'status': 'active', simplevm> 'total-time': 46586} -simplevm heartbeat-ping -simplevm query-migrate arguments={} id=None subsystem='qemu/qmp' -simplevm migration-status mbps=0.32976 remaining='1,118,208' status='active' -simplevm> {'blocked': False, +simplevm heartbeat-ping +simplevm qemu/qmp query-migrate arguments={} id=None +simplevm migration-status mbps=0.32976 remaining='1,118,208' status='active' +simplevm> {'blocked': False, simplevm> 'expected-downtime': 4000, simplevm> 'ram': {'dirty-pages-rate': 4, simplevm> 'dirty-sync-count': 2, @@ -988,10 +1006,10 @@ def test_vm_migration_pattern(outmigrate_pattern): simplevm> 'setup-time': 3, simplevm> 'status': 'active', simplevm> 'total-time': 57908} -simplevm heartbeat-ping -simplevm query-migrate arguments={} id=None subsystem='qemu/qmp' -simplevm migration-status mbps=0.34057172924857854 remaining='0' status='completed' -simplevm> {'blocked': False, +simplevm heartbeat-ping +simplevm qemu/qmp query-migrate arguments={} id=None +simplevm migration-status mbps=0.34057172924857854 remaining='0' status='completed' +simplevm> {'blocked': False, simplevm> 'downtime': 8, simplevm> 'ram': {'dirty-pages-rate': 0, simplevm> 'dirty-sync-count': 5, @@ -1010,19 +1028,19 @@ def test_vm_migration_pattern(outmigrate_pattern): simplevm> 'setup-time': 3, simplevm> 'status': 'completed', simplevm> 'total-time': 68420} -simplevm query-status arguments={} id=None subsystem='qemu/qmp' -simplevm finish-migration -simplevm vm-destroy-kill-supervisor attempt=1 subsystem='qemu' -simplevm vm-destroy-kill-supervisor attempt=2 subsystem='qemu' -simplevm vm-destroy-kill-vm attempt=1 subsystem='qemu' -simplevm vm-destroy-kill-vm attempt=2 subsystem='qemu' -simplevm clean-run-files subsystem='qemu' -simplevm finish-remote -simplevm stopped-heartbeat-ping -simplevm consul-deregister -simplevm outmigrate-finished exitcode=0 -simplevm release-lock count=0 target='/run/qemu.simplevm.lock' -simplevm release-lock result='unlocked' target='/run/qemu.simplevm.lock' +simplevm qemu/qmp query-status arguments={} id=None +simplevm finish-migration +simplevm qemu vm-destroy-kill-supervisor attempt=1 +simplevm qemu vm-destroy-kill-supervisor attempt=2 +simplevm qemu vm-destroy-kill-vm attempt=1 +simplevm qemu vm-destroy-kill-vm attempt=2 +simplevm qemu clean-run-files +simplevm finish-remote +simplevm stopped-heartbeat-ping +simplevm consul-deregister +simplevm outmigrate-finished exitcode=0 +simplevm release-lock count=0 target='/run/qemu.simplevm.lock' +simplevm release-lock result='unlocked' target='/run/qemu.simplevm.lock' """ ) @@ -1034,42 +1052,43 @@ def test_vm_migration_pattern(outmigrate_pattern): == """\ /nix/store/kj63j38ji0c8yk037yvzj9c5f27mzh58-python3.8-fc.qemu-d26a0eae29efd95fe5c328d8a9978eb5a6a4529e/bin/fc-qemu -v outmigrate simplevm load-system-config -simplevm connect-rados subsystem='ceph' -simplevm acquire-lock target='/run/qemu.simplevm.lock' -simplevm acquire-lock count=1 result='locked' target='/run/qemu.simplevm.lock' -simplevm qmp_capabilities arguments={} id=None subsystem='qemu/qmp' -simplevm query-status arguments={} id=None subsystem='qemu/qmp' -simplevm outmigrate -simplevm consul-register -simplevm heartbeat-initialized -simplevm locate-inmigration-service -simplevm check-staging-config result='none' -simplevm located-inmigration-service url='http://host2.mgm.test.gocept.net:35241' -simplevm started-heartbeat-ping -simplevm acquire-migration-locks -simplevm heartbeat-ping -simplevm check-staging-config result='none' -simplevm acquire-migration-lock result='success' subsystem='qemu' -simplevm acquire-local-migration-lock result='success' -simplevm acquire-remote-migration-lock -simplevm acquire-remote-migration-lock result='success' -simplevm unlock subsystem='ceph' volume='rbd.ssd/simplevm.root' -simplevm unlock subsystem='ceph' volume='rbd.ssd/simplevm.swap' -simplevm unlock subsystem='ceph' volume='rbd.ssd/simplevm.tmp' -simplevm prepare-remote-environment -simplevm start-migration target='tcp:192.168.4.7:2345' -simplevm migrate subsystem='qemu' -simplevm migrate-set-capabilities arguments={'capabilities': [{'capability': 'xbzrle', 'state': False}, {'capability': 'auto-converge', 'state': True}]} id=None subsystem='qemu/qmp' -simplevm migrate-set-parameters arguments={'compress-level': 0, 'downtime-limit': 4000, 'max-bandwidth': 22500} id=None subsystem='qemu/qmp' -simplevm migrate arguments={'uri': 'tcp:192.168.4.7:2345'} id=None subsystem='qemu/qmp' -simplevm query-migrate-parameters arguments={} id=None subsystem='qemu/qmp' -simplevm migrate-parameters announce-initial=50 announce-max=550 announce-rounds=5 announce-step=100 block-incremental=False compress-level=0 compress-threads=8 compress-wait-thread=True cpu-throttle-increment=10 cpu-throttle-initial=20 cpu-throttle-tailslow=False decompress-threads=2 downtime-limit=4000 max-bandwidth=22500 max-cpu-throttle=99 max-postcopy-bandwidth=0 multifd-channels=2 multifd-compression='none' multifd-zlib-level=1 multifd-zstd-level=1 subsystem='qemu' throttle-trigger-threshold=50 tls-authz='' tls-creds='' tls-hostname='' x-checkpoint-delay=20000 xbzrle-cache-size=67108864 -simplevm query-migrate arguments={} id=None subsystem='qemu/qmp' -simplevm migration-status mbps='-' remaining='0' status='setup' -simplevm> {'blocked': False, 'status': 'setup'} -simplevm query-migrate arguments={} id=None subsystem='qemu/qmp' -simplevm migration-status mbps=0.32976 remaining='285,528,064' status='active' -simplevm> {'blocked': False, +simplevm ceph connect-rados +simplevm acquire-lock target='/run/qemu.simplevm.lock' +simplevm acquire-lock count=1 result='locked' target='/run/qemu.simplevm.lock' +simplevm qemu/qmp qmp_capabilities arguments={} id=None +simplevm qemu/qmp query-status arguments={} id=None +simplevm outmigrate +simplevm consul-register +simplevm setup-outgoing-migration cookie='b76481202c5afb5b70feae12350c120b8e881356' +simplevm heartbeat-initialized +simplevm locate-inmigration-service +simplevm check-staging-config result='none' +simplevm located-inmigration-service url='http://host2.mgm.test.gocept.net:35241' +simplevm started-heartbeat-ping +simplevm acquire-migration-locks +simplevm heartbeat-ping +simplevm check-staging-config result='none' +simplevm qemu acquire-migration-lock result='success' +simplevm acquire-local-migration-lock result='success' +simplevm acquire-remote-migration-lock +simplevm acquire-remote-migration-lock result='success' +simplevm ceph unlock volume='rbd.ssd/simplevm.root' +simplevm ceph unlock volume='rbd.ssd/simplevm.swap' +simplevm ceph unlock volume='rbd.ssd/simplevm.tmp' +simplevm prepare-remote-environment +simplevm start-migration target='tcp:192.168.4.7:2345' +simplevm qemu migrate +simplevm qemu/qmp migrate-set-capabilities arguments={'capabilities': [{'capability': 'xbzrle', 'state': False}, {'capability': 'auto-converge', 'state': True}]} id=None +simplevm qemu/qmp migrate-set-parameters arguments={'compress-level': 0, 'downtime-limit': 4000, 'max-bandwidth': 22500} id=None +simplevm qemu/qmp migrate arguments={'uri': 'tcp:192.168.4.7:2345'} id=None +simplevm qemu/qmp query-migrate-parameters arguments={} id=None +simplevm qemu migrate-parameters announce-initial=50 announce-max=550 announce-rounds=5 announce-step=100 block-incremental=False compress-level=0 compress-threads=8 compress-wait-thread=True cpu-throttle-increment=10 cpu-throttle-initial=20 cpu-throttle-tailslow=False decompress-threads=2 downtime-limit=4000 max-bandwidth=22500 max-cpu-throttle=99 max-postcopy-bandwidth=0 multifd-channels=2 multifd-compression='none' multifd-zlib-level=1 multifd-zstd-level=1 throttle-trigger-threshold=50 tls-authz='' tls-creds='' tls-hostname='' x-checkpoint-delay=20000 xbzrle-cache-size=67108864 +simplevm qemu/qmp query-migrate arguments={} id=None +simplevm migration-status mbps='-' remaining='0' status='setup' +simplevm> { 'blocked': False, 'status': 'setup'} +simplevm qemu/qmp query-migrate arguments={} id=None +simplevm migration-status mbps=0.32976 remaining='285,528,064' status='active' +simplevm> { 'blocked': False, simplevm> 'expected-downtime': 4000, simplevm> 'ram': {'dirty-pages-rate': 0, simplevm> 'dirty-sync-count': 1, @@ -1088,9 +1107,9 @@ def test_vm_migration_pattern(outmigrate_pattern): simplevm> 'setup-time': 1, simplevm> 'status': 'active', simplevm> 'total-time': 1418} -simplevm query-migrate arguments={} id=None subsystem='qemu/qmp' -simplevm migration-status mbps=0.32976 remaining='285,331,456' status='active' -simplevm> {'blocked': False, +simplevm qemu/qmp query-migrate arguments={} id=None +simplevm migration-status mbps=0.32976 remaining='285,331,456' status='active' +simplevm> { 'blocked': False, simplevm> 'expected-downtime': 4000, simplevm> 'ram': {'dirty-pages-rate': 0, simplevm> 'dirty-sync-count': 1, @@ -1109,9 +1128,9 @@ def test_vm_migration_pattern(outmigrate_pattern): simplevm> 'setup-time': 1, simplevm> 'status': 'active', simplevm> 'total-time': 3422} -simplevm query-migrate arguments={} id=None subsystem='qemu/qmp' -simplevm migration-status mbps=0.18144 remaining='267,878,400' status='active' -simplevm> {'blocked': False, +simplevm qemu/qmp query-migrate arguments={} id=None +simplevm migration-status mbps=0.18144 remaining='267,878,400' status='active' +simplevm> { 'blocked': False, simplevm> 'expected-downtime': 4000, simplevm> 'ram': {'dirty-pages-rate': 0, simplevm> 'dirty-sync-count': 1, @@ -1130,10 +1149,10 @@ def test_vm_migration_pattern(outmigrate_pattern): simplevm> 'setup-time': 1, simplevm> 'status': 'active', simplevm> 'total-time': 6254} -simplevm heartbeat-ping -simplevm query-migrate arguments={} id=None subsystem='qemu/qmp' -simplevm migration-status mbps=0.18144 remaining='226,918,400' status='active' -simplevm> {'blocked': False, +simplevm heartbeat-ping +simplevm qemu/qmp query-migrate arguments={} id=None +simplevm migration-status mbps=0.18144 remaining='226,918,400' status='active' +simplevm> { 'blocked': False, simplevm> 'expected-downtime': 4000, simplevm> 'ram': {'dirty-pages-rate': 0, simplevm> 'dirty-sync-count': 1, @@ -1152,9 +1171,9 @@ def test_vm_migration_pattern(outmigrate_pattern): simplevm> 'setup-time': 1, simplevm> 'status': 'active', simplevm> 'total-time': 10259} -simplevm query-migrate arguments={} id=None subsystem='qemu/qmp' -simplevm migration-status mbps=0.18144 remaining='169,574,400' status='active' -simplevm> {'blocked': False, +simplevm qemu/qmp query-migrate arguments={} id=None +simplevm migration-status mbps=0.18144 remaining='169,574,400' status='active' +simplevm> { 'blocked': False, simplevm> 'expected-downtime': 4000, simplevm> 'ram': {'dirty-pages-rate': 0, simplevm> 'dirty-sync-count': 1, @@ -1173,10 +1192,10 @@ def test_vm_migration_pattern(outmigrate_pattern): simplevm> 'setup-time': 1, simplevm> 'status': 'active', simplevm> 'total-time': 15923} -simplevm heartbeat-ping -simplevm query-migrate arguments={} id=None subsystem='qemu/qmp' -simplevm migration-status mbps=0.18144 remaining='87,654,400' status='active' -simplevm> {'blocked': False, +simplevm heartbeat-ping +simplevm qemu/qmp query-migrate arguments={} id=None +simplevm migration-status mbps=0.18144 remaining='87,654,400' status='active' +simplevm> { 'blocked': False, simplevm> 'expected-downtime': 4000, simplevm> 'ram': {'dirty-pages-rate': 0, simplevm> 'dirty-sync-count': 1, @@ -1195,10 +1214,10 @@ def test_vm_migration_pattern(outmigrate_pattern): simplevm> 'setup-time': 1, simplevm> 'status': 'active', simplevm> 'total-time': 23932} -simplevm heartbeat-ping -simplevm query-migrate arguments={} id=None subsystem='qemu/qmp' -simplevm migration-status mbps=0.32976 remaining='18,825,216' status='active' -simplevm> {'blocked': False, +simplevm heartbeat-ping +simplevm qemu/qmp query-migrate arguments={} id=None +simplevm migration-status mbps=0.32976 remaining='18,825,216' status='active' +simplevm> { 'blocked': False, simplevm> 'expected-downtime': 4000, simplevm> 'ram': {'dirty-pages-rate': 0, simplevm> 'dirty-sync-count': 1, @@ -1217,10 +1236,10 @@ def test_vm_migration_pattern(outmigrate_pattern): simplevm> 'setup-time': 1, simplevm> 'status': 'active', simplevm> 'total-time': 35258} -simplevm heartbeat-ping -simplevm query-migrate arguments={} id=None subsystem='qemu/qmp' -simplevm migration-status mbps=0.32976 remaining='843,776' status='active' -simplevm> {'blocked': False, +simplevm heartbeat-ping +simplevm qemu/qmp query-migrate arguments={} id=None +simplevm migration-status mbps=0.32976 remaining='843,776' status='active' +simplevm> { 'blocked': False, simplevm> 'expected-downtime': 4000, simplevm> 'ram': {'dirty-pages-rate': 0, simplevm> 'dirty-sync-count': 1, @@ -1239,11 +1258,11 @@ def test_vm_migration_pattern(outmigrate_pattern): simplevm> 'setup-time': 1, simplevm> 'status': 'active', simplevm> 'total-time': 46582} -simplevm heartbeat-ping -simplevm heartbeat-ping -simplevm query-migrate arguments={} id=None subsystem='qemu/qmp' -simplevm migration-status mbps=0.32976 remaining='1,118,208' status='active' -simplevm> {'blocked': False, +simplevm heartbeat-ping +simplevm heartbeat-ping +simplevm qemu/qmp query-migrate arguments={} id=None +simplevm migration-status mbps=0.32976 remaining='1,118,208' status='active' +simplevm> { 'blocked': False, simplevm> 'expected-downtime': 4000, simplevm> 'ram': {'dirty-pages-rate': 4, simplevm> 'dirty-sync-count': 2, @@ -1262,10 +1281,10 @@ def test_vm_migration_pattern(outmigrate_pattern): simplevm> 'setup-time': 1, simplevm> 'status': 'active', simplevm> 'total-time': 57908} -simplevm heartbeat-ping -simplevm query-migrate arguments={} id=None subsystem='qemu/qmp' -simplevm migration-status mbps=0.3400370667639548 remaining='0' status='completed' -simplevm> {'blocked': False, +simplevm heartbeat-ping +simplevm qemu/qmp query-migrate arguments={} id=None +simplevm migration-status mbps=0.3400370667639548 remaining='0' status='completed' +simplevm> { 'blocked': False, simplevm> 'downtime': 11, simplevm> 'ram': {'dirty-pages-rate': 0, simplevm> 'dirty-sync-count': 5, @@ -1284,18 +1303,18 @@ def test_vm_migration_pattern(outmigrate_pattern): simplevm> 'setup-time': 1, simplevm> 'status': 'completed', simplevm> 'total-time': 68526} -simplevm query-status arguments={} id=None subsystem='qemu/qmp' -simplevm finish-migration -simplevm vm-destroy-kill-supervisor attempt=1 subsystem='qemu' -simplevm vm-destroy-kill-supervisor attempt=2 subsystem='qemu' -simplevm vm-destroy-kill-vm attempt=1 subsystem='qemu' -simplevm vm-destroy-kill-vm attempt=2 subsystem='qemu' -simplevm clean-run-files subsystem='qemu' -simplevm finish-remote -simplevm consul-deregister -simplevm outmigrate-finished exitcode=0 -simplevm release-lock count=0 target='/run/qemu.simplevm.lock' -simplevm release-lock result='unlocked' target='/run/qemu.simplevm.lock' +simplevm qemu/qmp query-status arguments={} id=None +simplevm finish-migration +simplevm qemu vm-destroy-kill-supervisor attempt=1 +simplevm qemu vm-destroy-kill-supervisor attempt=2 +simplevm qemu vm-destroy-kill-vm attempt=1 +simplevm qemu vm-destroy-kill-vm attempt=2 +simplevm qemu clean-run-files +simplevm finish-remote +simplevm consul-deregister +simplevm outmigrate-finished exitcode=0 +simplevm release-lock count=0 target='/run/qemu.simplevm.lock' +simplevm release-lock result='unlocked' target='/run/qemu.simplevm.lock' """ ) @@ -1333,7 +1352,7 @@ def communicate_progress(p): line = p.stdout.readline() if line: # This ensures we get partial output e.g. during timeouts. - print(line) + print(line.rstrip()) stdout += line else: p.wait() @@ -1350,68 +1369,72 @@ def communicate_progress(p): inmigrate = call("ssh host2 'fc-qemu -v inmigrate simplevm'", wait=False) outmigrate = call("fc-qemu -v outmigrate simplevm", wait=False) - assert outmigrate_pattern == communicate_progress(outmigrate) - assert outmigrate.returncode == 0 - + # Consume both process outputs so in a failing test we see both + # in the test output and can more easily compare what's going on. inmigrate_result = communicate_progress(inmigrate) + outmigrate_result = communicate_progress(outmigrate) + inmigrate_pattern = patterns.inmigrate inmigrate_pattern.in_order( """ /nix/store/.../bin/fc-qemu -v inmigrate simplevm load-system-config -simplevm connect-rados subsystem='ceph' -simplevm acquire-lock target='/run/qemu.simplevm.lock' -simplevm acquire-lock count=1 result='locked' target='/run/qemu.simplevm.lock' - -simplevm inmigrate -simplevm start-server type='incoming' url='http://host2.mgm.test.gocept.net:.../' -simplevm setup-incoming-api cookie='...' -simplevm consul-register-inmigrate - -simplevm received-acquire-migration-lock -simplevm acquire-migration-lock result='success' subsystem='qemu' -simplevm received-acquire-ceph-locks -simplevm lock subsystem='ceph' volume='rbd.ssd/simplevm.root' -simplevm lock subsystem='ceph' volume='rbd.ssd/simplevm.swap' -simplevm lock subsystem='ceph' volume='rbd.ssd/simplevm.tmp' - -simplevm received-prepare-incoming -simplevm acquire-global-lock subsystem='qemu' target='/run/fc-qemu.lock' -simplevm global-lock-acquire result='locked' subsystem='qemu' target='/run/fc-qemu.lock' -simplevm global-lock-status count=1 subsystem='qemu' target='/run/fc-qemu.lock' -simplevm sufficient-host-memory available_real=... bookable=... required=768 subsystem='qemu' -simplevm start-qemu subsystem='qemu' -simplevm qemu-system-x86_64 additional_args=['-incoming tcp:192.168.4.7:...'] local_args=['-nodefaults', '-only-migratable', '-cpu qemu64,enforce', '-name simplevm,process=kvm.simplevm', '-chroot /srv/vm/simplevm', '-runas nobody', '-serial file:/var/log/vm/simplevm.log', '-display vnc=127.0.0.1:2345', '-pidfile /run/qemu.simplevm.pid', '-vga std', '-m 256', '-readconfig /run/qemu.simplevm.cfg'] subsystem='qemu' -simplevm exec cmd='supervised-qemu qemu-system-x86_64 -nodefaults -only-migratable -cpu qemu64,enforce -name simplevm,process=kvm.simplevm -chroot /srv/vm/simplevm -runas nobody -serial file:/var/log/vm/simplevm.log -display vnc=127.0.0.1:2345 -pidfile /run/qemu.simplevm.pid -vga std -m 256 -readconfig /run/qemu.simplevm.cfg -incoming tcp:192.168.4.7:2345 -D /var/log/vm/simplevm.qemu.internal.log simplevm /var/log/vm/simplevm.supervisor.log' subsystem='qemu' -simplevm supervised-qemu-stdout subsystem='qemu' -simplevm supervised-qemu-stderr subsystem='qemu' - -simplevm global-lock-status count=0 subsystem='qemu' target='/run/fc-qemu.lock' -simplevm global-lock-release subsystem='qemu' target='/run/fc-qemu.lock' -simplevm global-lock-release result='unlocked' subsystem='qemu' -simplevm qmp_capabilities arguments={} id=None subsystem='qemu/qmp' -simplevm query-status arguments={} id=None subsystem='qemu/qmp' - -simplevm received-finish-incoming -simplevm query-status arguments={} id=None subsystem='qemu/qmp' -simplevm consul-deregister-inmigrate -simplevm stop-server result='success' type='incoming' -simplevm consul-register -simplevm inmigrate-finished exitcode=0 -simplevm release-lock count=0 target='/run/qemu.simplevm.lock' -simplevm release-lock result='unlocked' target='/run/qemu.simplevm.lock' +simplevm ceph connect-rados +simplevm acquire-lock target='/run/qemu.simplevm.lock' +simplevm acquire-lock count=1 result='locked' target='/run/qemu.simplevm.lock' + +simplevm inmigrate +simplevm start-server type='incoming' url='http://host2.mgm.test.gocept.net:.../' +simplevm setup-incoming-api cookie='...' +simplevm consul-register-inmigrate + +simplevm received-acquire-migration-lock +simplevm qemu acquire-migration-lock result='success' +simplevm received-acquire-ceph-locks +simplevm ceph lock volume='rbd.ssd/simplevm.root' +simplevm ceph lock volume='rbd.ssd/simplevm.swap' +simplevm ceph lock volume='rbd.ssd/simplevm.tmp' + +simplevm received-prepare-incoming +simplevm qemu acquire-global-lock target='/run/fc-qemu.lock' +simplevm qemu global-lock-acquire result='locked' target='/run/fc-qemu.lock' +simplevm qemu global-lock-status count=1 target='/run/fc-qemu.lock' +simplevm qemu sufficient-host-memory available_real=... bookable=... required=768 +simplevm qemu start-qemu +simplevm qemu qemu-system-x86_64 additional_args=['-incoming tcp:192.168.4.7:...'] local_args=['-nodefaults', '-only-migratable', '-cpu qemu64,enforce', '-name simplevm,process=kvm.simplevm', '-chroot /srv/vm/simplevm', '-runas nobody', '-serial file:/var/log/vm/simplevm.log', '-display vnc=127.0.0.1:2345', '-pidfile /run/qemu.simplevm.pid', '-vga std', '-m 256', '-readconfig /run/qemu.simplevm.cfg'] +simplevm qemu exec cmd='supervised-qemu qemu-system-x86_64 -nodefaults -only-migratable -cpu qemu64,enforce -name simplevm,process=kvm.simplevm -chroot /srv/vm/simplevm -runas nobody -serial file:/var/log/vm/simplevm.log -display vnc=127.0.0.1:2345 -pidfile /run/qemu.simplevm.pid -vga std -m 256 -readconfig /run/qemu.simplevm.cfg -incoming tcp:192.168.4.7:2345 -D /var/log/vm/simplevm.qemu.internal.log simplevm /var/log/vm/simplevm.supervisor.log' +simplevm qemu supervised-qemu-stdout +simplevm qemu supervised-qemu-stderr + +simplevm qemu global-lock-status count=0 target='/run/fc-qemu.lock' +simplevm qemu global-lock-release target='/run/fc-qemu.lock' +simplevm qemu global-lock-release result='unlocked' +simplevm qemu/qmp qmp_capabilities arguments={} id=None +simplevm qemu/qmp query-status arguments={} id=None + +simplevm received-finish-incoming +simplevm qemu/qmp query-status arguments={} id=None +simplevm consul-deregister-inmigrate +simplevm stop-server result='success' type='incoming' +simplevm consul-register +simplevm inmigrate-finished exitcode=0 +simplevm release-lock count=0 target='/run/qemu.simplevm.lock' +simplevm release-lock result='unlocked' target='/run/qemu.simplevm.lock' """ ) inmigrate_pattern.optional( """ simplevm> -simplevm received-ping timeout=60 -simplevm reset-timeout -simplevm waiting interval=0 remaining=... -simplevm guest-disconnect +simplevm received-ping timeout=60 +simplevm reset-timeout +simplevm waiting interval=0 remaining=... +simplevm guest-disconnect """ ) + assert outmigrate_pattern == outmigrate_result + assert outmigrate.returncode == 0 + assert inmigrate_pattern == inmigrate_result assert inmigrate.returncode == 0 @@ -1422,25 +1445,25 @@ def communicate_progress(p): local_status = call("fc-qemu status simplevm") assert local_status == Ellipsis( """\ -simplevm vm-status result='offline' -simplevm rbd-status locker=('client...', 'host2') volume='rbd.ssd/simplevm.root' -simplevm rbd-status locker=('client...', 'host2') volume='rbd.ssd/simplevm.swap' -simplevm rbd-status locker=('client...', 'host2') volume='rbd.ssd/simplevm.tmp' -simplevm consul address='host2' service='qemu-simplevm' +simplevm vm-status result='offline' +simplevm ceph rbd-status locker=('client...', 'host2') volume='rbd.ssd/simplevm.root' +simplevm ceph rbd-status locker=('client...', 'host2') volume='rbd.ssd/simplevm.swap' +simplevm ceph rbd-status locker=('client...', 'host2') volume='rbd.ssd/simplevm.tmp' +simplevm consul address='host2' service='qemu-simplevm' """ ) remote_status = call("ssh host2 'fc-qemu status simplevm'") assert remote_status == Ellipsis( """\ -simplevm vm-status result='online' -simplevm disk-throttle device='virtio0' iops=0 -simplevm disk-throttle device='virtio1' iops=0 -simplevm disk-throttle device='virtio2' iops=0 -simplevm rbd-status locker=('client...', 'host2') volume='rbd.ssd/simplevm.root' -simplevm rbd-status locker=('client...', 'host2') volume='rbd.ssd/simplevm.swap' -simplevm rbd-status locker=('client...', 'host2') volume='rbd.ssd/simplevm.tmp' -simplevm consul address='host2' service='qemu-simplevm' +simplevm vm-status result='online' +simplevm disk-throttle device='virtio0' iops=0 +simplevm disk-throttle device='virtio1' iops=0 +simplevm disk-throttle device='virtio2' iops=0 +simplevm ceph rbd-status locker=('client...', 'host2') volume='rbd.ssd/simplevm.root' +simplevm ceph rbd-status locker=('client...', 'host2') volume='rbd.ssd/simplevm.swap' +simplevm ceph rbd-status locker=('client...', 'host2') volume='rbd.ssd/simplevm.tmp' +simplevm consul address='host2' service='qemu-simplevm' """ ) @@ -1483,9 +1506,10 @@ def test_new_vm(vm): ... ensure-state action=start found=offline machine=simplevm wanted=online ... +ensure-presence machine=simplevm subsystem=ceph volume_spec=root +create-vm machine=simplevm subsystem=ceph volume=simplevm.root +... generate-config machine=simplevm -ensure-root machine=simplevm subsystem=ceph -create-vm machine=simplevm subsystem=ceph ... """ )