From 02327dfcd6a59e021dd2c7cc09422df08e143e03 Mon Sep 17 00:00:00 2001 From: "kirby@puppetlabs.com" Date: Wed, 16 Nov 2016 16:33:45 -0800 Subject: [PATCH] (POOLER-26) Fix lost VMs getting stuck in pending This commit updates vmpooler to understand how to resolve a situation where a pending VM does not exist. Without this change a pending VM that does not exist in vmware inventory gets stuck in the pending state, preventing the pool from ever reaching its target capacity. As a part of this change the find_vm method is updated to perform a light, then heavy search each time find_vm is called and all usage of find_vm || find_vm_heavy is replaced. This makes find_vm usage consistent across pool_manager. Additionally, open_socket method is updated to resolve an incorrect reference to the host name. --- lib/vmpooler/pool_manager.rb | 30 +++++++++----------- lib/vmpooler/vsphere_helper.rb | 5 ++++ spec/vmpooler/pool_manager_migration_spec.rb | 1 - spec/vmpooler/pool_manager_spec.rb | 1 - 4 files changed, 19 insertions(+), 18 deletions(-) diff --git a/lib/vmpooler/pool_manager.rb b/lib/vmpooler/pool_manager.rb index 245a9b4b3..939d7bc54 100644 --- a/lib/vmpooler/pool_manager.rb +++ b/lib/vmpooler/pool_manager.rb @@ -52,6 +52,11 @@ def _check_pending_vm(vm, pool, timeout, vsphere) fail_pending_vm(vm, pool, timeout) end + def remove_nonexistent_vm(vm, pool) + $redis.srem("vmpooler__pending__#{pool}", vm) + $logger.log('d', "[!] [#{pool}] '#{vm}' no longer exists. Removing from pending.") + end + def fail_pending_vm(vm, pool, timeout, exists=true) clone_stamp = $redis.hget("vmpooler__vm__#{vm}", 'clone') return if ! clone_stamp @@ -108,8 +113,7 @@ def check_ready_vm(vm, pool, ttl, vsphere) $redis.hset('vmpooler__vm__' + vm, 'check', Time.now) - host = vsphere.find_vm(vm) || - vsphere.find_vm_heavy(vm)[vm] + host = vsphere.find_vm(vm) if host if @@ -288,8 +292,7 @@ def destroy_vm(vm, pool, vsphere) # Auto-expire metadata key $redis.expire('vmpooler__vm__' + vm, ($config[:redis]['data_ttl'].to_i * 60 * 60)) - host = vsphere.find_vm(vm) || - vsphere.find_vm_heavy(vm)[vm] + host = vsphere.find_vm(vm) if host start = Time.now @@ -319,8 +322,7 @@ def create_vm_disk(vm, disk_size, vsphere) end def _create_vm_disk(vm, disk_size, vsphere) - host = vsphere.find_vm(vm) || - vsphere.find_vm_heavy(vm)[vm] + host = vsphere.find_vm(vm) if (host) && ((! disk_size.nil?) && (! disk_size.empty?) && (disk_size.to_i > 0)) $logger.log('s', "[ ] [disk_manager] '#{vm}' is attaching a #{disk_size}gb disk") @@ -360,8 +362,7 @@ def create_vm_snapshot(vm, snapshot_name, vsphere) end def _create_vm_snapshot(vm, snapshot_name, vsphere) - host = vsphere.find_vm(vm) || - vsphere.find_vm_heavy(vm)[vm] + host = vsphere.find_vm(vm) if (host) && ((! snapshot_name.nil?) && (! snapshot_name.empty?)) $logger.log('s', "[ ] [snapshot_manager] '#{vm}' is being snapshotted") @@ -390,8 +391,7 @@ def revert_vm_snapshot(vm, snapshot_name, vsphere) end def _revert_vm_snapshot(vm, snapshot_name, vsphere) - host = vsphere.find_vm(vm) || - vsphere.find_vm_heavy(vm)[vm] + host = vsphere.find_vm(vm) if host snapshot = vsphere.find_snapshot(host, snapshot_name) @@ -473,10 +473,6 @@ def _check_snapshot_queue(vsphere) end end - def find_vsphere_pool_vm(pool, vm, vsphere) - vsphere.find_vm(vm) || vsphere.find_vm_heavy(vm)[vm] - end - def migration_limit(migration_limit) # Returns migration_limit setting when enabled return false if migration_limit == 0 || ! migration_limit @@ -492,7 +488,7 @@ def migrate_vm(vm, pool, vsphere) def _migrate_vm(vm, pool, vsphere) begin $redis.srem('vmpooler__migrating__' + pool, vm) - vm_object = find_vsphere_pool_vm(pool, vm, vsphere) + vm_object = vsphere.find_vm(vm) parent_host, parent_host_name = get_vm_host_info(vm_object) migration_limit = migration_limit $config[:config]['migration_limit'] migration_count = $redis.scard('vmpooler__migration') @@ -607,12 +603,14 @@ def _check_pool(pool, vsphere) # PENDING $redis.smembers("vmpooler__pending__#{pool['name']}").each do |vm| + pool_timeout = pool['timeout'] || $config[:config]['timeout'] || 15 if inventory[vm] begin - pool_timeout = pool['timeout'] || $config[:config]['timeout'] || 15 check_pending_vm(vm, pool['name'], pool_timeout, vsphere) rescue end + else + fail_pending_vm(vm, pool['name'], pool_timeout, false) end end diff --git a/lib/vmpooler/vsphere_helper.rb b/lib/vmpooler/vsphere_helper.rb index c49bce481..5d06b64dd 100644 --- a/lib/vmpooler/vsphere_helper.rb +++ b/lib/vmpooler/vsphere_helper.rb @@ -289,6 +289,11 @@ def find_snapshot(vm, snapshotname) def find_vm(vmname) ensure_connected @connection, $credentials + find_vm_light(vmname) || find_vm_heavy(vmname)[vmname] + end + + def find_vm_light(vmname) + ensure_connected @connection, $credentials @connection.searchIndex.FindByDnsName(vmSearch: true, dnsName: vmname) end diff --git a/spec/vmpooler/pool_manager_migration_spec.rb b/spec/vmpooler/pool_manager_migration_spec.rb index 6eab80e71..9fd491b01 100644 --- a/spec/vmpooler/pool_manager_migration_spec.rb +++ b/spec/vmpooler/pool_manager_migration_spec.rb @@ -40,7 +40,6 @@ create_migrating_vm vm['name'], pool, redis allow(vsphere).to receive(:find_vm).and_return(vm) allow(pooler).to receive(:get_vm_host_info).and_return([{'name' => 'host1'}, 'host1']) - expect(vsphere).to receive(:find_vm).with(vm['name']) end it 'logs VM host when migration is disabled' do diff --git a/spec/vmpooler/pool_manager_spec.rb b/spec/vmpooler/pool_manager_spec.rb index 76f51132d..6bf00bd72 100644 --- a/spec/vmpooler/pool_manager_spec.rb +++ b/spec/vmpooler/pool_manager_spec.rb @@ -26,7 +26,6 @@ it 'calls fail_pending_vm' do allow(vsphere).to receive(:find_vm).and_return(nil) allow(redis).to receive(:hget) - expect(redis).to receive(:hget).with(String, 'clone').once subject._check_pending_vm(vm, pool, timeout, vsphere) end end