From 577033189e68fe9485039edc28c4c488e4630d00 Mon Sep 17 00:00:00 2001 From: "kirby@puppetlabs.com" Date: Wed, 16 Nov 2016 16:33:45 -0800 Subject: [PATCH] (POOLER-26) Fix VMs getting stuck in pending they do not exist This commit updates vmpooler to understand how to resolve a situation where a pending VM does not exist. Without this change a pending VM that does not exist in vmware inventory gets stuck in the pending state, preventing the pool from ever reaching its target capacity. As a part of this change the find_vm method is updated to perform a light, then heavy search each time find_vm is called and all usage of find_vm || find_vm_heavy is replaced. This makes find_vm usage consistent across pool_manager. Additionally, open_socket method is updated to resolve an incorrect reference to the host name. --- lib/vmpooler/pool_manager.rb | 61 +++++++++++--------- lib/vmpooler/vsphere_helper.rb | 5 ++ spec/vmpooler/pool_manager_migration_spec.rb | 1 - spec/vmpooler/pool_manager_spec.rb | 1 - 4 files changed, 38 insertions(+), 30 deletions(-) diff --git a/lib/vmpooler/pool_manager.rb b/lib/vmpooler/pool_manager.rb index c0cd67c45..a951f3052 100644 --- a/lib/vmpooler/pool_manager.rb +++ b/lib/vmpooler/pool_manager.rb @@ -30,8 +30,7 @@ def open_socket(host, domain=nil, timeout=5, port=22) Timeout.timeout(timeout) do target_host = host target_host = "#{host}.#{domain}" if domain - sock = TCPSocket.new target_host, port - sock + TCPSocket.new target_host, port end end @@ -40,25 +39,38 @@ def _check_pending_vm(vm, pool, timeout, vsphere) if host begin - open_socket vm + socket = open_socket vm + fail_pending_vm(vm, pool, timeout) if ! socket move_pending_vm_to_ready(vm, pool, host) rescue fail_pending_vm(vm, pool, timeout) end else - fail_pending_vm(vm, pool, timeout) + fail_pending_vm(vm, pool, timeout, false) end end - def fail_pending_vm(vm, pool, timeout) - clone_stamp = $redis.hget('vmpooler__vm__' + vm, 'clone') - - if (clone_stamp) && - (((Time.now - Time.parse(clone_stamp)) / 60) > timeout) - - $redis.smove('vmpooler__pending__' + pool, 'vmpooler__completed__' + pool, vm) + def remove_nonexistent_vm(vm, pool) + $redis.srem("vmpooler__pending__#{pool}", vm) + $logger.log('d', "[!] [#{pool}] '#{vm}' no longer exists. Removing from pending.") + end - $logger.log('d', "[!] [#{pool}] '#{vm}' marked as 'failed' after #{timeout} minutes") + def fail_pending_vm(vm, pool, timeout, exists=true) + begin + clone_stamp = $redis.hget("vmpooler__vm__#{vm}", 'clone') + return if ! clone_stamp + + time_since_clone = (Time.now - Time.parse(clone_stamp)) / 60 + if time_since_clone > timeout + if exists + $redis.smove('vmpooler__pending__' + pool, 'vmpooler__completed__' + pool, vm) + $logger.log('d', "[!] [#{pool}] '#{vm}' marked as 'failed' after #{timeout} minutes") + else + remove_nonexistent_vm(vm, pool) + end + end + rescue => err + $logger.log('d', "Fail pending VM failed with an error: #{err}") end end @@ -101,8 +113,7 @@ def check_ready_vm(vm, pool, ttl, vsphere) $redis.hset('vmpooler__vm__' + vm, 'check', Time.now) - host = vsphere.find_vm(vm) || - vsphere.find_vm_heavy(vm)[vm] + host = vsphere.find_vm(vm) if host if @@ -281,8 +292,7 @@ def destroy_vm(vm, pool, vsphere) # Auto-expire metadata key $redis.expire('vmpooler__vm__' + vm, ($config[:redis]['data_ttl'].to_i * 60 * 60)) - host = vsphere.find_vm(vm) || - vsphere.find_vm_heavy(vm)[vm] + host = vsphere.find_vm(vm) if host start = Time.now @@ -312,8 +322,7 @@ def create_vm_disk(vm, disk_size, vsphere) end def _create_vm_disk(vm, disk_size, vsphere) - host = vsphere.find_vm(vm) || - vsphere.find_vm_heavy(vm)[vm] + host = vsphere.find_vm(vm) if (host) && ((! disk_size.nil?) && (! disk_size.empty?) && (disk_size.to_i > 0)) $logger.log('s', "[ ] [disk_manager] '#{vm}' is attaching a #{disk_size}gb disk") @@ -353,8 +362,7 @@ def create_vm_snapshot(vm, snapshot_name, vsphere) end def _create_vm_snapshot(vm, snapshot_name, vsphere) - host = vsphere.find_vm(vm) || - vsphere.find_vm_heavy(vm)[vm] + host = vsphere.find_vm(vm) if (host) && ((! snapshot_name.nil?) && (! snapshot_name.empty?)) $logger.log('s', "[ ] [snapshot_manager] '#{vm}' is being snapshotted") @@ -383,8 +391,7 @@ def revert_vm_snapshot(vm, snapshot_name, vsphere) end def _revert_vm_snapshot(vm, snapshot_name, vsphere) - host = vsphere.find_vm(vm) || - vsphere.find_vm_heavy(vm)[vm] + host = vsphere.find_vm(vm) if host snapshot = vsphere.find_snapshot(host, snapshot_name) @@ -466,10 +473,6 @@ def _check_snapshot_queue(vsphere) end end - def find_vsphere_pool_vm(pool, vm, vsphere) - vsphere.find_vm(vm) || vsphere.find_vm_heavy(vm)[vm] - end - def migration_limit(migration_limit) # Returns migration_limit setting when enabled return false if migration_limit == 0 || ! migration_limit @@ -485,7 +488,7 @@ def migrate_vm(vm, pool, vsphere) def _migrate_vm(vm, pool, vsphere) begin $redis.srem('vmpooler__migrating__' + pool, vm) - vm_object = find_vsphere_pool_vm(pool, vm, vsphere) + vm_object = vsphere.find_vm(vm) parent_host, parent_host_name = get_vm_host_info(vm_object) migration_limit = migration_limit $config[:config]['migration_limit'] migration_count = $redis.scard('vmpooler__migration') @@ -600,12 +603,14 @@ def _check_pool(pool, vsphere) # PENDING $redis.smembers("vmpooler__pending__#{pool['name']}").each do |vm| + pool_timeout = pool['timeout'] || $config[:config]['timeout'] || 15 if inventory[vm] begin - pool_timeout = pool['timeout'] || $config[:config]['timeout'] || 15 check_pending_vm(vm, pool['name'], pool_timeout, vsphere) rescue end + else + fail_pending_vm(vm, pool['name'], pool_timeout, false) end end diff --git a/lib/vmpooler/vsphere_helper.rb b/lib/vmpooler/vsphere_helper.rb index c49bce481..5d06b64dd 100644 --- a/lib/vmpooler/vsphere_helper.rb +++ b/lib/vmpooler/vsphere_helper.rb @@ -289,6 +289,11 @@ def find_snapshot(vm, snapshotname) def find_vm(vmname) ensure_connected @connection, $credentials + find_vm_light(vmname) || find_vm_heavy(vmname)[vmname] + end + + def find_vm_light(vmname) + ensure_connected @connection, $credentials @connection.searchIndex.FindByDnsName(vmSearch: true, dnsName: vmname) end diff --git a/spec/vmpooler/pool_manager_migration_spec.rb b/spec/vmpooler/pool_manager_migration_spec.rb index 6eab80e71..9fd491b01 100644 --- a/spec/vmpooler/pool_manager_migration_spec.rb +++ b/spec/vmpooler/pool_manager_migration_spec.rb @@ -40,7 +40,6 @@ create_migrating_vm vm['name'], pool, redis allow(vsphere).to receive(:find_vm).and_return(vm) allow(pooler).to receive(:get_vm_host_info).and_return([{'name' => 'host1'}, 'host1']) - expect(vsphere).to receive(:find_vm).with(vm['name']) end it 'logs VM host when migration is disabled' do diff --git a/spec/vmpooler/pool_manager_spec.rb b/spec/vmpooler/pool_manager_spec.rb index f5c5386ee..09f9a29e8 100644 --- a/spec/vmpooler/pool_manager_spec.rb +++ b/spec/vmpooler/pool_manager_spec.rb @@ -26,7 +26,6 @@ it 'calls fail_pending_vm' do allow(vsphere).to receive(:find_vm).and_return(nil) allow(redis).to receive(:hget) - expect(redis).to receive(:hget).with(String, 'clone').once subject._check_pending_vm(vm, pool, timeout, vsphere) end end