Skip to content

Commit

Permalink
(POOLER-26) Fix lost VMs getting stuck in pending
Browse files Browse the repository at this point in the history
This commit updates vmpooler to understand how to resolve a situation
where a pending VM does not exist. Without this change a pending VM that
does not exist in vmware inventory gets stuck in the pending state,
preventing the pool from ever reaching its target capacity.

As a part of this change the find_vm method is updated to perform a
light, then heavy search each time find_vm is called and all usage of
find_vm || find_vm_heavy is replaced. This makes find_vm usage
consistent across pool_manager.

Additionally, open_socket method is updated to resolve an incorrect
reference to the host name.
  • Loading branch information
mattkirby committed Nov 17, 2016
1 parent c217d73 commit 8434048
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 30 deletions.
61 changes: 33 additions & 28 deletions lib/vmpooler/pool_manager.rb
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,7 @@ def open_socket(host, domain=nil, timeout=5, port=22)
Timeout.timeout(timeout) do
target_host = host
target_host = "#{host}.#{domain}" if domain
sock = TCPSocket.new target_host, port
sock
TCPSocket.new target_host, port
end
end

Expand All @@ -40,25 +39,38 @@ def _check_pending_vm(vm, pool, timeout, vsphere)

if host
begin
open_socket vm
socket = open_socket vm
fail_pending_vm(vm, pool, timeout) if ! socket
move_pending_vm_to_ready(vm, pool, host)
rescue
fail_pending_vm(vm, pool, timeout)
end
else
fail_pending_vm(vm, pool, timeout)
fail_pending_vm(vm, pool, timeout, false)
end
end

def fail_pending_vm(vm, pool, timeout)
clone_stamp = $redis.hget('vmpooler__vm__' + vm, 'clone')

if (clone_stamp) &&
(((Time.now - Time.parse(clone_stamp)) / 60) > timeout)

$redis.smove('vmpooler__pending__' + pool, 'vmpooler__completed__' + pool, vm)
def remove_nonexistent_vm(vm, pool)
$redis.srem("vmpooler__pending__#{pool}", vm)
$logger.log('d', "[!] [#{pool}] '#{vm}' no longer exists. Removing from pending.")
end

$logger.log('d', "[!] [#{pool}] '#{vm}' marked as 'failed' after #{timeout} minutes")
def fail_pending_vm(vm, pool, timeout, exists=true)
begin
clone_stamp = $redis.hget("vmpooler__vm__#{vm}", 'clone')
return if ! clone_stamp

time_since_clone = (Time.now - Time.parse(clone_stamp)) / 60
if time_since_clone > timeout
if exists
$redis.smove('vmpooler__pending__' + pool, 'vmpooler__completed__' + pool, vm)
$logger.log('d', "[!] [#{pool}] '#{vm}' marked as 'failed' after #{timeout} minutes")
else
remove_nonexistent_vm(vm, pool)
end
end
rescue => err
$logger.log('d', "Fail pending VM failed with an error: #{err}")
end
end

Expand Down Expand Up @@ -101,8 +113,7 @@ def check_ready_vm(vm, pool, ttl, vsphere)

$redis.hset('vmpooler__vm__' + vm, 'check', Time.now)

host = vsphere.find_vm(vm) ||
vsphere.find_vm_heavy(vm)[vm]
host = vsphere.find_vm(vm)

if host
if
Expand Down Expand Up @@ -281,8 +292,7 @@ def destroy_vm(vm, pool, vsphere)
# Auto-expire metadata key
$redis.expire('vmpooler__vm__' + vm, ($config[:redis]['data_ttl'].to_i * 60 * 60))

host = vsphere.find_vm(vm) ||
vsphere.find_vm_heavy(vm)[vm]
host = vsphere.find_vm(vm)

if host
start = Time.now
Expand Down Expand Up @@ -312,8 +322,7 @@ def create_vm_disk(vm, disk_size, vsphere)
end

def _create_vm_disk(vm, disk_size, vsphere)
host = vsphere.find_vm(vm) ||
vsphere.find_vm_heavy(vm)[vm]
host = vsphere.find_vm(vm)

if (host) && ((! disk_size.nil?) && (! disk_size.empty?) && (disk_size.to_i > 0))
$logger.log('s', "[ ] [disk_manager] '#{vm}' is attaching a #{disk_size}gb disk")
Expand Down Expand Up @@ -353,8 +362,7 @@ def create_vm_snapshot(vm, snapshot_name, vsphere)
end

def _create_vm_snapshot(vm, snapshot_name, vsphere)
host = vsphere.find_vm(vm) ||
vsphere.find_vm_heavy(vm)[vm]
host = vsphere.find_vm(vm)

if (host) && ((! snapshot_name.nil?) && (! snapshot_name.empty?))
$logger.log('s', "[ ] [snapshot_manager] '#{vm}' is being snapshotted")
Expand Down Expand Up @@ -383,8 +391,7 @@ def revert_vm_snapshot(vm, snapshot_name, vsphere)
end

def _revert_vm_snapshot(vm, snapshot_name, vsphere)
host = vsphere.find_vm(vm) ||
vsphere.find_vm_heavy(vm)[vm]
host = vsphere.find_vm(vm)

if host
snapshot = vsphere.find_snapshot(host, snapshot_name)
Expand Down Expand Up @@ -466,10 +473,6 @@ def _check_snapshot_queue(vsphere)
end
end

def find_vsphere_pool_vm(pool, vm, vsphere)
vsphere.find_vm(vm) || vsphere.find_vm_heavy(vm)[vm]
end

def migration_limit(migration_limit)
# Returns migration_limit setting when enabled
return false if migration_limit == 0 || ! migration_limit
Expand All @@ -485,7 +488,7 @@ def migrate_vm(vm, pool, vsphere)
def _migrate_vm(vm, pool, vsphere)
begin
$redis.srem('vmpooler__migrating__' + pool, vm)
vm_object = find_vsphere_pool_vm(pool, vm, vsphere)
vm_object = vsphere.find_vm(vm)
parent_host, parent_host_name = get_vm_host_info(vm_object)
migration_limit = migration_limit $config[:config]['migration_limit']
migration_count = $redis.scard('vmpooler__migration')
Expand Down Expand Up @@ -600,12 +603,14 @@ def _check_pool(pool, vsphere)

# PENDING
$redis.smembers("vmpooler__pending__#{pool['name']}").each do |vm|
pool_timeout = pool['timeout'] || $config[:config]['timeout'] || 15
if inventory[vm]
begin
pool_timeout = pool['timeout'] || $config[:config]['timeout'] || 15
check_pending_vm(vm, pool['name'], pool_timeout, vsphere)
rescue
end
else
fail_pending_vm(vm, pool['name'], pool_timeout, false)
end
end

Expand Down
5 changes: 5 additions & 0 deletions lib/vmpooler/vsphere_helper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,11 @@ def find_snapshot(vm, snapshotname)

def find_vm(vmname)
ensure_connected @connection, $credentials
find_vm_light(vmname) || find_vm_heavy(vmname)[vmname]
end

def find_vm_light(vmname)
ensure_connected @connection, $credentials

@connection.searchIndex.FindByDnsName(vmSearch: true, dnsName: vmname)
end
Expand Down
1 change: 0 additions & 1 deletion spec/vmpooler/pool_manager_migration_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@
create_migrating_vm vm['name'], pool, redis
allow(vsphere).to receive(:find_vm).and_return(vm)
allow(pooler).to receive(:get_vm_host_info).and_return([{'name' => 'host1'}, 'host1'])
expect(vsphere).to receive(:find_vm).with(vm['name'])
end

it 'logs VM host when migration is disabled' do
Expand Down
1 change: 0 additions & 1 deletion spec/vmpooler/pool_manager_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
it 'calls fail_pending_vm' do
allow(vsphere).to receive(:find_vm).and_return(nil)
allow(redis).to receive(:hget)
expect(redis).to receive(:hget).with(String, 'clone').once
subject._check_pending_vm(vm, pool, timeout, vsphere)
end
end
Expand Down

0 comments on commit 8434048

Please sign in to comment.