Add accounting for orphans to resource tracker.

Add accounting for orphaned instances to resource tracker.  Orphans
are instances that, for whatever reason, exist on the hypervisor
but are not accounted for in the Nova DB.

Such instances would cause resource tracker to under-report usage
numbers and result in out of memory errors during build.

Change-Id: Icc970e34e01ff8c7dfb000889e5ea6e2d0421c77
This commit is contained in:
Brian Elliott 2012-12-10 20:49:06 +00:00 committed by Gerrit Code Review
parent aef9802089
commit 07af4ceea7
7 changed files with 164 additions and 22 deletions

View File

@ -268,6 +268,11 @@ class ResourceTracker(object):
self._update_usage_from_migrations(resources, migrations)
# Detect and account for orphaned instances that may exist on the
# hypervisor, but are not in the DB:
orphans = self._find_orphaned_instances()
self._update_usage_from_orphans(resources, orphans)
self._report_final_resource_view(resources)
self._sync_compute_node(context, resources)
@ -364,8 +369,8 @@ class ResourceTracker(object):
def _update_usage(self, resources, usage, sign=1):
resources['memory_mb_used'] += sign * usage['memory_mb']
resources['local_gb_used'] += sign * usage['root_gb']
resources['local_gb_used'] += sign * usage['ephemeral_gb']
resources['local_gb_used'] += sign * usage.get('root_gb', 0)
resources['local_gb_used'] += sign * usage.get('ephemeral_gb', 0)
# free ram and disk may be negative, depending on policy:
resources['free_ram_mb'] = (resources['memory_mb'] -
@ -501,6 +506,40 @@ class ResourceTracker(object):
for instance in instances:
self._update_usage_from_instance(resources, instance)
def _find_orphaned_instances(self):
"""Given the set of instances and migrations already account for
by resource tracker, sanity check the hypervisor to determine
if there are any "orphaned" instances left hanging around.
Orphans could be consuming memory and should be accounted for in
usage calculations to guard against potential out of memory
errors.
"""
uuids1 = frozenset(self.tracked_instances.keys())
uuids2 = frozenset(self.tracked_migrations.keys())
uuids = uuids1 | uuids2
usage = self.driver.get_per_instance_usage()
vuuids = frozenset(usage.keys())
orphan_uuids = vuuids - uuids
orphans = [usage[uuid] for uuid in orphan_uuids]
return orphans
def _update_usage_from_orphans(self, resources, orphans):
"""Include orphaned instances in usage."""
for orphan in orphans:
uuid = orphan['uuid']
memory_mb = orphan['memory_mb']
LOG.warn(_("Detected running orphan instance: %(uuid)s (consuming "
"%(memory_mb)s MB memory") % locals())
# just record memory usage for the orphan
usage = {'memory_mb': orphan['memory_mb']}
self._update_usage(resources, usage)
def _verify_resources(self, resources):
resource_keys = ["vcpus", "memory_mb", "local_gb", "cpu_info",
"vcpus_used", "memory_mb_used", "local_gb_used"]

View File

@ -193,17 +193,17 @@ class BaseTestCase(test.TestCase):
# only used in the subsequent notification:
return (instance, instance)
def _tracker(self, host=None, unsupported=False):
def _driver(self):
return FakeVirtDriver()
def _tracker(self, host=None):
if host is None:
host = self.host
node = "fakenode"
if unsupported:
driver = UnsupportedVirtDriver()
else:
driver = FakeVirtDriver()
driver = self._driver()
tracker = resource_tracker.ResourceTracker(host, driver, node)
return tracker
@ -215,10 +215,13 @@ class UnsupportedDriverTestCase(BaseTestCase):
"""
def setUp(self):
super(UnsupportedDriverTestCase, self).setUp()
self.tracker = self._tracker(unsupported=True)
self.tracker = self._tracker()
# seed tracker with data:
self.tracker.update_available_resource(self.context)
def _driver(self):
return UnsupportedVirtDriver()
def test_disabled(self):
# disabled = no compute node stats
self.assertTrue(self.tracker.disabled)
@ -248,7 +251,7 @@ class UnsupportedDriverTestCase(BaseTestCase):
root_gb=10)
self.tracker.update_usage(self.context, instance)
def testDisabledResizeClaim(self):
def test_disabled_resize_claim(self):
instance = self._fake_instance()
instance_type = self._fake_instance_type_create()
claim = self.tracker.resize_claim(self.context, instance,
@ -258,7 +261,7 @@ class UnsupportedDriverTestCase(BaseTestCase):
self.assertEqual(instance_type['id'],
claim.migration['new_instance_type_id'])
def testDisabledResizeContextClaim(self):
def test_disabled_resize_context_claim(self):
instance = self._fake_instance()
instance_type = self._fake_instance_type_create()
with self.tracker.resize_claim(self.context, instance, instance_type) \
@ -327,18 +330,6 @@ class BaseTrackerTestCase(BaseTestCase):
self.tracker.update_available_resource(self.context)
self.limits = self._limits()
self._assert(FAKE_VIRT_MEMORY_MB, 'memory_mb')
self._assert(FAKE_VIRT_LOCAL_GB, 'local_gb')
self._assert(FAKE_VIRT_VCPUS, 'vcpus')
self._assert(0, 'memory_mb_used')
self._assert(0, 'local_gb_used')
self._assert(0, 'vcpus_used')
self._assert(0, 'running_vms')
self._assert(FAKE_VIRT_MEMORY_MB, 'free_ram_mb')
self._assert(FAKE_VIRT_LOCAL_GB, 'free_disk_gb')
self.assertFalse(self.tracker.disabled)
self.assertEqual(0, self.tracker.compute_node['current_workload'])
def _fake_service_get_all_compute_by_host(self, ctx, host):
self.compute = self._create_compute_node()
self.service = self._create_service(host, compute=self.compute)
@ -412,6 +403,19 @@ class TrackerTestCase(BaseTrackerTestCase):
self.assertFalse(self.tracker.disabled)
self.assertTrue(self.updated)
def test_init(self):
self._assert(FAKE_VIRT_MEMORY_MB, 'memory_mb')
self._assert(FAKE_VIRT_LOCAL_GB, 'local_gb')
self._assert(FAKE_VIRT_VCPUS, 'vcpus')
self._assert(0, 'memory_mb_used')
self._assert(0, 'local_gb_used')
self._assert(0, 'vcpus_used')
self._assert(0, 'running_vms')
self._assert(FAKE_VIRT_MEMORY_MB, 'free_ram_mb')
self._assert(FAKE_VIRT_LOCAL_GB, 'free_disk_gb')
self.assertFalse(self.tracker.disabled)
self.assertEqual(0, self.tracker.compute_node['current_workload'])
class InstanceClaimTestCase(BaseTrackerTestCase):
@ -817,3 +821,31 @@ class ResizeClaimTestCase(BaseTrackerTestCase):
self.assertEqual('fakehost', instance['host'])
self.assertEqual('fakehost', instance['launched_on'])
self.assertEqual('fakenode', instance['node'])
class OrphanTestCase(BaseTrackerTestCase):
def setUp(self):
super(OrphanTestCase, self).setUp()
def _driver(self):
class OrphanVirtDriver(FakeVirtDriver):
def get_per_instance_usage(self):
return {
'1-2-3-4-5': {'memory_mb': 4, 'uuid': '1-2-3-4-5'},
'2-3-4-5-6': {'memory_mb': 4, 'uuid': '2-3-4-5-6'},
}
return OrphanVirtDriver()
def test_usage(self):
# 2 instances, 4 mb each
self.assertEqual(8, self.tracker.compute_node['memory_mb_used'])
def test_find(self):
# create one legit instance and verify the 2 orphans remain
self._fake_instance()
orphans = self.tracker._find_orphaned_instances()
self.assertEqual(2, len(orphans))

View File

@ -1016,6 +1016,33 @@ class XenAPIVMTestCase(stubs.XenAPITestBase):
pass
self.assertTrue(was['called'])
def test_per_instance_usage_running(self):
instance = self._create_instance(spawn=True)
instance_type = instance_types.get_instance_type(3)
expected = {instance['uuid']: {'memory_mb': instance_type['memory_mb'],
'uuid': instance['uuid']}}
actual = self.conn.get_per_instance_usage()
self.assertEqual(expected, actual)
# Paused instances still consume resources:
self.conn.pause(instance)
actual = self.conn.get_per_instance_usage()
self.assertEqual(expected, actual)
def test_per_instance_usage_suspended(self):
# Suspended instances do not consume memory:
instance = self._create_instance(spawn=True)
self.conn.suspend(instance)
actual = self.conn.get_per_instance_usage()
self.assertEqual({}, actual)
def test_per_instance_usage_halted(self):
instance = self._create_instance(spawn=True)
self.conn.power_off(instance)
actual = self.conn.get_per_instance_usage()
self.assertEqual({}, actual)
def _create_instance(self, instance_id=1, spawn=True):
"""Creates and spawns a test instance."""
instance_values = {

View File

@ -767,6 +767,13 @@ class ComputeDriver(object):
stats = [stats]
return [s['hypervisor_hostname'] for s in stats]
def get_per_instance_usage(self):
"""Get information about instance resource usage.
:returns: dict of nova uuid => dict of usage info
"""
return {}
def load_compute_driver(virtapi, compute_driver=None):
"""Load a compute driver module.

View File

@ -607,6 +607,14 @@ class XenAPIDriver(driver.ComputeDriver):
"""resume guest state when a host is booted"""
self._vmops.power_on(instance)
def get_per_instance_usage(self):
"""Get information about instance resource usage.
:returns: dict of nova uuid => dict of usage
info
"""
return self._vmops.get_per_instance_usage()
class XenAPISession(object):
"""The session to invoke XenAPI SDK calls"""

View File

@ -635,6 +635,14 @@ class SessionBase(object):
db_ref['power_state'] = 'Halted'
VM_clean_shutdown = VM_hard_shutdown
def VM_suspend(self, session, vm_ref):
db_ref = _db_content['VM'][vm_ref]
db_ref['power_state'] = 'Suspended'
def VM_pause(self, session, vm_ref):
db_ref = _db_content['VM'][vm_ref]
db_ref['power_state'] = 'Paused'
def pool_eject(self, session, host_ref):
pass

View File

@ -1639,3 +1639,24 @@ class VMOps(object):
with excutils.save_and_reraise_exception():
recover_method(context, instance, destination_hostname,
block_migration)
def get_per_instance_usage(self):
"""Get usage info about each active instance."""
usage = {}
def _is_active(vm_rec):
power_state = vm_rec['power_state'].lower()
return power_state in ['running', 'paused']
def _get_uuid(vm_rec):
other_config = vm_rec['other_config']
return other_config.get('nova_uuid', None)
for vm_ref, vm_rec in vm_utils.list_vms(self._session):
uuid = _get_uuid(vm_rec)
if _is_active(vm_rec) and uuid is not None:
memory_mb = int(vm_rec['memory_static_max']) / 1024 / 1024
usage[uuid] = {'memory_mb': memory_mb, 'uuid': uuid}
return usage