Completes fix for LP #928910 - libvirt performance

This patch adds the remainder of the recommended fixes
from the original bug report:

* Modifies methods in the compute manager that relied on
  the DB power state to be in sync with the virt driver to
  instead just query the power state of the instance from the
  virt driver. This enables us to set the periodic tick to 10
  for the problematic compute.manager.Manager._sync_power_states()
  method.
* Modifies the _sync_power_states method in the following ways:
 ** Replace the call to driver.list_instances_detail() to a new,
    driver-overrideable get_num_instances() call
 ** For each instance known by the database, call driver.get_info()
    separately inside the loop instead of calling the expensive
    list_instances_detail() method that can take a very long time
    to complete on hosts with lots of instances
 ** Call greenthread.sleep(0) before each call to update the
    database power state, enabling other periodic tasks to do work

Once again, I left an inefficient default implementation of the
new driver.get_num_instances() method in the base driver class. I
need help from folks who understand the Xen/VMWare drivers to do
an override for get_num_instances() in those drivers that calls
the underlying XenAPI or VMWare API.

Change-Id: I88002689cdda32124423da320f8c542e286be51b
This commit is contained in:
Jay Pipes
2012-02-12 13:34:14 -05:00
parent d8f8bad0f2
commit 1c8ad4553b
4 changed files with 68 additions and 26 deletions

View File

@@ -635,7 +635,9 @@ class ComputeManager(manager.SchedulerDependentManager):
# tear down allocated network structure
self._deallocate_network(context, instance)
if instance['power_state'] == power_state.SHUTOFF:
current_power_state = self._get_power_state(context, instance)
if current_power_state == power_state.SHUTOFF:
self.db.instance_destroy(context, instance_id)
raise exception.Error(_('trying to destroy already destroyed'
' instance: %s') % instance_uuid)
@@ -970,10 +972,11 @@ class ComputeManager(manager.SchedulerDependentManager):
for i in xrange(max_tries):
instance_ref = self.db.instance_get_by_uuid(context, instance_uuid)
instance_id = instance_ref["id"]
instance_state = instance_ref["power_state"]
current_power_state = self._get_power_state(context, instance_ref)
expected_state = power_state.RUNNING
if instance_state != expected_state:
if current_power_state != expected_state:
self._instance_update(context, instance_id, task_state=None)
raise exception.Error(_('Failed to set admin password. '
'Instance %s is not running') %
@@ -1016,11 +1019,12 @@ class ComputeManager(manager.SchedulerDependentManager):
"""Write a file to the specified path in an instance on this host."""
context = context.elevated()
instance_ref = self.db.instance_get_by_uuid(context, instance_uuid)
instance_state = instance_ref['power_state']
current_power_state = self._get_power_state(context, instance_ref)
expected_state = power_state.RUNNING
if instance_state != expected_state:
if current_power_state != expected_state:
LOG.warn(_('trying to inject a file into a non-running '
'instance: %(instance_uuid)s (state: %(instance_state)s '
'instance: %(instance_uuid)s (state: '
'%(current_power_state)s '
'expected: %(expected_state)s)') % locals())
instance_uuid = instance_ref['uuid']
msg = _('instance %(instance_uuid)s: injecting file to %(path)s')
@@ -1034,11 +1038,12 @@ class ComputeManager(manager.SchedulerDependentManager):
"""Update agent running on an instance on this host."""
context = context.elevated()
instance_ref = self.db.instance_get_by_uuid(context, instance_uuid)
instance_state = instance_ref['power_state']
current_power_state = self._get_power_state(context, instance_ref)
expected_state = power_state.RUNNING
if instance_state != expected_state:
if current_power_state != expected_state:
LOG.warn(_('trying to update agent on a non-running '
'instance: %(instance_uuid)s (state: %(instance_state)s '
'instance: %(instance_uuid)s (state: '
'%(current_power_state)s '
'expected: %(expected_state)s)') % locals())
instance_uuid = instance_ref['uuid']
msg = _('instance %(instance_uuid)s: updating agent to %(url)s')
@@ -1423,7 +1428,8 @@ class ComputeManager(manager.SchedulerDependentManager):
def get_diagnostics(self, context, instance_uuid):
"""Retrieve diagnostics for an instance on this host."""
instance_ref = self.db.instance_get_by_uuid(context, instance_uuid)
if instance_ref["power_state"] == power_state.RUNNING:
current_power_state = self._get_power_state(context, instance_ref)
if current_power_state == power_state.RUNNING:
LOG.audit(_("instance %s: retrieving diagnostics"), instance_uuid,
context=context)
return self.driver.get_diagnostics(instance_ref)
@@ -2099,37 +2105,45 @@ class ComputeManager(manager.SchedulerDependentManager):
self.update_service_capabilities(
self.driver.get_host_stats(refresh=True))
@manager.periodic_task
@manager.periodic_task(ticks_between_runs=10)
def _sync_power_states(self, context):
"""Align power states between the database and the hypervisor.
The hypervisor is authoritative for the power_state data, so we
simply loop over all known instances for this host and update the
power_state according to the hypervisor. If the instance is not found
then it will be set to power_state.NOSTATE, because it doesn't exist
on the hypervisor.
The hypervisor is authoritative for the power_state data, but we don't
want to do an expensive call to the virt driver's list_instances_detail
method. Instead, we do a less-expensive call to get the number of
virtual machines known by the hypervisor and if the number matches the
number of virtual machines known by the database, we proceed in a lazy
loop, one database record at a time, checking if the hypervisor has the
same power state as is in the database. We call eventlet.sleep(0) after
each loop to allow the periodic task eventlet to do other work.
If the instance is not found on the hypervisor, but is in the database,
then it will be set to power_state.NOSTATE.
"""
vm_instances = self.driver.list_instances_detail()
vm_instances = dict((vm.name, vm) for vm in vm_instances)
db_instances = self.db.instance_get_all_by_host(context, self.host)
num_vm_instances = len(vm_instances)
num_vm_instances = self.driver.get_num_instances()
num_db_instances = len(db_instances)
if num_vm_instances != num_db_instances:
LOG.info(_("Found %(num_db_instances)s in the database and "
LOG.warn(_("Found %(num_db_instances)s in the database and "
"%(num_vm_instances)s on the hypervisor.") % locals())
for db_instance in db_instances:
# Allow other periodic tasks to do some work...
greenthread.sleep(0)
name = db_instance["name"]
db_power_state = db_instance['power_state']
vm_instance = vm_instances.get(name)
if vm_instance is None:
vm_power_state = power_state.NOSTATE
else:
try:
vm_instance = self.driver.get_info(name)
vm_power_state = vm_instance.state
except exception.InstanceNotFound:
msg = _("Instance %(name)s found in database but "
"not known by hypervisor. Setting power "
"state to NOSTATE") % locals()
LOG.warn(msg)
vm_power_state = power_state.NOSTATE
if vm_power_state == db_power_state:
continue

View File

@@ -552,6 +552,17 @@ class ComputeTestCase(BaseTestCase):
instance = db.instance_get_by_uuid(self.context, instance['uuid'])
self.assertEqual(instance['power_state'], power_state.NOSTATE)
def fake_driver_get_info(self2, _instance):
return {'state': power_state.NOSTATE,
'max_mem': 0,
'mem': 0,
'num_cpu': 2,
'cpu_time': 0}
self.stubs.Set(nova.virt.fake.FakeConnection, 'get_info',
fake_driver_get_info)
self.assertRaises(exception.Error,
self.compute.set_admin_password,
self.context,
@@ -1453,7 +1464,7 @@ class ComputeTestCase(BaseTestCase):
# Force the compute manager to do its periodic poll
ctxt = context.get_admin_context()
self.compute.periodic_tasks(ctxt, raise_on_error=True)
self.compute._sync_power_states(context.get_admin_context())
instances = db.instance_get_all(context.get_admin_context())
LOG.info(_("After force-killing instances: %s"), instances)

View File

@@ -122,6 +122,19 @@ class ComputeDriver(object):
# TODO(Vek): Need to pass context in for access to auth_token
raise NotImplementedError()
def get_num_instances(self):
"""Return the total number of virtual machines.
Return the number of virtual machines that the hypervisor knows
about.
:note This implementation works for all drivers, but it is
not particularly efficient. Maintainers of the virt drivers are
encouraged to override this method with something more
efficient.
"""
return len(self.list_instances())
def instance_exists(self, instance_id):
"""Checks existence of an instance on the host.

View File

@@ -281,6 +281,10 @@ class LibvirtConnection(driver.ComputeDriver):
else:
return libvirt.openAuth(uri, auth, 0)
def get_num_instances(self):
"""Efficient override of base instance_exists method."""
return self._conn.numOfDomains()
def instance_exists(self, instance_id):
"""Efficient override of base instance_exists method."""
try: