Browse Source

Merge "Error out interrupted builds" into stable/queens

changes/39/693239/1
Zuul Gerrit Code Review 4 months ago
parent
commit
b69648dd2e
3 changed files with 278 additions and 27 deletions
  1. +104
    -9
      nova/compute/manager.py
  2. +7
    -14
      nova/tests/functional/compute/test_init_host.py
  3. +167
    -4
      nova/tests/unit/compute/test_compute_mgr.py

+ 104
- 9
nova/compute/manager.py View File

@@ -644,6 +644,10 @@ class ComputeManager(manager.Manager):
Then allocations are removed from Placement for every instance that is
evacuated from this host regardless if the instance is reported by the
hypervisor or not.

:param context: The request context
:return: A dict keyed by instance uuid mapped to Migration objects
for instances that were migrated away from this host
"""
filters = {
'source_compute': self.host,
@@ -664,7 +668,7 @@ class ComputeManager(manager.Manager):
evacuations = objects.MigrationList.get_by_filters(context,
filters)
if not evacuations:
return
return {}
evacuations = {mig.instance_uuid: mig for mig in evacuations}

# The instances might be deleted in which case we need to avoid
@@ -851,9 +855,8 @@ class ComputeManager(manager.Manager):
# instance has already been scheduled to this particular host.
LOG.debug("Instance failed to spawn correctly, "
"setting to ERROR state", instance=instance)
instance.task_state = None
instance.vm_state = vm_states.ERROR
instance.save()
self._set_instance_obj_error_state(
context, instance, clean_task_state=True)
return

if (instance.vm_state in [vm_states.ACTIVE, vm_states.STOPPED] and
@@ -864,9 +867,8 @@ class ComputeManager(manager.Manager):
# spawned so set to ERROR state. This is consistent to BUILD
LOG.debug("Instance failed to rebuild correctly, "
"setting to ERROR state", instance=instance)
instance.task_state = None
instance.vm_state = vm_states.ERROR
instance.save()
self._set_instance_obj_error_state(
context, instance, clean_task_state=True)
return

if (instance.vm_state != vm_states.ERROR and
@@ -1204,10 +1206,24 @@ class ComputeManager(manager.Manager):

# Initialise instances on the host that are not evacuating
for instance in instances:
if (not evacuated_instances or
instance.uuid not in evacuated_instances):
if instance.uuid not in evacuated_instances:
self._init_instance(context, instance)

# NOTE(gibi): collect all the instance uuids that is in some way
# was already handled above. Either by init_instance or by
# _destroy_evacuated_instances. This way we can limit the scope of
# the _error_out_instances_whose_build_was_interrupted call to look
# only for instances that have allocations on this node and not
# handled by the above calls.
already_handled = {instance.uuid for instance in instances}.union(
evacuated_instances)
# NOTE(gibi): If ironic and vcenter virt driver slow start time
# becomes problematic here then we should consider adding a config
# option or a driver flag to tell us if we should thread this out
# in the background on startup
self._error_out_instances_whose_build_was_interrupted(
context, already_handled)

finally:
if CONF.defer_iptables_apply:
self.driver.filter_defer_apply_off()
@@ -1220,6 +1236,85 @@ class ComputeManager(manager.Manager):
# _sync_scheduler_instance_info periodic task will.
self._update_scheduler_instance_info(context, instances)

def _error_out_instances_whose_build_was_interrupted(
self, context, already_handled_instances):
"""If there are instances in BUILDING state that are not
assigned to this host but have allocations in placement towards
this compute that means the nova-compute service was
restarted while those instances waited for the resource claim
to finish and the _set_instance_host_and_node() to update the
instance.host field. We need to push them to ERROR state here to
prevent keeping them in BUILDING state forever.

:param context: The request context
:param already_handled_instances: The set of instance UUIDs that the
host initialization process already handled in some way.
"""

# Strategy:
# 1) Get the allocations from placement for our compute node(s)
# 2) Remove the already handled instances from the consumer list;
# they are either already initialized or need to be skipped.
# 3) Check which remaining consumer is an instance in BUILDING state
# and push it to ERROR state.

LOG.info(
"Looking for unclaimed instances stuck in BUILDING status for "
"nodes managed by this host")

try:
node_names = self.driver.get_available_nodes()
except exception.VirtDriverNotReady:
LOG.warning(
"Virt driver is not ready. Therefore unable to error out any "
"instances stuck in BUILDING state on this node. If this is "
"the first time this service is starting on this host, then "
"you can ignore this warning.")
return

for node_name in node_names:
try:
cn_uuid = objects.ComputeNode.get_by_host_and_nodename(
context, self.host, node_name).uuid
except exception.ComputeHostNotFound:
LOG.warning(
"Compute node %s not found in the database and therefore "
"unable to error out any instances stuck in BUILDING "
"state on this node. If this is the first time this "
"service is starting on this host, then you can ignore "
"this warning.", node_name)
continue

f = self.reportclient.get_allocations_for_resource_provider
allocations = f(context, cn_uuid)
if not allocations:
LOG.error(
"Could not retrieve compute node resource provider %s and "
"therefore unable to error out any instances stuck in "
"BUILDING state.", cn_uuid)
continue

not_handled_consumers = (set(allocations) -
already_handled_instances)

if not not_handled_consumers:
continue

filters = {
'vm_state': vm_states.BUILDING,
'uuid': not_handled_consumers
}

instances = objects.InstanceList.get_by_filters(
context, filters, expected_attrs=[])

for instance in instances:
LOG.debug(
"Instance spawn was interrupted before instance_claim, "
"setting instance to ERROR state", instance=instance)
self._set_instance_obj_error_state(
context, instance, clean_task_state=True)

def cleanup_host(self):
self.driver.register_event_listener(None)
self.instance_events.cancel_all_events()


+ 7
- 14
nova/tests/functional/compute/test_init_host.py View File

@@ -178,21 +178,14 @@ class TestComputeRestartInstanceStuckInBuild(
# instance_claim call we can wait for in the test
fake_notifier.wait_for_versioned_notifications(
'instance.create.start')
self.restart_compute_service(self.compute1)

# This is bug 1833581 as the server remains in BUILD state after the
# compute restart.
self._wait_for_state_change(self.admin_api, server, 'BUILD')

# Not even the periodic task push this server to ERROR because the
# server host is still None since the instance_claim didn't set it.
self.flags(instance_build_timeout=1)
self.compute1.manager._check_instance_build_time(
nova_context.get_admin_context())
server = self.admin_api.get_server(server['id'])
self.assertEqual('BUILD', server['status'])
self.assertIsNone(server['OS-EXT-SRV-ATTR:host'])
with mock.patch('nova.compute.manager.LOG.debug') as mock_log:
self.restart_compute_service(self.compute1)

# We expect that the instance is pushed to ERROR state during the
# compute restart.
# self._wait_for_state_change(self.admin_api, server, 'ERROR')
self._wait_for_state_change(self.admin_api, server, 'ERROR')
mock_log.assert_called_with(
'Instance spawn was interrupted before instance_claim, setting '
'instance to ERROR state',
instance=mock.ANY)

+ 167
- 4
nova/tests/unit/compute/test_compute_mgr.py View File

@@ -622,6 +622,8 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
return instance_obj._make_instance_list(
self.context, objects.InstanceList(), db_list, None)

@mock.patch.object(manager.ComputeManager,
'_error_out_instances_whose_build_was_interrupted')
@mock.patch.object(fake_driver.FakeDriver, 'init_host')
@mock.patch.object(fake_driver.FakeDriver, 'filter_defer_apply_on')
@mock.patch.object(fake_driver.FakeDriver, 'filter_defer_apply_off')
@@ -634,7 +636,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
def _do_mock_calls(mock_update_scheduler, mock_inst_init,
mock_destroy, mock_admin_ctxt, mock_host_get,
mock_filter_off, mock_filter_on, mock_init_host,
defer_iptables_apply):
mock_error_interrupted, defer_iptables_apply):
mock_admin_ctxt.return_value = self.context
inst_list = _make_instance_list(startup_instances)
mock_host_get.return_value = inst_list
@@ -658,6 +660,9 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
mock_update_scheduler.assert_called_once_with(
self.context, inst_list)

mock_error_interrupted.assert_called_once_with(
self.context, {inst.uuid for inst in inst_list})

# Test with defer_iptables_apply
self.flags(defer_iptables_apply=True)
_do_mock_calls(defer_iptables_apply=True)
@@ -666,6 +671,8 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
self.flags(defer_iptables_apply=False)
_do_mock_calls(defer_iptables_apply=False)

@mock.patch('nova.compute.manager.ComputeManager.'
'_error_out_instances_whose_build_was_interrupted')
@mock.patch('nova.objects.InstanceList.get_by_host',
return_value=objects.InstanceList())
@mock.patch('nova.compute.manager.ComputeManager.'
@@ -675,13 +682,15 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
@mock.patch('nova.compute.manager.ComputeManager.'
'_update_scheduler_instance_info', mock.NonCallableMock())
def test_init_host_no_instances(self, mock_destroy_evac_instances,
mock_get_by_host):
mock_get_by_host, mock_error_interrupted):
"""Tests the case that init_host runs and there are no instances
on this host yet (it's brand new). Uses NonCallableMock for the
methods we assert should not be called.
"""
self.compute.init_host()

mock_error_interrupted.assert_called_once_with(mock.ANY, set())

@mock.patch('nova.objects.InstanceList')
@mock.patch('nova.objects.MigrationList.get_by_filters')
def test_cleanup_host(self, mock_miglist_get, mock_instance_list):
@@ -711,6 +720,8 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
self.assertFalse(mock_register.called)

@mock.patch('nova.context.RequestContext.elevated')
@mock.patch.object(manager.ComputeManager,
'_error_out_instances_whose_build_was_interrupted')
@mock.patch('nova.objects.ComputeNode.get_by_host_and_nodename')
@mock.patch('nova.scheduler.utils.resources_from_flavor')
@mock.patch.object(manager.ComputeManager, '_get_instances_on_driver')
@@ -725,7 +736,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
def test_init_host_with_evacuated_instance(self, mock_save, mock_mig_get,
mock_temp_mut, mock_init_host, mock_destroy, mock_host_get,
mock_admin_ctxt, mock_init_virt, mock_get_inst, mock_resources,
mock_get_node, mock_elevated):
mock_get_node, mock_error_interrupted, mock_elevated):
our_host = self.compute.host
not_our_host = 'not-' + our_host

@@ -775,6 +786,11 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
mock.ANY, mock.ANY, mock.ANY)
mock_save.assert_called_once_with()

mock_error_interrupted.assert_called_once_with(
self.context, {deleted_instance.uuid})

@mock.patch.object(manager.ComputeManager,
'_error_out_instances_whose_build_was_interrupted')
@mock.patch.object(context, 'get_admin_context')
@mock.patch.object(objects.InstanceList, 'get_by_host')
@mock.patch.object(fake_driver.FakeDriver, 'init_host')
@@ -783,7 +799,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
'_destroy_evacuated_instances')
def test_init_host_with_in_progress_evacuations(self, mock_destroy_evac,
mock_init_instance, mock_init_host, mock_host_get,
mock_admin_ctxt):
mock_admin_ctxt, mock_error_interrupted):
"""Assert that init_instance is not called for instances that are
evacuating from the host during init_host.
"""
@@ -804,6 +820,145 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):

mock_init_instance.assert_called_once_with(
self.context, active_instance)
mock_error_interrupted.assert_called_once_with(
self.context, {active_instance.uuid, evacuating_instance.uuid})

@mock.patch.object(objects.Instance, 'save')
@mock.patch.object(objects.InstanceList, 'get_by_filters')
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
'get_allocations_for_resource_provider')
@mock.patch.object(objects.ComputeNode, 'get_by_host_and_nodename')
@mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
def test_init_host_with_interrupted_instance_build(
self, mock_get_nodes, mock_get_by_host_and_node,
mock_get_allocations, mock_get_instances, mock_instance_save):

mock_get_nodes.return_value = ['fake-node']
mock_get_by_host_and_node.return_value = objects.ComputeNode(
host=self.compute.host, uuid=uuids.cn_uuid)

active_instance = fake_instance.fake_instance_obj(
self.context, host=self.compute.host, uuid=uuids.active_instance)
evacuating_instance = fake_instance.fake_instance_obj(
self.context, host=self.compute.host, uuid=uuids.evac_instance)
interrupted_instance = fake_instance.fake_instance_obj(
self.context, host=None, uuid=uuids.interrupted_instance,
vm_state=vm_states.BUILDING)

# we have 3 different instances. We need consumers for each instance
# in placement and an extra consumer that is not an instance
allocations = {
uuids.active_instance: "fake-resources-active",
uuids.evac_instance: "fake-resources-evacuating",
uuids.interrupted_instance: "fake-resources-interrupted",
uuids.not_an_instance: "fake-resources-not-an-instance",
}
mock_get_allocations.return_value = allocations

# get is called with a uuid filter containing interrupted_instance,
# error_instance, and not_an_instance but it will only return the
# interrupted_instance as the error_instance is not in building state
# and not_an_instance does not match with any instance in the db.
mock_get_instances.return_value = objects.InstanceList(
self.context, objects=[interrupted_instance])

# interrupted_instance and error_instance is not in the list passed in
# because it is not assigned to the compute and therefore not processed
# by init_host and init_instance
self.compute._error_out_instances_whose_build_was_interrupted(
self.context,
{inst.uuid for inst in [active_instance, evacuating_instance]})

mock_get_by_host_and_node.assert_called_once_with(
self.context, self.compute.host, 'fake-node')
mock_get_allocations.assert_called_once_with(
self.context, uuids.cn_uuid)

mock_get_instances.assert_called_once_with(
self.context,
{'vm_state': 'building',
'uuid': {uuids.interrupted_instance, uuids.not_an_instance}
},
expected_attrs=[])

# this is expected to be called only once for interrupted_instance
mock_instance_save.assert_called_once_with()
self.assertEqual(vm_states.ERROR, interrupted_instance.vm_state)

@mock.patch.object(manager.LOG, 'warning')
@mock.patch.object(
fake_driver.FakeDriver, 'get_available_nodes',
side_effect=exception.VirtDriverNotReady)
def test_init_host_with_interrupted_instance_build_driver_not_ready(
self, mock_get_nodes, mock_log_warning):
self.compute._error_out_instances_whose_build_was_interrupted(
self.context, set())

mock_log_warning.assert_called_once_with(
"Virt driver is not ready. Therefore unable to error out any "
"instances stuck in BUILDING state on this node. If this is the "
"first time this service is starting on this host, then you can "
"ignore this warning.")

@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
'get_allocations_for_resource_provider')
@mock.patch.object(objects.ComputeNode, 'get_by_host_and_nodename')
@mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
def test_init_host_with_interrupted_instance_build_compute_node_not_found(
self, mock_get_nodes, mock_get_by_host_and_node,
mock_get_allocations):

mock_get_nodes.return_value = ['fake-node1', 'fake-node2']
mock_get_by_host_and_node.side_effect = [
exception.ComputeHostNotFound(host='fake-node1'),
objects.ComputeNode(host=self.compute.host, uuid=uuids.cn_uuid)]

self.compute._error_out_instances_whose_build_was_interrupted(
self.context, set())

# check that nova skip the node that is not found in the db and
# continue with the next
mock_get_by_host_and_node.assert_has_calls(
[
mock.call(self.context, self.compute.host, 'fake-node1'),
mock.call(self.context, self.compute.host, 'fake-node2'),
]
)

# placement only queried for the existing compute
mock_get_allocations.assert_called_once_with(
self.context, uuids.cn_uuid)

@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
'get_allocations_for_resource_provider')
@mock.patch.object(objects.ComputeNode, 'get_by_host_and_nodename')
@mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
def test_init_host_with_interrupted_instance_build_compute_rp_not_found(
self, mock_get_nodes, mock_get_by_host_and_node,
mock_get_allocations):

mock_get_nodes.return_value = ['fake-node1', 'fake-node2']
mock_get_by_host_and_node.side_effect = [
objects.ComputeNode(host=self.compute.host, uuid=uuids.cn1_uuid),
objects.ComputeNode(host=self.compute.host, uuid=uuids.cn2_uuid),
]

mock_get_allocations.side_effect = [
{},
{uuids.active_instance: "fake-resources"}
]

self.compute._error_out_instances_whose_build_was_interrupted(
self.context, {uuids.active_instance})

# check that nova skip the node that is not found in placement and
# continue with the next
mock_get_allocations.assert_has_calls(
[
mock.call(self.context, uuids.cn1_uuid),
mock.call(self.context, uuids.cn2_uuid),
]
)

def test_init_instance_with_binding_failed_vif_type(self):
# this instance will plug a 'binding_failed' vif
@@ -3609,6 +3764,14 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
self._do_test_set_admin_password_driver_error(
exc, vm_states.ACTIVE, None, expected_exception)

def test_destroy_evacuated_instances_no_migrations(self):
with mock.patch(
'nova.objects.MigrationList.get_by_filters') as migration_list:
migration_list.return_value = []

result = self.compute._destroy_evacuated_instances(self.context)
self.assertEqual({}, result)

def test_destroy_evacuated_instances(self):
our_host = self.compute.host
flavor = objects.Flavor()


Loading…
Cancel
Save