Make VIM tolerant of compute service failures

When the VIM detects the nova compute service is down on a
worker host, it attempts to migrate instances off that host
(by "disabling" the host). However, this isn't possible if
the compute service is down. The VIM then fails the instances,
which will eventually result in their evacuation (if the host
goes offline) or a reboot of the instance (if the compute
service recovers).

In the containers world, when the libvirt pod is restarted
(e.g. when stx-openstack application is re-applied), nova
reports that the compute service is down (for a short period
of time), which causes the undesirable behaviour described
above. The VIM is being updated to not disable the host in
this case and instead just raise an alarm to indicate that
the compute service has failed.

Change-Id: I186d8d76bbcd87405bafec47deb92ec24580640e
Closes-Bug: 1833096
Signed-off-by: Bart Wensley <barton.wensley@windriver.com>
(cherry picked from commit a9004988dc)
This commit is contained in:
Bart Wensley 2019-08-16 13:48:09 -05:00
parent ac00a68b22
commit 4ea74a99c9
2 changed files with 35 additions and 12 deletions

View File

@ -53,8 +53,12 @@ class EnabledState(state_machine.State):
return HOST_STATE.DISABLING
elif HOST_EVENT.TASK_COMPLETED == event:
# Do not disable this host if only the compute service is disabled.
# We will raise an alarm, but there is no way to safely move work
# off the host if the compute service is down.
if objects.HOST_SERVICE_STATE.ENABLED != \
host.host_service_state_aggregate():
host.host_service_state_aggregate(
ignore_services=[objects.HOST_SERVICES.COMPUTE]):
if not host.host_services_locked:
DLOG.info("Host services are not enabled on %s. "
"Disabling host." % host.name)
@ -62,6 +66,7 @@ class EnabledState(state_machine.State):
else:
DLOG.info("Host services are not enabled on %s. "
"Host services are locked." % host.name)
elif HOST_EVENT.TASK_FAILED == event:
DLOG.info("Audit failed for %s." % host.name)

View File

@ -185,10 +185,12 @@ class Host(ObjectData):
"""
return self._host_service_state[service]
def host_service_state_aggregate(self):
def host_service_state_aggregate(self, ignore_services=None):
"""
Returns the overall state of the host services
"""
if ignore_services is None:
ignore_services = []
all_enabled = True
at_least_one_failed = False
for service, service_state in self._host_service_state.items():
@ -196,6 +198,9 @@ class Host(ObjectData):
# there is no query function for that sevice.
if service == HOST_SERVICES.CONTAINER:
continue
# Ignore services we were told to ignore
if service in ignore_services:
continue
all_enabled = all_enabled and \
(service_state == HOST_SERVICE_STATE.ENABLED)
at_least_one_failed = at_least_one_failed or \
@ -758,30 +763,39 @@ class Host(ObjectData):
if service is not None:
if host_service_state == self._host_service_state[service]:
# No change to the state of the service
return
self._host_service_state[service] = host_service_state
# Host services logs and alarms only apply to worker hosts
if 'worker' in self.personality:
host_service_state_overall = \
self.host_service_state_aggregate()
if (HOST_SERVICE_STATE.ENABLED ==
host_service_state_overall):
# Host services logs and alarms only apply to the compute service on
# worker hosts
if 'worker' in self.personality and HOST_SERVICES.COMPUTE == service:
if HOST_SERVICE_STATE.ENABLED == host_service_state:
self._events = event_log.host_issue_log(
self, event_log.EVENT_ID.HOST_SERVICES_ENABLED)
alarm.host_clear_alarm(self._alarms)
self._alarms[:] = list()
elif (HOST_SERVICE_STATE.DISABLED ==
host_service_state_overall):
elif HOST_SERVICE_STATE.DISABLED == host_service_state:
# Always log the disabled compute service
self._events = event_log.host_issue_log(
self, event_log.EVENT_ID.HOST_SERVICES_DISABLED)
# Clear any previous alarms for this host
alarm.host_clear_alarm(self._alarms)
self._alarms[:] = list()
# Alarm the disabled compute service if the host is still
# enabled and is not being locked. Alarm it as a failure.
if self.nfvi_host_is_enabled():
if reason is None:
additional_text = ''
else:
additional_text = ", %s" % reason
self._alarms = alarm.host_raise_alarm(
self, alarm.ALARM_TYPE.HOST_SERVICES_FAILED,
additional_text=additional_text)
elif (HOST_SERVICE_STATE.FAILED ==
host_service_state_overall):
elif HOST_SERVICE_STATE.FAILED == host_service_state:
if reason is None:
additional_text = ''
else:
@ -790,6 +804,10 @@ class Host(ObjectData):
self._events = event_log.host_issue_log(
self, event_log.EVENT_ID.HOST_SERVICES_FAILED,
additional_text=additional_text)
# Clear any previous alarms for this host
alarm.host_clear_alarm(self._alarms)
self._alarms[:] = list()
# Alarm the failed compute service
self._alarms = alarm.host_raise_alarm(
self, alarm.ALARM_TYPE.HOST_SERVICES_FAILED,
additional_text=additional_text)