diff --git a/nfv/nfv-vim/nfv_vim/host_fsm/_host_state_enabled.py b/nfv/nfv-vim/nfv_vim/host_fsm/_host_state_enabled.py index a9672d70..c9342c22 100755 --- a/nfv/nfv-vim/nfv_vim/host_fsm/_host_state_enabled.py +++ b/nfv/nfv-vim/nfv_vim/host_fsm/_host_state_enabled.py @@ -53,8 +53,12 @@ class EnabledState(state_machine.State): return HOST_STATE.DISABLING elif HOST_EVENT.TASK_COMPLETED == event: + # Do not disable this host if only the compute service is disabled. + # We will raise an alarm, but there is no way to safely move work + # off the host if the compute service is down. if objects.HOST_SERVICE_STATE.ENABLED != \ - host.host_service_state_aggregate(): + host.host_service_state_aggregate( + ignore_services=[objects.HOST_SERVICES.COMPUTE]): if not host.host_services_locked: DLOG.info("Host services are not enabled on %s. " "Disabling host." % host.name) @@ -62,6 +66,7 @@ class EnabledState(state_machine.State): else: DLOG.info("Host services are not enabled on %s. " "Host services are locked." % host.name) + elif HOST_EVENT.TASK_FAILED == event: DLOG.info("Audit failed for %s." % host.name) diff --git a/nfv/nfv-vim/nfv_vim/objects/_host.py b/nfv/nfv-vim/nfv_vim/objects/_host.py index bbc3c8f7..536aa92e 100755 --- a/nfv/nfv-vim/nfv_vim/objects/_host.py +++ b/nfv/nfv-vim/nfv_vim/objects/_host.py @@ -185,10 +185,12 @@ class Host(ObjectData): """ return self._host_service_state[service] - def host_service_state_aggregate(self): + def host_service_state_aggregate(self, ignore_services=None): """ Returns the overall state of the host services """ + if ignore_services is None: + ignore_services = [] all_enabled = True at_least_one_failed = False for service, service_state in self._host_service_state.items(): @@ -196,6 +198,9 @@ class Host(ObjectData): # there is no query function for that sevice. if service == HOST_SERVICES.CONTAINER: continue + # Ignore services we were told to ignore + if service in ignore_services: + continue all_enabled = all_enabled and \ (service_state == HOST_SERVICE_STATE.ENABLED) at_least_one_failed = at_least_one_failed or \ @@ -758,30 +763,39 @@ class Host(ObjectData): if service is not None: if host_service_state == self._host_service_state[service]: + # No change to the state of the service return self._host_service_state[service] = host_service_state - # Host services logs and alarms only apply to worker hosts - if 'worker' in self.personality: - host_service_state_overall = \ - self.host_service_state_aggregate() - if (HOST_SERVICE_STATE.ENABLED == - host_service_state_overall): + # Host services logs and alarms only apply to the compute service on + # worker hosts + if 'worker' in self.personality and HOST_SERVICES.COMPUTE == service: + if HOST_SERVICE_STATE.ENABLED == host_service_state: self._events = event_log.host_issue_log( self, event_log.EVENT_ID.HOST_SERVICES_ENABLED) alarm.host_clear_alarm(self._alarms) self._alarms[:] = list() - elif (HOST_SERVICE_STATE.DISABLED == - host_service_state_overall): + elif HOST_SERVICE_STATE.DISABLED == host_service_state: + # Always log the disabled compute service self._events = event_log.host_issue_log( self, event_log.EVENT_ID.HOST_SERVICES_DISABLED) + # Clear any previous alarms for this host alarm.host_clear_alarm(self._alarms) self._alarms[:] = list() + # Alarm the disabled compute service if the host is still + # enabled and is not being locked. Alarm it as a failure. + if self.nfvi_host_is_enabled(): + if reason is None: + additional_text = '' + else: + additional_text = ", %s" % reason + self._alarms = alarm.host_raise_alarm( + self, alarm.ALARM_TYPE.HOST_SERVICES_FAILED, + additional_text=additional_text) - elif (HOST_SERVICE_STATE.FAILED == - host_service_state_overall): + elif HOST_SERVICE_STATE.FAILED == host_service_state: if reason is None: additional_text = '' else: @@ -790,6 +804,10 @@ class Host(ObjectData): self._events = event_log.host_issue_log( self, event_log.EVENT_ID.HOST_SERVICES_FAILED, additional_text=additional_text) + # Clear any previous alarms for this host + alarm.host_clear_alarm(self._alarms) + self._alarms[:] = list() + # Alarm the failed compute service self._alarms = alarm.host_raise_alarm( self, alarm.ALARM_TYPE.HOST_SERVICES_FAILED, additional_text=additional_text)