Set "disabled reason" for compute service.

Masakari never sets reason why compute service was disabled.
"disabled reason" was added in config.

Closes-Bug: 1936181
Change-Id: I998f7884195b93927773c7186d61c13670a53662
This commit is contained in:
Mitya_Eremeev 2021-07-14 12:11:01 +03:00
parent f2e830f927
commit c861437b52
7 changed files with 42 additions and 3 deletions

View File

@ -27,6 +27,11 @@ host_recovery_group = cfg.OptGroup(
title='Host failure recovery options', title='Host failure recovery options',
help="Configuration options for host failure recovery") help="Configuration options for host failure recovery")
process_recovery_group = cfg.OptGroup(
'process_failure',
title='Process failure recovery options',
help="Configuration options for process failure recovery")
customized_recovery_flow_group = cfg.OptGroup( customized_recovery_flow_group = cfg.OptGroup(
'taskflow_driver_recovery_flows', 'taskflow_driver_recovery_flows',
title='Customized recovery flow Options', title='Customized recovery flow Options',
@ -80,6 +85,10 @@ Operators can decide whether reserved_host should be added to aggregate group
of failed compute host. When set to True, reserved host will be added to the of failed compute host. When set to True, reserved host will be added to the
aggregate group of failed compute host. When set to False, the reserved_host aggregate group of failed compute host. When set to False, the reserved_host
will not be added to the aggregate group of failed compute host."""), will not be added to the aggregate group of failed compute host."""),
cfg.StrOpt("service_disable_reason",
default="Masakari detected host failed.",
help="Compute disable reason in case Masakari detects host "
"failure."),
] ]
instance_failure_options = [ instance_failure_options = [
@ -220,14 +229,23 @@ The allowed values for this option is comma separated dictionary of object
names in between ``{`` and ``}``.""")) names in between ``{`` and ``}``."""))
] ]
process_failure_opts = [
cfg.StrOpt("service_disable_reason",
default="Masakari detected process failed.",
help="Compute disable reason in case Masakari detects process "
"failure."),
]
def register_opts(conf): def register_opts(conf):
conf.register_group(instance_recovery_group) conf.register_group(instance_recovery_group)
conf.register_group(host_recovery_group) conf.register_group(host_recovery_group)
conf.register_group(process_recovery_group)
conf.register_group(customized_recovery_flow_group) conf.register_group(customized_recovery_flow_group)
conf.register_group(taskflow_group) conf.register_group(taskflow_group)
conf.register_opts(instance_failure_options, group=instance_recovery_group) conf.register_opts(instance_failure_options, group=instance_recovery_group)
conf.register_opts(host_failure_opts, group=host_recovery_group) conf.register_opts(host_failure_opts, group=host_recovery_group)
conf.register_opts(process_failure_opts, group=process_recovery_group)
conf.register_opts(taskflow_driver_recovery_flows, conf.register_opts(taskflow_driver_recovery_flows,
group=customized_recovery_flow_group) group=customized_recovery_flow_group)
conf.register_opts(taskflow_options, group=taskflow_group) conf.register_opts(taskflow_options, group=taskflow_group)
@ -237,6 +255,7 @@ def list_opts():
return { return {
instance_recovery_group.name: instance_failure_options, instance_recovery_group.name: instance_failure_options,
host_recovery_group.name: host_failure_opts, host_recovery_group.name: host_failure_opts,
process_recovery_group.name: process_failure_opts,
taskflow_group.name: taskflow_options taskflow_group.name: taskflow_options
} }

View File

@ -48,7 +48,8 @@ class DisableComputeServiceTask(base.MasakariTask):
def execute(self, host_name): def execute(self, host_name):
msg = "Disabling compute service on host: '%s'" % host_name msg = "Disabling compute service on host: '%s'" % host_name
self.update_details(msg) self.update_details(msg)
self.novaclient.enable_disable_service(self.context, host_name) self.novaclient.enable_disable_service(self.context, host_name,
reason=CONF.host_failure.service_disable_reason)
# Sleep until nova-compute service is marked as disabled. # Sleep until nova-compute service is marked as disabled.
log_msg = ("Sleeping %(wait)s sec before starting recovery " log_msg = ("Sleeping %(wait)s sec before starting recovery "
"thread until nova recognizes the node down.") "thread until nova recognizes the node down.")

View File

@ -45,7 +45,8 @@ class DisableComputeNodeTask(base.MasakariTask):
if not self.novaclient.is_service_disabled(self.context, host_name, if not self.novaclient.is_service_disabled(self.context, host_name,
process_name): process_name):
# disable compute node on given host # disable compute node on given host
self.novaclient.enable_disable_service(self.context, host_name) self.novaclient.enable_disable_service(self.context, host_name,
reason=CONF.process_failure.service_disable_reason)
msg = "Disabled compute service on host: '%s'" % host_name msg = "Disabled compute service on host: '%s'" % host_name
self.update_details(msg, 1.0) self.update_details(msg, 1.0)
else: else:

View File

@ -52,6 +52,7 @@ class HostFailureTestCase(test.TestCase):
self.instance_host = "fake-host" self.instance_host = "fake-host"
self.novaclient = nova.API() self.novaclient = nova.API()
self.fake_client = fakes.FakeNovaClient() self.fake_client = fakes.FakeNovaClient()
self.disabled_reason = CONF.host_failure.service_disable_reason
def _verify_instance_evacuated(self, old_instance_list): def _verify_instance_evacuated(self, old_instance_list):
for server in old_instance_list: for server in old_instance_list:
@ -86,7 +87,7 @@ class HostFailureTestCase(test.TestCase):
task.execute(self.instance_host) task.execute(self.instance_host)
mock_enable_disable.assert_called_once_with( mock_enable_disable.assert_called_once_with(
self.ctxt, self.instance_host) self.ctxt, self.instance_host, reason=self.disabled_reason)
def _test_instance_list(self, instances_evacuation_count): def _test_instance_list(self, instances_evacuation_count):
task = host_failure.PrepareHAEnabledInstancesTask(self.ctxt, task = host_failure.PrepareHAEnabledInstancesTask(self.ctxt,

View File

@ -20,12 +20,15 @@ Unit Tests for process failure TaskFlow
from unittest import mock from unittest import mock
from masakari.compute import nova from masakari.compute import nova
from masakari import conf
from masakari import context from masakari import context
from masakari.engine.drivers.taskflow import process_failure from masakari.engine.drivers.taskflow import process_failure
from masakari import exception from masakari import exception
from masakari import test from masakari import test
from masakari.tests.unit import fakes from masakari.tests.unit import fakes
CONF = conf.CONF
class ProcessFailureTestCase(test.TestCase): class ProcessFailureTestCase(test.TestCase):
@ -39,6 +42,7 @@ class ProcessFailureTestCase(test.TestCase):
# overriding 'wait_period_after_service_update' to 2 seconds # overriding 'wait_period_after_service_update' to 2 seconds
# to reduce the wait period. # to reduce the wait period.
self.override_config('wait_period_after_service_update', 2) self.override_config('wait_period_after_service_update', 2)
self.disabled_reason = CONF.process_failure.service_disable_reason
@mock.patch('masakari.compute.nova.novaclient') @mock.patch('masakari.compute.nova.novaclient')
@mock.patch('masakari.engine.drivers.taskflow.base.MasakariTask.' @mock.patch('masakari.engine.drivers.taskflow.base.MasakariTask.'

View File

@ -172,6 +172,13 @@ class FakeNovaClient(object):
services.append(service) services.append(service)
return services return services
def disable_log_reason(self, service_id, reason):
for _service in self._services:
if _service.id == service_id:
service = _service
service.status = 'disabled'
service.disabled_reason = reason
def __init__(self): def __init__(self):
self.servers = FakeNovaClient.ServerManager() self.servers = FakeNovaClient.ServerManager()
self.services = FakeNovaClient.Services() self.services = FakeNovaClient.Services()

View File

@ -0,0 +1,6 @@
---
features:
- |
Nova compute service "disable reason" is now set
in case of host or process failure.
It can be customised per type of failure via config.