Add ha_enabled_instance_metadata_key config option

Previously Masakari enforced a common instance metadata key that
controlled how the HA protection applied to the instance:
"HA_Enabled".
Moreover, it was the same whether considering the instance or host
failure.
There are cases where users (operators) would like to independently
configure it per failure class and in general customize its name.

This patch implements the requested behaviour.
New tests are included.

Change-Id: I9dddf784da422f239ec304a2c0b8f24625914fc5
Implements: blueprint customisable-ha-enabled-instance-metadata-key
This commit is contained in:
Radosław Piliszek 2020-07-13 21:04:41 +02:00
parent 9492987c31
commit 4397088da7
7 changed files with 148 additions and 19 deletions

View File

@ -43,13 +43,26 @@ host_failure_opts = [
cfg.BoolOpt('evacuate_all_instances', cfg.BoolOpt('evacuate_all_instances',
default=True, default=True,
help=""" help="""
Operators can decide whether all instances or only those instances which Operators can decide whether all instances or only those instances which have
contain metadata key 'HA_Enabled=True' should be allowed for evacuation from ``[host_failure]\\ha_enabled_instance_metadata_key`` set to ``True`` should be
a failed source compute node. When set to True, it will evacuate all instances allowed for evacuation from a failed source compute node.
from a failed source compute node. First preference will be given to those When set to True, it will evacuate all instances from a failed source compute
instances which contain 'HA_Enabled=True' metadata key, and then it will node.
evacuate the remaining ones. When set to False, it will evacuate only those First preference will be given to those instances which have
instances which contain 'HA_Enabled=True' metadata key."""), ``[host_failure]\\ha_enabled_instance_metadata_key`` set to ``True``,
and then it will evacuate the remaining ones.
When set to False, it will evacuate only those instances which have
``[host_failure]\\ha_enabled_instance_metadata_key`` set to ``True``.
"""),
cfg.StrOpt('ha_enabled_instance_metadata_key',
default='HA_Enabled',
help="""
Operators can decide on the instance metadata key naming that affects the
per-instance behaviour of ``[host_failure]\\evacuate_all_instances``.
The default is the same for both failure types (host, instance) but the value
can be overridden to make the metadata key different per failure type.
"""),
cfg.BoolOpt('ignore_instances_in_error_state', cfg.BoolOpt('ignore_instances_in_error_state',
default=False, default=False,
@ -74,12 +87,24 @@ instance_failure_options = [
default=False, default=False,
help=""" help="""
Operators can decide whether all instances or only those instances which Operators can decide whether all instances or only those instances which
contain metadata key 'HA_Enabled=True' should be taken into account to have ``[instance_failure]\\ha_enabled_instance_metadata_key`` set to ``True``
recover from instance failure events. When set to True, it will execute should be taken into account to recover from instance failure events.
instance failure recovery actions for an instance irrespective of whether When set to True, it will execute instance failure recovery actions for an
that particular instance contains metadata key 'HA_Enabled=True' or not. instance irrespective of whether that particular instance has
``[instance_failure]\\ha_enabled_instance_metadata_key`` set to ``True``.
When set to False, it will only execute instance failure recovery actions When set to False, it will only execute instance failure recovery actions
for an instance which contain metadata key 'HA_Enabled=True'."""), for an instance which has
``[instance_failure]\\ha_enabled_instance_metadata_key`` set to ``True``.
"""),
cfg.StrOpt('ha_enabled_instance_metadata_key',
default='HA_Enabled',
help="""
Operators can decide on the instance metadata key naming that affects the
per-instance behaviour of ``[instance_failure]\\process_all_instances``.
The default is the same for both failure types (host, instance) but the value
can be overridden to make the metadata key different per failure type.
"""),
] ]
taskflow_options = [ taskflow_options = [

View File

@ -73,9 +73,11 @@ class PrepareHAEnabledInstancesTask(base.MasakariTask):
ha_enabled_instances = [] ha_enabled_instances = []
non_ha_enabled_instances = [] non_ha_enabled_instances = []
ha_enabled_key = CONF.host_failure.ha_enabled_instance_metadata_key
for instance in instance_list: for instance in instance_list:
is_instance_ha_enabled = strutils.bool_from_string( is_instance_ha_enabled = strutils.bool_from_string(
instance.metadata.get('HA_Enabled', False)) instance.metadata.get(ha_enabled_key, False))
if CONF.host_failure.ignore_instances_in_error_state and ( if CONF.host_failure.ignore_instances_in_error_state and (
getattr(instance, "OS-EXT-STS:vm_state") == "error"): getattr(instance, "OS-EXT-STS:vm_state") == "error"):
if is_instance_ha_enabled: if is_instance_ha_enabled:

View File

@ -43,12 +43,14 @@ class StopInstanceTask(base.MasakariTask):
"""Stop the instance for recovery.""" """Stop the instance for recovery."""
instance = self.novaclient.get_server(self.context, instance_uuid) instance = self.novaclient.get_server(self.context, instance_uuid)
ha_enabled_key = CONF.instance_failure.ha_enabled_instance_metadata_key
# If an instance is not HA_Enabled and "process_all_instances" config # If an instance is not HA_Enabled and "process_all_instances" config
# option is also disabled, then there is no need to take any recovery # option is also disabled, then there is no need to take any recovery
# action. # action.
if not CONF.instance_failure.process_all_instances and not ( if not CONF.instance_failure.process_all_instances and not (
strutils.bool_from_string( strutils.bool_from_string(
instance.metadata.get('HA_Enabled', False))): instance.metadata.get(ha_enabled_key, False))):
msg = ("Skipping recovery for instance: %(instance_uuid)s as it is" msg = ("Skipping recovery for instance: %(instance_uuid)s as it is"
" not Ha_Enabled") % {'instance_uuid': instance_uuid} " not Ha_Enabled") % {'instance_uuid': instance_uuid}
LOG.info(msg) LOG.info(msg)

View File

@ -99,7 +99,9 @@ class HostFailureTestCase(test.TestCase):
self.assertNotEqual("error", self.assertNotEqual("error",
getattr(instance, "OS-EXT-STS:vm_state")) getattr(instance, "OS-EXT-STS:vm_state"))
if not CONF.host_failure.evacuate_all_instances: if not CONF.host_failure.evacuate_all_instances:
self.assertTrue(instance.metadata.get('HA_Enabled', False)) ha_enabled_key = (CONF.host_failure
.ha_enabled_instance_metadata_key)
self.assertTrue(instance.metadata.get(ha_enabled_key, False))
instance_uuid_list.append(instance.id) instance_uuid_list.append(instance.id)
@ -207,6 +209,54 @@ class HostFailureTestCase(test.TestCase):
mock.call('Evacuation process completed!', 1.0) mock.call('Evacuation process completed!', 1.0)
]) ])
@mock.patch('masakari.compute.nova.novaclient')
@mock.patch('masakari.engine.drivers.taskflow.base.MasakariTask.'
'update_details')
def test_host_failure_flow_for_auto_recovery_custom_ha_key(
self, _mock_notify, _mock_novaclient, mock_unlock, mock_lock,
mock_enable_disable):
_mock_novaclient.return_value = self.fake_client
self.override_config("evacuate_all_instances",
False, "host_failure")
ha_enabled_key = 'Ensure-My-HA'
self.override_config('ha_enabled_instance_metadata_key',
ha_enabled_key, 'host_failure')
# create test data
self.fake_client.servers.create(id="1", host=self.instance_host,
ha_enabled=True)
self.fake_client.servers.create(id="2", host=self.instance_host,
ha_enabled=True,
ha_enabled_key=ha_enabled_key)
# execute DisableComputeServiceTask
self._test_disable_compute_service(mock_enable_disable)
# execute PrepareHAEnabledInstancesTask
instance_list = self._test_instance_list(1)
# execute EvacuateInstancesTask
self._evacuate_instances(instance_list, mock_enable_disable)
# verify progress details
_mock_notify.assert_has_calls([
mock.call("Disabling compute service on host: 'fake-host'"),
mock.call("Disabled compute service on host: 'fake-host'", 1.0),
mock.call('Preparing instances for evacuation'),
mock.call("Total instances running on failed host 'fake-host' is 2"
"", 0.3),
mock.call("Total HA Enabled instances count: '1'", 0.6),
mock.call("Instances to be evacuated are: '2'", 1.0),
mock.call("Start evacuation of instances from failed host "
"'fake-host', instance uuids are: '2'"),
mock.call("Evacuation of instance started: '2'", 0.5),
mock.call("Successfully evacuate instances '2' from host "
"'fake-host'", 0.7),
mock.call('Evacuation process completed!', 1.0)
])
@mock.patch('masakari.compute.nova.novaclient') @mock.patch('masakari.compute.nova.novaclient')
@mock.patch('masakari.engine.drivers.taskflow.base.MasakariTask.' @mock.patch('masakari.engine.drivers.taskflow.base.MasakariTask.'
'update_details') 'update_details')

View File

@ -93,6 +93,46 @@ class InstanceFailureTestCase(test.TestCase):
"' vm_state is ACTIVE", 1.0) "' vm_state is ACTIVE", 1.0)
]) ])
@mock.patch('masakari.compute.nova.novaclient')
@mock.patch('masakari.engine.drivers.taskflow.base.MasakariTask.'
'update_details')
def test_instance_failure_flow_custom_ha_key(
self, _mock_notify, _mock_novaclient):
_mock_novaclient.return_value = self.fake_client
ha_enabled_key = 'Ensure-My-HA'
self.override_config('ha_enabled_instance_metadata_key',
ha_enabled_key, 'instance_failure')
# create test data with custom ha_enabled_key
self.fake_client.servers.create(self.instance_id,
host="fake-host",
ha_enabled=True,
ha_enabled_key=ha_enabled_key)
# test StopInstanceTask
self._test_stop_instance()
# test StartInstanceTask
task = instance_failure.StartInstanceTask(self.ctxt, self.novaclient)
task.execute(self.instance_id)
# test ConfirmInstanceActiveTask
self._test_confirm_instance_is_active()
# verify progress details
_mock_notify.assert_has_calls([
mock.call('Stopping instance: ' + self.instance_id),
mock.call("Stopped instance: '" + self.instance_id + "'", 1.0),
mock.call("Starting instance: '" + self.instance_id + "'"),
mock.call("Instance started: '" + self.instance_id + "'", 1.0),
mock.call("Confirming instance '" + self.instance_id +
"' vm_state is ACTIVE"),
mock.call("Confirmed instance '" + self.instance_id +
"' vm_state is ACTIVE", 1.0)
])
@mock.patch('masakari.compute.nova.novaclient') @mock.patch('masakari.compute.nova.novaclient')
@mock.patch('masakari.engine.drivers.taskflow.base.MasakariTask.' @mock.patch('masakari.engine.drivers.taskflow.base.MasakariTask.'
'update_details') 'update_details')

View File

@ -41,7 +41,7 @@ class FakeNovaClient(object):
class Server(object): class Server(object):
def __init__(self, id=None, uuid=None, host=None, vm_state=None, def __init__(self, id=None, uuid=None, host=None, vm_state=None,
task_state=None, power_state=1, ha_enabled=None, task_state=None, power_state=1, ha_enabled=None,
locked=False): ha_enabled_key='HA_Enabled', locked=False):
self.id = id self.id = id
self.uuid = uuid or uuidutils.generate_uuid() self.uuid = uuid or uuidutils.generate_uuid()
self.host = host self.host = host
@ -49,7 +49,7 @@ class FakeNovaClient(object):
setattr(self, 'OS-EXT-STS:vm_state', vm_state) setattr(self, 'OS-EXT-STS:vm_state', vm_state)
setattr(self, 'OS-EXT-STS:task_state', task_state) setattr(self, 'OS-EXT-STS:task_state', task_state)
setattr(self, 'OS-EXT-STS:power_state', power_state) setattr(self, 'OS-EXT-STS:power_state', power_state)
self.metadata = {"HA_Enabled": ha_enabled} self.metadata = {ha_enabled_key: ha_enabled}
self.locked = locked self.locked = locked
class ServerManager(object): class ServerManager(object):
@ -59,12 +59,14 @@ class FakeNovaClient(object):
self.stop_calls = [] self.stop_calls = []
def create(self, id, uuid=None, host=None, vm_state='active', def create(self, id, uuid=None, host=None, vm_state='active',
task_state=None, power_state=1, ha_enabled=False): task_state=None, power_state=1, ha_enabled=False,
ha_enabled_key='HA_Enabled'):
server = FakeNovaClient.Server(id=id, uuid=uuid, host=host, server = FakeNovaClient.Server(id=id, uuid=uuid, host=host,
vm_state=vm_state, vm_state=vm_state,
task_state=task_state, task_state=task_state,
power_state=power_state, power_state=power_state,
ha_enabled=ha_enabled) ha_enabled=ha_enabled,
ha_enabled_key=ha_enabled_key)
self._servers.append(server) self._servers.append(server)
return server return server

View File

@ -0,0 +1,8 @@
---
features:
- |
Adds ``ha_enabled_instance_metadata_key`` config option to ``host_failure``
and ``instance_failure`` config groups. This option allows operators to
override the default ``HA_Enabled`` instance metadata key which controls
the behaviour of Masakari towards the instance. This way one can have
different keys for different failure types (host vs instance failures).