Add ha_enabled_instance_metadata_key config option

Previously Masakari enforced a common instance metadata key that
controlled how the HA protection applied to the instance:
"HA_Enabled".
Moreover, it was the same whether considering the instance or host
failure.
There are cases where users (operators) would like to independently
configure it per failure class and in general customize its name.

This patch implements the requested behaviour.
New tests are included.

Change-Id: I9dddf784da422f239ec304a2c0b8f24625914fc5
Implements: blueprint customisable-ha-enabled-instance-metadata-key
This commit is contained in:
Radosław Piliszek 2020-07-13 21:04:41 +02:00
parent 9492987c31
commit 4397088da7
7 changed files with 148 additions and 19 deletions

View File

@ -43,13 +43,26 @@ host_failure_opts = [
cfg.BoolOpt('evacuate_all_instances',
default=True,
help="""
Operators can decide whether all instances or only those instances which
contain metadata key 'HA_Enabled=True' should be allowed for evacuation from
a failed source compute node. When set to True, it will evacuate all instances
from a failed source compute node. First preference will be given to those
instances which contain 'HA_Enabled=True' metadata key, and then it will
evacuate the remaining ones. When set to False, it will evacuate only those
instances which contain 'HA_Enabled=True' metadata key."""),
Operators can decide whether all instances or only those instances which have
``[host_failure]\\ha_enabled_instance_metadata_key`` set to ``True`` should be
allowed for evacuation from a failed source compute node.
When set to True, it will evacuate all instances from a failed source compute
node.
First preference will be given to those instances which have
``[host_failure]\\ha_enabled_instance_metadata_key`` set to ``True``,
and then it will evacuate the remaining ones.
When set to False, it will evacuate only those instances which have
``[host_failure]\\ha_enabled_instance_metadata_key`` set to ``True``.
"""),
cfg.StrOpt('ha_enabled_instance_metadata_key',
default='HA_Enabled',
help="""
Operators can decide on the instance metadata key naming that affects the
per-instance behaviour of ``[host_failure]\\evacuate_all_instances``.
The default is the same for both failure types (host, instance) but the value
can be overridden to make the metadata key different per failure type.
"""),
cfg.BoolOpt('ignore_instances_in_error_state',
default=False,
@ -74,12 +87,24 @@ instance_failure_options = [
default=False,
help="""
Operators can decide whether all instances or only those instances which
contain metadata key 'HA_Enabled=True' should be taken into account to
recover from instance failure events. When set to True, it will execute
instance failure recovery actions for an instance irrespective of whether
that particular instance contains metadata key 'HA_Enabled=True' or not.
have ``[instance_failure]\\ha_enabled_instance_metadata_key`` set to ``True``
should be taken into account to recover from instance failure events.
When set to True, it will execute instance failure recovery actions for an
instance irrespective of whether that particular instance has
``[instance_failure]\\ha_enabled_instance_metadata_key`` set to ``True``.
When set to False, it will only execute instance failure recovery actions
for an instance which contain metadata key 'HA_Enabled=True'."""),
for an instance which has
``[instance_failure]\\ha_enabled_instance_metadata_key`` set to ``True``.
"""),
cfg.StrOpt('ha_enabled_instance_metadata_key',
default='HA_Enabled',
help="""
Operators can decide on the instance metadata key naming that affects the
per-instance behaviour of ``[instance_failure]\\process_all_instances``.
The default is the same for both failure types (host, instance) but the value
can be overridden to make the metadata key different per failure type.
"""),
]
taskflow_options = [

View File

@ -73,9 +73,11 @@ class PrepareHAEnabledInstancesTask(base.MasakariTask):
ha_enabled_instances = []
non_ha_enabled_instances = []
ha_enabled_key = CONF.host_failure.ha_enabled_instance_metadata_key
for instance in instance_list:
is_instance_ha_enabled = strutils.bool_from_string(
instance.metadata.get('HA_Enabled', False))
instance.metadata.get(ha_enabled_key, False))
if CONF.host_failure.ignore_instances_in_error_state and (
getattr(instance, "OS-EXT-STS:vm_state") == "error"):
if is_instance_ha_enabled:

View File

@ -43,12 +43,14 @@ class StopInstanceTask(base.MasakariTask):
"""Stop the instance for recovery."""
instance = self.novaclient.get_server(self.context, instance_uuid)
ha_enabled_key = CONF.instance_failure.ha_enabled_instance_metadata_key
# If an instance is not HA_Enabled and "process_all_instances" config
# option is also disabled, then there is no need to take any recovery
# action.
if not CONF.instance_failure.process_all_instances and not (
strutils.bool_from_string(
instance.metadata.get('HA_Enabled', False))):
instance.metadata.get(ha_enabled_key, False))):
msg = ("Skipping recovery for instance: %(instance_uuid)s as it is"
" not Ha_Enabled") % {'instance_uuid': instance_uuid}
LOG.info(msg)

View File

@ -99,7 +99,9 @@ class HostFailureTestCase(test.TestCase):
self.assertNotEqual("error",
getattr(instance, "OS-EXT-STS:vm_state"))
if not CONF.host_failure.evacuate_all_instances:
self.assertTrue(instance.metadata.get('HA_Enabled', False))
ha_enabled_key = (CONF.host_failure
.ha_enabled_instance_metadata_key)
self.assertTrue(instance.metadata.get(ha_enabled_key, False))
instance_uuid_list.append(instance.id)
@ -207,6 +209,54 @@ class HostFailureTestCase(test.TestCase):
mock.call('Evacuation process completed!', 1.0)
])
@mock.patch('masakari.compute.nova.novaclient')
@mock.patch('masakari.engine.drivers.taskflow.base.MasakariTask.'
'update_details')
def test_host_failure_flow_for_auto_recovery_custom_ha_key(
self, _mock_notify, _mock_novaclient, mock_unlock, mock_lock,
mock_enable_disable):
_mock_novaclient.return_value = self.fake_client
self.override_config("evacuate_all_instances",
False, "host_failure")
ha_enabled_key = 'Ensure-My-HA'
self.override_config('ha_enabled_instance_metadata_key',
ha_enabled_key, 'host_failure')
# create test data
self.fake_client.servers.create(id="1", host=self.instance_host,
ha_enabled=True)
self.fake_client.servers.create(id="2", host=self.instance_host,
ha_enabled=True,
ha_enabled_key=ha_enabled_key)
# execute DisableComputeServiceTask
self._test_disable_compute_service(mock_enable_disable)
# execute PrepareHAEnabledInstancesTask
instance_list = self._test_instance_list(1)
# execute EvacuateInstancesTask
self._evacuate_instances(instance_list, mock_enable_disable)
# verify progress details
_mock_notify.assert_has_calls([
mock.call("Disabling compute service on host: 'fake-host'"),
mock.call("Disabled compute service on host: 'fake-host'", 1.0),
mock.call('Preparing instances for evacuation'),
mock.call("Total instances running on failed host 'fake-host' is 2"
"", 0.3),
mock.call("Total HA Enabled instances count: '1'", 0.6),
mock.call("Instances to be evacuated are: '2'", 1.0),
mock.call("Start evacuation of instances from failed host "
"'fake-host', instance uuids are: '2'"),
mock.call("Evacuation of instance started: '2'", 0.5),
mock.call("Successfully evacuate instances '2' from host "
"'fake-host'", 0.7),
mock.call('Evacuation process completed!', 1.0)
])
@mock.patch('masakari.compute.nova.novaclient')
@mock.patch('masakari.engine.drivers.taskflow.base.MasakariTask.'
'update_details')

View File

@ -93,6 +93,46 @@ class InstanceFailureTestCase(test.TestCase):
"' vm_state is ACTIVE", 1.0)
])
@mock.patch('masakari.compute.nova.novaclient')
@mock.patch('masakari.engine.drivers.taskflow.base.MasakariTask.'
'update_details')
def test_instance_failure_flow_custom_ha_key(
self, _mock_notify, _mock_novaclient):
_mock_novaclient.return_value = self.fake_client
ha_enabled_key = 'Ensure-My-HA'
self.override_config('ha_enabled_instance_metadata_key',
ha_enabled_key, 'instance_failure')
# create test data with custom ha_enabled_key
self.fake_client.servers.create(self.instance_id,
host="fake-host",
ha_enabled=True,
ha_enabled_key=ha_enabled_key)
# test StopInstanceTask
self._test_stop_instance()
# test StartInstanceTask
task = instance_failure.StartInstanceTask(self.ctxt, self.novaclient)
task.execute(self.instance_id)
# test ConfirmInstanceActiveTask
self._test_confirm_instance_is_active()
# verify progress details
_mock_notify.assert_has_calls([
mock.call('Stopping instance: ' + self.instance_id),
mock.call("Stopped instance: '" + self.instance_id + "'", 1.0),
mock.call("Starting instance: '" + self.instance_id + "'"),
mock.call("Instance started: '" + self.instance_id + "'", 1.0),
mock.call("Confirming instance '" + self.instance_id +
"' vm_state is ACTIVE"),
mock.call("Confirmed instance '" + self.instance_id +
"' vm_state is ACTIVE", 1.0)
])
@mock.patch('masakari.compute.nova.novaclient')
@mock.patch('masakari.engine.drivers.taskflow.base.MasakariTask.'
'update_details')

View File

@ -41,7 +41,7 @@ class FakeNovaClient(object):
class Server(object):
def __init__(self, id=None, uuid=None, host=None, vm_state=None,
task_state=None, power_state=1, ha_enabled=None,
locked=False):
ha_enabled_key='HA_Enabled', locked=False):
self.id = id
self.uuid = uuid or uuidutils.generate_uuid()
self.host = host
@ -49,7 +49,7 @@ class FakeNovaClient(object):
setattr(self, 'OS-EXT-STS:vm_state', vm_state)
setattr(self, 'OS-EXT-STS:task_state', task_state)
setattr(self, 'OS-EXT-STS:power_state', power_state)
self.metadata = {"HA_Enabled": ha_enabled}
self.metadata = {ha_enabled_key: ha_enabled}
self.locked = locked
class ServerManager(object):
@ -59,12 +59,14 @@ class FakeNovaClient(object):
self.stop_calls = []
def create(self, id, uuid=None, host=None, vm_state='active',
task_state=None, power_state=1, ha_enabled=False):
task_state=None, power_state=1, ha_enabled=False,
ha_enabled_key='HA_Enabled'):
server = FakeNovaClient.Server(id=id, uuid=uuid, host=host,
vm_state=vm_state,
task_state=task_state,
power_state=power_state,
ha_enabled=ha_enabled)
ha_enabled=ha_enabled,
ha_enabled_key=ha_enabled_key)
self._servers.append(server)
return server

View File

@ -0,0 +1,8 @@
---
features:
- |
Adds ``ha_enabled_instance_metadata_key`` config option to ``host_failure``
and ``instance_failure`` config groups. This option allows operators to
override the default ``HA_Enabled`` instance metadata key which controls
the behaviour of Masakari towards the instance. This way one can have
different keys for different failure types (host vs instance failures).