From 4397088da71d3a5d75a61b44f0ec4e0e711cbe97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Piliszek?= Date: Mon, 13 Jul 2020 21:04:41 +0200 Subject: [PATCH] Add ha_enabled_instance_metadata_key config option Previously Masakari enforced a common instance metadata key that controlled how the HA protection applied to the instance: "HA_Enabled". Moreover, it was the same whether considering the instance or host failure. There are cases where users (operators) would like to independently configure it per failure class and in general customize its name. This patch implements the requested behaviour. New tests are included. Change-Id: I9dddf784da422f239ec304a2c0b8f24625914fc5 Implements: blueprint customisable-ha-enabled-instance-metadata-key --- masakari/conf/engine_driver.py | 49 ++++++++++++----- .../engine/drivers/taskflow/host_failure.py | 4 +- .../drivers/taskflow/instance_failure.py | 4 +- .../taskflow/test_host_failure_flow.py | 52 ++++++++++++++++++- .../taskflow/test_instance_failure_flow.py | 40 ++++++++++++++ masakari/tests/unit/fakes.py | 10 ++-- ...nstance-metadata-key-af511ea2aac96690.yaml | 8 +++ 7 files changed, 148 insertions(+), 19 deletions(-) create mode 100644 releasenotes/notes/customisable-ha-enabled-instance-metadata-key-af511ea2aac96690.yaml diff --git a/masakari/conf/engine_driver.py b/masakari/conf/engine_driver.py index 12b9f3b4..0bc3bbdc 100644 --- a/masakari/conf/engine_driver.py +++ b/masakari/conf/engine_driver.py @@ -43,13 +43,26 @@ host_failure_opts = [ cfg.BoolOpt('evacuate_all_instances', default=True, help=""" -Operators can decide whether all instances or only those instances which -contain metadata key 'HA_Enabled=True' should be allowed for evacuation from -a failed source compute node. When set to True, it will evacuate all instances -from a failed source compute node. First preference will be given to those -instances which contain 'HA_Enabled=True' metadata key, and then it will -evacuate the remaining ones. When set to False, it will evacuate only those -instances which contain 'HA_Enabled=True' metadata key."""), +Operators can decide whether all instances or only those instances which have +``[host_failure]\\ha_enabled_instance_metadata_key`` set to ``True`` should be +allowed for evacuation from a failed source compute node. +When set to True, it will evacuate all instances from a failed source compute +node. +First preference will be given to those instances which have +``[host_failure]\\ha_enabled_instance_metadata_key`` set to ``True``, +and then it will evacuate the remaining ones. +When set to False, it will evacuate only those instances which have +``[host_failure]\\ha_enabled_instance_metadata_key`` set to ``True``. + """), + + cfg.StrOpt('ha_enabled_instance_metadata_key', + default='HA_Enabled', + help=""" +Operators can decide on the instance metadata key naming that affects the +per-instance behaviour of ``[host_failure]\\evacuate_all_instances``. +The default is the same for both failure types (host, instance) but the value +can be overridden to make the metadata key different per failure type. + """), cfg.BoolOpt('ignore_instances_in_error_state', default=False, @@ -74,12 +87,24 @@ instance_failure_options = [ default=False, help=""" Operators can decide whether all instances or only those instances which -contain metadata key 'HA_Enabled=True' should be taken into account to -recover from instance failure events. When set to True, it will execute -instance failure recovery actions for an instance irrespective of whether -that particular instance contains metadata key 'HA_Enabled=True' or not. +have ``[instance_failure]\\ha_enabled_instance_metadata_key`` set to ``True`` +should be taken into account to recover from instance failure events. +When set to True, it will execute instance failure recovery actions for an +instance irrespective of whether that particular instance has +``[instance_failure]\\ha_enabled_instance_metadata_key`` set to ``True``. When set to False, it will only execute instance failure recovery actions -for an instance which contain metadata key 'HA_Enabled=True'."""), +for an instance which has +``[instance_failure]\\ha_enabled_instance_metadata_key`` set to ``True``. + """), + + cfg.StrOpt('ha_enabled_instance_metadata_key', + default='HA_Enabled', + help=""" +Operators can decide on the instance metadata key naming that affects the +per-instance behaviour of ``[instance_failure]\\process_all_instances``. +The default is the same for both failure types (host, instance) but the value +can be overridden to make the metadata key different per failure type. + """), ] taskflow_options = [ diff --git a/masakari/engine/drivers/taskflow/host_failure.py b/masakari/engine/drivers/taskflow/host_failure.py index a9f6427e..ddcc5c87 100644 --- a/masakari/engine/drivers/taskflow/host_failure.py +++ b/masakari/engine/drivers/taskflow/host_failure.py @@ -73,9 +73,11 @@ class PrepareHAEnabledInstancesTask(base.MasakariTask): ha_enabled_instances = [] non_ha_enabled_instances = [] + ha_enabled_key = CONF.host_failure.ha_enabled_instance_metadata_key + for instance in instance_list: is_instance_ha_enabled = strutils.bool_from_string( - instance.metadata.get('HA_Enabled', False)) + instance.metadata.get(ha_enabled_key, False)) if CONF.host_failure.ignore_instances_in_error_state and ( getattr(instance, "OS-EXT-STS:vm_state") == "error"): if is_instance_ha_enabled: diff --git a/masakari/engine/drivers/taskflow/instance_failure.py b/masakari/engine/drivers/taskflow/instance_failure.py index 3b9276cb..692bd9d4 100644 --- a/masakari/engine/drivers/taskflow/instance_failure.py +++ b/masakari/engine/drivers/taskflow/instance_failure.py @@ -43,12 +43,14 @@ class StopInstanceTask(base.MasakariTask): """Stop the instance for recovery.""" instance = self.novaclient.get_server(self.context, instance_uuid) + ha_enabled_key = CONF.instance_failure.ha_enabled_instance_metadata_key + # If an instance is not HA_Enabled and "process_all_instances" config # option is also disabled, then there is no need to take any recovery # action. if not CONF.instance_failure.process_all_instances and not ( strutils.bool_from_string( - instance.metadata.get('HA_Enabled', False))): + instance.metadata.get(ha_enabled_key, False))): msg = ("Skipping recovery for instance: %(instance_uuid)s as it is" " not Ha_Enabled") % {'instance_uuid': instance_uuid} LOG.info(msg) diff --git a/masakari/tests/unit/engine/drivers/taskflow/test_host_failure_flow.py b/masakari/tests/unit/engine/drivers/taskflow/test_host_failure_flow.py index f63bb1ec..5f70ede3 100644 --- a/masakari/tests/unit/engine/drivers/taskflow/test_host_failure_flow.py +++ b/masakari/tests/unit/engine/drivers/taskflow/test_host_failure_flow.py @@ -99,7 +99,9 @@ class HostFailureTestCase(test.TestCase): self.assertNotEqual("error", getattr(instance, "OS-EXT-STS:vm_state")) if not CONF.host_failure.evacuate_all_instances: - self.assertTrue(instance.metadata.get('HA_Enabled', False)) + ha_enabled_key = (CONF.host_failure + .ha_enabled_instance_metadata_key) + self.assertTrue(instance.metadata.get(ha_enabled_key, False)) instance_uuid_list.append(instance.id) @@ -207,6 +209,54 @@ class HostFailureTestCase(test.TestCase): mock.call('Evacuation process completed!', 1.0) ]) + @mock.patch('masakari.compute.nova.novaclient') + @mock.patch('masakari.engine.drivers.taskflow.base.MasakariTask.' + 'update_details') + def test_host_failure_flow_for_auto_recovery_custom_ha_key( + self, _mock_notify, _mock_novaclient, mock_unlock, mock_lock, + mock_enable_disable): + _mock_novaclient.return_value = self.fake_client + self.override_config("evacuate_all_instances", + False, "host_failure") + + ha_enabled_key = 'Ensure-My-HA' + + self.override_config('ha_enabled_instance_metadata_key', + ha_enabled_key, 'host_failure') + + # create test data + self.fake_client.servers.create(id="1", host=self.instance_host, + ha_enabled=True) + self.fake_client.servers.create(id="2", host=self.instance_host, + ha_enabled=True, + ha_enabled_key=ha_enabled_key) + + # execute DisableComputeServiceTask + self._test_disable_compute_service(mock_enable_disable) + + # execute PrepareHAEnabledInstancesTask + instance_list = self._test_instance_list(1) + + # execute EvacuateInstancesTask + self._evacuate_instances(instance_list, mock_enable_disable) + + # verify progress details + _mock_notify.assert_has_calls([ + mock.call("Disabling compute service on host: 'fake-host'"), + mock.call("Disabled compute service on host: 'fake-host'", 1.0), + mock.call('Preparing instances for evacuation'), + mock.call("Total instances running on failed host 'fake-host' is 2" + "", 0.3), + mock.call("Total HA Enabled instances count: '1'", 0.6), + mock.call("Instances to be evacuated are: '2'", 1.0), + mock.call("Start evacuation of instances from failed host " + "'fake-host', instance uuids are: '2'"), + mock.call("Evacuation of instance started: '2'", 0.5), + mock.call("Successfully evacuate instances '2' from host " + "'fake-host'", 0.7), + mock.call('Evacuation process completed!', 1.0) + ]) + @mock.patch('masakari.compute.nova.novaclient') @mock.patch('masakari.engine.drivers.taskflow.base.MasakariTask.' 'update_details') diff --git a/masakari/tests/unit/engine/drivers/taskflow/test_instance_failure_flow.py b/masakari/tests/unit/engine/drivers/taskflow/test_instance_failure_flow.py index 5ae18618..3c2db18c 100644 --- a/masakari/tests/unit/engine/drivers/taskflow/test_instance_failure_flow.py +++ b/masakari/tests/unit/engine/drivers/taskflow/test_instance_failure_flow.py @@ -93,6 +93,46 @@ class InstanceFailureTestCase(test.TestCase): "' vm_state is ACTIVE", 1.0) ]) + @mock.patch('masakari.compute.nova.novaclient') + @mock.patch('masakari.engine.drivers.taskflow.base.MasakariTask.' + 'update_details') + def test_instance_failure_flow_custom_ha_key( + self, _mock_notify, _mock_novaclient): + _mock_novaclient.return_value = self.fake_client + + ha_enabled_key = 'Ensure-My-HA' + + self.override_config('ha_enabled_instance_metadata_key', + ha_enabled_key, 'instance_failure') + + # create test data with custom ha_enabled_key + self.fake_client.servers.create(self.instance_id, + host="fake-host", + ha_enabled=True, + ha_enabled_key=ha_enabled_key) + + # test StopInstanceTask + self._test_stop_instance() + + # test StartInstanceTask + task = instance_failure.StartInstanceTask(self.ctxt, self.novaclient) + task.execute(self.instance_id) + + # test ConfirmInstanceActiveTask + self._test_confirm_instance_is_active() + + # verify progress details + _mock_notify.assert_has_calls([ + mock.call('Stopping instance: ' + self.instance_id), + mock.call("Stopped instance: '" + self.instance_id + "'", 1.0), + mock.call("Starting instance: '" + self.instance_id + "'"), + mock.call("Instance started: '" + self.instance_id + "'", 1.0), + mock.call("Confirming instance '" + self.instance_id + + "' vm_state is ACTIVE"), + mock.call("Confirmed instance '" + self.instance_id + + "' vm_state is ACTIVE", 1.0) + ]) + @mock.patch('masakari.compute.nova.novaclient') @mock.patch('masakari.engine.drivers.taskflow.base.MasakariTask.' 'update_details') diff --git a/masakari/tests/unit/fakes.py b/masakari/tests/unit/fakes.py index 19a80a16..74c38576 100644 --- a/masakari/tests/unit/fakes.py +++ b/masakari/tests/unit/fakes.py @@ -41,7 +41,7 @@ class FakeNovaClient(object): class Server(object): def __init__(self, id=None, uuid=None, host=None, vm_state=None, task_state=None, power_state=1, ha_enabled=None, - locked=False): + ha_enabled_key='HA_Enabled', locked=False): self.id = id self.uuid = uuid or uuidutils.generate_uuid() self.host = host @@ -49,7 +49,7 @@ class FakeNovaClient(object): setattr(self, 'OS-EXT-STS:vm_state', vm_state) setattr(self, 'OS-EXT-STS:task_state', task_state) setattr(self, 'OS-EXT-STS:power_state', power_state) - self.metadata = {"HA_Enabled": ha_enabled} + self.metadata = {ha_enabled_key: ha_enabled} self.locked = locked class ServerManager(object): @@ -59,12 +59,14 @@ class FakeNovaClient(object): self.stop_calls = [] def create(self, id, uuid=None, host=None, vm_state='active', - task_state=None, power_state=1, ha_enabled=False): + task_state=None, power_state=1, ha_enabled=False, + ha_enabled_key='HA_Enabled'): server = FakeNovaClient.Server(id=id, uuid=uuid, host=host, vm_state=vm_state, task_state=task_state, power_state=power_state, - ha_enabled=ha_enabled) + ha_enabled=ha_enabled, + ha_enabled_key=ha_enabled_key) self._servers.append(server) return server diff --git a/releasenotes/notes/customisable-ha-enabled-instance-metadata-key-af511ea2aac96690.yaml b/releasenotes/notes/customisable-ha-enabled-instance-metadata-key-af511ea2aac96690.yaml new file mode 100644 index 00000000..dbaace65 --- /dev/null +++ b/releasenotes/notes/customisable-ha-enabled-instance-metadata-key-af511ea2aac96690.yaml @@ -0,0 +1,8 @@ +--- +features: + - | + Adds ``ha_enabled_instance_metadata_key`` config option to ``host_failure`` + and ``instance_failure`` config groups. This option allows operators to + override the default ``HA_Enabled`` instance metadata key which controls + the behaviour of Masakari towards the instance. This way one can have + different keys for different failure types (host vs instance failures).