diff --git a/masakari/conf/engine_driver.py b/masakari/conf/engine_driver.py index 376df76b..f1fa6d0b 100644 --- a/masakari/conf/engine_driver.py +++ b/masakari/conf/engine_driver.py @@ -39,6 +39,15 @@ instances which contain 'HA_Enabled=True' metadata key, and then it will evacuate the remaining ones. When set to False, it will evacuate only those instances which contain 'HA_Enabled=True' metadata key."""), + cfg.BoolOpt('ignore_instances_in_error_state', + default=False, + help=""" +Operators can decide whether error instances should be allowed for evacuation +from a failed source compute node or not. When set to True, it will ignore +error instances from evacuation from a failed source compute node. When set to +False, it will evacuate error instances along with other instances from a +failed source compute node."""), + cfg.BoolOpt("add_reserved_host_to_aggregate", default=False, help=""" diff --git a/masakari/engine/drivers/taskflow/host_failure.py b/masakari/engine/drivers/taskflow/host_failure.py index 9f440ccf..355292b9 100644 --- a/masakari/engine/drivers/taskflow/host_failure.py +++ b/masakari/engine/drivers/taskflow/host_failure.py @@ -70,17 +70,35 @@ class PrepareHAEnabledInstancesTask(base.MasakariTask): self.novaclient = novaclient def execute(self, context, host_name): + def _filter_instances(instance_list): + ha_enabled_instances = [] + non_ha_enabled_instances = [] + + for instance in instance_list: + is_instance_ha_enabled = strutils.bool_from_string( + instance.metadata.get('HA_Enabled', False)) + if CONF.host_failure.ignore_instances_in_error_state and ( + getattr(instance, "OS-EXT-STS:vm_state") == "error"): + if is_instance_ha_enabled: + msg = ("Ignoring recovery of HA_Enabled instance " + "'%(instance_id)s' as it is in 'error' state.") + LOG.info(msg, {'instance_id': instance.id}) + continue + + if is_instance_ha_enabled: + ha_enabled_instances.append(instance) + else: + non_ha_enabled_instances.append(instance) + + if CONF.host_failure.evacuate_all_instances: + ha_enabled_instances.extend(non_ha_enabled_instances) + + return ha_enabled_instances + instance_list = self.novaclient.get_servers(context, host_name) - if CONF.host_failure.evacuate_all_instances: - instance_list = sorted( - instance_list, key=lambda k: strutils.bool_from_string( - k.metadata.get('HA_Enabled', False)), reverse=True) - else: - instance_list = ( - [instance for instance in instance_list if - strutils.bool_from_string(instance.metadata.get('HA_Enabled', - False))]) + instance_list = _filter_instances(instance_list) + if not instance_list: msg = _('No instances to evacuate on host: %s.') % host_name LOG.info(msg) diff --git a/masakari/tests/unit/engine/drivers/taskflow/test_host_failure_flow.py b/masakari/tests/unit/engine/drivers/taskflow/test_host_failure_flow.py index 45172bcd..1ee2e56f 100644 --- a/masakari/tests/unit/engine/drivers/taskflow/test_host_failure_flow.py +++ b/masakari/tests/unit/engine/drivers/taskflow/test_host_failure_flow.py @@ -53,15 +53,21 @@ class HostFailureTestCase(test.TestCase): self.novaclient = nova.API() self.fake_client = fakes.FakeNovaClient() - def _verify_instance_evacuated(self): - for server in self.novaclient.get_servers(self.ctxt, - self.instance_host): + def _verify_instance_evacuated(self, old_instance_list): + for server in old_instance_list: instance = self.novaclient.get_server(self.ctxt, server.id) self.assertIn(getattr(instance, 'OS-EXT-STS:vm_state'), ['active', 'stopped', 'error']) - self.assertNotEqual(self.instance_host, - getattr(instance, - 'OS-EXT-SRV-ATTR:hypervisor_hostname')) + + if CONF.host_failure.ignore_instances_in_error_state and getattr( + server, 'OS-EXT-STS:vm_state') == 'error': + self.assertEqual( + self.instance_host, getattr( + instance, 'OS-EXT-SRV-ATTR:hypervisor_hostname')) + else: + self.assertNotEqual( + self.instance_host, getattr( + instance, 'OS-EXT-SRV-ATTR:hypervisor_hostname')) def _test_disable_compute_service(self, mock_enable_disable): task = host_failure.DisableComputeServiceTask(self.novaclient) @@ -70,24 +76,27 @@ class HostFailureTestCase(test.TestCase): mock_enable_disable.assert_called_once_with( self.ctxt, self.instance_host) - def _test_instance_list(self): + def _test_instance_list(self, instances_evacuation_count): task = host_failure.PrepareHAEnabledInstancesTask(self.novaclient) - instance_list = task.execute( - self.ctxt, self.instance_host) - evacuate_all_instances = CONF.host_failure.evacuate_all_instances + instance_list = task.execute(self.ctxt, self.instance_host) - if evacuate_all_instances: - self.assertEqual(len(self.fake_client.servers.list()), - len(instance_list['instance_list'])) - else: - for instance in instance_list['instance_list']: + for instance in instance_list['instance_list']: + if CONF.host_failure.ignore_instances_in_error_state: + self.assertNotEqual("error", + getattr(instance, "OS-EXT-STS:vm_state")) + if not CONF.host_failure.evacuate_all_instances: self.assertTrue(instance.metadata.get('HA_Enabled', False)) + self.assertEqual(instances_evacuation_count, + len(instance_list['instance_list'])) + return instance_list def _evacuate_instances(self, instance_list, mock_enable_disable, reserved_host=None): task = host_failure.EvacuateInstancesTask(self.novaclient) + old_instance_list = copy.deepcopy(instance_list['instance_list']) + if reserved_host: task.execute(self.ctxt, self.instance_host, instance_list['instance_list'], @@ -99,7 +108,7 @@ class HostFailureTestCase(test.TestCase): self.ctxt, self.instance_host, instance_list['instance_list']) # make sure instance is active and has different host - self._verify_instance_evacuated() + self._verify_instance_evacuated(old_instance_list) @mock.patch('masakari.compute.nova.novaclient') def test_host_failure_flow_for_auto_recovery( @@ -118,7 +127,7 @@ class HostFailureTestCase(test.TestCase): self._test_disable_compute_service(mock_enable_disable) # execute PrepareHAEnabledInstancesTask - instance_list = self._test_instance_list() + instance_list = self._test_instance_list(2) # execute EvacuateInstancesTask self._evacuate_instances(instance_list, mock_enable_disable) @@ -146,7 +155,7 @@ class HostFailureTestCase(test.TestCase): self._test_disable_compute_service(mock_enable_disable) # execute PrepareHAEnabledInstancesTask - instance_list = self._test_instance_list() + instance_list = self._test_instance_list(2) # execute EvacuateInstancesTask with mock.patch.object(host_obj.Host, "save") as mock_save: @@ -182,6 +191,50 @@ class HostFailureTestCase(test.TestCase): # execute EvacuateInstancesTask self._evacuate_instances(instance_list, mock_enable_disable) + @mock.patch('masakari.compute.nova.novaclient') + def test_host_failure_flow_ignore_error_instances( + self, _mock_novaclient, mock_unlock, mock_lock, + mock_enable_disable): + self.override_config("ignore_instances_in_error_state", + True, "host_failure") + self.override_config("evacuate_all_instances", + True, "host_failure") + _mock_novaclient.return_value = self.fake_client + + # create ha_enabled test data + self.fake_client.servers.create(id="1", host=self.instance_host, + vm_state='error', + ha_enabled=True) + self.fake_client.servers.create(id="2", host=self.instance_host, + vm_state='active', + ha_enabled=True) + + # execute PrepareHAEnabledInstancesTask + instance_list = self._test_instance_list(1) + + # execute EvacuateInstancesTask + self._evacuate_instances(instance_list, mock_enable_disable) + + @mock.patch('masakari.compute.nova.novaclient') + def test_host_failure_flow_ignore_error_instances_raise_skip_host_recovery( + self, _mock_novaclient, mock_unlock, mock_lock, + mock_enable_disable): + self.override_config("ignore_instances_in_error_state", + True, "host_failure") + self.override_config("evacuate_all_instances", + False, "host_failure") + _mock_novaclient.return_value = self.fake_client + + # create ha_enabled test data + self.fake_client.servers.create(id="1", host=self.instance_host, + vm_state='error', + ha_enabled=True) + + # execute PrepareHAEnabledInstancesTask + task = host_failure.PrepareHAEnabledInstancesTask(self.novaclient) + self.assertRaises(exception.SkipHostRecoveryException, task.execute, + self.ctxt, self.instance_host) + @mock.patch('masakari.compute.nova.novaclient') def test_host_failure_flow_all_instances_active_resized_instance( self, _mock_novaclient, mock_unlock, mock_lock, diff --git a/releasenotes/notes/add_evacuate_error_instances_conf_option-5b4d1906137395f0.yaml b/releasenotes/notes/add_evacuate_error_instances_conf_option-5b4d1906137395f0.yaml new file mode 100644 index 00000000..0c1ee97d --- /dev/null +++ b/releasenotes/notes/add_evacuate_error_instances_conf_option-5b4d1906137395f0.yaml @@ -0,0 +1,17 @@ +--- +features: + - | + Operators can decide whether error instances should be allowed for + evacuation along with other instances from a failed source compute node + or not. Added a new config option ``ignore_instances_in_error_state`` to + achieve this. When set to True, masakari will skip the recovery of error + instances otherwise it will evacuate error instances as well from a failed + source compute node. + + To use this feature, following config option need to be set under + ``host_failure`` section in 'masakari.conf' file:: + + [host_failure] + ignore_instances_in_error_state = False + + The default value for this config option is set to False.