diff --git a/masakari/conf/__init__.py b/masakari/conf/__init__.py index cb4d47f2..53a3e963 100644 --- a/masakari/conf/__init__.py +++ b/masakari/conf/__init__.py @@ -19,6 +19,7 @@ from masakari.conf import api from masakari.conf import base from masakari.conf import database from masakari.conf import engine +from masakari.conf import engine_driver from masakari.conf import exceptions from masakari.conf import nova from masakari.conf import osapi_v1 @@ -33,6 +34,7 @@ api.register_opts(CONF) base.register_opts(CONF) database.register_opts(CONF) engine.register_opts(CONF) +engine_driver.register_opts(CONF) exceptions.register_opts(CONF) nova.register_opts(CONF) osapi_v1.register_opts(CONF) diff --git a/masakari/conf/engine_driver.py b/masakari/conf/engine_driver.py new file mode 100644 index 00000000..0c8f289d --- /dev/null +++ b/masakari/conf/engine_driver.py @@ -0,0 +1,68 @@ +# Copyright 2016 NTT DATA +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +from oslo_config import cfg + + +instance_recovery_group = cfg.OptGroup( + 'instance_failure', + title='Instance failure recovery options', + help="Configuration options for instance failure recovery") + +host_recovery_group = cfg.OptGroup( + 'host_failure', + title='Host failure recovery options', + help="Configuration options for host failure recovery") + + +host_failure_opts = [ + cfg.BoolOpt('evacuate_all_instances', + default=True, + help=""" +Operators can decide whether all instances or only those instances which +contain metadata key 'HA_Enabled=True' should be allowed for evacuation from +a failed source compute node. When set to True, it will evacuate all instances +from a failed source compute node. First preference will be given to those +instances which contain 'HA_Enabled=True' metadata key, and then it will +evacuate the remaining ones. When set to False, it will evacuate only those +instances which contain 'HA_Enabled=True' metadata key."""), +] + +instance_failure_options = [ + cfg.BoolOpt('process_all_instances', + default=False, + help=""" +Operators can decide whether all instances or only those instances which +contain metadata key 'HA_Enabled=True' should be taken into account to +recover from instance failure events. When set to True, it will execute +instance failure recovery actions for an instance irrespective of whether +that particular instance contains metadata key 'HA_Enabled=True' or not. +When set to False, it will only execute instance failure recovery actions +for an instance which contain metadata key 'HA_Enabled=True'."""), +] + + +def register_opts(conf): + conf.register_group(instance_recovery_group) + conf.register_group(host_recovery_group) + conf.register_opts(instance_failure_options, group=instance_recovery_group) + conf.register_opts(host_failure_opts, group=host_recovery_group) + + +def list_opts(): + return { + instance_recovery_group.name: instance_failure_options, + host_recovery_group.name: host_failure_opts + } diff --git a/masakari/engine/drivers/taskflow/host_failure.py b/masakari/engine/drivers/taskflow/host_failure.py index 4b26aa1c..a0095dbc 100644 --- a/masakari/engine/drivers/taskflow/host_failure.py +++ b/masakari/engine/drivers/taskflow/host_failure.py @@ -54,7 +54,7 @@ class DisableComputeServiceTask(base.MasakariTask): class PrepareHAEnabledInstancesTask(base.MasakariTask): """Get all HA_Enabled instances.""" - default_provides = set(["ha_enabled_instances"]) + default_provides = set(["instance_list"]) def __init__(self, novaclient): requires = ["host_name"] @@ -63,29 +63,34 @@ class PrepareHAEnabledInstancesTask(base.MasakariTask): self.novaclient = novaclient def execute(self, context, host_name): - all_instances = self.novaclient.get_servers(context, host_name) - ha_enabled_instances = ( - [instance for instance in all_instances - if strutils.bool_from_string(instance.metadata.get('HA_Enabled', - False), - strict=True)]) + instance_list = self.novaclient.get_servers(context, host_name) + + if CONF.host_failure.evacuate_all_instances: + instance_list = sorted( + instance_list, key=lambda k: strutils.bool_from_string( + k.metadata.get('HA_Enabled', False)), reverse=True) + else: + instance_list = ( + [instance for instance in instance_list if + strutils.bool_from_string(instance.metadata.get('HA_Enabled', + False))]) return { - "ha_enabled_instances": ha_enabled_instances, + "instance_list": instance_list, } class AutoEvacuationInstancesTask(base.MasakariTask): - default_provides = set(["ha_enabled_instances"]) + default_provides = set(["instance_list"]) def __init__(self, novaclient): - requires = ["ha_enabled_instances"] + requires = ["instance_list"] super(AutoEvacuationInstancesTask, self).__init__(addons=[ACTION], requires=requires) self.novaclient = novaclient - def execute(self, context, ha_enabled_instances): - for instance in ha_enabled_instances: + def execute(self, context, instance_list): + for instance in instance_list: vm_state = getattr(instance, "OS-EXT-STS:vm_state") if vm_state in ['active', 'error', 'resized', 'stopped']: # Evacuate API only evacuates an instance in @@ -99,20 +104,20 @@ class AutoEvacuationInstancesTask(base.MasakariTask): self.novaclient.evacuate_instance(context, instance.id) return { - "ha_enabled_instances": ha_enabled_instances, + "instance_list": instance_list, } class ConfirmEvacuationTask(base.MasakariTask): def __init__(self, novaclient): - requires = ["ha_enabled_instances", "host_name"] + requires = ["instance_list", "host_name"] super(ConfirmEvacuationTask, self).__init__(addons=[ACTION], requires=requires) self.novaclient = novaclient - def execute(self, context, ha_enabled_instances, host_name): + def execute(self, context, instance_list, host_name): failed_evacuation_instances = [] - for instance in ha_enabled_instances: + for instance in instance_list: def _wait_for_evacuation(): new_instance = self.novaclient.get_server(context, instance.id) instance_host = getattr(new_instance, diff --git a/masakari/engine/drivers/taskflow/instance_failure.py b/masakari/engine/drivers/taskflow/instance_failure.py index 1ae15315..ea2a422e 100644 --- a/masakari/engine/drivers/taskflow/instance_failure.py +++ b/masakari/engine/drivers/taskflow/instance_failure.py @@ -45,9 +45,12 @@ class StopInstanceTask(base.MasakariTask): """Stop the instance for recovery.""" instance = self.novaclient.get_server(context, instance_uuid) - # If instance is not HA_Enabled then exit from the flow - if not strutils.bool_from_string(instance.metadata.get( - 'HA_Enabled', False), strict=True): + # If an instance is not HA_Enabled and "process_all_instances" config + # option is also disabled, then there is no need to take any recovery + # action. + if not CONF.instance_failure.process_all_instances and not ( + strutils.bool_from_string( + instance.metadata.get('HA_Enabled', False))): LOG.info(_LI("Skipping recovery for instance: %s as it is " "not Ha_Enabled."), instance_uuid) raise exception.SkipInstanceRecoveryException() diff --git a/masakari/tests/unit/engine/drivers/taskflow/test_host_failure_flow.py b/masakari/tests/unit/engine/drivers/taskflow/test_host_failure_flow.py index ca031a99..4d8aa8e9 100644 --- a/masakari/tests/unit/engine/drivers/taskflow/test_host_failure_flow.py +++ b/masakari/tests/unit/engine/drivers/taskflow/test_host_failure_flow.py @@ -21,12 +21,15 @@ import copy import mock from masakari.compute import nova +from masakari import conf from masakari import context from masakari.engine.drivers.taskflow import host_failure from masakari import exception from masakari import test from masakari.tests.unit import fakes +CONF = conf.CONF + class HostFailureTestCase(test.TestCase): @@ -38,6 +41,8 @@ class HostFailureTestCase(test.TestCase): # reduce the wait period. self.override_config("wait_period_after_evacuation", 2) self.override_config("wait_period_after_service_disabled", 2) + self.override_config("evacuate_all_instances", + False, "host_failure") self.instance_host = "fake-host" self.novaclient = nova.API() self.fake_client = fakes.FakeNovaClient() @@ -60,26 +65,31 @@ class HostFailureTestCase(test.TestCase): mock_disable.assert_called_once_with(self.instance_host, "nova-compute") - def _test_ha_enabled_instances(self): + def _test_instance_list(self): task = host_failure.PrepareHAEnabledInstancesTask(self.novaclient) - ha_enabled_instances = task.execute(self.ctxt, self.instance_host) + instance_list = task.execute( + self.ctxt, self.instance_host) + evacuate_all_instances = CONF.host_failure.evacuate_all_instances - for instance in ha_enabled_instances['ha_enabled_instances']: - self.assertTrue(instance.metadata.get( - 'HA_Enabled')) + if evacuate_all_instances: + self.assertEqual(len(self.fake_client.servers.list()), + len(instance_list['instance_list'])) + else: + for instance in instance_list['instance_list']: + self.assertTrue(instance.metadata.get('HA_Enabled', False)) - return ha_enabled_instances + return instance_list - def _auto_evacuate_instances(self, ha_enabled_instances): + def _auto_evacuate_instances(self, instance_list): task = host_failure.AutoEvacuationInstancesTask(self.novaclient) - ha_enabled_instances = task.execute( - self.ctxt, ha_enabled_instances['ha_enabled_instances']) + instance_list = task.execute( + self.ctxt, instance_list['instance_list']) - return ha_enabled_instances + return instance_list - def _test_confirm_evacuate_task(self, ha_enabled_instances): + def _test_confirm_evacuate_task(self, instance_list): task = host_failure.ConfirmEvacuationTask(self.novaclient) - task.execute(self.ctxt, ha_enabled_instances['ha_enabled_instances'], + task.execute(self.ctxt, instance_list['instance_list'], self.instance_host) # make sure instance is active and has different host self._verify_instance_evacuated() @@ -87,25 +97,26 @@ class HostFailureTestCase(test.TestCase): @mock.patch('masakari.compute.nova.novaclient') def test_host_failure_flow(self, _mock_novaclient): _mock_novaclient.return_value = self.fake_client + self.override_config("evacuate_all_instances", + True, "host_failure") # create test data self.fake_client.servers.create(id="1", host=self.instance_host, ha_enabled=True) - self.fake_client.servers.create(id="2", host=self.instance_host, - ha_enabled=True) + self.fake_client.servers.create(id="2", host=self.instance_host) # execute DisableComputeServiceTask self._test_disable_compute_service() # execute PrepareHAEnabledInstancesTask - ha_enabled_instances = self._test_ha_enabled_instances() + instance_list = self._test_instance_list() # execute AutoEvacuationInstancesTask - ha_enabled_instances = self._auto_evacuate_instances( - ha_enabled_instances) + instance_list = self._auto_evacuate_instances( + instance_list) # execute ConfirmEvacuationTask - self._test_confirm_evacuate_task(ha_enabled_instances) + self._test_confirm_evacuate_task(instance_list) @mock.patch('masakari.compute.nova.novaclient') def test_auto_evacuate_instances_task(self, _mock_novaclient): @@ -121,7 +132,7 @@ class HostFailureTestCase(test.TestCase): self._test_disable_compute_service() # execute PrepareHAEnabledInstancesTask - ha_enabled_instances = self._test_ha_enabled_instances() + instance_list = self._test_instance_list() # execute AutoEvacuationInstancesTask task = host_failure.AutoEvacuationInstancesTask(self.novaclient) @@ -130,7 +141,7 @@ class HostFailureTestCase(test.TestCase): with mock.patch.object(fakes.FakeNovaClient.ServerManager, "evacuate") as mock_evacuate: task.execute(self.ctxt, - ha_enabled_instances['ha_enabled_instances']) + instance_list['instance_list']) self.assertEqual(2, mock_evacuate.call_count) @mock.patch('masakari.compute.nova.novaclient') @@ -146,8 +157,8 @@ class HostFailureTestCase(test.TestCase): # execute PrepareHAEnabledInstancesTask task = host_failure.PrepareHAEnabledInstancesTask(self.novaclient) - ha_enabled_instances = task.execute(self.ctxt, self.instance_host) - self.assertEqual(0, len(ha_enabled_instances['ha_enabled_instances'])) + instance_list = task.execute(self.ctxt, self.instance_host) + self.assertEqual(0, len(instance_list['instance_list'])) @mock.patch('masakari.compute.nova.novaclient') def test_host_failure_flow_evacuation_failed(self, _mock_novaclient): @@ -157,13 +168,13 @@ class HostFailureTestCase(test.TestCase): server = self.fake_client.servers.create(id="1", host=self.instance_host, ha_enabled=True) - ha_enabled_instances = { - "ha_enabled_instances": self.fake_client.servers.list() + instance_list = { + "instance_list": self.fake_client.servers.list() } # execute AutoEvacuationInstancesTask - ha_enabled_instances = self._auto_evacuate_instances( - ha_enabled_instances) + instance_list = self._auto_evacuate_instances( + instance_list) def fake_get_server(context, host): # assume that while evacuating instance goes into error state @@ -176,7 +187,7 @@ class HostFailureTestCase(test.TestCase): task = host_failure.ConfirmEvacuationTask(self.novaclient) self.assertRaises( exception.AutoRecoveryFailureException, task.execute, - self.ctxt, ha_enabled_instances['ha_enabled_instances'], + self.ctxt, instance_list['instance_list'], self.instance_host) @mock.patch('masakari.compute.nova.novaclient') @@ -190,16 +201,16 @@ class HostFailureTestCase(test.TestCase): self.fake_client.servers.create(id="2", host=self.instance_host, vm_state="resized", ha_enabled=True) - ha_enabled_instances = { - "ha_enabled_instances": self.fake_client.servers.list() + instance_list = { + "instance_list": self.fake_client.servers.list() } # execute AutoEvacuationInstancesTask - ha_enabled_instances = self._auto_evacuate_instances( - ha_enabled_instances) + instance_list = self._auto_evacuate_instances( + instance_list) # execute ConfirmEvacuationTask - self._test_confirm_evacuate_task(ha_enabled_instances) + self._test_confirm_evacuate_task(instance_list) @mock.patch('masakari.compute.nova.novaclient') def test_host_failure_flow_shutdown_instance(self, _mock_novaclient): @@ -212,16 +223,16 @@ class HostFailureTestCase(test.TestCase): self.fake_client.servers.create(id="2", host=self.instance_host, vm_state="stopped", ha_enabled=True) - ha_enabled_instances = { - "ha_enabled_instances": self.fake_client.servers.list() + instance_list = { + "instance_list": self.fake_client.servers.list() } # execute AutoEvacuationInstancesTask - ha_enabled_instances = self._auto_evacuate_instances( - ha_enabled_instances) + instance_list = self._auto_evacuate_instances( + instance_list) # execute ConfirmEvacuationTask - self._test_confirm_evacuate_task(ha_enabled_instances) + self._test_confirm_evacuate_task(instance_list) @mock.patch('masakari.compute.nova.novaclient') def test_host_failure_flow_instance_in_error(self, _mock_novaclient): @@ -234,13 +245,13 @@ class HostFailureTestCase(test.TestCase): self.fake_client.servers.create(id="2", host=self.instance_host, vm_state="error", ha_enabled=True) - ha_enabled_instances = { - "ha_enabled_instances": self.fake_client.servers.list() + instance_list = { + "instance_list": self.fake_client.servers.list() } # execute AutoEvacuationInstancesTask - ha_enabled_instances = self._auto_evacuate_instances( - ha_enabled_instances) + instance_list = self._auto_evacuate_instances( + instance_list) # execute ConfirmEvacuationTask - self._test_confirm_evacuate_task(ha_enabled_instances) + self._test_confirm_evacuate_task(instance_list) diff --git a/masakari/tests/unit/engine/drivers/taskflow/test_instance_failure_flow.py b/masakari/tests/unit/engine/drivers/taskflow/test_instance_failure_flow.py index adcb9b4b..2c851e5f 100644 --- a/masakari/tests/unit/engine/drivers/taskflow/test_instance_failure_flow.py +++ b/masakari/tests/unit/engine/drivers/taskflow/test_instance_failure_flow.py @@ -40,6 +40,8 @@ class InstanceFailureTestCase(test.TestCase): # reduce the wait period. self.override_config('wait_period_after_power_off', 2) self.override_config('wait_period_after_power_on', 2) + self.override_config("process_all_instances", + False, "instance_failure") def _test_stop_instance(self): task = instance_failure.StopInstanceTask(self.novaclient) @@ -130,6 +132,29 @@ class InstanceFailureTestCase(test.TestCase): exception.SkipInstanceRecoveryException, task.execute, self.ctxt, self.instance_id) + @mock.patch('masakari.compute.nova.novaclient') + def test_instance_failure_flow_not_ha_enabled_but_conf_option_is_set( + self, _mock_novaclient): + # Setting this config option to True indicates masakari has to recover + # the instance irrespective of whether it is HA_Enabled or not. + self.override_config("process_all_instances", + True, "instance_failure") + _mock_novaclient.return_value = self.fake_client + + # create test data + self.fake_client.servers.create(self.instance_id, + host="fake-host", vm_state="resized") + + # test StopInstanceTask + self._test_stop_instance() + + # test StartInstanceTask + task = instance_failure.StartInstanceTask(self.novaclient) + task.execute(self.ctxt, self.instance_id) + + # test ConfirmInstanceActiveTask + self._test_confirm_instance_is_active() + @mock.patch('masakari.compute.nova.novaclient') def test_instance_failure_flow_start_failed(self, _mock_novaclient): _mock_novaclient.return_value = self.fake_client diff --git a/releasenotes/notes/add_ha_enabled_config_options-54a9270a5993d20a.yaml b/releasenotes/notes/add_ha_enabled_config_options-54a9270a5993d20a.yaml new file mode 100644 index 00000000..a5d050b0 --- /dev/null +++ b/releasenotes/notes/add_ha_enabled_config_options-54a9270a5993d20a.yaml @@ -0,0 +1,23 @@ +--- +features: + - Added two new config options: + + evacuate_all_instances: + Operators can decide whether all instances or only those instances + which contain metadata key 'HA_Enabled=True' should be allowed for + evacuation from a failed source compute node. When set to True, it will + evacuate all instances from a failed source compute node. First + preference will be given to those instances which contain + 'HA_Enabled=True' metadata key, and then it will evacuate the remaining + ones. When set to False, it will evacuate only those instances which + contain 'HA_Enabled=True' metadata key. + + process_all_instances: + Operators can decide whether all instances or only those instances + which contain metadata key 'HA_Enabled=True' should be taken into + account to recover from instance failure events. When set to True, + it will execute instance failure recovery actions for an instance + irrespective of whether that particular instance contains metadata key + 'HA_Enabled=True' or not. When set to False, it will only execute + instance failure recovery actions for an instance which contain + metadata key 'HA_Enabled=True'. \ No newline at end of file