Add ha enabled config options
evacuate_all_instances: This option will allow operators to decide whether all instances or only those instances which are "HA_Enabled" should be allowed for evacuation from a failed source compute node. process_all_instances: This option will allow operators to decide whether all instances or only those instances which are "HA_Enabled" should be taken into account to recover from instance_failure events. Implements: blueprint ha-enabled-config-options Change-Id: I9998295f03e7663f1da79fe5024b1a7553355ac2
This commit is contained in:
parent
6db17bc33a
commit
3001a7ccc3
@ -19,6 +19,7 @@ from masakari.conf import api
|
||||
from masakari.conf import base
|
||||
from masakari.conf import database
|
||||
from masakari.conf import engine
|
||||
from masakari.conf import engine_driver
|
||||
from masakari.conf import exceptions
|
||||
from masakari.conf import nova
|
||||
from masakari.conf import osapi_v1
|
||||
@ -33,6 +34,7 @@ api.register_opts(CONF)
|
||||
base.register_opts(CONF)
|
||||
database.register_opts(CONF)
|
||||
engine.register_opts(CONF)
|
||||
engine_driver.register_opts(CONF)
|
||||
exceptions.register_opts(CONF)
|
||||
nova.register_opts(CONF)
|
||||
osapi_v1.register_opts(CONF)
|
||||
|
68
masakari/conf/engine_driver.py
Normal file
68
masakari/conf/engine_driver.py
Normal file
@ -0,0 +1,68 @@
|
||||
# Copyright 2016 NTT DATA
|
||||
# All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License. You may obtain
|
||||
# a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
from oslo_config import cfg
|
||||
|
||||
|
||||
instance_recovery_group = cfg.OptGroup(
|
||||
'instance_failure',
|
||||
title='Instance failure recovery options',
|
||||
help="Configuration options for instance failure recovery")
|
||||
|
||||
host_recovery_group = cfg.OptGroup(
|
||||
'host_failure',
|
||||
title='Host failure recovery options',
|
||||
help="Configuration options for host failure recovery")
|
||||
|
||||
|
||||
host_failure_opts = [
|
||||
cfg.BoolOpt('evacuate_all_instances',
|
||||
default=True,
|
||||
help="""
|
||||
Operators can decide whether all instances or only those instances which
|
||||
contain metadata key 'HA_Enabled=True' should be allowed for evacuation from
|
||||
a failed source compute node. When set to True, it will evacuate all instances
|
||||
from a failed source compute node. First preference will be given to those
|
||||
instances which contain 'HA_Enabled=True' metadata key, and then it will
|
||||
evacuate the remaining ones. When set to False, it will evacuate only those
|
||||
instances which contain 'HA_Enabled=True' metadata key."""),
|
||||
]
|
||||
|
||||
instance_failure_options = [
|
||||
cfg.BoolOpt('process_all_instances',
|
||||
default=False,
|
||||
help="""
|
||||
Operators can decide whether all instances or only those instances which
|
||||
contain metadata key 'HA_Enabled=True' should be taken into account to
|
||||
recover from instance failure events. When set to True, it will execute
|
||||
instance failure recovery actions for an instance irrespective of whether
|
||||
that particular instance contains metadata key 'HA_Enabled=True' or not.
|
||||
When set to False, it will only execute instance failure recovery actions
|
||||
for an instance which contain metadata key 'HA_Enabled=True'."""),
|
||||
]
|
||||
|
||||
|
||||
def register_opts(conf):
|
||||
conf.register_group(instance_recovery_group)
|
||||
conf.register_group(host_recovery_group)
|
||||
conf.register_opts(instance_failure_options, group=instance_recovery_group)
|
||||
conf.register_opts(host_failure_opts, group=host_recovery_group)
|
||||
|
||||
|
||||
def list_opts():
|
||||
return {
|
||||
instance_recovery_group.name: instance_failure_options,
|
||||
host_recovery_group.name: host_failure_opts
|
||||
}
|
@ -54,7 +54,7 @@ class DisableComputeServiceTask(base.MasakariTask):
|
||||
|
||||
class PrepareHAEnabledInstancesTask(base.MasakariTask):
|
||||
"""Get all HA_Enabled instances."""
|
||||
default_provides = set(["ha_enabled_instances"])
|
||||
default_provides = set(["instance_list"])
|
||||
|
||||
def __init__(self, novaclient):
|
||||
requires = ["host_name"]
|
||||
@ -63,29 +63,34 @@ class PrepareHAEnabledInstancesTask(base.MasakariTask):
|
||||
self.novaclient = novaclient
|
||||
|
||||
def execute(self, context, host_name):
|
||||
all_instances = self.novaclient.get_servers(context, host_name)
|
||||
ha_enabled_instances = (
|
||||
[instance for instance in all_instances
|
||||
if strutils.bool_from_string(instance.metadata.get('HA_Enabled',
|
||||
False),
|
||||
strict=True)])
|
||||
instance_list = self.novaclient.get_servers(context, host_name)
|
||||
|
||||
if CONF.host_failure.evacuate_all_instances:
|
||||
instance_list = sorted(
|
||||
instance_list, key=lambda k: strutils.bool_from_string(
|
||||
k.metadata.get('HA_Enabled', False)), reverse=True)
|
||||
else:
|
||||
instance_list = (
|
||||
[instance for instance in instance_list if
|
||||
strutils.bool_from_string(instance.metadata.get('HA_Enabled',
|
||||
False))])
|
||||
|
||||
return {
|
||||
"ha_enabled_instances": ha_enabled_instances,
|
||||
"instance_list": instance_list,
|
||||
}
|
||||
|
||||
|
||||
class AutoEvacuationInstancesTask(base.MasakariTask):
|
||||
default_provides = set(["ha_enabled_instances"])
|
||||
default_provides = set(["instance_list"])
|
||||
|
||||
def __init__(self, novaclient):
|
||||
requires = ["ha_enabled_instances"]
|
||||
requires = ["instance_list"]
|
||||
super(AutoEvacuationInstancesTask, self).__init__(addons=[ACTION],
|
||||
requires=requires)
|
||||
self.novaclient = novaclient
|
||||
|
||||
def execute(self, context, ha_enabled_instances):
|
||||
for instance in ha_enabled_instances:
|
||||
def execute(self, context, instance_list):
|
||||
for instance in instance_list:
|
||||
vm_state = getattr(instance, "OS-EXT-STS:vm_state")
|
||||
if vm_state in ['active', 'error', 'resized', 'stopped']:
|
||||
# Evacuate API only evacuates an instance in
|
||||
@ -99,20 +104,20 @@ class AutoEvacuationInstancesTask(base.MasakariTask):
|
||||
self.novaclient.evacuate_instance(context, instance.id)
|
||||
|
||||
return {
|
||||
"ha_enabled_instances": ha_enabled_instances,
|
||||
"instance_list": instance_list,
|
||||
}
|
||||
|
||||
|
||||
class ConfirmEvacuationTask(base.MasakariTask):
|
||||
def __init__(self, novaclient):
|
||||
requires = ["ha_enabled_instances", "host_name"]
|
||||
requires = ["instance_list", "host_name"]
|
||||
super(ConfirmEvacuationTask, self).__init__(addons=[ACTION],
|
||||
requires=requires)
|
||||
self.novaclient = novaclient
|
||||
|
||||
def execute(self, context, ha_enabled_instances, host_name):
|
||||
def execute(self, context, instance_list, host_name):
|
||||
failed_evacuation_instances = []
|
||||
for instance in ha_enabled_instances:
|
||||
for instance in instance_list:
|
||||
def _wait_for_evacuation():
|
||||
new_instance = self.novaclient.get_server(context, instance.id)
|
||||
instance_host = getattr(new_instance,
|
||||
|
@ -45,9 +45,12 @@ class StopInstanceTask(base.MasakariTask):
|
||||
"""Stop the instance for recovery."""
|
||||
instance = self.novaclient.get_server(context, instance_uuid)
|
||||
|
||||
# If instance is not HA_Enabled then exit from the flow
|
||||
if not strutils.bool_from_string(instance.metadata.get(
|
||||
'HA_Enabled', False), strict=True):
|
||||
# If an instance is not HA_Enabled and "process_all_instances" config
|
||||
# option is also disabled, then there is no need to take any recovery
|
||||
# action.
|
||||
if not CONF.instance_failure.process_all_instances and not (
|
||||
strutils.bool_from_string(
|
||||
instance.metadata.get('HA_Enabled', False))):
|
||||
LOG.info(_LI("Skipping recovery for instance: %s as it is "
|
||||
"not Ha_Enabled."), instance_uuid)
|
||||
raise exception.SkipInstanceRecoveryException()
|
||||
|
@ -21,12 +21,15 @@ import copy
|
||||
import mock
|
||||
|
||||
from masakari.compute import nova
|
||||
from masakari import conf
|
||||
from masakari import context
|
||||
from masakari.engine.drivers.taskflow import host_failure
|
||||
from masakari import exception
|
||||
from masakari import test
|
||||
from masakari.tests.unit import fakes
|
||||
|
||||
CONF = conf.CONF
|
||||
|
||||
|
||||
class HostFailureTestCase(test.TestCase):
|
||||
|
||||
@ -38,6 +41,8 @@ class HostFailureTestCase(test.TestCase):
|
||||
# reduce the wait period.
|
||||
self.override_config("wait_period_after_evacuation", 2)
|
||||
self.override_config("wait_period_after_service_disabled", 2)
|
||||
self.override_config("evacuate_all_instances",
|
||||
False, "host_failure")
|
||||
self.instance_host = "fake-host"
|
||||
self.novaclient = nova.API()
|
||||
self.fake_client = fakes.FakeNovaClient()
|
||||
@ -60,26 +65,31 @@ class HostFailureTestCase(test.TestCase):
|
||||
mock_disable.assert_called_once_with(self.instance_host,
|
||||
"nova-compute")
|
||||
|
||||
def _test_ha_enabled_instances(self):
|
||||
def _test_instance_list(self):
|
||||
task = host_failure.PrepareHAEnabledInstancesTask(self.novaclient)
|
||||
ha_enabled_instances = task.execute(self.ctxt, self.instance_host)
|
||||
instance_list = task.execute(
|
||||
self.ctxt, self.instance_host)
|
||||
evacuate_all_instances = CONF.host_failure.evacuate_all_instances
|
||||
|
||||
for instance in ha_enabled_instances['ha_enabled_instances']:
|
||||
self.assertTrue(instance.metadata.get(
|
||||
'HA_Enabled'))
|
||||
if evacuate_all_instances:
|
||||
self.assertEqual(len(self.fake_client.servers.list()),
|
||||
len(instance_list['instance_list']))
|
||||
else:
|
||||
for instance in instance_list['instance_list']:
|
||||
self.assertTrue(instance.metadata.get('HA_Enabled', False))
|
||||
|
||||
return ha_enabled_instances
|
||||
return instance_list
|
||||
|
||||
def _auto_evacuate_instances(self, ha_enabled_instances):
|
||||
def _auto_evacuate_instances(self, instance_list):
|
||||
task = host_failure.AutoEvacuationInstancesTask(self.novaclient)
|
||||
ha_enabled_instances = task.execute(
|
||||
self.ctxt, ha_enabled_instances['ha_enabled_instances'])
|
||||
instance_list = task.execute(
|
||||
self.ctxt, instance_list['instance_list'])
|
||||
|
||||
return ha_enabled_instances
|
||||
return instance_list
|
||||
|
||||
def _test_confirm_evacuate_task(self, ha_enabled_instances):
|
||||
def _test_confirm_evacuate_task(self, instance_list):
|
||||
task = host_failure.ConfirmEvacuationTask(self.novaclient)
|
||||
task.execute(self.ctxt, ha_enabled_instances['ha_enabled_instances'],
|
||||
task.execute(self.ctxt, instance_list['instance_list'],
|
||||
self.instance_host)
|
||||
# make sure instance is active and has different host
|
||||
self._verify_instance_evacuated()
|
||||
@ -87,25 +97,26 @@ class HostFailureTestCase(test.TestCase):
|
||||
@mock.patch('masakari.compute.nova.novaclient')
|
||||
def test_host_failure_flow(self, _mock_novaclient):
|
||||
_mock_novaclient.return_value = self.fake_client
|
||||
self.override_config("evacuate_all_instances",
|
||||
True, "host_failure")
|
||||
|
||||
# create test data
|
||||
self.fake_client.servers.create(id="1", host=self.instance_host,
|
||||
ha_enabled=True)
|
||||
self.fake_client.servers.create(id="2", host=self.instance_host,
|
||||
ha_enabled=True)
|
||||
self.fake_client.servers.create(id="2", host=self.instance_host)
|
||||
|
||||
# execute DisableComputeServiceTask
|
||||
self._test_disable_compute_service()
|
||||
|
||||
# execute PrepareHAEnabledInstancesTask
|
||||
ha_enabled_instances = self._test_ha_enabled_instances()
|
||||
instance_list = self._test_instance_list()
|
||||
|
||||
# execute AutoEvacuationInstancesTask
|
||||
ha_enabled_instances = self._auto_evacuate_instances(
|
||||
ha_enabled_instances)
|
||||
instance_list = self._auto_evacuate_instances(
|
||||
instance_list)
|
||||
|
||||
# execute ConfirmEvacuationTask
|
||||
self._test_confirm_evacuate_task(ha_enabled_instances)
|
||||
self._test_confirm_evacuate_task(instance_list)
|
||||
|
||||
@mock.patch('masakari.compute.nova.novaclient')
|
||||
def test_auto_evacuate_instances_task(self, _mock_novaclient):
|
||||
@ -121,7 +132,7 @@ class HostFailureTestCase(test.TestCase):
|
||||
self._test_disable_compute_service()
|
||||
|
||||
# execute PrepareHAEnabledInstancesTask
|
||||
ha_enabled_instances = self._test_ha_enabled_instances()
|
||||
instance_list = self._test_instance_list()
|
||||
|
||||
# execute AutoEvacuationInstancesTask
|
||||
task = host_failure.AutoEvacuationInstancesTask(self.novaclient)
|
||||
@ -130,7 +141,7 @@ class HostFailureTestCase(test.TestCase):
|
||||
with mock.patch.object(fakes.FakeNovaClient.ServerManager,
|
||||
"evacuate") as mock_evacuate:
|
||||
task.execute(self.ctxt,
|
||||
ha_enabled_instances['ha_enabled_instances'])
|
||||
instance_list['instance_list'])
|
||||
self.assertEqual(2, mock_evacuate.call_count)
|
||||
|
||||
@mock.patch('masakari.compute.nova.novaclient')
|
||||
@ -146,8 +157,8 @@ class HostFailureTestCase(test.TestCase):
|
||||
|
||||
# execute PrepareHAEnabledInstancesTask
|
||||
task = host_failure.PrepareHAEnabledInstancesTask(self.novaclient)
|
||||
ha_enabled_instances = task.execute(self.ctxt, self.instance_host)
|
||||
self.assertEqual(0, len(ha_enabled_instances['ha_enabled_instances']))
|
||||
instance_list = task.execute(self.ctxt, self.instance_host)
|
||||
self.assertEqual(0, len(instance_list['instance_list']))
|
||||
|
||||
@mock.patch('masakari.compute.nova.novaclient')
|
||||
def test_host_failure_flow_evacuation_failed(self, _mock_novaclient):
|
||||
@ -157,13 +168,13 @@ class HostFailureTestCase(test.TestCase):
|
||||
server = self.fake_client.servers.create(id="1",
|
||||
host=self.instance_host,
|
||||
ha_enabled=True)
|
||||
ha_enabled_instances = {
|
||||
"ha_enabled_instances": self.fake_client.servers.list()
|
||||
instance_list = {
|
||||
"instance_list": self.fake_client.servers.list()
|
||||
}
|
||||
|
||||
# execute AutoEvacuationInstancesTask
|
||||
ha_enabled_instances = self._auto_evacuate_instances(
|
||||
ha_enabled_instances)
|
||||
instance_list = self._auto_evacuate_instances(
|
||||
instance_list)
|
||||
|
||||
def fake_get_server(context, host):
|
||||
# assume that while evacuating instance goes into error state
|
||||
@ -176,7 +187,7 @@ class HostFailureTestCase(test.TestCase):
|
||||
task = host_failure.ConfirmEvacuationTask(self.novaclient)
|
||||
self.assertRaises(
|
||||
exception.AutoRecoveryFailureException, task.execute,
|
||||
self.ctxt, ha_enabled_instances['ha_enabled_instances'],
|
||||
self.ctxt, instance_list['instance_list'],
|
||||
self.instance_host)
|
||||
|
||||
@mock.patch('masakari.compute.nova.novaclient')
|
||||
@ -190,16 +201,16 @@ class HostFailureTestCase(test.TestCase):
|
||||
self.fake_client.servers.create(id="2", host=self.instance_host,
|
||||
vm_state="resized",
|
||||
ha_enabled=True)
|
||||
ha_enabled_instances = {
|
||||
"ha_enabled_instances": self.fake_client.servers.list()
|
||||
instance_list = {
|
||||
"instance_list": self.fake_client.servers.list()
|
||||
}
|
||||
|
||||
# execute AutoEvacuationInstancesTask
|
||||
ha_enabled_instances = self._auto_evacuate_instances(
|
||||
ha_enabled_instances)
|
||||
instance_list = self._auto_evacuate_instances(
|
||||
instance_list)
|
||||
|
||||
# execute ConfirmEvacuationTask
|
||||
self._test_confirm_evacuate_task(ha_enabled_instances)
|
||||
self._test_confirm_evacuate_task(instance_list)
|
||||
|
||||
@mock.patch('masakari.compute.nova.novaclient')
|
||||
def test_host_failure_flow_shutdown_instance(self, _mock_novaclient):
|
||||
@ -212,16 +223,16 @@ class HostFailureTestCase(test.TestCase):
|
||||
self.fake_client.servers.create(id="2", host=self.instance_host,
|
||||
vm_state="stopped",
|
||||
ha_enabled=True)
|
||||
ha_enabled_instances = {
|
||||
"ha_enabled_instances": self.fake_client.servers.list()
|
||||
instance_list = {
|
||||
"instance_list": self.fake_client.servers.list()
|
||||
}
|
||||
|
||||
# execute AutoEvacuationInstancesTask
|
||||
ha_enabled_instances = self._auto_evacuate_instances(
|
||||
ha_enabled_instances)
|
||||
instance_list = self._auto_evacuate_instances(
|
||||
instance_list)
|
||||
|
||||
# execute ConfirmEvacuationTask
|
||||
self._test_confirm_evacuate_task(ha_enabled_instances)
|
||||
self._test_confirm_evacuate_task(instance_list)
|
||||
|
||||
@mock.patch('masakari.compute.nova.novaclient')
|
||||
def test_host_failure_flow_instance_in_error(self, _mock_novaclient):
|
||||
@ -234,13 +245,13 @@ class HostFailureTestCase(test.TestCase):
|
||||
self.fake_client.servers.create(id="2", host=self.instance_host,
|
||||
vm_state="error",
|
||||
ha_enabled=True)
|
||||
ha_enabled_instances = {
|
||||
"ha_enabled_instances": self.fake_client.servers.list()
|
||||
instance_list = {
|
||||
"instance_list": self.fake_client.servers.list()
|
||||
}
|
||||
|
||||
# execute AutoEvacuationInstancesTask
|
||||
ha_enabled_instances = self._auto_evacuate_instances(
|
||||
ha_enabled_instances)
|
||||
instance_list = self._auto_evacuate_instances(
|
||||
instance_list)
|
||||
|
||||
# execute ConfirmEvacuationTask
|
||||
self._test_confirm_evacuate_task(ha_enabled_instances)
|
||||
self._test_confirm_evacuate_task(instance_list)
|
||||
|
@ -40,6 +40,8 @@ class InstanceFailureTestCase(test.TestCase):
|
||||
# reduce the wait period.
|
||||
self.override_config('wait_period_after_power_off', 2)
|
||||
self.override_config('wait_period_after_power_on', 2)
|
||||
self.override_config("process_all_instances",
|
||||
False, "instance_failure")
|
||||
|
||||
def _test_stop_instance(self):
|
||||
task = instance_failure.StopInstanceTask(self.novaclient)
|
||||
@ -130,6 +132,29 @@ class InstanceFailureTestCase(test.TestCase):
|
||||
exception.SkipInstanceRecoveryException, task.execute,
|
||||
self.ctxt, self.instance_id)
|
||||
|
||||
@mock.patch('masakari.compute.nova.novaclient')
|
||||
def test_instance_failure_flow_not_ha_enabled_but_conf_option_is_set(
|
||||
self, _mock_novaclient):
|
||||
# Setting this config option to True indicates masakari has to recover
|
||||
# the instance irrespective of whether it is HA_Enabled or not.
|
||||
self.override_config("process_all_instances",
|
||||
True, "instance_failure")
|
||||
_mock_novaclient.return_value = self.fake_client
|
||||
|
||||
# create test data
|
||||
self.fake_client.servers.create(self.instance_id,
|
||||
host="fake-host", vm_state="resized")
|
||||
|
||||
# test StopInstanceTask
|
||||
self._test_stop_instance()
|
||||
|
||||
# test StartInstanceTask
|
||||
task = instance_failure.StartInstanceTask(self.novaclient)
|
||||
task.execute(self.ctxt, self.instance_id)
|
||||
|
||||
# test ConfirmInstanceActiveTask
|
||||
self._test_confirm_instance_is_active()
|
||||
|
||||
@mock.patch('masakari.compute.nova.novaclient')
|
||||
def test_instance_failure_flow_start_failed(self, _mock_novaclient):
|
||||
_mock_novaclient.return_value = self.fake_client
|
||||
|
@ -0,0 +1,23 @@
|
||||
---
|
||||
features:
|
||||
- Added two new config options:
|
||||
|
||||
evacuate_all_instances:
|
||||
Operators can decide whether all instances or only those instances
|
||||
which contain metadata key 'HA_Enabled=True' should be allowed for
|
||||
evacuation from a failed source compute node. When set to True, it will
|
||||
evacuate all instances from a failed source compute node. First
|
||||
preference will be given to those instances which contain
|
||||
'HA_Enabled=True' metadata key, and then it will evacuate the remaining
|
||||
ones. When set to False, it will evacuate only those instances which
|
||||
contain 'HA_Enabled=True' metadata key.
|
||||
|
||||
process_all_instances:
|
||||
Operators can decide whether all instances or only those instances
|
||||
which contain metadata key 'HA_Enabled=True' should be taken into
|
||||
account to recover from instance failure events. When set to True,
|
||||
it will execute instance failure recovery actions for an instance
|
||||
irrespective of whether that particular instance contains metadata key
|
||||
'HA_Enabled=True' or not. When set to False, it will only execute
|
||||
instance failure recovery actions for an instance which contain
|
||||
metadata key 'HA_Enabled=True'.
|
Loading…
Reference in New Issue
Block a user