Add ha enabled config options

evacuate_all_instances:
This option will allow operators to decide whether all instances
or only those instances which are "HA_Enabled" should be allowed
for evacuation from a failed source compute node.

process_all_instances:
This option will allow operators to decide whether all instances
or only those instances which are "HA_Enabled" should be taken
into account to recover from instance_failure events.

Implements: blueprint ha-enabled-config-options
Change-Id: I9998295f03e7663f1da79fe5024b1a7553355ac2
This commit is contained in:
hussainchachuliya 2016-11-14 18:27:17 +05:30 committed by Dinesh Bhor
parent 6db17bc33a
commit 3001a7ccc3
7 changed files with 198 additions and 61 deletions

View File

@ -19,6 +19,7 @@ from masakari.conf import api
from masakari.conf import base
from masakari.conf import database
from masakari.conf import engine
from masakari.conf import engine_driver
from masakari.conf import exceptions
from masakari.conf import nova
from masakari.conf import osapi_v1
@ -33,6 +34,7 @@ api.register_opts(CONF)
base.register_opts(CONF)
database.register_opts(CONF)
engine.register_opts(CONF)
engine_driver.register_opts(CONF)
exceptions.register_opts(CONF)
nova.register_opts(CONF)
osapi_v1.register_opts(CONF)

View File

@ -0,0 +1,68 @@
# Copyright 2016 NTT DATA
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
from oslo_config import cfg
instance_recovery_group = cfg.OptGroup(
'instance_failure',
title='Instance failure recovery options',
help="Configuration options for instance failure recovery")
host_recovery_group = cfg.OptGroup(
'host_failure',
title='Host failure recovery options',
help="Configuration options for host failure recovery")
host_failure_opts = [
cfg.BoolOpt('evacuate_all_instances',
default=True,
help="""
Operators can decide whether all instances or only those instances which
contain metadata key 'HA_Enabled=True' should be allowed for evacuation from
a failed source compute node. When set to True, it will evacuate all instances
from a failed source compute node. First preference will be given to those
instances which contain 'HA_Enabled=True' metadata key, and then it will
evacuate the remaining ones. When set to False, it will evacuate only those
instances which contain 'HA_Enabled=True' metadata key."""),
]
instance_failure_options = [
cfg.BoolOpt('process_all_instances',
default=False,
help="""
Operators can decide whether all instances or only those instances which
contain metadata key 'HA_Enabled=True' should be taken into account to
recover from instance failure events. When set to True, it will execute
instance failure recovery actions for an instance irrespective of whether
that particular instance contains metadata key 'HA_Enabled=True' or not.
When set to False, it will only execute instance failure recovery actions
for an instance which contain metadata key 'HA_Enabled=True'."""),
]
def register_opts(conf):
conf.register_group(instance_recovery_group)
conf.register_group(host_recovery_group)
conf.register_opts(instance_failure_options, group=instance_recovery_group)
conf.register_opts(host_failure_opts, group=host_recovery_group)
def list_opts():
return {
instance_recovery_group.name: instance_failure_options,
host_recovery_group.name: host_failure_opts
}

View File

@ -54,7 +54,7 @@ class DisableComputeServiceTask(base.MasakariTask):
class PrepareHAEnabledInstancesTask(base.MasakariTask):
"""Get all HA_Enabled instances."""
default_provides = set(["ha_enabled_instances"])
default_provides = set(["instance_list"])
def __init__(self, novaclient):
requires = ["host_name"]
@ -63,29 +63,34 @@ class PrepareHAEnabledInstancesTask(base.MasakariTask):
self.novaclient = novaclient
def execute(self, context, host_name):
all_instances = self.novaclient.get_servers(context, host_name)
ha_enabled_instances = (
[instance for instance in all_instances
if strutils.bool_from_string(instance.metadata.get('HA_Enabled',
False),
strict=True)])
instance_list = self.novaclient.get_servers(context, host_name)
if CONF.host_failure.evacuate_all_instances:
instance_list = sorted(
instance_list, key=lambda k: strutils.bool_from_string(
k.metadata.get('HA_Enabled', False)), reverse=True)
else:
instance_list = (
[instance for instance in instance_list if
strutils.bool_from_string(instance.metadata.get('HA_Enabled',
False))])
return {
"ha_enabled_instances": ha_enabled_instances,
"instance_list": instance_list,
}
class AutoEvacuationInstancesTask(base.MasakariTask):
default_provides = set(["ha_enabled_instances"])
default_provides = set(["instance_list"])
def __init__(self, novaclient):
requires = ["ha_enabled_instances"]
requires = ["instance_list"]
super(AutoEvacuationInstancesTask, self).__init__(addons=[ACTION],
requires=requires)
self.novaclient = novaclient
def execute(self, context, ha_enabled_instances):
for instance in ha_enabled_instances:
def execute(self, context, instance_list):
for instance in instance_list:
vm_state = getattr(instance, "OS-EXT-STS:vm_state")
if vm_state in ['active', 'error', 'resized', 'stopped']:
# Evacuate API only evacuates an instance in
@ -99,20 +104,20 @@ class AutoEvacuationInstancesTask(base.MasakariTask):
self.novaclient.evacuate_instance(context, instance.id)
return {
"ha_enabled_instances": ha_enabled_instances,
"instance_list": instance_list,
}
class ConfirmEvacuationTask(base.MasakariTask):
def __init__(self, novaclient):
requires = ["ha_enabled_instances", "host_name"]
requires = ["instance_list", "host_name"]
super(ConfirmEvacuationTask, self).__init__(addons=[ACTION],
requires=requires)
self.novaclient = novaclient
def execute(self, context, ha_enabled_instances, host_name):
def execute(self, context, instance_list, host_name):
failed_evacuation_instances = []
for instance in ha_enabled_instances:
for instance in instance_list:
def _wait_for_evacuation():
new_instance = self.novaclient.get_server(context, instance.id)
instance_host = getattr(new_instance,

View File

@ -45,9 +45,12 @@ class StopInstanceTask(base.MasakariTask):
"""Stop the instance for recovery."""
instance = self.novaclient.get_server(context, instance_uuid)
# If instance is not HA_Enabled then exit from the flow
if not strutils.bool_from_string(instance.metadata.get(
'HA_Enabled', False), strict=True):
# If an instance is not HA_Enabled and "process_all_instances" config
# option is also disabled, then there is no need to take any recovery
# action.
if not CONF.instance_failure.process_all_instances and not (
strutils.bool_from_string(
instance.metadata.get('HA_Enabled', False))):
LOG.info(_LI("Skipping recovery for instance: %s as it is "
"not Ha_Enabled."), instance_uuid)
raise exception.SkipInstanceRecoveryException()

View File

@ -21,12 +21,15 @@ import copy
import mock
from masakari.compute import nova
from masakari import conf
from masakari import context
from masakari.engine.drivers.taskflow import host_failure
from masakari import exception
from masakari import test
from masakari.tests.unit import fakes
CONF = conf.CONF
class HostFailureTestCase(test.TestCase):
@ -38,6 +41,8 @@ class HostFailureTestCase(test.TestCase):
# reduce the wait period.
self.override_config("wait_period_after_evacuation", 2)
self.override_config("wait_period_after_service_disabled", 2)
self.override_config("evacuate_all_instances",
False, "host_failure")
self.instance_host = "fake-host"
self.novaclient = nova.API()
self.fake_client = fakes.FakeNovaClient()
@ -60,26 +65,31 @@ class HostFailureTestCase(test.TestCase):
mock_disable.assert_called_once_with(self.instance_host,
"nova-compute")
def _test_ha_enabled_instances(self):
def _test_instance_list(self):
task = host_failure.PrepareHAEnabledInstancesTask(self.novaclient)
ha_enabled_instances = task.execute(self.ctxt, self.instance_host)
instance_list = task.execute(
self.ctxt, self.instance_host)
evacuate_all_instances = CONF.host_failure.evacuate_all_instances
for instance in ha_enabled_instances['ha_enabled_instances']:
self.assertTrue(instance.metadata.get(
'HA_Enabled'))
if evacuate_all_instances:
self.assertEqual(len(self.fake_client.servers.list()),
len(instance_list['instance_list']))
else:
for instance in instance_list['instance_list']:
self.assertTrue(instance.metadata.get('HA_Enabled', False))
return ha_enabled_instances
return instance_list
def _auto_evacuate_instances(self, ha_enabled_instances):
def _auto_evacuate_instances(self, instance_list):
task = host_failure.AutoEvacuationInstancesTask(self.novaclient)
ha_enabled_instances = task.execute(
self.ctxt, ha_enabled_instances['ha_enabled_instances'])
instance_list = task.execute(
self.ctxt, instance_list['instance_list'])
return ha_enabled_instances
return instance_list
def _test_confirm_evacuate_task(self, ha_enabled_instances):
def _test_confirm_evacuate_task(self, instance_list):
task = host_failure.ConfirmEvacuationTask(self.novaclient)
task.execute(self.ctxt, ha_enabled_instances['ha_enabled_instances'],
task.execute(self.ctxt, instance_list['instance_list'],
self.instance_host)
# make sure instance is active and has different host
self._verify_instance_evacuated()
@ -87,25 +97,26 @@ class HostFailureTestCase(test.TestCase):
@mock.patch('masakari.compute.nova.novaclient')
def test_host_failure_flow(self, _mock_novaclient):
_mock_novaclient.return_value = self.fake_client
self.override_config("evacuate_all_instances",
True, "host_failure")
# create test data
self.fake_client.servers.create(id="1", host=self.instance_host,
ha_enabled=True)
self.fake_client.servers.create(id="2", host=self.instance_host,
ha_enabled=True)
self.fake_client.servers.create(id="2", host=self.instance_host)
# execute DisableComputeServiceTask
self._test_disable_compute_service()
# execute PrepareHAEnabledInstancesTask
ha_enabled_instances = self._test_ha_enabled_instances()
instance_list = self._test_instance_list()
# execute AutoEvacuationInstancesTask
ha_enabled_instances = self._auto_evacuate_instances(
ha_enabled_instances)
instance_list = self._auto_evacuate_instances(
instance_list)
# execute ConfirmEvacuationTask
self._test_confirm_evacuate_task(ha_enabled_instances)
self._test_confirm_evacuate_task(instance_list)
@mock.patch('masakari.compute.nova.novaclient')
def test_auto_evacuate_instances_task(self, _mock_novaclient):
@ -121,7 +132,7 @@ class HostFailureTestCase(test.TestCase):
self._test_disable_compute_service()
# execute PrepareHAEnabledInstancesTask
ha_enabled_instances = self._test_ha_enabled_instances()
instance_list = self._test_instance_list()
# execute AutoEvacuationInstancesTask
task = host_failure.AutoEvacuationInstancesTask(self.novaclient)
@ -130,7 +141,7 @@ class HostFailureTestCase(test.TestCase):
with mock.patch.object(fakes.FakeNovaClient.ServerManager,
"evacuate") as mock_evacuate:
task.execute(self.ctxt,
ha_enabled_instances['ha_enabled_instances'])
instance_list['instance_list'])
self.assertEqual(2, mock_evacuate.call_count)
@mock.patch('masakari.compute.nova.novaclient')
@ -146,8 +157,8 @@ class HostFailureTestCase(test.TestCase):
# execute PrepareHAEnabledInstancesTask
task = host_failure.PrepareHAEnabledInstancesTask(self.novaclient)
ha_enabled_instances = task.execute(self.ctxt, self.instance_host)
self.assertEqual(0, len(ha_enabled_instances['ha_enabled_instances']))
instance_list = task.execute(self.ctxt, self.instance_host)
self.assertEqual(0, len(instance_list['instance_list']))
@mock.patch('masakari.compute.nova.novaclient')
def test_host_failure_flow_evacuation_failed(self, _mock_novaclient):
@ -157,13 +168,13 @@ class HostFailureTestCase(test.TestCase):
server = self.fake_client.servers.create(id="1",
host=self.instance_host,
ha_enabled=True)
ha_enabled_instances = {
"ha_enabled_instances": self.fake_client.servers.list()
instance_list = {
"instance_list": self.fake_client.servers.list()
}
# execute AutoEvacuationInstancesTask
ha_enabled_instances = self._auto_evacuate_instances(
ha_enabled_instances)
instance_list = self._auto_evacuate_instances(
instance_list)
def fake_get_server(context, host):
# assume that while evacuating instance goes into error state
@ -176,7 +187,7 @@ class HostFailureTestCase(test.TestCase):
task = host_failure.ConfirmEvacuationTask(self.novaclient)
self.assertRaises(
exception.AutoRecoveryFailureException, task.execute,
self.ctxt, ha_enabled_instances['ha_enabled_instances'],
self.ctxt, instance_list['instance_list'],
self.instance_host)
@mock.patch('masakari.compute.nova.novaclient')
@ -190,16 +201,16 @@ class HostFailureTestCase(test.TestCase):
self.fake_client.servers.create(id="2", host=self.instance_host,
vm_state="resized",
ha_enabled=True)
ha_enabled_instances = {
"ha_enabled_instances": self.fake_client.servers.list()
instance_list = {
"instance_list": self.fake_client.servers.list()
}
# execute AutoEvacuationInstancesTask
ha_enabled_instances = self._auto_evacuate_instances(
ha_enabled_instances)
instance_list = self._auto_evacuate_instances(
instance_list)
# execute ConfirmEvacuationTask
self._test_confirm_evacuate_task(ha_enabled_instances)
self._test_confirm_evacuate_task(instance_list)
@mock.patch('masakari.compute.nova.novaclient')
def test_host_failure_flow_shutdown_instance(self, _mock_novaclient):
@ -212,16 +223,16 @@ class HostFailureTestCase(test.TestCase):
self.fake_client.servers.create(id="2", host=self.instance_host,
vm_state="stopped",
ha_enabled=True)
ha_enabled_instances = {
"ha_enabled_instances": self.fake_client.servers.list()
instance_list = {
"instance_list": self.fake_client.servers.list()
}
# execute AutoEvacuationInstancesTask
ha_enabled_instances = self._auto_evacuate_instances(
ha_enabled_instances)
instance_list = self._auto_evacuate_instances(
instance_list)
# execute ConfirmEvacuationTask
self._test_confirm_evacuate_task(ha_enabled_instances)
self._test_confirm_evacuate_task(instance_list)
@mock.patch('masakari.compute.nova.novaclient')
def test_host_failure_flow_instance_in_error(self, _mock_novaclient):
@ -234,13 +245,13 @@ class HostFailureTestCase(test.TestCase):
self.fake_client.servers.create(id="2", host=self.instance_host,
vm_state="error",
ha_enabled=True)
ha_enabled_instances = {
"ha_enabled_instances": self.fake_client.servers.list()
instance_list = {
"instance_list": self.fake_client.servers.list()
}
# execute AutoEvacuationInstancesTask
ha_enabled_instances = self._auto_evacuate_instances(
ha_enabled_instances)
instance_list = self._auto_evacuate_instances(
instance_list)
# execute ConfirmEvacuationTask
self._test_confirm_evacuate_task(ha_enabled_instances)
self._test_confirm_evacuate_task(instance_list)

View File

@ -40,6 +40,8 @@ class InstanceFailureTestCase(test.TestCase):
# reduce the wait period.
self.override_config('wait_period_after_power_off', 2)
self.override_config('wait_period_after_power_on', 2)
self.override_config("process_all_instances",
False, "instance_failure")
def _test_stop_instance(self):
task = instance_failure.StopInstanceTask(self.novaclient)
@ -130,6 +132,29 @@ class InstanceFailureTestCase(test.TestCase):
exception.SkipInstanceRecoveryException, task.execute,
self.ctxt, self.instance_id)
@mock.patch('masakari.compute.nova.novaclient')
def test_instance_failure_flow_not_ha_enabled_but_conf_option_is_set(
self, _mock_novaclient):
# Setting this config option to True indicates masakari has to recover
# the instance irrespective of whether it is HA_Enabled or not.
self.override_config("process_all_instances",
True, "instance_failure")
_mock_novaclient.return_value = self.fake_client
# create test data
self.fake_client.servers.create(self.instance_id,
host="fake-host", vm_state="resized")
# test StopInstanceTask
self._test_stop_instance()
# test StartInstanceTask
task = instance_failure.StartInstanceTask(self.novaclient)
task.execute(self.ctxt, self.instance_id)
# test ConfirmInstanceActiveTask
self._test_confirm_instance_is_active()
@mock.patch('masakari.compute.nova.novaclient')
def test_instance_failure_flow_start_failed(self, _mock_novaclient):
_mock_novaclient.return_value = self.fake_client

View File

@ -0,0 +1,23 @@
---
features:
- Added two new config options:
evacuate_all_instances:
Operators can decide whether all instances or only those instances
which contain metadata key 'HA_Enabled=True' should be allowed for
evacuation from a failed source compute node. When set to True, it will
evacuate all instances from a failed source compute node. First
preference will be given to those instances which contain
'HA_Enabled=True' metadata key, and then it will evacuate the remaining
ones. When set to False, it will evacuate only those instances which
contain 'HA_Enabled=True' metadata key.
process_all_instances:
Operators can decide whether all instances or only those instances
which contain metadata key 'HA_Enabled=True' should be taken into
account to recover from instance failure events. When set to True,
it will execute instance failure recovery actions for an instance
irrespective of whether that particular instance contains metadata key
'HA_Enabled=True' or not. When set to False, it will only execute
instance failure recovery actions for an instance which contain
metadata key 'HA_Enabled=True'.