diff --git a/releasenotes/notes/host-maintenance-strategy-41f640927948fb56.yaml b/releasenotes/notes/host-maintenance-strategy-41f640927948fb56.yaml new file mode 100644 index 000000000..3131db9fa --- /dev/null +++ b/releasenotes/notes/host-maintenance-strategy-41f640927948fb56.yaml @@ -0,0 +1,9 @@ +--- +features: + - | + Added a strategy for one compute node maintenance, + without having the user's application been interrupted. + If given one backup node, the strategy will firstly + migrate all instances from the maintenance node to + the backup node. If the backup node is not provided, + it will migrate all instances, relying on nova-scheduler. diff --git a/setup.cfg b/setup.cfg index 51424727d..7239c2293 100644 --- a/setup.cfg +++ b/setup.cfg @@ -58,6 +58,7 @@ watcher_goals = noisy_neighbor = watcher.decision_engine.goal.goals:NoisyNeighborOptimization saving_energy = watcher.decision_engine.goal.goals:SavingEnergy hardware_maintenance = watcher.decision_engine.goal.goals:HardwareMaintenance + cluster_maintaining = watcher.decision_engine.goal.goals:ClusterMaintaining watcher_scoring_engines = dummy_scorer = watcher.decision_engine.scoring.dummy_scorer:DummyScorer @@ -80,6 +81,7 @@ watcher_strategies = noisy_neighbor = watcher.decision_engine.strategy.strategies.noisy_neighbor:NoisyNeighbor storage_capacity_balance = watcher.decision_engine.strategy.strategies.storage_capacity_balance:StorageCapacityBalance zone_migration = watcher.decision_engine.strategy.strategies.zone_migration:ZoneMigration + host_maintenance = watcher.decision_engine.strategy.strategies.host_maintenance:HostMaintenance watcher_actions = migrate = watcher.applier.actions.migration:Migrate diff --git a/watcher/decision_engine/goal/goals.py b/watcher/decision_engine/goal/goals.py index 965d69c9e..2a8748035 100644 --- a/watcher/decision_engine/goal/goals.py +++ b/watcher/decision_engine/goal/goals.py @@ -241,3 +241,28 @@ class HardwareMaintenance(base.Goal): def get_efficacy_specification(cls): """The efficacy spec for the current goal""" return specs.HardwareMaintenance() + + +class ClusterMaintaining(base.Goal): + """ClusterMaintenance + + This goal is used to maintain compute nodes + without having the user's application being interrupted. + """ + + @classmethod + def get_name(cls): + return "cluster_maintaining" + + @classmethod + def get_display_name(cls): + return _("Cluster Maintaining") + + @classmethod + def get_translatable_display_name(cls): + return "Cluster Maintaining" + + @classmethod + def get_efficacy_specification(cls): + """The efficacy spec for the current goal""" + return specs.Unclassified() diff --git a/watcher/decision_engine/strategy/strategies/__init__.py b/watcher/decision_engine/strategy/strategies/__init__.py index fc3a20c8e..c48529543 100644 --- a/watcher/decision_engine/strategy/strategies/__init__.py +++ b/watcher/decision_engine/strategy/strategies/__init__.py @@ -18,6 +18,7 @@ from watcher.decision_engine.strategy.strategies import actuation from watcher.decision_engine.strategy.strategies import basic_consolidation from watcher.decision_engine.strategy.strategies import dummy_strategy from watcher.decision_engine.strategy.strategies import dummy_with_scorer +from watcher.decision_engine.strategy.strategies import host_maintenance from watcher.decision_engine.strategy.strategies import noisy_neighbor from watcher.decision_engine.strategy.strategies import outlet_temp_control from watcher.decision_engine.strategy.strategies import saving_energy @@ -44,9 +45,10 @@ WorkloadStabilization = workload_stabilization.WorkloadStabilization UniformAirflow = uniform_airflow.UniformAirflow NoisyNeighbor = noisy_neighbor.NoisyNeighbor ZoneMigration = zone_migration.ZoneMigration +HostMaintenance = host_maintenance.HostMaintenance __all__ = ("Actuator", "BasicConsolidation", "OutletTempControl", "DummyStrategy", "DummyWithScorer", "VMWorkloadConsolidation", "WorkloadBalance", "WorkloadStabilization", "UniformAirflow", "NoisyNeighbor", "SavingEnergy", "StorageCapacityBalance", - "ZoneMigration") + "ZoneMigration", "HostMaintenance") diff --git a/watcher/decision_engine/strategy/strategies/base.py b/watcher/decision_engine/strategy/strategies/base.py old mode 100644 new mode 100755 index 93f735006..cad186a3d --- a/watcher/decision_engine/strategy/strategies/base.py +++ b/watcher/decision_engine/strategy/strategies/base.py @@ -471,3 +471,13 @@ class ZoneMigrationBaseStrategy(BaseStrategy): @classmethod def get_goal_name(cls): return "hardware_maintenance" + + +@six.add_metaclass(abc.ABCMeta) +class HostMaintenanceBaseStrategy(BaseStrategy): + + REASON_FOR_MAINTAINING = 'watcher_maintaining' + + @classmethod + def get_goal_name(cls): + return "cluster_maintaining" diff --git a/watcher/decision_engine/strategy/strategies/host_maintenance.py b/watcher/decision_engine/strategy/strategies/host_maintenance.py new file mode 100644 index 000000000..5693ac84c --- /dev/null +++ b/watcher/decision_engine/strategy/strategies/host_maintenance.py @@ -0,0 +1,331 @@ +# -*- encoding: utf-8 -*- +# Copyright (c) 2017 chinac.com +# +# Authors: suzhengwei +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from oslo_log import log +import six + +from watcher._i18n import _ +from watcher.common import exception as wexc +from watcher.decision_engine.model import element +from watcher.decision_engine.strategy.strategies import base + +LOG = log.getLogger(__name__) + + +class HostMaintenance(base.HostMaintenanceBaseStrategy): + """[PoC]Host Maintenance + + *Description* + + It is a migration strategy for one compute node maintenance, + without having the user's application been interruptted. + If given one backup node, the strategy will firstly + migrate all instances from the maintenance node to + the backup node. If the backup node is not provided, + it will migrate all instances, relying on nova-scheduler. + + *Requirements* + + * You must have at least 2 physical compute nodes to run this strategy. + + *Limitations* + + - This is a proof of concept that is not meant to be used in production + - It migrates all instances from one host to other hosts. It's better to + execute such strategy when load is not heavy, and use this algorithm + with `ONESHOT` audit. + - It assume that cold and live migrations are possible + """ + + INSTANCE_MIGRATION = "migrate" + CHANGE_NOVA_SERVICE_STATE = "change_nova_service_state" + REASON_FOR_DISABLE = 'watcher_disabled' + + def __init__(self, config, osc=None): + super(HostMaintenance, self).__init__(config, osc) + + @classmethod + def get_name(cls): + return "host_maintenance" + + @classmethod + def get_display_name(cls): + return _("Host Maintenance Strategy") + + @classmethod + def get_translatable_display_name(cls): + return "Host Maintenance Strategy" + + @classmethod + def get_schema(cls): + return { + "properties": { + "maintenance_node": { + "description": "The name of the compute node which " + "need maintenance", + "type": "string", + }, + "backup_node": { + "description": "The name of the compute node which " + "will backup the maintenance node.", + "type": "string", + }, + }, + "required": ["maintenance_node"], + } + + def get_disabled_compute_nodes_with_reason(self, reason=None): + return {uuid: cn for uuid, cn in + self.compute_model.get_all_compute_nodes().items() + if cn.state == element.ServiceState.ONLINE.value and + cn.status == element.ServiceState.DISABLED.value and + cn.disabled_reason == reason} + + def get_disabled_compute_nodes(self): + return self.get_disabled_compute_nodes_with_reason( + self.REASON_FOR_DISABLE) + + def get_instance_state_str(self, instance): + """Get instance state in string format""" + if isinstance(instance.state, six.string_types): + return instance.state + elif isinstance(instance.state, element.InstanceState): + return instance.state.value + else: + LOG.error('Unexpected instance state type, ' + 'state=%(state)s, state_type=%(st)s.', + dict(state=instance.state, + st=type(instance.state))) + raise wexc.WatcherException + + def get_node_status_str(self, node): + """Get node status in string format""" + if isinstance(node.status, six.string_types): + return node.status + elif isinstance(node.status, element.ServiceState): + return node.status.value + else: + LOG.error('Unexpected node status type, ' + 'status=%(status)s, status_type=%(st)s.', + dict(status=node.status, + st=type(node.status))) + raise wexc.WatcherException + + def get_node_capacity(self, node): + """Collect cpu, ram and disk capacity of a node. + + :param node: node object + :return: dict(cpu(cores), ram(MB), disk(B)) + """ + return dict(cpu=node.vcpus, + ram=node.memory, + disk=node.disk_capacity) + + def get_node_used(self, node): + """Collect cpu, ram and disk used of a node. + + :param node: node object + :return: dict(cpu(cores), ram(MB), disk(B)) + """ + vcpus_used = 0 + memory_used = 0 + disk_used = 0 + for instance in self.compute_model.get_node_instances(node): + vcpus_used += instance.vcpus + memory_used += instance.memory + disk_used += instance.disk + + return dict(cpu=vcpus_used, + ram=memory_used, + disk=disk_used) + + def get_node_free(self, node): + """Collect cpu, ram and disk free of a node. + + :param node: node object + :return: dict(cpu(cores), ram(MB), disk(B)) + """ + node_capacity = self.get_node_capacity(node) + node_used = self.get_node_used(node) + return dict(cpu=node_capacity['cpu']-node_used['cpu'], + ram=node_capacity['ram']-node_used['ram'], + disk=node_capacity['disk']-node_used['disk'], + ) + + def host_fits(self, source_node, destination_node): + """check host fits + + return True if VMs could intensively migrate + from source_node to destination_node. + """ + + source_node_used = self.get_node_used(source_node) + destination_node_free = self.get_node_free(destination_node) + metrics = ['cpu', 'ram'] + for m in metrics: + if source_node_used[m] > destination_node_free[m]: + return False + return True + + def add_action_enable_compute_node(self, node): + """Add an action for node enabler into the solution.""" + params = {'state': element.ServiceState.ENABLED.value} + self.solution.add_action( + action_type=self.CHANGE_NOVA_SERVICE_STATE, + resource_id=node.uuid, + input_parameters=params) + + def add_action_maintain_compute_node(self, node): + """Add an action for node maintenance into the solution.""" + params = {'state': element.ServiceState.DISABLED.value, + 'disabled_reason': self.REASON_FOR_MAINTAINING} + self.solution.add_action( + action_type=self.CHANGE_NOVA_SERVICE_STATE, + resource_id=node.uuid, + input_parameters=params) + + def enable_compute_node_if_disabled(self, node): + node_status_str = self.get_node_status_str(node) + if node_status_str != element.ServiceState.ENABLED.value: + self.add_action_enable_compute_node(node) + + def instance_migration(self, instance, src_node, des_node=None): + """Add an action for instance migration into the solution. + + :param instance: instance object + :param src_node: node object + :param des_node: node object. if None, the instance will be + migrated relying on nova-scheduler + :return: None + """ + instance_state_str = self.get_instance_state_str(instance) + if instance_state_str == element.InstanceState.ACTIVE.value: + migration_type = 'live' + else: + migration_type = 'cold' + + params = {'migration_type': migration_type, + 'source_node': src_node.uuid} + if des_node: + params['destination_node'] = des_node.uuid + self.solution.add_action(action_type=self.INSTANCE_MIGRATION, + resource_id=instance.uuid, + input_parameters=params) + + def host_migration(self, source_node, destination_node): + """host migration + + Migrate all instances from source_node to destination_node. + Active instances use "live-migrate", + and other instances use "cold-migrate" + """ + instances = self.compute_model.get_node_instances(source_node) + for instance in instances: + self.instance_migration(instance, source_node, destination_node) + + def safe_maintain(self, maintenance_node, backup_node=None): + """safe maintain one compute node + + Migrate all instances of the maintenance_node intensively to the + backup host. If users didn't give the backup host, it will select + one unused node to backup the maintaining node. + + It calculate the resource both of the backup node and maintaining + node to evaluate the migrations from maintaining node to backup node. + If all instances of the maintaining node can migrated to + the backup node, it will set the maintaining node in + 'watcher_maintaining' status., and add the migrations to solution. + """ + # If user gives a backup node with required capacity, then migrate + # all instances from the maintaining node to the backup node. + if backup_node: + if self.host_fits(maintenance_node, backup_node): + self.enable_compute_node_if_disabled(backup_node) + self.add_action_maintain_compute_node(maintenance_node) + self.host_migration(maintenance_node, backup_node) + return True + + # If uses didn't give the backup host, select one unused node + # with required capacity, then migrate all instances + # from maintaining node to it. + nodes = sorted( + self.get_disabled_compute_nodes().values(), + key=lambda x: self.get_node_capacity(x)['cpu']) + if maintenance_node in nodes: + nodes.remove(maintenance_node) + + for node in nodes: + if self.host_fits(maintenance_node, node): + self.enable_compute_node_if_disabled(node) + self.add_action_maintain_compute_node(maintenance_node) + self.host_migration(maintenance_node, node) + return True + + return False + + def try_maintain(self, maintenance_node): + """try to maintain one compute node + + It firstly set the maintenance_node in 'watcher_maintaining' status. + Then try to migrate all instances of the maintenance node, rely + on nova-scheduler. + """ + self.add_action_maintain_compute_node(maintenance_node) + instances = self.compute_model.get_node_instances(maintenance_node) + for instance in instances: + self.instance_migration(instance, maintenance_node) + + def pre_execute(self): + LOG.debug(self.compute_model.to_string()) + + if not self.compute_model: + raise wexc.ClusterStateNotDefined() + + if self.compute_model.stale: + raise wexc.ClusterStateStale() + + def do_execute(self): + LOG.info(_('Executing Host Maintenance Migration Strategy')) + + maintenance_node = self.input_parameters.get('maintenance_node') + backup_node = self.input_parameters.get('backup_node') + + # if no VMs in the maintenance_node, just maintain the compute node + src_node = self.compute_model.get_node_by_uuid(maintenance_node) + if len(self.compute_model.get_node_instances(src_node)) == 0: + if (src_node.disabled_reason != + self.REASON_FOR_MAINTAINING): + self.add_action_maintain_compute_node(src_node) + return + + if backup_node: + des_node = self.compute_model.get_node_by_uuid(backup_node) + else: + des_node = None + + if not self.safe_maintain(src_node, des_node): + self.try_maintain(src_node) + + def post_execute(self): + """Post-execution phase + + This can be used to compute the global efficacy + """ + LOG.debug(self.solution.actions) + LOG.debug(self.compute_model.to_string()) diff --git a/watcher/tests/decision_engine/strategy/strategies/test_host_maintenance.py b/watcher/tests/decision_engine/strategy/strategies/test_host_maintenance.py new file mode 100755 index 000000000..d1c21e354 --- /dev/null +++ b/watcher/tests/decision_engine/strategy/strategies/test_host_maintenance.py @@ -0,0 +1,206 @@ +# -*- encoding: utf-8 -*- +# Copyright (c) 2017 chinac.com +# +# Authors: suzhengwei +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import mock + +from watcher.common import exception +from watcher.decision_engine.model import model_root +from watcher.decision_engine.strategy import strategies +from watcher.tests import base +from watcher.tests.decision_engine.model import faker_cluster_state + + +class TestHostMaintenance(base.TestCase): + + def setUp(self): + super(TestHostMaintenance, self).setUp() + + # fake cluster + self.fake_cluster = faker_cluster_state.FakerModelCollector() + + p_model = mock.patch.object( + strategies.HostMaintenance, "compute_model", + new_callable=mock.PropertyMock) + self.m_model = p_model.start() + self.addCleanup(p_model.stop) + + p_audit_scope = mock.patch.object( + strategies.HostMaintenance, "audit_scope", + new_callable=mock.PropertyMock + ) + self.m_audit_scope = p_audit_scope.start() + self.addCleanup(p_audit_scope.stop) + + self.m_audit_scope.return_value = mock.Mock() + + self.m_model.return_value = model_root.ModelRoot() + self.strategy = strategies.HostMaintenance(config=mock.Mock()) + + def test_exception_stale_cdm(self): + self.fake_cluster.set_cluster_data_model_as_stale() + self.m_model.return_value = self.fake_cluster.cluster_data_model + + self.assertRaises( + exception.ClusterStateNotDefined, + self.strategy.execute) + + def test_get_node_capacity(self): + model = self.fake_cluster.generate_scenario_1() + self.m_model.return_value = model + node_0 = model.get_node_by_uuid("Node_0") + node_capacity = dict(cpu=40, ram=132, disk=250) + self.assertEqual(node_capacity, + self.strategy.get_node_capacity(node_0)) + + def test_get_node_used(self): + model = self.fake_cluster.generate_scenario_1() + self.m_model.return_value = model + node_0 = model.get_node_by_uuid("Node_0") + node_used = dict(cpu=20, ram=4, disk=40) + self.assertEqual(node_used, + self.strategy.get_node_used(node_0)) + + def test_get_node_free(self): + model = self.fake_cluster.generate_scenario_1() + self.m_model.return_value = model + node_0 = model.get_node_by_uuid("Node_0") + node_free = dict(cpu=20, ram=128, disk=210) + self.assertEqual(node_free, + self.strategy.get_node_free(node_0)) + + def test_host_fits(self): + model = self.fake_cluster.generate_scenario_1() + self.m_model.return_value = model + node_0 = model.get_node_by_uuid("Node_0") + node_1 = model.get_node_by_uuid("Node_1") + self.assertTrue(self.strategy.host_fits(node_0, node_1)) + + def test_add_action_enable_compute_node(self): + model = self.fake_cluster.generate_scenario_1() + self.m_model.return_value = model + node_0 = model.get_node_by_uuid('Node_0') + self.strategy.add_action_enable_compute_node(node_0) + expected = [{'action_type': 'change_nova_service_state', + 'input_parameters': { + 'state': 'enabled', + 'resource_id': 'Node_0'}}] + self.assertEqual(expected, self.strategy.solution.actions) + + def test_add_action_maintain_compute_node(self): + model = self.fake_cluster.generate_scenario_1() + self.m_model.return_value = model + node_0 = model.get_node_by_uuid('Node_0') + self.strategy.add_action_maintain_compute_node(node_0) + expected = [{'action_type': 'change_nova_service_state', + 'input_parameters': { + 'state': 'disabled', + 'disabled_reason': 'watcher_maintaining', + 'resource_id': 'Node_0'}}] + self.assertEqual(expected, self.strategy.solution.actions) + + def test_instance_migration(self): + model = self.fake_cluster.generate_scenario_1() + self.m_model.return_value = model + node_0 = model.get_node_by_uuid('Node_0') + node_1 = model.get_node_by_uuid('Node_1') + instance_0 = model.get_instance_by_uuid("INSTANCE_0") + self.strategy.instance_migration(instance_0, node_0, node_1) + self.assertEqual(1, len(self.strategy.solution.actions)) + expected = [{'action_type': 'migrate', + 'input_parameters': {'destination_node': node_1.uuid, + 'source_node': node_0.uuid, + 'migration_type': 'live', + 'resource_id': instance_0.uuid}}] + self.assertEqual(expected, self.strategy.solution.actions) + + def test_instance_migration_without_dest_node(self): + model = self.fake_cluster.generate_scenario_1() + self.m_model.return_value = model + node_0 = model.get_node_by_uuid('Node_0') + instance_0 = model.get_instance_by_uuid("INSTANCE_0") + self.strategy.instance_migration(instance_0, node_0) + self.assertEqual(1, len(self.strategy.solution.actions)) + expected = [{'action_type': 'migrate', + 'input_parameters': {'source_node': node_0.uuid, + 'migration_type': 'live', + 'resource_id': instance_0.uuid}}] + self.assertEqual(expected, self.strategy.solution.actions) + + def test_host_migration(self): + model = self.fake_cluster.generate_scenario_1() + self.m_model.return_value = model + node_0 = model.get_node_by_uuid('Node_0') + node_1 = model.get_node_by_uuid('Node_1') + instance_0 = model.get_instance_by_uuid("INSTANCE_0") + instance_1 = model.get_instance_by_uuid("INSTANCE_1") + self.strategy.host_migration(node_0, node_1) + self.assertEqual(2, len(self.strategy.solution.actions)) + expected = [{'action_type': 'migrate', + 'input_parameters': {'destination_node': node_1.uuid, + 'source_node': node_0.uuid, + 'migration_type': 'live', + 'resource_id': instance_0.uuid}}, + {'action_type': 'migrate', + 'input_parameters': {'destination_node': node_1.uuid, + 'source_node': node_0.uuid, + 'migration_type': 'live', + 'resource_id': instance_1.uuid}}] + self.assertIn(expected[0], self.strategy.solution.actions) + self.assertIn(expected[1], self.strategy.solution.actions) + + def test_safe_maintain(self): + model = self.fake_cluster.generate_scenario_1() + self.m_model.return_value = model + node_0 = model.get_node_by_uuid('Node_0') + node_1 = model.get_node_by_uuid('Node_1') + self.assertFalse(self.strategy.safe_maintain(node_0)) + self.assertFalse(self.strategy.safe_maintain(node_1)) + + def test_try_maintain(self): + model = self.fake_cluster.generate_scenario_1() + self.m_model.return_value = model + node_1 = model.get_node_by_uuid('Node_1') + self.strategy.try_maintain(node_1) + self.assertEqual(2, len(self.strategy.solution.actions)) + + def test_strategy(self): + model = self.fake_cluster. \ + generate_scenario_9_with_3_active_plus_1_disabled_nodes() + self.m_model.return_value = model + node_2 = model.get_node_by_uuid('Node_2') + node_3 = model.get_node_by_uuid('Node_3') + instance_4 = model.get_instance_by_uuid("INSTANCE_4") + if not self.strategy.safe_maintain(node_2, node_3): + self.strategy.try_maintain(node_2) + expected = [{'action_type': 'change_nova_service_state', + 'input_parameters': { + 'resource_id': 'Node_3', + 'state': 'enabled'}}, + {'action_type': 'change_nova_service_state', + 'input_parameters': { + 'resource_id': 'Node_2', + 'state': 'disabled', + 'disabled_reason': 'watcher_maintaining'}}, + {'action_type': 'migrate', + 'input_parameters': { + 'destination_node': node_3.uuid, + 'source_node': node_2.uuid, + 'migration_type': 'live', + 'resource_id': instance_4.uuid}}] + self.assertEqual(expected, self.strategy.solution.actions)