From 5c86a54d20f5034c06073617446e0b0bc2e1436d Mon Sep 17 00:00:00 2001 From: suzhengwei Date: Fri, 30 Jun 2017 18:59:09 +0800 Subject: [PATCH] workload balance base on cpu or ram util By the input parameter "metrics", it makes decision to migrate a VM base on cpu or memory utilization. Change-Id: I35cce3495c8dacad64ea6c6ee71082a85e9e0a83 --- doc/source/strategies/workload_balance.rst | 6 +- ...e-on-cpu-or-ram-util-3ff4ee968c32b2ed.yaml | 7 ++ .../strategy/strategies/workload_balance.py | 84 ++++++++++++------- .../model/ceilometer_metrics.py | 16 ++++ .../model/data/scenario_6_with_2_nodes.xml | 8 +- .../decision_engine/model/gnocchi_metrics.py | 16 ++++ .../strategies/test_workload_balance.py | 26 ++++-- 7 files changed, 121 insertions(+), 42 deletions(-) create mode 100644 releasenotes/notes/workload-balance-base-on-cpu-or-ram-util-3ff4ee968c32b2ed.yaml diff --git a/doc/source/strategies/workload_balance.rst b/doc/source/strategies/workload_balance.rst index ea09c6e8a..89703efcc 100644 --- a/doc/source/strategies/workload_balance.rst +++ b/doc/source/strategies/workload_balance.rst @@ -25,6 +25,7 @@ The *workload_balance* strategy requires the following metrics: metric service name plugins comment ======================= ============ ======= ======= ``cpu_util`` ceilometer_ none +``memory.resident`` ceilometer_ none ======================= ============ ======= ======= .. _ceilometer: http://docs.openstack.org/admin-guide/telemetry-measurements.html#openstack-compute @@ -66,6 +67,9 @@ Strategy parameters are: ============== ====== ============= ==================================== parameter type default Value description ============== ====== ============= ==================================== +``metrics`` String 'cpu_util' Workload balance base on cpu or ram + utilization. choice: ['cpu_util', + 'memory.resident'] ``threshold`` Number 25.0 Workload threshold for migration ``period`` Number 300 Aggregate time period of ceilometer ============== ====== ============= ==================================== @@ -90,7 +94,7 @@ How to use it ? at1 workload_balancing --strategy workload_balance $ openstack optimize audit create -a at1 -p threshold=26.0 \ - -p period=310 + -p period=310 -p metrics=cpu_util External Links -------------- diff --git a/releasenotes/notes/workload-balance-base-on-cpu-or-ram-util-3ff4ee968c32b2ed.yaml b/releasenotes/notes/workload-balance-base-on-cpu-or-ram-util-3ff4ee968c32b2ed.yaml new file mode 100644 index 000000000..f9b10ed83 --- /dev/null +++ b/releasenotes/notes/workload-balance-base-on-cpu-or-ram-util-3ff4ee968c32b2ed.yaml @@ -0,0 +1,7 @@ +--- +features: + - Existing workload_balance strategy based on + the VM workloads of CPU. This feature improves + the strategy. By the input parameter "metrics", + it makes decision to migrate a VM base on CPU + or memory utilization. \ No newline at end of file diff --git a/watcher/decision_engine/strategy/strategies/workload_balance.py b/watcher/decision_engine/strategy/strategies/workload_balance.py index 63e638c8c..fb3aa5b58 100644 --- a/watcher/decision_engine/strategy/strategies/workload_balance.py +++ b/watcher/decision_engine/strategy/strategies/workload_balance.py @@ -22,7 +22,7 @@ *Description* This strategy migrates a VM based on the VM workload of the hosts. -It makes decision to migrate a workload whenever a host's CPU +It makes decision to migrate a workload whenever a host's CPU or RAM utilization % is higher than the specified threshold. The VM to be moved should make the host close to average workload of all hosts nodes. @@ -32,7 +32,7 @@ hosts nodes. * Hardware: compute node should use the same physical CPUs * Software: Ceilometer component ceilometer-agent-compute running in each compute node, and Ceilometer API can - report such telemetry "cpu_util" successfully. + report such telemetry "cpu_util" and "memory.resident" successfully. * You must have at least 2 physical compute nodes to run this strategy. @@ -69,16 +69,16 @@ class WorkloadBalance(base.WorkloadStabilizationBaseStrategy): It is a migration strategy based on the VM workload of physical servers. It generates solutions to move a workload whenever a server's - CPU utilization % is higher than the specified threshold. + CPU or RAM utilization % is higher than the specified threshold. The VM to be moved should make the host close to average workload of all compute nodes. *Requirements* - * Hardware: compute node should use the same physical CPUs + * Hardware: compute node should use the same physical CPUs/RAMs * Software: Ceilometer component ceilometer-agent-compute running in each compute node, and Ceilometer API can report such telemetry - "cpu_util" successfully. + "cpu_util" and "memory.resident" successfully. * You must have at least 2 physical compute nodes to run this strategy *Limitations* @@ -91,8 +91,12 @@ class WorkloadBalance(base.WorkloadStabilizationBaseStrategy): """ # The meter to report CPU utilization % of VM in ceilometer - METER_NAME = "cpu_util" # Unit: %, value range is [0 , 100] + CPU_METER_NAME = "cpu_util" + + # The meter to report memory resident of VM in ceilometer + # Unit: MB + MEM_METER_NAME = "memory.resident" MIGRATION = "migrate" @@ -104,9 +108,9 @@ class WorkloadBalance(base.WorkloadStabilizationBaseStrategy): :param osc: :py:class:`~.OpenStackClients` instance """ super(WorkloadBalance, self).__init__(config, osc) - # the migration plan will be triggered when the CPU utilization % - # reaches threshold - self._meter = self.METER_NAME + # the migration plan will be triggered when the CPU or RAM + # utilization % reaches threshold + self._meter = None self._ceilometer = None self._gnocchi = None @@ -151,6 +155,13 @@ class WorkloadBalance(base.WorkloadStabilizationBaseStrategy): # Mandatory default setting for each element return { "properties": { + "metrics": { + "description": "Workload balance based on metrics: " + "cpu or ram utilization", + "type": "string", + "choice": ["cpu_util", "memory.resident"], + "default": "cpu_util" + }, "threshold": { "description": "workload threshold for migration", "type": "number", @@ -251,18 +262,21 @@ class WorkloadBalance(base.WorkloadStabilizationBaseStrategy): cores_available = host.vcpus - cores_used disk_available = host.disk - disk_used mem_available = host.memory - mem_used - if ( - cores_available >= required_cores and - disk_available >= required_disk and + if (cores_available >= required_cores and mem_available >= required_mem and + disk_available >= required_disk): + if (self._meter == self.CPU_METER_NAME and ((src_instance_workload + workload) < - self.threshold / 100 * host.vcpus) - ): - destination_hosts.append(instance_data) + self.threshold / 100 * host.vcpus)): + destination_hosts.append(instance_data) + if (self._meter == self.MEM_METER_NAME and + ((src_instance_workload + workload) < + self.threshold / 100 * host.memory)): + destination_hosts.append(instance_data) return destination_hosts - def group_hosts_by_cpu_util(self): + def group_hosts_by_cpu_or_ram_util(self): """Calculate the workloads of each node try to find out the nodes which have reached threshold @@ -286,10 +300,10 @@ class WorkloadBalance(base.WorkloadStabilizationBaseStrategy): instances = self.compute_model.get_node_instances(node) node_workload = 0.0 for instance in instances: - cpu_util = None + instance_util = None try: if self.config.datasource == "ceilometer": - cpu_util = self.ceilometer.statistic_aggregation( + instance_util = self.ceilometer.statistic_aggregation( resource_id=instance.uuid, meter_name=self._meter, period=self._period, @@ -298,7 +312,7 @@ class WorkloadBalance(base.WorkloadStabilizationBaseStrategy): stop_time = datetime.datetime.utcnow() start_time = stop_time - datetime.timedelta( seconds=int(self._period)) - cpu_util = self.gnocchi.statistic_aggregation( + instance_util = self.gnocchi.statistic_aggregation( resource_id=instance.uuid, metric=self._meter, granularity=self.granularity, @@ -308,23 +322,32 @@ class WorkloadBalance(base.WorkloadStabilizationBaseStrategy): ) except Exception as exc: LOG.exception(exc) - LOG.error("Can not get cpu_util from %s", + LOG.error("Can not get %s from %s", self._meter, self.config.datasource) continue - if cpu_util is None: - LOG.debug("Instance (%s): cpu_util is None", instance.uuid) + if instance_util is None: + LOG.debug("Instance (%s): %s is None", + instance.uuid, self._meter) continue - workload_cache[instance.uuid] = cpu_util * instance.vcpus / 100 + if self._meter == self.CPU_METER_NAME: + workload_cache[instance.uuid] = (instance_util * + instance.vcpus / 100) + else: + workload_cache[instance.uuid] = instance_util node_workload += workload_cache[instance.uuid] - LOG.debug("VM (%s): cpu_util %f", instance.uuid, cpu_util) - node_cpu_util = node_workload / node.vcpus * 100 + LOG.debug("VM (%s): %s %f", instance.uuid, self._meter, + instance_util) cluster_workload += node_workload + if self._meter == self.CPU_METER_NAME: + node_util = node_workload / node.vcpus * 100 + else: + node_util = node_workload / node.memory * 100 instance_data = { - 'node': node, "cpu_util": node_cpu_util, + 'node': node, self._meter: node_util, 'workload': node_workload} - if node_cpu_util >= self.threshold: + if node_util >= self.threshold: # mark the node to release resources overload_hosts.append(instance_data) else: @@ -356,8 +379,9 @@ class WorkloadBalance(base.WorkloadStabilizationBaseStrategy): """ self.threshold = self.input_parameters.threshold self._period = self.input_parameters.period + self._meter = self.input_parameters.metrics source_nodes, target_nodes, avg_workload, workload_cache = ( - self.group_hosts_by_cpu_util()) + self.group_hosts_by_cpu_or_ram_util()) if not source_nodes: LOG.debug("No hosts require optimization") @@ -373,7 +397,7 @@ class WorkloadBalance(base.WorkloadStabilizationBaseStrategy): # choose the server with largest cpu_util source_nodes = sorted(source_nodes, reverse=True, - key=lambda x: (x[self.METER_NAME])) + key=lambda x: (x[self._meter])) instance_to_migrate = self.choose_instance_to_migrate( source_nodes, avg_workload, workload_cache) @@ -391,7 +415,7 @@ class WorkloadBalance(base.WorkloadStabilizationBaseStrategy): "be because of there's no enough CPU/Memory/DISK") return self.solution destination_hosts = sorted(destination_hosts, - key=lambda x: (x["cpu_util"])) + key=lambda x: (x[self._meter])) # always use the host with lowerest CPU utilization mig_destination_node = destination_hosts[0]['node'] # generate solution to migrate the instance to the dest server, diff --git a/watcher/tests/decision_engine/model/ceilometer_metrics.py b/watcher/tests/decision_engine/model/ceilometer_metrics.py index 9c5d33660..1103fb41f 100644 --- a/watcher/tests/decision_engine/model/ceilometer_metrics.py +++ b/watcher/tests/decision_engine/model/ceilometer_metrics.py @@ -54,6 +54,8 @@ class FakeCeilometerMetrics(object): result = 0.0 if meter_name == "cpu_util": result = self.get_average_usage_instance_cpu_wb(resource_id) + elif meter_name == "memory.resident": + result = self.get_average_usage_instance_memory_wb(resource_id) return result def mock_get_statistics_nn(self, resource_id, meter_name, period, @@ -211,6 +213,20 @@ class FakeCeilometerMetrics(object): mock['INSTANCE_4'] = 10 return float(mock[str(uuid)]) + @staticmethod + def get_average_usage_instance_memory_wb(uuid): + mock = {} + # node 0 + mock['INSTANCE_1'] = 30 + # node 1 + mock['INSTANCE_3'] = 12 + mock['INSTANCE_4'] = 12 + if uuid not in mock.keys(): + # mock[uuid] = random.randint(1, 4) + mock[uuid] = 12 + + return mock[str(uuid)] + @staticmethod def get_average_usage_instance_cpu(uuid): """The last VM CPU usage values to average diff --git a/watcher/tests/decision_engine/model/data/scenario_6_with_2_nodes.xml b/watcher/tests/decision_engine/model/data/scenario_6_with_2_nodes.xml index c12eabaa5..636827c63 100644 --- a/watcher/tests/decision_engine/model/data/scenario_6_with_2_nodes.xml +++ b/watcher/tests/decision_engine/model/data/scenario_6_with_2_nodes.xml @@ -1,10 +1,10 @@ - - + + - - + + diff --git a/watcher/tests/decision_engine/model/gnocchi_metrics.py b/watcher/tests/decision_engine/model/gnocchi_metrics.py index 982bcac6c..8c0a75e4b 100644 --- a/watcher/tests/decision_engine/model/gnocchi_metrics.py +++ b/watcher/tests/decision_engine/model/gnocchi_metrics.py @@ -50,6 +50,8 @@ class FakeGnocchiMetrics(object): result = 0.0 if metric == "cpu_util": result = self.get_average_usage_instance_cpu_wb(resource_id) + elif metric == "memory.resident": + result = self.get_average_usage_instance_memory_wb(resource_id) return result @staticmethod @@ -242,3 +244,17 @@ class FakeGnocchiMetrics(object): mock['INSTANCE_3'] = 20 mock['INSTANCE_4'] = 10 return float(mock[str(uuid)]) + + @staticmethod + def get_average_usage_instance_memory_wb(uuid): + mock = {} + # node 0 + mock['INSTANCE_1'] = 30 + # node 1 + mock['INSTANCE_3'] = 12 + mock['INSTANCE_4'] = 12 + if uuid not in mock.keys(): + # mock[uuid] = random.randint(1, 4) + mock[uuid] = 12 + + return mock[str(uuid)] diff --git a/watcher/tests/decision_engine/strategy/strategies/test_workload_balance.py b/watcher/tests/decision_engine/strategy/strategies/test_workload_balance.py index 36e06e641..68efa4cb2 100644 --- a/watcher/tests/decision_engine/strategy/strategies/test_workload_balance.py +++ b/watcher/tests/decision_engine/strategy/strategies/test_workload_balance.py @@ -74,10 +74,12 @@ class TestWorkloadBalance(base.TestCase): self.strategy = strategies.WorkloadBalance( config=mock.Mock(datasource=self.datasource)) self.strategy.input_parameters = utils.Struct() - self.strategy.input_parameters.update({'threshold': 25.0, + self.strategy.input_parameters.update({'metrics': 'cpu_util', + 'threshold': 25.0, 'period': 300}) self.strategy.threshold = 25.0 self.strategy._period = 300 + self.strategy._meter = "cpu_util" def test_calc_used_resource(self): model = self.fake_cluster.generate_scenario_6_with_2_nodes() @@ -86,21 +88,31 @@ class TestWorkloadBalance(base.TestCase): cores_used, mem_used, disk_used = ( self.strategy.calculate_used_resource(node)) - self.assertEqual((cores_used, mem_used, disk_used), (20, 4, 40)) + self.assertEqual((cores_used, mem_used, disk_used), (20, 64, 40)) def test_group_hosts_by_cpu_util(self): model = self.fake_cluster.generate_scenario_6_with_2_nodes() self.m_model.return_value = model self.strategy.threshold = 30 - n1, n2, avg, w_map = self.strategy.group_hosts_by_cpu_util() + n1, n2, avg, w_map = self.strategy.group_hosts_by_cpu_or_ram_util() self.assertEqual(n1[0]['node'].uuid, 'Node_0') self.assertEqual(n2[0]['node'].uuid, 'Node_1') self.assertEqual(avg, 8.0) + def test_group_hosts_by_ram_util(self): + model = self.fake_cluster.generate_scenario_6_with_2_nodes() + self.m_model.return_value = model + self.strategy._meter = "memory.resident" + self.strategy.threshold = 30 + n1, n2, avg, w_map = self.strategy.group_hosts_by_cpu_or_ram_util() + self.assertEqual(n1[0]['node'].uuid, 'Node_0') + self.assertEqual(n2[0]['node'].uuid, 'Node_1') + self.assertEqual(avg, 33.0) + def test_choose_instance_to_migrate(self): model = self.fake_cluster.generate_scenario_6_with_2_nodes() self.m_model.return_value = model - n1, n2, avg, w_map = self.strategy.group_hosts_by_cpu_util() + n1, n2, avg, w_map = self.strategy.group_hosts_by_cpu_or_ram_util() instance_to_mig = self.strategy.choose_instance_to_migrate( n1, avg, w_map) self.assertEqual(instance_to_mig[0].uuid, 'Node_0') @@ -110,7 +122,7 @@ class TestWorkloadBalance(base.TestCase): def test_choose_instance_notfound(self): model = self.fake_cluster.generate_scenario_6_with_2_nodes() self.m_model.return_value = model - n1, n2, avg, w_map = self.strategy.group_hosts_by_cpu_util() + n1, n2, avg, w_map = self.strategy.group_hosts_by_cpu_or_ram_util() instances = model.get_all_instances() [model.remove_instance(inst) for inst in instances.values()] instance_to_mig = self.strategy.choose_instance_to_migrate( @@ -122,7 +134,7 @@ class TestWorkloadBalance(base.TestCase): self.m_model.return_value = model self.strategy.datasource = mock.MagicMock( statistic_aggregation=self.fake_metrics.mock_get_statistics_wb) - n1, n2, avg, w_map = self.strategy.group_hosts_by_cpu_util() + n1, n2, avg, w_map = self.strategy.group_hosts_by_cpu_or_ram_util() instance_to_mig = self.strategy.choose_instance_to_migrate( n1, avg, w_map) dest_hosts = self.strategy.filter_destination_hosts( @@ -202,7 +214,7 @@ class TestWorkloadBalance(base.TestCase): m_gnocchi.statistic_aggregation = mock.Mock( side_effect=self.fake_metrics.mock_get_statistics_wb) instance0 = model.get_instance_by_uuid("INSTANCE_0") - self.strategy.group_hosts_by_cpu_util() + self.strategy.group_hosts_by_cpu_or_ram_util() if self.strategy.config.datasource == "ceilometer": m_ceilometer.statistic_aggregation.assert_any_call( aggregate='avg', meter_name='cpu_util',