Merge "workload balance base on cpu or ram util"

This commit is contained in:
Jenkins 2017-08-15 08:37:58 +00:00 committed by Gerrit Code Review
commit e5c3df0c2f
7 changed files with 121 additions and 42 deletions

View File

@ -25,6 +25,7 @@ The *workload_balance* strategy requires the following metrics:
metric service name plugins comment
======================= ============ ======= =======
``cpu_util`` ceilometer_ none
``memory.resident`` ceilometer_ none
======================= ============ ======= =======
.. _ceilometer: http://docs.openstack.org/admin-guide/telemetry-measurements.html#openstack-compute
@ -66,6 +67,9 @@ Strategy parameters are:
============== ====== ============= ====================================
parameter type default Value description
============== ====== ============= ====================================
``metrics`` String 'cpu_util' Workload balance base on cpu or ram
utilization. choice: ['cpu_util',
'memory.resident']
``threshold`` Number 25.0 Workload threshold for migration
``period`` Number 300 Aggregate time period of ceilometer
============== ====== ============= ====================================
@ -90,7 +94,7 @@ How to use it ?
at1 workload_balancing --strategy workload_balance
$ openstack optimize audit create -a at1 -p threshold=26.0 \
-p period=310
-p period=310 -p metrics=cpu_util
External Links
--------------

View File

@ -0,0 +1,7 @@
---
features:
- Existing workload_balance strategy based on
the VM workloads of CPU. This feature improves
the strategy. By the input parameter "metrics",
it makes decision to migrate a VM base on CPU
or memory utilization.

View File

@ -22,7 +22,7 @@
*Description*
This strategy migrates a VM based on the VM workload of the hosts.
It makes decision to migrate a workload whenever a host's CPU
It makes decision to migrate a workload whenever a host's CPU or RAM
utilization % is higher than the specified threshold. The VM to
be moved should make the host close to average workload of all
hosts nodes.
@ -32,7 +32,7 @@ hosts nodes.
* Hardware: compute node should use the same physical CPUs
* Software: Ceilometer component ceilometer-agent-compute
running in each compute node, and Ceilometer API can
report such telemetry "cpu_util" successfully.
report such telemetry "cpu_util" and "memory.resident" successfully.
* You must have at least 2 physical compute nodes to run
this strategy.
@ -69,16 +69,16 @@ class WorkloadBalance(base.WorkloadStabilizationBaseStrategy):
It is a migration strategy based on the VM workload of physical
servers. It generates solutions to move a workload whenever a server's
CPU utilization % is higher than the specified threshold.
CPU or RAM utilization % is higher than the specified threshold.
The VM to be moved should make the host close to average workload
of all compute nodes.
*Requirements*
* Hardware: compute node should use the same physical CPUs
* Hardware: compute node should use the same physical CPUs/RAMs
* Software: Ceilometer component ceilometer-agent-compute running
in each compute node, and Ceilometer API can report such telemetry
"cpu_util" successfully.
"cpu_util" and "memory.resident" successfully.
* You must have at least 2 physical compute nodes to run this strategy
*Limitations*
@ -91,8 +91,12 @@ class WorkloadBalance(base.WorkloadStabilizationBaseStrategy):
"""
# The meter to report CPU utilization % of VM in ceilometer
METER_NAME = "cpu_util"
# Unit: %, value range is [0 , 100]
CPU_METER_NAME = "cpu_util"
# The meter to report memory resident of VM in ceilometer
# Unit: MB
MEM_METER_NAME = "memory.resident"
MIGRATION = "migrate"
@ -104,9 +108,9 @@ class WorkloadBalance(base.WorkloadStabilizationBaseStrategy):
:param osc: :py:class:`~.OpenStackClients` instance
"""
super(WorkloadBalance, self).__init__(config, osc)
# the migration plan will be triggered when the CPU utilization %
# reaches threshold
self._meter = self.METER_NAME
# the migration plan will be triggered when the CPU or RAM
# utilization % reaches threshold
self._meter = None
self._ceilometer = None
self._gnocchi = None
@ -151,6 +155,13 @@ class WorkloadBalance(base.WorkloadStabilizationBaseStrategy):
# Mandatory default setting for each element
return {
"properties": {
"metrics": {
"description": "Workload balance based on metrics: "
"cpu or ram utilization",
"type": "string",
"choice": ["cpu_util", "memory.resident"],
"default": "cpu_util"
},
"threshold": {
"description": "workload threshold for migration",
"type": "number",
@ -251,18 +262,21 @@ class WorkloadBalance(base.WorkloadStabilizationBaseStrategy):
cores_available = host.vcpus - cores_used
disk_available = host.disk - disk_used
mem_available = host.memory - mem_used
if (
cores_available >= required_cores and
disk_available >= required_disk and
if (cores_available >= required_cores and
mem_available >= required_mem and
disk_available >= required_disk):
if (self._meter == self.CPU_METER_NAME and
((src_instance_workload + workload) <
self.threshold / 100 * host.vcpus)
):
destination_hosts.append(instance_data)
self.threshold / 100 * host.vcpus)):
destination_hosts.append(instance_data)
if (self._meter == self.MEM_METER_NAME and
((src_instance_workload + workload) <
self.threshold / 100 * host.memory)):
destination_hosts.append(instance_data)
return destination_hosts
def group_hosts_by_cpu_util(self):
def group_hosts_by_cpu_or_ram_util(self):
"""Calculate the workloads of each node
try to find out the nodes which have reached threshold
@ -286,10 +300,10 @@ class WorkloadBalance(base.WorkloadStabilizationBaseStrategy):
instances = self.compute_model.get_node_instances(node)
node_workload = 0.0
for instance in instances:
cpu_util = None
instance_util = None
try:
if self.config.datasource == "ceilometer":
cpu_util = self.ceilometer.statistic_aggregation(
instance_util = self.ceilometer.statistic_aggregation(
resource_id=instance.uuid,
meter_name=self._meter,
period=self._period,
@ -298,7 +312,7 @@ class WorkloadBalance(base.WorkloadStabilizationBaseStrategy):
stop_time = datetime.datetime.utcnow()
start_time = stop_time - datetime.timedelta(
seconds=int(self._period))
cpu_util = self.gnocchi.statistic_aggregation(
instance_util = self.gnocchi.statistic_aggregation(
resource_id=instance.uuid,
metric=self._meter,
granularity=self.granularity,
@ -308,23 +322,32 @@ class WorkloadBalance(base.WorkloadStabilizationBaseStrategy):
)
except Exception as exc:
LOG.exception(exc)
LOG.error("Can not get cpu_util from %s",
LOG.error("Can not get %s from %s", self._meter,
self.config.datasource)
continue
if cpu_util is None:
LOG.debug("Instance (%s): cpu_util is None", instance.uuid)
if instance_util is None:
LOG.debug("Instance (%s): %s is None",
instance.uuid, self._meter)
continue
workload_cache[instance.uuid] = cpu_util * instance.vcpus / 100
if self._meter == self.CPU_METER_NAME:
workload_cache[instance.uuid] = (instance_util *
instance.vcpus / 100)
else:
workload_cache[instance.uuid] = instance_util
node_workload += workload_cache[instance.uuid]
LOG.debug("VM (%s): cpu_util %f", instance.uuid, cpu_util)
node_cpu_util = node_workload / node.vcpus * 100
LOG.debug("VM (%s): %s %f", instance.uuid, self._meter,
instance_util)
cluster_workload += node_workload
if self._meter == self.CPU_METER_NAME:
node_util = node_workload / node.vcpus * 100
else:
node_util = node_workload / node.memory * 100
instance_data = {
'node': node, "cpu_util": node_cpu_util,
'node': node, self._meter: node_util,
'workload': node_workload}
if node_cpu_util >= self.threshold:
if node_util >= self.threshold:
# mark the node to release resources
overload_hosts.append(instance_data)
else:
@ -356,8 +379,9 @@ class WorkloadBalance(base.WorkloadStabilizationBaseStrategy):
"""
self.threshold = self.input_parameters.threshold
self._period = self.input_parameters.period
self._meter = self.input_parameters.metrics
source_nodes, target_nodes, avg_workload, workload_cache = (
self.group_hosts_by_cpu_util())
self.group_hosts_by_cpu_or_ram_util())
if not source_nodes:
LOG.debug("No hosts require optimization")
@ -373,7 +397,7 @@ class WorkloadBalance(base.WorkloadStabilizationBaseStrategy):
# choose the server with largest cpu_util
source_nodes = sorted(source_nodes,
reverse=True,
key=lambda x: (x[self.METER_NAME]))
key=lambda x: (x[self._meter]))
instance_to_migrate = self.choose_instance_to_migrate(
source_nodes, avg_workload, workload_cache)
@ -391,7 +415,7 @@ class WorkloadBalance(base.WorkloadStabilizationBaseStrategy):
"be because of there's no enough CPU/Memory/DISK")
return self.solution
destination_hosts = sorted(destination_hosts,
key=lambda x: (x["cpu_util"]))
key=lambda x: (x[self._meter]))
# always use the host with lowerest CPU utilization
mig_destination_node = destination_hosts[0]['node']
# generate solution to migrate the instance to the dest server,

View File

@ -54,6 +54,8 @@ class FakeCeilometerMetrics(object):
result = 0.0
if meter_name == "cpu_util":
result = self.get_average_usage_instance_cpu_wb(resource_id)
elif meter_name == "memory.resident":
result = self.get_average_usage_instance_memory_wb(resource_id)
return result
def mock_get_statistics_nn(self, resource_id, meter_name, period,
@ -211,6 +213,20 @@ class FakeCeilometerMetrics(object):
mock['INSTANCE_4'] = 10
return float(mock[str(uuid)])
@staticmethod
def get_average_usage_instance_memory_wb(uuid):
mock = {}
# node 0
mock['INSTANCE_1'] = 30
# node 1
mock['INSTANCE_3'] = 12
mock['INSTANCE_4'] = 12
if uuid not in mock.keys():
# mock[uuid] = random.randint(1, 4)
mock[uuid] = 12
return mock[str(uuid)]
@staticmethod
def get_average_usage_instance_cpu(uuid):
"""The last VM CPU usage values to average

View File

@ -1,10 +1,10 @@
<ModelRoot>
<ComputeNode human_id="" uuid="Node_0" status="enabled" state="up" id="0" hostname="hostname_0" vcpus="40" disk="250" disk_capacity="250" memory="132">
<Instance state="active" human_id="" uuid="73b09e16-35b7-4922-804e-e8f5d9b740fc" vcpus="10" disk="20" disk_capacity="20" memory="2" metadata='{"optimize": true,"top": "floor", "nested": {"x": "y"}}'/>
<Instance state="active" human_id="" uuid="INSTANCE_1" vcpus="10" disk="20" disk_capacity="20" memory="2" metadata='{"optimize": true,"top": "floor", "nested": {"x": "y"}}'/>
<Instance state="active" human_id="" uuid="73b09e16-35b7-4922-804e-e8f5d9b740fc" vcpus="10" disk="20" disk_capacity="20" memory="32" metadata='{"optimize": true,"top": "floor", "nested": {"x": "y"}}'/>
<Instance state="active" human_id="" uuid="INSTANCE_1" vcpus="10" disk="20" disk_capacity="20" memory="32" metadata='{"optimize": true,"top": "floor", "nested": {"x": "y"}}'/>
</ComputeNode>
<ComputeNode human_id="" uuid="Node_1" status="enabled" state="up" id="1" hostname="hostname_1" vcpus="40" disk="250" disk_capacity="250" memory="132">
<Instance state="active" human_id="" uuid="INSTANCE_3" vcpus="10" disk="20" disk_capacity="20" memory="2" metadata='{"optimize": true,"top": "floor", "nested": {"x": "y"}}'/>
<Instance state="active" human_id="" uuid="INSTANCE_4" vcpus="10" disk="20" disk_capacity="20" memory="2" metadata='{"optimize": true,"top": "floor", "nested": {"x": "y"}}'/>
<Instance state="active" human_id="" uuid="INSTANCE_3" vcpus="10" disk="20" disk_capacity="20" memory="32" metadata='{"optimize": true,"top": "floor", "nested": {"x": "y"}}'/>
<Instance state="active" human_id="" uuid="INSTANCE_4" vcpus="10" disk="20" disk_capacity="20" memory="32" metadata='{"optimize": true,"top": "floor", "nested": {"x": "y"}}'/>
</ComputeNode>
</ModelRoot>

View File

@ -50,6 +50,8 @@ class FakeGnocchiMetrics(object):
result = 0.0
if metric == "cpu_util":
result = self.get_average_usage_instance_cpu_wb(resource_id)
elif metric == "memory.resident":
result = self.get_average_usage_instance_memory_wb(resource_id)
return result
@staticmethod
@ -242,3 +244,17 @@ class FakeGnocchiMetrics(object):
mock['INSTANCE_3'] = 20
mock['INSTANCE_4'] = 10
return float(mock[str(uuid)])
@staticmethod
def get_average_usage_instance_memory_wb(uuid):
mock = {}
# node 0
mock['INSTANCE_1'] = 30
# node 1
mock['INSTANCE_3'] = 12
mock['INSTANCE_4'] = 12
if uuid not in mock.keys():
# mock[uuid] = random.randint(1, 4)
mock[uuid] = 12
return mock[str(uuid)]

View File

@ -74,10 +74,12 @@ class TestWorkloadBalance(base.TestCase):
self.strategy = strategies.WorkloadBalance(
config=mock.Mock(datasource=self.datasource))
self.strategy.input_parameters = utils.Struct()
self.strategy.input_parameters.update({'threshold': 25.0,
self.strategy.input_parameters.update({'metrics': 'cpu_util',
'threshold': 25.0,
'period': 300})
self.strategy.threshold = 25.0
self.strategy._period = 300
self.strategy._meter = "cpu_util"
def test_calc_used_resource(self):
model = self.fake_cluster.generate_scenario_6_with_2_nodes()
@ -86,21 +88,31 @@ class TestWorkloadBalance(base.TestCase):
cores_used, mem_used, disk_used = (
self.strategy.calculate_used_resource(node))
self.assertEqual((cores_used, mem_used, disk_used), (20, 4, 40))
self.assertEqual((cores_used, mem_used, disk_used), (20, 64, 40))
def test_group_hosts_by_cpu_util(self):
model = self.fake_cluster.generate_scenario_6_with_2_nodes()
self.m_model.return_value = model
self.strategy.threshold = 30
n1, n2, avg, w_map = self.strategy.group_hosts_by_cpu_util()
n1, n2, avg, w_map = self.strategy.group_hosts_by_cpu_or_ram_util()
self.assertEqual(n1[0]['node'].uuid, 'Node_0')
self.assertEqual(n2[0]['node'].uuid, 'Node_1')
self.assertEqual(avg, 8.0)
def test_group_hosts_by_ram_util(self):
model = self.fake_cluster.generate_scenario_6_with_2_nodes()
self.m_model.return_value = model
self.strategy._meter = "memory.resident"
self.strategy.threshold = 30
n1, n2, avg, w_map = self.strategy.group_hosts_by_cpu_or_ram_util()
self.assertEqual(n1[0]['node'].uuid, 'Node_0')
self.assertEqual(n2[0]['node'].uuid, 'Node_1')
self.assertEqual(avg, 33.0)
def test_choose_instance_to_migrate(self):
model = self.fake_cluster.generate_scenario_6_with_2_nodes()
self.m_model.return_value = model
n1, n2, avg, w_map = self.strategy.group_hosts_by_cpu_util()
n1, n2, avg, w_map = self.strategy.group_hosts_by_cpu_or_ram_util()
instance_to_mig = self.strategy.choose_instance_to_migrate(
n1, avg, w_map)
self.assertEqual(instance_to_mig[0].uuid, 'Node_0')
@ -110,7 +122,7 @@ class TestWorkloadBalance(base.TestCase):
def test_choose_instance_notfound(self):
model = self.fake_cluster.generate_scenario_6_with_2_nodes()
self.m_model.return_value = model
n1, n2, avg, w_map = self.strategy.group_hosts_by_cpu_util()
n1, n2, avg, w_map = self.strategy.group_hosts_by_cpu_or_ram_util()
instances = model.get_all_instances()
[model.remove_instance(inst) for inst in instances.values()]
instance_to_mig = self.strategy.choose_instance_to_migrate(
@ -122,7 +134,7 @@ class TestWorkloadBalance(base.TestCase):
self.m_model.return_value = model
self.strategy.datasource = mock.MagicMock(
statistic_aggregation=self.fake_metrics.mock_get_statistics_wb)
n1, n2, avg, w_map = self.strategy.group_hosts_by_cpu_util()
n1, n2, avg, w_map = self.strategy.group_hosts_by_cpu_or_ram_util()
instance_to_mig = self.strategy.choose_instance_to_migrate(
n1, avg, w_map)
dest_hosts = self.strategy.filter_destination_hosts(
@ -202,7 +214,7 @@ class TestWorkloadBalance(base.TestCase):
m_gnocchi.statistic_aggregation = mock.Mock(
side_effect=self.fake_metrics.mock_get_statistics_wb)
instance0 = model.get_instance_by_uuid("INSTANCE_0")
self.strategy.group_hosts_by_cpu_util()
self.strategy.group_hosts_by_cpu_or_ram_util()
if self.strategy.config.datasource == "ceilometer":
m_ceilometer.statistic_aggregation.assert_any_call(
aggregate='avg', meter_name='cpu_util',