Merge "workload balance base on cpu or ram util"

2017-08-15 08:37:58 +00:00 · 2017-08-15 08:37:58 +00:00 · e5c3df0c2f
commit e5c3df0c2f
parent 6005d6ebdd 5c86a54d20
7 changed files with 121 additions and 42 deletions
--- a/doc/source/strategies/workload_balance.rst
+++ b/doc/source/strategies/workload_balance.rst
@ -25,6 +25,7 @@ The *workload_balance* strategy requires the following metrics:
 metric                  service name plugins comment
 ======================= ============ ======= =======
 ``cpu_util``            ceilometer_  none
+``memory.resident``     ceilometer_  none
 ======================= ============ ======= =======

 .. _ceilometer: http://docs.openstack.org/admin-guide/telemetry-measurements.html#openstack-compute
@ -66,6 +67,9 @@ Strategy parameters are:
 ============== ====== ============= ====================================
 parameter      type   default Value description
 ============== ====== ============= ====================================
+``metrics``    String 'cpu_util'    Workload balance base on cpu or ram
+                                    utilization. choice: ['cpu_util',
+                                    'memory.resident']
 ``threshold``  Number 25.0          Workload threshold for migration
 ``period``     Number 300           Aggregate time period of ceilometer
 ============== ====== ============= ====================================
@ -90,7 +94,7 @@ How to use it ?
      at1 workload_balancing --strategy workload_balance

    $ openstack optimize audit create -a at1 -p threshold=26.0 \
-            -p period=310
+            -p period=310 -p metrics=cpu_util

 External Links
 --------------
--- a/releasenotes/notes/workload-balance-base-on-cpu-or-ram-util-3ff4ee968c32b2ed.yaml
+++ b/releasenotes/notes/workload-balance-base-on-cpu-or-ram-util-3ff4ee968c32b2ed.yaml
@ -0,0 +1,7 @@
+---
+features:
+  - Existing workload_balance strategy based on
+    the VM workloads of CPU. This feature improves
+    the strategy. By the input parameter "metrics",
+    it makes decision to migrate a VM base on CPU
+    or memory utilization.
--- a/watcher/decision_engine/strategy/strategies/workload_balance.py
+++ b/watcher/decision_engine/strategy/strategies/workload_balance.py
@ -22,7 +22,7 @@
 *Description*

 This strategy migrates a VM based on the VM workload of the hosts.
-It makes decision to migrate a workload whenever a host's CPU
+It makes decision to migrate a workload whenever a host's CPU or RAM
 utilization % is higher than the specified threshold. The VM to
 be moved should make the host close to average workload of all
 hosts nodes.
@ -32,7 +32,7 @@ hosts nodes.
 * Hardware: compute node should use the same physical CPUs
 * Software: Ceilometer component ceilometer-agent-compute
  running in each compute node, and Ceilometer API can
-  report such telemetry "cpu_util" successfully.
+  report such telemetry "cpu_util" and "memory.resident" successfully.
 * You must have at least 2 physical compute nodes to run
  this strategy.

@ -69,16 +69,16 @@ class WorkloadBalance(base.WorkloadStabilizationBaseStrategy):

        It is a migration strategy based on the VM workload of physical
        servers. It generates solutions to move a workload whenever a server's
-        CPU utilization % is higher than the specified threshold.
+        CPU or RAM utilization % is higher than the specified threshold.
        The VM to be moved should make the host close to average workload
        of all compute nodes.

    *Requirements*

-        * Hardware: compute node should use the same physical CPUs
+        * Hardware: compute node should use the same physical CPUs/RAMs
        * Software: Ceilometer component ceilometer-agent-compute running
          in each compute node, and Ceilometer API can report such telemetry
-          "cpu_util" successfully.
+          "cpu_util" and "memory.resident" successfully.
        * You must have at least 2 physical compute nodes to run this strategy

    *Limitations*
@ -91,8 +91,12 @@ class WorkloadBalance(base.WorkloadStabilizationBaseStrategy):
    """

    # The meter to report CPU utilization % of VM in ceilometer
-    METER_NAME = "cpu_util"
    # Unit: %, value range is [0 , 100]
+    CPU_METER_NAME = "cpu_util"
+
+    # The meter to report memory resident of VM in ceilometer
+    # Unit: MB
+    MEM_METER_NAME = "memory.resident"

    MIGRATION = "migrate"

@ -104,9 +108,9 @@ class WorkloadBalance(base.WorkloadStabilizationBaseStrategy):
        :param osc: :py:class:`~.OpenStackClients` instance
        """
        super(WorkloadBalance, self).__init__(config, osc)
-        # the migration plan will be triggered when the CPU utilization %
-        # reaches threshold
-        self._meter = self.METER_NAME
+        # the migration plan will be triggered when the CPU or RAM
+        # utilization % reaches threshold
+        self._meter = None
        self._ceilometer = None
        self._gnocchi = None

@ -151,6 +155,13 @@ class WorkloadBalance(base.WorkloadStabilizationBaseStrategy):
        # Mandatory default setting for each element
        return {
            "properties": {
+                "metrics": {
+                    "description": "Workload balance based on metrics: "
+                                   "cpu or ram utilization",
+                    "type": "string",
+                    "choice": ["cpu_util", "memory.resident"],
+                    "default": "cpu_util"
+                },
                "threshold": {
                    "description": "workload threshold for migration",
                    "type": "number",
@ -251,18 +262,21 @@ class WorkloadBalance(base.WorkloadStabilizationBaseStrategy):
            cores_available = host.vcpus - cores_used
            disk_available = host.disk - disk_used
            mem_available = host.memory - mem_used
-            if (
-                    cores_available >= required_cores and
-                    disk_available >= required_disk and
+            if (cores_available >= required_cores and
                    mem_available >= required_mem and
+                    disk_available >= required_disk):
+                if (self._meter == self.CPU_METER_NAME and
                    ((src_instance_workload + workload) <
-                     self.threshold / 100 * host.vcpus)
-            ):
-                destination_hosts.append(instance_data)
+                     self.threshold / 100 * host.vcpus)):
+                    destination_hosts.append(instance_data)
+                if (self._meter == self.MEM_METER_NAME and
+                    ((src_instance_workload + workload) <
+                     self.threshold / 100 * host.memory)):
+                    destination_hosts.append(instance_data)

        return destination_hosts

-    def group_hosts_by_cpu_util(self):
+    def group_hosts_by_cpu_or_ram_util(self):
        """Calculate the workloads of each node

        try to find out the nodes which have reached threshold
@ -286,10 +300,10 @@ class WorkloadBalance(base.WorkloadStabilizationBaseStrategy):
            instances = self.compute_model.get_node_instances(node)
            node_workload = 0.0
            for instance in instances:
-                cpu_util = None
+                instance_util = None
                try:
                    if self.config.datasource == "ceilometer":
-                        cpu_util = self.ceilometer.statistic_aggregation(
+                        instance_util = self.ceilometer.statistic_aggregation(
                            resource_id=instance.uuid,
                            meter_name=self._meter,
                            period=self._period,
@ -298,7 +312,7 @@ class WorkloadBalance(base.WorkloadStabilizationBaseStrategy):
                        stop_time = datetime.datetime.utcnow()
                        start_time = stop_time - datetime.timedelta(
                            seconds=int(self._period))
-                        cpu_util = self.gnocchi.statistic_aggregation(
+                        instance_util = self.gnocchi.statistic_aggregation(
                            resource_id=instance.uuid,
                            metric=self._meter,
                            granularity=self.granularity,
@ -308,23 +322,32 @@ class WorkloadBalance(base.WorkloadStabilizationBaseStrategy):
                        )
                except Exception as exc:
                    LOG.exception(exc)
-                    LOG.error("Can not get cpu_util from %s",
+                    LOG.error("Can not get %s from %s", self._meter,
                              self.config.datasource)
                    continue
-                if cpu_util is None:
-                    LOG.debug("Instance (%s): cpu_util is None", instance.uuid)
+                if instance_util is None:
+                    LOG.debug("Instance (%s): %s is None",
+                              instance.uuid, self._meter)
                    continue
-                workload_cache[instance.uuid] = cpu_util * instance.vcpus / 100
+                if self._meter == self.CPU_METER_NAME:
+                    workload_cache[instance.uuid] = (instance_util *
+                                                     instance.vcpus / 100)
+                else:
+                    workload_cache[instance.uuid] = instance_util
                node_workload += workload_cache[instance.uuid]
-                LOG.debug("VM (%s): cpu_util %f", instance.uuid, cpu_util)
-            node_cpu_util = node_workload / node.vcpus * 100
+                LOG.debug("VM (%s): %s %f", instance.uuid, self._meter,
+                          instance_util)

            cluster_workload += node_workload
+            if self._meter == self.CPU_METER_NAME:
+                node_util = node_workload / node.vcpus * 100
+            else:
+                node_util = node_workload / node.memory * 100

            instance_data = {
-                'node': node, "cpu_util": node_cpu_util,
+                'node': node, self._meter: node_util,
                'workload': node_workload}
-            if node_cpu_util >= self.threshold:
+            if node_util >= self.threshold:
                # mark the node to release resources
                overload_hosts.append(instance_data)
            else:
@ -356,8 +379,9 @@ class WorkloadBalance(base.WorkloadStabilizationBaseStrategy):
        """
        self.threshold = self.input_parameters.threshold
        self._period = self.input_parameters.period
+        self._meter = self.input_parameters.metrics
        source_nodes, target_nodes, avg_workload, workload_cache = (
-            self.group_hosts_by_cpu_util())
+            self.group_hosts_by_cpu_or_ram_util())

        if not source_nodes:
            LOG.debug("No hosts require optimization")
@ -373,7 +397,7 @@ class WorkloadBalance(base.WorkloadStabilizationBaseStrategy):
        # choose the server with largest cpu_util
        source_nodes = sorted(source_nodes,
                              reverse=True,
-                              key=lambda x: (x[self.METER_NAME]))
+                              key=lambda x: (x[self._meter]))

        instance_to_migrate = self.choose_instance_to_migrate(
            source_nodes, avg_workload, workload_cache)
@ -391,7 +415,7 @@ class WorkloadBalance(base.WorkloadStabilizationBaseStrategy):
                        "be because of there's no enough CPU/Memory/DISK")
            return self.solution
        destination_hosts = sorted(destination_hosts,
-                                   key=lambda x: (x["cpu_util"]))
+                                   key=lambda x: (x[self._meter]))
        # always use the host with lowerest CPU utilization
        mig_destination_node = destination_hosts[0]['node']
        # generate solution to migrate the instance to the dest server,
--- a/watcher/tests/decision_engine/model/ceilometer_metrics.py
+++ b/watcher/tests/decision_engine/model/ceilometer_metrics.py
@ -54,6 +54,8 @@ class FakeCeilometerMetrics(object):
        result = 0.0
        if meter_name == "cpu_util":
            result = self.get_average_usage_instance_cpu_wb(resource_id)
+        elif meter_name == "memory.resident":
+            result = self.get_average_usage_instance_memory_wb(resource_id)
        return result

    def mock_get_statistics_nn(self, resource_id, meter_name, period,
@ -211,6 +213,20 @@ class FakeCeilometerMetrics(object):
        mock['INSTANCE_4'] = 10
        return float(mock[str(uuid)])

+    @staticmethod
+    def get_average_usage_instance_memory_wb(uuid):
+        mock = {}
+        # node 0
+        mock['INSTANCE_1'] = 30
+        # node 1
+        mock['INSTANCE_3'] = 12
+        mock['INSTANCE_4'] = 12
+        if uuid not in mock.keys():
+            # mock[uuid] = random.randint(1, 4)
+            mock[uuid] = 12
+
+        return mock[str(uuid)]
+
    @staticmethod
    def get_average_usage_instance_cpu(uuid):
        """The last VM CPU usage values to average
--- a/watcher/tests/decision_engine/model/data/scenario_6_with_2_nodes.xml
+++ b/watcher/tests/decision_engine/model/data/scenario_6_with_2_nodes.xml
@ -1,10 +1,10 @@
 <ModelRoot>
  <ComputeNode human_id="" uuid="Node_0" status="enabled" state="up" id="0" hostname="hostname_0" vcpus="40" disk="250" disk_capacity="250" memory="132">
-    <Instance state="active" human_id="" uuid="73b09e16-35b7-4922-804e-e8f5d9b740fc" vcpus="10" disk="20" disk_capacity="20" memory="2" metadata='{"optimize": true,"top": "floor", "nested": {"x": "y"}}'/>
-    <Instance state="active" human_id="" uuid="INSTANCE_1" vcpus="10" disk="20" disk_capacity="20" memory="2" metadata='{"optimize": true,"top": "floor", "nested": {"x": "y"}}'/>
+    <Instance state="active" human_id="" uuid="73b09e16-35b7-4922-804e-e8f5d9b740fc" vcpus="10" disk="20" disk_capacity="20" memory="32" metadata='{"optimize": true,"top": "floor", "nested": {"x": "y"}}'/>
+    <Instance state="active" human_id="" uuid="INSTANCE_1" vcpus="10" disk="20" disk_capacity="20" memory="32" metadata='{"optimize": true,"top": "floor", "nested": {"x": "y"}}'/>
  </ComputeNode>
  <ComputeNode human_id="" uuid="Node_1" status="enabled" state="up" id="1" hostname="hostname_1" vcpus="40" disk="250" disk_capacity="250" memory="132">
-    <Instance state="active" human_id="" uuid="INSTANCE_3" vcpus="10" disk="20" disk_capacity="20" memory="2" metadata='{"optimize": true,"top": "floor", "nested": {"x": "y"}}'/>
-    <Instance state="active" human_id="" uuid="INSTANCE_4" vcpus="10" disk="20" disk_capacity="20" memory="2" metadata='{"optimize": true,"top": "floor", "nested": {"x": "y"}}'/>
+    <Instance state="active" human_id="" uuid="INSTANCE_3" vcpus="10" disk="20" disk_capacity="20" memory="32" metadata='{"optimize": true,"top": "floor", "nested": {"x": "y"}}'/>
+    <Instance state="active" human_id="" uuid="INSTANCE_4" vcpus="10" disk="20" disk_capacity="20" memory="32" metadata='{"optimize": true,"top": "floor", "nested": {"x": "y"}}'/>
  </ComputeNode>
 </ModelRoot>
--- a/watcher/tests/decision_engine/model/gnocchi_metrics.py
+++ b/watcher/tests/decision_engine/model/gnocchi_metrics.py
@ -50,6 +50,8 @@ class FakeGnocchiMetrics(object):
        result = 0.0
        if metric == "cpu_util":
            result = self.get_average_usage_instance_cpu_wb(resource_id)
+        elif metric == "memory.resident":
+            result = self.get_average_usage_instance_memory_wb(resource_id)
        return result

    @staticmethod
@ -242,3 +244,17 @@ class FakeGnocchiMetrics(object):
        mock['INSTANCE_3'] = 20
        mock['INSTANCE_4'] = 10
        return float(mock[str(uuid)])
+
+    @staticmethod
+    def get_average_usage_instance_memory_wb(uuid):
+        mock = {}
+        # node 0
+        mock['INSTANCE_1'] = 30
+        # node 1
+        mock['INSTANCE_3'] = 12
+        mock['INSTANCE_4'] = 12
+        if uuid not in mock.keys():
+            # mock[uuid] = random.randint(1, 4)
+            mock[uuid] = 12
+
+        return mock[str(uuid)]
--- a/watcher/tests/decision_engine/strategy/strategies/test_workload_balance.py
+++ b/watcher/tests/decision_engine/strategy/strategies/test_workload_balance.py
@ -74,10 +74,12 @@ class TestWorkloadBalance(base.TestCase):
        self.strategy = strategies.WorkloadBalance(
            config=mock.Mock(datasource=self.datasource))
        self.strategy.input_parameters = utils.Struct()
-        self.strategy.input_parameters.update({'threshold': 25.0,
+        self.strategy.input_parameters.update({'metrics': 'cpu_util',
+                                               'threshold': 25.0,
                                               'period': 300})
        self.strategy.threshold = 25.0
        self.strategy._period = 300
+        self.strategy._meter = "cpu_util"

    def test_calc_used_resource(self):
        model = self.fake_cluster.generate_scenario_6_with_2_nodes()
@ -86,21 +88,31 @@ class TestWorkloadBalance(base.TestCase):
        cores_used, mem_used, disk_used = (
            self.strategy.calculate_used_resource(node))

-        self.assertEqual((cores_used, mem_used, disk_used), (20, 4, 40))
+        self.assertEqual((cores_used, mem_used, disk_used), (20, 64, 40))

    def test_group_hosts_by_cpu_util(self):
        model = self.fake_cluster.generate_scenario_6_with_2_nodes()
        self.m_model.return_value = model
        self.strategy.threshold = 30
-        n1, n2, avg, w_map = self.strategy.group_hosts_by_cpu_util()
+        n1, n2, avg, w_map = self.strategy.group_hosts_by_cpu_or_ram_util()
        self.assertEqual(n1[0]['node'].uuid, 'Node_0')
        self.assertEqual(n2[0]['node'].uuid, 'Node_1')
        self.assertEqual(avg, 8.0)

+    def test_group_hosts_by_ram_util(self):
+        model = self.fake_cluster.generate_scenario_6_with_2_nodes()
+        self.m_model.return_value = model
+        self.strategy._meter = "memory.resident"
+        self.strategy.threshold = 30
+        n1, n2, avg, w_map = self.strategy.group_hosts_by_cpu_or_ram_util()
+        self.assertEqual(n1[0]['node'].uuid, 'Node_0')
+        self.assertEqual(n2[0]['node'].uuid, 'Node_1')
+        self.assertEqual(avg, 33.0)
+
    def test_choose_instance_to_migrate(self):
        model = self.fake_cluster.generate_scenario_6_with_2_nodes()
        self.m_model.return_value = model
-        n1, n2, avg, w_map = self.strategy.group_hosts_by_cpu_util()
+        n1, n2, avg, w_map = self.strategy.group_hosts_by_cpu_or_ram_util()
        instance_to_mig = self.strategy.choose_instance_to_migrate(
            n1, avg, w_map)
        self.assertEqual(instance_to_mig[0].uuid, 'Node_0')
@ -110,7 +122,7 @@ class TestWorkloadBalance(base.TestCase):
    def test_choose_instance_notfound(self):
        model = self.fake_cluster.generate_scenario_6_with_2_nodes()
        self.m_model.return_value = model
-        n1, n2, avg, w_map = self.strategy.group_hosts_by_cpu_util()
+        n1, n2, avg, w_map = self.strategy.group_hosts_by_cpu_or_ram_util()
        instances = model.get_all_instances()
        [model.remove_instance(inst) for inst in instances.values()]
        instance_to_mig = self.strategy.choose_instance_to_migrate(
@ -122,7 +134,7 @@ class TestWorkloadBalance(base.TestCase):
        self.m_model.return_value = model
        self.strategy.datasource = mock.MagicMock(
            statistic_aggregation=self.fake_metrics.mock_get_statistics_wb)
-        n1, n2, avg, w_map = self.strategy.group_hosts_by_cpu_util()
+        n1, n2, avg, w_map = self.strategy.group_hosts_by_cpu_or_ram_util()
        instance_to_mig = self.strategy.choose_instance_to_migrate(
            n1, avg, w_map)
        dest_hosts = self.strategy.filter_destination_hosts(
@ -202,7 +214,7 @@ class TestWorkloadBalance(base.TestCase):
        m_gnocchi.statistic_aggregation = mock.Mock(
            side_effect=self.fake_metrics.mock_get_statistics_wb)
        instance0 = model.get_instance_by_uuid("INSTANCE_0")
-        self.strategy.group_hosts_by_cpu_util()
+        self.strategy.group_hosts_by_cpu_or_ram_util()
        if self.strategy.config.datasource == "ceilometer":
            m_ceilometer.statistic_aggregation.assert_any_call(
                aggregate='avg', meter_name='cpu_util',