diff --git a/doc/source/strategies/vm_workload_consolidation.rst b/doc/source/strategies/vm_workload_consolidation.rst index 41bab95de..ab6bc9ae7 100644 --- a/doc/source/strategies/vm_workload_consolidation.rst +++ b/doc/source/strategies/vm_workload_consolidation.rst @@ -26,9 +26,15 @@ metric service name plugins comment ``memory.resident`` ceilometer_ none ``memory`` ceilometer_ none ``disk.root.size`` ceilometer_ none +``compute.node.cpu.percent`` ceilometer_ none (optional) need to set the + ``compute_monitors`` option + to ``cpu.virt_driver`` in the + nova.conf. +``hardware.memory.used`` ceilometer_ SNMP_ (optional) ============================ ============ ======= ========================= .. _ceilometer: https://docs.openstack.org/ceilometer/latest/admin/telemetry-measurements.html#openstack-compute +.. _SNMP: https://docs.openstack.org/ceilometer/latest/admin/telemetry-measurements.html#snmp-based-meters Cluster data model ****************** diff --git a/watcher/decision_engine/strategy/strategies/vm_workload_consolidation.py b/watcher/decision_engine/strategy/strategies/vm_workload_consolidation.py index f26810d13..07fbc06e9 100644 --- a/watcher/decision_engine/strategy/strategies/vm_workload_consolidation.py +++ b/watcher/decision_engine/strategy/strategies/vm_workload_consolidation.py @@ -18,7 +18,10 @@ # limitations under the License. # +import collections + from oslo_log import log +import oslo_utils from watcher._i18n import _ from watcher.applier.actions import migration @@ -67,7 +70,8 @@ class VMWorkloadConsolidation(base.ServerConsolidationBaseStrategy): AGGREGATE = 'mean' DATASOURCE_METRICS = ['instance_ram_allocated', 'instance_cpu_usage', - 'instance_ram_usage', 'instance_root_disk_size'] + 'instance_ram_usage', 'instance_root_disk_size', + 'host_cpu_usage', 'host_ram_usage'] MIGRATION = "migrate" CHANGE_NOVA_SERVICE_STATE = "change_nova_service_state" @@ -77,6 +81,11 @@ class VMWorkloadConsolidation(base.ServerConsolidationBaseStrategy): self.number_of_migrations = 0 self.number_of_released_nodes = 0 self.datasource_instance_data_cache = dict() + self.datasource_node_data_cache = dict() + # Host metric adjustments that take into account planned + # migrations. + self.host_metric_delta = collections.defaultdict( + lambda: collections.defaultdict(int)) @classmethod def get_name(cls): @@ -227,6 +236,18 @@ class VMWorkloadConsolidation(base.ServerConsolidationBaseStrategy): destination_node) self.number_of_migrations += 1 + instance_util = self.get_instance_utilization(instance) + self.host_metric_delta[source_node.hostname]['cpu'] -= ( + instance_util['cpu']) + # We'll deduce the vm allocated memory. + self.host_metric_delta[source_node.hostname]['ram'] -= ( + instance.memory) + + self.host_metric_delta[destination_node.hostname]['cpu'] += ( + instance_util['cpu']) + self.host_metric_delta[destination_node.hostname]['ram'] += ( + instance.memory) + def disable_unused_nodes(self): """Generate actions for disabling unused nodes. @@ -289,6 +310,21 @@ class VMWorkloadConsolidation(base.ServerConsolidationBaseStrategy): disk=instance_disk_util) return self.datasource_instance_data_cache.get(instance.uuid) + def _get_node_total_utilization(self, node): + if node.hostname in self.datasource_node_data_cache: + return self.datasource_node_data_cache[node.hostname] + + cpu = self.datasource_backend.get_host_cpu_usage( + node, self.period, self.AGGREGATE, + self.granularity) + ram = self.datasource_backend.get_host_ram_usage( + node, self.period, self.AGGREGATE, + self.granularity) + + self.datasource_node_data_cache[node.hostname] = dict( + cpu=cpu, ram=ram) + return self.datasource_node_data_cache[node.hostname] + def get_node_utilization(self, node): """Collect cpu, ram and disk utilization statistics of a node. @@ -309,7 +345,33 @@ class VMWorkloadConsolidation(base.ServerConsolidationBaseStrategy): LOG.debug("instance utilization: %s %s", instance, instance_util) - return dict(cpu=node_cpu_util, ram=node_ram_util, + total_node_util = self._get_node_total_utilization(node) + total_node_cpu_util = total_node_util['cpu'] or 0 + if total_node_cpu_util: + total_node_cpu_util = total_node_cpu_util * node.vcpus / 100 + # account for planned migrations + total_node_cpu_util += self.host_metric_delta[node.hostname]['cpu'] + + total_node_ram_util = total_node_util['ram'] or 0 + if total_node_ram_util: + total_node_ram_util /= oslo_utils.units.Ki + total_node_ram_util += self.host_metric_delta[node.hostname]['ram'] + + LOG.debug( + "node utilization: %s. " + "total instance cpu: %s, " + "total instance ram: %s, " + "total instance disk: %s, " + "total host cpu: %s, " + "total host ram: %s, " + "node delta usage: %s.", + node, + node_cpu_util, node_ram_util, node_disk_util, + total_node_cpu_util, total_node_ram_util, + self.host_metric_delta[node.hostname]) + + return dict(cpu=max(node_cpu_util, total_node_cpu_util), + ram=max(node_ram_util, total_node_ram_util), disk=node_disk_util) def get_node_capacity(self, node): diff --git a/watcher/tests/decision_engine/model/faker_cluster_and_metrics.py b/watcher/tests/decision_engine/model/faker_cluster_and_metrics.py index 2335f85ab..5beb285e5 100644 --- a/watcher/tests/decision_engine/model/faker_cluster_and_metrics.py +++ b/watcher/tests/decision_engine/model/faker_cluster_and_metrics.py @@ -80,7 +80,7 @@ class FakerModelCollector(base.BaseClusterDataModelCollector): return self.load_model('scenario_4_with_metrics.xml') -class FakeCeilometerMetrics(object): +class FakeGnocchiMetrics(object): def __init__(self, model): self.model = model @@ -90,6 +90,9 @@ class FakeCeilometerMetrics(object): if meter_name == 'host_cpu_usage': return self.get_compute_node_cpu_util( resource, period, aggregate, granularity) + elif meter_name == 'host_ram_usage': + return self.get_compute_node_ram_util( + resource, period, aggregate, granularity) elif meter_name == 'instance_cpu_usage': return self.get_instance_cpu_util( resource, period, aggregate, granularity) @@ -110,109 +113,27 @@ class FakeCeilometerMetrics(object): Returns relative node CPU utilization <0, 100>. :param r_id: resource id """ - node_uuid = '%s_%s' % (resource.uuid, resource.hostname) - node = self.model.get_node_by_uuid(node_uuid) + node = self.model.get_node_by_uuid(resource.uuid) instances = self.model.get_node_instances(node) util_sum = 0.0 - for instance_uuid in instances: - instance = self.model.get_instance_by_uuid(instance_uuid) + for instance in instances: total_cpu_util = instance.vcpus * self.get_instance_cpu_util( - instance.uuid) + instance, period, aggregate, granularity) util_sum += total_cpu_util / 100.0 util_sum /= node.vcpus return util_sum * 100.0 - @staticmethod - def get_instance_cpu_util(resource, period, aggregate, - granularity): - instance_cpu_util = dict() - instance_cpu_util['INSTANCE_0'] = 10 - instance_cpu_util['INSTANCE_1'] = 30 - instance_cpu_util['INSTANCE_2'] = 60 - instance_cpu_util['INSTANCE_3'] = 20 - instance_cpu_util['INSTANCE_4'] = 40 - instance_cpu_util['INSTANCE_5'] = 50 - instance_cpu_util['INSTANCE_6'] = 100 - instance_cpu_util['INSTANCE_7'] = 100 - instance_cpu_util['INSTANCE_8'] = 100 - instance_cpu_util['INSTANCE_9'] = 100 - return instance_cpu_util[str(resource.uuid)] - - @staticmethod - def get_instance_ram_util(resource, period, aggregate, - granularity): - instance_ram_util = dict() - instance_ram_util['INSTANCE_0'] = 1 - instance_ram_util['INSTANCE_1'] = 2 - instance_ram_util['INSTANCE_2'] = 4 - instance_ram_util['INSTANCE_3'] = 8 - instance_ram_util['INSTANCE_4'] = 3 - instance_ram_util['INSTANCE_5'] = 2 - instance_ram_util['INSTANCE_6'] = 1 - instance_ram_util['INSTANCE_7'] = 2 - instance_ram_util['INSTANCE_8'] = 4 - instance_ram_util['INSTANCE_9'] = 8 - return instance_ram_util[str(resource.uuid)] - - @staticmethod - def get_instance_disk_root_size(resource, period, aggregate, - granularity): - instance_disk_util = dict() - instance_disk_util['INSTANCE_0'] = 10 - instance_disk_util['INSTANCE_1'] = 15 - instance_disk_util['INSTANCE_2'] = 30 - instance_disk_util['INSTANCE_3'] = 35 - instance_disk_util['INSTANCE_4'] = 20 - instance_disk_util['INSTANCE_5'] = 25 - instance_disk_util['INSTANCE_6'] = 25 - instance_disk_util['INSTANCE_7'] = 25 - instance_disk_util['INSTANCE_8'] = 25 - instance_disk_util['INSTANCE_9'] = 25 - return instance_disk_util[str(resource.uuid)] - - -class FakeGnocchiMetrics(object): - def __init__(self, model): - self.model = model - - def mock_get_statistics(self, resource=None, resource_type=None, - meter_name=None, period=300, aggregate='mean', - granularity=300): - if meter_name == 'host_cpu_usage': - return self.get_compute_node_cpu_util( - resource, period, aggregate, granularity) - elif meter_name == 'instance_cpu_usage': - return self.get_instance_cpu_util( - resource, period, aggregate, granularity) - elif meter_name == 'instance_ram_usage': - return self.get_instance_ram_util( - resource, period, aggregate, granularity) - elif meter_name == 'instance_root_disk_size': - return self.get_instance_disk_root_size( - resource, period, aggregate, granularity) - - def get_compute_node_cpu_util(self, resource, period, aggregate, + def get_compute_node_ram_util(self, resource, period, aggregate, granularity): - """Calculates node utilization dynamicaly. - - node CPU utilization should consider - and corelate with actual instance-node mappings - provided within a cluster model. - Returns relative node CPU utilization <0, 100>. - - :param r_id: resource id - """ - node_uuid = "%s_%s" % (resource.uuid, resource.hostname) - node = self.model.get_node_by_uuid(node_uuid) + # Returns mock host ram usage in KB based on the allocated + # instances. + node = self.model.get_node_by_uuid(resource.uuid) instances = self.model.get_node_instances(node) util_sum = 0.0 - for instance_uuid in instances: - instance = self.model.get_instance_by_uuid(instance_uuid) - total_cpu_util = instance.vcpus * self.get_instance_cpu_util( - instance.uuid) - util_sum += total_cpu_util / 100.0 - util_sum /= node.vcpus - return util_sum * 100.0 + for instance in instances: + util_sum += self.get_instance_ram_util( + instance, period, aggregate, granularity) + return util_sum / 1024 @staticmethod def get_instance_cpu_util(resource, period, aggregate, @@ -261,3 +182,9 @@ class FakeGnocchiMetrics(object): instance_disk_util['INSTANCE_8'] = 25 instance_disk_util['INSTANCE_9'] = 25 return instance_disk_util[str(resource.uuid)] + + +# TODO(lpetrut): consider dropping Ceilometer support, it was deprecated +# in Ocata. +class FakeCeilometerMetrics(FakeGnocchiMetrics): + pass diff --git a/watcher/tests/decision_engine/strategy/strategies/test_vm_workload_consolidation.py b/watcher/tests/decision_engine/strategy/strategies/test_vm_workload_consolidation.py index 489429a8d..b6f795e11 100644 --- a/watcher/tests/decision_engine/strategy/strategies/test_vm_workload_consolidation.py +++ b/watcher/tests/decision_engine/strategy/strategies/test_vm_workload_consolidation.py @@ -64,6 +64,10 @@ class TestVMWorkloadConsolidation(TestBaseStrategy): self.fake_metrics.get_instance_ram_util), get_instance_root_disk_size=( self.fake_metrics.get_instance_disk_root_size), + get_host_cpu_usage=( + self.fake_metrics.get_compute_node_cpu_util), + get_host_ram_usage=( + self.fake_metrics.get_compute_node_ram_util) ) self.strategy = strategies.VMWorkloadConsolidation( config=mock.Mock(datasources=self.datasource)) @@ -88,6 +92,71 @@ class TestVMWorkloadConsolidation(TestBaseStrategy): node_util, self.strategy.get_node_utilization(node_0)) + def test_get_node_utilization_using_host_metrics(self): + model = self.fake_c_cluster.generate_scenario_1() + self.m_c_model.return_value = model + self.fake_metrics.model = model + node_0 = model.get_node_by_uuid("Node_0") + + # "get_node_utilization" is expected to return the maximum + # between the host metrics and the sum of the instance metrics. + data_src = self.m_datasource.return_value + cpu_usage = 30 + data_src.get_host_cpu_usage = mock.Mock(return_value=cpu_usage) + data_src.get_host_ram_usage = mock.Mock(return_value=512 * 1024) + + exp_cpu_usage = cpu_usage * node_0.vcpus / 100 + exp_node_util = dict(cpu=exp_cpu_usage, ram=512, disk=10) + self.assertEqual( + exp_node_util, + self.strategy.get_node_utilization(node_0)) + + def test_get_node_utilization_after_migrations(self): + model = self.fake_c_cluster.generate_scenario_1() + self.m_c_model.return_value = model + self.fake_metrics.model = model + node_0 = model.get_node_by_uuid("Node_0") + node_1 = model.get_node_by_uuid("Node_1") + + data_src = self.m_datasource.return_value + cpu_usage = 30 + host_ram_usage_mb = 512 + data_src.get_host_cpu_usage = mock.Mock(return_value=cpu_usage) + data_src.get_host_ram_usage = mock.Mock( + return_value=host_ram_usage_mb * 1024) + + instance_uuid = 'INSTANCE_0' + instance = model.get_instance_by_uuid(instance_uuid) + self.strategy.add_migration(instance, node_0, node_1) + + instance_util = self.strategy.get_instance_utilization(instance) + + # Ensure that we take into account planned migrations when + # determining node utilization + exp_node_0_cpu_usage = ( + cpu_usage * node_0.vcpus) / 100 - instance_util['cpu'] + exp_node_1_cpu_usage = ( + cpu_usage * node_1.vcpus) / 100 + instance_util['cpu'] + + exp_node_0_ram_usage = host_ram_usage_mb - instance.memory + exp_node_1_ram_usage = host_ram_usage_mb + instance.memory + + exp_node_0_util = dict( + cpu=exp_node_0_cpu_usage, + ram=exp_node_0_ram_usage, + disk=0) + exp_node_1_util = dict( + cpu=exp_node_1_cpu_usage, + ram=exp_node_1_ram_usage, + disk=25) + + self.assertEqual( + exp_node_0_util, + self.strategy.get_node_utilization(node_0)) + self.assertEqual( + exp_node_1_util, + self.strategy.get_node_utilization(node_1)) + def test_get_node_capacity(self): model = self.fake_c_cluster.generate_scenario_1() self.m_c_model.return_value = model