Merge "vm workload consolidation: use actual host metrics"

This commit is contained in:
Zuul 2023-12-01 01:51:39 +00:00 committed by Gerrit Code Review
commit 9492c2190e
4 changed files with 160 additions and 96 deletions

View File

@ -26,9 +26,15 @@ metric service name plugins comment
``memory.resident`` ceilometer_ none ``memory.resident`` ceilometer_ none
``memory`` ceilometer_ none ``memory`` ceilometer_ none
``disk.root.size`` ceilometer_ none ``disk.root.size`` ceilometer_ none
``compute.node.cpu.percent`` ceilometer_ none (optional) need to set the
``compute_monitors`` option
to ``cpu.virt_driver`` in the
nova.conf.
``hardware.memory.used`` ceilometer_ SNMP_ (optional)
============================ ============ ======= ========================= ============================ ============ ======= =========================
.. _ceilometer: https://docs.openstack.org/ceilometer/latest/admin/telemetry-measurements.html#openstack-compute .. _ceilometer: https://docs.openstack.org/ceilometer/latest/admin/telemetry-measurements.html#openstack-compute
.. _SNMP: https://docs.openstack.org/ceilometer/latest/admin/telemetry-measurements.html#snmp-based-meters
Cluster data model Cluster data model
****************** ******************

View File

@ -18,7 +18,10 @@
# limitations under the License. # limitations under the License.
# #
import collections
from oslo_log import log from oslo_log import log
import oslo_utils
from watcher._i18n import _ from watcher._i18n import _
from watcher.applier.actions import migration from watcher.applier.actions import migration
@ -67,7 +70,8 @@ class VMWorkloadConsolidation(base.ServerConsolidationBaseStrategy):
AGGREGATE = 'mean' AGGREGATE = 'mean'
DATASOURCE_METRICS = ['instance_ram_allocated', 'instance_cpu_usage', DATASOURCE_METRICS = ['instance_ram_allocated', 'instance_cpu_usage',
'instance_ram_usage', 'instance_root_disk_size'] 'instance_ram_usage', 'instance_root_disk_size',
'host_cpu_usage', 'host_ram_usage']
MIGRATION = "migrate" MIGRATION = "migrate"
CHANGE_NOVA_SERVICE_STATE = "change_nova_service_state" CHANGE_NOVA_SERVICE_STATE = "change_nova_service_state"
@ -77,6 +81,11 @@ class VMWorkloadConsolidation(base.ServerConsolidationBaseStrategy):
self.number_of_migrations = 0 self.number_of_migrations = 0
self.number_of_released_nodes = 0 self.number_of_released_nodes = 0
self.datasource_instance_data_cache = dict() self.datasource_instance_data_cache = dict()
self.datasource_node_data_cache = dict()
# Host metric adjustments that take into account planned
# migrations.
self.host_metric_delta = collections.defaultdict(
lambda: collections.defaultdict(int))
@classmethod @classmethod
def get_name(cls): def get_name(cls):
@ -227,6 +236,18 @@ class VMWorkloadConsolidation(base.ServerConsolidationBaseStrategy):
destination_node) destination_node)
self.number_of_migrations += 1 self.number_of_migrations += 1
instance_util = self.get_instance_utilization(instance)
self.host_metric_delta[source_node.hostname]['cpu'] -= (
instance_util['cpu'])
# We'll deduce the vm allocated memory.
self.host_metric_delta[source_node.hostname]['ram'] -= (
instance.memory)
self.host_metric_delta[destination_node.hostname]['cpu'] += (
instance_util['cpu'])
self.host_metric_delta[destination_node.hostname]['ram'] += (
instance.memory)
def disable_unused_nodes(self): def disable_unused_nodes(self):
"""Generate actions for disabling unused nodes. """Generate actions for disabling unused nodes.
@ -289,6 +310,21 @@ class VMWorkloadConsolidation(base.ServerConsolidationBaseStrategy):
disk=instance_disk_util) disk=instance_disk_util)
return self.datasource_instance_data_cache.get(instance.uuid) return self.datasource_instance_data_cache.get(instance.uuid)
def _get_node_total_utilization(self, node):
if node.hostname in self.datasource_node_data_cache:
return self.datasource_node_data_cache[node.hostname]
cpu = self.datasource_backend.get_host_cpu_usage(
node, self.period, self.AGGREGATE,
self.granularity)
ram = self.datasource_backend.get_host_ram_usage(
node, self.period, self.AGGREGATE,
self.granularity)
self.datasource_node_data_cache[node.hostname] = dict(
cpu=cpu, ram=ram)
return self.datasource_node_data_cache[node.hostname]
def get_node_utilization(self, node): def get_node_utilization(self, node):
"""Collect cpu, ram and disk utilization statistics of a node. """Collect cpu, ram and disk utilization statistics of a node.
@ -309,7 +345,33 @@ class VMWorkloadConsolidation(base.ServerConsolidationBaseStrategy):
LOG.debug("instance utilization: %s %s", LOG.debug("instance utilization: %s %s",
instance, instance_util) instance, instance_util)
return dict(cpu=node_cpu_util, ram=node_ram_util, total_node_util = self._get_node_total_utilization(node)
total_node_cpu_util = total_node_util['cpu'] or 0
if total_node_cpu_util:
total_node_cpu_util = total_node_cpu_util * node.vcpus / 100
# account for planned migrations
total_node_cpu_util += self.host_metric_delta[node.hostname]['cpu']
total_node_ram_util = total_node_util['ram'] or 0
if total_node_ram_util:
total_node_ram_util /= oslo_utils.units.Ki
total_node_ram_util += self.host_metric_delta[node.hostname]['ram']
LOG.debug(
"node utilization: %s. "
"total instance cpu: %s, "
"total instance ram: %s, "
"total instance disk: %s, "
"total host cpu: %s, "
"total host ram: %s, "
"node delta usage: %s.",
node,
node_cpu_util, node_ram_util, node_disk_util,
total_node_cpu_util, total_node_ram_util,
self.host_metric_delta[node.hostname])
return dict(cpu=max(node_cpu_util, total_node_cpu_util),
ram=max(node_ram_util, total_node_ram_util),
disk=node_disk_util) disk=node_disk_util)
def get_node_capacity(self, node): def get_node_capacity(self, node):

View File

@ -80,7 +80,7 @@ class FakerModelCollector(base.BaseClusterDataModelCollector):
return self.load_model('scenario_4_with_metrics.xml') return self.load_model('scenario_4_with_metrics.xml')
class FakeCeilometerMetrics(object): class FakeGnocchiMetrics(object):
def __init__(self, model): def __init__(self, model):
self.model = model self.model = model
@ -90,6 +90,9 @@ class FakeCeilometerMetrics(object):
if meter_name == 'host_cpu_usage': if meter_name == 'host_cpu_usage':
return self.get_compute_node_cpu_util( return self.get_compute_node_cpu_util(
resource, period, aggregate, granularity) resource, period, aggregate, granularity)
elif meter_name == 'host_ram_usage':
return self.get_compute_node_ram_util(
resource, period, aggregate, granularity)
elif meter_name == 'instance_cpu_usage': elif meter_name == 'instance_cpu_usage':
return self.get_instance_cpu_util( return self.get_instance_cpu_util(
resource, period, aggregate, granularity) resource, period, aggregate, granularity)
@ -110,18 +113,28 @@ class FakeCeilometerMetrics(object):
Returns relative node CPU utilization <0, 100>. Returns relative node CPU utilization <0, 100>.
:param r_id: resource id :param r_id: resource id
""" """
node_uuid = '%s_%s' % (resource.uuid, resource.hostname) node = self.model.get_node_by_uuid(resource.uuid)
node = self.model.get_node_by_uuid(node_uuid)
instances = self.model.get_node_instances(node) instances = self.model.get_node_instances(node)
util_sum = 0.0 util_sum = 0.0
for instance_uuid in instances: for instance in instances:
instance = self.model.get_instance_by_uuid(instance_uuid)
total_cpu_util = instance.vcpus * self.get_instance_cpu_util( total_cpu_util = instance.vcpus * self.get_instance_cpu_util(
instance.uuid) instance, period, aggregate, granularity)
util_sum += total_cpu_util / 100.0 util_sum += total_cpu_util / 100.0
util_sum /= node.vcpus util_sum /= node.vcpus
return util_sum * 100.0 return util_sum * 100.0
def get_compute_node_ram_util(self, resource, period, aggregate,
granularity):
# Returns mock host ram usage in KB based on the allocated
# instances.
node = self.model.get_node_by_uuid(resource.uuid)
instances = self.model.get_node_instances(node)
util_sum = 0.0
for instance in instances:
util_sum += self.get_instance_ram_util(
instance, period, aggregate, granularity)
return util_sum / 1024
@staticmethod @staticmethod
def get_instance_cpu_util(resource, period, aggregate, def get_instance_cpu_util(resource, period, aggregate,
granularity): granularity):
@ -171,93 +184,7 @@ class FakeCeilometerMetrics(object):
return instance_disk_util[str(resource.uuid)] return instance_disk_util[str(resource.uuid)]
class FakeGnocchiMetrics(object): # TODO(lpetrut): consider dropping Ceilometer support, it was deprecated
def __init__(self, model): # in Ocata.
self.model = model class FakeCeilometerMetrics(FakeGnocchiMetrics):
pass
def mock_get_statistics(self, resource=None, resource_type=None,
meter_name=None, period=300, aggregate='mean',
granularity=300):
if meter_name == 'host_cpu_usage':
return self.get_compute_node_cpu_util(
resource, period, aggregate, granularity)
elif meter_name == 'instance_cpu_usage':
return self.get_instance_cpu_util(
resource, period, aggregate, granularity)
elif meter_name == 'instance_ram_usage':
return self.get_instance_ram_util(
resource, period, aggregate, granularity)
elif meter_name == 'instance_root_disk_size':
return self.get_instance_disk_root_size(
resource, period, aggregate, granularity)
def get_compute_node_cpu_util(self, resource, period, aggregate,
granularity):
"""Calculates node utilization dynamicaly.
node CPU utilization should consider
and corelate with actual instance-node mappings
provided within a cluster model.
Returns relative node CPU utilization <0, 100>.
:param r_id: resource id
"""
node_uuid = "%s_%s" % (resource.uuid, resource.hostname)
node = self.model.get_node_by_uuid(node_uuid)
instances = self.model.get_node_instances(node)
util_sum = 0.0
for instance_uuid in instances:
instance = self.model.get_instance_by_uuid(instance_uuid)
total_cpu_util = instance.vcpus * self.get_instance_cpu_util(
instance.uuid)
util_sum += total_cpu_util / 100.0
util_sum /= node.vcpus
return util_sum * 100.0
@staticmethod
def get_instance_cpu_util(resource, period, aggregate,
granularity):
instance_cpu_util = dict()
instance_cpu_util['INSTANCE_0'] = 10
instance_cpu_util['INSTANCE_1'] = 30
instance_cpu_util['INSTANCE_2'] = 60
instance_cpu_util['INSTANCE_3'] = 20
instance_cpu_util['INSTANCE_4'] = 40
instance_cpu_util['INSTANCE_5'] = 50
instance_cpu_util['INSTANCE_6'] = 100
instance_cpu_util['INSTANCE_7'] = 100
instance_cpu_util['INSTANCE_8'] = 100
instance_cpu_util['INSTANCE_9'] = 100
return instance_cpu_util[str(resource.uuid)]
@staticmethod
def get_instance_ram_util(resource, period, aggregate,
granularity):
instance_ram_util = dict()
instance_ram_util['INSTANCE_0'] = 1
instance_ram_util['INSTANCE_1'] = 2
instance_ram_util['INSTANCE_2'] = 4
instance_ram_util['INSTANCE_3'] = 8
instance_ram_util['INSTANCE_4'] = 3
instance_ram_util['INSTANCE_5'] = 2
instance_ram_util['INSTANCE_6'] = 1
instance_ram_util['INSTANCE_7'] = 2
instance_ram_util['INSTANCE_8'] = 4
instance_ram_util['INSTANCE_9'] = 8
return instance_ram_util[str(resource.uuid)]
@staticmethod
def get_instance_disk_root_size(resource, period, aggregate,
granularity):
instance_disk_util = dict()
instance_disk_util['INSTANCE_0'] = 10
instance_disk_util['INSTANCE_1'] = 15
instance_disk_util['INSTANCE_2'] = 30
instance_disk_util['INSTANCE_3'] = 35
instance_disk_util['INSTANCE_4'] = 20
instance_disk_util['INSTANCE_5'] = 25
instance_disk_util['INSTANCE_6'] = 25
instance_disk_util['INSTANCE_7'] = 25
instance_disk_util['INSTANCE_8'] = 25
instance_disk_util['INSTANCE_9'] = 25
return instance_disk_util[str(resource.uuid)]

View File

@ -64,6 +64,10 @@ class TestVMWorkloadConsolidation(TestBaseStrategy):
self.fake_metrics.get_instance_ram_util), self.fake_metrics.get_instance_ram_util),
get_instance_root_disk_size=( get_instance_root_disk_size=(
self.fake_metrics.get_instance_disk_root_size), self.fake_metrics.get_instance_disk_root_size),
get_host_cpu_usage=(
self.fake_metrics.get_compute_node_cpu_util),
get_host_ram_usage=(
self.fake_metrics.get_compute_node_ram_util)
) )
self.strategy = strategies.VMWorkloadConsolidation( self.strategy = strategies.VMWorkloadConsolidation(
config=mock.Mock(datasources=self.datasource)) config=mock.Mock(datasources=self.datasource))
@ -88,6 +92,71 @@ class TestVMWorkloadConsolidation(TestBaseStrategy):
node_util, node_util,
self.strategy.get_node_utilization(node_0)) self.strategy.get_node_utilization(node_0))
def test_get_node_utilization_using_host_metrics(self):
model = self.fake_c_cluster.generate_scenario_1()
self.m_c_model.return_value = model
self.fake_metrics.model = model
node_0 = model.get_node_by_uuid("Node_0")
# "get_node_utilization" is expected to return the maximum
# between the host metrics and the sum of the instance metrics.
data_src = self.m_datasource.return_value
cpu_usage = 30
data_src.get_host_cpu_usage = mock.Mock(return_value=cpu_usage)
data_src.get_host_ram_usage = mock.Mock(return_value=512 * 1024)
exp_cpu_usage = cpu_usage * node_0.vcpus / 100
exp_node_util = dict(cpu=exp_cpu_usage, ram=512, disk=10)
self.assertEqual(
exp_node_util,
self.strategy.get_node_utilization(node_0))
def test_get_node_utilization_after_migrations(self):
model = self.fake_c_cluster.generate_scenario_1()
self.m_c_model.return_value = model
self.fake_metrics.model = model
node_0 = model.get_node_by_uuid("Node_0")
node_1 = model.get_node_by_uuid("Node_1")
data_src = self.m_datasource.return_value
cpu_usage = 30
host_ram_usage_mb = 512
data_src.get_host_cpu_usage = mock.Mock(return_value=cpu_usage)
data_src.get_host_ram_usage = mock.Mock(
return_value=host_ram_usage_mb * 1024)
instance_uuid = 'INSTANCE_0'
instance = model.get_instance_by_uuid(instance_uuid)
self.strategy.add_migration(instance, node_0, node_1)
instance_util = self.strategy.get_instance_utilization(instance)
# Ensure that we take into account planned migrations when
# determining node utilization
exp_node_0_cpu_usage = (
cpu_usage * node_0.vcpus) / 100 - instance_util['cpu']
exp_node_1_cpu_usage = (
cpu_usage * node_1.vcpus) / 100 + instance_util['cpu']
exp_node_0_ram_usage = host_ram_usage_mb - instance.memory
exp_node_1_ram_usage = host_ram_usage_mb + instance.memory
exp_node_0_util = dict(
cpu=exp_node_0_cpu_usage,
ram=exp_node_0_ram_usage,
disk=0)
exp_node_1_util = dict(
cpu=exp_node_1_cpu_usage,
ram=exp_node_1_ram_usage,
disk=25)
self.assertEqual(
exp_node_0_util,
self.strategy.get_node_utilization(node_0))
self.assertEqual(
exp_node_1_util,
self.strategy.get_node_utilization(node_1))
def test_get_node_capacity(self): def test_get_node_capacity(self):
model = self.fake_c_cluster.generate_scenario_1() model = self.fake_c_cluster.generate_scenario_1()
self.m_c_model.return_value = model self.m_c_model.return_value = model