Support tot_data for metrics

The metrics code was just looking at the deltas between the last thirty
seconds and the new sample.  The cycles it returned to nova were simply
how many cycles were used in that thirty second sample.

Nova needs a running total.  Unfortunately this is a little complicated
in PowerVM as the metrics code keeps the VMs total (no matter which host
it runs on).  That's a weird statement.  To clarify, the base metrics
code will actually move the CPU cycles of a VM onto its new host when it
migrates.  This is different from libvirt which tracks it via the
cycles spent by the process on the host (so post migration, the
'counter' resets to 0).

This means that a VM that spends a million cycles on the source host
will have the cpu cycle counter start at a million when it lands on the
target host.

This change set takes this into account by determining the thirty second
sample delta between the calls.  It then keeps an internal to nova (via
an in memory cache) the total cycles.  This should account for the Live
Migration situation noted above.

Change-Id: Ie03c0dc927bf78f06354cbcbc15565bd3cfd780a
Closes-Bug: 1572820
This commit is contained in:
Drew Thorstensen
2016-04-26 23:36:52 -05:00
committed by Gautam Prasad
parent df3eda3f29
commit 52138dfdc4
2 changed files with 176 additions and 92 deletions

View File

@@ -80,34 +80,36 @@ class TestHostCPUStats(test.TestCase):
return lpar
return None
@mock.patch('nova_powervm.virt.powervm.host.HostCPUStats.'
'_get_fw_cycles_delta')
@mock.patch('nova_powervm.virt.powervm.host.HostCPUStats._get_cpu_freq')
@mock.patch('nova_powervm.virt.powervm.host.HostCPUStats.'
'_get_total_cycles')
'_get_total_cycles_delta')
@mock.patch('nova_powervm.virt.powervm.host.HostCPUStats.'
'_gather_user_cycles')
'_gather_user_cycles_delta')
@mock.patch('pypowervm.tasks.monitor.util.MetricCache._refresh_if_needed')
@mock.patch('pypowervm.tasks.monitor.util.ensure_ltm_monitors')
def test_update_internal_metric(self, mock_ensure_ltm, mock_refresh,
mock_user_cycles, mock_total_cycles,
mock_cpu_freq):
def test_update_internal_metric(
self, mock_ensure_ltm, mock_refresh, mock_user_cycles,
mock_total_cycles, mock_cpu_freq, mock_fw_cycles):
host_stats = pvm_host.HostCPUStats(self.adpt, 'host_uuid')
mock_cpu_freq.return_value = 4116
# Make sure None is returned if there is no data.
host_stats.cur_phyp = None
host_stats._update_internal_metric()
self.assertIsNone(host_stats.cur_data)
expect = {'iowait': 0, 'idle': 0, 'kernel': 0, 'user': 0,
'frequency': 0}
self.assertEqual(expect, host_stats.tot_data)
# Create mock phyp objects to test with
mock_phyp = mock.MagicMock()
mock_phyp.sample.system_firmware.utilized_proc_cycles = 58599310268
mock_prev_phyp = mock.MagicMock(
sample=mock.MagicMock(
system_firmware=mock.MagicMock(
utilized_proc_cycles=58599310268)))
mock_fw_cycles.return_value = 58599310268
mock_prev_phyp = mock.MagicMock()
# Mock methods not currently under test
mock_user_cycles.return_value = 0
mock_user_cycles.return_value = 50
mock_total_cycles.return_value = 1.6125945178663e+16
# Make the 'prev' the current...for the first pass
@@ -117,10 +119,10 @@ class TestHostCPUStats(test.TestCase):
# Validate the dictionary... No user cycles because all of the
# previous data is empty.
expect = {'iowait': 0, 'idle': 1.6125886579352732e+16,
'kernel': 58599310268, 'user': 0,
expect = {'iowait': 0, 'idle': 1.6125886579352682e+16,
'kernel': 58599310268, 'user': 50,
'frequency': 4116}
self.assertEqual(expect, host_stats.cur_data)
self.assertEqual(expect, host_stats.tot_data)
# Mock methods not currently under test
mock_user_cycles.return_value = 30010090000
@@ -129,29 +131,34 @@ class TestHostCPUStats(test.TestCase):
# Now 'increment' it with a new current/previous
host_stats.cur_phyp = mock_phyp
host_stats.prev_phyp = mock_prev_phyp
mock_user_cycles.return_value = 100000
host_stats._update_internal_metric()
# Validate this dictionary. Note that the values are still higher
# overall, even though we add the 'deltas' from each VM.
expect = {'iowait': 0, 'idle': 1.6125856569262732e+16,
'kernel': 58599310268, 'user': 30010090000,
# Validate this dictionary. Note that these values are 'higher'
# because this is a running total.
new_kern = 58599310268 * 2
expect = {'iowait': 0, 'idle': 3.2251773158605416e+16,
'kernel': new_kern, 'user': 100050,
'frequency': 4116}
self.assertEqual(expect, host_stats.cur_data)
self.assertEqual(expect, host_stats.tot_data)
@mock.patch('nova_powervm.virt.powervm.host.HostCPUStats.'
'_get_total_cycles')
'_get_fw_cycles_delta')
@mock.patch('nova_powervm.virt.powervm.host.HostCPUStats.'
'_gather_user_cycles')
'_get_total_cycles_delta')
@mock.patch('nova_powervm.virt.powervm.host.HostCPUStats.'
'_gather_user_cycles_delta')
@mock.patch('nova_powervm.virt.powervm.host.HostCPUStats._get_cpu_freq')
@mock.patch('pypowervm.tasks.monitor.util.MetricCache._refresh_if_needed')
@mock.patch('pypowervm.tasks.monitor.util.ensure_ltm_monitors')
def test_update_internal_metric_bad_total(
self, mock_ensure_ltm, mock_refresh, mock_cpu_freq,
mock_user_cycles, mock_tot_cycles):
mock_user_cycles, mock_tot_cycles, mock_fw_cycles):
"""Validates that if the total cycles are off, we handle."""
host_stats = pvm_host.HostCPUStats(self.adpt, 'host_uuid')
mock_cpu_freq.return_value = 4116
mock_user_cycles.return_value = 30010090000
mock_fw_cycles.return_value = 58599310268
# Mock the total cycles to some really low number.
mock_tot_cycles.return_value = 5
@@ -170,7 +177,7 @@ class TestHostCPUStats(test.TestCase):
# negative number.
expect = {'iowait': 0, 'idle': 0, 'kernel': 58599310268,
'user': 30010090000, 'frequency': 4116}
self.assertEqual(expect, host_stats.cur_data)
self.assertEqual(expect, host_stats.tot_data)
@mock.patch('subprocess.check_output')
@mock.patch('pypowervm.tasks.monitor.util.MetricCache._refresh_if_needed')
@@ -185,8 +192,8 @@ class TestHostCPUStats(test.TestCase):
'_delta_proc_cycles')
@mock.patch('pypowervm.tasks.monitor.util.MetricCache._refresh_if_needed')
@mock.patch('pypowervm.tasks.monitor.util.ensure_ltm_monitors')
def test_gather_user_cycles(self, mock_ensure_ltm, mock_refresh,
mock_cycles):
def test_gather_user_cycles_delta(self, mock_ensure_ltm, mock_refresh,
mock_cycles):
# Crete objects to test with
host_stats = pvm_host.HostCPUStats(self.adpt, 'host_uuid')
mock_phyp = mock.MagicMock()
@@ -198,21 +205,15 @@ class TestHostCPUStats(test.TestCase):
# Test that we can run with previous samples and then without.
host_stats.cur_phyp = mock_phyp
host_stats.prev_phyp = mock_prev_phyp
resp = host_stats._gather_user_cycles()
resp = host_stats._gather_user_cycles_delta()
self.assertEqual(30010090000, resp)
# Last, test to make sure the previous data is used.
host_stats.prev_data = {'user': 1000000}
resp = host_stats._gather_user_cycles()
self.assertEqual(30011090000, resp)
# Now test if there is no previous sample. Since there are no previous
# samples, it will be 0 (except it WILL default up to the previous
# min cycles, which we just set to 1000000).
# samples, it will be 0.
host_stats.prev_phyp = None
mock_cycles.return_value = 0
resp = host_stats._gather_user_cycles()
self.assertEqual(1000000, resp)
resp = host_stats._gather_user_cycles_delta()
self.assertEqual(0, resp)
@mock.patch('pypowervm.tasks.monitor.util.MetricCache._refresh_if_needed')
@mock.patch('pypowervm.tasks.monitor.util.ensure_ltm_monitors')
@@ -226,7 +227,7 @@ class TestHostCPUStats(test.TestCase):
# is deleted and a new one takes its place (LPAR ID 6)
delta = host_stats._delta_proc_cycles(mock_phyp.sample.lpars,
mock_prev_phyp.sample.lpars)
self.assertEqual(10010090000, delta)
self.assertEqual(10010000000, delta)
# Now test as if there is no previous data. This results in 0 as they
# could have all been LPMs with months of cycles (rather than 30
@@ -235,6 +236,16 @@ class TestHostCPUStats(test.TestCase):
self.assertEqual(0, delta2)
self.assertNotEqual(delta2, delta)
# Test that if previous sample had 0 values, the sample is not
# considered for evaluation, and resultant delta cycles is 0.
prev_lpar_sample = mock_prev_phyp.sample.lpars[0].processor
prev_lpar_sample.util_cap_proc_cycles = 0
prev_lpar_sample.util_uncap_proc_cycles = 0
prev_lpar_sample.donated_proc_cycles = 0
delta3 = host_stats._delta_proc_cycles(mock_phyp.sample.lpars,
mock_prev_phyp.sample.lpars)
self.assertEqual(0, delta3)
@mock.patch('pypowervm.tasks.monitor.util.MetricCache._refresh_if_needed')
@mock.patch('pypowervm.tasks.monitor.util.ensure_ltm_monitors')
def test_delta_user_cycles(self, mock_ensure_ltm, mock_refresh):
@@ -243,15 +254,17 @@ class TestHostCPUStats(test.TestCase):
mock_phyp, mock_prev_phyp = self._get_mock_phyps()
mock_phyp.sample.lpars[0].processor.util_cap_proc_cycles = 250000
mock_phyp.sample.lpars[0].processor.util_uncap_proc_cycles = 250000
mock_phyp.sample.lpars[0].processor.donated_proc_cycles = 500
mock_prev_phyp.sample.lpars[0].processor.util_cap_proc_cycles = 0
num = 455000
mock_prev_phyp.sample.lpars[0].processor.util_uncap_proc_cycles = num
mock_prev_phyp.sample.lpars[0].processor.donated_proc_cycles = 1000
# Test that a previous sample allows us to gather just the delta.
new_elem = self._get_sample(4, mock_phyp.sample)
old_elem = self._get_sample(4, mock_prev_phyp.sample)
delta = host_stats._delta_user_cycles(new_elem, old_elem)
self.assertEqual(45000, delta)
self.assertEqual(45500, delta)
# Validate the scenario where we don't have a previous. Should default
# to 0, given no context of why the previous sample did not have the
@@ -302,13 +315,54 @@ class TestHostCPUStats(test.TestCase):
host_stats = pvm_host.HostCPUStats(self.adpt, 'host_uuid')
mock_phyp = mock.MagicMock()
mock_phyp.sample = mock.MagicMock()
mock_phyp.sample.processor.configurable_proc_units = 1
mock_phyp.sample.time_based_cycles = 1.6125945178663e+16
mock_phyp.sample.processor.configurable_proc_units = 5
mock_phyp.sample.time_based_cycles = 500
host_stats.cur_phyp = mock_phyp
# Make sure we get the full system cycles.
max_cycles = host_stats._get_total_cycles()
self.assertEqual(1.6125945178663e+16, max_cycles)
max_cycles = host_stats._get_total_cycles_delta()
self.assertEqual(2500, max_cycles)
@mock.patch('pypowervm.tasks.monitor.util.MetricCache._refresh_if_needed')
@mock.patch('pypowervm.tasks.monitor.util.ensure_ltm_monitors')
def test_get_total_cycles_diff_cores(self, mock_ensure_ltm, mock_refresh):
# Mock objects to test with
host_stats = pvm_host.HostCPUStats(self.adpt, 'host_uuid')
# Latest Sample
mock_phyp = mock.MagicMock(sample=mock.MagicMock())
mock_phyp.sample.processor.configurable_proc_units = 48
mock_phyp.sample.time_based_cycles = 1000
host_stats.cur_phyp = mock_phyp
# Earlier sample. Use a higher proc unit sample
mock_phyp = mock.MagicMock(sample=mock.MagicMock())
mock_phyp.sample.processor.configurable_proc_units = 1
mock_phyp.sample.time_based_cycles = 500
host_stats.prev_phyp = mock_phyp
# Make sure we get the full system cycles.
max_cycles = host_stats._get_total_cycles_delta()
self.assertEqual(24000, max_cycles)
@mock.patch('pypowervm.tasks.monitor.util.MetricCache._refresh_if_needed')
@mock.patch('pypowervm.tasks.monitor.util.ensure_ltm_monitors')
def test_get_firmware_cycles(self, mock_ensure_ltm, mock_refresh):
# Mock objects to test with
host_stats = pvm_host.HostCPUStats(self.adpt, 'host_uuid')
# Latest Sample
mock_phyp = mock.MagicMock(sample=mock.MagicMock())
mock_phyp.sample.system_firmware.utilized_proc_cycles = 2000
# Previous Sample
prev_phyp = mock.MagicMock(sample=mock.MagicMock())
prev_phyp.sample.system_firmware.utilized_proc_cycles = 1000
host_stats.cur_phyp = mock_phyp
host_stats.prev_phyp = prev_phyp
# Get delta
delta_firmware_cycles = host_stats._get_fw_cycles_delta()
self.assertEqual(1000, delta_firmware_cycles)
def _get_mock_phyps(self):
"""Helper method to return cur_phyp and prev_phyp."""
@@ -316,12 +370,14 @@ class TestHostCPUStats(test.TestCase):
mock_lpar_4A.configure_mock(id=4, name='A')
mock_lpar_4A.processor = mock.MagicMock(
util_cap_proc_cycles=5005045000,
util_uncap_proc_cycles=5005045000)
util_uncap_proc_cycles=5005045000,
donated_proc_cycles=10000)
mock_lpar_4A_prev = mock.Mock()
mock_lpar_4A_prev.configure_mock(id=4, name='A')
mock_lpar_4A_prev.processor = mock.MagicMock(
util_cap_proc_cycles=0,
util_uncap_proc_cycles=0)
util_cap_proc_cycles=40000,
util_uncap_proc_cycles=40000,
donated_proc_cycles=0)
mock_phyp = mock.MagicMock(sample=mock.MagicMock(lpars=[mock_lpar_4A]))
mock_prev_phyp = mock.MagicMock(
sample=mock.MagicMock(lpars=[mock_lpar_4A_prev]))

View File

@@ -112,9 +112,18 @@ class HostCPUStats(pcm_util.MetricCache):
:param host_uuid: The UUID of the host CEC to maintain a metrics
cache for.
"""
# A dictionary to store the number of cycles spent. This is defined
# in the _update_internal_metric method.
self.cur_data, self.prev_data = None, None
# This represents the current state of cycles spent on the system.
# These are used to figure out usage statistics. As such, they are
# tied to the start of the nova compute process.
#
# - idle: Total idle cycles on the compute host.
# - kernel: How many cycles the hypervisor has consumed. Not a direct
# analogy to KVM
# - user: The amount of time spent by the VM's themselves.
# - iowait: Not used in PowerVM, but needed for nova.
# - frequency: The CPU frequency
self.tot_data = {'idle': 0, 'kernel': 0, 'user': 0, 'iowait': 0,
'frequency': 0}
# Invoke the parent to seed the metrics. Don't include VIO - will
# result in quicker calls.
@@ -136,7 +145,7 @@ class HostCPUStats(pcm_util.MetricCache):
# Return the dictionary format of the cycles as derived by the
# _update_internal_metric method. If there is no data yet, None would
# be the result.
return self.cur_data
return self.tot_data
def _update_internal_metric(self):
"""Uses the latest stats from the cache, and parses to Nova format.
@@ -147,53 +156,53 @@ class HostCPUStats(pcm_util.MetricCache):
# If there is no 'new' data (perhaps sampling is not turned on) then
# return no data.
if self.cur_phyp is None:
self.cur_data = None
return
# Move the current data to the previous. The previous data is used
# for some internal calculations. Blank out the current data just
# in case of error. Don't want to persist two copies of same.
self.prev_data, self.cur_data = self.cur_data, None
# Compute the cycles spent in FW since last collection.
fw_cycles_delta = self._get_fw_cycles_delta()
# Now we need the firmware cycles.
fw_cycles = self.cur_phyp.sample.system_firmware.utilized_proc_cycles
# Compute the cycles the system spent since last run.
tot_cycles_delta = self._get_total_cycles_delta()
# Compute the max cycles.
tot_cycles = self._get_total_cycles()
# Get the total user cycles.
user_cycles = self._gather_user_cycles()
# Get the user cycles since last run
user_cycles_delta = self._gather_user_cycles_delta()
# Make sure that the total cycles is higher than the user/fw cycles.
# Should not happen, but just in case there is any precision loss from
# CPU data back to system.
if user_cycles + fw_cycles > tot_cycles:
LOG.warning(_LW("Host CPU Metrics determined that the total "
"cycles reported was less than the used cycles. "
"This indicates an issue with the PCM data. "
"Please investigate the results."))
tot_cycles = user_cycles + fw_cycles
if user_cycles_delta + fw_cycles_delta > tot_cycles_delta:
LOG.warning(_LW(
"Host CPU Metrics determined that the total cycles reported "
"was less than the used cycles. This indicates an issue with "
"the PCM data. Please investigate the results.\n"
"Total Delta Cycles: %(tot_cycles)d\n"
"User Delta Cycles: %(user_cycles)d\n"
"Firmware Delta Cycles: %(fw_cycles)d"),
{'tot_cycles': tot_cycles_delta, 'fw_cycles': fw_cycles_delta,
'user_cycles': user_cycles_delta})
tot_cycles_delta = user_cycles_delta + fw_cycles_delta
# Idle is the subtraction of all.
idle_cycles = tot_cycles - user_cycles - fw_cycles
idle_delta_cycles = (tot_cycles_delta - user_cycles_delta -
fw_cycles_delta)
# Get the processor frequency.
freq = self._get_cpu_freq()
# The only moving cycles are idle, kernel and user.
self.tot_data['idle'] += idle_delta_cycles
self.tot_data['kernel'] += fw_cycles_delta
self.tot_data['user'] += user_cycles_delta
# Now save these cycles to the internal data structure.
self.cur_data = {'idle': idle_cycles, 'kernel': fw_cycles,
'user': user_cycles, 'iowait': 0, 'frequency': freq}
# Frequency doesn't accumulate like the others. So this stays static.
self.tot_data['frequency'] = self._get_cpu_freq()
def _gather_user_cycles(self):
"""The estimated total user cycles.
def _gather_user_cycles_delta(self):
"""The estimated user cycles of all VMs/VIOSes since last run.
The sample data includes information about how much CPU has been used
by workloads and the Virtual I/O Servers. There is not one global
counter that can be used to obtain the CPU spent cycles.
This method will calculate the delta of workload (and I/O Server)
cycles between the previous sample and the current sample, and then
add it to the previous 'user cycles'.
cycles between the previous sample and the current sample.
There are edge cases for this however. If a VM is deleted or migrated
its cycles will no longer be taken into account. The algorithm takes
@@ -220,11 +229,7 @@ class HostCPUStats(pcm_util.MetricCache):
vios_delta_cycles = self._delta_proc_cycles(vios_cur_samples,
vios_prev_samples)
# The used cycles is the total of used cycles from before along with
# the new delta cycles.
prev_user_cycles = (0 if self.prev_data is None
else self.prev_data['user'])
return prev_user_cycles + vm_delta_cycles + vios_delta_cycles
return vm_delta_cycles + vios_delta_cycles
@staticmethod
def _get_cpu_freq():
@@ -270,11 +275,20 @@ class HostCPUStats(pcm_util.MetricCache):
# included.
if prev_sample is None:
return 0
# If the previous sample values are all 0 (happens when VM is just
# migrated, phyp creates entry for VM with 0 values), then ignore the
# sample.
if (prev_sample.processor.util_cap_proc_cycles ==
prev_sample.processor.util_uncap_proc_cycles ==
prev_sample.processor.donated_proc_cycles == 0):
return 0
prev_amount = (prev_sample.processor.util_cap_proc_cycles +
prev_sample.processor.util_uncap_proc_cycles)
prev_sample.processor.util_uncap_proc_cycles -
prev_sample.processor.donated_proc_cycles)
cur_amount = (cur_sample.processor.util_cap_proc_cycles +
cur_sample.processor.util_uncap_proc_cycles)
cur_sample.processor.util_uncap_proc_cycles -
cur_sample.processor.donated_proc_cycles)
return cur_amount - prev_amount
@staticmethod
@@ -293,16 +307,30 @@ class HostCPUStats(pcm_util.MetricCache):
return prev_sample
return None
def _get_total_cycles(self):
"""Returns the 'total cycles' on the system.
def _get_total_cycles_delta(self):
"""Returns the 'total cycles' on the system since last sample.
:return: The estimated total cycles spent
:return: The total delta cycles since the last run.
"""
sample = self.cur_phyp.sample
cur_cores = sample.processor.configurable_proc_units
cur_cycles_per_core = sample.time_based_cycles
# Gather the estimated cycle count
total_procs = sample.processor.configurable_proc_units
cycles_per_sec = sample.time_based_cycles
est_total_cycles_per_sec = total_procs * cycles_per_sec
if self.prev_phyp:
prev_cycles_per_core = self.prev_phyp.sample.time_based_cycles
else:
prev_cycles_per_core = 0
return est_total_cycles_per_sec
# Get the delta cycles between the cores.
delta_cycles_per_core = cur_cycles_per_core - prev_cycles_per_core
# Total cycles since last sample is the 'per cpu' cycles spent
# times the number of active cores.
return delta_cycles_per_core * cur_cores
def _get_fw_cycles_delta(self):
"""Returns the number of cycles spent on firmware since last sample."""
cur_fw = self.cur_phyp.sample.system_firmware.utilized_proc_cycles
prev_fw = (self.prev_phyp.sample.system_firmware.utilized_proc_cycles
if self.prev_phyp else 0)
return cur_fw - prev_fw