Adjust test_autohold to catch stats errors

This associates resources with only the held node and validates
that we emit stats that:

* Include the node while it is running a job
* Do not include the node after it is complete and is held

Change-Id: I832005dacd5e8c2ca00840fd5976834e88cfdf98
This commit is contained in:
James E. Blair 2021-08-20 08:46:47 -07:00
parent 919c5a3654
commit c22c63ab5f
2 changed files with 53 additions and 23 deletions

View File

@ -215,6 +215,11 @@ def registerProjects(source_name, client, config):
client.addProjectByName(project)
class StatException(Exception):
# Used by assertReportedStat
pass
class GerritDriverMock(GerritDriver):
def __init__(self, registry, changes: Dict[str, Dict[str, Change]],
upstream_root: str, additional_event_queues, poller_events,
@ -5426,7 +5431,7 @@ class ZuulTestCase(BaseTestCase):
self.assertEqual(self.getZKTree(client.WAITER_ROOT), [])
self.assertEqual(self.getZKTree(client.LOCK_ROOT), [])
def assertReportedStat(self, key, value=None, kind=None):
def assertReportedStat(self, key, value=None, kind=None, timeout=5):
"""Check statsd output
Check statsd return values. A ``value`` should specify a
@ -5443,6 +5448,8 @@ class ZuulTestCase(BaseTestCase):
- ``ms`` timing
- ``s`` set
:arg int timeout: How long to wait for the stat to appear
:returns: The value
"""
@ -5450,7 +5457,7 @@ class ZuulTestCase(BaseTestCase):
self.assertNotEqual(kind, None)
start = time.time()
while time.time() < (start + 5):
while time.time() <= (start + timeout):
# Note our fake statsd just queues up results in a queue.
# We just keep going through them until we find one that
# matches, or fail out. If statsd pipelines are used,
@ -5480,7 +5487,7 @@ class ZuulTestCase(BaseTestCase):
already_set_keys.update([k])
for k in already_set_keys:
if key != k and key.startswith(k):
raise Exception(
raise StatException(
"Key %s is a gauge/counter and "
"we are trying to set subkey %s" % (k, key))
@ -5513,7 +5520,16 @@ class ZuulTestCase(BaseTestCase):
return s_value
time.sleep(0.1)
raise Exception("Key %s not found in reported stats" % key)
raise StatException("Key %s not found in reported stats" % key)
def assertUnReportedStat(self, key, value=None, kind=None):
try:
value = self.assertReportedStat(key, value=value,
kind=kind, timeout=0)
except StatException:
return
raise StatException("Key %s found in reported stats: %s" %
(key, value))
def assertBuilds(self, builds):
"""Assert that the running builds are as described.

View File

@ -1922,13 +1922,6 @@ class TestScheduler(ZuulTestCase):
client = zuul.rpcclient.RPCClient('127.0.0.1',
self.gearman_server.port)
self.addCleanup(client.shutdown)
# Set resources so we can examine the code path for updating
# the stats on autohold.
self.fake_nodepool.resources = {
'cores': 2,
'ram': 1024,
'instances': 1,
}
r = client.autohold('tenant-one', 'org/project', 'project-test2',
"", "", "reason text", 1)
self.assertTrue(r)
@ -1966,17 +1959,40 @@ class TestScheduler(ZuulTestCase):
break
self.assertIsNone(held_node)
self.hold_jobs_in_queue = True
# Hold in build to check the stats
self.executor_server.hold_jobs_in_build = True
# Now test that failed jobs are autoheld
# Set resources only for this node so we can examine the code
# path for updating the stats on autohold.
self.fake_nodepool.resources = {
'cores': 2,
'ram': 1024,
'instances': 1,
}
# Some convenience variables for checking these stats.
tenant_ram_stat = 'zuul.nodepool.resources.tenant.tenant-one.ram'
project_ram_stat = ('zuul.nodepool.resources.project.'
'review_example_com/org/project.ram')
B = self.fake_gerrit.addFakeChange('org/project', 'master', 'B')
self.executor_server.failJob('project-test2', B)
self.fake_gerrit.addEvent(B.getPatchsetCreatedEvent(1))
self.waitUntilSettled()
# Get the build request object
build = list(self.scheds.first.sched.executor.builds.values())[0]
self.hold_jobs_in_queue = False
self.executor_api.release()
# We should report using the held node's resources
self.assertReportedStat(tenant_ram_stat, value='1024', kind='g')
self.assertReportedStat(project_ram_stat, value='1024', kind='g')
self.assertUnReportedStat(tenant_ram_stat, value='0', kind='g')
self.assertUnReportedStat(project_ram_stat, value='0', kind='g')
self.executor_server.hold_jobs_in_build = False
self.executor_server.release()
self.waitUntilSettled()
self.assertEqual(B.data['status'], 'NEW')
@ -2009,7 +2025,14 @@ class TestScheduler(ZuulTestCase):
self.assertEqual(1, len(request2.nodes))
self.assertEqual(1, len(request2.nodes[0]["nodes"]))
# We should now report that we no longer use the nodes resources
self.assertReportedStat(tenant_ram_stat, value='1024', kind='g')
self.assertReportedStat(project_ram_stat, value='1024', kind='g')
self.assertReportedStat(tenant_ram_stat, value='0', kind='g')
self.assertReportedStat(project_ram_stat, value='0', kind='g')
# Another failed change should not hold any more nodes
self.fake_nodepool.resources = {}
C = self.fake_gerrit.addFakeChange('org/project', 'master', 'C')
self.executor_server.failJob('project-test2', C)
self.fake_gerrit.addEvent(C.getPatchsetCreatedEvent(1))
@ -2036,15 +2059,6 @@ class TestScheduler(ZuulTestCase):
self.assertEqual(3, len(node_states))
self.assertEqual([zuul.model.STATE_USED] * 3, node_states)
# The resources should be reported
self.assertReportedStat(
'zuul.nodepool.resources.tenant.tenant-one.ram',
value='1024', kind='g')
self.assertReportedStat(
'zuul.nodepool.resources.project.'
'review_example_com/org/project.ram',
value='1024', kind='g')
@simple_layout('layouts/autohold.yaml')
def test_autohold_info(self):
client = zuul.rpcclient.RPCClient('127.0.0.1',