Adjust test_autohold to catch stats errors

This associates resources with only the held node and validates that we emit stats that: * Include the node while it is running a job * Do not include the node after it is complete and is held Change-Id: I832005dacd5e8c2ca00840fd5976834e88cfdf98
2021-08-20 08:46:47 -07:00 · 2021-08-20 08:46:47 -07:00 · c22c63ab5f
parent 919c5a3654
commit c22c63ab5f
2 changed files with 53 additions and 23 deletions
--- a/tests/base.py
+++ b/tests/base.py
@ -215,6 +215,11 @@ def registerProjects(source_name, client, config):
                client.addProjectByName(project)


+class StatException(Exception):
+    # Used by assertReportedStat
+    pass
+
+
 class GerritDriverMock(GerritDriver):
    def __init__(self, registry, changes: Dict[str, Dict[str, Change]],
                 upstream_root: str, additional_event_queues, poller_events,
@ -5426,7 +5431,7 @@ class ZuulTestCase(BaseTestCase):
        self.assertEqual(self.getZKTree(client.WAITER_ROOT), [])
        self.assertEqual(self.getZKTree(client.LOCK_ROOT), [])

-    def assertReportedStat(self, key, value=None, kind=None):
+    def assertReportedStat(self, key, value=None, kind=None, timeout=5):
        """Check statsd output

        Check statsd return values.  A ``value`` should specify a
@ -5443,6 +5448,8 @@ class ZuulTestCase(BaseTestCase):
          - ``ms`` timing
          - ``s`` set

+        :arg int timeout: How long to wait for the stat to appear
+
        :returns: The value
        """

@ -5450,7 +5457,7 @@ class ZuulTestCase(BaseTestCase):
            self.assertNotEqual(kind, None)

        start = time.time()
-        while time.time() < (start + 5):
+        while time.time() <= (start + timeout):
            # Note our fake statsd just queues up results in a queue.
            # We just keep going through them until we find one that
            # matches, or fail out.  If statsd pipelines are used,
@ -5480,7 +5487,7 @@ class ZuulTestCase(BaseTestCase):
                    already_set_keys.update([k])
            for k in already_set_keys:
                if key != k and key.startswith(k):
-                    raise Exception(
+                    raise StatException(
                        "Key %s is a gauge/counter and "
                        "we are trying to set subkey %s" % (k, key))

@ -5513,7 +5520,16 @@ class ZuulTestCase(BaseTestCase):
                    return s_value
            time.sleep(0.1)

-        raise Exception("Key %s not found in reported stats" % key)
+        raise StatException("Key %s not found in reported stats" % key)
+
+    def assertUnReportedStat(self, key, value=None, kind=None):
+        try:
+            value = self.assertReportedStat(key, value=value,
+                                            kind=kind, timeout=0)
+        except StatException:
+            return
+        raise StatException("Key %s found in reported stats: %s" %
+                            (key, value))

    def assertBuilds(self, builds):
        """Assert that the running builds are as described.
--- a/tests/unit/test_scheduler.py
+++ b/tests/unit/test_scheduler.py
@ -1922,13 +1922,6 @@ class TestScheduler(ZuulTestCase):
        client = zuul.rpcclient.RPCClient('127.0.0.1',
                                          self.gearman_server.port)
        self.addCleanup(client.shutdown)
-        # Set resources so we can examine the code path for updating
-        # the stats on autohold.
-        self.fake_nodepool.resources = {
-            'cores': 2,
-            'ram': 1024,
-            'instances': 1,
-        }
        r = client.autohold('tenant-one', 'org/project', 'project-test2',
                            "", "", "reason text", 1)
        self.assertTrue(r)
@ -1966,17 +1959,40 @@ class TestScheduler(ZuulTestCase):
                break
        self.assertIsNone(held_node)

-        self.hold_jobs_in_queue = True
+        # Hold in build to check the stats
+        self.executor_server.hold_jobs_in_build = True
+
        # Now test that failed jobs are autoheld
+
+        # Set resources only for this node so we can examine the code
+        # path for updating the stats on autohold.
+        self.fake_nodepool.resources = {
+            'cores': 2,
+            'ram': 1024,
+            'instances': 1,
+        }
+        # Some convenience variables for checking these stats.
+        tenant_ram_stat = 'zuul.nodepool.resources.tenant.tenant-one.ram'
+        project_ram_stat = ('zuul.nodepool.resources.project.'
+                            'review_example_com/org/project.ram')
+
        B = self.fake_gerrit.addFakeChange('org/project', 'master', 'B')
        self.executor_server.failJob('project-test2', B)
        self.fake_gerrit.addEvent(B.getPatchsetCreatedEvent(1))

        self.waitUntilSettled()
+
+        # Get the build request object
        build = list(self.scheds.first.sched.executor.builds.values())[0]

-        self.hold_jobs_in_queue = False
-        self.executor_api.release()
+        # We should report using the held node's resources
+        self.assertReportedStat(tenant_ram_stat, value='1024', kind='g')
+        self.assertReportedStat(project_ram_stat, value='1024', kind='g')
+        self.assertUnReportedStat(tenant_ram_stat, value='0', kind='g')
+        self.assertUnReportedStat(project_ram_stat, value='0', kind='g')
+
+        self.executor_server.hold_jobs_in_build = False
+        self.executor_server.release()
        self.waitUntilSettled()

        self.assertEqual(B.data['status'], 'NEW')
@ -2009,7 +2025,14 @@ class TestScheduler(ZuulTestCase):
        self.assertEqual(1, len(request2.nodes))
        self.assertEqual(1, len(request2.nodes[0]["nodes"]))

+        # We should now report that we no longer use the nodes resources
+        self.assertReportedStat(tenant_ram_stat, value='1024', kind='g')
+        self.assertReportedStat(project_ram_stat, value='1024', kind='g')
+        self.assertReportedStat(tenant_ram_stat, value='0', kind='g')
+        self.assertReportedStat(project_ram_stat, value='0', kind='g')
+
        # Another failed change should not hold any more nodes
+        self.fake_nodepool.resources = {}
        C = self.fake_gerrit.addFakeChange('org/project', 'master', 'C')
        self.executor_server.failJob('project-test2', C)
        self.fake_gerrit.addEvent(C.getPatchsetCreatedEvent(1))
@ -2036,15 +2059,6 @@ class TestScheduler(ZuulTestCase):
        self.assertEqual(3, len(node_states))
        self.assertEqual([zuul.model.STATE_USED] * 3, node_states)

-        # The resources should be reported
-        self.assertReportedStat(
-            'zuul.nodepool.resources.tenant.tenant-one.ram',
-            value='1024', kind='g')
-        self.assertReportedStat(
-            'zuul.nodepool.resources.project.'
-            'review_example_com/org/project.ram',
-            value='1024', kind='g')
-
    @simple_layout('layouts/autohold.yaml')
    def test_autohold_info(self):
        client = zuul.rpcclient.RPCClient('127.0.0.1',