From 1fdf9f5490b3d58956816c23b8f3d4d1dc10086e Mon Sep 17 00:00:00 2001
From: "James E. Blair" <jim@acmegating.com>
Date: Fri, 23 Jul 2021 10:46:39 -0700
Subject: [PATCH] Report all build[set] completions to DB

We missed some cases where builds might be aborted and the results
not reported to the database.  This updates the test framework to
assert that tests end with no open builds or buildsets in the
database.

To fix the actual issues, we need to report some build completions
in the scheduler instead of the pipeline manager.  So to do that,
we grab a SQL reporter object when initializing the scheduler
(and we therefore no longer need to do so when initializing the
pipeline manager).  The SQL reporter isn't like the rest of the
reporters -- it isn't pipeline specific, so a single global instance
is fine.

Finally, initializing the SQL reporter during scheduler init had
some conflicts with a unit test which tested that the merger could
load "source-only" connections.  That test actually verified that
the *scheduler* loaded source-only connections.  So to correct this,
it now verifies that the executor (which has a merger and is under
the same constraints as the merger for this purpose) can do so.  We
no longer need the source_only flag in tests.

Change-Id: I1a983dcc9f4e5282c11af23813a4ca1c0f8e9d9d
---
 tests/base.py                  | 41 ++++++++++++++++++++++++++++------
 tests/unit/test_connection.py  | 13 +++++------
 tests/unit/test_scheduler.py   |  2 +-
 zuul/driver/sql/sqlreporter.py |  4 ++--
 zuul/manager/__init__.py       | 11 +++++----
 zuul/scheduler.py              | 26 +++++++++++++++++++++
 6 files changed, 76 insertions(+), 21 deletions(-)

diff --git a/tests/base.py b/tests/base.py
index 1a2b5025de..0b474426ea 100644
--- a/tests/base.py
+++ b/tests/base.py
@@ -66,6 +66,7 @@ from git.util import IterableList
 import yaml
 import paramiko
 import prometheus_client.exposition
+import sqlalchemy
 
 from zuul.driver.sql.sqlconnection import DatabaseSession
 from zuul.model import (
@@ -4143,7 +4144,7 @@ class SymLink(object):
 class SchedulerTestApp:
     def __init__(self, log, config, changes, additional_event_queues,
                  upstream_root, rpcclient, poller_events,
-                 git_url_with_auth, source_only, fake_sql,
+                 git_url_with_auth, fake_sql,
                  add_cleanup, validate_tenants, instance_id):
         self.log = log
         self.config = config
@@ -4162,7 +4163,7 @@ class SchedulerTestApp:
             add_cleanup,
             fake_sql,
         )
-        self.connections.configure(self.config, source_only=source_only)
+        self.connections.configure(self.config)
 
         self.sched = TestScheduler(self.config, self.connections, self)
         self.sched.log = logging.getLogger(f"zuul.Scheduler-{instance_id}")
@@ -4212,7 +4213,7 @@ class SchedulerTestManager:
 
     def create(self, log, config, changes, additional_event_queues,
                upstream_root, rpcclient, poller_events,
-               git_url_with_auth, source_only, fake_sql, add_cleanup,
+               git_url_with_auth, fake_sql, add_cleanup,
                validate_tenants):
         # Since the config contains a regex we cannot use copy.deepcopy()
         # as this will raise an exception with Python <3.7
@@ -4233,7 +4234,7 @@ class SchedulerTestManager:
         app = SchedulerTestApp(log, scheduler_config, changes,
                                additional_event_queues, upstream_root,
                                rpcclient, poller_events,
-                               git_url_with_auth, source_only,
+                               git_url_with_auth,
                                fake_sql, add_cleanup,
                                validate_tenants, instance_id)
         self.instances.append(app)
@@ -4346,7 +4347,6 @@ class ZuulTestCase(BaseTestCase):
     use_ssl: bool = False
     git_url_with_auth: bool = False
     log_console_port: int = 19885
-    source_only: bool = False
     fake_sql: bool = False
     validate_tenants = None
 
@@ -4504,7 +4504,7 @@ class ZuulTestCase(BaseTestCase):
             self.upstream_root, self.rpcclient, self.poller_events,
             self.git_url_with_auth, self.addCleanup, True)
         executor_connections.configure(self.config,
-                                       source_only=self.source_only)
+                                       source_only=True)
         self.executor_api = TestingExecutorApi(self.zk_client)
         self.executor_server = RecordingExecutorServer(
             self.config,
@@ -4522,7 +4522,7 @@ class ZuulTestCase(BaseTestCase):
         self.scheds.create(
             self.log, self.config, self.changes,
             self.additional_event_queues, self.upstream_root, self.rpcclient,
-            self.poller_events, self.git_url_with_auth, self.source_only,
+            self.poller_events, self.git_url_with_auth,
             self.fake_sql, self.addCleanup, self.validate_tenants)
 
         self.merge_server = None
@@ -4830,6 +4830,32 @@ class ZuulTestCase(BaseTestCase):
                 with open(os.path.join(root, fn)) as f:
                     self.assertTrue(f.read() in test_keys)
 
+    def assertSQLState(self):
+        reporter = self.scheds.first.connections.getSqlReporter(None)
+        with self.scheds.first.connections.getSqlConnection().\
+             engine.connect() as conn:
+
+            try:
+                result = conn.execute(
+                    sqlalchemy.sql.select(
+                        reporter.connection.zuul_buildset_table)
+                )
+            except sqlalchemy.exc.ProgrammingError:
+                # Table doesn't exist
+                return
+
+            for buildset in result.fetchall():
+                self.assertIsNotNone(buildset.result)
+
+            result = conn.execute(
+                sqlalchemy.sql.select(reporter.connection.zuul_build_table)
+            )
+
+            for build in result.fetchall():
+                self.assertIsNotNone(build.result)
+                self.assertIsNotNone(build.start_time)
+                self.assertIsNotNone(build.end_time)
+
     def assertFinalState(self):
         self.log.debug("Assert final state")
         # Make sure no jobs are running
@@ -4847,6 +4873,7 @@ class ZuulTestCase(BaseTestCase):
         self.assertEmptyQueues()
         self.assertNodepoolState()
         self.assertNoGeneratedKeys()
+        self.assertSQLState()
         ipm = zuul.manager.independent.IndependentPipelineManager
         for tenant in self.scheds.first.sched.abide.tenants.values():
             for pipeline in tenant.layout.pipelines.values():
diff --git a/tests/unit/test_connection.py b/tests/unit/test_connection.py
index c3a8bbf049..3f5e48f00a 100644
--- a/tests/unit/test_connection.py
+++ b/tests/unit/test_connection.py
@@ -539,17 +539,16 @@ class TestMultipleGerrits(ZuulTestCase):
 class TestConnectionsMerger(ZuulTestCase):
     config_file = 'zuul-connections-merger.conf'
     tenant_config_file = 'config/single-tenant/main.yaml'
-    source_only = True
 
     def test_connections_merger(self):
         "Test merger only configures source connections"
 
-        self.assertIn("gerrit", self.scheds.first.connections.connections)
-        self.assertIn("github", self.scheds.first.connections.connections)
-        self.assertNotIn("smtp", self.scheds.first.connections.connections)
-        self.assertNotIn("sql", self.scheds.first.connections.connections)
-        self.assertNotIn("timer", self.scheds.first.connections.connections)
-        self.assertNotIn("zuul", self.scheds.first.connections.connections)
+        self.assertIn("gerrit", self.executor_server.connections.connections)
+        self.assertIn("github", self.executor_server.connections.connections)
+        self.assertNotIn("smtp", self.executor_server.connections.connections)
+        self.assertNotIn("sql", self.executor_server.connections.connections)
+        self.assertNotIn("timer", self.executor_server.connections.connections)
+        self.assertNotIn("zuul", self.executor_server.connections.connections)
 
 
 class TestConnectionsCgit(ZuulTestCase):
diff --git a/tests/unit/test_scheduler.py b/tests/unit/test_scheduler.py
index cfb1629aba..f3d72f41b6 100644
--- a/tests/unit/test_scheduler.py
+++ b/tests/unit/test_scheduler.py
@@ -92,7 +92,7 @@ class TestSchedulerZone(ZuulTestCase):
             self.upstream_root, self.rpcclient, self.poller_events,
             self.git_url_with_auth, self.addCleanup, self.fake_sql)
         executor_connections.configure(self.config,
-                                       source_only=self.source_only)
+                                       source_only=True)
         self.executor_server_unzoned = RecordingExecutorServer(
             config,
             connections=executor_connections,
diff --git a/zuul/driver/sql/sqlreporter.py b/zuul/driver/sql/sqlreporter.py
index b3359315e6..c2cc173546 100644
--- a/zuul/driver/sql/sqlreporter.py
+++ b/zuul/driver/sql/sqlreporter.py
@@ -90,7 +90,7 @@ class SQLReporter(BaseReporter):
             )
             return db_buildset
 
-    def reportBuildsetEnd(self, buildset, action, final):
+    def reportBuildsetEnd(self, buildset, action, final, result=None):
         if not buildset.uuid:
             return
         if final:
@@ -102,7 +102,7 @@ class SQLReporter(BaseReporter):
             db_buildset = db.getBuildset(
                 tenant=buildset.item.pipeline.tenant.name, uuid=buildset.uuid)
             if db_buildset:
-                db_buildset.result = buildset.result
+                db_buildset.result = buildset.result or result
                 db_buildset.message = message
             elif buildset.builds:
                 self.log.error("Unable to find buildset "
diff --git a/zuul/manager/__init__.py b/zuul/manager/__init__.py
index bb967631f7..788e13f551 100644
--- a/zuul/manager/__init__.py
+++ b/zuul/manager/__init__.py
@@ -60,7 +60,7 @@ class PipelineManager(metaclass=ABCMeta):
         self.ref_filters = []
         # Cached dynamic layouts (layout uuid -> layout)
         self._layout_cache = {}
-        self.sql = self.sched.connections.getSqlReporter(pipeline)
+        self.sql = self.sched.sql
 
     def __str__(self):
         return "<%s %s>" % (self.__class__.__name__, self.pipeline.name)
@@ -196,7 +196,7 @@ class PipelineManager(metaclass=ABCMeta):
                 )
         # This might be called after canceljobs, which also sets a
         # non-final 'cancel' result.
-        self.sql.reportBuildsetEnd(item.current_build_set, 'cancel',
+        self.sql.reportBuildsetEnd(item.current_build_set, 'dequeue',
                                    final=False)
 
     def sendReport(self, action_reporters, item, message=None):
@@ -554,6 +554,7 @@ class PipelineManager(metaclass=ABCMeta):
         # Without this check, all items with a valid result would be reported
         # twice.
         if not item.current_build_set.result and item.live:
+            item.setReportedResult('DEQUEUED')
             self.reportDequeue(item)
 
     def removeItem(self, item):
@@ -716,8 +717,11 @@ class PipelineManager(metaclass=ABCMeta):
         # reported immediately afterwards during queue processing.
         if (prime and item.current_build_set.ref and not
                 item.didBundleStartReporting()):
+            # Force a dequeued result here because we haven't actually
+            # reported the item, but we are done with this buildset.
             self.sql.reportBuildsetEnd(
-                item.current_build_set, 'dequeue', final=False)
+                item.current_build_set, 'dequeue', final=False,
+                result='DEQUEUED')
             item.resetAllBuilds()
 
         for job in jobs_to_cancel:
@@ -1422,7 +1426,6 @@ class PipelineManager(metaclass=ABCMeta):
         item = build.build_set.item
 
         log.debug("Build %s of %s completed" % (build, item.change))
-        self.sql.reportBuildEnd(build, final=(not build.retry))
         item.pipeline.tenant.semaphore_handler.release(item, build.job)
 
         if item.getJob(build.job.name) is None:
diff --git a/zuul/scheduler.py b/zuul/scheduler.py
index ecec8d890a..923dfb4d34 100644
--- a/zuul/scheduler.py
+++ b/zuul/scheduler.py
@@ -145,6 +145,7 @@ class Scheduler(threading.Thread):
 
         self._zuul_app = app
         self.connections = connections
+        self.sql = self.connections.getSqlReporter(None)
         self.statsd = get_statsd(config)
         self.rpc = rpclistener.RPCListener(config, self)
         self.rpc_slow = rpclistener.RPCListenerSlow(config, self)
@@ -1124,6 +1125,13 @@ class Scheduler(threading.Thread):
                             item.getJob(request_job),
                         )
                     )
+                try:
+                    self.sql.reportBuildsetEnd(
+                        item.current_build_set, 'dequeue',
+                        final=False, result='DEQUEUED')
+                except Exception:
+                    self.log.exception(
+                        "Error reporting buildset completion to DB:")
 
             for build in builds_to_cancel:
                 self.log.info(
@@ -1216,6 +1224,13 @@ class Scheduler(threading.Thread):
                             item.getJob(request_job),
                         )
                     )
+                try:
+                    self.sql.reportBuildsetEnd(
+                        item.current_build_set, 'dequeue',
+                        final=False, result='DEQUEUED')
+                except Exception:
+                    self.log.exception(
+                        "Error reporting buildset completion to DB:")
 
             for build in builds_to_cancel:
                 self.log.info(
@@ -1826,6 +1841,11 @@ class Scheduler(threading.Thread):
         # internal dict after it's added to the report queue.
         self.executor.removeBuild(build)
 
+        try:
+            self.sql.reportBuildEnd(build, final=(not build.retry))
+        except Exception:
+            log.exception("Error reporting build completion to DB:")
+
         if build.build_set is not build.build_set.item.current_build_set:
             log.debug("Build %s is not in the current build set", build)
             return
@@ -2054,6 +2074,12 @@ class Scheduler(threading.Thread):
                         del self.executor.builds[build.uuid]
                     except KeyError:
                         pass
+                    try:
+                        self.sql.reportBuildEnd(build, final=False)
+                    except Exception:
+                        self.log.exception(
+                            "Error reporting build completion to DB:")
+
             else:
                 nodeset = buildset.getJobNodeSet(job_name)
                 if nodeset: