Report NODE_FAILURES caused by node request failures to SQL database

Currently, NODE_FAILURE results are not reported via SQL in case the
node request failed. Tis is because those results are directly
evaluated in the pipeline manager before the build is even started.
Thus, there are no build result events sent by the executor and the
"normal" build result event handling is skipped for those builds.

As those build results are not stored in the database they are also not
visible in the UI. Thus, there could be cases where a buildset failed
because of a NODE_FAILURE, but all builds that are shown were
successful.

To fix this, we could directly call the SQL reporter when the
NODE_FAILURE is evaluated in the pipeline manager.

Also adapt the reportBuildEnd() method in the sql reporter so that the
build entry is created in case its not present. This could be the case
if the build started event was not processed or did not happen at all
(e.g. for the NODE_FAILURE results or any result that is created via a
"fake build" directly in the pipeline manager).

Change-Id: I2603a7ccf26a41e6747c9276cb37c9b0fd668f75
This commit is contained in:
Felix Edel
2022-01-31 13:23:58 +01:00
parent cd50028d7b
commit d5a801e5f0
4 changed files with 67 additions and 20 deletions

View File

@@ -130,10 +130,11 @@ class TestSQLConnectionMysql(ZuulTestCase):
sa.sql.select([reporter.connection.zuul_buildset_table]))
buildsets = result.fetchall()
self.assertEqual(3, len(buildsets))
self.assertEqual(4, len(buildsets))
buildset0 = buildsets[0]
buildset1 = buildsets[1]
buildset2 = buildsets[2]
buildset3 = buildsets[3]
self.assertEqual('check', buildset0['pipeline'])
self.assertEqual('org/project', buildset0['project'])
@@ -201,6 +202,25 @@ class TestSQLConnectionMysql(ZuulTestCase):
buildset2_builds[0]['job_name'])
self.assertEqual("SUCCESS", buildset2_builds[0]['result'])
buildset3_builds = conn.execute(
sa.sql.select([
reporter.connection.zuul_build_table
]).where(
reporter.connection.zuul_build_table.c.buildset_id ==
buildset3['id']
)
).fetchall()
self.assertEqual(
'project-test1', buildset3_builds[1]['job_name'])
self.assertEqual('NODE_FAILURE', buildset3_builds[1]['result'])
self.assertEqual(None, buildset3_builds[1]['log_url'])
self.assertIsNotNone(buildset3_builds[1]['start_time'])
self.assertIsNotNone(buildset3_builds[1]['end_time'])
self.assertGreaterEqual(
buildset3_builds[1]['end_time'],
buildset3_builds[1]['start_time'])
self.executor_server.hold_jobs_in_build = True
# Add a success result
@@ -229,6 +249,21 @@ class TestSQLConnectionMysql(ZuulTestCase):
self.orderedRelease()
self.waitUntilSettled()
# Add a node_failure result
self.fake_nodepool.pause()
C = self.fake_gerrit.addFakeChange('org/project', 'master', 'C')
C.addApproval('Code-Review', 2)
self.fake_gerrit.addEvent(C.addApproval('Approved', 1))
self.waitUntilSettled()
self.orderedRelease()
self.waitUntilSettled()
req = self.fake_nodepool.getNodeRequests()[0]
self.fake_nodepool.addFailRequest(req)
self.fake_nodepool.unpause()
self.waitUntilSettled()
self.orderedRelease()
self.waitUntilSettled()
check_results()
def test_sql_results_retry_builds(self):

View File

@@ -91,31 +91,19 @@ class SQLReporter(BaseReporter):
f"{buildset.uuid} in DB")
def reportBuildStart(self, build):
buildset = build.build_set
start_time = build.start_time or time.time()
start = datetime.datetime.fromtimestamp(start_time,
tz=datetime.timezone.utc)
with self.connection.getSession() as db:
db_buildset = db.getBuildset(
tenant=buildset.item.pipeline.tenant.name, uuid=buildset.uuid)
db_build = db_buildset.createBuild(
uuid=build.uuid,
job_name=build.job.name,
start_time=start,
voting=build.job.voting,
nodeset=build.job.nodeset.name,
)
db_build = self._createBuild(db, build)
return db_build
def reportBuildEnd(self, build, tenant, final):
end_time = build.end_time or time.time()
end = datetime.datetime.fromtimestamp(end_time,
tz=datetime.timezone.utc)
with self.connection.getSession() as db:
db_build = db.getBuild(tenant=tenant, uuid=build.uuid)
if not db_build:
return None
db_build = self._createBuild(db, build)
end_time = build.end_time or time.time()
end = datetime.datetime.fromtimestamp(
end_time, tz=datetime.timezone.utc)
db_build.result = build.result
db_build.end_time = end
@@ -136,6 +124,23 @@ class SQLReporter(BaseReporter):
return db_build
def _createBuild(self, db, build):
start_time = build.start_time or time.time()
start = datetime.datetime.fromtimestamp(start_time,
tz=datetime.timezone.utc)
buildset = build.build_set
db_buildset = db.getBuildset(
tenant=buildset.item.pipeline.tenant.name, uuid=buildset.uuid)
db_build = db_buildset.createBuild(
uuid=build.uuid,
job_name=build.job.name,
start_time=start,
voting=build.job.voting,
nodeset=build.job.nodeset.name,
)
return db_build
def getBuilds(self, *args, **kw):
"""Return a list of Build objects"""
return self.connection.getBuilds(*args, **kw)

View File

@@ -1693,7 +1693,13 @@ class PipelineManager(metaclass=ABCMeta):
log.info("Node request %s: failure for %s",
request, request.job_name)
job = build_set.item.getJob(request.job_name)
build_set.item.setNodeRequestFailure(job)
fakebuild = build_set.item.setNodeRequestFailure(job)
try:
self.sql.reportBuildEnd(
fakebuild, tenant=build_set.item.pipeline.tenant.name,
final=True)
except Exception:
log.exception("Error reporting build completion to DB:")
self._resumeBuilds(build_set)
tenant = build_set.item.pipeline.tenant
tenant.semaphore_handler.release(build_set.item, job)

View File

@@ -4772,6 +4772,7 @@ class QueueItem(zkobject.ZKObject):
)
self.addBuild(fakebuild)
self.setResult(fakebuild)
return fakebuild
def setDequeuedNeedingChange(self):
self.updateAttributes(