Emit stats for more build results

We currently typically only emit build stats for success and failure,
but do not emit stats for NODE_FAILURE, CANCELED, SKIPPED, etc.

To round out the feature so that all build results are emitted, this
includes a small refactor to report build stats at the same time we
report build results to the database (it almost all cases).  One
exception to that is when we receive a non-current build result --
that generally happens after a build is canceled, so we don't need
to emit the stats again.

Change-Id: I3bdf4e2643e151e2eae9f204f62cdc106df876b4
This commit is contained in:
James E. Blair 2023-09-26 07:19:18 -07:00
parent 1da1c5e014
commit 1a226acbd0
4 changed files with 33 additions and 9 deletions

View File

@ -0,0 +1,7 @@
---
features:
- |
Monitoring stats for builds are now emitted for more results.
Previously they were emitted for some build results such as
`SUCCESS`, `FAILURE`, and `ERROR` but now include `NODE_FAILURE`
and other result values.

View File

@ -5933,6 +5933,18 @@ For CI problems and help debugging, contact ci@example.org"""
A.messages[1]))
self.assertTrue(
'Skipped due to failed job project-merge' in A.messages[1])
self.assertReportedStat(
'zuul.nodepool.requests.requested.total', value='1', kind='c')
self.assertReportedStat(
'zuul.nodepool.requests.requested.label.label1',
value='1', kind='c')
self.assertReportedStat(
'zuul.nodepool.requests.failed.label.label1',
value='1', kind='c')
self.assertReportedStat(
'zuul.tenant.tenant-one.pipeline.gate.project.review_example_com.'
'org_project.master.job.project-merge.NODE_FAILURE', value='1',
kind='c')
def test_nodepool_resources(self):
"Test that resources are reported"

View File

@ -5472,7 +5472,7 @@ class QueueItem(zkobject.ZKObject):
job=job, build_set=self.current_build_set,
error_detail=str(e), result='FAILURE')
self.addBuild(fakebuild)
self.pipeline.manager.sql.reportBuildEnd(
self.pipeline.manager.sched.reportBuildEnd(
fakebuild,
tenant=self.pipeline.tenant.name,
final=True)
@ -5881,7 +5881,7 @@ class QueueItem(zkobject.ZKObject):
error_detail=skipped_reason,
result='SKIPPED')
self.addBuild(fakebuild)
self.pipeline.manager.sql.reportBuildEnd(
self.pipeline.manager.sched.reportBuildEnd(
fakebuild,
tenant=self.pipeline.tenant.name,
final=True)
@ -5897,7 +5897,7 @@ class QueueItem(zkobject.ZKObject):
result='NODE_FAILURE',
)
self.addBuild(fakebuild)
self.pipeline.manager.sql.reportBuildEnd(
self.pipeline.manager.sched.reportBuildEnd(
fakebuild,
tenant=self.pipeline.tenant.name,
final=True)
@ -5958,7 +5958,7 @@ class QueueItem(zkobject.ZKObject):
job=job, build_set=self.current_build_set,
error_detail=msg, result='SKIPPED')
self.addBuild(fakebuild)
self.pipeline.manager.sql.reportBuildEnd(
self.pipeline.manager.sched.reportBuildEnd(
fakebuild,
tenant=self.pipeline.tenant.name,
final=True)
@ -5972,7 +5972,7 @@ class QueueItem(zkobject.ZKObject):
job=job, build_set=self.current_build_set,
error_detail=msg, result='SKIPPED')
self.addBuild(fakebuild)
self.pipeline.manager.sql.reportBuildEnd(
self.pipeline.manager.sched.reportBuildEnd(
fakebuild,
tenant=self.pipeline.tenant.name,
final=True)

View File

@ -887,6 +887,10 @@ class Scheduler(threading.Thread):
event, needs_result=False
)
def reportBuildEnd(self, build, tenant, final):
self.sql.reportBuildEnd(build, tenant=tenant, final=final)
self._reportBuildStats(build)
def _reportBuildStats(self, build):
# Note, as soon as the result is set, other threads may act
# upon this, even though the event hasn't been fully
@ -2904,6 +2908,9 @@ class Scheduler(threading.Thread):
)
self._cleanupCompletedBuild(build)
# This usually happens when we have already canceled a
# build and reported stats for it, so don't send stats in
# this case.
try:
self.sql.reportBuildEnd(
build, tenant=pipeline.tenant.name,
@ -2974,11 +2981,9 @@ class Scheduler(threading.Thread):
}
tracing.endSavedSpan(build.span_info, attributes=attributes)
self._reportBuildStats(build)
self._cleanupCompletedBuild(build)
try:
self.sql.reportBuildEnd(
self.reportBuildEnd(
build, tenant=pipeline.tenant.name, final=(not build.retry))
except Exception:
log.exception("Error reporting build completion to DB:")
@ -3130,7 +3135,7 @@ class Scheduler(threading.Thread):
# anymore once the pipeline is removed.
self.executor.removeBuild(build)
try:
self.sql.reportBuildEnd(
self.reportBuildEnd(
build, build.build_set.item.pipeline.tenant.name,
final=False)
except Exception: