From 1a226acbd022968914574fdeb467b78bc5bfcc77 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Tue, 26 Sep 2023 07:19:18 -0700 Subject: [PATCH] Emit stats for more build results We currently typically only emit build stats for success and failure, but do not emit stats for NODE_FAILURE, CANCELED, SKIPPED, etc. To round out the feature so that all build results are emitted, this includes a small refactor to report build stats at the same time we report build results to the database (it almost all cases). One exception to that is when we receive a non-current build result -- that generally happens after a build is canceled, so we don't need to emit the stats again. Change-Id: I3bdf4e2643e151e2eae9f204f62cdc106df876b4 --- .../notes/build-result-stats-5e80a82a78faeb61.yaml | 7 +++++++ tests/unit/test_scheduler.py | 12 ++++++++++++ zuul/model.py | 10 +++++----- zuul/scheduler.py | 13 +++++++++---- 4 files changed, 33 insertions(+), 9 deletions(-) create mode 100644 releasenotes/notes/build-result-stats-5e80a82a78faeb61.yaml diff --git a/releasenotes/notes/build-result-stats-5e80a82a78faeb61.yaml b/releasenotes/notes/build-result-stats-5e80a82a78faeb61.yaml new file mode 100644 index 0000000000..77c313ec4a --- /dev/null +++ b/releasenotes/notes/build-result-stats-5e80a82a78faeb61.yaml @@ -0,0 +1,7 @@ +--- +features: + - | + Monitoring stats for builds are now emitted for more results. + Previously they were emitted for some build results such as + `SUCCESS`, `FAILURE`, and `ERROR` but now include `NODE_FAILURE` + and other result values. diff --git a/tests/unit/test_scheduler.py b/tests/unit/test_scheduler.py index f0507ac46e..c242ca3a8e 100644 --- a/tests/unit/test_scheduler.py +++ b/tests/unit/test_scheduler.py @@ -5933,6 +5933,18 @@ For CI problems and help debugging, contact ci@example.org""" A.messages[1])) self.assertTrue( 'Skipped due to failed job project-merge' in A.messages[1]) + self.assertReportedStat( + 'zuul.nodepool.requests.requested.total', value='1', kind='c') + self.assertReportedStat( + 'zuul.nodepool.requests.requested.label.label1', + value='1', kind='c') + self.assertReportedStat( + 'zuul.nodepool.requests.failed.label.label1', + value='1', kind='c') + self.assertReportedStat( + 'zuul.tenant.tenant-one.pipeline.gate.project.review_example_com.' + 'org_project.master.job.project-merge.NODE_FAILURE', value='1', + kind='c') def test_nodepool_resources(self): "Test that resources are reported" diff --git a/zuul/model.py b/zuul/model.py index 6888c22a6f..2577e9bf61 100644 --- a/zuul/model.py +++ b/zuul/model.py @@ -5472,7 +5472,7 @@ class QueueItem(zkobject.ZKObject): job=job, build_set=self.current_build_set, error_detail=str(e), result='FAILURE') self.addBuild(fakebuild) - self.pipeline.manager.sql.reportBuildEnd( + self.pipeline.manager.sched.reportBuildEnd( fakebuild, tenant=self.pipeline.tenant.name, final=True) @@ -5881,7 +5881,7 @@ class QueueItem(zkobject.ZKObject): error_detail=skipped_reason, result='SKIPPED') self.addBuild(fakebuild) - self.pipeline.manager.sql.reportBuildEnd( + self.pipeline.manager.sched.reportBuildEnd( fakebuild, tenant=self.pipeline.tenant.name, final=True) @@ -5897,7 +5897,7 @@ class QueueItem(zkobject.ZKObject): result='NODE_FAILURE', ) self.addBuild(fakebuild) - self.pipeline.manager.sql.reportBuildEnd( + self.pipeline.manager.sched.reportBuildEnd( fakebuild, tenant=self.pipeline.tenant.name, final=True) @@ -5958,7 +5958,7 @@ class QueueItem(zkobject.ZKObject): job=job, build_set=self.current_build_set, error_detail=msg, result='SKIPPED') self.addBuild(fakebuild) - self.pipeline.manager.sql.reportBuildEnd( + self.pipeline.manager.sched.reportBuildEnd( fakebuild, tenant=self.pipeline.tenant.name, final=True) @@ -5972,7 +5972,7 @@ class QueueItem(zkobject.ZKObject): job=job, build_set=self.current_build_set, error_detail=msg, result='SKIPPED') self.addBuild(fakebuild) - self.pipeline.manager.sql.reportBuildEnd( + self.pipeline.manager.sched.reportBuildEnd( fakebuild, tenant=self.pipeline.tenant.name, final=True) diff --git a/zuul/scheduler.py b/zuul/scheduler.py index 21820708af..b3b0174e8d 100644 --- a/zuul/scheduler.py +++ b/zuul/scheduler.py @@ -887,6 +887,10 @@ class Scheduler(threading.Thread): event, needs_result=False ) + def reportBuildEnd(self, build, tenant, final): + self.sql.reportBuildEnd(build, tenant=tenant, final=final) + self._reportBuildStats(build) + def _reportBuildStats(self, build): # Note, as soon as the result is set, other threads may act # upon this, even though the event hasn't been fully @@ -2904,6 +2908,9 @@ class Scheduler(threading.Thread): ) self._cleanupCompletedBuild(build) + # This usually happens when we have already canceled a + # build and reported stats for it, so don't send stats in + # this case. try: self.sql.reportBuildEnd( build, tenant=pipeline.tenant.name, @@ -2974,11 +2981,9 @@ class Scheduler(threading.Thread): } tracing.endSavedSpan(build.span_info, attributes=attributes) - self._reportBuildStats(build) - self._cleanupCompletedBuild(build) try: - self.sql.reportBuildEnd( + self.reportBuildEnd( build, tenant=pipeline.tenant.name, final=(not build.retry)) except Exception: log.exception("Error reporting build completion to DB:") @@ -3130,7 +3135,7 @@ class Scheduler(threading.Thread): # anymore once the pipeline is removed. self.executor.removeBuild(build) try: - self.sql.reportBuildEnd( + self.reportBuildEnd( build, build.build_set.item.pipeline.tenant.name, final=False) except Exception: