Browse Source

Merge "Fix missing semaphore release on node failure"

changes/48/635148/1
Zuul 2 years ago
committed by Gerrit Code Review
parent
commit
069a550f81
5 changed files with 75 additions and 0 deletions
  1. +6
    -0
      tests/fixtures/config/semaphore/git/common-config/zuul.yaml
  2. +1
    -0
      tests/fixtures/config/semaphore/git/org_project4/README
  3. +1
    -0
      tests/fixtures/config/semaphore/main.yaml
  4. +64
    -0
      tests/unit/test_scheduler.py
  5. +3
    -0
      zuul/manager/__init__.py

+ 6
- 0
tests/fixtures/config/semaphore/git/common-config/zuul.yaml View File

@ -103,3 +103,9 @@
- project-test1
- semaphore-one-test1-resources-first
- semaphore-one-test2-resources-first
- project:
name: org/project4
check:
jobs:
- semaphore-one-test1-resources-first

+ 1
- 0
tests/fixtures/config/semaphore/git/org_project4/README View File

@ -0,0 +1 @@
test

+ 1
- 0
tests/fixtures/config/semaphore/main.yaml View File

@ -9,3 +9,4 @@
- org/project1
- org/project2
- org/project3
- org/project4

+ 64
- 0
tests/unit/test_scheduler.py View File

@ -6251,6 +6251,38 @@ class TestSemaphore(ZuulTestCase):
self.assertEqual(A.reported, 1)
self.assertEqual(B.reported, 1)
def test_semaphore_node_failure(self):
"Test semaphore and node failure"
tenant = self.sched.abide.tenants.get('tenant-one')
# Pause nodepool so we can fail the node request later
self.fake_nodepool.pause()
A = self.fake_gerrit.addFakeChange('org/project2', 'master', 'A')
self.assertFalse('test-semaphore' in
tenant.semaphore_handler.semaphores)
self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
self.waitUntilSettled()
# By default we first lock the semaphore and then get the nodes
# so at this point the semaphore needs to be aquired.
self.assertTrue('test-semaphore' in
tenant.semaphore_handler.semaphores)
# Fail the node request and unpause
req = self.fake_nodepool.getNodeRequests()[0]
self.fake_nodepool.addFailRequest(req)
self.fake_nodepool.unpause()
self.waitUntilSettled()
# At this point the job that holds the semaphore failed with
# node_failure and the semaphore must be released.
self.assertFalse('test-semaphore' in
tenant.semaphore_handler.semaphores)
self.assertEquals(1, A.reported)
self.assertIn('semaphore-one-test3 semaphore-one-test3 : NODE_FAILURE',
A.messages[0])
def test_semaphore_resources_first(self):
"Test semaphores with max=1 (mutex) and get resources first"
tenant = self.sched.abide.tenants.get('tenant-one')
@ -6298,6 +6330,38 @@ class TestSemaphore(ZuulTestCase):
self.executor_server.release()
self.waitUntilSettled()
def test_semaphore_resources_first_node_failure(self):
"Test semaphore and node failure"
tenant = self.sched.abide.tenants.get('tenant-one')
# Pause nodepool so we can fail the node request later
self.fake_nodepool.pause()
A = self.fake_gerrit.addFakeChange('org/project4', 'master', 'A')
self.assertFalse('test-semaphore' in
tenant.semaphore_handler.semaphores)
self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
self.waitUntilSettled()
# With resources first we first get the nodes so at this point the
# semaphore must not be aquired.
self.assertFalse('test-semaphore' in
tenant.semaphore_handler.semaphores)
# Fail the node request and unpause
req = self.fake_nodepool.getNodeRequests()[0]
self.fake_nodepool.addFailRequest(req)
self.fake_nodepool.unpause()
self.waitUntilSettled()
# At this point the job should never have acuired a semaphore so check
# that it still has not locked a semaphore.
self.assertFalse('test-semaphore' in
tenant.semaphore_handler.semaphores)
self.assertEquals(1, A.reported)
self.assertIn('semaphore-one-test1-resources-first : NODE_FAILURE',
A.messages[0])
def test_semaphore_zk_error(self):
"Test semaphore release with zk error"
tenant = self.sched.abide.tenants.get('tenant-one')


+ 3
- 0
zuul/manager/__init__.py View File

@ -864,6 +864,9 @@ class PipelineManager(object):
(request, request.job.name,))
build_set.item.setNodeRequestFailure(request.job)
self._resumeBuilds(request.build_set)
tenant = build_set.item.pipeline.tenant
tenant.semaphore_handler.release(build_set.item, request.job)
self.log.info("Completed node request %s for job %s of item %s "
"with nodes %s" %
(request, request.job, build_set.item,


Loading…
Cancel
Save