Fix image build event race and add more image api tests

This change does two things:

1) It adds tests that verify that in the case where images are
   managed in a central repo, admins may not use the web api of
   the wrong tenant (ie, a tenant that is not actually managing
   the builds of these images) to trigger or delete a build or
   upload.

In expanding the tests to include more than one tenant, a previously
existing race condition was more reliably observed, so it is fixed:

2) The following race sequence could happen:

[1] launcher triggers builds for 2 missing images
[2] scheduler begins reporting the image buildset (2 jobs)
[2] scheduler adds image build artifact #1 to registry
[1] launcher receives signal from zk watch that the image registry
    was updated and checks for missing images
[1] launcher observes that image #2 is still missing and submits
    an image build trigger
[2] scheduler adds image build artifact #2 to registry
[2] scheduler removes queue item
[2] scheduler processes trigger event, begins new build of
    image #2

To resolve this, we note that the addition of multiple image artifacts
is a critical section.  We adjust the creation order so that the
sequence is now:

* Create artifact #1 with state=None
* Create artifact #2 with state=None
* Create upload #1 and update artifact #1 to ready
* Create upload #2 and update artifact #2 to ready

The critical section is now bounded by the condition of having any
IBAs with state=None.  We now check for that within the launcher and wait
for it to clear before we decide if any images are missing.

Change-Id: Iad1b68cf8c9cf6bd5f13dab0471e7bf5fd290fbf
This commit is contained in:
James E. Blair
2025-05-22 15:00:00 -07:00
parent 973f343a73
commit 36af7c37e1
10 changed files with 270 additions and 5 deletions

View File

@@ -0,0 +1,2 @@
- hosts: all
tasks: []

View File

@@ -0,0 +1,21 @@
- pipeline:
name: image
manager: independent
trigger:
zuul:
- event: image-build
success:
zuul:
image-built: true
image-validated: true
- project:
name: common-config
image:
jobs:
# These jobs should run
- build-debian-local-image
- build-ubuntu-local-image
# This job should not because we don't emit a trigger event
# for this image.
- build-fedora-local-image

View File

@@ -0,0 +1,134 @@
- pipeline:
name: check
manager: independent
trigger:
gerrit:
- event: patchset-created
success:
gerrit:
Verified: 1
failure:
gerrit:
Verified: -1
- pipeline:
name: gate
manager: dependent
success-message: Build succeeded (gate).
trigger:
gerrit:
- event: comment-added
approval:
- Approved: 1
success:
gerrit:
Verified: 2
submit: true
failure:
gerrit:
Verified: -2
start:
gerrit:
Verified: 0
precedence: high
- pipeline:
name: post
manager: independent
trigger:
gerrit:
- event: ref-updated
ref: ^(?!refs/).*$
- pipeline:
name: tag
manager: independent
trigger:
gerrit:
- event: ref-updated
ref: ^refs/tags/.*$
- job:
name: base
parent: null
run: playbooks/base.yaml
nodeset:
nodes:
- label: ubuntu-xenial
name: controller
- job:
name: build-debian-local-image
image-build-name: debian-local
- job:
name: build-ubuntu-local-image
image-build-name: ubuntu-local
- job:
name: build-fedora-local-image
image-build-name: fedora-local
- image:
name: debian
type: cloud
- image:
name: debian-local
type: zuul
- image:
name: ubuntu-local
type: zuul
- image:
name: fedora-local
type: zuul
- flavor:
name: normal
- label:
name: debian-normal
image: debian
flavor: normal
- label:
name: debian-local-normal
image: debian-local
flavor: normal
# See zuul-image.yaml for the image build pipeline and config
- section:
name: aws-base
abstract: true
connection: aws
host-key-checking: false
boot-timeout: 120
launch-timeout: 600
object-storage:
bucket-name: zuul
key-name: zuul
flavors:
- name: normal
instance-type: t3.medium
images:
- name: debian
image-id: ami-1e749f67
- name: debian-local
- name: ubuntu-local
# Don't add fedora here, so that we don't emit a trigger event
# for it.
- section:
name: aws-us-east-1
parent: aws-base
region: us-east-1
- provider:
name: aws-us-east-1-main
section: aws-us-east-1
labels:
- name: debian-normal
- name: debian-local-normal

View File

@@ -0,0 +1 @@
test

View File

@@ -0,0 +1 @@
test

View File

@@ -0,0 +1,22 @@
- tenant:
name: tenant-one
source:
gerrit:
config-projects:
- common-config:
include: ['job', 'nodeset', 'secret', 'pipeline', 'project', 'project-template', 'semaphore', 'queue', 'image', 'flavor', 'label', 'section', 'provider']
extra-config-paths: zuul-image.yaml
- tenant-one-config
untrusted-projects:
- org/project1
- tenant:
name: tenant-two
source:
gerrit:
config-projects:
- common-config:
include: ['job', 'nodeset', 'secret', 'pipeline', 'project', 'project-template', 'semaphore', 'queue', 'image', 'flavor', 'label', 'section', 'provider']
- tenant-two-config
untrusted-projects:
- org/project2

View File

@@ -1493,6 +1493,7 @@ class TestWeb(BaseTestWeb):
class TestWebProviders(LauncherBaseTestCase, WebMixin):
config_file = 'zuul-connections-nodepool.conf'
tenant_config_file = 'config/multi-tenant-provider/main.yaml'
@simple_layout('layouts/nodepool.yaml', enable_nodepool=True)
def test_web_providers(self):
@@ -1590,7 +1591,6 @@ class TestWebProviders(LauncherBaseTestCase, WebMixin):
]
self.assertEqual(expected, data)
@simple_layout('layouts/nodepool-image.yaml', enable_nodepool=True)
@return_data(
'build-debian-local-image',
'refs/heads/master',
@@ -1625,6 +1625,20 @@ class TestWebProviders(LauncherBaseTestCase, WebMixin):
f"api/tenant/tenant-one/image-build-artifact/{art['uuid']}"
)
self.assertEqual(401, resp.status_code, resp.text)
# Test that the wrong tenant fails, even with auth
authz = {'iss': 'zuul_operator',
'aud': 'zuul.example.com',
'sub': 'testuser',
'zuul': {
'admin': ['tenant-two', ]
},
'exp': int(time.time()) + 3600}
token = jwt.encode(authz, key='NoDanaOnlyZuul',
algorithm='HS256')
resp = self.delete_url(
f"api/tenant/tenant-two/image-build-artifact/{art['uuid']}",
headers={'Authorization': 'Bearer %s' % token})
self.assertEqual(404, resp.status_code, resp.text)
# Do it again with auth
authz = {'iss': 'zuul_operator',
'aud': 'zuul.example.com',
@@ -1645,7 +1659,6 @@ class TestWebProviders(LauncherBaseTestCase, WebMixin):
if 'build_artifacts' not in data[1]:
break
@simple_layout('layouts/nodepool-image.yaml', enable_nodepool=True)
@return_data(
'build-debian-local-image',
'refs/heads/master',
@@ -1680,6 +1693,20 @@ class TestWebProviders(LauncherBaseTestCase, WebMixin):
f"api/tenant/tenant-one/image-upload/{upload['uuid']}"
)
self.assertEqual(401, resp.status_code, resp.text)
# Test that the wrong tenant fails, even with auth
authz = {'iss': 'zuul_operator',
'aud': 'zuul.example.com',
'sub': 'testuser',
'zuul': {
'admin': ['tenant-two', ]
},
'exp': int(time.time()) + 3600}
token = jwt.encode(authz, key='NoDanaOnlyZuul',
algorithm='HS256')
resp = self.delete_url(
f"api/tenant/tenant-two/image-upload/{upload['uuid']}",
headers={'Authorization': 'Bearer %s' % token})
self.assertEqual(404, resp.status_code, resp.text)
# Do it again with auth
authz = {'iss': 'zuul_operator',
'aud': 'zuul.example.com',
@@ -1700,7 +1727,6 @@ class TestWebProviders(LauncherBaseTestCase, WebMixin):
if 'build_artifacts' not in data[1]:
break
@simple_layout('layouts/nodepool-image.yaml', enable_nodepool=True)
@return_data(
'build-debian-local-image',
'refs/heads/master',
@@ -1755,8 +1781,27 @@ class TestWebProviders(LauncherBaseTestCase, WebMixin):
dict(name='build-ubuntu-local-image', result='SUCCESS'),
dict(name='build-ubuntu-local-image', result='SUCCESS'),
], ordered=False)
# Try again with the wrong tenant
authz = {'iss': 'zuul_operator',
'aud': 'zuul.example.com',
'sub': 'testuser',
'zuul': {
'admin': ['tenant-two', ]
},
'exp': int(time.time()) + 3600}
token = jwt.encode(authz, key='NoDanaOnlyZuul',
algorithm='HS256')
resp = self.post_url(
"api/tenant/tenant-two/image/ubuntu-local/build",
headers={'Authorization': 'Bearer %s' % token})
self.assertEqual(200, resp.status_code, resp.text)
self.waitUntilSettled("image rebuild")
self.assertHistory([
dict(name='build-debian-local-image', result='SUCCESS'),
dict(name='build-ubuntu-local-image', result='SUCCESS'),
dict(name='build-ubuntu-local-image', result='SUCCESS'),
], ordered=False)
@simple_layout('layouts/nodepool-image.yaml', enable_nodepool=True)
@return_data(
'build-debian-local-image',
'refs/heads/master',

View File

@@ -42,6 +42,7 @@ class ZuulReporter(BaseReporter):
continue
image_name = build.job.image_build_name
image = item.manager.tenant.layout.images[image_name]
ibas = []
for artifact in get_artifacts_from_result_data(
build.result_data,
logger=self.log):
@@ -51,7 +52,9 @@ class ZuulReporter(BaseReporter):
item.manager.tenant.name, image, build,
metadata, artifact['url'],
self.image_validated)
sched.createImageUploads(iba)
ibas.append(iba)
for iba in ibas:
sched.createImageUploads(iba)
def reportImageValidated(self, item):
sched = self.driver.sched

View File

@@ -1969,8 +1969,32 @@ class Launcher:
tenant_name, iba.name)
self.trigger_events[tenant_name].put(event.trigger_name, event)
def _waitForStableImageRegistry(self):
# Wait for any non-ready IBAs to become ready. This resolves
# a race condition where the scheduler might have created the
# first of more than one IBA in a buildset, and we were
# immediately triggered. Wait until the scheduler is finished
# so that if a buildset creates more than one IBA we have all
# the info.
now = time.time()
while True:
done = True
for iba in self.image_build_registry.getAllArtifacts():
if iba.state is not None:
continue
if now - iba.timestamp > 30:
# We'll only wait 30 seconds for this so we don't
# get stuck.
continue
done = False
break
if done:
return
time.sleep(1)
def checkMissingImages(self):
self.log.debug("Checking for missing images")
self._waitForStableImageRegistry()
for tenant_name, providers in self.tenant_providers.items():
images_by_project_branch = {}
for provider in providers:

View File

@@ -53,6 +53,18 @@ class ImageBuildRegistry(LockableZKObjectCache):
arts = sorted(arts, key=lambda x: x.timestamp)
return arts
def getAllArtifacts(self):
keys = []
for image_canonical_name in self.builds_by_image_name.keys():
keys.extend(self.builds_by_image_name[image_canonical_name])
arts = [self._cached_objects.get(key) for key in keys]
arts = [a for a in arts if a is not None]
# Sort in a stable order, primarily by timestamp, then format
# for identical timestamps.
arts = sorted(arts, key=lambda x: x.format)
arts = sorted(arts, key=lambda x: x.timestamp)
return arts
class ImageUploadRegistry(LockableZKObjectCache):