Fix multi-provider cleanup

The launcher's leaked resource cleanup method would overwrite the
list of potential leaked resources after each endpoint; this means
that only the last endpoint to run would actually have its resources
cleaned up.  Correct this by keeping a list of potential leaks indexed
by endpoint.

Change-Id: I961cb53f53ccf5fc39e467e1ddde3034a46f1952
This commit is contained in:
James E. Blair 2025-05-06 08:16:00 -07:00
parent cd0c1be98f
commit aafe84d1b1
5 changed files with 180 additions and 23 deletions

View File

@ -84,9 +84,11 @@ class FakeOpenstackCloud:
log = logging.getLogger("zuul.FakeOpenstackCloud")
def __init__(self,
region,
needs_floating_ip=False,
auto_attach_floating_ip=True,
):
self._fake_region = region
self._fake_needs_floating_ip = needs_floating_ip
self._fake_auto_attach_floating_ip = auto_attach_floating_ip
self.servers = []
@ -153,7 +155,7 @@ class FakeOpenstackConnection:
self.config = FakeOpenstackConfig()
self.config.config = {}
self.config.config['image_format'] = 'qcow2'
self.config.config['region_name'] = 'region1'
self.config.config['region_name'] = cloud._fake_region or 'region1'
def _needs_floating_ip(self, server, nat_destination):
return self.cloud._fake_needs_floating_ip
@ -302,7 +304,7 @@ class FakeOpenstackConnection:
class FakeOpenstackProviderEndpoint(OpenstackProviderEndpoint):
def _getClient(self):
return self._fake_cloud._getConnection()
return self._fake_cloud[self.region]._getConnection()
def _expandServer(self, server):
return server

View File

@ -0,0 +1,139 @@
- pipeline:
name: check
manager: independent
trigger:
gerrit:
- event: patchset-created
success:
gerrit:
Verified: 1
failure:
gerrit:
Verified: -1
- pipeline:
name: gate
manager: dependent
success-message: Build succeeded (gate).
trigger:
gerrit:
- event: comment-added
approval:
- Approved: 1
success:
gerrit:
Verified: 2
submit: true
failure:
gerrit:
Verified: -2
start:
gerrit:
Verified: 0
precedence: high
- pipeline:
name: post
manager: independent
trigger:
gerrit:
- event: ref-updated
ref: ^(?!refs/).*$
- pipeline:
name: tag
manager: independent
trigger:
gerrit:
- event: ref-updated
ref: ^refs/tags/.*$
- job:
name: base
parent: null
run: playbooks/base.yaml
nodeset:
nodes:
- label: debian-normal
name: controller
- job:
name: check-job
run: playbooks/check.yaml
- job:
name: post-job
run: playbooks/post.yaml
- job:
name: tag-job
run: playbooks/tag.yaml
- project:
name: org/project
check:
jobs:
- check-job
gate:
jobs:
- check-job
post:
jobs:
- post-job
tag:
jobs:
- tag-job
- image:
name: debian
type: cloud
- flavor:
name: normal
- label:
name: debian-normal
image: debian
flavor: normal
- section:
name: openstack-base
abstract: true
connection: openstack
host-key-checking: false
boot-timeout: 120
launch-timeout: 600
launch-attempts: 2
key-name: zuul
floating-ip-cleanup: true
port-cleanup-interval: 1
flavors:
- name: normal
flavor-name: Fake Flavor
images:
- name: debian
image-id: fake-image
networks:
- fake-network
- section:
name: openstack
parent: openstack-base
- provider:
name: openstack-main
section: openstack
labels:
- name: debian-normal
- section:
name: openstack-extra
parent: openstack-base
region: region2
- provider:
name: openstack-extra-main
section: openstack-extra
labels: []
images: []
flavors: []

View File

@ -291,7 +291,8 @@ class TestAwsDriver(BaseCloudDriverTest):
def test_aws_diskimage_ebs_direct(self):
self._test_diskimage()
@simple_layout('layouts/nodepool.yaml', enable_nodepool=True)
@simple_layout('layouts/nodepool-multi-provider.yaml',
enable_nodepool=True)
def test_aws_resource_cleanup(self):
self.waitUntilSettled()
self.launcher.cleanup_worker.INTERVAL = 1

View File

@ -63,27 +63,34 @@ class BaseOpenstackDriverTest(ZuulTestCase):
openstack_needs_floating_ip = False
openstack_auto_attach_floating_ip = True
def _makeFakeCloud(self, region):
fake_cloud = FakeOpenstackCloud(
region,
needs_floating_ip=self.openstack_needs_floating_ip,
auto_attach_floating_ip=self.openstack_auto_attach_floating_ip,
)
fake_cloud.max_instances =\
self.test_config.driver.openstack.get('max_instances', 100)
fake_cloud.max_cores =\
self.test_config.driver.openstack.get('max_cores', 100)
fake_cloud.max_ram =\
self.test_config.driver.openstack.get('max_ram', 1000000)
fake_cloud.max_volumes =\
self.test_config.driver.openstack.get('max_volumes', 100)
fake_cloud.max_volume_gb =\
self.test_config.driver.openstack.get('max_volume_gb', 100)
self.fake_cloud[region] = fake_cloud
def setUp(self):
self.initTestConfig()
self.useFixture(ImageMocksFixture())
clouds_yaml = os.path.join(FIXTURE_DIR, 'clouds.yaml')
self.useFixture(
fixtures.EnvironmentVariable('OS_CLIENT_CONFIG_FILE', clouds_yaml))
self.fake_cloud = FakeOpenstackCloud(
needs_floating_ip=self.openstack_needs_floating_ip,
auto_attach_floating_ip=self.openstack_auto_attach_floating_ip,
)
self.fake_cloud.max_instances =\
self.test_config.driver.openstack.get('max_instances', 100)
self.fake_cloud.max_cores =\
self.test_config.driver.openstack.get('max_cores', 100)
self.fake_cloud.max_ram =\
self.test_config.driver.openstack.get('max_ram', 1000000)
self.fake_cloud.max_volumes =\
self.test_config.driver.openstack.get('max_volumes', 100)
self.fake_cloud.max_volume_gb =\
self.test_config.driver.openstack.get('max_volume_gb', 100)
self.fake_cloud = {}
self._makeFakeCloud(None)
self._makeFakeCloud('region2')
self.patch(OpenstackDriver, '_endpoint_class',
FakeOpenstackProviderEndpoint)
self.patch(FakeOpenstackProviderEndpoint,
@ -136,11 +143,12 @@ class TestOpenstackDriver(BaseOpenstackDriverTest, BaseCloudDriverTest):
# Openstack-driver specific tests
@simple_layout('layouts/openstack/nodepool.yaml', enable_nodepool=True)
@simple_layout('layouts/openstack/nodepool-multi.yaml',
enable_nodepool=True)
def test_openstack_resource_cleanup(self):
self.waitUntilSettled()
self.launcher.cleanup_worker.INTERVAL = 1
conn = self.fake_cloud._getConnection()
conn = self.fake_cloud[None]._getConnection()
system_id = self.launcher.system.system_id
tags = {
'zuul_system_id': system_id,

View File

@ -681,6 +681,7 @@ class CleanupWorker:
def __init__(self, launcher):
self.launcher = launcher
self.wake_event = threading.Event()
# These are indexed by endpoint cname
self.possibly_leaked_nodes = {}
self.possibly_leaked_uploads = {}
self._running = False
@ -748,6 +749,10 @@ class CleanupWorker:
def cleanupLeakedResources(self, endpoint):
newly_leaked_nodes = {}
newly_leaked_uploads = {}
possibly_leaked_nodes = self.possibly_leaked_nodes.get(
endpoint.canonical_name, {})
possibly_leaked_uploads = self.possibly_leaked_uploads.get(
endpoint.canonical_name, {})
# Get a list of all providers that share this endpoint. This
# is because some providers may store resources like image
@ -765,7 +770,7 @@ class CleanupWorker:
upload_id = resource.metadata.get('zuul_upload_uuid')
if node_id and self.launcher.api.getProviderNode(node_id) is None:
newly_leaked_nodes[node_id] = resource
if node_id in self.possibly_leaked_nodes:
if node_id in possibly_leaked_nodes:
# We've seen this twice now, so it's not a race
# condition.
try:
@ -782,7 +787,7 @@ class CleanupWorker:
self.launcher.image_upload_registry.getItem(
upload_id) is None):
newly_leaked_uploads[upload_id] = resource
if upload_id in self.possibly_leaked_uploads:
if upload_id in possibly_leaked_uploads:
# We've seen this twice now, so it's not a race
# condition.
try:
@ -796,8 +801,10 @@ class CleanupWorker:
self.log.exception(
"Unable to delete leaked "
f"resource for upload {upload_id}")
self.possibly_leaked_nodes = newly_leaked_nodes
self.possibly_leaked_uploads = newly_leaked_uploads
self.possibly_leaked_nodes[endpoint.canonical_name] =\
newly_leaked_nodes
self.possibly_leaked_uploads[endpoint.canonical_name] =\
newly_leaked_uploads
class Launcher: