This does the following: * Retries node requests without penalty if they hit an unexpected quota error. The new node state "TEMPFAIL" is used for this. * Calculates total provider quota in a method similar to nodepool and caches it and updates it every 5 minutes. * Keeps track of realtime zuul quota usage by way of a custom TreeCache that adds and removes quota as it sees events. This lets us know real-time usage without needing to iterate over all nodes. * Avoids starting a node state machine if it is expected that the node will put the provider over quota. This is similar, but not quite the same as nodepool's idea of a paused provider. * Attempts to keep the percentage quota used of all providers roughly equivalent while still including some randomization. This is still likely not a final allocation algorithm. This has some issues that will need to be resolved in subsequent commits: * It is possible for a node request to be starved if multiple launchers are running for a provider, and there are requests of different sizes for that provider (the large one may never run if the small ones are able to fit in the last bit of quota). * Because we record quota failures as node objects in ZK, if a node request continually hits quota errors, our node records in ZK will grow without bound. Despite these issues, this change is useful for continued testing of the system, and since it is mainly focused with adding quota support and only minimally changes the node/request handling algorithms, it stands on its own. Change-Id: I6d65db2b103cc3f0aec7e46a6a63a393399c91eb
197 lines
7.1 KiB
Python
197 lines
7.1 KiB
Python
# Copyright 2024 Acme Gating, LLC
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
|
# not use this file except in compliance with the License. You may obtain
|
|
# a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
# License for the specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
import fixtures
|
|
import testtools
|
|
from kazoo.exceptions import NoNodeError
|
|
|
|
from zuul import model
|
|
from zuul.launcher.client import LauncherClient
|
|
|
|
from tests.base import (
|
|
ZuulTestCase,
|
|
iterate_timeout,
|
|
)
|
|
|
|
|
|
class BaseCloudDriverTest(ZuulTestCase):
|
|
cloud_test_connection_type = 'ssh'
|
|
cloud_test_image_format = ''
|
|
cloud_test_provider_name = ''
|
|
|
|
def setUp(self):
|
|
self.useFixture(fixtures.MonkeyPatch(
|
|
'zuul.launcher.server.NodescanRequest.FAKE', True))
|
|
super().setUp()
|
|
|
|
def _getEndpoint(self):
|
|
# Use the launcher provider so that we're using the same ttl
|
|
# method caches.
|
|
for provider in self.launcher.tenant_providers['tenant-one']:
|
|
if provider.name == self.cloud_test_provider_name:
|
|
return provider.getEndpoint()
|
|
|
|
def _assertProviderNodeAttributes(self, pnode):
|
|
self.assertEqual(pnode.connection_type,
|
|
self.cloud_test_connection_type)
|
|
self.assertIsNotNone(pnode.interface_ip)
|
|
|
|
def _test_node_lifecycle(self, label):
|
|
# Call this in a test to run a node lifecycle
|
|
for _ in iterate_timeout(
|
|
30, "scheduler and launcher to have the same layout"):
|
|
if (self.scheds.first.sched.local_layout_state.get("tenant-one") ==
|
|
self.launcher.local_layout_state.get("tenant-one")):
|
|
break
|
|
endpoint = self._getEndpoint()
|
|
nodeset = model.NodeSet()
|
|
nodeset.addNode(model.Node("node", label))
|
|
|
|
ctx = self.createZKContext(None)
|
|
request = self.requestNodes([n.label for n in nodeset.getNodes()])
|
|
|
|
client = LauncherClient(self.zk_client, None)
|
|
request = client.getRequest(request.uuid)
|
|
|
|
self.assertEqual(request.state, model.NodesetRequest.State.FULFILLED)
|
|
self.assertEqual(len(request.nodes), 1)
|
|
|
|
client.acceptNodeset(request, nodeset)
|
|
self.waitUntilSettled()
|
|
|
|
with testtools.ExpectedException(NoNodeError):
|
|
# Request should be gone
|
|
request.refresh(ctx)
|
|
|
|
for node in nodeset.getNodes():
|
|
pnode = node._provider_node
|
|
self.assertIsNotNone(pnode)
|
|
self.assertTrue(pnode.hasLock())
|
|
self._assertProviderNodeAttributes(pnode)
|
|
|
|
for _ in iterate_timeout(10, "instances to appear"):
|
|
if len(list(endpoint.listInstances())) > 0:
|
|
break
|
|
client.useNodeset(nodeset)
|
|
self.waitUntilSettled()
|
|
|
|
for node in nodeset.getNodes():
|
|
pnode = node._provider_node
|
|
self.assertTrue(pnode.hasLock())
|
|
self.assertTrue(pnode.state, pnode.State.IN_USE)
|
|
|
|
client.returnNodeset(nodeset)
|
|
self.waitUntilSettled()
|
|
|
|
for node in nodeset.getNodes():
|
|
pnode = node._provider_node
|
|
self.assertFalse(pnode.hasLock())
|
|
self.assertTrue(pnode.state, pnode.State.USED)
|
|
|
|
for _ in iterate_timeout(60, "node to be deleted"):
|
|
try:
|
|
pnode.refresh(ctx)
|
|
except NoNodeError:
|
|
break
|
|
|
|
# Iterate here because the aws driver (at least) performs
|
|
# delayed async deletes.
|
|
for _ in iterate_timeout(60, "instances to be deleted"):
|
|
if len(list(endpoint.listInstances())) == 0:
|
|
break
|
|
|
|
def _test_quota(self, label):
|
|
for _ in iterate_timeout(
|
|
30, "scheduler and launcher to have the same layout"):
|
|
if (self.scheds.first.sched.local_layout_state.get("tenant-one") ==
|
|
self.launcher.local_layout_state.get("tenant-one")):
|
|
break
|
|
endpoint = self._getEndpoint()
|
|
nodeset1 = model.NodeSet()
|
|
nodeset1.addNode(model.Node("node", label))
|
|
|
|
nodeset2 = model.NodeSet()
|
|
nodeset2.addNode(model.Node("node", label))
|
|
|
|
ctx = self.createZKContext(None)
|
|
request1 = self.requestNodes([n.label for n in nodeset1.getNodes()])
|
|
request2 = self.requestNodes(
|
|
[n.label for n in nodeset2.getNodes()],
|
|
timeout=None)
|
|
|
|
client = LauncherClient(self.zk_client, None)
|
|
request1 = client.getRequest(request1.uuid)
|
|
|
|
self.assertEqual(request1.state, model.NodesetRequest.State.FULFILLED)
|
|
self.assertEqual(len(request1.nodes), 1)
|
|
|
|
client.acceptNodeset(request1, nodeset1)
|
|
client.useNodeset(nodeset1)
|
|
|
|
# We should still be waiting on request2.
|
|
|
|
# TODO: This is potentially racy (but only producing
|
|
# false-negatives) and also slow. We should find a way to
|
|
# determine if the launcher had paused a provider.
|
|
with testtools.ExpectedException(Exception):
|
|
self.waitForNodeRequest(request2, 10)
|
|
request2 = client.getRequest(request2.uuid)
|
|
self.assertEqual(request2.state, model.NodesetRequest.State.ACCEPTED)
|
|
|
|
client.returnNodeset(nodeset1)
|
|
self.waitUntilSettled()
|
|
|
|
for node in nodeset1.getNodes():
|
|
pnode = node._provider_node
|
|
for _ in iterate_timeout(60, "node to be deleted"):
|
|
try:
|
|
pnode.refresh(ctx)
|
|
except NoNodeError:
|
|
break
|
|
|
|
self.waitForNodeRequest(request2, 10)
|
|
request2 = client.getRequest(request2.uuid)
|
|
self.assertEqual(request2.state, model.NodesetRequest.State.FULFILLED)
|
|
self.assertEqual(len(request2.nodes), 1)
|
|
|
|
client.acceptNodeset(request2, nodeset2)
|
|
client.useNodeset(nodeset2)
|
|
client.returnNodeset(nodeset2)
|
|
self.waitUntilSettled()
|
|
|
|
# Iterate here because the aws driver (at least) performs
|
|
# delayed async deletes.
|
|
for _ in iterate_timeout(60, "instances to be deleted"):
|
|
if len(list(endpoint.listInstances())) == 0:
|
|
break
|
|
|
|
def _test_diskimage(self):
|
|
self.waitUntilSettled()
|
|
self.assertHistory([
|
|
dict(name='build-debian-local-image', result='SUCCESS'),
|
|
], ordered=False)
|
|
|
|
name = 'review.example.com%2Forg%2Fcommon-config/debian-local'
|
|
artifacts = self.launcher.image_build_registry.\
|
|
getArtifactsForImage(name)
|
|
self.assertEqual(1, len(artifacts))
|
|
self.assertEqual(self.cloud_test_image_format, artifacts[0].format)
|
|
self.assertTrue(artifacts[0].validated)
|
|
uploads = self.launcher.image_upload_registry.getUploadsForImage(
|
|
name)
|
|
self.assertEqual(1, len(uploads))
|
|
self.assertEqual(artifacts[0].uuid, uploads[0].artifact_uuid)
|
|
self.assertIsNotNone(uploads[0].external_id)
|
|
self.assertTrue(uploads[0].validated)
|