Handle create state-machine errors as node failure

Treat all exceptions from the create state-machine as fatal errors and
fail the node request. This resembles how we handle state-machine errors
in Nodepool today.

This change also adds the missing `RuntimeConfigurationException`.

Change-Id: Iac0291d1c987781df1844fc4b9b64ef12912ad4f
This commit is contained in:
Simon Westphahl
2024-08-28 14:54:03 +02:00
committed by James E. Blair
parent 2378376ca6
commit 861967392e
5 changed files with 93 additions and 103 deletions

View File

@ -97,6 +97,9 @@
- flavor:
name: dedicated
- flavor:
name: invalid
- label:
name: debian-normal
image: debian
@ -112,6 +115,11 @@
image: debian
flavor: dedicated
- label:
name: debian-invalid
image: debian
flavor: invalid
- section:
name: aws-base
abstract: true
@ -130,6 +138,8 @@
- name: dedicated
instance-type: t3.large
dedicated-host: True
- name: invalid
instance-type: invalid
images:
- name: debian
image-id: ami-1e749f67
@ -149,3 +159,5 @@
key-name: zuul
- name: debian-dedicated
key-name: zuul
- name: debian-invalid
key-name: zuul

View File

@ -1398,6 +1398,10 @@ class TestNodepoolConfig(ZuulTestCase):
self.assertEqual('debian-dedicated', label.name)
self.assertEqual('debian', label.image)
self.assertEqual('dedicated', label.flavor)
label = layout.labels['debian-invalid']
self.assertEqual('debian-invalid', label.name)
self.assertEqual('debian', label.image)
self.assertEqual('invalid', label.flavor)
section = layout.sections['aws-base']
self.assertEqual('aws-base', section.name)
self.assertEqual(True, section.abstract)
@ -1406,11 +1410,12 @@ class TestNodepoolConfig(ZuulTestCase):
self.assertEqual('aws-us-east-1-main', provider_config.name)
self.assertEqual('aws-us-east-1', provider_config.section)
provider = layout.providers['aws-us-east-1-main']
self.assertEqual(3, len(provider.labels))
self.assertEqual(4, len(provider.labels))
labels = sorted([x for x in provider.labels.keys()])
self.assertEqual('debian-dedicated', labels[0])
self.assertEqual('debian-large', labels[1])
self.assertEqual('debian-normal', labels[2])
self.assertEqual('debian-invalid', labels[1])
self.assertEqual('debian-large', labels[2])
self.assertEqual('debian-normal', labels[3])
@simple_layout('layouts/nodepool.yaml', enable_nodepool=True)
def test_section_inheritance(self):

View File

@ -30,6 +30,7 @@ from moto import mock_aws
from tests.base import (
ZuulTestCase,
iterate_timeout,
okay_tracebacks,
simple_layout,
return_data,
)
@ -249,43 +250,49 @@ class TestLauncher(ZuulTestCase):
self.assertEqual("test_external_id", uploads[0].external_id)
self.assertTrue(uploads[0].validated)
@simple_layout('layouts/nodepool.yaml', enable_nodepool=True)
def test_launcher_missing_label(self):
def _requestNodes(self, labels):
result_queue = PipelineResultEventQueue(
self.zk_client, "tenant-one", "check")
labels = ["debian-normal", "debian-unavailable"]
with self.createZKContext(None) as ctx:
# Lock the pipeline, so we can grab the result event
with pipeline_lock(self.zk_client, "tenant-one", "check"):
request = model.NodesetRequest.new(
ctx,
tenant_name="tenant-one",
pipeline_name="check",
buildset_uuid=uuid.uuid4().hex,
job_uuid=uuid.uuid4().hex,
job_name="foobar",
labels=labels,
priority=100,
request_time=time.time(),
zuul_event_id=uuid.uuid4().hex,
span_info=None,
)
for _ in iterate_timeout(
10, "nodeset request to be fulfilled"):
result_events = list(result_queue)
if result_events:
for event in result_events:
# Remove event(s) from queue
result_queue.ack(event)
break
self.assertEqual(len(result_events), 1)
for event in result_queue:
self.assertIsInstance(event, model.NodesProvisionedEvent)
self.assertEqual(event.request_id, request.uuid)
self.assertEqual(event.build_set_uuid, request.buildset_uuid)
request.refresh(ctx)
return request
@simple_layout('layouts/nodepool.yaml', enable_nodepool=True)
def test_launcher_missing_label(self):
ctx = self.createZKContext(None)
# Lock the pipeline, so we can grab the result event
with pipeline_lock(self.zk_client, "tenant-one", "check"):
request = model.NodesetRequest.new(
ctx,
tenant_name="tenant-one",
pipeline_name="check",
buildset_uuid=uuid.uuid4().hex,
job_uuid=uuid.uuid4().hex,
job_name="foobar",
labels=labels,
priority=100,
request_time=time.time(),
zuul_event_id=uuid.uuid4().hex,
span_info=None,
)
for _ in iterate_timeout(
10, "nodeset request to be fulfilled"):
result_events = list(result_queue)
if result_events:
for event in result_events:
# Remove event(s) from queue
result_queue.ack(event)
break
self.assertEqual(len(result_events), 1)
for event in result_queue:
self.assertEqual(event.request_id, request.uuid)
self.assertEqual(event.build_set_uuid, request.buildset_uuid)
request.refresh(ctx)
labels = ["debian-normal", "debian-unavailable"]
request = self._requestNodes(labels)
self.assertEqual(request.state, model.NodesetRequest.State.FAILED)
self.assertEqual(len(request.provider_nodes), 0)
@ -294,42 +301,14 @@ class TestLauncher(ZuulTestCase):
@simple_layout('layouts/nodepool.yaml', enable_nodepool=True)
def test_node_lifecycle(self):
result_queue = PipelineResultEventQueue(
self.zk_client, "tenant-one", "check")
nodeset = model.NodeSet()
nodeset.addNode(model.Node("node", "debian-normal"))
ctx = self.createZKContext(None)
# Lock the pipeline, so we can grab the result event
with pipeline_lock(self.zk_client, "tenant-one", "check"):
model.NodesetRequest.new(
ctx,
tenant_name="tenant-one",
pipeline_name="check",
buildset_uuid=uuid.uuid4().hex,
job_uuid=uuid.uuid4().hex,
job_name="foobar",
labels=[n.label for n in nodeset.getNodes()],
priority=100,
request_time=time.time(),
zuul_event_id=uuid.uuid4().hex,
span_info=None,
)
for _ in iterate_timeout(
10, "nodeset request to be fulfilled"):
result_events = list(result_queue)
if result_events:
for event in result_events:
# Remove event(s) from queue
result_queue.ack(event)
break
self.assertEqual(len(result_events), 1)
for event in result_queue:
self.assertIsInstance(event, model.NodesProvisionedEvent)
request = self._requestNodes([n.label for n in nodeset.getNodes()])
client = LauncherClient(self.zk_client, None)
request = client.getRequest(event.request_id)
request = client.getRequest(request.uuid)
self.assertEqual(request.state, model.NodesetRequest.State.FULFILLED)
self.assertEqual(len(request.provider_nodes), 1)
@ -370,47 +349,15 @@ class TestLauncher(ZuulTestCase):
@simple_layout('layouts/nodepool.yaml', enable_nodepool=True)
def test_lost_nodeset_request(self):
result_queue = PipelineResultEventQueue(
self.zk_client, "tenant-one", "check")
ctx = self.createZKContext(None)
# Lock the pipeline, so we can grab the result event
with pipeline_lock(self.zk_client, "tenant-one", "check"):
model.NodesetRequest.new(
ctx,
tenant_name="tenant-one",
pipeline_name="check",
buildset_uuid=uuid.uuid4().hex,
job_uuid=uuid.uuid4().hex,
job_name="foobar",
labels=["debian-normal"],
priority=100,
request_time=time.time(),
zuul_event_id=uuid.uuid4().hex,
span_info=None,
)
for _ in iterate_timeout(
10, "nodeset request to be fulfilled"):
result_events = list(result_queue)
if result_events:
for event in result_events:
# Remove event(s) from queue
result_queue.ack(event)
break
self.assertEqual(len(result_events), 1)
for event in result_queue:
self.assertIsInstance(event, model.NodesProvisionedEvent)
client = LauncherClient(self.zk_client, None)
request = client.getRequest(event.request_id)
request = self._requestNodes(["debian-normal"])
provider_nodes = []
for node_id in request.provider_nodes:
provider_nodes.append(model.ProviderNode.fromZK(
ctx, path=model.ProviderNode._getPath(node_id)))
client.deleteRequest(request)
request.delete(ctx)
self.waitUntilSettled()
with testtools.ExpectedException(NoNodeError):
@ -426,6 +373,29 @@ class TestLauncher(ZuulTestCase):
except NoNodeError:
break
@simple_layout('layouts/nodepool.yaml', enable_nodepool=True)
@okay_tracebacks('_getQuotaForInstanceType')
def test_failed_node(self):
ctx = self.createZKContext(None)
request = self._requestNodes(["debian-invalid"])
self.assertEqual(request.state, model.NodesetRequest.State.FAILED)
self.assertEqual(len(request.provider_nodes), 1)
provider_nodes = []
for node_id in request.provider_nodes:
provider_nodes.append(model.ProviderNode.fromZK(
ctx, path=model.ProviderNode._getPath(node_id)))
request.delete(ctx)
self.waitUntilSettled()
for pnode in provider_nodes:
for _ in iterate_timeout(60, "node to be deleted"):
try:
pnode.refresh(ctx)
except NoNodeError:
break
class TestLauncherImagePermissions(ZuulTestCase):
config_file = 'zuul-connections-nodepool.conf'

View File

@ -75,6 +75,10 @@ class CapacityException(Exception):
statsd_key = 'error.capacity'
class RuntimeConfigurationException(Exception):
pass
# Authentication Exceptions
class AuthTokenException(Exception):
defaultMsg = 'Unknown Error'

View File

@ -396,10 +396,9 @@ class Launcher:
self._checkNode(node, log)
if node.state == model.ProviderNode.State.READY:
node.releaseLock()
except ProviderNodeError as err:
except Exception:
state = model.ProviderNode.State.FAILED
log.exception("Marking node %s as %s: %s", node,
state, err)
log.exception("Marking node %s as %s", node, state)
with self.createZKContext(node._lock, self.log) as ctx:
node.updateAttributes(ctx, state=state)