Handle create state-machine errors as node failure
Treat all exceptions from the create state-machine as fatal errors and fail the node request. This resembles how we handle state-machine errors in Nodepool today. This change also adds the missing `RuntimeConfigurationException`. Change-Id: Iac0291d1c987781df1844fc4b9b64ef12912ad4f
This commit is contained in:

committed by
James E. Blair

parent
2378376ca6
commit
861967392e
12
tests/fixtures/layouts/nodepool.yaml
vendored
12
tests/fixtures/layouts/nodepool.yaml
vendored
@ -97,6 +97,9 @@
|
||||
- flavor:
|
||||
name: dedicated
|
||||
|
||||
- flavor:
|
||||
name: invalid
|
||||
|
||||
- label:
|
||||
name: debian-normal
|
||||
image: debian
|
||||
@ -112,6 +115,11 @@
|
||||
image: debian
|
||||
flavor: dedicated
|
||||
|
||||
- label:
|
||||
name: debian-invalid
|
||||
image: debian
|
||||
flavor: invalid
|
||||
|
||||
- section:
|
||||
name: aws-base
|
||||
abstract: true
|
||||
@ -130,6 +138,8 @@
|
||||
- name: dedicated
|
||||
instance-type: t3.large
|
||||
dedicated-host: True
|
||||
- name: invalid
|
||||
instance-type: invalid
|
||||
images:
|
||||
- name: debian
|
||||
image-id: ami-1e749f67
|
||||
@ -149,3 +159,5 @@
|
||||
key-name: zuul
|
||||
- name: debian-dedicated
|
||||
key-name: zuul
|
||||
- name: debian-invalid
|
||||
key-name: zuul
|
||||
|
@ -1398,6 +1398,10 @@ class TestNodepoolConfig(ZuulTestCase):
|
||||
self.assertEqual('debian-dedicated', label.name)
|
||||
self.assertEqual('debian', label.image)
|
||||
self.assertEqual('dedicated', label.flavor)
|
||||
label = layout.labels['debian-invalid']
|
||||
self.assertEqual('debian-invalid', label.name)
|
||||
self.assertEqual('debian', label.image)
|
||||
self.assertEqual('invalid', label.flavor)
|
||||
section = layout.sections['aws-base']
|
||||
self.assertEqual('aws-base', section.name)
|
||||
self.assertEqual(True, section.abstract)
|
||||
@ -1406,11 +1410,12 @@ class TestNodepoolConfig(ZuulTestCase):
|
||||
self.assertEqual('aws-us-east-1-main', provider_config.name)
|
||||
self.assertEqual('aws-us-east-1', provider_config.section)
|
||||
provider = layout.providers['aws-us-east-1-main']
|
||||
self.assertEqual(3, len(provider.labels))
|
||||
self.assertEqual(4, len(provider.labels))
|
||||
labels = sorted([x for x in provider.labels.keys()])
|
||||
self.assertEqual('debian-dedicated', labels[0])
|
||||
self.assertEqual('debian-large', labels[1])
|
||||
self.assertEqual('debian-normal', labels[2])
|
||||
self.assertEqual('debian-invalid', labels[1])
|
||||
self.assertEqual('debian-large', labels[2])
|
||||
self.assertEqual('debian-normal', labels[3])
|
||||
|
||||
@simple_layout('layouts/nodepool.yaml', enable_nodepool=True)
|
||||
def test_section_inheritance(self):
|
||||
|
@ -30,6 +30,7 @@ from moto import mock_aws
|
||||
from tests.base import (
|
||||
ZuulTestCase,
|
||||
iterate_timeout,
|
||||
okay_tracebacks,
|
||||
simple_layout,
|
||||
return_data,
|
||||
)
|
||||
@ -249,43 +250,49 @@ class TestLauncher(ZuulTestCase):
|
||||
self.assertEqual("test_external_id", uploads[0].external_id)
|
||||
self.assertTrue(uploads[0].validated)
|
||||
|
||||
@simple_layout('layouts/nodepool.yaml', enable_nodepool=True)
|
||||
def test_launcher_missing_label(self):
|
||||
def _requestNodes(self, labels):
|
||||
result_queue = PipelineResultEventQueue(
|
||||
self.zk_client, "tenant-one", "check")
|
||||
labels = ["debian-normal", "debian-unavailable"]
|
||||
|
||||
with self.createZKContext(None) as ctx:
|
||||
# Lock the pipeline, so we can grab the result event
|
||||
with pipeline_lock(self.zk_client, "tenant-one", "check"):
|
||||
request = model.NodesetRequest.new(
|
||||
ctx,
|
||||
tenant_name="tenant-one",
|
||||
pipeline_name="check",
|
||||
buildset_uuid=uuid.uuid4().hex,
|
||||
job_uuid=uuid.uuid4().hex,
|
||||
job_name="foobar",
|
||||
labels=labels,
|
||||
priority=100,
|
||||
request_time=time.time(),
|
||||
zuul_event_id=uuid.uuid4().hex,
|
||||
span_info=None,
|
||||
)
|
||||
for _ in iterate_timeout(
|
||||
10, "nodeset request to be fulfilled"):
|
||||
result_events = list(result_queue)
|
||||
if result_events:
|
||||
for event in result_events:
|
||||
# Remove event(s) from queue
|
||||
result_queue.ack(event)
|
||||
break
|
||||
|
||||
self.assertEqual(len(result_events), 1)
|
||||
for event in result_queue:
|
||||
self.assertIsInstance(event, model.NodesProvisionedEvent)
|
||||
self.assertEqual(event.request_id, request.uuid)
|
||||
self.assertEqual(event.build_set_uuid, request.buildset_uuid)
|
||||
|
||||
request.refresh(ctx)
|
||||
return request
|
||||
|
||||
@simple_layout('layouts/nodepool.yaml', enable_nodepool=True)
|
||||
def test_launcher_missing_label(self):
|
||||
ctx = self.createZKContext(None)
|
||||
# Lock the pipeline, so we can grab the result event
|
||||
with pipeline_lock(self.zk_client, "tenant-one", "check"):
|
||||
request = model.NodesetRequest.new(
|
||||
ctx,
|
||||
tenant_name="tenant-one",
|
||||
pipeline_name="check",
|
||||
buildset_uuid=uuid.uuid4().hex,
|
||||
job_uuid=uuid.uuid4().hex,
|
||||
job_name="foobar",
|
||||
labels=labels,
|
||||
priority=100,
|
||||
request_time=time.time(),
|
||||
zuul_event_id=uuid.uuid4().hex,
|
||||
span_info=None,
|
||||
)
|
||||
for _ in iterate_timeout(
|
||||
10, "nodeset request to be fulfilled"):
|
||||
result_events = list(result_queue)
|
||||
if result_events:
|
||||
for event in result_events:
|
||||
# Remove event(s) from queue
|
||||
result_queue.ack(event)
|
||||
break
|
||||
|
||||
self.assertEqual(len(result_events), 1)
|
||||
for event in result_queue:
|
||||
self.assertEqual(event.request_id, request.uuid)
|
||||
self.assertEqual(event.build_set_uuid, request.buildset_uuid)
|
||||
|
||||
request.refresh(ctx)
|
||||
labels = ["debian-normal", "debian-unavailable"]
|
||||
request = self._requestNodes(labels)
|
||||
self.assertEqual(request.state, model.NodesetRequest.State.FAILED)
|
||||
self.assertEqual(len(request.provider_nodes), 0)
|
||||
|
||||
@ -294,42 +301,14 @@ class TestLauncher(ZuulTestCase):
|
||||
|
||||
@simple_layout('layouts/nodepool.yaml', enable_nodepool=True)
|
||||
def test_node_lifecycle(self):
|
||||
result_queue = PipelineResultEventQueue(
|
||||
self.zk_client, "tenant-one", "check")
|
||||
nodeset = model.NodeSet()
|
||||
nodeset.addNode(model.Node("node", "debian-normal"))
|
||||
|
||||
ctx = self.createZKContext(None)
|
||||
# Lock the pipeline, so we can grab the result event
|
||||
with pipeline_lock(self.zk_client, "tenant-one", "check"):
|
||||
model.NodesetRequest.new(
|
||||
ctx,
|
||||
tenant_name="tenant-one",
|
||||
pipeline_name="check",
|
||||
buildset_uuid=uuid.uuid4().hex,
|
||||
job_uuid=uuid.uuid4().hex,
|
||||
job_name="foobar",
|
||||
labels=[n.label for n in nodeset.getNodes()],
|
||||
priority=100,
|
||||
request_time=time.time(),
|
||||
zuul_event_id=uuid.uuid4().hex,
|
||||
span_info=None,
|
||||
)
|
||||
for _ in iterate_timeout(
|
||||
10, "nodeset request to be fulfilled"):
|
||||
result_events = list(result_queue)
|
||||
if result_events:
|
||||
for event in result_events:
|
||||
# Remove event(s) from queue
|
||||
result_queue.ack(event)
|
||||
break
|
||||
|
||||
self.assertEqual(len(result_events), 1)
|
||||
for event in result_queue:
|
||||
self.assertIsInstance(event, model.NodesProvisionedEvent)
|
||||
request = self._requestNodes([n.label for n in nodeset.getNodes()])
|
||||
|
||||
client = LauncherClient(self.zk_client, None)
|
||||
request = client.getRequest(event.request_id)
|
||||
request = client.getRequest(request.uuid)
|
||||
|
||||
self.assertEqual(request.state, model.NodesetRequest.State.FULFILLED)
|
||||
self.assertEqual(len(request.provider_nodes), 1)
|
||||
@ -370,47 +349,15 @@ class TestLauncher(ZuulTestCase):
|
||||
|
||||
@simple_layout('layouts/nodepool.yaml', enable_nodepool=True)
|
||||
def test_lost_nodeset_request(self):
|
||||
result_queue = PipelineResultEventQueue(
|
||||
self.zk_client, "tenant-one", "check")
|
||||
|
||||
ctx = self.createZKContext(None)
|
||||
# Lock the pipeline, so we can grab the result event
|
||||
with pipeline_lock(self.zk_client, "tenant-one", "check"):
|
||||
model.NodesetRequest.new(
|
||||
ctx,
|
||||
tenant_name="tenant-one",
|
||||
pipeline_name="check",
|
||||
buildset_uuid=uuid.uuid4().hex,
|
||||
job_uuid=uuid.uuid4().hex,
|
||||
job_name="foobar",
|
||||
labels=["debian-normal"],
|
||||
priority=100,
|
||||
request_time=time.time(),
|
||||
zuul_event_id=uuid.uuid4().hex,
|
||||
span_info=None,
|
||||
)
|
||||
for _ in iterate_timeout(
|
||||
10, "nodeset request to be fulfilled"):
|
||||
result_events = list(result_queue)
|
||||
if result_events:
|
||||
for event in result_events:
|
||||
# Remove event(s) from queue
|
||||
result_queue.ack(event)
|
||||
break
|
||||
|
||||
self.assertEqual(len(result_events), 1)
|
||||
for event in result_queue:
|
||||
self.assertIsInstance(event, model.NodesProvisionedEvent)
|
||||
|
||||
client = LauncherClient(self.zk_client, None)
|
||||
request = client.getRequest(event.request_id)
|
||||
request = self._requestNodes(["debian-normal"])
|
||||
|
||||
provider_nodes = []
|
||||
for node_id in request.provider_nodes:
|
||||
provider_nodes.append(model.ProviderNode.fromZK(
|
||||
ctx, path=model.ProviderNode._getPath(node_id)))
|
||||
|
||||
client.deleteRequest(request)
|
||||
request.delete(ctx)
|
||||
self.waitUntilSettled()
|
||||
|
||||
with testtools.ExpectedException(NoNodeError):
|
||||
@ -426,6 +373,29 @@ class TestLauncher(ZuulTestCase):
|
||||
except NoNodeError:
|
||||
break
|
||||
|
||||
@simple_layout('layouts/nodepool.yaml', enable_nodepool=True)
|
||||
@okay_tracebacks('_getQuotaForInstanceType')
|
||||
def test_failed_node(self):
|
||||
ctx = self.createZKContext(None)
|
||||
request = self._requestNodes(["debian-invalid"])
|
||||
self.assertEqual(request.state, model.NodesetRequest.State.FAILED)
|
||||
self.assertEqual(len(request.provider_nodes), 1)
|
||||
|
||||
provider_nodes = []
|
||||
for node_id in request.provider_nodes:
|
||||
provider_nodes.append(model.ProviderNode.fromZK(
|
||||
ctx, path=model.ProviderNode._getPath(node_id)))
|
||||
|
||||
request.delete(ctx)
|
||||
self.waitUntilSettled()
|
||||
|
||||
for pnode in provider_nodes:
|
||||
for _ in iterate_timeout(60, "node to be deleted"):
|
||||
try:
|
||||
pnode.refresh(ctx)
|
||||
except NoNodeError:
|
||||
break
|
||||
|
||||
|
||||
class TestLauncherImagePermissions(ZuulTestCase):
|
||||
config_file = 'zuul-connections-nodepool.conf'
|
||||
|
@ -75,6 +75,10 @@ class CapacityException(Exception):
|
||||
statsd_key = 'error.capacity'
|
||||
|
||||
|
||||
class RuntimeConfigurationException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
# Authentication Exceptions
|
||||
class AuthTokenException(Exception):
|
||||
defaultMsg = 'Unknown Error'
|
||||
|
@ -396,10 +396,9 @@ class Launcher:
|
||||
self._checkNode(node, log)
|
||||
if node.state == model.ProviderNode.State.READY:
|
||||
node.releaseLock()
|
||||
except ProviderNodeError as err:
|
||||
except Exception:
|
||||
state = model.ProviderNode.State.FAILED
|
||||
log.exception("Marking node %s as %s: %s", node,
|
||||
state, err)
|
||||
log.exception("Marking node %s as %s", node, state)
|
||||
with self.createZKContext(node._lock, self.log) as ctx:
|
||||
node.updateAttributes(ctx, state=state)
|
||||
|
||||
|
Reference in New Issue
Block a user