Improve error handling on deleted nodes

This change:
* catches exceptions in the node cleanup method so they don't
  stop the main run method
* implements a timeout on node deletion so that if we can't delete
  an instance from the cloud, we still eventually delete the node
* switches from monotonic to regular time on the state machine
  start time initializer (since unlike nodepool, this value will
  be used by multiple components).

Change-Id: I17d99dea5f0d165579fc7d804615a0ad09023d44
This commit is contained in:
James E. Blair
2024-10-19 08:53:25 -07:00
parent 1b13bf6ec9
commit c53129c79f
3 changed files with 58 additions and 7 deletions
+35
View File
@@ -595,6 +595,41 @@ class TestLauncher(ZuulTestCase):
except NoNodeError:
break
@simple_layout('layouts/nodepool.yaml', enable_nodepool=True)
@mock.patch(
'zuul.driver.aws.awsendpoint.AwsCreateStateMachine.advance',
side_effect=Exception("Fake error"))
@mock.patch(
'zuul.driver.aws.awsendpoint.AwsDeleteStateMachine.advance',
side_effect=Exception("Fake error"))
@mock.patch('zuul.launcher.server.Launcher.DELETE_TIMEOUT', 1)
@okay_tracebacks('_execute_mock_call')
def test_failed_node3(self, mock_create, mock_delete):
# Test a node failure inside both the create and delete state
# machines
ctx = self.createZKContext(None)
request = self.requestNodes(["debian-normal"])
self.assertEqual(request.state, model.NodesetRequest.State.FAILED)
provider_nodes = request.provider_nodes[0]
self.assertEqual(len(provider_nodes), 2)
self.assertEqual(len(request.nodes), 1)
self.assertEqual(provider_nodes[-1], request.nodes[-1])
provider_nodes = []
for node_id in request.nodes:
provider_nodes.append(model.ProviderNode.fromZK(
ctx, path=model.ProviderNode._getPath(node_id)))
request.delete(ctx)
self.waitUntilSettled()
for pnode in provider_nodes:
for _ in iterate_timeout(60, "node to be deleted"):
try:
pnode.refresh(ctx)
except NoNodeError:
break
@simple_layout('layouts/nodepool-image.yaml', enable_nodepool=True)
@return_data(
'build-debian-local-image',
+22 -6
View File
@@ -177,6 +177,7 @@ class Launcher:
log = logging.getLogger("zuul.Launcher")
# Max. time the main event loop is allowed to sleep
MAX_SLEEP = 1
DELETE_TIMEOUT = 600
def __init__(self, config, connections):
self._running = True
@@ -479,9 +480,14 @@ class Launcher:
# * deallocate from request here
# * re-allocated similar to min-ready
if not request or node.state in node.State.FAILED:
self._cleanupNode(node, log)
try:
self._cleanupNode(node, log)
except Exception:
log.exception("Error in node cleanup")
self.wake_event.set()
def _checkNode(self, node, log):
# TODO: check timeout
with self.createZKContext(node._lock, self.log) as ctx:
with node.activeContext(ctx):
if not node.create_state_machine:
@@ -521,11 +527,21 @@ class Launcher:
old_state = node.delete_state_machine.state
log.debug("Checking node %s cleanup", node)
node.delete_state_machine.advance()
new_state = node.delete_state_machine.state
if old_state != new_state:
log.debug("Node %s advanced from %s to %s",
node, old_state, new_state)
now = time.time()
if (now - node.delete_state_machine.start_time >
self.DELETE_TIMEOUT):
log.error("Timeout deleting node %s", node)
node.delete_state_machine.state =\
node.delete_state_machine.COMPLETE
node.delete_state_machine.complete = True
if not node.delete_state_machine.complete:
node.delete_state_machine.advance()
new_state = node.delete_state_machine.state
if old_state != new_state:
log.debug("Node %s advanced from %s to %s",
node, old_state, new_state)
if not node.delete_state_machine.complete:
self.wake_event.set()
+1 -1
View File
@@ -21,7 +21,7 @@ class StateMachine:
def __init__(self, initial_state):
self.state = initial_state.get("state", self.START)
self.complete = False
self.start_time = initial_state.get("start_time", time.monotonic())
self.start_time = initial_state.get("start_time", time.time())
def advance(self):
pass