Improve error handling on deleted nodes
This change: * catches exceptions in the node cleanup method so they don't stop the main run method * implements a timeout on node deletion so that if we can't delete an instance from the cloud, we still eventually delete the node * switches from monotonic to regular time on the state machine start time initializer (since unlike nodepool, this value will be used by multiple components). Change-Id: I17d99dea5f0d165579fc7d804615a0ad09023d44
This commit is contained in:
@@ -595,6 +595,41 @@ class TestLauncher(ZuulTestCase):
|
||||
except NoNodeError:
|
||||
break
|
||||
|
||||
@simple_layout('layouts/nodepool.yaml', enable_nodepool=True)
|
||||
@mock.patch(
|
||||
'zuul.driver.aws.awsendpoint.AwsCreateStateMachine.advance',
|
||||
side_effect=Exception("Fake error"))
|
||||
@mock.patch(
|
||||
'zuul.driver.aws.awsendpoint.AwsDeleteStateMachine.advance',
|
||||
side_effect=Exception("Fake error"))
|
||||
@mock.patch('zuul.launcher.server.Launcher.DELETE_TIMEOUT', 1)
|
||||
@okay_tracebacks('_execute_mock_call')
|
||||
def test_failed_node3(self, mock_create, mock_delete):
|
||||
# Test a node failure inside both the create and delete state
|
||||
# machines
|
||||
ctx = self.createZKContext(None)
|
||||
request = self.requestNodes(["debian-normal"])
|
||||
self.assertEqual(request.state, model.NodesetRequest.State.FAILED)
|
||||
provider_nodes = request.provider_nodes[0]
|
||||
self.assertEqual(len(provider_nodes), 2)
|
||||
self.assertEqual(len(request.nodes), 1)
|
||||
self.assertEqual(provider_nodes[-1], request.nodes[-1])
|
||||
|
||||
provider_nodes = []
|
||||
for node_id in request.nodes:
|
||||
provider_nodes.append(model.ProviderNode.fromZK(
|
||||
ctx, path=model.ProviderNode._getPath(node_id)))
|
||||
|
||||
request.delete(ctx)
|
||||
self.waitUntilSettled()
|
||||
|
||||
for pnode in provider_nodes:
|
||||
for _ in iterate_timeout(60, "node to be deleted"):
|
||||
try:
|
||||
pnode.refresh(ctx)
|
||||
except NoNodeError:
|
||||
break
|
||||
|
||||
@simple_layout('layouts/nodepool-image.yaml', enable_nodepool=True)
|
||||
@return_data(
|
||||
'build-debian-local-image',
|
||||
|
||||
+22
-6
@@ -177,6 +177,7 @@ class Launcher:
|
||||
log = logging.getLogger("zuul.Launcher")
|
||||
# Max. time the main event loop is allowed to sleep
|
||||
MAX_SLEEP = 1
|
||||
DELETE_TIMEOUT = 600
|
||||
|
||||
def __init__(self, config, connections):
|
||||
self._running = True
|
||||
@@ -479,9 +480,14 @@ class Launcher:
|
||||
# * deallocate from request here
|
||||
# * re-allocated similar to min-ready
|
||||
if not request or node.state in node.State.FAILED:
|
||||
self._cleanupNode(node, log)
|
||||
try:
|
||||
self._cleanupNode(node, log)
|
||||
except Exception:
|
||||
log.exception("Error in node cleanup")
|
||||
self.wake_event.set()
|
||||
|
||||
def _checkNode(self, node, log):
|
||||
# TODO: check timeout
|
||||
with self.createZKContext(node._lock, self.log) as ctx:
|
||||
with node.activeContext(ctx):
|
||||
if not node.create_state_machine:
|
||||
@@ -521,11 +527,21 @@ class Launcher:
|
||||
|
||||
old_state = node.delete_state_machine.state
|
||||
log.debug("Checking node %s cleanup", node)
|
||||
node.delete_state_machine.advance()
|
||||
new_state = node.delete_state_machine.state
|
||||
if old_state != new_state:
|
||||
log.debug("Node %s advanced from %s to %s",
|
||||
node, old_state, new_state)
|
||||
|
||||
now = time.time()
|
||||
if (now - node.delete_state_machine.start_time >
|
||||
self.DELETE_TIMEOUT):
|
||||
log.error("Timeout deleting node %s", node)
|
||||
node.delete_state_machine.state =\
|
||||
node.delete_state_machine.COMPLETE
|
||||
node.delete_state_machine.complete = True
|
||||
|
||||
if not node.delete_state_machine.complete:
|
||||
node.delete_state_machine.advance()
|
||||
new_state = node.delete_state_machine.state
|
||||
if old_state != new_state:
|
||||
log.debug("Node %s advanced from %s to %s",
|
||||
node, old_state, new_state)
|
||||
|
||||
if not node.delete_state_machine.complete:
|
||||
self.wake_event.set()
|
||||
|
||||
@@ -21,7 +21,7 @@ class StateMachine:
|
||||
def __init__(self, initial_state):
|
||||
self.state = initial_state.get("state", self.START)
|
||||
self.complete = False
|
||||
self.start_time = initial_state.get("start_time", time.monotonic())
|
||||
self.start_time = initial_state.get("start_time", time.time())
|
||||
|
||||
def advance(self):
|
||||
pass
|
||||
|
||||
Reference in New Issue
Block a user