Fix relaunch attempts when hitting quota errors

The quota calculations in nodepool never can be perfectly accurate
because there still can be some races with other launchers in the
tenant or by other workloads sharing the tenant. So we must be able to
gracefully handle the case when the cloud refuses a launch because of
quota. Currently we just invalidate the quota cache and immediately
try again to launch the node. Of course that will fail again in most
cases. This makes the node request and thus the zuul job fail with
NODE_FAILURE.

Instead we need to pause the handler like it would happen because of
quota calculation. Instead we mark the node as aborted which is no
failure indicator and pause the handler so it will automatically
reschedule a new node launch as soon as the quota calculation allows
it.

Change-Id: I122268dc7723901c04a58fa6f5c1688a3fdab227
This commit is contained in:
Tobias Henkel 2018-01-23 19:43:10 +01:00
parent eb52394c8c
commit 31c276c234
No known key found for this signature in database
GPG Key ID: 03750DEC158E5FA2
7 changed files with 77 additions and 25 deletions

View File

@ -300,6 +300,15 @@ class LabelRecorder(object):
self.data.remove({'label': label, 'node_id': node_id})
return node_id
def removeNode(self, id):
'''
Remove the node with the specified ID.
'''
for d in self.data:
if d['node_id'] == id:
self.data.remove(d)
return
class NodeRequestHandlerNotifications(object):
"""
@ -645,9 +654,18 @@ class NodeRequestHandler(NodeRequestHandlerNotifications,
return False
# Launches are complete, so populate ready_nodes and failed_nodes.
for node in self.nodeset:
aborted_nodes = []
for node in self.nodeset[:]:
if node.state == zk.READY:
self.ready_nodes.append(node)
elif node.state == zk.ABORTED:
# ABORTED is a transient error triggered by overquota. In order
# to handle this gracefully don't count this as failed so the
# node is relaunched within this provider. Unlock the node so
# the DeletedNodeWorker cleans up the zNode.
aborted_nodes.append(node)
self.nodeset.remove(node)
self.zk.unlockNode(node)
else:
self.failed_nodes.append(node)
@ -674,6 +692,14 @@ class NodeRequestHandler(NodeRequestHandlerNotifications,
self.log.debug("Declining node request %s because nodes failed",
self.request.id)
self.decline_request()
elif aborted_nodes:
# Because nodes are added to the satisfied types list before they
# are ready we need to remove the aborted nodes again so they can
# be created again.
for node in aborted_nodes:
self._satisfied_types.removeNode(node.id)
self.paused = True
return False
else:
# The assigned nodes must be added to the request in the order
# in which they were requested.

View File

@ -246,10 +246,12 @@ class OpenStackNodeLauncher(NodeLauncher):
self.zk.storeNode(self.node)
if attempts == self._retries:
raise
# Invalidate the quota cache if we encountered a quota error.
if 'quota exceeded' in str(e).lower():
# A quota exception is not directly recoverable so bail
# out immediately with a specific exception.
self.log.info("Quota exceeded, invalidating quota cache")
self.handler.manager.invalidateQuotaCache()
raise exceptions.QuotaException("Quota exceeded")
attempts += 1
self.node.state = zk.READY
@ -380,8 +382,8 @@ class OpenStackNodeRequestHandler(NodeRequestHandler):
'''
Check if all launch requests have completed.
When all of the Node objects have reached a final state (READY or
FAILED), we'll know all threads have finished the launch process.
When all of the Node objects have reached a final state (READY, FAILED
or ABORTED), we'll know all threads have finished the launch process.
'''
if not self._threads:
return True
@ -392,9 +394,10 @@ class OpenStackNodeRequestHandler(NodeRequestHandler):
node_states = [node.state for node in self.nodeset]
# NOTE: It very important that NodeLauncher always sets one of
# NOTE: It's very important that NodeLauncher always sets one of
# these states, no matter what.
if not all(s in (zk.READY, zk.FAILED) for s in node_states):
if not all(s in (zk.READY, zk.FAILED, zk.ABORTED)
for s in node_states):
return False
return True

View File

@ -22,6 +22,7 @@ import time
from kazoo import exceptions as kze
from nodepool import exceptions
from nodepool import stats
from nodepool import zk
@ -69,6 +70,16 @@ class NodeLauncher(threading.Thread,
self.node.id)
self.node.state = zk.FAILED
statsd_key = 'error.zksession'
except exceptions.QuotaException:
# We encountered a quota error when trying to launch a
# node. In this case we need to abort the launch. The upper
# layers will take care of this and reschedule a new node once
# the quota is ok again.
self.log.info("Aborting node %s due to quota failure" %
self.node.id)
self.node.state = zk.ABORTED
self.zk.storeNode(self.node)
statsd_key = 'error.quota'
except Exception as e:
self.log.exception("Launch failed for node %s:", self.node.id)
self.node.state = zk.FAILED

View File

@ -45,6 +45,10 @@ class DibFailedError(BuilderError):
pass
class QuotaException(Exception):
pass
class TimeoutException(Exception):
pass

View File

@ -224,6 +224,8 @@ class PoolWorker(threading.Thread):
try:
if not r.poll():
active_handlers.append(r)
if r.paused:
self.paused_handler = r
else:
self.log.debug("Removing handler for request %s",
r.request.id)
@ -662,7 +664,7 @@ class DeletedNodeWorker(BaseCleanupWorker):
Delete instances from providers and nodes entries from ZooKeeper.
'''
cleanup_states = (zk.USED, zk.IN_USE, zk.BUILDING, zk.FAILED,
zk.DELETING)
zk.DELETING, zk.ABORTED)
zk_conn = self._nodepool.getZK()
for node in zk_conn.nodeIterator():

View File

@ -297,32 +297,35 @@ class TestLauncher(tests.DBTestCase):
# (according to nodepool's quota estimate) fails.
client.max_instances = 1
# Request a second node; this request should fail.
# Request a second node; this request should pause the handler.
req2 = zk.NodeRequest()
req2.state = zk.REQUESTED
req2.node_types.append('fake-label')
self.log.debug("Adding second request")
self.zk.storeNodeRequest(req2)
req2 = self.waitForNodeRequest(req2)
self.assertEqual(req2.state, zk.FAILED)
# After the second request failed, the internal quota estimate
# should be reset, so the next request should pause to wait
# for more quota to become available.
req3 = zk.NodeRequest()
req3.state = zk.REQUESTED
req3.node_types.append('fake-label')
self.log.debug("Adding third request")
self.zk.storeNodeRequest(req3)
req3 = self.waitForNodeRequest(req3, (zk.PENDING,))
self.assertEqual(req3.state, zk.PENDING)
# Wait until there is a paused request handler and verify that
# there is still only one server built (from the first
# request).
pool_worker = pool.getPoolWorkers('fake-provider')
while not pool_worker[0].paused_handler:
# self.log.debug("tick")
time.sleep(0.1)
self.log.debug("finished waiting")
# The handler is paused now and the request should be in state PENDING
req2 = self.waitForNodeRequest(req2, zk.PENDING)
self.assertEqual(req2.state, zk.PENDING)
# Now free up the first node
self.log.debug("Marking first node as used %s", req1.id)
req1_node.state = zk.USED
self.zk.storeNode(req1_node)
self.zk.unlockNode(req1_node)
self.waitForNodeDeletion(req1_node)
# After the first node is cleaned up the second request should be
# able to fulfill now.
req2 = self.waitForNodeRequest(req2)
self.assertEqual(req2.state, zk.FULFILLED)
self.assertEqual(len(client._server_list), 1)
def test_fail_request_on_launch_failure(self):

View File

@ -52,6 +52,9 @@ USED = 'used'
HOLD = 'hold'
# Initial node state
INIT = 'init'
# Aborted due to a transient error like overquota that should not count as a
# failed launch attempt
ABORTED = 'aborted'
# NOTE(Shrews): Importing this from nodepool.config causes an import error
@ -484,7 +487,7 @@ class Node(BaseModel):
Class representing a launched node.
'''
VALID_STATES = set([BUILDING, TESTING, READY, IN_USE, USED,
HOLD, DELETING, FAILED, INIT])
HOLD, DELETING, FAILED, INIT, ABORTED])
def __init__(self, id=None):
super(Node, self).__init__(id)