Only fail requests if no cloud can service them

Previously if we had cloud errors when attempting to handle a request in
a particular cloud we treated that node request as failed even if other
clouds had still not attempted to fulfill the request. This is
problematic because clouds have outages and this is the reason nodepool
supports speaking to more than one cloud at a time.

We fix this by only marking the request failed if all other providers
have attempted to fulfill it. Otherwise we put it back in the requested
state and let other providers have a go.

Change-Id: I519cdd2401f77cad97d60329e207dddafc8cd5a4
This commit is contained in:
Clark Boylan
2018-01-13 14:32:47 -08:00
parent 393ddcfaba
commit c4d047ab21
2 changed files with 16 additions and 22 deletions

View File

@@ -134,6 +134,17 @@ class NodeRequestHandler(object):
self.nodeset = []
def decline_request(self):
self.request.declined_by.append(self.launcher_id)
launchers = set(self.zk.getRegisteredLaunchers())
if launchers.issubset(set(self.request.declined_by)):
# All launchers have declined it
self.log.debug("Failing declined node request %s",
self.request.id)
self.request.state = zk.FAILED
else:
self.request.state = zk.REQUESTED
def run(self):
'''
Execute node request handling.
@@ -147,10 +158,10 @@ class NodeRequestHandler(object):
self.run_handler()
except Exception:
self.log.exception(
"Exception in NodeRequestHandler for request %s:",
self.request.id)
"Declining node request %s due to exception in "
"NodeRequestHandler:", self.request.id)
self.decline_request()
self.unlockNodeSet(clear_allocation=True)
self.request.state = zk.FAILED
self.zk.storeNodeRequest(self.request)
self.zk.unlockNodeRequest(self.request)
self.done = True
@@ -196,15 +207,7 @@ class NodeRequestHandler(object):
if self.launch_manager.failed_nodes:
self.log.debug("Declining node request %s because nodes failed",
self.request.id)
self.request.declined_by.append(self.launcher_id)
launchers = set(self.zk.getRegisteredLaunchers())
if launchers.issubset(set(self.request.declined_by)):
# All launchers have declined it
self.log.debug("Failing declined node request %s",
self.request.id)
self.request.state = zk.FAILED
else:
self.request.state = zk.REQUESTED
self.decline_request()
else:
# The assigned nodes must be added to the request in the order
# in which they were requested.

View File

@@ -563,22 +563,13 @@ class OpenStackNodeRequestHandler(NodeRequestHandler):
if declined_reasons:
self.log.debug("Declining node request %s because %s",
self.request.id, ', '.join(declined_reasons))
self.request.declined_by.append(self.launcher_id)
launchers = set(self.zk.getRegisteredLaunchers())
if launchers.issubset(set(self.request.declined_by)):
self.log.debug("Failing declined node request %s",
self.request.id)
# All launchers have declined it
self.request.state = zk.FAILED
self.decline_request()
self.unlockNodeSet(clear_allocation=True)
# If conditions have changed for a paused request to now cause us
# to decline it, we need to unpause so we don't keep trying it
if self.paused:
self.paused = False
# If we didn't mark the request as failed above, reset it.
if self.request.state != zk.FAILED:
self.request.state = zk.REQUESTED
self.zk.storeNodeRequest(self.request)
self.zk.unlockNodeRequest(self.request)