Fix node failures when at volume quota

When creating instances with boot from volume we don't get quota
related information in the exception raised by wait_for_server. Also
in the server munch that is returned the fault information is
missing. This causes node failures when we run into the volume
quota. This can be fixed by explicitly fetching the server if we got
one and inspecting the fault information which contains more
information about the fault reason [1].

[1] Example fault reason:
Build of instance 4628f079-26a9-4a1d-aaa0-881ba4c7b9cb aborted:
VolumeSizeExceedsAvailableQuota: Requested volume or snapshot exceeds
allowed gigabytes quota. Requested 500G, quota is 10240G and 10050G
has been consumed.

Change-Id: I6d832d4dbe348646cd4fb49ee7cb5f6a6ad343cf
This commit is contained in:
Tobias Henkel 2019-07-19 12:33:57 +02:00 committed by David Shrewsbury
parent 83c7c1a3a6
commit 8678b34398
4 changed files with 85 additions and 1 deletions

View File

@ -259,6 +259,9 @@ class FakeOpenStackCloud(object):
result = self._get(name_or_id, self._server_list)
return result
def get_server_by_id(self, server_id):
return self.get_server(server_id)
def _clean_floating_ip(self, server):
server.public_v4 = ''
server.public_v6 = ''
@ -326,6 +329,19 @@ class FakeUploadFailCloud(FakeOpenStackCloud):
return super(FakeUploadFailCloud, self).create_image(**kwargs)
class FakeLaunchAndGetFaultCloud(FakeOpenStackCloud):
log = logging.getLogger("nodepool.FakeLaunchAndGetFaultCloud")
def __init__(self):
super().__init__()
def wait_for_server(self, server, **kwargs):
# OpenStack provider launch code specifically looks for 'quota' in
# the failure message.
server.fault = {'message': 'quota server fault'}
raise Exception("wait_for_server failure")
class FakeLaunchAndDeleteFailCloud(FakeOpenStackCloud):
log = logging.getLogger("nodepool.FakeLaunchAndDeleteFailCloud")

View File

@ -258,6 +258,28 @@ class OpenStackNodeLauncher(NodeLauncher):
"Request %s: Launch attempt %d/%d failed for node %s:",
self.handler.request.id, attempts,
self._retries, self.node.id)
# If we got an external id we need to fetch the server info
# again in order to retrieve the fault reason as this is not
# included in the server object we already have.
quota_exceeded = False
if self.node.external_id:
try:
server = self.handler.manager.getServerById(
self.node.external_id) or {}
fault = server.get('fault', {}).get('message')
if fault:
self.log.error(
'Request %s: Detailed error for node %s: %s',
self.handler.request.id, self.node.external_id,
fault)
if 'quota' in fault:
quota_exceeded = True
except Exception:
self.log.exception(
'Request %s: Failed to retrieve error information '
'for node %s', self.node.external_id)
# If we created an instance, delete it.
if self.node.external_id:
deleting_node = zk.Node()
@ -278,6 +300,9 @@ class OpenStackNodeLauncher(NodeLauncher):
if attempts == self._retries:
raise
if 'quota exceeded' in str(e).lower():
quota_exceeded = True
if quota_exceeded:
# A quota exception is not directly recoverable so bail
# out immediately with a specific exception.
self.log.info("Quota exceeded, invalidating quota cache")

View File

@ -363,6 +363,9 @@ class OpenStackProvider(Provider):
def getServer(self, server_id):
return self._client.get_server(server_id)
def getServerById(self, server_id):
return self._client.get_server_by_id(server_id)
def getServerConsole(self, server_id):
try:
return self._client.get_server_console(server_id)

View File

@ -1721,10 +1721,50 @@ class TestLauncher(tests.DBTestCase):
while self.zk.countPoolNodes('fake-provider', 'main'):
time.sleep(0)
@mock.patch('nodepool.driver.openstack.provider.'
'OpenStackProvider.invalidateQuotaCache')
def test_launchNode_node_fault_message(self, mock_invalidatequotacache):
'''
Test failed launch can get detailed node fault info if available.
'''
fake_client = fakeprovider.FakeLaunchAndGetFaultCloud()
def get_fake_client(*args, **kwargs):
return fake_client
self.useFixture(fixtures.MockPatchObject(
fakeprovider.FakeProvider, '_getClient',
get_fake_client))
configfile = self.setup_config('node_launch_retry.yaml')
self.useBuilder(configfile)
pool = self.useNodepool(configfile, watermark_sleep=1)
pool.cleanup_interval = 60
pool.start()
self.waitForImage('fake-provider', 'fake-image')
req = zk.NodeRequest()
req.state = zk.REQUESTED
req.node_types.append('fake-label')
self.zk.storeNodeRequest(req)
# We expect the request to go PENDING and pause here because the
# wait_for_server() defined in FakeLaunchAndGetFaultCloud should fail
# and set the fault.message attribute on the server. When the code in
# launch() catches this failure, it looks for the string 'quota' inside
# this server attribute and makes the call to invalideQuotaCache()
# based on the presence of that string and a QuotaException is raised,
# causing request handling to pause.
self.waitForNodeRequest(req, (zk.PENDING,))
pool_worker = pool.getPoolWorkers('fake-provider')
while not pool_worker[0].paused_handler:
time.sleep(0.1)
self.assertTrue(mock_invalidatequotacache.called)
def test_launchNode_delete_error(self):
'''
Test that the launcher keeps trying to spawn a node in case of a
delete error
delete error
'''
fake_client = fakeprovider.FakeLaunchAndDeleteFailCloud(
times_to_fail=1)