Demote launch/delete timeeouts to warnings

If we hit the internal timeout while launching or deleting a server,
we raise an exception and then log the traceback.  This is a not-
unexpected occurance, and the traceback is not useful since it's
just one stack frame within the same class, so instead, let's log
these timeouts at warning level without the traceback.

Change-Id: Id4806d8ea2d0a232504e5a75d69cec239bcac670
This commit is contained in:
James E. Blair 2024-04-16 10:13:38 -07:00
parent 6a71686b6b
commit eb17811484
4 changed files with 60 additions and 16 deletions

View File

@ -128,6 +128,7 @@ class FakeOpenStackCloud(object):
self.compute = FakeSession(self)
self.pause_creates = False
self._image_list = images
self._create_server_timeout = 0
if self._image_list is None:
self._image_list = [
Dummy(
@ -265,10 +266,11 @@ class FakeOpenStackCloud(object):
volumes=[],
_kw=kw)
instance_list.append(s)
t = threading.Thread(target=self._finish,
name='FakeProvider create',
args=(s, 0.1, done_status))
t.start()
if not kw.get('_test_timeout'):
t = threading.Thread(target=self._finish,
name='FakeProvider create',
args=(s, 0.1, done_status))
t.start()
return s.copy()
def _delete(self, name_or_id, instance_list):
@ -330,6 +332,9 @@ class FakeOpenStackCloud(object):
return server
def create_server(self, **kw):
if self._create_server_timeout:
self._create_server_timeout -= 1
kw['_test_timeout'] = True
return self._create(self._server_list, **kw)
def get_server(self, name_or_id):
@ -472,10 +477,10 @@ class FakeLaunchAndDeleteFailCloud(FakeOpenStackCloud):
def delete_server(self, *args, **kwargs):
if self.times_to_fail_delete is None:
raise exceptions.ServerDeleteException("Test fail server delete.")
raise Exception("Test fail server delete.")
if self.times_failed_delete < self.times_to_fail_delete:
self.times_failed_delete += 1
raise exceptions.ServerDeleteException("Test fail server delete.")
raise Exception("Test fail server delete.")
else:
self.delete_success = True
return super().delete_server(*args, **kwargs)

View File

@ -273,7 +273,8 @@ class StateMachineNodeLauncher(stats.StatsReporter):
now = time.monotonic()
if (now - state_machine.start_time >
self.manager.provider.launch_timeout):
raise Exception("Timeout waiting for instance creation")
raise exceptions.ServerCreateTimeoutException(
"Timeout waiting for instance creation")
if not self.runDeleteStateMachine():
return
@ -321,8 +322,13 @@ class StateMachineNodeLauncher(stats.StatsReporter):
self.manager.nodescan_worker.removeRequest(self.nodescan_request)
self.nodescan_request = None
except Exception as e:
self.log.exception("Launch attempt %d/%d for node %s, failed:",
self.attempts, self.retries, node.id)
if isinstance(e, exceptions.TimeoutException):
self.log.warning(
"Launch attempt %d/%d for node %s, failed: %s",
self.attempts, self.retries, node.id, str(e))
else:
self.log.exception("Launch attempt %d/%d for node %s, failed:",
self.attempts, self.retries, node.id)
if self.state_machine and self.state_machine.external_id:
# If we're deleting, don't overwrite the node external
# id, because we may make another delete state machine
@ -429,7 +435,8 @@ class StateMachineNodeDeleter:
try:
now = time.monotonic()
if now - state_machine.start_time > self.DELETE_TIMEOUT:
raise Exception("Timeout waiting for instance deletion")
raise exceptions.ServerDeleteTimeoutException(
"Timeout waiting for instance deletion")
if state_machine.state == state_machine.START:
node.state = zk.DELETING
@ -451,9 +458,13 @@ class StateMachineNodeDeleter:
except exceptions.NotFound:
self.log.info("Instance %s not found in provider %s",
node.external_id, node.provider)
except Exception:
self.log.exception("Exception deleting instance %s from %s:",
node.external_id, node.provider)
except Exception as e:
if isinstance(e, exceptions.TimeoutException):
self.log.warning("Failure deleting instance %s from %s: %s",
node.external_id, node.provider, str(e))
else:
self.log.exception("Exception deleting instance %s from %s:",
node.external_id, node.provider)
# Don't delete the ZK node in this case, but do unlock it
if node_exists:
self.zk.unlockNode(node)
@ -1321,7 +1332,7 @@ class NodescanRequest:
def _checkTimeout(self):
now = time.monotonic()
if now - self.start_time > self.timeout:
raise Exception(
raise exceptions.ConnectionTimeoutException(
f"Timeout connecting to {self.ip} on port {self.port}")
def _checkTransport(self):

View File

@ -69,8 +69,12 @@ class IPAddTimeoutException(TimeoutException):
statsd_key = 'error.ipadd'
class ServerDeleteException(TimeoutException):
statsd_key = 'error.serverdelete'
class ServerCreateTimeoutException(TimeoutException):
statsd_key = 'error.servercreatetimeout'
class ServerDeleteTimeoutException(TimeoutException):
statsd_key = 'error.serverdeletetimeout'
class ImageCreateException(TimeoutException):

View File

@ -625,6 +625,30 @@ class TestLauncher(tests.DBTestCase):
self.assertEqual(req.state, zk.FAILED)
self.assertNotEqual(req.declined_by, [])
def test_fail_request_on_launch_timeout(self):
'''
Test that provider launch timeout fails the request.
'''
configfile = self.setup_config('node_launch_timeout.yaml')
self.useBuilder(configfile)
self.waitForImage('fake-provider', 'fake-image')
pool = self.useNodepool(configfile, watermark_sleep=1)
self.startPool(pool)
manager = pool.getProviderManager('fake-provider')
client = pool.getProviderManager('fake-provider').adapter._getClient()
client._create_server_timeout = 2
req = zk.NodeRequest()
req.state = zk.REQUESTED
req.node_types.append('fake-label')
self.zk.storeNodeRequest(req)
req = self.waitForNodeRequest(req)
self.assertEqual(0, manager.adapter.createServer_fails)
self.assertEqual(req.state, zk.FAILED)
self.assertNotEqual(req.declined_by, [])
def test_az_change_recover(self):
'''
Test that nodepool recovers from az change in the cloud.