Demote launch/delete timeeouts to warnings

If we hit the internal timeout while launching or deleting a server,
we raise an exception and then log the traceback.  This is a not-
unexpected occurance, and the traceback is not useful since it's
just one stack frame within the same class, so instead, let's log
these timeouts at warning level without the traceback.

Change-Id: Id4806d8ea2d0a232504e5a75d69cec239bcac670
This commit is contained in:
James E. Blair 2024-04-16 10:13:38 -07:00
parent 6a71686b6b
commit 64452f1a26
6 changed files with 113 additions and 18 deletions

View File

@ -128,6 +128,7 @@ class FakeOpenStackCloud(object):
self.compute = FakeSession(self)
self.pause_creates = False
self._image_list = images
self._create_server_timeout = 0
if self._image_list is None:
self._image_list = [
Dummy(
@ -265,10 +266,11 @@ class FakeOpenStackCloud(object):
volumes=[],
_kw=kw)
instance_list.append(s)
t = threading.Thread(target=self._finish,
name='FakeProvider create',
args=(s, 0.1, done_status))
t.start()
if not kw.get('_test_timeout'):
t = threading.Thread(target=self._finish,
name='FakeProvider create',
args=(s, 0.1, done_status))
t.start()
return s.copy()
def _delete(self, name_or_id, instance_list):
@ -330,6 +332,9 @@ class FakeOpenStackCloud(object):
return server
def create_server(self, **kw):
if self._create_server_timeout:
self._create_server_timeout -= 1
kw['_test_timeout'] = True
return self._create(self._server_list, **kw)
def get_server(self, name_or_id):
@ -472,10 +477,10 @@ class FakeLaunchAndDeleteFailCloud(FakeOpenStackCloud):
def delete_server(self, *args, **kwargs):
if self.times_to_fail_delete is None:
raise exceptions.ServerDeleteException("Test fail server delete.")
raise Exception("Test fail server delete.")
if self.times_failed_delete < self.times_to_fail_delete:
self.times_failed_delete += 1
raise exceptions.ServerDeleteException("Test fail server delete.")
raise Exception("Test fail server delete.")
else:
self.delete_success = True
return super().delete_server(*args, **kwargs)

View File

@ -273,7 +273,8 @@ class StateMachineNodeLauncher(stats.StatsReporter):
now = time.monotonic()
if (now - state_machine.start_time >
self.manager.provider.launch_timeout):
raise Exception("Timeout waiting for instance creation")
raise exceptions.ServerCreateTimeoutException(
"Timeout waiting for instance creation")
if not self.runDeleteStateMachine():
return
@ -321,8 +322,13 @@ class StateMachineNodeLauncher(stats.StatsReporter):
self.manager.nodescan_worker.removeRequest(self.nodescan_request)
self.nodescan_request = None
except Exception as e:
self.log.exception("Launch attempt %d/%d for node %s, failed:",
self.attempts, self.retries, node.id)
if isinstance(e, exceptions.TimeoutException):
self.log.warning(
"Launch attempt %d/%d for node %s, failed: %s",
self.attempts, self.retries, node.id, str(e))
else:
self.log.exception("Launch attempt %d/%d for node %s, failed:",
self.attempts, self.retries, node.id)
if self.state_machine and self.state_machine.external_id:
# If we're deleting, don't overwrite the node external
# id, because we may make another delete state machine
@ -429,7 +435,8 @@ class StateMachineNodeDeleter:
try:
now = time.monotonic()
if now - state_machine.start_time > self.DELETE_TIMEOUT:
raise Exception("Timeout waiting for instance deletion")
raise exceptions.ServerDeleteTimeoutException(
"Timeout waiting for instance deletion")
if state_machine.state == state_machine.START:
node.state = zk.DELETING
@ -451,9 +458,13 @@ class StateMachineNodeDeleter:
except exceptions.NotFound:
self.log.info("Instance %s not found in provider %s",
node.external_id, node.provider)
except Exception:
self.log.exception("Exception deleting instance %s from %s:",
node.external_id, node.provider)
except Exception as e:
if isinstance(e, exceptions.TimeoutException):
self.log.warning("Failure deleting instance %s from %s: %s",
node.external_id, node.provider, str(e))
else:
self.log.exception("Exception deleting instance %s from %s:",
node.external_id, node.provider)
# Don't delete the ZK node in this case, but do unlock it
if node_exists:
self.zk.unlockNode(node)
@ -1321,7 +1332,7 @@ class NodescanRequest:
def _checkTimeout(self):
now = time.monotonic()
if now - self.start_time > self.timeout:
raise Exception(
raise exceptions.ConnectionTimeoutException(
f"Timeout connecting to {self.ip} on port {self.port}")
def _checkTransport(self):

View File

@ -69,8 +69,12 @@ class IPAddTimeoutException(TimeoutException):
statsd_key = 'error.ipadd'
class ServerDeleteException(TimeoutException):
statsd_key = 'error.serverdelete'
class ServerCreateTimeoutException(TimeoutException):
statsd_key = 'error.servercreatetimeout'
class ServerDeleteTimeoutException(TimeoutException):
statsd_key = 'error.serverdeletetimeout'
class ImageCreateException(TimeoutException):

View File

@ -0,0 +1,48 @@
elements-dir: .
images-dir: '{images_dir}'
build-log-dir: '{build_log_dir}'
zookeeper-servers:
- host: {zookeeper_host}
port: {zookeeper_port}
chroot: {zookeeper_chroot}
zookeeper-tls:
ca: {zookeeper_ca}
cert: {zookeeper_cert}
key: {zookeeper_key}
labels:
- name: fake-label
min-ready: 0
providers:
- name: fake-provider
cloud: fake
driver: fake
region-name: fake-region
launch-retries: 2
launch-timeout: 1
rate: 0.0001
diskimages:
- name: fake-image
pools:
- name: main
max-servers: 96
labels:
- name: fake-label
diskimage: fake-image
min-ram: 8192
diskimages:
- name: fake-image
elements:
- fedora
- vm
release: 21
dib-cmd: nodepool/tests/fake-image-create
env-vars:
TMPDIR: /opt/dib_tmp
DIB_IMAGE_CACHE: /opt/dib_cache
DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/
BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2

View File

@ -625,6 +625,30 @@ class TestLauncher(tests.DBTestCase):
self.assertEqual(req.state, zk.FAILED)
self.assertNotEqual(req.declined_by, [])
def test_fail_request_on_launch_timeout(self):
'''
Test that provider launch timeout fails the request.
'''
configfile = self.setup_config('node_launch_timeout.yaml')
self.useBuilder(configfile)
self.waitForImage('fake-provider', 'fake-image')
pool = self.useNodepool(configfile, watermark_sleep=1)
self.startPool(pool)
manager = pool.getProviderManager('fake-provider')
client = pool.getProviderManager('fake-provider').adapter._getClient()
client._create_server_timeout = 2
req = zk.NodeRequest()
req.state = zk.REQUESTED
req.node_types.append('fake-label')
self.zk.storeNodeRequest(req)
req = self.waitForNodeRequest(req)
self.assertEqual(0, manager.adapter.createServer_fails)
self.assertEqual(req.state, zk.FAILED)
self.assertNotEqual(req.declined_by, [])
def test_az_change_recover(self):
'''
Test that nodepool recovers from az change in the cloud.

View File

@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from nodepool import exceptions
from nodepool import tests
from nodepool.nodeutils import iterate_timeout
from nodepool.zk.zookeeper import Node
@ -143,7 +144,8 @@ class TestNodescanWorker(tests.BaseTestCase):
for _ in iterate_timeout(30, Exception, 'nodescan'):
if request.complete:
break
with testtools.ExpectedException(Exception):
with testtools.ExpectedException(
exceptions.ConnectionTimeoutException):
request.result()
worker.stop()
worker.join()
@ -170,7 +172,8 @@ class TestNodescanWorker(tests.BaseTestCase):
for _ in iterate_timeout(30, Exception, 'nodescan'):
if request.complete:
break
with testtools.ExpectedException(Exception):
with testtools.ExpectedException(
exceptions.ConnectionTimeoutException):
request.result()
worker.stop()
worker.join()