From 64452f1a263b3457ea6c948376158fe61c5e9a77 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Tue, 16 Apr 2024 10:13:38 -0700 Subject: [PATCH] Demote launch/delete timeeouts to warnings If we hit the internal timeout while launching or deleting a server, we raise an exception and then log the traceback. This is a not- unexpected occurance, and the traceback is not useful since it's just one stack frame within the same class, so instead, let's log these timeouts at warning level without the traceback. Change-Id: Id4806d8ea2d0a232504e5a75d69cec239bcac670 --- nodepool/driver/fake/adapter.py | 17 ++++--- nodepool/driver/statemachine.py | 27 +++++++---- nodepool/exceptions.py | 8 +++- .../tests/fixtures/node_launch_timeout.yaml | 48 +++++++++++++++++++ nodepool/tests/unit/test_launcher.py | 24 ++++++++++ nodepool/tests/unit/test_nodescan.py | 7 ++- 6 files changed, 113 insertions(+), 18 deletions(-) create mode 100644 nodepool/tests/fixtures/node_launch_timeout.yaml diff --git a/nodepool/driver/fake/adapter.py b/nodepool/driver/fake/adapter.py index 040559471..011d73ece 100644 --- a/nodepool/driver/fake/adapter.py +++ b/nodepool/driver/fake/adapter.py @@ -128,6 +128,7 @@ class FakeOpenStackCloud(object): self.compute = FakeSession(self) self.pause_creates = False self._image_list = images + self._create_server_timeout = 0 if self._image_list is None: self._image_list = [ Dummy( @@ -265,10 +266,11 @@ class FakeOpenStackCloud(object): volumes=[], _kw=kw) instance_list.append(s) - t = threading.Thread(target=self._finish, - name='FakeProvider create', - args=(s, 0.1, done_status)) - t.start() + if not kw.get('_test_timeout'): + t = threading.Thread(target=self._finish, + name='FakeProvider create', + args=(s, 0.1, done_status)) + t.start() return s.copy() def _delete(self, name_or_id, instance_list): @@ -330,6 +332,9 @@ class FakeOpenStackCloud(object): return server def create_server(self, **kw): + if self._create_server_timeout: + self._create_server_timeout -= 1 + kw['_test_timeout'] = True return self._create(self._server_list, **kw) def get_server(self, name_or_id): @@ -472,10 +477,10 @@ class FakeLaunchAndDeleteFailCloud(FakeOpenStackCloud): def delete_server(self, *args, **kwargs): if self.times_to_fail_delete is None: - raise exceptions.ServerDeleteException("Test fail server delete.") + raise Exception("Test fail server delete.") if self.times_failed_delete < self.times_to_fail_delete: self.times_failed_delete += 1 - raise exceptions.ServerDeleteException("Test fail server delete.") + raise Exception("Test fail server delete.") else: self.delete_success = True return super().delete_server(*args, **kwargs) diff --git a/nodepool/driver/statemachine.py b/nodepool/driver/statemachine.py index 6b53e5aeb..fa312abef 100644 --- a/nodepool/driver/statemachine.py +++ b/nodepool/driver/statemachine.py @@ -273,7 +273,8 @@ class StateMachineNodeLauncher(stats.StatsReporter): now = time.monotonic() if (now - state_machine.start_time > self.manager.provider.launch_timeout): - raise Exception("Timeout waiting for instance creation") + raise exceptions.ServerCreateTimeoutException( + "Timeout waiting for instance creation") if not self.runDeleteStateMachine(): return @@ -321,8 +322,13 @@ class StateMachineNodeLauncher(stats.StatsReporter): self.manager.nodescan_worker.removeRequest(self.nodescan_request) self.nodescan_request = None except Exception as e: - self.log.exception("Launch attempt %d/%d for node %s, failed:", - self.attempts, self.retries, node.id) + if isinstance(e, exceptions.TimeoutException): + self.log.warning( + "Launch attempt %d/%d for node %s, failed: %s", + self.attempts, self.retries, node.id, str(e)) + else: + self.log.exception("Launch attempt %d/%d for node %s, failed:", + self.attempts, self.retries, node.id) if self.state_machine and self.state_machine.external_id: # If we're deleting, don't overwrite the node external # id, because we may make another delete state machine @@ -429,7 +435,8 @@ class StateMachineNodeDeleter: try: now = time.monotonic() if now - state_machine.start_time > self.DELETE_TIMEOUT: - raise Exception("Timeout waiting for instance deletion") + raise exceptions.ServerDeleteTimeoutException( + "Timeout waiting for instance deletion") if state_machine.state == state_machine.START: node.state = zk.DELETING @@ -451,9 +458,13 @@ class StateMachineNodeDeleter: except exceptions.NotFound: self.log.info("Instance %s not found in provider %s", node.external_id, node.provider) - except Exception: - self.log.exception("Exception deleting instance %s from %s:", - node.external_id, node.provider) + except Exception as e: + if isinstance(e, exceptions.TimeoutException): + self.log.warning("Failure deleting instance %s from %s: %s", + node.external_id, node.provider, str(e)) + else: + self.log.exception("Exception deleting instance %s from %s:", + node.external_id, node.provider) # Don't delete the ZK node in this case, but do unlock it if node_exists: self.zk.unlockNode(node) @@ -1321,7 +1332,7 @@ class NodescanRequest: def _checkTimeout(self): now = time.monotonic() if now - self.start_time > self.timeout: - raise Exception( + raise exceptions.ConnectionTimeoutException( f"Timeout connecting to {self.ip} on port {self.port}") def _checkTransport(self): diff --git a/nodepool/exceptions.py b/nodepool/exceptions.py index 1fe822663..c3b38feb1 100644 --- a/nodepool/exceptions.py +++ b/nodepool/exceptions.py @@ -69,8 +69,12 @@ class IPAddTimeoutException(TimeoutException): statsd_key = 'error.ipadd' -class ServerDeleteException(TimeoutException): - statsd_key = 'error.serverdelete' +class ServerCreateTimeoutException(TimeoutException): + statsd_key = 'error.servercreatetimeout' + + +class ServerDeleteTimeoutException(TimeoutException): + statsd_key = 'error.serverdeletetimeout' class ImageCreateException(TimeoutException): diff --git a/nodepool/tests/fixtures/node_launch_timeout.yaml b/nodepool/tests/fixtures/node_launch_timeout.yaml new file mode 100644 index 000000000..415149516 --- /dev/null +++ b/nodepool/tests/fixtures/node_launch_timeout.yaml @@ -0,0 +1,48 @@ +elements-dir: . +images-dir: '{images_dir}' +build-log-dir: '{build_log_dir}' + +zookeeper-servers: + - host: {zookeeper_host} + port: {zookeeper_port} + chroot: {zookeeper_chroot} + +zookeeper-tls: + ca: {zookeeper_ca} + cert: {zookeeper_cert} + key: {zookeeper_key} + +labels: + - name: fake-label + min-ready: 0 + +providers: + - name: fake-provider + cloud: fake + driver: fake + region-name: fake-region + launch-retries: 2 + launch-timeout: 1 + rate: 0.0001 + diskimages: + - name: fake-image + pools: + - name: main + max-servers: 96 + labels: + - name: fake-label + diskimage: fake-image + min-ram: 8192 + +diskimages: + - name: fake-image + elements: + - fedora + - vm + release: 21 + dib-cmd: nodepool/tests/fake-image-create + env-vars: + TMPDIR: /opt/dib_tmp + DIB_IMAGE_CACHE: /opt/dib_cache + DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/ + BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2 diff --git a/nodepool/tests/unit/test_launcher.py b/nodepool/tests/unit/test_launcher.py index fc11d247f..b366855ab 100644 --- a/nodepool/tests/unit/test_launcher.py +++ b/nodepool/tests/unit/test_launcher.py @@ -625,6 +625,30 @@ class TestLauncher(tests.DBTestCase): self.assertEqual(req.state, zk.FAILED) self.assertNotEqual(req.declined_by, []) + def test_fail_request_on_launch_timeout(self): + ''' + Test that provider launch timeout fails the request. + ''' + configfile = self.setup_config('node_launch_timeout.yaml') + self.useBuilder(configfile) + self.waitForImage('fake-provider', 'fake-image') + + pool = self.useNodepool(configfile, watermark_sleep=1) + self.startPool(pool) + manager = pool.getProviderManager('fake-provider') + client = pool.getProviderManager('fake-provider').adapter._getClient() + client._create_server_timeout = 2 + + req = zk.NodeRequest() + req.state = zk.REQUESTED + req.node_types.append('fake-label') + self.zk.storeNodeRequest(req) + + req = self.waitForNodeRequest(req) + self.assertEqual(0, manager.adapter.createServer_fails) + self.assertEqual(req.state, zk.FAILED) + self.assertNotEqual(req.declined_by, []) + def test_az_change_recover(self): ''' Test that nodepool recovers from az change in the cloud. diff --git a/nodepool/tests/unit/test_nodescan.py b/nodepool/tests/unit/test_nodescan.py index 9a1bdc6b2..f1cb593bb 100644 --- a/nodepool/tests/unit/test_nodescan.py +++ b/nodepool/tests/unit/test_nodescan.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from nodepool import exceptions from nodepool import tests from nodepool.nodeutils import iterate_timeout from nodepool.zk.zookeeper import Node @@ -143,7 +144,8 @@ class TestNodescanWorker(tests.BaseTestCase): for _ in iterate_timeout(30, Exception, 'nodescan'): if request.complete: break - with testtools.ExpectedException(Exception): + with testtools.ExpectedException( + exceptions.ConnectionTimeoutException): request.result() worker.stop() worker.join() @@ -170,7 +172,8 @@ class TestNodescanWorker(tests.BaseTestCase): for _ in iterate_timeout(30, Exception, 'nodescan'): if request.complete: break - with testtools.ExpectedException(Exception): + with testtools.ExpectedException( + exceptions.ConnectionTimeoutException): request.result() worker.stop() worker.join()