From b8035de65f563b6b4c31a2de3439fcdfcdca5a84 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Fri, 14 Jan 2022 14:42:46 -0800 Subject: [PATCH] Improve handling of errors in provider manager startup If a provider (or its configuration) is sufficiently broken that the provider manager is unable to start, then the launcher will go into a loop where it attempts to restart all providers in the system until it succeeds. During this time, no pool managers are running which mean all requests are ignored by this launcher. Nodepool continuously reloads its configuration file, and in case of an error, the expected behavior is to continue running and allow the user to correct the configuration and retry after a short delay. We also expect providers on a launcher to be independent of each other so that if ones fails, the others continue working. However since we neither exit, nor process node requests if a provider manager fails to start, an error with one provider can cause all providers to stop handling requests with very little feedback to the operator. To address this, if a provider manager fails to start, the launcher will now behave as if the provider were absent from the config file. It will still emit the error to the log, and it will continuously attempt to start the provider so that if the error condition abates, the provider will start. If there are no providers on-line for a label, then as long as any provider in the system is running, node requests will be handled and declined and possibly failed while the broken provider is offilne. If the system contains only a single provider and it is broken, then no requests will be handled (failed), which is the current behavior, and still likely to be the most desirable in that case. Change-Id: If652e8911993946cee67c4dba5e6f88e55ac7099 --- nodepool/driver/fake/provider.py | 5 ++ nodepool/launcher.py | 3 ++ nodepool/provider_manager.py | 9 +++- .../fixtures/broken_provider_config.yaml | 49 +++++++++++++++++++ nodepool/tests/unit/test_launcher.py | 30 ++++++++++++ 5 files changed, 94 insertions(+), 2 deletions(-) create mode 100644 nodepool/tests/fixtures/broken_provider_config.yaml diff --git a/nodepool/driver/fake/provider.py b/nodepool/driver/fake/provider.py index 1bbdfddb7..e38376a1d 100644 --- a/nodepool/driver/fake/provider.py +++ b/nodepool/driver/fake/provider.py @@ -421,3 +421,8 @@ class FakeProvider(OpenStackProvider): def getRequestHandler(self, poolworker, request): return FakeNodeRequestHandler(poolworker, request) + + def start(self, zk_conn): + if self.provider.region_name == 'broken-region': + raise Exception("Broken cloud config") + super().start(zk_conn) diff --git a/nodepool/launcher.py b/nodepool/launcher.py index c6f619f6e..5377f12b3 100644 --- a/nodepool/launcher.py +++ b/nodepool/launcher.py @@ -957,6 +957,9 @@ class NodePool(threading.Thread): self.reconfigureZooKeeper(config) provider_manager.ProviderManager.reconfigure(self.config, config, self.getZK()) + for provider_name in list(config.providers.keys()): + if provider_name not in config.provider_managers: + del config.providers[provider_name] self.setConfig(config) def removeCompletedRequests(self): diff --git a/nodepool/provider_manager.py b/nodepool/provider_manager.py index 72a7bece1..06674e894 100644 --- a/nodepool/provider_manager.py +++ b/nodepool/provider_manager.py @@ -57,8 +57,13 @@ class ProviderManager(object): else: ProviderManager.log.debug("Creating new ProviderManager object" " for %s" % p.name) - new_config.provider_managers[p.name] = get_provider(p) - new_config.provider_managers[p.name].start(zk_conn) + try: + pm = get_provider(p) + pm.start(zk_conn) + new_config.provider_managers[p.name] = pm + except Exception: + ProviderManager.log.exception( + "Error starting provider %s", p.name) for stop_manager in stop_managers: stop_manager.stop() diff --git a/nodepool/tests/fixtures/broken_provider_config.yaml b/nodepool/tests/fixtures/broken_provider_config.yaml new file mode 100644 index 000000000..500a55658 --- /dev/null +++ b/nodepool/tests/fixtures/broken_provider_config.yaml @@ -0,0 +1,49 @@ +elements-dir: . +images-dir: '{images_dir}' +build-log-dir: '{build_log_dir}' + +zookeeper-servers: + - host: {zookeeper_host} + port: {zookeeper_port} + chroot: {zookeeper_chroot} + +zookeeper-tls: + ca: {zookeeper_ca} + cert: {zookeeper_cert} + key: {zookeeper_key} + +labels: + - name: broken-label + min-ready: 0 + - name: good-label + min-ready: 0 + +providers: + - name: good-provider + cloud: fake + driver: fake + region-name: fake + rate: 0.0001 + cloud-images: + - name: good-image + pools: + - name: main + labels: + - name: good-label + cloud-image: good-image + min-ram: 8192 + flavor-name: 'Fake' + - name: broken-provider + cloud: fake + driver: fake + region-name: broken-region + rate: 0.0001 + cloud-images: + - name: broken-image + pools: + - name: main + labels: + - name: broken-label + cloud-image: broken-image + min-ram: 8192 + flavor-name: 'Fake' diff --git a/nodepool/tests/unit/test_launcher.py b/nodepool/tests/unit/test_launcher.py index 3a1fd51a0..7d7f763ad 100644 --- a/nodepool/tests/unit/test_launcher.py +++ b/nodepool/tests/unit/test_launcher.py @@ -1797,6 +1797,36 @@ class TestLauncher(tests.DBTestCase): req = self.waitForNodeRequest(req) self.assertEqual(req.state, zk.FAILED) + def test_broken_provider(self): + ''' + If a provider has a broken config, it should not be started, and + any requests for it should be declined/failed. Other + providers should be started and should be able to fulfill + requests. + ''' + configfile = self.setup_config('broken_provider_config.yaml') + pool = self.useNodepool(configfile, watermark_sleep=1) + pool.start() + self.wait_for_config(pool) + + manager = pool.getProviderManager('good-provider') + manager._client.create_image(name="good-image") + + good_req = zk.NodeRequest() + good_req.state = zk.REQUESTED + good_req.node_types.append('good-label') + self.zk.storeNodeRequest(good_req) + + broken_req = zk.NodeRequest() + broken_req.state = zk.REQUESTED + broken_req.node_types.append('broken-label') + self.zk.storeNodeRequest(broken_req) + + good_req = self.waitForNodeRequest(good_req) + broken_req = self.waitForNodeRequest(broken_req) + self.assertEqual(good_req.state, zk.FULFILLED) + self.assertEqual(broken_req.state, zk.FAILED) + def test_provider_wont_wedge(self): ''' A provider should not wedge itself when it is at (1) maximum capacity