Improve handling of errors in provider manager startup

If a provider (or its configuration) is sufficiently broken that the provider manager is unable to start, then the launcher will go into a loop where it attempts to restart all providers in the system until it succeeds. During this time, no pool managers are running which mean all requests are ignored by this launcher. Nodepool continuously reloads its configuration file, and in case of an error, the expected behavior is to continue running and allow the user to correct the configuration and retry after a short delay. We also expect providers on a launcher to be independent of each other so that if ones fails, the others continue working. However since we neither exit, nor process node requests if a provider manager fails to start, an error with one provider can cause all providers to stop handling requests with very little feedback to the operator. To address this, if a provider manager fails to start, the launcher will now behave as if the provider were absent from the config file. It will still emit the error to the log, and it will continuously attempt to start the provider so that if the error condition abates, the provider will start. If there are no providers on-line for a label, then as long as any provider in the system is running, node requests will be handled and declined and possibly failed while the broken provider is offilne. If the system contains only a single provider and it is broken, then no requests will be handled (failed), which is the current behavior, and still likely to be the most desirable in that case. Change-Id: If652e8911993946cee67c4dba5e6f88e55ac7099
2022-01-14 14:42:46 -08:00 · 2022-01-14 14:42:46 -08:00 · b8035de65f
parent e95b146d10
commit b8035de65f
5 changed files with 94 additions and 2 deletions
--- a/nodepool/driver/fake/provider.py
+++ b/nodepool/driver/fake/provider.py
@ -421,3 +421,8 @@ class FakeProvider(OpenStackProvider):

    def getRequestHandler(self, poolworker, request):
        return FakeNodeRequestHandler(poolworker, request)
+
+    def start(self, zk_conn):
+        if self.provider.region_name == 'broken-region':
+            raise Exception("Broken cloud config")
+        super().start(zk_conn)
--- a/nodepool/launcher.py
+++ b/nodepool/launcher.py
@ -957,6 +957,9 @@ class NodePool(threading.Thread):
        self.reconfigureZooKeeper(config)
        provider_manager.ProviderManager.reconfigure(self.config, config,
                                                     self.getZK())
+        for provider_name in list(config.providers.keys()):
+            if provider_name not in config.provider_managers:
+                del config.providers[provider_name]
        self.setConfig(config)

    def removeCompletedRequests(self):
--- a/nodepool/provider_manager.py
+++ b/nodepool/provider_manager.py
@ -57,8 +57,13 @@ class ProviderManager(object):
            else:
                ProviderManager.log.debug("Creating new ProviderManager object"
                                          " for %s" % p.name)
-                new_config.provider_managers[p.name] = get_provider(p)
-                new_config.provider_managers[p.name].start(zk_conn)
+                try:
+                    pm = get_provider(p)
+                    pm.start(zk_conn)
+                    new_config.provider_managers[p.name] = pm
+                except Exception:
+                    ProviderManager.log.exception(
+                        "Error starting provider %s", p.name)

        for stop_manager in stop_managers:
            stop_manager.stop()
--- a/nodepool/tests/fixtures/broken_provider_config.yaml
+++ b/nodepool/tests/fixtures/broken_provider_config.yaml
@ -0,0 +1,49 @@
+elements-dir: .
+images-dir: '{images_dir}'
+build-log-dir: '{build_log_dir}'
+
+zookeeper-servers:
+  - host: {zookeeper_host}
+    port: {zookeeper_port}
+    chroot: {zookeeper_chroot}
+
+zookeeper-tls:
+  ca: {zookeeper_ca}
+  cert: {zookeeper_cert}
+  key: {zookeeper_key}
+
+labels:
+  - name: broken-label
+    min-ready: 0
+  - name: good-label
+    min-ready: 0
+
+providers:
+  - name: good-provider
+    cloud: fake
+    driver: fake
+    region-name: fake
+    rate: 0.0001
+    cloud-images:
+      - name: good-image
+    pools:
+      - name: main
+        labels:
+          - name: good-label
+            cloud-image: good-image
+            min-ram: 8192
+            flavor-name: 'Fake'
+  - name: broken-provider
+    cloud: fake
+    driver: fake
+    region-name: broken-region
+    rate: 0.0001
+    cloud-images:
+      - name: broken-image
+    pools:
+      - name: main
+        labels:
+          - name: broken-label
+            cloud-image: broken-image
+            min-ram: 8192
+            flavor-name: 'Fake'
--- a/nodepool/tests/unit/test_launcher.py
+++ b/nodepool/tests/unit/test_launcher.py
@ -1797,6 +1797,36 @@ class TestLauncher(tests.DBTestCase):
        req = self.waitForNodeRequest(req)
        self.assertEqual(req.state, zk.FAILED)

+    def test_broken_provider(self):
+        '''
+        If a provider has a broken config, it should not be started, and
+        any requests for it should be declined/failed.  Other
+        providers should be started and should be able to fulfill
+        requests.
+        '''
+        configfile = self.setup_config('broken_provider_config.yaml')
+        pool = self.useNodepool(configfile, watermark_sleep=1)
+        pool.start()
+        self.wait_for_config(pool)
+
+        manager = pool.getProviderManager('good-provider')
+        manager._client.create_image(name="good-image")
+
+        good_req = zk.NodeRequest()
+        good_req.state = zk.REQUESTED
+        good_req.node_types.append('good-label')
+        self.zk.storeNodeRequest(good_req)
+
+        broken_req = zk.NodeRequest()
+        broken_req.state = zk.REQUESTED
+        broken_req.node_types.append('broken-label')
+        self.zk.storeNodeRequest(broken_req)
+
+        good_req = self.waitForNodeRequest(good_req)
+        broken_req = self.waitForNodeRequest(broken_req)
+        self.assertEqual(good_req.state, zk.FULFILLED)
+        self.assertEqual(broken_req.state, zk.FAILED)
+
    def test_provider_wont_wedge(self):
        '''
        A provider should not wedge itself when it is at (1) maximum capacity