From 242f9cc3e67e53a9841ae5ef7afef5270daaacff Mon Sep 17 00:00:00 2001
From: "James E. Blair" <jim@acmegating.com>
Date: Tue, 5 Nov 2024 13:07:43 -0800
Subject: [PATCH] Handle early AWS spot instance reclamations

If an AWS spot instance is used as a metastatic backing node, an
unexpected series of events can occur:

* aws driver creates backing node instance
* aws driver scans ssh keys and stores them on backing node
* aws reclaims spot instance
* aws re-uses IP from backing node
* metastatic driver creates node
* metastatic driver scans ssh keys and stores them on node

Zuul would then use the wrong node (whether that succeeds depends
on what else has happened to the node in the interim).

To avoid this situation, we implement this change:
* After scanning the metastatic node ssh keys, we compare them to
  the backing node ssh keys and if they differ, trigger an error
  in the metastatic node and mark the backing node as failed.

In case the node is reclaimed one step early in the above sequence,
we implement this change:
* After completing the nodescan, the aws driver will double check
  that the instance is still running; if not, it will trigger an
  error.

The above is still subject to a small race if the nodescan time
takes less than the cache interval of the instance list, and the
node is reclaimed after the nodescan and within the cache interval
(currently 10 seconds).  In the unlikely event that does happen,
then the metastatic key check should still catch the issue as long
as the replacement node also does not boot within those 10 seconds.
(Technically possible, but the combination of all of these things
should be very unlikely in practice.)

Change-Id: I9ce1f6df04e9c49deceda99c8e4024dd98ea88f9
---
 nodepool/driver/aws/adapter.py                | 17 ++++
 nodepool/driver/metastatic/adapter.py         | 29 +++++--
 nodepool/driver/statemachine.py               | 19 +++--
 nodepool/tests/fixtures/aws/aws-spot.yaml     |  1 +
 nodepool/tests/unit/test_driver_aws.py        | 38 +++++++++
 nodepool/tests/unit/test_driver_metastatic.py | 85 +++++++++++++++++++
 6 files changed, 174 insertions(+), 15 deletions(-)

diff --git a/nodepool/driver/aws/adapter.py b/nodepool/driver/aws/adapter.py
index f7c030958..11a4c4250 100644
--- a/nodepool/driver/aws/adapter.py
+++ b/nodepool/driver/aws/adapter.py
@@ -877,6 +877,23 @@ class AwsAdapter(statemachine.Adapter):
                 iops=label.iops))
         return quota
 
+    def notifyNodescanResult(self, label, external_id, success, keys):
+        instance_id = external_id.get('instance')
+        host_id = external_id.get('host')
+
+        # Verify that the instance or host have not been deleted in
+        # the interim (i.e., due to spot instance reclamation).
+        if instance_id:
+            instance = {'InstanceId': instance_id, 'state': 'unknown'}
+            instance = self._refresh(instance)
+            if instance['State']['Name'].lower() != 'running':
+                raise Exception(f"Instance {instance_id} is not running")
+        if host_id:
+            host = {'HostId': host_id, 'state': 'unknown'}
+            host = self._refresh(host)
+            if host['State'].lower() != 'available':
+                raise Exception(f"Host {host_id} is not available")
+
     def uploadImage(self, provider_image, image_name, filename,
                     image_format, metadata, md5, sha256):
         self.log.debug(f"Uploading image {image_name}")
diff --git a/nodepool/driver/metastatic/adapter.py b/nodepool/driver/metastatic/adapter.py
index d9625cacd..9ca3c6830 100644
--- a/nodepool/driver/metastatic/adapter.py
+++ b/nodepool/driver/metastatic/adapter.py
@@ -352,20 +352,31 @@ class MetastaticAdapter(statemachine.Adapter):
     def getQuotaForLabel(self, label):
         return QuotaInformation(instances=1)
 
-    def notifyNodescanFailure(self, label, external_id):
+    def notifyNodescanResult(self, label, external_id, success, keys):
+        exc = None
         with self.allocation_lock:
             backing_node_records = self.backing_node_records.get(
                 label.name, [])
             for bnr in backing_node_records:
                 if bnr.backsNode(external_id):
-                    self.log.info(
-                        "Nodescan failure of %s on %s, failing backing node",
-                        external_id, bnr.node_id)
-                    bnr.failed = True
-                    backing_node = self._getNode(bnr.node_id)
-                    backing_node.user_data = self._makeBackingNodeUserData(bnr)
-                    self.zk.storeNode(backing_node)
-                    return
+                    break
+            else:
+                raise Exception(
+                    f"Unable to find backing node for {external_id}")
+            backing_node = self._getNode(bnr.node_id)
+            if success and backing_node.host_keys and keys:
+                if sorted(keys) != sorted(backing_node.host_keys):
+                    exc = Exception(f"Key mismatch on {external_id}")
+                    success = False
+            if not success:
+                self.log.info(
+                    "Nodescan failure of %s on %s, failing backing node",
+                    external_id, bnr.node_id)
+                bnr.failed = True
+                backing_node.user_data = self._makeBackingNodeUserData(bnr)
+                self.zk.storeNode(backing_node)
+        if exc:
+            raise exc
 
     # Local implementation below
 
diff --git a/nodepool/driver/statemachine.py b/nodepool/driver/statemachine.py
index ca14158b2..45a8eefe0 100644
--- a/nodepool/driver/statemachine.py
+++ b/nodepool/driver/statemachine.py
@@ -242,6 +242,7 @@ class StateMachineNodeLauncher(stats.StatsReporter):
         instance = None
         node = self.node
         statsd_key = 'ready'
+        label = self.handler.pool.labels[node.type[0]]
 
         try:
             if self.state_machine is None:
@@ -261,8 +262,16 @@ class StateMachineNodeLauncher(stats.StatsReporter):
                         self.log.warning("Error scanning keys: %s", str(e))
                     else:
                         self.log.exception("Exception scanning keys:")
+                    try:
+                        self.manager.adapter.notifyNodescanResult(
+                            label, node.external_id, False, None)
+                    except Exception:
+                        self.log.exception(
+                            "Exception processing failed nodescan result:")
                     raise exceptions.LaunchKeyscanException(
                         "Can't scan instance %s key" % node.id)
+                self.manager.adapter.notifyNodescanResult(
+                    label, node.external_id, True, keys)
                 if keys:
                     node.host_keys = keys
                 self.log.debug(f"Node {node.id} is ready")
@@ -297,7 +306,6 @@ class StateMachineNodeLauncher(stats.StatsReporter):
                 self.updateNodeFromInstance(instance)
                 self.log.debug("Submitting nodescan request for %s",
                                node.interface_ip)
-                label = self.handler.pool.labels[self.node.type[0]]
                 self.nodescan_request = NodescanRequest(
                     node,
                     label.host_key_checking,
@@ -357,9 +365,6 @@ class StateMachineNodeLauncher(stats.StatsReporter):
 
             if isinstance(e, exceptions.LaunchKeyscanException):
                 try:
-                    label = self.handler.pool.labels[node.type[0]]
-                    self.manager.adapter.notifyNodescanFailure(
-                        label, node.external_id)
                     console = self.manager.adapter.getConsoleLog(
                         label, node.external_id)
                     if console:
@@ -1701,10 +1706,12 @@ class Adapter:
         """
         raise NotImplementedError()
 
-    def notifyNodescanFailure(self, label, external_id):
-        """Notify the adapter of a nodescan failure
+    def notifyNodescanResult(self, label, external_id, success, keys):
+        """Notify the adapter of a nodescan resurt
 
         :param label ConfigLabel: The label config for the node
         :param external_id str or dict: The external id of the server
+        :param success bool: Whether the nodescan succeeded
+        :param keys str or None: The retrieved keys
         """
         pass
diff --git a/nodepool/tests/fixtures/aws/aws-spot.yaml b/nodepool/tests/fixtures/aws/aws-spot.yaml
index 1bdc53651..0bebc0174 100644
--- a/nodepool/tests/fixtures/aws/aws-spot.yaml
+++ b/nodepool/tests/fixtures/aws/aws-spot.yaml
@@ -19,6 +19,7 @@ providers:
   - name: ec2-us-west-2
     driver: aws
     region-name: us-west-2
+    launch-retries: 1
     cloud-images:
       - name: ubuntu1404
         image-id: ami-1e749f67
diff --git a/nodepool/tests/unit/test_driver_aws.py b/nodepool/tests/unit/test_driver_aws.py
index 66e7dcf0b..759669c58 100644
--- a/nodepool/tests/unit/test_driver_aws.py
+++ b/nodepool/tests/unit/test_driver_aws.py
@@ -1326,6 +1326,44 @@ class TestDriverAws(tests.DBTestCase):
 
         self.assertTrue(node.node_properties['spot'])
 
+    def test_aws_provisioning_spot_early_reclaim(self):
+        # Test that if AWS reclaims the spot instance before we're
+        # done with the nodescan we fail the request.
+
+        orig_notify = nodepool.driver.aws.adapter.AwsAdapter.\
+            notifyNodescanResult
+
+        def notify(*args, **kw):
+            adapter = self.pool.getProviderManager('ec2-us-west-2').adapter
+            for i in adapter._listInstances():
+                self.ec2_client.terminate_instances(
+                    InstanceIds=[i['InstanceId']])
+            for _ in iterate_timeout(60, Exception,
+                                     "Instance list cache to update",
+                                     interval=1):
+                for i in adapter._listInstances():
+                    if i['State']['Name'].lower() == 'running':
+                        break
+                else:
+                    break
+            return orig_notify(*args, **kw)
+
+        self.useFixture(fixtures.MonkeyPatch(
+            'nodepool.driver.aws.adapter.AwsAdapter.notifyNodescanResult',
+            notify))
+
+        configfile = self.setup_config('aws/aws-spot.yaml')
+        self.pool = self.useNodepool(configfile, watermark_sleep=1)
+        self.startPool(self.pool)
+
+        req = zk.NodeRequest()
+        req.state = zk.REQUESTED
+        req.tenant_name = 'tenant-1'
+        req.node_types.append('ubuntu1404-spot')
+        self.zk.storeNodeRequest(req)
+        self.log.debug("Waiting for request %s", req.id)
+        self.waitForNodeRequest(req, states=(zk.FAILED,))
+
     def test_aws_dedicated_host(self):
         req = self.requestNode('aws/aws-dedicated-host.yaml', 'ubuntu')
         for _ in iterate_timeout(60, Exception,
diff --git a/nodepool/tests/unit/test_driver_metastatic.py b/nodepool/tests/unit/test_driver_metastatic.py
index 6f328d8dd..a7a39ba7e 100644
--- a/nodepool/tests/unit/test_driver_metastatic.py
+++ b/nodepool/tests/unit/test_driver_metastatic.py
@@ -309,6 +309,91 @@ class TestDriverMetastatic(tests.DBTestCase):
         nodes = self._getNodes()
         self.assertEqual(nodes, [])
 
+    def test_metastatic_nodescan_key_mismatch(self):
+        # Test that a nodescan key mismatch takes a backing node out of service
+
+        # This tests a scenario where a keyscan on a metastatic node
+        # does not match the key from the backing node.  This could
+        # happen if the cloud reclaimed the backing node and reused
+        # the IP.
+
+        counter = -1
+        # bn1, node1, node2
+        keys = [
+            ['ssh-rsa bnkey'],
+            ['ssh-rsa otherkey'],
+            ['ssh-rsa bnkey'],
+        ]
+        orig_advance = nodepool.driver.statemachine.NodescanRequest.advance
+
+        def handler(obj, *args, **kw):
+            nonlocal counter, keys
+            if counter >= len(keys):
+                return orig_advance(obj, *args, **kw)
+            ret = orig_advance(obj, *args, **kw)
+            obj.keys = keys[counter]
+            counter += 1
+            return ret
+
+        self.useFixture(fixtures.MonkeyPatch(
+            'nodepool.driver.statemachine.NodescanRequest.advance',
+            handler))
+
+        configfile = self.setup_config('metastatic.yaml')
+        pool = self.useNodepool(configfile, watermark_sleep=1)
+        self.startPool(pool)
+        manager = pool.getProviderManager('fake-provider')
+        manager.adapter._client.create_image(name="fake-image")
+
+        # Launch one metastatic node on a backing node
+        node1 = self._requestNode()
+        nodes = self._getNodes()
+        self.assertEqual(len(nodes), 2)
+        bn1 = nodes[1]
+        self.assertEqual(bn1.provider, 'fake-provider')
+        self.assertEqual(bn1.id, node1.driver_data['backing_node'])
+
+        # Launch a second one with a failed nodescan; should have a
+        # second backing node
+        node2 = self._requestNode()
+        nodes = self._getNodes()
+        bn2 = nodes[3]
+        # Reload bn1 since the userdata failed
+        self.assertEqual(bn1.id, nodes[1].id)
+        bn1 = nodes[1]
+        self.assertNotEqual(bn1.id, bn2.id)
+        self.assertEqual(nodes, [node1, bn1, node2, bn2])
+        self.assertEqual(bn2.id, node2.driver_data['backing_node'])
+
+        # Allocate a third node, should use the second backing node
+        node3 = self._requestNode()
+        nodes = self._getNodes()
+        self.assertEqual(nodes, [node1, bn1, node2, bn2, node3])
+        self.assertEqual(bn2.id, node3.driver_data['backing_node'])
+
+        # Delete node3, verify that both backing nodes exist
+        node3.state = zk.DELETING
+        self.zk.storeNode(node3)
+        self.waitForNodeDeletion(node3)
+        nodes = self._getNodes()
+        self.assertEqual(nodes, [node1, bn1, node2, bn2])
+
+        # Delete node2, verify that only the first backing node exists
+        node2.state = zk.DELETING
+        self.zk.storeNode(node2)
+        self.waitForNodeDeletion(node2)
+        self.waitForNodeDeletion(bn2)
+        nodes = self._getNodes()
+        self.assertEqual(nodes, [node1, bn1])
+
+        # Delete node1, verify that no nodes exist
+        node1.state = zk.DELETING
+        self.zk.storeNode(node1)
+        self.waitForNodeDeletion(node1)
+        self.waitForNodeDeletion(bn1)
+        nodes = self._getNodes()
+        self.assertEqual(nodes, [])
+
     def test_metastatic_min_retention(self):
         # Test that the metastatic driver honors min-retention
         configfile = self.setup_config('metastatic.yaml')