diff --git a/nodepool/driver/aws/adapter.py b/nodepool/driver/aws/adapter.py index f7c030958..11a4c4250 100644 --- a/nodepool/driver/aws/adapter.py +++ b/nodepool/driver/aws/adapter.py @@ -877,6 +877,23 @@ class AwsAdapter(statemachine.Adapter): iops=label.iops)) return quota + def notifyNodescanResult(self, label, external_id, success, keys): + instance_id = external_id.get('instance') + host_id = external_id.get('host') + + # Verify that the instance or host have not been deleted in + # the interim (i.e., due to spot instance reclamation). + if instance_id: + instance = {'InstanceId': instance_id, 'state': 'unknown'} + instance = self._refresh(instance) + if instance['State']['Name'].lower() != 'running': + raise Exception(f"Instance {instance_id} is not running") + if host_id: + host = {'HostId': host_id, 'state': 'unknown'} + host = self._refresh(host) + if host['State'].lower() != 'available': + raise Exception(f"Host {host_id} is not available") + def uploadImage(self, provider_image, image_name, filename, image_format, metadata, md5, sha256): self.log.debug(f"Uploading image {image_name}") diff --git a/nodepool/driver/metastatic/adapter.py b/nodepool/driver/metastatic/adapter.py index d9625cacd..9ca3c6830 100644 --- a/nodepool/driver/metastatic/adapter.py +++ b/nodepool/driver/metastatic/adapter.py @@ -352,20 +352,31 @@ class MetastaticAdapter(statemachine.Adapter): def getQuotaForLabel(self, label): return QuotaInformation(instances=1) - def notifyNodescanFailure(self, label, external_id): + def notifyNodescanResult(self, label, external_id, success, keys): + exc = None with self.allocation_lock: backing_node_records = self.backing_node_records.get( label.name, []) for bnr in backing_node_records: if bnr.backsNode(external_id): - self.log.info( - "Nodescan failure of %s on %s, failing backing node", - external_id, bnr.node_id) - bnr.failed = True - backing_node = self._getNode(bnr.node_id) - backing_node.user_data = self._makeBackingNodeUserData(bnr) - self.zk.storeNode(backing_node) - return + break + else: + raise Exception( + f"Unable to find backing node for {external_id}") + backing_node = self._getNode(bnr.node_id) + if success and backing_node.host_keys and keys: + if sorted(keys) != sorted(backing_node.host_keys): + exc = Exception(f"Key mismatch on {external_id}") + success = False + if not success: + self.log.info( + "Nodescan failure of %s on %s, failing backing node", + external_id, bnr.node_id) + bnr.failed = True + backing_node.user_data = self._makeBackingNodeUserData(bnr) + self.zk.storeNode(backing_node) + if exc: + raise exc # Local implementation below diff --git a/nodepool/driver/statemachine.py b/nodepool/driver/statemachine.py index ca14158b2..45a8eefe0 100644 --- a/nodepool/driver/statemachine.py +++ b/nodepool/driver/statemachine.py @@ -242,6 +242,7 @@ class StateMachineNodeLauncher(stats.StatsReporter): instance = None node = self.node statsd_key = 'ready' + label = self.handler.pool.labels[node.type[0]] try: if self.state_machine is None: @@ -261,8 +262,16 @@ class StateMachineNodeLauncher(stats.StatsReporter): self.log.warning("Error scanning keys: %s", str(e)) else: self.log.exception("Exception scanning keys:") + try: + self.manager.adapter.notifyNodescanResult( + label, node.external_id, False, None) + except Exception: + self.log.exception( + "Exception processing failed nodescan result:") raise exceptions.LaunchKeyscanException( "Can't scan instance %s key" % node.id) + self.manager.adapter.notifyNodescanResult( + label, node.external_id, True, keys) if keys: node.host_keys = keys self.log.debug(f"Node {node.id} is ready") @@ -297,7 +306,6 @@ class StateMachineNodeLauncher(stats.StatsReporter): self.updateNodeFromInstance(instance) self.log.debug("Submitting nodescan request for %s", node.interface_ip) - label = self.handler.pool.labels[self.node.type[0]] self.nodescan_request = NodescanRequest( node, label.host_key_checking, @@ -357,9 +365,6 @@ class StateMachineNodeLauncher(stats.StatsReporter): if isinstance(e, exceptions.LaunchKeyscanException): try: - label = self.handler.pool.labels[node.type[0]] - self.manager.adapter.notifyNodescanFailure( - label, node.external_id) console = self.manager.adapter.getConsoleLog( label, node.external_id) if console: @@ -1701,10 +1706,12 @@ class Adapter: """ raise NotImplementedError() - def notifyNodescanFailure(self, label, external_id): - """Notify the adapter of a nodescan failure + def notifyNodescanResult(self, label, external_id, success, keys): + """Notify the adapter of a nodescan resurt :param label ConfigLabel: The label config for the node :param external_id str or dict: The external id of the server + :param success bool: Whether the nodescan succeeded + :param keys str or None: The retrieved keys """ pass diff --git a/nodepool/tests/fixtures/aws/aws-spot.yaml b/nodepool/tests/fixtures/aws/aws-spot.yaml index 1bdc53651..0bebc0174 100644 --- a/nodepool/tests/fixtures/aws/aws-spot.yaml +++ b/nodepool/tests/fixtures/aws/aws-spot.yaml @@ -19,6 +19,7 @@ providers: - name: ec2-us-west-2 driver: aws region-name: us-west-2 + launch-retries: 1 cloud-images: - name: ubuntu1404 image-id: ami-1e749f67 diff --git a/nodepool/tests/unit/test_driver_aws.py b/nodepool/tests/unit/test_driver_aws.py index 66e7dcf0b..759669c58 100644 --- a/nodepool/tests/unit/test_driver_aws.py +++ b/nodepool/tests/unit/test_driver_aws.py @@ -1326,6 +1326,44 @@ class TestDriverAws(tests.DBTestCase): self.assertTrue(node.node_properties['spot']) + def test_aws_provisioning_spot_early_reclaim(self): + # Test that if AWS reclaims the spot instance before we're + # done with the nodescan we fail the request. + + orig_notify = nodepool.driver.aws.adapter.AwsAdapter.\ + notifyNodescanResult + + def notify(*args, **kw): + adapter = self.pool.getProviderManager('ec2-us-west-2').adapter + for i in adapter._listInstances(): + self.ec2_client.terminate_instances( + InstanceIds=[i['InstanceId']]) + for _ in iterate_timeout(60, Exception, + "Instance list cache to update", + interval=1): + for i in adapter._listInstances(): + if i['State']['Name'].lower() == 'running': + break + else: + break + return orig_notify(*args, **kw) + + self.useFixture(fixtures.MonkeyPatch( + 'nodepool.driver.aws.adapter.AwsAdapter.notifyNodescanResult', + notify)) + + configfile = self.setup_config('aws/aws-spot.yaml') + self.pool = self.useNodepool(configfile, watermark_sleep=1) + self.startPool(self.pool) + + req = zk.NodeRequest() + req.state = zk.REQUESTED + req.tenant_name = 'tenant-1' + req.node_types.append('ubuntu1404-spot') + self.zk.storeNodeRequest(req) + self.log.debug("Waiting for request %s", req.id) + self.waitForNodeRequest(req, states=(zk.FAILED,)) + def test_aws_dedicated_host(self): req = self.requestNode('aws/aws-dedicated-host.yaml', 'ubuntu') for _ in iterate_timeout(60, Exception, diff --git a/nodepool/tests/unit/test_driver_metastatic.py b/nodepool/tests/unit/test_driver_metastatic.py index 6f328d8dd..a7a39ba7e 100644 --- a/nodepool/tests/unit/test_driver_metastatic.py +++ b/nodepool/tests/unit/test_driver_metastatic.py @@ -309,6 +309,91 @@ class TestDriverMetastatic(tests.DBTestCase): nodes = self._getNodes() self.assertEqual(nodes, []) + def test_metastatic_nodescan_key_mismatch(self): + # Test that a nodescan key mismatch takes a backing node out of service + + # This tests a scenario where a keyscan on a metastatic node + # does not match the key from the backing node. This could + # happen if the cloud reclaimed the backing node and reused + # the IP. + + counter = -1 + # bn1, node1, node2 + keys = [ + ['ssh-rsa bnkey'], + ['ssh-rsa otherkey'], + ['ssh-rsa bnkey'], + ] + orig_advance = nodepool.driver.statemachine.NodescanRequest.advance + + def handler(obj, *args, **kw): + nonlocal counter, keys + if counter >= len(keys): + return orig_advance(obj, *args, **kw) + ret = orig_advance(obj, *args, **kw) + obj.keys = keys[counter] + counter += 1 + return ret + + self.useFixture(fixtures.MonkeyPatch( + 'nodepool.driver.statemachine.NodescanRequest.advance', + handler)) + + configfile = self.setup_config('metastatic.yaml') + pool = self.useNodepool(configfile, watermark_sleep=1) + self.startPool(pool) + manager = pool.getProviderManager('fake-provider') + manager.adapter._client.create_image(name="fake-image") + + # Launch one metastatic node on a backing node + node1 = self._requestNode() + nodes = self._getNodes() + self.assertEqual(len(nodes), 2) + bn1 = nodes[1] + self.assertEqual(bn1.provider, 'fake-provider') + self.assertEqual(bn1.id, node1.driver_data['backing_node']) + + # Launch a second one with a failed nodescan; should have a + # second backing node + node2 = self._requestNode() + nodes = self._getNodes() + bn2 = nodes[3] + # Reload bn1 since the userdata failed + self.assertEqual(bn1.id, nodes[1].id) + bn1 = nodes[1] + self.assertNotEqual(bn1.id, bn2.id) + self.assertEqual(nodes, [node1, bn1, node2, bn2]) + self.assertEqual(bn2.id, node2.driver_data['backing_node']) + + # Allocate a third node, should use the second backing node + node3 = self._requestNode() + nodes = self._getNodes() + self.assertEqual(nodes, [node1, bn1, node2, bn2, node3]) + self.assertEqual(bn2.id, node3.driver_data['backing_node']) + + # Delete node3, verify that both backing nodes exist + node3.state = zk.DELETING + self.zk.storeNode(node3) + self.waitForNodeDeletion(node3) + nodes = self._getNodes() + self.assertEqual(nodes, [node1, bn1, node2, bn2]) + + # Delete node2, verify that only the first backing node exists + node2.state = zk.DELETING + self.zk.storeNode(node2) + self.waitForNodeDeletion(node2) + self.waitForNodeDeletion(bn2) + nodes = self._getNodes() + self.assertEqual(nodes, [node1, bn1]) + + # Delete node1, verify that no nodes exist + node1.state = zk.DELETING + self.zk.storeNode(node1) + self.waitForNodeDeletion(node1) + self.waitForNodeDeletion(bn1) + nodes = self._getNodes() + self.assertEqual(nodes, []) + def test_metastatic_min_retention(self): # Test that the metastatic driver honors min-retention configfile = self.setup_config('metastatic.yaml')