Merge "Add host-key-checking to metastatic driver"
This commit is contained in:
commit
fe70068909
@ -107,6 +107,23 @@ itself, which is "meta".
|
||||
to apply to all pools within that provider, or it can be
|
||||
overridden here for a specific pool.
|
||||
|
||||
.. attr:: host-key-checking
|
||||
:type: bool
|
||||
:default: False
|
||||
|
||||
Whether to validate SSH host keys. When true, this helps
|
||||
ensure that nodes are ready to receive SSH connections
|
||||
before they are supplied to the requestor. When set to
|
||||
false, nodepool-launcher will not attempt to ssh-keyscan
|
||||
nodes after they are booted. Unlike other drivers, this
|
||||
defaults to false here because it is presumed that the
|
||||
backing node has already been checked for connectivity.
|
||||
Enabling it here will cause the launcher to check
|
||||
connectivity each time it allocates a new slot on the
|
||||
backing node, and if a check fails, it will mark the backing
|
||||
node as failed and stop allocating any more slots on that
|
||||
node.
|
||||
|
||||
.. attr:: node-attributes
|
||||
:type: dict
|
||||
|
||||
@ -175,3 +192,23 @@ itself, which is "meta".
|
||||
last requested node is deleted to see if new requests
|
||||
arrive for this label before deleting the backing node.
|
||||
Set this value to the amount of time in seconds to wait.
|
||||
|
||||
.. attr:: host-key-checking
|
||||
:type: bool
|
||||
:default: False
|
||||
|
||||
Whether to validate SSH host keys. When true, this helps
|
||||
ensure that nodes are ready to receive SSH connections
|
||||
before they are supplied to the requestor. When set to
|
||||
false, nodepool-launcher will not attempt to ssh-keyscan
|
||||
nodes after they are booted. Unlike other drivers, this
|
||||
defaults to false here because it is presumed that the
|
||||
backing node has already been checked for connectivity.
|
||||
Enabling it here will cause the launcher to check
|
||||
connectivity each time it allocates a new slot on the
|
||||
backing node, and if a check fails, it will mark the backing
|
||||
node as failed and stop allocating any more slots on that
|
||||
node.
|
||||
|
||||
.. note:: This value will override the value for
|
||||
:attr:`providers.[metastatic].pools.host-key-checking`.
|
||||
|
@ -220,6 +220,8 @@ class BackingNodeRecord:
|
||||
self.last_used = time.time()
|
||||
|
||||
def hasAvailableSlot(self):
|
||||
if self.failed:
|
||||
return False
|
||||
return None in self.allocated_nodes
|
||||
|
||||
def isEmpty(self):
|
||||
@ -297,6 +299,8 @@ class MetastaticAdapter(statemachine.Adapter):
|
||||
# The label doesn't exist in our config any more,
|
||||
# it must have been removed.
|
||||
grace_time = 0
|
||||
if bnr.failed:
|
||||
grace_time = 0
|
||||
if (bnr.isEmpty() and
|
||||
now - bnr.last_used > grace_time):
|
||||
self.log.info("Backing node %s has been idle for "
|
||||
@ -325,6 +329,21 @@ class MetastaticAdapter(statemachine.Adapter):
|
||||
def getQuotaForLabel(self, label):
|
||||
return QuotaInformation(instances=1)
|
||||
|
||||
def notifyNodescanFailure(self, label, external_id):
|
||||
with self.allocation_lock:
|
||||
backing_node_records = self.backing_node_records.get(
|
||||
label.name, [])
|
||||
for bnr in backing_node_records:
|
||||
if bnr.backsNode(external_id):
|
||||
self.log.info(
|
||||
"Nodescan failure of %s on %s, failing backing node",
|
||||
external_id, bnr.node_id)
|
||||
bnr.failed = True
|
||||
backing_node = self._getNode(bnr.node_id)
|
||||
backing_node.user_data = self._makeBackingNodeUserData(bnr)
|
||||
self.zk.storeNode(backing_node)
|
||||
return
|
||||
|
||||
# Local implementation below
|
||||
|
||||
def _init(self):
|
||||
@ -352,6 +371,7 @@ class MetastaticAdapter(statemachine.Adapter):
|
||||
backing_node_record = BackingNodeRecord(user_data['label'],
|
||||
user_data['slots'])
|
||||
backing_node_record.node_id = node.id
|
||||
backing_node_record.failed = user_data.get('failed', False)
|
||||
self.log.info("Found backing node %s for %s",
|
||||
node.id, user_data['label'])
|
||||
self._addBackingNode(user_data['label'],
|
||||
@ -381,6 +401,9 @@ class MetastaticAdapter(statemachine.Adapter):
|
||||
# if we have room for the label, allocate and return existing slot
|
||||
# otherwise, make a new backing node
|
||||
with self.allocation_lock:
|
||||
# First, find out if we're retrying a request for the same
|
||||
# node id; if so, immediately deallocate the old one.
|
||||
self._deallocateBackingNodeInner(node_id)
|
||||
backing_node_record = None
|
||||
for bnr in self.backing_node_records.get(label.name, []):
|
||||
if bnr.hasAvailableSlot():
|
||||
@ -411,15 +434,26 @@ class MetastaticAdapter(statemachine.Adapter):
|
||||
def _deallocateBackingNode(self, node_id):
|
||||
self._init()
|
||||
with self.allocation_lock:
|
||||
for label_name, backing_node_records in \
|
||||
self.backing_node_records.items():
|
||||
for bn in backing_node_records:
|
||||
if bn.backsNode(node_id):
|
||||
slot = bn.deallocateSlot(node_id)
|
||||
self.log.info(
|
||||
"Unassigned node %s from backing node %s slot %s",
|
||||
node_id, bn.node_id, slot)
|
||||
return
|
||||
self._deallocateBackingNodeInner(node_id)
|
||||
|
||||
def _deallocateBackingNodeInner(self, node_id):
|
||||
for label_name, backing_node_records in \
|
||||
self.backing_node_records.items():
|
||||
for bnr in backing_node_records:
|
||||
if bnr.backsNode(node_id):
|
||||
slot = bnr.deallocateSlot(node_id)
|
||||
self.log.info(
|
||||
"Unassigned node %s from backing node %s slot %s",
|
||||
node_id, bnr.node_id, slot)
|
||||
return
|
||||
|
||||
def _makeBackingNodeUserData(self, bnr):
|
||||
return json.dumps({
|
||||
'owner': self.my_id,
|
||||
'label': bnr.label_name,
|
||||
'slots': bnr.slot_count,
|
||||
'failed': bnr.failed,
|
||||
})
|
||||
|
||||
def _checkBackingNodeRequests(self):
|
||||
self._init()
|
||||
@ -452,11 +486,7 @@ class MetastaticAdapter(statemachine.Adapter):
|
||||
node = self._getNode(node_id)
|
||||
self.zk.lockNode(node, blocking=True, timeout=30,
|
||||
ephemeral=False, identifier=self.my_id)
|
||||
node.user_data = json.dumps({
|
||||
'owner': self.my_id,
|
||||
'label': bnr.label_name,
|
||||
'slots': bnr.slot_count,
|
||||
})
|
||||
node.user_data = self._makeBackingNodeUserData(bnr)
|
||||
node.state = zk.IN_USE
|
||||
self.zk.storeNode(node)
|
||||
self.zk.deleteNodeRequest(request)
|
||||
|
@ -45,7 +45,8 @@ class MetastaticLabel(ConfigValue):
|
||||
self.cloud_image = MetastaticCloudImage()
|
||||
self.max_parallel_jobs = label.get('max-parallel-jobs', 1)
|
||||
self.grace_time = label.get('grace-time', 60)
|
||||
self.host_key_checking = self.pool.host_key_checking
|
||||
self.host_key_checking = label.get('host-key-checking',
|
||||
self.pool.host_key_checking)
|
||||
|
||||
@staticmethod
|
||||
def getSchema():
|
||||
@ -54,6 +55,7 @@ class MetastaticLabel(ConfigValue):
|
||||
v.Required('backing-label'): str,
|
||||
'max-parallel-jobs': int,
|
||||
'grace-time': int,
|
||||
'host-key-checking': bool,
|
||||
}
|
||||
|
||||
def isBackingConfigEqual(self, other):
|
||||
@ -74,7 +76,9 @@ class MetastaticPool(ConfigPool):
|
||||
self.labels = {}
|
||||
# We will just use the interface_ip of the backing node
|
||||
self.use_internal_ip = False
|
||||
self.host_key_checking = False
|
||||
# Allow extra checking of the backing node to detect failure
|
||||
# after it's already running.
|
||||
self.host_key_checking = pool_config.get('host-key-checking', False)
|
||||
|
||||
self.load(pool_config)
|
||||
|
||||
@ -95,6 +99,7 @@ class MetastaticPool(ConfigPool):
|
||||
v.Required('name'): str,
|
||||
v.Required('labels'): [label],
|
||||
'max-servers': int,
|
||||
'host-key-checking': bool,
|
||||
})
|
||||
return pool
|
||||
|
||||
|
@ -344,6 +344,8 @@ class StateMachineNodeLauncher(stats.StatsReporter):
|
||||
if isinstance(e, exceptions.LaunchKeyscanException):
|
||||
try:
|
||||
label = self.handler.pool.labels[node.type[0]]
|
||||
self.manager.adapter.notifyNodescanFailure(
|
||||
label, node.external_id)
|
||||
console = self.manager.adapter.getConsoleLog(
|
||||
label, node.external_id)
|
||||
if console:
|
||||
@ -1651,7 +1653,7 @@ class Adapter:
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
# The following method is optional
|
||||
# The following methods are optional
|
||||
def getConsoleLog(self, label, external_id):
|
||||
"""Return the console log from the specified server
|
||||
|
||||
@ -1659,3 +1661,11 @@ class Adapter:
|
||||
:param external_id str: The external id of the server
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def notifyNodescanFailure(self, label, external_id):
|
||||
"""Notify the adapter of a nodescan failure
|
||||
|
||||
:param label ConfigLabel: The label config for the node
|
||||
:param external_id str: The external id of the server
|
||||
"""
|
||||
pass
|
||||
|
2
nodepool/tests/fixtures/metastatic.yaml
vendored
2
nodepool/tests/fixtures/metastatic.yaml
vendored
@ -49,6 +49,7 @@ providers:
|
||||
- name: main
|
||||
max-servers: 10
|
||||
priority: 1
|
||||
host-key-checking: true
|
||||
node-attributes:
|
||||
metaattr: meta
|
||||
testattr: metastatic
|
||||
@ -57,3 +58,4 @@ providers:
|
||||
backing-label: backing-label
|
||||
max-parallel-jobs: 2
|
||||
grace-time: 2
|
||||
host-key-checking: true
|
||||
|
@ -17,11 +17,13 @@ import json
|
||||
import logging
|
||||
import os
|
||||
|
||||
import fixtures
|
||||
import testtools
|
||||
|
||||
from nodepool import tests
|
||||
from nodepool.zk import zookeeper as zk
|
||||
from nodepool.driver.statemachine import StateMachineProvider
|
||||
import nodepool.driver.statemachine
|
||||
from nodepool.cmd.config_validator import ConfigValidator
|
||||
|
||||
|
||||
@ -228,3 +230,79 @@ class TestDriverMetastatic(tests.DBTestCase):
|
||||
self.waitForNodeDeletion(bn1)
|
||||
nodes = self._getNodes()
|
||||
self.assertEqual(nodes, [])
|
||||
|
||||
def test_metastatic_nodescan(self):
|
||||
# Test that a nodescan failure takes a backing node out of service
|
||||
|
||||
counter = -1
|
||||
# bn1, node1, node2
|
||||
fail = [False, False, True]
|
||||
orig_advance = nodepool.driver.statemachine.NodescanRequest.advance
|
||||
|
||||
def handler(*args, **kw):
|
||||
nonlocal counter, fail
|
||||
counter += 1
|
||||
if counter >= len(fail):
|
||||
return orig_advance(*args, **kw)
|
||||
if fail[counter]:
|
||||
raise nodepool.exceptions.ConnectionTimeoutException()
|
||||
return orig_advance(*args, **kw)
|
||||
|
||||
self.useFixture(fixtures.MonkeyPatch(
|
||||
'nodepool.driver.statemachine.NodescanRequest.advance',
|
||||
handler))
|
||||
|
||||
configfile = self.setup_config('metastatic.yaml')
|
||||
pool = self.useNodepool(configfile, watermark_sleep=1)
|
||||
self.startPool(pool)
|
||||
manager = pool.getProviderManager('fake-provider')
|
||||
manager.adapter._client.create_image(name="fake-image")
|
||||
|
||||
# Launch one metastatic node on a backing node
|
||||
node1 = self._requestNode()
|
||||
nodes = self._getNodes()
|
||||
self.assertEqual(len(nodes), 2)
|
||||
bn1 = nodes[1]
|
||||
self.assertEqual(bn1.provider, 'fake-provider')
|
||||
self.assertEqual(bn1.id, node1.driver_data['backing_node'])
|
||||
|
||||
# Launch a second one with a failed nodescan; should have a
|
||||
# second backing node
|
||||
node2 = self._requestNode()
|
||||
nodes = self._getNodes()
|
||||
bn2 = nodes[3]
|
||||
# Reload bn1 since the userdata failed
|
||||
self.assertEqual(bn1.id, nodes[1].id)
|
||||
bn1 = nodes[1]
|
||||
self.assertNotEqual(bn1.id, bn2.id)
|
||||
self.assertEqual(nodes, [node1, bn1, node2, bn2])
|
||||
self.assertEqual(bn2.id, node2.driver_data['backing_node'])
|
||||
|
||||
# Allocate a third node, should use the second backing node
|
||||
node3 = self._requestNode()
|
||||
nodes = self._getNodes()
|
||||
self.assertEqual(nodes, [node1, bn1, node2, bn2, node3])
|
||||
self.assertEqual(bn2.id, node3.driver_data['backing_node'])
|
||||
|
||||
# Delete node3, verify that both backing nodes exist
|
||||
node3.state = zk.DELETING
|
||||
self.zk.storeNode(node3)
|
||||
self.waitForNodeDeletion(node3)
|
||||
nodes = self._getNodes()
|
||||
self.assertEqual(nodes, [node1, bn1, node2, bn2])
|
||||
|
||||
# Delete node2, verify that only the first backing node exists
|
||||
node2.state = zk.DELETING
|
||||
self.zk.storeNode(node2)
|
||||
self.waitForNodeDeletion(node2)
|
||||
self.waitForNodeDeletion(bn2)
|
||||
nodes = self._getNodes()
|
||||
self.assertEqual(nodes, [node1, bn1])
|
||||
|
||||
# Delete node1, verify that no nodes exist
|
||||
node1.state = zk.DELETING
|
||||
self.zk.storeNode(node1)
|
||||
self.waitForNodeDeletion(node1)
|
||||
self.waitForNodeDeletion(bn1)
|
||||
nodes = self._getNodes()
|
||||
self.assertEqual(nodes, [])
|
||||
|
@ -0,0 +1,9 @@
|
||||
---
|
||||
features:
|
||||
- |
|
||||
The metastatic driver now includes the
|
||||
:attr:`providers.[metastatic].pools.host-key-checking` and
|
||||
:attr:`providers.[metastatic].pools.labels.host-key-checking`
|
||||
options. This can be used to verify that a backing node is still
|
||||
functioning correctly before allocating new metastatic nodes to
|
||||
it.
|
Loading…
Reference in New Issue
Block a user