From a59528c8e873ee7a0b09ab7de588d19a38559a3d Mon Sep 17 00:00:00 2001 From: mhuin Date: Mon, 22 Jan 2018 11:26:38 +0100 Subject: [PATCH] Clean held nodes automatically after configurable timeout Introduce a new configuration setting, "max_hold_age", that specifies the maximum uptime of held instances. If set to 0, held instances are kept until manually deleted. A custom value can be provided at the rpcclient level. Change-Id: I9a09728e5728c537ee44721f5d5e774dc0dcefa7 --- doc/source/configuration.rst | 11 ++ nodepool/cmd/config_validator.py | 1 + nodepool/config.py | 4 + nodepool/launcher.py | 63 ++++++++ .../tests/fixtures/node_max_hold_age.yaml | 50 +++++++ .../tests/fixtures/node_max_hold_age_2.yaml | 50 +++++++ nodepool/tests/test_launcher.py | 137 ++++++++++++++++++ nodepool/zk.py | 6 +- 8 files changed, 321 insertions(+), 1 deletion(-) create mode 100644 nodepool/tests/fixtures/node_max_hold_age.yaml create mode 100644 nodepool/tests/fixtures/node_max_hold_age_2.yaml diff --git a/doc/source/configuration.rst b/doc/source/configuration.rst index 23b48d9ec..83f3acb92 100644 --- a/doc/source/configuration.rst +++ b/doc/source/configuration.rst @@ -146,6 +146,17 @@ written to run on nodes of a certain label. Example:: to -1 to have the label considered disabled, so that no nodes will be created at all. +.. _maxholdage: + +max-hold-age +------------ + +Maximum number of seconds a node shall be in "hold" state. If +this is exceeded the node will be deleted. A value of 0 disables this. +Defaults to 0. + +This setting is applied to all nodes, regardless of label or provider. + .. _diskimages: diskimages diff --git a/nodepool/cmd/config_validator.py b/nodepool/cmd/config_validator.py index 310c534cc..1562aa554 100644 --- a/nodepool/cmd/config_validator.py +++ b/nodepool/cmd/config_validator.py @@ -68,6 +68,7 @@ class ConfigValidator: 'providers': list, 'labels': [label], 'diskimages': [diskimage], + 'max-hold-age': int, } log.info("validating %s" % self.config_file) diff --git a/nodepool/config.py b/nodepool/config.py index 98fa2c41f..84e86117f 100755 --- a/nodepool/config.py +++ b/nodepool/config.py @@ -16,6 +16,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import math import time import yaml @@ -115,6 +116,9 @@ def loadConfig(config_path): newconfig.provider_managers = {} newconfig.zookeeper_servers = {} newconfig.diskimages = {} + newconfig.max_hold_age = config.get('max-hold-age', math.inf) + if newconfig.max_hold_age <= 0: + newconfig.max_hold_age = math.inf for server in config.get('zookeeper-servers', []): z = zk.ZooKeeperConnectionConfig(server['host'], diff --git a/nodepool/launcher.py b/nodepool/launcher.py index 75cfd641c..66dad9222 100755 --- a/nodepool/launcher.py +++ b/nodepool/launcher.py @@ -17,6 +17,7 @@ # limitations under the License. import logging +import math import os import os.path import socket @@ -538,6 +539,62 @@ class CleanupWorker(BaseCleanupWorker): finally: zk_conn.unlockNode(node) + def _cleanupMaxHoldAge(self): + ''' + Delete any held server past their max-hold-age. + + Remove any servers which are longer than max-hold-age in hold state. + ''' + self.log.debug('Cleaning up held nodes...') + + zk_conn = self._nodepool.getZK() + held_nodes = [n for n in zk_conn.nodeIterator() if n.state == zk.HOLD] + for node in held_nodes: + # Can't do anything if we aren't configured for this provider. + if node.provider not in self._nodepool.config.providers: + continue + + if node.hold_expiration is not None and node.hold_expiration > 0: + expiration = node.hold_expiration + else: + expiration = math.inf + max_uptime = min(expiration, self._nodepool.config.max_hold_age) + if math.isinf(max_uptime): + continue + + # check state time against now + now = int(time.time()) + if (now - node.state_time) < max_uptime: + continue + + try: + zk_conn.lockNode(node, blocking=False) + except exceptions.ZKLockException: + continue + + # Double check the state now that we have a lock since it + # may have changed on us. + if node.state != zk.HOLD: + zk_conn.unlockNode(node) + continue + + self.log.debug("Node %s exceeds max hold age (%s): %s >= %s", + node.id, + ("manual setting" + and node.hold_expiration == max_uptime + or "configuration setting"), + now - node.state_time, + max_uptime) + + try: + node.state = zk.DELETING + zk_conn.storeNode(node) + except Exception: + self.log.exception( + "Failure marking aged node %s for delete:", node.id) + finally: + zk_conn.unlockNode(node) + def _run(self): ''' Catch exceptions individually so that other cleanup routines may @@ -567,6 +624,12 @@ class CleanupWorker(BaseCleanupWorker): self.log.exception( "Exception in CleanupWorker (max ready age cleanup):") + try: + self._cleanupMaxHoldAge() + except Exception: + self.log.exception( + "Exception in CleanupWorker (max hold age cleanup):") + class DeletedNodeWorker(BaseCleanupWorker): ''' diff --git a/nodepool/tests/fixtures/node_max_hold_age.yaml b/nodepool/tests/fixtures/node_max_hold_age.yaml new file mode 100644 index 000000000..8d27b6d22 --- /dev/null +++ b/nodepool/tests/fixtures/node_max_hold_age.yaml @@ -0,0 +1,50 @@ +elements-dir: . +images-dir: '{images_dir}' +build-log-dir: '{build_log_dir}' + +max-hold-age: 2 + +zookeeper-servers: + - host: {zookeeper_host} + port: {zookeeper_port} + chroot: {zookeeper_chroot} + +labels: + - name: fake-label + min-ready: 1 + +providers: + - name: fake-provider + cloud: fake + driver: fake + region-name: fake-region + rate: 0.0001 + diskimages: + - name: fake-image + meta: + key: value + key2: value + pools: + - name: main + max-servers: 96 + availability-zones: + - az1 + networks: + - net-name + labels: + - name: fake-label + diskimage: fake-image + min-ram: 8192 + flavor-name: 'Fake' + +diskimages: + - name: fake-image + elements: + - fedora + - vm + release: 21 + env-vars: + TMPDIR: /opt/dib_tmp + DIB_IMAGE_CACHE: /opt/dib_cache + DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/ + BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2 diff --git a/nodepool/tests/fixtures/node_max_hold_age_2.yaml b/nodepool/tests/fixtures/node_max_hold_age_2.yaml new file mode 100644 index 000000000..651d69cb2 --- /dev/null +++ b/nodepool/tests/fixtures/node_max_hold_age_2.yaml @@ -0,0 +1,50 @@ +elements-dir: . +images-dir: '{images_dir}' +build-log-dir: '{build_log_dir}' + +max-hold-age: 10 + +zookeeper-servers: + - host: {zookeeper_host} + port: {zookeeper_port} + chroot: {zookeeper_chroot} + +labels: + - name: fake-label + min-ready: 2 + +providers: + - name: fake-provider + cloud: fake + driver: fake + region-name: fake-region + rate: 0.0001 + diskimages: + - name: fake-image + meta: + key: value + key2: value + pools: + - name: main + max-servers: 96 + availability-zones: + - az1 + networks: + - net-name + labels: + - name: fake-label + diskimage: fake-image + min-ram: 8192 + flavor-name: 'Fake' + +diskimages: + - name: fake-image + elements: + - fedora + - vm + release: 21 + env-vars: + TMPDIR: /opt/dib_tmp + DIB_IMAGE_CACHE: /opt/dib_cache + DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/ + BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2 diff --git a/nodepool/tests/test_launcher.py b/nodepool/tests/test_launcher.py index 6e121f7d6..e3d33c981 100644 --- a/nodepool/tests/test_launcher.py +++ b/nodepool/tests/test_launcher.py @@ -660,6 +660,143 @@ class TestLauncher(tests.DBTestCase): manager = pool.getProviderManager('fake-provider') self.waitForInstanceDeletion(manager, nodes[0].external_id) + def test_max_hold_age(self): + """Test a held node with exceeded max-hold-age is deleted""" + configfile = self.setup_config('node_max_hold_age.yaml') + pool = self.useNodepool(configfile, watermark_sleep=1) + self.useBuilder(configfile) + pool.start() + self.waitForImage('fake-provider', 'fake-image') + self.log.debug("Waiting for initial pool...") + nodes = self.waitForNodes('fake-label') + self.log.debug("...done waiting for initial pool.") + node = nodes[0] + self.log.debug("Holding node %s..." % node.id) + # hold the node + node.state = zk.HOLD + node.comment = 'testing' + self.zk.lockNode(node, blocking=False) + self.zk.storeNode(node) + self.zk.unlockNode(node) + znode = self.zk.getNode(node.id) + self.log.debug("Node %s in state '%s'" % (znode.id, znode.state)) + # Wait for the instance to be cleaned up + manager = pool.getProviderManager('fake-provider') + self.waitForInstanceDeletion(manager, node.external_id) + + def test_hold_expiration_no_default(self): + """Test a held node is deleted when past its operator-specified TTL, + no max-hold-age set""" + configfile = self.setup_config('node_max_ready_age.yaml') + pool = self.useNodepool(configfile, watermark_sleep=1) + self.useBuilder(configfile) + pool.start() + self.waitForImage('fake-provider', 'fake-image') + self.log.debug("Waiting for initial pool...") + nodes = self.waitForNodes('fake-label') + self.log.debug("...done waiting for initial pool.") + node = nodes[0] + self.log.debug("Holding node %s..." % node.id) + # hold the node + node.state = zk.HOLD + node.comment = 'testing' + node.hold_expiration = 5 + self.zk.lockNode(node, blocking=False) + self.zk.storeNode(node) + self.zk.unlockNode(node) + znode = self.zk.getNode(node.id) + self.log.debug("Node %s in state '%s'" % (znode.id, znode.state)) + # Wait for the instance to be cleaned up + manager = pool.getProviderManager('fake-provider') + self.waitForInstanceDeletion(manager, node.external_id) + + def test_hold_expiration_lower_than_default(self): + """Test a held node is deleted when past its operator-specified TTL, + with max-hold-age set in the configuration""" + configfile = self.setup_config('node_max_hold_age_2.yaml') + pool = self.useNodepool(configfile, watermark_sleep=1) + self.useBuilder(configfile) + pool.start() + self.waitForImage('fake-provider', 'fake-image') + self.log.debug("Waiting for initial pool...") + nodes = self.waitForNodes('fake-label', 2) + self.log.debug("...done waiting for initial pool.") + node_custom = nodes[0] + # TODO make it a fraction of fixture's max-hold-age + hold_expiration = 2 + node = nodes[1] + self.log.debug("Holding node %s... (default)" % node.id) + self.log.debug("Holding node %s...(%s seconds)" % (node_custom.id, + hold_expiration)) + # hold the nodes + node.state = zk.HOLD + node.comment = 'testing' + node_custom.state = zk.HOLD + node_custom.comment = 'testing hold_expiration' + node_custom.hold_expiration = hold_expiration + self.zk.lockNode(node, blocking=False) + self.zk.storeNode(node) + self.zk.unlockNode(node) + self.zk.lockNode(node_custom, blocking=False) + self.zk.storeNode(node_custom) + self.zk.unlockNode(node_custom) + znode = self.zk.getNode(node.id) + self.log.debug("Node %s in state '%s'" % (znode.id, znode.state)) + znode_custom = self.zk.getNode(node_custom.id) + self.log.debug("Node %s in state '%s'" % (znode_custom.id, + znode_custom.state)) + # Wait for the instance to be cleaned up + manager = pool.getProviderManager('fake-provider') + self.waitForInstanceDeletion(manager, node_custom.external_id) + # control node should still be held + held_nodes = [n for n in self.zk.nodeIterator() if n.state == zk.HOLD] + self.assertTrue(any(n.id == node.id for n in held_nodes), + held_nodes) + # finally, control node gets deleted + self.waitForInstanceDeletion(manager, node.external_id) + + def test_hold_expiration_higher_than_default(self): + """Test a held node is deleted after max-hold-age seconds if the + operator specifies a larger TTL""" + configfile = self.setup_config('node_max_hold_age_2.yaml') + pool = self.useNodepool(configfile, watermark_sleep=1) + self.useBuilder(configfile) + pool.start() + self.waitForImage('fake-provider', 'fake-image') + self.log.debug("Waiting for initial pool...") + nodes = self.waitForNodes('fake-label', 2) + self.log.debug("...done waiting for initial pool.") + node_custom = nodes[0] + # TODO make it a multiple of fixture's max-hold-age + hold_expiration = 20 + node = nodes[1] + self.log.debug("Holding node %s... (default)" % node.id) + self.log.debug("Holding node %s...(%s seconds)" % (node_custom.id, + hold_expiration)) + # hold the nodes + node.state = zk.HOLD + node.comment = 'testing' + node_custom.state = zk.HOLD + node_custom.comment = 'testing hold_expiration' + node_custom.hold_expiration = hold_expiration + self.zk.lockNode(node, blocking=False) + self.zk.storeNode(node) + self.zk.unlockNode(node) + self.zk.lockNode(node_custom, blocking=False) + self.zk.storeNode(node_custom) + self.zk.unlockNode(node_custom) + znode = self.zk.getNode(node.id) + self.log.debug("Node %s in state '%s'" % (znode.id, znode.state)) + znode_custom = self.zk.getNode(node_custom.id) + self.log.debug("Node %s in state '%s'" % (znode_custom.id, + znode_custom.state)) + # Wait for the instance to be cleaned up + manager = pool.getProviderManager('fake-provider') + self.waitForInstanceDeletion(manager, node.external_id) + # custom node should be deleted as well + held_nodes = [n for n in self.zk.nodeIterator() if n.state == zk.HOLD] + self.assertEqual(0, len(held_nodes), held_nodes) + def test_label_provider(self): """Test that only providers listed in the label satisfy the request""" configfile = self.setup_config('node_label_provider.yaml') diff --git a/nodepool/zk.py b/nodepool/zk.py index 6e123ccde..9b223e544 100755 --- a/nodepool/zk.py +++ b/nodepool/zk.py @@ -447,6 +447,7 @@ class Node(BaseModel): self.username = None self.connection_type = None self.host_keys = [] + self.hold_expiration = None def __repr__(self): d = self.toDict() @@ -479,7 +480,8 @@ class Node(BaseModel): self.hold_job == other.hold_job and self.username == other.username and self.connection_type == other.connection_type and - self.host_keys == other.host_keys) + self.host_keys == other.host_keys and + self.hold_expiration == other.hold_expiration) else: return False @@ -513,6 +515,7 @@ class Node(BaseModel): d['host_keys'] = self.host_keys d['username'] = self.username d['connection_type'] = self.connection_type + d['hold_expiration'] = self.hold_expiration return d @staticmethod @@ -549,6 +552,7 @@ class Node(BaseModel): o.username = d.get('username', 'zuul') o.connection_type = d.get('connection_type') o.host_keys = d.get('host_keys', []) + o.hold_expiration = d.get('hold_expiration') return o