Clean held nodes automatically after configurable timeout
Introduce a new configuration setting, "max_hold_age", that specifies the maximum uptime of held instances. If set to 0, held instances are kept until manually deleted. A custom value can be provided at the rpcclient level. Change-Id: I9a09728e5728c537ee44721f5d5e774dc0dcefa7
This commit is contained in:
parent
afa66c7f58
commit
a59528c8e8
@ -146,6 +146,17 @@ written to run on nodes of a certain label. Example::
|
||||
to -1 to have the label considered disabled, so that no nodes will
|
||||
be created at all.
|
||||
|
||||
.. _maxholdage:
|
||||
|
||||
max-hold-age
|
||||
------------
|
||||
|
||||
Maximum number of seconds a node shall be in "hold" state. If
|
||||
this is exceeded the node will be deleted. A value of 0 disables this.
|
||||
Defaults to 0.
|
||||
|
||||
This setting is applied to all nodes, regardless of label or provider.
|
||||
|
||||
.. _diskimages:
|
||||
|
||||
diskimages
|
||||
|
@ -68,6 +68,7 @@ class ConfigValidator:
|
||||
'providers': list,
|
||||
'labels': [label],
|
||||
'diskimages': [diskimage],
|
||||
'max-hold-age': int,
|
||||
}
|
||||
|
||||
log.info("validating %s" % self.config_file)
|
||||
|
@ -16,6 +16,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import math
|
||||
import time
|
||||
import yaml
|
||||
|
||||
@ -115,6 +116,9 @@ def loadConfig(config_path):
|
||||
newconfig.provider_managers = {}
|
||||
newconfig.zookeeper_servers = {}
|
||||
newconfig.diskimages = {}
|
||||
newconfig.max_hold_age = config.get('max-hold-age', math.inf)
|
||||
if newconfig.max_hold_age <= 0:
|
||||
newconfig.max_hold_age = math.inf
|
||||
|
||||
for server in config.get('zookeeper-servers', []):
|
||||
z = zk.ZooKeeperConnectionConfig(server['host'],
|
||||
|
@ -17,6 +17,7 @@
|
||||
# limitations under the License.
|
||||
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
import os.path
|
||||
import socket
|
||||
@ -538,6 +539,62 @@ class CleanupWorker(BaseCleanupWorker):
|
||||
finally:
|
||||
zk_conn.unlockNode(node)
|
||||
|
||||
def _cleanupMaxHoldAge(self):
|
||||
'''
|
||||
Delete any held server past their max-hold-age.
|
||||
|
||||
Remove any servers which are longer than max-hold-age in hold state.
|
||||
'''
|
||||
self.log.debug('Cleaning up held nodes...')
|
||||
|
||||
zk_conn = self._nodepool.getZK()
|
||||
held_nodes = [n for n in zk_conn.nodeIterator() if n.state == zk.HOLD]
|
||||
for node in held_nodes:
|
||||
# Can't do anything if we aren't configured for this provider.
|
||||
if node.provider not in self._nodepool.config.providers:
|
||||
continue
|
||||
|
||||
if node.hold_expiration is not None and node.hold_expiration > 0:
|
||||
expiration = node.hold_expiration
|
||||
else:
|
||||
expiration = math.inf
|
||||
max_uptime = min(expiration, self._nodepool.config.max_hold_age)
|
||||
if math.isinf(max_uptime):
|
||||
continue
|
||||
|
||||
# check state time against now
|
||||
now = int(time.time())
|
||||
if (now - node.state_time) < max_uptime:
|
||||
continue
|
||||
|
||||
try:
|
||||
zk_conn.lockNode(node, blocking=False)
|
||||
except exceptions.ZKLockException:
|
||||
continue
|
||||
|
||||
# Double check the state now that we have a lock since it
|
||||
# may have changed on us.
|
||||
if node.state != zk.HOLD:
|
||||
zk_conn.unlockNode(node)
|
||||
continue
|
||||
|
||||
self.log.debug("Node %s exceeds max hold age (%s): %s >= %s",
|
||||
node.id,
|
||||
("manual setting"
|
||||
and node.hold_expiration == max_uptime
|
||||
or "configuration setting"),
|
||||
now - node.state_time,
|
||||
max_uptime)
|
||||
|
||||
try:
|
||||
node.state = zk.DELETING
|
||||
zk_conn.storeNode(node)
|
||||
except Exception:
|
||||
self.log.exception(
|
||||
"Failure marking aged node %s for delete:", node.id)
|
||||
finally:
|
||||
zk_conn.unlockNode(node)
|
||||
|
||||
def _run(self):
|
||||
'''
|
||||
Catch exceptions individually so that other cleanup routines may
|
||||
@ -567,6 +624,12 @@ class CleanupWorker(BaseCleanupWorker):
|
||||
self.log.exception(
|
||||
"Exception in CleanupWorker (max ready age cleanup):")
|
||||
|
||||
try:
|
||||
self._cleanupMaxHoldAge()
|
||||
except Exception:
|
||||
self.log.exception(
|
||||
"Exception in CleanupWorker (max hold age cleanup):")
|
||||
|
||||
|
||||
class DeletedNodeWorker(BaseCleanupWorker):
|
||||
'''
|
||||
|
50
nodepool/tests/fixtures/node_max_hold_age.yaml
vendored
Normal file
50
nodepool/tests/fixtures/node_max_hold_age.yaml
vendored
Normal file
@ -0,0 +1,50 @@
|
||||
elements-dir: .
|
||||
images-dir: '{images_dir}'
|
||||
build-log-dir: '{build_log_dir}'
|
||||
|
||||
max-hold-age: 2
|
||||
|
||||
zookeeper-servers:
|
||||
- host: {zookeeper_host}
|
||||
port: {zookeeper_port}
|
||||
chroot: {zookeeper_chroot}
|
||||
|
||||
labels:
|
||||
- name: fake-label
|
||||
min-ready: 1
|
||||
|
||||
providers:
|
||||
- name: fake-provider
|
||||
cloud: fake
|
||||
driver: fake
|
||||
region-name: fake-region
|
||||
rate: 0.0001
|
||||
diskimages:
|
||||
- name: fake-image
|
||||
meta:
|
||||
key: value
|
||||
key2: value
|
||||
pools:
|
||||
- name: main
|
||||
max-servers: 96
|
||||
availability-zones:
|
||||
- az1
|
||||
networks:
|
||||
- net-name
|
||||
labels:
|
||||
- name: fake-label
|
||||
diskimage: fake-image
|
||||
min-ram: 8192
|
||||
flavor-name: 'Fake'
|
||||
|
||||
diskimages:
|
||||
- name: fake-image
|
||||
elements:
|
||||
- fedora
|
||||
- vm
|
||||
release: 21
|
||||
env-vars:
|
||||
TMPDIR: /opt/dib_tmp
|
||||
DIB_IMAGE_CACHE: /opt/dib_cache
|
||||
DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/
|
||||
BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2
|
50
nodepool/tests/fixtures/node_max_hold_age_2.yaml
vendored
Normal file
50
nodepool/tests/fixtures/node_max_hold_age_2.yaml
vendored
Normal file
@ -0,0 +1,50 @@
|
||||
elements-dir: .
|
||||
images-dir: '{images_dir}'
|
||||
build-log-dir: '{build_log_dir}'
|
||||
|
||||
max-hold-age: 10
|
||||
|
||||
zookeeper-servers:
|
||||
- host: {zookeeper_host}
|
||||
port: {zookeeper_port}
|
||||
chroot: {zookeeper_chroot}
|
||||
|
||||
labels:
|
||||
- name: fake-label
|
||||
min-ready: 2
|
||||
|
||||
providers:
|
||||
- name: fake-provider
|
||||
cloud: fake
|
||||
driver: fake
|
||||
region-name: fake-region
|
||||
rate: 0.0001
|
||||
diskimages:
|
||||
- name: fake-image
|
||||
meta:
|
||||
key: value
|
||||
key2: value
|
||||
pools:
|
||||
- name: main
|
||||
max-servers: 96
|
||||
availability-zones:
|
||||
- az1
|
||||
networks:
|
||||
- net-name
|
||||
labels:
|
||||
- name: fake-label
|
||||
diskimage: fake-image
|
||||
min-ram: 8192
|
||||
flavor-name: 'Fake'
|
||||
|
||||
diskimages:
|
||||
- name: fake-image
|
||||
elements:
|
||||
- fedora
|
||||
- vm
|
||||
release: 21
|
||||
env-vars:
|
||||
TMPDIR: /opt/dib_tmp
|
||||
DIB_IMAGE_CACHE: /opt/dib_cache
|
||||
DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/
|
||||
BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2
|
@ -660,6 +660,143 @@ class TestLauncher(tests.DBTestCase):
|
||||
manager = pool.getProviderManager('fake-provider')
|
||||
self.waitForInstanceDeletion(manager, nodes[0].external_id)
|
||||
|
||||
def test_max_hold_age(self):
|
||||
"""Test a held node with exceeded max-hold-age is deleted"""
|
||||
configfile = self.setup_config('node_max_hold_age.yaml')
|
||||
pool = self.useNodepool(configfile, watermark_sleep=1)
|
||||
self.useBuilder(configfile)
|
||||
pool.start()
|
||||
self.waitForImage('fake-provider', 'fake-image')
|
||||
self.log.debug("Waiting for initial pool...")
|
||||
nodes = self.waitForNodes('fake-label')
|
||||
self.log.debug("...done waiting for initial pool.")
|
||||
node = nodes[0]
|
||||
self.log.debug("Holding node %s..." % node.id)
|
||||
# hold the node
|
||||
node.state = zk.HOLD
|
||||
node.comment = 'testing'
|
||||
self.zk.lockNode(node, blocking=False)
|
||||
self.zk.storeNode(node)
|
||||
self.zk.unlockNode(node)
|
||||
znode = self.zk.getNode(node.id)
|
||||
self.log.debug("Node %s in state '%s'" % (znode.id, znode.state))
|
||||
# Wait for the instance to be cleaned up
|
||||
manager = pool.getProviderManager('fake-provider')
|
||||
self.waitForInstanceDeletion(manager, node.external_id)
|
||||
|
||||
def test_hold_expiration_no_default(self):
|
||||
"""Test a held node is deleted when past its operator-specified TTL,
|
||||
no max-hold-age set"""
|
||||
configfile = self.setup_config('node_max_ready_age.yaml')
|
||||
pool = self.useNodepool(configfile, watermark_sleep=1)
|
||||
self.useBuilder(configfile)
|
||||
pool.start()
|
||||
self.waitForImage('fake-provider', 'fake-image')
|
||||
self.log.debug("Waiting for initial pool...")
|
||||
nodes = self.waitForNodes('fake-label')
|
||||
self.log.debug("...done waiting for initial pool.")
|
||||
node = nodes[0]
|
||||
self.log.debug("Holding node %s..." % node.id)
|
||||
# hold the node
|
||||
node.state = zk.HOLD
|
||||
node.comment = 'testing'
|
||||
node.hold_expiration = 5
|
||||
self.zk.lockNode(node, blocking=False)
|
||||
self.zk.storeNode(node)
|
||||
self.zk.unlockNode(node)
|
||||
znode = self.zk.getNode(node.id)
|
||||
self.log.debug("Node %s in state '%s'" % (znode.id, znode.state))
|
||||
# Wait for the instance to be cleaned up
|
||||
manager = pool.getProviderManager('fake-provider')
|
||||
self.waitForInstanceDeletion(manager, node.external_id)
|
||||
|
||||
def test_hold_expiration_lower_than_default(self):
|
||||
"""Test a held node is deleted when past its operator-specified TTL,
|
||||
with max-hold-age set in the configuration"""
|
||||
configfile = self.setup_config('node_max_hold_age_2.yaml')
|
||||
pool = self.useNodepool(configfile, watermark_sleep=1)
|
||||
self.useBuilder(configfile)
|
||||
pool.start()
|
||||
self.waitForImage('fake-provider', 'fake-image')
|
||||
self.log.debug("Waiting for initial pool...")
|
||||
nodes = self.waitForNodes('fake-label', 2)
|
||||
self.log.debug("...done waiting for initial pool.")
|
||||
node_custom = nodes[0]
|
||||
# TODO make it a fraction of fixture's max-hold-age
|
||||
hold_expiration = 2
|
||||
node = nodes[1]
|
||||
self.log.debug("Holding node %s... (default)" % node.id)
|
||||
self.log.debug("Holding node %s...(%s seconds)" % (node_custom.id,
|
||||
hold_expiration))
|
||||
# hold the nodes
|
||||
node.state = zk.HOLD
|
||||
node.comment = 'testing'
|
||||
node_custom.state = zk.HOLD
|
||||
node_custom.comment = 'testing hold_expiration'
|
||||
node_custom.hold_expiration = hold_expiration
|
||||
self.zk.lockNode(node, blocking=False)
|
||||
self.zk.storeNode(node)
|
||||
self.zk.unlockNode(node)
|
||||
self.zk.lockNode(node_custom, blocking=False)
|
||||
self.zk.storeNode(node_custom)
|
||||
self.zk.unlockNode(node_custom)
|
||||
znode = self.zk.getNode(node.id)
|
||||
self.log.debug("Node %s in state '%s'" % (znode.id, znode.state))
|
||||
znode_custom = self.zk.getNode(node_custom.id)
|
||||
self.log.debug("Node %s in state '%s'" % (znode_custom.id,
|
||||
znode_custom.state))
|
||||
# Wait for the instance to be cleaned up
|
||||
manager = pool.getProviderManager('fake-provider')
|
||||
self.waitForInstanceDeletion(manager, node_custom.external_id)
|
||||
# control node should still be held
|
||||
held_nodes = [n for n in self.zk.nodeIterator() if n.state == zk.HOLD]
|
||||
self.assertTrue(any(n.id == node.id for n in held_nodes),
|
||||
held_nodes)
|
||||
# finally, control node gets deleted
|
||||
self.waitForInstanceDeletion(manager, node.external_id)
|
||||
|
||||
def test_hold_expiration_higher_than_default(self):
|
||||
"""Test a held node is deleted after max-hold-age seconds if the
|
||||
operator specifies a larger TTL"""
|
||||
configfile = self.setup_config('node_max_hold_age_2.yaml')
|
||||
pool = self.useNodepool(configfile, watermark_sleep=1)
|
||||
self.useBuilder(configfile)
|
||||
pool.start()
|
||||
self.waitForImage('fake-provider', 'fake-image')
|
||||
self.log.debug("Waiting for initial pool...")
|
||||
nodes = self.waitForNodes('fake-label', 2)
|
||||
self.log.debug("...done waiting for initial pool.")
|
||||
node_custom = nodes[0]
|
||||
# TODO make it a multiple of fixture's max-hold-age
|
||||
hold_expiration = 20
|
||||
node = nodes[1]
|
||||
self.log.debug("Holding node %s... (default)" % node.id)
|
||||
self.log.debug("Holding node %s...(%s seconds)" % (node_custom.id,
|
||||
hold_expiration))
|
||||
# hold the nodes
|
||||
node.state = zk.HOLD
|
||||
node.comment = 'testing'
|
||||
node_custom.state = zk.HOLD
|
||||
node_custom.comment = 'testing hold_expiration'
|
||||
node_custom.hold_expiration = hold_expiration
|
||||
self.zk.lockNode(node, blocking=False)
|
||||
self.zk.storeNode(node)
|
||||
self.zk.unlockNode(node)
|
||||
self.zk.lockNode(node_custom, blocking=False)
|
||||
self.zk.storeNode(node_custom)
|
||||
self.zk.unlockNode(node_custom)
|
||||
znode = self.zk.getNode(node.id)
|
||||
self.log.debug("Node %s in state '%s'" % (znode.id, znode.state))
|
||||
znode_custom = self.zk.getNode(node_custom.id)
|
||||
self.log.debug("Node %s in state '%s'" % (znode_custom.id,
|
||||
znode_custom.state))
|
||||
# Wait for the instance to be cleaned up
|
||||
manager = pool.getProviderManager('fake-provider')
|
||||
self.waitForInstanceDeletion(manager, node.external_id)
|
||||
# custom node should be deleted as well
|
||||
held_nodes = [n for n in self.zk.nodeIterator() if n.state == zk.HOLD]
|
||||
self.assertEqual(0, len(held_nodes), held_nodes)
|
||||
|
||||
def test_label_provider(self):
|
||||
"""Test that only providers listed in the label satisfy the request"""
|
||||
configfile = self.setup_config('node_label_provider.yaml')
|
||||
|
@ -447,6 +447,7 @@ class Node(BaseModel):
|
||||
self.username = None
|
||||
self.connection_type = None
|
||||
self.host_keys = []
|
||||
self.hold_expiration = None
|
||||
|
||||
def __repr__(self):
|
||||
d = self.toDict()
|
||||
@ -479,7 +480,8 @@ class Node(BaseModel):
|
||||
self.hold_job == other.hold_job and
|
||||
self.username == other.username and
|
||||
self.connection_type == other.connection_type and
|
||||
self.host_keys == other.host_keys)
|
||||
self.host_keys == other.host_keys and
|
||||
self.hold_expiration == other.hold_expiration)
|
||||
else:
|
||||
return False
|
||||
|
||||
@ -513,6 +515,7 @@ class Node(BaseModel):
|
||||
d['host_keys'] = self.host_keys
|
||||
d['username'] = self.username
|
||||
d['connection_type'] = self.connection_type
|
||||
d['hold_expiration'] = self.hold_expiration
|
||||
return d
|
||||
|
||||
@staticmethod
|
||||
@ -549,6 +552,7 @@ class Node(BaseModel):
|
||||
o.username = d.get('username', 'zuul')
|
||||
o.connection_type = d.get('connection_type')
|
||||
o.host_keys = d.get('host_keys', [])
|
||||
o.hold_expiration = d.get('hold_expiration')
|
||||
return o
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user