From 0011255c80984cbda76a0f4201ea841e59409ff1 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Wed, 1 Feb 2012 00:37:52 +0000 Subject: [PATCH] Abandon devstack vms that launch slowly. If a vm takes more than 900 seconds to launch, abandon it (set it to error state and let the reaper come along and delet it in 24 hours when things have hopefully settled down). Addresses bug 921738. Set the number of vms in the pool to 10 (increase from 5) to paper over more spurrious operational errors from cloud providers. Change-Id: Ib000e420377f4d19871bd42fb360016a519e4b2f --- slave_scripts/devstack-vm-launch.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/slave_scripts/devstack-vm-launch.py b/slave_scripts/devstack-vm-launch.py index da4a5a66..3ca55181 100755 --- a/slave_scripts/devstack-vm-launch.py +++ b/slave_scripts/devstack-vm-launch.py @@ -37,7 +37,9 @@ CLOUD_SERVERS_API_KEY = os.environ['CLOUD_SERVERS_API_KEY'] IMAGE_NAME = os.environ.get('IMAGE_NAME', 'devstack-oneiric') MIN_RAM = 1024 -MIN_READY_MACHINES = 5 +MIN_READY_MACHINES = 10 # keep this number of machine in the pool +ABANDON_TIMEOUT = 900 # assume a machine will never boot if it hasn't + # after this amount of time db = vmdatabase.VMDatabase() @@ -96,15 +98,13 @@ if CLOUD_SERVERS_DRIVER == 'rackspace': # TODO: The vmdatabase is (probably) ready, but this needs reworking to # actually support multiple providers start = time.time() - timeout = 600 to_ignore = [] - finished = False - while (time.time()-start) < timeout: + error = False + while True: building_machines = [x for x in db.getMachines() if x['state'] == vmdatabase.BUILDING] if not building_machines: print "Finished" - finished = True break provider_nodes = conn.list_nodes() print "Waiting on %s machines" % len(building_machines) @@ -119,7 +119,7 @@ if CLOUD_SERVERS_DRIVER == 'rackspace': if (p_node.public_ips and p_node.state == NodeState.RUNNING): print "Node %s is ready" % my_node['id'] db.setMachineState(my_node['uuid'], vmdatabase.READY) - if (p_node.public_ips and p_node.state in + elif (p_node.public_ips and p_node.state in [NodeState.UNKNOWN, NodeState.REBOOTING, NodeState.TERMINATED]): @@ -130,7 +130,14 @@ if CLOUD_SERVERS_DRIVER == 'rackspace': p_node.state, count) if count >= 5: + print "Abandoning node %s due to too many errors" % (my_node['id']) db.setMachineState(my_node['uuid'], vmdatabase.ERROR) + error = True + else: + if time.time()-my_node['state_time'] >= ABANDON_TIMEOUT: + print "Abandoning node %s due to timeout" % (my_node['id']) + db.setMachineState(my_node['uuid'], vmdatabase.ERROR) + error = True time.sleep(3) -if not finished: +if error: sys.exit(1)