Files
devstack-gate/slave_scripts/devstack-vm-launch.py
James E. Blair 0011255c80 Abandon devstack vms that launch slowly.
If a vm takes more than 900 seconds to launch, abandon it (set it
to error state and let the reaper come along and delet it in 24 hours
when things have hopefully settled down).  Addresses bug 921738.

Set the number of vms in the pool to 10 (increase from 5) to paper
over more spurrious operational errors from cloud providers.

Change-Id: Ib000e420377f4d19871bd42fb360016a519e4b2f
2012-02-01 00:37:52 +00:00

144 lines
5.7 KiB
Python
Executable File

#!/usr/bin/env python
# Make sure there are always a certain number of VMs launched and
# ready for use by devstack.
# Copyright (C) 2011 OpenStack LLC.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
#
# See the License for the specific language governing permissions and
# limitations under the License.
from libcloud.compute.base import NodeImage, NodeSize, NodeLocation
from libcloud.compute.types import Provider, NodeState
from libcloud.compute.providers import get_driver
from libcloud.compute.deployment import MultiStepDeployment, ScriptDeployment, SSHKeyDeployment
import libcloud
import os, sys
import getopt
import time
import vmdatabase
CLOUD_SERVERS_DRIVER = os.environ.get('CLOUD_SERVERS_DRIVER','rackspace')
CLOUD_SERVERS_USERNAME = os.environ['CLOUD_SERVERS_USERNAME']
CLOUD_SERVERS_API_KEY = os.environ['CLOUD_SERVERS_API_KEY']
IMAGE_NAME = os.environ.get('IMAGE_NAME', 'devstack-oneiric')
MIN_RAM = 1024
MIN_READY_MACHINES = 10 # keep this number of machine in the pool
ABANDON_TIMEOUT = 900 # assume a machine will never boot if it hasn't
# after this amount of time
db = vmdatabase.VMDatabase()
ready_machines = [x for x in db.getMachines()
if x['state'] == vmdatabase.READY]
building_machines = [x for x in db.getMachines()
if x['state'] == vmdatabase.BUILDING]
# Count machines that are ready and machines that are building,
# so that if the provider is very slow, we aren't queueing up tons
# of machines to be built.
num_to_launch = MIN_READY_MACHINES - (len(ready_machines) +
len(building_machines))
print "%s ready, %s building, need to launch %s" % (len(ready_machines),
len(building_machines),
num_to_launch)
if num_to_launch <= 0 and len(building_machines) == 0:
sys.exit(0)
if CLOUD_SERVERS_DRIVER == 'rackspace':
Driver = get_driver(Provider.RACKSPACE)
conn = Driver(CLOUD_SERVERS_USERNAME, CLOUD_SERVERS_API_KEY)
images = conn.list_images()
sizes = [sz for sz in conn.list_sizes() if sz.ram >= MIN_RAM]
sizes.sort(lambda a,b: cmp(a.ram, b.ram))
size = sizes[0]
images = [img for img in conn.list_images()
if img.name.startswith(IMAGE_NAME)]
images.sort()
if not len(images):
raise Exception("No images found")
image = images[-1]
else:
raise Exception ("Driver not supported")
if CLOUD_SERVERS_DRIVER == 'rackspace':
last_name = ''
error_counts = {}
for i in range(num_to_launch):
while True:
node_name = 'devstack-%s.slave.openstack.org' % int(time.time())
if node_name != last_name: break
time.sleep(1)
node = conn.create_node(name=node_name, image=image, size=size)
db.addMachine(CLOUD_SERVERS_DRIVER, node.id, IMAGE_NAME,
node_name, node.public_ip[0], node.uuid)
print "Started building node %s:" % node.id
print " name: %s [%s]" % (node_name, node.public_ip[0])
print " uuid: %s" % (node.uuid)
print
# Wait for nodes
# TODO: The vmdatabase is (probably) ready, but this needs reworking to
# actually support multiple providers
start = time.time()
to_ignore = []
error = False
while True:
building_machines = [x for x in db.getMachines()
if x['state'] == vmdatabase.BUILDING]
if not building_machines:
print "Finished"
break
provider_nodes = conn.list_nodes()
print "Waiting on %s machines" % len(building_machines)
for my_node in building_machines:
if my_node['uuid'] in to_ignore: continue
p_nodes = [x for x in provider_nodes if x.uuid == my_node['uuid']]
if len(p_nodes) != 1:
print "Incorrect number of nodes (%s) from provider matching UUID %s" % (len(p_nodes), my_node['uuid'])
to_ignore.append(my_node)
else:
p_node = p_nodes[0]
if (p_node.public_ips and p_node.state == NodeState.RUNNING):
print "Node %s is ready" % my_node['id']
db.setMachineState(my_node['uuid'], vmdatabase.READY)
elif (p_node.public_ips and p_node.state in
[NodeState.UNKNOWN,
NodeState.REBOOTING,
NodeState.TERMINATED]):
count = error_counts.get(my_node['id'], 0)
count += 1
error_counts[my_node['id']] = count
print "Node %s is in error %s (%s/5)" % (my_node['id'],
p_node.state,
count)
if count >= 5:
print "Abandoning node %s due to too many errors" % (my_node['id'])
db.setMachineState(my_node['uuid'], vmdatabase.ERROR)
error = True
else:
if time.time()-my_node['state_time'] >= ABANDON_TIMEOUT:
print "Abandoning node %s due to timeout" % (my_node['id'])
db.setMachineState(my_node['uuid'], vmdatabase.ERROR)
error = True
time.sleep(3)
if error:
sys.exit(1)