diff --git a/slave_scripts/devstack-vm-delete.py b/slave_scripts/devstack-vm-delete.py index 9860e96e..4c4c1380 100755 --- a/slave_scripts/devstack-vm-delete.py +++ b/slave_scripts/devstack-vm-delete.py @@ -33,13 +33,9 @@ CLOUD_SERVERS_DRIVER = os.environ.get('CLOUD_SERVERS_DRIVER','rackspace') CLOUD_SERVERS_USERNAME = os.environ['CLOUD_SERVERS_USERNAME'] CLOUD_SERVERS_API_KEY = os.environ['CLOUD_SERVERS_API_KEY'] -CHANGE = os.environ['GERRIT_CHANGE_NUMBER'] -PATCH = os.environ['GERRIT_PATCHSET_NUMBER'] -BUILD = os.environ['BUILD_NUMBER'] - +node_uuid = sys.argv[1] db = vmdatabase.VMDatabase() -machine = db.getMachine(CHANGE, PATCH, BUILD) -node_name = machine['name'] +machine = db.getMachine(node_uuid) if CLOUD_SERVERS_DRIVER == 'rackspace': Driver = get_driver(Provider.RACKSPACE) @@ -47,4 +43,4 @@ if CLOUD_SERVERS_DRIVER == 'rackspace': node = [n for n in conn.list_nodes() if n.id==str(machine['id'])][0] node.destroy() -db.delMachine(machine['id']) +db.delMachine(node_uuid) diff --git a/slave_scripts/devstack-vm-fetch.py b/slave_scripts/devstack-vm-fetch.py new file mode 100644 index 00000000..cc7ee516 --- /dev/null +++ b/slave_scripts/devstack-vm-fetch.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python + +# Fetch a ready VM for use by devstack. + +# Copyright (C) 2011 OpenStack LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# +# See the License for the specific language governing permissions and +# limitations under the License. + +import vmdatabase + +db = vmdatabase.VMDatabase() +node = db.getMachineForUse() + +if not node: + raise Exception("No ready nodes") + +print "NODE_IP_ADDR=%s\n" % node['ip'] +print "NODE_UUID=%s\n" % node['uuid'] diff --git a/slave_scripts/devstack-vm-gate.sh b/slave_scripts/devstack-vm-gate.sh index 47c4f00c..02217b7f 100755 --- a/slave_scripts/devstack-vm-gate.sh +++ b/slave_scripts/devstack-vm-gate.sh @@ -62,31 +62,29 @@ do cd $WORKSPACE done -python $CI_SCRIPT_DIR/devstack-vm-launch.py || exit $? -. $HOSTNAME.node.sh -rm $HOSTNAME.node.sh +eval `python $CI_SCRIPT_DIR/devstack-vm-fetch.py` || exit $? -scp -C $CI_SCRIPT_DIR/devstack-vm-gate-host.sh $ipAddr: +scp -C $CI_SCRIPT_DIR/devstack-vm-gate-host.sh $NODE_IP_ADDR: RETVAL=$? if [ $RETVAL != 0 ]; then echo "Deleting host" - python $CI_SCRIPT_DIR/devstack-vm-delete.py + python $CI_SCRIPT_DIR/devstack-vm-delete.py $NODE_UUID fi -scp -C -q -r $WORKSPACE/ $ipAddr:workspace +scp -C -q -r $WORKSPACE/ $NODE_IP_ADDR:workspace RETVAL=$? if [ $RETVAL != 0 ]; then echo "Deleting host" - python $CI_SCRIPT_DIR/devstack-vm-delete.py + python $CI_SCRIPT_DIR/devstack-vm-delete.py $NODE_UUID fi -ssh $ipAddr ./devstack-vm-gate-host.sh +ssh $NODE_IP_ADDR ./devstack-vm-gate-host.sh RETVAL=$? if [ $RETVAL = 0 ] && [ $ALWAYS_KEEP = 0 ]; then echo "Deleting host" - python $CI_SCRIPT_DIR/devstack-vm-delete.py + python $CI_SCRIPT_DIR/devstack-vm-delete.py $NODE_UUID else #echo "Giving host to developer" - #python $CI_SCRIPT_DIR/devstack-vm-give.py + #python $CI_SCRIPT_DIR/devstack-vm-give.py $NODE_UUID exit $RETVAL fi diff --git a/slave_scripts/devstack-vm-give.py b/slave_scripts/devstack-vm-give.py index 83511534..8e68cc1d 100755 --- a/slave_scripts/devstack-vm-give.py +++ b/slave_scripts/devstack-vm-give.py @@ -28,12 +28,9 @@ import tempfile import vmdatabase -CHANGE = os.environ['GERRIT_CHANGE_NUMBER'] -PATCH = os.environ['GERRIT_PATCHSET_NUMBER'] -BUILD = os.environ['BUILD_NUMBER'] - +node_uuid = sys.argv[1] db = vmdatabase.VMDatabase() -machine = db.getMachine(CHANGE, PATCH, BUILD) +machine = db.getMachine(node_uuid) stat, out = commands.getstatusoutput("ssh -p 29418 review.openstack.org gerrit query --format=JSON change:%s" % os.environ['GERRIT_CHANGE_NUMBER']) diff --git a/slave_scripts/devstack-vm-launch.py b/slave_scripts/devstack-vm-launch.py index 540567a5..8222eda9 100755 --- a/slave_scripts/devstack-vm-launch.py +++ b/slave_scripts/devstack-vm-launch.py @@ -1,6 +1,7 @@ #!/usr/bin/env python -# Launch a VM for use by devstack. +# Make sure there are always a certain number of VMs launched and +# ready for use by devstack. # Copyright (C) 2011 OpenStack LLC. # @@ -19,6 +20,7 @@ # limitations under the License. from libcloud.base import NodeImage, NodeSize, NodeLocation +from libcloud.compute.types import NodeState from libcloud.types import Provider from libcloud.providers import get_driver from libcloud.deployment import MultiStepDeployment, ScriptDeployment, SSHKeyDeployment @@ -32,17 +34,29 @@ import vmdatabase CLOUD_SERVERS_DRIVER = os.environ.get('CLOUD_SERVERS_DRIVER','rackspace') CLOUD_SERVERS_USERNAME = os.environ['CLOUD_SERVERS_USERNAME'] CLOUD_SERVERS_API_KEY = os.environ['CLOUD_SERVERS_API_KEY'] -CLOUD_SERVERS_HOST = os.environ.get('CLOUD_SERVERS_HOST', None) -CLOUD_SERVERS_PATH = os.environ.get('CLOUD_SERVERS_PATH', None) IMAGE_NAME = 'devstack-oneiric' MIN_RAM = 1024 - -CHANGE = os.environ['GERRIT_CHANGE_NUMBER'] -PATCH = os.environ['GERRIT_PATCHSET_NUMBER'] -BUILD = os.environ['BUILD_NUMBER'] +MIN_READY_MACHINES = 5 db = vmdatabase.VMDatabase() -node_name = 'devstack-%s-%s-%s.slave.openstack.org' % (CHANGE, PATCH, BUILD) + +ready_machines = [x for x in db.getMachines() + if x['state'] == vmdatabase.READY] +building_machines = [x for x in db.getMachines() + if x['state'] == vmdatabase.BUILDING] + +# Count machines that are ready and machines that are building, +# so that if the provider is very slow, we aren't queueing up tons +# of machines to be built. +num_to_launch = MIN_READY_MACHINES - (len(ready_machines) + + len(building_machines)) + +print "%s ready, %s building, need to launch %s" % (len(ready_machines), + len(building_machines), + num_to_launch) + +if num_to_launch <= 0: + sys.exit(0) if CLOUD_SERVERS_DRIVER == 'rackspace': Driver = get_driver(Provider.RACKSPACE) @@ -55,22 +69,56 @@ if CLOUD_SERVERS_DRIVER == 'rackspace': images = [img for img in conn.list_images() if img.name.startswith(IMAGE_NAME)] images.sort() + if not len(images): + raise Exception("No images found") image = images[-1] else: raise Exception ("Driver not supported") if CLOUD_SERVERS_DRIVER == 'rackspace': - node = conn.create_node(name=node_name, image=image, size=size) - # A private method, Tomaz Muraus says he's thinking of making it public - node = conn._wait_until_running(node=node, wait_period=3, - timeout=600) - -print "Node ID:", node.id -print "Node IP:", node.public_ip[0] - -db.addMachine(node.id, node_name, node.public_ip[0], CHANGE, PATCH, BUILD) - -with open("%s.node.sh" % node_name,"w") as node_file: - node_file.write("ipAddr=%s\n" % node.public_ip[0]) - node_file.write("nodeId=%s\n" % node.id) + last_name = '' + for i in range(num_to_launch): + while True: + node_name = 'devstack-%s.slave.openstack.org' % int(time.time()) + if node_name != last_name: break + time.sleep(1) + node = conn.create_node(name=node_name, image=image, size=size) + db.addMachine(CLOUD_SERVERS_DRIVER, node.id, IMAGE_NAME, + node_name, node.public_ip[0], node.uuid) + print "Started building node %s:" % node.id + print " name: %s [%s]" % (node_name, node.public_ip[0]) + print " uuid: %s" % (node.uuid) + print + # Wait for nodes + # TODO: The vmdatabase is (probably) ready, but this needs reworking to + # actually support multiple providers + start = time.time() + timeout = 600 + to_ignore = [] + while (time.time()-start) < timeout: + building_machines = [x for x in db.getMachines() + if x['state'] == vmdatabase.BUILDING] + if not building_machines: + print "Finished" + break + provider_nodes = conn.list_nodes() + print "Waiting on %s machines" % len(building_machines) + for my_node in building_machines: + if my_node['uuid'] in to_ignore: continue + p_nodes = [x for x in provider_nodes if x.uuid == my_node['uuid']] + if len(p_nodes) != 1: + print "Incorrect number of nodes (%s) from provider matching UUID %s" % (len(p_nodes), my_node['uuid']) + to_ignore.append(my_node) + else: + p_node = p_nodes[0] + if (p_node.public_ips and p_node.state == NodeState.RUNNING): + print "Node %s is ready" % my_node['id'] + db.setMachineState(my_node['uuid'], vmdatabase.READY) + if (p_node.public_ips and p_node.state in + [NodeState.UNKNOWN, + NodeState.REBOOTING, + NodeState.TERMINATED]): + print "Node %s is in error" % my_node['id'] + db.setMachineState(my_node['uuid'], vmdatabase.ERROR) + time.sleep(3) diff --git a/slave_scripts/devstack-vm-reap.py b/slave_scripts/devstack-vm-reap.py index 7824e149..cee76e2c 100755 --- a/slave_scripts/devstack-vm-reap.py +++ b/slave_scripts/devstack-vm-reap.py @@ -32,6 +32,7 @@ import vmdatabase CLOUD_SERVERS_DRIVER = os.environ.get('CLOUD_SERVERS_DRIVER','rackspace') CLOUD_SERVERS_USERNAME = os.environ['CLOUD_SERVERS_USERNAME'] CLOUD_SERVERS_API_KEY = os.environ['CLOUD_SERVERS_API_KEY'] +MACHINE_LIFETIME = 24*60*60 # Amount of time after being used db = vmdatabase.VMDatabase() @@ -53,18 +54,21 @@ def delete(machine): node = [n for n in conn.list_nodes() if n.id==str(machine['id'])] if not node: print ' Machine id %s not found' % machine['id'] - db.delMachine(machine['id']) + db.delMachine(machine['uuid']) return node = node[0] node.destroy() - db.delMachine(machine['id']) + db.delMachine(machine['uuid']) now = time.time() for machine in db.getMachines(): - if REAP_ALL or (now-machine['created'] > 24*60*60): + # Normally, reap machines that have sat in their current state + # for 24 hours, unless that state is READY. + if REAP_ALL or (machine['state']!=vmdatabase.READY and + now-machine['state_time'] > MACHINE_LIFETIME): print 'Deleting', machine['name'] delete(machine) - + print print 'Known machines (end):' for machine in db.getMachines(): diff --git a/slave_scripts/vmdatabase.py b/slave_scripts/vmdatabase.py index a20aa1b4..1a4fa02b 100644 --- a/slave_scripts/vmdatabase.py +++ b/slave_scripts/vmdatabase.py @@ -2,56 +2,105 @@ import sqlite3 import os import time +# States: +# The cloud provider is building this machine. We have an ID, but it's +# not ready for use. +BUILDING=1 +# The machine is ready for use. +READY=2 +# This can mean in-use, or used but complete. We don't actually need to +# distinguish between those states -- we'll just delete a machine 24 hours +# after it transitions into the USED state. +USED=3 +# An error state, should just try to delete it. +ERROR=4 + +# Columns: +# state: one of the above values +# state_time: the time of transition into that state +# user: set if the machine is given to a user +# id: identifier from cloud provider +# name: machine name +# ip: machine ip +# uuid: uuid from libcloud +# provider: libcloud driver for this server +# image: name of image this server is based on + class VMDatabase(object): def __init__(self, path=os.path.expanduser("~/vm.db")): + # Set isolation_level = None, which means "autocommit" mode + # but more importantly lets you manage transactions manually + # without the isolation emulation getting in your way. + # Most of our writes can be autocomitted, and the one(s) + # that can't, we'll set up the transaction around the critical + # section. if not os.path.exists(path): - conn = sqlite3.connect(path) - c = conn.cursor() - c.execute('''create table machines -(id int, name text, ip text, change_number, patch_number, build_number, created int, user text)''') - conn.commit() - c.close() - self.conn = sqlite3.connect(path) + conn = sqlite3.connect(path, isolation_level=None) + conn.execute("""create table machines + (provider text, id int, image text, + name text, ip text, uuid text, + state_time int, state int, user text)""") + del conn + self.conn = sqlite3.connect(path, isolation_level = None) + # This turns the returned rows into objects that are like lists + # and dicts at the same time: + self.conn.row_factory = sqlite3.Row - def addMachine(self, mid, name, ip, change, patch, build): - c = self.conn.cursor() - c.execute("insert into machines (id, name, ip, change_number, patch_number, build_number, created) values (?, ?, ?, ?, ?, ?, ?)", - (mid, name, ip, change, patch, build, int(time.time()))) - self.conn.commit() - c.close() + def addMachine(self, provider, mid, image, name, ip, uuid): + self.conn.execute("""insert into machines + (provider, id, image, name, ip, + uuid, state_time, state) + values (?, ?, ?, ?, ?, ?, ?, ?)""", + (provider, mid, image, name, ip, uuid, + int(time.time()), BUILDING)) - def delMachine(self, mid): - c = self.conn.cursor() - c.execute("delete from machines where id=?", (mid,)) - self.conn.commit() - c.close() + def delMachine(self, uuid): + self.conn.execute("delete from machines where uuid=?", (uuid,)) - def setMachineUser(self, mid, user): - c = self.conn.cursor() - c.execute("update machines set user=? where id=?", (user, mid)) - self.conn.commit() - c.close() + def setMachineUser(self, uuid, user): + self.conn.execute("update machines set user=? where uuid=?", + (user, uuid)) + + def setMachineState(self, uuid, state): + self.conn.execute("""update machines set state=?, state_time=? + where uuid=?""", + (state, int(time.time()), uuid)) def getMachines(self): - c = self.conn.cursor() - c.execute("select * from machines") - names = [col[0] for col in c.description] - data = [dict(zip(names, row)) for row in c] - c.close() - return data + return self.conn.execute("select * from machines order by state_time") - def getMachine(self, change, patch, build): - c = self.conn.cursor() - c.execute("select * from machines where change_number=? and patch_number=? and build_number=?", (change, patch, build)) - names = [col[0] for col in c.description] - data = [row for row in c] - c.close() - return dict(zip(names, data[0])) + def getMachine(self, uuid): + for x in self.conn.execute("select * from machines where uuid=?", + (uuid,)): + return x + + def getMachineForUse(self): + """Atomically find a machine that is ready for use, and update + its state.""" + self.conn.execute("begin exclusive transaction") + ret = None + for m in self.getMachines(): + if m['state']==READY: + self.setMachineState(m['id'], USED) + ret = m + break + self.conn.execute("commit") + return ret if __name__=='__main__': - db = VMDatabase() - db.addMachine(1, 'foo', '1.2.3.4', 88, 2, 1) - db.setMachineUser(1, 'jeblair') + db = VMDatabase("/tmp/vm.db") + db.addMachine('rackspace', 1, 'devstack', 'foo', '1.2.3.4', 'uuid1') + db.setMachineState('uuid1', READY) + db.addMachine('rackspace', 2, 'devstack', 'foo2', '1.2.3.4', 'uuid2') + db.setMachineState('uuid2', READY) + m = db.getMachineForUse() + print 'got machine' + print m + db.setMachineUser(m['uuid'], 'jeblair') print db.getMachines() - print db.getMachine(88,2,1) - db.delMachine(1) + print db.getMachine(1) + print 'waiting to delete' + time.sleep(2) + db.delMachine('uuid1') + db.delMachine('uuid2') +