Collect request handling implementation in an OpenStack driver
This change moves OpenStack related code to a driver. To avoid circular import, this change also moves the StatsReporter to the stats module so that the handlers doesn't have to import the launcher. Change-Id: I319ce8780aa7e81b079c3f31d546b89eca6cf5f4 Story: 2001044 Task: 4614
This commit is contained in:
parent
27b600ee2c
commit
4d201328f5
|
@ -266,7 +266,7 @@ class NodePoolCmd(NodepoolApp):
|
||||||
if self.args.now:
|
if self.args.now:
|
||||||
manager = provider_manager.get_provider(provider, True)
|
manager = provider_manager.get_provider(provider, True)
|
||||||
manager.start()
|
manager.start()
|
||||||
launcher.InstanceDeleter.delete(self.zk, manager, node)
|
launcher.NodeDeleter.delete(self.zk, manager, node)
|
||||||
manager.stop()
|
manager.stop()
|
||||||
else:
|
else:
|
||||||
node.state = zk.DELETING
|
node.state = zk.DELETING
|
||||||
|
|
|
@ -0,0 +1,34 @@
|
||||||
|
# Copyright (C) 2011-2013 OpenStack Foundation
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||||
|
# implied.
|
||||||
|
#
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
from nodepool import fakeprovider
|
||||||
|
from nodepool.driver.openstack.provider import OpenStackProvider
|
||||||
|
|
||||||
|
|
||||||
|
class FakeProvider(OpenStackProvider):
|
||||||
|
def __init__(self, provider, use_taskmanager):
|
||||||
|
self.createServer_fails = 0
|
||||||
|
self.__client = fakeprovider.FakeOpenStackCloud()
|
||||||
|
super(FakeProvider, self).__init__(provider, use_taskmanager)
|
||||||
|
|
||||||
|
def _getClient(self):
|
||||||
|
return self.__client
|
||||||
|
|
||||||
|
def createServer(self, *args, **kwargs):
|
||||||
|
while self.createServer_fails:
|
||||||
|
self.createServer_fails -= 1
|
||||||
|
raise Exception("Expected createServer exception")
|
||||||
|
return super(FakeProvider, self).createServer(*args, **kwargs)
|
|
@ -0,0 +1,506 @@
|
||||||
|
# Copyright (C) 2011-2014 OpenStack Foundation
|
||||||
|
# Copyright 2017 Red Hat
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
# not use this file except in compliance with the License. You may obtain
|
||||||
|
# a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||||
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||||
|
# License for the specific language governing permissions and limitations
|
||||||
|
# under the License.
|
||||||
|
|
||||||
|
import collections
|
||||||
|
import logging
|
||||||
|
import pprint
|
||||||
|
import random
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
|
|
||||||
|
from nodepool import exceptions
|
||||||
|
from nodepool import nodeutils as utils
|
||||||
|
from nodepool import stats
|
||||||
|
from nodepool import zk
|
||||||
|
from nodepool.driver import NodeLaunchManager
|
||||||
|
from nodepool.driver import NodeRequestHandler
|
||||||
|
|
||||||
|
|
||||||
|
class NodeLauncher(threading.Thread, stats.StatsReporter):
|
||||||
|
log = logging.getLogger("nodepool.driver.openstack."
|
||||||
|
"NodeLauncher")
|
||||||
|
|
||||||
|
def __init__(self, zk, provider_label, provider_manager, requestor,
|
||||||
|
node, retries):
|
||||||
|
'''
|
||||||
|
Initialize the launcher.
|
||||||
|
|
||||||
|
:param ZooKeeper zk: A ZooKeeper object.
|
||||||
|
:param ProviderLabel provider: A config ProviderLabel object.
|
||||||
|
:param ProviderManager provider_manager: The manager object used to
|
||||||
|
interact with the selected provider.
|
||||||
|
:param str requestor: Identifier for the request originator.
|
||||||
|
:param Node node: The node object.
|
||||||
|
:param int retries: Number of times to retry failed launches.
|
||||||
|
'''
|
||||||
|
threading.Thread.__init__(self, name="NodeLauncher-%s" % node.id)
|
||||||
|
stats.StatsReporter.__init__(self)
|
||||||
|
self.log = logging.getLogger("nodepool.NodeLauncher-%s" % node.id)
|
||||||
|
self._zk = zk
|
||||||
|
self._label = provider_label
|
||||||
|
self._manager = provider_manager
|
||||||
|
self._node = node
|
||||||
|
self._retries = retries
|
||||||
|
self._image_name = None
|
||||||
|
self._requestor = requestor
|
||||||
|
|
||||||
|
self._pool = self._label.pool
|
||||||
|
self._provider = self._pool.provider
|
||||||
|
if self._label.diskimage:
|
||||||
|
self._diskimage = self._provider.diskimages[self._label.diskimage.name]
|
||||||
|
else:
|
||||||
|
self._diskimage = None
|
||||||
|
self._cloud_image = self._provider.cloud_images.get(self._label.cloud_image, None)
|
||||||
|
|
||||||
|
def logConsole(self, server_id, hostname):
|
||||||
|
if not self._label.console_log:
|
||||||
|
return
|
||||||
|
console = self._manager.getServerConsole(server_id)
|
||||||
|
if console:
|
||||||
|
self.log.debug('Console log from hostname %s:' % hostname)
|
||||||
|
for line in console.splitlines():
|
||||||
|
self.log.debug(line.rstrip())
|
||||||
|
|
||||||
|
def _launchNode(self):
|
||||||
|
if self._label.diskimage:
|
||||||
|
# launch using diskimage
|
||||||
|
cloud_image = self._zk.getMostRecentImageUpload(
|
||||||
|
self._diskimage.name, self._provider.name)
|
||||||
|
|
||||||
|
if not cloud_image:
|
||||||
|
raise exceptions.LaunchNodepoolException(
|
||||||
|
"Unable to find current cloud image %s in %s" %
|
||||||
|
(self._diskimage.name, self._provider.name)
|
||||||
|
)
|
||||||
|
|
||||||
|
config_drive = self._diskimage.config_drive
|
||||||
|
image_external = dict(id=cloud_image.external_id)
|
||||||
|
image_id = "{path}/{upload_id}".format(
|
||||||
|
path=self._zk._imageUploadPath(cloud_image.image_name,
|
||||||
|
cloud_image.build_id,
|
||||||
|
cloud_image.provider_name),
|
||||||
|
upload_id=cloud_image.id)
|
||||||
|
image_name = self._diskimage.name
|
||||||
|
|
||||||
|
else:
|
||||||
|
# launch using unmanaged cloud image
|
||||||
|
config_drive = self._cloud_image.config_drive
|
||||||
|
|
||||||
|
# These are different values for zk, but it's all the same
|
||||||
|
# for cloud-images.
|
||||||
|
# image_external is what we use for OpenStack.
|
||||||
|
# image_id is what we record in the node for zk.
|
||||||
|
# image_name is what we log, so matches the config.
|
||||||
|
image_external = self._cloud_image.name
|
||||||
|
if self._cloud_image.image_id:
|
||||||
|
image_external = dict(id=self._cloud_image.image_id)
|
||||||
|
elif self._cloud_image.image_name:
|
||||||
|
image_external = self._cloud_image.image_name
|
||||||
|
else:
|
||||||
|
image_external = self._cloud_image.name
|
||||||
|
image_id = self._cloud_image.name
|
||||||
|
image_name = self._cloud_image.name
|
||||||
|
|
||||||
|
hostname = self._provider.hostname_format.format(
|
||||||
|
label=self._label, provider=self._provider, node=self._node
|
||||||
|
)
|
||||||
|
|
||||||
|
self.log.info("Creating server with hostname %s in %s from image %s "
|
||||||
|
"for node id: %s" % (hostname, self._provider.name,
|
||||||
|
image_name,
|
||||||
|
self._node.id))
|
||||||
|
|
||||||
|
# NOTE: We store the node ID in the server metadata to use for leaked
|
||||||
|
# instance detection. We cannot use the external server ID for this
|
||||||
|
# because that isn't available in ZooKeeper until after the server is
|
||||||
|
# active, which could cause a race in leak detection.
|
||||||
|
|
||||||
|
server = self._manager.createServer(
|
||||||
|
hostname,
|
||||||
|
image=image_external,
|
||||||
|
min_ram=self._label.min_ram,
|
||||||
|
flavor_name=self._label.flavor_name,
|
||||||
|
key_name=self._label.key_name,
|
||||||
|
az=self._node.az,
|
||||||
|
config_drive=config_drive,
|
||||||
|
nodepool_node_id=self._node.id,
|
||||||
|
nodepool_image_name=image_name,
|
||||||
|
networks=self._pool.networks,
|
||||||
|
boot_from_volume=self._label.boot_from_volume,
|
||||||
|
volume_size=self._label.volume_size)
|
||||||
|
|
||||||
|
self._node.external_id = server.id
|
||||||
|
self._node.hostname = hostname
|
||||||
|
self._node.image_id = image_id
|
||||||
|
|
||||||
|
# Checkpoint save the updated node info
|
||||||
|
self._zk.storeNode(self._node)
|
||||||
|
|
||||||
|
self.log.debug("Waiting for server %s for node id: %s" %
|
||||||
|
(server.id, self._node.id))
|
||||||
|
server = self._manager.waitForServer(
|
||||||
|
server, self._provider.launch_timeout)
|
||||||
|
|
||||||
|
if server.status != 'ACTIVE':
|
||||||
|
raise exceptions.LaunchStatusException("Server %s for node id: %s "
|
||||||
|
"status: %s" %
|
||||||
|
(server.id, self._node.id,
|
||||||
|
server.status))
|
||||||
|
|
||||||
|
# If we didn't specify an AZ, set it to the one chosen by Nova.
|
||||||
|
# Do this after we are done waiting since AZ may not be available
|
||||||
|
# immediately after the create request.
|
||||||
|
if not self._node.az:
|
||||||
|
self._node.az = server.location.zone
|
||||||
|
|
||||||
|
interface_ip = server.interface_ip
|
||||||
|
if not interface_ip:
|
||||||
|
self.log.debug(
|
||||||
|
"Server data for failed IP: %s" % pprint.pformat(
|
||||||
|
server))
|
||||||
|
raise exceptions.LaunchNetworkException(
|
||||||
|
"Unable to find public IP of server")
|
||||||
|
|
||||||
|
self._node.interface_ip = interface_ip
|
||||||
|
self._node.public_ipv4 = server.public_v4
|
||||||
|
self._node.public_ipv6 = server.public_v6
|
||||||
|
self._node.private_ipv4 = server.private_v4
|
||||||
|
# devstack-gate multi-node depends on private_v4 being populated
|
||||||
|
# with something. On clouds that don't have a private address, use
|
||||||
|
# the public.
|
||||||
|
if not self._node.private_ipv4:
|
||||||
|
self._node.private_ipv4 = server.public_v4
|
||||||
|
|
||||||
|
# Checkpoint save the updated node info
|
||||||
|
self._zk.storeNode(self._node)
|
||||||
|
|
||||||
|
self.log.debug(
|
||||||
|
"Node %s is running [region: %s, az: %s, ip: %s ipv4: %s, "
|
||||||
|
"ipv6: %s]" %
|
||||||
|
(self._node.id, self._node.region, self._node.az,
|
||||||
|
self._node.interface_ip, self._node.public_ipv4,
|
||||||
|
self._node.public_ipv6))
|
||||||
|
|
||||||
|
# Get the SSH public keys for the new node and record in ZooKeeper
|
||||||
|
try:
|
||||||
|
self.log.debug("Gathering host keys for node %s", self._node.id)
|
||||||
|
host_keys = utils.keyscan(
|
||||||
|
interface_ip, timeout=self._provider.boot_timeout)
|
||||||
|
if not host_keys:
|
||||||
|
raise exceptions.LaunchKeyscanException(
|
||||||
|
"Unable to gather host keys")
|
||||||
|
except exceptions.SSHTimeoutException:
|
||||||
|
self.logConsole(self._node.external_id, self._node.hostname)
|
||||||
|
raise
|
||||||
|
|
||||||
|
self._node.host_keys = host_keys
|
||||||
|
self._zk.storeNode(self._node)
|
||||||
|
|
||||||
|
def _run(self):
|
||||||
|
attempts = 1
|
||||||
|
while attempts <= self._retries:
|
||||||
|
try:
|
||||||
|
self._launchNode()
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
if attempts <= self._retries:
|
||||||
|
self.log.exception(
|
||||||
|
"Launch attempt %d/%d failed for node %s:",
|
||||||
|
attempts, self._retries, self._node.id)
|
||||||
|
# If we created an instance, delete it.
|
||||||
|
if self._node.external_id:
|
||||||
|
self._manager.cleanupNode(self._node.external_id)
|
||||||
|
self._manager.waitForNodeCleanup(self._node.external_id)
|
||||||
|
self._node.external_id = None
|
||||||
|
self._node.public_ipv4 = None
|
||||||
|
self._node.public_ipv6 = None
|
||||||
|
self._node.inerface_ip = None
|
||||||
|
self._zk.storeNode(self._node)
|
||||||
|
if attempts == self._retries:
|
||||||
|
raise
|
||||||
|
attempts += 1
|
||||||
|
|
||||||
|
self._node.state = zk.READY
|
||||||
|
self._zk.storeNode(self._node)
|
||||||
|
self.log.info("Node id %s is ready", self._node.id)
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
start_time = time.time()
|
||||||
|
statsd_key = 'ready'
|
||||||
|
|
||||||
|
try:
|
||||||
|
self._run()
|
||||||
|
except Exception as e:
|
||||||
|
self.log.exception("Launch failed for node %s:",
|
||||||
|
self._node.id)
|
||||||
|
self._node.state = zk.FAILED
|
||||||
|
self._zk.storeNode(self._node)
|
||||||
|
|
||||||
|
if hasattr(e, 'statsd_key'):
|
||||||
|
statsd_key = e.statsd_key
|
||||||
|
else:
|
||||||
|
statsd_key = 'error.unknown'
|
||||||
|
|
||||||
|
try:
|
||||||
|
dt = int((time.time() - start_time) * 1000)
|
||||||
|
self.recordLaunchStats(statsd_key, dt, self._image_name,
|
||||||
|
self._node.provider, self._node.az,
|
||||||
|
self._requestor)
|
||||||
|
self.updateNodeStats(self._zk, self._provider)
|
||||||
|
except Exception:
|
||||||
|
self.log.exception("Exception while reporting stats:")
|
||||||
|
|
||||||
|
|
||||||
|
class OpenStackNodeLaunchManager(NodeLaunchManager):
|
||||||
|
def launch(self, node):
|
||||||
|
'''
|
||||||
|
Launch a new node as described by the supplied Node.
|
||||||
|
|
||||||
|
We expect each NodeLauncher thread to directly modify the node that
|
||||||
|
is passed to it. The poll() method will expect to see the node.state
|
||||||
|
attribute to change as the node is processed.
|
||||||
|
|
||||||
|
:param Node node: The node object.
|
||||||
|
'''
|
||||||
|
self._nodes.append(node)
|
||||||
|
provider_label = self._pool.labels[node.type]
|
||||||
|
t = NodeLauncher(self._zk, provider_label, self._manager,
|
||||||
|
self._requestor, node, self._retries)
|
||||||
|
t.start()
|
||||||
|
self._threads.append(t)
|
||||||
|
|
||||||
|
|
||||||
|
class OpenStackNodeRequestHandler(NodeRequestHandler):
|
||||||
|
log = logging.getLogger("nodepool.driver.openstack."
|
||||||
|
"OpenStackNodeRequestHandler")
|
||||||
|
|
||||||
|
def __init__(self, pw, request):
|
||||||
|
super(OpenStackNodeRequestHandler, self).__init__(pw, request)
|
||||||
|
self.chosen_az = None
|
||||||
|
|
||||||
|
def _imagesAvailable(self):
|
||||||
|
'''
|
||||||
|
Determines if the requested images are available for this provider.
|
||||||
|
|
||||||
|
ZooKeeper is queried for an image uploaded to the provider that is
|
||||||
|
in the READY state.
|
||||||
|
|
||||||
|
:returns: True if it is available, False otherwise.
|
||||||
|
'''
|
||||||
|
for label in self.request.node_types:
|
||||||
|
|
||||||
|
if self.pool.labels[label].cloud_image:
|
||||||
|
img = self.pool.labels[label].cloud_image
|
||||||
|
if not self.manager.labelReady(img):
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
img = self.pool.labels[label].diskimage.name
|
||||||
|
|
||||||
|
if not self.zk.getMostRecentImageUpload(img, self.provider.name):
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
def _invalidNodeTypes(self):
|
||||||
|
'''
|
||||||
|
Return any node types that are invalid for this provider.
|
||||||
|
|
||||||
|
:returns: A list of node type names that are invalid, or an empty
|
||||||
|
list if all are valid.
|
||||||
|
'''
|
||||||
|
invalid = []
|
||||||
|
for ntype in self.request.node_types:
|
||||||
|
if ntype not in self.pool.labels:
|
||||||
|
invalid.append(ntype)
|
||||||
|
return invalid
|
||||||
|
|
||||||
|
def _countNodes(self):
|
||||||
|
'''
|
||||||
|
Query ZooKeeper to determine the number of provider nodes launched.
|
||||||
|
|
||||||
|
:returns: An integer for the number launched for this provider.
|
||||||
|
'''
|
||||||
|
count = 0
|
||||||
|
for node in self.zk.nodeIterator():
|
||||||
|
if (node.provider == self.provider.name and
|
||||||
|
node.pool == self.pool.name):
|
||||||
|
count += 1
|
||||||
|
return count
|
||||||
|
|
||||||
|
def _waitForNodeSet(self):
|
||||||
|
'''
|
||||||
|
Fill node set for the request.
|
||||||
|
|
||||||
|
Obtain nodes for the request, pausing all new request handling for
|
||||||
|
this provider until the node set can be filled.
|
||||||
|
|
||||||
|
We attempt to group the node set within the same provider availability
|
||||||
|
zone. For this to work properly, the provider entry in the nodepool
|
||||||
|
config must list the availability zones. Otherwise, new nodes will be
|
||||||
|
put in random AZs at nova's whim. The exception being if there is an
|
||||||
|
existing node in the READY state that we can select for this node set.
|
||||||
|
Its AZ will then be used for new nodes, as well as any other READY
|
||||||
|
nodes.
|
||||||
|
|
||||||
|
note:: This code is a bit racey in its calculation of the number of
|
||||||
|
nodes in use for quota purposes. It is possible for multiple
|
||||||
|
launchers to be doing this calculation at the same time. Since we
|
||||||
|
currently have no locking mechanism around the "in use"
|
||||||
|
calculation, if we are at the edge of the quota, one of the
|
||||||
|
launchers could attempt to launch a new node after the other
|
||||||
|
launcher has already started doing so. This would cause an
|
||||||
|
expected failure from the underlying library, which is ok for now.
|
||||||
|
'''
|
||||||
|
if not self.launch_manager:
|
||||||
|
self.launch_manager = OpenStackNodeLaunchManager(
|
||||||
|
self.zk, self.pool, self.manager,
|
||||||
|
self.request.requestor, retries=self.provider.launch_retries)
|
||||||
|
|
||||||
|
# Since this code can be called more than once for the same request,
|
||||||
|
# we need to calculate the difference between our current node set
|
||||||
|
# and what was requested. We cannot use set operations here since a
|
||||||
|
# node type can appear more than once in the requested types.
|
||||||
|
saved_types = collections.Counter([n.type for n in self.nodeset])
|
||||||
|
requested_types = collections.Counter(self.request.node_types)
|
||||||
|
diff = requested_types - saved_types
|
||||||
|
needed_types = list(diff.elements())
|
||||||
|
|
||||||
|
ready_nodes = self.zk.getReadyNodesOfTypes(needed_types)
|
||||||
|
|
||||||
|
for ntype in needed_types:
|
||||||
|
# First try to grab from the list of already available nodes.
|
||||||
|
got_a_node = False
|
||||||
|
if self.request.reuse and ntype in ready_nodes:
|
||||||
|
for node in ready_nodes[ntype]:
|
||||||
|
# Only interested in nodes from this provider and
|
||||||
|
# pool, and within the selected AZ.
|
||||||
|
if node.provider != self.provider.name:
|
||||||
|
continue
|
||||||
|
if node.pool != self.pool.name:
|
||||||
|
continue
|
||||||
|
if self.chosen_az and node.az != self.chosen_az:
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.zk.lockNode(node, blocking=False)
|
||||||
|
except exceptions.ZKLockException:
|
||||||
|
# It's already locked so skip it.
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
if self.paused:
|
||||||
|
self.log.debug("Unpaused request %s", self.request)
|
||||||
|
self.paused = False
|
||||||
|
|
||||||
|
self.log.debug(
|
||||||
|
"Locked existing node %s for request %s",
|
||||||
|
node.id, self.request.id)
|
||||||
|
got_a_node = True
|
||||||
|
node.allocated_to = self.request.id
|
||||||
|
self.zk.storeNode(node)
|
||||||
|
self.nodeset.append(node)
|
||||||
|
|
||||||
|
# If we haven't already chosen an AZ, select the
|
||||||
|
# AZ from this ready node. This will cause new nodes
|
||||||
|
# to share this AZ, as well.
|
||||||
|
if not self.chosen_az and node.az:
|
||||||
|
self.chosen_az = node.az
|
||||||
|
break
|
||||||
|
|
||||||
|
# Could not grab an existing node, so launch a new one.
|
||||||
|
if not got_a_node:
|
||||||
|
# Select grouping AZ if we didn't set AZ from a selected,
|
||||||
|
# pre-existing node
|
||||||
|
if not self.chosen_az:
|
||||||
|
self.chosen_az = random.choice(
|
||||||
|
self.pool.azs or self.manager.getAZs())
|
||||||
|
|
||||||
|
# If we calculate that we're at capacity, pause until nodes
|
||||||
|
# are released by Zuul and removed by the DeletedNodeWorker.
|
||||||
|
if self._countNodes() >= self.pool.max_servers:
|
||||||
|
if not self.paused:
|
||||||
|
self.log.debug(
|
||||||
|
"Pausing request handling to satisfy request %s",
|
||||||
|
self.request)
|
||||||
|
self.paused = True
|
||||||
|
return
|
||||||
|
|
||||||
|
if self.paused:
|
||||||
|
self.log.debug("Unpaused request %s", self.request)
|
||||||
|
self.paused = False
|
||||||
|
|
||||||
|
node = zk.Node()
|
||||||
|
node.state = zk.INIT
|
||||||
|
node.type = ntype
|
||||||
|
node.provider = self.provider.name
|
||||||
|
node.pool = self.pool.name
|
||||||
|
node.az = self.chosen_az
|
||||||
|
node.region = self.provider.region_name
|
||||||
|
node.launcher = self.launcher_id
|
||||||
|
node.allocated_to = self.request.id
|
||||||
|
|
||||||
|
# Note: It should be safe (i.e., no race) to lock the node
|
||||||
|
# *after* it is stored since nodes in INIT state are not
|
||||||
|
# locked anywhere.
|
||||||
|
self.zk.storeNode(node)
|
||||||
|
self.zk.lockNode(node, blocking=False)
|
||||||
|
self.log.debug("Locked building node %s for request %s",
|
||||||
|
node.id, self.request.id)
|
||||||
|
|
||||||
|
# Set state AFTER lock so sthat it isn't accidentally cleaned
|
||||||
|
# up (unlocked BUILDING nodes will be deleted).
|
||||||
|
node.state = zk.BUILDING
|
||||||
|
self.zk.storeNode(node)
|
||||||
|
|
||||||
|
self.nodeset.append(node)
|
||||||
|
self.launch_manager.launch(node)
|
||||||
|
|
||||||
|
def run_handler(self):
|
||||||
|
'''
|
||||||
|
Main body for the OpenStackNodeRequestHandler.
|
||||||
|
'''
|
||||||
|
self._setFromPoolWorker()
|
||||||
|
declined_reasons = []
|
||||||
|
invalid_types = self._invalidNodeTypes()
|
||||||
|
if invalid_types:
|
||||||
|
declined_reasons.append('node type(s) [%s] not available' %
|
||||||
|
','.join(invalid_types))
|
||||||
|
elif not self._imagesAvailable():
|
||||||
|
declined_reasons.append('images are not available')
|
||||||
|
if len(self.request.node_types) > self.pool.max_servers:
|
||||||
|
declined_reasons.append('it would exceed quota')
|
||||||
|
|
||||||
|
if declined_reasons:
|
||||||
|
self.log.debug("Declining node request %s because %s",
|
||||||
|
self.request.id, ', '.join(declined_reasons))
|
||||||
|
self.request.declined_by.append(self.launcher_id)
|
||||||
|
launchers = set(self.zk.getRegisteredLaunchers())
|
||||||
|
if launchers.issubset(set(self.request.declined_by)):
|
||||||
|
self.log.debug("Failing declined node request %s",
|
||||||
|
self.request.id)
|
||||||
|
# All launchers have declined it
|
||||||
|
self.request.state = zk.FAILED
|
||||||
|
self.unlockNodeSet(clear_allocation=True)
|
||||||
|
self.zk.storeNodeRequest(self.request)
|
||||||
|
self.zk.unlockNodeRequest(self.request)
|
||||||
|
self.done = True
|
||||||
|
return
|
||||||
|
|
||||||
|
if self.paused:
|
||||||
|
self.log.debug("Retrying node request %s", self.request.id)
|
||||||
|
else:
|
||||||
|
self.log.debug("Accepting node request %s", self.request.id)
|
||||||
|
self.request.state = zk.PENDING
|
||||||
|
self.zk.storeNodeRequest(self.request)
|
||||||
|
|
||||||
|
self._waitForNodeSet()
|
|
@ -0,0 +1,344 @@
|
||||||
|
# Copyright (C) 2011-2013 OpenStack Foundation
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||||
|
# implied.
|
||||||
|
#
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from contextlib import contextmanager
|
||||||
|
import operator
|
||||||
|
|
||||||
|
import shade
|
||||||
|
|
||||||
|
from nodepool import exceptions
|
||||||
|
from nodepool.driver import Provider
|
||||||
|
from nodepool.nodeutils import iterate_timeout
|
||||||
|
from nodepool.task_manager import ManagerStoppedException
|
||||||
|
from nodepool.task_manager import TaskManager
|
||||||
|
|
||||||
|
|
||||||
|
IPS_LIST_AGE = 5 # How long to keep a cached copy of the ip list
|
||||||
|
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def shade_inner_exceptions():
|
||||||
|
try:
|
||||||
|
yield
|
||||||
|
except shade.OpenStackCloudException as e:
|
||||||
|
e.log_error()
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
class OpenStackProvider(Provider):
|
||||||
|
log = logging.getLogger("nodepool.driver.openstack.OpenStackProvider")
|
||||||
|
|
||||||
|
def __init__(self, provider, use_taskmanager):
|
||||||
|
self.provider = provider
|
||||||
|
self._images = {}
|
||||||
|
self._networks = {}
|
||||||
|
self.__flavors = {}
|
||||||
|
self.__azs = None
|
||||||
|
self._use_taskmanager = use_taskmanager
|
||||||
|
self._taskmanager = None
|
||||||
|
|
||||||
|
def start(self):
|
||||||
|
if self._use_taskmanager:
|
||||||
|
self._taskmanager = TaskManager(None, self.provider.name,
|
||||||
|
self.provider.rate)
|
||||||
|
self._taskmanager.start()
|
||||||
|
self.resetClient()
|
||||||
|
|
||||||
|
def stop(self):
|
||||||
|
if self._taskmanager:
|
||||||
|
self._taskmanager.stop()
|
||||||
|
|
||||||
|
def join(self):
|
||||||
|
if self._taskmanager:
|
||||||
|
self._taskmanager.join()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def _flavors(self):
|
||||||
|
if not self.__flavors:
|
||||||
|
self.__flavors = self._getFlavors()
|
||||||
|
return self.__flavors
|
||||||
|
|
||||||
|
def _getClient(self):
|
||||||
|
if self._use_taskmanager:
|
||||||
|
manager = self._taskmanager
|
||||||
|
else:
|
||||||
|
manager = None
|
||||||
|
return shade.OpenStackCloud(
|
||||||
|
cloud_config=self.provider.cloud_config,
|
||||||
|
manager=manager,
|
||||||
|
**self.provider.cloud_config.config)
|
||||||
|
|
||||||
|
def resetClient(self):
|
||||||
|
self._client = self._getClient()
|
||||||
|
if self._use_taskmanager:
|
||||||
|
self._taskmanager.setClient(self._client)
|
||||||
|
|
||||||
|
def _getFlavors(self):
|
||||||
|
flavors = self.listFlavors()
|
||||||
|
flavors.sort(key=operator.itemgetter('ram'))
|
||||||
|
return flavors
|
||||||
|
|
||||||
|
# TODO(mordred): These next three methods duplicate logic that is in
|
||||||
|
# shade, but we can't defer to shade until we're happy
|
||||||
|
# with using shade's resource caching facility. We have
|
||||||
|
# not yet proven that to our satisfaction, but if/when
|
||||||
|
# we do, these should be able to go away.
|
||||||
|
def _findFlavorByName(self, flavor_name):
|
||||||
|
for f in self._flavors:
|
||||||
|
if flavor_name in (f['name'], f['id']):
|
||||||
|
return f
|
||||||
|
raise Exception("Unable to find flavor: %s" % flavor_name)
|
||||||
|
|
||||||
|
def _findFlavorByRam(self, min_ram, flavor_name):
|
||||||
|
for f in self._flavors:
|
||||||
|
if (f['ram'] >= min_ram
|
||||||
|
and (not flavor_name or flavor_name in f['name'])):
|
||||||
|
return f
|
||||||
|
raise Exception("Unable to find flavor with min ram: %s" % min_ram)
|
||||||
|
|
||||||
|
def findFlavor(self, flavor_name, min_ram):
|
||||||
|
# Note: this will throw an error if the provider is offline
|
||||||
|
# but all the callers are in threads (they call in via CreateServer) so
|
||||||
|
# the mainloop won't be affected.
|
||||||
|
if min_ram:
|
||||||
|
return self._findFlavorByRam(min_ram, flavor_name)
|
||||||
|
else:
|
||||||
|
return self._findFlavorByName(flavor_name)
|
||||||
|
|
||||||
|
def findImage(self, name):
|
||||||
|
if name in self._images:
|
||||||
|
return self._images[name]
|
||||||
|
|
||||||
|
with shade_inner_exceptions():
|
||||||
|
image = self._client.get_image(name)
|
||||||
|
self._images[name] = image
|
||||||
|
return image
|
||||||
|
|
||||||
|
def findNetwork(self, name):
|
||||||
|
if name in self._networks:
|
||||||
|
return self._networks[name]
|
||||||
|
|
||||||
|
with shade_inner_exceptions():
|
||||||
|
network = self._client.get_network(name)
|
||||||
|
self._networks[name] = network
|
||||||
|
return network
|
||||||
|
|
||||||
|
def deleteImage(self, name):
|
||||||
|
if name in self._images:
|
||||||
|
del self._images[name]
|
||||||
|
|
||||||
|
with shade_inner_exceptions():
|
||||||
|
return self._client.delete_image(name)
|
||||||
|
|
||||||
|
def createServer(self, name, image,
|
||||||
|
flavor_name=None, min_ram=None,
|
||||||
|
az=None, key_name=None, config_drive=True,
|
||||||
|
nodepool_node_id=None, nodepool_node_label=None,
|
||||||
|
nodepool_image_name=None,
|
||||||
|
networks=None, boot_from_volume=False, volume_size=50):
|
||||||
|
if not networks:
|
||||||
|
networks = []
|
||||||
|
if not isinstance(image, dict):
|
||||||
|
# if it's a dict, we already have the cloud id. If it's not,
|
||||||
|
# we don't know if it's name or ID so need to look it up
|
||||||
|
image = self.findImage(image)
|
||||||
|
flavor = self.findFlavor(flavor_name=flavor_name, min_ram=min_ram)
|
||||||
|
create_args = dict(name=name,
|
||||||
|
image=image,
|
||||||
|
flavor=flavor,
|
||||||
|
config_drive=config_drive)
|
||||||
|
if boot_from_volume:
|
||||||
|
create_args['boot_from_volume'] = boot_from_volume
|
||||||
|
create_args['volume_size'] = volume_size
|
||||||
|
# NOTE(pabelanger): Always cleanup volumes when we delete a server.
|
||||||
|
create_args['terminate_volume'] = True
|
||||||
|
if key_name:
|
||||||
|
create_args['key_name'] = key_name
|
||||||
|
if az:
|
||||||
|
create_args['availability_zone'] = az
|
||||||
|
nics = []
|
||||||
|
for network in networks:
|
||||||
|
net_id = self.findNetwork(network)['id']
|
||||||
|
nics.append({'net-id': net_id})
|
||||||
|
if nics:
|
||||||
|
create_args['nics'] = nics
|
||||||
|
# Put provider.name and image_name in as groups so that ansible
|
||||||
|
# inventory can auto-create groups for us based on each of those
|
||||||
|
# qualities
|
||||||
|
# Also list each of those values directly so that non-ansible
|
||||||
|
# consumption programs don't need to play a game of knowing that
|
||||||
|
# groups[0] is the image name or anything silly like that.
|
||||||
|
groups_list = [self.provider.name]
|
||||||
|
|
||||||
|
if nodepool_image_name:
|
||||||
|
groups_list.append(nodepool_image_name)
|
||||||
|
if nodepool_node_label:
|
||||||
|
groups_list.append(nodepool_node_label)
|
||||||
|
meta = dict(
|
||||||
|
groups=",".join(groups_list),
|
||||||
|
nodepool_provider_name=self.provider.name,
|
||||||
|
)
|
||||||
|
if nodepool_node_id:
|
||||||
|
meta['nodepool_node_id'] = nodepool_node_id
|
||||||
|
if nodepool_image_name:
|
||||||
|
meta['nodepool_image_name'] = nodepool_image_name
|
||||||
|
if nodepool_node_label:
|
||||||
|
meta['nodepool_node_label'] = nodepool_node_label
|
||||||
|
create_args['meta'] = meta
|
||||||
|
|
||||||
|
with shade_inner_exceptions():
|
||||||
|
return self._client.create_server(wait=False, **create_args)
|
||||||
|
|
||||||
|
def getServer(self, server_id):
|
||||||
|
with shade_inner_exceptions():
|
||||||
|
return self._client.get_server(server_id)
|
||||||
|
|
||||||
|
def getServerConsole(self, server_id):
|
||||||
|
try:
|
||||||
|
with shade_inner_exceptions():
|
||||||
|
return self._client.get_server_console(server_id)
|
||||||
|
except shade.OpenStackCloudException:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def waitForServer(self, server, timeout=3600):
|
||||||
|
with shade_inner_exceptions():
|
||||||
|
return self._client.wait_for_server(
|
||||||
|
server=server, auto_ip=True, reuse=False,
|
||||||
|
timeout=timeout)
|
||||||
|
|
||||||
|
def waitForNodeCleanup(self, server_id, timeout=600):
|
||||||
|
for count in iterate_timeout(
|
||||||
|
timeout, exceptions.ServerDeleteException,
|
||||||
|
"server %s deletion" % server_id):
|
||||||
|
if not self.getServer(server_id):
|
||||||
|
return
|
||||||
|
|
||||||
|
def waitForImage(self, image_id, timeout=3600):
|
||||||
|
last_status = None
|
||||||
|
for count in iterate_timeout(
|
||||||
|
timeout, exceptions.ImageCreateException, "image creation"):
|
||||||
|
try:
|
||||||
|
image = self.getImage(image_id)
|
||||||
|
except exceptions.NotFound:
|
||||||
|
continue
|
||||||
|
except ManagerStoppedException:
|
||||||
|
raise
|
||||||
|
except Exception:
|
||||||
|
self.log.exception('Unable to list images while waiting for '
|
||||||
|
'%s will retry' % (image_id))
|
||||||
|
continue
|
||||||
|
|
||||||
|
# shade returns None when not found
|
||||||
|
if not image:
|
||||||
|
continue
|
||||||
|
|
||||||
|
status = image['status']
|
||||||
|
if (last_status != status):
|
||||||
|
self.log.debug(
|
||||||
|
'Status of image in {provider} {id}: {status}'.format(
|
||||||
|
provider=self.provider.name,
|
||||||
|
id=image_id,
|
||||||
|
status=status))
|
||||||
|
if status == 'ERROR' and 'fault' in image:
|
||||||
|
self.log.debug(
|
||||||
|
'ERROR in {provider} on {id}: {resason}'.format(
|
||||||
|
provider=self.provider.name,
|
||||||
|
id=image_id,
|
||||||
|
resason=image['fault']['message']))
|
||||||
|
last_status = status
|
||||||
|
# Glance client returns lower case statuses - but let's be sure
|
||||||
|
if status.lower() in ['active', 'error']:
|
||||||
|
return image
|
||||||
|
|
||||||
|
def createImage(self, server, image_name, meta):
|
||||||
|
with shade_inner_exceptions():
|
||||||
|
return self._client.create_image_snapshot(
|
||||||
|
image_name, server, **meta)
|
||||||
|
|
||||||
|
def getImage(self, image_id):
|
||||||
|
with shade_inner_exceptions():
|
||||||
|
return self._client.get_image(image_id)
|
||||||
|
|
||||||
|
def labelReady(self, image_id):
|
||||||
|
return self.getImage(image_id)
|
||||||
|
|
||||||
|
def uploadImage(self, image_name, filename, image_type=None, meta=None,
|
||||||
|
md5=None, sha256=None):
|
||||||
|
# configure glance and upload image. Note the meta flags
|
||||||
|
# are provided as custom glance properties
|
||||||
|
# NOTE: we have wait=True set here. This is not how we normally
|
||||||
|
# do things in nodepool, preferring to poll ourselves thankyouverymuch.
|
||||||
|
# However - two things to note:
|
||||||
|
# - PUT has no aysnc mechanism, so we have to handle it anyway
|
||||||
|
# - v2 w/task waiting is very strange and complex - but we have to
|
||||||
|
# block for our v1 clouds anyway, so we might as well
|
||||||
|
# have the interface be the same and treat faking-out
|
||||||
|
# a shade-level fake-async interface later
|
||||||
|
if not meta:
|
||||||
|
meta = {}
|
||||||
|
if image_type:
|
||||||
|
meta['disk_format'] = image_type
|
||||||
|
with shade_inner_exceptions():
|
||||||
|
image = self._client.create_image(
|
||||||
|
name=image_name,
|
||||||
|
filename=filename,
|
||||||
|
is_public=False,
|
||||||
|
wait=True,
|
||||||
|
md5=md5,
|
||||||
|
sha256=sha256,
|
||||||
|
**meta)
|
||||||
|
return image.id
|
||||||
|
|
||||||
|
def listImages(self):
|
||||||
|
with shade_inner_exceptions():
|
||||||
|
return self._client.list_images()
|
||||||
|
|
||||||
|
def listFlavors(self):
|
||||||
|
with shade_inner_exceptions():
|
||||||
|
return self._client.list_flavors(get_extra=False)
|
||||||
|
|
||||||
|
def listNodes(self):
|
||||||
|
# shade list_servers carries the nodepool server list caching logic
|
||||||
|
with shade_inner_exceptions():
|
||||||
|
return self._client.list_servers()
|
||||||
|
|
||||||
|
def deleteServer(self, server_id):
|
||||||
|
with shade_inner_exceptions():
|
||||||
|
return self._client.delete_server(server_id, delete_ips=True)
|
||||||
|
|
||||||
|
def cleanupNode(self, server_id):
|
||||||
|
server = self.getServer(server_id)
|
||||||
|
if not server:
|
||||||
|
raise exceptions.NotFound()
|
||||||
|
|
||||||
|
self.log.debug('Deleting server %s' % server_id)
|
||||||
|
self.deleteServer(server_id)
|
||||||
|
|
||||||
|
def cleanupLeakedFloaters(self):
|
||||||
|
with shade_inner_exceptions():
|
||||||
|
self._client.delete_unattached_floating_ips()
|
||||||
|
|
||||||
|
def getAZs(self):
|
||||||
|
if self.__azs is None:
|
||||||
|
self.__azs = self._client.list_availability_zone_names()
|
||||||
|
if not self.__azs:
|
||||||
|
# If there are no zones, return a list containing None so that
|
||||||
|
# random.choice can pick None and pass that to Nova. If this
|
||||||
|
# feels dirty, please direct your ire to policy.json and the
|
||||||
|
# ability to turn off random portions of the OpenStack API.
|
||||||
|
self.__azs = [None]
|
||||||
|
return self.__azs
|
|
@ -13,6 +13,26 @@
|
||||||
# under the License.
|
# under the License.
|
||||||
|
|
||||||
|
|
||||||
|
class NotFound(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class LaunchNodepoolException(Exception):
|
||||||
|
statsd_key = 'error.nodepool'
|
||||||
|
|
||||||
|
|
||||||
|
class LaunchStatusException(Exception):
|
||||||
|
statsd_key = 'error.status'
|
||||||
|
|
||||||
|
|
||||||
|
class LaunchNetworkException(Exception):
|
||||||
|
statsd_key = 'error.network'
|
||||||
|
|
||||||
|
|
||||||
|
class LaunchKeyscanException(Exception):
|
||||||
|
statsd_key = 'error.keyscan'
|
||||||
|
|
||||||
|
|
||||||
class BuilderError(RuntimeError):
|
class BuilderError(RuntimeError):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
|
@ -16,24 +16,20 @@
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
import collections
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import os.path
|
import os.path
|
||||||
import pprint
|
|
||||||
import random
|
|
||||||
import socket
|
import socket
|
||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
|
|
||||||
from nodepool import exceptions
|
from nodepool import exceptions
|
||||||
from nodepool import nodeutils as utils
|
|
||||||
from nodepool import provider_manager
|
from nodepool import provider_manager
|
||||||
from nodepool import stats
|
from nodepool import stats
|
||||||
from nodepool import config as nodepool_config
|
from nodepool import config as nodepool_config
|
||||||
from nodepool import zk
|
from nodepool import zk
|
||||||
from nodepool.driver import NodeRequestHandler
|
from nodepool.driver.openstack.handler import OpenStackNodeRequestHandler
|
||||||
from nodepool.driver import NodeLaunchManager
|
|
||||||
|
|
||||||
MINS = 60
|
MINS = 60
|
||||||
HOURS = 60 * MINS
|
HOURS = 60 * MINS
|
||||||
|
@ -44,124 +40,13 @@ SUSPEND_WAIT_TIME = 30 # How long to wait between checks for ZooKeeper
|
||||||
# connectivity if it disappears.
|
# connectivity if it disappears.
|
||||||
|
|
||||||
|
|
||||||
class LaunchNodepoolException(Exception):
|
class NodeDeleter(threading.Thread, stats.StatsReporter):
|
||||||
statsd_key = 'error.nodepool'
|
log = logging.getLogger("nodepool.NodeDeleter")
|
||||||
|
|
||||||
|
|
||||||
class LaunchStatusException(Exception):
|
|
||||||
statsd_key = 'error.status'
|
|
||||||
|
|
||||||
|
|
||||||
class LaunchNetworkException(Exception):
|
|
||||||
statsd_key = 'error.network'
|
|
||||||
|
|
||||||
|
|
||||||
class LaunchKeyscanException(Exception):
|
|
||||||
statsd_key = 'error.keyscan'
|
|
||||||
|
|
||||||
|
|
||||||
class StatsReporter(object):
|
|
||||||
'''
|
|
||||||
Class adding statsd reporting functionality.
|
|
||||||
'''
|
|
||||||
def __init__(self):
|
|
||||||
super(StatsReporter, self).__init__()
|
|
||||||
self._statsd = stats.get_client()
|
|
||||||
|
|
||||||
def recordLaunchStats(self, subkey, dt, image_name,
|
|
||||||
provider_name, node_az, requestor):
|
|
||||||
'''
|
|
||||||
Record node launch statistics.
|
|
||||||
|
|
||||||
:param str subkey: statsd key
|
|
||||||
:param int dt: Time delta in milliseconds
|
|
||||||
:param str image_name: Name of the image used
|
|
||||||
:param str provider_name: Name of the provider
|
|
||||||
:param str node_az: AZ of the launched node
|
|
||||||
:param str requestor: Identifier for the request originator
|
|
||||||
'''
|
|
||||||
if not self._statsd:
|
|
||||||
return
|
|
||||||
|
|
||||||
keys = [
|
|
||||||
'nodepool.launch.provider.%s.%s' % (provider_name, subkey),
|
|
||||||
'nodepool.launch.image.%s.%s' % (image_name, subkey),
|
|
||||||
'nodepool.launch.%s' % (subkey,),
|
|
||||||
]
|
|
||||||
|
|
||||||
if node_az:
|
|
||||||
keys.append('nodepool.launch.provider.%s.%s.%s' %
|
|
||||||
(provider_name, node_az, subkey))
|
|
||||||
|
|
||||||
if requestor:
|
|
||||||
# Replace '.' which is a graphite hierarchy, and ':' which is
|
|
||||||
# a statsd delimeter.
|
|
||||||
requestor = requestor.replace('.', '_')
|
|
||||||
requestor = requestor.replace(':', '_')
|
|
||||||
keys.append('nodepool.launch.requestor.%s.%s' %
|
|
||||||
(requestor, subkey))
|
|
||||||
|
|
||||||
for key in keys:
|
|
||||||
self._statsd.timing(key, dt)
|
|
||||||
self._statsd.incr(key)
|
|
||||||
|
|
||||||
|
|
||||||
def updateNodeStats(self, zk_conn, provider):
|
|
||||||
'''
|
|
||||||
Refresh statistics for all known nodes.
|
|
||||||
|
|
||||||
:param ZooKeeper zk_conn: A ZooKeeper connection object.
|
|
||||||
:param Provider provider: A config Provider object.
|
|
||||||
'''
|
|
||||||
if not self._statsd:
|
|
||||||
return
|
|
||||||
|
|
||||||
states = {}
|
|
||||||
|
|
||||||
# Initialize things we know about to zero
|
|
||||||
for state in zk.Node.VALID_STATES:
|
|
||||||
key = 'nodepool.nodes.%s' % state
|
|
||||||
states[key] = 0
|
|
||||||
key = 'nodepool.provider.%s.nodes.%s' % (provider.name, state)
|
|
||||||
states[key] = 0
|
|
||||||
|
|
||||||
for node in zk_conn.nodeIterator():
|
|
||||||
#nodepool.nodes.STATE
|
|
||||||
key = 'nodepool.nodes.%s' % node.state
|
|
||||||
states[key] += 1
|
|
||||||
|
|
||||||
#nodepool.label.LABEL.nodes.STATE
|
|
||||||
key = 'nodepool.label.%s.nodes.%s' % (node.type, node.state)
|
|
||||||
# It's possible we could see node types that aren't in our config
|
|
||||||
if key in states:
|
|
||||||
states[key] += 1
|
|
||||||
else:
|
|
||||||
states[key] = 1
|
|
||||||
|
|
||||||
#nodepool.provider.PROVIDER.nodes.STATE
|
|
||||||
key = 'nodepool.provider.%s.nodes.%s' % (node.provider, node.state)
|
|
||||||
# It's possible we could see providers that aren't in our config
|
|
||||||
if key in states:
|
|
||||||
states[key] += 1
|
|
||||||
else:
|
|
||||||
states[key] = 1
|
|
||||||
|
|
||||||
for key, count in states.items():
|
|
||||||
self._statsd.gauge(key, count)
|
|
||||||
|
|
||||||
#nodepool.provider.PROVIDER.max_servers
|
|
||||||
key = 'nodepool.provider.%s.max_servers' % provider.name
|
|
||||||
max_servers = sum([p.max_servers for p in provider.pools.values()])
|
|
||||||
self._statsd.gauge(key, max_servers)
|
|
||||||
|
|
||||||
|
|
||||||
class InstanceDeleter(threading.Thread, StatsReporter):
|
|
||||||
log = logging.getLogger("nodepool.InstanceDeleter")
|
|
||||||
|
|
||||||
def __init__(self, zk, manager, node):
|
def __init__(self, zk, manager, node):
|
||||||
threading.Thread.__init__(self, name='InstanceDeleter for %s %s' %
|
threading.Thread.__init__(self, name='NodeDeleter for %s %s' %
|
||||||
(node.provider, node.external_id))
|
(node.provider, node.external_id))
|
||||||
StatsReporter.__init__(self)
|
stats.StatsReporter.__init__(self)
|
||||||
self._zk = zk
|
self._zk = zk
|
||||||
self._manager = manager
|
self._manager = manager
|
||||||
self._node = node
|
self._node = node
|
||||||
|
@ -188,11 +73,11 @@ class InstanceDeleter(threading.Thread, StatsReporter):
|
||||||
if node.external_id:
|
if node.external_id:
|
||||||
manager.cleanupNode(node.external_id)
|
manager.cleanupNode(node.external_id)
|
||||||
manager.waitForNodeCleanup(node.external_id)
|
manager.waitForNodeCleanup(node.external_id)
|
||||||
except provider_manager.NotFound:
|
except exceptions.NotFound:
|
||||||
InstanceDeleter.log.info("Instance %s not found in provider %s",
|
NodeDeleter.log.info("Instance %s not found in provider %s",
|
||||||
node.external_id, node.provider)
|
node.external_id, node.provider)
|
||||||
except Exception:
|
except Exception:
|
||||||
InstanceDeleter.log.exception(
|
NodeDeleter.log.exception(
|
||||||
"Exception deleting instance %s from %s:",
|
"Exception deleting instance %s from %s:",
|
||||||
node.external_id, node.provider)
|
node.external_id, node.provider)
|
||||||
# Don't delete the ZK node in this case, but do unlock it
|
# Don't delete the ZK node in this case, but do unlock it
|
||||||
|
@ -201,7 +86,7 @@ class InstanceDeleter(threading.Thread, StatsReporter):
|
||||||
return
|
return
|
||||||
|
|
||||||
if node_exists:
|
if node_exists:
|
||||||
InstanceDeleter.log.info(
|
NodeDeleter.log.info(
|
||||||
"Deleting ZK node id=%s, state=%s, external_id=%s",
|
"Deleting ZK node id=%s, state=%s, external_id=%s",
|
||||||
node.id, node.state, node.external_id)
|
node.id, node.state, node.external_id)
|
||||||
# This also effectively releases the lock
|
# This also effectively releases the lock
|
||||||
|
@ -223,480 +108,6 @@ class InstanceDeleter(threading.Thread, StatsReporter):
|
||||||
self.log.exception("Exception while reporting stats:")
|
self.log.exception("Exception while reporting stats:")
|
||||||
|
|
||||||
|
|
||||||
class NodeLauncher(threading.Thread, StatsReporter):
|
|
||||||
|
|
||||||
def __init__(self, zk, provider_label, provider_manager, requestor,
|
|
||||||
node, retries):
|
|
||||||
'''
|
|
||||||
Initialize the launcher.
|
|
||||||
|
|
||||||
:param ZooKeeper zk: A ZooKeeper object.
|
|
||||||
:param ProviderLabel provider: A config ProviderLabel object.
|
|
||||||
:param ProviderManager provider_manager: The manager object used to
|
|
||||||
interact with the selected provider.
|
|
||||||
:param str requestor: Identifier for the request originator.
|
|
||||||
:param Node node: The node object.
|
|
||||||
:param int retries: Number of times to retry failed launches.
|
|
||||||
'''
|
|
||||||
threading.Thread.__init__(self, name="NodeLauncher-%s" % node.id)
|
|
||||||
StatsReporter.__init__(self)
|
|
||||||
self.log = logging.getLogger("nodepool.NodeLauncher-%s" % node.id)
|
|
||||||
self._zk = zk
|
|
||||||
self._label = provider_label
|
|
||||||
self._manager = provider_manager
|
|
||||||
self._node = node
|
|
||||||
self._retries = retries
|
|
||||||
self._image_name = None
|
|
||||||
self._requestor = requestor
|
|
||||||
|
|
||||||
self._pool = self._label.pool
|
|
||||||
self._provider = self._pool.provider
|
|
||||||
if self._label.diskimage:
|
|
||||||
self._diskimage = self._provider.diskimages[self._label.diskimage.name]
|
|
||||||
else:
|
|
||||||
self._diskimage = None
|
|
||||||
self._cloud_image = self._provider.cloud_images.get(self._label.cloud_image, None)
|
|
||||||
|
|
||||||
def logConsole(self, server_id, hostname):
|
|
||||||
if not self._label.console_log:
|
|
||||||
return
|
|
||||||
console = self._manager.getServerConsole(server_id)
|
|
||||||
if console:
|
|
||||||
self.log.debug('Console log from hostname %s:' % hostname)
|
|
||||||
for line in console.splitlines():
|
|
||||||
self.log.debug(line.rstrip())
|
|
||||||
|
|
||||||
def _launchNode(self):
|
|
||||||
if self._label.diskimage:
|
|
||||||
# launch using diskimage
|
|
||||||
cloud_image = self._zk.getMostRecentImageUpload(
|
|
||||||
self._diskimage.name, self._provider.name)
|
|
||||||
|
|
||||||
if not cloud_image:
|
|
||||||
raise LaunchNodepoolException(
|
|
||||||
"Unable to find current cloud image %s in %s" %
|
|
||||||
(self._diskimage.name, self._provider.name)
|
|
||||||
)
|
|
||||||
|
|
||||||
config_drive = self._diskimage.config_drive
|
|
||||||
image_external = dict(id=cloud_image.external_id)
|
|
||||||
image_id = "{path}/{upload_id}".format(
|
|
||||||
path=self._zk._imageUploadPath(cloud_image.image_name,
|
|
||||||
cloud_image.build_id,
|
|
||||||
cloud_image.provider_name),
|
|
||||||
upload_id=cloud_image.id)
|
|
||||||
image_name = self._diskimage.name
|
|
||||||
|
|
||||||
else:
|
|
||||||
# launch using unmanaged cloud image
|
|
||||||
config_drive = self._cloud_image.config_drive
|
|
||||||
|
|
||||||
# These are different values for zk, but it's all the same
|
|
||||||
# for cloud-images.
|
|
||||||
# image_external is what we use for OpenStack.
|
|
||||||
# image_id is what we record in the node for zk.
|
|
||||||
# image_name is what we log, so matches the config.
|
|
||||||
if self._cloud_image.image_id:
|
|
||||||
image_external = dict(id=self._cloud_image.image_id)
|
|
||||||
elif self._cloud_image.image_name:
|
|
||||||
image_external = self._cloud_image.image_name
|
|
||||||
else:
|
|
||||||
image_external = self._cloud_image.name
|
|
||||||
image_id = self._cloud_image.name
|
|
||||||
image_name = self._cloud_image.name
|
|
||||||
|
|
||||||
hostname = self._provider.hostname_format.format(
|
|
||||||
label=self._label, provider=self._provider, node=self._node
|
|
||||||
)
|
|
||||||
|
|
||||||
self.log.info("Creating server with hostname %s in %s from image %s "
|
|
||||||
"for node id: %s" % (hostname, self._provider.name,
|
|
||||||
image_name,
|
|
||||||
self._node.id))
|
|
||||||
|
|
||||||
# NOTE: We store the node ID in the server metadata to use for leaked
|
|
||||||
# instance detection. We cannot use the external server ID for this
|
|
||||||
# because that isn't available in ZooKeeper until after the server is
|
|
||||||
# active, which could cause a race in leak detection.
|
|
||||||
|
|
||||||
server = self._manager.createServer(
|
|
||||||
hostname,
|
|
||||||
image=image_external,
|
|
||||||
min_ram=self._label.min_ram,
|
|
||||||
flavor_name=self._label.flavor_name,
|
|
||||||
key_name=self._label.key_name,
|
|
||||||
az=self._node.az,
|
|
||||||
config_drive=config_drive,
|
|
||||||
nodepool_node_id=self._node.id,
|
|
||||||
nodepool_node_label=self._node.type,
|
|
||||||
nodepool_image_name=image_name,
|
|
||||||
networks=self._pool.networks,
|
|
||||||
boot_from_volume=self._label.boot_from_volume,
|
|
||||||
volume_size=self._label.volume_size)
|
|
||||||
|
|
||||||
self._node.external_id = server.id
|
|
||||||
self._node.hostname = hostname
|
|
||||||
self._node.image_id = image_id
|
|
||||||
|
|
||||||
# Checkpoint save the updated node info
|
|
||||||
self._zk.storeNode(self._node)
|
|
||||||
|
|
||||||
self.log.debug("Waiting for server %s for node id: %s" %
|
|
||||||
(server.id, self._node.id))
|
|
||||||
server = self._manager.waitForServer(
|
|
||||||
server, self._provider.launch_timeout)
|
|
||||||
|
|
||||||
if server.status != 'ACTIVE':
|
|
||||||
raise LaunchStatusException("Server %s for node id: %s "
|
|
||||||
"status: %s" %
|
|
||||||
(server.id, self._node.id,
|
|
||||||
server.status))
|
|
||||||
|
|
||||||
# If we didn't specify an AZ, set it to the one chosen by Nova.
|
|
||||||
# Do this after we are done waiting since AZ may not be available
|
|
||||||
# immediately after the create request.
|
|
||||||
if not self._node.az:
|
|
||||||
self._node.az = server.location.zone
|
|
||||||
|
|
||||||
interface_ip = server.interface_ip
|
|
||||||
if not interface_ip:
|
|
||||||
self.log.debug(
|
|
||||||
"Server data for failed IP: %s" % pprint.pformat(
|
|
||||||
server))
|
|
||||||
raise LaunchNetworkException("Unable to find public IP of server")
|
|
||||||
|
|
||||||
self._node.interface_ip = interface_ip
|
|
||||||
self._node.public_ipv4 = server.public_v4
|
|
||||||
self._node.public_ipv6 = server.public_v6
|
|
||||||
self._node.private_ipv4 = server.private_v4
|
|
||||||
# devstack-gate multi-node depends on private_v4 being populated
|
|
||||||
# with something. On clouds that don't have a private address, use
|
|
||||||
# the public.
|
|
||||||
if not self._node.private_ipv4:
|
|
||||||
self._node.private_ipv4 = server.public_v4
|
|
||||||
|
|
||||||
# Checkpoint save the updated node info
|
|
||||||
self._zk.storeNode(self._node)
|
|
||||||
|
|
||||||
self.log.debug(
|
|
||||||
"Node %s is running [region: %s, az: %s, ip: %s ipv4: %s, "
|
|
||||||
"ipv6: %s]" %
|
|
||||||
(self._node.id, self._node.region, self._node.az,
|
|
||||||
self._node.interface_ip, self._node.public_ipv4,
|
|
||||||
self._node.public_ipv6))
|
|
||||||
|
|
||||||
# Get the SSH public keys for the new node and record in ZooKeeper
|
|
||||||
try:
|
|
||||||
self.log.debug("Gathering host keys for node %s", self._node.id)
|
|
||||||
host_keys = utils.keyscan(
|
|
||||||
interface_ip, timeout=self._provider.boot_timeout)
|
|
||||||
if not host_keys:
|
|
||||||
raise LaunchKeyscanException("Unable to gather host keys")
|
|
||||||
except exceptions.SSHTimeoutException:
|
|
||||||
self.logConsole(self._node.external_id, self._node.hostname)
|
|
||||||
raise
|
|
||||||
|
|
||||||
self._node.host_keys = host_keys
|
|
||||||
self._zk.storeNode(self._node)
|
|
||||||
|
|
||||||
def _run(self):
|
|
||||||
attempts = 1
|
|
||||||
while attempts <= self._retries:
|
|
||||||
try:
|
|
||||||
self._launchNode()
|
|
||||||
break
|
|
||||||
except Exception:
|
|
||||||
if attempts <= self._retries:
|
|
||||||
self.log.exception(
|
|
||||||
"Launch attempt %d/%d failed for node %s:",
|
|
||||||
attempts, self._retries, self._node.id)
|
|
||||||
# If we created an instance, delete it.
|
|
||||||
if self._node.external_id:
|
|
||||||
self._manager.cleanupNode(self._node.external_id)
|
|
||||||
self._manager.waitForNodeCleanup(self._node.external_id)
|
|
||||||
self._node.external_id = None
|
|
||||||
self._node.public_ipv4 = None
|
|
||||||
self._node.public_ipv6 = None
|
|
||||||
self._node.inerface_ip = None
|
|
||||||
self._zk.storeNode(self._node)
|
|
||||||
if attempts == self._retries:
|
|
||||||
raise
|
|
||||||
attempts += 1
|
|
||||||
|
|
||||||
self._node.state = zk.READY
|
|
||||||
self._zk.storeNode(self._node)
|
|
||||||
self.log.info("Node id %s is ready", self._node.id)
|
|
||||||
|
|
||||||
def run(self):
|
|
||||||
start_time = time.time()
|
|
||||||
statsd_key = 'ready'
|
|
||||||
|
|
||||||
try:
|
|
||||||
self._run()
|
|
||||||
except Exception as e:
|
|
||||||
self.log.exception("Launch failed for node %s:",
|
|
||||||
self._node.id)
|
|
||||||
self._node.state = zk.FAILED
|
|
||||||
self._zk.storeNode(self._node)
|
|
||||||
|
|
||||||
if hasattr(e, 'statsd_key'):
|
|
||||||
statsd_key = e.statsd_key
|
|
||||||
else:
|
|
||||||
statsd_key = 'error.unknown'
|
|
||||||
|
|
||||||
try:
|
|
||||||
dt = int((time.time() - start_time) * 1000)
|
|
||||||
self.recordLaunchStats(statsd_key, dt, self._image_name,
|
|
||||||
self._node.provider, self._node.az,
|
|
||||||
self._requestor)
|
|
||||||
self.updateNodeStats(self._zk, self._provider)
|
|
||||||
except Exception:
|
|
||||||
self.log.exception("Exception while reporting stats:")
|
|
||||||
|
|
||||||
|
|
||||||
class OpenStackNodeLaunchManager(NodeLaunchManager):
|
|
||||||
def launch(self, node):
|
|
||||||
'''
|
|
||||||
Launch a new node as described by the supplied Node.
|
|
||||||
|
|
||||||
We expect each NodeLauncher thread to directly modify the node that
|
|
||||||
is passed to it. The poll() method will expect to see the node.state
|
|
||||||
attribute to change as the node is processed.
|
|
||||||
|
|
||||||
:param Node node: The node object.
|
|
||||||
'''
|
|
||||||
self._nodes.append(node)
|
|
||||||
provider_label = self._pool.labels[node.type]
|
|
||||||
t = NodeLauncher(self._zk, provider_label, self._manager,
|
|
||||||
self._requestor, node, self._retries)
|
|
||||||
t.start()
|
|
||||||
self._threads.append(t)
|
|
||||||
|
|
||||||
|
|
||||||
class OpenStackNodeRequestHandler(NodeRequestHandler):
|
|
||||||
log = logging.getLogger("nodepool.OpenStackNodeRequestHandler")
|
|
||||||
|
|
||||||
def __init__(self, pw, request):
|
|
||||||
super(OpenStackNodeRequestHandler, self).__init__(pw, request)
|
|
||||||
self.chosen_az = None
|
|
||||||
|
|
||||||
def _imagesAvailable(self):
|
|
||||||
'''
|
|
||||||
Determines if the requested images are available for this provider.
|
|
||||||
|
|
||||||
ZooKeeper is queried for an image uploaded to the provider that is
|
|
||||||
in the READY state.
|
|
||||||
|
|
||||||
:returns: True if it is available, False otherwise.
|
|
||||||
'''
|
|
||||||
for label in self.request.node_types:
|
|
||||||
|
|
||||||
if self.pool.labels[label].cloud_image:
|
|
||||||
img = self.pool.labels[label].cloud_image
|
|
||||||
if not self.manager.labelReady(img):
|
|
||||||
return False
|
|
||||||
else:
|
|
||||||
img = self.pool.labels[label].diskimage.name
|
|
||||||
|
|
||||||
if not self.zk.getMostRecentImageUpload(img, self.provider.name):
|
|
||||||
return False
|
|
||||||
return True
|
|
||||||
|
|
||||||
def _invalidNodeTypes(self):
|
|
||||||
'''
|
|
||||||
Return any node types that are invalid for this provider.
|
|
||||||
|
|
||||||
:returns: A list of node type names that are invalid, or an empty
|
|
||||||
list if all are valid.
|
|
||||||
'''
|
|
||||||
invalid = []
|
|
||||||
for ntype in self.request.node_types:
|
|
||||||
if ntype not in self.pool.labels:
|
|
||||||
invalid.append(ntype)
|
|
||||||
return invalid
|
|
||||||
|
|
||||||
def _countNodes(self):
|
|
||||||
'''
|
|
||||||
Query ZooKeeper to determine the number of provider nodes launched.
|
|
||||||
|
|
||||||
:returns: An integer for the number launched for this provider.
|
|
||||||
'''
|
|
||||||
count = 0
|
|
||||||
for node in self.zk.nodeIterator():
|
|
||||||
if (node.provider == self.provider.name and
|
|
||||||
node.pool == self.pool.name):
|
|
||||||
count += 1
|
|
||||||
return count
|
|
||||||
|
|
||||||
def _waitForNodeSet(self):
|
|
||||||
'''
|
|
||||||
Fill node set for the request.
|
|
||||||
|
|
||||||
Obtain nodes for the request, pausing all new request handling for
|
|
||||||
this provider until the node set can be filled.
|
|
||||||
|
|
||||||
We attempt to group the node set within the same provider availability
|
|
||||||
zone. For this to work properly, the provider entry in the nodepool
|
|
||||||
config must list the availability zones. Otherwise, new nodes will be
|
|
||||||
put in random AZs at nova's whim. The exception being if there is an
|
|
||||||
existing node in the READY state that we can select for this node set.
|
|
||||||
Its AZ will then be used for new nodes, as well as any other READY
|
|
||||||
nodes.
|
|
||||||
|
|
||||||
note:: This code is a bit racey in its calculation of the number of
|
|
||||||
nodes in use for quota purposes. It is possible for multiple
|
|
||||||
launchers to be doing this calculation at the same time. Since we
|
|
||||||
currently have no locking mechanism around the "in use"
|
|
||||||
calculation, if we are at the edge of the quota, one of the
|
|
||||||
launchers could attempt to launch a new node after the other
|
|
||||||
launcher has already started doing so. This would cause an
|
|
||||||
expected failure from the underlying library, which is ok for now.
|
|
||||||
'''
|
|
||||||
if not self.launch_manager:
|
|
||||||
self.launch_manager = OpenStackNodeLaunchManager(
|
|
||||||
self.zk, self.pool, self.manager,
|
|
||||||
self.request.requestor, retries=self.provider.launch_retries)
|
|
||||||
|
|
||||||
# Since this code can be called more than once for the same request,
|
|
||||||
# we need to calculate the difference between our current node set
|
|
||||||
# and what was requested. We cannot use set operations here since a
|
|
||||||
# node type can appear more than once in the requested types.
|
|
||||||
saved_types = collections.Counter([n.type for n in self.nodeset])
|
|
||||||
requested_types = collections.Counter(self.request.node_types)
|
|
||||||
diff = requested_types - saved_types
|
|
||||||
needed_types = list(diff.elements())
|
|
||||||
|
|
||||||
ready_nodes = self.zk.getReadyNodesOfTypes(needed_types)
|
|
||||||
|
|
||||||
for ntype in needed_types:
|
|
||||||
# First try to grab from the list of already available nodes.
|
|
||||||
got_a_node = False
|
|
||||||
if self.request.reuse and ntype in ready_nodes:
|
|
||||||
for node in ready_nodes[ntype]:
|
|
||||||
# Only interested in nodes from this provider and
|
|
||||||
# pool, and within the selected AZ.
|
|
||||||
if node.provider != self.provider.name:
|
|
||||||
continue
|
|
||||||
if node.pool != self.pool.name:
|
|
||||||
continue
|
|
||||||
if self.chosen_az and node.az != self.chosen_az:
|
|
||||||
continue
|
|
||||||
|
|
||||||
try:
|
|
||||||
self.zk.lockNode(node, blocking=False)
|
|
||||||
except exceptions.ZKLockException:
|
|
||||||
# It's already locked so skip it.
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
if self.paused:
|
|
||||||
self.log.debug("Unpaused request %s", self.request)
|
|
||||||
self.paused = False
|
|
||||||
|
|
||||||
self.log.debug(
|
|
||||||
"Locked existing node %s for request %s",
|
|
||||||
node.id, self.request.id)
|
|
||||||
got_a_node = True
|
|
||||||
node.allocated_to = self.request.id
|
|
||||||
self.zk.storeNode(node)
|
|
||||||
self.nodeset.append(node)
|
|
||||||
|
|
||||||
# If we haven't already chosen an AZ, select the
|
|
||||||
# AZ from this ready node. This will cause new nodes
|
|
||||||
# to share this AZ, as well.
|
|
||||||
if not self.chosen_az and node.az:
|
|
||||||
self.chosen_az = node.az
|
|
||||||
break
|
|
||||||
|
|
||||||
# Could not grab an existing node, so launch a new one.
|
|
||||||
if not got_a_node:
|
|
||||||
# Select grouping AZ if we didn't set AZ from a selected,
|
|
||||||
# pre-existing node
|
|
||||||
if not self.chosen_az:
|
|
||||||
self.chosen_az = random.choice(
|
|
||||||
self.pool.azs or self.manager.getAZs())
|
|
||||||
|
|
||||||
# If we calculate that we're at capacity, pause until nodes
|
|
||||||
# are released by Zuul and removed by the DeletedNodeWorker.
|
|
||||||
if self._countNodes() >= self.pool.max_servers:
|
|
||||||
if not self.paused:
|
|
||||||
self.log.debug(
|
|
||||||
"Pausing request handling to satisfy request %s",
|
|
||||||
self.request)
|
|
||||||
self.paused = True
|
|
||||||
return
|
|
||||||
|
|
||||||
if self.paused:
|
|
||||||
self.log.debug("Unpaused request %s", self.request)
|
|
||||||
self.paused = False
|
|
||||||
|
|
||||||
node = zk.Node()
|
|
||||||
node.state = zk.INIT
|
|
||||||
node.type = ntype
|
|
||||||
node.provider = self.provider.name
|
|
||||||
node.pool = self.pool.name
|
|
||||||
node.az = self.chosen_az
|
|
||||||
node.region = self.provider.region_name
|
|
||||||
node.launcher = self.launcher_id
|
|
||||||
node.allocated_to = self.request.id
|
|
||||||
|
|
||||||
# Note: It should be safe (i.e., no race) to lock the node
|
|
||||||
# *after* it is stored since nodes in INIT state are not
|
|
||||||
# locked anywhere.
|
|
||||||
self.zk.storeNode(node)
|
|
||||||
self.zk.lockNode(node, blocking=False)
|
|
||||||
self.log.debug("Locked building node %s for request %s",
|
|
||||||
node.id, self.request.id)
|
|
||||||
|
|
||||||
# Set state AFTER lock so sthat it isn't accidentally cleaned
|
|
||||||
# up (unlocked BUILDING nodes will be deleted).
|
|
||||||
node.state = zk.BUILDING
|
|
||||||
self.zk.storeNode(node)
|
|
||||||
|
|
||||||
self.nodeset.append(node)
|
|
||||||
self.launch_manager.launch(node)
|
|
||||||
|
|
||||||
def run_handler(self):
|
|
||||||
'''
|
|
||||||
Main body for the NodeRequestHandler.
|
|
||||||
'''
|
|
||||||
self._setFromPoolWorker()
|
|
||||||
|
|
||||||
declined_reasons = []
|
|
||||||
invalid_types = self._invalidNodeTypes()
|
|
||||||
if invalid_types:
|
|
||||||
declined_reasons.append('node type(s) [%s] not available' %
|
|
||||||
','.join(invalid_types))
|
|
||||||
elif not self._imagesAvailable():
|
|
||||||
declined_reasons.append('images are not available')
|
|
||||||
if len(self.request.node_types) > self.pool.max_servers:
|
|
||||||
declined_reasons.append('it would exceed quota')
|
|
||||||
|
|
||||||
if declined_reasons:
|
|
||||||
self.log.debug("Declining node request %s because %s",
|
|
||||||
self.request.id, ', '.join(declined_reasons))
|
|
||||||
self.request.declined_by.append(self.launcher_id)
|
|
||||||
launchers = set(self.zk.getRegisteredLaunchers())
|
|
||||||
if launchers.issubset(set(self.request.declined_by)):
|
|
||||||
self.log.debug("Failing declined node request %s",
|
|
||||||
self.request.id)
|
|
||||||
# All launchers have declined it
|
|
||||||
self.request.state = zk.FAILED
|
|
||||||
self.unlockNodeSet(clear_allocation=True)
|
|
||||||
self.zk.storeNodeRequest(self.request)
|
|
||||||
self.zk.unlockNodeRequest(self.request)
|
|
||||||
self.done = True
|
|
||||||
return
|
|
||||||
|
|
||||||
if self.paused:
|
|
||||||
self.log.debug("Retrying node request %s", self.request.id)
|
|
||||||
else:
|
|
||||||
self.log.debug("Accepting node request %s", self.request.id)
|
|
||||||
self.request.state = zk.PENDING
|
|
||||||
self.zk.storeNodeRequest(self.request)
|
|
||||||
|
|
||||||
self._waitForNodeSet()
|
|
||||||
|
|
||||||
|
|
||||||
class PoolWorker(threading.Thread):
|
class PoolWorker(threading.Thread):
|
||||||
'''
|
'''
|
||||||
Class that manages node requests for a single provider pool.
|
Class that manages node requests for a single provider pool.
|
||||||
|
@ -893,7 +304,7 @@ class BaseCleanupWorker(threading.Thread):
|
||||||
self.log.info("Deleting %s instance %s from %s",
|
self.log.info("Deleting %s instance %s from %s",
|
||||||
node.state, node.external_id, node.provider)
|
node.state, node.external_id, node.provider)
|
||||||
try:
|
try:
|
||||||
t = InstanceDeleter(
|
t = NodeDeleter(
|
||||||
self._nodepool.getZK(),
|
self._nodepool.getZK(),
|
||||||
self._nodepool.getProviderManager(node.provider),
|
self._nodepool.getProviderManager(node.provider),
|
||||||
node)
|
node)
|
||||||
|
@ -1083,7 +494,7 @@ class CleanupWorker(BaseCleanupWorker):
|
||||||
zk_conn.unlockNode(node)
|
zk_conn.unlockNode(node)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# The InstanceDeleter thread will unlock and remove the
|
# The NodeDeleter thread will unlock and remove the
|
||||||
# node from ZooKeeper if it succeeds.
|
# node from ZooKeeper if it succeeds.
|
||||||
self._deleteInstance(node)
|
self._deleteInstance(node)
|
||||||
|
|
||||||
|
@ -1169,7 +580,7 @@ class DeletedNodeWorker(BaseCleanupWorker):
|
||||||
zk_conn.unlockNode(node)
|
zk_conn.unlockNode(node)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# The InstanceDeleter thread will unlock and remove the
|
# The NodeDeleter thread will unlock and remove the
|
||||||
# node from ZooKeeper if it succeeds.
|
# node from ZooKeeper if it succeeds.
|
||||||
self._deleteInstance(node)
|
self._deleteInstance(node)
|
||||||
|
|
||||||
|
|
|
@ -17,33 +17,9 @@
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
from contextlib import contextmanager
|
|
||||||
import operator
|
|
||||||
|
|
||||||
import shade
|
from nodepool.driver.fake.provider import FakeProvider
|
||||||
|
from nodepool.driver.openstack.provider import OpenStackProvider
|
||||||
from nodepool import exceptions
|
|
||||||
from nodepool import fakeprovider
|
|
||||||
from nodepool.driver import Provider
|
|
||||||
from nodepool.nodeutils import iterate_timeout
|
|
||||||
from nodepool.task_manager import ManagerStoppedException
|
|
||||||
from nodepool.task_manager import TaskManager
|
|
||||||
|
|
||||||
|
|
||||||
IPS_LIST_AGE = 5 # How long to keep a cached copy of the ip list
|
|
||||||
|
|
||||||
|
|
||||||
@contextmanager
|
|
||||||
def shade_inner_exceptions():
|
|
||||||
try:
|
|
||||||
yield
|
|
||||||
except shade.OpenStackCloudException as e:
|
|
||||||
e.log_error()
|
|
||||||
raise
|
|
||||||
|
|
||||||
|
|
||||||
class NotFound(Exception):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def get_provider(provider, use_taskmanager):
|
def get_provider(provider, use_taskmanager):
|
||||||
|
@ -83,324 +59,3 @@ class ProviderManager(object):
|
||||||
for m in config.provider_managers.values():
|
for m in config.provider_managers.values():
|
||||||
m.stop()
|
m.stop()
|
||||||
m.join()
|
m.join()
|
||||||
|
|
||||||
|
|
||||||
class OpenStackProvider(Provider):
|
|
||||||
log = logging.getLogger("nodepool.OpenStackProvider")
|
|
||||||
|
|
||||||
def __init__(self, provider, use_taskmanager):
|
|
||||||
self.provider = provider
|
|
||||||
self._images = {}
|
|
||||||
self._networks = {}
|
|
||||||
self.__flavors = {}
|
|
||||||
self.__azs = None
|
|
||||||
self._use_taskmanager = use_taskmanager
|
|
||||||
self._taskmanager = None
|
|
||||||
|
|
||||||
def start(self):
|
|
||||||
if self._use_taskmanager:
|
|
||||||
self._taskmanager = TaskManager(None, self.provider.name,
|
|
||||||
self.provider.rate)
|
|
||||||
self._taskmanager.start()
|
|
||||||
self.resetClient()
|
|
||||||
|
|
||||||
def stop(self):
|
|
||||||
if self._taskmanager:
|
|
||||||
self._taskmanager.stop()
|
|
||||||
|
|
||||||
def join(self):
|
|
||||||
if self._taskmanager:
|
|
||||||
self._taskmanager.join()
|
|
||||||
|
|
||||||
@property
|
|
||||||
def _flavors(self):
|
|
||||||
if not self.__flavors:
|
|
||||||
self.__flavors = self._getFlavors()
|
|
||||||
return self.__flavors
|
|
||||||
|
|
||||||
def _getClient(self):
|
|
||||||
if self._use_taskmanager:
|
|
||||||
manager = self._taskmanager
|
|
||||||
else:
|
|
||||||
manager = None
|
|
||||||
return shade.OpenStackCloud(
|
|
||||||
cloud_config=self.provider.cloud_config,
|
|
||||||
manager=manager,
|
|
||||||
**self.provider.cloud_config.config)
|
|
||||||
|
|
||||||
def resetClient(self):
|
|
||||||
self._client = self._getClient()
|
|
||||||
if self._use_taskmanager:
|
|
||||||
self._taskmanager.setClient(self._client)
|
|
||||||
|
|
||||||
def _getFlavors(self):
|
|
||||||
flavors = self.listFlavors()
|
|
||||||
flavors.sort(key=operator.itemgetter('ram'))
|
|
||||||
return flavors
|
|
||||||
|
|
||||||
# TODO(mordred): These next three methods duplicate logic that is in
|
|
||||||
# shade, but we can't defer to shade until we're happy
|
|
||||||
# with using shade's resource caching facility. We have
|
|
||||||
# not yet proven that to our satisfaction, but if/when
|
|
||||||
# we do, these should be able to go away.
|
|
||||||
def _findFlavorByName(self, flavor_name):
|
|
||||||
for f in self._flavors:
|
|
||||||
if flavor_name in (f['name'], f['id']):
|
|
||||||
return f
|
|
||||||
raise Exception("Unable to find flavor: %s" % flavor_name)
|
|
||||||
|
|
||||||
def _findFlavorByRam(self, min_ram, flavor_name):
|
|
||||||
for f in self._flavors:
|
|
||||||
if (f['ram'] >= min_ram
|
|
||||||
and (not flavor_name or flavor_name in f['name'])):
|
|
||||||
return f
|
|
||||||
raise Exception("Unable to find flavor with min ram: %s" % min_ram)
|
|
||||||
|
|
||||||
def findFlavor(self, flavor_name, min_ram):
|
|
||||||
# Note: this will throw an error if the provider is offline
|
|
||||||
# but all the callers are in threads (they call in via CreateServer) so
|
|
||||||
# the mainloop won't be affected.
|
|
||||||
if min_ram:
|
|
||||||
return self._findFlavorByRam(min_ram, flavor_name)
|
|
||||||
else:
|
|
||||||
return self._findFlavorByName(flavor_name)
|
|
||||||
|
|
||||||
def findImage(self, name):
|
|
||||||
if name in self._images:
|
|
||||||
return self._images[name]
|
|
||||||
|
|
||||||
with shade_inner_exceptions():
|
|
||||||
image = self._client.get_image(name)
|
|
||||||
self._images[name] = image
|
|
||||||
return image
|
|
||||||
|
|
||||||
def findNetwork(self, name):
|
|
||||||
if name in self._networks:
|
|
||||||
return self._networks[name]
|
|
||||||
|
|
||||||
with shade_inner_exceptions():
|
|
||||||
network = self._client.get_network(name)
|
|
||||||
self._networks[name] = network
|
|
||||||
return network
|
|
||||||
|
|
||||||
def deleteImage(self, name):
|
|
||||||
if name in self._images:
|
|
||||||
del self._images[name]
|
|
||||||
|
|
||||||
with shade_inner_exceptions():
|
|
||||||
return self._client.delete_image(name)
|
|
||||||
|
|
||||||
def createServer(self, name, image,
|
|
||||||
flavor_name=None, min_ram=None,
|
|
||||||
az=None, key_name=None, config_drive=True,
|
|
||||||
nodepool_node_id=None, nodepool_node_label=None,
|
|
||||||
nodepool_image_name=None,
|
|
||||||
networks=None, boot_from_volume=False, volume_size=50):
|
|
||||||
if not networks:
|
|
||||||
networks = []
|
|
||||||
if not isinstance(image, dict):
|
|
||||||
# if it's a dict, we already have the cloud id. If it's not,
|
|
||||||
# we don't know if it's name or ID so need to look it up
|
|
||||||
image = self.findImage(image)
|
|
||||||
flavor = self.findFlavor(flavor_name=flavor_name, min_ram=min_ram)
|
|
||||||
create_args = dict(name=name,
|
|
||||||
image=image,
|
|
||||||
flavor=flavor,
|
|
||||||
config_drive=config_drive)
|
|
||||||
if boot_from_volume:
|
|
||||||
create_args['boot_from_volume'] = boot_from_volume
|
|
||||||
create_args['volume_size'] = volume_size
|
|
||||||
# NOTE(pabelanger): Always cleanup volumes when we delete a server.
|
|
||||||
create_args['terminate_volume'] = True
|
|
||||||
if key_name:
|
|
||||||
create_args['key_name'] = key_name
|
|
||||||
if az:
|
|
||||||
create_args['availability_zone'] = az
|
|
||||||
nics = []
|
|
||||||
for network in networks:
|
|
||||||
net_id = self.findNetwork(network)['id']
|
|
||||||
nics.append({'net-id': net_id})
|
|
||||||
if nics:
|
|
||||||
create_args['nics'] = nics
|
|
||||||
# Put provider.name and image_name in as groups so that ansible
|
|
||||||
# inventory can auto-create groups for us based on each of those
|
|
||||||
# qualities
|
|
||||||
# Also list each of those values directly so that non-ansible
|
|
||||||
# consumption programs don't need to play a game of knowing that
|
|
||||||
# groups[0] is the image name or anything silly like that.
|
|
||||||
groups_list = [self.provider.name]
|
|
||||||
|
|
||||||
if nodepool_image_name:
|
|
||||||
groups_list.append(nodepool_image_name)
|
|
||||||
if nodepool_node_label:
|
|
||||||
groups_list.append(nodepool_node_label)
|
|
||||||
meta = dict(
|
|
||||||
groups=",".join(groups_list),
|
|
||||||
nodepool_provider_name=self.provider.name,
|
|
||||||
)
|
|
||||||
if nodepool_node_id:
|
|
||||||
meta['nodepool_node_id'] = nodepool_node_id
|
|
||||||
if nodepool_image_name:
|
|
||||||
meta['nodepool_image_name'] = nodepool_image_name
|
|
||||||
if nodepool_node_label:
|
|
||||||
meta['nodepool_node_label'] = nodepool_node_label
|
|
||||||
create_args['meta'] = meta
|
|
||||||
|
|
||||||
with shade_inner_exceptions():
|
|
||||||
return self._client.create_server(wait=False, **create_args)
|
|
||||||
|
|
||||||
def getServer(self, server_id):
|
|
||||||
with shade_inner_exceptions():
|
|
||||||
return self._client.get_server(server_id)
|
|
||||||
|
|
||||||
def getServerConsole(self, server_id):
|
|
||||||
try:
|
|
||||||
with shade_inner_exceptions():
|
|
||||||
return self._client.get_server_console(server_id)
|
|
||||||
except shade.OpenStackCloudException:
|
|
||||||
return None
|
|
||||||
|
|
||||||
def waitForServer(self, server, timeout=3600):
|
|
||||||
with shade_inner_exceptions():
|
|
||||||
return self._client.wait_for_server(
|
|
||||||
server=server, auto_ip=True, reuse=False,
|
|
||||||
timeout=timeout)
|
|
||||||
|
|
||||||
def waitForNodeCleanup(self, server_id, timeout=600):
|
|
||||||
for count in iterate_timeout(
|
|
||||||
timeout, exceptions.ServerDeleteException,
|
|
||||||
"server %s deletion" % server_id):
|
|
||||||
if not self.getServer(server_id):
|
|
||||||
return
|
|
||||||
|
|
||||||
def waitForImage(self, image_id, timeout=3600):
|
|
||||||
last_status = None
|
|
||||||
for count in iterate_timeout(
|
|
||||||
timeout, exceptions.ImageCreateException, "image creation"):
|
|
||||||
try:
|
|
||||||
image = self.getImage(image_id)
|
|
||||||
except NotFound:
|
|
||||||
continue
|
|
||||||
except ManagerStoppedException:
|
|
||||||
raise
|
|
||||||
except Exception:
|
|
||||||
self.log.exception('Unable to list images while waiting for '
|
|
||||||
'%s will retry' % (image_id))
|
|
||||||
continue
|
|
||||||
|
|
||||||
# shade returns None when not found
|
|
||||||
if not image:
|
|
||||||
continue
|
|
||||||
|
|
||||||
status = image['status']
|
|
||||||
if (last_status != status):
|
|
||||||
self.log.debug(
|
|
||||||
'Status of image in {provider} {id}: {status}'.format(
|
|
||||||
provider=self.provider.name,
|
|
||||||
id=image_id,
|
|
||||||
status=status))
|
|
||||||
if status == 'ERROR' and 'fault' in image:
|
|
||||||
self.log.debug(
|
|
||||||
'ERROR in {provider} on {id}: {resason}'.format(
|
|
||||||
provider=self.provider.name,
|
|
||||||
id=image_id,
|
|
||||||
resason=image['fault']['message']))
|
|
||||||
last_status = status
|
|
||||||
# Glance client returns lower case statuses - but let's be sure
|
|
||||||
if status.lower() in ['active', 'error']:
|
|
||||||
return image
|
|
||||||
|
|
||||||
def createImage(self, server, image_name, meta):
|
|
||||||
with shade_inner_exceptions():
|
|
||||||
return self._client.create_image_snapshot(
|
|
||||||
image_name, server, **meta)
|
|
||||||
|
|
||||||
def getImage(self, image_id):
|
|
||||||
with shade_inner_exceptions():
|
|
||||||
return self._client.get_image(image_id)
|
|
||||||
|
|
||||||
def labelReady(self, image_id):
|
|
||||||
return self.getImage(image_id)
|
|
||||||
|
|
||||||
def uploadImage(self, image_name, filename, image_type=None, meta=None,
|
|
||||||
md5=None, sha256=None):
|
|
||||||
# configure glance and upload image. Note the meta flags
|
|
||||||
# are provided as custom glance properties
|
|
||||||
# NOTE: we have wait=True set here. This is not how we normally
|
|
||||||
# do things in nodepool, preferring to poll ourselves thankyouverymuch.
|
|
||||||
# However - two things to note:
|
|
||||||
# - PUT has no aysnc mechanism, so we have to handle it anyway
|
|
||||||
# - v2 w/task waiting is very strange and complex - but we have to
|
|
||||||
# block for our v1 clouds anyway, so we might as well
|
|
||||||
# have the interface be the same and treat faking-out
|
|
||||||
# a shade-level fake-async interface later
|
|
||||||
if not meta:
|
|
||||||
meta = {}
|
|
||||||
if image_type:
|
|
||||||
meta['disk_format'] = image_type
|
|
||||||
with shade_inner_exceptions():
|
|
||||||
image = self._client.create_image(
|
|
||||||
name=image_name,
|
|
||||||
filename=filename,
|
|
||||||
is_public=False,
|
|
||||||
wait=True,
|
|
||||||
md5=md5,
|
|
||||||
sha256=sha256,
|
|
||||||
**meta)
|
|
||||||
return image.id
|
|
||||||
|
|
||||||
def listImages(self):
|
|
||||||
with shade_inner_exceptions():
|
|
||||||
return self._client.list_images()
|
|
||||||
|
|
||||||
def listFlavors(self):
|
|
||||||
with shade_inner_exceptions():
|
|
||||||
return self._client.list_flavors(get_extra=False)
|
|
||||||
|
|
||||||
def listNodes(self):
|
|
||||||
# shade list_servers carries the nodepool server list caching logic
|
|
||||||
with shade_inner_exceptions():
|
|
||||||
return self._client.list_servers()
|
|
||||||
|
|
||||||
def deleteServer(self, server_id):
|
|
||||||
with shade_inner_exceptions():
|
|
||||||
return self._client.delete_server(server_id, delete_ips=True)
|
|
||||||
|
|
||||||
def cleanupNode(self, server_id):
|
|
||||||
server = self.getServer(server_id)
|
|
||||||
if not server:
|
|
||||||
raise NotFound()
|
|
||||||
|
|
||||||
self.log.debug('Deleting server %s' % server_id)
|
|
||||||
self.deleteServer(server_id)
|
|
||||||
|
|
||||||
def cleanupLeakedFloaters(self):
|
|
||||||
with shade_inner_exceptions():
|
|
||||||
self._client.delete_unattached_floating_ips()
|
|
||||||
|
|
||||||
def getAZs(self):
|
|
||||||
if self.__azs is None:
|
|
||||||
self.__azs = self._client.list_availability_zone_names()
|
|
||||||
if not self.__azs:
|
|
||||||
# If there are no zones, return a list containing None so that
|
|
||||||
# random.choice can pick None and pass that to Nova. If this
|
|
||||||
# feels dirty, please direct your ire to policy.json and the
|
|
||||||
# ability to turn off random portions of the OpenStack API.
|
|
||||||
self.__azs = [None]
|
|
||||||
return self.__azs
|
|
||||||
|
|
||||||
|
|
||||||
class FakeProvider(OpenStackProvider):
|
|
||||||
def __init__(self, provider, use_taskmanager):
|
|
||||||
self.createServer_fails = 0
|
|
||||||
self.__client = fakeprovider.FakeOpenStackCloud()
|
|
||||||
super(FakeProvider, self).__init__(provider, use_taskmanager)
|
|
||||||
|
|
||||||
def _getClient(self):
|
|
||||||
return self.__client
|
|
||||||
|
|
||||||
def createServer(self, *args, **kwargs):
|
|
||||||
while self.createServer_fails:
|
|
||||||
self.createServer_fails -= 1
|
|
||||||
raise Exception("Expected createServer exception")
|
|
||||||
return super(FakeProvider, self).createServer(*args, **kwargs)
|
|
||||||
|
|
|
@ -20,6 +20,8 @@ import os
|
||||||
import logging
|
import logging
|
||||||
import statsd
|
import statsd
|
||||||
|
|
||||||
|
from nodepool import zk
|
||||||
|
|
||||||
log = logging.getLogger("nodepool.stats")
|
log = logging.getLogger("nodepool.stats")
|
||||||
|
|
||||||
def get_client():
|
def get_client():
|
||||||
|
@ -38,3 +40,98 @@ def get_client():
|
||||||
return statsd.StatsClient(**statsd_args)
|
return statsd.StatsClient(**statsd_args)
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class StatsReporter(object):
|
||||||
|
'''
|
||||||
|
Class adding statsd reporting functionality.
|
||||||
|
'''
|
||||||
|
def __init__(self):
|
||||||
|
super(StatsReporter, self).__init__()
|
||||||
|
self._statsd = get_client()
|
||||||
|
|
||||||
|
def recordLaunchStats(self, subkey, dt, image_name,
|
||||||
|
provider_name, node_az, requestor):
|
||||||
|
'''
|
||||||
|
Record node launch statistics.
|
||||||
|
|
||||||
|
:param str subkey: statsd key
|
||||||
|
:param int dt: Time delta in milliseconds
|
||||||
|
:param str image_name: Name of the image used
|
||||||
|
:param str provider_name: Name of the provider
|
||||||
|
:param str node_az: AZ of the launched node
|
||||||
|
:param str requestor: Identifier for the request originator
|
||||||
|
'''
|
||||||
|
if not self._statsd:
|
||||||
|
return
|
||||||
|
|
||||||
|
keys = [
|
||||||
|
'nodepool.launch.provider.%s.%s' % (provider_name, subkey),
|
||||||
|
'nodepool.launch.image.%s.%s' % (image_name, subkey),
|
||||||
|
'nodepool.launch.%s' % (subkey,),
|
||||||
|
]
|
||||||
|
|
||||||
|
if node_az:
|
||||||
|
keys.append('nodepool.launch.provider.%s.%s.%s' %
|
||||||
|
(provider_name, node_az, subkey))
|
||||||
|
|
||||||
|
if requestor:
|
||||||
|
# Replace '.' which is a graphite hierarchy, and ':' which is
|
||||||
|
# a statsd delimeter.
|
||||||
|
requestor = requestor.replace('.', '_')
|
||||||
|
requestor = requestor.replace(':', '_')
|
||||||
|
keys.append('nodepool.launch.requestor.%s.%s' %
|
||||||
|
(requestor, subkey))
|
||||||
|
|
||||||
|
for key in keys:
|
||||||
|
self._statsd.timing(key, dt)
|
||||||
|
self._statsd.incr(key)
|
||||||
|
|
||||||
|
|
||||||
|
def updateNodeStats(self, zk_conn, provider):
|
||||||
|
'''
|
||||||
|
Refresh statistics for all known nodes.
|
||||||
|
|
||||||
|
:param ZooKeeper zk_conn: A ZooKeeper connection object.
|
||||||
|
:param Provider provider: A config Provider object.
|
||||||
|
'''
|
||||||
|
if not self._statsd:
|
||||||
|
return
|
||||||
|
|
||||||
|
states = {}
|
||||||
|
|
||||||
|
# Initialize things we know about to zero
|
||||||
|
for state in zk.Node.VALID_STATES:
|
||||||
|
key = 'nodepool.nodes.%s' % state
|
||||||
|
states[key] = 0
|
||||||
|
key = 'nodepool.provider.%s.nodes.%s' % (provider.name, state)
|
||||||
|
states[key] = 0
|
||||||
|
|
||||||
|
for node in zk_conn.nodeIterator():
|
||||||
|
#nodepool.nodes.STATE
|
||||||
|
key = 'nodepool.nodes.%s' % node.state
|
||||||
|
states[key] += 1
|
||||||
|
|
||||||
|
#nodepool.label.LABEL.nodes.STATE
|
||||||
|
key = 'nodepool.label.%s.nodes.%s' % (node.type, node.state)
|
||||||
|
# It's possible we could see node types that aren't in our config
|
||||||
|
if key in states:
|
||||||
|
states[key] += 1
|
||||||
|
else:
|
||||||
|
states[key] = 1
|
||||||
|
|
||||||
|
#nodepool.provider.PROVIDER.nodes.STATE
|
||||||
|
key = 'nodepool.provider.%s.nodes.%s' % (node.provider, node.state)
|
||||||
|
# It's possible we could see providers that aren't in our config
|
||||||
|
if key in states:
|
||||||
|
states[key] += 1
|
||||||
|
else:
|
||||||
|
states[key] = 1
|
||||||
|
|
||||||
|
for key, count in states.items():
|
||||||
|
self._statsd.gauge(key, count)
|
||||||
|
|
||||||
|
#nodepool.provider.PROVIDER.max_servers
|
||||||
|
key = 'nodepool.provider.%s.max_servers' % provider.name
|
||||||
|
max_servers = sum([p.max_servers for p in provider.pools.values()])
|
||||||
|
self._statsd.gauge(key, max_servers)
|
||||||
|
|
|
@ -198,7 +198,8 @@ class BaseTestCase(testtools.TestCase):
|
||||||
return fake_client
|
return fake_client
|
||||||
|
|
||||||
self.useFixture(fixtures.MonkeyPatch(
|
self.useFixture(fixtures.MonkeyPatch(
|
||||||
'nodepool.provider_manager.OpenStackProvider._getClient',
|
'nodepool.driver.openstack.provider.OpenStackProvider.'
|
||||||
|
'_getClient',
|
||||||
get_fake_client))
|
get_fake_client))
|
||||||
self.useFixture(fixtures.MonkeyPatch(
|
self.useFixture(fixtures.MonkeyPatch(
|
||||||
'nodepool.launcher._get_one_cloud',
|
'nodepool.launcher._get_one_cloud',
|
||||||
|
|
|
@ -105,7 +105,7 @@ class TestNodePoolBuilder(tests.DBTestCase):
|
||||||
return fake_client
|
return fake_client
|
||||||
|
|
||||||
self.useFixture(fixtures.MonkeyPatch(
|
self.useFixture(fixtures.MonkeyPatch(
|
||||||
'nodepool.provider_manager.FakeProvider._getClient',
|
'nodepool.driver.fake.provider.FakeProvider._getClient',
|
||||||
get_fake_client))
|
get_fake_client))
|
||||||
self.useFixture(fixtures.MonkeyPatch(
|
self.useFixture(fixtures.MonkeyPatch(
|
||||||
'nodepool.launcher._get_one_cloud',
|
'nodepool.launcher._get_one_cloud',
|
||||||
|
|
|
@ -400,7 +400,7 @@ class TestLauncher(tests.DBTestCase):
|
||||||
def fail_delete(self, name):
|
def fail_delete(self, name):
|
||||||
raise RuntimeError('Fake Error')
|
raise RuntimeError('Fake Error')
|
||||||
|
|
||||||
fake_delete = 'nodepool.provider_manager.FakeProvider.deleteServer'
|
fake_delete = 'nodepool.driver.fake.provider.FakeProvider.deleteServer'
|
||||||
self.useFixture(fixtures.MonkeyPatch(fake_delete, fail_delete))
|
self.useFixture(fixtures.MonkeyPatch(fake_delete, fail_delete))
|
||||||
|
|
||||||
configfile = self.setup_config('node.yaml')
|
configfile = self.setup_config('node.yaml')
|
||||||
|
@ -412,7 +412,7 @@ class TestLauncher(tests.DBTestCase):
|
||||||
self.assertEqual(len(nodes), 1)
|
self.assertEqual(len(nodes), 1)
|
||||||
|
|
||||||
self.zk.lockNode(nodes[0], blocking=False)
|
self.zk.lockNode(nodes[0], blocking=False)
|
||||||
nodepool.launcher.InstanceDeleter.delete(
|
nodepool.launcher.NodeDeleter.delete(
|
||||||
self.zk, pool.getProviderManager('fake-provider'), nodes[0])
|
self.zk, pool.getProviderManager('fake-provider'), nodes[0])
|
||||||
|
|
||||||
# Make sure our old node is in delete state, even though delete failed
|
# Make sure our old node is in delete state, even though delete failed
|
||||||
|
|
|
@ -21,7 +21,7 @@ from nodepool import builder
|
||||||
from nodepool import provider_manager
|
from nodepool import provider_manager
|
||||||
from nodepool import tests
|
from nodepool import tests
|
||||||
from nodepool import zk
|
from nodepool import zk
|
||||||
from nodepool.launcher import OpenStackNodeLaunchManager
|
from nodepool.driver.openstack.handler import OpenStackNodeLaunchManager
|
||||||
|
|
||||||
|
|
||||||
class TestNodeLaunchManager(tests.DBTestCase):
|
class TestNodeLaunchManager(tests.DBTestCase):
|
||||||
|
@ -62,7 +62,7 @@ class TestNodeLaunchManager(tests.DBTestCase):
|
||||||
self.assertEqual(len(mgr.ready_nodes), 1)
|
self.assertEqual(len(mgr.ready_nodes), 1)
|
||||||
self.assertEqual(len(mgr.failed_nodes), 0)
|
self.assertEqual(len(mgr.failed_nodes), 0)
|
||||||
|
|
||||||
@mock.patch('nodepool.launcher.NodeLauncher._launchNode')
|
@mock.patch('nodepool.driver.openstack.handler.NodeLauncher._launchNode')
|
||||||
def test_failed_launch(self, mock_launch):
|
def test_failed_launch(self, mock_launch):
|
||||||
configfile = self.setup_config('node.yaml')
|
configfile = self.setup_config('node.yaml')
|
||||||
self._setup(configfile)
|
self._setup(configfile)
|
||||||
|
@ -79,7 +79,7 @@ class TestNodeLaunchManager(tests.DBTestCase):
|
||||||
self.assertEqual(len(mgr.failed_nodes), 1)
|
self.assertEqual(len(mgr.failed_nodes), 1)
|
||||||
self.assertEqual(len(mgr.ready_nodes), 0)
|
self.assertEqual(len(mgr.ready_nodes), 0)
|
||||||
|
|
||||||
@mock.patch('nodepool.launcher.NodeLauncher._launchNode')
|
@mock.patch('nodepool.driver.openstack.handler.NodeLauncher._launchNode')
|
||||||
def test_mixed_launch(self, mock_launch):
|
def test_mixed_launch(self, mock_launch):
|
||||||
configfile = self.setup_config('node.yaml')
|
configfile = self.setup_config('node.yaml')
|
||||||
self._setup(configfile)
|
self._setup(configfile)
|
||||||
|
|
|
@ -24,7 +24,7 @@ import yaml
|
||||||
from nodepool import config as nodepool_config
|
from nodepool import config as nodepool_config
|
||||||
from nodepool import provider_manager
|
from nodepool import provider_manager
|
||||||
from nodepool import tests
|
from nodepool import tests
|
||||||
from nodepool.provider_manager import shade_inner_exceptions
|
from nodepool.driver.openstack.provider import shade_inner_exceptions
|
||||||
|
|
||||||
|
|
||||||
class TestShadeIntegration(tests.IntegrationTestCase):
|
class TestShadeIntegration(tests.IntegrationTestCase):
|
||||||
|
|
Loading…
Reference in New Issue