438 lines
18 KiB
Python
438 lines
18 KiB
Python
# Copyright (C) 2011-2014 OpenStack Foundation
|
|
# Copyright 2017 Red Hat
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
|
# not use this file except in compliance with the License. You may obtain
|
|
# a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
# License for the specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
import math
|
|
import pprint
|
|
import random
|
|
|
|
from kazoo import exceptions as kze
|
|
import openstack
|
|
|
|
from nodepool import exceptions
|
|
from nodepool import nodeutils as utils
|
|
from nodepool import zk
|
|
from nodepool.driver.utils import NodeLauncher, QuotaInformation
|
|
from nodepool.driver import NodeRequestHandler
|
|
|
|
|
|
class OpenStackNodeLauncher(NodeLauncher):
|
|
def __init__(self, handler, node, provider_config, provider_label):
|
|
'''
|
|
Initialize the launcher.
|
|
|
|
:param OpenStackNodeRequestHandler handler: The handler object.
|
|
:param Node node: A Node object describing the node to launch.
|
|
:param ProviderConfig provider_config: A ProviderConfig object
|
|
describing the provider launching this node.
|
|
:param ProviderLabel provider_label: A ProviderLabel object
|
|
describing the label to use for the node.
|
|
'''
|
|
super().__init__(handler.zk, node, provider_config)
|
|
|
|
# Number of times to retry failed launches.
|
|
self._retries = provider_config.launch_retries
|
|
|
|
self.label = provider_label
|
|
self.pool = provider_label.pool
|
|
self.handler = handler
|
|
self.zk = handler.zk
|
|
|
|
def _logConsole(self, server_id, hostname):
|
|
if not self.label.console_log:
|
|
return
|
|
console = self.handler.manager.getServerConsole(server_id)
|
|
if console:
|
|
self.log.debug('Console log from hostname %s:' % hostname)
|
|
for line in console.splitlines():
|
|
self.log.debug(line.rstrip())
|
|
|
|
def _launchNode(self):
|
|
if self.label.diskimage:
|
|
diskimage = self.provider_config.diskimages[
|
|
self.label.diskimage.name]
|
|
else:
|
|
diskimage = None
|
|
|
|
if diskimage:
|
|
# launch using diskimage
|
|
cloud_image = self.handler.zk.getMostRecentImageUpload(
|
|
diskimage.name, self.provider_config.name)
|
|
|
|
if not cloud_image:
|
|
raise exceptions.LaunchNodepoolException(
|
|
"Unable to find current cloud image %s in %s" %
|
|
(diskimage.name, self.provider_config.name)
|
|
)
|
|
|
|
config_drive = diskimage.config_drive
|
|
# Using a dict with the ID bypasses an image search during
|
|
# server creation.
|
|
image_external = dict(id=cloud_image.external_id)
|
|
image_id = "{path}/{upload_id}".format(
|
|
path=self.handler.zk._imageUploadPath(
|
|
cloud_image.image_name,
|
|
cloud_image.build_id,
|
|
cloud_image.provider_name),
|
|
upload_id=cloud_image.id)
|
|
image_name = diskimage.name
|
|
username = cloud_image.username
|
|
connection_type = diskimage.connection_type
|
|
connection_port = diskimage.connection_port
|
|
|
|
else:
|
|
# launch using unmanaged cloud image
|
|
config_drive = self.label.cloud_image.config_drive
|
|
|
|
if self.label.cloud_image.image_id:
|
|
# Using a dict with the ID bypasses an image search during
|
|
# server creation.
|
|
image_external = dict(id=self.label.cloud_image.image_id)
|
|
else:
|
|
image_external = self.label.cloud_image.external_name
|
|
|
|
image_id = self.label.cloud_image.name
|
|
image_name = self.label.cloud_image.name
|
|
username = self.label.cloud_image.username
|
|
connection_type = self.label.cloud_image.connection_type
|
|
connection_port = self.label.cloud_image.connection_port
|
|
|
|
hostname = self.provider_config.hostname_format.format(
|
|
label=self.label, provider=self.provider_config, node=self.node
|
|
)
|
|
|
|
self.log.info("Creating server with hostname %s in %s from image %s "
|
|
"for node id: %s" % (hostname,
|
|
self.provider_config.name,
|
|
image_name,
|
|
self.node.id))
|
|
|
|
# NOTE: We store the node ID in the server metadata to use for leaked
|
|
# instance detection. We cannot use the external server ID for this
|
|
# because that isn't available in ZooKeeper until after the server is
|
|
# active, which could cause a race in leak detection.
|
|
|
|
try:
|
|
server = self.handler.manager.createServer(
|
|
hostname,
|
|
image=image_external,
|
|
min_ram=self.label.min_ram,
|
|
flavor_name=self.label.flavor_name,
|
|
key_name=self.label.key_name,
|
|
az=self.node.az,
|
|
config_drive=config_drive,
|
|
nodepool_node_id=self.node.id,
|
|
nodepool_node_label=self.node.type[0],
|
|
nodepool_image_name=image_name,
|
|
networks=self.pool.networks,
|
|
security_groups=self.pool.security_groups,
|
|
boot_from_volume=self.label.boot_from_volume,
|
|
volume_size=self.label.volume_size,
|
|
instance_properties=self.label.instance_properties,
|
|
userdata=self.label.userdata)
|
|
except openstack.cloud.exc.OpenStackCloudCreateException as e:
|
|
if e.resource_id:
|
|
self.node.external_id = e.resource_id
|
|
# The outer exception handler will handle storing the
|
|
# node immediately after this.
|
|
raise
|
|
|
|
self.node.external_id = server.id
|
|
self.node.hostname = hostname
|
|
self.node.image_id = image_id
|
|
|
|
pool = self.handler.provider.pools.get(self.node.pool)
|
|
resources = self.handler.manager.quotaNeededByNodeType(
|
|
self.node.type[0], pool)
|
|
self.node.resources = resources.quota['compute']
|
|
if username:
|
|
self.node.username = username
|
|
self.node.connection_type = connection_type
|
|
self.node.connection_port = connection_port
|
|
|
|
# Checkpoint save the updated node info
|
|
self.zk.storeNode(self.node)
|
|
|
|
self.log.debug("Waiting for server %s for node id: %s" %
|
|
(server.id, self.node.id))
|
|
server = self.handler.manager.waitForServer(
|
|
server, self.provider_config.launch_timeout,
|
|
auto_ip=self.pool.auto_floating_ip)
|
|
|
|
if server.status != 'ACTIVE':
|
|
raise exceptions.LaunchStatusException("Server %s for node id: %s "
|
|
"status: %s" %
|
|
(server.id, self.node.id,
|
|
server.status))
|
|
|
|
# If we didn't specify an AZ, set it to the one chosen by Nova.
|
|
# Do this after we are done waiting since AZ may not be available
|
|
# immediately after the create request.
|
|
if not self.node.az:
|
|
self.node.az = server.location.zone
|
|
|
|
interface_ip = server.interface_ip
|
|
if not interface_ip:
|
|
self.log.debug(
|
|
"Server data for failed IP: %s" % pprint.pformat(
|
|
server))
|
|
raise exceptions.LaunchNetworkException(
|
|
"Unable to find public IP of server")
|
|
|
|
self.node.host_id = server.host_id
|
|
self.node.interface_ip = interface_ip
|
|
self.node.public_ipv4 = server.public_v4
|
|
self.node.public_ipv6 = server.public_v6
|
|
self.node.private_ipv4 = server.private_v4
|
|
# devstack-gate multi-node depends on private_v4 being populated
|
|
# with something. On clouds that don't have a private address, use
|
|
# the public.
|
|
if not self.node.private_ipv4:
|
|
self.node.private_ipv4 = server.public_v4
|
|
|
|
# Checkpoint save the updated node info
|
|
self.zk.storeNode(self.node)
|
|
|
|
self.log.debug(
|
|
"Node %s is running [region: %s, az: %s, ip: %s ipv4: %s, "
|
|
"ipv6: %s, hostid: %s]" %
|
|
(self.node.id, self.node.region, self.node.az,
|
|
self.node.interface_ip, self.node.public_ipv4,
|
|
self.node.public_ipv6, self.node.host_id))
|
|
|
|
# wait and scan the new node and record in ZooKeeper
|
|
host_keys = []
|
|
if self.pool.host_key_checking:
|
|
try:
|
|
self.log.debug(
|
|
"Gathering host keys for node %s", self.node.id)
|
|
# only gather host keys if the connection type is ssh
|
|
gather_host_keys = connection_type == 'ssh'
|
|
host_keys = utils.nodescan(
|
|
interface_ip,
|
|
timeout=self.provider_config.boot_timeout,
|
|
gather_hostkeys=gather_host_keys,
|
|
port=connection_port)
|
|
|
|
if gather_host_keys and not host_keys:
|
|
raise exceptions.LaunchKeyscanException(
|
|
"Unable to gather host keys")
|
|
except exceptions.ConnectionTimeoutException:
|
|
self._logConsole(self.node.external_id, self.node.hostname)
|
|
raise
|
|
|
|
self.node.host_keys = host_keys
|
|
self.zk.storeNode(self.node)
|
|
|
|
def launch(self):
|
|
attempts = 1
|
|
while attempts <= self._retries:
|
|
try:
|
|
self._launchNode()
|
|
break
|
|
except kze.SessionExpiredError:
|
|
# If we lost our ZooKeeper session, we've lost our node lock
|
|
# so there's no need to continue.
|
|
raise
|
|
except Exception as e:
|
|
if attempts <= self._retries:
|
|
self.log.exception(
|
|
"Request %s: Launch attempt %d/%d failed for node %s:",
|
|
self.handler.request.id, attempts,
|
|
self._retries, self.node.id)
|
|
# If we created an instance, delete it.
|
|
if self.node.external_id:
|
|
deleting_node = zk.Node()
|
|
deleting_node.provider = self.node.provider
|
|
deleting_node.pool = self.node.pool
|
|
deleting_node.type = self.node.type
|
|
deleting_node.external_id = self.node.external_id
|
|
deleting_node.state = zk.DELETING
|
|
self.zk.storeNode(deleting_node)
|
|
self.log.info(
|
|
"Request %s: Node %s scheduled for cleanup",
|
|
self.handler.request.id, deleting_node.external_id)
|
|
self.node.external_id = None
|
|
self.node.public_ipv4 = None
|
|
self.node.public_ipv6 = None
|
|
self.node.interface_ip = None
|
|
self.zk.storeNode(self.node)
|
|
if attempts == self._retries:
|
|
raise
|
|
if 'quota exceeded' in str(e).lower():
|
|
# A quota exception is not directly recoverable so bail
|
|
# out immediately with a specific exception.
|
|
self.log.info("Quota exceeded, invalidating quota cache")
|
|
self.handler.manager.invalidateQuotaCache()
|
|
raise exceptions.QuotaException("Quota exceeded")
|
|
attempts += 1
|
|
|
|
self.node.state = zk.READY
|
|
self.zk.storeNode(self.node)
|
|
self.log.info("Node id %s is ready", self.node.id)
|
|
|
|
|
|
class OpenStackNodeRequestHandler(NodeRequestHandler):
|
|
|
|
def __init__(self, pw, request):
|
|
super().__init__(pw, request)
|
|
self.chosen_az = None
|
|
self._threads = []
|
|
|
|
@property
|
|
def alive_thread_count(self):
|
|
count = 0
|
|
for t in self._threads:
|
|
if t.isAlive():
|
|
count += 1
|
|
return count
|
|
|
|
def imagesAvailable(self):
|
|
'''
|
|
Determines if the requested images are available for this provider.
|
|
|
|
ZooKeeper is queried for an image uploaded to the provider that is
|
|
in the READY state.
|
|
|
|
:returns: True if it is available, False otherwise.
|
|
'''
|
|
if self.provider.manage_images:
|
|
for label in self.request.node_types:
|
|
if self.pool.labels[label].cloud_image:
|
|
if not self.manager.labelReady(self.pool.labels[label]):
|
|
return False
|
|
else:
|
|
if not self.zk.getMostRecentImageUpload(
|
|
self.pool.labels[label].diskimage.name,
|
|
self.provider.name):
|
|
return False
|
|
return True
|
|
|
|
def hasRemainingQuota(self, ntype):
|
|
needed_quota = self.manager.quotaNeededByNodeType(ntype, self.pool)
|
|
|
|
if not self.pool.ignore_provider_quota:
|
|
# Calculate remaining quota which is calculated as:
|
|
# quota = <total nodepool quota> - <used quota> - <quota for node>
|
|
cloud_quota = self.manager.estimatedNodepoolQuota()
|
|
cloud_quota.subtract(
|
|
self.manager.estimatedNodepoolQuotaUsed())
|
|
cloud_quota.subtract(needed_quota)
|
|
self.log.debug("Predicted remaining provider quota: %s",
|
|
cloud_quota)
|
|
|
|
if not cloud_quota.non_negative():
|
|
return False
|
|
|
|
# Now calculate pool specific quota. Values indicating no quota default
|
|
# to math.inf representing infinity that can be calculated with.
|
|
pool_quota = QuotaInformation(cores=self.pool.max_cores,
|
|
instances=self.pool.max_servers,
|
|
ram=self.pool.max_ram,
|
|
default=math.inf)
|
|
pool_quota.subtract(
|
|
self.manager.estimatedNodepoolQuotaUsed(self.pool))
|
|
self.log.debug("Current pool quota: %s" % pool_quota)
|
|
pool_quota.subtract(needed_quota)
|
|
self.log.debug("Predicted remaining pool quota: %s", pool_quota)
|
|
|
|
return pool_quota.non_negative()
|
|
|
|
def hasProviderQuota(self, node_types):
|
|
needed_quota = QuotaInformation()
|
|
|
|
for ntype in node_types:
|
|
needed_quota.add(
|
|
self.manager.quotaNeededByNodeType(ntype, self.pool))
|
|
|
|
if not self.pool.ignore_provider_quota:
|
|
cloud_quota = self.manager.estimatedNodepoolQuota()
|
|
cloud_quota.subtract(needed_quota)
|
|
|
|
if not cloud_quota.non_negative():
|
|
return False
|
|
|
|
# Now calculate pool specific quota. Values indicating no quota default
|
|
# to math.inf representing infinity that can be calculated with.
|
|
pool_quota = QuotaInformation(cores=self.pool.max_cores,
|
|
instances=self.pool.max_servers,
|
|
ram=self.pool.max_ram,
|
|
default=math.inf)
|
|
pool_quota.subtract(needed_quota)
|
|
return pool_quota.non_negative()
|
|
|
|
def checkReusableNode(self, node):
|
|
if self.chosen_az and node.az != self.chosen_az:
|
|
return False
|
|
return True
|
|
|
|
def nodeReusedNotification(self, node):
|
|
"""
|
|
We attempt to group the node set within the same provider availability
|
|
zone.
|
|
For this to work properly, the provider entry in the nodepool
|
|
config must list the availability zones. Otherwise, new nodes will be
|
|
put in random AZs at nova's whim. The exception being if there is an
|
|
existing node in the READY state that we can select for this node set.
|
|
Its AZ will then be used for new nodes, as well as any other READY
|
|
nodes.
|
|
"""
|
|
# If we haven't already chosen an AZ, select the
|
|
# AZ from this ready node. This will cause new nodes
|
|
# to share this AZ, as well.
|
|
if not self.chosen_az and node.az:
|
|
self.chosen_az = node.az
|
|
|
|
def setNodeMetadata(self, node):
|
|
"""
|
|
Select grouping AZ if we didn't set AZ from a selected,
|
|
pre-existing node
|
|
"""
|
|
if not self.chosen_az:
|
|
self.chosen_az = random.choice(
|
|
self.pool.azs or self.manager.getAZs())
|
|
node.az = self.chosen_az
|
|
node.cloud = self.provider.cloud_config.name
|
|
node.region = self.provider.region_name
|
|
|
|
def launchesComplete(self):
|
|
'''
|
|
Check if all launch requests have completed.
|
|
|
|
When all of the Node objects have reached a final state (READY, FAILED
|
|
or ABORTED), we'll know all threads have finished the launch process.
|
|
'''
|
|
if not self._threads:
|
|
return True
|
|
|
|
# Give the NodeLaunch threads time to finish.
|
|
if self.alive_thread_count:
|
|
return False
|
|
|
|
node_states = [node.state for node in self.nodeset]
|
|
|
|
# NOTE: It's very important that NodeLauncher always sets one of
|
|
# these states, no matter what.
|
|
if not all(s in (zk.READY, zk.FAILED, zk.ABORTED)
|
|
for s in node_states):
|
|
return False
|
|
|
|
return True
|
|
|
|
def launch(self, node):
|
|
label = self.pool.labels[node.type[0]]
|
|
thd = OpenStackNodeLauncher(self, node, self.provider, label)
|
|
thd.start()
|
|
self._threads.append(thd)
|