tripleo-common/tripleo_common/actions/baremetal_deploy.py

638 lines
23 KiB
Python

# Copyright 2018 Red Hat, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import logging
import jsonschema
import metalsmith
from metalsmith import instance_config
from metalsmith import sources
from mistral_lib import actions
from openstack import exceptions as sdk_exc
import six
from tripleo_common.actions import base
from tripleo_common.utils import keystone
LOG = logging.getLogger(__name__)
def _provisioner(context):
session = keystone.get_session(context)
return metalsmith.Provisioner(session=session)
_IMAGE_SCHEMA = {
'type': 'object',
'properties': {
'href': {'type': 'string'},
'checksum': {'type': 'string'},
'kernel': {'type': 'string'},
'ramdisk': {'type': 'string'},
},
'required': ['href'],
'additionalProperties': False,
}
_NIC_SCHEMA = {
'type': 'object',
'properties': {
'network': {'type': 'string'},
'port': {'type': 'string'},
'fixed_ip': {'type': 'string'},
'subnet': {'type': 'string'},
},
'additionalProperties': False
}
_INSTANCE_SCHEMA = {
'type': 'object',
'properties': {
'capabilities': {'type': 'object'},
'hostname': {
'type': 'string',
'minLength': 2,
'maxLength': 255
},
'image': _IMAGE_SCHEMA,
'name': {'type': 'string'},
'nics': {
'type': 'array',
'items': _NIC_SCHEMA
},
'profile': {'type': 'string'},
'provisioned': {'type': 'boolean'},
'resource_class': {'type': 'string'},
'root_size_gb': {'type': 'integer', 'minimum': 4},
'swap_size_mb': {'type': 'integer', 'minimum': 64},
'traits': {
'type': 'array',
'items': {'type': 'string'}
},
},
'additionalProperties': False,
}
_INSTANCES_SCHEMA = {
'type': 'array',
'items': _INSTANCE_SCHEMA
}
"""JSON schema of the instances list."""
_ROLES_INPUT_SCHEMA = {
'type': 'array',
'items': {
'type': 'object',
'properties': {
'name': {'type': 'string'},
'hostname_format': {'type': 'string'},
'count': {'type': 'integer', 'minimum': 0},
'defaults': _INSTANCE_SCHEMA,
'instances': _INSTANCES_SCHEMA,
},
'additionalProperties': False,
'required': ['name'],
}
}
"""JSON schema of the roles list."""
class CheckExistingInstancesAction(base.TripleOAction):
"""Detect which requested instances have already been provisioned."""
def __init__(self, instances, default_resource_class='baremetal'):
super(CheckExistingInstancesAction, self).__init__()
self.instances = instances
self.default_resource_class = default_resource_class
def run(self, context):
try:
_validate_instances(self.instances)
except Exception as exc:
LOG.error('Failed to validate provided instances. %s', exc)
return actions.Result(error=six.text_type(exc))
provisioner = _provisioner(context)
not_found = []
found = []
for request in self.instances:
ident = request.get('name', request['hostname'])
try:
instance = provisioner.show_instance(ident)
# TODO(dtantsur): replace Error with a specific exception
except (sdk_exc.ResourceNotFound, metalsmith.exceptions.Error):
not_found.append(request)
except Exception as exc:
message = ('Failed to request instance information for %s'
% ident)
LOG.exception(message)
return actions.Result(
error="%s. %s: %s" % (message, type(exc).__name__, exc)
)
else:
# NOTE(dtantsur): metalsmith can match instances by node names,
# provide a safeguard to avoid conflicts.
if (instance.hostname and
instance.hostname != request['hostname']):
error = ("Requested hostname %s was not found, but the "
"deployed node %s has a matching name. Refusing "
"to proceed to avoid confusing results. Please "
"either rename the node or use a different "
"hostname") % (request['hostname'], instance.uuid)
return actions.Result(error=error)
if (not instance.allocation
and instance.state == metalsmith.InstanceState.ACTIVE
and 'name' in request):
# Existing node is missing an allocation record,
# so create one without triggering allocation
LOG.debug('Reserving existing %s' % request['name'])
self.get_baremetal_client(context).allocation.create(
resource_class=request.get('resource_class') or
self.default_resource_class,
name=request['hostname'],
node=request['name']
)
found.append(_instance_to_dict(provisioner.connection,
instance))
if found:
LOG.info('Found existing instances: %s',
', '.join(r['hostname'] for r in found))
if not_found:
LOG.info('Instance(s) %s do not exist',
', '.join(r['hostname'] for r in not_found))
return {
'not_found': not_found,
'instances': found
}
class ReserveNodesAction(base.TripleOAction):
"""Reserve nodes for requested instances."""
def __init__(self, instances, default_resource_class='baremetal'):
super(ReserveNodesAction, self).__init__()
self.instances = instances
self.default_resource_class = default_resource_class
def run(self, context):
try:
_validate_instances(self.instances)
except Exception as exc:
LOG.error('Failed to validate provided instances. %s', exc)
return actions.Result(error=six.text_type(exc))
provisioner = _provisioner(context)
# TODO(dtantsur): looping over instances is not very optimal, change it
# to metalsmith plan deployment API when it's available.
result = []
nodes = []
try:
for instance in self.instances:
LOG.debug('Trying to reserve a node for instance %s', instance)
if instance.get('name'):
# NOTE(dtantsur): metalsmith accepts list of nodes to pick
# from. We implement a simplest case when a user can pick a
# node by its name (actually, UUID will also work).
candidates = [instance['name']]
else:
candidates = None
if instance.get('profile'):
# TODO(dtantsur): change to traits?
instance.setdefault(
'capabilities', {})['profile'] = instance['profile']
node = provisioner.reserve_node(
resource_class=instance.get('resource_class') or
self.default_resource_class,
capabilities=instance.get('capabilities'),
candidates=candidates,
traits=instance.get('traits'))
LOG.info('Reserved node %s for instance %s', node, instance)
nodes.append(node)
result.append({'node': node.id, 'instance': instance})
except Exception as exc:
LOG.exception('Provisioning failed, cleaning up')
# Remove all reservations on failure
_release_nodes(provisioner, nodes)
return actions.Result(
error="%s: %s" % (type(exc).__name__, exc)
)
return {'reservations': result}
class DeployNodeAction(base.TripleOAction):
"""Provision instance on a previously reserved node."""
def __init__(self, instance, node, ssh_keys=None,
# For compatibility with deployment based on heat+nova
ssh_user_name='heat-admin',
default_network='ctlplane',
# 50 is the default for old flavors, subtracting 1G to account
# for partitioning and configdrive.
default_root_size=49):
super(DeployNodeAction, self).__init__()
self.instance = instance
self.node = node
self.config = instance_config.CloudInitConfig(ssh_keys=ssh_keys)
self.config.add_user(ssh_user_name, admin=True, sudo=True)
self.default_network = default_network
self.default_root_size = default_root_size
def run(self, context):
try:
_validate_instances([self.instance])
except Exception as exc:
LOG.error('Failed to validate the request. %s', exc)
return actions.Result(error=six.text_type(exc))
provisioner = _provisioner(context)
LOG.debug('Starting provisioning of %s on node %s',
self.instance, self.node)
try:
image = _get_source(self.instance)
instance = provisioner.provision_node(
self.node,
config=self.config,
hostname=self.instance['hostname'],
image=image,
nics=self.instance.get('nics',
[{'network': self.default_network}]),
root_size_gb=self.instance.get('root_size_gb',
self.default_root_size),
swap_size_mb=self.instance.get('swap_size_mb'),
)
except Exception as exc:
LOG.exception('Provisioning of %s on node %s failed',
self.instance, self.node)
_release_nodes(provisioner, [self.node])
return actions.Result(
error="%s: %s" % (type(exc).__name__, exc)
)
LOG.info('Started provisioning of %s on node %s',
self.instance, self.node)
return _instance_to_dict(provisioner.connection, instance)
class WaitForDeploymentAction(base.TripleOAction):
"""Wait for the instance to be deployed."""
def __init__(self, instance, timeout=3600):
super(WaitForDeploymentAction, self).__init__()
self.instance = instance
self.timeout = timeout
def run(self, context):
provisioner = _provisioner(context)
LOG.debug('Waiting for instance %s to provision',
self.instance['hostname'])
try:
instance = provisioner.wait_for_provisioning(
[self.instance['uuid']], timeout=self.timeout)[0]
except Exception as exc:
LOG.exception('Provisioning of instance %s failed or timed out',
self.instance['hostname'])
# Do not tear down, leave it up for the calling code to handle.
return actions.Result(
error="%s: %s" % (type(exc).__name__, exc)
)
LOG.info('Successfully provisioned instance %s',
self.instance['hostname'])
return _instance_to_dict(provisioner.connection, instance)
class UndeployInstanceAction(base.TripleOAction):
"""Undeploy a previously deployed instance."""
def __init__(self, instance, timeout=1800):
super(UndeployInstanceAction, self).__init__()
self.instance = instance
self.timeout = timeout
def run(self, context):
provisioner = _provisioner(context)
if isinstance(self.instance, dict):
inst = self.instance['hostname']
else:
inst = self.instance
try:
instance = provisioner.show_instance(inst)
except Exception:
LOG.warning('Cannot get instance %s, assuming already deleted',
self.instance)
return
LOG.debug('Unprovisioning instance %s', instance.hostname)
provisioner.unprovision_node(instance.node, wait=self.timeout)
LOG.info('Successfully unprovisioned %s', instance.hostname)
class ExpandRolesAction(base.TripleOAction):
"""Convert a baremetal_deployment file to list of instances."""
def __init__(self, roles, stackname='overcloud',
default_image='overcloud-full', provisioned=True):
super(ExpandRolesAction, self).__init__()
self.roles = roles
self.stackname = stackname
self.default_image = default_image
self.provisioned = provisioned
def run(self, context):
for role in self.roles:
role.setdefault('defaults', {})
role['defaults'].setdefault(
'image', {'href': self.default_image})
for inst in role.get('instances', []):
for k, v in role['defaults'].items():
inst.setdefault(k, v)
# Set the default hostname now for duplicate hostname
# detection during validation
if 'hostname' not in inst and 'name' in inst:
inst['hostname'] = inst['name']
try:
_validate_roles(self.roles, stackname=self.stackname)
except Exception as exc:
LOG.error('Failed to validate the request. %s', exc)
return actions.Result(error=six.text_type(exc))
instances = []
hostname_map = {}
parameter_defaults = {'HostnameMap': hostname_map}
for role in self.roles:
name = role['name']
hostname_format = _hostname_format(
role.get('hostname_format'), name)
count = role.get('count', 1)
unprovisioned_indexes = []
# build a map of all potential generated names
# with the index number which generates the name
potential_gen_names = {}
for index in range(count + len(role.get('instances', []))):
potential_gen_names[_build_hostname(
hostname_format, index, self.stackname)] = index
# build a list of instances from the specified
# instances list
role_instances = []
for instance in role.get('instances', []):
inst = {}
inst.update(instance)
# create a hostname map entry now if the specified hostname
# is a valid generated name
if inst.get('hostname') in potential_gen_names:
hostname_map[inst['hostname']] = inst['hostname']
role_instances.append(inst)
# add generated instance entries until the desired count of
# provisioned instances is reached
while len([i for i in role_instances
if i.get('provisioned', True)]) < count:
inst = {}
inst.update(role['defaults'])
role_instances.append(inst)
# NOTE(dtantsur): our hostname format may differ from THT defaults,
# so override it in the resulting environment
parameter_defaults['%sHostnameFormat' % name] = (
hostname_format)
# ensure each instance has a unique non-empty hostname
# and a hostname map entry. Also build a list of indexes
# for unprovisioned instances
index = 0
for inst in role_instances:
provisioned = inst.get('provisioned', True)
gen_name = None
hostname = inst.get('hostname')
if hostname not in hostname_map:
while (not gen_name
or gen_name in hostname_map):
gen_name = _build_hostname(
hostname_format, index, self.stackname)
index += 1
inst.setdefault('hostname', gen_name)
hostname = inst.get('hostname')
hostname_map[gen_name] = inst['hostname']
if not provisioned:
if gen_name:
unprovisioned_indexes.append(
potential_gen_names[gen_name])
elif hostname in potential_gen_names:
unprovisioned_indexes.append(
potential_gen_names[hostname])
if unprovisioned_indexes:
parameter_defaults['%sRemovalPolicies' % name] = [{
'resource_list': unprovisioned_indexes
}]
provisioned_count = 0
for inst in role_instances:
provisioned = inst.pop('provisioned', True)
if provisioned:
provisioned_count += 1
# Only add instances which match the desired provisioned state
if provisioned == self.provisioned:
instances.append(inst)
parameter_defaults['%sCount' % name] = (
provisioned_count)
try:
_validate_instances(instances)
except Exception as exc:
LOG.error('Failed to validate the request. %s', exc)
return actions.Result(error=six.text_type(exc))
if self.provisioned:
env = {'parameter_defaults': parameter_defaults}
else:
env = {}
return {'instances': instances, 'environment': env}
class PopulateEnvironmentAction(base.TripleOAction):
"""Populate the resulting environment file.
Fills in DeployedServerPortMap with the IP addresses of the nodes.
"""
def __init__(self, environment, port_map, ctlplane_network='ctlplane'):
super(PopulateEnvironmentAction, self).__init__()
self.environment = environment
self.port_map = port_map
self.ctlplane_network = ctlplane_network
def run(self, context):
network_keys = (
'mtu',
'tags',
)
subnet_keys = (
'cidr',
'gateway_ip',
'host_routes',
'dns_nameservers',
)
resource_registry = self.environment.setdefault(
'resource_registry', {})
resource_registry.setdefault(
'OS::TripleO::DeployedServer::ControlPlanePort',
'/usr/share/openstack-tripleo-heat-templates'
'/deployed-server/deployed-neutron-port.yaml')
port_map = (self.environment.setdefault('parameter_defaults', {})
.setdefault('DeployedServerPortMap', {}))
for hostname, nets in self.port_map.items():
ctlplane_network = nets.get(self.ctlplane_network)
if not ctlplane_network:
LOG.warning('No ctlplane ports information for %s', hostname)
continue
fixed_ips = ctlplane_network.get('fixed_ips', [])
network_all = ctlplane_network.get('network', {})
network = {k: v for k, v in network_all.items()
if k in network_keys}
subnets = []
for subnet in ctlplane_network.get('subnets', []):
subnets.append({k: v for k, v in subnet.items()
if k in subnet_keys})
ctlplane = {
'fixed_ips': fixed_ips,
'network': network,
'subnets': subnets
}
port_map['%s-%s' % (hostname, self.ctlplane_network)] = ctlplane
return self.environment
def _validate_instances(instances):
jsonschema.validate(instances, _INSTANCES_SCHEMA)
hostnames = set()
names = set()
for inst in instances:
# NOTE(dtantsur): validate image parameters
_get_source(inst)
if inst.get('hostname'):
if inst['hostname'] in hostnames:
raise ValueError('Hostname %s is used more than once' %
inst['hostname'])
hostnames.add(inst['hostname'])
if inst.get('name'):
if inst['name'] in names:
raise ValueError('Node %s is requested more than once' %
inst['name'])
names.add(inst['name'])
def _validate_roles(roles, stackname='overcloud'):
jsonschema.validate(roles, _ROLES_INPUT_SCHEMA)
for item in roles:
count = item.get('count', 1)
instances = item.get('instances', [])
instances = [i for i in instances if i.get('provisioned', True)]
name = item.get('name')
if len(instances) > count:
raise ValueError(
"%s: number of instance entries %s "
"cannot be greater than count %s" %
(name, len(instances), count)
)
defaults = item.get('defaults', {})
if 'hostname' in defaults:
raise ValueError("%s: cannot specify hostname in defaults"
% name)
if 'name' in defaults:
raise ValueError("%s: cannot specify name in defaults"
% name)
if 'provisioned' in defaults:
raise ValueError("%s: cannot specify provisioned in defaults"
% name)
if 'instances' in item:
_validate_instances(item['instances'])
def _release_nodes(provisioner, nodes):
for node in nodes:
LOG.debug('Removing reservation from node %s', node)
try:
provisioner.unprovision_node(node)
except Exception:
LOG.exception('Unable to release node %s, moving on', node)
else:
LOG.info('Removed reservation from node %s', node)
def _get_source(instance):
image = instance.get('image', {})
return sources.detect(image=image.get('href'),
kernel=image.get('kernel'),
ramdisk=image.get('ramdisk'),
checksum=image.get('checksum'))
def _instance_to_dict(connection, instance):
"""Convert an instance to a dict, adding ports information."""
result = instance.to_dict()
result['port_map'] = {}
for nic in instance.nics():
for ip in nic.fixed_ips:
net_name = getattr(nic.network, 'name', None) or nic.network.id
subnet = connection.network.get_subnet(ip['subnet_id'])
net_info = result['port_map'].setdefault(
net_name, {'network': nic.network.to_dict(),
'fixed_ips': [], 'subnets': []})
net_info['fixed_ips'].append({'ip_address': ip['ip_address']})
net_info['subnets'].append(subnet.to_dict())
return result
def _hostname_format(hostname_format, role_name):
if not hostname_format:
hostname_format = '%stackname%-{}-%index%'.format(
'novacompute' if role_name == 'Compute' else role_name.lower())
return hostname_format
def _build_hostname(hostname_format, index, stack):
gen_name = hostname_format.replace('%index%', str(index))
gen_name = gen_name.replace('%stackname%', stack)
return gen_name