134270f240
Currently, we leverage Ansible to handle the baremetal configure process. As part of our efforts to support work on Tripleo.Next, these Ansible workflows will need to be migrated to tripleoclient. This change is consolodating Python methods from tripleo-ansible into tripleoclient. Change-Id: I1dbb3e4864688ec931bb19a8f1891d2822632f5a
527 lines
19 KiB
Python
527 lines
19 KiB
Python
# -*- coding: utf-8 -*-
|
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
|
# not use this file except in compliance with the License. You may obtain
|
|
# a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
# License for the specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
import logging
|
|
from typing import Dict
|
|
from typing import List
|
|
|
|
from concurrent import futures
|
|
from openstack import connect as sdkclient
|
|
from openstack import exceptions
|
|
from openstack.utils import iterate_timeout
|
|
from oslo_utils import units
|
|
from tripleoclient import exceptions as ooo_exceptions
|
|
from tripleo_common.utils import nodes as node_utils
|
|
|
|
|
|
class TripleoBaremetal(object):
|
|
|
|
"""Base class for TripleO Baremetal operations.
|
|
|
|
The TripleoBase class provides access to commonly used elements
|
|
required to interact with and perform baremetal operations for TripleO.
|
|
|
|
:param timeout: How long to wait until we consider this job to have
|
|
timed out
|
|
:type timeout: integer
|
|
|
|
:param verbosity: How verbose should we be. Currently, this just sets
|
|
DEBUG for any non-zero value provided.
|
|
:type verbosity: integer
|
|
"""
|
|
|
|
def __init__(self, timeout: int = 1200, verbosity: int = 1):
|
|
self.conn = sdkclient(
|
|
cloud='undercloud'
|
|
)
|
|
self.timeout = timeout
|
|
self.log = logging.getLogger(__name__)
|
|
if verbosity > 0:
|
|
self.log.setLevel(logging.DEBUG)
|
|
|
|
def all_manageable_nodes(self):
|
|
"""This method returns a list of manageable nodes from Ironic
|
|
|
|
We take no arguments and instead create a list of nodes that
|
|
are in the manageable state and NOT in maintenenace. We return the
|
|
subsequent list.
|
|
|
|
Raises:
|
|
NoNodeFound: If no nodes match the above description, we will raise
|
|
an exception.
|
|
|
|
Returns:
|
|
nodes: The List of manageable nodes that are not currently in
|
|
maintenance.
|
|
"""
|
|
nodes = [n.id for n in self.conn.baremetal.nodes(
|
|
provision_state='manageable', is_maintenance=False)]
|
|
|
|
if not nodes:
|
|
raise ooo_exceptions.NoNodeFound
|
|
|
|
return nodes
|
|
|
|
|
|
class TripleoProvide(TripleoBaremetal):
|
|
|
|
"""TripleoProvide handles state transition of baremetal nodes.
|
|
|
|
The TripleoProvide class handles the transition of nodes between the
|
|
manageable and available states.
|
|
|
|
:param wait_for_bridge_mapping: Bool to determine whether or not we are
|
|
waiting for the bridge mapping to be
|
|
active in ironic-neutron-agent
|
|
:type wait_for_bridge_mapping: bool
|
|
|
|
"""
|
|
|
|
def __init__(self, wait_for_bridge_mappings: bool = False,
|
|
verbosity: int = 1):
|
|
|
|
super().__init__(verbosity)
|
|
self.wait_for_bridge_mappings = wait_for_bridge_mappings
|
|
|
|
def _wait_for_unlocked(self, node: str, timeout: int):
|
|
timeout_msg = f'Timeout waiting for node {node} to be unlocked'
|
|
|
|
for count in iterate_timeout(timeout, timeout_msg):
|
|
node_info = self.conn.baremetal.get_node(
|
|
node,
|
|
fields=['reservation']
|
|
)
|
|
|
|
if node_info.reservation is None:
|
|
return
|
|
|
|
def _wait_for_bridge_mapping(self, node: str):
|
|
|
|
client = self.conn.network
|
|
timeout_msg = (f'Timeout waiting for node {node} to have '
|
|
'bridge_mappings set in the ironic-neutron-agent '
|
|
'entry')
|
|
|
|
# default agent polling period is 30s, so wait 60s
|
|
timeout = 60
|
|
|
|
for count in iterate_timeout(timeout, timeout_msg):
|
|
agents = list(
|
|
client.agents(host=node, binary='ironic-neutron-agent'))
|
|
|
|
if agents:
|
|
if agents[0].configuration.get('bridge_mappings'):
|
|
return
|
|
|
|
def provide(self, nodes: str):
|
|
|
|
"""Transition nodes to the Available state.
|
|
|
|
provide handles the state transition from the nodes current state
|
|
to the available state
|
|
|
|
:param nodes: The node UUID or name that we will be working on
|
|
:type nodes: String
|
|
"""
|
|
|
|
client = self.conn.baremetal
|
|
node_timeout = self.timeout
|
|
nodes_wait = nodes[:]
|
|
|
|
for node in nodes:
|
|
self.log.info('Providing node: {}'.format(node))
|
|
self._wait_for_unlocked(node, node_timeout)
|
|
|
|
if self.wait_for_bridge_mappings:
|
|
self._wait_for_bridge_mapping(node)
|
|
|
|
try:
|
|
client.set_node_provision_state(
|
|
node,
|
|
"provide",
|
|
wait=False)
|
|
|
|
except Exception as e:
|
|
nodes_wait.remove(node)
|
|
self.log.error(
|
|
"Can not start providing for node {}: {}".format(
|
|
nodes, e))
|
|
return
|
|
|
|
try:
|
|
self.log.info(
|
|
"Waiting for available state: {}".format(nodes_wait))
|
|
|
|
client.wait_for_nodes_provision_state(
|
|
nodes=nodes_wait,
|
|
expected_state='available',
|
|
timeout=self.timeout,
|
|
fail=False
|
|
)
|
|
|
|
except exceptions.ResourceFailure as e:
|
|
self.log.error("Failed providing nodes due to failure: {}".format(
|
|
e))
|
|
return
|
|
|
|
except exceptions.ResourceTimeout as e:
|
|
self.log.error("Failed providing nodes due to timeout: {}".format(
|
|
e))
|
|
|
|
def provide_manageable_nodes(self):
|
|
self.provide(self.all_manageable_nodes())
|
|
|
|
|
|
class TripleoClean(TripleoBaremetal):
|
|
|
|
"""TripleoClean manages the Ironic node cleaning process.
|
|
|
|
:param all_manageable: Should we work on all nodes in the manageable state
|
|
:type all_manageable: bool
|
|
|
|
:param provide: Should we also set the nodes back to the available state
|
|
:type provide: bool
|
|
|
|
:param timeout: How long should we wait before we consider the nodes to
|
|
have failed.
|
|
:type timeout: integer
|
|
|
|
:param raid_config: The raid configuration we should configure on the node
|
|
:type raid_config: Dictionary
|
|
|
|
:param concurrency: How many nodes should we do at once
|
|
:type concurrency: integer
|
|
|
|
:param clean_steps: The Ironic cleaning steps that should be executed on
|
|
the nodes
|
|
:type clean_steps: List
|
|
"""
|
|
log = logging.getLogger(__name__)
|
|
|
|
def __init__(self, all_manageable: bool = False, provide: bool = False,
|
|
timeout: int = 60, raid_config: Dict = {},
|
|
concurrency: int = 1, verbosity: int = 0,
|
|
clean_steps: List = [{'interface': 'deploy',
|
|
'step': 'erase_devices_metadata'}]):
|
|
super().__init__(verbosity=verbosity, timeout=timeout)
|
|
self.all_manageable = all_manageable
|
|
self.provide = provide
|
|
self.raid_config = raid_config
|
|
self.clean_steps = clean_steps
|
|
self.concurrency = concurrency
|
|
|
|
def _parallel_nodes_cleaning(self, nodes: List):
|
|
client = self.conn.baremetal
|
|
node_timeout = self.timeout
|
|
clean_steps = self.clean_steps
|
|
failed_nodes = []
|
|
success_nodes = []
|
|
if self.raid_config:
|
|
for node in nodes:
|
|
try:
|
|
client.update_node(
|
|
node,
|
|
target_raid_config=self.raid_config
|
|
)
|
|
success_nodes.append(node)
|
|
self.log.info("Setting the raid configuration "
|
|
"for node {} succeeded.".format(node))
|
|
except exceptions.BadRequestException as err:
|
|
self.log.error("Setting raid configuration "
|
|
"for node {} failed. Error: {}".format(
|
|
node, err
|
|
))
|
|
failed_nodes.append(node)
|
|
nodes.pop(nodes.index(node))
|
|
workers = min(len(nodes), self.concurrency) or 1
|
|
with futures.ThreadPoolExecutor(max_workers=workers) as executor:
|
|
future_to_build = {
|
|
executor.submit(
|
|
client.set_node_provision_state,
|
|
node,
|
|
"clean",
|
|
clean_steps=clean_steps,
|
|
wait=True
|
|
): node for node in nodes
|
|
}
|
|
done, not_done = futures.wait(
|
|
future_to_build,
|
|
timeout=node_timeout,
|
|
return_when=futures.ALL_COMPLETED
|
|
)
|
|
try:
|
|
self.log.info(
|
|
"Waiting for manageable state: {}".format(nodes))
|
|
res = client.wait_for_nodes_provision_state(
|
|
nodes=nodes,
|
|
expected_state='manageable',
|
|
timeout=self.timeout,
|
|
fail=False
|
|
)
|
|
except exceptions.ResourceFailure as e:
|
|
self.log.error("Failed providing nodes due to failure: {}".format(
|
|
e))
|
|
except exceptions.ResourceTimeout as e:
|
|
self.log.error("Failed providing nodes due to timeout: {}".format(
|
|
e))
|
|
finally:
|
|
err_nodes = [n.name for n in res if n.last_error]
|
|
s_nodes = [n.name for n in res if not n.last_error]
|
|
for node in err_nodes:
|
|
failed_nodes.append(node)
|
|
for node in s_nodes:
|
|
success_nodes.append(node)
|
|
|
|
return(set(failed_nodes), set(success_nodes))
|
|
|
|
def clean_manageable_nodes(self):
|
|
self.clean(nodes=self.all_manageable_nodes())
|
|
|
|
def clean(self, nodes: List):
|
|
"""clean manages the cleaning process for the Ironic nodes.
|
|
|
|
Using the provided clean steps, this method will clean the provided
|
|
baremetal nodes.
|
|
|
|
:param nodes: A list of nodes to clean
|
|
:type nodes: List
|
|
"""
|
|
if not nodes:
|
|
self.log.error("Provide either UUID or names of nodes!")
|
|
try:
|
|
failed_nodes, success_nodes = self._parallel_nodes_cleaning(
|
|
nodes)
|
|
if failed_nodes:
|
|
msg = ("Cleaning completed with failures. "
|
|
f"{failed_nodes} node(s) failed.")
|
|
self.log.error(msg)
|
|
else:
|
|
msg = ("Cleaning completed "
|
|
f"successfully: {len(success_nodes)} nodes")
|
|
self.log.info(msg)
|
|
except exceptions.OpenStackCloudException as err:
|
|
self.log.error(str(err))
|
|
|
|
|
|
class TripleoConfigure(TripleoBaremetal):
|
|
|
|
"""TripleoConfigure handles properties for the ironic nodes.
|
|
|
|
We use this class to set the properties for each node such as the
|
|
kernel, ramdisk, boot device, root_device.
|
|
|
|
:param kernel_name: The name of the kernel image we will deploy
|
|
:type kernel_name: String
|
|
|
|
:param ramdisk_name: The name of the ramdisk image we will deploy
|
|
:type ramdisk_name: String
|
|
|
|
:param instance_boot: Should the node boot from local disks or something
|
|
else
|
|
:type instance_boot: String
|
|
|
|
:param boot_mode: Is this node using BIOS or UEFI
|
|
:type boot_mode: String
|
|
|
|
:param: root_device: What is the root device for this node. eg /dev/sda
|
|
:type root_device: String
|
|
|
|
:param root_device_minimum_size: What is the smallest disk we should
|
|
consider acceptable for deployment
|
|
:type root_device: Integer
|
|
|
|
:param overwrite_root_device_hints: Should we overwrite existing root
|
|
device hints when root_device is used.
|
|
:type overwrite_root_device_hints: Boolean
|
|
"""
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
def __init__(self, kernel_name: str = None, ramdisk_name: str = None,
|
|
instance_boot_option: str = None, boot_mode: str = None,
|
|
root_device: str = None, verbosity: int = 0,
|
|
root_device_minimum_size: int = 4,
|
|
overwrite_root_device_hints: bool = False):
|
|
|
|
super().__init__(verbosity=verbosity)
|
|
self.kernel_name = kernel_name
|
|
self.ramdisk_name = ramdisk_name
|
|
self.instance_boot_option = instance_boot_option
|
|
self.boot_mode = boot_mode
|
|
self.root_device = root_device
|
|
self.root_device_minimum_size = root_device_minimum_size
|
|
self.overwrite_root_device_hints = overwrite_root_device_hints
|
|
|
|
def _apply_root_device_strategy(self, node_uuid: List,
|
|
strategy: str, minimum_size: int = 4,
|
|
overwrite: bool = False):
|
|
clients = self.conn
|
|
node = clients.baremetal.find_node(node_uuid)
|
|
|
|
if node.properties.get('root_device') and not overwrite:
|
|
# This is a correct situation, we still want to allow people to
|
|
# fine-tune the root device setting for a subset of nodes.
|
|
# However, issue a warning, so that they know which nodes were not
|
|
# updated during this run.
|
|
self.log.warning('Root device hints are already set for node '
|
|
'{} and overwriting is not requested,'
|
|
' skipping'.format(node.id))
|
|
self.log.warning('You may unset them by running $ ironic '
|
|
'node-update {} remove '
|
|
'properties/root_device'.format(node.id))
|
|
return
|
|
|
|
inspector_client = self.conn.baremetal_introspection
|
|
baremetal_client = self.conn.baremetal
|
|
|
|
try:
|
|
data = inspector_client.get_introspection_data(node.id)
|
|
except Exception:
|
|
raise exceptions.RootDeviceDetectionError(
|
|
f'No introspection data found for node {node.id}, '
|
|
'root device cannot be detected')
|
|
try:
|
|
disks = data['inventory']['disks']
|
|
except KeyError:
|
|
raise exceptions.RootDeviceDetectionError(
|
|
f'Malformed introspection data for node {node.id}: '
|
|
'disks list is missing')
|
|
|
|
minimum_size *= units.Gi
|
|
disks = [d for d in disks if d.get('size', 0) >= minimum_size]
|
|
|
|
if not disks:
|
|
raise exceptions.RootDeviceDetectionError(
|
|
f'No suitable disks found for node {node.id}')
|
|
|
|
if strategy == 'smallest':
|
|
disks.sort(key=lambda d: d['size'])
|
|
root_device = disks[0]
|
|
elif strategy == 'largest':
|
|
disks.sort(key=lambda d: d['size'], reverse=True)
|
|
root_device = disks[0]
|
|
else:
|
|
disk_names = [x.strip() for x in strategy.split(',')]
|
|
disks = {d['name']: d for d in disks}
|
|
for candidate in disk_names:
|
|
try:
|
|
root_device = disks['/dev/%s' % candidate]
|
|
except KeyError:
|
|
continue
|
|
else:
|
|
break
|
|
else:
|
|
raise exceptions.RootDeviceDetectionError(
|
|
f'Cannot find a disk with any of names {strategy} '
|
|
f'for node {node.id}')
|
|
|
|
hint = None
|
|
|
|
for hint_name in ('wwn_with_extension', 'wwn', 'serial'):
|
|
if root_device.get(hint_name):
|
|
hint = {hint_name: root_device[hint_name]}
|
|
break
|
|
|
|
if hint is None:
|
|
# I don't think it might actually happen, but just in case
|
|
raise exceptions.RootDeviceDetectionError(
|
|
f"Neither WWN nor serial number are known for device "
|
|
f"{root_device['name']} "
|
|
f"on node {node.id}; root device hints cannot be used")
|
|
|
|
# During the introspection process we got local_gb assigned according
|
|
# to the default strategy. Now we need to update it.
|
|
new_size = root_device['size'] / units.Gi
|
|
# This -1 is what we always do to account for partitioning
|
|
new_size -= 1
|
|
|
|
baremetal_client.update_node(
|
|
node.id,
|
|
[{'op': 'add', 'path': '/properties/root_device', 'value': hint},
|
|
{'op': 'add', 'path': '/properties/local_gb', 'value': new_size}])
|
|
self.log.info('Updated root device for node %s, new device '
|
|
'is %s, new local_gb is %s',
|
|
node.id, root_device, new_size
|
|
)
|
|
|
|
def _configure_boot(self, node_uuid: List,
|
|
kernel_name: str = None,
|
|
ramdisk_name: str = None,
|
|
instance_boot_option: str = None,
|
|
boot_mode: str = None):
|
|
|
|
baremetal_client = self.conn.baremetal
|
|
|
|
image_ids = {'kernel': kernel_name, 'ramdisk': ramdisk_name}
|
|
node = baremetal_client.find_node(node_uuid)
|
|
capabilities = node.properties.get('capabilities', {})
|
|
capabilities = node_utils.capabilities_to_dict(capabilities)
|
|
|
|
if instance_boot_option is not None:
|
|
capabilities['boot_option'] = instance_boot_option
|
|
if boot_mode is not None:
|
|
capabilities['boot_mode'] = boot_mode
|
|
|
|
capabilities = node_utils.dict_to_capabilities(capabilities)
|
|
baremetal_client.update_node(node.id, [
|
|
{
|
|
'op': 'add',
|
|
'path': '/properties/capabilities',
|
|
'value': capabilities,
|
|
},
|
|
{
|
|
'op': 'add',
|
|
'path': '/driver_info/deploy_ramdisk',
|
|
'value': image_ids['ramdisk'],
|
|
},
|
|
{
|
|
'op': 'add',
|
|
'path': '/driver_info/deploy_kernel',
|
|
'value': image_ids['kernel'],
|
|
},
|
|
{
|
|
'op': 'add',
|
|
'path': '/driver_info/rescue_ramdisk',
|
|
'value': image_ids['ramdisk'],
|
|
},
|
|
{
|
|
'op': 'add',
|
|
'path': '/driver_info/rescue_kernel',
|
|
'value': image_ids['kernel'],
|
|
},
|
|
])
|
|
|
|
def configure(self, node_uuids: List):
|
|
|
|
"""Configure Node boot options.
|
|
|
|
:param node_uuids: List of instance UUID(s).
|
|
:type node_uuids: List
|
|
|
|
"""
|
|
for node_uuid in node_uuids:
|
|
self._configure_boot(node_uuid, self.kernel_name,
|
|
self.ramdisk_name, self.instance_boot_option,
|
|
self.boot_mode)
|
|
if self.root_device:
|
|
self._apply_root_device_strategy(
|
|
node_uuid,
|
|
strategy=self.root_device,
|
|
minimum_size=self.root_device_minimum_size,
|
|
overwrite=self.overwrite_root_device_hints)
|
|
|
|
self.log.info('Successfully configured the nodes.')
|
|
|
|
def configure_manageable_nodes(self):
|
|
self.configure(node_uuids=self.all_manageable_nodes())
|