527 lines
19 KiB
Python
527 lines
19 KiB
Python
# -*- coding: utf-8 -*-
|
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
|
# not use this file except in compliance with the License. You may obtain
|
|
# a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
# License for the specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
import logging
|
|
from typing import Dict
|
|
from typing import List
|
|
|
|
from concurrent import futures
|
|
from openstack import connect as sdkclient
|
|
from openstack import exceptions
|
|
from openstack.utils import iterate_timeout
|
|
from oslo_utils import units
|
|
from tripleoclient import exceptions as ooo_exceptions
|
|
from tripleo_common.utils import nodes as node_utils
|
|
|
|
|
|
class TripleoBaremetal(object):
|
|
|
|
"""Base class for TripleO Baremetal operations.
|
|
|
|
The TripleoBase class provides access to commonly used elements
|
|
required to interact with and perform baremetal operations for TripleO.
|
|
|
|
:param timeout: How long to wait until we consider this job to have
|
|
timed out
|
|
:type timeout: integer
|
|
|
|
:param verbosity: How verbose should we be. Currently, this just sets
|
|
DEBUG for any non-zero value provided.
|
|
:type verbosity: integer
|
|
"""
|
|
|
|
def __init__(self, timeout: int = 1200, verbosity: int = 1):
|
|
self.conn = sdkclient(
|
|
cloud='undercloud'
|
|
)
|
|
self.timeout = timeout
|
|
self.log = logging.getLogger(__name__)
|
|
if verbosity > 0:
|
|
self.log.setLevel(logging.DEBUG)
|
|
|
|
def all_manageable_nodes(self):
|
|
"""This method returns a list of manageable nodes from Ironic
|
|
|
|
We take no arguments and instead create a list of nodes that
|
|
are in the manageable state and NOT in maintenenace. We return the
|
|
subsequent list.
|
|
|
|
Raises:
|
|
NoNodeFound: If no nodes match the above description, we will raise
|
|
an exception.
|
|
|
|
Returns:
|
|
nodes: The List of manageable nodes that are not currently in
|
|
maintenance.
|
|
"""
|
|
nodes = [n.id for n in self.conn.baremetal.nodes(
|
|
provision_state='manageable', is_maintenance=False)]
|
|
|
|
if not nodes:
|
|
raise ooo_exceptions.NoNodeFound
|
|
|
|
return nodes
|
|
|
|
|
|
class TripleoProvide(TripleoBaremetal):
|
|
|
|
"""TripleoProvide handles state transition of baremetal nodes.
|
|
|
|
The TripleoProvide class handles the transition of nodes between the
|
|
manageable and available states.
|
|
|
|
:param wait_for_bridge_mapping: Bool to determine whether or not we are
|
|
waiting for the bridge mapping to be
|
|
active in ironic-neutron-agent
|
|
:type wait_for_bridge_mapping: bool
|
|
|
|
"""
|
|
|
|
def __init__(self, wait_for_bridge_mappings: bool = False,
|
|
timeout: int = 60, verbosity: int = 1):
|
|
|
|
super().__init__(timeout=timeout, verbosity=verbosity)
|
|
self.wait_for_bridge_mappings = wait_for_bridge_mappings
|
|
|
|
def _wait_for_unlocked(self, node: str, timeout: int):
|
|
timeout_msg = f'Timeout waiting for node {node} to be unlocked'
|
|
|
|
for count in iterate_timeout(timeout, timeout_msg):
|
|
node_info = self.conn.baremetal.get_node(
|
|
node,
|
|
fields=['reservation']
|
|
)
|
|
|
|
if node_info.reservation is None:
|
|
return
|
|
|
|
def _wait_for_bridge_mapping(self, node: str):
|
|
|
|
client = self.conn.network
|
|
timeout_msg = (f'Timeout waiting for node {node} to have '
|
|
'bridge_mappings set in the ironic-neutron-agent '
|
|
'entry')
|
|
|
|
# default agent polling period is 30s, so wait 60s
|
|
timeout = 60
|
|
|
|
for count in iterate_timeout(timeout, timeout_msg):
|
|
agents = list(
|
|
client.agents(host=node, binary='ironic-neutron-agent'))
|
|
|
|
if agents:
|
|
if agents[0].configuration.get('bridge_mappings'):
|
|
return
|
|
|
|
def provide(self, nodes: str):
|
|
|
|
"""Transition nodes to the Available state.
|
|
|
|
provide handles the state transition from the nodes current state
|
|
to the available state
|
|
|
|
:param nodes: The node UUID or name that we will be working on
|
|
:type nodes: String
|
|
"""
|
|
|
|
client = self.conn.baremetal
|
|
node_timeout = self.timeout
|
|
nodes_wait = nodes[:]
|
|
|
|
for node in nodes:
|
|
self.log.info('Providing node: {}'.format(node))
|
|
self._wait_for_unlocked(node, node_timeout)
|
|
|
|
if self.wait_for_bridge_mappings:
|
|
self._wait_for_bridge_mapping(node)
|
|
|
|
try:
|
|
client.set_node_provision_state(
|
|
node,
|
|
"provide",
|
|
wait=False)
|
|
|
|
except Exception as e:
|
|
nodes_wait.remove(node)
|
|
self.log.error(
|
|
"Can not start providing for node {}: {}".format(
|
|
nodes, e))
|
|
return
|
|
|
|
try:
|
|
self.log.info(
|
|
"Waiting for available state: {}".format(nodes_wait))
|
|
|
|
client.wait_for_nodes_provision_state(
|
|
nodes=nodes_wait,
|
|
expected_state='available',
|
|
timeout=self.timeout,
|
|
fail=False
|
|
)
|
|
|
|
except exceptions.ResourceFailure as e:
|
|
self.log.error("Failed providing nodes due to failure: {}".format(
|
|
e))
|
|
return
|
|
|
|
except exceptions.ResourceTimeout as e:
|
|
self.log.error("Failed providing nodes due to timeout: {}".format(
|
|
e))
|
|
|
|
def provide_manageable_nodes(self):
|
|
self.provide(self.all_manageable_nodes())
|
|
|
|
|
|
class TripleoClean(TripleoBaremetal):
|
|
|
|
"""TripleoClean manages the Ironic node cleaning process.
|
|
|
|
:param all_manageable: Should we work on all nodes in the manageable state
|
|
:type all_manageable: bool
|
|
|
|
:param provide: Should we also set the nodes back to the available state
|
|
:type provide: bool
|
|
|
|
:param timeout: How long should we wait before we consider the nodes to
|
|
have failed.
|
|
:type timeout: integer
|
|
|
|
:param raid_config: The raid configuration we should configure on the node
|
|
:type raid_config: Dictionary
|
|
|
|
:param concurrency: How many nodes should we do at once
|
|
:type concurrency: integer
|
|
|
|
:param clean_steps: The Ironic cleaning steps that should be executed on
|
|
the nodes
|
|
:type clean_steps: List
|
|
"""
|
|
log = logging.getLogger(__name__)
|
|
|
|
def __init__(self, all_manageable: bool = False, provide: bool = False,
|
|
timeout: int = 60, raid_config: Dict = {},
|
|
concurrency: int = 1, verbosity: int = 0,
|
|
clean_steps: List = [{'interface': 'deploy',
|
|
'step': 'erase_devices_metadata'}]):
|
|
super().__init__(verbosity=verbosity, timeout=timeout)
|
|
self.all_manageable = all_manageable
|
|
self.provide = provide
|
|
self.raid_config = raid_config
|
|
self.clean_steps = clean_steps
|
|
self.concurrency = concurrency
|
|
|
|
def _parallel_nodes_cleaning(self, nodes: List):
|
|
client = self.conn.baremetal
|
|
node_timeout = self.timeout
|
|
clean_steps = self.clean_steps
|
|
failed_nodes = []
|
|
success_nodes = []
|
|
if self.raid_config:
|
|
for node in nodes:
|
|
try:
|
|
client.update_node(
|
|
node,
|
|
target_raid_config=self.raid_config
|
|
)
|
|
success_nodes.append(node)
|
|
self.log.info("Setting the raid configuration "
|
|
"for node {} succeeded.".format(node))
|
|
except exceptions.BadRequestException as err:
|
|
self.log.error("Setting raid configuration "
|
|
"for node {} failed. Error: {}".format(
|
|
node, err
|
|
))
|
|
failed_nodes.append(node)
|
|
nodes.pop(nodes.index(node))
|
|
workers = min(len(nodes), self.concurrency) or 1
|
|
with futures.ThreadPoolExecutor(max_workers=workers) as executor:
|
|
future_to_build = {
|
|
executor.submit(
|
|
client.set_node_provision_state,
|
|
node,
|
|
"clean",
|
|
clean_steps=clean_steps,
|
|
wait=True
|
|
): node for node in nodes
|
|
}
|
|
done, not_done = futures.wait(
|
|
future_to_build,
|
|
timeout=node_timeout,
|
|
return_when=futures.ALL_COMPLETED
|
|
)
|
|
try:
|
|
self.log.info(
|
|
"Waiting for manageable state: {}".format(nodes))
|
|
res = client.wait_for_nodes_provision_state(
|
|
nodes=nodes,
|
|
expected_state='manageable',
|
|
timeout=self.timeout,
|
|
fail=False
|
|
)
|
|
except exceptions.ResourceFailure as e:
|
|
self.log.error("Failed providing nodes due to failure: {}".format(
|
|
e))
|
|
except exceptions.ResourceTimeout as e:
|
|
self.log.error("Failed providing nodes due to timeout: {}".format(
|
|
e))
|
|
finally:
|
|
err_nodes = [n.name for n in res if n.last_error]
|
|
s_nodes = [n.name for n in res if not n.last_error]
|
|
for node in err_nodes:
|
|
failed_nodes.append(node)
|
|
for node in s_nodes:
|
|
success_nodes.append(node)
|
|
|
|
return(set(failed_nodes), set(success_nodes))
|
|
|
|
def clean_manageable_nodes(self):
|
|
self.clean(nodes=self.all_manageable_nodes())
|
|
|
|
def clean(self, nodes: List):
|
|
"""clean manages the cleaning process for the Ironic nodes.
|
|
|
|
Using the provided clean steps, this method will clean the provided
|
|
baremetal nodes.
|
|
|
|
:param nodes: A list of nodes to clean
|
|
:type nodes: List
|
|
"""
|
|
if not nodes:
|
|
self.log.error("Provide either UUID or names of nodes!")
|
|
try:
|
|
failed_nodes, success_nodes = self._parallel_nodes_cleaning(
|
|
nodes)
|
|
if failed_nodes:
|
|
msg = ("Cleaning completed with failures. "
|
|
f"{failed_nodes} node(s) failed.")
|
|
self.log.error(msg)
|
|
else:
|
|
msg = ("Cleaning completed "
|
|
f"successfully: {len(success_nodes)} nodes")
|
|
self.log.info(msg)
|
|
except exceptions.OpenStackCloudException as err:
|
|
self.log.error(str(err))
|
|
|
|
|
|
class TripleoConfigure(TripleoBaremetal):
|
|
|
|
"""TripleoConfigure handles properties for the ironic nodes.
|
|
|
|
We use this class to set the properties for each node such as the
|
|
kernel, ramdisk, boot device, root_device.
|
|
|
|
:param kernel_name: The name of the kernel image we will deploy
|
|
:type kernel_name: String
|
|
|
|
:param ramdisk_name: The name of the ramdisk image we will deploy
|
|
:type ramdisk_name: String
|
|
|
|
:param instance_boot: Should the node boot from local disks or something
|
|
else
|
|
:type instance_boot: String
|
|
|
|
:param boot_mode: Is this node using BIOS or UEFI
|
|
:type boot_mode: String
|
|
|
|
:param: root_device: What is the root device for this node. eg /dev/sda
|
|
:type root_device: String
|
|
|
|
:param root_device_minimum_size: What is the smallest disk we should
|
|
consider acceptable for deployment
|
|
:type root_device: Integer
|
|
|
|
:param overwrite_root_device_hints: Should we overwrite existing root
|
|
device hints when root_device is used.
|
|
:type overwrite_root_device_hints: Boolean
|
|
"""
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
def __init__(self, kernel_name: str = None, ramdisk_name: str = None,
|
|
instance_boot_option: str = None, boot_mode: str = None,
|
|
root_device: str = None, verbosity: int = 0,
|
|
root_device_minimum_size: int = 4,
|
|
overwrite_root_device_hints: bool = False):
|
|
|
|
super().__init__(verbosity=verbosity)
|
|
self.kernel_name = kernel_name
|
|
self.ramdisk_name = ramdisk_name
|
|
self.instance_boot_option = instance_boot_option
|
|
self.boot_mode = boot_mode
|
|
self.root_device = root_device
|
|
self.root_device_minimum_size = root_device_minimum_size
|
|
self.overwrite_root_device_hints = overwrite_root_device_hints
|
|
|
|
def _apply_root_device_strategy(self, node_uuid: List,
|
|
strategy: str, minimum_size: int = 4,
|
|
overwrite: bool = False):
|
|
clients = self.conn
|
|
node = clients.baremetal.find_node(node_uuid)
|
|
|
|
if node.properties.get('root_device') and not overwrite:
|
|
# This is a correct situation, we still want to allow people to
|
|
# fine-tune the root device setting for a subset of nodes.
|
|
# However, issue a warning, so that they know which nodes were not
|
|
# updated during this run.
|
|
self.log.warning('Root device hints are already set for node '
|
|
'{} and overwriting is not requested,'
|
|
' skipping'.format(node.id))
|
|
self.log.warning('You may unset them by running $ ironic '
|
|
'node-update {} remove '
|
|
'properties/root_device'.format(node.id))
|
|
return
|
|
|
|
inspector_client = self.conn.baremetal_introspection
|
|
baremetal_client = self.conn.baremetal
|
|
|
|
try:
|
|
data = inspector_client.get_introspection_data(node.id)
|
|
except Exception:
|
|
raise exceptions.RootDeviceDetectionError(
|
|
f'No introspection data found for node {node.id}, '
|
|
'root device cannot be detected')
|
|
try:
|
|
disks = data['inventory']['disks']
|
|
except KeyError:
|
|
raise exceptions.RootDeviceDetectionError(
|
|
f'Malformed introspection data for node {node.id}: '
|
|
'disks list is missing')
|
|
|
|
minimum_size *= units.Gi
|
|
disks = [d for d in disks if d.get('size', 0) >= minimum_size]
|
|
|
|
if not disks:
|
|
raise exceptions.RootDeviceDetectionError(
|
|
f'No suitable disks found for node {node.id}')
|
|
|
|
if strategy == 'smallest':
|
|
disks.sort(key=lambda d: d['size'])
|
|
root_device = disks[0]
|
|
elif strategy == 'largest':
|
|
disks.sort(key=lambda d: d['size'], reverse=True)
|
|
root_device = disks[0]
|
|
else:
|
|
disk_names = [x.strip() for x in strategy.split(',')]
|
|
disks = {d['name']: d for d in disks}
|
|
for candidate in disk_names:
|
|
try:
|
|
root_device = disks['/dev/%s' % candidate]
|
|
except KeyError:
|
|
continue
|
|
else:
|
|
break
|
|
else:
|
|
raise exceptions.RootDeviceDetectionError(
|
|
f'Cannot find a disk with any of names {strategy} '
|
|
f'for node {node.id}')
|
|
|
|
hint = None
|
|
|
|
for hint_name in ('wwn_with_extension', 'wwn', 'serial'):
|
|
if root_device.get(hint_name):
|
|
hint = {hint_name: root_device[hint_name]}
|
|
break
|
|
|
|
if hint is None:
|
|
# I don't think it might actually happen, but just in case
|
|
raise exceptions.RootDeviceDetectionError(
|
|
f"Neither WWN nor serial number are known for device "
|
|
f"{root_device['name']} "
|
|
f"on node {node.id}; root device hints cannot be used")
|
|
|
|
# During the introspection process we got local_gb assigned according
|
|
# to the default strategy. Now we need to update it.
|
|
new_size = root_device['size'] / units.Gi
|
|
# This -1 is what we always do to account for partitioning
|
|
new_size -= 1
|
|
|
|
baremetal_client.update_node(
|
|
node.id,
|
|
[{'op': 'add', 'path': '/properties/root_device', 'value': hint},
|
|
{'op': 'add', 'path': '/properties/local_gb', 'value': new_size}])
|
|
self.log.info('Updated root device for node %s, new device '
|
|
'is %s, new local_gb is %s',
|
|
node.id, root_device, new_size
|
|
)
|
|
|
|
def _configure_boot(self, node_uuid: List,
|
|
kernel_name: str = None,
|
|
ramdisk_name: str = None,
|
|
instance_boot_option: str = None,
|
|
boot_mode: str = None):
|
|
|
|
baremetal_client = self.conn.baremetal
|
|
|
|
image_ids = {'kernel': kernel_name, 'ramdisk': ramdisk_name}
|
|
node = baremetal_client.find_node(node_uuid)
|
|
capabilities = node.properties.get('capabilities', {})
|
|
capabilities = node_utils.capabilities_to_dict(capabilities)
|
|
|
|
if instance_boot_option is not None:
|
|
capabilities['boot_option'] = instance_boot_option
|
|
if boot_mode is not None:
|
|
capabilities['boot_mode'] = boot_mode
|
|
|
|
capabilities = node_utils.dict_to_capabilities(capabilities)
|
|
baremetal_client.update_node(node.id, [
|
|
{
|
|
'op': 'add',
|
|
'path': '/properties/capabilities',
|
|
'value': capabilities,
|
|
},
|
|
{
|
|
'op': 'add',
|
|
'path': '/driver_info/deploy_ramdisk',
|
|
'value': image_ids['ramdisk'],
|
|
},
|
|
{
|
|
'op': 'add',
|
|
'path': '/driver_info/deploy_kernel',
|
|
'value': image_ids['kernel'],
|
|
},
|
|
{
|
|
'op': 'add',
|
|
'path': '/driver_info/rescue_ramdisk',
|
|
'value': image_ids['ramdisk'],
|
|
},
|
|
{
|
|
'op': 'add',
|
|
'path': '/driver_info/rescue_kernel',
|
|
'value': image_ids['kernel'],
|
|
},
|
|
])
|
|
|
|
def configure(self, node_uuids: List):
|
|
|
|
"""Configure Node boot options.
|
|
|
|
:param node_uuids: List of instance UUID(s).
|
|
:type node_uuids: List
|
|
|
|
"""
|
|
for node_uuid in node_uuids:
|
|
self._configure_boot(node_uuid, self.kernel_name,
|
|
self.ramdisk_name, self.instance_boot_option,
|
|
self.boot_mode)
|
|
if self.root_device:
|
|
self._apply_root_device_strategy(
|
|
node_uuid,
|
|
strategy=self.root_device,
|
|
minimum_size=self.root_device_minimum_size,
|
|
overwrite=self.overwrite_root_device_hints)
|
|
|
|
self.log.info('Successfully configured the nodes.')
|
|
|
|
def configure_manageable_nodes(self):
|
|
self.configure(node_uuids=self.all_manageable_nodes())
|