python-tripleoclient/tripleoclient/workflows/tripleo_baremetal.py

# -*- coding: utf-8 -*-
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.

import logging
from typing import Dict
from typing import List

from concurrent import futures
from openstack import connect as sdkclient
from openstack import exceptions
from openstack.utils import iterate_timeout
from oslo_utils import units
from tripleoclient import exceptions as ooo_exceptions
from tripleo_common.utils import nodes as node_utils


class TripleoBaremetal(object):

    """Base class for TripleO Baremetal operations.

    The TripleoBase class provides access to commonly used elements
    required to interact with and perform baremetal operations for TripleO.

    :param timeout: How long to wait until we consider this job to have
                    timed out
    :type timeout: integer

    :param verbosity: How verbose should we be. Currently, this just sets
                      DEBUG for any non-zero value provided.
    :type verbosity: integer
    """

    def __init__(self, timeout: int = 1200, verbosity: int = 1):
        self.conn = sdkclient(
            cloud='undercloud'
        )
        self.timeout = timeout
        self.log = logging.getLogger(__name__)
        if verbosity > 0:
            self.log.setLevel(logging.DEBUG)

    def all_manageable_nodes(self):
        """This method returns a list of manageable nodes from Ironic

        We take no arguments and instead create a list of nodes that
        are in the manageable state and NOT in maintenenace. We return the
        subsequent list.

        Raises:
          NoNodeFound: If no nodes match the above description, we will raise
                       an exception.

        Returns:
          nodes: The List of manageable nodes that are not currently in
                 maintenance.
        """
        nodes = [n.id for n in self.conn.baremetal.nodes(
            provision_state='manageable', is_maintenance=False)]

        if not nodes:
            raise ooo_exceptions.NoNodeFound

        return nodes


class TripleoProvide(TripleoBaremetal):

    """TripleoProvide handles state transition of baremetal nodes.

    The TripleoProvide class handles the transition of nodes between the
    manageable and available states.

    :param wait_for_bridge_mapping: Bool to determine whether or not we are
                                    waiting for the bridge mapping to be
                                    active in ironic-neutron-agent
    :type wait_for_bridge_mapping: bool

    """

    def __init__(self, wait_for_bridge_mappings: bool = False,
                 verbosity: int = 1):

        super().__init__(verbosity)
        self.wait_for_bridge_mappings = wait_for_bridge_mappings

    def _wait_for_unlocked(self, node: str, timeout: int):
        timeout_msg = f'Timeout waiting for node {node} to be unlocked'

        for count in iterate_timeout(timeout, timeout_msg):
            node_info = self.conn.baremetal.get_node(
                node,
                fields=['reservation']
            )

            if node_info.reservation is None:
                return

    def _wait_for_bridge_mapping(self, node: str):

        client = self.conn.network
        timeout_msg = (f'Timeout waiting for node {node} to have '
                       'bridge_mappings set in the ironic-neutron-agent '
                       'entry')

        # default agent polling period is 30s, so wait 60s
        timeout = 60

        for count in iterate_timeout(timeout, timeout_msg):
            agents = list(
                client.agents(host=node, binary='ironic-neutron-agent'))

            if agents:
                if agents[0].configuration.get('bridge_mappings'):
                    return

    def provide(self, nodes: str):

        """Transition nodes to the Available state.

        provide handles the state transition from the nodes current state
        to the available state

        :param nodes: The node UUID or name that we will be working on
        :type nodes: String
        """

        client = self.conn.baremetal
        node_timeout = self.timeout
        nodes_wait = nodes[:]

        for node in nodes:
            self.log.info('Providing node: {}'.format(node))
            self._wait_for_unlocked(node, node_timeout)

            if self.wait_for_bridge_mappings:
                self._wait_for_bridge_mapping(node)

            try:
                client.set_node_provision_state(
                    node,
                    "provide",
                    wait=False)

            except Exception as e:
                nodes_wait.remove(node)
                self.log.error(
                    "Can not start providing for node {}: {}".format(
                        nodes, e))
                return

        try:
            self.log.info(
                "Waiting for available state: {}".format(nodes_wait))

            client.wait_for_nodes_provision_state(
                nodes=nodes_wait,
                expected_state='available',
                timeout=self.timeout,
                fail=False
            )

        except exceptions.ResourceFailure as e:
            self.log.error("Failed providing nodes due to failure: {}".format(
                e))
            return

        except exceptions.ResourceTimeout as e:
            self.log.error("Failed providing nodes due to timeout: {}".format(
                e))

    def provide_manageable_nodes(self):
        self.provide(self.all_manageable_nodes())


class TripleoClean(TripleoBaremetal):

    """TripleoClean manages the Ironic node cleaning process.

    :param all_manageable: Should we work on all nodes in the manageable state
    :type all_manageable: bool

    :param provide: Should we also set the nodes back to the available state
    :type provide: bool

    :param timeout: How long should we wait before we consider the nodes to
                    have failed.
    :type timeout: integer

    :param raid_config: The raid configuration we should configure on the node
    :type raid_config: Dictionary

    :param concurrency: How many nodes should we do at once
    :type concurrency: integer

    :param clean_steps: The Ironic cleaning steps that should be executed on
                        the nodes
    :type clean_steps: List
    """
    log = logging.getLogger(__name__)

    def __init__(self, all_manageable: bool = False, provide: bool = False,
                 timeout: int = 60, raid_config: Dict = {},
                 concurrency: int = 1, verbosity: int = 0,
                 clean_steps: List = [{'interface': 'deploy',
                                       'step': 'erase_devices_metadata'}]):
        super().__init__(verbosity=verbosity, timeout=timeout)
        self.all_manageable = all_manageable
        self.provide = provide
        self.raid_config = raid_config
        self.clean_steps = clean_steps
        self.concurrency = concurrency

    def _parallel_nodes_cleaning(self, nodes: List):
        client = self.conn.baremetal
        node_timeout = self.timeout
        clean_steps = self.clean_steps
        failed_nodes = []
        success_nodes = []
        if self.raid_config:
            for node in nodes:
                try:
                    client.update_node(
                        node,
                        target_raid_config=self.raid_config
                    )
                    success_nodes.append(node)
                    self.log.info("Setting the raid configuration "
                                  "for node {} succeeded.".format(node))
                except exceptions.BadRequestException as err:
                    self.log.error("Setting raid configuration "
                                   "for node {} failed. Error: {}".format(
                                       node, err
                                   ))
                    failed_nodes.append(node)
                    nodes.pop(nodes.index(node))
        workers = min(len(nodes), self.concurrency) or 1
        with futures.ThreadPoolExecutor(max_workers=workers) as executor:
            future_to_build = {
                executor.submit(
                    client.set_node_provision_state,
                    node,
                    "clean",
                    clean_steps=clean_steps,
                    wait=True
                ): node for node in nodes
            }
            done, not_done = futures.wait(
                future_to_build,
                timeout=node_timeout,
                return_when=futures.ALL_COMPLETED
            )
        try:
            self.log.info(
                "Waiting for manageable state: {}".format(nodes))
            res = client.wait_for_nodes_provision_state(
                    nodes=nodes,
                    expected_state='manageable',
                    timeout=self.timeout,
                    fail=False
                )
        except exceptions.ResourceFailure as e:
            self.log.error("Failed providing nodes due to failure: {}".format(
                e))
        except exceptions.ResourceTimeout as e:
            self.log.error("Failed providing nodes due to timeout: {}".format(
                e))
        finally:
            err_nodes = [n.name for n in res if n.last_error]
            s_nodes = [n.name for n in res if not n.last_error]
            for node in err_nodes:
                failed_nodes.append(node)
            for node in s_nodes:
                success_nodes.append(node)

        return(set(failed_nodes), set(success_nodes))

    def clean_manageable_nodes(self):
        self.clean(nodes=self.all_manageable_nodes())

    def clean(self, nodes: List):
        """clean manages the cleaning process for the Ironic nodes.

        Using the provided clean steps, this method will clean the provided
        baremetal nodes.

        :param nodes: A list of nodes to clean
        :type nodes: List
        """
        if not nodes:
            self.log.error("Provide either UUID or names of nodes!")
            try:
                failed_nodes, success_nodes = self._parallel_nodes_cleaning(
                    nodes)
                if failed_nodes:
                    msg = ("Cleaning completed with failures. "
                           f"{failed_nodes} node(s) failed.")
                    self.log.error(msg)
                else:
                    msg = ("Cleaning completed "
                           f"successfully: {len(success_nodes)} nodes")
                    self.log.info(msg)
            except exceptions.OpenStackCloudException as err:
                self.log.error(str(err))


class TripleoConfigure(TripleoBaremetal):

    """TripleoConfigure handles properties for the ironic nodes.

    We use this class to set the properties for each node such as the
    kernel, ramdisk, boot device, root_device.

    :param kernel_name: The name of the kernel image we will deploy
    :type kernel_name: String

    :param ramdisk_name: The name of the ramdisk image we will deploy
    :type ramdisk_name: String

    :param instance_boot: Should the node boot from local disks or something
                          else
    :type instance_boot: String

    :param boot_mode: Is this node using BIOS or UEFI
    :type boot_mode: String

    :param: root_device: What is the root device for this node. eg /dev/sda
    :type root_device: String

    :param root_device_minimum_size: What is the smallest disk we should
                                     consider acceptable for deployment
    :type root_device: Integer

    :param overwrite_root_device_hints: Should we overwrite existing root
                                        device hints when root_device is used.
    :type overwrite_root_device_hints: Boolean
    """

    log = logging.getLogger(__name__)

    def __init__(self, kernel_name: str = None, ramdisk_name: str = None,
                 instance_boot_option: str = None, boot_mode: str = None,
                 root_device: str = None, verbosity: int = 0,
                 root_device_minimum_size: int = 4,
                 overwrite_root_device_hints: bool = False):

        super().__init__(verbosity=verbosity)
        self.kernel_name = kernel_name
        self.ramdisk_name = ramdisk_name
        self.instance_boot_option = instance_boot_option
        self.boot_mode = boot_mode
        self.root_device = root_device
        self.root_device_minimum_size = root_device_minimum_size
        self.overwrite_root_device_hints = overwrite_root_device_hints

    def _apply_root_device_strategy(self, node_uuid: List,
                                    strategy: str, minimum_size: int = 4,
                                    overwrite: bool = False):
        clients = self.conn
        node = clients.baremetal.find_node(node_uuid)

        if node.properties.get('root_device') and not overwrite:
            # This is a correct situation, we still want to allow people to
            # fine-tune the root device setting for a subset of nodes.
            # However, issue a warning, so that they know which nodes were not
            # updated during this run.
            self.log.warning('Root device hints are already set for node '
                             '{} and overwriting is not requested,'
                             ' skipping'.format(node.id))
            self.log.warning('You may unset them by running $ ironic '
                             'node-update {} remove '
                             'properties/root_device'.format(node.id))
            return

        inspector_client = self.conn.baremetal_introspection
        baremetal_client = self.conn.baremetal

        try:
            data = inspector_client.get_introspection_data(node.id)
        except Exception:
            raise exceptions.RootDeviceDetectionError(
                f'No introspection data found for node {node.id}, '
                'root device cannot be detected')
        try:
            disks = data['inventory']['disks']
        except KeyError:
            raise exceptions.RootDeviceDetectionError(
                f'Malformed introspection data for node {node.id}: '
                'disks list is missing')

        minimum_size *= units.Gi
        disks = [d for d in disks if d.get('size', 0) >= minimum_size]

        if not disks:
            raise exceptions.RootDeviceDetectionError(
                f'No suitable disks found for node {node.id}')

        if strategy == 'smallest':
            disks.sort(key=lambda d: d['size'])
            root_device = disks[0]
        elif strategy == 'largest':
            disks.sort(key=lambda d: d['size'], reverse=True)
            root_device = disks[0]
        else:
            disk_names = [x.strip() for x in strategy.split(',')]
            disks = {d['name']: d for d in disks}
            for candidate in disk_names:
                try:
                    root_device = disks['/dev/%s' % candidate]
                except KeyError:
                    continue
                else:
                    break
            else:
                raise exceptions.RootDeviceDetectionError(
                    f'Cannot find a disk with any of names {strategy} '
                    f'for node {node.id}')

        hint = None

        for hint_name in ('wwn_with_extension', 'wwn', 'serial'):
            if root_device.get(hint_name):
                hint = {hint_name: root_device[hint_name]}
                break

        if hint is None:
            # I don't think it might actually happen, but just in case
            raise exceptions.RootDeviceDetectionError(
                f"Neither WWN nor serial number are known for device "
                f"{root_device['name']} "
                f"on node {node.id}; root device hints cannot be used")

        # During the introspection process we got local_gb assigned according
        # to the default strategy. Now we need to update it.
        new_size = root_device['size'] / units.Gi
        # This -1 is what we always do to account for partitioning
        new_size -= 1

        baremetal_client.update_node(
            node.id,
            [{'op': 'add', 'path': '/properties/root_device', 'value': hint},
             {'op': 'add', 'path': '/properties/local_gb', 'value': new_size}])
        self.log.info('Updated root device for node %s, new device '
                      'is %s, new local_gb is %s',
                      node.id, root_device, new_size
                      )

    def _configure_boot(self, node_uuid: List,
                        kernel_name: str = None,
                        ramdisk_name: str = None,
                        instance_boot_option: str = None,
                        boot_mode: str = None):

        baremetal_client = self.conn.baremetal

        image_ids = {'kernel': kernel_name, 'ramdisk': ramdisk_name}
        node = baremetal_client.find_node(node_uuid)
        capabilities = node.properties.get('capabilities', {})
        capabilities = node_utils.capabilities_to_dict(capabilities)

        if instance_boot_option is not None:
            capabilities['boot_option'] = instance_boot_option
        if boot_mode is not None:
            capabilities['boot_mode'] = boot_mode

        capabilities = node_utils.dict_to_capabilities(capabilities)
        baremetal_client.update_node(node.id, [
            {
                'op': 'add',
                'path': '/properties/capabilities',
                'value': capabilities,
            },
            {
                'op': 'add',
                'path': '/driver_info/deploy_ramdisk',
                'value': image_ids['ramdisk'],
            },
            {
                'op': 'add',
                'path': '/driver_info/deploy_kernel',
                'value': image_ids['kernel'],
            },
            {
                'op': 'add',
                'path': '/driver_info/rescue_ramdisk',
                'value': image_ids['ramdisk'],
            },
            {
                'op': 'add',
                'path': '/driver_info/rescue_kernel',
                'value': image_ids['kernel'],
            },
        ])

    def configure(self, node_uuids: List):

        """Configure Node boot options.

        :param node_uuids: List of instance UUID(s).
        :type node_uuids: List

        """
        for node_uuid in node_uuids:
            self._configure_boot(node_uuid, self.kernel_name,
                                 self.ramdisk_name, self.instance_boot_option,
                                 self.boot_mode)
            if self.root_device:
                self._apply_root_device_strategy(
                    node_uuid,
                    strategy=self.root_device,
                    minimum_size=self.root_device_minimum_size,
                    overwrite=self.overwrite_root_device_hints)

        self.log.info('Successfully configured the nodes.')

    def configure_manageable_nodes(self):
        self.configure(node_uuids=self.all_manageable_nodes())