drydock/python/drydock_provisioner/orchestrator/orchestrator.py

# Copyright 2017 AT&T Intellectual Property.  All other rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Workflow orchestrator for Drydock tasks."""

import time
import importlib
import logging
import uuid
import ulid2
import concurrent.futures
import os

import drydock_provisioner.config as config
import drydock_provisioner.objects as objects
import drydock_provisioner.error as errors
import drydock_provisioner.objects.fields as hd_fields

from .actions.orchestrator import Noop
from .actions.orchestrator import ValidateDesign
from .actions.orchestrator import VerifySite
from .actions.orchestrator import PrepareSite
from .actions.orchestrator import VerifyNodes
from .actions.orchestrator import PrepareNodes
from .actions.orchestrator import DeployNodes
from .actions.orchestrator import RelabelNodes
from .actions.orchestrator import DestroyNodes
from .validations.validator import Validator


class Orchestrator(object):
    """Defines functionality for task execution workflow."""

    def __init__(self, enabled_drivers=None, state_manager=None,
                 ingester=None):
        """Initialize the orchestrator. A single instance should be executing at a time.

        :param enabled_drivers: a dictionary of drivers to enable for executing downstream tasks
        :param state_manager: the instance of statemgr.state.DrydockState to use for accessign app state
        :param ingester: instance of ingester.Ingester used to process design documents
        """
        self.orch_id = uuid.uuid4()

        self.stop_flag = False

        self.enabled_drivers = {}

        self.state_manager = state_manager
        self.ingester = ingester

        if self.state_manager is None or self.ingester is None:
            raise errors.OrchestratorError(
                "Orchestrator requires instantiated state manager and ingester."
            )

        self.logger = logging.getLogger('drydock.orchestrator')

        if enabled_drivers is not None:
            oob_drivers = enabled_drivers.oob_driver

            # This is because oslo_config changes the option value
            # for multiopt depending on if multiple values are actually defined

            for d in oob_drivers:
                self.logger.info("Enabling OOB driver %s" % d)
                if d is not None:
                    m, c = d.rsplit('.', 1)
                    oob_driver_class = \
                        getattr(importlib.import_module(m), c, None)
                    if oob_driver_class is not None:
                        if self.enabled_drivers.get('oob', None) is None:
                            self.enabled_drivers['oob'] = []
                        self.enabled_drivers['oob'].append(
                            oob_driver_class(
                                state_manager=state_manager,
                                orchestrator=self))

            node_driver_name = enabled_drivers.node_driver
            if node_driver_name is not None:
                m, c = node_driver_name.rsplit('.', 1)
                node_driver_class = \
                    getattr(importlib.import_module(m), c, None)
                if node_driver_class is not None:
                    self.enabled_drivers['node'] = node_driver_class(
                        state_manager=state_manager, orchestrator=self)

            network_driver_name = enabled_drivers.network_driver
            if network_driver_name is not None:
                m, c = network_driver_name.rsplit('.', 1)
                network_driver_class = getattr(
                    importlib.import_module(m), c, None)
                if network_driver_class is not None:
                    self.enabled_drivers['network'] = network_driver_class(
                        state_manager=state_manager, orchestrator=self)

            kubernetes_driver_name = enabled_drivers.kubernetes_driver
            if kubernetes_driver_name is not None:
                m, c = kubernetes_driver_name.rsplit('.', 1)
                kubernetes_driver_class = getattr(
                    importlib.import_module(m), c, None)
                if kubernetes_driver_class is not None:
                    self.enabled_drivers[
                        'kubernetes'] = kubernetes_driver_class(
                            state_manager=state_manager, orchestrator=self)

    def watch_for_tasks(self):
        """Start polling the database watching for Queued tasks to execute."""
        orch_task_actions = {
            hd_fields.OrchestratorAction.Noop: Noop,
            hd_fields.OrchestratorAction.ValidateDesign: ValidateDesign,
            hd_fields.OrchestratorAction.VerifySite: VerifySite,
            hd_fields.OrchestratorAction.PrepareSite: PrepareSite,
            hd_fields.OrchestratorAction.VerifyNodes: VerifyNodes,
            hd_fields.OrchestratorAction.PrepareNodes: PrepareNodes,
            hd_fields.OrchestratorAction.DeployNodes: DeployNodes,
            hd_fields.OrchestratorAction.RelabelNodes: RelabelNodes,
            hd_fields.OrchestratorAction.DestroyNodes: DestroyNodes,
        }

        # Loop trying to claim status as the active orchestrator

        tp = concurrent.futures.ThreadPoolExecutor(max_workers=16)

        while True:
            if self.stop_flag:
                tp.shutdown()
                return
            claim = self.state_manager.claim_leadership(self.orch_id)

            if not claim:
                self.logger.info(
                    "Orchestrator %s denied leadership, sleeping to try again."
                    % str(self.orch_id))
                # TODO(sh8121att) Make this configurable
                time.sleep(config.config_mgr.conf.leadership_claim_interval)
            else:
                self.logger.info(
                    "Orchestrator %s successfully claimed leadership, polling for tasks."
                    % str(self.orch_id))

                # As active orchestrator, loop looking for queued tasks.
                task_future = None
                while True:
                    # TODO(sh8121att) Need a timeout here
                    if self.stop_flag:
                        tp.shutdown()
                        self.state_manager.abdicate_leadership(self.orch_id)
                        return
                    if task_future is not None:
                        if task_future.done():
                            self.logger.debug(
                                "Task execution complete, looking for the next task."
                            )
                            exc = task_future.exception()
                            if exc is not None:
                                self.logger.error(
                                    "Error in starting orchestrator action.",
                                    exc_info=exc)
                            task_future = None

                    if task_future is None:
                        next_task = self.state_manager.get_next_queued_task(
                            allowed_actions=list(orch_task_actions.keys()))

                        if next_task is not None:
                            self.logger.info(
                                "Found task %s queued, starting execution." %
                                str(next_task.get_id()))
                            if next_task.check_terminate():
                                self.logger.info(
                                    "Task %s marked for termination, skipping execution."
                                    % str(next_task.get_id()))
                                next_task.set_status(
                                    hd_fields.TaskStatus.Terminated)
                                next_task.save()
                                continue
                            action = orch_task_actions[next_task.action](
                                next_task, self, self.state_manager)
                            if action:
                                task_future = tp.submit(action.start)
                            else:
                                self.logger.warning(
                                    "Task %s has unsupported action %s, ending execution."
                                    % (str(next_task.get_id()),
                                       next_task.action))
                                next_task.add_status_msg(
                                    msg="Unsupported action %s." %
                                    next_task.action,
                                    error=True,
                                    ctx=str(next_task.get_id()),
                                    ctx_type='task')
                                next_task.failure()
                                next_task.set_status(
                                    hd_fields.TaskStatus.Complete)
                                next_task.save()
                        else:
                            self.logger.info(
                                "No task found, waiting to poll again.")

                    # TODO(sh8121att) Make this configurable
                    time.sleep(config.config_mgr.conf.poll_interval)
                    claim = self.state_manager.maintain_leadership(
                        self.orch_id)
                    if not claim:
                        self.logger.info(
                            "Orchestrator %s lost leadership, attempting to reclaim."
                            % str(self.orch_id))
                        break

    def stop_orchestrator(self):
        """Indicate this orchestrator instance should stop attempting to run."""
        self.stop_flag = True

    def terminate_task(self, task, propagate=True, terminated_by=None):
        """Mark a task for termination.

        Optionally propagate the termination recursively to all subtasks

        :param task: A objects.Task instance to terminate
        :param propagate: whether the termination should propagatge to subtasks
        """
        if task is None:
            raise errors.OrchestratorError(
                "Could find task %s" % str(task.get_id()))
        else:
            # Terminate initial task first to prevent add'l subtasks
            self.logger.debug("Terminating task %s." % str(task.get_id()))
            task.terminate_task(terminated_by=terminated_by)

            if propagate:
                # Get subtasks list
                subtasks = task.get_subtasks()

                for st_id in subtasks:
                    st = self.state_manager.get_task(st_id)
                    self.terminate_task(
                        st, propagate=True, terminated_by=terminated_by)

    def create_task(self, **kwargs):
        """Create a new task and persist it."""
        new_task = objects.Task(statemgr=self.state_manager, **kwargs)
        self.state_manager.post_task(new_task)

        return new_task

    def compute_model_inheritance(self, site_design, resolve_aliases=False):
        """Compute inheritance of the design model.

        Given a fully populated Site model, compute the effective
        design by applying inheritance and references
        """
        node_failed = []

        try:
            nodes = site_design.baremetal_nodes
            for n in nodes or []:
                try:
                    n.compile_applied_model(
                        site_design,
                        state_manager=self.state_manager,
                        resolve_aliases=resolve_aliases)
                except Exception as ex:
                    node_failed.append(n)
                    self.logger.error(
                        "Failed to build applied model for node %s." % n.name)
            if node_failed:
                raise errors.DesignError(
                    "Failed to build applied model for %s" % ",".join(
                        [x.name for x in node_failed]))
        except AttributeError:
            self.logger.debug(
                "Model inheritance skipped, no node definitions in site design."
            )

        return

    def get_described_site(self, design_ref):
        """Ingest design data referenced by design_ref.

        Return a tuple of the processing status and the populated instance
        of SiteDesign

        :param design_ref: Supported URI referencing a design document
        """
        status, site_design = self.ingester.ingest_data(
            design_ref=design_ref, design_state=self.state_manager)

        return status, site_design

    def get_effective_site(self, design_ref, resolve_aliases=False):
        """Ingest design data and compile the effective model of the design.

        Return a tuple of the processing status and the populated instance
        of SiteDesign after computing the inheritance chain

        :param design_ref: Supported URI referencing a design document
        """
        status = None
        site_design = None
        val = Validator(self)
        try:
            status, site_design = self.get_described_site(design_ref)
            if status.status == hd_fields.ValidationResult.Success:
                self.compute_model_inheritance(
                    site_design, resolve_aliases=resolve_aliases)
                self.compute_bootaction_targets(site_design)
                self.render_route_domains(site_design)
                status = val.validate_design(site_design, result_status=status)
        except Exception as ex:
            if status is not None:
                status.add_status_msg(
                    "Error loading effective site: %s" % str(ex),
                    error=True,
                    ctx='NA',
                    ctx_type='NA')
                status.set_status(hd_fields.ActionResult.Failure)
            self.logger.error(
                "Error getting site definition: %s" % str(ex), exc_info=ex)

        return status, site_design

    def get_target_nodes(self, task, failures=False, successes=False):
        """Compute list of target nodes for given ``task``.
        If failures is true, then create a node_filter based on task result
        failures. If successes is true, then create a node_filter based on
        task result successes. If both are true, raise an exception. If neither
        are true, build the list from the task node_filter.

        :param task: instance of objects.Task
        :param failures: whether to build target list from previous task failures
        :param successes: whether to build target list from previous task successes
        """
        design_status, site_design = self.get_effective_site(task.design_ref)

        if design_status.status != hd_fields.ValidationResult.Success:
            raise errors.OrchestratorError(
                "Unable to render effective site design.")
        if failures and successes:
            raise errors.OrchestratorError(
                "Cannot specify both failures and successes.")

        if failures:
            if len(task.result.failures) == 0:
                return []
            nf = task.node_filter_from_failures()
        elif successes:
            if len(task.result.successes) == 0:
                return []
            nf = task.node_filter_from_sucessess()
        else:
            nf = task.node_filter

        node_list = self.process_node_filter(nf, site_design)
        return node_list

    def create_nodefilter_from_nodelist(self, node_list):
        """Create a node filter to match list of nodes.

        Returns a dictionary that will be properly processed by the orchestrator

        :param node_list: List of objects.BaremetalNode instances the filter should match
        """
        nf = dict()

        nf['filter_set_type'] = 'intersection'
        nf['filter_set'] = [
            dict(
                node_names=[x.get_id() for x in node_list],
                filter_type='union')
        ]

        return nf

    def compute_bootaction_targets(self, site_design):
        """Find target nodes for each bootaction in ``site_design``.

        Calculate the node_filter for each bootaction and save the list
        of target node names.

        :param site_design: an instance of objects.SiteDesign
        """
        if site_design.bootactions is None:
            return
        for ba in site_design.bootactions:
            nf = ba.node_filter
            target_nodes = self.process_node_filter(nf, site_design)
            if not target_nodes:
                ba.target_nodes = []
            else:
                ba.target_nodes = [x.get_id() for x in target_nodes]

    def process_node_filter(self, node_filter, site_design):
        try:
            target_nodes = site_design.baremetal_nodes
            if target_nodes is None:
                raise AttributeError()
        except AttributeError:
            self.logger.debug(
                "Invalid site design, no baremetal nodes in site_design.")
            return []

        if node_filter is None:
            return target_nodes

        if not isinstance(node_filter, dict) and not isinstance(
                node_filter, objects.NodeFilterSet):
            msg = "Invalid node_filter, must be a dictionary with keys 'filter_set_type' and 'filter_set'."
            self.logger.error(msg)
            raise errors.OrchestratorError(msg)

        result_sets = []

        if isinstance(node_filter, dict):
            for f in node_filter.get('filter_set', []):
                result_sets.append(self.process_filter(target_nodes, f))

            return self.join_filter_sets(
                node_filter.get('filter_set_type'), result_sets)

        elif isinstance(node_filter, objects.NodeFilterSet):
            for f in node_filter.filter_set:
                result_sets.append(self.process_filter(target_nodes, f))

            return self.join_filter_sets(node_filter.filter_set_type,
                                         result_sets)

    def join_filter_sets(self, filter_set_type, result_sets):
        if filter_set_type == 'union':
            return self.list_union(*result_sets)
        elif filter_set_type == 'intersection':
            return self.list_intersection(*result_sets)
        else:
            raise errors.OrchestratorError(
                "Unknown filter set type %s" % filter_set_type)

    def process_filter(self, node_set, filter_set):
        """Take a filter and apply it to the node_set.

        :param node_set: A full set of objects.BaremetalNode
        :param filter_set: A node filter describing filters to apply to the node set.
                           Either a dict or objects.NodeFilter
        """
        try:
            if isinstance(filter_set, dict):
                set_type = filter_set.get('filter_type', None)
                node_names = filter_set.get('node_names', [])
                node_tags = filter_set.get('node_tags', [])
                node_labels = filter_set.get('node_labels', {})
                rack_names = filter_set.get('rack_names', [])
                rack_labels = filter_set.get('rack_labels', {})
            elif isinstance(filter_set, objects.NodeFilter):
                set_type = filter_set.filter_type
                node_names = filter_set.node_names
                node_tags = filter_set.node_tags
                node_labels = filter_set.node_labels
                rack_names = filter_set.rack_names
                rack_labels = filter_set.rack_labels
            else:
                raise errors.OrchestratorError(
                    "Node filter must be a dictionary or a NodeFilter instance"
                )

            target_nodes = dict()

            if node_names:
                self.logger.debug("Filtering nodes based on node names.")
                target_nodes['node_names'] = [
                    x for x in node_set if x.get_name() in node_names
                ]

            if node_tags:
                self.logger.debug("Filtering nodes based on node tags.")
                target_nodes['node_tags'] = [
                    x for x in node_set for t in node_tags if x.has_tag(t)
                ]

            if rack_names:
                self.logger.debug("Filtering nodes based on rack names.")
                target_nodes['rack_names'] = [
                    x for x in node_set if x.get_rack() in rack_names
                ]

            if node_labels:
                self.logger.debug("Filtering nodes based on node labels.")
                target_nodes['node_labels'] = []
                for k, v in node_labels.items():
                    target_nodes['node_labels'].extend([
                        x for x in node_set
                        if getattr(x, 'owner_data', {}).get(k, None) == v
                    ])

            if rack_labels:
                self.logger.info(
                    "Rack label filtering not yet implemented, returning all nodes."
                )
                target_nodes['rack_labels'] = node_set

            if set_type == 'union':
                return self.list_union(
                    target_nodes.get('node_names', []),
                    target_nodes.get('node_tags', []),
                    target_nodes.get('rack_names', []),
                    target_nodes.get('node_labels', []))
            elif set_type == 'intersection':
                return self.list_intersection(
                    target_nodes.get('node_names', None),
                    target_nodes.get('node_tags', None),
                    target_nodes.get('rack_names', None),
                    target_nodes.get('node_labels', None))

        except Exception as ex:
            self.logger.error("Error processing node filter.", exc_info=ex)
            raise errors.OrchestratorError(
                "Error processing node filter: %s" % str(ex))

    def list_intersection(self, a, *rest):
        """Take the intersection of a with the intersection of all the rest.

        :param a: list of values
        :params rest: 0 or more lists of values
        """
        if len(rest) > 1:
            result = self.list_intersection(rest[0], *rest[1:])
        elif rest:
            result = rest[0]
        else:
            result = None

        if a is None:
            return result
        elif result is None:
            return a
        else:
            return list(set(a).intersection(set(result)))

    def list_union(self, *lists):
        """Return a unique-ified union of all the lists.

        :param lists: indefinite number of lists
        """
        results = set()
        if len(lists) > 1:
            for l in lists:
                results = results.union(set(l))
            return list(results)
        elif len(lists) == 1:
            return list(set(lists[0]))
        else:
            return None

    def create_bootaction_context(self, nodename, task):
        """Save a boot action context for ``nodename``

        Generate a identity key and persist the boot action context
        for nodename pointing at the top level task. Return the
        generated identity key as ``bytes``.

        :param nodename: Name of the node the bootaction context is targeted for
        :param task: The task instigating the ndoe deployment
        """
        design_status, site_design = self.get_effective_site(task.design_ref)

        if site_design.bootactions is None:
            return None

        identity_key = None

        self.logger.debug(
            "Creating boot action context for node %s" % nodename)

        for ba in site_design.bootactions:
            self.logger.debug(
                "Boot actions target nodes: %s" % ba.target_nodes)
            if nodename in ba.target_nodes:
                if identity_key is None:
                    identity_key = os.urandom(32)
                    self.state_manager.post_boot_action_context(
                        nodename, task.get_id(), identity_key)
                self.logger.debug(
                    "Adding boot action %s for node %s to the database." %
                    (ba.name, nodename))
                if ba.signaling:
                    init_status = hd_fields.ActionResult.Incomplete
                else:
                    init_status = hd_fields.ActionResult.Unreported
                    self.logger.debug(
                        "Boot action %s has disabled signaling, marking unreported."
                        % ba.name)
                action_id = ulid2.generate_binary_ulid()
                self.state_manager.post_boot_action(
                    nodename,
                    task.get_id(),
                    identity_key,
                    action_id,
                    ba.name,
                    action_status=init_status)
        return identity_key

    def find_node_package_lists(self, nodename, task):
        """Return all packages to be installed on ``nodename``

        :param nodename: The name of the node to retrieve packages for
        :param task: The task initiating this request
        """
        design_status, site_design = self.get_effective_site(task.design_ref)

        if site_design.bootactions is None:
            return None

        self.logger.debug(
            "Extracting package install list for node %s" % nodename)

        pkg_list = dict()

        for ba in site_design.bootactions:
            if nodename in ba.target_nodes:
                # NOTE(sh8121att) the ulid generation below
                # is throw away data as these assets are only used to
                # get a full list of packages to deploy
                assets = ba.render_assets(
                    nodename,
                    site_design,
                    ulid2.generate_binary_ulid(),
                    ulid2.generate_binary_ulid(),
                    task.design_ref,
                    type_filter=hd_fields.BootactionAssetType.PackageList)
                for a in assets:
                    pkg_list.update(a.package_list)

        return pkg_list

    def render_route_domains(self, site_design):
        """Update site_design with static routes for route domains.

        site_design will be updated in place with explicit static routes
        for all routedomain members

        :param site_design: a populated instance of objects.SiteDesign
        """
        self.logger.info("Rendering routes for network route domains.")
        if site_design.networks is not None:
            routedomains = dict()
            for n in site_design.networks:
                if n.routedomain is not None:
                    if n.routedomain not in routedomains:
                        self.logger.info("Adding routedomain %s to render map."
                                         % n.routedomain)
                        routedomains[n.routedomain] = list()
                    routedomains[n.routedomain].append(n)
            for rd, nl in routedomains.items():
                rd_cidrs = [n.cidr for n in nl]
                self.logger.debug("Target CIDRs for routedomain %s: %s" %
                                  (rd, ','.join(rd_cidrs)))
                for n in site_design.networks:
                    gw = None
                    metric = None
                    for r in n.routes:
                        if 'routedomain' in r and r.get('routedomain',
                                                        None) == rd:
                            gw = r.get('gateway')
                            metric = r.get('metric')
                            self.logger.debug(
                                "Use gateway %s for routedomain %s on network %s."
                                % (gw, rd, n.get_name()))
                            break
                    if gw is not None and metric is not None:
                        for cidr in rd_cidrs:
                            if cidr != n.cidr:
                                n.routes.append(
                                    dict(
                                        subnet=cidr, gateway=gw,
                                        metric=metric))