shipyard/src/bin/shipyard_airflow/shipyard_airflow/common/deployment_group/deployment_group_manager.py

351 lines
13 KiB
Python

# Copyright 2018 AT&T Intellectual Property. All other rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Deployment group manager module
Encapsulates classes and functions related to the management and use of
deployment groups used during baremetal provisioning.
"""
import logging
import networkx as nx
from .deployment_group import DeploymentGroup
from .deployment_group import Stage
from .errors import DeploymentGroupCycleError
from .errors import DeploymentGroupStageError
from .errors import DeploymentGroupSuccessProcessingError
from .errors import UnknownDeploymentGroupError
from .errors import UnknownNodeError
LOG = logging.getLogger(__name__)
class DeploymentGroupManager:
"""Manager object to control ordering and cross-group interactions
:param group_dict_list: list of group entries translated from a
DeploymentStrategy document.
:param node_lookup: function to lookup nodes based on group selectors
"""
def __init__(self, group_dict_list, node_lookup):
LOG.debug("Initializing DeploymentGroupManager")
# the raw input
self._group_dict_list = group_dict_list
# A dictionary of all groups by group name. E.g.:
# {
# 'group-1': DeploymentGroup(...),
# }
self._all_groups = {}
for group_dict in group_dict_list:
group = DeploymentGroup(group_dict, node_lookup)
self._all_groups[group.name] = group
self._group_graph = _generate_group_graph(
self._all_groups.values()
)
self._group_order = list(nx.topological_sort(self._group_graph))
# Setup nodes.
# self.all_nodes is a dictionary of all nodes by node name,
# representing each node's status of deployment. E.g.:
# { 'node-01' : Stage.NOT_STARTED}
#
# each group is also updated with group.actionable_nodes based on group
# ordering (deduplication)
self._all_nodes = {}
self._calculate_nodes()
def get_next_group(self, stage):
"""Get the next eligible group name to use for the provided stage
Finds the next group that has as status eligible for the stage
provided.
Returns None if there are no groups ready for the stage
"""
prev_stage = Stage.previous_stage(stage)
for group in self._group_order:
if self._all_groups[group].stage in prev_stage:
return self._all_groups[group]
return None
def group_list(self):
"""Return a list of DeploymentGroup objects in group order"""
summary = []
for group_nm in self._group_order:
group = self._all_groups[group_nm]
summary.append(group)
return summary
def critical_groups_failed(self):
"""Return True if any critical groups have failed"""
for group in self._all_groups.values():
if group.stage == Stage.FAILED and group.critical:
return True
return False
def evaluate_group_succ_criteria(self, group_name, stage):
"""Checks a group against its success criteria for a stage
:param group_name: the name of the group to check
:param stage: Stage.PREPARED or Stage.DEPLOYED
Returns a boolean: True = success, False = failure.
"""
failed_criteria = self.get_group_failures_for_stage(group_name, stage)
if failed_criteria:
# Logging of criteria has already occurred during checking.
self.mark_group_failed(group_name)
LOG.info("Group %s has failed to meet its success criteria while "
"trying to move to stage: %s",
group_name, stage)
return False
elif stage == Stage.DEPLOYED:
self.mark_group_deployed(group_name)
LOG.info("Group %s has met its success criteria and is "
"successfully deployed (%s)", group_name, stage)
return True
elif stage == Stage.PREPARED:
self.mark_group_prepared(group_name)
LOG.info("Group %s has met its success criteria and is "
"now set to stage %s", group_name, stage)
return True
# Any other cases are invalid.
raise DeploymentGroupSuccessProcessingError(
"Group {} has no failures, but is in an invalid state {}".format(
group_name, stage
)
)
def report_group_summary(self):
"""Reports the status of all groups handled by this deployment"""
LOG.info("===== Group Summary =====")
for group in self.group_list():
LOG.info(" Group %s%s ended with stage: %s",
group.name,
" [Critical]" if group.critical else "",
group.stage)
LOG.info("===== End Group Summary =====")
def report_node_summary(self):
"""Reports the status of all nodes handled by this deployment"""
# Ordered stages
stages = [Stage.NOT_STARTED,
Stage.PREPARED,
Stage.DEPLOYED,
Stage.FAILED]
LOG.info("===== Node Summary =====")
for stage in stages:
nodes = self.get_nodes(stage=stage)
LOG.info(" Nodes %s: %s", stage, ", ".join(nodes))
LOG.info("===== End Node Summary =====")
#
# Methods that support setup of the nodes in groups
#
def _calculate_nodes(self):
"""Calculate the mapping of all compute nodes
Uses self.group_order, self.all_groups
"""
for name in self._group_order:
group = self._all_groups[name]
known_nodes = set(self._all_nodes.keys())
_update_group_actionable_nodes(group, known_nodes)
for node in group.full_nodes:
self._all_nodes[node] = Stage.NOT_STARTED
#
# Methods for managing marking the stage of processing for a group
#
def mark_group_failed(self, group_name):
"""Sets status for a group and all successors(dependents) to failed
:param group_name: The name of the group to fail
"""
group = self._find_group(group_name)
group.stage = Stage.FAILED
successors = list(self._group_graph.successors(group_name))
if successors:
LOG.info("Group %s (now FAILED) has dependent groups %s",
group_name, ", ".join(successors))
for name in successors:
self.mark_group_failed(name)
def mark_group_prepared(self, group_name):
"""Sets a group to the Stage.PREPARED stage"""
group = self._find_group(group_name)
group.stage = Stage.PREPARED
def mark_group_deployed(self, group_name):
"""Sets a group to the Stage.DEPLOYED stage"""
group = self._find_group(group_name)
group.stage = Stage.DEPLOYED
def _find_group(self, group_name):
"""Wrapper for accessing groups from self.all_groups"""
group = self._all_groups.get(group_name)
if group is None:
raise UnknownDeploymentGroupError(
"Group name {} does not refer to a known group".format(
group_name)
)
return group
def get_group_failures_for_stage(self, group_name, stage):
"""Check if the nodes of a group cause the group to fail
Returns the list of failed success criteria, or [] if the group is
successful
This is only for checking transitions to PREPARED and DEPLOYED. The
valid stages for input to this method are Stage.PREPARED and
Stage.DEPLOYED.
Note that nodes that are DEPLOYED count as PREPARED, but not
the other way around.
"""
if stage not in [Stage.DEPLOYED, Stage.PREPARED]:
raise DeploymentGroupStageError(
"The stage {} is not valid for checking group"
" failures.".format(stage))
success_nodes = set()
# deployed nodes count as success for prepared and deployed
success_nodes.update(self.get_nodes(Stage.DEPLOYED))
if stage == Stage.PREPARED:
success_nodes.update(self.get_nodes(Stage.PREPARED))
group = self._find_group(group_name)
return group.get_failed_success_criteria(success_nodes)
#
# Methods for handling nodes
#
def fail_unsuccessful_nodes(self, group, successes):
"""Fail nodes that were not successful in a group's actionable list
:param group: the group to check
:param successes: the list of successful nodes from processing
This makes an assumption that all actionable nodes should be in a list
of successes if they are to be considered successful. If the success
list is empty, all the actionable nodes in the group would be
considered failed.
"""
# Mark non-successes as failed
failed_nodes = set(group.actionable_nodes).difference(set(successes))
for node_name in failed_nodes:
self.mark_node_failed(node_name)
def mark_node_deployed(self, node_name):
"""Mark a node as deployed"""
self._set_node_stage(node_name, Stage.DEPLOYED)
def mark_node_prepared(self, node_name):
"""Mark a node as prepared"""
self._set_node_stage(node_name, Stage.PREPARED)
def mark_node_failed(self, node_name):
"""Mark a node as failed"""
self._set_node_stage(node_name, Stage.FAILED)
def _set_node_stage(self, node_name, stage):
"""Find and set a node's stage to the specified stage"""
if node_name in self._all_nodes:
self._all_nodes[node_name] = stage
else:
raise UnknownNodeError("The specified node {} does not"
" exist in this manager".format(node_name))
def get_nodes(self, stage=None):
"""Get a list of nodes that have the specified status"""
if stage is None:
return [name for name in self._all_nodes]
return [name for name, n_stage
in self._all_nodes.items()
if n_stage == stage]
def _update_group_actionable_nodes(group, known_nodes):
"""Updates a group's actionable nodes
Acitonable nodes is the group's (full_nodes - known_nodes)
"""
LOG.debug("Known nodes before processing group %s is %s",
group.name,
", ".join(known_nodes))
group_nodes = set(group.full_nodes)
group.actionable_nodes = list(group_nodes.difference(known_nodes))
LOG.debug("Group %s set actionable_nodes to %s. "
"Full node list for this group is %s",
group.name,
", ".join(group.actionable_nodes),
", ".join(group.full_nodes))
def _generate_group_graph(groups):
"""Create the directed graph of groups
:param groups: An iterable of DeploymentGroup objects
returns a directed graph of group names
"""
LOG.debug("Generating directed graph of groups based on dependencies")
graph = nx.DiGraph()
# Add all groups as graph nodes. It is not strictly necessary to do two
# loops here, but n is small and for obviousness.
for group in groups:
graph.add_node(group.name)
# Add all edges
for group in groups:
if group.depends_on:
for parent in group.depends_on:
LOG.debug("%s has parent %s", group.name, parent)
graph.add_edge(parent, group.name)
else:
LOG.debug("%s is not dependent upon any other groups", group.name)
_detect_cycles(graph)
return graph
def _detect_cycles(graph):
"""Detect if there are cycles between the groups
Raise a DeploymentGroupCycleError if there are any circular
dependencies
"""
LOG.debug("Detecting cycles in graph")
circ_deps = []
try:
circ_deps = list(nx.find_cycle(graph))
except nx.NetworkXNoCycle:
LOG.info('There are no cycles detected in the graph')
pass
if circ_deps:
involved_nodes = set()
# a value in this list is like: ('group1', 'group2')
for dep in circ_deps:
involved_nodes.update(dep)
raise DeploymentGroupCycleError(
"The following are involved in a circular dependency:"
" {}".format(", ".join(involved_nodes))
)