Workflow to support deployment groups
Updates the Shipyard/Airflow workflow for deploy_site and update_site to use the deployment group/deployment strategy information from the design. This allows for baremetal nodes to be deployed in a design- specified order, with criticality and success criteria driving the success and failure of deployment. Includes refactoring of service endpoints to reduce the need for so much data passing. Change-Id: Ib5e9fca535ca74d1819fe46959695acfed5b65c2changes/35/569835/16
parent
ea47f2c77b
commit
04906cce68
|
@ -2,7 +2,7 @@
|
|||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
**/.pytest_cache/
|
||||
.pytest_cache/
|
||||
|
||||
|
||||
# C extensions
|
||||
|
|
4
Makefile
4
Makefile
|
@ -79,9 +79,9 @@ run:
|
|||
.PHONY: build_airflow
|
||||
build_airflow:
|
||||
ifeq ($(USE_PROXY), true)
|
||||
docker build --network host -t $(IMAGE) --label $(LABEL) -f $(IMAGE_DIR)/Dockerfile $(IMAGE_DIR) --build-arg http_proxy=$(PROXY) --build-arg https_proxy=$(PROXY)
|
||||
docker build --network host -t $(IMAGE) --label $(LABEL) -f $(IMAGE_DIR)/Dockerfile --build-arg http_proxy=$(PROXY) --build-arg https_proxy=$(PROXY) --build-arg ctx_base=$(BUILD_CTX) .
|
||||
else
|
||||
docker build --network host -t $(IMAGE) --label $(LABEL) -f $(IMAGE_DIR)/Dockerfile $(IMAGE_DIR)
|
||||
docker build --network host -t $(IMAGE) --label $(LABEL) -f $(IMAGE_DIR)/Dockerfile --build-arg ctx_base=$(BUILD_CTX) .
|
||||
endif
|
||||
ifeq ($(PUSH_IMAGE), true)
|
||||
docker push $(IMAGE)
|
||||
|
|
|
@ -32,6 +32,7 @@ EXPOSE $WORKER_PORT
|
|||
# Set ARG for usage during build
|
||||
ARG AIRFLOW_HOME=/usr/local/airflow
|
||||
ARG DEBIAN_FRONTEND=noninteractive
|
||||
ARG ctx_base=src/bin
|
||||
|
||||
# Kubectl version
|
||||
ARG KUBECTL_VERSION=1.8.6
|
||||
|
@ -76,17 +77,30 @@ RUN useradd -ms /bin/bash -d ${AIRFLOW_HOME} airflow \
|
|||
# Dependency requirements
|
||||
# Note - removing snakebite (python 2 vs. 3). See:
|
||||
# https://github.com/puckel/docker-airflow/issues/77
|
||||
COPY ./requirements.txt /tmp/
|
||||
COPY images/airflow/requirements.txt /tmp/
|
||||
RUN pip3 install -r /tmp/requirements.txt \
|
||||
&& pip3 uninstall -y snakebite || true
|
||||
|
||||
# Copy scripts used in the container:
|
||||
# entrypoint.sh, airflow_start_service.sh and airflow_logrotate.sh
|
||||
COPY script/*.sh ${AIRFLOW_HOME}/
|
||||
COPY images/airflow/script/*.sh ${AIRFLOW_HOME}/
|
||||
|
||||
# Change permissions
|
||||
RUN chown -R airflow: ${AIRFLOW_HOME}
|
||||
|
||||
# Shipyard
|
||||
#
|
||||
# Shipyard provides core functionality used by the airflow plugins/operators
|
||||
# Since Shipyard and Airflow are built together as images, this should prevent
|
||||
# stale or out-of-date code between these parts.
|
||||
# Shipyard requirements, source and installation
|
||||
COPY ${ctx_base}/shipyard_airflow/requirements.txt /tmp/api_requirements.txt
|
||||
RUN pip3 install -r /tmp/api_requirements.txt
|
||||
|
||||
COPY ${ctx_base}/shipyard_airflow /tmp/shipyard/
|
||||
RUN cd /tmp/shipyard \
|
||||
&& python3 setup.py install
|
||||
|
||||
# Set work directory
|
||||
USER airflow
|
||||
WORKDIR ${AIRFLOW_HOME}
|
||||
|
|
|
@ -82,6 +82,71 @@ class DeploymentGroupManager:
|
|||
return self._all_groups[group]
|
||||
return None
|
||||
|
||||
def group_list(self):
|
||||
"""Return a list of DeploymentGroup objects in group order"""
|
||||
summary = []
|
||||
for group_nm in self._group_order:
|
||||
group = self._all_groups[group_nm]
|
||||
summary.append(group)
|
||||
return summary
|
||||
|
||||
def critical_groups_failed(self):
|
||||
"""Return True if any critical groups have failed"""
|
||||
for group in self._all_groups.values():
|
||||
if group.stage == Stage.FAILED and group.critical:
|
||||
return True
|
||||
return False
|
||||
|
||||
def evaluate_group_succ_criteria(self, group_name, stage):
|
||||
"""Checks a group against its success criteria for a stage
|
||||
|
||||
:param group_name: the name of the group to check
|
||||
:param stage: Stage.PREPARED or Stage.DEPLOYED
|
||||
Returns a boolean: True = success, False = failure.
|
||||
"""
|
||||
failed_criteria = self.get_group_failures_for_stage(group_name, stage)
|
||||
if failed_criteria:
|
||||
# Logging of criteria has already occurred during checking.
|
||||
self.mark_group_failed(group_name)
|
||||
LOG.info("Group %s has failed to meet its success criteria while "
|
||||
"trying to move to stage: %s",
|
||||
group_name, stage)
|
||||
return False
|
||||
elif stage == Stage.DEPLOYED:
|
||||
self.mark_group_deployed(group_name)
|
||||
LOG.info("Group %s has met its success criteria and is "
|
||||
"successfully deployed (%s)", group_name, stage)
|
||||
return True
|
||||
elif stage == Stage.PREPARED:
|
||||
self.mark_group_prepared(group_name)
|
||||
LOG.info("Group %s has met its success criteria and is "
|
||||
"now set to stage %s", group_name, stage)
|
||||
return True
|
||||
|
||||
def report_group_summary(self):
|
||||
"""Reports the status of all groups handled by this deployment"""
|
||||
LOG.info("===== Group Summary =====")
|
||||
for group in self.group_list():
|
||||
LOG.info(" Group %s%s ended with stage: %s",
|
||||
group.name,
|
||||
" [Critical]" if group.critical else "",
|
||||
group.stage)
|
||||
LOG.info("===== End Group Summary =====")
|
||||
|
||||
def report_node_summary(self):
|
||||
"""Reports the status of all nodes handled by this deployment"""
|
||||
# Ordered stages
|
||||
stages = [Stage.NOT_STARTED,
|
||||
Stage.PREPARED,
|
||||
Stage.DEPLOYED,
|
||||
Stage.FAILED]
|
||||
|
||||
LOG.info("===== Node Summary =====")
|
||||
for stage in stages:
|
||||
nodes = self.get_nodes(stage=stage)
|
||||
LOG.info(" Nodes %s: %s", stage, ", ".join(nodes))
|
||||
LOG.info("===== End Node Summary =====")
|
||||
|
||||
#
|
||||
# Methods that support setup of the nodes in groups
|
||||
#
|
||||
|
@ -163,6 +228,22 @@ class DeploymentGroupManager:
|
|||
# Methods for handling nodes
|
||||
#
|
||||
|
||||
def fail_unsuccessful_nodes(self, group, successes):
|
||||
"""Fail nodes that were not successful in a group's actionable list
|
||||
|
||||
:param group: the group to check
|
||||
:param successes: the list of successful nodes from processing
|
||||
|
||||
This makes an assumption that all actionable nodes should be in a list
|
||||
of successes if they are to be considered successful. If the success
|
||||
list is empty, all the actionable nodes in the group would be
|
||||
considered failed.
|
||||
"""
|
||||
# Mark non-successes as failed
|
||||
failed_nodes = set(group.actionable_nodes).difference(set(successes))
|
||||
for node_name in failed_nodes:
|
||||
self.mark_node_failed(node_name)
|
||||
|
||||
def mark_node_deployed(self, node_name):
|
||||
"""Mark a node as deployed"""
|
||||
self._set_node_stage(node_name, Stage.DEPLOYED)
|
||||
|
@ -203,7 +284,7 @@ def _update_group_actionable_nodes(group, known_nodes):
|
|||
", ".join(known_nodes))
|
||||
|
||||
group_nodes = set(group.full_nodes)
|
||||
group.actionable_nodes = group_nodes.difference(known_nodes)
|
||||
group.actionable_nodes = list(group_nodes.difference(known_nodes))
|
||||
LOG.debug("Group %s set actionable_nodes to %s. "
|
||||
"Full node list for this group is %s",
|
||||
group.name,
|
||||
|
|
|
@ -13,8 +13,7 @@
|
|||
# limitations under the License.
|
||||
|
||||
from airflow.models import DAG
|
||||
from airflow.operators import DrydockDeployNodesOperator
|
||||
from airflow.operators import DrydockPrepareNodesOperator
|
||||
from airflow.operators import DrydockNodesOperator
|
||||
from airflow.operators import DrydockPrepareSiteOperator
|
||||
from airflow.operators import DrydockVerifySiteOperator
|
||||
|
||||
|
@ -43,15 +42,8 @@ def deploy_site_drydock(parent_dag_name, child_dag_name, args):
|
|||
sub_dag_name=child_dag_name,
|
||||
dag=dag)
|
||||
|
||||
drydock_prepare_nodes = DrydockPrepareNodesOperator(
|
||||
task_id='prepare_nodes',
|
||||
shipyard_conf=config_path,
|
||||
main_dag_name=parent_dag_name,
|
||||
sub_dag_name=child_dag_name,
|
||||
dag=dag)
|
||||
|
||||
drydock_deploy_nodes = DrydockDeployNodesOperator(
|
||||
task_id='deploy_nodes',
|
||||
drydock_nodes = DrydockNodesOperator(
|
||||
task_id='prepare_and_deploy_nodes',
|
||||
shipyard_conf=config_path,
|
||||
main_dag_name=parent_dag_name,
|
||||
sub_dag_name=child_dag_name,
|
||||
|
@ -59,7 +51,6 @@ def deploy_site_drydock(parent_dag_name, child_dag_name, args):
|
|||
|
||||
# Define dependencies
|
||||
drydock_prepare_site.set_upstream(drydock_verify_site)
|
||||
drydock_prepare_nodes.set_upstream(drydock_prepare_site)
|
||||
drydock_deploy_nodes.set_upstream(drydock_prepare_nodes)
|
||||
drydock_nodes.set_upstream(drydock_prepare_site)
|
||||
|
||||
return dag
|
||||
|
|
|
@ -12,7 +12,6 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import logging
|
||||
import os
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from airflow.exceptions import AirflowException
|
||||
|
@ -22,9 +21,9 @@ from airflow.utils.decorators import apply_defaults
|
|||
import armada.common.client as client
|
||||
import armada.common.session as session
|
||||
from get_k8s_pod_port_ip import get_pod_port_ip
|
||||
from service_endpoint import ucp_service_endpoint
|
||||
from service_token import shipyard_service_token
|
||||
from ucp_base_operator import UcpBaseOperator
|
||||
import service_endpoint
|
||||
from xcom_pusher import XcomPusher
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
@ -42,16 +41,12 @@ class ArmadaBaseOperator(UcpBaseOperator):
|
|||
|
||||
@apply_defaults
|
||||
def __init__(self,
|
||||
armada_svc_type='armada',
|
||||
deckhand_svc_type='deckhand',
|
||||
query={},
|
||||
svc_session=None,
|
||||
svc_token=None,
|
||||
*args, **kwargs):
|
||||
"""Initialization of ArmadaBaseOperator object.
|
||||
|
||||
:param armada_svc_type: Armada Service Type
|
||||
:param deckhand_svc_type: Deckhand Service Type
|
||||
:param query: A dictionary containing explicit query string parameters
|
||||
:param svc_session: Keystone Session
|
||||
:param svc_token: Keystone Token
|
||||
|
@ -66,8 +61,6 @@ class ArmadaBaseOperator(UcpBaseOperator):
|
|||
pod_selector_pattern=[{'pod_pattern': 'armada-api',
|
||||
'container': 'armada-api'}],
|
||||
*args, **kwargs)
|
||||
self.armada_svc_type = armada_svc_type
|
||||
self.deckhand_svc_type = deckhand_svc_type
|
||||
self.query = query
|
||||
self.svc_session = svc_session
|
||||
self.svc_token = svc_token
|
||||
|
@ -81,21 +74,11 @@ class ArmadaBaseOperator(UcpBaseOperator):
|
|||
# Logs uuid of action performed by the Operator
|
||||
LOG.info("Armada Operator for action %s", self.action_info['id'])
|
||||
|
||||
# Retrieve Endpoint Information
|
||||
armada_svc_endpoint = ucp_service_endpoint(
|
||||
self, svc_type=self.armada_svc_type)
|
||||
|
||||
# Set up armada client
|
||||
self.armada_client = self._init_armada_client(armada_svc_endpoint,
|
||||
self.svc_token)
|
||||
|
||||
# Retrieve DeckHand Endpoint Information
|
||||
deckhand_svc_endpoint = ucp_service_endpoint(
|
||||
self, svc_type=self.deckhand_svc_type)
|
||||
|
||||
# Get deckhand design reference url
|
||||
self.deckhand_design_ref = self._init_deckhand_design_ref(
|
||||
deckhand_svc_endpoint)
|
||||
self.armada_client = self._init_armada_client(
|
||||
self.endpoints.endpoint_by_name(service_endpoint.ARMADA),
|
||||
self.svc_token
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _init_armada_client(armada_svc_endpoint, svc_token):
|
||||
|
@ -133,26 +116,6 @@ class ArmadaBaseOperator(UcpBaseOperator):
|
|||
else:
|
||||
raise AirflowException("Failed to set up Armada client!")
|
||||
|
||||
def _init_deckhand_design_ref(self, deckhand_svc_endpoint):
|
||||
|
||||
LOG.info("Deckhand endpoint is %s", deckhand_svc_endpoint)
|
||||
|
||||
# Form DeckHand Design Reference Path
|
||||
# This URL will be used to retrieve the Site Design YAMLs
|
||||
deckhand_path = "deckhand+" + deckhand_svc_endpoint
|
||||
_deckhand_design_ref = os.path.join(deckhand_path,
|
||||
"revisions",
|
||||
str(self.revision_id),
|
||||
"rendered-documents")
|
||||
|
||||
if _deckhand_design_ref:
|
||||
LOG.info("Design YAMLs will be retrieved from %s",
|
||||
_deckhand_design_ref)
|
||||
|
||||
return _deckhand_design_ref
|
||||
else:
|
||||
raise AirflowException("Unable to Retrieve Design Reference!")
|
||||
|
||||
@get_pod_port_ip('tiller', namespace='kube-system')
|
||||
def get_tiller_info(self, pods_ip_port={}):
|
||||
|
||||
|
|
|
@ -58,7 +58,7 @@ class ArmadaPostApplyOperator(ArmadaBaseOperator):
|
|||
try:
|
||||
armada_post_apply = self.armada_client.post_apply(
|
||||
manifest=armada_manifest,
|
||||
manifest_ref=self.deckhand_design_ref,
|
||||
manifest_ref=self.design_ref,
|
||||
values=override_values,
|
||||
set=chart_set,
|
||||
query=self.query,
|
||||
|
|
|
@ -42,8 +42,7 @@ class ArmadaValidateDesignOperator(ArmadaBaseOperator):
|
|||
# Validate Site Design
|
||||
try:
|
||||
post_validate = self.armada_client.post_validate(
|
||||
manifest=self.deckhand_design_ref,
|
||||
timeout=timeout)
|
||||
manifest=self.design_ref, timeout=timeout)
|
||||
|
||||
except errors.ClientError as client_error:
|
||||
# Dump logs from Armada API pods
|
||||
|
|
|
@ -15,7 +15,6 @@
|
|||
import logging
|
||||
import time
|
||||
|
||||
from airflow.exceptions import AirflowException
|
||||
from kubernetes import client, config
|
||||
|
||||
|
||||
|
@ -54,7 +53,7 @@ def check_node_status(time_out, interval):
|
|||
# Logs initial state of all nodes in the cluster
|
||||
ret_init = v1.list_node(watch=False)
|
||||
|
||||
logging.info("Current state of nodes in Cluster is")
|
||||
logging.info("Current state of nodes in the cluster is")
|
||||
|
||||
for i in ret_init.items:
|
||||
logging.info("%s\t%s\t%s", i.metadata.name,
|
||||
|
@ -86,7 +85,7 @@ def check_node_status(time_out, interval):
|
|||
cluster_ready = False
|
||||
|
||||
# Print current state of node
|
||||
logging.info("Node %s is not Ready", j.metadata.name)
|
||||
logging.info("Node %s is not ready", j.metadata.name)
|
||||
logging.debug("Current status of %s is %s",
|
||||
j.metadata.name,
|
||||
j.status.conditions[-1].message)
|
||||
|
@ -96,16 +95,18 @@ def check_node_status(time_out, interval):
|
|||
|
||||
logging.info("Node %s is in Ready state", j.metadata.name)
|
||||
|
||||
# Raise Time Out Exception
|
||||
# If any nodes are not ready and the timeout is reached, stop waiting
|
||||
if not cluster_ready and i == end_range:
|
||||
raise AirflowException("Timed Out! One or more Nodes fail to "
|
||||
"get into Ready State!")
|
||||
|
||||
# Exit loop if Cluster is in Ready state
|
||||
if cluster_ready:
|
||||
logging.info("All nodes are in Ready state")
|
||||
logging.info("Timed Out! One or more Nodes failed to reach ready "
|
||||
"state")
|
||||
break
|
||||
elif cluster_ready:
|
||||
# Exit loop if Cluster is in Ready state
|
||||
logging.info("All nodes are in ready state")
|
||||
break
|
||||
else:
|
||||
# Back off and check again in next iteration
|
||||
logging.info("Wait for %d seconds...", int(interval))
|
||||
time.sleep(int(interval))
|
||||
# Return the nodes that are not ready.
|
||||
return not_ready_node_list
|
||||
|
|
|
@ -19,7 +19,7 @@ from airflow.plugins_manager import AirflowPlugin
|
|||
from airflow.exceptions import AirflowException
|
||||
|
||||
from deckhand.client import client as deckhand_client
|
||||
from service_endpoint import ucp_service_endpoint
|
||||
import service_endpoint
|
||||
from service_token import shipyard_service_token
|
||||
from ucp_base_operator import UcpBaseOperator
|
||||
|
||||
|
@ -41,8 +41,6 @@ class DeckhandBaseOperator(UcpBaseOperator):
|
|||
committed_ver=None,
|
||||
deckhandclient=None,
|
||||
deckhand_client_read_timeout=None,
|
||||
deckhand_svc_endpoint=None,
|
||||
deckhand_svc_type='deckhand',
|
||||
revision_id=None,
|
||||
svc_session=None,
|
||||
svc_token=None,
|
||||
|
@ -53,8 +51,6 @@ class DeckhandBaseOperator(UcpBaseOperator):
|
|||
:param committed_ver: Last committed version
|
||||
:param deckhandclient: An instance of deckhand client
|
||||
:param deckhand_client_read_timeout: Deckhand client connect timeout
|
||||
:param deckhand_svc_endpoint: Deckhand Service Endpoint
|
||||
:param deckhand_svc_type: Deckhand Service Type
|
||||
:param revision_id: Target revision for workflow
|
||||
:param svc_session: Keystone Session
|
||||
:param svc_token: Keystone Token
|
||||
|
@ -70,8 +66,6 @@ class DeckhandBaseOperator(UcpBaseOperator):
|
|||
self.committed_ver = committed_ver
|
||||
self.deckhandclient = deckhandclient
|
||||
self.deckhand_client_read_timeout = deckhand_client_read_timeout
|
||||
self.deckhand_svc_endpoint = deckhand_svc_endpoint
|
||||
self.deckhand_svc_type = deckhand_svc_type
|
||||
self.revision_id = revision_id
|
||||
self.svc_session = svc_session
|
||||
self.svc_token = svc_token
|
||||
|
@ -96,8 +90,9 @@ class DeckhandBaseOperator(UcpBaseOperator):
|
|||
self.action_info['id'])
|
||||
|
||||
# Retrieve Endpoint Information
|
||||
self.deckhand_svc_endpoint = ucp_service_endpoint(
|
||||
self, svc_type=self.deckhand_svc_type)
|
||||
self.deckhand_svc_endpoint = self.endpoints.endpoint_by_name(
|
||||
service_endpoint.DECKHAND
|
||||
)
|
||||
|
||||
LOG.info("Deckhand endpoint is %s",
|
||||
self.deckhand_svc_endpoint)
|
||||
|
|
|
@ -45,7 +45,7 @@ class DeploymentConfigurationOperator(BaseOperator):
|
|||
cannot be retrieved
|
||||
"""
|
||||
config_keys_defaults = {
|
||||
"physical_provisioner.deployment_strategy": "all-at-once",
|
||||
"physical_provisioner.deployment_strategy": None,
|
||||
"physical_provisioner.deploy_interval": 30,
|
||||
"physical_provisioner.deploy_timeout": 3600,
|
||||
"physical_provisioner.destroy_interval": 30,
|
||||
|
|
|
@ -14,7 +14,6 @@
|
|||
import copy
|
||||
import pprint
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from urllib.parse import urlparse
|
||||
|
||||
|
@ -25,9 +24,35 @@ from airflow.utils.decorators import apply_defaults
|
|||
import drydock_provisioner.drydock_client.client as client
|
||||
import drydock_provisioner.drydock_client.session as session
|
||||
from drydock_provisioner import error as errors
|
||||
from service_endpoint import ucp_service_endpoint
|
||||
from service_token import shipyard_service_token
|
||||
from ucp_base_operator import UcpBaseOperator
|
||||
|
||||
try:
|
||||
import service_endpoint
|
||||
except ImportError:
|
||||
from shipyard_airflow.plugins import service_endpoint
|
||||
try:
|
||||
from service_token import shipyard_service_token
|
||||
except ImportError:
|
||||
from shipyard_airflow.plugins.service_token import shipyard_service_token
|
||||
|
||||
try:
|
||||
from ucp_base_operator import UcpBaseOperator
|
||||
except ImportError:
|
||||
from shipyard_airflow.plugins.ucp_base_operator import UcpBaseOperator
|
||||
|
||||
try:
|
||||
from drydock_errors import (
|
||||
DrydockClientUseFailureException,
|
||||
DrydockTaskFailedException,
|
||||
DrydockTaskNotCreatedException,
|
||||
DrydockTaskTimeoutException
|
||||
)
|
||||
except ImportError:
|
||||
from shipyard_airflow.plugins.drydock_errors import (
|
||||
DrydockClientUseFailureException,
|
||||
DrydockTaskFailedException,
|
||||
DrydockTaskNotCreatedException,
|
||||
DrydockTaskTimeoutException
|
||||
)
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
@ -44,11 +69,7 @@ class DrydockBaseOperator(UcpBaseOperator):
|
|||
|
||||
@apply_defaults
|
||||
def __init__(self,
|
||||
deckhand_design_ref=None,
|
||||
deckhand_svc_type='deckhand',
|
||||
drydock_client=None,
|
||||
drydock_svc_endpoint=None,
|
||||
drydock_svc_type='physicalprovisioner',
|
||||
drydock_task_id=None,
|
||||
node_filter=None,
|
||||
redeploy_server=None,
|
||||
|
@ -57,11 +78,7 @@ class DrydockBaseOperator(UcpBaseOperator):
|
|||
*args, **kwargs):
|
||||
"""Initialization of DrydockBaseOperator object.
|
||||
|
||||
:param deckhand_design_ref: A URI reference to the design documents
|
||||
:param deckhand_svc_type: Deckhand Service Type
|
||||
:param drydockclient: An instance of drydock client
|
||||
:param drydock_svc_endpoint: Drydock Service Endpoint
|
||||
:param drydock_svc_type: Drydock Service Type
|
||||
:param drydock_task_id: Drydock Task ID
|
||||
:param node_filter: A filter for narrowing the scope of the task.
|
||||
Valid fields are 'node_names', 'rack_names',
|
||||
|
@ -81,11 +98,7 @@ class DrydockBaseOperator(UcpBaseOperator):
|
|||
pod_selector_pattern=[{'pod_pattern': 'drydock-api',
|
||||
'container': 'drydock-api'}],
|
||||
*args, **kwargs)
|
||||
self.deckhand_design_ref = deckhand_design_ref
|
||||
self.deckhand_svc_type = deckhand_svc_type
|
||||
self.drydock_client = drydock_client
|
||||
self.drydock_svc_endpoint = drydock_svc_endpoint
|
||||
self.drydock_svc_type = drydock_svc_type
|
||||
self.drydock_task_id = drydock_task_id
|
||||
self.node_filter = node_filter
|
||||
self.redeploy_server = redeploy_server
|
||||
|
@ -126,8 +139,9 @@ class DrydockBaseOperator(UcpBaseOperator):
|
|||
% self.__class__.__name__)
|
||||
|
||||
# Retrieve Endpoint Information
|
||||
self.drydock_svc_endpoint = ucp_service_endpoint(
|
||||
self, svc_type=self.drydock_svc_type)
|
||||
self.drydock_svc_endpoint = self.endpoints.endpoint_by_name(
|
||||
service_endpoint.DRYDOCK
|
||||
)
|
||||
|
||||
LOG.info("Drydock endpoint is %s", self.drydock_svc_endpoint)
|
||||
|
||||
|
@ -147,7 +161,9 @@ class DrydockBaseOperator(UcpBaseOperator):
|
|||
if dd_session:
|
||||
LOG.info("Successfully Set Up DryDock Session")
|
||||
else:
|
||||
raise AirflowException("Failed to set up Drydock Session!")
|
||||
raise DrydockClientUseFailureException(
|
||||
"Failed to set up Drydock Session!"
|
||||
)
|
||||
|
||||
# Use the DrydockSession to build a DrydockClient that can
|
||||
# be used to make one or more API calls
|
||||
|
@ -158,26 +174,9 @@ class DrydockBaseOperator(UcpBaseOperator):
|
|||
if self.drydock_client:
|
||||
LOG.info("Successfully Set Up DryDock client")
|
||||
else:
|
||||
raise AirflowException("Failed to set up Drydock Client!")
|
||||
|
||||
# Retrieve DeckHand Endpoint Information
|
||||
deckhand_svc_endpoint = ucp_service_endpoint(
|
||||
self, svc_type=self.deckhand_svc_type)
|
||||
|
||||
LOG.info("Deckhand endpoint is %s", deckhand_svc_endpoint)
|
||||
|
||||
# Form DeckHand Design Reference Path
|
||||
# This URL will be used to retrieve the Site Design YAMLs
|
||||
deckhand_path = "deckhand+" + deckhand_svc_endpoint
|
||||
self.deckhand_design_ref = os.path.join(deckhand_path,
|
||||
"revisions",
|
||||
str(self.revision_id),
|
||||
"rendered-documents")
|
||||
if self.deckhand_design_ref:
|
||||
LOG.info("Design YAMLs will be retrieved from %s",
|
||||
self.deckhand_design_ref)
|
||||
else:
|
||||
raise AirflowException("Unable to Retrieve Design Reference!")
|
||||
raise DrydockClientUseFailureException(
|
||||
"Failed to set up Drydock Client!"
|
||||
)
|
||||
|
||||
@shipyard_service_token
|
||||
def _auth_gen(self):
|
||||
|
@ -196,7 +195,7 @@ class DrydockBaseOperator(UcpBaseOperator):
|
|||
try:
|
||||
# Create Task
|
||||
create_task_response = self.drydock_client.create_task(
|
||||
design_ref=self.deckhand_design_ref,
|
||||
design_ref=self.design_ref,
|
||||
task_action=task_action,
|
||||
node_filter=self.node_filter)
|
||||
|
||||
|
@ -204,7 +203,7 @@ class DrydockBaseOperator(UcpBaseOperator):
|
|||
# Dump logs from Drydock pods
|
||||
self.get_k8s_logs()
|
||||
|
||||
raise AirflowException(client_error)
|
||||
raise DrydockClientUseFailureException(client_error)
|
||||
|
||||
# Retrieve Task ID
|
||||
self.drydock_task_id = create_task_response['task_id']
|
||||
|
@ -216,7 +215,7 @@ class DrydockBaseOperator(UcpBaseOperator):
|
|||
if self.drydock_task_id:
|
||||
return self.drydock_task_id
|
||||
else:
|
||||
raise AirflowException("Unable to create task!")
|
||||
raise DrydockTaskNotCreatedException("Unable to create task!")
|
||||
|
||||
def query_task(self, interval, time_out):
|
||||
|
||||
|
@ -235,21 +234,16 @@ class DrydockBaseOperator(UcpBaseOperator):
|
|||
|
||||
try:
|
||||
# Retrieve current task state
|
||||
task_state = self.drydock_client.get_task(
|
||||
task_id=self.drydock_task_id)
|
||||
task_state = self.get_task_dict(task_id=self.drydock_task_id)
|
||||
|
||||
task_status = task_state['status']
|
||||
task_result = task_state['result']['status']
|
||||
|
||||
LOG.info("Current status of task id %s is %s",
|
||||
self.drydock_task_id, task_status)
|
||||
|
||||
except errors.ClientError as client_error:
|
||||
# Dump logs from Drydock pods
|
||||
except DrydockClientUseFailureException:
|
||||
self.get_k8s_logs()
|
||||
|
||||
raise AirflowException(client_error)
|
||||
|
||||
raise
|
||||
except:
|
||||
# There can be situations where there are intermittent network
|
||||
# issues that prevents us from retrieving the task state. We
|
||||
|
@ -275,6 +269,21 @@ class DrydockBaseOperator(UcpBaseOperator):
|
|||
else:
|
||||
self.task_failure(True)
|
||||
|
||||
def get_task_dict(self, task_id):
|
||||
"""Retrieve task output in its raw dictionary format
|
||||
|
||||
:param task_id: The id of the task to retrieve
|
||||
Raises DrydockClientUseFailureException if the client raises an
|
||||
exception
|
||||
See:
|
||||
http://att-comdev-drydock.readthedocs.io/en/latest/task.html#task-status-schema
|
||||
"""
|
||||
try:
|
||||
return self.drydock_client.get_task(task_id=task_id)
|
||||
except errors.ClientError as client_error:
|
||||
# Dump logs from Drydock pods
|
||||
raise DrydockClientUseFailureException(client_error)
|
||||
|
||||
def task_failure(self, _task_failure):
|
||||
# Dump logs from Drydock pods
|
||||
self.get_k8s_logs()
|
||||
|
@ -289,7 +298,7 @@ class DrydockBaseOperator(UcpBaseOperator):
|
|||
self.all_task_ids = {t['task_id']: t for t in all_tasks}
|
||||
|
||||
except errors.ClientError as client_error:
|
||||
raise AirflowException(client_error)
|
||||
raise DrydockClientUseFailureException(client_error)
|
||||
|
||||
# Retrieve the failed parent task and assign it to list
|
||||
failed_parent_task = (
|
||||
|
@ -299,7 +308,7 @@ class DrydockBaseOperator(UcpBaseOperator):
|
|||
# Since there is only 1 failed parent task, we will print index 0
|
||||
# of the list
|
||||
if failed_parent_task:
|
||||
LOG.error('%s task has either failed or timed out',
|
||||
LOG.error("%s task has either failed or timed out",
|
||||
failed_parent_task[0]['action'])
|
||||
|
||||
LOG.error(pprint.pprint(failed_parent_task[0]))
|
||||
|
@ -312,9 +321,13 @@ class DrydockBaseOperator(UcpBaseOperator):
|
|||
|
||||
# Raise Exception to terminate workflow
|
||||
if _task_failure:
|
||||
raise AirflowException("Failed to Execute/Complete Task!")
|
||||
raise DrydockTaskFailedException(
|
||||
"Failed to Execute/Complete Task!"
|
||||
)
|
||||
else:
|
||||
raise AirflowException("Task Execution Timed Out!")
|
||||
raise DrydockTaskTimeoutException(
|
||||
"Task Execution Timed Out!"
|
||||
)
|
||||
|
||||
def check_subtask_failure(self, subtask_id_list):
|
||||
|
||||
|
@ -367,7 +380,9 @@ class DrydockBaseOperator(UcpBaseOperator):
|
|||
subtask_id)
|
||||
|
||||
else:
|
||||
raise AirflowException("Unable to retrieve subtask info!")
|
||||
raise DrydockClientUseFailureException(
|
||||
"Unable to retrieve subtask info!"
|
||||
)
|
||||
|
||||
|
||||
class DrydockBaseOperatorPlugin(AirflowPlugin):
|
||||
|
|
|
@ -1,70 +0,0 @@
|
|||
# Copyright 2018 AT&T Intellectual Property. All other rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import logging
|
||||
import time
|
||||
|
||||
from airflow.plugins_manager import AirflowPlugin
|
||||
|
||||
from check_k8s_node_status import check_node_status
|
||||
from drydock_base_operator import DrydockBaseOperator
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DrydockDeployNodesOperator(DrydockBaseOperator):
|
||||
|
||||
"""Drydock Deploy Nodes Operator
|
||||
|
||||
This operator will trigger drydock to deploy the bare metal
|
||||
nodes
|
||||
|
||||
"""
|
||||
|
||||
def do_execute(self):
|
||||
|
||||
# Trigger DryDock to execute task
|
||||
self.create_task('deploy_nodes')
|
||||
|
||||
# Retrieve query interval and timeout
|
||||
q_interval = self.dc['physical_provisioner.deploy_interval']
|
||||
task_timeout = self.dc['physical_provisioner.deploy_timeout']
|
||||
|
||||
# Query Task
|
||||
self.query_task(q_interval, task_timeout)
|
||||
|
||||
# It takes time for the cluster join process to be triggered across
|
||||
# all the nodes in the cluster. Hence there is a need to back off
|
||||
# and wait before checking the state of the cluster join process.
|
||||
join_wait = self.dc['physical_provisioner.join_wait']
|
||||
|
||||
LOG.info("All nodes deployed in MAAS")
|
||||
LOG.info("Wait for %d seconds before checking node state...",
|
||||
join_wait)
|
||||
|
||||
time.sleep(join_wait)
|
||||
|
||||
# Check that cluster join process is completed before declaring
|
||||
# deploy_node as 'completed'.
|
||||
node_st_timeout = self.dc['kubernetes.node_status_timeout']
|
||||
node_st_interval = self.dc['kubernetes.node_status_interval']
|
||||
|
||||
check_node_status(node_st_timeout, node_st_interval)
|
||||
|
||||
|
||||
class DrydockDeployNodesOperatorPlugin(AirflowPlugin):
|
||||
|
||||
"""Creates DrydockDeployNodesOperator in Airflow."""
|
||||
|
||||
name = 'drydock_deploy_nodes_operator'
|
||||
operators = [DrydockDeployNodesOperator]
|
|
@ -0,0 +1,34 @@
|
|||
# Copyright 2018 AT&T Intellectual Property. All other rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Drydock specific exceptions generated during operator execution.
|
||||
|
||||
Generally marker exceptions extending AirflowException
|
||||
"""
|
||||
from airflow.exceptions import AirflowException
|
||||
|
||||
|
||||
class DrydockClientUseFailureException(AirflowException):
|
||||
pass
|
||||
|
||||
|
||||
class DrydockTaskFailedException(AirflowException):
|
||||
pass
|
||||
|
||||
|
||||
class DrydockTaskNotCreatedException(AirflowException):
|
||||
pass
|
||||
|
||||
|
||||
class DrydockTaskTimeoutException(AirflowException):
|
||||
pass
|
|
@ -0,0 +1,486 @@
|
|||
# Copyright 2018 AT&T Intellectual Property. All other rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Prepare and deploy nodes using Drydock
|
||||
|
||||
Uses the deployment strategy named in the deployment-configuration to
|
||||
progress through preparation and deployment of nodes in a group-based fashion.
|
||||
|
||||
In the case of no specified deployment strategy, an "all-at-once" approach is
|
||||
taken, by which all nodes are deployed together.
|
||||
|
||||
Historical Note: This operator replaces the function of drydock_prepare_nodes
|
||||
and drydock_deploy_nodes operators that existed previously.
|
||||
"""
|
||||
import logging
|
||||
import time
|
||||
|
||||
from airflow.exceptions import AirflowException
|
||||
from airflow.plugins_manager import AirflowPlugin
|
||||
|
||||
from shipyard_airflow.common.deployment_group.deployment_group import Stage
|
||||
from shipyard_airflow.common.deployment_group.deployment_group_manager import \
|
||||
DeploymentGroupManager
|
||||
from shipyard_airflow.common.deployment_group.node_lookup import NodeLookup
|
||||
|
||||
try:
|
||||
import check_k8s_node_status
|
||||
except ImportError:
|
||||
from shipyard_airflow.plugins import check_k8s_node_status
|
||||
|
||||
try:
|
||||
from drydock_base_operator import DrydockBaseOperator
|
||||
except ImportError:
|
||||
from shipyard_airflow.plugins.drydock_base_operator import \
|
||||
DrydockBaseOperator
|
||||
|
||||
try:
|
||||
from drydock_errors import (
|
||||
DrydockTaskFailedException,
|
||||
DrydockTaskTimeoutException
|
||||
)
|
||||
except ImportError:
|
||||
from shipyard_airflow.plugins.drydock_errors import (
|
||||
DrydockTaskFailedException,
|
||||
DrydockTaskTimeoutException
|
||||
)
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DrydockNodesOperator(DrydockBaseOperator):
|
||||
"""Drydock Nodes Operator
|
||||
|
||||
Using a deployment strategy to calculate the deployment sequence,
|
||||
deploy a series of baremetal nodes using Drydock.
|
||||
"""
|
||||
|
||||
def do_execute(self):
|
||||
self._setup_configured_values()
|
||||
# setup self.strat_name and self.strategy
|
||||
self.strategy = {}
|
||||
self._setup_deployment_strategy()
|
||||
dgm = _get_deployment_group_manager(
|
||||
self.strategy['groups'],
|
||||
_get_node_lookup(self.drydock_client, self.design_ref)
|
||||
)
|
||||
|
||||
_process_deployment_groups(dgm,
|
||||
self._execute_prepare,
|
||||
self._execute_deployment)
|
||||
|
||||
# All groups "complete" (as they're going to be). Report summary
|
||||
dgm.report_group_summary()
|
||||
dgm.report_node_summary()
|
||||
if dgm.critical_groups_failed():
|
||||
raise AirflowException(
|
||||
"One or more deployment groups marked as critical have failed"
|
||||
)
|
||||
else:
|
||||
LOG.info("All critical groups have met their success criteria")
|
||||
# TODO (bryan-strassner) it is very possible that many nodes failed
|
||||
# deployment, but all critical groups had enough success to
|
||||
# continue processing. This will be non-obvious to the casual
|
||||
# observer of the workflow. A likely enhancement is to allow
|
||||
# notes be added to the shipyard action associated with this
|
||||
# workflow that would be reported back to the end user doing a
|
||||
# describe of the action. This will require new database structures
|
||||
# to hold the notes, and a means to insert the notes. A shared
|
||||
# functionality in the base ucp operator or a common module would
|
||||
# be a reasonable way to support this.
|
||||
|
||||
def _setup_configured_values(self):
|
||||
"""Sets self.<name> values from the deployment configuration"""
|
||||
# Retrieve query intervals and timeouts
|
||||
# Intervals - How often will something be queried for status.
|
||||
self.dep_interval = self.dc['physical_provisioner.deploy_interval']
|
||||
self.node_st_interval = self.dc['kubernetes.node_status_interval']
|
||||
self.prep_interval = self.dc[
|
||||
'physical_provisioner.prepare_node_interval'
|
||||
]
|
||||
# Timeouts - Time Shipyard waits for completion of a task.
|
||||
self.dep_timeout = self.dc['physical_provisioner.deploy_timeout']
|
||||
self.node_st_timeout = self.dc['kubernetes.node_status_timeout']
|
||||
self.prep_timeout = self.dc[
|
||||
'physical_provisioner.prepare_node_timeout'
|
||||
]
|
||||
# The time to wait before querying k8s nodes after Drydock deploy nodes
|
||||
self.join_wait = self.dc['physical_provisioner.join_wait']
|
||||
|
||||
def _execute_prepare(self, group):
|
||||
"""Executes the prepare nodes step for the group.
|
||||
|
||||
:param group: the DeploymentGroup to prepare
|
||||
Returns a QueryTaskResult object
|
||||
"""
|
||||
LOG.info("Group %s is preparing nodes", group.name)
|
||||
|
||||
self.node_filter = _gen_node_name_filter(group.actionable_nodes)
|
||||
return self._execute_task('prepare_nodes',
|
||||
self.prep_interval,
|
||||
self.prep_timeout)
|
||||
|
||||
def _execute_deployment(self, group):
|
||||
"""Execute the deployment of nodes for the group.
|
||||
|
||||
:param group: The DeploymentGroup to deploy
|
||||
Returns a QueryTaskResult object
|
||||
"""
|
||||
LOG.info("Group %s is deploying nodes", group.name)
|
||||
|
||||
self.node_filter = _gen_node_name_filter(group.actionable_nodes)
|
||||
task_result = self._execute_task('deploy_nodes',
|
||||
self.dep_interval,
|
||||
self.dep_timeout)
|
||||
|
||||
if not task_result.successes:
|
||||
# if there are no successes from Drydock, there is no need to
|
||||
# wait and check on the results from node status.
|
||||
LOG.info("There are no nodes indicated as successful from Drydock."
|
||||
" Skipping waiting for Kubernetes node join and "
|
||||
"proceeding to validation")
|
||||
return task_result
|
||||
|
||||
# It takes time for the cluster join process to be triggered across
|
||||
# all the nodes in the cluster. Hence there is a need to back off
|
||||
# and wait before checking the state of the cluster join process.
|
||||
LOG.info("Nodes <%s> reported as deployed in MAAS",
|
||||
", ".join(task_result.successes))
|
||||
LOG.info("Waiting for %d seconds before checking node state...",
|
||||
self.join_wait)
|
||||
time.sleep(self.join_wait)
|
||||
|
||||
# Check that cluster join process is completed before declaring
|
||||
# deploy_node as 'completed'.
|
||||
# This should only include nodes that drydock has indicated as
|
||||
# successful and has passed the join script to.
|
||||
# Anything not ready in the timeout needs to be considered a failure
|
||||
not_ready_list = check_k8s_node_status.check_node_status(
|
||||
self.node_st_timeout,
|
||||
self.node_st_interval
|
||||
)
|
||||
for node in not_ready_list:
|
||||
# Remove nodes that are not ready from the list of successes, since
|
||||
# they did not complete deployment successfully.
|
||||
try:
|
||||
LOG.info("Node %s failed to join the Kubernetes cluster or was"
|
||||
" not timely enough", node)
|
||||
task_result.successes.remove(node)
|
||||
except ValueError:
|
||||
# This node is not joined, but was not one that we were
|
||||
# looking for either.
|
||||
LOG.info("%s failed to join Kubernetes, but was not in the "
|
||||
"Drydock results: %s",
|
||||
node,
|
||||
", ".join(task_result.successes))
|
||||
return task_result
|
||||
|
||||
def _execute_task(self, task_name, interval, timeout):
|
||||
"""Execute the Drydock task requested
|
||||
|
||||
:param task_name: 'prepare_nodes', 'deploy_nodes'
|
||||
;param interval: The time between checking status on the task
|
||||
:param timeout: The total time allowed for the task
|
||||
|
||||
Wraps the query_task method in the base class, capturing
|
||||
AirflowExceptions and summarizing results into a response
|
||||
QueryTaskResult object
|
||||
|
||||
Note: It does not matter if the task ultimately succeeds or fails in
|
||||
Drydock - the base class will handle all the logging and etc for
|
||||
the purposes of troubleshooting. What matters is the node successes.
|
||||
Following any result of query_task, this code will re-query the task
|
||||
results from Drydock to gather the node successes placing them into
|
||||
the successes list in the response object. In the case of a failure to
|
||||
get the task results, this workflow must assume that the result is a
|
||||
total loss, and pass back no successes
|
||||
"""
|
||||
self.create_task(task_name)
|
||||
result = QueryTaskResult(self.drydock_task_id, task_name)
|
||||
|
||||
try:
|
||||
self.query_task(interval, timeout)
|
||||
except DrydockTaskFailedException:
|
||||
# Task failure may be successful enough based on success criteria.
|
||||
# This should not halt the overall flow of this workflow step.
|
||||
LOG.warn(
|
||||
"Task %s has failed. Logs contain details of the failure. "
|
||||
"Some nodes may be succesful, processing continues", task_name
|
||||
)
|
||||
except DrydockTaskTimeoutException:
|
||||
# Task timeout may be successful enough based on success criteria.
|
||||
# This should not halt the overall flow of this workflow step.
|
||||
LOG.warn(
|
||||
"Task %s has timed out after %s seconds. Logs contain details "
|
||||
"of the failure. Some nodes may be succesful, processing "
|
||||
"continues", task_name, timeout
|
||||
)
|
||||
# Other AirflowExceptions will fail the whole task - let them do this.
|
||||
|
||||
# find successes
|
||||
result.successes = self._get_successes_for_task(self.drydock_task_id)
|
||||
return result
|
||||
|
||||
def _get_successes_for_task(self, task_id, extend_success=True):
|
||||
"""Discover the successful nodes based on the current task id.
|
||||
|
||||
:param task_id: The id of the task
|
||||
:param extend_successes: determines if this result extends successes
|
||||
or simply reports on the task.
|
||||
Gets the set of successful nodes by examining the self.drydock_task_id.
|
||||
The children are traversed recursively to display each sub-task's
|
||||
information.
|
||||
|
||||
Only a reported success at the parent task indicates success of the
|
||||
task. Drydock is assumed to roll up overall success to the top level.
|
||||
"""
|
||||
success_nodes = []
|
||||
task_dict = self.get_task_dict(task_id)
|
||||
task_status = task_dict.get('status', "Not Specified")
|
||||
task_result = task_dict.get('result')
|
||||
if task_result is None:
|
||||
LOG.warn("Task result is missing for task %s, with status %s."
|
||||
" Neither successes nor further details can be extracted"
|
||||
" from this result",
|
||||
task_id, task_status)
|
||||
else:
|
||||
if extend_success:
|
||||
try:
|
||||
# successes and failures on the task result drive the
|
||||
# interpretation of success or failure for this workflow.
|
||||
# - Any node that is _only_ success for a task is a
|
||||
# success to us.
|
||||
# - Any node that is listed as a failure is a failure.
|
||||
# This implies that a node listed as a success and a
|
||||
# failure is a failure. E.g. some subtasks succeeded and
|
||||
# some failed
|
||||
t_successes = task_result.get('successes', [])
|
||||
t_failures = task_result.get('failures', [])
|
||||
actual_successes = set(t_successes) - set(t_failures)
|
||||
# acquire the successes from success nodes
|
||||
success_nodes.extend(actual_successes)
|
||||
LOG.info("Nodes <%s> added as successes for task %s",
|
||||
", ".join(success_nodes), task_id)
|
||||
except KeyError:
|
||||
# missing key on the path to getting nodes - don't add any
|
||||
LOG.warn("Missing successes field on result of task %s, "
|
||||
"but a success field was expected. No successes"
|
||||
" can be extracted from this result",
|
||||
task_id)
|
||||
pass
|
||||
_report_task_info(task_id, task_result, task_status)
|
||||
|
||||
# for each child, report only the step info, do not add to overall
|
||||
# success list.
|
||||
for ch_task_id in task_dict.get('subtask_id_list', []):
|
||||
success_nodes.extend(
|
||||
self._get_successes_for_task(ch_task_id, extend_success=False)
|
||||
)
|
||||
# deduplicate and return
|
||||
return set(success_nodes)
|
||||
|
||||
def _setup_deployment_strategy(self):
|
||||
"""Determine the deployment strategy
|
||||
|
||||
Uses the specified strategy from the deployment configuration
|
||||
or returns a default configuration of 'all-at-once'
|
||||
"""
|
||||
self.strat_name = self.dc['physical_provisioner.deployment_strategy']
|
||||
if self.strat_name:
|
||||
# if there is a deployment strategy specified, get it and use it
|
||||
self.strategy = self.get_unique_doc(
|
||||
name=self.strat_name,
|
||||
schema="shipyard/DeploymentStrategy/v1"
|
||||
)
|
||||
else:
|
||||
# The default behavior is to deploy all nodes, and fail if
|
||||
# any nodes fail to deploy.
|
||||
self.strat_name = 'all-at-once (defaulted)'
|
||||
self.strategy = _default_deployment_strategy()
|
||||
LOG.info("Strategy Name: %s has %s groups",
|
||||
self.strat_name,
|
||||
len(self.strategy.get('groups', [])))
|
||||
|
||||
|
||||
#
|
||||
# Functions supporting the nodes operator class
|
||||
#
|
||||
|
||||
def _get_node_lookup(drydock_client, design_ref):
|
||||
"""Return a NodeLookup suitable for the DeploymentGroupManager
|
||||
|
||||
:param drydock_client: the drydock_client object
|
||||
:param design_ref: the design_ref for the NodeLookup
|
||||
"""
|
||||
return NodeLookup(drydock_client, design_ref).lookup
|
||||
|
||||
|
||||
def _get_deployment_group_manager(groups_dict_list, node_lookup):
|
||||
"""Return a DeploymentGroupManager suitable for managing this deployment
|
||||
|
||||
:param groups_dict_list: the list of group dictionaries to use
|
||||
:param node_lookup: a NodeLookup object that will be used by this
|
||||
DeploymentGroupManager
|
||||
"""
|
||||
return DeploymentGroupManager(groups_dict_list, node_lookup)
|
||||
|
||||
|
||||
def _process_deployment_groups(dgm, prepare_func, deploy_func):
|
||||
"""Executes the deployment group deployments
|
||||
|
||||
:param dgm: the DeploymentGroupManager object that manages the
|
||||
dependency chain of groups
|
||||
:param prepare_func: a function that accepts a DeploymentGroup and returns
|
||||
a QueryTaskResult with the purpose of preparing nodes
|
||||
:param deploy_func: a function that accepts a DeploymentGroup and returns
|
||||
a QueryTaskResult with the purpose of deploying nodes
|
||||
"""
|
||||
complete = False
|
||||
while not complete:
|
||||
# Find the next group to be prepared. Prepare and deploy it.
|
||||
group = dgm.get_next_group(Stage.PREPARED)
|
||||
if group is None:
|
||||
LOG.info("There are no more groups eligible to process")
|
||||
# whether or not really complete, the processing loop is done.
|
||||
complete = True
|
||||
continue
|
||||
|
||||
LOG.info("*** Deployment Group: %s is being processed ***", group.name)
|
||||
if not group.actionable_nodes:
|
||||
LOG.info("There were no actionable nodes for group %s. It is "
|
||||
"possible that all nodes: [%s] have previously been "
|
||||
"deployed. Group will be immediately checked "
|
||||
"against its success criteria", group.name,
|
||||
", ".join(group.full_nodes))
|
||||
|
||||
# In the case of a group having no actionable nodes, since groups
|
||||
# prepare -> deploy in direct sequence, we can check against
|
||||
# deployment, since all nodes would need to be deployed or have
|
||||
# been attempted. Need to follow the state-transition, so
|
||||
# PREPARED -> DEPLOYED
|
||||
dgm.evaluate_group_succ_criteria(group.name, Stage.PREPARED)
|
||||
dgm.evaluate_group_succ_criteria(group.name, Stage.DEPLOYED)
|
||||
# success or failure, move on to next group
|
||||
continue
|
||||
|
||||
LOG.info("%s has actionable nodes: [%s]", group.name,
|
||||
", ".join(group.actionable_nodes))
|
||||
if len(group.actionable_nodes) < len(group.full_nodes):
|
||||
LOG.info("Some nodes are not actionable because they were "
|
||||
"included in a prior group, but will be considered in "
|
||||
"the success critera calculation for this group")
|
||||
|
||||
# Group has actionable nodes.
|
||||
# Prepare Nodes for group, store QueryTaskResults
|
||||
prep_qtr = prepare_func(group)
|
||||
# Mark successes as prepared
|
||||
for node_name in prep_qtr.successes:
|
||||
dgm.mark_node_prepared(node_name)
|
||||
|
||||
dgm.fail_unsuccessful_nodes(group, prep_qtr.successes)
|
||||
should_deploy = dgm.evaluate_group_succ_criteria(group.name,
|
||||
Stage.PREPARED)
|
||||
if not should_deploy:
|
||||
# group has failed, move on to next group. Current group has
|
||||
# been marked as failed.
|
||||
continue
|
||||
|
||||
# Continue with deployment
|
||||
dep_qtr = deploy_func(group)
|
||||
# Mark successes as deployed
|
||||
for node_name in dep_qtr.successes:
|
||||
dgm.mark_node_deployed(node_name)
|
||||
dgm.fail_unsuccessful_nodes(group, dep_qtr.successes)
|
||||
dgm.evaluate_group_succ_criteria(group.name, Stage.DEPLOYED)
|
||||
|
||||
|
||||
def _report_task_info(task_id, task_result, task_status):
|
||||
"""Logs information regarding a task.
|
||||
|
||||
:param task_id: id of the task
|
||||
:param task_result: The result dictionary of the task
|
||||
:param task_status: The status for the task
|
||||
"""
|
||||
# setup fields, or defaults if missing values
|
||||
task_failures = task_result.get('failures', [])
|
||||
task_successes = task_result.get('successes', [])
|
||||
result_details = task_result.get('details', {'messageList': []})
|
||||
result_status = task_result.get('status', "No status supplied")
|
||||
LOG.info("Task %s with status %s/%s reports successes: [%s] and"
|
||||
" failures: [%s]", task_id, task_status, result_status,
|
||||
", ".join(task_successes), ", ".join(task_failures))
|
||||
for message_item in result_details['messageList']:
|
||||
context_type = message_item.get('context_type', 'N/A')
|
||||
context_id = message_item.get('context', 'N/A')
|
||||
message = message_item.get('message', "No message text supplied")
|
||||
error = message_item.get('error', False)
|
||||
timestamp = message_item.get('ts', 'No timestamp supplied')
|
||||
LOG.info(" - Task %s for item %s:%s has message: %s [err=%s, at %s]",
|
||||
task_id, context_type, context_id, message, error, timestamp)
|
||||
|
||||
|
||||
def _default_deployment_strategy():
|
||||
"""The default deployment strategy for 'all-at-once'"""
|
||||
return {
|
||||
'groups': [
|
||||
{
|
||||
'name': 'default',
|
||||
'critical': True,
|
||||
'depends_on': [],
|
||||
'selectors': [
|
||||
{
|
||||
'node_names': [],
|
||||
'node_labels': [],
|
||||
'node_tags': [],
|
||||
'rack_names': [],
|
||||
},
|
||||
],
|
||||
'success_criteria': {
|
||||
'percent_successful_nodes': 100
|
||||
},
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
def _gen_node_name_filter(node_names):
|
||||
"""Generates a drydock compatible node filter using only node names
|
||||
|
||||
:param node_names: the nodes with which to create a filter
|
||||
"""
|
||||
return {
|
||||
'filter_set_type': 'union',
|
||||
'filter_set': [
|
||||
{
|
||||
'filter_type': 'union',
|
||||
'node_names': node_names
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
class QueryTaskResult:
|
||||
"""Represents a summarized query result from a task"""
|
||||
def __init__(self, task_id, task_name):
|
||||
self.task_id = task_id
|
||||
self.task_name = task_name
|
||||
# The succeeded node names
|
||||
self.successes = []
|
||||
|
||||
|
||||
class DrydockNodesOperatorPlugin(AirflowPlugin):
|
||||
|
||||
"""Creates DrydockPrepareNodesOperator in Airflow."""
|
||||
|
||||
name = 'drydock_nodes_operator'
|
||||
operators = [DrydockNodesOperator]
|
|
@ -1,46 +0,0 @@
|
|||
# Copyright 2018 AT&T Intellectual Property. All other rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from airflow.plugins_manager import AirflowPlugin
|
||||
|
||||
from drydock_base_operator import DrydockBaseOperator
|
||||
|
||||
|
||||
class DrydockPrepareNodesOperator(DrydockBaseOperator):
|
||||
|
||||
"""Drydock Prepare Nodes Operator
|
||||
|
||||
This operator will trigger drydock to prepare nodes for
|
||||
site deployment
|
||||
|
||||
"""
|
||||
|
||||
def do_execute(self):
|
||||
|
||||
# Trigger DryDock to execute task
|
||||
self.create_task('prepare_nodes')
|
||||
|
||||
# Retrieve query interval and timeout
|
||||
q_interval = self.dc['physical_provisioner.prepare_node_interval']
|
||||
task_timeout = self.dc['physical_provisioner.prepare_node_timeout']
|
||||
|
||||
# Query Task
|
||||
self.query_task(q_interval, task_timeout)
|
||||
|
||||
|
||||
class DrydockPrepareNodesOperatorPlugin(AirflowPlugin):
|
||||
|
||||
"""Creates DrydockPrepareNodesOperator in Airflow."""
|
||||
|
||||
name = 'drydock_prepare_nodes_operator'
|
||||
operators = [DrydockPrepareNodesOperator]
|
|
@ -49,7 +49,7 @@ class DrydockValidateDesignOperator(DrydockBaseOperator):
|
|||
|
||||
payload = {
|
||||
'rel': "design",
|
||||
'href': self.deckhand_design_ref,
|
||||
'href': self.design_ref,
|
||||
'type': "application/x-yaml"
|
||||
}
|
||||
|
||||
|
|
|
@ -12,16 +12,15 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import logging
|
||||
import os
|
||||
|
||||
from airflow.utils.decorators import apply_defaults
|
||||
from airflow.plugins_manager import AirflowPlugin
|
||||
from airflow.exceptions import AirflowException
|
||||
|
||||
try:
|
||||
from service_endpoint import ucp_service_endpoint
|
||||
import service_endpoint
|
||||
except ImportError:
|
||||
from shipyard_airflow.plugins.service_endpoint import ucp_service_endpoint
|
||||
from shipyard_airflow.plugins import service_endpoint
|
||||
|
||||
try:
|
||||
from service_token import shipyard_service_token
|
||||
|
@ -47,19 +46,11 @@ class PromenadeBaseOperator(UcpBaseOperator):
|
|||