Workflow to support deployment groups

Updates the Shipyard/Airflow workflow for deploy_site and
update_site to use the deployment group/deployment strategy
information from the design.

This allows for baremetal nodes to be deployed in a design-
specified order, with criticality and success criteria driving
the success and failure of deployment.

Includes refactoring of service endpoints to reduce the need
for so much data passing.

Change-Id: Ib5e9fca535ca74d1819fe46959695acfed5b65c2
changes/35/569835/16
Bryan Strassner 2018-05-10 17:15:59 -05:00
parent ea47f2c77b
commit 04906cce68
29 changed files with 1391 additions and 345 deletions

2
.gitignore vendored
View File

@ -2,7 +2,7 @@
__pycache__/
*.py[cod]
*$py.class
**/.pytest_cache/
.pytest_cache/
# C extensions

View File

@ -79,9 +79,9 @@ run:
.PHONY: build_airflow
build_airflow:
ifeq ($(USE_PROXY), true)
docker build --network host -t $(IMAGE) --label $(LABEL) -f $(IMAGE_DIR)/Dockerfile $(IMAGE_DIR) --build-arg http_proxy=$(PROXY) --build-arg https_proxy=$(PROXY)
docker build --network host -t $(IMAGE) --label $(LABEL) -f $(IMAGE_DIR)/Dockerfile --build-arg http_proxy=$(PROXY) --build-arg https_proxy=$(PROXY) --build-arg ctx_base=$(BUILD_CTX) .
else
docker build --network host -t $(IMAGE) --label $(LABEL) -f $(IMAGE_DIR)/Dockerfile $(IMAGE_DIR)
docker build --network host -t $(IMAGE) --label $(LABEL) -f $(IMAGE_DIR)/Dockerfile --build-arg ctx_base=$(BUILD_CTX) .
endif
ifeq ($(PUSH_IMAGE), true)
docker push $(IMAGE)

View File

@ -32,6 +32,7 @@ EXPOSE $WORKER_PORT
# Set ARG for usage during build
ARG AIRFLOW_HOME=/usr/local/airflow
ARG DEBIAN_FRONTEND=noninteractive
ARG ctx_base=src/bin
# Kubectl version
ARG KUBECTL_VERSION=1.8.6
@ -76,17 +77,30 @@ RUN useradd -ms /bin/bash -d ${AIRFLOW_HOME} airflow \
# Dependency requirements
# Note - removing snakebite (python 2 vs. 3). See:
# https://github.com/puckel/docker-airflow/issues/77
COPY ./requirements.txt /tmp/
COPY images/airflow/requirements.txt /tmp/
RUN pip3 install -r /tmp/requirements.txt \
&& pip3 uninstall -y snakebite || true
# Copy scripts used in the container:
# entrypoint.sh, airflow_start_service.sh and airflow_logrotate.sh
COPY script/*.sh ${AIRFLOW_HOME}/
COPY images/airflow/script/*.sh ${AIRFLOW_HOME}/
# Change permissions
RUN chown -R airflow: ${AIRFLOW_HOME}
# Shipyard
#
# Shipyard provides core functionality used by the airflow plugins/operators
# Since Shipyard and Airflow are built together as images, this should prevent
# stale or out-of-date code between these parts.
# Shipyard requirements, source and installation
COPY ${ctx_base}/shipyard_airflow/requirements.txt /tmp/api_requirements.txt
RUN pip3 install -r /tmp/api_requirements.txt
COPY ${ctx_base}/shipyard_airflow /tmp/shipyard/
RUN cd /tmp/shipyard \
&& python3 setup.py install
# Set work directory
USER airflow
WORKDIR ${AIRFLOW_HOME}

View File

@ -82,6 +82,71 @@ class DeploymentGroupManager:
return self._all_groups[group]
return None
def group_list(self):
"""Return a list of DeploymentGroup objects in group order"""
summary = []
for group_nm in self._group_order:
group = self._all_groups[group_nm]
summary.append(group)
return summary
def critical_groups_failed(self):
"""Return True if any critical groups have failed"""
for group in self._all_groups.values():
if group.stage == Stage.FAILED and group.critical:
return True
return False
def evaluate_group_succ_criteria(self, group_name, stage):
"""Checks a group against its success criteria for a stage
:param group_name: the name of the group to check
:param stage: Stage.PREPARED or Stage.DEPLOYED
Returns a boolean: True = success, False = failure.
"""
failed_criteria = self.get_group_failures_for_stage(group_name, stage)
if failed_criteria:
# Logging of criteria has already occurred during checking.
self.mark_group_failed(group_name)
LOG.info("Group %s has failed to meet its success criteria while "
"trying to move to stage: %s",
group_name, stage)
return False
elif stage == Stage.DEPLOYED:
self.mark_group_deployed(group_name)
LOG.info("Group %s has met its success criteria and is "
"successfully deployed (%s)", group_name, stage)
return True
elif stage == Stage.PREPARED:
self.mark_group_prepared(group_name)
LOG.info("Group %s has met its success criteria and is "
"now set to stage %s", group_name, stage)
return True
def report_group_summary(self):
"""Reports the status of all groups handled by this deployment"""
LOG.info("===== Group Summary =====")
for group in self.group_list():
LOG.info(" Group %s%s ended with stage: %s",
group.name,
" [Critical]" if group.critical else "",
group.stage)
LOG.info("===== End Group Summary =====")
def report_node_summary(self):
"""Reports the status of all nodes handled by this deployment"""
# Ordered stages
stages = [Stage.NOT_STARTED,
Stage.PREPARED,
Stage.DEPLOYED,
Stage.FAILED]
LOG.info("===== Node Summary =====")
for stage in stages:
nodes = self.get_nodes(stage=stage)
LOG.info(" Nodes %s: %s", stage, ", ".join(nodes))
LOG.info("===== End Node Summary =====")
#
# Methods that support setup of the nodes in groups
#
@ -163,6 +228,22 @@ class DeploymentGroupManager:
# Methods for handling nodes
#
def fail_unsuccessful_nodes(self, group, successes):
"""Fail nodes that were not successful in a group's actionable list
:param group: the group to check
:param successes: the list of successful nodes from processing
This makes an assumption that all actionable nodes should be in a list
of successes if they are to be considered successful. If the success
list is empty, all the actionable nodes in the group would be
considered failed.
"""
# Mark non-successes as failed
failed_nodes = set(group.actionable_nodes).difference(set(successes))
for node_name in failed_nodes:
self.mark_node_failed(node_name)
def mark_node_deployed(self, node_name):
"""Mark a node as deployed"""
self._set_node_stage(node_name, Stage.DEPLOYED)
@ -203,7 +284,7 @@ def _update_group_actionable_nodes(group, known_nodes):
", ".join(known_nodes))
group_nodes = set(group.full_nodes)
group.actionable_nodes = group_nodes.difference(known_nodes)
group.actionable_nodes = list(group_nodes.difference(known_nodes))
LOG.debug("Group %s set actionable_nodes to %s. "
"Full node list for this group is %s",
group.name,

View File

@ -13,8 +13,7 @@
# limitations under the License.
from airflow.models import DAG
from airflow.operators import DrydockDeployNodesOperator
from airflow.operators import DrydockPrepareNodesOperator
from airflow.operators import DrydockNodesOperator
from airflow.operators import DrydockPrepareSiteOperator
from airflow.operators import DrydockVerifySiteOperator
@ -43,15 +42,8 @@ def deploy_site_drydock(parent_dag_name, child_dag_name, args):
sub_dag_name=child_dag_name,
dag=dag)
drydock_prepare_nodes = DrydockPrepareNodesOperator(
task_id='prepare_nodes',
shipyard_conf=config_path,
main_dag_name=parent_dag_name,
sub_dag_name=child_dag_name,
dag=dag)
drydock_deploy_nodes = DrydockDeployNodesOperator(
task_id='deploy_nodes',
drydock_nodes = DrydockNodesOperator(
task_id='prepare_and_deploy_nodes',
shipyard_conf=config_path,
main_dag_name=parent_dag_name,
sub_dag_name=child_dag_name,
@ -59,7 +51,6 @@ def deploy_site_drydock(parent_dag_name, child_dag_name, args):
# Define dependencies
drydock_prepare_site.set_upstream(drydock_verify_site)
drydock_prepare_nodes.set_upstream(drydock_prepare_site)
drydock_deploy_nodes.set_upstream(drydock_prepare_nodes)
drydock_nodes.set_upstream(drydock_prepare_site)
return dag

View File

@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import os
from urllib.parse import urlparse
from airflow.exceptions import AirflowException
@ -22,9 +21,9 @@ from airflow.utils.decorators import apply_defaults
import armada.common.client as client
import armada.common.session as session
from get_k8s_pod_port_ip import get_pod_port_ip
from service_endpoint import ucp_service_endpoint
from service_token import shipyard_service_token
from ucp_base_operator import UcpBaseOperator
import service_endpoint
from xcom_pusher import XcomPusher
LOG = logging.getLogger(__name__)
@ -42,16 +41,12 @@ class ArmadaBaseOperator(UcpBaseOperator):
@apply_defaults
def __init__(self,
armada_svc_type='armada',
deckhand_svc_type='deckhand',
query={},
svc_session=None,
svc_token=None,
*args, **kwargs):
"""Initialization of ArmadaBaseOperator object.
:param armada_svc_type: Armada Service Type
:param deckhand_svc_type: Deckhand Service Type
:param query: A dictionary containing explicit query string parameters
:param svc_session: Keystone Session
:param svc_token: Keystone Token
@ -66,8 +61,6 @@ class ArmadaBaseOperator(UcpBaseOperator):
pod_selector_pattern=[{'pod_pattern': 'armada-api',
'container': 'armada-api'}],
*args, **kwargs)
self.armada_svc_type = armada_svc_type
self.deckhand_svc_type = deckhand_svc_type
self.query = query
self.svc_session = svc_session
self.svc_token = svc_token
@ -81,21 +74,11 @@ class ArmadaBaseOperator(UcpBaseOperator):
# Logs uuid of action performed by the Operator
LOG.info("Armada Operator for action %s", self.action_info['id'])
# Retrieve Endpoint Information
armada_svc_endpoint = ucp_service_endpoint(
self, svc_type=self.armada_svc_type)
# Set up armada client
self.armada_client = self._init_armada_client(armada_svc_endpoint,
self.svc_token)
# Retrieve DeckHand Endpoint Information
deckhand_svc_endpoint = ucp_service_endpoint(
self, svc_type=self.deckhand_svc_type)
# Get deckhand design reference url
self.deckhand_design_ref = self._init_deckhand_design_ref(
deckhand_svc_endpoint)
self.armada_client = self._init_armada_client(
self.endpoints.endpoint_by_name(service_endpoint.ARMADA),
self.svc_token
)
@staticmethod
def _init_armada_client(armada_svc_endpoint, svc_token):
@ -133,26 +116,6 @@ class ArmadaBaseOperator(UcpBaseOperator):
else:
raise AirflowException("Failed to set up Armada client!")
def _init_deckhand_design_ref(self, deckhand_svc_endpoint):
LOG.info("Deckhand endpoint is %s", deckhand_svc_endpoint)
# Form DeckHand Design Reference Path
# This URL will be used to retrieve the Site Design YAMLs
deckhand_path = "deckhand+" + deckhand_svc_endpoint
_deckhand_design_ref = os.path.join(deckhand_path,
"revisions",
str(self.revision_id),
"rendered-documents")
if _deckhand_design_ref:
LOG.info("Design YAMLs will be retrieved from %s",
_deckhand_design_ref)
return _deckhand_design_ref
else:
raise AirflowException("Unable to Retrieve Design Reference!")
@get_pod_port_ip('tiller', namespace='kube-system')
def get_tiller_info(self, pods_ip_port={}):

View File

@ -58,7 +58,7 @@ class ArmadaPostApplyOperator(ArmadaBaseOperator):
try:
armada_post_apply = self.armada_client.post_apply(
manifest=armada_manifest,
manifest_ref=self.deckhand_design_ref,
manifest_ref=self.design_ref,
values=override_values,
set=chart_set,
query=self.query,

View File

@ -42,8 +42,7 @@ class ArmadaValidateDesignOperator(ArmadaBaseOperator):
# Validate Site Design
try:
post_validate = self.armada_client.post_validate(
manifest=self.deckhand_design_ref,
timeout=timeout)
manifest=self.design_ref, timeout=timeout)
except errors.ClientError as client_error:
# Dump logs from Armada API pods

View File

@ -15,7 +15,6 @@
import logging
import time
from airflow.exceptions import AirflowException
from kubernetes import client, config
@ -54,7 +53,7 @@ def check_node_status(time_out, interval):
# Logs initial state of all nodes in the cluster
ret_init = v1.list_node(watch=False)
logging.info("Current state of nodes in Cluster is")
logging.info("Current state of nodes in the cluster is")
for i in ret_init.items:
logging.info("%s\t%s\t%s", i.metadata.name,
@ -86,7 +85,7 @@ def check_node_status(time_out, interval):
cluster_ready = False
# Print current state of node
logging.info("Node %s is not Ready", j.metadata.name)
logging.info("Node %s is not ready", j.metadata.name)
logging.debug("Current status of %s is %s",
j.metadata.name,
j.status.conditions[-1].message)
@ -96,16 +95,18 @@ def check_node_status(time_out, interval):
logging.info("Node %s is in Ready state", j.metadata.name)
# Raise Time Out Exception
# If any nodes are not ready and the timeout is reached, stop waiting
if not cluster_ready and i == end_range:
raise AirflowException("Timed Out! One or more Nodes fail to "
"get into Ready State!")
# Exit loop if Cluster is in Ready state
if cluster_ready:
logging.info("All nodes are in Ready state")
logging.info("Timed Out! One or more Nodes failed to reach ready "
"state")
break
elif cluster_ready:
# Exit loop if Cluster is in Ready state
logging.info("All nodes are in ready state")
break
else:
# Back off and check again in next iteration
logging.info("Wait for %d seconds...", int(interval))
time.sleep(int(interval))
# Return the nodes that are not ready.
return not_ready_node_list

View File

@ -19,7 +19,7 @@ from airflow.plugins_manager import AirflowPlugin
from airflow.exceptions import AirflowException
from deckhand.client import client as deckhand_client
from service_endpoint import ucp_service_endpoint
import service_endpoint
from service_token import shipyard_service_token
from ucp_base_operator import UcpBaseOperator
@ -41,8 +41,6 @@ class DeckhandBaseOperator(UcpBaseOperator):
committed_ver=None,
deckhandclient=None,
deckhand_client_read_timeout=None,
deckhand_svc_endpoint=None,
deckhand_svc_type='deckhand',
revision_id=None,
svc_session=None,
svc_token=None,
@ -53,8 +51,6 @@ class DeckhandBaseOperator(UcpBaseOperator):
:param committed_ver: Last committed version
:param deckhandclient: An instance of deckhand client
:param deckhand_client_read_timeout: Deckhand client connect timeout
:param deckhand_svc_endpoint: Deckhand Service Endpoint
:param deckhand_svc_type: Deckhand Service Type
:param revision_id: Target revision for workflow
:param svc_session: Keystone Session
:param svc_token: Keystone Token
@ -70,8 +66,6 @@ class DeckhandBaseOperator(UcpBaseOperator):
self.committed_ver = committed_ver
self.deckhandclient = deckhandclient
self.deckhand_client_read_timeout = deckhand_client_read_timeout
self.deckhand_svc_endpoint = deckhand_svc_endpoint
self.deckhand_svc_type = deckhand_svc_type
self.revision_id = revision_id
self.svc_session = svc_session
self.svc_token = svc_token
@ -96,8 +90,9 @@ class DeckhandBaseOperator(UcpBaseOperator):
self.action_info['id'])
# Retrieve Endpoint Information
self.deckhand_svc_endpoint = ucp_service_endpoint(
self, svc_type=self.deckhand_svc_type)
self.deckhand_svc_endpoint = self.endpoints.endpoint_by_name(
service_endpoint.DECKHAND
)
LOG.info("Deckhand endpoint is %s",
self.deckhand_svc_endpoint)

View File

@ -45,7 +45,7 @@ class DeploymentConfigurationOperator(BaseOperator):
cannot be retrieved
"""
config_keys_defaults = {
"physical_provisioner.deployment_strategy": "all-at-once",
"physical_provisioner.deployment_strategy": None,
"physical_provisioner.deploy_interval": 30,
"physical_provisioner.deploy_timeout": 3600,
"physical_provisioner.destroy_interval": 30,

View File

@ -14,7 +14,6 @@
import copy
import pprint
import logging
import os
import time
from urllib.parse import urlparse
@ -25,9 +24,35 @@ from airflow.utils.decorators import apply_defaults
import drydock_provisioner.drydock_client.client as client
import drydock_provisioner.drydock_client.session as session
from drydock_provisioner import error as errors
from service_endpoint import ucp_service_endpoint
from service_token import shipyard_service_token
from ucp_base_operator import UcpBaseOperator
try:
import service_endpoint
except ImportError:
from shipyard_airflow.plugins import service_endpoint
try:
from service_token import shipyard_service_token
except ImportError:
from shipyard_airflow.plugins.service_token import shipyard_service_token
try:
from ucp_base_operator import UcpBaseOperator
except ImportError:
from shipyard_airflow.plugins.ucp_base_operator import UcpBaseOperator
try:
from drydock_errors import (
DrydockClientUseFailureException,
DrydockTaskFailedException,
DrydockTaskNotCreatedException,
DrydockTaskTimeoutException
)
except ImportError:
from shipyard_airflow.plugins.drydock_errors import (
DrydockClientUseFailureException,
DrydockTaskFailedException,
DrydockTaskNotCreatedException,
DrydockTaskTimeoutException
)
LOG = logging.getLogger(__name__)
@ -44,11 +69,7 @@ class DrydockBaseOperator(UcpBaseOperator):
@apply_defaults
def __init__(self,
deckhand_design_ref=None,
deckhand_svc_type='deckhand',
drydock_client=None,
drydock_svc_endpoint=None,
drydock_svc_type='physicalprovisioner',
drydock_task_id=None,
node_filter=None,
redeploy_server=None,
@ -57,11 +78,7 @@ class DrydockBaseOperator(UcpBaseOperator):
*args, **kwargs):
"""Initialization of DrydockBaseOperator object.
:param deckhand_design_ref: A URI reference to the design documents
:param deckhand_svc_type: Deckhand Service Type
:param drydockclient: An instance of drydock client
:param drydock_svc_endpoint: Drydock Service Endpoint
:param drydock_svc_type: Drydock Service Type
:param drydock_task_id: Drydock Task ID
:param node_filter: A filter for narrowing the scope of the task.
Valid fields are 'node_names', 'rack_names',
@ -81,11 +98,7 @@ class DrydockBaseOperator(UcpBaseOperator):
pod_selector_pattern=[{'pod_pattern': 'drydock-api',
'container': 'drydock-api'}],
*args, **kwargs)
self.deckhand_design_ref = deckhand_design_ref
self.deckhand_svc_type = deckhand_svc_type
self.drydock_client = drydock_client
self.drydock_svc_endpoint = drydock_svc_endpoint
self.drydock_svc_type = drydock_svc_type
self.drydock_task_id = drydock_task_id
self.node_filter = node_filter
self.redeploy_server = redeploy_server
@ -126,8 +139,9 @@ class DrydockBaseOperator(UcpBaseOperator):
% self.__class__.__name__)
# Retrieve Endpoint Information
self.drydock_svc_endpoint = ucp_service_endpoint(
self, svc_type=self.drydock_svc_type)
self.drydock_svc_endpoint = self.endpoints.endpoint_by_name(
service_endpoint.DRYDOCK
)
LOG.info("Drydock endpoint is %s", self.drydock_svc_endpoint)
@ -147,7 +161,9 @@ class DrydockBaseOperator(UcpBaseOperator):
if dd_session:
LOG.info("Successfully Set Up DryDock Session")
else:
raise AirflowException("Failed to set up Drydock Session!")
raise DrydockClientUseFailureException(
"Failed to set up Drydock Session!"
)
# Use the DrydockSession to build a DrydockClient that can
# be used to make one or more API calls
@ -158,26 +174,9 @@ class DrydockBaseOperator(UcpBaseOperator):
if self.drydock_client:
LOG.info("Successfully Set Up DryDock client")
else:
raise AirflowException("Failed to set up Drydock Client!")
# Retrieve DeckHand Endpoint Information
deckhand_svc_endpoint = ucp_service_endpoint(
self, svc_type=self.deckhand_svc_type)
LOG.info("Deckhand endpoint is %s", deckhand_svc_endpoint)
# Form DeckHand Design Reference Path
# This URL will be used to retrieve the Site Design YAMLs
deckhand_path = "deckhand+" + deckhand_svc_endpoint
self.deckhand_design_ref = os.path.join(deckhand_path,
"revisions",
str(self.revision_id),
"rendered-documents")
if self.deckhand_design_ref:
LOG.info("Design YAMLs will be retrieved from %s",
self.deckhand_design_ref)
else:
raise AirflowException("Unable to Retrieve Design Reference!")
raise DrydockClientUseFailureException(
"Failed to set up Drydock Client!"
)
@shipyard_service_token
def _auth_gen(self):
@ -196,7 +195,7 @@ class DrydockBaseOperator(UcpBaseOperator):
try:
# Create Task
create_task_response = self.drydock_client.create_task(
design_ref=self.deckhand_design_ref,
design_ref=self.design_ref,
task_action=task_action,
node_filter=self.node_filter)
@ -204,7 +203,7 @@ class DrydockBaseOperator(UcpBaseOperator):
# Dump logs from Drydock pods
self.get_k8s_logs()
raise AirflowException(client_error)
raise DrydockClientUseFailureException(client_error)
# Retrieve Task ID
self.drydock_task_id = create_task_response['task_id']
@ -216,7 +215,7 @@ class DrydockBaseOperator(UcpBaseOperator):
if self.drydock_task_id:
return self.drydock_task_id
else:
raise AirflowException("Unable to create task!")
raise DrydockTaskNotCreatedException("Unable to create task!")
def query_task(self, interval, time_out):
@ -235,21 +234,16 @@ class DrydockBaseOperator(UcpBaseOperator):
try:
# Retrieve current task state
task_state = self.drydock_client.get_task(
task_id=self.drydock_task_id)
task_state = self.get_task_dict(task_id=self.drydock_task_id)
task_status = task_state['status']
task_result = task_state['result']['status']
LOG.info("Current status of task id %s is %s",
self.drydock_task_id, task_status)
except errors.ClientError as client_error:
# Dump logs from Drydock pods
except DrydockClientUseFailureException:
self.get_k8s_logs()
raise AirflowException(client_error)
raise
except:
# There can be situations where there are intermittent network
# issues that prevents us from retrieving the task state. We
@ -275,6 +269,21 @@ class DrydockBaseOperator(UcpBaseOperator):
else:
self.task_failure(True)
def get_task_dict(self, task_id):
"""Retrieve task output in its raw dictionary format
:param task_id: The id of the task to retrieve
Raises DrydockClientUseFailureException if the client raises an
exception
See:
http://att-comdev-drydock.readthedocs.io/en/latest/task.html#task-status-schema
"""
try:
return self.drydock_client.get_task(task_id=task_id)
except errors.ClientError as client_error:
# Dump logs from Drydock pods
raise DrydockClientUseFailureException(client_error)
def task_failure(self, _task_failure):
# Dump logs from Drydock pods
self.get_k8s_logs()
@ -289,7 +298,7 @@ class DrydockBaseOperator(UcpBaseOperator):
self.all_task_ids = {t['task_id']: t for t in all_tasks}
except errors.ClientError as client_error:
raise AirflowException(client_error)
raise DrydockClientUseFailureException(client_error)
# Retrieve the failed parent task and assign it to list
failed_parent_task = (
@ -299,7 +308,7 @@ class DrydockBaseOperator(UcpBaseOperator):
# Since there is only 1 failed parent task, we will print index 0
# of the list
if failed_parent_task:
LOG.error('%s task has either failed or timed out',
LOG.error("%s task has either failed or timed out",
failed_parent_task[0]['action'])
LOG.error(pprint.pprint(failed_parent_task[0]))
@ -312,9 +321,13 @@ class DrydockBaseOperator(UcpBaseOperator):
# Raise Exception to terminate workflow
if _task_failure:
raise AirflowException("Failed to Execute/Complete Task!")
raise DrydockTaskFailedException(
"Failed to Execute/Complete Task!"
)
else:
raise AirflowException("Task Execution Timed Out!")
raise DrydockTaskTimeoutException(
"Task Execution Timed Out!"
)
def check_subtask_failure(self, subtask_id_list):
@ -367,7 +380,9 @@ class DrydockBaseOperator(UcpBaseOperator):
subtask_id)
else:
raise AirflowException("Unable to retrieve subtask info!")
raise DrydockClientUseFailureException(
"Unable to retrieve subtask info!"
)
class DrydockBaseOperatorPlugin(AirflowPlugin):

View File

@ -1,70 +0,0 @@
# Copyright 2018 AT&T Intellectual Property. All other rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import time
from airflow.plugins_manager import AirflowPlugin
from check_k8s_node_status import check_node_status
from drydock_base_operator import DrydockBaseOperator
LOG = logging.getLogger(__name__)
class DrydockDeployNodesOperator(DrydockBaseOperator):
"""Drydock Deploy Nodes Operator
This operator will trigger drydock to deploy the bare metal
nodes
"""
def do_execute(self):
# Trigger DryDock to execute task
self.create_task('deploy_nodes')
# Retrieve query interval and timeout
q_interval = self.dc['physical_provisioner.deploy_interval']
task_timeout = self.dc['physical_provisioner.deploy_timeout']
# Query Task
self.query_task(q_interval, task_timeout)
# It takes time for the cluster join process to be triggered across
# all the nodes in the cluster. Hence there is a need to back off
# and wait before checking the state of the cluster join process.
join_wait = self.dc['physical_provisioner.join_wait']
LOG.info("All nodes deployed in MAAS")
LOG.info("Wait for %d seconds before checking node state...",
join_wait)
time.sleep(join_wait)
# Check that cluster join process is completed before declaring
# deploy_node as 'completed'.
node_st_timeout = self.dc['kubernetes.node_status_timeout']
node_st_interval = self.dc['kubernetes.node_status_interval']
check_node_status(node_st_timeout, node_st_interval)
class DrydockDeployNodesOperatorPlugin(AirflowPlugin):
"""Creates DrydockDeployNodesOperator in Airflow."""
name = 'drydock_deploy_nodes_operator'
operators = [DrydockDeployNodesOperator]

View File

@ -0,0 +1,34 @@
# Copyright 2018 AT&T Intellectual Property. All other rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Drydock specific exceptions generated during operator execution.
Generally marker exceptions extending AirflowException
"""
from airflow.exceptions import AirflowException
class DrydockClientUseFailureException(AirflowException):
pass
class DrydockTaskFailedException(AirflowException):
pass
class DrydockTaskNotCreatedException(AirflowException):
pass
class DrydockTaskTimeoutException(AirflowException):
pass

View File

@ -0,0 +1,486 @@
# Copyright 2018 AT&T Intellectual Property. All other rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Prepare and deploy nodes using Drydock
Uses the deployment strategy named in the deployment-configuration to
progress through preparation and deployment of nodes in a group-based fashion.
In the case of no specified deployment strategy, an "all-at-once" approach is
taken, by which all nodes are deployed together.
Historical Note: This operator replaces the function of drydock_prepare_nodes
and drydock_deploy_nodes operators that existed previously.
"""
import logging
import time
from airflow.exceptions import AirflowException
from airflow.plugins_manager import AirflowPlugin
from shipyard_airflow.common.deployment_group.deployment_group import Stage
from shipyard_airflow.common.deployment_group.deployment_group_manager import \
DeploymentGroupManager
from shipyard_airflow.common.deployment_group.node_lookup import NodeLookup
try:
import check_k8s_node_status
except ImportError:
from shipyard_airflow.plugins import check_k8s_node_status
try:
from drydock_base_operator import DrydockBaseOperator
except ImportError:
from shipyard_airflow.plugins.drydock_base_operator import \
DrydockBaseOperator
try:
from drydock_errors import (
DrydockTaskFailedException,
DrydockTaskTimeoutException
)
except ImportError:
from shipyard_airflow.plugins.drydock_errors import (
DrydockTaskFailedException,
DrydockTaskTimeoutException
)
LOG = logging.getLogger(__name__)
class DrydockNodesOperator(DrydockBaseOperator):
"""Drydock Nodes Operator
Using a deployment strategy to calculate the deployment sequence,
deploy a series of baremetal nodes using Drydock.
"""
def do_execute(self):
self._setup_configured_values()
# setup self.strat_name and self.strategy
self.strategy = {}
self._setup_deployment_strategy()
dgm = _get_deployment_group_manager(
self.strategy['groups'],
_get_node_lookup(self.drydock_client, self.design_ref)
)
_process_deployment_groups(dgm,
self._execute_prepare,
self._execute_deployment)
# All groups "complete" (as they're going to be). Report summary
dgm.report_group_summary()
dgm.report_node_summary()
if dgm.critical_groups_failed():
raise AirflowException(
"One or more deployment groups marked as critical have failed"
)
else:
LOG.info("All critical groups have met their success criteria")
# TODO (bryan-strassner) it is very possible that many nodes failed
# deployment, but all critical groups had enough success to
# continue processing. This will be non-obvious to the casual
# observer of the workflow. A likely enhancement is to allow
# notes be added to the shipyard action associated with this
# workflow that would be reported back to the end user doing a
# describe of the action. This will require new database structures
# to hold the notes, and a means to insert the notes. A shared
# functionality in the base ucp operator or a common module would
# be a reasonable way to support this.
def _setup_configured_values(self):
"""Sets self.<name> values from the deployment configuration"""
# Retrieve query intervals and timeouts
# Intervals - How often will something be queried for status.
self.dep_interval = self.dc['physical_provisioner.deploy_interval']
self.node_st_interval = self.dc['kubernetes.node_status_interval']
self.prep_interval = self.dc[
'physical_provisioner.prepare_node_interval'
]
# Timeouts - Time Shipyard waits for completion of a task.
self.dep_timeout = self.dc['physical_provisioner.deploy_timeout']
self.node_st_timeout = self.dc['kubernetes.node_status_timeout']
self.prep_timeout = self.dc[
'physical_provisioner.prepare_node_timeout'
]
# The time to wait before querying k8s nodes after Drydock deploy nodes
self.join_wait = self.dc['physical_provisioner.join_wait']
def _execute_prepare(self, group):
"""Executes the prepare nodes step for the group.
:param group: the DeploymentGroup to prepare
Returns a QueryTaskResult object
"""
LOG.info("Group %s is preparing nodes", group.name)
self.node_filter = _gen_node_name_filter(group.actionable_nodes)
return self._execute_task('prepare_nodes',
self.prep_interval,
self.prep_timeout)
def _execute_deployment(self, group):
"""Execute the deployment of nodes for the group.
:param group: The DeploymentGroup to deploy
Returns a QueryTaskResult object
"""
LOG.info("Group %s is deploying nodes", group.name)
self.node_filter = _gen_node_name_filter(group.actionable_nodes)
task_result = self._execute_task('deploy_nodes',
self.dep_interval,
self.dep_timeout)
if not task_result.successes:
# if there are no successes from Drydock, there is no need to
# wait and check on the results from node status.
LOG.info("There are no nodes indicated as successful from Drydock."
" Skipping waiting for Kubernetes node join and "
"proceeding to validation")
return task_result
# It takes time for the cluster join process to be triggered across
# all the nodes in the cluster. Hence there is a need to back off
# and wait before checking the state of the cluster join process.
LOG.info("Nodes <%s> reported as deployed in MAAS",
", ".join(task_result.successes))
LOG.info("Waiting for %d seconds before checking node state...",
self.join_wait)
time.sleep(self.join_wait)
# Check that cluster join process is completed before declaring
# deploy_node as 'completed'.
# This should only include nodes that drydock has indicated as
# successful and has passed the join script to.
# Anything not ready in the timeout needs to be considered a failure
not_ready_list = check_k8s_node_status.check_node_status(
self.node_st_timeout,
self.node_st_interval
)
for node in not_ready_list:
# Remove nodes that are not ready from the list of successes, since
# they did not complete deployment successfully.
try:
LOG.info("Node %s failed to join the Kubernetes cluster or was"
" not timely enough", node)
task_result.successes.remove(node)
except ValueError:
# This node is not joined, but was not one that we were
# looking for either.
LOG.info("%s failed to join Kubernetes, but was not in the "
"Drydock results: %s",
node,
", ".join(task_result.successes))
return task_result
def _execute_task(self, task_name, interval, timeout):
"""Execute the Drydock task requested
:param task_name: 'prepare_nodes', 'deploy_nodes'
;param interval: The time between checking status on the task
:param timeout: The total time allowed for the task
Wraps the query_task method in the base class, capturing
AirflowExceptions and summarizing results into a response
QueryTaskResult object
Note: It does not matter if the task ultimately succeeds or fails in
Drydock - the base class will handle all the logging and etc for
the purposes of troubleshooting. What matters is the node successes.
Following any result of query_task, this code will re-query the task
results from Drydock to gather the node successes placing them into
the successes list in the response object. In the case of a failure to
get the task results, this workflow must assume that the result is a
total loss, and pass back no successes
"""
self.create_task(task_name)
result = QueryTaskResult(self.drydock_task_id, task_name)
try:
self.query_task(interval, timeout)
except DrydockTaskFailedException:
# Task failure may be successful enough based on success criteria.
# This should not halt the overall flow of this workflow step.
LOG.warn(
"Task %s has failed. Logs contain details of the failure. "
"Some nodes may be succesful, processing continues", task_name
)
except DrydockTaskTimeoutException:
# Task timeout may be successful enough based on success criteria.
# This should not halt the overall flow of this workflow step.
LOG.warn(
"Task %s has timed out after %s seconds. Logs contain details "
"of the failure. Some nodes may be succesful, processing "
"continues", task_name, timeout
)
# Other AirflowExceptions will fail the whole task - let them do this.
# find successes
result.successes = self._get_successes_for_task(self.drydock_task_id)
return result
def _get_successes_for_task(self, task_id, extend_success=True):
"""Discover the successful nodes based on the current task id.
:param task_id: The id of the task
:param extend_successes: determines if this result extends successes
or simply reports on the task.
Gets the set of successful nodes by examining the self.drydock_task_id.
The children are traversed recursively to display each sub-task's
information.
Only a reported success at the parent task indicates success of the
task. Drydock is assumed to roll up overall success to the top level.
"""
success_nodes = []
task_dict = self.get_task_dict(task_id)
task_status = task_dict.get('status', "Not Specified")
task_result = task_dict.get('result')
if task_result is None:
LOG.warn("Task result is missing for task %s, with status %s."
" Neither successes nor further details can be extracted"
" from this result",
task_id, task_status)
else:
if extend_success:
try:
# successes and failures on the task result drive the
# interpretation of success or failure for this workflow.
# - Any node that is _only_ success for a task is a
# success to us.
# - Any node that is listed as a failure is a failure.
# This implies that a node listed as a success and a
# failure is a failure. E.g. some subtasks succeeded and
# some failed
t_successes = task_result.get('successes', [])
t_failures = task_result.get('failures', [])
actual_successes = set(t_successes) - set(t_failures)
# acquire the successes from success nodes
success_nodes.extend(actual_successes)
LOG.info("Nodes <%s> added as successes for task %s",
", ".join(success_nodes), task_id)
except KeyError:
# missing key on the path to getting nodes - don't add any
LOG.warn("Missing successes field on result of task %s, "
"but a success field was expected. No successes"
" can be extracted from this result",
task_id)
pass
_report_task_info(task_id, task_result, task_status)
# for each child, report only the step info, do not add to overall
# success list.
for ch_task_id in task_dict.get('subtask_id_list', []):
success_nodes.extend(
self._get_successes_for_task(ch_task_id, extend_success=False)
)
# deduplicate and return
return set(success_nodes)
def _setup_deployment_strategy(self):
"""Determine the deployment strategy
Uses the specified strategy from the deployment configuration
or returns a default configuration of 'all-at-once'
"""
self.strat_name = self.dc['physical_provisioner.deployment_strategy']
if self.strat_name:
# if there is a deployment strategy specified, get it and use it
self.strategy = self.get_unique_doc(
name=self.strat_name,
schema="shipyard/DeploymentStrategy/v1"
)
else:
# The default behavior is to deploy all nodes, and fail if
# any nodes fail to deploy.
self.strat_name = 'all-at-once (defaulted)'
self.strategy = _default_deployment_strategy()
LOG.info("Strategy Name: %s has %s groups",
self.strat_name,
len(self.strategy.get('groups', [])))
#
# Functions supporting the nodes operator class
#
def _get_node_lookup(drydock_client, design_ref):
"""Return a NodeLookup suitable for the DeploymentGroupManager
:param drydock_client: the drydock_client object
:param design_ref: the design_ref for the NodeLookup
"""
return NodeLookup(drydock_client, design_ref).lookup
def _get_deployment_group_manager(groups_dict_list, node_lookup):
"""Return a DeploymentGroupManager suitable for managing this deployment
:param groups_dict_list: the list of group dictionaries to use
:param node_lookup: a NodeLookup object that will be used by this
DeploymentGroupManager
"""
return DeploymentGroupManager(groups_dict_list, node_lookup)
def _process_deployment_groups(dgm, prepare_func, deploy_func):
"""Executes the deployment group deployments
:param dgm: the DeploymentGroupManager object that manages the
dependency chain of groups
:param prepare_func: a function that accepts a DeploymentGroup and returns
a QueryTaskResult with the purpose of preparing nodes
:param deploy_func: a function that accepts a DeploymentGroup and returns
a QueryTaskResult with the purpose of deploying nodes
"""
complete = False
while not complete:
# Find the next group to be prepared. Prepare and deploy it.
group = dgm.get_next_group(Stage.PREPARED)
if group is None:
LOG.info("There are no more groups eligible to process")
# whether or not really complete, the processing loop is done.
complete = True
continue
LOG.info("*** Deployment Group: %s is being processed ***", group.name)
if not group.actionable_nodes:
LOG.info("There were no actionable nodes for group %s. It is "
"possible that all nodes: [%s] have previously been "
"deployed. Group will be immediately checked "
"against its success criteria", group.name,
", ".join(group.full_nodes))
# In the case of a group having no actionable nodes, since groups
# prepare -> deploy in direct sequence, we can check against
# deployment, since all nodes would need to be deployed or have
# been attempted. Need to follow the state-transition, so
# PREPARED -> DEPLOYED
dgm.evaluate_group_succ_criteria(group.name, Stage.PREPARED)
dgm.evaluate_group_succ_criteria(group.name, Stage.DEPLOYED)
# success or failure, move on to next group
continue
LOG.info("%s has actionable nodes: [%s]", group.name,
", ".join(group.actionable_nodes))
if len(group.actionable_nodes) < len(group.full_nodes):
LOG.info("Some nodes are not actionable because they were "
"included in a prior group, but will be considered in "
"the success critera calculation for this group")
# Group has actionable nodes.
# Prepare Nodes for group, store QueryTaskResults
prep_qtr = prepare_func(group)
# Mark successes as prepared
for node_name in prep_qtr.successes:
dgm.mark_node_prepared(node_name)
dgm.fail_unsuccessful_nodes(group, prep_qtr.successes)
should_deploy = dgm.evaluate_group_succ_criteria(group.name,
Stage.PREPARED)
if not should_deploy:
# group has failed, move on to next group. Current group has
# been marked as failed.
continue
# Continue with deployment
dep_qtr = deploy_func(group)
# Mark successes as deployed
for node_name in dep_qtr.successes:
dgm.mark_node_deployed(node_name)
dgm.fail_unsuccessful_nodes(group, dep_qtr.successes)
dgm.evaluate_group_succ_criteria(group.name, Stage.DEPLOYED)
def _report_task_info(task_id, task_result, task_status):
"""Logs information regarding a task.
:param task_id: id of the task
:param task_result: The result dictionary of the task
:param task_status: The status for the task
"""
# setup fields, or defaults if missing values
task_failures = task_result.get('failures', [])
task_successes = task_result.get('successes', [])
result_details = task_result.get('details', {'messageList': []})
result_status = task_result.get('status', "No status supplied")
LOG.info("Task %s with status %s/%s reports successes: [%s] and"
" failures: [%s]", task_id, task_status, result_status,
", ".join(task_successes), ", ".join(task_failures))
for message_item in result_details['messageList']:
context_type = message_item.get('context_type', 'N/A')
context_id = message_item.get('context', 'N/A')
message = message_item.get('message', "No message text supplied")
error = message_item.get('error', False)
timestamp = message_item.get('ts', 'No timestamp supplied')
LOG.info(" - Task %s for item %s:%s has message: %s [err=%s, at %s]",
task_id, context_type, context_id, message, error, timestamp)
def _default_deployment_strategy():
"""The default deployment strategy for 'all-at-once'"""
return {
'groups': [
{
'name': 'default',
'critical': True,
'depends_on': [],
'selectors': [
{
'node_names': [],
'node_labels': [],
'node_tags': [],
'rack_names': [],
},
],
'success_criteria': {
'percent_successful_nodes': 100
},
}
]
}
def _gen_node_name_filter(node_names):
"""Generates a drydock compatible node filter using only node names
:param node_names: the nodes with which to create a filter
"""
return {
'filter_set_type': 'union',
'filter_set': [
{
'filter_type': 'union',
'node_names': node_names
}
]
}
class QueryTaskResult:
"""Represents a summarized query result from a task"""
def __init__(self, task_id, task_name):
self.task_id = task_id
self.task_name = task_name
# The succeeded node names
self.successes = []
class DrydockNodesOperatorPlugin(AirflowPlugin):
"""Creates DrydockPrepareNodesOperator in Airflow."""
name = 'drydock_nodes_operator'
operators = [DrydockNodesOperator]

View File

@ -1,46 +0,0 @@
# Copyright 2018 AT&T Intellectual Property. All other rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from airflow.plugins_manager import AirflowPlugin
from drydock_base_operator import DrydockBaseOperator
class DrydockPrepareNodesOperator(DrydockBaseOperator):
"""Drydock Prepare Nodes Operator
This operator will trigger drydock to prepare nodes for
site deployment
"""
def do_execute(self):
# Trigger DryDock to execute task
self.create_task('prepare_nodes')
# Retrieve query interval and timeout
q_interval = self.dc['physical_provisioner.prepare_node_interval']
task_timeout = self.dc['physical_provisioner.prepare_node_timeout']
# Query Task
self.query_task(q_interval, task_timeout)
class DrydockPrepareNodesOperatorPlugin(AirflowPlugin):
"""Creates DrydockPrepareNodesOperator in Airflow."""
name = 'drydock_prepare_nodes_operator'
operators = [DrydockPrepareNodesOperator]

View File

@ -49,7 +49,7 @@ class DrydockValidateDesignOperator(DrydockBaseOperator):
payload = {
'rel': "design",
'href': self.deckhand_design_ref,
'href': self.design_ref,
'type': "application/x-yaml"
}

View File

@ -12,16 +12,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import os
from airflow.utils.decorators import apply_defaults
from airflow.plugins_manager import AirflowPlugin
from airflow.exceptions import AirflowException
try:
from service_endpoint import ucp_service_endpoint
import service_endpoint
except ImportError:
from shipyard_airflow.plugins.service_endpoint import ucp_service_endpoint
from shipyard_airflow.plugins import service_endpoint
try:
from service_token import shipyard_service_token
@ -47,19 +46,11 @@ class PromenadeBaseOperator(UcpBaseOperator):