Add Airflow Worker Upgrade Workflow

This patch set is meant to create a workflow that will allow us
to upgrade the airflow worker without causing disruption to the
current running workflow.

Note that we will set the update strategy for airflow worker
to 'OnDelete'. The 'OnDelete' update strategy implements the legacy
(1.6 and prior) behavior. When we select this update strategy, the
statefulSet controller will not automatically update Pods when a
modification is made to the StatefulSet’s '.spec.template field'.
This strategy can be selected by setting the '.spec.template.updateStrategy.type'
to 'OnDelete'. Refer to [0] for more information.

[0] https://kubernetes.io/docs/tutorials/stateful-application/basic-stateful-set/#creating-a-statefulset

Change-Id: I1f6c3564b7fba6abe422b86e36818eb2cd3454ea
This commit is contained in:
Anthony Lin 2018-03-12 09:49:33 +00:00
parent fa105e6da8
commit 7219519135
11 changed files with 258 additions and 6 deletions

View File

@ -33,6 +33,7 @@ rules:
- endpoints
- pods
verbs:
- delete
- get
- list
- watch
@ -57,6 +58,14 @@ metadata:
spec:
serviceName: {{ tuple "airflow_worker" "discovery" $envAll | include "helm-toolkit.endpoints.hostname_short_endpoint_lookup" }}
podManagementPolicy: "Parallel"
# NOTE: We are using 'OnDelete' strategy instead of 'RollingUpdate'
# so that the upgrade of airflow worker will only start after the
# completion of the 'update_site' workflow (the worker pods will get
# deleted by the workflow at the very end, after everything is completed).
# This will ensure availability of airflow worker during update/upgrade
# and prevent any disruption to the workflow.
updateStrategy:
type: OnDelete
replicas: {{ .Values.pod.replicas.airflow.worker }}
template:
metadata:

View File

@ -93,10 +93,10 @@ RUN curl -L -o /usr/local/bin/kubectl \
COPY script/entrypoint.sh ${AIRFLOW_HOME}/entrypoint.sh
COPY script/airflow_start_service.sh ${AIRFLOW_HOME}/airflow_start_service.sh
COPY script/airflow_logrotate.sh ${AIRFLOW_HOME}/airflow_logrotate.sh
COPY script/upgrade_airflow_worker.sh ${AIRFLOW_HOME}/upgrade_airflow_worker.sh
# Change permissions
RUN chown -R airflow: ${AIRFLOW_HOME} \
&& chmod +x ${AIRFLOW_HOME}/entrypoint.sh
RUN chown -R airflow: ${AIRFLOW_HOME}
# Set work directory
USER airflow

2
images/airflow/script/airflow_logrotate.sh Normal file → Executable file
View File

@ -14,7 +14,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
set -x
set -ex
while true; do

0
images/airflow/script/airflow_start_service.sh Normal file → Executable file
View File

0
images/airflow/script/entrypoint.sh Normal file → Executable file
View File

View File

@ -0,0 +1,98 @@
#!/bin/bash
# Copyright 2018 AT&T Intellectual Property. All other rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -ex
# NOTE: We are directing all the output of the script to /dev/null in order
# to complete the workflow. Hence it will be useful to create a log file to
# track the progress of the script for troubleshooting purpose.
check_timeout_counter() {
# Check total elapsed time
# The default time out is set to 5 minutes
if [[ $counter -ge $max_count ]]; then
echo -e "Update Site Workflow Status Check Timed Out!" >> /usr/local/airflow/upgrade_airflow_worker.log
return 1
fi
}
# Define Variables
#
# Allow user to optionally pass in custom query time and max count as $4
# and $5 respectively
# Default query time is 30 seconds
# Default max_count for 5 minutes time out is 60*5/30 = 10
#
# NOTE: Dag ID will take value of $1
#
# NOTE: $2 will look like '2018-03-13' while $3 will look like '05:10:19'
# The execution date that we need to pass into the Airflow CLI will need
# to take the form of '2018-03-13T05:10:19'. Hence we will need to concatenate
# $2 and $3 together to form the dag_execution_date.
#
dag_id=$1
dag_execution_date="$2T$3"
query_time=${4:-30}
max_count=${5:-10}
# Initialize dag_state to "running" state
# Dag can be in "running", "success", "failed", "skipped" or "up for retry" state
dag_state="running"
# Initialize counter to 1
counter=1
echo -e "Checking Dag State..."
while true;
do
# Set current working directory to be the directory where the shell script
# is located. In this way we will be able to import the modules that are
# required for our custom Operators.
cd "${0%/*}"
# Get current state of dag using Airflow CLI
check_dag_state=`airflow dag_state ${dag_id} ${dag_execution_date}`
echo -e ${check_dag_state} >> /usr/local/airflow/upgrade_airflow_worker.log
# We will need to extract the last word in the 'check_dag_state'
# string variable as that will contain the status of the dag run
dag_state=`echo ${check_dag_state} | awk '{print $NF}'`
echo -e ${dag_state} >> /usr/local/airflow/upgrade_airflow_worker.log
if [[ $dag_state == "success" ]]; then
echo -e "\nWorkflow has completed" >> /usr/local/airflow/upgrade_airflow_worker.log
echo -e "\n" >> /usr/local/airflow/upgrade_airflow_worker.log
echo -e "Proceeding to upgrade Airflow Worker..." >> /usr/local/airflow/upgrade_airflow_worker.log
echo -e "Deleting Airflow Worker Pods..." >> /usr/local/airflow/upgrade_airflow_worker.log
for i in `kubectl get pods -n ucp | grep -i airflow-worker | awk '{print $1}'`; do
# Delete Airflow Worker pod so that they will respawn with the new
# configurations and/or images
kubectl delete pod $i -n ucp
done
echo -e "Airflow Worker Pods Deleted!" >> /usr/local/airflow/upgrade_airflow_worker.log
return 0
fi
echo -e "Workflow is in" $dag_state "state\n" >> /usr/local/airflow/upgrade_airflow_worker.log
echo -e "Back Off for $query_time seconds...\n" >> /usr/local/airflow/upgrade_airflow_worker.log
sleep $query_time
# Step counter and check the timeout counter
((counter++))
check_timeout_counter
done

View File

@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from airflow.operators import ConcurrencyCheckOperator
from airflow.operators.bash_operator import BashOperator
from airflow.operators.python_operator import BranchPythonOperator
from airflow.operators.python_operator import PythonOperator
from airflow.operators.subdag_operator import SubDagOperator
@ -173,3 +175,69 @@ class CommonStepFactory(object):
task_id=task_id,
on_failure_callback=step_failure_handler,
dag=self.dag)
def get_decide_airflow_upgrade(self, task_id=dn.DECIDE_AIRFLOW_UPGRADE):
"""Generate the decide_airflow_upgrade step
Step responsible for deciding whether to branch to the path
to upgrade airflow worker
"""
def upgrade_airflow_check(**kwargs):
"""upgrade_airflow_check function
Defines a function to decide whether to upgrade airflow
worker. The decision will be based on the xcom value that
is retrieved from the 'armada_apply' task
"""
# DAG ID will be parent + subdag name
dag_id = self.parent_dag_name + '.' + dn.ARMADA_BUILD_DAG_NAME
# Check if Shipyard/Airflow were upgraded by the workflow
upgrade_airflow = kwargs['ti'].xcom_pull(
key='upgrade_airflow_worker',
task_ids='armada_apply',
dag_id=dag_id)
# Go to the branch to upgrade Airflow worker if the Shipyard
# chart were upgraded/modified
if upgrade_airflow == "true":
return "upgrade_airflow"
else:
return "skip_upgrade_airflow"
return BranchPythonOperator(task_id=task_id,
python_callable=upgrade_airflow_check,
trigger_rule="all_done",
dag=self.dag)
def get_upgrade_airflow(self, task_id=dn.UPGRADE_AIRFLOW):
"""Generate the upgrade_airflow step
Step responsible for upgrading airflow worker. Step will
execute the upgrade script in the background and direct
output to null so that 'nohup.out' will not be created.
Note that this is done intentionally so that the upgrade
of airflow worker will only start after the completion of
the 'update_site' workflow. This will ensure availability
of airflow worker during update/upgrade and prevent any
disruption to the workflow. Note that dag_id and execution
date are required for proper execution of the script.
"""
return BashOperator(task_id=task_id,
bash_command=(
"nohup "
"/usr/local/airflow/upgrade_airflow_worker.sh "
"{{ ti.dag_id }} {{ ti.execution_date }} "
">/dev/null 2>&1 &"),
dag=self.dag)
def get_skip_upgrade_airflow(self, task_id=dn.SKIP_UPGRADE_AIRFLOW):
"""Generate the skip_upgrade_airflow step
Step will print a message stating that we do not need to
upgrade the airflow worker
"""
return BashOperator(task_id=task_id,
bash_command=(
"echo 'Airflow Worker Upgrade Not Required'"),
dag=self.dag)

View File

@ -24,3 +24,6 @@ DESTROY_SERVER_DAG_NAME = 'destroy_server'
# Steps
ACTION_XCOM = 'action_xcom'
DECIDE_AIRFLOW_UPGRADE = 'decide_airflow_upgrade'
UPGRADE_AIRFLOW = 'upgrade_airflow'
SKIP_UPGRADE_AIRFLOW = 'skip_upgrade_airflow'

View File

@ -55,6 +55,9 @@ validate_site_design = step_factory.get_validate_site_design()
deployment_configuration = step_factory.get_deployment_configuration()
drydock_build = step_factory.get_drydock_build()
armada_build = step_factory.get_armada_build()
decide_airflow_upgrade = step_factory.get_decide_airflow_upgrade()
upgrade_airflow = step_factory.get_upgrade_airflow()
skip_upgrade_airflow = step_factory.get_skip_upgrade_airflow()
# DAG Wiring
concurrency_check.set_upstream(action_xcom)
@ -66,3 +69,6 @@ drydock_build.set_upstream([
deployment_configuration
])
armada_build.set_upstream(drydock_build)
decide_airflow_upgrade.set_upstream(armada_build)
decide_airflow_upgrade.set_downstream(upgrade_airflow)
decide_airflow_upgrade.set_downstream(skip_upgrade_airflow)

View File

@ -1,4 +1,4 @@
# Copyright 2017 AT&T Intellectual Property. All other rights reserved.
# Copyright 2018 AT&T Intellectual Property. All other rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -29,6 +29,7 @@ from get_k8s_pod_port_ip import get_pod_port_ip
from service_endpoint import ucp_service_endpoint
from service_token import shipyard_service_token
from xcom_puller import XcomPuller
from xcom_pusher import XcomPusher
class ArmadaOperator(BaseOperator):
@ -39,7 +40,7 @@ class ArmadaOperator(BaseOperator):
:param shipyard_conf: Location of shipyard.conf
:param sub_dag_name: Child Dag
The Drydock operator assumes that prior steps have set xcoms for
The Armada operator assumes that prior steps have set xcoms for
the action and the deployment configuration
"""
@ -218,6 +219,8 @@ class ArmadaOperator(BaseOperator):
armada_post_apply = {}
override_values = []
chart_set = []
upgrade_airflow_worker = False
# enhance the context's query entity with target_manifest
query = context.get('query', {})
query['target_manifest'] = target_manifest
@ -230,11 +233,34 @@ class ArmadaOperator(BaseOperator):
set=chart_set,
query=query)
# Search for Shipyard deployment in the list of chart upgrades
# NOTE: It is possible for the chart name to take on different
# values, e.g. 'aic-ucp-shipyard', 'ucp-shipyard'. Hence we
# will search for the word 'shipyard', which should exist as
# part of the name of the Shipyard Helm Chart.
for i in armada_post_apply['message']['upgrade']:
if 'shipyard' in i:
upgrade_airflow_worker = True
break
# Create xcom key 'upgrade_airflow_worker'
# Value of key will depend on whether an upgrade has been
# performed on the Shipyard/Airflow Chart
self.xcom_pusher = XcomPusher(context['task_instance'])
if upgrade_airflow_worker:
self.xcom_pusher.xcom_push(key='upgrade_airflow_worker',
value='true')
else:
self.xcom_pusher.xcom_push(key='upgrade_airflow_worker',
value='false')
# We will expect Armada to return the releases that it is
# deploying. Note that if we try and deploy the same release
# twice, we will end up with empty response as nothing has
# changed.
if armada_post_apply['message']['install']:
if (armada_post_apply['message']['install'] or
armada_post_apply['message']['upgrade']):
logging.info("Armada Apply Successfully Executed")
logging.info(armada_post_apply)
else:

View File

@ -0,0 +1,42 @@
# Copyright 2018 AT&T Intellectual Property. All other rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
from airflow.exceptions import AirflowException
LOG = logging.getLogger(__name__)
class XcomPusher(object):
"""XcomPusher pushes xcom value
Create specific key with value and stores as xcom.
"""
def __init__(self, task_instance):
self.ti = task_instance
def xcom_push(self, key=None, value=None):
"""Push a particular xcom value"""
LOG.info("Pushing xcom from %s.%s with key %s and value %s",
self.ti.dag_id,
self.ti.task_id,
key,
value)
try:
self.ti.xcom_push(key=key, value=value)
except:
raise AirflowException("Xcom push failed!")