Add preinstall and postinstall playbooks

Replacing complex bash logic with simple ansible playbooks
improves readability and error handling, plus gaining
idempotency.

Added wait for apt lock on admin node.

Change-Id: I1c8bc4efbdbb3d239c4e8864984b1fbaa15dadcb
This commit is contained in:
Matthew Mosesohn 2016-08-08 18:07:52 +03:00
parent 960c4f13f9
commit 099686b2ca
8 changed files with 156 additions and 84 deletions

View File

@ -31,10 +31,8 @@ KARGO_COMMIT=${KARGO_COMMIT:-master}
# Default deployment settings
COMMON_DEFAULTS_YAML="kargo_default_common.yaml"
COMMON_DEFAULTS_SRC="${BASH_SOURCE%/*}/../kargo/${COMMON_DEFAULTS_YAML}"
COMMON_DEFAULTS_OPT="-e @~/kargo/${COMMON_DEFAULTS_YAML}"
OS_SPECIFIC_DEFAULTS_YAML="kargo_default_${NODE_BASE_OS}.yaml"
OS_SPECIFIC_DEFAULTS_SRC="${BASH_SOURCE%/*}/../kargo/${OS_SPECIFIC_DEFAULTS_YAML}"
OS_SPECIFIC_DEFAULTS_OPT="-e @~/kargo/${OS_SPECIFIC_DEFAULTS_YAML}"
required_ansible_version="2.1.0"
@ -55,6 +53,8 @@ function exit_gracefully {
fi
fi
fi
# Kill current ssh-agent
eval $(ssh-agent -k)
exit $exit_code
}
@ -73,9 +73,9 @@ function with_retries {
function admin_node_command {
if [[ "$ADMIN_IP" == "local" ]];then
eval "$@"
eval "$@"
else
ssh $SSH_OPTIONS $ADMIN_USER@$ADMIN_IP "$@"
ssh $SSH_OPTIONS $ADMIN_USER@$ADMIN_IP "$@"
fi
}
@ -85,15 +85,15 @@ function wait_for_nodes {
master_wait_time=30
while true; do
report=$(sshpass -p ${ADMIN_PASSWORD} ssh ${SSH_OPTIONS} -o PreferredAuthentications=password ${ADMIN_USER}@${IP} echo ok || echo not ready)
if [ "${report}" = "ok" ]; then
break
fi
if [ "${elapsed_time}" -gt "${master_wait_time}" ]; then
exit 2
fi
sleep 1
let elapsed_time+=1
done
@ -138,14 +138,24 @@ if ! type sshpass > /dev/null; then
fi
# Copy utils/kargo dir to WORKSPACE/utils/kargo so it works across both local
# and remote admin node deployment modes.
echo "Preparing admin node..."
if [[ "$ADMIN_IP" != "local" ]]; then
ADMIN_WORKSPACE="workspace"
sshpass -p $ADMIN_PASSWORD ssh-copy-id $SSH_OPTIONS_COPYID -o PreferredAuthentications=password $ADMIN_USER@${ADMIN_IP} -p 22
else
ADMIN_WORKSPACE="$WORKSPACE"
fi
admin_node_command mkdir -p $ADMIN_WORKSPACE/utils/kargo
tar cz ${BASH_SOURCE%/*}/../kargo | admin_node_command tar xzf - -C $ADMIN_WORKSPACE/utils/
echo "Setting up ansible and required dependencies..."
installed_ansible_version=$(admin_node_command dpkg-query -W -f='${Version}\n' ansible || echo "0.0")
installed_ansible_version=$(admin_node_command dpkg-query -W -f='\${Version}\\n' ansible || echo "0.0")
if ! admin_node_command type ansible > /dev/null || \
dpkg --compare-versions "$installed_ansible_version" "lt" "$required_ansible_version"; then
# Wait for apt lock in case it is updating from cron job
while admin_node_command pgrep -a -f apt; do echo 'Waiting for apt lock...'; sleep 30; done
case $ADMIN_NODE_BASE_OS in
ubuntu)
with_retries admin_node_command -- sudo apt-get update
@ -164,74 +174,68 @@ if ! admin_node_command type ansible > /dev/null || \
fi
echo "Checking out kargo playbook..."
admin_node_command git clone $KARGO_REPO
admin_node_command "sh -c 'cd kargo && git checkout $KARGO_COMMIT'"
echo "Setting up admin node for deployment..."
cat ${BASH_SOURCE%/*}/../kargo/inventory.py | admin_node_command "cat > inventory.py"
admin_node_command CONFIG_FILE=kargo/inventory/inventory.cfg python3 inventory.py ${SLAVE_IPS[@]}
admin_node_command "sh -c 'cd $ADMIN_WORKSPACE && git clone $KARGO_REPO'" || true
admin_node_command "sh -c 'cd $ADMIN_WORKSPACE/kargo && git fetch --all && git checkout $KARGO_COMMIT'"
cat $WORKSPACE/id_rsa | admin_node_command "cat - > .ssh/id_rsa"
admin_node_command chmod 600 .ssh/id_rsa
echo "Uploading default settings..."
cat $COMMON_DEFAULTS_SRC | admin_node_command "cat > kargo/${COMMON_DEFAULTS_YAML}"
cat $OS_SPECIFIC_DEFAULTS_SRC | admin_node_command "cat > kargo/${OS_SPECIFIC_DEFAULTS_YAML}"
cat $COMMON_DEFAULTS_SRC | admin_node_command "cat > $ADMIN_WORKSPACE/kargo/${COMMON_DEFAULTS_YAML}"
cat $OS_SPECIFIC_DEFAULTS_SRC | admin_node_command "cat > $ADMIN_WORKSPACE/kargo/${OS_SPECIFIC_DEFAULTS_YAML}"
COMMON_DEFAULTS_OPT="-e @$ADMIN_WORKSPACE/kargo/${COMMON_DEFAULTS_YAML}"
OS_SPECIFIC_DEFAULTS_OPT="-e @$ADMIN_WORKSPACE/kargo/${OS_SPECIFIC_DEFAULTS_YAML}"
if [ -n "$CUSTOM_YAML" ]; then
echo "Uploading custom YAML for deployment..."
echo -e "$CUSTOM_YAML" | admin_node_command "cat > kargo/custom.yaml"
custom_opts="-e @~/kargo/custom.yaml"
echo -e "$CUSTOM_YAML" | admin_node_command "cat > $ADMIN_WORKSPACE/kargo/custom.yaml"
custom_opts="-e @$ADMIN_WORKSPACE/kargo/custom.yaml"
fi
# TODO(mattymo): move to ansible
echo "Generating ansible inventory on admin node..."
admin_node_command CONFIG_FILE=$ADMIN_WORKSPACE/kargo/inventory/inventory.cfg python3 $ADMIN_WORKSPACE/utils/kargo/inventory.py ${SLAVE_IPS[@]}
echo "Waiting for all nodes to be reachable by SSH..."
wait_for_nodes ${SLAVE_IPS[@]}
current_slave=1
deploy_args=""
echo "Adding ssh key authentication and labels to nodes..."
for slaveip in ${SLAVE_IPS[@]}; do
# FIXME(mattymo): Underlay provisioner should set up keys
sshpass -p $ADMIN_PASSWORD ssh-copy-id $SSH_OPTIONS_COPYID -o PreferredAuthentications=password $ADMIN_USER@${slaveip} -p 22
# FIXME(mattymo): underlay should set hostnames
ssh $SSH_OPTIONS $ADMIN_USER@$slaveip "sudo sed -i 's/127.0.1.1.*/$slaveip\tnode${current_slave}/g' /etc/hosts"
ssh $SSH_OPTIONS $ADMIN_USER@$slaveip "sudo hostnamectl set-hostname node${current_slave}"
# TODO(mattymo): Move to kargo
# Workaround to disable ipv6 dns which can cause docker pull to fail
echo "precedence ::ffff:0:0/96 100" | ssh $SSH_OPTIONS $ADMIN_USER@$slaveip "sudo sh -c 'cat - >> /etc/gai.conf'"
# Workaround to fix DNS search domain: https://github.com/kubespray/kargo/issues/322
# Retry in case of apt lock
with_retries ssh $SSH_OPTIONS $ADMIN_USER@$slaveip "sudo DEBIAN_FRONTEND=noninteractive apt-get remove -y resolvconf"
# If resolvconf was installed, copy its conf to fix dangling symlink
ssh $SSH_OPTIONS $ADMIN_USER@$slaveip "sudo cp --remove-destination \`realpath /etc/resolv.conf\` /etc/resolv.conf" || :
ssh $SSH_OPTIONS $ADMIN_USER@$slaveip "sudo rm -rf /etc/resolvconf"
# FIXME(mattymo): Underlay provisioner should set label file
# Add VM label:
ssh $SSH_OPTIONS $ADMIN_USER@$slaveip "echo $VM_LABEL > /home/${ADMIN_USER}/vm_label"
inventory_args+=" ${slaveip}"
((current_slave++))
done
# Stop trapping pre-setup tasks
set +e
echo "Running pre-setup steps on nodes via ansible..."
tries=3
until admin_node_command /usr/bin/ansible-playbook \
--ssh-extra-args "-o\ StrictHostKeyChecking=no" -u ${ADMIN_USER} -b \
--become-user=root -i $ADMIN_WORKSPACE/kargo/inventory/inventory.cfg \
$ADMIN_WORKSPACE/utils/kargo/preinstall.yml $COMMON_DEFAULTS_OPT \
$OS_SPECIFIC_DEFAULTS_OPT $custom_opts; do
if [[ $tries > 1 ]]; then
(( tries-- ))
echo "Deployment failed! Trying $tries more times..."
else
exit_gracefully 1
fi
done
echo "Deploying k8s via ansible..."
tries=3
until admin_node_command /usr/bin/ansible-playbook \
--ssh-extra-args "-o\ StrictHostKeyChecking=no" -u ${ADMIN_USER} -b \
--become-user=root -i /home/${ADMIN_USER}/kargo/inventory/inventory.cfg \
/home/${ADMIN_USER}/kargo/cluster.yml $COMMON_DEFAULTS_OPT \
--become-user=root -i $ADMIN_WORKSPACE/kargo/inventory/inventory.cfg \
$ADMIN_WORKSPACE/kargo/cluster.yml $COMMON_DEFAULTS_OPT \
$OS_SPECIFIC_DEFAULTS_OPT $custom_opts; do
if [[ $tries > 0 ]]; then
if [[ $tries > 1 ]]; then
(( tries-- ))
echo "Deployment failed! Trying $tries more times..."
else
@ -241,47 +245,21 @@ done
deploy_res=0
echo "Initial deploy succeeded. Proceeding with post-install tasks..."
# NOTE: This needs to run on a node with kube-config.yml and kubelet (kube-master role)
PRI_NODE=${SLAVE_IPS[0]}
echo "Setting up kubedns..."
ssh $SSH_OPTIONS $ADMIN_USER@$PRI_NODE sudo pip install kpm
ssh $SSH_OPTIONS $ADMIN_USER@$PRI_NODE sudo /usr/local/bin/kpm deploy kube-system/kubedns --namespace=kube-system
tries=26
for waiting in `seq 1 $tries`; do
if ssh $SSH_OPTIONS $ADMIN_USER@$PRI_NODE kubectl get po --namespace=kube-system | grep kubedns | grep -q Running; then
ssh $SSH_OPTIONS $ADMIN_USER@$PRI_NODE host kubernetes && break
fi
if [ $waiting -lt $tries ]; then
echo "Waiting for kubedns to be up..."
sleep 5
else
echo "Kubedns did not come up in time"
deploy_res=1
fi
tries=3
until admin_node_command /usr/bin/ansible-playbook \
--ssh-extra-args "-o\ StrictHostKeyChecking=no" -u ${ADMIN_USER} -b \
--become-user=root -i $ADMIN_WORKSPACE/kargo/inventory/inventory.cfg \
$ADMIN_WORKSPACE/utils/kargo/postinstall.yml $COMMON_DEFAULTS_OPT \
$OS_SPECIFIC_DEFAULTS_OPT $custom_opts; do
if [[ $tries > 1 ]]; then
(( tries-- ))
echo "Deployment failed! Trying $tries more times..."
else
exit_gracefully 1
fi
done
if [ "$deploy_res" -eq "0" ]; then
echo "Testing network connectivity..."
. ${BASH_SOURCE%/*}/../kargo/test_networking.sh
test_networking
deploy_res=$?
if [ "$deploy_res" -eq "0" ]; then
echo "Copying connectivity script to node..."
scp $SSH_OPTIONS ${BASH_SOURCE%/*}/../kargo/test_networking.sh $ADMIN_USER@$PRI_NODE:test_networking.sh
fi
fi
if [ "$deploy_res" -eq "0" ]; then
echo "Enabling dashboard UI..."
cat ${BASH_SOURCE%/*}/../kargo/kubernetes-dashboard.yaml | ssh $SSH_OPTIONS $ADMIN_USER@${SLAVE_IPS[0]} "kubectl create -f -"
deploy_res=$?
if [ "$deploy_res" -ne "0" ]; then
echo "Unable to create dashboard UI!"
fi
fi
# FIXME(mattymo): Move this to underlay
# setup VLAN if everything is ok and env will not be deleted
if [ "$VLAN_BRIDGE" ] && [ "${deploy_res}" -eq "0" ] && [ "${DONT_DESTROY_ON_SUCCESS}" = "1" ];then
rm -f VLAN_IPS
@ -311,6 +289,7 @@ set +x
echo "**************************************"
echo "**************************************"
set -x
rm -f VLAN_IPS
fi

4
utils/kargo/ansible.cfg Normal file
View File

@ -0,0 +1,4 @@
[ssh_connection]
pipelining=True
[defaults]
host_key_checking=False

View File

@ -0,0 +1,5 @@
---
- hosts: kube-master[0]
roles:
- { role: postinstall, tags: postinstall }

View File

@ -0,0 +1,5 @@
---
- hosts: all
roles:
- { role: preinstall, tags: preinstall }

View File

@ -0,0 +1,45 @@
---
- name: Install kpm via pip
command: pip install kpm
run_once: true
- name: Deploy kubedns pod
command: "{{ bin_dir }}/kpm deploy kube-system/kubedns --namespace=kube-system"
- name: Wait for kubedns to be ready
uri: url=https://kube:{{kube_api_pwd}}@kubernetes validate_certs=no
register: kubedns_ready
until: kubedns_ready.status == 200 or kubedns_ready.status == 401
delay: 30
retries: 15
- name: Copy network test script
copy:
src: test_networking.sh
dest: "{{ bin_dir }}/test_networking.sh"
owner: root
group: root
mode: 0755
#FIXME(mattmyo): Refactor test_networking.sh to run without ssh keys
- name: Copy ssh key to primary master
become: no
local_action: shell sh -c "rsync -p -e \"ssh -i $HOME/.ssh/id_rsa -o StrictHostKeyChecking=no\" $HOME/.ssh/id_rsa {{ansible_user}}@{{ ip }}:.ssh/id_rsa"
- name: Test networking connectivity
shell: "bash {{ bin_dir }}/test_networking.sh"
changed_when: false
become: no
- name: Copy dashboard definition
copy:
src: kubernetes-dashboard.yml
dest: /etc/kubernetes/kubernetes-dashboard.yml
owner: root
group: root
mode: 0644
register: dashboard
- name: Create Kubernetes dashboard
command: "{{ bin_dir }}/kubectl create -f /etc/kubernetes/kubernetes-dashboard.yml"
when: dashboard.changed

View File

@ -0,0 +1,34 @@
---
- name: Ensure required ansible version
assert:
that: ansible_version.major == 2 and ansible_version.minor >= 1
- name: Wait for nodes to be ready
command: echo "Ready"
retries: 30
delay: 1
- name: Set correct /etc/hosts entry
register: updated_etc_hosts
lineinfile:
dest: /etc/hosts
regexp: "127.0.1.1.*"
line: "{{ ip }}\t{{ inventory_hostname }}"
state: present
- name: Update hostname via hostnamectl
command: hostnamectl set-hostname {{ inventory_hostname }}
when: updated_etc_hosts.changed
# FIXME(mattymo): Fix in kargo
- name: Purge resolvconf
register: purged_resolveconf
apt:
name: resolvconf
force: yes
state: absent
- name: Fix /etc/resolv.conf symlink
shell: "cp --remove-destination `realpath /etc/resolv.conf` /etc/resolv.conf"
when: purged_resolveconf.changed and purged_resolvconf.stdout.find('Removing resolvconf')