121 lines
4.3 KiB
Bash
Executable File
121 lines
4.3 KiB
Bash
Executable File
#!/bin/bash
|
|
set -eu
|
|
|
|
RMQ_START_TIMEOUT=${RMQ_START_TIMEOUT:-"900s"}
|
|
RMQ_INIT_JOB_TIMEOUT=${RMQ_INIT_JOB_TIMEOUT:-"250s"}
|
|
RMQ_TERMINATE_TIMEOUT=${RMQ_TERMINATE_TIMEOUT:-"200s"}
|
|
|
|
# Set RMQ_NS environment variable to `ucp`, to reset UCP RabbitMQ cluster.
|
|
RMQ_NS=${RMQ_NS:-"ucp"}
|
|
RMQ_SS=${RMQ_SS:-"clcp-${RMQ_NS}-rabbitmq-rabbitmq"}
|
|
RMQ_RELEASE_GROUP=${RMQ_RELEASE_GROUP:-"clcp-${RMQ_NS}-rabbitmq"}
|
|
RMQ_WAIT_JOB=${RMQ_WAIT_JOB:-"clcp-${RMQ_NS}-rabbitmq-cluster-wait"}
|
|
RMQ_SERVER_LABELS=${RMQ_SERVER_LABELS:-"application=rabbitmq,component=server"}
|
|
|
|
function job_recreate {
|
|
local ns=$1
|
|
local job=$2
|
|
local job_orig_path=$(mktemp --suffix=.orig.json)
|
|
local job_new_path=$(mktemp --suffix=.new.json)
|
|
kubectl get -n "${ns}" jobs "${job}" -o=json > "${job_orig_path}"
|
|
|
|
cat "${job_orig_path}" | python -c "
|
|
import sys, json
|
|
d = json.load(sys.stdin)
|
|
def rm(d, path):
|
|
path = path.split('.')
|
|
last = path[-1]
|
|
for p in path[:-1]:
|
|
d = d.get(p, {})
|
|
d.pop(last, None)
|
|
rm(d, 'status')
|
|
rm(d, 'spec.selector')
|
|
rm(d, 'spec.template.metadata.creationTimestamp')
|
|
rm(d, 'spec.template.metadata.labels.controller-uid')
|
|
rm(d, 'metadata.creationTimestamp')
|
|
rm(d, 'metadata.labels.controller-uid')
|
|
rm(d, 'metadata.resourceVersion')
|
|
rm(d, 'metadata.selfLink')
|
|
rm(d, 'metadata.uid')
|
|
print(json.dumps(d))
|
|
" > "${job_new_path}"
|
|
|
|
cat "${job_orig_path}" | kubectl delete -f -
|
|
cat "${job_new_path}" | kubectl create -f -
|
|
kubectl wait -n "${ns}" job "${job}" --for=condition=complete --timeout="${RMQ_INIT_JOB_TIMEOUT}"
|
|
}
|
|
|
|
function force_unmap_rbd {
|
|
local pvc_name=$1
|
|
local rbd_host=$2
|
|
# NOTE: for a single PVC there may be multiple PVs, try checking all of them.
|
|
local rbd_names=$(kubectl get pv \
|
|
-o jsonpath="{.items[?(@.spec.claimRef.name=='${pvc_name}')].spec.rbd.image}")
|
|
local ceph_mon_pod=$(kubectl get pods -n ceph \
|
|
-l application=ceph,component=mon \
|
|
--field-selector status.phase=Running \
|
|
-o jsonpath='{.items[0].metadata.name}')
|
|
for rbd_name in ${rbd_names}; do
|
|
if kubectl exec -n ceph $ceph_mon_pod -- rbd status $rbd_name | grep -i "Watchers: none"; then
|
|
echo "No clients connected to '${rbd_name}' - OK"
|
|
else
|
|
echo "RBD client is still connected to '${rbd_name}', starting force unmap"
|
|
local exec_pod=$(kubectl get pod -l application=divingbell,component=exec -n ucp \
|
|
-o jsonpath="{.items[?(.spec.nodeName=='${rbd_host}')].metadata.name}")
|
|
echo "Using '${exec_pod}' pod to unmap the volume"
|
|
if kubectl exec -it $exec_pod -n ucp -- nsenter -t 1 -m -u -n -i rbd \
|
|
showmapped | grep -i $rbd_name; then
|
|
kubectl exec -it $exec_pod -n ucp -- nsenter -t 1 -m -u -n -i \
|
|
rbd unmap -o force $rbd_name
|
|
fi
|
|
fi
|
|
done
|
|
}
|
|
|
|
# Get a list of hosts where RabbitMQ servers are present.
|
|
rmq_hosts=$(kubectl get po -n "${RMQ_NS}" \
|
|
-l "${RMQ_SERVER_LABELS}" \
|
|
-o jsonpath="{.items[*].spec.nodeName}" | tr ' ' "\n" | sort -u)
|
|
|
|
echo "Scaling down RabbitMQ"
|
|
kubectl scale -n "${RMQ_NS}" --replicas=0 statefulset/"${RMQ_SS}"
|
|
|
|
echo "Waiting for pods to terminate"
|
|
kubectl wait -n "${RMQ_NS}" --for=delete -l "${RMQ_SERVER_LABELS}" \
|
|
--timeout="${RMQ_TERMINATE_TIMEOUT}" pod 2>&1 | \
|
|
grep -Eq 'no matching resources found|condition met'
|
|
|
|
echo "Check for any stale RBDs before deleting PVCs"
|
|
for host in $rmq_hosts; do
|
|
echo "Check host '${host}'"
|
|
pvcs=$(kubectl get pvc -n "${RMQ_NS}" -l release_group=${RMQ_RELEASE_GROUP} \
|
|
-o jsonpath="{.items[*].metadata.name}")
|
|
for pvc in $pvcs; do
|
|
force_unmap_rbd $pvc $host
|
|
done
|
|
done
|
|
|
|
echo "Deleting RabbitMQ volumes"
|
|
kubectl delete -n "${RMQ_NS}" pvc -l release_group="${RMQ_RELEASE_GROUP}"
|
|
|
|
echo "Scaling up RabbitMQ"
|
|
kubectl scale -n "${RMQ_NS}" --replicas=2 statefulset/"${RMQ_SS}"
|
|
kubectl rollout status -n "${RMQ_NS}" statefulset/"${RMQ_SS}" \
|
|
--timeout="${RMQ_START_TIMEOUT}"
|
|
|
|
echo "Waiting for RabbitMQ cluster to become ready"
|
|
job_recreate "${RMQ_NS}" "${RMQ_WAIT_JOB}"
|
|
|
|
echo "Restarting RabbitMQ init jobs"
|
|
for job in $(kubectl get -n "${RMQ_NS}" jobs -l component=rabbit-init \
|
|
--no-headers -o name | awk -F '/' '{ print $NF }'); do
|
|
job_recreate "${RMQ_NS}" "${job}"
|
|
done
|
|
|
|
# Handle bouncing RO separately if openstack namespace
|
|
if [ "${RMQ_NS}" == "openstack" ]; then
|
|
kubectl delete pod -n "${RMQ_NS}" -l application="ro",component="api"
|
|
fi
|
|
|
|
echo "DONE"
|