Merge "Fix cert rotation cron job not renewing conf files"

This commit is contained in:
Zuul 2021-08-06 20:56:34 +00:00 committed by Gerrit Code Review
commit 1df668d5c6
1 changed files with 336 additions and 132 deletions

View File

@ -3,150 +3,354 @@
# SPDX-License-Identifier: Apache-2.0
#
# Copyright (C) 2019 Intel Corporation
# Copyright (c) 2021 Wind River Systems, Inc.
#
#
# This script is to rotate kubernetes cluster certificates automatically
#
# Renew certificates 15 days before expiration
declare -r CUTOFF_DAYS=15
declare -r CUTOFF_DAYS_S=$((${CUTOFF_DAYS}*24*3600))
# Temporary working directory
TEMP_WORK_DIR="/tmp/kube_cert_rotation"
# Expiration date of k8s certs
CERT_LASTDATE=$(openssl x509 -in /etc/kubernetes/pki/apiserver.crt -text | grep 'Not After' | awk -F ' : ' '{print $2}')
CERT_EXP_DATES=$(kubeadm alpha certs check-expiration)
if [ "x${CERT_LASTDATE}" != "x" ]; then
CERT_LASTDATE_S=$(date -d "${CERT_LASTDATE}" +%s)
CURRENT_DATE_S=$(date +%s)
DAY_LEFT_S=$((${CERT_LASTDATE_S}-${CURRENT_DATE_S}))
fi
# Time left in seconds for a cert
time_left_s() {
local time_left_s=""
local exp_date=""
exp_date=$(echo "${CERT_EXP_DATES}" | grep "$1" | grep -oE '[a-zA-Z]{3} [0-3][0-9], [0-9]{4} ([0-1][0-9]|2[0-3]):[0-5][0-9] UTC')
if [ "x${exp_date}" != "x" ]; then
exp_date_s=$(date -d "${exp_date}" +%s)
current_date_s=$(date +%s)
time_left_s=$((${exp_date_s}-${current_date_s}))
fi
echo ${time_left_s}
}
# Retrieve a certiticate's valid time by openssl
time_left_s_by_openssl() {
local time_left_s=""
local exp_date=""
exp_date=$(openssl x509 -in "$1" -enddate -noout| awk -F"=" '{print $2}')
if [ "x${exp_date}" != "x" ]; then
exp_date_s=$(date -d "${exp_date}" +%s)
current_date_s=$(date +%s)
time_left_s=$((${exp_date_s}-${current_date_s}))
fi
echo ${time_left_s}
}
# Renew kubernetes certificates
# return value:
# 0: renewed successfully
# 255: no need to renew
# 1: renewal failed
renew_cert() {
local ret=0
local time_left_s=""
time_left_s=$(time_left_s "$1")
if [ "x${time_left_s}" != "x" ]; then
if [ ${time_left_s} -lt ${CUTOFF_DAYS_S} ]; then
kubeadm alpha certs renew $1
if [ $? -ne 0 ]; then
ret=1
fi
else
ret=255
fi
else
ret=1
fi
return ${ret}
}
# Renew certificate using openssl
# return value:
# 0: renewed successfully
# 255: no need to renew
# 1: renewal failed
renew_cert_by_openssl() {
local ret=0
local time_left_s=""
time_left_s=$(time_left_s_by_openssl "$1/$2.crt")
if [ "x${time_left_s}" != "x" ]; then
if [ ${time_left_s} -lt ${CUTOFF_DAYS_S} ]; then
# Create csr config file
echo "$3" > "${TEMP_WORK_DIR}/$2_csr.conf"
if [ $? -ne 0 ]; then
ret=1
fi
# generate private key
if [ $ret -eq 0 ]; then
openssl genpkey -out "${TEMP_WORK_DIR}/$2.key" -algorithm RSA -pkeyopt rsa_keygen_bits:4096
if [ $? -ne 0 ]; then
ret=1
fi
fi
# generate CSR
if [ $ret -eq 0 ]; then
openssl req -new -key "${TEMP_WORK_DIR}/$2.key" -out "${TEMP_WORK_DIR}/$2.csr" -config "${TEMP_WORK_DIR}/$2_csr.conf"
if [ $? -ne 0 ]; then
ret=1
fi
fi
# generate certificate
if [ $ret -eq 0 ]; then
openssl x509 -req -in "${TEMP_WORK_DIR}/$2.csr" -CA /etc/etcd/ca.crt -CAkey /etc/etcd/ca.key -CAcreateserial \
-out "${TEMP_WORK_DIR}/$2.crt" -days 365 -extensions v3_req -extfile "${TEMP_WORK_DIR}/$2_csr.conf"
if [ $? -ne 0 ]; then
ret=1
fi
fi
# replace the existing cert file
if [ $ret -eq 0 ]; then
mv "${TEMP_WORK_DIR}/$2.crt" "$1/$2.crt"
if [ $? -ne 0 ]; then
ret=1
fi
fi
# replace the existing key file
if [ $ret -eq 0 ]; then
mv "${TEMP_WORK_DIR}/$2.key" "$1/$2.key"
if [ $? -ne 0 ]; then
ret=1
fi
fi
else
ret=255
fi
else
ret=1
fi
return ${ret}
}
# Get cluster host floating IP address
get_cluster_host_floating_ip() {
local floating_ip=""
floating_ip=$(cat /etc/kubernetes/admin.conf | grep "server:" | awk -F"//" '{print $2}' | tr -d "[]" | sed -e s/:6443//)
echo ${floating_ip}
}
# Renew certificates 90 days before expiration
ERR=0
declare -r NINETY_DAYS_S=$((90*24*3600))
if [ ${DAY_LEFT_S} -lt ${NINETY_DAYS_S} ]; then
# Same expiration date of apiserver, apiserver-kubelet-client and front-proxy-client
if [ ${ERR} -eq 0 ]; then
kubeadm alpha certs renew apiserver
if [ $? -ne 0 ]; then
ERR=1
fi
fi
RESTART_APISERVER=0
RESTART_CONTROLLER_MANAGER=0
RESTART_SCHEDULER=0
RESTART_SYSINV=0
RESTART_CERT_MON=0
RESTART_ETCD=0
if [ ${ERR} -eq 0 ]; then
kubeadm alpha certs renew apiserver-kubelet-client
if [ $? -ne 0 ]; then
ERR=1
fi
fi
if [ ${ERR} -eq 0 ]; then
kubeadm alpha certs renew front-proxy-client
if [ $? -ne 0 ]; then
ERR=1
fi
fi
# Update cluster configuration files using the renewed certificates
if [ ${ERR} -eq 0 ]; then
ADVERTISE_ADDR=$(kubectl get endpoints kubernetes -o jsonpath='{.subsets[0].addresses[0].ip}')
else
ADVERTISE_ADDR=""
fi
if [ "x${ADVERTISE_ADDR}" != "x" ]; then
# Update admin.conf
if [ ${ERR} -eq 0 ]; then
kubeadm alpha kubeconfig user --client-name=kubernetes-admin --apiserver-advertise-address=${ADVERTISE_ADDR} --org system:masters > /tmp/admin.conf
if [ $? -eq 0 ]; then
mv /tmp/admin.conf /etc/kubernetes/admin.conf
if [ $? -ne 0 ]; then
ERR=1
fi
else
ERR=1
fi
fi
# Update controller-manager.conf
if [ ${ERR} -eq 0 ]; then
kubeadm alpha kubeconfig user --client-name=system:kube-controller-manager --apiserver-advertise-address=${ADVERTISE_ADDR} --cert-dir /etc/kubernetes/pki/ > /tmp/controller-manager.conf
if [ $? -eq 0 ]; then
mv /tmp/controller-manager.conf /etc/kubernetes/controller-manager.conf
if [ $? -ne 0 ]; then
ERR=1
fi
else
ERR=1
fi
fi
# Update scheduler.conf
if [ ${ERR} -eq 0 ]; then
kubeadm alpha kubeconfig user --client-name=system:kube-scheduler --apiserver-advertise-address=${ADVERTISE_ADDR} --cert-dir /etc/kubernetes/pki/ > /tmp/scheduler.conf
if [ $? -eq 0 ]; then
mv /tmp/scheduler.conf /etc/kubernetes/scheduler.conf
if [ $? -ne 0 ]; then
ERR=1
fi
else
ERR=1
fi
fi
# Update kubelet.conf
# This block could be removed once this issue is resolved. https://github.com/kubernetes/kubeadm/issues/1753
if [ ${ERR} -eq 0 ]; then
kubeadm alpha kubeconfig user --client-name=system:node:${HOSTNAME} --apiserver-advertise-address=${ADVERTISE_ADDR} --org system:nodes > /tmp/kubelet.conf
if [ $? -eq 0 ]; then
mv /tmp/kubelet.conf /etc/kubernetes/kubelet.conf
if [ $? -ne 0 ]; then
ERR=1
fi
else
ERR=1
fi
fi
else
# step 1, renew kubernetes certificates
# Renew apiserver certificate
if [ ${ERR} -eq 0 ]; then
# The extra space in 'apiserver ' is to distinguish other names with apiserver in them.
renew_cert 'apiserver '
result=$?
if [ ${result} -eq 0 ]; then
RESTART_APISERVER=1
elif [ ${result} -eq 1 ]; then
ERR=1
fi
# Restart the containers of k8s components to refresh the configurations within container
if [ ${ERR} -eq 0 ]; then
crictl ps | awk '/kube-apiserver/{print$1}' | xargs crictl stop > /dev/null
if [ $? -ne 0 ]; then
ERR=2
fi
fi
if [ ${ERR} -eq 0 ]; then
crictl ps | awk '/kube-controller-manager/{print$1}' | xargs crictl stop > /dev/null
if [ $? -ne 0 ]; then
ERR=2
fi
fi
if [ ${ERR} -eq 0 ]; then
crictl ps | awk '/kube-scheduler/{print$1}' | xargs crictl stop > /dev/null
if [ $? -ne 0 ]; then
ERR=2
fi
fi
if [ ${ERR} -eq 0 ]; then
systemctl daemon-reload
systemctl restart kubelet
if [ $? -ne 0 ]; then
ERR=2
fi
fi
if [ ${ERR} -eq 2 ]; then
# Notify admin to lock and unlock this master node if restart k8s components failed
fmClientCli -c "### ###250.003###set###host###host=${HOSTNAME}### ###major###Kubernetes certificates on host ${HOSTNAME} have been renewed but not updated.###operational-violation### ###Lock and unlock host ${HOSTNAME} to update config.### ### ###"
elif [ ${ERR} -eq 1 ]; then
# Notify admin to rotate kube cert manually if cert renew or config failed
fmClientCli -c "### ###250.003###set###host###host=${HOSTNAME}### ###major###Kubernetes certificates automatic rotation failed on host ${HOSTNAME}###operational-violation### ###Rotate kubernetes certificates manually, lock and unlock host ${HOSTNAME} to update config.### ### ###"
else
# Clear the alarm if cert rotation completed
fmClientCli -d "###250.003###host=${HOSTNAME}###"
fi
# Renew apiserver kubelet client certificate
if [ ${ERR} -eq 0 ]; then
renew_cert 'apiserver-kubelet-client'
result=$?
if [ ${result} -eq 0 ]; then
RESTART_APISERVER=1
elif [ ${result} -eq 1 ]; then
ERR=1
fi
fi
# Renew front proxy client certificate
if [ ${ERR} -eq 0 ]; then
renew_cert 'front-proxy-client'
if [ $? -eq 1 ]; then
ERR=1
fi
fi
# Renew certs in admin.conf
if [ ${ERR} -eq 0 ]; then
renew_cert 'admin.conf'
result=$?
if [ ${result} -eq 0 ]; then
RESTART_SYSINV=1
RESTART_CERT_MON=1
elif [ ${result} -eq 1 ]; then
ERR=1
fi
fi
# Renew certs in controller-manager.conf
if [ ${ERR} -eq 0 ]; then
renew_cert 'controller-manager.conf'
result=$?
if [ ${result} -eq 0 ]; then
RESTART_CONTROLLER_MANAGER=1
elif [ ${result} -eq 1 ]; then
ERR=1
fi
fi
# Renew certs in scheduler.conf
if [ ${ERR} -eq 0 ]; then
renew_cert 'scheduler.conf'
result=$?
if [ ${result} -eq 0 ]; then
RESTART_SCHEDULER=1
elif [ ${result} -eq 1 ]; then
ERR=1
fi
fi
# Create temporary working directory
if [ ${ERR} -eq 0 ]; then
mkdir -p ${TEMP_WORK_DIR}
chmod 0600 ${TEMP_WORK_DIR}
if [ $? -ne 0 ]; then
ERR=1
fi
fi
# Get cluster host floating IP address
if [ ${ERR} -eq 0 ]; then
floating_ip=$(get_cluster_host_floating_ip)
if [ "x${floating_ip}" == "x" ]; then
ERR=1
fi
fi
# Renew apiserver-etcd-client certificate
if [ ${ERR} -eq 0 ]; then
config="
[req]
prompt = no
x509_extensions = v3_req
distinguished_name = dn
[dn]
CN = apiserver-etcd-client
[v3_req]
keyUsage = critical, Digital Signature, Key Encipherment
extendedKeyUsage = TLS Web Server Authentication, TLS Web Client Authentication
subjectAltName = @alt_names
[alt_names]
IP.1 = ${floating_ip}
IP.2 = 127.0.0.1
"
renew_cert_by_openssl "/etc/kubernetes/pki/" "apiserver-etcd-client" "${config}"
result=$?
if [ ${result} -eq 0 ]; then
RESTART_APISERVER=1
elif [ ${result} -eq 1 ]; then
ERR=1
fi
fi
# Renew etcd-server certificate
if [ ${ERR} -eq 0 ]; then
config="
[req]
prompt = no
x509_extensions = v3_req
distinguished_name = dn
[dn]
CN = etcd-server
[v3_req]
keyUsage = critical, Digital Signature, Key Encipherment
extendedKeyUsage = TLS Web Server Authentication, TLS Web Client Authentication
subjectAltName = @alt_names
[alt_names]
IP.1 = ${floating_ip}
IP.2 = 127.0.0.1
"
renew_cert_by_openssl "/etc/etcd/" "etcd-server" "${config}"
result=$?
if [ ${result} -eq 0 ]; then
RESTART_ETCD=1
elif [ ${result} -eq 1 ]; then
ERR=1
fi
fi
# Renew etcd-client certificate
if [ ${ERR} -eq 0 ]; then
config="
[req]
prompt = no
x509_extensions = v3_req
distinguished_name = dn
[dn]
CN = root
[v3_req]
keyUsage = critical, Digital Signature, Key Encipherment
extendedKeyUsage = TLS Web Server Authentication, TLS Web Client Authentication
subjectAltName = @alt_names
[alt_names]
DNS.1 = root
"
renew_cert_by_openssl "/etc/etcd/" "etcd-client" "${config}"
result=$?
if [ ${result} -eq 1 ]; then
ERR=1
fi
fi
# Remove temporary working directory
rm -rf ${TEMP_WORK_DIR}
# step 2, restart affected kubernetes components and system services
# Restart apiserver
if [ ${RESTART_APISERVER} -eq 1 ]; then
crictl ps | awk '/kube-apiserver/{print$1}' | xargs crictl stop > /dev/null
if [ $? -ne 0 ]; then
ERR=2
fi
fi
# Restart controller-manager
if [ ${RESTART_CONTROLLER_MANAGER} -eq 1 ]; then
crictl ps | awk '/kube-controller-manager/{print$1}' | xargs crictl stop > /dev/null
if [ $? -ne 0 ]; then
ERR=2
fi
fi
# Restart scheduler
if [ ${RESTART_SCHEDULER} -eq 1 ]; then
crictl ps | awk '/kube-scheduler/{print$1}' | xargs crictl stop > /dev/null
if [ $? -ne 0 ]; then
ERR=2
fi
fi
# Restart sysinv-conductor since it's using credentials from admin.conf
if [ ${RESTART_SYSINV} -eq 1 ]; then
sm-restart-safe service sysinv-conductor
if [ $? -ne 0 ]; then
ERR=2
fi
fi
# Restart cert-mon since it's using credentials from admin.conf
if [ ${RESTART_CERT_MON} -eq 1 ]; then
sm-restart-safe service cert-mon
if [ $? -ne 0 ]; then
ERR=2
fi
fi
# Restart etcd server
if [ ${RESTART_ETCD} -eq 1 ]; then
sm-restart-safe service etcd
if [ $? -ne 0 ]; then
ERR=2
fi
fi
if [ ${ERR} -eq 2 ]; then
# Notify admin to lock and unlock this master node if restart k8s components failed
/usr/local/bin/fmClientCli -c "### ###250.003###set###host###host=${HOSTNAME}### ###major###Kubernetes certificates have been renewed but not all services have been updated.###operational-violation### ###Lock and unlock the host to update services with new certificates (Manually renew kubernetes certificates first if renewal failed).### ### ###"
elif [ ${ERR} -eq 1 ]; then
# Notify admin to renew kube cert manually and restart services by lock/unlock if cert renew or config failed
/usr/local/bin/fmClientCli -c "### ###250.003###set###host###host=${HOSTNAME}### ###major###Kubernetes certificates renewal failed.###operational-violation### ###Lock and unlock the host to update services with new certificates (Manually renew kubernetes certificates first if renewal failed).### ### ###"
else
# Clear the alarm if cert rotation completed
/usr/local/bin/fmClientCli -d "###250.003###host=${HOSTNAME}###"
fi