config/sysinv/sysinv/sysinv/scripts/kube-cert-rotation.sh

376 lines
11 KiB
Bash

#!/bin/bash
#
# SPDX-License-Identifier: Apache-2.0
#
# Copyright (C) 2019 Intel Corporation
# Copyright (c) 2021-2023 Wind River Systems, Inc.
#
#
# This script is to rotate kubernetes cluster certificates automatically
#
# Renew certificates 15 days before expiration
declare -r CUTOFF_DAYS=15
declare -r CUTOFF_DAYS_S=$((${CUTOFF_DAYS}*24*3600))
# Temporary working directory
TEMP_WORK_DIR="/tmp/kube_cert_rotation"
# Expiration date of k8s certs
# Tries ga command version, failing over to alpha command
kubeadm certs &> /dev/null
if [ $? -eq 0 ]; then
CERT_CMD='certs'
else
CERT_CMD='alpha certs'
fi
CERT_EXP_DATES=$(kubeadm $CERT_CMD check-expiration)
# Time left in seconds for a cert
time_left_s() {
local time_left_s=""
local exp_date=""
exp_date=$(echo "${CERT_EXP_DATES}" | grep "$1" | grep -oE '[a-zA-Z]{3} [0-3][0-9], [0-9]{4} ([0-1][0-9]|2[0-3]):[0-5][0-9] UTC')
if [ "x${exp_date}" != "x" ]; then
exp_date_s=$(date -d "${exp_date}" +%s)
current_date_s=$(date +%s)
time_left_s=$((${exp_date_s}-${current_date_s}))
fi
echo ${time_left_s}
}
# Retrieve a certiticate's valid time by openssl
time_left_s_by_openssl() {
local time_left_s=""
local exp_date=""
exp_date=$(openssl x509 -in "$1" -enddate -noout| awk -F"=" '{print $2}')
if [ "x${exp_date}" != "x" ]; then
exp_date_s=$(date -d "${exp_date}" +%s)
current_date_s=$(date +%s)
time_left_s=$((${exp_date_s}-${current_date_s}))
fi
echo ${time_left_s}
}
# Renew kubernetes certificates
# return value:
# 0: renewed successfully
# 255: no need to renew
# 1: renewal failed
renew_cert() {
local ret=0
local time_left_s=""
time_left_s=$(time_left_s "$1")
if [ "x${time_left_s}" != "x" ]; then
if [ ${time_left_s} -lt ${CUTOFF_DAYS_S} ]; then
kubeadm $CERT_CMD renew $1
if [ $? -ne 0 ]; then
ret=1
fi
else
ret=255
fi
else
ret=1
fi
return ${ret}
}
# Renew certificate using openssl
# return value:
# 0: renewed successfully
# 255: no need to renew
# 1: renewal failed
renew_cert_by_openssl() {
local ret=0
local time_left_s=""
if [ ! -f "$1/$2.crt" ]; then
return 255
fi
time_left_s=$(time_left_s_by_openssl "$1/$2.crt")
if [ "x${time_left_s}" != "x" ]; then
if [ ${time_left_s} -lt ${CUTOFF_DAYS_S} ]; then
# Create csr config file
echo "$3" > "${TEMP_WORK_DIR}/$2_csr.conf"
if [ $? -ne 0 ]; then
ret=1
fi
# generate private key
if [ $ret -eq 0 ]; then
openssl genpkey -out "${TEMP_WORK_DIR}/$2.key" -algorithm RSA -pkeyopt rsa_keygen_bits:4096
if [ $? -ne 0 ]; then
ret=1
fi
fi
# generate CSR
if [ $ret -eq 0 ]; then
openssl req -new -key "${TEMP_WORK_DIR}/$2.key" -out "${TEMP_WORK_DIR}/$2.csr" -config "${TEMP_WORK_DIR}/$2_csr.conf"
if [ $? -ne 0 ]; then
ret=1
fi
fi
# generate certificate
if [ $ret -eq 0 ]; then
openssl x509 -req -in "${TEMP_WORK_DIR}/$2.csr" -CA /etc/etcd/ca.crt -CAkey /etc/etcd/ca.key -CAcreateserial \
-out "${TEMP_WORK_DIR}/$2.crt" -days 365 -extensions v3_req -extfile "${TEMP_WORK_DIR}/$2_csr.conf"
if [ $? -ne 0 ]; then
ret=1
fi
fi
# replace the existing cert file
if [ $ret -eq 0 ]; then
mv "${TEMP_WORK_DIR}/$2.crt" "$1/$2.crt"
if [ $? -ne 0 ]; then
ret=1
fi
fi
# replace the existing key file
if [ $ret -eq 0 ]; then
mv "${TEMP_WORK_DIR}/$2.key" "$1/$2.key"
if [ $? -ne 0 ]; then
ret=1
fi
fi
else
ret=255
fi
else
ret=1
fi
return ${ret}
}
# Get cluster host floating IP address
get_cluster_host_floating_ip() {
local floating_ip=""
floating_ip=$(cat /etc/kubernetes/admin.conf | grep "server:" | awk -F"//" '{print $2}' | tr -d "[]" | sed -e s/:6443//)
echo ${floating_ip}
}
ERR=0
RESTART_APISERVER=0
RESTART_CONTROLLER_MANAGER=0
RESTART_SCHEDULER=0
RESTART_SYSINV=0
RESTART_CERT_MON=0
RESTART_ETCD=0
# step 1, renew kubernetes certificates
# Renew apiserver certificate
if [ ${ERR} -eq 0 ]; then
# The extra space in 'apiserver ' is to distinguish other names with apiserver in them.
renew_cert 'apiserver '
result=$?
if [ ${result} -eq 0 ]; then
RESTART_APISERVER=1
elif [ ${result} -eq 1 ]; then
ERR=1
fi
fi
# Renew apiserver kubelet client certificate
if [ ${ERR} -eq 0 ]; then
renew_cert 'apiserver-kubelet-client'
result=$?
if [ ${result} -eq 0 ]; then
RESTART_APISERVER=1
elif [ ${result} -eq 1 ]; then
ERR=1
fi
fi
# Renew front proxy client certificate
if [ ${ERR} -eq 0 ]; then
renew_cert 'front-proxy-client'
if [ $? -eq 1 ]; then
ERR=1
fi
fi
# Renew certs in admin.conf
if [ ${ERR} -eq 0 ]; then
renew_cert 'admin.conf'
result=$?
if [ ${result} -eq 0 ]; then
RESTART_SYSINV=1
RESTART_CERT_MON=1
elif [ ${result} -eq 1 ]; then
ERR=1
fi
fi
# Renew certs in controller-manager.conf
if [ ${ERR} -eq 0 ]; then
renew_cert 'controller-manager.conf'
result=$?
if [ ${result} -eq 0 ]; then
RESTART_CONTROLLER_MANAGER=1
elif [ ${result} -eq 1 ]; then
ERR=1
fi
fi
# Renew certs in scheduler.conf
if [ ${ERR} -eq 0 ]; then
renew_cert 'scheduler.conf'
result=$?
if [ ${result} -eq 0 ]; then
RESTART_SCHEDULER=1
elif [ ${result} -eq 1 ]; then
ERR=1
fi
fi
# Create temporary working directory
if [ ${ERR} -eq 0 ]; then
mkdir -p ${TEMP_WORK_DIR}
chmod 0600 ${TEMP_WORK_DIR}
if [ $? -ne 0 ]; then
ERR=1
fi
fi
# Get cluster host floating IP address
if [ ${ERR} -eq 0 ]; then
floating_ip=$(get_cluster_host_floating_ip)
if [ "x${floating_ip}" == "x" ]; then
ERR=1
fi
fi
# Renew apiserver-etcd-client certificate
if [ ${ERR} -eq 0 ]; then
config="
[req]
prompt = no
x509_extensions = v3_req
distinguished_name = dn
[dn]
CN = apiserver-etcd-client
[v3_req]
keyUsage = critical, Digital Signature, Key Encipherment
extendedKeyUsage = TLS Web Server Authentication, TLS Web Client Authentication
subjectAltName = @alt_names
[alt_names]
IP.1 = ${floating_ip}
IP.2 = 127.0.0.1
"
renew_cert_by_openssl "/etc/kubernetes/pki" "apiserver-etcd-client" "${config}"
result=$?
if [ ${result} -eq 0 ]; then
RESTART_APISERVER=1
elif [ ${result} -eq 1 ]; then
ERR=1
fi
fi
# Renew etcd-server certificate
if [ ${ERR} -eq 0 ]; then
config="
[req]
prompt = no
x509_extensions = v3_req
distinguished_name = dn
[dn]
CN = etcd-server
[v3_req]
keyUsage = critical, Digital Signature, Key Encipherment
extendedKeyUsage = TLS Web Server Authentication, TLS Web Client Authentication
subjectAltName = @alt_names
[alt_names]
IP.1 = ${floating_ip}
IP.2 = 127.0.0.1
"
renew_cert_by_openssl "/etc/etcd" "etcd-server" "${config}"
result=$?
if [ ${result} -eq 0 ]; then
RESTART_ETCD=1
elif [ ${result} -eq 1 ]; then
ERR=1
fi
fi
# Renew etcd-client certificate
if [ ${ERR} -eq 0 ]; then
config="
[req]
prompt = no
x509_extensions = v3_req
distinguished_name = dn
[dn]
CN = root
[v3_req]
keyUsage = critical, Digital Signature, Key Encipherment
extendedKeyUsage = TLS Web Server Authentication, TLS Web Client Authentication
subjectAltName = @alt_names
[alt_names]
DNS.1 = root
"
renew_cert_by_openssl "/etc/etcd" "etcd-client" "${config}"
result=$?
if [ ${result} -eq 1 ]; then
ERR=1
fi
fi
# Remove temporary working directory
rm -rf ${TEMP_WORK_DIR}
# step 2, restart affected kubernetes components and system services
# Restart apiserver
if [ ${RESTART_APISERVER} -eq 1 ]; then
crictl ps | awk '/kube-apiserver/{print$1}' | xargs crictl stop > /dev/null
if [ $? -ne 0 ]; then
ERR=2
fi
fi
# Restart controller-manager
if [ ${RESTART_CONTROLLER_MANAGER} -eq 1 ]; then
crictl ps | awk '/kube-controller-manager/{print$1}' | xargs crictl stop > /dev/null
if [ $? -ne 0 ]; then
ERR=2
fi
fi
# Restart scheduler
if [ ${RESTART_SCHEDULER} -eq 1 ]; then
crictl ps | awk '/kube-scheduler/{print$1}' | xargs crictl stop > /dev/null
if [ $? -ne 0 ]; then
ERR=2
fi
fi
# Restart sysinv services, both conductor and api, since both are using
# credentials from admin.conf. Command sm-restart-safe only restarts
# sysinv-conductor. Command sm-restart will restart sysinv-conductor
# and its dependencies, meaning all sysinv services.
if [ ${RESTART_SYSINV} -eq 1 ]; then
sm-restart service sysinv-conductor
if [ $? -ne 0 ]; then
ERR=2
fi
fi
# Restart cert-mon since it's using credentials from admin.conf
if [ ${RESTART_CERT_MON} -eq 1 ]; then
sm-restart-safe service cert-mon
if [ $? -ne 0 ]; then
ERR=2
fi
fi
# Restart etcd server
if [ ${RESTART_ETCD} -eq 1 ]; then
sm-restart-safe service etcd
if [ $? -ne 0 ]; then
ERR=2
fi
fi
if [ ${ERR} -eq 2 ]; then
# Notify admin to lock and unlock this master node if restart k8s components failed
/usr/local/bin/fmClientCli -c "### ###250.003###set###host###host=${HOSTNAME}### ###major###Kubernetes certificates have been renewed but not all services have been updated.###operational-violation### ###Lock and unlock the host to update services with new certificates (Manually renew kubernetes certificates first if renewal failed).### ### ###"
elif [ ${ERR} -eq 1 ]; then
# Notify admin to renew kube cert manually and restart services by lock/unlock if cert renew or config failed
/usr/local/bin/fmClientCli -c "### ###250.003###set###host###host=${HOSTNAME}### ###major###Kubernetes certificates renewal failed.###operational-violation### ###Lock and unlock the host to update services with new certificates (Manually renew kubernetes certificates first if renewal failed).### ### ###"
else
# Clear the alarm if cert rotation completed
# Check if alarm exist first before deleting. fmClientCli -A returns 0 when found and 255 when not found
/usr/local/bin/fmClientCli -A "250.003" &> /dev/null
if [ $? -eq 0 ]; then
/usr/local/bin/fmClientCli -d "###250.003###host=${HOSTNAME}###"
fi
fi