Merge "Recover subcloud certs after long period offline"

This commit is contained in:
Zuul
2023-08-04 18:49:26 +00:00
committed by Gerrit Code Review
5 changed files with 439 additions and 0 deletions

View File

@@ -0,0 +1,43 @@
---
#
# Copyright (c) 2023 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
# SUB-TASK DESCRIPTION:
# These tasks copy files from systemcontroller to subcloud
# They allow copying between two directories with root only access
#
- block:
- name: Create tmp file to facilitate copying between root only access dirs
tempfile:
state: file
prefix: tmp_cert_copy_
path: /tmp/
register: tmp_cert_file
- name: Copy file from source to temporary location first
copy:
src: "{{ item.path }}"
dest: "{{ tmp_cert_file.path }}"
remote_src: true
become: yes
connection: local
- name: Copy from temporary location to remote
copy:
src: "{{ tmp_cert_file.path }}"
dest: "{{ item.path }}"
owner: root
group: root
mode: "{{ item.perms | default(0644) }}"
become: yes
always:
- name: Delete temporary file after use
file:
path: "{{ tmp_cert_file.path }}"
state: absent
connection: local
become: yes
failed_when: false

View File

@@ -0,0 +1,18 @@
---
#
# Copyright (c) 2023 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
# ROLE DESCRIPTION:
# This role performs certificate recovery for subclouds when offline for
# a long period of time
#
- name: Recover K8s Root CA certificates (K8s Root CA, ETCD CA, FrontProxy CA)
import_tasks: recover-k8s-root-cas.yml
- name: Renew K8s leaf certificates
import_tasks: recover-k8s-leaf-certificates.yml
- name: Recover dc admin endpoint Root CA, subcloud ICA and leaf certificates
import_tasks: recover-dc-admin-ep-certificate-chain.yml

View File

@@ -0,0 +1,238 @@
---
#
# Copyright (c) 2023 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
# SUB-TASK DESCRIPTION:
# These tasks create a new ICA for the subcloud and update both the
# dc-adminep-root-ca and the new ICA in the subcloud to
# create a new valid chain
#
- block:
- name: Retrieve DC admin endpoint Root CA from k8s secret
command: >-
kubectl get secret dc-adminep-root-ca-certificate
-n dc-cert -o jsonpath='{.data.ca\.crt}'
environment:
KUBECONFIG: "/etc/kubernetes/admin.conf"
register: dc_root_ca_cert_result
connection: local
- name: Get software version
shell: source /etc/platform/platform.conf; echo $sw_version
register: sw_version_result
- name: Set location of dc-adminep-root-ca.crt certificate on subcloud
set_fact:
dc_adminep_root_ca_location: >-
/opt/platform/config/{{ sw_version_result.stdout }}/dc-adminep-root-ca.crt
- name: Verify if dc-adminep-root-ca.crt certificate is expired on subcloud
command: openssl x509 -in {{ dc_adminep_root_ca_location }} -checkend 0
become: yes
register: adminep_root_ca_info
failed_when: false
- name: Get dc-adminep-root-ca.crt certificate from subcloud
shell: cat {{ dc_adminep_root_ca_location }} | base64 -w0
become: yes
register: adminep_root_ca_from_subcloud
- name: Copy dc-adminep-root-ca.crt from system controller to subcloud
copy:
content: "{{ dc_root_ca_cert_result.stdout | b64decode }}"
dest: "{{ dc_adminep_root_ca_location }}"
become: yes
when: |
adminep_root_ca_info.rc != 0 or
dc_root_ca_cert_result.stdout != adminep_root_ca_from_subcloud.stdout
- name: Create tmp file name for current dc-adminep ICA certificate on subcloud
tempfile:
state: file
prefix: tmp_subcloud_ica_subcloud_
suffix: .pem
path: /tmp/
register: tmp_subcloud_ica_subcloud
- name: Retrieve current dc-adminep ICA cert from k8s secret on subcloud
command: >-
kubectl get secret sc-adminep-ca-certificate -n sc-cert -o jsonpath='{.data.tls\.crt}'
environment:
KUBECONFIG: "/etc/kubernetes/admin.conf"
register: current_subcloud_ica_cert_result
retries: 3
delay: 5
until: current_subcloud_ica_cert_result is not failed
- name: Save current dc-adminep ICA cert to file
copy:
content: "{{ current_subcloud_ica_cert_result.stdout | b64decode }}"
dest: "{{ tmp_subcloud_ica_subcloud.path }}"
- name: Verify dc-admin-ep certificate chain is correct on subcloud
command: >-
openssl verify -CAfile
{{ dc_adminep_root_ca_location }} {{ tmp_subcloud_ica_subcloud.path }}
register: ca_verification
failed_when: false
become: yes
- name: Recover the invalid certificate chain for dc-admin-ep certificates on subcloud
block:
- name: Verify if subcloud ICA certificate exists
command: kubectl -n dc-cert get secret {{ name }}-adminep-ca-certificate
register: subcloud_ica_cert_exists
failed_when: false
environment:
KUBECONFIG: "/etc/kubernetes/admin.conf"
connection: local
- name: Create dc-admin-ep ICA certificate for subcloud if missing
block:
- name: Create tmp file name for subcloud ICA certificate template
tempfile:
state: file
prefix: tmp_subcloud_cert_template_
suffix: .j2
path: /tmp/
register: tmp_subcloud_ica_template
connection: local
- name: Save certificate template to file {{ tmp_subcloud_ica_template.path }}
vars:
template_content: |
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
creationTimestamp: null
name: {{ name }}-adminep-ca-certificate
namespace: dc-cert
spec:
isCA: true
commonName: {{ name }}
duration: 8760h0m0s # 1 year
issuerRef:
kind: Issuer
name: dc-adminep-root-ca-issuer
renewBefore: 720h0m0s # 30 days
secretName: {{ name }}-adminep-ca-certificate
copy:
dest: "{{ tmp_subcloud_ica_template.path }}"
content: "{{ template_content }}"
mode: 0640
connection: local
- name: Apply ICA certificate for subcloud in k8s in system controller
shell: |
kubectl apply -f "{{ tmp_subcloud_ica_template.path }}"
kubectl -n dc-cert delete secret {{ name }}-adminep-ca-certificate
environment:
KUBECONFIG: "/etc/kubernetes/admin.conf"
register: apply_cert_result
connection: local
retries: 5
delay: 5
until: apply_cert_result is not failed
when: subcloud_ica_cert_exists.rc != 0
- name: Retrieve subcloud ICA certificate data from from k8s secret
command: >-
kubectl get secret {{ name }}-adminep-ca-certificate
-n dc-cert -o jsonpath='{.data.tls\.crt}'
environment:
KUBECONFIG: "/etc/kubernetes/admin.conf"
register: subcloud_ica_cert_result
connection: local
retries: 5
delay: 5
until: subcloud_ica_cert_result is not failed
- name: Retrieve subcloud ICA certificate key from from k8s secret
command: >-
kubectl get secret {{ name }}-adminep-ca-certificate
-n dc-cert -o jsonpath='{.data.tls\.key}'
environment:
KUBECONFIG: "/etc/kubernetes/admin.conf"
register: subcloud_ica_key_result
connection: local
retries: 5
delay: 5
until: subcloud_ica_key_result is not failed
- name: Create tmp file name for the ICA certificate
tempfile:
state: file
prefix: tmp_subcloud_cert_crt
suffix: .crt
path: /tmp/
register: tmp_subcloud_ica_crt
- name: Create tmp file name for the ICA key
tempfile:
state: file
prefix: tmp_subcloud_cert_key
suffix: .key
path: /tmp/
register: tmp_subcloud_ica_key
- name: Save the ICA certificate to file
copy:
content: "{{ subcloud_ica_cert_result.stdout | b64decode }}"
dest: "{{ tmp_subcloud_ica_crt.path }}"
- name: Save the ICA key to file
copy:
content: "{{ subcloud_ica_key_result.stdout | b64decode }}"
dest: "{{ tmp_subcloud_ica_key.path }}"
- name: Update ICA and certificate chain on subcloud
shell: |
kubectl -n sc-cert delete secret sc-adminep-ca-certificate --ignore-not-found=true
kubectl -n sc-cert create secret tls sc-adminep-ca-certificate \
--cert "{{ tmp_subcloud_ica_crt.path }}" --key "{{ tmp_subcloud_ica_key.path }}"
kubectl -n sc-cert delete secret sc-adminep-certificate --ignore-not-found=true
environment:
KUBECONFIG: "/etc/kubernetes/admin.conf"
register: subcloud_ica_update_result
retries: 5
delay: 5
until: subcloud_ica_update_result is not failed
- name: Wait until management affecting alarms clear up
shell: source /etc/platform/openrc; fm alarm-list --mgmt_affecting | grep -c True
register: mgmt_affecting_present
failed_when: |
mgmt_affecting_present.stdout | int != 0 or mgmt_affecting_present.stderr | length > 0
# Up to 10 mins to have some buffer, but usually takes ~ 3 mins.
retries: 30
delay: 20
until:
- mgmt_affecting_present.stdout | int == 0
- mgmt_affecting_present.stderr | length == 0
- name: Set fact to mark that DC admin endpoint certificate chain recovery was performed
set_fact:
subcloud_dc_admin_ep_cert_chain_recovered: true
when: ca_verification.rc != 0
always:
- block:
- name: Delete temporary file on systemcontroller
file:
path: "{{ tmp_subcloud_ica_template.path }}"
state: absent
connection: local
when: tmp_subcloud_ica_template.path is defined
- name: Delete temporary files on subcloud
file:
path: "{{ item }}"
state: absent
loop:
- "{{ tmp_subcloud_ica_crt.path if tmp_subcloud_ica_crt.path is defined }}"
- "{{ tmp_subcloud_ica_key.path if tmp_subcloud_ica_key.path is defined }}"
- "{{ tmp_subcloud_ica_subcloud.path if tmp_subcloud_ica_subcloud.path is defined }}"

View File

@@ -0,0 +1,85 @@
---
#
# Copyright (c) 2023 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
# SUB-TASK DESCRIPTION:
# These tasks recover the kubernetes leaf certificates after expiry
# and verify the k8s cluster is healthy afterwards
#
- name: Verify k8s leaf certificates expiration
shell: |
K8_OUT=$(kubeadm certs check-expiration --config /etc/kubernetes/kubeadm.yaml)
if [ "$?" -eq "0" ]; then
echo "$K8_OUT" | grep invalid
# if no matches for 'invalid', it means no k8s leaf certificate is expired
# proceed to check the kubelet certificate
if [ "$?" -ne "0" ]; then
# checkend will return 0 if valid and 1 for invalid
openssl x509 -in /var/lib/kubelet/pki/kubelet-client-current.pem -checkend 0
exit $?
else
exit 1
fi
fi
register: k8s_certs_expiration
failed_when: false
become: yes
- name: Recover K8s cluster after certificate expiration
block:
- name: Recover k8s controller plane leaf certificates
command: bash -x /usr/bin/kube-cert-rotation.sh
register: kube_cert_rotation_out
become: yes
retries: 5
delay: 10
until: kube_cert_rotation_out is not failed
- name: Recover kubelet certificates
command: bash -x /usr/bin/kube-expired-kubelet-cert-recovery.sh
register: kubelet_cert_rotation_out
become: yes
retries: 5
delay: 10
until: kubelet_cert_rotation_out is not failed
- name: Wait till kubectl starts replying
command: kubectl get nodes
environment:
KUBECONFIG: "/etc/kubernetes/admin.conf"
register: k8s_get_nodes
# Waits up to 2 minutes, but in a freshly installed system, it takes ~ 20secs
retries: 12
delay: 10
until: k8s_get_nodes is not failed
- name: Trigger a restart of every pod (deployment,statefulset,daemonset rollout)
shell: >-
kubectl get deployment,statefulset,daemonset -A --no-headers |
awk '{ print " rollout restart " $2 " -n " $1}' | xargs -n5 kubectl
environment:
KUBECONFIG: "/etc/kubernetes/admin.conf"
- name: Wait pods to restart (become READY)
shell: >-
kubectl get pods -l '!job-name' -A --no-headers \
-o 'custom-columns=NAME:.metadata.name,READY:.status.containerStatuses[*].ready' \
| grep -cv true
environment:
KUBECONFIG: "/etc/kubernetes/admin.conf"
register: pods_starting
failed_when: pods_starting.stdout | int != 0
# Waits for up to one hour. This is the same value used in k8s-rootca-update
# A large number is used because the recovery time depends on the number of pods running.
# In a freshly installed system, without additional pods, it takes ~ 1min 10secs
retries: 360
delay: 10
until: pods_starting.stdout | int == 0
- name: Set fact to mark that K8s leaf certificates recovery was performed
set_fact:
subcloud_k8s_leaf_certs_recovered: true
when: k8s_certs_expiration.rc != 0

View File

@@ -0,0 +1,55 @@
---
#
# Copyright (c) 2023 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
# SUB-TASK DESCRIPTION:
# These tasks recover the K8s Root CA certificates (K8s Root CA, ETCD CA, FrontProxy CA)
#
- name: Verify if Kubernetes Root CA is expired
community.crypto.x509_certificate_info:
path: /etc/kubernetes/pki/ca.crt
register: k8s_root_ca_info
become: yes
- name: Verify if ETCD Root CA is expired
community.crypto.x509_certificate_info:
path: /etc/etcd/ca.crt
register: etcd_root_ca_info
become: yes
- name: Verify if FrontProxy Root CA is expired
community.crypto.x509_certificate_info:
path: /etc/kubernetes/pki/front-proxy-ca.crt
register: frontproxy_ca_info
become: yes
- name: Copy K8s Root CA cert and key from systemcontroller to the subcloud
include_tasks: copy-file-from-local-to-remote.yml
loop:
- { path: /etc/kubernetes/pki/ca.crt }
- { path: /etc/kubernetes/pki/ca.key }
when: k8s_root_ca_info.expired
- name: Copy ETCD Root CA cert and key from systemcontroller to the subcloud
include_tasks: copy-file-from-local-to-remote.yml
loop:
- { path: /etc/etcd/ca.crt, perms: '0755' }
- { path: /etc/etcd/ca.key }
when: etcd_root_ca_info.expired
- name: Copy FrontProxy Root CA cert and key from systemcontroller to the subcloud
include_tasks: copy-file-from-local-to-remote.yml
loop:
- { path: /etc/kubernetes/pki/front-proxy-ca.crt }
- { path: /etc/kubernetes/pki/front-proxy-ca.key }
when: frontproxy_ca_info.expired
- name: Set fact to mark that K8s Root CAs recovery was performed
set_fact:
subcloud_k8s_root_ca_recovered: true
when:
- k8s_root_ca_info.expired
- etcd_root_ca_info.expired
- frontproxy_ca_info.expired