Removed bootstrap roles from optimized restore

Optimized restore relied on bootstrap roles for several key parts during
development.  Now that optimized restore is stable, bootstrap roles can
be removed, to further improve performance.

TEST PLAN
PASS: Optimized B&R on AIO-SX
* With registry-filesystem
* Without registry-filesystem
* With wipe_ceph_osds=true
* With wipe_ceph_osds=false
* With static policy, https://review.opendev.org/c/starlingx/ansible-playbooks/+/890370
* Without ceph configured
PASS: Optimized B&R on AIO-SX subcloud
* With and without prestaged data
PASS: Optimized B&R after stx6 to stx8 upgrade
* Done without Kubernetes upgrade after stx upgrade
* With and without registry filesystem
PASS: Optimized B&R on subcloud after stx6 to stx8 upgrade
* Done without Kubernetes upgrade after stx upgrade
* With and without prestage data
PASS: Optimized upgrade on AIO-SX, stx6 to stx8
* Subcloud upgrade
PASS: AIO-SX bootstrap
PASS: AIO-DX bootstrap

Depends-On: https://review.opendev.org/c/starlingx/ansible-playbooks/+/893190
Depends-On: https://review.opendev.org/c/starlingx/ansible-playbooks/+/889902
Story: 2010798
Task: 48266
Signed-off-by: Joshua Kraitberg <joshua.kraitberg@windriver.com>
Change-Id: I11640bc7a899bc9428d8d800321a5353cc31a27b
This commit is contained in:
Joshua Kraitberg 2023-08-15 13:38:46 -04:00
parent 748668ad2a
commit ceeccb1dcd
17 changed files with 301 additions and 152 deletions

View File

@ -1,6 +1,6 @@
---
#
# Copyright (c) 2022 Wind River Systems, Inc.
# Copyright (c) 2022-2023 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@ -28,8 +28,7 @@
- backup-restore/prepare-env
- optimized-restore/prepare-env
- restore-platform/prepare-env
- bootstrap/prepare-env
- { role: bootstrap/validate-config, become: yes }
- { role: common/wipe-ceph-osds, become: yes }
- { role: optimized-restore/apply-manifest, become: yes }
- { role: optimized-restore/restore-configuration, become: yes }
- { role: optimized-restore/restore-data, become: yes }

View File

@ -114,6 +114,8 @@
command: >-
sed -i 's/controller-0/controller-1/'
{{ldap_schema_path}}/cn=config/olcDatabase={1}mdb.ldif
args:
warn: false
- name: Restore ldap
command: "slapadd -F {{ ldap_schema_path }} -l {{ staging_dir }}/ldap.db"

View File

@ -673,68 +673,8 @@
msg: "additional_local_registry_images must be a list"
when: additional_local_registry_images | type_debug != 'list'
# Docker images archive source validation
- block:
- set_fact:
images_archive_md5_file: "{{ images_archive_dir }}/container-image.tar.gz.md5"
- name: Check if images archive(s) exists
find:
paths: "{{ images_archive_dir }}"
patterns: "*.tar.gz"
recurse: no
register: images_archive_find_output
- debug: var=images_archive_find_output.files
- set_fact:
num_of_archive_files_on_disk: "{{ images_archive_find_output.files|length }}"
- block:
- name: Check if images archive md5 exists
stat:
path: "{{ images_archive_md5_file }}"
register: images_archive_md5
- block:
- name: Get number of archive files in md5 file
shell: cat {{ images_archive_md5_file }} | wc -l
register: file_count
- name: Print warning if md5 file content is invalid
debug:
msg: >-
WARNING: Number of archive files listed in {{ images_archive_md5_file }}
does not match with the number of image archive files on disk. Fall
back to downloading images...
when: file_count.stdout != num_of_archive_files_on_disk
- block:
- name: Verify container images archive file checksum
command: md5sum -c {{ images_archive_md5_file }}
args:
chdir: "{{ images_archive_dir }}"
register: checksum_result
failed_when: false
- debug: var=checksum_result
- name: Print warning if images archive checksum failed
debug:
msg: >-
WARNING: Images archive checksum failed. Fall back to downloading
images...
when: checksum_result.rc != 0
- name: Turn on images archive flag if file checksum is successfully validated
set_fact:
images_archive_exists: true
images_archive_files: "{{ images_archive_find_output.files }}"
when: checksum_result.rc == 0
when: file_count.stdout == num_of_archive_files_on_disk
when: images_archive_md5.stat.exists
when: num_of_archive_files_on_disk|int > 0
- import_role:
role: common/validate-image-archives
# System applications validation
- name: Validate applications
@ -902,27 +842,8 @@
(item.readOnly | type_debug != 'bool')
loop: "{{ apiserver_extra_volumes + controllermanager_extra_volumes + scheduler_extra_volumes }}"
# Wipe ceph osds
# Note that due to Ansible mishandling of boolean values via extra-vars we are
# adding supplementary validation here.
# See: https://github.com/ansible/ansible/issues/17193
- name: Check for Ceph data wipe flag
fail:
msg: "wipe_ceph_osds is misconfigured. Valid value is either 'true' or 'false'"
when: (mode == "restore") and (not wipe_ceph_osds | type_debug == 'bool') and
(wipe_ceph_osds != 'true') and
(wipe_ceph_osds != 'false')
- block:
- name: Wipe ceph osds
script: wipe_osds.sh
register: results
- name: Result of wiping ceph osds
debug: var=results.stdout_lines
when: (mode == "bootstrap") or
(mode == "restore" and wipe_ceph_osds|bool)
- import_role:
name: common/wipe-ceph-osds
# bootstrap_config ini file generation
- block:

View File

@ -34,8 +34,3 @@
- { name: 'registry.k8s.io', value: "{{ registryk8s_registry }}" }
- { name: 'icr.io', value: "{{ icr_registry }}" }
no_log: true
- name: Infer missing registries values during upgrades
import_tasks:
file: get_missing_docker_registries.yml
when: upgrade_in_progress|default(false)

View File

@ -32,10 +32,16 @@
when: mode is regex("^upgrade_") or
mode == 'trident_install'
# During a restore/upgrade docker registry information will not be in facts.
# Instead this information will be pulled from the platform backup.
# Obtaining the values from the backup is significantly faster than getting them
# from sysinv and barbican.
- name: Retrieve configured docker registries during upgrades
import_tasks:
file: get_docker_registries.yml
when: upgrade_in_progress|default(false)
file: restore_docker_registries.yml
when:
- mode | default(none) == 'restore'
- restore_mode | default(none) == 'optimized'
# Disable the log to not expose registry password
- name: Get registry credentials if registry type is AWS ECR

View File

@ -0,0 +1,34 @@
---
#
# Copyright (c) 2023 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
# SUB-TASKS DESCRIPTION:
# Restore docker registries during restore or upgrade.
# During the backup, the registry information was dumped into the
# override file.
- name: Read docker registries from backup
slurp:
src: "{{ target_backup_dir }}/{{ override_filename }}"
register: restore_overrides
no_log: true
- name: Set docker registries facts
set_fact:
registries:
docker_registries: "{{ (restore_overrides.content | b64decode | from_yaml)['docker_registries'] }}"
no_log: true
# TODO(jkraibe): Remove after reworking how role uses facts to log in/out of registries
- name: Create single var registry facts
set_fact:
"{{ registry_to_fact[item[0]] }}": "{{ item[1] }}"
loop: "{{ registries['docker_registries'].items() }}"
no_log: true
- name: Infer missing registries values during upgrades
import_tasks:
file: get_missing_docker_registries.yml
when: upgrade_in_progress|default(false)

View File

@ -13,5 +13,11 @@ registry_to_service:
registry.k8s.io: registryk8s-registry
registry_to_fact:
docker.elastic.co: elastic_registry
docker.io: docker_registry
gcr.io: gcr_registry
ghcr.io: ghcr_registry
icr.io: icr_registry
k8s.gcr.io: k8s_registry
quay.io: quay_registry
registry.k8s.io: registryk8s_registry

View File

@ -0,0 +1,72 @@
---
#
# Copyright (c) 2023 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
# SUB-TASKS DESCRIPTION:
# Validate image archives that are present on system.
#
# Docker images archive source validation
- block:
- set_fact:
images_archive_md5_file: "{{ images_archive_dir }}/container-image.tar.gz.md5"
- name: Check if images archive(s) exists
find:
paths: "{{ images_archive_dir }}"
patterns: "*.tar.gz"
recurse: no
register: images_archive_find_output
- debug: var=images_archive_find_output.files
- set_fact:
num_of_archive_files_on_disk: "{{ images_archive_find_output.files|length }}"
- block:
- name: Check if images archive md5 exists
stat:
path: "{{ images_archive_md5_file }}"
register: images_archive_md5
- block:
- name: Get number of archive files in md5 file
shell: cat {{ images_archive_md5_file }} | wc -l
register: file_count
- name: Print warning if md5 file content is invalid
debug:
msg: >-
WARNING: Number of archive files listed in {{ images_archive_md5_file }}
does not match with the number of image archive files on disk. Fall
back to downloading images...
when: file_count.stdout != num_of_archive_files_on_disk
- block:
- name: Verify container images archive file checksum
command: md5sum -c {{ images_archive_md5_file }}
args:
chdir: "{{ images_archive_dir }}"
register: checksum_result
failed_when: false
- debug: var=checksum_result
- name: Print warning if images archive checksum failed
debug:
msg: >-
WARNING: Images archive checksum failed. Fall back to downloading
images...
when: checksum_result.rc != 0
- name: Turn on images archive flag if file checksum is successfully validated
set_fact:
images_archive_exists: true
images_archive_files: "{{ images_archive_find_output.files }}"
when: checksum_result.rc == 0
when: file_count.stdout == num_of_archive_files_on_disk
when: images_archive_md5.stat.exists
when: num_of_archive_files_on_disk|int > 0

View File

@ -0,0 +1,33 @@
---
#
# Copyright (c) 2023 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
# SUB-TASK DESCRIPTION:
# This will wipe any Ceph OSDs during a bootstrap or optionally
# wipe them during restore.
#
# TODO(jkraitbe): Consider moving this check to happen earlier
# Wipe ceph osds
# Note that due to Ansible mishandling of boolean values via extra-vars we are
# adding supplementary validation here.
# See: https://github.com/ansible/ansible/issues/17193
- name: Check for Ceph data wipe flag
fail:
msg: "wipe_ceph_osds is misconfigured. Valid value is either 'true' or 'false'"
when: (mode == "restore") and (not wipe_ceph_osds | type_debug == 'bool') and
(wipe_ceph_osds != 'true') and
(wipe_ceph_osds != 'false')
- block:
- name: Wipe ceph osds
script: wipe_osds.sh
register: results
- name: Result of wiping ceph osds
debug: var=results.stdout_lines
when: (mode == "bootstrap") or
(mode == "restore" and wipe_ceph_osds|bool)

View File

@ -0,0 +1,5 @@
---
# Changing this value will only impact restore.
# If set to true, OSDs will be wiped during a restore.
# OSDs are always wiped during bootstrap.
wipe_ceph_osds: false

View File

@ -7,6 +7,34 @@
# ROLE DESCRIPTION:
# Prepare environment for optimized restore.
- name: Set config path facts
set_fact:
keyring_permdir: "{{ platform_path + '/.keyring/' + software_version }}"
config_permdir: "{{ platform_path + '/config/' + software_version }}"
sysinv_permdir: "{{ platform_path + '/sysinv/' + software_version }}"
puppet_permdir: "{{ platform_path + '/puppet/' + software_version }}"
images_archive_dir: "/opt/platform-backup/{{ software_version }}"
- name: Set config path facts for restore
set_fact:
branding_permdir: "{{ config_permdir }}/branding"
banner_permdir: "{{ config_permdir }}/banner/etc"
ssh_config_permdir: "{{ config_permdir }}/ssh_config"
pxe_config_permdir: "{{ config_permdir }}/pxelinux.cfg"
fluxcd_permdir: "{{ platform_path }}/fluxcd/"
helm_overrides_permdir: "{{ platform_path + '/helm/' + software_version }}"
sysinv_config_permdir: "{{ platform_path + '/sysinv/' + software_version }}"
- name: Check if bootstrap_finalized flag exists on host
stat:
path: "{{ config_permdir }}/.bootstrap_finalized"
register: bootstrap_finalized_flag
- name: Fail if host is unlocked or host configurations have already started
fail:
msg: Restore cannot be performed on deployed host, reinstall the system before trying again.
when: bootstrap_finalized_flag.stat.exists
- name: Set restore file parameter
set_fact:
restore_data_file: "{{ target_backup_dir }}/{{ backup_filename }}"
@ -47,3 +75,8 @@
previous_software_version: "{{ software_version }}"
when: not upgrade_in_progress
- name: Check if the prestaged registry filesystem exists
stat:
path: "{{ images_archive_dir }}/local_registry_filesystem.tgz"
register: prestage_registry_filesystem

View File

@ -20,6 +20,7 @@ restore_items:
- etc/haproxy
- etc/hosts
- etc/keystone
- etc/kubernetes
- etc/lighttpd
- etc/mtc
- etc/mtc.conf
@ -51,6 +52,7 @@ restore_exclude_items:
upgrade_exclude_items:
- etc/group
- etc/group-
- etc/kubernetes
- etc/passwd
- etc/passwd-
- etc/postgresql

View File

@ -21,7 +21,7 @@
- name: Restore configuration files
command: >-
tar --use-compress-program=pigz -C / -xpf {{ platform_backup_fqpn }} --overwrite
tar --use-compress-program=pigz -C / -xvpf {{ platform_backup_fqpn }} --overwrite
{{ ' '.join(restore_items) }}
{% for v in restore_exclude_items %}
--exclude {{ v | quote }}
@ -86,6 +86,10 @@
when: previous_software_version != '21.12'
- import_role:
name: roles/bootstrap/prepare-env
tasks_from: restore_prep_tasks.yml
- name: Migrate files and directories during upgrade
block:
- name: Migrate files to latest release

View File

@ -11,3 +11,8 @@ remove_containerd_pods_flag: /etc/platform/.remove_containerd_pods
# If set, all images that were present in the local cache from registry.local
# will be redownloaded during restore playbook.
restore_original_images: true
# Kubernetes bringup
kubelet_vol_plugin_dir: /var/opt/libexec/kubernetes/kubelet-plugins/volume/exec/
kubelet_pmond_template: /usr/share/puppet/modules/platform/templates/kubelet-pmond-conf.erb
kubelet_override_template: /usr/share/puppet/modules/platform/templates/kube-stx-override.conf.erb

View File

@ -38,6 +38,8 @@
- name: Disable local registry authentication
command: "sed -i '/auth:/,$d' /etc/docker-distribution/registry/config.yml"
args:
warn: false
- name: Restart docker services
systemd:
@ -45,3 +47,7 @@
state: restarted
enabled: true
loop: "{{ docker_services }}"
# Docker images archive source validation
- import_role:
name: common/validate-image-archives

View File

@ -98,71 +98,97 @@
import_role:
name: common/push-docker-images
- name: Check if var/lib/kublet is present in the backup
command:
cmd: "tar --use-compress-program=pigz -tf '{{ platform_backup_fqpn }}' var/lib/kubelet"
args:
warn: false
failed_when: false
register: kubelet_backup_found
- name: Restore path
block:
- name: Check if var/lib/kubelet is present in the backup
command:
cmd: "tar --use-compress-program=pigz -tf '{{ platform_backup_fqpn }}' var/lib/kubelet"
args:
warn: false
failed_when: false
register: kubelet_backup_found
- name: Restore var/lib/kublet from the backup
command:
# In the case where the backed-up CPU manager policy is 'static', the
# state preserved in "var/lib/kubelet/cpu_manager_state" conflicts with
# the default 'none' policy and causes kubelet to error out in the next
# step. Excluding this file here is safe as it gets regenerated with
# intended values after subsequent unlock after the restore playbook.
cmd: "tar --use-compress-program=pigz -C / -xpf \
'{{ platform_backup_fqpn }}' var/lib/kubelet \
--exclude var/lib/kubelet/cpu_manager_state"
args:
warn: false
when: kubelet_backup_found.rc == 0
- name: Restore var/lib/kubelet from the backup
command:
# In the case where the backed-up CPU manager policy is 'static', the
# state preserved in "var/lib/kubelet/cpu_manager_state" conflicts with
# the default 'none' policy and causes kubelet to error out in the next
# step. Excluding this file here is safe as it gets regenerated with
# intended values after subsequent unlock after the restore playbook.
cmd: "tar --use-compress-program=pigz -C / -xpf \
'{{ platform_backup_fqpn }}' var/lib/kubelet \
--exclude var/lib/kubelet/cpu_manager_state"
args:
warn: false
when: kubelet_backup_found.rc == 0
- name: Bring up Kubernetes master
import_role:
name: common/bringup-kubemaster
- name: Create kubelet override config file
template:
src: roles/common/bringup-kubemaster/templates/kubelet.conf.j2
dest: /etc/default/kubelet
vars:
node_ip: "{{ cluster_vip.stdout_lines[0].strip() }}"
- name: Start kubelet
systemd:
name: kubelet
state: started
retries: 3
delay: 15
- name: Stop etcd
service:
name: etcd
state: stopped
# Check if there is a default-registry-key for kube-system and create it when
# there is not during platform-restore process
- name: Get kube-system default registry key
command: >-
kubectl --kubeconfig=/etc/kubernetes/admin.conf get secret default-registry-key --namespace=kube-system
failed_when: false
register: kube_system_default_registry_key
- name: Restore etcd database
include_role:
name: backup-restore/restore-etcd
- name: Create kube-system default registry key
command: >-
kubectl -n kube-system create secret docker-registry default-registry-key
--docker-server={{ local_registry }}
--docker-username={{ local_registry_credentials['username'] }}
--docker-password={{ local_registry_credentials['password'] }}
environment:
KUBECONFIG: "/etc/kubernetes/admin.conf"
when: kube_system_default_registry_key.rc != 0
- name: Start etcd
service:
name: etcd
state: started
- name: Get deployment namespace default registry key
command: >-
kubectl --kubeconfig=/etc/kubernetes/admin.conf get secret default-registry-key --namespace=deployment
failed_when: false
register: get_deployment_default_registry_key
- name: Initializing Kubernetes master
command: >
kubeadm init
--ignore-preflight-errors=DirAvailable--var-lib-etc
--ignore-preflight-errors=FileAvailable--etc-kubernetes-manifests-kube-apiserver.yaml
--ignore-preflight-errors=FileAvailable--etc-kubernetes-manifests-kube-controller-manager.yaml
--ignore-preflight-errors=FileAvailable--etc-kubernetes-manifests-kube-scheduler.yaml
--config=/etc/kubernetes/kubeadm.yaml
- name: Copy default-registry-key to deployment namespace
shell: >-
kubectl get secret default-registry-key --namespace=kube-system -o yaml
| sed 's/namespace: kube-system/namespace: deployment/'
| kubectl apply --namespace=deployment -f -
environment:
KUBECONFIG: "/etc/kubernetes/admin.conf"
when: get_deployment_default_registry_key.stdout == ""
- name: Set up k8s environment variable
copy:
src: /usr/share/puppet/modules/platform/files/kubeconfig.sh
dest: /etc/profile.d/kubeconfig.sh
remote_src: yes
- name: Add kubelet service override
copy:
src: "{{ kubelet_override_template }}"
dest: /etc/systemd/system/kubelet.service.d/kube-stx-override.conf
mode: preserve
remote_src: yes
- name: Register kubelet with pmond
copy:
src: "{{ kubelet_pmond_template }}"
dest: /etc/pmon.d/kubelet.conf
mode: preserve
remote_src: yes
when: not upgrade_in_progress
# During an upgrade we do full bringup of Kubernetes.
- name: Restore Kubernetes during upgrade
block:
- name: Bring up Kubernetes master
import_role:
name: common/bringup-kubemaster
- name: Start kubelet
systemd:
name: kubelet
state: started
retries: 3
delay: 15
when: upgrade_in_progress
- name: Restore helm service
import_tasks: restore-helm.yml