Merge "Remove extra reboot from optimized restore"

2022-10-27 20:40:13 +00:00
parent 8ea7aa795b 9a9397e72f
commit 01738355bf
5 changed files with 289 additions and 97 deletions
--- a/playbookconfig/src/playbooks/restore_optimized.yml
+++ b/playbookconfig/src/playbooks/restore_optimized.yml
@@ -34,6 +34,7 @@

  vars_files:
    - host_vars/backup-restore/default.yml
+    - vars/backup-restore/main.yml

  tasks:

@@ -50,6 +51,7 @@
        sysinv_config_permdir: "{{ '/opt/platform/sysinv/' + software_version }}"
        # SSL certs configuration
        ca_cert_dir: "/etc/pki/ca-trust/source/anchors"
+        pxelinux_config_permdir: "{{ '/opt/platform/config/' + software_version + '/pxelinux.cfg' }}"

    - name: Setup flags to control puppet manifest apply
      file:
@@ -126,60 +128,110 @@
      loop:
        - etc/barbican
        - etc/containerd
+        - etc/cni
        - etc/default
        - etc/docker
        - etc/docker-distribution
        - etc/drbd.d
        - etc/etcd
+        - etc/fm
+        - etc/group
+        - etc/group-
        - etc/haproxy
        - etc/hosts
        - etc/keystone
        - etc/kubernetes
+        - etc/lighttpd
+        - etc/mtc
+        - etc/mtc.conf
+        - etc/mtc.ini
+        - etc/passwd
+        - etc/passwd-
        - etc/pki
        - etc/platform/openrc
+        - etc/profile.d/kubeconfig.sh
        - etc/resolv.conf
+        - etc/shadow
+        - etc/shadow-
+        - etc/sm
        - etc/ssl
+        - etc/sysctl.d
        - etc/sysinv
      args:
        warn: false

+    - name: Update kernel parameters for iptables
+      command: sysctl --system &>/dev/null
+
    - name: Update boot loader configuration
      command: "{{ grub_mkconfig }} -o /boot/grub2/grub.cfg"

-    - name: Determine network configuration files
-      find:
-        paths: "{{ network_scripts_location }}"
-        patterns: "ifcfg-*"
-      register: network_files_to_delete
+    # Bring up networking, meant to replicate state during boostrapping
+    - name: Restore networking
+      block:
+        - name: Determine network configuration files
+          find:
+            paths: "{{ network_scripts_location }}"
+            patterns: "ifcfg-*"
+          register: network_files_to_delete

-    - name: Remove network configuration files
-      file:
-        path: "{{ item.path }}"
-        state: absent
-      loop: "{{ network_files_to_delete.files }}"
+        - name: Remove network configuration files
+          file:
+            path: "{{ item.path }}"
+            state: absent
+          loop: "{{ network_files_to_delete.files }}"

-    - name: Restore network configuration files
-      command: "tar -C / -xpf {{ platform_backup_fqpn }} --overwrite --wildcards {{ network_scripts_location_bkp }}/*"
+        - name: Restore network configuration files
+          command: "tar -C / -xpf {{ platform_backup_fqpn }} --overwrite --wildcards {{ network_scripts_location_bkp }}/*"

-    - name: Restore profile files
-      command: "tar -C / -xpf {{ platform_backup_fqpn }} --overwrite {{ item }}"
-      loop:
-        - "etc/profile.d/kubeconfig.sh"
-      args:
-        warn: false
+        #fails due to enp0s9 not having the ip set on ifcfg-enp0s9
+#        - name: Restart networking daemon
+#          systemd:
+#            name: networking
+#            state: restarted

-    - name: Restore ldap data
-      import_role:
-        name: backup-restore/restore-ldap
+        - name: Bring lo up
+          command: ifup lo lo:1 lo:5

-    - name: Restore etcd snapshot
-      import_role:
-        name: backup-restore/restore-etcd
+        - name: Lookup controller host address
+          command: "gethostip -d controller"
+          register: host_lookup
+
+        - name: Define controller host address
+          set_fact:
+            controller_address: "{{ host_lookup.stdout_lines[0] }}"
+
+        - name: Configure controller host address
+          command: "ip addr add {{ controller_address }} dev lo scope host"
+
+        - name: Lookup controller host address
+          command: "gethostip -d pxecontroller"
+          register: pxe_host_lookup
+
+        - name: Define controller host address
+          set_fact:
+            pxecontroller_address: "{{ pxe_host_lookup.stdout_lines[0] }}"
+
+        - name: Configure controller host address
+          command: "ip addr add {{ pxecontroller_address }} dev lo scope host"
+
+      ignore_errors: true

    - name: Restore Postgres
      import_role:
        name: backup-restore/restore-postgres

+    # restore-more-data/tasks/main.yml#459
+    # Set all the hosts including controller-0 to locked/disabled/offline state.
+    # After the services are restarted, mtce will update controller-0 to
+    # locked/disabled/online state. Setting controller-0 to offline state now
+    # will ensure that keystone, sysinv and mtcAgent are indeed in-service after being restarted.
+    - name: Set all the hosts to locked/disabled/offline state
+      shell: >-
+        psql -c "update i_host set administrative='locked', operational='disabled',
+        availability='offline'" sysinv
+      become_user: postgres
+
    - name: Restore persistent configuration
      command: "tar -C / -xpf {{ platform_backup_fqpn }} --overwrite {{ item }}"
      loop:
@@ -189,33 +241,6 @@
      args:
        warn: false

-    - name: Check archived kubelet dir
-      shell: "tar -tf {{ platform_backup_fqpn }} | grep 'var/lib/kubelet'"
-      args:
-        warn: false
-      register: kubelet_dir_result
-
-    - name: Restore kubelet configuration
-      command: "tar -C / -xpf {{ platform_backup_fqpn }} --overwrite var/lib/kubelet/"
-      args:
-        warn: false
-      when: kubelet_dir_result.rc == 0
-
-    - name: Restore kubelet pmond configuration file
-      command: "tar -C / -xpf {{ platform_backup_fqpn }} --overwrite {{ item }}"
-      loop:
-        - etc/pmon.d/kubelet.conf
-      args:
-        warn: false
-
-    - name: Reload systemd
-      command: systemctl daemon-reload
-
-    - name: Restore container registry filesystem
-      command: "tar -C / -xpf {{ registry_backup_fqpn }} --overwrite var/lib/docker-distribution/"
-      args:
-        warn: false
-
    - name: Check home dir for CentOS
      block:

@@ -232,7 +257,6 @@
          when: home_dir_result.rc == 0

      when: os_release == "centos"
-
    - name: Check home dir for Debian
      block:

@@ -250,53 +274,182 @@

      when: os_release == "debian"

-    - name: Lookup controller host address
-      command: "gethostip -d controller"
-      register: host_lookup
+    # This shouldn't be needed after restoring /etc/shadow and /etc/passwd. Cache?
+    - name: Make sure user sysinv is ready
+      user:
+        name: sysinv
+        group: sysinv
+        groups: sys_protected
+        shell: /sbin/nologin
+        state: present

-    - name: Define controller host address
-      set_fact:
-        controller_address: "{{ host_lookup.stdout_lines[0] }}"
-
-    - name: Configure controller host address
-      command: "ip addr add {{ controller_address }} dev lo scope host"
-
-    - name: Disable local registry authentication
-      command: "sed -i '/auth:/,$d' /etc/docker-distribution/registry/config.yml"
-
-    - name: Start docker registry service
+    - name: Bringup flock services
      systemd:
-        name: "{{ docker_registry_service }}"
+        name: "{{ item }}"
        state: restarted
-
-    - name: Start containerd service
-      systemd:
-        name: containerd
-        state: restarted
-
-    - name: Pull kubernetes local container images
-      command: "crictl pull registry.local:9001/{{ item }}"
      loop:
-        - k8s.gcr.io/kube-apiserver:v1.23.1
-        - k8s.gcr.io/kube-scheduler:v1.23.1
-        - k8s.gcr.io/kube-controller-manager:v1.23.1
-        - k8s.gcr.io/coredns/coredns:v1.8.6
+        - "{{ 'keystone' if os_release == 'debian' else 'openstack-keystone' }}"
+        - fminit
+        - fm-api
+        - sysinv-conductor
+        - sysinv-agent
+        - sysinv-api
+        - mtcClient
+        - "{{ 'barbican-api' if os_release == 'debian' else 'openstack-barbican-api' }}"

-    # restore-more-data/tasks/main.yml#459
-    # Set all the hosts including controller-0 to locked/disabled/offline state.
-    # After the services are restarted, mtce will update controller-0 to
-    # locked/disabled/online state. Setting controller-0 to offline state now
-    # will ensure that keystone, sysinv and mtcAgent are indeed in-service after being restated.
-    - name: Set all the hosts to locked/disabled/offline state
-      shell: >-
-        psql -c "update i_host set administrative='locked', operational='disabled',
-        availability='offline'" sysinv
-      become_user: postgres
+    - name: Bringup ocf flock services
+      command: "{{ item }} start"
+      environment:
+        OCF_ROOT: "/usr/lib/ocf"
+        OCF_RESKEY_state: "active"
+      loop:
+        - /usr/lib/ocf/resource.d/platform/mtcAgent

-    # NOTE(outbrito): If I leave the task below like this, sm comes up as part of the restore and
-    # brings drbd up once the node reboots, then I had to enable/start kubelet manually. I also had
-    # to bounce drbd since after the snapshot restore, drbd doesn't get the restored data promptly
-    # I think there is some kind of caching involved
+    - name: Restore ldap data
+      import_role:
+        name: backup-restore/restore-ldap
+
+    - name: Restore docker registry
+      block:
+        - name: Restore container registry filesystem
+          command: "tar -C / -xpf {{ registry_backup_fqpn }} --overwrite var/lib/docker-distribution/"
+          args:
+            warn: false
+
+        - name: Disable local registry authentication
+          command: "sed -i '/auth:/,$d' /etc/docker-distribution/registry/config.yml"
+
+        - name: Start docker registry service
+          systemd:
+            name: "{{ docker_registry_service }}"
+            state: restarted
+
+    - name: Restore etcd
+      block:
+        - name: Restore etcd snapshot
+          import_role:
+            name: backup-restore/restore-etcd
+
+        - name: Start etcd
+          systemd:
+            name: etcd
+            state: restarted
+
+    - name: Restore kubernetes
+      block:
+        - name: Start containerd service
+          systemd:
+            name: containerd
+            state: restarted
+
+        - name: Pull kubernetes local container images
+          command: "crictl pull registry.local:9001/{{ item }}"
+          loop:
+            - k8s.gcr.io/kube-apiserver:v1.23.1
+            - k8s.gcr.io/kube-scheduler:v1.23.1
+            - k8s.gcr.io/kube-controller-manager:v1.23.1
+            - k8s.gcr.io/coredns/coredns:v1.8.6
+
+        - name: Check archived kubelet dir
+          shell: "tar -tf {{ platform_backup_fqpn }} | grep 'var/lib/kubelet'"
+          args:
+            warn: false
+          register: kubelet_dir_result
+
+        - name: Restore kubelet configuration
+          command: "tar -C / -xpf {{ platform_backup_fqpn }} --overwrite var/lib/kubelet/"
+          args:
+            warn: false
+          when: kubelet_dir_result.rc == 0
+
+        - name: Restore kubelet pmond configuration file
+          command: "tar -C / -xpf {{ platform_backup_fqpn }} --overwrite {{ item }}"
+          loop:
+            - etc/pmon.d/kubelet.conf
+          args:
+            warn: false
+
+        - name: Get Kubernetes version
+          import_role:
+            name: common/get-kube-version
+
+        - name: Mount k8s bind mount
+          import_role:
+            name: common/k8s-bind-mount
+
+        - name: Reload systemd
+          command: systemctl daemon-reload
+
+        - name: Start kubelet
+          systemd:
+            name: kubelet
+            state: restarted
+
+    - name: Restore helm service
+      block:  # excerpt from bringup_helm.yml
+        - name: Ensure helm directories exist
+          file:
+            path: "{{ item }}"
+            state: directory
+            recurse: yes
+            owner: www
+            group: root
+          with_items:
+            - /var/www/var
+            - /var/www/var/log
+            - /var/www/tmp
+
+        - name: Create source and target helm bind directories
+          file:
+            path: "{{ item }}"
+            state: directory
+            owner: www
+            group: root
+            mode: 0755
+          with_items:
+            - "{{ source_helm_bind_dir }}"
+            - "{{ target_helm_bind_dir }}"
+
+        - name: Restore Helm charts if the host is bootstrapped in restore mode
+          command: tar -C / --overwrite -xpf {{ platform_backup_fqpn }} {{ item }}
+          args:
+            warn: false
+          become_user: root
+          with_items:
+            - "{{ source_helm_bind_dir | regex_replace('^\\/', '') }}"
+
+        # Note that /opt/platform/helm_charts are owned by www
+        # NOTE: helm --debug option displays vital information, no harm enabling.
+        # These only show in ansible.log on failure.
+        - name: Generate Helm repo indicies
+          command: /sbin/helm repo index "{{ source_helm_bind_dir }}/{{ item }}" --debug
+          become_user: www
+          environment:
+            KUBECONFIG: /etc/kubernetes/admin.conf
+            HOME: /home/sysadmin
+          with_items:
+            - "{{ helm_repo_name_apps }}"
+            - "{{ helm_repo_name_platform }}"
+
+        - name: Bind mount on {{ target_helm_bind_dir }}
+          # Due to deficiency of mount module, resort to command for now
+          command: mount -o bind -t ext4 {{ source_helm_bind_dir }} {{ target_helm_bind_dir }}
+          args:
+            warn: false
+
+        - name: Enable and Restart lighttpd for Helm
+          systemd:
+            name: lighttpd
+            enabled: yes
+            state: restarted
+
+    - name: Create a symlink to PXE config files
+      file:
+        src: "{{ pxelinux_config_permdir }}"
+        dest: /var/pxeboot/pxelinux.cfg
+        state: link
+
+    # Make system ready for unlock
    - name: Restore complete, set flags
      file:
        path: "{{ item }}"
--- a/playbookconfig/src/playbooks/roles/bootstrap/bringup-essential-services/tasks/main.yml
+++ b/playbookconfig/src/playbooks/roles/bootstrap/bringup-essential-services/tasks/main.yml
@@ -35,8 +35,9 @@
    when: last_config_file_exists and reconfigure_endpoints and
          (mgmt_floating_virtual != prev_mgmt_floating_virtual)

-  - name: Mount current kubernetes version
-    import_tasks: k8s_bind_mount.yml
+  - name: Mount current Kubernetes version
+    import_role:
+      name: common/k8s-bind-mount

  - name: Refresh local DNS (i.e. /etc/hosts)
    import_tasks: refresh_local_dns.yml
--- a/playbookconfig/src/playbooks/roles/common/k8s-bind-mount/tasks/main.yml
+++ b/playbookconfig/src/playbooks/roles/common/k8s-bind-mount/tasks/main.yml
@@ -0,0 +1,29 @@
+---
+#
+# Copyright (c) 2022 Wind River Systems, Inc.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# ROLE DESCRIPTION:
+#   These tasks do the preparation for kubernetes staged versions.
+#
+
+- block:
+  - name: Set Kubernetes local directory
+    set_fact:
+      kube_local_dir: "/usr/local/kubernetes"
+
+  # When updating kubernetes, kubeadm and kubelet/kubectl need to be updated separately
+  # and we will have "stage1" and "stage2" subdirectories to separate these different stages.
+  - name: Bind Kubernetes stage1 and stage2 directories
+    mount:
+      path: "{{ kube_local_dir }}/current/{{ item }}"
+      src: "{{ kube_local_dir }}/{{ kubernetes_version }}/{{ item }}"
+      opts: bind
+      state: mounted
+      fstype: none
+    with_items:
+      - "stage1"
+      - "stage2"
+
+  when: kubernetes_version is defined
--- a/playbookconfig/src/playbooks/roles/restore-platform/restore-more-data/tasks/main.yml
+++ b/playbookconfig/src/playbooks/roles/restore-platform/restore-more-data/tasks/main.yml
@@ -483,7 +483,7 @@
  # Set all the hosts including controller-0 to locked/disabled/offline state.
  # After the services are restarted, mtce will update controller-0 to
  # locked/disabled/online state. Setting controller-0 to offline state now
-  # will ensure that keystone, sysinv and mtcAgent are indeed in-service after being restated.
+  # will ensure that keystone, sysinv and mtcAgent are indeed in-service after being restarted.
  - name: Set all the hosts to locked/disabled/offline state
    shell: >-
      psql -c "update i_host set administrative='locked', operational='disabled',
@@ -537,7 +537,7 @@

  # Run "system host-list" to verify that controller-0 is in
  # "online" state. This will ensure that keystone, sysinv and
-  # mtcAgent are indeed in-service after being restated.
+  # mtcAgent are indeed in-service after being restarted.
  - name: Check controller-0 is in online state
    shell: source /etc/platform/openrc; system host-show controller-0 --column availability --format value
    register: check_online
--- a/playbookconfig/src/playbooks/vars/backup-restore/main.yml
+++ b/playbookconfig/src/playbooks/vars/backup-restore/main.yml
@@ -0,0 +1,9 @@
+---
+# Should we move these to vars/common? They're used on:
+#   task bringup_helm.yml
+#   playbook upgrade-k8s-armada-helm.yml
+#   role restore-more-data.yml
+source_helm_bind_dir: /opt/platform/helm_charts
+target_helm_bind_dir: /var/www/pages/helm_charts
+helm_repo_name_apps: starlingx
+helm_repo_name_platform: stx-platform