ansible-playbooks/playbookconfig/src/playbooks/roles/bootstrap/prepare-env/tasks/load_patching_tasks.yml

---
#
# Copyright (c) 2019 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
# SUB-TASKS DESCRIPTION:
#   These tasks perform following activities:
#       - verify if the installed load matches the backup load
#       - install patches if the system was patched
#       - reboot the controller if it is required by the patching
#
- block:
  - name: Create {{ restore_in_progress_flag }} flag file
    file:
      path: "{{ restore_in_progress_flag }}"
      state: touch

  # For remote play the backup tarball will be transferred to /scratch
  - block:
    # Check if the backup tarball already exists. If it is the second run
    # after the reboot, no need to transfer the backup tarball again.
    - name: Check if {{ backup_filename }} has been uploaded already
      stat:
        path: "/scratch/{{ backup_filename }}"
      register: check_backup_tarball

    - block:
      # TODO(wzhou): Considering to break backup tarball into multiple small tarfiles
      # During restore upload each small tarfile one at a time to restore a subfunction.

      # Because Ansible copy module uses ansible_remote_tmp directory as
      # a staging area to transfer file, the default ansible_remote_tmp
      # which is set in /tmp (1GB) may be too small for backup tarball,
      # we require user to set ansible_remote_tmp to a new directory in
      # /home/sysadmin via -e option on the command line. For example:
      # -e "ansible_remote_tmp=/home/sysadmin/ansible-restore"
      - name: Transfer backup tarball to /scratch on controller-0
        copy:
          src: "{{ initial_backup_dir }}/{{ backup_filename }}"
          dest: /scratch
          owner: root
          group: root
          mode: 0644

      # As an alternative to Ansible copy, synchronize module may be
      # used to transfer large files. But synchronize is broken in Ansible 2.8
      # https://github.com/ansible/ansible/issues/56629.
      # - name: Transfer backup tarball to /scratch on controller-0
      #   synchronize:
      #     src: "{{ initial_backup_dir }}/{{ backup_filename }}"
      #     dest: "/scratch/{{ backup_filename }}"

      when: not check_backup_tarball.stat.exists

    - name: Set target_backup_dir to /scratch
      set_fact:
        target_backup_dir: /scratch

    when: inventory_hostname != "localhost"

  - name: For local play set target_backup_dir to initial_backup_dir
    set_fact:
      target_backup_dir: "{{ initial_backup_dir }}"
    when: inventory_hostname == "localhost"

  - name: Set fact for patching staging dir
    set_fact:
      patching_staging_dir: /scratch/patching

  - name: Create staging directory {{ patching_staging_dir }} for patch files
    file:
      path: "{{ patching_staging_dir }}"
      state: directory

  - block:
    - name: Get the checksum of the build.info file of the installed load
      stat:
        path: /etc/build.info
        get_checksum: yes
      register: installed_buildinfo_check

    - name: Retrieve build.info file from backup
      command: >-
        tar -C {{ patching_staging_dir }} -xpf {{ target_backup_dir }}/{{ backup_filename }} --transform='s,.*/,,'
        etc/build.info
      args:
        warn: false

    - name: Get the checksum of the build.info file from the backup
      stat:
        path: "{{ patching_staging_dir }}/build.info"
        get_checksum: yes
      register: backup_buildinfo_check

    - name: Fail if load version of backup does not match the version of the installed load
      fail:
        msg: "Load version of backup does not match the version of the installed load."
      when: installed_buildinfo_check.stat.checksum != backup_buildinfo_check.stat.checksum

    - name: Retrieve platform.conf file from the backup
      command: >-
        tar -C {{ patching_staging_dir }} -xpf {{ target_backup_dir }}/{{ backup_filename }} --transform='s,.*/,,'
        etc/platform/platform.conf
      args:
        warn: false

    - name: Get subfunction from the backup
      shell: grep -F 'subfunction' {{ patching_staging_dir }}/platform.conf
      register: backup_subfunc

    - name: Get subfunction set from backup platform.conf
      set_fact:
        backup_subfunc_set: "{{ backup_subfunc.stdout_lines[0].split('=')[1].split(',') }}"

    - name: Get subfunction from the installed load
      shell: grep -F 'subfunction' /etc/platform/platform.conf
      register: installed_subfunc

    - name: Get subfunction set from installed platform.conf
      set_fact:
        installed_subfunc_set: "{{ installed_subfunc.stdout_lines[0].split('=')[1].split(',') }}"

    - name: Check the difference between the two subfunction sets
      set_fact:
        diff_set: "{{ backup_subfunc_set | symmetric_difference(installed_subfunc_set) }}"

    - name: Fail if subfunction of backup does not match the subfunction of the installed load
      fail:
        msg: "Subfunction mismatch - backup: {{ backup_subfunc_set }}, installed: {{ installed_subfunc_set }}"
      when: diff_set != []

    # Patching is potentially a multi-phase step. If the controller is impacted by patches from the
    # backup, it must be rebooted before continuing the restore. If restore_patching_complete_flag
    # file exists, it means it is the second run after the reboot. The restore and apply patching
    # block will be skipped.
    - name: Check if {{ restore_patching_complete_flag }} file exists
      stat:
        path: "{{ restore_patching_complete_flag }}"
      register: check_patching_complete

    # Restore and apply patching
    - block:
      - name: Strip the leading slash in dirname and assign it to a new variable
        set_fact:
          short_patching_permdir: "{{ patching_permdir | regex_replace('^\\/', '') }}"
          short_patching_repo_permdir: "{{ patching_repo_permdir | regex_replace('^\\/', '') }}"

      - name: Delete {{ patching_permdir }} dir if it exists
        file:
          path: "{{ patching_permdir }}"
          state: absent

      - name: Restore patching
        command: >-
          tar -C /opt -xpf {{ target_backup_dir }}/{{ backup_filename }} --strip-components=1
          {{ short_patching_permdir }}
        args:
          warn: false

      - name: Delete {{ patching_repo_permdir }} dir if it exists
        file:
          path: "{{ patching_repo_permdir }}"
          state: absent

      - name: Restore patching repo
        command: >-
          tar -C /www/pages -xpf {{ target_backup_dir }}/{{ backup_filename }} --strip-components=2
          {{ short_patching_repo_permdir }}
        args:
          warn: false

      - name: Apply patches
        command: sw-patch install-local
        args:
          warn: false

      - name: Create {{ restore_patching_complete_flag }} file
        file:
          path: "{{ restore_patching_complete_flag }}"
          state: touch

      # Check if the controller was impacted by patches
      - name: Check if {{ node_is_patched_flag }} file exists
        stat:
          path: "{{ node_is_patched_flag }}"
        register: check_node_is_patched

      # The controller was not impacted by patches. Reboot is not required.
      # However we need to restart the patch controller and agent, since
      # we setup the repo and patch store outside its control.
      - block:
        - name: Restart the patch controller and agent
          systemd:
            name: "{{ item }}"
            state: restarted
          with_items:
            - sw-patch-controller-daemon
            - sw-patch-agent
        when: not check_node_is_patched.stat.exists

      # The controller was impacted by patches. Reboot is required.
      - block:
        - name: Inform user that this controller will be rebooted
          debug:
            msg: >-
              This controller has been patched. A reboot will start.
              After reboot is completed, please re-run the playbook to
              restore the platform again.

        - name: Remove the {{ restore_in_progress_flag }} file
          file:
            path: "{{ restore_in_progress_flag }}"
            state: absent

        - name: Remove staging directory {{ patching_staging_dir }} for patch files
          file:
            path: "{{ patching_staging_dir }}"
            state: absent

        # For better control of the restore, we don't invoke Ansible
        # reboot module to reboot the node. We require user to re-run
        # the playbook to restore the platform after reboot is completed.
        # TODO(wzhou): Suport patching without re-run of the restore_platform
        #   playbook by either invoking Ansible reboot module or defining reboot
        #   as an async task.
        - name: Reboot the controller
          shell: sleep 5 && reboot
          failed_when: false

        - name: >-
            Define a variable to indicate that the play was ended due to required controller reboot
          set_fact:
            required_reboot: true

        - name: Trigger the play to end and do cleanup
          fail:
            msg: Trigger the play to end and do cleanup.

        when: check_node_is_patched.stat.exists

      when: not check_patching_complete.stat.exists

    # The restore_patching_complete_flag file is removed in the following two scenarios:
    # 1. This is the first run with no patches to apply.
    # 2. This is the second run after the node reboot due to patching.
    - name: Clear {{ restore_patching_complete_flag }} flag file
      file:
        path: "{{ restore_patching_complete_flag }}"
        state: absent

    - name: Remove staging directory {{ patching_staging_dir }} for patch files
      file:
        path: "{{ patching_staging_dir }}"
        state: absent

    rescue:
      - block:
        - name: Remove the {{ restore_in_progress_flag }} file
          file:
            path: "{{ restore_in_progress_flag }}"
            state: absent

        - name: Remove staging directory {{ patching_staging_dir }} for patch files
          file:
            path: "{{ patching_staging_dir }}"
            state: absent

        - name: Fail the platform restore
          fail:
            msg: Restore platform failed!
        when: required_reboot is not defined

      # This is inside bootstrap playbook. Invoking end_play will only end bootstrap.
      # The restore_platform playbook will continue to play which is not what we want.
      - name: Terminate the platform restore
        fail:
          msg: >-
            The restore is terminated due to required controller node reboot. Please
            re-run the playbook to restore the platform after reboot is completed.
        when: required_reboot

  become: yes
  become_user: root