system-config/playbooks/zuul_reboot.yaml

# This relies on flock -n /var/run/zuul_reboot.lock to ensure
# we don't run multiple copies of this playbook concurrently.

# TODO: stop pulling in the hourly job if we do this
- name: "Ensure we are going to restart/reboot on the same image"
  import_playbook: zuul_pull.yaml

# TODO Do we want to force disabled servers to be rebooted too?
- hosts: "zuul-executor:!disabled"
  name: "Reboot zuul-executors gracefully one at a time"
  serial: 1
  tasks:
    - name: Gracefully stop the executor
      include_role:
        name: zuul-executor
        tasks_from: graceful
    - name: Upgrade executor server packages
      apt:
        update_cache: yes
        upgrade: yes
      register: apt_action
      # 20 minute wait for unattended-upgrades to complete
      delay: 30
      retries: 40
      until: apt_action is success
    - name: Reboot the executor server
      reboot:
    # TODO(clarkb) newer ansible (core >=2.18) has a mount_facts module we
    # might be able to use instead.
    - name: Wait for AFS to be mounted before starting executors
      command: 'mount -t afs'
      register: mount_output
      until: "'AFS on /afs type afs' in mount_output.stdout"
      # Booting servers can sometimes be slow. Give AFS 5 minutes
      retries: 60
      delay: 5
    - name: Start the executor
      include_role:
        name: zuul-executor
        tasks_from: start

- hosts: "zuul-merger:!disabled"
  name: "Reboot zuul-mergers gracefully one at a time"
  serial: 1
  tasks:
    - name: Gracefully stop the merger
      include_role:
        name: zuul-merger
        tasks_from: graceful
    - name: Upgrade merger server packages
      apt:
        update_cache: yes
        upgrade: yes
      register: apt_action
      # 20 minute wait for unattended-upgrades to complete
      delay: 30
      retries: 40
      until: apt_action is success
    - name: Reboot the merger server
      reboot:
    - name: Start the merger
      include_role:
        name: zuul-merger
        tasks_from: start

- hosts: "zuul-launcher:!disabled"
  name: "Reboot zuul-launchers gracefully one at a time"
  serial: 1
  tasks:
    - name: Gracefully stop the launcher
      include_role:
        name: zuul-launcher
        tasks_from: graceful
    - name: Upgrade launcher server packages
      apt:
        update_cache: yes
        upgrade: yes
      register: apt_action
      # 20 minute wait for unattended-upgrades to complete
      delay: 30
      retries: 40
      until: apt_action is success
    - name: Reboot the launcher server
      reboot:
    - name: Start the launcher
      include_role:
        name: zuul-launcher
        tasks_from: start

# TODO should we do both schedulers with reboots then do the webs without
# reboots?
- hosts: "zuul-scheduler:!disabled"
  name: "Reboot zuul-schedulers gracefully one at a time"
  serial: 1
  tasks:
    - name: Stop the scheduler process
      include_role:
        name: zuul-scheduler
        tasks_from: stop
    - name: Stop the web processes
      include_role:
        name: zuul-web
        tasks_from: stop
    - name: Upgrade scheduler server packages
      apt:
        update_cache: yes
        upgrade: yes
      register: apt_action
      # 20 minute wait for unattended-upgrades to complete
      delay: 30
      retries: 40
      until: apt_action is success
    - name: Reboot the scheduler server
      reboot:
    - name: Start the scheduler process
      include_role:
        name: zuul-scheduler
        tasks_from: start
    - name: Start the web processes
      include_role:
        name: zuul-web
        tasks_from: start
    - name: Wait for scheduler to be running
      uri:
        url: https://zuul.opendev.org/api/components
        method: GET
        return_content: yes
      register: components
      # 3 hours
      retries: 360
      delay: 30
      until: "{{ components.status == 200 and components.content | from_json | json_query(scheduler_query) | length == 1 and components.content | from_json | json_query(scheduler_query) | first == 'running' }}"
      vars:
        scheduler_query: "scheduler[?hostname=='{{ inventory_hostname }}'].state"
    - name: Wait for web to be running
      uri:
        url: https://zuul.opendev.org/api/components
        method: GET
        return_content: yes
      register: components
      # 3 hours
      retries: 360
      delay: 30
      until: "{{ components.status == 200 and components.content | from_json | json_query(web_query) | length == 1 and components.content | from_json | json_query(web_query) | first == 'running' }}"
      vars:
        web_query: "web[?hostname=='{{ inventory_hostname }}'].state"
    - name: Wait for fingergw to be running
      uri:
        url: https://zuul.opendev.org/api/components
        method: GET
        return_content: yes
      register: components
      # 45 minutes
      retries: 180
      delay: 15
      until: "{{ components.status == 200 and components.content | from_json | json_query(finger_query) | length == 1 and components.content | from_json | json_query(finger_query) | first == 'running' }}"
      vars:
        finger_query: "fingergw[?hostname=='{{ inventory_hostname }}'].state"