system-config/playbooks/zuul_reboot.yaml

126 lines
4.3 KiB
YAML

# This relies on flock -n /var/run/zuul_reboot.lock to ensure
# we don't run multiple copies of this playbook concurrently.
# TODO: stop pulling in the hourly job if we do this
- name: "Ensure we are going to restart/reboot on the same image"
import_playbook: zuul_pull.yaml
# TODO Do we want to force disabled servers to be rebooted too?
- hosts: "zuul-executor:!disabled"
name: "Reboot zuul-executors gracefully one at a time"
serial: 1
tasks:
- name: Gracefully stop the executor
include_role:
name: zuul-executor
tasks_from: graceful
- name: Upgrade executor server packages
apt:
update_cache: yes
upgrade: yes
register: apt_action
# 20 minute wait for unattended-upgrades to complete
delay: 30
retries: 40
until: apt_action is success or 'Failed to lock apt for exclusive operation' not in apt_action.msg
- name: Reboot the executor server
reboot:
- name: Start the executor
include_role:
name: zuul-executor
tasks_from: start
- hosts: "zuul-merger:!disabled"
name: "Reboot zuul-mergers gracefully one at a time"
serial: 1
tasks:
- name: Gracefully stop the merger
include_role:
name: zuul-merger
tasks_from: graceful
- name: Upgrade merger server packages
apt:
update_cache: yes
upgrade: yes
register: apt_action
# 20 minute wait for unattended-upgrades to complete
delay: 30
retries: 40
until: apt_action is success or 'Failed to lock apt for exclusive operation' not in apt_action.msg
- name: Reboot the merger server
reboot:
- name: Start the merger
include_role:
name: zuul-merger
tasks_from: start
# TODO should we do both schedulers with reboots then do the webs without
# reboots?
- hosts: "zuul-scheduler:!disabled"
name: "Reboot zuul-schedulers gracefully one at a time"
serial: 1
tasks:
- name: Stop the scheduler process
include_role:
name: zuul-scheduler
tasks_from: stop
- name: Stop the web processes
include_role:
name: zuul-web
tasks_from: stop
- name: Upgrade scheduler server packages
apt:
update_cache: yes
upgrade: yes
register: apt_action
# 20 minute wait for unattended-upgrades to complete
delay: 30
retries: 40
until: apt_action is success or 'Failed to lock apt for exclusive operation' not in apt_action.msg
- name: Reboot the scheduler server
reboot:
- name: Start the scheduler process
include_role:
name: zuul-scheduler
tasks_from: start
- name: Start the web processes
include_role:
name: zuul-web
tasks_from: start
- name: Wait for scheduler to be running
uri:
url: https://zuul.opendev.org/api/components
method: GET
return_content: yes
register: components
# 3 hours
retries: 360
delay: 30
until: "{{ components.status == 200 and components.content | from_json | json_query(scheduler_query) | length == 1 and components.content | from_json | json_query(scheduler_query) | first == 'running' }}"
vars:
scheduler_query: "scheduler[?hostname=='{{ inventory_hostname }}'].state"
- name: Wait for web to be running
uri:
url: https://zuul.opendev.org/api/components
method: GET
return_content: yes
register: components
# 3 hours
retries: 360
delay: 30
until: "{{ components.status == 200 and components.content | from_json | json_query(web_query) | length == 1 and components.content | from_json | json_query(web_query) | first == 'running' }}"
vars:
web_query: "web[?hostname=='{{ inventory_hostname }}'].state"
- name: Wait for fingergw to be running
uri:
url: https://zuul.opendev.org/api/components
method: GET
return_content: yes
register: components
# 45 minutes
retries: 180
delay: 15
until: "{{ components.status == 200 and components.content | from_json | json_query(finger_query) | length == 1 and components.content | from_json | json_query(finger_query) | first == 'running' }}"
vars:
finger_query: "fingergw[?hostname=='{{ inventory_hostname }}'].state"