bc833f1dfd
Ansible syntax got me. When I updated the apt tasks to retry on apt/dpkg locks I overindented the register, delay, retries, and until parameters. These are to the task not the module. Change-Id: I955d96b5467597503e0e5563e37ffa736ef2fcdc
126 lines
4.3 KiB
YAML
126 lines
4.3 KiB
YAML
# This relies on flock -n /var/run/zuul_reboot.lock to ensure
|
|
# we don't run multiple copies of this playbook concurrently.
|
|
|
|
# TODO: stop pulling in the hourly job if we do this
|
|
- name: "Ensure we are going to restart/reboot on the same image"
|
|
import_playbook: zuul_pull.yaml
|
|
|
|
# TODO Do we want to force disabled servers to be rebooted too?
|
|
- hosts: "zuul-executor:!disabled"
|
|
name: "Reboot zuul-executors gracefully one at a time"
|
|
serial: 1
|
|
tasks:
|
|
- name: Gracefully stop the executor
|
|
include_role:
|
|
name: zuul-executor
|
|
tasks_from: graceful
|
|
- name: Upgrade executor server packages
|
|
apt:
|
|
update_cache: yes
|
|
upgrade: yes
|
|
register: apt_action
|
|
# 20 minute wait for unattended-upgrades to complete
|
|
delay: 30
|
|
retries: 40
|
|
until: apt_action is success or 'Failed to lock apt for exclusive operation' not in apt_action.msg
|
|
- name: Reboot the executor server
|
|
reboot:
|
|
- name: Start the executor
|
|
include_role:
|
|
name: zuul-executor
|
|
tasks_from: start
|
|
|
|
- hosts: "zuul-merger:!disabled"
|
|
name: "Reboot zuul-mergers gracefully one at a time"
|
|
serial: 1
|
|
tasks:
|
|
- name: Gracefully stop the merger
|
|
include_role:
|
|
name: zuul-merger
|
|
tasks_from: graceful
|
|
- name: Upgrade merger server packages
|
|
apt:
|
|
update_cache: yes
|
|
upgrade: yes
|
|
register: apt_action
|
|
# 20 minute wait for unattended-upgrades to complete
|
|
delay: 30
|
|
retries: 40
|
|
until: apt_action is success or 'Failed to lock apt for exclusive operation' not in apt_action.msg
|
|
- name: Reboot the merger server
|
|
reboot:
|
|
- name: Start the merger
|
|
include_role:
|
|
name: zuul-merger
|
|
tasks_from: start
|
|
|
|
# TODO should we do both schedulers with reboots then do the webs without
|
|
# reboots?
|
|
- hosts: "zuul-scheduler:!disabled"
|
|
name: "Reboot zuul-schedulers gracefully one at a time"
|
|
serial: 1
|
|
tasks:
|
|
- name: Stop the scheduler process
|
|
include_role:
|
|
name: zuul-scheduler
|
|
tasks_from: stop
|
|
- name: Stop the web processes
|
|
include_role:
|
|
name: zuul-web
|
|
tasks_from: stop
|
|
- name: Upgrade scheduler server packages
|
|
apt:
|
|
update_cache: yes
|
|
upgrade: yes
|
|
register: apt_action
|
|
# 20 minute wait for unattended-upgrades to complete
|
|
delay: 30
|
|
retries: 40
|
|
until: apt_action is success or 'Failed to lock apt for exclusive operation' not in apt_action.msg
|
|
- name: Reboot the scheduler server
|
|
reboot:
|
|
- name: Start the scheduler process
|
|
include_role:
|
|
name: zuul-scheduler
|
|
tasks_from: start
|
|
- name: Start the web processes
|
|
include_role:
|
|
name: zuul-web
|
|
tasks_from: start
|
|
- name: Wait for scheduler to be running
|
|
uri:
|
|
url: https://zuul.opendev.org/api/components
|
|
method: GET
|
|
return_content: yes
|
|
register: components
|
|
# 3 hours
|
|
retries: 360
|
|
delay: 30
|
|
until: "{{ components.status == 200 and components.content | from_json | json_query(scheduler_query) | length == 1 and components.content | from_json | json_query(scheduler_query) | first == 'running' }}"
|
|
vars:
|
|
scheduler_query: "scheduler[?hostname=='{{ inventory_hostname }}'].state"
|
|
- name: Wait for web to be running
|
|
uri:
|
|
url: https://zuul.opendev.org/api/components
|
|
method: GET
|
|
return_content: yes
|
|
register: components
|
|
# 3 hours
|
|
retries: 360
|
|
delay: 30
|
|
until: "{{ components.status == 200 and components.content | from_json | json_query(web_query) | length == 1 and components.content | from_json | json_query(web_query) | first == 'running' }}"
|
|
vars:
|
|
web_query: "web[?hostname=='{{ inventory_hostname }}'].state"
|
|
- name: Wait for fingergw to be running
|
|
uri:
|
|
url: https://zuul.opendev.org/api/components
|
|
method: GET
|
|
return_content: yes
|
|
register: components
|
|
# 45 minutes
|
|
retries: 180
|
|
delay: 15
|
|
until: "{{ components.status == 200 and components.content | from_json | json_query(finger_query) | length == 1 and components.content | from_json | json_query(finger_query) | first == 'running' }}"
|
|
vars:
|
|
finger_query: "fingergw[?hostname=='{{ inventory_hostname }}'].state"
|