We run the zuul_reboot playbook across many hours which means that we can hit conflicts with daily unattended-upgrade runs. When this happens we don't immediately get the dpkg lock. We had code in place to retry when this conflict occurs but it looks like the Ansible apt module's error messages have chagned since we first implemented this so we don't actually retry. The reason for checking the message is to avoid retrying when a failure other than a failure to get the lock occurs. Considering this has already been problematic (due to changing messages) lets just go ahead and retry until we have success. Other failures are not anticipated so this should make the overall upgrade and reboot process more resilient. Change-Id: Iecbe8962bf14aeb4a1543ef1dbe5a9fe49cd1b55
159 lines
5.1 KiB
YAML
159 lines
5.1 KiB
YAML
# This relies on flock -n /var/run/zuul_reboot.lock to ensure
|
|
# we don't run multiple copies of this playbook concurrently.
|
|
|
|
# TODO: stop pulling in the hourly job if we do this
|
|
- name: "Ensure we are going to restart/reboot on the same image"
|
|
import_playbook: zuul_pull.yaml
|
|
|
|
# TODO Do we want to force disabled servers to be rebooted too?
|
|
- hosts: "zuul-executor:!disabled"
|
|
name: "Reboot zuul-executors gracefully one at a time"
|
|
serial: 1
|
|
tasks:
|
|
- name: Gracefully stop the executor
|
|
include_role:
|
|
name: zuul-executor
|
|
tasks_from: graceful
|
|
- name: Upgrade executor server packages
|
|
apt:
|
|
update_cache: yes
|
|
upgrade: yes
|
|
register: apt_action
|
|
# 20 minute wait for unattended-upgrades to complete
|
|
delay: 30
|
|
retries: 40
|
|
until: apt_action is success
|
|
- name: Reboot the executor server
|
|
reboot:
|
|
# TODO(clarkb) newer ansible (core >=2.18) has a mount_facts module we
|
|
# might be able to use instead.
|
|
- name: Wait for AFS to be mounted before starting executors
|
|
command: 'mount -t afs'
|
|
register: mount_output
|
|
until: "'AFS on /afs type afs' in mount_output.stdout"
|
|
# Booting servers can sometimes be slow. Give AFS 5 minutes
|
|
retries: 60
|
|
delay: 5
|
|
- name: Start the executor
|
|
include_role:
|
|
name: zuul-executor
|
|
tasks_from: start
|
|
|
|
- hosts: "zuul-merger:!disabled"
|
|
name: "Reboot zuul-mergers gracefully one at a time"
|
|
serial: 1
|
|
tasks:
|
|
- name: Gracefully stop the merger
|
|
include_role:
|
|
name: zuul-merger
|
|
tasks_from: graceful
|
|
- name: Upgrade merger server packages
|
|
apt:
|
|
update_cache: yes
|
|
upgrade: yes
|
|
register: apt_action
|
|
# 20 minute wait for unattended-upgrades to complete
|
|
delay: 30
|
|
retries: 40
|
|
until: apt_action is success
|
|
- name: Reboot the merger server
|
|
reboot:
|
|
- name: Start the merger
|
|
include_role:
|
|
name: zuul-merger
|
|
tasks_from: start
|
|
|
|
- hosts: "zuul-launcher:!disabled"
|
|
name: "Reboot zuul-launchers gracefully one at a time"
|
|
serial: 1
|
|
tasks:
|
|
- name: Gracefully stop the launcher
|
|
include_role:
|
|
name: zuul-launcher
|
|
tasks_from: graceful
|
|
- name: Upgrade launcher server packages
|
|
apt:
|
|
update_cache: yes
|
|
upgrade: yes
|
|
register: apt_action
|
|
# 20 minute wait for unattended-upgrades to complete
|
|
delay: 30
|
|
retries: 40
|
|
until: apt_action is success
|
|
- name: Reboot the launcher server
|
|
reboot:
|
|
- name: Start the launcher
|
|
include_role:
|
|
name: zuul-launcher
|
|
tasks_from: start
|
|
|
|
# TODO should we do both schedulers with reboots then do the webs without
|
|
# reboots?
|
|
- hosts: "zuul-scheduler:!disabled"
|
|
name: "Reboot zuul-schedulers gracefully one at a time"
|
|
serial: 1
|
|
tasks:
|
|
- name: Stop the scheduler process
|
|
include_role:
|
|
name: zuul-scheduler
|
|
tasks_from: stop
|
|
- name: Stop the web processes
|
|
include_role:
|
|
name: zuul-web
|
|
tasks_from: stop
|
|
- name: Upgrade scheduler server packages
|
|
apt:
|
|
update_cache: yes
|
|
upgrade: yes
|
|
register: apt_action
|
|
# 20 minute wait for unattended-upgrades to complete
|
|
delay: 30
|
|
retries: 40
|
|
until: apt_action is success
|
|
- name: Reboot the scheduler server
|
|
reboot:
|
|
- name: Start the scheduler process
|
|
include_role:
|
|
name: zuul-scheduler
|
|
tasks_from: start
|
|
- name: Start the web processes
|
|
include_role:
|
|
name: zuul-web
|
|
tasks_from: start
|
|
- name: Wait for scheduler to be running
|
|
uri:
|
|
url: https://zuul.opendev.org/api/components
|
|
method: GET
|
|
return_content: yes
|
|
register: components
|
|
# 3 hours
|
|
retries: 360
|
|
delay: 30
|
|
until: "{{ components.status == 200 and components.content | from_json | json_query(scheduler_query) | length == 1 and components.content | from_json | json_query(scheduler_query) | first == 'running' }}"
|
|
vars:
|
|
scheduler_query: "scheduler[?hostname=='{{ inventory_hostname }}'].state"
|
|
- name: Wait for web to be running
|
|
uri:
|
|
url: https://zuul.opendev.org/api/components
|
|
method: GET
|
|
return_content: yes
|
|
register: components
|
|
# 3 hours
|
|
retries: 360
|
|
delay: 30
|
|
until: "{{ components.status == 200 and components.content | from_json | json_query(web_query) | length == 1 and components.content | from_json | json_query(web_query) | first == 'running' }}"
|
|
vars:
|
|
web_query: "web[?hostname=='{{ inventory_hostname }}'].state"
|
|
- name: Wait for fingergw to be running
|
|
uri:
|
|
url: https://zuul.opendev.org/api/components
|
|
method: GET
|
|
return_content: yes
|
|
register: components
|
|
# 45 minutes
|
|
retries: 180
|
|
delay: 15
|
|
until: "{{ components.status == 200 and components.content | from_json | json_query(finger_query) | length == 1 and components.content | from_json | json_query(finger_query) | first == 'running' }}"
|
|
vars:
|
|
finger_query: "fingergw[?hostname=='{{ inventory_hostname }}'].state"
|