Files
system-config/playbooks/zuul_reboot.yaml
Clark Boylan a74251cad3 Update zuul_reboot package upgrade error conditions
We run the zuul_reboot playbook across many hours which means that we
can hit conflicts with daily unattended-upgrade runs. When this happens
we don't immediately get the dpkg lock. We had code in place to retry
when this conflict occurs but it looks like the Ansible apt module's
error messages have chagned since we first implemented this so we don't
actually retry.

The reason for checking the message is to avoid retrying when a failure
other than a failure to get the lock occurs. Considering this has
already been problematic (due to changing messages) lets just go ahead
and retry until we have success. Other failures are not anticipated so
this should make the overall upgrade and reboot process more resilient.

Change-Id: Iecbe8962bf14aeb4a1543ef1dbe5a9fe49cd1b55
2025-11-24 15:39:35 -08:00

159 lines
5.1 KiB
YAML

# This relies on flock -n /var/run/zuul_reboot.lock to ensure
# we don't run multiple copies of this playbook concurrently.
# TODO: stop pulling in the hourly job if we do this
- name: "Ensure we are going to restart/reboot on the same image"
import_playbook: zuul_pull.yaml
# TODO Do we want to force disabled servers to be rebooted too?
- hosts: "zuul-executor:!disabled"
name: "Reboot zuul-executors gracefully one at a time"
serial: 1
tasks:
- name: Gracefully stop the executor
include_role:
name: zuul-executor
tasks_from: graceful
- name: Upgrade executor server packages
apt:
update_cache: yes
upgrade: yes
register: apt_action
# 20 minute wait for unattended-upgrades to complete
delay: 30
retries: 40
until: apt_action is success
- name: Reboot the executor server
reboot:
# TODO(clarkb) newer ansible (core >=2.18) has a mount_facts module we
# might be able to use instead.
- name: Wait for AFS to be mounted before starting executors
command: 'mount -t afs'
register: mount_output
until: "'AFS on /afs type afs' in mount_output.stdout"
# Booting servers can sometimes be slow. Give AFS 5 minutes
retries: 60
delay: 5
- name: Start the executor
include_role:
name: zuul-executor
tasks_from: start
- hosts: "zuul-merger:!disabled"
name: "Reboot zuul-mergers gracefully one at a time"
serial: 1
tasks:
- name: Gracefully stop the merger
include_role:
name: zuul-merger
tasks_from: graceful
- name: Upgrade merger server packages
apt:
update_cache: yes
upgrade: yes
register: apt_action
# 20 minute wait for unattended-upgrades to complete
delay: 30
retries: 40
until: apt_action is success
- name: Reboot the merger server
reboot:
- name: Start the merger
include_role:
name: zuul-merger
tasks_from: start
- hosts: "zuul-launcher:!disabled"
name: "Reboot zuul-launchers gracefully one at a time"
serial: 1
tasks:
- name: Gracefully stop the launcher
include_role:
name: zuul-launcher
tasks_from: graceful
- name: Upgrade launcher server packages
apt:
update_cache: yes
upgrade: yes
register: apt_action
# 20 minute wait for unattended-upgrades to complete
delay: 30
retries: 40
until: apt_action is success
- name: Reboot the launcher server
reboot:
- name: Start the launcher
include_role:
name: zuul-launcher
tasks_from: start
# TODO should we do both schedulers with reboots then do the webs without
# reboots?
- hosts: "zuul-scheduler:!disabled"
name: "Reboot zuul-schedulers gracefully one at a time"
serial: 1
tasks:
- name: Stop the scheduler process
include_role:
name: zuul-scheduler
tasks_from: stop
- name: Stop the web processes
include_role:
name: zuul-web
tasks_from: stop
- name: Upgrade scheduler server packages
apt:
update_cache: yes
upgrade: yes
register: apt_action
# 20 minute wait for unattended-upgrades to complete
delay: 30
retries: 40
until: apt_action is success
- name: Reboot the scheduler server
reboot:
- name: Start the scheduler process
include_role:
name: zuul-scheduler
tasks_from: start
- name: Start the web processes
include_role:
name: zuul-web
tasks_from: start
- name: Wait for scheduler to be running
uri:
url: https://zuul.opendev.org/api/components
method: GET
return_content: yes
register: components
# 3 hours
retries: 360
delay: 30
until: "{{ components.status == 200 and components.content | from_json | json_query(scheduler_query) | length == 1 and components.content | from_json | json_query(scheduler_query) | first == 'running' }}"
vars:
scheduler_query: "scheduler[?hostname=='{{ inventory_hostname }}'].state"
- name: Wait for web to be running
uri:
url: https://zuul.opendev.org/api/components
method: GET
return_content: yes
register: components
# 3 hours
retries: 360
delay: 30
until: "{{ components.status == 200 and components.content | from_json | json_query(web_query) | length == 1 and components.content | from_json | json_query(web_query) | first == 'running' }}"
vars:
web_query: "web[?hostname=='{{ inventory_hostname }}'].state"
- name: Wait for fingergw to be running
uri:
url: https://zuul.opendev.org/api/components
method: GET
return_content: yes
register: components
# 45 minutes
retries: 180
delay: 15
until: "{{ components.status == 200 and components.content | from_json | json_query(finger_query) | length == 1 and components.content | from_json | json_query(finger_query) | first == 'running' }}"
vars:
finger_query: "fingergw[?hostname=='{{ inventory_hostname }}'].state"