Merge "backups: add retirement and purge lists"

This commit is contained in:
Zuul 2024-11-25 16:54:26 +00:00 committed by Gerrit Code Review
commit 0ce69e8cb3
11 changed files with 145 additions and 13 deletions

View File

@ -262,6 +262,19 @@ then monthly backups for 1 year and yearly backups for each archive.
The backup servers will send a warning when backup volume usage is
high, at which point this can be run manually.
Retiring hosts
--------------
When a host that runs backups is retired, it should have its user
added to the `borg_retire_users` list (this list can differ per-backup
server, depending on storage requirements). Retired users will have
their backup accounts disabled and we will only keep the latest backup
revision.
When we are ready to completely remove the backups, the user can be
moved to the `borg_purge_users` list, which will purge all borg
backups on the next ansible run.
.. _force-merging-a-change:
Force-Merging a Change

View File

@ -13,3 +13,18 @@ Their ``authorized_keys`` file is configured with the public key to
allow the remote host to log in and only run ``borg`` in server mode.
**Role Variables**
.. zuul:rolevar:: borg_retire_users
:default: []
A list of backup user names that are in a "retired" state. The
host should not be in the inventory or active. The backup user
will be diabled and when running a prune, we will only keep the
latest backup to save space.
.. zuul:rolevar:: borg_purge_users
default: []
A list of backup user names whose data should be purged. This list
represents backups for hosts that have been retired and we now
agree we do not want to retain any of their data.

View File

@ -1 +1,3 @@
borg_users: []
borg_retire_users: []
borg_purge_users: []

View File

@ -10,10 +10,12 @@ if [[ ${borg_op} == 'noop' ]]; then
BORG_OP='--dry-run'
elif [[ ${borg_op} == 'prune' ]]; then
BORG_OP=''
LOG_FILE="/opt/backups/prune-$(date '+%Y-%m-%d-%H-%M-%S').log"
echo "*** Logging output to ${LOG_FILE}"
exec 1>${LOG_FILE}
exec 2>&1
if [ -z ${NO_LOG_FILE+x} ]; then
LOG_FILE="/opt/backups/prune-$(date '+%Y-%m-%d-%H-%M-%S').log"
echo "*** Logging output to ${LOG_FILE}"
exec 1>${LOG_FILE}
exec 2>&1
fi
else
echo "*** Invalid input"
exit 1
@ -22,25 +24,39 @@ fi
pushd /opt/backups
for u in borg-*; do
BORG_REPO=/opt/backups/$u/backup
BORG_BASE=/opt/backups/$u
BORG_REPO=${BORG_BASE}/backup
sudo BORG_OP=${BORG_OP} BORG_RELOCATED_REPO_ACCESS_IS_OK=y BORG_REPO=${BORG_REPO} -u ${u} -s <<'EOF'
_prune_flags='--keep-daily 7 --keep-weekly 4 --keep-monthly 12'
_retired=''
if [[ -f ${BORG_BASE}/.retired ]]; then
_prune_flags='--keep-daily 1'
_retired=' (retired)'
fi
sudo BORG_OP=${BORG_OP} BORG_UNKNOWN_UNENCRYPTED_REPO_ACCESS_IS_OK=yes BORG_REPO=${BORG_REPO} _retired="${_retired}" _prune_flags="${_prune_flags}" -u ${u} -s <<'EOF'
# Look at all archives and strip the timestamp, leaving just the archive names
# We limit the prune by --prefix so each archive is considered separately
# Long-running aborted backups might leave a ".checkpoint" archive around; ignore
# these as prune will remove them automatically
#
# Note we are assuming the archives are in the format made by our backup scripts,
# which include -YYYY-MM-DDTHH:MM:SS on the end.
archives=$(/opt/borg/bin/borg list ${BORG_REPO} | awk '$1 !~ /\.checkpoint$/ { print substr($1, 0, length($1)-20) }' | sort | uniq)
echo "+------"
echo "| $(date) Pruning ${BORG_REPO}${_retired}"
for prefix in ${archives};
do
echo
echo
echo "+------"
echo "| $(date) Pruning ${BORG_REPO} archive ${prefix}"
echo "+------"
/opt/borg/bin/borg prune --prefix ${prefix} ${BORG_OP} --verbose --list --show-rc --keep-daily 7 --keep-weekly 4 --keep-monthly 12
echo "| $(date) - archive ${prefix}"
/opt/borg/bin/borg prune --prefix ${prefix} ${BORG_OP} --verbose --list --show-rc ${_prune_flags}
done
echo "| $(date) done!"
echo "+------"
echo
EOF
done

View File

@ -64,3 +64,15 @@
loop: '{{ borg_users }}'
loop_control:
loop_var: borg_user
- name: Remove purged user's backup dirs
file:
name: '/opt/backups/{{ item }}/backup'
state: absent
loop: '{{ borg_purge_users }}'
- name: Disable retired users
include_tasks: retire.yaml
loop: '{{ borg_retire_users }}'
loop_control:
loop_var: borg_user

View File

@ -0,0 +1,17 @@
- name: Disable backup user login
user:
name: '{{ borg_user }}'
shell: /bin/nologin
- name: Remove ssh key
file:
name: '/opt/backups/{{ borg_user }}/.ssh'
state: absent
- name: Mark as retired
file:
name: '/opt/backups/{{ borg_user }}/.retired'
state: touch
owner: '{{ borg_user }}'
group: '{{ borg_user }}'
mode: 0644

View File

@ -0,0 +1,31 @@
- hosts: "borg-backup-server"
tasks:
- name: Setup backup area
file:
name: '/opt/backups/'
state: directory
mode: 0755
owner: root
group: root
# We put this in borg_retire_users and check it gets
# marked as retired
- name: Setup fake retired user
user:
name: borg-retired
home: '/opt/backups/borg-retired'
# We put "borg-purge" in borg_purge_users and check it's
# backup directory gets removed.
- name: Setup fake purge user
user:
name: borg-purge
home: '/opt/backups/borg-purge'
- name: Setup fake purge user backup directory
file:
name: '/opt/backups/borg-purge/backup'
state: directory
mode: 0755
owner: borg-purge
group: borg-purge

View File

@ -140,6 +140,7 @@
- group_vars/zuul-merger.yaml
- group_vars/zuul-scheduler.yaml
- group_vars/zuul-web.yaml
- host_vars/borg-backup01.region.provider.opendev.org.yaml
- host_vars/codesearch01.opendev.org.yaml
- host_vars/etherpad99.opendev.org.yaml
- host_vars/letsencrypt01.opendev.org.yaml

View File

@ -0,0 +1,4 @@
borg_retire_users:
- borg-retired
borg_purge_users:
- borg-purge

View File

@ -54,6 +54,14 @@ def test_borg_server_users(host):
f = host.file(borg_repo)
assert f.exists
# test retired stamp is made for host in retired group
f = host.file('/opt/backups/borg-retired/.retired')
assert f.exists
# test purge for host in purge group
f = host.file('/opt/backups/borg-purge/backup')
assert not f.exists
def test_borg_backup_host_config(host):
hostname = host.backend.get_hostname()
if hostname == 'borg-backup01.region.provider.opendev.org':
@ -97,7 +105,19 @@ def test_borg_server_prune(host):
if hostname != 'borg-backup01.region.provider.opendev.org':
pytest.skip()
cmd = host.run('echo "prune" | /usr/local/bin/prune-borg-backups &> /var/log/prune-borg-backups.log')
# bit of a hack; instead of making a host, backing it up, and then
# retiring it -- which would require testing multiple runs of the
# backup process -- simulate the retired user being active by just
# making a small archive. This ensure the prune script works on
# user flagged as retired.
cmd = host.run('dd if=/dev/urandom of=/tmp/borg-backup.random bs=1M count=5')
assert cmd.succeeded
cmd = host.run('sudo -u borg-retired /opt/borg/bin/borg init --encryption=none /opt/backups/borg-retired/backup')
assert cmd.succeeded
cmd = host.run('sudo -u borg-retired /opt/borg/bin/borg create /opt/backups/borg-retired/backup::test-9999-12-12T00:00:00 /tmp/borg-backup.random')
assert cmd.succeeded
cmd = host.run('echo "prune" | NO_LOG_FILE=1 /usr/local/bin/prune-borg-backups &> /var/log/prune-borg-backups.log')
assert cmd.succeeded
def test_borg_server_verify(host):

View File

@ -427,6 +427,7 @@
- <<: *bastion_group
vars:
run_playbooks:
- playbooks/test-borg-backup-pre.yaml
- playbooks/service-borg-backup.yaml
run_test_playbook: playbooks/test-borg-backup.yaml
files: