From 8361ab701ccce10fe5bfb5901252a6470f23753f Mon Sep 17 00:00:00 2001 From: Ian Wienand Date: Wed, 30 Oct 2024 17:08:09 +1100 Subject: [PATCH] backups: add retirement and purge lists This adds a retirement and purge list to the borg management role. The idea here is that when a backed-up host is shut-down, we add its backup user to the retired list. On the next ansible run the user will be disabled on the backup-server and the backup repo marked as retired. On the next prune, we will trim the backup to only the last run to save space. This gives us a grace period to restore if we should need to. When we are sure we don't want the data, we can put it in the purge list, and the backup repo is removed on the next ansible run (hosts can go straight into this if we want). This allows us to have a review process/history before we purge data. To test, we create a fake "borg-retired" user on the backup-server, and give it a simple backup. This is marked as retired, which is reflected in the testinfra run of the prune script. Similarly a "borg-purge" user is created, and we ensure it's backup dir is removed. Documentation is updated. Change-Id: I5dff0a9d35b11a1f021048a12ecddce952c0c13c --- doc/source/sysadmin.rst | 13 ++++++ playbooks/roles/borg-backup-server/README.rst | 15 +++++++ .../borg-backup-server/defaults/main.yaml | 2 + .../files/prune-borg-backups.sh | 40 +++++++++++++------ .../roles/borg-backup-server/tasks/main.yaml | 12 ++++++ .../borg-backup-server/tasks/retire.yaml | 17 ++++++++ playbooks/test-borg-backup-pre.yaml | 31 ++++++++++++++ playbooks/zuul/run-base.yaml | 1 + ...ckup01.region.provider.opendev.org.yaml.j2 | 4 ++ testinfra/test_borg_backups.py | 22 +++++++++- zuul.d/system-config-run.yaml | 1 + 11 files changed, 145 insertions(+), 13 deletions(-) create mode 100644 playbooks/roles/borg-backup-server/tasks/retire.yaml create mode 100644 playbooks/test-borg-backup-pre.yaml create mode 100644 playbooks/zuul/templates/host_vars/borg-backup01.region.provider.opendev.org.yaml.j2 diff --git a/doc/source/sysadmin.rst b/doc/source/sysadmin.rst index 2694e34cfa..6735caa132 100644 --- a/doc/source/sysadmin.rst +++ b/doc/source/sysadmin.rst @@ -262,6 +262,19 @@ then monthly backups for 1 year and yearly backups for each archive. The backup servers will send a warning when backup volume usage is high, at which point this can be run manually. +Retiring hosts +-------------- + +When a host that runs backups is retired, it should have its user +added to the `borg_retire_users` list (this list can differ per-backup +server, depending on storage requirements). Retired users will have +their backup accounts disabled and we will only keep the latest backup +revision. + +When we are ready to completely remove the backups, the user can be +moved to the `borg_purge_users` list, which will purge all borg +backups on the next ansible run. + .. _force-merging-a-change: Force-Merging a Change diff --git a/playbooks/roles/borg-backup-server/README.rst b/playbooks/roles/borg-backup-server/README.rst index c78c557dec..f16bfa704a 100644 --- a/playbooks/roles/borg-backup-server/README.rst +++ b/playbooks/roles/borg-backup-server/README.rst @@ -13,3 +13,18 @@ Their ``authorized_keys`` file is configured with the public key to allow the remote host to log in and only run ``borg`` in server mode. **Role Variables** + +.. zuul:rolevar:: borg_retire_users + :default: [] + + A list of backup user names that are in a "retired" state. The + host should not be in the inventory or active. The backup user + will be diabled and when running a prune, we will only keep the + latest backup to save space. + +.. zuul:rolevar:: borg_purge_users + default: [] + + A list of backup user names whose data should be purged. This list + represents backups for hosts that have been retired and we now + agree we do not want to retain any of their data. diff --git a/playbooks/roles/borg-backup-server/defaults/main.yaml b/playbooks/roles/borg-backup-server/defaults/main.yaml index 9ba22faee0..c3d0fc7839 100644 --- a/playbooks/roles/borg-backup-server/defaults/main.yaml +++ b/playbooks/roles/borg-backup-server/defaults/main.yaml @@ -1 +1,3 @@ borg_users: [] +borg_retire_users: [] +borg_purge_users: [] diff --git a/playbooks/roles/borg-backup-server/files/prune-borg-backups.sh b/playbooks/roles/borg-backup-server/files/prune-borg-backups.sh index af6b684dc6..7669047f63 100644 --- a/playbooks/roles/borg-backup-server/files/prune-borg-backups.sh +++ b/playbooks/roles/borg-backup-server/files/prune-borg-backups.sh @@ -10,10 +10,12 @@ if [[ ${borg_op} == 'noop' ]]; then BORG_OP='--dry-run' elif [[ ${borg_op} == 'prune' ]]; then BORG_OP='' - LOG_FILE="/opt/backups/prune-$(date '+%Y-%m-%d-%H-%M-%S').log" - echo "*** Logging output to ${LOG_FILE}" - exec 1>${LOG_FILE} - exec 2>&1 + if [ -z ${NO_LOG_FILE+x} ]; then + LOG_FILE="/opt/backups/prune-$(date '+%Y-%m-%d-%H-%M-%S').log" + echo "*** Logging output to ${LOG_FILE}" + exec 1>${LOG_FILE} + exec 2>&1 + fi else echo "*** Invalid input" exit 1 @@ -22,25 +24,39 @@ fi pushd /opt/backups for u in borg-*; do - BORG_REPO=/opt/backups/$u/backup + BORG_BASE=/opt/backups/$u + BORG_REPO=${BORG_BASE}/backup - sudo BORG_OP=${BORG_OP} BORG_RELOCATED_REPO_ACCESS_IS_OK=y BORG_REPO=${BORG_REPO} -u ${u} -s <<'EOF' + _prune_flags='--keep-daily 7 --keep-weekly 4 --keep-monthly 12' + _retired='' + if [[ -f ${BORG_BASE}/.retired ]]; then + _prune_flags='--keep-daily 1' + _retired=' (retired)' + fi + + sudo BORG_OP=${BORG_OP} BORG_UNKNOWN_UNENCRYPTED_REPO_ACCESS_IS_OK=yes BORG_REPO=${BORG_REPO} _retired="${_retired}" _prune_flags="${_prune_flags}" -u ${u} -s <<'EOF' # Look at all archives and strip the timestamp, leaving just the archive names # We limit the prune by --prefix so each archive is considered separately # Long-running aborted backups might leave a ".checkpoint" archive around; ignore # these as prune will remove them automatically + # + # Note we are assuming the archives are in the format made by our backup scripts, + # which include -YYYY-MM-DDTHH:MM:SS on the end. archives=$(/opt/borg/bin/borg list ${BORG_REPO} | awk '$1 !~ /\.checkpoint$/ { print substr($1, 0, length($1)-20) }' | sort | uniq) + echo "+------" + echo "| $(date) Pruning ${BORG_REPO}${_retired}" + for prefix in ${archives}; do - echo - echo - echo "+------" - echo "| $(date) Pruning ${BORG_REPO} archive ${prefix}" - echo "+------" - /opt/borg/bin/borg prune --prefix ${prefix} ${BORG_OP} --verbose --list --show-rc --keep-daily 7 --keep-weekly 4 --keep-monthly 12 + echo "| $(date) - archive ${prefix}" + /opt/borg/bin/borg prune --prefix ${prefix} ${BORG_OP} --verbose --list --show-rc ${_prune_flags} done + echo "| $(date) done!" + echo "+------" + echo + EOF done diff --git a/playbooks/roles/borg-backup-server/tasks/main.yaml b/playbooks/roles/borg-backup-server/tasks/main.yaml index 5ea555008d..67ea53e813 100644 --- a/playbooks/roles/borg-backup-server/tasks/main.yaml +++ b/playbooks/roles/borg-backup-server/tasks/main.yaml @@ -64,3 +64,15 @@ loop: '{{ borg_users }}' loop_control: loop_var: borg_user + +- name: Remove purged user's backup dirs + file: + name: '/opt/backups/{{ item }}/backup' + state: absent + loop: '{{ borg_purge_users }}' + +- name: Disable retired users + include_tasks: retire.yaml + loop: '{{ borg_retire_users }}' + loop_control: + loop_var: borg_user diff --git a/playbooks/roles/borg-backup-server/tasks/retire.yaml b/playbooks/roles/borg-backup-server/tasks/retire.yaml new file mode 100644 index 0000000000..0208fac0b7 --- /dev/null +++ b/playbooks/roles/borg-backup-server/tasks/retire.yaml @@ -0,0 +1,17 @@ +- name: Disable backup user login + user: + name: '{{ borg_user }}' + shell: /bin/nologin + +- name: Remove ssh key + file: + name: '/opt/backups/{{ borg_user }}/.ssh' + state: absent + +- name: Mark as retired + file: + name: '/opt/backups/{{ borg_user }}/.retired' + state: touch + owner: '{{ borg_user }}' + group: '{{ borg_user }}' + mode: 0644 diff --git a/playbooks/test-borg-backup-pre.yaml b/playbooks/test-borg-backup-pre.yaml new file mode 100644 index 0000000000..bfff0fbf07 --- /dev/null +++ b/playbooks/test-borg-backup-pre.yaml @@ -0,0 +1,31 @@ +- hosts: "borg-backup-server" + tasks: + - name: Setup backup area + file: + name: '/opt/backups/' + state: directory + mode: 0755 + owner: root + group: root + + # We put this in borg_retire_users and check it gets + # marked as retired + - name: Setup fake retired user + user: + name: borg-retired + home: '/opt/backups/borg-retired' + + # We put "borg-purge" in borg_purge_users and check it's + # backup directory gets removed. + - name: Setup fake purge user + user: + name: borg-purge + home: '/opt/backups/borg-purge' + + - name: Setup fake purge user backup directory + file: + name: '/opt/backups/borg-purge/backup' + state: directory + mode: 0755 + owner: borg-purge + group: borg-purge diff --git a/playbooks/zuul/run-base.yaml b/playbooks/zuul/run-base.yaml index 554365399a..9c363adfe8 100644 --- a/playbooks/zuul/run-base.yaml +++ b/playbooks/zuul/run-base.yaml @@ -140,6 +140,7 @@ - group_vars/zuul-merger.yaml - group_vars/zuul-scheduler.yaml - group_vars/zuul-web.yaml + - host_vars/borg-backup01.region.provider.opendev.org.yaml - host_vars/codesearch01.opendev.org.yaml - host_vars/etherpad99.opendev.org.yaml - host_vars/letsencrypt01.opendev.org.yaml diff --git a/playbooks/zuul/templates/host_vars/borg-backup01.region.provider.opendev.org.yaml.j2 b/playbooks/zuul/templates/host_vars/borg-backup01.region.provider.opendev.org.yaml.j2 new file mode 100644 index 0000000000..7de413411b --- /dev/null +++ b/playbooks/zuul/templates/host_vars/borg-backup01.region.provider.opendev.org.yaml.j2 @@ -0,0 +1,4 @@ +borg_retire_users: + - borg-retired +borg_purge_users: + - borg-purge diff --git a/testinfra/test_borg_backups.py b/testinfra/test_borg_backups.py index 9975fc0459..17b37602a1 100644 --- a/testinfra/test_borg_backups.py +++ b/testinfra/test_borg_backups.py @@ -54,6 +54,14 @@ def test_borg_server_users(host): f = host.file(borg_repo) assert f.exists + # test retired stamp is made for host in retired group + f = host.file('/opt/backups/borg-retired/.retired') + assert f.exists + + # test purge for host in purge group + f = host.file('/opt/backups/borg-purge/backup') + assert not f.exists + def test_borg_backup_host_config(host): hostname = host.backend.get_hostname() if hostname == 'borg-backup01.region.provider.opendev.org': @@ -97,7 +105,19 @@ def test_borg_server_prune(host): if hostname != 'borg-backup01.region.provider.opendev.org': pytest.skip() - cmd = host.run('echo "prune" | /usr/local/bin/prune-borg-backups &> /var/log/prune-borg-backups.log') + # bit of a hack; instead of making a host, backing it up, and then + # retiring it -- which would require testing multiple runs of the + # backup process -- simulate the retired user being active by just + # making a small archive. This ensure the prune script works on + # user flagged as retired. + cmd = host.run('dd if=/dev/urandom of=/tmp/borg-backup.random bs=1M count=5') + assert cmd.succeeded + cmd = host.run('sudo -u borg-retired /opt/borg/bin/borg init --encryption=none /opt/backups/borg-retired/backup') + assert cmd.succeeded + cmd = host.run('sudo -u borg-retired /opt/borg/bin/borg create /opt/backups/borg-retired/backup::test-9999-12-12T00:00:00 /tmp/borg-backup.random') + assert cmd.succeeded + + cmd = host.run('echo "prune" | NO_LOG_FILE=1 /usr/local/bin/prune-borg-backups &> /var/log/prune-borg-backups.log') assert cmd.succeeded def test_borg_server_verify(host): diff --git a/zuul.d/system-config-run.yaml b/zuul.d/system-config-run.yaml index 6dd62932b6..227b27d3b5 100644 --- a/zuul.d/system-config-run.yaml +++ b/zuul.d/system-config-run.yaml @@ -425,6 +425,7 @@ - <<: *bastion_group vars: run_playbooks: + - playbooks/test-borg-backup-pre.yaml - playbooks/service-borg-backup.yaml run_test_playbook: playbooks/test-borg-backup.yaml files: