jenkins-pipelines/scripts/helpers/archive-dir.sh

511 lines
16 KiB
Bash
Executable File

#!/bin/bash
PROGNAME="${BASH_SOURCE[0]##*/}"
SRC_DIR=
DST_DIR=
CHECKSUM_FILES_LIST_FILE=
DST_CHECKSUMS_FILE=
CHANGE_OWNER=
CHANGE_GROUP=
JOBS=1
XTRACE=0
usage() {
echo -n "\
Usage: $0 [OPTIONS...] SRC_DIR DST_DIR TMP_DIR
Archive SRC_DIR in DST_DIR, using TMP_DIR for temporary files.
-j,--jobs=N calculate checksums in parallel (default: 1)
--owner=OWNER set copied file's owner as specified
--group=GROUP set copied file's group as specified
--output-checksums=CK_FILE
save StxChecksums to this file; by default print it to
STDOUT
--checksum-hardlink=CK_LIST_FILE
Hardlink destination files if possible. CK_LIST_FILE
must contain a list of existing StxChecksums file names
from previously-archived directories, one per line.
We will use the files with matching properties & checksums
to create hard links in DST_DIR.
--xtrace Enable debug output
If executed by root, we will preserve owners/groups of the copied files,
unless they are overridden on the command line.
If this script is called by non-root, it will create all files with the
calling user's effective user & group ownership.
"
exit 0
}
cmdline_error() {
if [[ "$#" -gt 0 ]] ; then
echo "ERROR:" "$@" >&2;
fi
echo "Type \`$0 --help' for more info" >&2
exit 1
}
check_pipe_status() {
local -a pipestatus=(${PIPESTATUS[*]})
local -i i
for ((i=0; i<${#pipestatus[*]}; ++i)) ; do
[[ "${pipestatus[$i]}" -eq 0 ]] || return 1
done
return 0
}
# Process command line
temp=$(getopt -o h,j: --long help,jobs:,owner:,group:,output-checksums:,checksum-hardlink:,xtrace -n "$PROGNAME" -- "$@") || cmdline_error
eval set -- "$temp"
while [[ "$#" -gt 0 ]] ; do
case "$1" in
-h|--help)
usage
exit 0
;;
-j|--jobs)
JOBS="$2"
if [[ ! "$JOBS" =~ ^[0-9]{1,2}$ || "$JOBS" -le 0 || "$JOBS" -ge 99 ]] ; then
cmdline_error "$1 must be an integer [1.99]"
fi
shift 2
;;
--owner)
CHANGE_OWNER="$2"
shift 2
;;
--group)
CHANGE_GROUP="$2"
shift 2
;;
--checksum-hardlink)
CHECKSUM_FILES_LIST_FILE="$2"
shift 2
;;
--output-checksums)
DST_CHECKSUMS_FILE="$2"
shift 2
;;
--xtrace)
XTRACE=1
shift
;;
--)
shift
break
;;
*)
cmdline_error
;;
esac
done
[[ "$#" -ge 3 ]] || cmdline_error "not enough arguments"
[[ "$#" -le 3 ]] || cmdline_error "too many arguments"
SRC_DIR="$1"
DST_DIR="$2"
TMP_DIR="$3"
if [[ ! "$EGID" ]] ; then
EGID="$(id -g)" || exit 1
fi
if [[ $XTRACE -eq 1 ]] ; then
set -x
fi
# Make sure BSD look is installed
if ! look --help >/dev/null ; then
echo "This script requires \"look\" to be installed" >&2
exit 1
fi
# Check for GNU parallel
if parallel --help >/dev/null 2>&1 ; then
GNU_PARALLEL_EXISTS=1
else
GNU_PARALLEL_EXISTS=0
fi
set -e
#
# Combine checksum list files into one
#
if [[ "$CHECKSUM_FILES_LIST_FILE" ]] ; then
echo $'\n## Combining checksum lists into one' >&2
combined_checksums_file="$TMP_DIR/combined_checksums.list"
while read -r checksums_file ; do
# skip empty lines and comments
if echo "$checksums_file" | grep -E '^\s*(#.*)$' ; then
continue
fi
# skip missing files
[[ -f "$checksums_file" ]] || continue
# add file path to the second token (file name)
checksums_dir="$(dirname "$checksums_file")"
awk -v "DIR=$checksums_dir/" '{ if (match($0, /^[[:space:]]*[^[:space:]]+[[:space:]]+/) >= 0) print substr($0, 1, RLENGTH) DIR substr($0, RLENGTH+1) }' \
"$checksums_file"
done <"$CHECKSUM_FILES_LIST_FILE" | sort >"$combined_checksums_file"
check_pipe_status
fi
#
# Create source file lists
#
# Cretate a list file with each source file or dir + their stat properties
echo $'\n## Compiling file list: '"$SRC_DIR" >&2
full_list_file="$TMP_DIR/full.list"
( cd "$SRC_DIR" && find -printf 'type=%y owner=%U group=%G mode=%#m size=%s mtime=%T@ name=%p\n' ) \
| sed 's#name=[.]/#name=#' \
| sed 's#\(mtime=[0-9]\+\)[.][0-9]\+#\1#g' \
>"${full_list_file}"
check_pipe_status
# Create another list file that contains only regular files
regfile_list_file="$TMP_DIR/regfile.list"
\grep '^type=f' "$full_list_file" | sort -k 7 >"$regfile_list_file" || exit 1
# Create a list file that contains only directories
# Sort by the last field "name=..."
dir_list_file="$TMP_DIR/dir.list"
\grep '^type=d' "$full_list_file" | sort -k 7 >"$dir_list_file" || exit 1
# Create a list file that contains all other entries (non-dirs & non-files)
other_list_file="$TMP_DIR/other.list"
\grep '^type=[^df]' "$full_list_file" | sort -k 7 >"$other_list_file" || exit 1
#
# Usage: process_lines MESSAGE INPUT_FILE FUNC ARGS...
#
# Call shell function FUNC in parallel, similar to xargs.
# We will read lines from INPUT_FILE, then pass some subset of lines
# to FUNC many times in parallel, until all lines have been processed.
# Input lines will be appended as additional arguments to FUNC calls.
#
# FUNC and any global vars it references must be exported before
# calling process_lines().
#
# MESSAGE will be printed to STDERR before starting
#
process_lines() {
local message="$1" ; shift
local input_file="$1" ; shift
# how many input lines? bail out if 0
local line_count
line_count="$(cat "$input_file" | wc -l)" || exit 1
[[ "$line_count" -gt 0 ]] || return 0
# How many lines to process at a time. The more the better, but with too
# many some child jobs may starve -- cap it at 256
local lines_per_job
if [[ "$JOBS" -gt 1 ]] ; then
let lines_per_job="line_count / JOBS / 2"
if [[ "$lines_per_job" -eq 0 ]] ; then
lines_per_job=1
elif [[ "$lines_per_job" -gt 256 ]] ; then
lines_per_job=256
fi
else
lines_per_job=256
fi
echo "** $message [JOBS=$JOBS lines_per_job=$lines_per_job]" >&2
# Prefer GNU parallel because it can exit early
local -a cmd
if [[ $GNU_PARALLEL_EXISTS -eq 1 ]] ; then
cmd=(parallel --halt now,fail=1 -q -r -d '\n' -n $lines_per_job -P $JOBS "$@")
else
cmd=(xargs -r -d '\n' -n $lines_per_job -P $JOBS $SHELL -c '"$@"' unused_arg "$@")
fi
if ! "${cmd[@]}" <"$input_file" ; then
echo "ERROR: command failed (\"$message\")" >&2
return 1
fi
}
#
# create directories in sort order, ie create parents before
# children
#
echo $'\n## Creating directories: '"$DST_DIR" >&2
while read -r line ; do
[[ -n "$line" ]] || continue
name="${line##*name=}"
mode="$(echo "$line" | sed -n -r 's#.*mode=([0-9]+).*#\1#p')"
install_args=()
if [[ "$CHANGE_OWNER" ]] ; then
install_args+=("--owner" "$CHANGE_OWNER")
elif [[ $EUID -eq 0 ]] ; then
owner="$(echo "$line" | sed -n -r 's#.*owner=([0-9]+).*#\1#p')"
install_args+=("--owner" "$owner")
fi
if [[ "$CHANGE_GROUP" ]] ; then
install_args+=("--group" "$CHANGE_GROUP")
elif [[ $EUID -eq 0 ]] ; then
group="$(echo "$line" | sed -n -r 's#.*group=([0-9]+).*#\1#p')"
install_args+=("--group" "$group")
fi
echo " MKDIR $name" >&2
if [[ -e "$DST_DIR/$name" && ! -d "$DST_DIR/$name" ]] ; then
\rm "$DST_DIR/$name" || exit 1
fi
install -d "${install_args[@]}" "$DST_DIR/$name"
done <"$dir_list_file" || exit 1
#
# Copy or hardlink regular files
#
echo $'\n## Copying regular files: '"$SRC_DIR" >&2
# helper function to process regular files
# global vars used:
# SRC_DIR
# DST_DIR
# CHANGE_OWNER
# CHANGE_GROUP
# EUID (always definedby bash)
# EGID
# TMP_DIR
# XTRACE
# combined_checksums_file
process_regfiles() {
if [[ $XTRACE -eq 1 ]] ; then
set -x
fi
# Temp file generated by this function. Its name must be unique to
# prevent interference from other jobs with -j N.
local matching_checksums_file
matching_checksums_file="$TMP_DIR/matching_checksums-$$.list"
local line
for line in "$@" ; do
# source file name relative to SRC_DIR
local name
name="${line##*name=}"
[[ "$name" ]] || continue
# source checksum
local checksum
#flock -s "$DST_DIR" echo " SHA256 $name" >&2
checksum="$(sha256sum "$SRC_DIR/$name" | awk '{print $1}')"
if [[ ! "$checksum" ]] ; then
flock -s "$DST_DIR" echo "$SRC_DIR/$name: failed to calculate checksum" >&2
return 1
fi
# source owner; or a user-provided override
local -a install_args=()
local owner
if [[ "$CHANGE_OWNER" ]] ; then
owner="$CHANGE_OWNER"
install_args+=("--owner" "$owner")
elif [[ $EUID -eq 0 ]] ; then
owner="$(echo "$line" | sed -n -r 's#.* owner=([0-9]+).*#\1#p')"
install_args+=("--owner" "$owner")
else
owner=$EUID
fi
# source group; or a user-provided override
local group
if [[ "$CHANGE_GROUP" ]] ; then
group="$CHANGE_GROUP"
install_args+=("--group" "$group")
elif [[ $EGID -eq 0 ]] ; then
group="$(echo "$line" | sed -n -r 's#.* group=([0-9]+).*#\1#p')"
install_args+=("--group" "$group")
else
group=$EGID
fi
# source file's mode/permissions
local mode
mode="$(echo "$line" | sed -n -r 's#.* mode=([^[:space:]]+).*#\1#p')"
# Search for the checksum in an older StxChecksums file
if [[ "$combined_checksums_file" ]] ; then
if look "$checksum " "$combined_checksums_file" >"$matching_checksums_file" ; then
(
# As we read previosuly-archived files properties from StxChecksums,
# make sure they have not changed compared to the actual files on disk.
while read -r ref_checksum ref_name ref_size ref_mtime ref_dev ref_inode ref_path x_rest ; do
[[ -f "$ref_path" ]] || continue
# read on-disk file properties
local ref_stat
ref_stat=($(stat -c '%s %Y %u %g %#04a' "$ref_path" || true))
[[ "${#ref_stat[@]}" -eq 5 ]] || continue
# on-disk size does not match StxChecksums
local ref_ondisk_size
ref_ondisk_size="${ref_stat[0]}"
[[ "$ref_size" == "$ref_ondisk_size" ]] || continue
# on-disk mtime does not match StxChecksums
local ref_ondisk_mtime
ref_ondisk_mtime="${ref_stat[1]}"
[[ "${ref_mtime}" == "$ref_ondisk_mtime" ]] || continue
# on-disk owner does not match requested owner
local ref_ondisk_owner
ref_ondisk_owner="${ref_stat[2]}"
[[ "${owner}" == "$ref_ondisk_owner" ]] || continue
# on-disk group does not match requested group
local ref_ondisk_group
ref_ondisk_group="${ref_stat[3]}"
[[ "${group}" == "$ref_ondisk_group" ]] || continue
# on-disk mode does not match the mode of the source file
ref_ondisk_mode="${ref_stat[4]}"
[[ "${mode}" == "$ref_ondisk_mode" ]] || continue
# At this point checksum, size, mtime, mode, owner, group and checksums of the
# exsiting file match with the file we are trying to copy.
# Use that file to create a hardlink.
flock -s "$DST_DIR" echo " LINK $name (from $ref_name)" >&2
if ln -f "$ref_name" "${DST_DIR}/$name" ; then
flock -s "$DST_DIR" echo "$checksum $name $ref_size $ref_mtime $ref_dev $ref_inode $DST_DIR/$name"
exit 0
fi
done <"$matching_checksums_file"
# checksum not found in older archives
exit 1
) && continue || true
fi
fi
# No matching files found: really copy it
if [[ -e "$DST_DIR/$name" ]] ; then
\rm "$DST_DIR/$name" || exit 1
fi
# source file's size & mtime
local size mtime
size="$(echo "$line" | sed -n -r 's#.* size=([^[:space:]]+).*#\1#p')"
mtime="$(echo "$line" | sed -n -r 's#.* mtime=([^[:space:]]+).*#\1#p')"
# copy it to $DST_DIR
flock -s "$DST_DIR" echo " COPY $name" >&2
rm -f "$DST_DIR/$name" || exit 1
install --preserve-timestamps "${install_args[@]}" --mode="$mode" -T "$SRC_DIR/$name" "$DST_DIR/$name" || exit 1
# check destination file properties
local dst_stat dst_size dst_dev dst_ino
dst_stat=($(stat -c '%s %d %i' "$DST_DIR/$name")) || exit 1
dst_size="${dst_stat[0]}"
dst_dev="${dst_stat[1]}"
dst_ino="${dst_stat[2]}"
# file changed while copying
if [[ "$dst_size" != "$size" ]] ; then
flock -s "$DST_DIR" echo "ERROR: $SRC_DIR/$name changed while copying!" >&2
exit 1
fi
# print out a line for StxChecksums using source file properties (preserved
# during copying), but with destination file's dev & ino.
flock -s "$DST_DIR" echo "$checksum $name $size $mtime $dst_dev $dst_ino $DST_DIR/$name"
done
rm -f "$matching_checksums_file"
}
# process files in parallel
(
if [[ "$DST_CHECKSUMS_FILE" ]] ; then
dst_checksums_fd=5
exec 5<>"$DST_CHECKSUMS_FILE" || exit 1
else
dst_checksums_fd=1
fi
export SRC_DIR \
DST_DIR \
CHANGE_OWNER \
CHANGE_GROUP \
EGID \
TMP_DIR \
XTRACE \
combined_checksums_file
export -f process_regfiles
message="processing regular files"
process_lines "$message" "$regfile_list_file" process_regfiles | sort >&$dst_checksums_fd
[[ "${PIPESTATUS[0]}" -eq 0 && "${PIPESTATUS[1]}" -eq 0 ]] || exit 1
) || exit 1
#
# copy special files
#
echo $'\n## Copying special files: '"$DST_DIR" >&2
# helper function for processing special files
# global vars used:
# SRC_DIR
# DST_DIR
# CHANGE_OWNER
# CHANGE_GROUP
# XTRACE
process_other() {
if [[ $XTRACE -eq 1 ]] ; then
set -x
fi
local line
for line in "$@" ; do
local name
name="${line##*name=}"
[[ -n "$name" ]] || continue
local type
type="$(echo "$line" | sed 's#^type=\(.\) .*#\1#g')"
[[ -n "$type" ]] || continue
flock -s "$DST_DIR" echo " CREATE type=$type $name" >&2
if [[ -e "$DST_DIR/$name" ]] ; then
rm "$DST_DIR/$name" || exit 1
fi
cp -a --no-dereference "$SRC_DIR/$name" "$DST_DIR/$name" || exit 1
if [[ "$CHANGE_OWNER" || "$CHANGE_GROUP" ]] ; then
local chown_arg=
if [[ "$CHANGE_OWNER" ]] ; then
chown_arg="$CHANGE_OWNER"
fi
if [[ "$CHANGE_GROUP" ]] ; then
chown_arg+=":$CHANGE_GROUP"
fi
chown --no-dereference "$chown_arg" "$DST_DIR/$name" || exit 1
fi
done
}
# process them in parallel
(
export SRC_DIR \
DST_DIR \
CHANGE_OWNER \
CHANGE_GROUP \
XTRACE
export -f process_other
message="processing other files"
process_lines "$message" "$other_list_file" process_other || exit 1
) || exit 1