#!/bin/bash PROGNAME="${BASH_SOURCE[0]##*/}" SRC_DIR= DST_DIR= CHECKSUM_FILES_LIST_FILE= DST_CHECKSUMS_FILE= CHANGE_OWNER= CHANGE_GROUP= JOBS=1 XTRACE=0 usage() { echo -n "\ Usage: $0 [OPTIONS...] SRC_DIR DST_DIR TMP_DIR Archive SRC_DIR in DST_DIR, using TMP_DIR for temporary files. -j,--jobs=N calculate checksums in parallel (default: 1) --owner=OWNER set copied file's owner as specified --group=GROUP set copied file's group as specified --output-checksums=CK_FILE save StxChecksums to this file; by default print it to STDOUT --checksum-hardlink=CK_LIST_FILE Hardlink destination files if possible. CK_LIST_FILE must contain a list of existing StxChecksums file names from previously-archived directories, one per line. We will use the files with matching properties & checksums to create hard links in DST_DIR. --xtrace Enable debug output If executed by root, we will preserve owners/groups of the copied files, unless they are overridden on the command line. If this script is called by non-root, it will create all files with the calling user's effective user & group ownership. " exit 0 } cmdline_error() { if [[ "$#" -gt 0 ]] ; then echo "ERROR:" "$@" >&2; fi echo "Type \`$0 --help' for more info" >&2 exit 1 } check_pipe_status() { local -a pipestatus=(${PIPESTATUS[*]}) local -i i for ((i=0; i<${#pipestatus[*]}; ++i)) ; do [[ "${pipestatus[$i]}" -eq 0 ]] || return 1 done return 0 } # Process command line temp=$(getopt -o h,j: --long help,jobs:,owner:,group:,output-checksums:,checksum-hardlink:,xtrace -n "$PROGNAME" -- "$@") || cmdline_error eval set -- "$temp" while [[ "$#" -gt 0 ]] ; do case "$1" in -h|--help) usage exit 0 ;; -j|--jobs) JOBS="$2" if [[ ! "$JOBS" =~ ^[0-9]{1,2}$ || "$JOBS" -le 0 || "$JOBS" -ge 99 ]] ; then cmdline_error "$1 must be an integer [1.99]" fi shift 2 ;; --owner) CHANGE_OWNER="$2" shift 2 ;; --group) CHANGE_GROUP="$2" shift 2 ;; --checksum-hardlink) CHECKSUM_FILES_LIST_FILE="$2" shift 2 ;; --output-checksums) DST_CHECKSUMS_FILE="$2" shift 2 ;; --xtrace) XTRACE=1 shift ;; --) shift break ;; *) cmdline_error ;; esac done [[ "$#" -ge 3 ]] || cmdline_error "not enough arguments" [[ "$#" -le 3 ]] || cmdline_error "too many arguments" SRC_DIR="$1" DST_DIR="$2" TMP_DIR="$3" if [[ ! "$EGID" ]] ; then EGID="$(id -g)" || exit 1 fi if [[ $XTRACE -eq 1 ]] ; then set -x fi # Make sure BSD look is installed if ! look --help >/dev/null ; then echo "This script requires \"look\" to be installed" >&2 exit 1 fi # Check for GNU parallel if parallel --help >/dev/null 2>&1 ; then GNU_PARALLEL_EXISTS=1 else GNU_PARALLEL_EXISTS=0 fi set -e # # Combine checksum list files into one # if [[ "$CHECKSUM_FILES_LIST_FILE" ]] ; then echo $'\n## Combining checksum lists into one' >&2 combined_checksums_file="$TMP_DIR/combined_checksums.list" while read -r checksums_file ; do # skip empty lines and comments if echo "$checksums_file" | grep -E '^\s*(#.*)$' ; then continue fi # skip missing files [[ -f "$checksums_file" ]] || continue # add file path to the second token (file name) checksums_dir="$(dirname "$checksums_file")" awk -v "DIR=$checksums_dir/" '{ if (match($0, /^[[:space:]]*[^[:space:]]+[[:space:]]+/) >= 0) print substr($0, 1, RLENGTH) DIR substr($0, RLENGTH+1) }' \ "$checksums_file" done <"$CHECKSUM_FILES_LIST_FILE" | sort >"$combined_checksums_file" check_pipe_status fi # # Create source file lists # # Cretate a list file with each source file or dir + their stat properties echo $'\n## Compiling file list: '"$SRC_DIR" >&2 full_list_file="$TMP_DIR/full.list" ( cd "$SRC_DIR" && find -printf 'type=%y owner=%U group=%G mode=%#m size=%s mtime=%T@ name=%p\n' ) \ | sed 's#name=[.]/#name=#' \ | sed 's#\(mtime=[0-9]\+\)[.][0-9]\+#\1#g' \ >"${full_list_file}" check_pipe_status # Create another list file that contains only regular files regfile_list_file="$TMP_DIR/regfile.list" \grep '^type=f' "$full_list_file" | sort -k 7 >"$regfile_list_file" || exit 1 # Create a list file that contains only directories # Sort by the last field "name=..." dir_list_file="$TMP_DIR/dir.list" \grep '^type=d' "$full_list_file" | sort -k 7 >"$dir_list_file" || exit 1 # Create a list file that contains all other entries (non-dirs & non-files) other_list_file="$TMP_DIR/other.list" \grep '^type=[^df]' "$full_list_file" | sort -k 7 >"$other_list_file" || exit 1 # # Usage: process_lines MESSAGE INPUT_FILE FUNC ARGS... # # Call shell function FUNC in parallel, similar to xargs. # We will read lines from INPUT_FILE, then pass some subset of lines # to FUNC many times in parallel, until all lines have been processed. # Input lines will be appended as additional arguments to FUNC calls. # # FUNC and any global vars it references must be exported before # calling process_lines(). # # MESSAGE will be printed to STDERR before starting # process_lines() { local message="$1" ; shift local input_file="$1" ; shift # how many input lines? bail out if 0 local line_count line_count="$(cat "$input_file" | wc -l)" || exit 1 [[ "$line_count" -gt 0 ]] || return 0 # How many lines to process at a time. The more the better, but with too # many some child jobs may starve -- cap it at 256 local lines_per_job if [[ "$JOBS" -gt 1 ]] ; then let lines_per_job="line_count / JOBS / 2" if [[ "$lines_per_job" -eq 0 ]] ; then lines_per_job=1 elif [[ "$lines_per_job" -gt 256 ]] ; then lines_per_job=256 fi else lines_per_job=256 fi echo "** $message [JOBS=$JOBS lines_per_job=$lines_per_job]" >&2 # Prefer GNU parallel because it can exit early local -a cmd if [[ $GNU_PARALLEL_EXISTS -eq 1 ]] ; then cmd=(parallel --halt now,fail=1 -q -r -d '\n' -n $lines_per_job -P $JOBS "$@") else cmd=(xargs -r -d '\n' -n $lines_per_job -P $JOBS $SHELL -c '"$@"' unused_arg "$@") fi if ! "${cmd[@]}" <"$input_file" ; then echo "ERROR: command failed (\"$message\")" >&2 return 1 fi } # # create directories in sort order, ie create parents before # children # echo $'\n## Creating directories: '"$DST_DIR" >&2 while read -r line ; do [[ -n "$line" ]] || continue name="${line##*name=}" mode="$(echo "$line" | sed -n -r 's#.*mode=([0-9]+).*#\1#p')" install_args=() if [[ "$CHANGE_OWNER" ]] ; then install_args+=("--owner" "$CHANGE_OWNER") elif [[ $EUID -eq 0 ]] ; then owner="$(echo "$line" | sed -n -r 's#.*owner=([0-9]+).*#\1#p')" install_args+=("--owner" "$owner") fi if [[ "$CHANGE_GROUP" ]] ; then install_args+=("--group" "$CHANGE_GROUP") elif [[ $EUID -eq 0 ]] ; then group="$(echo "$line" | sed -n -r 's#.*group=([0-9]+).*#\1#p')" install_args+=("--group" "$group") fi echo " MKDIR $name" >&2 if [[ -e "$DST_DIR/$name" && ! -d "$DST_DIR/$name" ]] ; then \rm "$DST_DIR/$name" || exit 1 fi install -d "${install_args[@]}" "$DST_DIR/$name" done <"$dir_list_file" || exit 1 # # Copy or hardlink regular files # echo $'\n## Copying regular files: '"$SRC_DIR" >&2 # helper function to process regular files # global vars used: # SRC_DIR # DST_DIR # CHANGE_OWNER # CHANGE_GROUP # EUID (always definedby bash) # EGID # TMP_DIR # XTRACE # combined_checksums_file process_regfiles() { if [[ $XTRACE -eq 1 ]] ; then set -x fi # Temp file generated by this function. Its name must be unique to # prevent interference from other jobs with -j N. local matching_checksums_file matching_checksums_file="$TMP_DIR/matching_checksums-$$.list" local line for line in "$@" ; do # source file name relative to SRC_DIR local name name="${line##*name=}" [[ "$name" ]] || continue # source checksum local checksum #flock -s "$DST_DIR" echo " SHA256 $name" >&2 checksum="$(sha256sum "$SRC_DIR/$name" | awk '{print $1}')" if [[ ! "$checksum" ]] ; then flock -s "$DST_DIR" echo "$SRC_DIR/$name: failed to calculate checksum" >&2 return 1 fi # source owner; or a user-provided override local -a install_args=() local owner if [[ "$CHANGE_OWNER" ]] ; then owner="$CHANGE_OWNER" install_args+=("--owner" "$owner") elif [[ $EUID -eq 0 ]] ; then owner="$(echo "$line" | sed -n -r 's#.* owner=([0-9]+).*#\1#p')" install_args+=("--owner" "$owner") else owner=$EUID fi # source group; or a user-provided override local group if [[ "$CHANGE_GROUP" ]] ; then group="$CHANGE_GROUP" install_args+=("--group" "$group") elif [[ $EGID -eq 0 ]] ; then group="$(echo "$line" | sed -n -r 's#.* group=([0-9]+).*#\1#p')" install_args+=("--group" "$group") else group=$EGID fi # source file's mode/permissions local mode mode="$(echo "$line" | sed -n -r 's#.* mode=([^[:space:]]+).*#\1#p')" # Search for the checksum in an older StxChecksums file if [[ "$combined_checksums_file" ]] ; then if look "$checksum " "$combined_checksums_file" >"$matching_checksums_file" ; then ( # As we read previosuly-archived files properties from StxChecksums, # make sure they have not changed compared to the actual files on disk. while read -r ref_checksum ref_name ref_size ref_mtime ref_dev ref_inode ref_path x_rest ; do [[ -f "$ref_path" ]] || continue # read on-disk file properties local ref_stat ref_stat=($(stat -c '%s %Y %u %g %#04a' "$ref_path" || true)) [[ "${#ref_stat[@]}" -eq 5 ]] || continue # on-disk size does not match StxChecksums local ref_ondisk_size ref_ondisk_size="${ref_stat[0]}" [[ "$ref_size" == "$ref_ondisk_size" ]] || continue # on-disk mtime does not match StxChecksums local ref_ondisk_mtime ref_ondisk_mtime="${ref_stat[1]}" [[ "${ref_mtime}" == "$ref_ondisk_mtime" ]] || continue # on-disk owner does not match requested owner local ref_ondisk_owner ref_ondisk_owner="${ref_stat[2]}" [[ "${owner}" == "$ref_ondisk_owner" ]] || continue # on-disk group does not match requested group local ref_ondisk_group ref_ondisk_group="${ref_stat[3]}" [[ "${group}" == "$ref_ondisk_group" ]] || continue # on-disk mode does not match the mode of the source file ref_ondisk_mode="${ref_stat[4]}" [[ "${mode}" == "$ref_ondisk_mode" ]] || continue # At this point checksum, size, mtime, mode, owner, group and checksums of the # exsiting file match with the file we are trying to copy. # Use that file to create a hardlink. flock -s "$DST_DIR" echo " LINK $name (from $ref_name)" >&2 if ln -f "$ref_name" "${DST_DIR}/$name" ; then flock -s "$DST_DIR" echo "$checksum $name $ref_size $ref_mtime $ref_dev $ref_inode $DST_DIR/$name" exit 0 fi done <"$matching_checksums_file" # checksum not found in older archives exit 1 ) && continue || true fi fi # No matching files found: really copy it if [[ -e "$DST_DIR/$name" ]] ; then \rm "$DST_DIR/$name" || exit 1 fi # source file's size & mtime local size mtime size="$(echo "$line" | sed -n -r 's#.* size=([^[:space:]]+).*#\1#p')" mtime="$(echo "$line" | sed -n -r 's#.* mtime=([^[:space:]]+).*#\1#p')" # copy it to $DST_DIR flock -s "$DST_DIR" echo " COPY $name" >&2 rm -f "$DST_DIR/$name" || exit 1 install --preserve-timestamps "${install_args[@]}" --mode="$mode" -T "$SRC_DIR/$name" "$DST_DIR/$name" || exit 1 # check destination file properties local dst_stat dst_size dst_dev dst_ino dst_stat=($(stat -c '%s %d %i' "$DST_DIR/$name")) || exit 1 dst_size="${dst_stat[0]}" dst_dev="${dst_stat[1]}" dst_ino="${dst_stat[2]}" # file changed while copying if [[ "$dst_size" != "$size" ]] ; then flock -s "$DST_DIR" echo "ERROR: $SRC_DIR/$name changed while copying!" >&2 exit 1 fi # print out a line for StxChecksums using source file properties (preserved # during copying), but with destination file's dev & ino. flock -s "$DST_DIR" echo "$checksum $name $size $mtime $dst_dev $dst_ino $DST_DIR/$name" done rm -f "$matching_checksums_file" } # process files in parallel ( if [[ "$DST_CHECKSUMS_FILE" ]] ; then dst_checksums_fd=5 exec 5<>"$DST_CHECKSUMS_FILE" || exit 1 else dst_checksums_fd=1 fi export SRC_DIR \ DST_DIR \ CHANGE_OWNER \ CHANGE_GROUP \ EGID \ TMP_DIR \ XTRACE \ combined_checksums_file export -f process_regfiles message="processing regular files" process_lines "$message" "$regfile_list_file" process_regfiles | sort >&$dst_checksums_fd [[ "${PIPESTATUS[0]}" -eq 0 && "${PIPESTATUS[1]}" -eq 0 ]] || exit 1 ) || exit 1 # # copy special files # echo $'\n## Copying special files: '"$DST_DIR" >&2 # helper function for processing special files # global vars used: # SRC_DIR # DST_DIR # CHANGE_OWNER # CHANGE_GROUP # XTRACE process_other() { if [[ $XTRACE -eq 1 ]] ; then set -x fi local line for line in "$@" ; do local name name="${line##*name=}" [[ -n "$name" ]] || continue local type type="$(echo "$line" | sed 's#^type=\(.\) .*#\1#g')" [[ -n "$type" ]] || continue flock -s "$DST_DIR" echo " CREATE type=$type $name" >&2 if [[ -e "$DST_DIR/$name" ]] ; then rm "$DST_DIR/$name" || exit 1 fi cp -a --no-dereference "$SRC_DIR/$name" "$DST_DIR/$name" || exit 1 if [[ "$CHANGE_OWNER" || "$CHANGE_GROUP" ]] ; then local chown_arg= if [[ "$CHANGE_OWNER" ]] ; then chown_arg="$CHANGE_OWNER" fi if [[ "$CHANGE_GROUP" ]] ; then chown_arg+=":$CHANGE_GROUP" fi chown --no-dereference "$chown_arg" "$DST_DIR/$name" || exit 1 fi done } # process them in parallel ( export SRC_DIR \ DST_DIR \ CHANGE_OWNER \ CHANGE_GROUP \ XTRACE export -f process_other message="processing other files" process_lines "$message" "$other_list_file" process_other || exit 1 ) || exit 1