Old script was slow and didn't preserve hardlinks within the source set.
This script doesn't link files that are identical within the source set,
ie same checksum & attributes, but different inode. It can only link
such files to similar files from older builds. This deficiency will be
addressed in a separate commit.
TESTS
===================
* Manually test various input directories, including:
- a directory that contains each type of file (regular, devices,
sockets, symlinks, etc)
- old index files with spaces in file names
* Given a build with a dozen or so historical builds,
copied the "aptly" directory and compared timing and destination
directory size before/after this patch:
- old script: time=4m13s size=56.0G
- new script: time=14s size=6.1G
* Run a Jenkins build that rebuilds one package, and doesn't
clean/rebuild the ISO. Make sure "archive-misc" works as expected.
Change-Id: Ic8f8931c4143bc355db1ccbad56ed772c0f3081e
Signed-off-by: Davlet Panech <davlet.panech@windriver.com>
827 lines
30 KiB
Python
Executable File
827 lines
30 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
import sys
|
|
assert sys.version_info >= (3, 9), "Python >= 3.9 is required"
|
|
|
|
HELP="""\
|
|
Usage: archive-dir.py [<OPTIONS>...] <SRC_DIR> <DST_DIR> <TMP_DIR>
|
|
[<OLD_INDEX_FILES>...]
|
|
|
|
Archive SRC_DIR in DST_DIR, using TMP_DIR for temporary files.
|
|
|
|
Create the index file, DST_DIR/StxChecksums.
|
|
|
|
With --checksum-hardlink, attempt to link identical files form older builds
|
|
instead of copying them.
|
|
|
|
|
|
-v,--verbose be verbose
|
|
|
|
-j,--jobs=N perform various operations in parallel (default: 1)
|
|
|
|
--owner=OWNER set destination files' owner, requires requires root
|
|
privileges.
|
|
|
|
--group=GROUP set desintation files' group as specified; requires root
|
|
privileges, or current user must be a member of GROUP
|
|
|
|
--checksum-hardlink
|
|
Hardlink destination files if possible. You must provide
|
|
one or more index files (StxChecksums) generated by older
|
|
builds. We will use the files with matching properties &
|
|
checksums to create hard links in DST_DIR.
|
|
|
|
--old-index-files-from=OLD_INDEX_LIST_FILE
|
|
Read additional index file names from INDEX_LIST_FILE
|
|
|
|
--reflink Create light-weight (COW) file copies if possible. This
|
|
only applies when copying (ie when no link candidates
|
|
found)
|
|
|
|
--skip-existing Skip files that already exist at destination. We still need
|
|
to calculate thir checksums in order to create the index,
|
|
but we will skip the copy.
|
|
|
|
--keep-temp-files
|
|
Normally we delete temporary files upon successful
|
|
completion, this option will keep them.
|
|
|
|
"""
|
|
|
|
# FIXME: this doesn't link files that are identical within the source set,
|
|
# ie same checksum & attributes, but different inode. It can only link
|
|
# such files to similar files from older builds.
|
|
|
|
import argparse
|
|
from collections.abc import Iterable
|
|
from collections.abc import Callable
|
|
from dataclasses import dataclass
|
|
import grp
|
|
import hashlib
|
|
import itertools
|
|
from multiprocessing import Pool
|
|
import os
|
|
from pathlib import Path
|
|
import pwd
|
|
import re
|
|
import shutil
|
|
import shlex
|
|
import stat
|
|
import subprocess
|
|
from typing import TextIO
|
|
|
|
JOBS = 1
|
|
CHECKSUM_READ_SIZE = 4 * 1024 * 1024 # 4 MiB
|
|
COPY_REFLINK = False
|
|
OLD_INDEX_FILES = []
|
|
SKIP_EXISTING = False
|
|
SRC_DIR = None
|
|
DST_DIR = None
|
|
TMP_DIR = None
|
|
CHANGE_UID = None
|
|
CHANGE_GID = None
|
|
VERBOSITY = 0
|
|
CURRENT_GID_LIST = []
|
|
OUTPUT_INDEX_FILE = None
|
|
KEEP_TEMP_FILES = False
|
|
|
|
def log_error(msg:str)->None:
|
|
print('ERROR: %s' % msg, file=sys.stderr)
|
|
|
|
def log_warn(msg:str)->None:
|
|
print('WARNING: %s' % msg, file=sys.stderr)
|
|
|
|
def log_info(msg:str)->None:
|
|
print('%s' % msg, file=sys.stderr)
|
|
|
|
def log_debug(msg:str)->None:
|
|
if VERBOSITY > 0:
|
|
print('%s' % msg, file=sys.stderr)
|
|
|
|
def log_shell_cmd(cmd:str)->None:
|
|
if VERBOSITY > 0:
|
|
print('%% %s' % cmd, file=sys.stderr)
|
|
|
|
# Apply func to items returned by an iterator in parallel.
|
|
# Returns an iterator with the results of func, in unpredictable
|
|
# order.
|
|
def map_p(func:Callable, it:Iterable)->Iterable:
|
|
pool = Pool(JOBS)
|
|
try:
|
|
for x in pool.imap_unordered(func, it):
|
|
yield x
|
|
pool.close()
|
|
pool.join()
|
|
except:
|
|
pool.terminate()
|
|
pool.join()
|
|
raise
|
|
pass
|
|
|
|
# Remove a file if it exists. Raise an exception on directories.
|
|
def remove_file(filename:str)->None:
|
|
try:
|
|
os.unlink(filename)
|
|
except FileNotFoundError:
|
|
pass
|
|
|
|
# Sort a file, ie replace it with a sorted version
|
|
def sort_file_inplace(filename:str, tmp_filename:str)->None:
|
|
cmd = [ 'sort', '--parallel=%d' % JOBS, '-o', tmp_filename, filename ]
|
|
log_shell_cmd(shlex.join(cmd))
|
|
subprocess.run(cmd, check=True)
|
|
log_debug('rename(%s,%s)' % (tmp_filename, filename))
|
|
os.unlink(filename)
|
|
os.rename(tmp_filename, filename)
|
|
|
|
# Combine old index files into one and sort it by checksum
|
|
# Output saved to TMP_DIR/old_index.list
|
|
def combine_old_index_files():
|
|
if OLD_INDEX_FILES:
|
|
log_info('Combining old index files into one')
|
|
# Use 'awk' to add StxChecksums' base directory to each relative filename in it,
|
|
# for each input file, otherwise we won't be able to find the referenced file
|
|
# later when we read these entries.
|
|
#
|
|
# Pipe awk's output to sort
|
|
#
|
|
# ie: ( awk [...] StxChecksums_1 ; awk [...] StxChecksums_2 ; ... ) | sort [...]
|
|
|
|
# Start the sort process, reading from STDIN
|
|
combined_index_file = os.path.join(TMP_DIR, 'old_index.list')
|
|
sort_cmd = [ 'sort', '--parallel=%s' % JOBS, '--output=%s' % combined_index_file ]
|
|
log_shell_cmd(shlex.join(sort_cmd))
|
|
sort_proc = subprocess.Popen(sort_cmd, stdin=subprocess.PIPE)
|
|
|
|
# For each input file, execute AWK with its STDOUT set to sort's STDIN
|
|
try:
|
|
dst_dir_realpath = os.path.realpath(DST_DIR)
|
|
awk_expr = '{ if (match($0, /^[[:space:]]*[^[:space:]]+[[:space:]]+/) >= 0) print substr($0, 1, RLENGTH) DIR substr($0, RLENGTH+1) }'
|
|
for old_index_file in OLD_INDEX_FILES:
|
|
try:
|
|
# Skip StxChecksums file that we are (re-)generating now
|
|
base_dir = os.path.realpath(os.path.dirname(old_index_file))
|
|
if base_dir == dst_dir_realpath and os.path.basename(old_index_file) == 'StxChecksums':
|
|
log_warn('Ignoring output index file %s' % old_index_file)
|
|
continue
|
|
|
|
# Input file may get deleted by job cleanup scripts from underneath us
|
|
# Open the file for reading and pass the open file descriptor to AWK
|
|
with open(old_index_file) as old_index_fh:
|
|
os.set_inheritable(old_index_fh.fileno(), True)
|
|
log_debug('fd %d = %s' % (old_index_fh.fileno(), old_index_file))
|
|
awk_cmd = [ 'awk', '-v', 'DIR=%s/' % base_dir, awk_expr, '/dev/fd/%d' % old_index_fh.fileno() ]
|
|
log_shell_cmd(shlex.join(awk_cmd))
|
|
subprocess.run(awk_cmd, stdout=sort_proc.stdin, check=True, close_fds=False)
|
|
except OSError as e:
|
|
# Ignore errors (typically ENOENT) -- fall back to copy elsewhere
|
|
subprocess.warn('Failed to process %s: %s' % old_index_file, str(e))
|
|
continue
|
|
finally:
|
|
sort_proc.stdin.close()
|
|
sort_proc.wait()
|
|
if sort_proc.returncode != 0:
|
|
raise subprocess.CalledProcessError(returncode=sort_proc.returncode, cmd=sort_cmd)
|
|
|
|
|
|
# Format a line of StxChecksums file
|
|
def format_index_line(rel_path:str, orig_path:str, checksum:str, st:os.stat_result)->str:
|
|
return '%s %s %d %d %d %d %s' % (checksum, rel_path, st.st_size, st.st_mtime, st.st_dev, st.st_ino, orig_path)
|
|
|
|
# File information for intermediate file lists
|
|
@dataclass
|
|
class FileInfo:
|
|
dev:int
|
|
ino:int
|
|
uid:int
|
|
gid:int
|
|
mode:int
|
|
size:int
|
|
mtime:float
|
|
checksum:str
|
|
rel_path:str
|
|
|
|
# Create a FileInfo object from a stat record
|
|
def stat_to_file_info(st:os.stat_result, checksum:str, rel_path:str)->FileInfo:
|
|
return FileInfo(st.st_dev, st.st_ino, st.st_uid, st.st_gid, st.st_mode, st.st_size, st.st_mtime, checksum, rel_path)
|
|
|
|
# Format a FileInfo record as a line of text
|
|
# DEV INO UID GID MODE SIZE MTIME CHECKSUM REL_PATH
|
|
def format_file_info(fi:FileInfo)->str:
|
|
return '%d %d %d %d %d %d %f %s %s' % (fi.dev, fi.ino, fi.uid, fi.gid, fi.mode, fi.size, fi.mtime, fi.checksum, fi.rel_path)
|
|
|
|
# Parse a line of text into a FileInfo object
|
|
# DEV INO UID GID MODE SIZE MTIME CHECKSUM REL_PATH
|
|
RE_FILE_INFO = re.compile(r'^(\d+) (\d+) (\d+) (\d+) (\d+) (\d+) (\S+) (-|[0-9a-f]+) (.+)$', re.ASCII)
|
|
def parse_file_info(line:str)->FileInfo:
|
|
match = RE_FILE_INFO.match(line)
|
|
if match:
|
|
return FileInfo(
|
|
int(match.group(1)), # dev
|
|
int(match.group(2)), # ino
|
|
int(match.group(3)), # uid
|
|
int(match.group(4)), # gid
|
|
int(match.group(5)), # mode
|
|
int(match.group(6)), # size
|
|
float(match.group(7)), # mtime
|
|
match.group(8), # checksum
|
|
match.group(9), # rel_path
|
|
)
|
|
return None
|
|
|
|
# Read a list of FileInfo objects from a file
|
|
def read_file_info_lines(filename:str)->Iterable[FileInfo]:
|
|
with open(filename) as fh:
|
|
for line in fh:
|
|
fi = parse_file_info(line.rstrip('\n'))
|
|
if fi:
|
|
yield fi
|
|
#
|
|
# Find a hardlink candidate among the index (StxChecksums) files
|
|
# generated by older builds.
|
|
# Returns an iterator of tuples (old_path, stat_result), or None.
|
|
#
|
|
RE_OLD_FILE_INFO_LIST = [
|
|
# Faster, but won't match filenames with spaces in them
|
|
re.compile(r'^([0-9a-f]+) (\S+) (\d+) (\d+) (\d+) (\d+) (.+)$', re.ASCII),
|
|
# Slower (because of .+ in the middle)
|
|
re.compile(r'^([0-9a-f]+) (.+) (\d+) (\d+) (\d+) (\d+) (.+)$', re.ASCII)
|
|
]
|
|
def find_old_files(checksum:str)->Iterable[tuple[str, os.stat_result]]:
|
|
# If there are no index files => no combined index either
|
|
if OLD_INDEX_FILES:
|
|
cmd = [ 'look', '%s ' % checksum, os.path.join(TMP_DIR, 'old_index.list') ]
|
|
log_shell_cmd(shlex.join(cmd))
|
|
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, encoding='utf8')
|
|
try:
|
|
for line in p.stdout:
|
|
line = line.rstrip('\n')
|
|
re_match_found = False
|
|
for regex in RE_OLD_FILE_INFO_LIST:
|
|
match = regex.match(line)
|
|
if match:
|
|
re_match_found = True
|
|
full_path = match.group(2)
|
|
size = int(match.group(3))
|
|
mtime = int(match.group(4))
|
|
try:
|
|
st = os.stat(full_path, follow_symlinks=False)
|
|
# NOTE: index files store time stamps as integer's (ie truncated)
|
|
if st.st_size == size and int(st.st_mtime) == mtime:
|
|
yield (full_path, st)
|
|
else:
|
|
log_debug('ignoring old index entry because its metadata doesn\'t match reality [%s] size=%d:%d mtime=%d:%d' % (line, size, st.st_size, mtime, int(st.st_mtime)))
|
|
except FileNotFoundError:
|
|
log_debug('ignoring old index entry because the referenced file doesn\'t exist: %s' % full_path)
|
|
except OSError as e:
|
|
log_warn('ignoring old index entry: %s: %s' % (full_path, str(e)))
|
|
if not re_match_found:
|
|
log_warn('Failed to parse (old) index line [%s]' % line)
|
|
finally:
|
|
p.stdout.close()
|
|
p.wait()
|
|
|
|
#
|
|
# Search SRC_DIR and save the FileInfo entries to 3 files:
|
|
# dirs.list -- directores
|
|
# files.list -- non-directories with unique dev/ino
|
|
# links.list -- duplicate dev/inos
|
|
#
|
|
# All files will have the checksum field set to "-"; we will calculate
|
|
# the checksums separately for files.list.
|
|
#
|
|
# Returns a tuple with total counts.
|
|
#
|
|
def find_files()->tuple[int,int,int]:
|
|
|
|
log_info("searching for files")
|
|
|
|
dirs_file = os.path.join(TMP_DIR, 'dirs.list')
|
|
dirs_fh = None
|
|
dirs_count = 0
|
|
|
|
files_file = os.path.join(TMP_DIR, 'files.list')
|
|
files_fh = None
|
|
files_count = 0
|
|
|
|
links_file = os.path.join(TMP_DIR, 'links.list')
|
|
links_fh = None
|
|
links_count = 0
|
|
|
|
try:
|
|
log_debug('creating %s' % dirs_file)
|
|
dirs_fh = open(dirs_file, 'w')
|
|
|
|
log_debug('creating %s' % files_file)
|
|
files_fh = open(files_file, 'w')
|
|
|
|
log_debug('creating %s' % links_file)
|
|
links_fh = open(links_file, 'w')
|
|
|
|
dev_map = {}
|
|
|
|
def walk_error(err:Exception)->None:
|
|
raise err
|
|
|
|
dirs_count = 0
|
|
files_count = 0
|
|
links_count = 0
|
|
|
|
log_debug(' %s/' % '.')
|
|
st = os.stat(SRC_DIR, follow_symlinks=False)
|
|
print('%s' % format_file_info(stat_to_file_info(st, '-', '.')), file=dirs_fh)
|
|
dirs_count += 1
|
|
|
|
for (dirpath, dirnames, filenames) in os.walk(SRC_DIR, onerror=walk_error):
|
|
rel_dirpath = dirpath[len(SRC_DIR)+1:]
|
|
extra_files = []
|
|
|
|
# directories
|
|
for dirname in dirnames:
|
|
full_path = os.path.join(dirpath, dirname)
|
|
st = os.stat(full_path, follow_symlinks=False)
|
|
# os.walk() returns directory symlinks as "directories" here.
|
|
# Treat them as any other non-directory file below
|
|
if stat.S_ISDIR(st.st_mode):
|
|
rel_path = os.path.join(rel_dirpath, dirname)
|
|
log_debug(' %s/' % rel_path)
|
|
print('%s' % format_file_info(stat_to_file_info(st, '-', rel_path)), file=dirs_fh)
|
|
dirs_count += 1
|
|
else:
|
|
extra_files.append(dirname)
|
|
|
|
# files
|
|
for filename in itertools.chain.from_iterable([filenames, extra_files]):
|
|
rel_path = os.path.join(rel_dirpath, filename)
|
|
full_path = os.path.join(dirpath, filename)
|
|
log_debug (' %s' % rel_path)
|
|
st = os.stat(full_path, follow_symlinks=False)
|
|
ino_map = dev_map.get(st.st_dev)
|
|
if ino_map is None:
|
|
ino_map = {}
|
|
dev_map[st.st_dev] = ino_map
|
|
if st.st_ino not in ino_map:
|
|
ino_map[st.st_ino] = None
|
|
fh = files_fh
|
|
files_count += 1
|
|
else:
|
|
fh = links_fh
|
|
links_count += 1
|
|
print('%s' % format_file_info(stat_to_file_info(st, '-', rel_path)), file=fh)
|
|
|
|
finally:
|
|
for fh in (links_fh, files_fh, dirs_fh):
|
|
if fh is not None:
|
|
fh.close()
|
|
|
|
# Sort files.list because we need to look up duplicate devno/ino entries
|
|
# there for creating links
|
|
sort_file_inplace(files_file, '%s.tmp' % files_file)
|
|
|
|
log_info ('found dirs=%d files=%d links=%d' % (dirs_count, files_count, links_count))
|
|
return (dirs_count, files_count, links_count)
|
|
|
|
# Get the SHA256 of a file
|
|
def get_sha256(path:str)->str:
|
|
with open(path, "rb") as f:
|
|
file_hash = hashlib.sha256()
|
|
while chunk := f.read(CHECKSUM_READ_SIZE):
|
|
file_hash.update(chunk)
|
|
return file_hash.hexdigest()
|
|
|
|
# Calculate and add the checksum given a FileInfo and return
|
|
# the updated FileInfo. Make no changes for non-regfiles.
|
|
def add_one_checksum(fi:FileInfo)->FileInfo:
|
|
if stat.S_ISREG(fi.mode):
|
|
src_path = os.path.join(SRC_DIR, fi.rel_path)
|
|
log_debug('sha256(%s)' % src_path)
|
|
fi.checksum = get_sha256(src_path)
|
|
return fi
|
|
|
|
#
|
|
# Add checksums and sort files.list
|
|
#
|
|
def calc_checksums(files_count:int)->None:
|
|
log_info("calculating checksums, count=%d" % files_count)
|
|
|
|
list_file = os.path.join(TMP_DIR, 'files.list')
|
|
tmp_list_file = os.path.join(TMP_DIR, 'files.list.tmp')
|
|
|
|
log_debug('creating sorted %s' % tmp_list_file)
|
|
with open(tmp_list_file, 'w') as fh:
|
|
fi_iter = read_file_info_lines(list_file)
|
|
for fi in map_p(add_one_checksum, fi_iter):
|
|
print(format_file_info(fi), file=fh)
|
|
|
|
cmd = [ 'sort', '--parallel=%d' % JOBS, '-o', list_file, tmp_list_file ]
|
|
log_shell_cmd(shlex.join(cmd))
|
|
subprocess.run(cmd, check=True)
|
|
os.unlink(tmp_list_file)
|
|
|
|
#
|
|
# Create directores at destination
|
|
#
|
|
def create_dirs(dirs_count:int)->None:
|
|
log_info("creating directories, count=%d" % dirs_count)
|
|
for fi in read_file_info_lines(os.path.join(TMP_DIR, 'dirs.list')):
|
|
if fi.rel_path == '.':
|
|
path = DST_DIR
|
|
else:
|
|
path = os.path.join(DST_DIR, fi.rel_path)
|
|
|
|
dst_exists = False
|
|
try:
|
|
st = os.stat(path)
|
|
if stat.S_ISDIR(st.st_mode):
|
|
dst_exists = True
|
|
else:
|
|
remove_file(path)
|
|
except FileNotFoundError:
|
|
pass
|
|
|
|
if not dst_exists:
|
|
log_debug('mkdir(%s)' % path)
|
|
os.mkdir(path)
|
|
|
|
# If we are not root, set directory permissions to be
|
|
# writable by owner, because we will be creating files
|
|
# there. This will fail if destination directory is not
|
|
# already owned by us (to be expected).
|
|
if os.geteuid() != 0:
|
|
log_debug('chmod(%s, 0%o)' % (path, 0o700))
|
|
# Don't set follow_symlinks because this function
|
|
# is never called for symlinks
|
|
os.chmod(path, 0o700)
|
|
|
|
# Copy a file and its attributes, but change UID/GID as specified
|
|
def do_copy(src_path:str, dst_path:str, new_uid:int, new_gid:int)->None:
|
|
#log_debug("copy(%s, %s)" % (src_path, dst_path))
|
|
cmd = [ 'cp', '-a' ]
|
|
if COPY_REFLINK:
|
|
cmd.append('--reflink')
|
|
cmd.append('--no-dereference')
|
|
cmd.append('--')
|
|
cmd.append(src_path)
|
|
cmd.append(dst_path)
|
|
log_shell_cmd(shlex.join(cmd))
|
|
subprocess.run(cmd, check=True)
|
|
|
|
# Doesn't support reflinks, see https://github.com/python/cpython/issues/81338
|
|
#shutil.copy2(src_path, dst_path, follow_symlinks=False)
|
|
|
|
st = os.stat(dst_path, follow_symlinks=False)
|
|
if new_gid != st.st_gid or new_uid != st.st_uid:
|
|
log_debug('chown(%s, %d, %d)' % (dst_path, new_uid, new_gid))
|
|
os.chown(dst_path, new_uid, new_gid)
|
|
st = os.stat(dst_path, follow_symlinks=False)
|
|
|
|
return st
|
|
|
|
#
|
|
# Copy or link a regfile:
|
|
# If there's an older file with the same checksum, link it
|
|
# Otherwise copy it
|
|
# If linking fails, also copy it
|
|
#
|
|
# Return tuple(REL_PATH, FULL_PATH, CHECKSUM, DST_STAT_RESULT, LINKED, COPIED, SKIPPED)
|
|
#
|
|
def copy_one_file(fi:FileInfo)->tuple:
|
|
dst_path = os.path.join(DST_DIR, fi.rel_path)
|
|
src_path = os.path.join(SRC_DIR, fi.rel_path)
|
|
|
|
# Work out target file's UID/GID
|
|
if CHANGE_GID is not None:
|
|
new_gid = CHANGE_GID
|
|
else:
|
|
new_gid = fi.gid
|
|
if os.geteuid() != 0 and new_gid not in CURRENT_GID_LIST:
|
|
new_gid = os.getegid()
|
|
|
|
if CHANGE_UID is not None:
|
|
new_uid = CHANGE_UID
|
|
else:
|
|
new_uid = fi.uid
|
|
if os.geteuid() != 0:
|
|
new_uid = os.geteuid()
|
|
|
|
# Skip existing files
|
|
if SKIP_EXISTING:
|
|
try:
|
|
st = os.stat(dst_path, follow_symlinks=False)
|
|
if st.st_uid == new_uid and \
|
|
st.st_gid == new_gid and \
|
|
st.st_size == fi.size and \
|
|
st.st_mtime == fi.mtime and \
|
|
st.st_mode == fi.mode:
|
|
log_debug('skipping existing %s' % dst_path)
|
|
# (REL_PATH, FULL_PATH, CHECKSUM, STAT_RES, LINKED, COPIED, SKIPPED)
|
|
return (fi.rel_path, dst_path, fi.checksum, st, 0, 0, 1)
|
|
except FileNotFoundError:
|
|
pass
|
|
|
|
# Delete destination file if it exists
|
|
remove_file(dst_path)
|
|
|
|
# Regular file: try to link it to a file from an older build
|
|
if stat.S_ISREG(fi.mode) and fi.checksum != '-':
|
|
|
|
# Look up an identical file among the older builds
|
|
for (old_path, old_st) in find_old_files(fi.checksum):
|
|
try:
|
|
log_debug('found link candidate by checksum: %s' % old_path)
|
|
# Only link old files whose attributes match the source file
|
|
# except mtime
|
|
if old_st.st_uid == new_uid and \
|
|
old_st.st_gid == new_gid and \
|
|
old_st.st_size == fi.size and \
|
|
old_st.st_mode == fi.mode:
|
|
log_debug('link(%s,%s)' % (old_path, dst_path))
|
|
os.link(old_path, dst_path)
|
|
dst_stat = os.stat(dst_path, follow_symlinks=False)
|
|
# (REL_PATH, FULL_PATH, CHECKSUM, STAT_RES, LINKED, COPIED, SKIPPED)
|
|
return (fi.rel_path, old_path, fi.checksum, dst_stat, 1, 0, 0)
|
|
break
|
|
except OSError as e:
|
|
log_warn('link(old_path,dst_path): %s' % str(e))
|
|
|
|
# Checksum not found, or link failed: copy
|
|
dst_stat = do_copy(src_path, dst_path, new_uid, new_gid)
|
|
# (REL_PATH, FULL_PATH, CHECKSUM, STAT_RES, LINKED, COPIED, SKIPPED)
|
|
return (fi.rel_path, dst_path, fi.checksum, dst_stat, 0, 1, 0)
|
|
|
|
#
|
|
# Copy files to DST_DIR
|
|
#
|
|
# Returns tuple (total_linked, total_copied, total_skipped)
|
|
#
|
|
def copy_files(files_count:int)->tuple[int,int]:
|
|
log_info("copying files, count=%d" % files_count)
|
|
total_linked = 0
|
|
total_copied = 0
|
|
total_skipped = 0
|
|
with open(os.path.join(TMP_DIR, 'files.index'), 'w') as fh:
|
|
fi_iter = read_file_info_lines(os.path.join(TMP_DIR, 'files.list'))
|
|
for (rel_path, full_path, checksum, st, linked, copied, skipped) in map_p(copy_one_file, fi_iter):
|
|
total_linked += linked
|
|
total_copied += copied
|
|
total_skipped += skipped
|
|
if stat.S_ISREG(st.st_mode):
|
|
index_line = format_index_line(rel_path, full_path, checksum, st)
|
|
print('%s' % index_line, file = fh)
|
|
|
|
return (total_linked, total_copied, total_skipped)
|
|
|
|
#
|
|
# Re-create a hardlink at destination, ie create a file
|
|
# as a link to a previously copied file, because it was
|
|
# linked in SRC_DIR.
|
|
#
|
|
# Fall back to copy if link fails.
|
|
#
|
|
# Return tuple(REL_PATH, FULL_PATH, CHECKSUM, DST_STAT_RESULT, LINKED, COPIED, SKIPPED)
|
|
#
|
|
def copy_one_link(fi:FileInfo)->tuple:
|
|
dst_path = os.path.join(DST_DIR, fi.rel_path)
|
|
src_path = os.path.join(SRC_DIR, fi.rel_path)
|
|
|
|
# Delete destination file if it exists
|
|
remove_file(dst_path)
|
|
|
|
# Try to link it to a file we previously installed in copy_files()
|
|
# Find the previously-installed source file in files.list, by dev/ino
|
|
try:
|
|
cmd = [ 'look', '%d %d ' % (fi.dev, fi.ino), os.path.join(TMP_DIR, 'files.list') ]
|
|
log_shell_cmd(shlex.join(cmd))
|
|
cmd_res = subprocess.run(cmd, check=False, encoding='utf8', stdout=subprocess.PIPE).stdout
|
|
old_fi = parse_file_info(cmd_res)
|
|
if old_fi:
|
|
orig_path = os.path.join(DST_DIR, old_fi.rel_path)
|
|
log_debug('link(%s,%s)' % (orig_path, dst_path))
|
|
os.link(orig_path, dst_path)
|
|
st = os.stat(dst_path, follow_symlinks=False)
|
|
# (REL_PATH, FULL_PATH, CHECKSUM, STAT_RES, LINKED, COPIED, SKIPPED)
|
|
return (fi.rel_path, orig_path, old_fi.checksum, st, 1, 0, 0)
|
|
except OSError as e:
|
|
log_warn('failed to link %s: %s' % (dst_path, str(e)))
|
|
|
|
# Fall back to copy
|
|
return copy_one_file(fi)
|
|
|
|
#
|
|
# Re-create or copy hardlinks at destination
|
|
#
|
|
# Returns tuple (total_linked, total_copied, total_skipped)
|
|
#
|
|
def copy_links(links_count:int)->tuple[int,int]:
|
|
log_info("copying links, count=%d" % links_count)
|
|
total_linked = 0
|
|
total_copied = 0
|
|
total_skipped = 0
|
|
with open(os.path.join(TMP_DIR, 'links.index'), 'w') as fh:
|
|
fi_iter = read_file_info_lines(os.path.join(TMP_DIR, 'links.list'))
|
|
for (rel_path, full_path, checksum, st, linked, copied, skipped) in map_p(copy_one_link, fi_iter):
|
|
total_linked += linked
|
|
total_copied += copied
|
|
total_skipped += skipped
|
|
if stat.S_ISREG(st.st_mode):
|
|
index_line = format_index_line(rel_path, full_path, checksum, st)
|
|
print('%s' % index_line, file = fh)
|
|
return (total_linked, total_copied, total_skipped)
|
|
|
|
#
|
|
# Set directory permissions & ownership to how they were at the source
|
|
#
|
|
def adjust_one_dir_perms(fi:FileInfo)->FileInfo:
|
|
path = os.path.join(DST_DIR, fi.rel_path)
|
|
perms = stat.S_IMODE(fi.mode)
|
|
log_debug("chmod(%s, 0%o)" % (path, perms))
|
|
# Don't set follow_symlinks because this function
|
|
# is never called for symlinks
|
|
os.chmod(path, perms)
|
|
|
|
# At this point target directory exists and is owned
|
|
# by the current UID:GID due to create_dirs().
|
|
st = os.stat(path, follow_symlinks=False)
|
|
|
|
if CHANGE_GID is not None:
|
|
new_gid = CHANGE_GID
|
|
else:
|
|
new_gid = fi.gid
|
|
if os.geteuid() != 0 and new_gid not in CURRENT_GID_LIST:
|
|
new_gid = os.getegid()
|
|
|
|
if CHANGE_UID is not None:
|
|
new_uid = CHANGE_UID
|
|
else:
|
|
new_uid = fi.uid
|
|
if os.geteuid() != 0:
|
|
new_uid = os.geteuid()
|
|
|
|
if new_uid != st.st_uid or new_gid != st.st_gid:
|
|
log_debug("chown(%s, %d, %d)" % (path, new_uid, new_gid))
|
|
os.chown(path, new_uid, new_gid, follow_symlinks=False)
|
|
|
|
# Set both access time and modification time to modification fime of the
|
|
# source directory
|
|
log_debug("utime(%s, (%f, %f))" % (path, fi.mtime, fi.mtime))
|
|
os.utime(path, (fi.mtime, fi.mtime))
|
|
|
|
return fi
|
|
|
|
#
|
|
# Adjust directory permissions & ownership at destination
|
|
#
|
|
def adjust_dir_perms(dirs_count:int)->None:
|
|
log_info("adjusting directory permissions, count=%d" % dirs_count)
|
|
fi_iter = read_file_info_lines(os.path.join(TMP_DIR, 'dirs.list'))
|
|
for fi in map_p(adjust_one_dir_perms, fi_iter):
|
|
pass
|
|
|
|
# Save or print "standard" index (StxChecksums) for regfiles and links
|
|
def save_index(files_count:int, links_count:int)->None:
|
|
files_index_file = os.path.join(TMP_DIR, 'files.index')
|
|
links_index_file = os.path.join(TMP_DIR, 'links.index')
|
|
full_index_file = os.path.join(DST_DIR, 'StxChecksums')
|
|
log_info('creating index, count=%d' % (files_count + links_count))
|
|
|
|
sort_cmd = [ 'sort', '--parallel=%d' % JOBS, '--output=%s' % full_index_file, files_index_file, links_index_file ]
|
|
|
|
log_shell_cmd(shlex.join(sort_cmd))
|
|
subprocess.run(sort_cmd, check=True)
|
|
|
|
# Delete temp files
|
|
def cleanup():
|
|
if not KEEP_TEMP_FILES:
|
|
tmp_files = [
|
|
'dirs.list',
|
|
'files.index',
|
|
'files.list',
|
|
'links.index',
|
|
'links.list',
|
|
'old_index.list',
|
|
]
|
|
for file in tmp_files:
|
|
remove_file(os.path.join(TMP_DIR, file))
|
|
|
|
# process command line
|
|
def init()->None:
|
|
def positive_integer(s:str)->int:
|
|
v = int(s)
|
|
if v < 1:
|
|
raise ValueError()
|
|
return v
|
|
def user_id(s:str)->int:
|
|
try:
|
|
uid = int(s)
|
|
except:
|
|
try:
|
|
uid = pwd.getpwnam(s).pw_uid
|
|
except:
|
|
raise ValueError()
|
|
if uid < 0:
|
|
raise ValueError
|
|
return uid
|
|
def group_id(s:str)->int:
|
|
try:
|
|
uid = int(s)
|
|
except:
|
|
try:
|
|
uid = grp.getgrnam(s).gr_gid
|
|
except:
|
|
raise ValueError()
|
|
if uid < 0:
|
|
raise ValueError
|
|
return uid
|
|
|
|
p = argparse.ArgumentParser()
|
|
p.add_argument('-j', '--jobs', type=positive_integer, default=1)
|
|
p.add_argument('--owner', type=user_id)
|
|
p.add_argument('--group', type=group_id)
|
|
p.add_argument('--checksum-hardlink', action='store_true', default=False)
|
|
p.add_argument('--old-index-files-from')
|
|
p.add_argument('--output-checksums')
|
|
p.add_argument('--skip-existing', action='store_true', default=False)
|
|
p.add_argument('-v', '--verbose', action='count', default=0, dest='verbosity')
|
|
p.add_argument('--reflink', action='store_true', default=False)
|
|
p.add_argument('--keep-temp-files', action='store_true', default=False)
|
|
p.add_argument('SRC_DIR')
|
|
p.add_argument('DST_DIR')
|
|
p.add_argument('TMP_DIR')
|
|
p.add_argument('old_index_files', nargs='*')
|
|
p.format_help = lambda: HELP
|
|
args = p.parse_args()
|
|
|
|
current_gid_list = [ os.getegid(), *os.getgroups() ]
|
|
if args.owner is not None:
|
|
if os.geteuid() != 0 and args.owner != os.geteuid():
|
|
log_error('--owner can only be changed by root')
|
|
sys.exit(1)
|
|
if args.group is not None:
|
|
if os.geteuid() != 0 and args.group not in current_gid_list:
|
|
log_error('--group can only be changed by root; or it must be a group you are a member of')
|
|
sys.exit(1)
|
|
|
|
existing_old_index_files = []
|
|
if args.checksum_hardlink:
|
|
old_index_files = []
|
|
old_index_files += args.old_index_files
|
|
if args.old_index_files_from:
|
|
with open(args.old_index_files_from) as fh:
|
|
for filename in fh:
|
|
filename = filename.rstrip()
|
|
old_index_files.append(filename)
|
|
# Ignore missing/non-readable files because they may disappear
|
|
# while this script is running
|
|
for filename in old_index_files:
|
|
try:
|
|
with open(filename) as ref_fh:
|
|
existing_old_index_files.append(filename)
|
|
except OSError as x:
|
|
log_warn('Ignoring index file %s: %s' % (filename, str(x)))
|
|
elif args.old_index_files:
|
|
log_warn('old index files are meaningless without --checksum-hardlink')
|
|
|
|
global JOBS, CHANGE_UID, CHANGE_GID, CURRENT_GID_LIST
|
|
global VERBOSITY, COPY_REFLINK, SRC_DIR, DST_DIR, TMP_DIR
|
|
global OLD_INDEX_FILES, OUTPUT_INDEX_FILE
|
|
global KEEP_TEMP_FILES, SKIP_EXISTING
|
|
JOBS = args.jobs
|
|
CHANGE_UID = args.owner
|
|
CHANGE_GID = args.group
|
|
CURRENT_GID_LIST = current_gid_list
|
|
VERBOSITY = args.verbosity
|
|
COPY_REFLINK = args.reflink
|
|
SRC_DIR = str(Path(args.SRC_DIR).absolute())
|
|
DST_DIR = str(Path(args.DST_DIR).absolute())
|
|
TMP_DIR = str(Path(args.TMP_DIR).absolute())
|
|
OLD_INDEX_FILES = existing_old_index_files
|
|
OUTPUT_INDEX_FILE = args.output_checksums
|
|
SKIP_EXISTING = args.skip_existing
|
|
KEEP_TEMP_FILES = args.keep_temp_files
|
|
|
|
init()
|
|
|
|
log_debug('SRC_DIR=%s' % SRC_DIR)
|
|
log_debug('DST_DIR=%s' % DST_DIR)
|
|
log_debug('TMP_DIR=%s' % TMP_DIR)
|
|
log_debug('JOBS=%d' % JOBS)
|
|
if CHANGE_UID:
|
|
log_debug('CHANGE_UID=%d' % CHANGE_UID)
|
|
if CHANGE_GID:
|
|
log_debug('CHANGE_GID=%d' % CHANGE_GID)
|
|
log_debug('OLD_INDEX_FILES=%s' % OLD_INDEX_FILES)
|
|
log_debug('KEEP_TEMP_FILES=%d' % KEEP_TEMP_FILES)
|
|
|
|
if not os.path.isdir(TMP_DIR):
|
|
os.mkdir(TMP_DIR)
|
|
(dirs_count, files_count, links_count) = find_files()
|
|
calc_checksums(files_count)
|
|
create_dirs(dirs_count)
|
|
combine_old_index_files() # DST_DIR must already exist
|
|
(linked1, copied1, skipped1) = copy_files(files_count)
|
|
(linked2, copied2, skipped2) = copy_links(links_count)
|
|
adjust_dir_perms(dirs_count)
|
|
save_index(files_count, links_count)
|
|
cleanup()
|
|
log_info('%s linked=%d copied=%d skipped=%d' % (DST_DIR, linked1+linked2, copied1+copied2, skipped1+skipped2))
|