#!/bin/bash # Copyright (c) 2012, Code Aurora Forum. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # # Redistributions in binary form must reproduce the above # copyright notice, this list of conditions and the following # disclaimer in the documentation and/or other materials provided # with the distribution. # # Neither the name of Code Aurora Forum, Inc. nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR # BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, # WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE # OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN # IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. usage() { # error_message cat <<-EOF usage: $(basename $0) [-unvt] [--noref] [--nolosse] [-r|--ratio number] [git gc option...] git.repo -u|-h usage/help -v verbose -n dry-run don't actually repack anything -t touch treat repo as if it had been touched --noref avoid extra ref packing timestamp checking --noloose do not run just because there are loose object dirs (repacking may still run if they are referenced) -r ratio packfile ratio to aim for (default 10) git gc option will be passed as args to git gc git.repo to run gc against Garbage collect using a pseudo logarithmic packfile maintenance approach. This approach attempts to minimize packfile churn by keeping several generations of varying sized packfiles around and only consolidating packfiles (or loose objects) which are either new packfiles, or packfiles close to the same size as another packfile. An estimate is used to predict when rollups (one consolidation would cause another consolidation) would occur so that this rollup can be done all at once via a single repack. This reduces both the runtime and the pack file churn in rollup cases. Approach: plan each consolidation by creating a table like this: Id Keep Size Sha1(or consolidation list) Actions(repack down up note) 1 - 11356 9052edfb7392646cd4e5f362b953675985f01f96 y - - New 2 - 429088 010904d5c11cd26a79fda91b01ab454d1001b402 y - - New c1 - 440444 [1,2] - - - Id: numbers preceded by a c are estimated "c pack" files Keep: - none, k private keep, o our keep Size: in disk blocks (default du output) Sha1: of packfile, or consolidation list of packfile ids Actions repack: - n no, y yes down: - noop, ^ consolidate with a file above up: - noop, v consolidate with a file below note: Human description of script decisions: New (file is a new packfile) Consolidate with: (too far from:) On the first pass, always consolidate any new packfiles along with loose objects and along with any packfiles which are within the ratio size of their predecessors (note, the list is ordered by increasing size). After each consolidation, insert a fake consolidation, or "c pack", to naively represent the size and ordered positioning of the anticipated new consolidated pack. Every time a new pack is planned, rescan the list in case the new "c pack" would cause more consolidation... Once the packfiles which need consolidation are determined, the packfiles which will not be consolidated are marked with a .keep file, and those which will be consolidated will have their .keep removed if they have one. Thus, the packfiles with a .keep will not get repacked. Packfile consolidation is determined by the --ratio parameter (default is 10). This ratio is somewhat of a tradeoff. The smaller the number, the more packfiles will be kept on average; this increases disk utilization somewhat. However, a larger ratio causes greater churn and may increase disk utilization due to deleted packfiles not being reclaimed since they may still be kept open by long running applications such as Gerrit. Sane ratio values are probably between 2 and 10. Since most consolidations actually end up smaller than the estimated consolidated packfile size (due to compression), the true ratio achieved will likely be 1 to 2 greater than the target ratio. The smaller the target ratio, the greater this discrepancy. Finally, attempt to skip garbage collection entirely on untouched repos. In order to determine if a repo has been touched, use the timestamp on the script's keep files, if any relevant file/dir is newer than a keep marker file, assume that the repo has been touched and gc needs to run. Also assume gc needs to run whenever there are loose object dirs since they may contain untouched unreferenced loose objects which need to be pruned (once they expire). In order to allow the keep files to be an effective timestamp marker to detect relevant changes in a repo since the last run, all relevant files and directories which may be modified during a gc run (even during a noop gc run), must have their timestamps reset to the same time as the keep files or gc will always run even on untouched repos. The relevant files/dirs are all those files and directories which garbage collection, object packing, ref packing and pruning might change during noop actions. EOF [ -n "$1" ] && info "ERROR $1" exit 128 } debug() { [ -n "$SW_V" ] && info "$1" ; } info() { echo "$1" >&2 ; } array_copy() { #v2 # array_src array_dst local src=$1 dst=$2 local s i=0 eval s=\${#$src[@]} while [ $i -lt $s ] ; do eval $dst[$i]=\"\${$src[$i]}\" i=$(($i + 1)) done } array_equals() { #v2 # array_name [vals...] local a=$1 ; shift local s=0 t=() val array_copy "$a" t for s in "${!t[@]}" ; do s=$((s+1)) ; done [ "$s" -ne "$#" ] && return 1 for val in "${t[@]}" ; do [ "$val" = "$1" ] || return 2 shift done return 0 } packs_sizes() { # git.repo > "size pack"... du -s "$1"/objects/pack/pack-$SHA1.pack | sort -n 2> /dev/null } is_ourkeep() { grep -q "$KEEP" "$1" 2> /dev/null ; } # keep has_ourkeep() { is_ourkeep "$(keep_for "$1")" ; } # pack has_keep() { [ -f "$(keep_for "$1")" ] ; } # pack is_repo() { [ -d "$1/objects" ] && [ -d "$1/refs/heads" ] ; } # git.repo keep() { # pack # returns true if we added our keep keep=$(keep_for "$1") [ -f "$keep" ] && return 1 echo "$KEEP" > "$keep" return 0 } keep_for() { # packfile > keepfile local keep=$(echo "$1" | sed -es'/\.pack$/.keep/') [ "${keep/.keep}" = "$keep" ] && return 1 echo "$keep" } idx_for() { # packfile > idxfile local idx=$(echo "$1" | sed -es'/\.pack$/.idx/') [ "${idx/.idx}" = "$idx" ] && return 1 echo "$idx" } # pack_or_keep_file > sha sha_for() { echo "$1" | sed -es'|\(.*/\)*pack-\([^.]*\)\..*$|\2|' ; } private_keeps() { # git.repo -> sets pkeeps local repo=$1 ary=$2 local keep keeps=("$repo"/objects/pack/pack-$SHA1.keep) pkeeps=() for keep in "${keeps[@]}" ; do is_ourkeep "$keep" || pkeeps=("${pkeeps[@]}" "$keep") done } is_tooclose() { [ "$(($1 * $RATIO))" -gt "$2" ] ; } # smaller larger unique() { # [args...] > unique_words local lines=$(while [ $# -gt 0 ] ; do echo "$1" ; shift ; done) lines=$(echo "$lines" | sort -u) echo $lines # as words } outfs() { # fs [args...] > argfs... local fs=$1 ; shift [ $# -gt 0 ] && echo -n "$1" ; shift while [ $# -gt 0 ] ; do echo -n "$fs$1" ; shift ; done } sort_list() { # < list > formatted_list # n has_keep size sha repack down up note awk '{ note=$8; for(i=8;i treat as touched" ; return 0 ; } if [ -z "$SW_LOOSE" ] ; then # If there are loose objects, they may need to be pruned, # run even if nothing has really been touched. loose=$(find "$repo/objects" -type d \ -wholename "$repo/objects/[0-9][0-9]" -print -quit 2>/dev/null) [ -n "$loose" ] && { info "There are loose object directories" ; return 0 ; } fi # If we don't have a keep, the current packfiles may not have been # compressed with the current gc policy (gc may never have been run), # so run at least once to repack everything. Also, we need a marker # file for timestamp tracking (a dir needs to detect changes within # it, so it cannot be a marker) and our keeps are something we control, # use them. for keep in "$repo"/objects/pack/pack-$SHA1.keep ; do is_ourkeep "$keep" && { ours=$keep ; break ; } done [ -z "$ours" ] && { info 'We have no keep (we have never run?): run' ; return 0 ; } debug "Our timestamp keep: $ours" # The wholename stuff seems to get touched by a noop git gc newer=$(find "$repo/objects" "$repo/refs" "$repo/packed-refs" \ '!' -wholename "$repo/objects/info" \ '!' -wholename "$repo/objects/info/*" \ -newer "$ours" \ -print -quit 2>/dev/null) [ -z "$newer" ] && return 1 info "Touched since last run: $newer" return 0 } touch_refs() { # git.repo start_date refs local repo=$1 start_date=$2 refs=$3 ( debug "Setting start date($start_date) on unpacked refs:" debug "$refs" cd "$repo/refs" || return # safe to assume no newlines in a ref name echo "$refs" | xargs -d '\n' -n 1 touch -c -d "$start_date" ) } set_start_date() { # git.repo start_date refs refdirs packedrefs [packs] local repo=$1 start_date=$2 refs=$3 refdirs=$4 packedrefs=$5 ; shift 5 local pack keep idx repacked # This stuff is touched during object packs while [ $# -gt 0 ] ; do pack=$1 ; shift keep="$(keep_for "$pack")" idx="$(idx_for "$pack")" touch -c -d "$start_date" "$pack" "$keep" "$idx" debug "Setting start date on: $pack $keep $idx" done # This will prevent us from detecting any deletes in the pack dir # since gc ran, except for private keeps which we are checking # manually. But there really shouldn't be any other relevant deletes # in this dir which should cause us to rerun next time, deleting a # pack or index file by anything but gc would be bad! debug "Setting start date on pack dir: $start_date" touch -c -d "$start_date" "$repo/objects/pack" if [ -z "$SW_REFS" ] ; then repacked=$(find "$repo/packed-refs" -newer "$repo/objects/pack" -print -quit 2>/dev/null) if [ -n "$repacked" ] ; then # The ref dirs and packed-ref files seem to get touched even on # a noop refpacking debug "Setting start date on packed-refs" touch -c -d "$start_date" "$repo/packed-refs" touch_refs "$repo" "$start_date" "$refdirs" # A ref repack does not imply a ref change, but since it is # hard to tell, simply assume so if [ "$refs" != "$(cd "$repo/refs" ; find -depth)" ] || \ [ "$packedrefs" != "$(<"$repo/packed-refs")" ] ; then # We retouch if needed (instead of simply checking then # touching) to avoid a race between the check and the set. debug " but refs actually got packed, so retouch packed-refs" touch -c "$repo/packed-refs" fi fi fi } note_consolidate() { # note entry > note (no duplicated consolidated entries) local note=$1 entry=$2 local entries=() ifs=$IFS if echo "$note" | grep -q 'Consolidate with:[0-9,c]' ; then IFS=, entries=( $(echo "$note" | sed -es'/^.*Consolidate with:\([0-9,c]*\).*$/\1/') ) note=( $(echo "$note" | sed -es'/Consolidate with:[0-9,c]*//') ) IFS=$ifs fi entries=( $(unique "${entries[@]}" "$entry") ) echo "$note Consolidate with:$(outfs , "${entries[@]}")" } note_toofar() { # note entry > note (no duplicated "too far" entries) local note=$1 entry=$2 local entries=() ifs=$IFS if echo "$note" | grep -q '(too far from:[0-9,c]*)' ; then IFS=, entries=( $(echo "$note" | sed -es'/^.*(too far from:\([0-9,c]*\)).*$/\1/') ) note=( $(echo "$note" | sed -es'/(too far from:[0-9,c]*)//') ) IFS=$ifs fi entries=( $(unique "${entries[@]}" "$entry") ) echo "$note (too far from:$(outfs , "${entries[@]}"))" } last_entry() { # isRepack pline repackline > last_rows_entry local size_hit=$1 pline=$2 repackline=$3 if [ -n "$pline" ] ; then if [ -n "$size_hit" ] ; then echo "$repack_line" else echo "$pline" fi fi } init_list() { # git.repo > shortlist local repo=$1 local file local n has_keep size sha repack packs_sizes "$1" | { while read size file ; do n=$((n+1)) repack=n has_keep=- if has_keep "$file" ; then has_keep=k has_ourkeep "$file" && has_keep=o fi sha=$(sha_for "$file") echo "$n $has_keep $size $sha $repack" done } | sort_list } consolidate_list() { # run < list > list local run=$1 local sum=0 psize=0 sum_size=0 size_hit pn clist pline repackline local n has_keep size sha repack down up note { while read n has_keep size sha repack down up note; do [ -z "$up" ] && up='-' [ -z "$down" ] && down="-" if [ "$has_keep" = "k" ] ; then echo "$n $has_keep $size $sha $repack - - Private" continue fi if [ "$repack" = "n" ] ; then if is_tooclose $psize $size ; then size_hit=y repack=y sum=$(($sum + $sum_size + $size)) sum_size=0 # Prevents double summing this entry clist=($(unique "${clist[@]}" $pn $n)) down="^" [ "$has_keep" = "-" ] && note="$note New +" note=$(note_consolidate "$note" "$pn") elif [ "$has_keep" = "-" ] ; then repack=y sum=$(($sum + $size)) sum_size=0 # Prevents double summing this entry clist=($(unique "${clist[@]}" $n)) note="$note New" elif [ $psize -ne 0 ] ; then sum_size=$size down="!" note=$(note_toofar "$note" "$pn") else sum_size=$size fi else sum_size=$size fi # By preventing "c files" (consolidated) from being marked # "repack" they won't get keeps repack2=y [ "${n/c}" != "$n" ] && { repack=- ; repack2=- ; } last_entry "$size_hit" "$pline" "$repack_line" # Delay the printout until we know whether we are # being consolidated with the entry following us # (we won't know until the next iteration). # size_hit is used to determine which of the lines # below will actually get printed above on the next # iteration. pline="$n $has_keep $size $sha $repack $down $up $note" repack_line="$n $has_keep $size $sha $repack2 $down v $note" pn=$n ; psize=$size # previous entry data size_hit='' # will not be consolidated up done last_entry "$size_hit" "$pline" "$repack_line" [ $sum -gt 0 ] && echo "c$run - $sum [$(outfs , "${clist[@]}")] - - -" } | sort_list } process_list() { # git.repo > list local list=$(init_list "$1") plist run=0 while true ; do plist=$list run=$((run +1)) list=$(echo "$list" | consolidate_list "$run") if [ "$plist" != "$list" ] ; then debug "------------------------------------------------------------------------------------" debug "$HEADER" debug "$list" else break fi done debug "------------------------------------------------------------------------------------" echo "$list" } repack_list() { # git.repo < list local repo=$1 local start_date newpacks=0 pkeeps keeps=1 refs refdirs rtn local packedrefs=$(<"$repo/packed-refs") # so they don't appear touched after a noop refpacking if [ -z "$SW_REFS" ] ; then refs=$(cd "$repo/refs" ; find -depth) refdirs=$(cd "$repo/refs" ; find -type d -depth) debug "Before refs:" debug "$refs" fi # Find a private keep snapshot which has not changed from # before our start_date so private keep deletions during gc # can be detected while ! array_equals pkeeps "${keeps[@]}" ; do debug "Getting a private keep snapshot" private_keeps "$repo" keeps=("${pkeeps[@]}") debug "before keeps: ${keeps[*]}" start_date=$(date) private_keeps "$repo" debug "after keeps: ${pkeeps[*]}" done while read n has_keep size sha repack down up note; do if [ "$repack" = "y" ] ; then keep="$repo/objects/pack/pack-$sha.keep" info "Repacking $repo/objects/pack/pack-$sha.pack" [ -f "$keep" ] && rm -f "$keep" fi done ( cd "$repo" && git gc "${GC_OPTS[@]}" ) ; rtn=$? # Mark any files withoug a .keep with our .keep packs=("$repo"/objects/pack/pack-$SHA1.pack) for pack in "${packs[@]}" ; do if keep "$pack" ; then info "New pack: $pack" newpacks=$((newpacks+1)) fi done # Record start_time. If there is more than 1 new packfile, we # don't want to risk touching it with an older date since that # would prevent consolidation on the next run. If the private # keeps have changed, then we should run next time no matter what. if [ $newpacks -le 1 ] || ! array_equals pkeeps "${keeps[@]}" ; then set_start_date "$repo" "$start_date" "$refs" "$refdirs" "$packedrefs" "${packs[@]}" fi return $rtn # we really only care about the gc error code } git_gc() { # git.repo local list=$(process_list "$1") if [ -z "$SW_V" ] ; then info "Running $PROG on $1. git gc options: ${GC_OPTS[@]}" echo "$HEADER" >&2 echo "$list" >&2 ; fi echo "$list" | repack_list "$1" } PROG=$(basename "$0") HEADER="Id Keep Size Sha1(or consolidation list) Actions(repack down up note)" KEEP=git-exproll HEX='[0-9a-f]' HEX10=$HEX$HEX$HEX$HEX$HEX$HEX$HEX$HEX$HEX$HEX SHA1=$HEX10$HEX10$HEX10$HEX10 RATIO=10 SW_N='' ; SW_V='' ; SW_T='' ; SW_REFS='' ; SW_LOOSE='' ; GC_OPTS=() while [ $# -gt 0 ] ; do case "$1" in -u|-h) usage ;; -n) SW_N="$1" ;; -v) SW_V="$1" ;; -t) SW_T="$1" ;; --norefs) SW_REFS="$1" ;; --noloose) SW_LOOSE="$1" ;; -r|--ratio) shift ; RATIO="$1" ;; *) [ $# -le 1 ] && break GC_OPTS=( "${GC_OPTS[@]}" "$1" ) ;; esac shift done REPO="$1" if ! is_repo "$REPO" ; then REPO=$REPO/.git is_repo "$REPO" || usage "($1) is not likely a git repo" fi if [ -z "$SW_N" ] ; then is_touched "$REPO" || { info "Repo untouched since last run" ; exit ; } git_gc "$REPO" else is_touched "$REPO" || info "Repo untouched since last run, analyze anyway." process_list "$REPO" >&2 fi