#!/bin/bash # backupStatistics version #VERSION# set -e [ -r "#ETCDIR#/backup.conf" ] && \ . "#ETCDIR#/backup.conf" do_stage() { case $1 in 1) if [ "$2" == '##DESCRIBE##' ] then echo 'generate lists $filename -> $inode' return 0 fi dest="${backups["${backupID}"]%% *}" dest="${dest%/}" while [ ! -d "${dest}" ] && [ ${maxWait} -gt 0 ] do sleep 1 maxWait=$[${maxWait}-1] done rm -f "${cacheDir}/${backupID}.inodes" for dat in $(ls "${dest}") do echo "${dat}:" find "${dest}/${dat}" -type f -exec \ stat -c'%i %n' {} \; >> \ "${cacheDir}/${backupID}.inodes" done ;; 2) if [ "$2" == '##DESCRIBE##' ] then echo 'sort previous lists by $inode' return 0 fi tmpDirA="$(mktemp -d)" tmpDirB="$(mktemp -d "${cacheDir}/tmp.XXXXXX")" sort -T "${tmpDirA}" -T "${tmpDirB}" -u "${cacheDir}/${backupID}.inodes" > \ "${cacheDir}/${backupID}.inodes.sorted" rmdir "${tmpDirA}" "${tmpDirB}" ;; 3) if [ "$2" == '##DESCRIBE##' ] then echo 'generate lists $inode -> $contentHash' return 0 fi tmpDirA="$(mktemp -d)" tmpDirB="$(mktemp -d "${cacheDir}/tmp.XXXXXX")" uniq -m1 "${cacheDir}/${backupID}.inodes.sorted" | \ parallel \ sha512sum {=s/^[[:digit:]]\+ //=} \| \ sed "\"s|^\([0-9a-f]\{128\}\) .*\$|\1 "{=s/^\([[:digit:]]\+\) .*/\\1/=}"|\"" \ \; | \ sort -T "${tmpDirA}" -T "${tmpDirB}" > \ "${cacheDir}/${backupID}.content" rmdir "${tmpDirA}" "${tmpDirB}" ;; 4) if [ "$2" == '##DESCRIBE##' ] then echo 'find duplicate hashes' return 0 fi ( uniq -m1 --all-repeated=separate "${cacheDir}/${backupID}.content" echo "" ) | \ sed 's|^\S\+ ||' | \ while read s do if [ -z "${s}" ] then echo "" else echo -n "${s} " fi done | \ sed 's| $||' > \ "${cacheDir}/${backupID}.duplicates" ;; 5) if [ "$2" == '##DESCRIBE##' ] then echo 'remove inodes with duplicate hashes' return 0 fi while read line do original="$( grep "^${line%% *} " "${cacheDir}/${backupID}.inodes.sorted" | \ sed 's|^\S\+ ||' )" for kopieInode in ${line#* } do kopie="$( grep "^${kopieInode} " "${cacheDir}/${backupID}.inodes.sorted" | \ sed 's|^\S\+ ||' )" diff "${original}" "${kopie}" if ${dummy} then echo "rm \"${kopie}\"" echo "ln \"${original}\" \"${kopie}\"" else exit 1 DO NOT EXECUTE YET # rm "${kopie}" # ln "${original}" "${kopie}" fi done done < \ "${cacheDir}/${backupID}.duplicates" ;; esac } usage() { >&2 echo \ 'Usage: backupStatistics [OPTION] Search and tidy duplicate and not-hardlinked files in the backups. With no options, tidy up all backups. THIS CAN BE VERY TIME CONSUMING. Mandatory arguments to long options are mandatory for short options too. -d, --dummy only generate lists, do not modify backupfiles -m, --max=maxNum stop execution after step maxNum -s, --skip=skipNum skip first skipNum steps #HELPTEXT# # the executed steps are:' for ((stage=1; stage<=#NUMSTAGES#; stage++)) do >&2 echo '' >&2 echo " ${stage}. $(do_stage ${stage} '##DESCRIBE##')" done >&2 echo '' [ -z "$1" ] && exit 1 exit $1 } eval set -- "$( getopt -o dm:s: \ --long dummy \ --long help \ --long max: \ --long skip: \ --long version \ -n "$(basename "$0")" -- "$@" || \ echo usage )" dummy=false maxNum=#NUMSTAGES# skipNum=0 while true; do case "$1" in -d|--dummy) dummy=true ;; --help) usage 0 ;; -m|--max) shift maxNum=$1 ;; -s|--skip) shift skipNum=$1 ;; --version) >&2 echo '#VERSION#' exit 0 ;; --) shift [ $# -gt 0 ] && echo 'ERROR: Unknown parameter: '"$#" && usage break ;; *) >&2 echo 'That should not happen, '"$1"' unknown though ...' exit -1 ;; esac shift done if [ ! -d "${cacheDir}" ] || [ -z "${cacheDir}" ] then >&2 "ERROR: Cache directory must exist, '${cacheDir}' does not! Closing." exit 1 fi if [ ! "${skipNum}" -ge 0 ] || \ [ ! "${skipNum}" -le #NUMSTAGES# ] || \ [ ! "${maxNum}" -ge 0 ] || \ [ ! "${maxNum}" -le #NUMSTAGES# ] then usage fi for ((stage=${skipNum}+1; stage<=${maxNum}; stage++)) do echo "entering stage ${stage} ($(do_stage ${stage} '##DESCRIBE##')) ..." for backupID in "${!backups[@]}" do echo "${backupID}:" do_stage ${stage} "${backupID}" done echo "... stage ${stage} completed." done