#!/bin/bash # backupStatistics version #VERSION# . #ETCDIR#/backup.conf do_stage() { case $1 in 1) if [ "$2" == '##DESCRIBE##' ] then echo 'generate lists $filename -> $inode' return 0 fi dest="${backups["${backupID}"]%% *}" dest="${dest%/}" while [ ! -d "${dest}" ] && [ ${maxWait} -gt 0 ] do sleep 1 maxWait=$[${maxWait}-1] done rm -f "${cacheDir}/${backupID}.inodes" for dat in $(ls "${dest}") do echo "${dat}:" find "${dest}/${dat}" -type f -exec \ stat -c'%i %n' {} \; >> \ "${cacheDir}/${backupID}.inodes" done ;; 2) if [ "$2" == '##DESCRIBE##' ] then echo 'sort previous lists by $inode' return 0 fi sort -u "${cacheDir}/${backupID}.inodes" > \ "${cacheDir}/${backupID}.inodes.sorted" ;; 3) if [ "$2" == '##DESCRIBE##' ] then echo 'generate lists $inode -> $contentHash' return 0 fi uniq -m1 "${cacheDir}/${backupID}.inodes.sorted" | \ parallel \ sha512sum {=s/^[[:digit:]]\+ //=} \| \ sed "\"s|^\([0-9a-f]\{128\}\) .*\$|\1 "{=s/^\([[:digit:]]\+\) .*/\\1/=}"|\"" \ \; | \ sort > \ "${cacheDir}/${backupID}.content" ;; 4) if [ "$2" == '##DESCRIBE##' ] then echo 'find duplicate hashes' return 0 fi uniq -m1 -D "${cacheDir}/${backupID}.content" | sed ' :a; $!N; s@^\(\S\+ \)\(.*\)\n\1@\1\2 @; ta; P; D ' | \ sed 's|^\S\+ ||' > \ "${cacheDir}/${backupID}.duplicates" ;; 5) if [ "$2" == '##DESCRIBE##' ] then echo 'remove inodes with duplicate hashes' return 0 fi while read line do original="$( grep "^${line%% *} " "${cacheDir}/${backupID}.inodes.sorted" | \ sed 's|^\S\+ ||' )" for kopieInode in ${line#* } do kopie="$( grep "^${kopieInode} " "${cacheDir}/${backupID}.inodes.sorted" | \ sed 's|^\S\+ ||' )" if ${dummy} then echo "rm \"${kopie}\"" echo "ln \"${original}\" \"${kopie}\"" else exit 1 DO NOT EXECUTE YET # rm "${kopie}" # ln "${original}" "${kopie}" fi done done < \ "${cacheDir}/${backupID}.duplicates" ;; esac } usage() { >&2 echo 'Usage: backupStatistics [OPTION]' >&2 echo 'Search and tidy duplicate and not-hardlinked files in the backups.' >&2 echo '' >&2 echo 'With no options, tidy up all backups. THIS CAN BE VERY TIME CONSUMING.' >&2 echo '' >&2 echo 'Mandatory arguments to long options are mandatory for short options too.' >&2 echo ' -d | --dummy only generate lists, do not modify backupfiles' >&2 echo ' -h | --help display this help and exit' >&2 echo ' -m | --max=maxNum stop execution after step maxNum' >&2 echo ' -s | --skip=skipNum skip first skipNum steps' >&2 echo ' -V | --version display version and exit' >&2 echo '' >&2 echo 'the executed steps are:' for ((stage=1; stage<=#NUMSTAGES#; stage++)) do >&2 echo '' >&2 echo " ${stage}. $(do_stage ${stage} '##DESCRIBE##')" done >&2 echo '' [ -z "$1" ] && exit 1 exit $1 } eval set -- "$( getopt -o dhm:s:V \ --long dummy \ --long help \ --long max: \ --long skip: \ --long version \ -n "$(basename "$0")" -- "$@" || \ echo usage )" dummy=false maxNum=#NUMSTAGES# skipNum=0 while true; do case "$1" in -d|--dummy) dummy=true ;; -h|--help) usage 0 ;; -m|--max) shift maxNum=$1 ;; -s|--skip) shift skipNum=$1 ;; -V|--version) >&2 echo '#VERSION#' exit 0 ;; --) shift [ $# -gt 0 ] && echo 'ERROR: Unknown parameter: '"$#" && usage break ;; *) >&2 echo 'That should not happen, '"$1"' unknown though ...' exit -1 ;; esac shift done if [ ! -d "${cacheDir}" ] || [ -z "${cacheDir}" ] then >&2 "ERROR: Cache directory must exist, '${cacheDir}' does not! Closing." exit 1 fi if [ ! "${skipNum}" -ge 0 ] || \ [ ! "${skipNum}" -le #NUMSTAGES# ] || \ [ ! "${maxNum}" -ge 0 ] || \ [ ! "${maxNum}" -le #NUMSTAGES# ] then usage fi for ((stage=${skipNum}-1; stage<=${maxNum}; stage++)) do echo "entering stage ${stage} ..." for backupID in "${!backups[@]}" do echo "${backupID}:" do_stage ${stage} "${backupID}" done echo "... stage ${stage} completed." done