diff options
Diffstat (limited to 'backupStatistics.in')
-rw-r--r-- | backupStatistics.in | 205 |
1 files changed, 205 insertions, 0 deletions
diff --git a/backupStatistics.in b/backupStatistics.in new file mode 100644 index 0000000..d5511bf --- /dev/null +++ b/backupStatistics.in @@ -0,0 +1,205 @@ +#!/bin/bash + +# backupStatistics version #VERSION# + +. #ETCDIR#/backup.conf + +do_stage() +{ + case $1 in + 1) + if [ "$2" == '##DESCRIBE##' ] + then + echo 'generate lists $filename -> $inode' + return 0 + fi + dest="${backups["${backupID}"]%% *}" + dest="${dest%/}" + while [ ! -d "${dest}" ] && [ ${maxWait} -gt 0 ] + do + sleep 1 + maxWait=$[${maxWait}-1] + done + + rm -f "${cacheDir}/${backupID}.inodes" + for dat in $(ls "${dest}") + do + echo "${dat}:" + find "${dest}/${dat}" -type f -exec \ + stat -c'%i %n' {} \; >> \ + "${cacheDir}/${backupID}.inodes" + done + ;; + 2) + if [ "$2" == '##DESCRIBE##' ] + then + echo 'sort previous lists by $inode' + return 0 + fi + sort -u "${cacheDir}/${backupID}.inodes" > \ + "${cacheDir}/${backupID}.inodes.sorted" + ;; + 3) + if [ "$2" == '##DESCRIBE##' ] + then + echo 'generate lists $inode -> $contentHash' + return 0 + fi + uniq -m1 "${cacheDir}/${backupID}.inodes.sorted" | \ + parallel \ + sha512sum {=s/^[[:digit:]]\+ //=} \| \ + sed "\"s|^\([0-9a-f]\{128\}\) .*\$|\1 "{=s/^\([[:digit:]]\+\) .*/\\1/=}"|\"" \ + \; | \ + sort > \ + "${cacheDir}/${backupID}.content" + ;; + 4) + if [ "$2" == '##DESCRIBE##' ] + then + echo 'find duplicate hashes' + return 0 + fi + uniq -m1 -D "${cacheDir}/${backupID}.content" | + sed ' + :a; + $!N; + s@^\(\S\+ \)\(.*\)\n\1@\1\2 @; + ta; + P; + D + ' | \ + sed 's|^\S\+ ||' > \ + "${cacheDir}/${backupID}.duplicates" + ;; + 5) + if [ "$2" == '##DESCRIBE##' ] + then + echo 'remove inodes with duplicate hashes' + return 0 + fi + while read line + do + original="$( + grep "^${line%% *} " "${cacheDir}/${backupID}.inodes.sorted" | \ + sed 's|^\S\+ ||' + )" + for kopieInode in ${line#* } + do + kopie="$( + grep "^${kopieInode} " "${cacheDir}/${backupID}.inodes.sorted" | \ + sed 's|^\S\+ ||' + )" + if ${dummy} + then + echo "rm \"${kopie}\"" + echo "ln \"${original}\" \"${kopie}\"" + else + exit 1 + DO NOT EXECUTE YET + # rm "${kopie}" + # ln "${original}" "${kopie}" + fi + done + done < \ + "${cacheDir}/${backupID}.duplicates" + ;; + esac +} + +usage() +{ + >&2 echo 'Usage: backupStatistics [OPTION]' + >&2 echo 'Search and tidy duplicate and not-hardlinked files in the backups.' + >&2 echo '' + >&2 echo 'With no options, tidy up all backups. THIS CAN BE VERY TIME CONSUMING.' + >&2 echo '' + >&2 echo 'Mandatory arguments to long options are mandatory for short options too.' + >&2 echo ' -d | --dummy only generate lists, do not modify backupfiles' + >&2 echo ' -h | --help display this help and exit' + >&2 echo ' -m | --max=maxNum stop execution after step maxNum' + >&2 echo ' -s | --skip=skipNum skip first skipNum steps' + >&2 echo ' -V | --version display version and exit' + >&2 echo '' + >&2 echo 'the executed steps are:' + for ((stage=1; stage<=#NUMSTAGES#; stage++)) + do + >&2 echo '' + >&2 echo " ${stage}. $(do_stage ${stage} '##DESCRIBE##')" + done + >&2 echo '' + [ -z "$1" ] && exit 1 + exit $1 +} + +eval set -- "$( + getopt -o dhm:s:V \ + --long dummy \ + --long help \ + --long max: \ + --long skip: \ + --long version \ + -n "$(basename "$0")" -- "$@" || \ + echo usage +)" + +dummy=false +maxNum=#NUMSTAGES# +skipNum=0 + +while true; do + case "$1" in + -d|--dummy) + dummy=true + ;; + -h|--help) + usage 0 + ;; + -m|--max) + shift + maxNum=$1 + ;; + -s|--skip) + shift + skipNum=$1 + ;; + -V|--version) + >&2 echo '#VERSION#' + exit 0 + ;; + --) + shift + [ $# -gt 0 ] && echo 'ERROR: Unknown parameter: '"$#" && usage + break + ;; + *) + >&2 echo 'That should not happen, '"$1"' unknown though ...' + exit -1 + ;; + esac + shift +done + +if [ ! -d "${cacheDir}" ] || [ -z "${cacheDir}" ] +then + >&2 "ERROR: Cache directory must exist, '${cacheDir}' does not! Closing." + exit 1 +fi + +if [ ! "${skipNum}" -ge 0 ] || \ + [ ! "${skipNum}" -le #NUMSTAGES# ] || \ + [ ! "${maxNum}" -ge 0 ] || \ + [ ! "${maxNum}" -le #NUMSTAGES# ] +then + usage +fi + +for ((stage=${skipNum}-1; stage<=${maxNum}; stage++)) +do + echo "entering stage ${stage} ..." + for backupID in "${!backups[@]}" + do + echo "${backupID}:" + do_stage ${stage} "${backupID}" + done + echo "... stage ${stage} completed." +done |