summaryrefslogtreecommitdiff
path: root/backup-statistics.in
diff options
context:
space:
mode:
Diffstat (limited to 'backup-statistics.in')
-rw-r--r--backup-statistics.in346
1 files changed, 346 insertions, 0 deletions
diff --git a/backup-statistics.in b/backup-statistics.in
new file mode 100644
index 0000000..a44f41d
--- /dev/null
+++ b/backup-statistics.in
@@ -0,0 +1,346 @@
+#!/bin/bash
+
+# backup-statistics version #VERSION#
+
+set -e
+
+[ -r "#ETCDIR#/backup.conf" ] && \
+ . "#ETCDIR#/backup.conf"
+
+do_stage()
+{
+ case $1 in
+ 1)
+ if [ "$2" == '##DESCRIBE##' ]
+ then
+ echo 'generate lists $filename -> $inode'
+ return 0
+ fi
+ dest="${backups["${backupID}"]%% *}"
+ dest="${dest%/}"
+ while [ ! -d "${dest}" ] && [ ${maxWait} -gt 0 ]
+ do
+ sleep 1
+ maxWait=$[${maxWait}-1]
+ done
+
+ rm -f "${cacheDir}/${backupID}.inodes"
+ touch "${cacheDir}/${backupID}.inodes"
+ chmod go-rwx "${cacheDir}/${backupID}.inodes"
+ for dat in $(ls "${dest}")
+ do
+ echo "${dat}:"
+ find "${dest}/${dat}" -type f -links -64001 -printf '%i %D-%m-%U-%G %p\n' >> \
+ "${cacheDir}/${backupID}.inodes"
+ done
+ ;;
+ 2)
+ if [ "$2" == '##DESCRIBE##' ]
+ then
+ echo 'sort previous lists by $inode'
+ return 0
+ fi
+ tmpDirA="$(mktemp -d)"
+ tmpDirB="$(mktemp -d "${cacheDir}/tmp.XXXXXX")"
+ touch "${cacheDir}/${backupID}.inodes.sorted"
+ chmod go-rwx "${cacheDir}/${backupID}.inodes.sorted"
+ sort -T "${tmpDirA}" -T "${tmpDirB}" -u "${cacheDir}/${backupID}.inodes" > \
+ "${cacheDir}/${backupID}.inodes.sorted"
+ rmdir "${tmpDirA}" "${tmpDirB}"
+ ;;
+ 3)
+ if [ "$2" == '##DESCRIBE##' ]
+ then
+ echo 'generate lists $inode -> $count, $contentHash'
+ return 0
+ fi
+ touch "${cacheDir}/${backupID}.content"
+ chmod go-rwx "${cacheDir}/${backupID}.content"
+ uniq -cm2 "${cacheDir}/${backupID}.inodes.sorted" | \
+ parallel \
+ sha512sum {=s/^ *\([[:digit:]]\+ \)\{2\}[0-9-]\+ //=} \| \
+ sed '"s|^\([0-9a-f]\{128\}\) .*\$|\1'{=s/^ *\([[:digit:]]\+ [[:digit:]]\+\) \([0-9-]\+\) .*/-\\2 \\1/=}'|"' \
+ \; > \
+ "${cacheDir}/${backupID}.content"
+ ;;
+ 4)
+ if [ "$2" == '##DESCRIBE##' ]
+ then
+ echo 'sort previous lists by $contentHash'
+ return 0
+ fi
+ tmpDirA="$(mktemp -d)"
+ tmpDirB="$(mktemp -d "${cacheDir}/tmp.XXXXXX")"
+ touch "${cacheDir}/${backupID}.content.sorted"
+ chmod go-rwx "${cacheDir}/${backupID}.content.sorted"
+ sort -T "${tmpDirA}" -T "${tmpDirB}" -k1,1 -k2nr,2 "${cacheDir}/${backupID}.content" > \
+ "${cacheDir}/${backupID}.content.sorted"
+ rmdir "${tmpDirA}" "${tmpDirB}"
+ ;;
+ 5)
+ if [ "$2" == '##DESCRIBE##' ]
+ then
+ echo 'generate sorted lists of groups of inodes with the same hashes'
+ return 0
+ fi
+ index=0
+ tmpDirA="$(mktemp -d)"
+ tmpDirB="$(mktemp -d "${cacheDir}/tmp.XXXXXX")"
+ touch "${cacheDir}/${backupID}.duplicates"
+ chmod go-rwx "${cacheDir}/${backupID}.duplicates"
+ uniq -m1 --all-repeated=separate "${cacheDir}/${backupID}.content.sorted" | \
+ sed 's|^\(\S\+ \)\{2\}||' | \
+ while read s
+ do
+ if [ -z "${s}" ]
+ then
+ index=$[${index}+1]
+ else
+ echo "${s#* } B ${index}"
+ fi
+ done | \
+ sort -T "${tmpDirA}" -T "${tmpDirB}" > \
+ "${cacheDir}/${backupID}.duplicates"
+ rmdir "${tmpDirA}" "${tmpDirB}"
+ ;;
+ 6)
+ if [ "$2" == '##DESCRIBE##' ]
+ then
+ echo 'find files to inodes of previous lists'
+ return 0
+ fi
+ tmpDirA="$(mktemp -d)"
+ tmpDirB="$(mktemp -d "${cacheDir}/tmp.XXXXXX")"
+
+ unset block
+ unset lastBlock
+ unset firstInode
+ unset lastInode
+
+ touch "${cacheDir}/${backupID}.duplicates.files"
+ chmod go-rwx "${cacheDir}/${backupID}.duplicates.files"
+ sed '
+ s|^\(\S\+\) \S\+ |\1 F |
+ ' "${cacheDir}/${backupID}.inodes.sorted" | \
+ sort -m -T "${tmpDirA}" -T "${tmpDirB}" -- \
+ - "${cacheDir}/${backupID}.duplicates" | \
+ while read -r inode type extra
+ do
+ if [ "${type}" == "B" ]
+ then
+ block="${extra}"
+ elif [ "${lastInode}" == "${inode}" ] && [ -n "${block}" ]
+ then
+ echo "${block} ${inode} ${extra}"
+ else
+ unset block
+ fi
+ lastInode="${inode}"
+ done | \
+ sort -T "${tmpDirA}" -T "${tmpDirB}" -k1n,1 | \
+ while read -r block inode extra
+ do
+ if [ "${lastBlock}" != "${block}" ]
+ then
+ firstInode="${inode}"
+ fi
+ if [ "${lastBlock}" != "${block}" ] || [ "${firstInode}" != "${inode}" ]
+ then
+ echo "${block} ${extra}"
+ fi
+ lastBlock="${block}"
+ done | \
+ uniq -m1 --group=separate > \
+ "${cacheDir}/${backupID}.duplicates.files"
+ rmdir "${tmpDirA}" "${tmpDirB}"
+ ;;
+ 7)
+ if [ "$2" == '##DESCRIBE##' ]
+ then
+ echo 'relink files with different inodes and same hashes'
+ return 0
+ fi
+ if [ ! -r "${cacheDir}/next.action" ]
+ then
+ cat "${cacheDir}/${backupID}.duplicates.files"
+ elif [ "$(head -n1 "${cacheDir}/next.action")" == "${backupID}" ]
+ then
+ startBlock="$(tail -n1 "${cacheDir}/next.action")"
+ sed "
+ :vor;
+ /^${startBlock} /bnach;
+ d;
+ bvor;
+ :nach;
+ n;
+ bnach
+ " "${cacheDir}/${backupID}.duplicates.files"
+ fi | \
+ while read -r oBlock original
+ do
+ echo "${backupID}" > "${cacheDir}/next.action2"
+ echo "${oBlock}" >> "${cacheDir}/next.action2"
+ mv "${cacheDir}/next.action2" "${cacheDir}/next.action"
+ while read -r kBlock kopie
+ do
+ [ -z "${kopie}" ] && break
+ if [ "${kBlock}" != "${oBlock}" ]
+ then
+ >&2 echo "'${kBlock}' != '${oBlock}'"
+ >&2 echo "'${backupID}':"
+ >&2 echo "'${original}'"
+ >&2 echo "'${kopie}'"
+ exit 1
+ fi
+
+ if ${paranoid}
+ then
+ diff "${original}" "${kopie}"
+ fi
+ if [ $(stat -c'%h' "${original}") -ge 65000 ]
+ then
+ echo "rm \"${original}\""
+ echo "ln \"${kopie}\" \"${original}\""
+ if ! ${dummy}
+ then
+ rm "${original}"
+ ln "${kopie}" "${original}"
+ fi
+ else
+ echo "rm \"${kopie}\""
+ echo "ln \"${original}\" \"${kopie}\""
+ if ! ${dummy}
+ then
+ rm "${kopie}"
+ ln "${original}" "${kopie}"
+ fi
+ fi
+ done
+ done
+ if [ -r "${cacheDir}/next.action" ] && \
+ [ "$(head -n1 "${cacheDir}/next.action")" == "${backupID}" ]
+ then
+ rm -f "${cacheDir}/next.action" "${cacheDir}/next.action2"
+ fi
+ ;;
+ esac
+}
+
+usage()
+{
+ >&2 echo \
+'Usage: backup-statistics [OPTION]
+Search and tidy duplicate and not-hardlinked files in the backups.
+
+With no options, tidy up all backups. THIS CAN BE VERY TIME CONSUMING.
+
+Mandatory arguments to long options are mandatory for short options too.
+ -d, --dummy only generate lists, do not modify backupfiles
+ -m, --max=maxNum stop execution after step maxNum
+ -p, --paranoid test for file differences before relinking (test _should_ be obsolete)
+ -s, --skip=skipNum skip first skipNum steps
+#HELPTEXT# #
+
+the executed steps are:'
+
+ for ((stage=1; stage<=#NUMSTAGES#; stage++))
+ do
+ >&2 echo ''
+ >&2 echo " ${stage}. $(do_stage ${stage} '##DESCRIBE##')"
+ done
+ >&2 echo ''
+ [ -z "$1" ] && exit 1
+ exit $1
+}
+
+eval set -- "$(
+ getopt -o dm:ps: \
+ --long dummy \
+ --long help \
+ --long max: \
+ --long paranoid \
+ --long skip: \
+ --long version \
+ -n "$(basename "$0")" -- "$@" || \
+ echo usage
+)"
+
+dummy=false
+maxNum=#NUMSTAGES#
+paranoid=false
+skipNum=0
+
+while true; do
+ case "$1" in
+ -d|--dummy)
+ dummy=true
+ ;;
+ --help)
+ usage 0
+ ;;
+ -m|--max)
+ shift
+ maxNum=$1
+ ;;
+ -p|--paranoid)
+ paranoid=true
+ ;;
+ -s|--skip)
+ shift
+ skipNum=$1
+ ;;
+ --version)
+ >&2 echo '#VERSION#'
+ exit 0
+ ;;
+ --)
+ shift
+ [ $# -gt 0 ] && echo 'ERROR: Unknown parameter: '"$#" && usage
+ break
+ ;;
+ *)
+ >&2 echo 'That should not happen, '"$1"' unknown though ...'
+ exit -1
+ ;;
+ esac
+ shift
+done
+
+if [ ! -d "${cacheDir}" ] || [ -z "${cacheDir}" ]
+then
+ >&2 "ERROR: Cache directory must exist, '${cacheDir}' does not! Closing."
+ exit 1
+fi
+
+(
+ echo -n 'Signature: '
+ echo -n '.IsCacheDirectory' | \
+ md5sum - | \
+ cut -d ' ' -f 1
+ echo '# This file is a cache directory tag created by '"$(basename "$0")"'.'
+ echo '# For information about cache directory tags, see:'
+ echo '# http://www.brynosaurus.com/cachedir/'
+) > "${cacheDir}/CACHEDIR.TAG"
+(
+ echo '+ .rsync-filter'
+ echo '- *'
+) > "${cacheDir}/.rsync-filter"
+
+if [ ! "${skipNum}" -ge 0 ] || \
+ [ ! "${skipNum}" -le #NUMSTAGES# ] || \
+ [ ! "${maxNum}" -ge 0 ] || \
+ [ ! "${maxNum}" -le #NUMSTAGES# ]
+then
+ usage
+fi
+
+for ((stage=${skipNum}+1; stage<=${maxNum}; stage++))
+do
+ echo "entering stage ${stage} ($(do_stage ${stage} '##DESCRIBE##')) ..."
+ for backupID in "${!backups[@]}"
+ do
+ echo "${backupID}:"
+ do_stage ${stage} "${backupID}"
+ done
+ echo "... stage ${stage} completed."
+done