#!/bin/bash # backup-statistics version #VERSION# set -e [ -r "#ETCDIR#/backup.conf" ] && \ . "#ETCDIR#/backup.conf" do_stage() { case $1 in 1) if [ "$2" == '##DESCRIBE##' ] then echo 'generate lists $filename -> $inode' return 0 fi dest="${backups["${backupID}"]%% *}" dest="${dest%/}" while [ ! -d "${dest}" ] && [ ${maxWait} -gt 0 ] do sleep 1 maxWait=$[${maxWait}-1] done rm -f "${cacheDir}/${backupID}.inodes" touch "${cacheDir}/${backupID}.inodes" chmod go-rwx "${cacheDir}/${backupID}.inodes" for dat in $(ls "${dest}") do echo "${dat}:" find "${dest}/${dat}" -type f -links -64001 -printf '%i %D-%m-%U-%G %p\n' >> \ "${cacheDir}/${backupID}.inodes" done ;; 2) if [ "$2" == '##DESCRIBE##' ] then echo 'sort previous lists by $inode' return 0 fi tmpDirA="$(mktemp -d)" tmpDirB="$(mktemp -d "${cacheDir}/tmp.XXXXXX")" touch "${cacheDir}/${backupID}.inodes.sorted" chmod go-rwx "${cacheDir}/${backupID}.inodes.sorted" sort -T "${tmpDirA}" -T "${tmpDirB}" -u "${cacheDir}/${backupID}.inodes" > \ "${cacheDir}/${backupID}.inodes.sorted" rmdir "${tmpDirA}" "${tmpDirB}" ;; 3) if [ "$2" == '##DESCRIBE##' ] then echo 'generate lists $inode -> $count, $contentHash' return 0 fi touch "${cacheDir}/${backupID}.content" chmod go-rwx "${cacheDir}/${backupID}.content" uniq -cm2 "${cacheDir}/${backupID}.inodes.sorted" | \ parallel \ sha512sum {=s/^ *\([[:digit:]]\+ \)\{2\}[0-9-]\+ //=} \| \ sed '"s|^\([0-9a-f]\{128\}\) .*\$|\1'{=s/^ *\([[:digit:]]\+ [[:digit:]]\+\) \([0-9-]\+\) .*/-\\2 \\1/=}'|"' \ \; > \ "${cacheDir}/${backupID}.content" ;; 4) if [ "$2" == '##DESCRIBE##' ] then echo 'sort previous lists by $contentHash' return 0 fi tmpDirA="$(mktemp -d)" tmpDirB="$(mktemp -d "${cacheDir}/tmp.XXXXXX")" touch "${cacheDir}/${backupID}.content.sorted" chmod go-rwx "${cacheDir}/${backupID}.content.sorted" sort -T "${tmpDirA}" -T "${tmpDirB}" -k1,1 -k2nr,2 "${cacheDir}/${backupID}.content" > \ "${cacheDir}/${backupID}.content.sorted" rmdir "${tmpDirA}" "${tmpDirB}" ;; 5) if [ "$2" == '##DESCRIBE##' ] then echo 'generate sorted lists of groups of inodes with the same hashes' return 0 fi index=0 tmpDirA="$(mktemp -d)" tmpDirB="$(mktemp -d "${cacheDir}/tmp.XXXXXX")" touch "${cacheDir}/${backupID}.duplicates" chmod go-rwx "${cacheDir}/${backupID}.duplicates" uniq -m1 --all-repeated=separate "${cacheDir}/${backupID}.content.sorted" | \ sed 's|^\(\S\+ \)\{2\}||' | \ while read s do if [ -z "${s}" ] then index=$[${index}+1] else echo "${s#* } B ${index}" fi done | \ sort -T "${tmpDirA}" -T "${tmpDirB}" > \ "${cacheDir}/${backupID}.duplicates" rmdir "${tmpDirA}" "${tmpDirB}" ;; 6) if [ "$2" == '##DESCRIBE##' ] then echo 'find files to inodes of previous lists' return 0 fi tmpDirA="$(mktemp -d)" tmpDirB="$(mktemp -d "${cacheDir}/tmp.XXXXXX")" unset block unset lastBlock unset firstInode unset lastInode touch "${cacheDir}/${backupID}.duplicates.files" chmod go-rwx "${cacheDir}/${backupID}.duplicates.files" sed ' s|^\(\S\+\) \S\+ |\1 F | ' "${cacheDir}/${backupID}.inodes.sorted" | \ sort -m -T "${tmpDirA}" -T "${tmpDirB}" -- \ - "${cacheDir}/${backupID}.duplicates" | \ while read -r inode type extra do if [ "${type}" == "B" ] then block="${extra}" elif [ "${lastInode}" == "${inode}" ] && [ -n "${block}" ] then echo "${block} ${inode} ${extra}" else unset block fi lastInode="${inode}" done | \ sort -T "${tmpDirA}" -T "${tmpDirB}" -k1n,1 | \ while read -r block inode extra do if [ "${lastBlock}" != "${block}" ] then firstInode="${inode}" fi if [ "${lastBlock}" != "${block}" ] || [ "${firstInode}" != "${inode}" ] then echo "${block} ${extra}" fi lastBlock="${block}" done | \ uniq -m1 --group=separate > \ "${cacheDir}/${backupID}.duplicates.files" rmdir "${tmpDirA}" "${tmpDirB}" ;; 7) if [ "$2" == '##DESCRIBE##' ] then echo 'relink files with different inodes and same hashes' return 0 fi if [ ! -r "${cacheDir}/next.action" ] then cat "${cacheDir}/${backupID}.duplicates.files" elif [ "$(head -n1 "${cacheDir}/next.action")" == "${backupID}" ] then startBlock="$(tail -n1 "${cacheDir}/next.action")" sed " :vor; /^${startBlock} /bnach; d; bvor; :nach; n; bnach " "${cacheDir}/${backupID}.duplicates.files" fi | \ while read -r oBlock original do echo "${backupID}" > "${cacheDir}/next.action2" echo "${oBlock}" >> "${cacheDir}/next.action2" mv "${cacheDir}/next.action2" "${cacheDir}/next.action" while read -r kBlock kopie do [ -z "${kopie}" ] && break if [ "${kBlock}" != "${oBlock}" ] then >&2 echo "'${kBlock}' != '${oBlock}'" >&2 echo "'${backupID}':" >&2 echo "'${original}'" >&2 echo "'${kopie}'" exit 1 fi if ${paranoid} then diff "${original}" "${kopie}" fi if [ $(stat -c'%h' "${original}") -ge 65000 ] then echo "rm \"${original}\"" echo "ln \"${kopie}\" \"${original}\"" if ! ${dummy} then rm "${original}" ln "${kopie}" "${original}" fi else echo "rm \"${kopie}\"" echo "ln \"${original}\" \"${kopie}\"" if ! ${dummy} then rm "${kopie}" ln "${original}" "${kopie}" fi fi done done if [ -r "${cacheDir}/next.action" ] && \ [ "$(head -n1 "${cacheDir}/next.action")" == "${backupID}" ] then rm -f "${cacheDir}/next.action" "${cacheDir}/next.action2" fi ;; esac } usage() { >&2 echo \ 'Usage: backup-statistics [OPTION] Search and tidy duplicate and not-hardlinked files in the backups. With no options, tidy up all backups. THIS CAN BE VERY TIME CONSUMING. Mandatory arguments to long options are mandatory for short options too. -d, --dummy only generate lists, do not modify backupfiles -m, --max=maxNum stop execution after step maxNum -p, --paranoid test for file differences before relinking (test _should_ be obsolete) -s, --skip=skipNum skip first skipNum steps #HELPTEXT# # the executed steps are:' for ((stage=1; stage<=#NUMSTAGES#; stage++)) do >&2 echo '' >&2 echo " ${stage}. $(do_stage ${stage} '##DESCRIBE##')" done >&2 echo '' [ -z "$1" ] && exit 1 exit $1 } eval set -- "$( getopt -o dm:ps: \ --long dummy \ --long help \ --long max: \ --long paranoid \ --long skip: \ --long version \ -n "$(basename "$0")" -- "$@" || \ echo usage )" dummy=false maxNum=#NUMSTAGES# paranoid=false skipNum=0 while true; do case "$1" in -d|--dummy) dummy=true ;; --help) usage 0 ;; -m|--max) shift maxNum=$1 ;; -p|--paranoid) paranoid=true ;; -s|--skip) shift skipNum=$1 ;; --version) >&2 echo '#VERSION#' exit 0 ;; --) shift [ $# -gt 0 ] && echo 'ERROR: Unknown parameter: '"$#" && usage break ;; *) >&2 echo 'That should not happen, '"$1"' unknown though ...' exit -1 ;; esac shift done if [ ! -d "${cacheDir}" ] || [ -z "${cacheDir}" ] then >&2 "ERROR: Cache directory must exist, '${cacheDir}' does not! Closing." exit 1 fi ( echo -n 'Signature: ' echo -n '.IsCacheDirectory' | \ md5sum - | \ cut -d ' ' -f 1 echo '# This file is a cache directory tag created by '"$(basename "$0")"'.' echo '# For information about cache directory tags, see:' echo '# http://www.brynosaurus.com/cachedir/' ) > "${cacheDir}/CACHEDIR.TAG" ( echo '+ .rsync-filter' echo '- *' ) > "${cacheDir}/.rsync-filter" if [ ! "${skipNum}" -ge 0 ] || \ [ ! "${skipNum}" -le #NUMSTAGES# ] || \ [ ! "${maxNum}" -ge 0 ] || \ [ ! "${maxNum}" -le #NUMSTAGES# ] then usage fi for ((stage=${skipNum}+1; stage<=${maxNum}; stage++)) do echo "entering stage ${stage} ($(do_stage ${stage} '##DESCRIBE##')) ..." for backupID in "${!backups[@]}" do echo "${backupID}:" do_stage ${stage} "${backupID}" done echo "... stage ${stage} completed." done