#!/bin/bash # backupStatistics version #VERSION# set -e [ -r "#ETCDIR#/backup.conf" ] && \ . "#ETCDIR#/backup.conf" do_stage() { case $1 in 1) if [ "$2" == '##DESCRIBE##' ] then echo 'generate lists $filename -> $inode' return 0 fi dest="${backups["${backupID}"]%% *}" dest="${dest%/}" while [ ! -d "${dest}" ] && [ ${maxWait} -gt 0 ] do sleep 1 maxWait=$[${maxWait}-1] done rm -f "${cacheDir}/${backupID}.inodes" for dat in $(ls "${dest}") do echo "${dat}:" find "${dest}/${dat}" -type f -links -64001 -printf '%i %D-%m-%U-%G %p\n' >> \ "${cacheDir}/${backupID}.inodes" done ;; 2) if [ "$2" == '##DESCRIBE##' ] then echo 'sort and partition previous lists by $inode' return 0 fi tmpDirA="$(mktemp -d)" tmpDirB="$(mktemp -d "${cacheDir}/tmp.XXXXXX")" rm -rf "${cacheDir}/${backupID}.inodes.sorted" mkdir "${cacheDir}/${backupID}.inodes.sorted" sort -T "${tmpDirA}" -T "${tmpDirB}" -u "${cacheDir}/${backupID}.inodes" | \ while read -r line do part="${line:0:4}" part="${part%% *}" echo "${line}" >> \ "${cacheDir}/${backupID}.inodes.sorted/part.${part}" done rmdir "${tmpDirA}" "${tmpDirB}" ;; 3) if [ "$2" == '##DESCRIBE##' ] then echo 'generate lists $inode -> $count, $contentHash' return 0 fi cat "${cacheDir}/${backupID}.inodes.sorted/"part.* | \ uniq -cm2 | \ parallel \ sha512sum {=s/^ *\([[:digit:]]\+ \)\{2\}[0-9-]\+ //=} \| \ sed '"s|^\([0-9a-f]\{128\}\) .*\$|\1'{=s/^ *\([[:digit:]]\+ [[:digit:]]\+\) \([0-9-]\+\) .*/-\\2 \\1/=}'|"' \ \; > \ "${cacheDir}/${backupID}.content" ;; 4) if [ "$2" == '##DESCRIBE##' ] then echo 'sort previous lists by $contentHash' return 0 fi tmpDirA="$(mktemp -d)" tmpDirB="$(mktemp -d "${cacheDir}/tmp.XXXXXX")" sort -T "${tmpDirA}" -T "${tmpDirB}" -k1,1 -k2nr,2 "${cacheDir}/${backupID}.content" > \ "${cacheDir}/${backupID}.content.sorted" rmdir "${tmpDirA}" "${tmpDirB}" ;; 5) if [ "$2" == '##DESCRIBE##' ] then echo 'find duplicate hashes' return 0 fi ( uniq -m1 --all-repeated=separate "${cacheDir}/${backupID}.content.sorted" echo "" ) | \ sed 's|^\(\S\+ \)\{2\}||' | \ while read s do if [ -z "${s}" ] then echo "" else echo -n "${s} " fi done | \ sed 's| $||' > \ "${cacheDir}/${backupID}.duplicates" ;; 6) if [ "$2" == '##DESCRIBE##' ] then echo 'remove inodes with duplicate hashes' return 0 fi if [ -r "${cacheDir}/next.action" ] then startInode="$(cat "${cacheDir}/next.action")" sed " :vor; / ${startInode}\( \|$\)/{ s@^\(\S\+ \)\(.* \)\?${startInode}\( \|$\)@\1${startInode}\3@; bnach }; d; bvor; :nach; n; bnach " "${cacheDir}/${backupID}.duplicates" else cat "${cacheDir}/${backupID}.duplicates" fi | \ while read line do originalInode="${line%% *}" original="$( grep -m1 "^${originalInode} " "${cacheDir}/${backupID}.inodes.sorted/part.${originalInode:0:4}" | \ sed 's|^\S\+ ||' )" for kopieInode in ${line#* } do echo "${kopieInode}" > "${cacheDir}/next.action2" mv "${cacheDir}/next.action2" "${cacheDir}/next.action" OIFS="${IFS}" IFS="$(printf '\n\t')" for kopie in $( grep "^${kopieInode} " "${cacheDir}/${backupID}.inodes.sorted/part.${kopieInode:0:4}" | \ sed 's|^\S\+ ||' ) do IFS="${OIFS}" if ${paranoid} then diff "${original}" "${kopie}" fi if [ $(stat -c'%h' "${original}") -ge 65000 ] then echo "rm \"${original}\"" echo "ln \"${kopie}\" \"${original}\"" if ! ${dummy} then rm "${original}" ln "${kopie}" "${original}" fi else echo "rm \"${kopie}\"" echo "ln \"${original}\" \"${kopie}\"" if ! ${dummy} then rm "${kopie}" ln "${original}" "${kopie}" fi fi done done done if [ -r "${cacheDir}/next.action" ] && \ grep -q " $(cat "${cacheDir}/next.action")\( \|$\)" "${cacheDir}/${backupID}.duplicates" then rm -f "${cacheDir}/next.action" "${cacheDir}/next.action2" fi ;; esac } usage() { >&2 echo \ 'Usage: backupStatistics [OPTION] Search and tidy duplicate and not-hardlinked files in the backups. With no options, tidy up all backups. THIS CAN BE VERY TIME CONSUMING. Mandatory arguments to long options are mandatory for short options too. -d, --dummy only generate lists, do not modify backupfiles -m, --max=maxNum stop execution after step maxNum -p, --paranoid test for file differences before relinking (test _should_ be obsolete) -s, --skip=skipNum skip first skipNum steps #HELPTEXT# # the executed steps are:' for ((stage=1; stage<=#NUMSTAGES#; stage++)) do >&2 echo '' >&2 echo " ${stage}. $(do_stage ${stage} '##DESCRIBE##')" done >&2 echo '' [ -z "$1" ] && exit 1 exit $1 } eval set -- "$( getopt -o dm:ps: \ --long dummy \ --long help \ --long max: \ --long paranoid \ --long skip: \ --long version \ -n "$(basename "$0")" -- "$@" || \ echo usage )" dummy=false maxNum=#NUMSTAGES# paranoid=false skipNum=0 while true; do case "$1" in -d|--dummy) dummy=true ;; --help) usage 0 ;; -m|--max) shift maxNum=$1 ;; -p|--paranoid) paranoid=true ;; -s|--skip) shift skipNum=$1 ;; --version) >&2 echo '#VERSION#' exit 0 ;; --) shift [ $# -gt 0 ] && echo 'ERROR: Unknown parameter: '"$#" && usage break ;; *) >&2 echo 'That should not happen, '"$1"' unknown though ...' exit -1 ;; esac shift done if [ ! -d "${cacheDir}" ] || [ -z "${cacheDir}" ] then >&2 "ERROR: Cache directory must exist, '${cacheDir}' does not! Closing." exit 1 fi if [ ! "${skipNum}" -ge 0 ] || \ [ ! "${skipNum}" -le #NUMSTAGES# ] || \ [ ! "${maxNum}" -ge 0 ] || \ [ ! "${maxNum}" -le #NUMSTAGES# ] then usage fi for ((stage=${skipNum}+1; stage<=${maxNum}; stage++)) do echo "entering stage ${stage} ($(do_stage ${stage} '##DESCRIBE##')) ..." for backupID in "${!backups[@]}" do echo "${backupID}:" do_stage ${stage} "${backupID}" done echo "... stage ${stage} completed." done