From 8dea12d03d4c979111682d1f3645aac7f96cd366 Mon Sep 17 00:00:00 2001 From: Erich Eckner Date: Wed, 11 Jan 2017 13:33:48 +0100 Subject: Ãsollte jetzt in O(n log(n)) laufen MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Makefile | 2 +- backupStatistics.in | 174 +++++++++++++++++++++++++++++++--------------------- 2 files changed, 106 insertions(+), 70 deletions(-) diff --git a/Makefile b/Makefile index 3090227..763626c 100644 --- a/Makefile +++ b/Makefile @@ -34,7 +34,7 @@ all: man.commons backup backup.1 lastBackups lastBackups.1 backupStatistics back s/#VERSION#/$(VERSION)/; \ s@#BINDIR#@$(BINDIR)@; \ s@#ETCDIR#@$(ETCDIR)@; \ - s@#NUMSTAGES#@6@; \ + s@#NUMSTAGES#@7@; \ s@#HELPTEXT#\(\s\+\)#@ --help \1display this help and exit\n --version\1display version and exit@; \ " $< > $@ && \ ( [[ "$@" = *.* ]] || chmod +x "$@" ) diff --git a/backupStatistics.in b/backupStatistics.in index 160ba0c..9e9608c 100644 --- a/backupStatistics.in +++ b/backupStatistics.in @@ -29,27 +29,19 @@ do_stage() do echo "${dat}:" find "${dest}/${dat}" -type f -links -64001 -printf '%i %D-%m-%U-%G %p\n' >> \ - "${cacheDir}/${backupID}.inodes" + "${cacheDir}/${backupID}.inodes" done ;; 2) if [ "$2" == '##DESCRIBE##' ] then - echo 'sort and partition previous lists by $inode' + echo 'sort previous lists by $inode' return 0 fi tmpDirA="$(mktemp -d)" tmpDirB="$(mktemp -d "${cacheDir}/tmp.XXXXXX")" - rm -rf "${cacheDir}/${backupID}.inodes.sorted" - mkdir "${cacheDir}/${backupID}.inodes.sorted" - sort -T "${tmpDirA}" -T "${tmpDirB}" -u "${cacheDir}/${backupID}.inodes" | \ - while read -r line - do - part="${line:0:4}" - part="${part%% *}" - echo "${line}" >> \ - "${cacheDir}/${backupID}.inodes.sorted/part.${part}" - done + sort -T "${tmpDirA}" -T "${tmpDirB}" -u "${cacheDir}/${backupID}.inodes" > \ + "${cacheDir}/${backupID}.inodes.sorted" rmdir "${tmpDirA}" "${tmpDirB}" ;; 3) @@ -58,8 +50,7 @@ do_stage() echo 'generate lists $inode -> $count, $contentHash' return 0 fi - cat "${cacheDir}/${backupID}.inodes.sorted/"part.* | \ - uniq -cm2 | \ + uniq -cm2 "${cacheDir}/${backupID}.inodes.sorted" | \ parallel \ sha512sum {=s/^ *\([[:digit:]]\+ \)\{2\}[0-9-]\+ //=} \| \ sed '"s|^\([0-9a-f]\{128\}\) .*\$|\1'{=s/^ *\([[:digit:]]\+ [[:digit:]]\+\) \([0-9-]\+\) .*/-\\2 \\1/=}'|"' \ @@ -81,96 +72,141 @@ do_stage() 5) if [ "$2" == '##DESCRIBE##' ] then - echo 'find duplicate hashes' + echo 'generate sorted lists of groups of inodes with the same hashes' return 0 fi - ( - uniq -m1 --all-repeated=separate "${cacheDir}/${backupID}.content.sorted" - echo "" - ) | \ + index=0 + tmpDirA="$(mktemp -d)" + tmpDirB="$(mktemp -d "${cacheDir}/tmp.XXXXXX")" + uniq -m1 --all-repeated=separate "${cacheDir}/${backupID}.content.sorted" | \ sed 's|^\(\S\+ \)\{2\}||' | \ while read s do if [ -z "${s}" ] then - echo "" + index=$[${index}+1] else - echo -n "${s} " + echo "${s#* } B ${index}" fi done | \ - sed 's| $||' > \ + sort -T "${tmpDirA}" -T "${tmpDirB}" > \ "${cacheDir}/${backupID}.duplicates" + rmdir "${tmpDirA}" "${tmpDirB}" ;; 6) if [ "$2" == '##DESCRIBE##' ] then - echo 'remove inodes with duplicate hashes' + echo 'find files to inodes of previous lists' return 0 fi - if [ -r "${cacheDir}/next.action" ] + tmpDirA="$(mktemp -d)" + tmpDirB="$(mktemp -d "${cacheDir}/tmp.XXXXXX")" + + unset block + unset lastBlock + unset firstInode + unset lastInode + + sed ' + s|^\(\S\+\) \S\+ |\1 F | + ' "${cacheDir}/${backupID}.inodes.sorted" | \ + sort -m -T "${tmpDirA}" -T "${tmpDirB}" -- \ + - "${cacheDir}/${backupID}.duplicates" | \ + while read -r inode type extra + do + if [ "${type}" == "B" ] + then + block="${extra}" + elif [ "${lastInode}" == "${inode}" ] && [ -n "${block}" ] + then + echo "${block} ${inode} ${extra}" + else + unset block + fi + lastInode="${inode}" + done | \ + sort -T "${tmpDirA}" -T "${tmpDirB}" -k1n,1 | \ + while read -r block inode extra + do + if [ "${lastBlock}" != "${block}" ] + then + firstInode="${inode}" + fi + if [ "${lastBlock}" != "${block}" ] || [ "${firstInode}" != "${inode}" ] + then + echo "${block} ${extra}" + fi + lastBlock="${block}" + done | \ + uniq -m1 --group=separate > \ + "${cacheDir}/${backupID}.duplicates.files" + rmdir "${tmpDirA}" "${tmpDirB}" + ;; + 7) + if [ "$2" == '##DESCRIBE##' ] then - startInode="$(cat "${cacheDir}/next.action")" + echo 'relink files with different inodes and same hashes' + return 0 + fi + if [ ! -r "${cacheDir}/next.action" ] + then + cat "${cacheDir}/${backupID}.duplicates.files" + elif [ "$(head -n1 "${cacheDir}/next.action")" == "${backupID}" ] + then + startBlock="$(tail -n1 "${cacheDir}/next.action")" sed " :vor; - / ${startInode}\( \|$\)/{ - s@^\(\S\+ \)\(.* \)\?${startInode}\( \|$\)@\1${startInode}\3@; - bnach - }; + /^${startBlock} /bnach; d; bvor; :nach; n; bnach - " "${cacheDir}/${backupID}.duplicates" - else - cat "${cacheDir}/${backupID}.duplicates" + " "${cacheDir}/${backupID}.duplicates.files" fi | \ - while read line + while read -r oBlock original do - originalInode="${line%% *}" - original="$( - grep -m1 "^${originalInode} " "${cacheDir}/${backupID}.inodes.sorted/part.${originalInode:0:4}" | \ - sed 's|^\S\+ ||' - )" - for kopieInode in ${line#* } + echo "${backupID}" > "${cacheDir}/next.action2" + echo "${oBlock}" >> "${cacheDir}/next.action2" + mv "${cacheDir}/next.action2" "${cacheDir}/next.action" + while read -r kBlock kopie do - echo "${kopieInode}" > "${cacheDir}/next.action2" - mv "${cacheDir}/next.action2" "${cacheDir}/next.action" - OIFS="${IFS}" - IFS="$(printf '\n\t')" - for kopie in $( - grep "^${kopieInode} " "${cacheDir}/${backupID}.inodes.sorted/part.${kopieInode:0:4}" | \ - sed 's|^\S\+ ||' - ) - do - IFS="${OIFS}" - if ${paranoid} + [ -z "${kopie}" ] && break + if [ "${kBlock}" != "${oBlock}" ] + then + >&2 echo "'${kBlock}' != '${oBlock}'" + >&2 echo "'${backupID}':" + >&2 echo "'${original}'" + >&2 echo "'${kopie}'" + exit 1 + fi + + if ${paranoid} + then + diff "${original}" "${kopie}" + fi + if [ $(stat -c'%h' "${original}") -ge 65000 ] + then + echo "rm \"${original}\"" + echo "ln \"${kopie}\" \"${original}\"" + if ! ${dummy} then - diff "${original}" "${kopie}" + rm "${original}" + ln "${kopie}" "${original}" fi - if [ $(stat -c'%h' "${original}") -ge 65000 ] + else + echo "rm \"${kopie}\"" + echo "ln \"${original}\" \"${kopie}\"" + if ! ${dummy} then - echo "rm \"${original}\"" - echo "ln \"${kopie}\" \"${original}\"" - if ! ${dummy} - then - rm "${original}" - ln "${kopie}" "${original}" - fi - else - echo "rm \"${kopie}\"" - echo "ln \"${original}\" \"${kopie}\"" - if ! ${dummy} - then - rm "${kopie}" - ln "${original}" "${kopie}" - fi + rm "${kopie}" + ln "${original}" "${kopie}" fi - done + fi done done if [ -r "${cacheDir}/next.action" ] && \ - grep -q " $(cat "${cacheDir}/next.action")\( \|$\)" "${cacheDir}/${backupID}.duplicates" + [ "$(head -n1 "${cacheDir}/next.action")" == "${backupID}" ] then rm -f "${cacheDir}/next.action" "${cacheDir}/next.action2" fi -- cgit v1.2.3-54-g00ecf