#!/bin/bash

# backupStatistics version #VERSION#

set -e

[ -r "#ETCDIR#/backup.conf" ] && \
  . "#ETCDIR#/backup.conf"

do_stage()
{
  case $1 in
    1)
      if [ "$2" == '##DESCRIBE##' ]
      then
        echo 'generate lists $filename -> $inode'
        return 0
      fi
      dest="${backups["${backupID}"]%% *}"
      dest="${dest%/}"
      while [ ! -d "${dest}" ] && [ ${maxWait} -gt 0 ]
      do
        sleep 1
        maxWait=$[${maxWait}-1]
      done

      rm -f "${cacheDir}/${backupID}.inodes"
      for dat in $(ls "${dest}")
      do
        echo "${dat}:"
        find "${dest}/${dat}" -type f -links -64001 -printf '%i %D-%m-%U-%G %p\n' >> \
        "${cacheDir}/${backupID}.inodes"
      done
    ;;
    2)
      if [ "$2" == '##DESCRIBE##' ]
      then
        echo 'sort and partition previous lists by $inode'
        return 0
      fi
      tmpDirA="$(mktemp -d)"
      tmpDirB="$(mktemp -d "${cacheDir}/tmp.XXXXXX")"
      rm -rf "${cacheDir}/${backupID}.inodes.sorted"
      mkdir "${cacheDir}/${backupID}.inodes.sorted"
      sort -T "${tmpDirA}" -T "${tmpDirB}" -u "${cacheDir}/${backupID}.inodes" | \
        while read -r line
        do
          part="${line:0:4}"
          part="${part%% *}"
          echo "${line}" >> \
            "${cacheDir}/${backupID}.inodes.sorted/part.${part}"
        done
      rmdir "${tmpDirA}" "${tmpDirB}"
    ;;
    3)
      if [ "$2" == '##DESCRIBE##' ]
      then
        echo 'generate lists $inode -> $count, $contentHash'
        return 0
      fi
      cat "${cacheDir}/${backupID}.inodes.sorted/"part.* | \
        uniq -cm2 | \
        parallel \
          sha512sum {=s/^ *\([[:digit:]]\+ \)\{2\}[0-9-]\+ //=} \| \
            sed '"s|^\([0-9a-f]\{128\}\)  .*\$|\1'{=s/^ *\([[:digit:]]\+ [[:digit:]]\+\) \([0-9-]\+\) .*/-\\2 \\1/=}'|"' \
          \; > \
        "${cacheDir}/${backupID}.content"
    ;;
    4)
      if [ "$2" == '##DESCRIBE##' ]
      then
        echo 'sort previous lists by $contentHash'
        return 0
      fi
      tmpDirA="$(mktemp -d)"
      tmpDirB="$(mktemp -d "${cacheDir}/tmp.XXXXXX")"
      sort -T "${tmpDirA}" -T "${tmpDirB}" -k1,1 -k2nr,2 "${cacheDir}/${backupID}.content" > \
        "${cacheDir}/${backupID}.content.sorted"
      rmdir "${tmpDirA}" "${tmpDirB}"
    ;;
    5)
      if [ "$2" == '##DESCRIBE##' ]
      then
        echo 'find duplicate hashes'
        return 0
      fi
      (
        uniq -m1 --all-repeated=separate "${cacheDir}/${backupID}.content.sorted"
        echo ""
      ) | \
        sed 's|^\(\S\+ \)\{2\}||' | \
        while read s
        do
          if [ -z "${s}" ]
          then
            echo ""
          else
            echo -n "${s} "
          fi
        done | \
        sed 's| $||' > \
        "${cacheDir}/${backupID}.duplicates"
    ;;
    6)
      if [ "$2" == '##DESCRIBE##' ]
      then
        echo 'remove inodes with duplicate hashes'
        return 0
      fi
      if [ -r "${cacheDir}/next.action" ]
      then
        startInode="$(cat "${cacheDir}/next.action")"
        sed "
          :vor;
            / ${startInode}\( \|$\)/{
              s@^\(\S\+ \)\(.* \)\?${startInode}\( \|$\)@\1${startInode}\3@;
              bnach
            };
            d;
            bvor;
          :nach;
            n;
            bnach
        " "${cacheDir}/${backupID}.duplicates"
      else
        cat "${cacheDir}/${backupID}.duplicates"
      fi | \
        while read line
        do
          originalInode="${line%% *}"
          original="$(
            grep -m1 "^${originalInode} " "${cacheDir}/${backupID}.inodes.sorted/part.${originalInode:0:4}" | \
              sed 's|^\S\+ ||'
            )"
          for kopieInode in ${line#* }
          do
            echo "${kopieInode}" > "${cacheDir}/next.action2"
            mv "${cacheDir}/next.action2" "${cacheDir}/next.action"
            OIFS="${IFS}"
            IFS="$(printf '\n\t')"
            for kopie in $(
              grep "^${kopieInode} " "${cacheDir}/${backupID}.inodes.sorted/part.${kopieInode:0:4}" | \
                sed 's|^\S\+ ||'
              )
            do
              IFS="${OIFS}"
              if ${paranoid}
              then
                diff "${original}" "${kopie}"
              fi
              if [ $(stat -c'%h' "${original}") -ge 65000 ]
              then
                echo "rm \"${original}\""
                echo "ln \"${kopie}\" \"${original}\""
                if ! ${dummy}
                then
                  rm "${original}"
                  ln "${kopie}" "${original}"
                fi
              else
                echo "rm \"${kopie}\""
                echo "ln \"${original}\" \"${kopie}\""
                if ! ${dummy}
                then
                  rm "${kopie}"
                  ln "${original}" "${kopie}"
                fi
              fi
            done
          done
        done
      if [ -r "${cacheDir}/next.action" ] && \
        grep -q " $(cat "${cacheDir}/next.action")\( \|$\)" "${cacheDir}/${backupID}.duplicates"
      then
        rm -f "${cacheDir}/next.action" "${cacheDir}/next.action2"
      fi
    ;;
  esac
}

usage()
{
  >&2 echo \
'Usage:  backupStatistics [OPTION]
Search and tidy duplicate and not-hardlinked files in the backups.

With no options, tidy up all backups. THIS CAN BE VERY TIME CONSUMING.

Mandatory arguments to long options are mandatory for short options too.
  -d, --dummy         only generate lists, do not modify backupfiles
  -m, --max=maxNum    stop execution after step maxNum
  -p, --paranoid      test for file differences before relinking (test _should_ be obsolete)
  -s, --skip=skipNum  skip first skipNum steps
#HELPTEXT#           #

the executed steps are:'

  for ((stage=1; stage<=#NUMSTAGES#; stage++))
  do
    >&2 echo ''
    >&2 echo "  ${stage}. $(do_stage ${stage} '##DESCRIBE##')"
  done
  >&2 echo ''
  [ -z "$1" ] && exit 1
  exit $1
}

eval set -- "$(
  getopt -o dm:ps: \
    --long dummy \
    --long help \
    --long max: \
    --long paranoid \
    --long skip: \
    --long version \
    -n "$(basename "$0")" -- "$@" || \
  echo usage
)"

dummy=false
maxNum=#NUMSTAGES#
paranoid=false
skipNum=0

while true; do
  case "$1" in
    -d|--dummy)
      dummy=true
      ;;
    --help)
      usage 0
      ;;
    -m|--max)
      shift
      maxNum=$1
      ;;
    -p|--paranoid)
      paranoid=true
      ;;
    -s|--skip)
      shift
      skipNum=$1
      ;;
    --version)
      >&2 echo '#VERSION#'
      exit 0
      ;;
    --)
      shift
      [ $# -gt 0 ] && echo 'ERROR: Unknown parameter: '"$#" && usage
      break
      ;;
    *)
      >&2 echo 'That should not happen, '"$1"' unknown though ...'
      exit -1
      ;;
  esac
  shift
done

if [ ! -d "${cacheDir}" ] || [ -z "${cacheDir}" ]
then
  >&2 "ERROR: Cache directory must exist, '${cacheDir}' does not! Closing."
  exit 1
fi

if [ ! "${skipNum}" -ge 0 ] || \
  [ ! "${skipNum}" -le #NUMSTAGES# ] || \
  [ ! "${maxNum}" -ge 0 ] || \
  [ ! "${maxNum}" -le #NUMSTAGES# ]
then
  usage
fi

for ((stage=${skipNum}+1; stage<=${maxNum}; stage++))
do
  echo "entering stage ${stage} ($(do_stage ${stage} '##DESCRIBE##')) ..."
  for backupID in "${!backups[@]}"
  do
    echo "${backupID}:"
    do_stage ${stage} "${backupID}"
  done
  echo "... stage ${stage} completed."
done