summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore5
-rw-r--r--Makefile18
-rw-r--r--backup.conf3
-rw-r--r--backupStatistics.in205
-rw-r--r--hardlinkedbackups.1.in25
-rw-r--r--man.commons.in8
6 files changed, 256 insertions, 8 deletions
diff --git a/.gitignore b/.gitignore
index 50c0d58..284ebda 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,6 @@
backup
-hardlinkedbackups.1
+backupStatistics
lastBackups
+man.commons
+*.common
+*.1
diff --git a/Makefile b/Makefile
index 8b4d3e5..805b51d 100644
--- a/Makefile
+++ b/Makefile
@@ -26,22 +26,26 @@ MANDIR = /usr/share/man
VERSION = 1.1
-all: hardlinkedbackups.1 backup lastBackups
+all: backup backup.1 lastBackups lastBackups.1 backupStatistics backupStatistics.1
%: %.in
- sed "s/#VERSION#/$(VERSION)/; s@#BINDIR#@$(BINDIR)@; s@#ETCDIR#@$(ETCDIR)@" $< > $@
+ sed "s/#VERSION#/$(VERSION)/; s@#BINDIR#@$(BINDIR)@; s@#ETCDIR#@$(ETCDIR)@; s@#NUMSTAGES#@5@" $< > $@
+
+%.common: man.commons
+ grep -v "^$(@:.common=)(" $< > $@
+
+%.1: % %.common
+ help2man -o "$@" -N --include $<.common --no-discard-stderr "./$<"
.PHONY: install dist clean
install: all
- install -D -m0755 -t $(DESTDIR)$(BINDIR) backup lastBackups
- install -D -m0644 -t $(DESTDIR)$(MANDIR)/man1 hardlinkedbackups.1
- ln -s $(DESTDIR)$(MANDIR)/man1/hardlinkedbackups.1 $(DESTDIR)$(MANDIR)/man1/backups.1
- ln -s $(DESTDIR)$(MANDIR)/man1/hardlinkedbackups.1 $(DESTDIR)$(MANDIR)/man1/lastBackups.1
+ install -D -m0755 -t $(DESTDIR)$(BINDIR) backup backupStatistics lastBackups
+ install -D -m0644 -t $(DESTDIR)$(MANDIR)/man1 backup.1 lastBackups.1 backupStatistics.1
install -D -m0644 -t $(DESTDIR)$(ETCDIR) backup.conf
clean:
- rm -f backup lastBackups hardlinkedbackups.1
+ cat .gitignore | xargs rm -f
dist: clean
git status --porcelain 2> /dev/null | grep -q "\S" && (git add .; git commit -m"neue Version: $(VERSION)") || true
diff --git a/backup.conf b/backup.conf
index 0b65488..82fa2c0 100644
--- a/backup.conf
+++ b/backup.conf
@@ -22,3 +22,6 @@ outdatedLimit=$[2*24*60*60]
# subdirectories which should be appended to the parent directory in the report
recognSubdirRegex="home\|boot\|root\|erich"
+
+# directory for caching valuable information in backupStatistics
+cacheDir="/path/to/cache/directory"
diff --git a/backupStatistics.in b/backupStatistics.in
new file mode 100644
index 0000000..d5511bf
--- /dev/null
+++ b/backupStatistics.in
@@ -0,0 +1,205 @@
+#!/bin/bash
+
+# backupStatistics version #VERSION#
+
+. #ETCDIR#/backup.conf
+
+do_stage()
+{
+ case $1 in
+ 1)
+ if [ "$2" == '##DESCRIBE##' ]
+ then
+ echo 'generate lists $filename -> $inode'
+ return 0
+ fi
+ dest="${backups["${backupID}"]%% *}"
+ dest="${dest%/}"
+ while [ ! -d "${dest}" ] && [ ${maxWait} -gt 0 ]
+ do
+ sleep 1
+ maxWait=$[${maxWait}-1]
+ done
+
+ rm -f "${cacheDir}/${backupID}.inodes"
+ for dat in $(ls "${dest}")
+ do
+ echo "${dat}:"
+ find "${dest}/${dat}" -type f -exec \
+ stat -c'%i %n' {} \; >> \
+ "${cacheDir}/${backupID}.inodes"
+ done
+ ;;
+ 2)
+ if [ "$2" == '##DESCRIBE##' ]
+ then
+ echo 'sort previous lists by $inode'
+ return 0
+ fi
+ sort -u "${cacheDir}/${backupID}.inodes" > \
+ "${cacheDir}/${backupID}.inodes.sorted"
+ ;;
+ 3)
+ if [ "$2" == '##DESCRIBE##' ]
+ then
+ echo 'generate lists $inode -> $contentHash'
+ return 0
+ fi
+ uniq -m1 "${cacheDir}/${backupID}.inodes.sorted" | \
+ parallel \
+ sha512sum {=s/^[[:digit:]]\+ //=} \| \
+ sed "\"s|^\([0-9a-f]\{128\}\) .*\$|\1 "{=s/^\([[:digit:]]\+\) .*/\\1/=}"|\"" \
+ \; | \
+ sort > \
+ "${cacheDir}/${backupID}.content"
+ ;;
+ 4)
+ if [ "$2" == '##DESCRIBE##' ]
+ then
+ echo 'find duplicate hashes'
+ return 0
+ fi
+ uniq -m1 -D "${cacheDir}/${backupID}.content" |
+ sed '
+ :a;
+ $!N;
+ s@^\(\S\+ \)\(.*\)\n\1@\1\2 @;
+ ta;
+ P;
+ D
+ ' | \
+ sed 's|^\S\+ ||' > \
+ "${cacheDir}/${backupID}.duplicates"
+ ;;
+ 5)
+ if [ "$2" == '##DESCRIBE##' ]
+ then
+ echo 'remove inodes with duplicate hashes'
+ return 0
+ fi
+ while read line
+ do
+ original="$(
+ grep "^${line%% *} " "${cacheDir}/${backupID}.inodes.sorted" | \
+ sed 's|^\S\+ ||'
+ )"
+ for kopieInode in ${line#* }
+ do
+ kopie="$(
+ grep "^${kopieInode} " "${cacheDir}/${backupID}.inodes.sorted" | \
+ sed 's|^\S\+ ||'
+ )"
+ if ${dummy}
+ then
+ echo "rm \"${kopie}\""
+ echo "ln \"${original}\" \"${kopie}\""
+ else
+ exit 1
+ DO NOT EXECUTE YET
+ # rm "${kopie}"
+ # ln "${original}" "${kopie}"
+ fi
+ done
+ done < \
+ "${cacheDir}/${backupID}.duplicates"
+ ;;
+ esac
+}
+
+usage()
+{
+ >&2 echo 'Usage: backupStatistics [OPTION]'
+ >&2 echo 'Search and tidy duplicate and not-hardlinked files in the backups.'
+ >&2 echo ''
+ >&2 echo 'With no options, tidy up all backups. THIS CAN BE VERY TIME CONSUMING.'
+ >&2 echo ''
+ >&2 echo 'Mandatory arguments to long options are mandatory for short options too.'
+ >&2 echo ' -d | --dummy only generate lists, do not modify backupfiles'
+ >&2 echo ' -h | --help display this help and exit'
+ >&2 echo ' -m | --max=maxNum stop execution after step maxNum'
+ >&2 echo ' -s | --skip=skipNum skip first skipNum steps'
+ >&2 echo ' -V | --version display version and exit'
+ >&2 echo ''
+ >&2 echo 'the executed steps are:'
+ for ((stage=1; stage<=#NUMSTAGES#; stage++))
+ do
+ >&2 echo ''
+ >&2 echo " ${stage}. $(do_stage ${stage} '##DESCRIBE##')"
+ done
+ >&2 echo ''
+ [ -z "$1" ] && exit 1
+ exit $1
+}
+
+eval set -- "$(
+ getopt -o dhm:s:V \
+ --long dummy \
+ --long help \
+ --long max: \
+ --long skip: \
+ --long version \
+ -n "$(basename "$0")" -- "$@" || \
+ echo usage
+)"
+
+dummy=false
+maxNum=#NUMSTAGES#
+skipNum=0
+
+while true; do
+ case "$1" in
+ -d|--dummy)
+ dummy=true
+ ;;
+ -h|--help)
+ usage 0
+ ;;
+ -m|--max)
+ shift
+ maxNum=$1
+ ;;
+ -s|--skip)
+ shift
+ skipNum=$1
+ ;;
+ -V|--version)
+ >&2 echo '#VERSION#'
+ exit 0
+ ;;
+ --)
+ shift
+ [ $# -gt 0 ] && echo 'ERROR: Unknown parameter: '"$#" && usage
+ break
+ ;;
+ *)
+ >&2 echo 'That should not happen, '"$1"' unknown though ...'
+ exit -1
+ ;;
+ esac
+ shift
+done
+
+if [ ! -d "${cacheDir}" ] || [ -z "${cacheDir}" ]
+then
+ >&2 "ERROR: Cache directory must exist, '${cacheDir}' does not! Closing."
+ exit 1
+fi
+
+if [ ! "${skipNum}" -ge 0 ] || \
+ [ ! "${skipNum}" -le #NUMSTAGES# ] || \
+ [ ! "${maxNum}" -ge 0 ] || \
+ [ ! "${maxNum}" -le #NUMSTAGES# ]
+then
+ usage
+fi
+
+for ((stage=${skipNum}-1; stage<=${maxNum}; stage++))
+do
+ echo "entering stage ${stage} ..."
+ for backupID in "${!backups[@]}"
+ do
+ echo "${backupID}:"
+ do_stage ${stage} "${backupID}"
+ done
+ echo "... stage ${stage} completed."
+done
diff --git a/hardlinkedbackups.1.in b/hardlinkedbackups.1.in
index 914edbe..7902568 100644
--- a/hardlinkedbackups.1.in
+++ b/hardlinkedbackups.1.in
@@ -6,9 +6,12 @@ hardlinkedbackups \- hardlinked incremental backups via rsync (and possibly thro
\fBbackup /tmp/pidFile /path/to/destination/ user@source:path [proxy_user@ssh_host]\fP
.TP
\fBlastBackups\fP
+.TP
+\fBbackupStatistics [options]\fP
.SH DESCRIPTION
\fBbackup\fP generates incremental backups (by hardlinking old unchanged files) via rsync and possibly a SOCKS\-tunnel.
\fBlastBackups\fP shows date of backups and warns about outdated ones.
+\fBbackupStatistics\fP exhaustively searches backups for duplicate but not hard-linked files.
.SH USAGE
.TP
.B "backup /tmp/pidFile /path/to/destination/ user@source:path"
@@ -24,6 +27,25 @@ This is designed to be called from a cron daemon for daily backups.
.TP
.B "lastBackups"
reports about actuality of backups defined in \fB#ETCDIR#/backup.conf\fP
+.TP
+.B "backupStatistics"
+Exhaustively search backups defined in \fB#ETCDIR#/backup.conf\fP for duplicate files which are not hard linked.
+For detail see section \fBBACKUPSTATISTICS\fP.
+.SH BACKUPSTATISTICS
+execution happens in several stages:
+.TP
+1.
+.TP
+blabla
+.TP
+.B "-d | --dummy"
+do not modify backed up files
+.TP
+.B "-m | --max \fImaxStage\fP"
+stop execution after completing stage \fImaxStage\fP
+.TP
+.B "-s | --skip \fIskipStages\fP"
+start execution right after stage \fIskipStages\fP
.SH CONFIGURATION
The configfile \fB#ETCDIR#/backup.conf\fP is a bash script, which defines the following variables:
.TP
@@ -41,6 +63,9 @@ time before backups are considered outdated [seconds]
.TP
.B "recognSubdirRegex"
regular expression of subdirectories which should be appended to the parent directory in the report
+.TP
+.B "cacheDir"
+directory for caching valuable information in \fBbackupStatistics\fP
.SH FILES
.TP
.B "#BINDIR#/backup"
diff --git a/man.commons.in b/man.commons.in
new file mode 100644
index 0000000..45918c7
--- /dev/null
+++ b/man.commons.in
@@ -0,0 +1,8 @@
+[FILES]
+#ETCDIR#/backup.conf
+[AUTHOR]
+Erich Eckner <opensource at eckner dot net>.
+[SEE ALSO]
+backup(1)
+lastBackup(1)
+backupStatistics(1)