diff options
author | Jim Meyering <meyering@redhat.com> | 2008-09-22 22:42:12 +0200 |
---|---|---|
committer | Jim Meyering <meyering@redhat.com> | 2008-09-27 00:10:08 +0200 |
commit | 24412edeaf556a96a5ee122851de7c3e37726bdb (patch) | |
tree | 54240c2ae7944434cb6ef9ed2ff69ba6c964c78b | |
parent | a5111af33ea6a5d27c3f7ab67afdb2a5884c38b6 (diff) | |
download | coreutils-24412edeaf556a96a5ee122851de7c3e37726bdb.tar.xz |
rm -r: avoid O(n^2) performance for a directory with very many entries
This enhancement works around a problem that is specific to at least
ext3 and ext4 file systems. With them, it would take hours to remove
a two-million-entry directory. RAM-backed file systems (tmpfs) are
not affected, since there is no seek penalty.
* remove.c (rm_malloc, rm_free, compare_ino): New functions.
(dirent_count, preprocess_dir): New function.
[struct readdir_data]: New struct.
(remove_cwd_entries): Call preprocess_dir.
* tests/rm/ext3-perf: New file. Test for the performance fix.
* NEWS: mention the new feature
-rw-r--r-- | NEWS | 7 | ||||
-rw-r--r-- | src/remove.c | 165 | ||||
-rw-r--r-- | tests/Makefile.am | 1 | ||||
-rwxr-xr-x | tests/rm/ext3-perf | 76 |
4 files changed, 249 insertions, 0 deletions
@@ -9,6 +9,13 @@ GNU coreutils NEWS -*- outline -*- ** New features + chgrp, chmod, chown, chcon, du, rm: now all display linear performance, + even when operating on million-entry directories on ext3 and ext4 file + systems. Before, they would exhibit O(N^2) performance, due to linear + per-entry seek time cost when operating on entries in readdir order. + Rm was improved directly, while the others inherit the improvement + from the newer version of fts in gnulib. + comm now verifies that the inputs are in sorted order. This check can be turned off with the --nocheck-order option. diff --git a/src/remove.c b/src/remove.c index 8143f5015..26f096e2c 100644 --- a/src/remove.c +++ b/src/remove.c @@ -60,6 +60,14 @@ enum CONSECUTIVE_READDIR_UNLINK_THRESHOLD = 10 }; +/* If the heuristics in preprocess_dir suggest that there + are fewer than this many entries in a directory, then it + skips the preprocessing altogether. */ +enum +{ + INODE_SORT_DIR_ENTRIES_THRESHOLD = 10000 +}; + /* FIXME: in 2009, or whenever Darwin 7.9.0 (aka MacOS X 10.3.9) is no longer relevant, remove this work-around code. Then, there will be no need to perform the extra rewinddir call, ever. */ @@ -217,6 +225,24 @@ hash_compare_strings (void const *x, void const *y) return STREQ (x, y) ? true : false; } +/* Obstack allocator: longjump on failure. */ +static void * +rm_malloc (void *v_jumpbuf, long size) +{ + jmp_buf *jumpbuf = v_jumpbuf; + void *p = malloc (size); + if (p) + return p; + longjmp (*jumpbuf, 1); +} + +/* With the 2-arg allocator, we must also provide a two-argument freer. */ +static void +rm_free (void *v_jumpbuf ATTRIBUTE_UNUSED, void *ptr) +{ + free (ptr); +} + static inline void push_dir (Dirstack_state *ds, const char *dir_name) { @@ -1225,6 +1251,141 @@ fd_to_subdirp (int fd_cwd, char const *f, return NULL; } +struct readdir_data +{ + ino_t ino; + char name[FLEXIBLE_ARRAY_MEMBER]; +}; + +/* A comparison function to sort on increasing inode number. */ +static int +compare_ino (void const *av, void const *bv) +{ + struct readdir_data const *const *a = av; + struct readdir_data const *const *b = bv; + return (a[0]->ino < b[0]->ino ? -1 + : b[0]->ino < a[0]->ino ? 1 : 0); +} + +/* Return an approximation of the maximum number of dirent entries + in a directory with stat data *ST. */ +static size_t +dirent_count (struct stat const *st) +{ + return st->st_size / 16; +} + +/* When a directory contains very many entries, operating on N entries in + readdir order can be very seek-intensive (be it to unlink or even to + merely stat each entry), to the point that it results in O(N^2) work. + This is file-system-specific: ext3 and ext4 (as of 2008) are susceptible, + but tmpfs is not. The general solution is to process entries in inode + order. That means reading all entries, and sorting them before operating + on any. As such, it is useful only on systems with useful dirent.d_ino. + Since 'rm -r's removal process must traverse into directories and since + this preprocessing phase can allocate O(N) storage, here we store and + sort only non-directory entries, and then remove all of those, so that we + can free all allocated storage before traversing into any subdirectory. + Perform this optimization only when not interactive and not in verbose + mode, to keep the implementation simple and to minimize duplication. + Upon failure, simply free any resources and return. */ +static void +preprocess_dir (DIR **dirp, struct rm_options const *x) +{ +#if HAVE_STRUCT_DIRENT_D_TYPE + + struct stat st; + /* There are many reasons to return right away, skipping this + optimization. The penalty for being wrong is that we will + perform a small amount of extra work. + + Skip this optimization if... */ + + if (/* - there is a chance of interactivity */ + x->interactive != RMI_NEVER + + /* - we're in verbose mode */ + || x->verbose + + /* - privileged users can unlink nonempty directories. + Otherwise, there'd be a race condition between the readdir + call (in which we learn dirent.d_type) and the unlink, by + which time the non-directory may be replaced with a directory. */ + || ! cannot_unlink_dir () + + /* - we can't fstat the file descriptor */ + || fstat (dirfd (*dirp), &st) != 0 + + /* - the directory is smaller than some threshold. + Estimate the number of inodes with a heuristic. + There's no significant benefit to sorting if there are + too few inodes. */ + || dirent_count (&st) < INODE_SORT_DIR_ENTRIES_THRESHOLD) + return; + + /* FIXME: maybe test file system type, too; skip if it's tmpfs: see fts.c */ + + struct obstack o_readdir_data; /* readdir data: inode,name pairs */ + struct obstack o_p; /* an array of pointers to each inode,name pair */ + + /* Arrange to longjmp upon obstack allocation failure. */ + jmp_buf readdir_jumpbuf; + if (setjmp (readdir_jumpbuf)) + goto cleanup; + + obstack_init_minimal (&o_readdir_data); + obstack_init_minimal (&o_p); + + obstack_specify_allocation_with_arg (&o_readdir_data, 0, 0, + rm_malloc, rm_free, &readdir_jumpbuf); + obstack_specify_allocation_with_arg (&o_p, 0, 0, + rm_malloc, rm_free, &readdir_jumpbuf); + + /* Read all entries, storing <d_ino, d_name> for each non-dir one. + Maintain a parallel list of pointers into the primary buffer. */ + while (1) + { + struct dirent const *dp; + dp = readdir_ignoring_dot_and_dotdot (*dirp); + /* no need to distinguish EOF from failure */ + if (dp == NULL) + break; + + /* Skip known-directory and type-unknown entries. */ + if (D_TYPE (dp) == DT_UNKNOWN || D_TYPE (dp) == DT_DIR) + break; + + size_t name_len = strlen (dp->d_name); + size_t ent_len = offsetof (struct readdir_data, name) + name_len + 1; + struct readdir_data *v = obstack_alloc (&o_readdir_data, ent_len); + v->ino = D_INO (dp); + memcpy (v->name, dp->d_name, name_len + 1); + + /* Append V to the list of pointers. */ + obstack_ptr_grow (&o_p, v); + } + + /* Compute size and finalize VV. */ + size_t n = obstack_object_size (&o_p) / sizeof (void *); + struct readdir_data **vv = obstack_finish (&o_p); + + /* Sort on inode number. */ + qsort(vv, n, sizeof *vv, compare_ino); + + /* Iterate through those pointers, unlinking each name. */ + for (size_t i = 0; i < n; i++) + { + /* ignore failure */ + (void) unlinkat (dirfd (*dirp), vv[i]->name, 0); + } + + cleanup: + obstack_free (&o_readdir_data, NULL); + obstack_free (&o_p, NULL); + rewinddir (*dirp); +#endif +} + /* Remove entries in the directory open on DIRP Upon finding a directory that is both non-empty and that can be chdir'd into, return RM_OK and set *SUBDIR and fill in SUBDIR_SB, where @@ -1247,6 +1408,10 @@ remove_cwd_entries (DIR **dirp, assert (VALID_STATUS (status)); *subdir = NULL; + /* This is just an optimization. + It's not a fatal problem if it fails. */ + preprocess_dir (dirp, x); + while (1) { struct dirent const *dp; diff --git a/tests/Makefile.am b/tests/Makefile.am index bc656c427..e0377cc32 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -72,6 +72,7 @@ EXTRA_DIST += $(TESTS) TESTS = \ misc/help-version \ misc/invalid-opt \ + rm/ext3-perf \ rm/cycle \ chmod/no-x \ chgrp/basic \ diff --git a/tests/rm/ext3-perf b/tests/rm/ext3-perf new file mode 100755 index 000000000..e0439dffb --- /dev/null +++ b/tests/rm/ext3-perf @@ -0,0 +1,76 @@ +#!/bin/sh +# ensure that "rm -rf DIR-with-many-entries" is not O(N^2) + +# Copyright (C) 2008 Free Software Foundation, Inc. + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +if test "$VERBOSE" = yes; then + set -x + rm --version +fi + +. $srcdir/test-lib.sh + +expensive_ + +# Using rm -rf to remove a 400k-entry directory takes: +# - 9 seconds with the patch, on a 2-yr-old system +# - 350 seconds without the patch, on a high-end system (disk 20-30% faster) +threshold_seconds=60 + +# The number of entries in our test directory. +n=400000 + +# Choose a value that is large enough to ensure an accidentally +# regressed rm would require much longer than $threshold_seconds to remove +# the directory. With n=400k, pre-patch GNU rm would require about 350 +# seconds even on a fast disk. On a relatively modern system, the +# patched version of rm requires about 10 seconds, so even if you +# choose to enable very expensive tests with a disk that is much slower, +# the test should still succeed. + +# Skip unless "." is on an ext[34] file system. +# FIXME-maybe: try to find a suitable file system or allow +# the user to specify it via an envvar. +df -t ext3 -t ext4dev -t ext4 . \ + || skip_test_ 'this test runs only on an ext3 or ext4 file system' + +# Skip if there are too few inodes free. Require some slack. +free_inodes=$(stat -f --format=%d .) || framework_failure +min_free_inodes=$(expr 12 \* $n / 10) +test $min_free_inodes -lt $free_inodes \ + || skip_test_ "too few free inodes on '.': $free_inodes;" \ + "this test requires at least $min_free_inodes" + +ok=0 +mkdir d && + cd d && + seq $n | xargs touch && + test -f 1 && + test -f $n && + cd .. && + ok=1 +test $ok = 1 || framework_failure + +fail=0 +start=$(date +%s) +rm -rf d || fail=1 +duration=$(expr $(date +%s) - $start) +test $duration -lt $threshold_seconds || + { fail=1; echo rm took longer than $threshold_seconds seconds; } + +echo removing a $n-entry directory took $duration seconds + +Exit $fail |