summaryrefslogtreecommitdiff
path: root/gl/lib/di-set.c
diff options
context:
space:
mode:
authorPaul Eggert <eggert@cs.ucla.edu>2010-07-06 14:53:14 -0700
committerPaul Eggert <eggert@cs.ucla.edu>2010-07-06 14:58:48 -0700
commitfb1a26c3f64669a1b61740252c5db5fd5413c7e5 (patch)
tree994631f1859b0adbe74bff895919f1636f798e81 /gl/lib/di-set.c
parentd5427265e30522cfda098bb82ad3d4bff0a0d2bd (diff)
downloadcoreutils-fb1a26c3f64669a1b61740252c5db5fd5413c7e5.tar.xz
du: Hash with a mechanism that's simpler and takes less memory.
* gl/lib/dev-map.c, gl/lib/dev-map.h, gl/modules/dev-map: Remove. * gl/lib/ino-map.c, gl/lib/ino-map.h, gl/modules/ino-map: New files. * gl/modules/dev-map-tests, gl/tests/test-dev-map.c: Remove. * gl/modules/ino-map-tests, gl/tests/test-ino-map.c: New files. * gl/lib/di-set.h (struct di_set): Renamed from struct di_set_state, and now private. All uses changed. (_ATTRIBUTE_NONNULL_): Don't assume C99. (di_set_alloc): Renamed from di_set_init, with no size arg. Now allocates the object rather than initializing it. For now, this no longer takes an initial size; we can put this back later if it is needed. * gl/lib/di-set.c: Include hash.h, ino-map.h, and limits.h instead of stdio.h, assert.h, stdint.h, sys/types.h (di-set.h includes that now), sys/stat.h, and verify.h. (N_DEV_BITS_4, N_INO_BITS_4, N_DEV_BITS_8, N_INO_BITS_8): Remove. (struct dev_ino_4, struct dev_ino_8, struct dev_ino_full): Remove. (enum di_mode): Remove. (hashint): New typedef. (HASHINT_MAX, LARGE_INO_MIN): New macros. (struct di_ent): Now maps a dev_t to a inode set, instead of containing a union. (struct dev_map_ent): Remove. (struct di_set): New type. (is_encoded_ptr, decode_ptr, di_ent_create): Remove. (di_ent_hash, di_ent_compare, di_ent_free, di_set_alloc, di_set_free): (di_set_insert): Adjust to new representation. (di_ino_hash, map_device, map_inode_number): New functions. * gl/modules/di-set (Depends-on): Replace dev-map with ino-map. Remove 'verify'. * gl/tests/test-di-set.c: Adjust to the above changes to API. * src/du.c (INITIAL_DI_SET_SIZE): Remove. (hash_ins, main): Adjust to new di-set API.
Diffstat (limited to 'gl/lib/di-set.c')
-rw-r--r--gl/lib/di-set.c353
1 files changed, 156 insertions, 197 deletions
diff --git a/gl/lib/di-set.c b/gl/lib/di-set.c
index 3c4717b73..e0e2b24dd 100644
--- a/gl/lib/di-set.c
+++ b/gl/lib/di-set.c
@@ -15,262 +15,221 @@
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. */
-/* written by Jim Meyering */
+/* written by Paul Eggert and Jim Meyering */
#include <config.h>
#include "di-set.h"
-#include <stdio.h>
-#include <assert.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-
-#include "verify.h"
-
-/* Set operations for device-inode pairs stored in a space-efficient manner.
- A naive mapping uses 16 bytes to save a single st_dev, st_ino pair.
- However, in many applications, the vast majority of actual device,inode
- number pairs can be efficiently compressed to fit in 8 or even 4 bytes,
- by using a separate table to map a relatively small number of devices
- to small integers. */
-
-#define N_DEV_BITS_4 5
-#define N_INO_BITS_4 (32 - N_DEV_BITS_4 - 2 - 1)
-
-#define N_DEV_BITS_8 8
-#define N_INO_BITS_8 (64 - N_DEV_BITS_8 - 2 - 1)
-
-/* Note how the last bit is always set.
- This is required, in order to be able to distinguish
- an encoded di_ent value from a malloc-returned pointer,
- which must be 4-byte-aligned or better. */
-struct dev_ino_4
-{
- uint32_t mode:2; /* must be first */
- uint32_t short_ino:N_INO_BITS_4;
- uint32_t mapped_dev:N_DEV_BITS_4;
- uint32_t always_set:1;
-};
-verify (N_DEV_BITS_4 <= 8 * sizeof (int));
-verify (sizeof (struct dev_ino_4) == 4);
-
-struct dev_ino_8
-{
- uint32_t mode:2; /* must be first */
- uint64_t short_ino:N_INO_BITS_8;
- uint32_t mapped_dev:N_DEV_BITS_8;
- uint32_t always_set:1;
-};
-verify (sizeof (struct dev_ino_8) == 8);
-
-struct dev_ino_full
-{
- uint32_t mode:2; /* must be first */
- dev_t dev;
- ino_t ino;
-};
+#include "hash.h"
+#include "ino-map.h"
-enum di_mode
-{
- DI_MODE_4 = 1,
- DI_MODE_8 = 2,
- DI_MODE_FULL = 3
-};
+#include <limits.h>
+#include <stdlib.h>
-/*
- di_mode raw_inode mapped dev always_set
- \____________|_______________\_____/
- 4-byte | 2| 25 | 5 |1| mapped_dev
- `----------------------------------------------------|-----.
- 8-byte | 2| 53 | 8 |1|
- `----------------------------------------------------------'
-*/
+/* The hash package hashes "void *", but this package wants to hash
+ integers. Use integers that are as large as possible, but no
+ larger than void *, so that they can be cast to void * and back
+ without losing information. */
+typedef size_t hashint;
+#define HASHINT_MAX ((hashint) -1)
+
+/* Integers represent inode numbers. Integers in the range
+ 1..(LARGE_INO_MIN-1) represent inode numbers directly. (The hash
+ package does not work with null pointers, so inode 0 cannot be used
+ as a key.) To find the representations of other inode numbers, map
+ them through INO_MAP. */
+#define LARGE_INO_MIN (HASHINT_MAX / 2)
+
+/* Set operations for device-inode pairs stored in a space-efficient
+ manner. Use a two-level hash table. The top level hashes by
+ device number, as there are typically a small number of devices.
+ The lower level hashes by mapped inode numbers. In the typical
+ case where the inode number is positive and small, the inode number
+ maps to itself, masquerading as a void * value; otherwise, its
+ value is the result of hashing the inode value through INO_MAP. */
+
+/* A pair that maps a device number to a set of inode numbers. */
struct di_ent
{
- union
- {
- struct dev_ino_4 di4;
- struct dev_ino_8 di8;
- struct dev_ino_full full;
- uint32_t u32;
- uint64_t u64;
- void *ptr;
- } u;
-};
-
-struct dev_map_ent
-{
dev_t dev;
- uint32_t mapped_dev;
+ struct hash_table *ino_set;
};
-static inline bool
-is_encoded_ptr (struct di_ent const *v)
+/* A two-level hash table that manages and indexes these pairs. */
+struct di_set
{
- return (size_t) v % 4;
-}
+ /* Map device numbers to sets of inode number representatives. */
+ struct hash_table *dev_map;
-static struct di_ent
-decode_ptr (struct di_ent const *v)
-{
- if (!is_encoded_ptr (v))
- return *v;
+ /* If nonnull, map large inode numbers to their small
+ representatives. If null, there are no large inode numbers in
+ this set. */
+ struct ino_map *ino_map;
- struct di_ent di;
- di.u.ptr = (void *) v;
- return di;
-}
+ /* Cache of the most recently allocated and otherwise-unused storage
+ for probing this table. */
+ struct di_ent *probe;
+};
+/* Hash a device-inode-set entry. */
static size_t
di_ent_hash (void const *x, size_t table_size)
{
- struct di_ent e = decode_ptr (x);
- return (e.u.di4.mode == DI_MODE_4
- ? e.u.u32
- : (e.u.di4.mode == DI_MODE_8
- ? e.u.u64
- : e.u.full.ino)) % table_size;
+ struct di_ent const *p = x;
+ dev_t dev = p->dev;
+
+ /* Exclusive-OR the words of DEV into H. This avoids loss of info,
+ without using a wider % that could be quite slow. */
+ size_t h = dev;
+ int i;
+ for (i = 1; i < sizeof dev / sizeof h + (sizeof dev % sizeof h != 0); i++)
+ h ^= dev >>= CHAR_BIT * sizeof h;
+
+ return h % table_size;
}
-/* Compare two di_ent structs.
- Return true if they are the same. */
+/* Return true if two device-inode-set entries are the same. */
static bool
di_ent_compare (void const *x, void const *y)
{
- struct di_ent a = decode_ptr (x);
- struct di_ent b = decode_ptr (y);
- if (a.u.di4.mode != b.u.di4.mode)
- return false;
-
- if (a.u.di4.mode == DI_MODE_4)
- return (a.u.di4.short_ino == b.u.di4.short_ino
- && a.u.di4.mapped_dev == b.u.di4.mapped_dev);
-
- if (a.u.di8.mode == DI_MODE_8)
- return (a.u.di8.short_ino == b.u.di8.short_ino
- && a.u.di8.mapped_dev == b.u.di8.mapped_dev);
-
- return (a.u.full.ino == b.u.full.ino
- && a.u.full.dev == b.u.full.dev);
+ struct di_ent const *a = x;
+ struct di_ent const *b = y;
+ return a->dev == b->dev;
}
+/* Free a device-inode-set entry. */
static void
di_ent_free (void *v)
{
- if ( ! is_encoded_ptr (v))
- free (v);
+ struct di_ent *a = v;
+ hash_free (a->ino_set);
+ free (a);
}
-int
-di_set_init (struct di_set_state *dis, size_t initial_size)
+/* Create a set of device-inode pairs. Return NULL on allocation failure. */
+struct di_set *
+di_set_alloc (void)
{
- if (dev_map_init (&dis->dev_map) < 0)
- return -1;
+ struct di_set *dis = malloc (sizeof *dis);
+ if (dis)
+ {
+ enum { INITIAL_DEV_MAP_SIZE = 11 };
+ dis->dev_map = hash_initialize (INITIAL_DEV_MAP_SIZE, NULL,
+ di_ent_hash, di_ent_compare,
+ di_ent_free);
+ if (! dis->dev_map)
+ {
+ free (dis);
+ return NULL;
+ }
+ dis->ino_map = NULL;
+ dis->probe = NULL;
+ }
- dis->di_set = hash_initialize (initial_size, NULL,
- di_ent_hash, di_ent_compare, di_ent_free);
- return dis->di_set ? 0 : -1;
+ return dis;
}
+/* Free a set of device-inode pairs. */
void
-di_set_free (struct di_set_state *dis)
+di_set_free (struct di_set *dis)
{
- dev_map_free (&dis->dev_map);
- hash_free (dis->di_set);
+ hash_free (dis->dev_map);
+ free (dis->ino_map);
+ free (dis->probe);
+ free (dis);
}
-/* Given a device-inode set, DIS, create an entry for the DEV,INO
- pair, and store it in *V. If possible, encode DEV,INO into the pointer
- itself, but if not, allocate space for a full "struct di_ent" and set *V
- to that pointer. Upon memory allocation failure, return -1.
- Otherwise return 0. */
-int
-di_ent_create (struct di_set_state *dis,
- dev_t dev, ino_t ino,
- struct di_ent **v)
+/* Hash an encoded inode number I. */
+static size_t
+di_ino_hash (void const *i, size_t table_size)
{
- static int prev_m = -1;
- static dev_t prev_dev = -1;
- struct di_ent di_ent;
- int mapped_dev;
+ return (hashint) i % table_size;
+}
- if (dev == prev_dev)
- mapped_dev = prev_m;
+/* Using the DIS table, map a device to a hash table that represents
+ a set of inode numbers. Return NULL on error. */
+static struct hash_table *
+map_device (struct di_set *dis, dev_t dev)
+{
+ /* Find space for the probe, reusing the cache if available. */
+ struct di_ent *ent;
+ struct di_ent *probe = dis->probe;
+ if (probe)
+ {
+ /* If repeating a recent query, return the cached result. */
+ if (probe->dev == dev)
+ return probe->ino_set;
+ }
else
{
- mapped_dev = dev_map_insert (&dis->dev_map, dev);
- if (mapped_dev < 0)
- return -1;
- prev_dev = dev;
- prev_m = mapped_dev;
+ dis->probe = probe = malloc (sizeof *probe);
+ if (! probe)
+ return NULL;
}
- if (mapped_dev < (1 << N_DEV_BITS_4)
- && ino < (1 << N_INO_BITS_4))
+ /* Probe for the device. */
+ probe->dev = dev;
+ ent = hash_insert (dis->dev_map, probe);
+ if (! ent)
+ return NULL;
+
+ if (ent != probe)
{
-#if lint
- /* When this struct is smaller than a pointer, initialize
- the pointer so tools like valgrind don't complain about
- the uninitialized bytes. */
- if (sizeof di_ent.u.di4 < sizeof di_ent.u.ptr)
- di_ent.u.ptr = NULL;
-#endif
- di_ent.u.di4.mode = DI_MODE_4;
- di_ent.u.di4.short_ino = ino;
- di_ent.u.di4.mapped_dev = mapped_dev;
- di_ent.u.di4.always_set = 1;
- *v = di_ent.u.ptr;
+ /* Use the existing entry. */
+ probe->ino_set = ent->ino_set;
}
- else if (mapped_dev < (1 << N_DEV_BITS_8)
- && ino < ((uint64_t) 1 << N_INO_BITS_8))
+ else
{
- di_ent.u.di8.mode = DI_MODE_8;
- di_ent.u.di8.short_ino = ino;
- di_ent.u.di8.mapped_dev = mapped_dev;
- di_ent.u.di8.always_set = 1;
- *v = di_ent.u.ptr;
+ enum { INITIAL_INO_SET_SIZE = 1021 };
+
+ /* Prepare to allocate a new probe next time; this one is in use. */
+ dis->probe = NULL;
+
+ /* DEV is new; allocate an inode set for it. */
+ probe->ino_set = hash_initialize (INITIAL_INO_SET_SIZE, NULL,
+ di_ino_hash, NULL, NULL);
}
- else
+
+ return probe->ino_set;
+}
+
+/* Using the DIS table, map an inode number to a mapped value.
+ Return INO_MAP_INSERT_FAILURE on error. */
+static hashint
+map_inode_number (struct di_set *dis, ino_t ino)
+{
+ if (0 < ino && ino < LARGE_INO_MIN)
+ return ino;
+
+ if (! dis->ino_map)
{
- /* Handle the case in which INO is too large or in which (far less
- likely) we encounter hard-linked files on 2^N_DEV_BITS_8
- different devices. */
- struct di_ent *p = malloc (sizeof *p);
- if (!p)
- return -1;
- assert ((size_t) p % 4 == 0);
- p->u.full.mode = DI_MODE_FULL;
- p->u.full.ino = ino;
- p->u.full.dev = dev;
- *v = p;
+ dis->ino_map = ino_map_alloc (LARGE_INO_MIN);
+ if (! dis->ino_map)
+ return INO_MAP_INSERT_FAILURE;
}
- return 0;
+ return ino_map_insert (dis->ino_map, ino);
}
-/* Attempt to insert the DEV,INO pair into the set, DIS.
- If it matches a pair already in DIS, don't modify DIS and return 0.
+/* Attempt to insert the DEV,INO pair into the set DIS.
+ If it matches a pair already in DIS, keep that pair and return 0.
Otherwise, if insertion is successful, return 1.
Upon any failure return -1. */
int
-di_set_insert (struct di_set_state *dis, dev_t dev, ino_t ino)
+di_set_insert (struct di_set *dis, dev_t dev, ino_t ino)
{
- struct di_ent *v;
- if (di_ent_create (dis, dev, ino, &v) < 0)
- return -1;
+ hashint i;
- int err = hash_insert0 (dis->di_set, v, NULL);
- if (err == -1) /* Insertion failed due to lack of memory. */
+ /* Map the device number to a set of inodes. */
+ struct hash_table *ino_set = map_device (dis, dev);
+ if (! ino_set)
return -1;
- if (err == 1) /* Insertion succeeded. */
- return 1;
-
- /* That pair is already in the table, so ENT was not inserted. Free it. */
- if (! is_encoded_ptr (v))
- free (v);
+ /* Map the inode number to a small representative I. */
+ i = map_inode_number (dis, ino);
+ if (i == INO_MAP_INSERT_FAILURE)
+ return -1;
- return 0;
+ /* Put I into the inode set. */
+ return hash_insert0 (ino_set, (void *) i, NULL);
}