Revamp to allow fine-tuning to control when and by how

much the table grows and shrinks. (next_prime): Don't assert. (hash_reset_tuning): New function. (check_tuning): New function. (hash_initialize): Accept and use new tuning parameter. (hash_rehash): Rewrite, updating for tuning. (hash_insert): Honor tuning semantics. (hash_delete): Likewise. From François Pinard.
author: Jim Meyering <jim@meyering.net> 1999-03-15 15:50:31 +0000
committer: Jim Meyering <jim@meyering.net> 1999-03-15 15:50:31 +0000
commit: 0a1a14a0958643bc5dabc2d0b0901dca08fca722 (patch)
tree: 2d4589b6aad75188449a745799487f488427a00a /lib/hash.c
parent: 674d2ec393f69b015b3ff80343a431b81f94322f (diff)
download: coreutils-0a1a14a0958643bc5dabc2d0b0901dca08fca722.tar.xz
1 files changed, 231 insertions, 100 deletions
diff --git a/lib/hash.c b/lib/hash.c
index d40789846..2a5f5c699 100644
--- a/lib/hash.c
+++ b/lib/hash.c
@@ -76,10 +76,36 @@ char *malloc ();
    become inordinate, as unused slots in the hash table take some space.  The
    best bet is to make sure you are using a good `hasher' function (beware
    that those are not that easy to write! :-), and to use a table size at
-   least bigger than the actual number of entries.
-
-   Currently, whenever the addition of an entry gets 80% of buckets to be
-   non-empty, this package automatically doubles the number of buckets.  */
+   least bigger than the actual number of entries.  */
+
+/* If, while adding an to a bucket which was empty, the ratio of used buckets
+   over table size goes over the growth threshold (a number between 0.0 and
+   1.0), then reorganise the table size bigger by the growth factor (a number
+   greater than 1.0).  The growth threshold defaults to 0.8, and the growth
+   factor defaults to 1.414, meaning that the table will have doubled its size
+   every second time 80% of the buckets get used.  */
+#define DEFAULT_GROWTH_THRESHOLD 0.8
+#define DEFAULT_GROWTH_FACTOR 1.414
+
+/* If, while emptying a bucket, the ratio of used buckets over table size
+   drops below the shrink threshold (a number between 0.0 and 1.0), then
+   reorganise the table size smaller through the usage of a shrink factor (a
+   number greater than the shrink threshold but smaller than 1.0).  The shrink
+   threshold and factor default to 0.0 and 1.0, meaning that the table never
+   shrinks.  */
+#define DEFAULT_SHRINK_THRESHOLD 0.0
+#define DEFAULT_SHRINK_FACTOR 1.0
+
+/* Use this to initialize or reset a TUNING structure to
+   some sensible values. */
+static const Hash_tuning default_tuning =
+  {
+    DEFAULT_SHRINK_THRESHOLD,
+    DEFAULT_SHRINK_FACTOR,
+    DEFAULT_GROWTH_THRESHOLD,
+    DEFAULT_GROWTH_FACTOR,
+    false
+  };
 
 /* Information and lookup.  */
 
@@ -91,7 +117,7 @@ char *malloc ();
    number of buckets (used plus unused), or the maximum number of slots, are
    the same quantity.  */
 
-unsigned int
+unsigned
 hash_get_n_buckets (const Hash_table *table)
 {
   return table->n_buckets;
@@ -99,7 +125,7 @@ hash_get_n_buckets (const Hash_table *table)
 
 /* Return the number of slots in use (non-empty buckets).  */
 
-unsigned int
+unsigned
 hash_get_n_buckets_used (const Hash_table *table)
 {
   return table->n_buckets_used;
@@ -107,7 +133,7 @@ hash_get_n_buckets_used (const Hash_table *table)
 
 /* Return the number of active entries.  */
 
-unsigned int
+unsigned
 hash_get_n_entries (const Hash_table *table)
 {
   return table->n_entries;
@@ -115,18 +141,18 @@ hash_get_n_entries (const Hash_table *table)
 
 /* Return the length of the most lenghty chain (bucket).  */
 
-unsigned int
+unsigned
 hash_get_max_bucket_length (const Hash_table *table)
 {
   struct hash_entry *bucket;
-  unsigned int max_bucket_length = 0;
+  unsigned max_bucket_length = 0;
 
   for (bucket = table->bucket; bucket < table->bucket_limit; bucket++)
     {
       if (bucket->data)
 	{
 	  struct hash_entry *cursor = bucket;
-	  unsigned int bucket_length = 1;
+	  unsigned bucket_length = 1;
 
 	  while (cursor = cursor->next, cursor)
 	    bucket_length++;
@@ -146,8 +172,8 @@ bool
 hash_table_ok (const Hash_table *table)
 {
   struct hash_entry *bucket;
-  unsigned int n_buckets_used = 0;
-  unsigned int n_entries = 0;
+  unsigned n_buckets_used = 0;
+  unsigned n_entries = 0;
 
   for (bucket = table->bucket; bucket < table->bucket_limit; bucket++)
     {
@@ -174,10 +200,10 @@ hash_table_ok (const Hash_table *table)
 void
 hash_print_statistics (const Hash_table *table, FILE *stream)
 {
-  unsigned int n_entries = hash_get_n_entries (table);
-  unsigned int n_buckets = hash_get_n_buckets (table);
-  unsigned int n_buckets_used = hash_get_n_buckets_used (table);
-  unsigned int max_bucket_length = hash_get_max_bucket_length (table);
+  unsigned n_entries = hash_get_n_entries (table);
+  unsigned n_buckets = hash_get_n_buckets (table);
+  unsigned n_buckets_used = hash_get_n_buckets_used (table);
+  unsigned max_bucket_length = hash_get_max_bucket_length (table);
 
   fprintf (stream, "# entries:         %u\n", n_entries);
   fprintf (stream, "# buckets:         %u\n", n_buckets);
@@ -186,8 +212,8 @@ hash_print_statistics (const Hash_table *table, FILE *stream)
   fprintf (stream, "max bucket length: %u\n", max_bucket_length);
 }
 
-/* If an entry from table, TABLE, matches ENTRY, return the one from
-   the table.  Otherwise, return NULL. */
+/* Return the user entry from the hash table, if some entry in the hash table
+   compares equally with ENTRY, or NULL otherwise. */
 
 void *
 hash_lookup (const Hash_table *table, const void *entry)
@@ -263,11 +289,11 @@ hash_get_next (const Hash_table *table, const void *entry)
    return the number of pointers copied.  Do not copy more than BUFFER_SIZE
    pointers.  */
 
-unsigned int
+unsigned
 hash_get_entries (const Hash_table *table, void **buffer,
-		  unsigned int buffer_size)
+		  unsigned buffer_size)
 {
-  unsigned int counter = 0;
+  unsigned counter = 0;
   struct hash_entry *bucket;
   struct hash_entry *cursor;
 
@@ -295,11 +321,11 @@ hash_get_entries (const Hash_table *table, void **buffer,
    as received.  The walking continue for as long as the PROCESSOR function
    returns nonzero.  When it returns zero, the walking is interrupted.  */
 
-unsigned int
+unsigned
 hash_do_for_each (const Hash_table *table, Hash_processor processor,
 		  void *processor_data)
 {
-  unsigned int counter = 0;
+  unsigned counter = 0;
   struct hash_entry *bucket;
   struct hash_entry *cursor;
 
@@ -332,8 +358,8 @@ hash_do_for_each (const Hash_table *table, Hash_processor processor,
    algorithms tend to be domain-specific, so what's good for [diffutils'] io.c
    may not be good for your application."  */
 
-unsigned int
-hash_string (const char *string, unsigned int n_buckets)
+unsigned
+hash_string (const char *string, unsigned n_buckets)
 {
 # ifndef CHAR_BIT
 #  define CHAR_BIT 8
@@ -360,8 +386,8 @@ hash_string (const char *string, unsigned int n_buckets)
    very old Cyber `snoop', itself written in typical Greg Mansfield style.
    (By the way, what happened to this excellent man?  Is he still alive?)  */
 
-unsigned int
-hash_string (const char *string, unsigned int n_buckets)
+unsigned
+hash_string (const char *string, unsigned n_buckets)
 {
   unsigned value = 0;
 
@@ -393,12 +419,14 @@ is_prime (unsigned long candidate)
 }
 
 /* Round a given CANDIDATE number up to the nearest prime, and return that
-   prime.  CANDIDATE should be at least equal to 10.  */
+   prime.  Primes lower than 10 are merely skipped.  */
 
 static unsigned long
 next_prime (unsigned long candidate)
 {
-  assert (candidate >= 10);
+  /* Skip small primes.  */
+  if (candidate < 10)
+    candidate = 10;
 
   /* Make it definitely odd.  */
   candidate |= 1;
@@ -409,33 +437,72 @@ next_prime (unsigned long candidate)
   return candidate;
 }
 
-/* Allocate and return a new hash table, or NULL if an error is met.  The
-   initial number of buckets would be at least CANDIDATE (which need not be
-   prime).
+void
+hash_reset_tuning (Hash_tuning *tuning)
+{
+  *tuning = default_tuning;
+}
 
-   If DATA_FREER is not NULL, this function may be later called with the data
-   as an argument, just before they entry containing the data gets freed.  The
-   HASHER function should be supplied, and FIXME.  The COMPARATOR function
-   should also be supplied, and FIXME.  */
+/* For the given hash TABLE, check the user supplied tuning structure for
+   reasonable values, and return true if there is no gross error with it.
+   Otherwise, definitvely reset the TUNING field to some acceptable default in
+   the hash table (that is, the user looses the right of further modifying
+   tuning arguments), and return false.  */
 
-    /* User-supplied function for freeing datas.  It is specified in
-       hash_initialize.  If non-null, it is used by hash_free and hash_clear.
-       You should specify `free' here only if you want these functions to free
-       all of your `data' data.  This is typically the case when your data is
-       simply an auxilliary struct that you have malloc'd to aggregate several
-       values.  */
+static bool
+check_tuning (Hash_table *table)
+{
+  const Hash_tuning *tuning = table->tuning;
+
+  if (tuning->growth_threshold > 0.0
+      && tuning->growth_threshold < 1.0
+      && tuning->growth_factor > 1.0
+      && tuning->shrink_threshold >= 0.0
+      && tuning->shrink_threshold < 1.0
+      && tuning->shrink_factor > tuning->shrink_threshold
+      && tuning->shrink_factor <= 1.0
+      && tuning->shrink_threshold < tuning->growth_threshold)
+    return true;
 
-    /* User-supplied hash function that hashes entry ENTRY to an integer in
-       the range 0..TABLE_SIZE-1.  */
+  table->tuning = &default_tuning;
+  return false;
+}
 
-    /* User-supplied function that determines whether a new entry is unique by
-       comparing the new entry to entries that hashed to the same bucket
-       index.  It should return zero for a pair of entries that compare equal,
-       non-zero otherwise.  */
+/* Allocate and return a new hash table, or NULL if an error is met.  The
+   initial number of buckets is automatically selected so to _guarantee_ that
+   you may insert at least CANDIDATE different user entries before any growth
+   of the hash table size occurs.  So, if you happen to know beforehand the
+   number of entries you intend to insert in the hash table, you may save some
+   table memory and insertion time, by specifying it here.  If the
+   IS_N_BUCKETS field of the TUNING structure is true, the CANDIDATE argument
+   has its meaning changed to the wanted number of buckets.
+
+   TUNING points to a structure of user-supplied values, in case some fine
+   tuning is wanted over the default behaviour of the hasher.  If TUNING is
+   NULL, proper defaults are used instead.
+
+   The user-supplied HASHER function should be provided.  It accepts two
+   arguments ENTRY and TABLE_SIZE.  It computes, by hasing ENTRY contents, a
+   slot number for that entry which should be in the range 0..TABLE_SIZE-1.
+   This slot number is then returned.
+
+   The user-supplied COMPARATOR function should be provided.  It accepts two
+   arguments pointing to user data, it then returns true for a pair of entries
+   that compare equal, or false otherwise.  This function is internally called
+   on entries which are already known to hash to the same bucket index.
+
+   The user-supplied DATA_FREER function, when not NULL, may be later called
+   with the user data as an argument, just before the entry containing the
+   data gets freed.  This happens from within `hash_free' or `hash_clear'.
+   You should specify this function only if you want these functions to free
+   all of your `data' data.  This is typically the case when your data is
+   simply an auxiliary struct that you have malloc'd to aggregate several
+   values.  */
 
 Hash_table *
-hash_initialize (unsigned int candidate, Hash_hasher hasher,
-		 Hash_comparator comparator, Hash_data_freer data_freer)
+hash_initialize (unsigned candidate, const Hash_tuning *tuning,
+		 Hash_hasher hasher, Hash_comparator comparator,
+		 Hash_data_freer data_freer)
 {
   Hash_table *table;
   struct hash_entry *bucket;
@@ -447,7 +514,23 @@ hash_initialize (unsigned int candidate, Hash_hasher hasher,
   if (table == NULL)
     return NULL;
 
-  table->n_buckets = next_prime (candidate < 10 ? 10 : candidate);
+  if (!tuning)
+    tuning = &default_tuning;
+  table->tuning = tuning;
+  if (!check_tuning (table))
+    {
+      /* Abort initialisation if tuning arguments are improper.  This is the
+	 only occasion when the user gets some feedback about it.  Later on,
+	 if the user modifies the tuning wrongly, it gets restored to some
+	 proper default, and the user looses the right of tuning further.  */
+      free (table);
+      return NULL;
+    }
+
+  table->n_buckets
+    = next_prime (tuning->is_n_buckets ? candidate
+		  : (unsigned) (candidate / tuning->growth_threshold));
+
   table->bucket = (struct hash_entry *)
     malloc (table->n_buckets * sizeof (struct hash_entry));
   if (table->bucket == NULL)
@@ -682,21 +765,24 @@ hash_find_entry (Hash_table *table, const void *entry,
   return NULL;
 }
 
-/* For an already existing hash table, change the number of buckets and make
-   it NEW_TABLE_SIZE.  The contents of the hash table are preserved.  */
+/* For an already existing hash table, change the number of buckets through
+   specifying CANDIDATE.  The contents of the hash table are preserved.  The
+   new number of buckets is automatically selected so to _guarantee_ that the
+   table may receive at least CANDIDATE different user entries, including
+   those already in the table, before any other growth of the hash table size
+   occurs.  If the IS_N_BUCKETS field of the TUNING structure is true, the
+   CANDIDATE argument has its meaning changed to the wanted number of buckets.
+   */
 
 bool
-hash_rehash (Hash_table *table, unsigned int new_n_buckets)
+hash_rehash (Hash_table *table, unsigned candidate)
 {
   Hash_table *new_table;
   struct hash_entry *bucket;
   struct hash_entry *cursor;
   struct hash_entry *next;
 
-  if (table->n_buckets <= 0 || new_n_buckets == 0)
-    return false;
-
-  new_table = hash_initialize (new_n_buckets, table->hasher,
+  new_table = hash_initialize (candidate, table->tuning, table->hasher,
 			       table->comparator, table->data_freer);
   if (new_table == NULL)
     return false;
@@ -709,43 +795,50 @@ hash_rehash (Hash_table *table, unsigned int new_n_buckets)
   new_table->free_entry_list = table->free_entry_list;
 
   for (bucket = table->bucket; bucket < table->bucket_limit; bucket++)
-    {
-      if (bucket->data)
+    if (bucket->data)
+      for (cursor = bucket; cursor; cursor = next)
 	{
-	  for (cursor = bucket; cursor; cursor = next)
-	    {
-	      void *data = cursor->data;
-	      struct hash_entry *new_bucket
-		= new_table->bucket + new_table->hasher (data, new_n_buckets);
+	  void *data = cursor->data;
+	  struct hash_entry *new_bucket
+	    = (new_table->bucket
+	       + new_table->hasher (data, new_table->n_buckets));
 
-	      assert (new_bucket < new_table->bucket_limit);
+	  assert (new_bucket < new_table->bucket_limit);
+	  next = cursor->next;
 
-	      /* Free overflow entries as soon as possible, moving them from the
-		 old hash table into the new one, as they may be needed now.  */
-	      next = cursor->next;
+	  if (new_bucket->data)
+	    if (cursor == bucket)
+	      {
+		/* Allocate or recycle an entry, when moving from a bucket
+		   header into a bucket overflow.  */
+		struct hash_entry *new_entry = allocate_entry (new_table);
+
+		if (new_entry == NULL)
+		  return false;
+
+		new_entry->data = data;
+		new_entry->next = new_bucket->next;
+		new_bucket->next = new_entry;
+	      }
+	    else
+	      {
+		/* Merely relink an existing entry, when moving from a
+		   bucket overflow into a bucket overflow.  */
+		cursor->next = new_bucket->next;
+		new_bucket->next = cursor;
+	      }
+	  else
+	    {
+	      /* Free an existing entry, when moving from a bucket
+		 overflow into a bucket header.  Also take care of the
+		 simple case of moving from a bucket header into a bucket
+		 header.  */
+	      new_bucket->data = data;
+	      new_table->n_buckets_used++;
 	      if (cursor != bucket)
 		free_entry (new_table, cursor);
-
-	      /* Insert the entry into the new hash table.  */
-	      if (new_bucket->data)
-		{
-		  struct hash_entry *new_entry = allocate_entry (new_table);
-
-		  if (new_entry == NULL)
-		    return false;
-
-		  new_entry->data = data;
-		  new_entry->next = new_bucket->next;
-		  new_bucket->next = new_entry;
-		}
-	      else
-		{
-		  new_bucket->data = data;
-		  new_table->n_buckets_used++;
-		}
 	    }
 	}
-    }
 
   free (table->bucket);
   table->bucket = new_table->bucket;
@@ -801,18 +894,31 @@ hash_insert (Hash_table *table, const void *entry)
   table->n_entries++;
   table->n_buckets_used++;
 
-  /* If more than 80% of the buckets are in use, rehash the table so it's two
-     times bigger.  There's no point in checking the number of entries,
-     because if the hashing function is ill-conditioned, rehashing is not
-     likely to improve it.  */
+  /* If the growth threshold of the buckets in use has been reached, rehash
+     the table bigger.  It's no real use checking the number of entries, as
+     if the hashing function is ill-conditioned, rehashing is not likely to
+     improve it.  */
 
-  if (5 * table->n_buckets_used > 4 * table->n_buckets)
+  if (table->n_buckets_used
+      > table->tuning->growth_threshold * table->n_buckets)
     {
-      unsigned int new_n_buckets = next_prime (2 * table->n_buckets + 1);
-
-      /* If the rehash fails, arrange to return NULL.  */
-      if (!hash_rehash (table, new_n_buckets))
-	entry = NULL;
+      /* Check more fully, before starting real work.  If tuning arguments got
+	 improper, the second check will rely on proper defaults.  */
+      check_tuning (table);
+      if (table->n_buckets_used
+	  > table->tuning->growth_threshold * table->n_buckets)
+	{
+	  const Hash_tuning *tuning = table->tuning;
+	  unsigned candidate
+	    = (unsigned) (tuning->is_n_buckets
+			  ? (table->n_buckets * tuning->growth_factor)
+			  : (table->n_buckets * tuning->growth_factor
+			     * tuning->growth_threshold));
+
+	  /* If the rehash fails, arrange to return NULL.  */
+	  if (!hash_rehash (table, candidate))
+	    entry = NULL;
+	}
     }
 
   return (void *) entry;
@@ -831,9 +937,34 @@ hash_delete (Hash_table *table, const void *entry)
   if (data = hash_find_entry (table, entry, &bucket, true), !data)
     return NULL;
 
-  if (!bucket->data)
-    table->n_buckets_used--;
   table->n_entries--;
+  if (!bucket->data)
+    {
+      table->n_buckets_used--;
+
+      /* If the shrink threshold of the buckets in use has been reached,
+	 rehash the table smaller.  */
+
+      if (table->n_buckets_used
+	  < table->tuning->shrink_threshold * table->n_buckets)
+	{
+	  /* Check more fully, before starting real work.  If tuning arguments
+	     got improper, the second check will rely on proper defaults.  */
+	  check_tuning (table);
+	  if (table->n_buckets_used
+	      < table->tuning->shrink_threshold * table->n_buckets)
+	    {
+	      const Hash_tuning *tuning = table->tuning;
+	      unsigned candidate
+		= (unsigned) (tuning->is_n_buckets
+			      ? table->n_buckets * tuning->shrink_factor
+			      : (table->n_buckets * tuning->shrink_factor
+				 * tuning->growth_threshold));
+
+	      hash_rehash (table, candidate);
+	    }
+	}
+    }
 
   return data;
 }
author	Jim Meyering <jim@meyering.net>	1999-03-15 15:50:31 +0000
committer	Jim Meyering <jim@meyering.net>	1999-03-15 15:50:31 +0000
commit	0a1a14a0958643bc5dabc2d0b0901dca08fca722 (patch)
tree	2d4589b6aad75188449a745799487f488427a00a /lib/hash.c
parent	674d2ec393f69b015b3ff80343a431b81f94322f (diff)
download	coreutils-0a1a14a0958643bc5dabc2d0b0901dca08fca722.tar.xz