| Index: gperf/src/gperf/3.0.1/gperf-3.0.1-src/src/search.cc
|
| ===================================================================
|
| --- gperf/src/gperf/3.0.1/gperf-3.0.1-src/src/search.cc (revision 0)
|
| +++ gperf/src/gperf/3.0.1/gperf-3.0.1-src/src/search.cc (revision 0)
|
| @@ -0,0 +1,1684 @@
|
| +/* Search algorithm.
|
| + Copyright (C) 1989-1998, 2000, 2002 Free Software Foundation, Inc.
|
| + Written by Douglas C. Schmidt <schmidt@ics.uci.edu>
|
| + and Bruno Haible <bruno@clisp.org>.
|
| +
|
| + This file is part of GNU GPERF.
|
| +
|
| + GNU GPERF is free software; you can redistribute it and/or modify
|
| + it under the terms of the GNU General Public License as published by
|
| + the Free Software Foundation; either version 2, or (at your option)
|
| + any later version.
|
| +
|
| + GNU GPERF is distributed in the hope that it will be useful,
|
| + but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
| + GNU General Public License for more details.
|
| +
|
| + You should have received a copy of the GNU General Public License
|
| + along with this program; see the file COPYING.
|
| + If not, write to the Free Software Foundation, Inc.,
|
| + 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
|
| +
|
| +/* Specification. */
|
| +#include "search.h"
|
| +
|
| +#include <stdio.h>
|
| +#include <stdlib.h> /* declares exit(), rand(), srand() */
|
| +#include <string.h> /* declares memset(), memcmp() */
|
| +#include <time.h> /* declares time() */
|
| +#include <math.h> /* declares exp() */
|
| +#include <limits.h> /* defines INT_MIN, INT_MAX, UINT_MAX */
|
| +#include "options.h"
|
| +#include "hash-table.h"
|
| +#include "config.h"
|
| +
|
| +/* ============================== Portability ============================== */
|
| +
|
| +/* Assume ISO C++ 'for' scoping rule. */
|
| +#define for if (0) ; else for
|
| +
|
| +/* Dynamically allocated array with dynamic extent:
|
| +
|
| + Example:
|
| + DYNAMIC_ARRAY (my_array, int, n);
|
| + ...
|
| + FREE_DYNAMIC_ARRAY (my_array);
|
| +
|
| + Attention: depending on your implementation my_array is either the array
|
| + itself or a pointer to the array! Always use my_array only as expression!
|
| + */
|
| +#if HAVE_DYNAMIC_ARRAY
|
| + #define DYNAMIC_ARRAY(var,eltype,size) eltype var[size]
|
| + #define FREE_DYNAMIC_ARRAY(var)
|
| +#else
|
| + #define DYNAMIC_ARRAY(var,eltype,size) eltype *var = new eltype[size]
|
| + #define FREE_DYNAMIC_ARRAY(var) delete[] var
|
| +#endif
|
| +
|
| +/* ================================ Theory ================================= */
|
| +
|
| +/* The general form of the hash function is
|
| +
|
| + hash (keyword) = sum (asso_values[keyword[i] + alpha_inc[i]] : i in Pos)
|
| + + len (keyword)
|
| +
|
| + where Pos is a set of byte positions,
|
| + each alpha_inc[i] is a nonnegative integer,
|
| + each asso_values[c] is a nonnegative integer,
|
| + len (keyword) is the keyword's length if !option[NOLENGTH], or 0 otherwise.
|
| +
|
| + Theorem 1: If all keywords are different, there is a set Pos such that
|
| + all tuples (keyword[i] : i in Pos) are different.
|
| +
|
| + Theorem 2: If all tuples (keyword[i] : i in Pos) are different, there
|
| + are nonnegative integers alpha_inc[i] such that all multisets
|
| + {keyword[i] + alpha_inc[i] : i in Pos} are different.
|
| +
|
| + Define selchars[keyword] := {keyword[i] + alpha_inc[i] : i in Pos}.
|
| +
|
| + Theorem 3: If all multisets selchars[keyword] are different, there are
|
| + nonnegative integers asso_values[c] such that all hash values
|
| + sum (asso_values[c] : c in selchars[keyword]) are different.
|
| +
|
| + Based on these three facts, we find the hash function in three steps:
|
| +
|
| + Step 1 (Finding good byte positions):
|
| + Find a set Pos, as small as possible, such that all tuples
|
| + (keyword[i] : i in Pos) are different.
|
| +
|
| + Step 2 (Finding good alpha increments):
|
| + Find nonnegative integers alpha_inc[i], as many of them as possible being
|
| + zero, and the others being as small as possible, such that all multisets
|
| + {keyword[i] + alpha_inc[i] : i in Pos} are different.
|
| +
|
| + Step 3 (Finding good asso_values):
|
| + Find asso_values[c] such that all hash (keyword) are different.
|
| +
|
| + In other words, each step finds a projection that is injective on the
|
| + given finite set:
|
| + proj1 : String --> Map (Pos --> N)
|
| + proj2 : Map (Pos --> N) --> Map (Pos --> N) / S(Pos)
|
| + proj3 : Map (Pos --> N) / S(Pos) --> N
|
| + where
|
| + N denotes the set of nonnegative integers,
|
| + Map (A --> B) := Hom_Set (A, B) is the set of maps from A to B, and
|
| + S(Pos) is the symmetric group over Pos.
|
| +
|
| + This was the theory for option[NOLENGTH]; if !option[NOLENGTH], slight
|
| + modifications apply:
|
| + proj1 : String --> Map (Pos --> N) x N
|
| + proj2 : Map (Pos --> N) x N --> Map (Pos --> N) / S(Pos) x N
|
| + proj3 : Map (Pos --> N) / S(Pos) x N --> N
|
| +
|
| + For a case-insensitive hash function, the general form is
|
| +
|
| + hash (keyword) =
|
| + sum (asso_values[alpha_unify[keyword[i] + alpha_inc[i]]] : i in Pos)
|
| + + len (keyword)
|
| +
|
| + where alpha_unify[c] is chosen so that an upper/lower case change in
|
| + keyword[i] doesn't change alpha_unify[keyword[i] + alpha_inc[i]].
|
| + */
|
| +
|
| +/* ==================== Initialization and Preparation ===================== */
|
| +
|
| +Search::Search (KeywordExt_List *list)
|
| + : _head (list)
|
| +{
|
| +}
|
| +
|
| +void
|
| +Search::prepare ()
|
| +{
|
| + /* Compute the total number of keywords. */
|
| + _total_keys = 0;
|
| + for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
|
| + _total_keys++;
|
| +
|
| + /* Compute the minimum and maximum keyword length. */
|
| + _max_key_len = INT_MIN;
|
| + _min_key_len = INT_MAX;
|
| + for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
|
| + {
|
| + KeywordExt *keyword = temp->first();
|
| +
|
| + if (_max_key_len < keyword->_allchars_length)
|
| + _max_key_len = keyword->_allchars_length;
|
| + if (_min_key_len > keyword->_allchars_length)
|
| + _min_key_len = keyword->_allchars_length;
|
| + }
|
| +
|
| + /* Exit program if an empty string is used as keyword, since the comparison
|
| + expressions don't work correctly for looking up an empty string. */
|
| + if (_min_key_len == 0)
|
| + {
|
| + fprintf (stderr, "Empty input keyword is not allowed.\n"
|
| + "To recognize an empty input keyword, your code should check for\n"
|
| + "len == 0 before calling the gperf generated lookup function.\n");
|
| + exit (1);
|
| + }
|
| +
|
| + /* Exit program if the characters in the keywords are not in the required
|
| + range. */
|
| + if (option[SEVENBIT])
|
| + for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
|
| + {
|
| + KeywordExt *keyword = temp->first();
|
| +
|
| + const char *k = keyword->_allchars;
|
| + for (int i = keyword->_allchars_length; i > 0; k++, i--)
|
| + if (!(static_cast<unsigned char>(*k) < 128))
|
| + {
|
| + fprintf (stderr, "Option --seven-bit has been specified,\n"
|
| + "but keyword \"%.*s\" contains non-ASCII characters.\n"
|
| + "Try removing option --seven-bit.\n",
|
| + keyword->_allchars_length, keyword->_allchars);
|
| + exit (1);
|
| + }
|
| + }
|
| +}
|
| +
|
| +/* ====================== Finding good byte positions ====================== */
|
| +
|
| +/* Computes the upper bound on the indices passed to asso_values[],
|
| + assuming no alpha_increments. */
|
| +unsigned int
|
| +Search::compute_alpha_size () const
|
| +{
|
| + return (option[SEVENBIT] ? 128 : 256);
|
| +}
|
| +
|
| +/* Computes the unification rules between different asso_values[c],
|
| + assuming no alpha_increments. */
|
| +unsigned int *
|
| +Search::compute_alpha_unify () const
|
| +{
|
| + if (option[UPPERLOWER])
|
| + {
|
| + /* Uppercase to lowercase mapping. */
|
| + unsigned int alpha_size = compute_alpha_size();
|
| + unsigned int *alpha_unify = new unsigned int[alpha_size];
|
| + for (unsigned int c = 0; c < alpha_size; c++)
|
| + alpha_unify[c] = c;
|
| + for (unsigned int c = 'A'; c <= 'Z'; c++)
|
| + alpha_unify[c] = c + ('a'-'A');
|
| + return alpha_unify;
|
| + }
|
| + else
|
| + /* Identity mapping. */
|
| + return NULL;
|
| +}
|
| +
|
| +/* Initializes each keyword's _selchars array. */
|
| +void
|
| +Search::init_selchars_tuple (const Positions& positions, const unsigned int *alpha_unify) const
|
| +{
|
| + for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
|
| + temp->first()->init_selchars_tuple(positions, alpha_unify);
|
| +}
|
| +
|
| +/* Deletes each keyword's _selchars array. */
|
| +void
|
| +Search::delete_selchars () const
|
| +{
|
| + for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
|
| + temp->first()->delete_selchars();
|
| +}
|
| +
|
| +/* Count the duplicate keywords that occur with a given set of positions.
|
| + In other words, it returns the difference
|
| + # K - # proj1 (K)
|
| + where K is the multiset of given keywords. */
|
| +unsigned int
|
| +Search::count_duplicates_tuple (const Positions& positions, const unsigned int *alpha_unify) const
|
| +{
|
| + /* Run through the keyword list and count the duplicates incrementally.
|
| + The result does not depend on the order of the keyword list, thanks to
|
| + the formula above. */
|
| + init_selchars_tuple (positions, alpha_unify);
|
| +
|
| + unsigned int count = 0;
|
| + {
|
| + Hash_Table representatives (_total_keys, option[NOLENGTH]);
|
| + for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
|
| + {
|
| + KeywordExt *keyword = temp->first();
|
| + if (representatives.insert (keyword))
|
| + count++;
|
| + }
|
| + }
|
| +
|
| + delete_selchars ();
|
| +
|
| + return count;
|
| +}
|
| +
|
| +/* Find good key positions. */
|
| +
|
| +void
|
| +Search::find_positions ()
|
| +{
|
| + /* If the user gave the key positions, we use them. */
|
| + if (option[POSITIONS])
|
| + {
|
| + _key_positions = option.get_key_positions();
|
| + return;
|
| + }
|
| +
|
| + /* Compute preliminary alpha_unify table. */
|
| + unsigned int *alpha_unify = compute_alpha_unify ();
|
| +
|
| + /* 1. Find positions that must occur in order to distinguish duplicates. */
|
| + Positions mandatory;
|
| +
|
| + if (!option[DUP])
|
| + {
|
| + for (KeywordExt_List *l1 = _head; l1 && l1->rest(); l1 = l1->rest())
|
| + {
|
| + KeywordExt *keyword1 = l1->first();
|
| + for (KeywordExt_List *l2 = l1->rest(); l2; l2 = l2->rest())
|
| + {
|
| + KeywordExt *keyword2 = l2->first();
|
| +
|
| + /* If keyword1 and keyword2 have the same length and differ
|
| + in just one position, and it is not the last character,
|
| + this position is mandatory. */
|
| + if (keyword1->_allchars_length == keyword2->_allchars_length)
|
| + {
|
| + int n = keyword1->_allchars_length;
|
| + int i;
|
| + for (i = 0; i < n - 1; i++)
|
| + {
|
| + unsigned char c1 = keyword1->_allchars[i];
|
| + unsigned char c2 = keyword2->_allchars[i];
|
| + if (option[UPPERLOWER])
|
| + {
|
| + if (c1 >= 'A' && c1 <= 'Z')
|
| + c1 += 'a' - 'A';
|
| + if (c2 >= 'A' && c2 <= 'Z')
|
| + c2 += 'a' - 'A';
|
| + }
|
| + if (c1 != c2)
|
| + break;
|
| + }
|
| + if (i < n - 1)
|
| + {
|
| + int j;
|
| + for (j = i + 1; j < n; j++)
|
| + {
|
| + unsigned char c1 = keyword1->_allchars[j];
|
| + unsigned char c2 = keyword2->_allchars[j];
|
| + if (option[UPPERLOWER])
|
| + {
|
| + if (c1 >= 'A' && c1 <= 'Z')
|
| + c1 += 'a' - 'A';
|
| + if (c2 >= 'A' && c2 <= 'Z')
|
| + c2 += 'a' - 'A';
|
| + }
|
| + if (c1 != c2)
|
| + break;
|
| + }
|
| + if (j >= n)
|
| + {
|
| + /* Position i is mandatory. */
|
| + if (!mandatory.contains (i))
|
| + mandatory.add (i);
|
| + }
|
| + }
|
| + }
|
| + }
|
| + }
|
| + }
|
| +
|
| + /* 2. Add positions, as long as this decreases the duplicates count. */
|
| + int imax = (_max_key_len - 1 < Positions::MAX_KEY_POS - 1
|
| + ? _max_key_len - 1 : Positions::MAX_KEY_POS - 1);
|
| + Positions current = mandatory;
|
| + unsigned int current_duplicates_count =
|
| + count_duplicates_tuple (current, alpha_unify);
|
| + for (;;)
|
| + {
|
| + Positions best;
|
| + unsigned int best_duplicates_count = UINT_MAX;
|
| +
|
| + for (int i = imax; i >= -1; i--)
|
| + if (!current.contains (i))
|
| + {
|
| + Positions tryal = current;
|
| + tryal.add (i);
|
| + unsigned int try_duplicates_count =
|
| + count_duplicates_tuple (tryal, alpha_unify);
|
| +
|
| + /* We prefer 'try' to 'best' if it produces less duplicates,
|
| + or if it produces the same number of duplicates but with
|
| + a more efficient hash function. */
|
| + if (try_duplicates_count < best_duplicates_count
|
| + || (try_duplicates_count == best_duplicates_count && i >= 0))
|
| + {
|
| + best = tryal;
|
| + best_duplicates_count = try_duplicates_count;
|
| + }
|
| + }
|
| +
|
| + /* Stop adding positions when it gives no improvement. */
|
| + if (best_duplicates_count >= current_duplicates_count)
|
| + break;
|
| +
|
| + current = best;
|
| + current_duplicates_count = best_duplicates_count;
|
| + }
|
| +
|
| + /* 3. Remove positions, as long as this doesn't increase the duplicates
|
| + count. */
|
| + for (;;)
|
| + {
|
| + Positions best;
|
| + unsigned int best_duplicates_count = UINT_MAX;
|
| +
|
| + for (int i = imax; i >= -1; i--)
|
| + if (current.contains (i) && !mandatory.contains (i))
|
| + {
|
| + Positions tryal = current;
|
| + tryal.remove (i);
|
| + unsigned int try_duplicates_count =
|
| + count_duplicates_tuple (tryal, alpha_unify);
|
| +
|
| + /* We prefer 'try' to 'best' if it produces less duplicates,
|
| + or if it produces the same number of duplicates but with
|
| + a more efficient hash function. */
|
| + if (try_duplicates_count < best_duplicates_count
|
| + || (try_duplicates_count == best_duplicates_count && i == -1))
|
| + {
|
| + best = tryal;
|
| + best_duplicates_count = try_duplicates_count;
|
| + }
|
| + }
|
| +
|
| + /* Stop removing positions when it gives no improvement. */
|
| + if (best_duplicates_count > current_duplicates_count)
|
| + break;
|
| +
|
| + current = best;
|
| + current_duplicates_count = best_duplicates_count;
|
| + }
|
| +
|
| + /* 4. Replace two positions by one, as long as this doesn't increase the
|
| + duplicates count. */
|
| + for (;;)
|
| + {
|
| + Positions best;
|
| + unsigned int best_duplicates_count = UINT_MAX;
|
| +
|
| + for (int i1 = imax; i1 >= -1; i1--)
|
| + if (current.contains (i1) && !mandatory.contains (i1))
|
| + for (int i2 = imax; i2 >= -1; i2--)
|
| + if (current.contains (i2) && !mandatory.contains (i2) && i2 != i1)
|
| + for (int i3 = imax; i3 >= 0; i3--)
|
| + if (!current.contains (i3))
|
| + {
|
| + Positions tryal = current;
|
| + tryal.remove (i1);
|
| + tryal.remove (i2);
|
| + tryal.add (i3);
|
| + unsigned int try_duplicates_count =
|
| + count_duplicates_tuple (tryal, alpha_unify);
|
| +
|
| + /* We prefer 'try' to 'best' if it produces less duplicates,
|
| + or if it produces the same number of duplicates but with
|
| + a more efficient hash function. */
|
| + if (try_duplicates_count < best_duplicates_count
|
| + || (try_duplicates_count == best_duplicates_count
|
| + && (i1 == -1 || i2 == -1 || i3 >= 0)))
|
| + {
|
| + best = tryal;
|
| + best_duplicates_count = try_duplicates_count;
|
| + }
|
| + }
|
| +
|
| + /* Stop removing positions when it gives no improvement. */
|
| + if (best_duplicates_count > current_duplicates_count)
|
| + break;
|
| +
|
| + current = best;
|
| + current_duplicates_count = best_duplicates_count;
|
| + }
|
| +
|
| + /* That's it. Hope it's good enough. */
|
| + _key_positions = current;
|
| +
|
| + if (option[DEBUG])
|
| + {
|
| + /* Print the result. */
|
| + fprintf (stderr, "\nComputed positions: ");
|
| + PositionReverseIterator iter = _key_positions.reviterator();
|
| + bool seen_lastchar = false;
|
| + bool first = true;
|
| + for (int i; (i = iter.next ()) != PositionReverseIterator::EOS; )
|
| + {
|
| + if (!first)
|
| + fprintf (stderr, ", ");
|
| + if (i == Positions::LASTCHAR)
|
| + seen_lastchar = true;
|
| + else
|
| + {
|
| + fprintf (stderr, "%d", i + 1);
|
| + first = false;
|
| + }
|
| + }
|
| + if (seen_lastchar)
|
| + {
|
| + if (!first)
|
| + fprintf (stderr, ", ");
|
| + fprintf (stderr, "$");
|
| + }
|
| + fprintf (stderr, "\n");
|
| + }
|
| +
|
| + /* Free preliminary alpha_unify table. */
|
| + delete[] alpha_unify;
|
| +}
|
| +
|
| +/* Count the duplicate keywords that occur with the found set of positions.
|
| + In other words, it returns the difference
|
| + # K - # proj1 (K)
|
| + where K is the multiset of given keywords. */
|
| +unsigned int
|
| +Search::count_duplicates_tuple () const
|
| +{
|
| + unsigned int *alpha_unify = compute_alpha_unify ();
|
| + unsigned int count = count_duplicates_tuple (_key_positions, alpha_unify);
|
| + delete[] alpha_unify;
|
| + return count;
|
| +}
|
| +
|
| +/* ===================== Finding good alpha increments ===================== */
|
| +
|
| +/* Computes the upper bound on the indices passed to asso_values[]. */
|
| +unsigned int
|
| +Search::compute_alpha_size (const unsigned int *alpha_inc) const
|
| +{
|
| + unsigned int max_alpha_inc = 0;
|
| + for (int i = 0; i < _max_key_len; i++)
|
| + if (max_alpha_inc < alpha_inc[i])
|
| + max_alpha_inc = alpha_inc[i];
|
| + return (option[SEVENBIT] ? 128 : 256) + max_alpha_inc;
|
| +}
|
| +
|
| +/* Computes the unification rules between different asso_values[c]. */
|
| +unsigned int *
|
| +Search::compute_alpha_unify (const Positions& positions, const unsigned int *alpha_inc) const
|
| +{
|
| + if (option[UPPERLOWER])
|
| + {
|
| + /* Without alpha increments, we would simply unify
|
| + 'A' -> 'a', ..., 'Z' -> 'z'.
|
| + But when a keyword contains at position i a character c,
|
| + we have the constraint
|
| + asso_values[tolower(c) + alpha_inc[i]] ==
|
| + asso_values[toupper(c) + alpha_inc[i]].
|
| + This introduces a unification
|
| + toupper(c) + alpha_inc[i] -> tolower(c) + alpha_inc[i].
|
| + Note that this unification can extend outside the range of
|
| + ASCII letters! But still every unified character pair is at
|
| + a distance of 'a'-'A' = 32, or (after chained unification)
|
| + at a multiple of 32. So in the end the alpha_unify vector has
|
| + the form c -> c + 32 * f(c) where f(c) is a nonnegative
|
| + integer. */
|
| + unsigned int alpha_size = compute_alpha_size (alpha_inc);
|
| +
|
| + unsigned int *alpha_unify = new unsigned int[alpha_size];
|
| + for (unsigned int c = 0; c < alpha_size; c++)
|
| + alpha_unify[c] = c;
|
| +
|
| + for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
|
| + {
|
| + KeywordExt *keyword = temp->first();
|
| +
|
| + /* Iterate through the selected character positions. */
|
| + PositionIterator iter = positions.iterator(keyword->_allchars_length);
|
| +
|
| + for (int i; (i = iter.next ()) != PositionIterator::EOS; )
|
| + {
|
| + unsigned int c;
|
| + if (i == Positions::LASTCHAR)
|
| + c = static_cast<unsigned char>(keyword->_allchars[keyword->_allchars_length - 1]);
|
| + else if (i < keyword->_allchars_length)
|
| + c = static_cast<unsigned char>(keyword->_allchars[i]);
|
| + else
|
| + abort ();
|
| + if (c >= 'A' && c <= 'Z')
|
| + c += 'a' - 'A';
|
| + if (c >= 'a' && c <= 'z')
|
| + {
|
| + if (i != Positions::LASTCHAR)
|
| + c += alpha_inc[i];
|
| + /* Unify c with c - ('a'-'A'). */
|
| + unsigned int d = alpha_unify[c];
|
| + unsigned int b = c - ('a'-'A');
|
| + for (int a = b; a >= 0 && alpha_unify[a] == b; a -= ('a'-'A'))
|
| + alpha_unify[a] = d;
|
| + }
|
| + }
|
| + }
|
| + return alpha_unify;
|
| + }
|
| + else
|
| + /* Identity mapping. */
|
| + return NULL;
|
| +}
|
| +
|
| +/* Initializes each keyword's _selchars array. */
|
| +void
|
| +Search::init_selchars_multiset (const Positions& positions, const unsigned int *alpha_unify, const unsigned int *alpha_inc) const
|
| +{
|
| + for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
|
| + temp->first()->init_selchars_multiset(positions, alpha_unify, alpha_inc);
|
| +}
|
| +
|
| +/* Count the duplicate keywords that occur with the given set of positions
|
| + and a given alpha_inc[] array.
|
| + In other words, it returns the difference
|
| + # K - # proj2 (proj1 (K))
|
| + where K is the multiset of given keywords. */
|
| +unsigned int
|
| +Search::count_duplicates_multiset (const unsigned int *alpha_inc) const
|
| +{
|
| + /* Run through the keyword list and count the duplicates incrementally.
|
| + The result does not depend on the order of the keyword list, thanks to
|
| + the formula above. */
|
| + unsigned int *alpha_unify = compute_alpha_unify (_key_positions, alpha_inc);
|
| + init_selchars_multiset (_key_positions, alpha_unify, alpha_inc);
|
| +
|
| + unsigned int count = 0;
|
| + {
|
| + Hash_Table representatives (_total_keys, option[NOLENGTH]);
|
| + for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
|
| + {
|
| + KeywordExt *keyword = temp->first();
|
| + if (representatives.insert (keyword))
|
| + count++;
|
| + }
|
| + }
|
| +
|
| + delete_selchars ();
|
| + delete[] alpha_unify;
|
| +
|
| + return count;
|
| +}
|
| +
|
| +/* Find good _alpha_inc[]. */
|
| +
|
| +void
|
| +Search::find_alpha_inc ()
|
| +{
|
| + /* The goal is to choose _alpha_inc[] such that it doesn't introduce
|
| + artificial duplicates.
|
| + In other words, the goal is # proj2 (proj1 (K)) = # proj1 (K). */
|
| + unsigned int duplicates_goal = count_duplicates_tuple ();
|
| +
|
| + /* Start with zero increments. This is sufficient in most cases. */
|
| + unsigned int *current = new unsigned int [_max_key_len];
|
| + for (int i = 0; i < _max_key_len; i++)
|
| + current[i] = 0;
|
| + unsigned int current_duplicates_count = count_duplicates_multiset (current);
|
| +
|
| + if (current_duplicates_count > duplicates_goal)
|
| + {
|
| + /* Look which _alpha_inc[i] we are free to increment. */
|
| + unsigned int nindices;
|
| + {
|
| + nindices = 0;
|
| + PositionIterator iter = _key_positions.iterator(_max_key_len);
|
| + for (;;)
|
| + {
|
| + int key_pos = iter.next ();
|
| + if (key_pos == PositionIterator::EOS)
|
| + break;
|
| + if (key_pos != Positions::LASTCHAR)
|
| + nindices++;
|
| + }
|
| + }
|
| +
|
| + DYNAMIC_ARRAY (indices, unsigned int, nindices);
|
| + {
|
| + unsigned int j = 0;
|
| + PositionIterator iter = _key_positions.iterator(_max_key_len);
|
| + for (;;)
|
| + {
|
| + int key_pos = iter.next ();
|
| + if (key_pos == PositionIterator::EOS)
|
| + break;
|
| + if (key_pos != Positions::LASTCHAR)
|
| + indices[j++] = key_pos;
|
| + }
|
| + if (!(j == nindices))
|
| + abort ();
|
| + }
|
| +
|
| + /* Perform several rounds of searching for a good alpha increment.
|
| + Each round reduces the number of artificial collisions by adding
|
| + an increment in a single key position. */
|
| + DYNAMIC_ARRAY (best, unsigned int, _max_key_len);
|
| + DYNAMIC_ARRAY (tryal, unsigned int, _max_key_len);
|
| + do
|
| + {
|
| + /* An increment of 1 is not always enough. Try higher increments
|
| + also. */
|
| + for (unsigned int inc = 1; ; inc++)
|
| + {
|
| + unsigned int best_duplicates_count = UINT_MAX;
|
| +
|
| + for (unsigned int j = 0; j < nindices; j++)
|
| + {
|
| + memcpy (tryal, current, _max_key_len * sizeof (unsigned int));
|
| + tryal[indices[j]] += inc;
|
| + unsigned int try_duplicates_count =
|
| + count_duplicates_multiset (tryal);
|
| +
|
| + /* We prefer 'try' to 'best' if it produces less
|
| + duplicates. */
|
| + if (try_duplicates_count < best_duplicates_count)
|
| + {
|
| + memcpy (best, tryal, _max_key_len * sizeof (unsigned int));
|
| + best_duplicates_count = try_duplicates_count;
|
| + }
|
| + }
|
| +
|
| + /* Stop this round when we got an improvement. */
|
| + if (best_duplicates_count < current_duplicates_count)
|
| + {
|
| + memcpy (current, best, _max_key_len * sizeof (unsigned int));
|
| + current_duplicates_count = best_duplicates_count;
|
| + break;
|
| + }
|
| + }
|
| + }
|
| + while (current_duplicates_count > duplicates_goal);
|
| + FREE_DYNAMIC_ARRAY (tryal);
|
| + FREE_DYNAMIC_ARRAY (best);
|
| +
|
| + if (option[DEBUG])
|
| + {
|
| + /* Print the result. */
|
| + fprintf (stderr, "\nComputed alpha increments: ");
|
| + bool first = true;
|
| + for (unsigned int j = nindices; j-- > 0; )
|
| + if (current[indices[j]] != 0)
|
| + {
|
| + if (!first)
|
| + fprintf (stderr, ", ");
|
| + fprintf (stderr, "%u:+%u",
|
| + indices[j] + 1, current[indices[j]]);
|
| + first = false;
|
| + }
|
| + fprintf (stderr, "\n");
|
| + }
|
| + FREE_DYNAMIC_ARRAY (indices);
|
| + }
|
| +
|
| + _alpha_inc = current;
|
| + _alpha_size = compute_alpha_size (_alpha_inc);
|
| + _alpha_unify = compute_alpha_unify (_key_positions, _alpha_inc);
|
| +}
|
| +
|
| +/* ======================= Finding good asso_values ======================== */
|
| +
|
| +/* Initializes the asso_values[] related parameters. */
|
| +
|
| +void
|
| +Search::prepare_asso_values ()
|
| +{
|
| + KeywordExt_List *temp;
|
| +
|
| + /* Initialize each keyword's _selchars array. */
|
| + init_selchars_multiset(_key_positions, _alpha_unify, _alpha_inc);
|
| +
|
| + /* Compute the maximum _selchars_length over all keywords. */
|
| + _max_selchars_length = _key_positions.iterator(_max_key_len).remaining();
|
| +
|
| + /* Check for duplicates, i.e. keywords with the same _selchars array
|
| + (and - if !option[NOLENGTH] - also the same length).
|
| + We deal with these by building an equivalence class, so that only
|
| + 1 keyword is representative of the entire collection. Only this
|
| + representative remains in the keyword list; the others are accessible
|
| + through the _duplicate_link chain, starting at the representative.
|
| + This *greatly* simplifies processing during later stages of the program.
|
| + Set _total_duplicates and _list_len = _total_keys - _total_duplicates. */
|
| + {
|
| + _list_len = _total_keys;
|
| + _total_duplicates = 0;
|
| + /* Make hash table for efficiency. */
|
| + Hash_Table representatives (_list_len, option[NOLENGTH]);
|
| +
|
| + KeywordExt_List *prev = NULL; /* list node before temp */
|
| + for (temp = _head; temp; )
|
| + {
|
| + KeywordExt *keyword = temp->first();
|
| + KeywordExt *other_keyword = representatives.insert (keyword);
|
| + KeywordExt_List *garbage = NULL;
|
| +
|
| + if (other_keyword)
|
| + {
|
| + _total_duplicates++;
|
| + _list_len--;
|
| + /* Remove keyword from the main list. */
|
| + prev->rest() = temp->rest();
|
| + garbage = temp;
|
| + /* And insert it on other_keyword's duplicate list. */
|
| + keyword->_duplicate_link = other_keyword->_duplicate_link;
|
| + other_keyword->_duplicate_link = keyword;
|
| +
|
| + /* Complain if user hasn't enabled the duplicate option. */
|
| + if (!option[DUP] || option[DEBUG])
|
| + {
|
| + fprintf (stderr, "Key link: \"%.*s\" = \"%.*s\", with key set \"",
|
| + keyword->_allchars_length, keyword->_allchars,
|
| + other_keyword->_allchars_length, other_keyword->_allchars);
|
| + for (int j = 0; j < keyword->_selchars_length; j++)
|
| + putc (keyword->_selchars[j], stderr);
|
| + fprintf (stderr, "\".\n");
|
| + }
|
| + }
|
| + else
|
| + {
|
| + keyword->_duplicate_link = NULL;
|
| + prev = temp;
|
| + }
|
| + temp = temp->rest();
|
| + if (garbage)
|
| + delete garbage;
|
| + }
|
| + if (option[DEBUG])
|
| + representatives.dump();
|
| + }
|
| +
|
| + /* Exit program if duplicates exists and option[DUP] not set, since we
|
| + don't want to continue in this case. (We don't want to turn on
|
| + option[DUP] implicitly, because the generated code is usually much
|
| + slower. */
|
| + if (_total_duplicates)
|
| + {
|
| + if (option[DUP])
|
| + fprintf (stderr, "%d input keys have identical hash values, examine output carefully...\n",
|
| + _total_duplicates);
|
| + else
|
| + {
|
| + fprintf (stderr, "%d input keys have identical hash values,\n",
|
| + _total_duplicates);
|
| + if (option[POSITIONS])
|
| + fprintf (stderr, "try different key positions or use option -D.\n");
|
| + else
|
| + fprintf (stderr, "use option -D.\n");
|
| + exit (1);
|
| + }
|
| + }
|
| +
|
| + /* Compute the occurrences of each character in the alphabet. */
|
| + _occurrences = new int[_alpha_size];
|
| + memset (_occurrences, 0, _alpha_size * sizeof (_occurrences[0]));
|
| + for (temp = _head; temp; temp = temp->rest())
|
| + {
|
| + KeywordExt *keyword = temp->first();
|
| + const unsigned int *ptr = keyword->_selchars;
|
| + for (int count = keyword->_selchars_length; count > 0; ptr++, count--)
|
| + _occurrences[*ptr]++;
|
| + }
|
| +
|
| + /* Memory allocation. */
|
| + _asso_values = new int[_alpha_size];
|
| +
|
| + int non_linked_length = _list_len;
|
| + unsigned int asso_value_max;
|
| +
|
| + asso_value_max =
|
| + static_cast<unsigned int>(non_linked_length * option.get_size_multiple());
|
| + /* Round up to the next power of two. This makes it easy to ensure
|
| + an _asso_value[c] is >= 0 and < asso_value_max. Also, the jump value
|
| + being odd, it guarantees that Search::try_asso_value() will iterate
|
| + through different values for _asso_value[c]. */
|
| + if (asso_value_max == 0)
|
| + asso_value_max = 1;
|
| + asso_value_max |= asso_value_max >> 1;
|
| + asso_value_max |= asso_value_max >> 2;
|
| + asso_value_max |= asso_value_max >> 4;
|
| + asso_value_max |= asso_value_max >> 8;
|
| + asso_value_max |= asso_value_max >> 16;
|
| + asso_value_max++;
|
| + _asso_value_max = asso_value_max;
|
| +
|
| + /* Given the bound for _asso_values[c], we have a bound for the possible
|
| + hash values, as computed in compute_hash(). */
|
| + _max_hash_value = (option[NOLENGTH] ? 0 : _max_key_len)
|
| + + (_asso_value_max - 1) * _max_selchars_length;
|
| + /* Allocate a sparse bit vector for detection of collisions of hash
|
| + values. */
|
| + _collision_detector = new Bool_Array (_max_hash_value + 1);
|
| +
|
| + if (option[DEBUG])
|
| + {
|
| + fprintf (stderr, "total non-linked keys = %d\nmaximum associated value is %d"
|
| + "\nmaximum size of generated hash table is %d\n",
|
| + non_linked_length, asso_value_max, _max_hash_value);
|
| +
|
| + int field_width;
|
| +
|
| + field_width = 0;
|
| + {
|
| + for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
|
| + {
|
| + KeywordExt *keyword = temp->first();
|
| + if (field_width < keyword->_selchars_length)
|
| + field_width = keyword->_selchars_length;
|
| + }
|
| + }
|
| +
|
| + fprintf (stderr, "\ndumping the keyword list without duplicates\n");
|
| + fprintf (stderr, "keyword #, %*s, keyword\n", field_width, "keysig");
|
| + int i = 0;
|
| + for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
|
| + {
|
| + KeywordExt *keyword = temp->first();
|
| + fprintf (stderr, "%9d, ", ++i);
|
| + if (field_width > keyword->_selchars_length)
|
| + fprintf (stderr, "%*s", field_width - keyword->_selchars_length, "");
|
| + for (int j = 0; j < keyword->_selchars_length; j++)
|
| + putc (keyword->_selchars[j], stderr);
|
| + fprintf (stderr, ", %.*s\n",
|
| + keyword->_allchars_length, keyword->_allchars);
|
| + }
|
| + fprintf (stderr, "\nend of keyword list\n\n");
|
| + }
|
| +
|
| + if (option[RANDOM] || option.get_jump () == 0)
|
| + /* We will use rand(), so initialize the random number generator. */
|
| + srand (static_cast<long>(time (0)));
|
| +
|
| + _initial_asso_value = (option[RANDOM] ? -1 : option.get_initial_asso_value ());
|
| + _jump = option.get_jump ();
|
| +}
|
| +
|
| +/* Finds some _asso_values[] that fit. */
|
| +
|
| +/* The idea is to choose the _asso_values[] one by one, in a way that
|
| + a choice that has been made never needs to be undone later. This
|
| + means that we split the work into several steps. Each step chooses
|
| + one or more _asso_values[c]. The result of choosing one or more
|
| + _asso_values[c] is that the partitioning of the keyword set gets
|
| + broader.
|
| + Look at this partitioning: After every step, the _asso_values[] of a
|
| + certain set C of characters are undetermined. (At the beginning, C
|
| + is the set of characters c with _occurrences[c] > 0. At the end, C
|
| + is empty.) To each keyword K, we associate the multiset of _selchars
|
| + for which the _asso_values[] are undetermined:
|
| + K --> K->_selchars intersect C.
|
| + Consider two keywords equivalent if their value under this mapping is
|
| + the same. This introduces an equivalence relation on the set of
|
| + keywords. The equivalence classes partition the keyword set. (At the
|
| + beginning, the partition is the finest possible: each K is an equivalence
|
| + class by itself, because all K have a different _selchars. At the end,
|
| + all K have been merged into a single equivalence class.)
|
| + The partition before a step is always a refinement of the partition
|
| + after the step.
|
| + We choose the steps in such a way that the partition really becomes
|
| + broader at each step. (A step that only chooses an _asso_values[c]
|
| + without changing the partition is better merged with the previous step,
|
| + to avoid useless backtracking.) */
|
| +
|
| +struct EquivalenceClass
|
| +{
|
| + /* The keywords in this equivalence class. */
|
| + KeywordExt_List * _keywords;
|
| + KeywordExt_List * _keywords_last;
|
| + /* The number of keywords in this equivalence class. */
|
| + unsigned int _cardinality;
|
| + /* The undetermined selected characters for the keywords in this
|
| + equivalence class, as a canonically reordered multiset. */
|
| + unsigned int * _undetermined_chars;
|
| + unsigned int _undetermined_chars_length;
|
| +
|
| + EquivalenceClass * _next;
|
| +};
|
| +
|
| +struct Step
|
| +{
|
| + /* The characters whose values are being determined in this step. */
|
| + unsigned int _changing_count;
|
| + unsigned int * _changing;
|
| + /* Exclusive upper bound for the _asso_values[c] of this step.
|
| + A power of 2. */
|
| + unsigned int _asso_value_max;
|
| + /* The characters whose values will be determined after this step. */
|
| + bool * _undetermined;
|
| + /* The keyword set partition after this step. */
|
| + EquivalenceClass * _partition;
|
| + /* The expected number of iterations in this step. */
|
| + double _expected_lower;
|
| + double _expected_upper;
|
| +
|
| + Step * _next;
|
| +};
|
| +
|
| +static inline bool
|
| +equals (const unsigned int *ptr1, const unsigned int *ptr2, unsigned int len)
|
| +{
|
| + while (len > 0)
|
| + {
|
| + if (*ptr1 != *ptr2)
|
| + return false;
|
| + ptr1++;
|
| + ptr2++;
|
| + len--;
|
| + }
|
| + return true;
|
| +}
|
| +
|
| +EquivalenceClass *
|
| +Search::compute_partition (bool *undetermined) const
|
| +{
|
| + EquivalenceClass *partition = NULL;
|
| + EquivalenceClass *partition_last = NULL;
|
| + for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
|
| + {
|
| + KeywordExt *keyword = temp->first();
|
| +
|
| + /* Compute the undetermined characters for this keyword. */
|
| + unsigned int *undetermined_chars =
|
| + new unsigned int[keyword->_selchars_length];
|
| + unsigned int undetermined_chars_length = 0;
|
| +
|
| + for (int i = 0; i < keyword->_selchars_length; i++)
|
| + if (undetermined[keyword->_selchars[i]])
|
| + undetermined_chars[undetermined_chars_length++] = keyword->_selchars[i];
|
| +
|
| + /* Look up the equivalence class to which this keyword belongs. */
|
| + EquivalenceClass *equclass;
|
| + for (equclass = partition; equclass; equclass = equclass->_next)
|
| + if (equclass->_undetermined_chars_length == undetermined_chars_length
|
| + && equals (equclass->_undetermined_chars, undetermined_chars,
|
| + undetermined_chars_length))
|
| + break;
|
| + if (equclass == NULL)
|
| + {
|
| + equclass = new EquivalenceClass();
|
| + equclass->_keywords = NULL;
|
| + equclass->_keywords_last = NULL;
|
| + equclass->_cardinality = 0;
|
| + equclass->_undetermined_chars = undetermined_chars;
|
| + equclass->_undetermined_chars_length = undetermined_chars_length;
|
| + equclass->_next = NULL;
|
| + if (partition)
|
| + partition_last->_next = equclass;
|
| + else
|
| + partition = equclass;
|
| + partition_last = equclass;
|
| + }
|
| + else
|
| + delete[] undetermined_chars;
|
| +
|
| + /* Add the keyword to the equivalence class. */
|
| + KeywordExt_List *cons = new KeywordExt_List(keyword);
|
| + if (equclass->_keywords)
|
| + equclass->_keywords_last->rest() = cons;
|
| + else
|
| + equclass->_keywords = cons;
|
| + equclass->_keywords_last = cons;
|
| + equclass->_cardinality++;
|
| + }
|
| +
|
| + /* Free some of the allocated memory. The caller doesn't need it. */
|
| + for (EquivalenceClass *cls = partition; cls; cls = cls->_next)
|
| + delete[] cls->_undetermined_chars;
|
| +
|
| + return partition;
|
| +}
|
| +
|
| +static void
|
| +delete_partition (EquivalenceClass *partition)
|
| +{
|
| + while (partition != NULL)
|
| + {
|
| + EquivalenceClass *equclass = partition;
|
| + partition = equclass->_next;
|
| + delete_list (equclass->_keywords);
|
| + //delete[] equclass->_undetermined_chars; // already freed above
|
| + delete equclass;
|
| + }
|
| +}
|
| +
|
| +/* Compute the possible number of collisions when _asso_values[c] is
|
| + chosen, leading to the given partition. */
|
| +unsigned int
|
| +Search::count_possible_collisions (EquivalenceClass *partition, unsigned int c) const
|
| +{
|
| + /* Every equivalence class p is split according to the frequency of
|
| + occurrence of c, leading to equivalence classes p1, p2, ...
|
| + This leads to (|p|^2 - |p1|^2 - |p2|^2 - ...)/2 possible collisions.
|
| + Return the sum of this expression over all equivalence classes. */
|
| + unsigned int sum = 0;
|
| + unsigned int m = _max_selchars_length;
|
| + DYNAMIC_ARRAY (split_cardinalities, unsigned int, m + 1);
|
| + for (EquivalenceClass *cls = partition; cls; cls = cls->_next)
|
| + {
|
| + for (unsigned int i = 0; i <= m; i++)
|
| + split_cardinalities[i] = 0;
|
| +
|
| + for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest())
|
| + {
|
| + KeywordExt *keyword = temp->first();
|
| +
|
| + unsigned int count = 0;
|
| + for (int i = 0; i < keyword->_selchars_length; i++)
|
| + if (keyword->_selchars[i] == c)
|
| + count++;
|
| +
|
| + split_cardinalities[count]++;
|
| + }
|
| +
|
| + sum += cls->_cardinality * cls->_cardinality;
|
| + for (unsigned int i = 0; i <= m; i++)
|
| + sum -= split_cardinalities[i] * split_cardinalities[i];
|
| + }
|
| + FREE_DYNAMIC_ARRAY (split_cardinalities);
|
| + return sum;
|
| +}
|
| +
|
| +/* Test whether adding c to the undetermined characters changes the given
|
| + partition. */
|
| +bool
|
| +Search::unchanged_partition (EquivalenceClass *partition, unsigned int c) const
|
| +{
|
| + for (EquivalenceClass *cls = partition; cls; cls = cls->_next)
|
| + {
|
| + unsigned int first_count = UINT_MAX;
|
| +
|
| + for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest())
|
| + {
|
| + KeywordExt *keyword = temp->first();
|
| +
|
| + unsigned int count = 0;
|
| + for (int i = 0; i < keyword->_selchars_length; i++)
|
| + if (keyword->_selchars[i] == c)
|
| + count++;
|
| +
|
| + if (temp == cls->_keywords)
|
| + first_count = count;
|
| + else if (count != first_count)
|
| + /* c would split this equivalence class. */
|
| + return false;
|
| + }
|
| + }
|
| + return true;
|
| +}
|
| +
|
| +void
|
| +Search::find_asso_values ()
|
| +{
|
| + Step *steps;
|
| +
|
| + /* Determine the steps, starting with the last one. */
|
| + {
|
| + bool *undetermined;
|
| + bool *determined;
|
| +
|
| + steps = NULL;
|
| +
|
| + undetermined = new bool[_alpha_size];
|
| + for (unsigned int c = 0; c < _alpha_size; c++)
|
| + undetermined[c] = false;
|
| +
|
| + determined = new bool[_alpha_size];
|
| + for (unsigned int c = 0; c < _alpha_size; c++)
|
| + determined[c] = true;
|
| +
|
| + for (;;)
|
| + {
|
| + /* Compute the partition that needs to be refined. */
|
| + EquivalenceClass *partition = compute_partition (undetermined);
|
| +
|
| + /* Determine the main character to be chosen in this step.
|
| + Choosing such a character c has the effect of splitting every
|
| + equivalence class (according the the frequency of occurrence of c).
|
| + We choose the c with the minimum number of possible collisions,
|
| + so that characters which lead to a large number of collisions get
|
| + handled early during the search. */
|
| + unsigned int chosen_c;
|
| + unsigned int chosen_possible_collisions;
|
| + {
|
| + unsigned int best_c = 0;
|
| + unsigned int best_possible_collisions = UINT_MAX;
|
| + for (unsigned int c = 0; c < _alpha_size; c++)
|
| + if (_occurrences[c] > 0 && determined[c])
|
| + {
|
| + unsigned int possible_collisions =
|
| + count_possible_collisions (partition, c);
|
| + if (possible_collisions < best_possible_collisions)
|
| + {
|
| + best_c = c;
|
| + best_possible_collisions = possible_collisions;
|
| + }
|
| + }
|
| + if (best_possible_collisions == UINT_MAX)
|
| + {
|
| + /* All c with _occurrences[c] > 0 are undetermined. We are
|
| + are the starting situation and don't need any more step. */
|
| + delete_partition (partition);
|
| + break;
|
| + }
|
| + chosen_c = best_c;
|
| + chosen_possible_collisions = best_possible_collisions;
|
| + }
|
| +
|
| + /* We need one more step. */
|
| + Step *step = new Step();
|
| +
|
| + step->_undetermined = new bool[_alpha_size];
|
| + memcpy (step->_undetermined, undetermined, _alpha_size*sizeof(bool));
|
| +
|
| + step->_partition = partition;
|
| +
|
| + /* Now determine how the equivalence classes will be before this
|
| + step. */
|
| + undetermined[chosen_c] = true;
|
| + partition = compute_partition (undetermined);
|
| +
|
| + /* Now determine which other characters should be determined in this
|
| + step, because they will not change the equivalence classes at
|
| + this point. It is the set of all c which, for all equivalence
|
| + classes, have the same frequency of occurrence in every keyword
|
| + of the equivalence class. */
|
| + for (unsigned int c = 0; c < _alpha_size; c++)
|
| + if (_occurrences[c] > 0 && determined[c]
|
| + && unchanged_partition (partition, c))
|
| + {
|
| + undetermined[c] = true;
|
| + determined[c] = false;
|
| + }
|
| +
|
| + /* main_c must be one of these. */
|
| + if (determined[chosen_c])
|
| + abort ();
|
| +
|
| + /* Now the set of changing characters of this step. */
|
| + unsigned int changing_count;
|
| +
|
| + changing_count = 0;
|
| + for (unsigned int c = 0; c < _alpha_size; c++)
|
| + if (undetermined[c] && !step->_undetermined[c])
|
| + changing_count++;
|
| +
|
| + unsigned int *changing = new unsigned int[changing_count];
|
| + changing_count = 0;
|
| + for (unsigned int c = 0; c < _alpha_size; c++)
|
| + if (undetermined[c] && !step->_undetermined[c])
|
| + changing[changing_count++] = c;
|
| +
|
| + step->_changing = changing;
|
| + step->_changing_count = changing_count;
|
| +
|
| + step->_asso_value_max = _asso_value_max;
|
| +
|
| + step->_expected_lower =
|
| + exp (static_cast<double>(chosen_possible_collisions)
|
| + / static_cast<double>(_max_hash_value));
|
| + step->_expected_upper =
|
| + exp (static_cast<double>(chosen_possible_collisions)
|
| + / static_cast<double>(_asso_value_max));
|
| +
|
| + delete_partition (partition);
|
| +
|
| + step->_next = steps;
|
| + steps = step;
|
| + }
|
| +
|
| + delete[] determined;
|
| + delete[] undetermined;
|
| + }
|
| +
|
| + if (option[DEBUG])
|
| + {
|
| + unsigned int stepno = 0;
|
| + for (Step *step = steps; step; step = step->_next)
|
| + {
|
| + stepno++;
|
| + fprintf (stderr, "Step %u chooses _asso_values[", stepno);
|
| + for (unsigned int i = 0; i < step->_changing_count; i++)
|
| + {
|
| + if (i > 0)
|
| + fprintf (stderr, ",");
|
| + fprintf (stderr, "'%c'", step->_changing[i]);
|
| + }
|
| + fprintf (stderr, "], expected number of iterations between %g and %g.\n",
|
| + step->_expected_lower, step->_expected_upper);
|
| + fprintf (stderr, "Keyword equivalence classes:\n");
|
| + for (EquivalenceClass *cls = step->_partition; cls; cls = cls->_next)
|
| + {
|
| + fprintf (stderr, "\n");
|
| + for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest())
|
| + {
|
| + KeywordExt *keyword = temp->first();
|
| + fprintf (stderr, " %.*s\n",
|
| + keyword->_allchars_length, keyword->_allchars);
|
| + }
|
| + }
|
| + fprintf (stderr, "\n");
|
| + }
|
| + }
|
| +
|
| + /* Initialize _asso_values[]. (The value given here matters only
|
| + for those c which occur in all keywords with equal multiplicity.) */
|
| + for (unsigned int c = 0; c < _alpha_size; c++)
|
| + _asso_values[c] = 0;
|
| +
|
| + unsigned int stepno = 0;
|
| + for (Step *step = steps; step; step = step->_next)
|
| + {
|
| + stepno++;
|
| +
|
| + /* Initialize the asso_values[]. */
|
| + unsigned int k = step->_changing_count;
|
| + for (unsigned int i = 0; i < k; i++)
|
| + {
|
| + unsigned int c = step->_changing[i];
|
| + _asso_values[c] =
|
| + (_initial_asso_value < 0 ? rand () : _initial_asso_value)
|
| + & (step->_asso_value_max - 1);
|
| + }
|
| +
|
| + unsigned int iterations = 0;
|
| + DYNAMIC_ARRAY (iter, unsigned int, k);
|
| + for (unsigned int i = 0; i < k; i++)
|
| + iter[i] = 0;
|
| + unsigned int ii = (_jump != 0 ? k - 1 : 0);
|
| +
|
| + for (;;)
|
| + {
|
| + /* Test whether these asso_values[] lead to collisions among
|
| + the equivalence classes that should be collision-free. */
|
| + bool has_collision = false;
|
| + for (EquivalenceClass *cls = step->_partition; cls; cls = cls->_next)
|
| + {
|
| + /* Iteration Number array is a win, O(1) initialization time! */
|
| + _collision_detector->clear ();
|
| +
|
| + for (KeywordExt_List *ptr = cls->_keywords; ptr; ptr = ptr->rest())
|
| + {
|
| + KeywordExt *keyword = ptr->first();
|
| +
|
| + /* Compute the new hash code for the keyword, leaving apart
|
| + the yet undetermined asso_values[]. */
|
| + int hashcode;
|
| + {
|
| + int sum = option[NOLENGTH] ? 0 : keyword->_allchars_length;
|
| + const unsigned int *p = keyword->_selchars;
|
| + int i = keyword->_selchars_length;
|
| + for (; i > 0; p++, i--)
|
| + if (!step->_undetermined[*p])
|
| + sum += _asso_values[*p];
|
| + hashcode = sum;
|
| + }
|
| +
|
| + /* See whether it collides with another keyword's hash code,
|
| + from the same equivalence class. */
|
| + if (_collision_detector->set_bit (hashcode))
|
| + {
|
| + has_collision = true;
|
| + break;
|
| + }
|
| + }
|
| +
|
| + /* Don't need to continue looking at the other equivalence
|
| + classes if we already have found a collision. */
|
| + if (has_collision)
|
| + break;
|
| + }
|
| +
|
| + iterations++;
|
| + if (!has_collision)
|
| + break;
|
| +
|
| + /* Try other asso_values[]. */
|
| + if (_jump != 0)
|
| + {
|
| + /* The way we try various values for
|
| + asso_values[step->_changing[0],...step->_changing[k-1]]
|
| + is like this:
|
| + for (bound = 0,1,...)
|
| + for (ii = 0,...,k-1)
|
| + iter[ii] := bound
|
| + iter[0..ii-1] := values <= bound
|
| + iter[ii+1..k-1] := values < bound
|
| + and
|
| + asso_values[step->_changing[i]] =
|
| + _initial_asso_value + iter[i] * _jump.
|
| + This makes it more likely to find small asso_values[].
|
| + */
|
| + unsigned int bound = iter[ii];
|
| + unsigned int i = 0;
|
| + while (i < ii)
|
| + {
|
| + unsigned int c = step->_changing[i];
|
| + iter[i]++;
|
| + _asso_values[c] =
|
| + (_asso_values[c] + _jump) & (step->_asso_value_max - 1);
|
| + if (iter[i] <= bound)
|
| + goto found_next;
|
| + _asso_values[c] =
|
| + (_asso_values[c] - iter[i] * _jump)
|
| + & (step->_asso_value_max - 1);
|
| + iter[i] = 0;
|
| + i++;
|
| + }
|
| + i = ii + 1;
|
| + while (i < k)
|
| + {
|
| + unsigned int c = step->_changing[i];
|
| + iter[i]++;
|
| + _asso_values[c] =
|
| + (_asso_values[c] + _jump) & (step->_asso_value_max - 1);
|
| + if (iter[i] < bound)
|
| + goto found_next;
|
| + _asso_values[c] =
|
| + (_asso_values[c] - iter[i] * _jump)
|
| + & (step->_asso_value_max - 1);
|
| + iter[i] = 0;
|
| + i++;
|
| + }
|
| + /* Switch from one ii to the next. */
|
| + {
|
| + unsigned int c = step->_changing[ii];
|
| + _asso_values[c] =
|
| + (_asso_values[c] - bound * _jump)
|
| + & (step->_asso_value_max - 1);
|
| + iter[ii] = 0;
|
| + }
|
| + /* Here all iter[i] == 0. */
|
| + ii++;
|
| + if (ii == k)
|
| + {
|
| + ii = 0;
|
| + bound++;
|
| + if (bound == step->_asso_value_max)
|
| + {
|
| + /* Out of search space! We can either backtrack, or
|
| + increase the available search space of this step.
|
| + It seems simpler to choose the latter solution. */
|
| + step->_asso_value_max = 2 * step->_asso_value_max;
|
| + if (step->_asso_value_max > _asso_value_max)
|
| + {
|
| + _asso_value_max = step->_asso_value_max;
|
| + /* Reinitialize _max_hash_value. */
|
| + _max_hash_value =
|
| + (option[NOLENGTH] ? 0 : _max_key_len)
|
| + + (_asso_value_max - 1) * _max_selchars_length;
|
| + /* Reinitialize _collision_detector. */
|
| + delete _collision_detector;
|
| + _collision_detector =
|
| + new Bool_Array (_max_hash_value + 1);
|
| + }
|
| + }
|
| + }
|
| + {
|
| + unsigned int c = step->_changing[ii];
|
| + iter[ii] = bound;
|
| + _asso_values[c] =
|
| + (_asso_values[c] + bound * _jump)
|
| + & (step->_asso_value_max - 1);
|
| + }
|
| + found_next: ;
|
| + }
|
| + else
|
| + {
|
| + /* Random. */
|
| + unsigned int c = step->_changing[ii];
|
| + _asso_values[c] =
|
| + (_asso_values[c] + rand ()) & (step->_asso_value_max - 1);
|
| + /* Next time, change the next c. */
|
| + ii++;
|
| + if (ii == k)
|
| + ii = 0;
|
| + }
|
| + }
|
| + FREE_DYNAMIC_ARRAY (iter);
|
| +
|
| + if (option[DEBUG])
|
| + {
|
| + fprintf (stderr, "Step %u chose _asso_values[", stepno);
|
| + for (unsigned int i = 0; i < step->_changing_count; i++)
|
| + {
|
| + if (i > 0)
|
| + fprintf (stderr, ",");
|
| + fprintf (stderr, "'%c'", step->_changing[i]);
|
| + }
|
| + fprintf (stderr, "] in %u iterations.\n", iterations);
|
| + }
|
| + }
|
| +
|
| + /* Free allocated memory. */
|
| + while (steps != NULL)
|
| + {
|
| + Step *step = steps;
|
| + steps = step->_next;
|
| + delete[] step->_changing;
|
| + delete[] step->_undetermined;
|
| + delete_partition (step->_partition);
|
| + delete step;
|
| + }
|
| +}
|
| +
|
| +/* Computes a keyword's hash value, relative to the current _asso_values[],
|
| + and stores it in keyword->_hash_value. */
|
| +
|
| +inline int
|
| +Search::compute_hash (KeywordExt *keyword) const
|
| +{
|
| + int sum = option[NOLENGTH] ? 0 : keyword->_allchars_length;
|
| +
|
| + const unsigned int *p = keyword->_selchars;
|
| + int i = keyword->_selchars_length;
|
| + for (; i > 0; p++, i--)
|
| + sum += _asso_values[*p];
|
| +
|
| + return keyword->_hash_value = sum;
|
| +}
|
| +
|
| +/* Finds good _asso_values[]. */
|
| +
|
| +void
|
| +Search::find_good_asso_values ()
|
| +{
|
| + prepare_asso_values ();
|
| +
|
| + /* Search for good _asso_values[]. */
|
| + int asso_iteration;
|
| + if ((asso_iteration = option.get_asso_iterations ()) == 0)
|
| + /* Try only the given _initial_asso_value and _jump. */
|
| + find_asso_values ();
|
| + else
|
| + {
|
| + /* Try different pairs of _initial_asso_value and _jump, in the
|
| + following order:
|
| + (0, 1)
|
| + (1, 1)
|
| + (2, 1) (0, 3)
|
| + (3, 1) (1, 3)
|
| + (4, 1) (2, 3) (0, 5)
|
| + (5, 1) (3, 3) (1, 5)
|
| + ..... */
|
| + KeywordExt_List *saved_head = _head;
|
| + int best_initial_asso_value = 0;
|
| + int best_jump = 1;
|
| + int *best_asso_values = new int[_alpha_size];
|
| + int best_collisions = INT_MAX;
|
| + int best_max_hash_value = INT_MAX;
|
| +
|
| + _initial_asso_value = 0; _jump = 1;
|
| + for (;;)
|
| + {
|
| + /* Restore the keyword list in its original order. */
|
| + _head = copy_list (saved_head);
|
| + /* Find good _asso_values[]. */
|
| + find_asso_values ();
|
| + /* Test whether it is the best solution so far. */
|
| + int collisions = 0;
|
| + int max_hash_value = INT_MIN;
|
| + _collision_detector->clear ();
|
| + for (KeywordExt_List *ptr = _head; ptr; ptr = ptr->rest())
|
| + {
|
| + KeywordExt *keyword = ptr->first();
|
| + int hashcode = compute_hash (keyword);
|
| + if (max_hash_value < hashcode)
|
| + max_hash_value = hashcode;
|
| + if (_collision_detector->set_bit (hashcode))
|
| + collisions++;
|
| + }
|
| + if (collisions < best_collisions
|
| + || (collisions == best_collisions
|
| + && max_hash_value < best_max_hash_value))
|
| + {
|
| + memcpy (best_asso_values, _asso_values,
|
| + _alpha_size * sizeof (_asso_values[0]));
|
| + best_collisions = collisions;
|
| + best_max_hash_value = max_hash_value;
|
| + }
|
| + /* Delete the copied keyword list. */
|
| + delete_list (_head);
|
| +
|
| + if (--asso_iteration == 0)
|
| + break;
|
| + /* Prepare for next iteration. */
|
| + if (_initial_asso_value >= 2)
|
| + _initial_asso_value -= 2, _jump += 2;
|
| + else
|
| + _initial_asso_value += _jump, _jump = 1;
|
| + }
|
| + _head = saved_head;
|
| + /* Install the best found asso_values. */
|
| + _initial_asso_value = best_initial_asso_value;
|
| + _jump = best_jump;
|
| + memcpy (_asso_values, best_asso_values,
|
| + _alpha_size * sizeof (_asso_values[0]));
|
| + delete[] best_asso_values;
|
| + /* The keywords' _hash_value fields are recomputed below. */
|
| + }
|
| +}
|
| +
|
| +/* ========================================================================= */
|
| +
|
| +/* Comparison function for sorting by increasing _hash_value. */
|
| +static bool
|
| +less_by_hash_value (KeywordExt *keyword1, KeywordExt *keyword2)
|
| +{
|
| + return keyword1->_hash_value < keyword2->_hash_value;
|
| +}
|
| +
|
| +/* Sorts the keyword list by hash value. */
|
| +
|
| +void
|
| +Search::sort ()
|
| +{
|
| + _head = mergesort_list (_head, less_by_hash_value);
|
| +}
|
| +
|
| +void
|
| +Search::optimize ()
|
| +{
|
| + /* Preparations. */
|
| + prepare ();
|
| +
|
| + /* Step 1: Finding good byte positions. */
|
| + find_positions ();
|
| +
|
| + /* Step 2: Finding good alpha increments. */
|
| + find_alpha_inc ();
|
| +
|
| + /* Step 3: Finding good asso_values. */
|
| + find_good_asso_values ();
|
| +
|
| + /* Make one final check, just to make sure nothing weird happened.... */
|
| + _collision_detector->clear ();
|
| + for (KeywordExt_List *curr_ptr = _head; curr_ptr; curr_ptr = curr_ptr->rest())
|
| + {
|
| + KeywordExt *curr = curr_ptr->first();
|
| + unsigned int hashcode = compute_hash (curr);
|
| + if (_collision_detector->set_bit (hashcode))
|
| + {
|
| + /* This shouldn't happen. proj1, proj2, proj3 must have been
|
| + computed to be injective on the given keyword set. */
|
| + fprintf (stderr,
|
| + "\nInternal error, unexpected duplicate hash code\n");
|
| + if (option[POSITIONS])
|
| + fprintf (stderr, "try options -m or -r, or use new key positions.\n\n");
|
| + else
|
| + fprintf (stderr, "try options -m or -r.\n\n");
|
| + exit (1);
|
| + }
|
| + }
|
| +
|
| + /* Sorts the keyword list by hash value. */
|
| + sort ();
|
| +
|
| + /* Set unused asso_values[c] to max_hash_value + 1. This is not absolutely
|
| + necessary, but speeds up the lookup function in many cases of lookup
|
| + failure: no string comparison is needed once the hash value of a string
|
| + is larger than the hash value of any keyword. */
|
| + int max_hash_value;
|
| + {
|
| + KeywordExt_List *temp;
|
| + for (temp = _head; temp->rest(); temp = temp->rest())
|
| + ;
|
| + max_hash_value = temp->first()->_hash_value;
|
| + }
|
| + for (unsigned int c = 0; c < _alpha_size; c++)
|
| + if (_occurrences[c] == 0)
|
| + _asso_values[c] = max_hash_value + 1;
|
| +
|
| + /* Propagate unified asso_values. */
|
| + if (_alpha_unify)
|
| + for (unsigned int c = 0; c < _alpha_size; c++)
|
| + if (_alpha_unify[c] != c)
|
| + _asso_values[c] = _asso_values[_alpha_unify[c]];
|
| +}
|
| +
|
| +/* Prints out some diagnostics upon completion. */
|
| +
|
| +Search::~Search ()
|
| +{
|
| + delete _collision_detector;
|
| + if (option[DEBUG])
|
| + {
|
| + fprintf (stderr, "\ndumping occurrence and associated values tables\n");
|
| +
|
| + for (unsigned int i = 0; i < _alpha_size; i++)
|
| + if (_occurrences[i])
|
| + fprintf (stderr, "asso_values[%c] = %6d, occurrences[%c] = %6d\n",
|
| + i, _asso_values[i], i, _occurrences[i]);
|
| +
|
| + fprintf (stderr, "end table dumping\n");
|
| +
|
| + fprintf (stderr, "\nDumping key list information:\ntotal non-static linked keywords = %d"
|
| + "\ntotal keywords = %d\ntotal duplicates = %d\nmaximum key length = %d\n",
|
| + _list_len, _total_keys, _total_duplicates, _max_key_len);
|
| +
|
| + int field_width = _max_selchars_length;
|
| + fprintf (stderr, "\nList contents are:\n(hash value, key length, index, %*s, keyword):\n",
|
| + field_width, "selchars");
|
| + for (KeywordExt_List *ptr = _head; ptr; ptr = ptr->rest())
|
| + {
|
| + fprintf (stderr, "%11d,%11d,%6d, ",
|
| + ptr->first()->_hash_value, ptr->first()->_allchars_length, ptr->first()->_final_index);
|
| + if (field_width > ptr->first()->_selchars_length)
|
| + fprintf (stderr, "%*s", field_width - ptr->first()->_selchars_length, "");
|
| + for (int j = 0; j < ptr->first()->_selchars_length; j++)
|
| + putc (ptr->first()->_selchars[j], stderr);
|
| + fprintf (stderr, ", %.*s\n",
|
| + ptr->first()->_allchars_length, ptr->first()->_allchars);
|
| + }
|
| +
|
| + fprintf (stderr, "End dumping list.\n\n");
|
| + }
|
| + delete[] _asso_values;
|
| + delete[] _occurrences;
|
| + delete[] _alpha_unify;
|
| + delete[] _alpha_inc;
|
| +}
|
|
|
| Property changes on: gperf\src\gperf\3.0.1\gperf-3.0.1-src\src\search.cc
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + LF
|
|
|
|
|