1/* Search algorithm.
2 Copyright (C) 1989-1998, 2000, 2002, 2009, 2016 Free Software Foundation, Inc.
3 Written by Douglas C. Schmidt <schmidt@ics.uci.edu>
4 and Bruno Haible <bruno@clisp.org>.
5
6 This file is part of GNU GPERF.
7
8 This program is free software: you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or
11 (at your option) any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program. If not, see <http://www.gnu.org/licenses/>. */
20
21/* Specification. */
22#include "search.h"
23
24#include <stdio.h>
25#include <stdlib.h> /* declares exit(), rand(), srand() */
26#include <string.h> /* declares memset(), memcmp() */
27#include <time.h> /* declares time() */
28#include <math.h> /* declares exp() */
29#include <limits.h> /* defines INT_MIN, INT_MAX, UINT_MAX */
30#include "options.h"
31#include "hash-table.h"
32#include "config.h"
33
34/* ============================== Portability ============================== */
35
36/* Assume ISO C++ 'for' scoping rule. */
37#define for if (0) ; else for
38
39/* Dynamically allocated array with dynamic extent:
40
41 Example:
42 DYNAMIC_ARRAY (my_array, int, n);
43 ...
44 FREE_DYNAMIC_ARRAY (my_array);
45
46 Attention: depending on your implementation my_array is either the array
47 itself or a pointer to the array! Always use my_array only as expression!
48 */
49#if HAVE_DYNAMIC_ARRAY
50 #define DYNAMIC_ARRAY(var,eltype,size) eltype var[size]
51 #define FREE_DYNAMIC_ARRAY(var)
52#else
53 #define DYNAMIC_ARRAY(var,eltype,size) eltype *var = new eltype[size]
54 #define FREE_DYNAMIC_ARRAY(var) delete[] var
55#endif
56
57/* ================================ Theory ================================= */
58
59/* The general form of the hash function is
60
61 hash (keyword) = sum (asso_values[keyword[i] + alpha_inc[i]] : i in Pos)
62 + len (keyword)
63
64 where Pos is a set of byte positions,
65 each alpha_inc[i] is a nonnegative integer,
66 each asso_values[c] is a nonnegative integer,
67 len (keyword) is the keyword's length if _hash_includes_len, or 0 otherwise.
68
69 Theorem 1: If all keywords are different, there is a set Pos such that
70 all tuples (keyword[i] : i in Pos) are different.
71
72 Theorem 2: If all tuples (keyword[i] : i in Pos) are different, there
73 are nonnegative integers alpha_inc[i] such that all multisets
74 {keyword[i] + alpha_inc[i] : i in Pos} are different.
75
76 Define selchars[keyword] := {keyword[i] + alpha_inc[i] : i in Pos}.
77
78 Theorem 3: If all multisets selchars[keyword] are different, there are
79 nonnegative integers asso_values[c] such that all hash values
80 sum (asso_values[c] : c in selchars[keyword]) are different.
81
82 Based on these three facts, we find the hash function in three steps:
83
84 Step 1 (Finding good byte positions):
85 Find a set Pos, as small as possible, such that all tuples
86 (keyword[i] : i in Pos) are different.
87
88 Step 2 (Finding good alpha increments):
89 Find nonnegative integers alpha_inc[i], as many of them as possible being
90 zero, and the others being as small as possible, such that all multisets
91 {keyword[i] + alpha_inc[i] : i in Pos} are different.
92
93 Step 3 (Finding good asso_values):
94 Find asso_values[c] such that all hash (keyword) are different.
95
96 In other words, each step finds a projection that is injective on the
97 given finite set:
98 proj1 : String --> Map (Pos --> N)
99 proj2 : Map (Pos --> N) --> Map (Pos --> N) / S(Pos)
100 proj3 : Map (Pos --> N) / S(Pos) --> N
101 where
102 N denotes the set of nonnegative integers,
103 Map (A --> B) := Hom_Set (A, B) is the set of maps from A to B, and
104 S(Pos) is the symmetric group over Pos.
105
106 This was the theory for !_hash_includes_len; if _hash_includes_len, slight
107 modifications apply:
108 proj1 : String --> Map (Pos --> N) x N
109 proj2 : Map (Pos --> N) x N --> Map (Pos --> N) / S(Pos) x N
110 proj3 : Map (Pos --> N) / S(Pos) x N --> N
111
112 For a case-insensitive hash function, the general form is
113
114 hash (keyword) =
115 sum (asso_values[alpha_unify[keyword[i] + alpha_inc[i]]] : i in Pos)
116 + len (keyword)
117
118 where alpha_unify[c] is chosen so that an upper/lower case change in
119 keyword[i] doesn't change alpha_unify[keyword[i] + alpha_inc[i]].
120 */
121
122/* ==================== Initialization and Preparation ===================== */
123
124Search::Search (KeywordExt_List *list)
125 : _head (list)
126{
127}
128
129void
130Search::prepare ()
131{
132 /* Compute the total number of keywords. */
133 _total_keys = 0;
134 for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
135 _total_keys++;
136
137 /* Compute the minimum and maximum keyword length. */
138 _max_key_len = INT_MIN;
139 _min_key_len = INT_MAX;
140 for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
141 {
142 KeywordExt *keyword = temp->first();
143
144 if (_max_key_len < keyword->_allchars_length)
145 _max_key_len = keyword->_allchars_length;
146 if (_min_key_len > keyword->_allchars_length)
147 _min_key_len = keyword->_allchars_length;
148 }
149
150 /* Exit program if an empty string is used as keyword, since the comparison
151 expressions don't work correctly for looking up an empty string. */
152 if (_min_key_len == 0)
153 {
154 fprintf (stderr, "Empty input keyword is not allowed.\n"
155 "To recognize an empty input keyword, your code should check for\n"
156 "len == 0 before calling the gperf generated lookup function.\n");
157 exit (1);
158 }
159
160 /* Exit program if the characters in the keywords are not in the required
161 range. */
162 if (option[SEVENBIT])
163 {
164 for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
165 {
166 KeywordExt *keyword = temp->first();
167
168 const char *k = keyword->_allchars;
169 for (int i = keyword->_allchars_length; i > 0; k++, i--)
170 if (!(static_cast<unsigned char>(*k) < 128))
171 {
172 fprintf (stderr, "Option --seven-bit has been specified,\n"
173 "but keyword \"%.*s\" contains non-ASCII characters.\n"
174 "Try removing option --seven-bit.\n",
175 keyword->_allchars_length, keyword->_allchars);
176 exit (1);
177 }
178 }
179 }
180
181 /* Determine whether the hash function shall include the length. */
182 _hash_includes_len = !(option[NOLENGTH] || (_min_key_len == _max_key_len));
183}
184
185/* ====================== Finding good byte positions ====================== */
186
187/* Computes the upper bound on the indices passed to asso_values[],
188 assuming no alpha_increments. */
189unsigned int
190Search::compute_alpha_size () const
191{
192 return (option[SEVENBIT] ? 128 : 256);
193}
194
195/* Computes the unification rules between different asso_values[c],
196 assuming no alpha_increments. */
197unsigned int *
198Search::compute_alpha_unify () const
199{
200 if (option[UPPERLOWER])
201 {
202 /* Uppercase to lowercase mapping. */
203 unsigned int alpha_size = compute_alpha_size();
204 unsigned int *alpha_unify = new unsigned int[alpha_size];
205 for (unsigned int c = 0; c < alpha_size; c++)
206 alpha_unify[c] = c;
207 for (unsigned int c = 'A'; c <= 'Z'; c++)
208 alpha_unify[c] = c + ('a'-'A');
209 return alpha_unify;
210 }
211 else
212 /* Identity mapping. */
213 return NULL;
214}
215
216/* Initializes each keyword's _selchars array. */
217void
218Search::init_selchars_tuple (const Positions& positions, const unsigned int *alpha_unify) const
219{
220 for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
221 temp->first()->init_selchars_tuple(positions, alpha_unify);
222}
223
224/* Deletes each keyword's _selchars array. */
225void
226Search::delete_selchars () const
227{
228 for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
229 temp->first()->delete_selchars();
230}
231
232/* Count the duplicate keywords that occur with a given set of positions.
233 In other words, it returns the difference
234 # K - # proj1 (K)
235 where K is the multiset of given keywords. */
236unsigned int
237Search::count_duplicates_tuple (const Positions& positions, const unsigned int *alpha_unify) const
238{
239 /* Run through the keyword list and count the duplicates incrementally.
240 The result does not depend on the order of the keyword list, thanks to
241 the formula above. */
242 init_selchars_tuple (positions, alpha_unify);
243
244 unsigned int count = 0;
245 {
246 Hash_Table representatives (_total_keys, !_hash_includes_len);
247 for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
248 {
249 KeywordExt *keyword = temp->first();
250 if (representatives.insert (keyword))
251 count++;
252 }
253 }
254
255 delete_selchars ();
256
257 return count;
258}
259
260/* Find good key positions. */
261
262void
263Search::find_positions ()
264{
265 /* If the user gave the key positions, we use them. */
266 if (option[POSITIONS])
267 {
268 _key_positions = option.get_key_positions();
269 return;
270 }
271
272 /* Compute preliminary alpha_unify table. */
273 unsigned int *alpha_unify = compute_alpha_unify ();
274
275 /* 1. Find positions that must occur in order to distinguish duplicates. */
276 Positions mandatory;
277
278 if (!option[DUP])
279 {
280 for (KeywordExt_List *l1 = _head; l1 && l1->rest(); l1 = l1->rest())
281 {
282 KeywordExt *keyword1 = l1->first();
283 for (KeywordExt_List *l2 = l1->rest(); l2; l2 = l2->rest())
284 {
285 KeywordExt *keyword2 = l2->first();
286
287 /* If keyword1 and keyword2 have the same length and differ
288 in just one position, and it is not the last character,
289 this position is mandatory. */
290 if (keyword1->_allchars_length == keyword2->_allchars_length)
291 {
292 int n = keyword1->_allchars_length;
293 int i;
294 for (i = 0; i < n - 1; i++)
295 {
296 unsigned char c1 = keyword1->_allchars[i];
297 unsigned char c2 = keyword2->_allchars[i];
298 if (option[UPPERLOWER])
299 {
300 if (c1 >= 'A' && c1 <= 'Z')
301 c1 += 'a' - 'A';
302 if (c2 >= 'A' && c2 <= 'Z')
303 c2 += 'a' - 'A';
304 }
305 if (c1 != c2)
306 break;
307 }
308 if (i < n - 1)
309 {
310 int j;
311 for (j = i + 1; j < n; j++)
312 {
313 unsigned char c1 = keyword1->_allchars[j];
314 unsigned char c2 = keyword2->_allchars[j];
315 if (option[UPPERLOWER])
316 {
317 if (c1 >= 'A' && c1 <= 'Z')
318 c1 += 'a' - 'A';
319 if (c2 >= 'A' && c2 <= 'Z')
320 c2 += 'a' - 'A';
321 }
322 if (c1 != c2)
323 break;
324 }
325 if (j >= n)
326 {
327 /* Position i is mandatory. */
328 if (!mandatory.contains (i))
329 mandatory.add (i);
330 }
331 }
332 }
333 }
334 }
335 }
336
337 /* 2. Add positions, as long as this decreases the duplicates count. */
338 int imax = (_max_key_len - 1 < Positions::MAX_KEY_POS - 1
339 ? _max_key_len - 1 : Positions::MAX_KEY_POS - 1);
340 Positions current = mandatory;
341 unsigned int current_duplicates_count =
342 count_duplicates_tuple (current, alpha_unify);
343 for (;;)
344 {
345 Positions best;
346 unsigned int best_duplicates_count = UINT_MAX;
347
348 for (int i = imax; i >= -1; i--)
349 if (!current.contains (i))
350 {
351 Positions tryal = current;
352 tryal.add (i);
353 unsigned int try_duplicates_count =
354 count_duplicates_tuple (tryal, alpha_unify);
355
356 /* We prefer 'try' to 'best' if it produces less duplicates,
357 or if it produces the same number of duplicates but with
358 a more efficient hash function. */
359 if (try_duplicates_count < best_duplicates_count
360 || (try_duplicates_count == best_duplicates_count && i >= 0))
361 {
362 best = tryal;
363 best_duplicates_count = try_duplicates_count;
364 }
365 }
366
367 /* Stop adding positions when it gives no improvement. */
368 if (best_duplicates_count >= current_duplicates_count)
369 break;
370
371 current = best;
372 current_duplicates_count = best_duplicates_count;
373 }
374
375 /* 3. Remove positions, as long as this doesn't increase the duplicates
376 count. */
377 for (;;)
378 {
379 Positions best;
380 unsigned int best_duplicates_count = UINT_MAX;
381
382 for (int i = imax; i >= -1; i--)
383 if (current.contains (i) && !mandatory.contains (i))
384 {
385 Positions tryal = current;
386 tryal.remove (i);
387 unsigned int try_duplicates_count =
388 count_duplicates_tuple (tryal, alpha_unify);
389
390 /* We prefer 'try' to 'best' if it produces less duplicates,
391 or if it produces the same number of duplicates but with
392 a more efficient hash function. */
393 if (try_duplicates_count < best_duplicates_count
394 || (try_duplicates_count == best_duplicates_count && i == -1))
395 {
396 best = tryal;
397 best_duplicates_count = try_duplicates_count;
398 }
399 }
400
401 /* Stop removing positions when it gives no improvement. */
402 if (best_duplicates_count > current_duplicates_count)
403 break;
404
405 current = best;
406 current_duplicates_count = best_duplicates_count;
407 }
408
409 /* 4. Replace two positions by one, as long as this doesn't increase the
410 duplicates count. */
411 for (;;)
412 {
413 Positions best;
414 unsigned int best_duplicates_count = UINT_MAX;
415
416 for (int i1 = imax; i1 >= -1; i1--)
417 if (current.contains (i1) && !mandatory.contains (i1))
418 {
419 for (int i2 = imax; i2 >= -1; i2--)
420 if (current.contains (i2) && !mandatory.contains (i2) && i2 != i1)
421 {
422 for (int i3 = imax; i3 >= 0; i3--)
423 if (!current.contains (i3))
424 {
425 Positions tryal = current;
426 tryal.remove (i1);
427 tryal.remove (i2);
428 tryal.add (i3);
429 unsigned int try_duplicates_count =
430 count_duplicates_tuple (tryal, alpha_unify);
431
432 /* We prefer 'try' to 'best' if it produces less
433 duplicates, or if it produces the same number
434 of duplicates but with a more efficient hash
435 function. */
436 if (try_duplicates_count < best_duplicates_count
437 || (try_duplicates_count == best_duplicates_count
438 && (i1 == -1 || i2 == -1 || i3 >= 0)))
439 {
440 best = tryal;
441 best_duplicates_count = try_duplicates_count;
442 }
443 }
444 }
445 }
446
447 /* Stop removing positions when it gives no improvement. */
448 if (best_duplicates_count > current_duplicates_count)
449 break;
450
451 current = best;
452 current_duplicates_count = best_duplicates_count;
453 }
454
455 /* That's it. Hope it's good enough. */
456 _key_positions = current;
457
458 if (option[DEBUG])
459 {
460 /* Print the result. */
461 fprintf (stderr, "\nComputed positions: ");
462 PositionReverseIterator iter = _key_positions.reviterator();
463 bool seen_lastchar = false;
464 bool first = true;
465 for (int i; (i = iter.next ()) != PositionReverseIterator::EOS; )
466 {
467 if (!first)
468 fprintf (stderr, ", ");
469 if (i == Positions::LASTCHAR)
470 seen_lastchar = true;
471 else
472 {
473 fprintf (stderr, "%d", i + 1);
474 first = false;
475 }
476 }
477 if (seen_lastchar)
478 {
479 if (!first)
480 fprintf (stderr, ", ");
481 fprintf (stderr, "$");
482 }
483 fprintf (stderr, "\n");
484 }
485
486 /* Free preliminary alpha_unify table. */
487 delete[] alpha_unify;
488}
489
490/* Count the duplicate keywords that occur with the found set of positions.
491 In other words, it returns the difference
492 # K - # proj1 (K)
493 where K is the multiset of given keywords. */
494unsigned int
495Search::count_duplicates_tuple () const
496{
497 unsigned int *alpha_unify = compute_alpha_unify ();
498 unsigned int count = count_duplicates_tuple (_key_positions, alpha_unify);
499 delete[] alpha_unify;
500 return count;
501}
502
503/* ===================== Finding good alpha increments ===================== */
504
505/* Computes the upper bound on the indices passed to asso_values[]. */
506unsigned int
507Search::compute_alpha_size (const unsigned int *alpha_inc) const
508{
509 unsigned int max_alpha_inc = 0;
510 for (int i = 0; i < _max_key_len; i++)
511 if (max_alpha_inc < alpha_inc[i])
512 max_alpha_inc = alpha_inc[i];
513 return (option[SEVENBIT] ? 128 : 256) + max_alpha_inc;
514}
515
516/* Computes the unification rules between different asso_values[c]. */
517unsigned int *
518Search::compute_alpha_unify (const Positions& positions, const unsigned int *alpha_inc) const
519{
520 if (option[UPPERLOWER])
521 {
522 /* Without alpha increments, we would simply unify
523 'A' -> 'a', ..., 'Z' -> 'z'.
524 But when a keyword contains at position i a character c,
525 we have the constraint
526 asso_values[tolower(c) + alpha_inc[i]] ==
527 asso_values[toupper(c) + alpha_inc[i]].
528 This introduces a unification
529 toupper(c) + alpha_inc[i] -> tolower(c) + alpha_inc[i].
530 Note that this unification can extend outside the range of
531 ASCII letters! But still every unified character pair is at
532 a distance of 'a'-'A' = 32, or (after chained unification)
533 at a multiple of 32. So in the end the alpha_unify vector has
534 the form c -> c + 32 * f(c) where f(c) is a nonnegative
535 integer. */
536 unsigned int alpha_size = compute_alpha_size (alpha_inc);
537
538 unsigned int *alpha_unify = new unsigned int[alpha_size];
539 for (unsigned int c = 0; c < alpha_size; c++)
540 alpha_unify[c] = c;
541
542 for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
543 {
544 KeywordExt *keyword = temp->first();
545
546 /* Iterate through the selected character positions. */
547 PositionIterator iter = positions.iterator(keyword->_allchars_length);
548
549 for (int i; (i = iter.next ()) != PositionIterator::EOS; )
550 {
551 unsigned int c;
552 if (i == Positions::LASTCHAR)
553 c = static_cast<unsigned char>(keyword->_allchars[keyword->_allchars_length - 1]);
554 else if (i < keyword->_allchars_length)
555 c = static_cast<unsigned char>(keyword->_allchars[i]);
556 else
557 abort ();
558 if (c >= 'A' && c <= 'Z')
559 c += 'a' - 'A';
560 if (c >= 'a' && c <= 'z')
561 {
562 if (i != Positions::LASTCHAR)
563 c += alpha_inc[i];
564 /* Unify c with c - ('a'-'A'). */
565 unsigned int d = alpha_unify[c];
566 unsigned int b = c - ('a'-'A');
567 for (int a = b; a >= 0 && alpha_unify[a] == b; a -= ('a'-'A'))
568 alpha_unify[a] = d;
569 }
570 }
571 }
572 return alpha_unify;
573 }
574 else
575 /* Identity mapping. */
576 return NULL;
577}
578
579/* Initializes each keyword's _selchars array. */
580void
581Search::init_selchars_multiset (const Positions& positions, const unsigned int *alpha_unify, const unsigned int *alpha_inc) const
582{
583 for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
584 temp->first()->init_selchars_multiset(positions, alpha_unify, alpha_inc);
585}
586
587/* Count the duplicate keywords that occur with the given set of positions
588 and a given alpha_inc[] array.
589 In other words, it returns the difference
590 # K - # proj2 (proj1 (K))
591 where K is the multiset of given keywords. */
592unsigned int
593Search::count_duplicates_multiset (const unsigned int *alpha_inc) const
594{
595 /* Run through the keyword list and count the duplicates incrementally.
596 The result does not depend on the order of the keyword list, thanks to
597 the formula above. */
598 unsigned int *alpha_unify = compute_alpha_unify (_key_positions, alpha_inc);
599 init_selchars_multiset (_key_positions, alpha_unify, alpha_inc);
600
601 unsigned int count = 0;
602 {
603 Hash_Table representatives (_total_keys, !_hash_includes_len);
604 for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
605 {
606 KeywordExt *keyword = temp->first();
607 if (representatives.insert (keyword))
608 count++;
609 }
610 }
611
612 delete_selchars ();
613 delete[] alpha_unify;
614
615 return count;
616}
617
618/* Find good _alpha_inc[]. */
619
620void
621Search::find_alpha_inc ()
622{
623 /* The goal is to choose _alpha_inc[] such that it doesn't introduce
624 artificial duplicates.
625 In other words, the goal is # proj2 (proj1 (K)) = # proj1 (K). */
626 unsigned int duplicates_goal = count_duplicates_tuple ();
627
628 /* Start with zero increments. This is sufficient in most cases. */
629 unsigned int *current = new unsigned int [_max_key_len];
630 for (int i = 0; i < _max_key_len; i++)
631 current[i] = 0;
632 unsigned int current_duplicates_count = count_duplicates_multiset (current);
633
634 if (current_duplicates_count > duplicates_goal)
635 {
636 /* Look which _alpha_inc[i] we are free to increment. */
637 unsigned int nindices;
638 {
639 nindices = 0;
640 PositionIterator iter = _key_positions.iterator(_max_key_len);
641 for (;;)
642 {
643 int key_pos = iter.next ();
644 if (key_pos == PositionIterator::EOS)
645 break;
646 if (key_pos != Positions::LASTCHAR)
647 nindices++;
648 }
649 }
650
651 DYNAMIC_ARRAY (indices, unsigned int, nindices);
652 {
653 unsigned int j = 0;
654 PositionIterator iter = _key_positions.iterator(_max_key_len);
655 for (;;)
656 {
657 int key_pos = iter.next ();
658 if (key_pos == PositionIterator::EOS)
659 break;
660 if (key_pos != Positions::LASTCHAR)
661 indices[j++] = key_pos;
662 }
663 if (!(j == nindices))
664 abort ();
665 }
666
667 /* Perform several rounds of searching for a good alpha increment.
668 Each round reduces the number of artificial collisions by adding
669 an increment in a single key position. */
670 DYNAMIC_ARRAY (best, unsigned int, _max_key_len);
671 DYNAMIC_ARRAY (tryal, unsigned int, _max_key_len);
672 do
673 {
674 /* An increment of 1 is not always enough. Try higher increments
675 also. */
676 for (unsigned int inc = 1; ; inc++)
677 {
678 unsigned int best_duplicates_count = UINT_MAX;
679
680 for (unsigned int j = 0; j < nindices; j++)
681 {
682 memcpy (tryal, current, _max_key_len * sizeof (unsigned int));
683 tryal[indices[j]] += inc;
684 unsigned int try_duplicates_count =
685 count_duplicates_multiset (tryal);
686
687 /* We prefer 'try' to 'best' if it produces less
688 duplicates. */
689 if (try_duplicates_count < best_duplicates_count)
690 {
691 memcpy (best, tryal, _max_key_len * sizeof (unsigned int));
692 best_duplicates_count = try_duplicates_count;
693 }
694 }
695
696 /* Stop this round when we got an improvement. */
697 if (best_duplicates_count < current_duplicates_count)
698 {
699 memcpy (current, best, _max_key_len * sizeof (unsigned int));
700 current_duplicates_count = best_duplicates_count;
701 break;
702 }
703 }
704 }
705 while (current_duplicates_count > duplicates_goal);
706 FREE_DYNAMIC_ARRAY (tryal);
707 FREE_DYNAMIC_ARRAY (best);
708
709 if (option[DEBUG])
710 {
711 /* Print the result. */
712 fprintf (stderr, "\nComputed alpha increments: ");
713 bool first = true;
714 for (unsigned int j = nindices; j-- > 0; )
715 if (current[indices[j]] != 0)
716 {
717 if (!first)
718 fprintf (stderr, ", ");
719 fprintf (stderr, "%u:+%u",
720 indices[j] + 1, current[indices[j]]);
721 first = false;
722 }
723 fprintf (stderr, "\n");
724 }
725 FREE_DYNAMIC_ARRAY (indices);
726 }
727
728 _alpha_inc = current;
729 _alpha_size = compute_alpha_size (_alpha_inc);
730 _alpha_unify = compute_alpha_unify (_key_positions, _alpha_inc);
731}
732
733/* ======================= Finding good asso_values ======================== */
734
735/* Initializes the asso_values[] related parameters. */
736
737void
738Search::prepare_asso_values ()
739{
740 KeywordExt_List *temp;
741
742 /* Initialize each keyword's _selchars array. */
743 init_selchars_multiset(_key_positions, _alpha_unify, _alpha_inc);
744
745 /* Compute the maximum _selchars_length over all keywords. */
746 _max_selchars_length = _key_positions.iterator(_max_key_len).remaining();
747
748 /* Check for duplicates, i.e. keywords with the same _selchars array
749 (and - if _hash_includes_len - also the same length).
750 We deal with these by building an equivalence class, so that only
751 1 keyword is representative of the entire collection. Only this
752 representative remains in the keyword list; the others are accessible
753 through the _duplicate_link chain, starting at the representative.
754 This *greatly* simplifies processing during later stages of the program.
755 Set _total_duplicates and _list_len = _total_keys - _total_duplicates. */
756 {
757 _list_len = _total_keys;
758 _total_duplicates = 0;
759 /* Make hash table for efficiency. */
760 Hash_Table representatives (_list_len, !_hash_includes_len);
761
762 KeywordExt_List *prev = NULL; /* list node before temp */
763 for (temp = _head; temp; )
764 {
765 KeywordExt *keyword = temp->first();
766 KeywordExt *other_keyword = representatives.insert (keyword);
767 KeywordExt_List *garbage = NULL;
768
769 if (other_keyword)
770 {
771 _total_duplicates++;
772 _list_len--;
773 /* Remove keyword from the main list. */
774 prev->rest() = temp->rest();
775 garbage = temp;
776 /* And insert it on other_keyword's duplicate list. */
777 keyword->_duplicate_link = other_keyword->_duplicate_link;
778 other_keyword->_duplicate_link = keyword;
779
780 /* Complain if user hasn't enabled the duplicate option. */
781 if (!option[DUP] || option[DEBUG])
782 {
783 fprintf (stderr, "Key link: \"%.*s\" = \"%.*s\", with key set \"",
784 keyword->_allchars_length, keyword->_allchars,
785 other_keyword->_allchars_length, other_keyword->_allchars);
786 for (int j = 0; j < keyword->_selchars_length; j++)
787 putc (keyword->_selchars[j], stderr);
788 fprintf (stderr, "\".\n");
789 }
790 }
791 else
792 {
793 keyword->_duplicate_link = NULL;
794 prev = temp;
795 }
796 temp = temp->rest();
797 if (garbage)
798 delete garbage;
799 }
800 if (option[DEBUG])
801 representatives.dump();
802 }
803
804 /* Exit program if duplicates exists and option[DUP] not set, since we
805 don't want to continue in this case. (We don't want to turn on
806 option[DUP] implicitly, because the generated code is usually much
807 slower. */
808 if (_total_duplicates)
809 {
810 if (option[DUP])
811 fprintf (stderr, "%d input keys have identical hash values, examine output carefully...\n",
812 _total_duplicates);
813 else
814 {
815 fprintf (stderr, "%d input keys have identical hash values,\n",
816 _total_duplicates);
817 if (option[POSITIONS])
818 fprintf (stderr, "try different key positions or use option -D.\n");
819 else
820 fprintf (stderr, "use option -D.\n");
821 exit (1);
822 }
823 }
824
825 /* Compute the occurrences of each character in the alphabet. */
826 _occurrences = new int[_alpha_size];
827 memset (_occurrences, 0, _alpha_size * sizeof (_occurrences[0]));
828 for (temp = _head; temp; temp = temp->rest())
829 {
830 KeywordExt *keyword = temp->first();
831 const unsigned int *ptr = keyword->_selchars;
832 for (int count = keyword->_selchars_length; count > 0; ptr++, count--)
833 _occurrences[*ptr]++;
834 }
835
836 /* Memory allocation. */
837 _asso_values = new int[_alpha_size];
838
839 int non_linked_length = _list_len;
840 unsigned int asso_value_max;
841
842 asso_value_max =
843 static_cast<unsigned int>(non_linked_length * option.get_size_multiple());
844 /* Round up to the next power of two. This makes it easy to ensure
845 an _asso_value[c] is >= 0 and < asso_value_max. Also, the jump value
846 being odd, it guarantees that Search::try_asso_value() will iterate
847 through different values for _asso_value[c]. */
848 if (asso_value_max == 0)
849 asso_value_max = 1;
850 asso_value_max |= asso_value_max >> 1;
851 asso_value_max |= asso_value_max >> 2;
852 asso_value_max |= asso_value_max >> 4;
853 asso_value_max |= asso_value_max >> 8;
854 asso_value_max |= asso_value_max >> 16;
855 asso_value_max++;
856 _asso_value_max = asso_value_max;
857
858 /* Given the bound for _asso_values[c], we have a bound for the possible
859 hash values, as computed in compute_hash(). */
860 _max_hash_value = (_hash_includes_len ? _max_key_len : 0)
861 + (_asso_value_max - 1) * _max_selchars_length;
862 /* Allocate a sparse bit vector for detection of collisions of hash
863 values. */
864 _collision_detector = new Bool_Array (_max_hash_value + 1);
865
866 if (option[DEBUG])
867 {
868 fprintf (stderr, "total non-linked keys = %d\nmaximum associated value is %d"
869 "\nmaximum size of generated hash table is %d\n",
870 non_linked_length, asso_value_max, _max_hash_value);
871
872 int field_width;
873
874 field_width = 0;
875 {
876 for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
877 {
878 KeywordExt *keyword = temp->first();
879 if (field_width < keyword->_selchars_length)
880 field_width = keyword->_selchars_length;
881 }
882 }
883
884 fprintf (stderr, "\ndumping the keyword list without duplicates\n");
885 fprintf (stderr, "keyword #, %*s, keyword\n", field_width, "keysig");
886 int i = 0;
887 for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
888 {
889 KeywordExt *keyword = temp->first();
890 fprintf (stderr, "%9d, ", ++i);
891 if (field_width > keyword->_selchars_length)
892 fprintf (stderr, "%*s", field_width - keyword->_selchars_length, "");
893 for (int j = 0; j < keyword->_selchars_length; j++)
894 putc (keyword->_selchars[j], stderr);
895 fprintf (stderr, ", %.*s\n",
896 keyword->_allchars_length, keyword->_allchars);
897 }
898 fprintf (stderr, "\nend of keyword list\n\n");
899 }
900
901 if (option[RANDOM] || option.get_jump () == 0)
902 /* We will use rand(), so initialize the random number generator. */
903 srand (static_cast<long>(time (0)));
904
905 _initial_asso_value = (option[RANDOM] ? -1 : option.get_initial_asso_value ());
906 _jump = option.get_jump ();
907}
908
909/* Finds some _asso_values[] that fit. */
910
911/* The idea is to choose the _asso_values[] one by one, in a way that
912 a choice that has been made never needs to be undone later. This
913 means that we split the work into several steps. Each step chooses
914 one or more _asso_values[c]. The result of choosing one or more
915 _asso_values[c] is that the partitioning of the keyword set gets
916 broader.
917 Look at this partitioning: After every step, the _asso_values[] of a
918 certain set C of characters are undetermined. (At the beginning, C
919 is the set of characters c with _occurrences[c] > 0. At the end, C
920 is empty.) To each keyword K, we associate the multiset of _selchars
921 for which the _asso_values[] are undetermined:
922 K --> K->_selchars intersect C.
923 Consider two keywords equivalent if their value under this mapping is
924 the same. This introduces an equivalence relation on the set of
925 keywords. The equivalence classes partition the keyword set. (At the
926 beginning, the partition is the finest possible: each K is an equivalence
927 class by itself, because all K have a different _selchars. At the end,
928 all K have been merged into a single equivalence class.)
929 The partition before a step is always a refinement of the partition
930 after the step.
931 We choose the steps in such a way that the partition really becomes
932 broader at each step. (A step that only chooses an _asso_values[c]
933 without changing the partition is better merged with the previous step,
934 to avoid useless backtracking.) */
935
936struct EquivalenceClass
937{
938 /* The keywords in this equivalence class. */
939 KeywordExt_List * _keywords;
940 KeywordExt_List * _keywords_last;
941 /* The number of keywords in this equivalence class. */
942 unsigned int _cardinality;
943 /* The undetermined selected characters for the keywords in this
944 equivalence class, as a canonically reordered multiset. */
945 unsigned int * _undetermined_chars;
946 unsigned int _undetermined_chars_length;
947
948 EquivalenceClass * _next;
949};
950
951struct Step
952{
953 /* The characters whose values are being determined in this step. */
954 unsigned int _changing_count;
955 unsigned int * _changing;
956 /* Exclusive upper bound for the _asso_values[c] of this step.
957 A power of 2. */
958 unsigned int _asso_value_max;
959 /* The characters whose values will be determined after this step. */
960 bool * _undetermined;
961 /* The keyword set partition after this step. */
962 EquivalenceClass * _partition;
963 /* The expected number of iterations in this step. */
964 double _expected_lower;
965 double _expected_upper;
966
967 Step * _next;
968};
969
970static inline bool
971equals (const unsigned int *ptr1, const unsigned int *ptr2, unsigned int len)
972{
973 while (len > 0)
974 {
975 if (*ptr1 != *ptr2)
976 return false;
977 ptr1++;
978 ptr2++;
979 len--;
980 }
981 return true;
982}
983
984EquivalenceClass *
985Search::compute_partition (bool *undetermined) const
986{
987 EquivalenceClass *partition = NULL;
988 EquivalenceClass *partition_last = NULL;
989 for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
990 {
991 KeywordExt *keyword = temp->first();
992
993 /* Compute the undetermined characters for this keyword. */
994 unsigned int *undetermined_chars =
995 new unsigned int[keyword->_selchars_length];
996 unsigned int undetermined_chars_length = 0;
997
998 for (int i = 0; i < keyword->_selchars_length; i++)
999 if (undetermined[keyword->_selchars[i]])
1000 undetermined_chars[undetermined_chars_length++] = keyword->_selchars[i];
1001
1002 /* Look up the equivalence class to which this keyword belongs. */
1003 EquivalenceClass *equclass;
1004 for (equclass = partition; equclass; equclass = equclass->_next)
1005 if (equclass->_undetermined_chars_length == undetermined_chars_length
1006 && equals (equclass->_undetermined_chars, undetermined_chars,
1007 undetermined_chars_length))
1008 break;
1009 if (equclass == NULL)
1010 {
1011 equclass = new EquivalenceClass();
1012 equclass->_keywords = NULL;
1013 equclass->_keywords_last = NULL;
1014 equclass->_cardinality = 0;
1015 equclass->_undetermined_chars = undetermined_chars;
1016 equclass->_undetermined_chars_length = undetermined_chars_length;
1017 equclass->_next = NULL;
1018 if (partition)
1019 partition_last->_next = equclass;
1020 else
1021 partition = equclass;
1022 partition_last = equclass;
1023 }
1024 else
1025 delete[] undetermined_chars;
1026
1027 /* Add the keyword to the equivalence class. */
1028 KeywordExt_List *cons = new KeywordExt_List(keyword);
1029 if (equclass->_keywords)
1030 equclass->_keywords_last->rest() = cons;
1031 else
1032 equclass->_keywords = cons;
1033 equclass->_keywords_last = cons;
1034 equclass->_cardinality++;
1035 }
1036
1037 /* Free some of the allocated memory. The caller doesn't need it. */
1038 for (EquivalenceClass *cls = partition; cls; cls = cls->_next)
1039 delete[] cls->_undetermined_chars;
1040
1041 return partition;
1042}
1043
1044static void
1045delete_partition (EquivalenceClass *partition)
1046{
1047 while (partition != NULL)
1048 {
1049 EquivalenceClass *equclass = partition;
1050 partition = equclass->_next;
1051 delete_list (equclass->_keywords);
1052 //delete[] equclass->_undetermined_chars; // already freed above
1053 delete equclass;
1054 }
1055}
1056
1057/* Compute the possible number of collisions when _asso_values[c] is
1058 chosen, leading to the given partition. */
1059unsigned int
1060Search::count_possible_collisions (EquivalenceClass *partition, unsigned int c) const
1061{
1062 /* Every equivalence class p is split according to the frequency of
1063 occurrence of c, leading to equivalence classes p1, p2, ...
1064 This leads to (|p|^2 - |p1|^2 - |p2|^2 - ...)/2 possible collisions.
1065 Return the sum of this expression over all equivalence classes. */
1066 unsigned int sum = 0;
1067 unsigned int m = _max_selchars_length;
1068 DYNAMIC_ARRAY (split_cardinalities, unsigned int, m + 1);
1069 for (EquivalenceClass *cls = partition; cls; cls = cls->_next)
1070 {
1071 for (unsigned int i = 0; i <= m; i++)
1072 split_cardinalities[i] = 0;
1073
1074 for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest())
1075 {
1076 KeywordExt *keyword = temp->first();
1077
1078 unsigned int count = 0;
1079 for (int i = 0; i < keyword->_selchars_length; i++)
1080 if (keyword->_selchars[i] == c)
1081 count++;
1082
1083 split_cardinalities[count]++;
1084 }
1085
1086 sum += cls->_cardinality * cls->_cardinality;
1087 for (unsigned int i = 0; i <= m; i++)
1088 sum -= split_cardinalities[i] * split_cardinalities[i];
1089 }
1090 FREE_DYNAMIC_ARRAY (split_cardinalities);
1091 return sum;
1092}
1093
1094/* Test whether adding c to the undetermined characters changes the given
1095 partition. */
1096bool
1097Search::unchanged_partition (EquivalenceClass *partition, unsigned int c) const
1098{
1099 for (EquivalenceClass *cls = partition; cls; cls = cls->_next)
1100 {
1101 unsigned int first_count = UINT_MAX;
1102
1103 for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest())
1104 {
1105 KeywordExt *keyword = temp->first();
1106
1107 unsigned int count = 0;
1108 for (int i = 0; i < keyword->_selchars_length; i++)
1109 if (keyword->_selchars[i] == c)
1110 count++;
1111
1112 if (temp == cls->_keywords)
1113 first_count = count;
1114 else if (count != first_count)
1115 /* c would split this equivalence class. */
1116 return false;
1117 }
1118 }
1119 return true;
1120}
1121
1122void
1123Search::find_asso_values ()
1124{
1125 Step *steps;
1126
1127 /* Determine the steps, starting with the last one. */
1128 {
1129 bool *undetermined;
1130 bool *determined;
1131
1132 steps = NULL;
1133
1134 undetermined = new bool[_alpha_size];
1135 for (unsigned int c = 0; c < _alpha_size; c++)
1136 undetermined[c] = false;
1137
1138 determined = new bool[_alpha_size];
1139 for (unsigned int c = 0; c < _alpha_size; c++)
1140 determined[c] = true;
1141
1142 for (;;)
1143 {
1144 /* Compute the partition that needs to be refined. */
1145 EquivalenceClass *partition = compute_partition (undetermined);
1146
1147 /* Determine the main character to be chosen in this step.
1148 Choosing such a character c has the effect of splitting every
1149 equivalence class (according the the frequency of occurrence of c).
1150 We choose the c with the minimum number of possible collisions,
1151 so that characters which lead to a large number of collisions get
1152 handled early during the search. */
1153 unsigned int chosen_c;
1154 unsigned int chosen_possible_collisions;
1155 {
1156 unsigned int best_c = 0;
1157 unsigned int best_possible_collisions = UINT_MAX;
1158 for (unsigned int c = 0; c < _alpha_size; c++)
1159 if (_occurrences[c] > 0 && determined[c])
1160 {
1161 unsigned int possible_collisions =
1162 count_possible_collisions (partition, c);
1163 if (possible_collisions < best_possible_collisions)
1164 {
1165 best_c = c;
1166 best_possible_collisions = possible_collisions;
1167 }
1168 }
1169 if (best_possible_collisions == UINT_MAX)
1170 {
1171 /* All c with _occurrences[c] > 0 are undetermined. We are
1172 are the starting situation and don't need any more step. */
1173 delete_partition (partition);
1174 break;
1175 }
1176 chosen_c = best_c;
1177 chosen_possible_collisions = best_possible_collisions;
1178 }
1179
1180 /* We need one more step. */
1181 Step *step = new Step();
1182
1183 step->_undetermined = new bool[_alpha_size];
1184 memcpy (step->_undetermined, undetermined, _alpha_size*sizeof(bool));
1185
1186 step->_partition = partition;
1187
1188 /* Now determine how the equivalence classes will be before this
1189 step. */
1190 undetermined[chosen_c] = true;
1191 partition = compute_partition (undetermined);
1192
1193 /* Now determine which other characters should be determined in this
1194 step, because they will not change the equivalence classes at
1195 this point. It is the set of all c which, for all equivalence
1196 classes, have the same frequency of occurrence in every keyword
1197 of the equivalence class. */
1198 for (unsigned int c = 0; c < _alpha_size; c++)
1199 if (_occurrences[c] > 0 && determined[c]
1200 && unchanged_partition (partition, c))
1201 {
1202 undetermined[c] = true;
1203 determined[c] = false;
1204 }
1205
1206 /* main_c must be one of these. */
1207 if (determined[chosen_c])
1208 abort ();
1209
1210 /* Now the set of changing characters of this step. */
1211 unsigned int changing_count;
1212
1213 changing_count = 0;
1214 for (unsigned int c = 0; c < _alpha_size; c++)
1215 if (undetermined[c] && !step->_undetermined[c])
1216 changing_count++;
1217
1218 unsigned int *changing = new unsigned int[changing_count];
1219 changing_count = 0;
1220 for (unsigned int c = 0; c < _alpha_size; c++)
1221 if (undetermined[c] && !step->_undetermined[c])
1222 changing[changing_count++] = c;
1223
1224 step->_changing = changing;
1225 step->_changing_count = changing_count;
1226
1227 step->_asso_value_max = _asso_value_max;
1228
1229 step->_expected_lower =
1230 exp (static_cast<double>(chosen_possible_collisions)
1231 / static_cast<double>(_max_hash_value));
1232 step->_expected_upper =
1233 exp (static_cast<double>(chosen_possible_collisions)
1234 / static_cast<double>(_asso_value_max));
1235
1236 delete_partition (partition);
1237
1238 step->_next = steps;
1239 steps = step;
1240 }
1241
1242 delete[] determined;
1243 delete[] undetermined;
1244 }
1245
1246 if (option[DEBUG])
1247 {
1248 unsigned int stepno = 0;
1249 for (Step *step = steps; step; step = step->_next)
1250 {
1251 stepno++;
1252 fprintf (stderr, "Step %u chooses _asso_values[", stepno);
1253 for (unsigned int i = 0; i < step->_changing_count; i++)
1254 {
1255 if (i > 0)
1256 fprintf (stderr, ",");
1257 fprintf (stderr, "'%c'", step->_changing[i]);
1258 }
1259 fprintf (stderr, "], expected number of iterations between %g and %g.\n",
1260 step->_expected_lower, step->_expected_upper);
1261 fprintf (stderr, "Keyword equivalence classes:\n");
1262 for (EquivalenceClass *cls = step->_partition; cls; cls = cls->_next)
1263 {
1264 fprintf (stderr, "\n");
1265 for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest())
1266 {
1267 KeywordExt *keyword = temp->first();
1268 fprintf (stderr, " %.*s\n",
1269 keyword->_allchars_length, keyword->_allchars);
1270 }
1271 }
1272 fprintf (stderr, "\n");
1273 }
1274 }
1275
1276 /* Initialize _asso_values[]. (The value given here matters only
1277 for those c which occur in all keywords with equal multiplicity.) */
1278 for (unsigned int c = 0; c < _alpha_size; c++)
1279 _asso_values[c] = 0;
1280
1281 unsigned int stepno = 0;
1282 for (Step *step = steps; step; step = step->_next)
1283 {
1284 stepno++;
1285
1286 /* Initialize the asso_values[]. */
1287 unsigned int k = step->_changing_count;
1288 for (unsigned int i = 0; i < k; i++)
1289 {
1290 unsigned int c = step->_changing[i];
1291 _asso_values[c] =
1292 (_initial_asso_value < 0 ? rand () : _initial_asso_value)
1293 & (step->_asso_value_max - 1);
1294 }
1295
1296 unsigned int iterations = 0;
1297 DYNAMIC_ARRAY (iter, unsigned int, k);
1298 for (unsigned int i = 0; i < k; i++)
1299 iter[i] = 0;
1300 unsigned int ii = (_jump != 0 ? k - 1 : 0);
1301
1302 for (;;)
1303 {
1304 /* Test whether these asso_values[] lead to collisions among
1305 the equivalence classes that should be collision-free. */
1306 bool has_collision = false;
1307 for (EquivalenceClass *cls = step->_partition; cls; cls = cls->_next)
1308 {
1309 /* Iteration Number array is a win, O(1) initialization time! */
1310 _collision_detector->clear ();
1311
1312 for (KeywordExt_List *ptr = cls->_keywords; ptr; ptr = ptr->rest())
1313 {
1314 KeywordExt *keyword = ptr->first();
1315
1316 /* Compute the new hash code for the keyword, leaving apart
1317 the yet undetermined asso_values[]. */
1318 int hashcode;
1319 {
1320 int sum = _hash_includes_len ? keyword->_allchars_length : 0;
1321 const unsigned int *p = keyword->_selchars;
1322 int i = keyword->_selchars_length;
1323 for (; i > 0; p++, i--)
1324 if (!step->_undetermined[*p])
1325 sum += _asso_values[*p];
1326 hashcode = sum;
1327 }
1328
1329 /* See whether it collides with another keyword's hash code,
1330 from the same equivalence class. */
1331 if (_collision_detector->set_bit (hashcode))
1332 {
1333 has_collision = true;
1334 break;
1335 }
1336 }
1337
1338 /* Don't need to continue looking at the other equivalence
1339 classes if we already have found a collision. */
1340 if (has_collision)
1341 break;
1342 }
1343
1344 iterations++;
1345 if (!has_collision)
1346 break;
1347
1348 /* Try other asso_values[]. */
1349 if (_jump != 0)
1350 {
1351 /* The way we try various values for
1352 asso_values[step->_changing[0],...step->_changing[k-1]]
1353 is like this:
1354 for (bound = 0,1,...)
1355 for (ii = 0,...,k-1)
1356 iter[ii] := bound
1357 iter[0..ii-1] := values <= bound
1358 iter[ii+1..k-1] := values < bound
1359 and
1360 asso_values[step->_changing[i]] =
1361 _initial_asso_value + iter[i] * _jump.
1362 This makes it more likely to find small asso_values[].
1363 */
1364 unsigned int bound = iter[ii];
1365 unsigned int i = 0;
1366 while (i < ii)
1367 {
1368 unsigned int c = step->_changing[i];
1369 iter[i]++;
1370 _asso_values[c] =
1371 (_asso_values[c] + _jump) & (step->_asso_value_max - 1);
1372 if (iter[i] <= bound)
1373 goto found_next;
1374 _asso_values[c] =
1375 (_asso_values[c] - iter[i] * _jump)
1376 & (step->_asso_value_max - 1);
1377 iter[i] = 0;
1378 i++;
1379 }
1380 i = ii + 1;
1381 while (i < k)
1382 {
1383 unsigned int c = step->_changing[i];
1384 iter[i]++;
1385 _asso_values[c] =
1386 (_asso_values[c] + _jump) & (step->_asso_value_max - 1);
1387 if (iter[i] < bound)
1388 goto found_next;
1389 _asso_values[c] =
1390 (_asso_values[c] - iter[i] * _jump)
1391 & (step->_asso_value_max - 1);
1392 iter[i] = 0;
1393 i++;
1394 }
1395 /* Switch from one ii to the next. */
1396 {
1397 unsigned int c = step->_changing[ii];
1398 _asso_values[c] =
1399 (_asso_values[c] - bound * _jump)
1400 & (step->_asso_value_max - 1);
1401 iter[ii] = 0;
1402 }
1403 /* Here all iter[i] == 0. */
1404 ii++;
1405 if (ii == k)
1406 {
1407 ii = 0;
1408 bound++;
1409 if (bound == step->_asso_value_max)
1410 {
1411 /* Out of search space! We can either backtrack, or
1412 increase the available search space of this step.
1413 It seems simpler to choose the latter solution. */
1414 step->_asso_value_max = 2 * step->_asso_value_max;
1415 if (step->_asso_value_max > _asso_value_max)
1416 {
1417 _asso_value_max = step->_asso_value_max;
1418 /* Reinitialize _max_hash_value. */
1419 _max_hash_value =
1420 (_hash_includes_len ? _max_key_len : 0)
1421 + (_asso_value_max - 1) * _max_selchars_length;
1422 /* Reinitialize _collision_detector. */
1423 delete _collision_detector;
1424 _collision_detector =
1425 new Bool_Array (_max_hash_value + 1);
1426 }
1427 }
1428 }
1429 {
1430 unsigned int c = step->_changing[ii];
1431 iter[ii] = bound;
1432 _asso_values[c] =
1433 (_asso_values[c] + bound * _jump)
1434 & (step->_asso_value_max - 1);
1435 }
1436 found_next: ;
1437 }
1438 else
1439 {
1440 /* Random. */
1441 unsigned int c = step->_changing[ii];
1442 _asso_values[c] =
1443 (_asso_values[c] + rand ()) & (step->_asso_value_max - 1);
1444 /* Next time, change the next c. */
1445 ii++;
1446 if (ii == k)
1447 ii = 0;
1448 }
1449 }
1450 FREE_DYNAMIC_ARRAY (iter);
1451
1452 if (option[DEBUG])
1453 {
1454 fprintf (stderr, "Step %u chose _asso_values[", stepno);
1455 for (unsigned int i = 0; i < step->_changing_count; i++)
1456 {
1457 if (i > 0)
1458 fprintf (stderr, ",");
1459 fprintf (stderr, "'%c'", step->_changing[i]);
1460 }
1461 fprintf (stderr, "] in %u iterations.\n", iterations);
1462 }
1463 }
1464
1465 /* Free allocated memory. */
1466 while (steps != NULL)
1467 {
1468 Step *step = steps;
1469 steps = step->_next;
1470 delete[] step->_changing;
1471 delete[] step->_undetermined;
1472 delete_partition (step->_partition);
1473 delete step;
1474 }
1475}
1476
1477/* Computes a keyword's hash value, relative to the current _asso_values[],
1478 and stores it in keyword->_hash_value. */
1479
1480inline int
1481Search::compute_hash (KeywordExt *keyword) const
1482{
1483 int sum = _hash_includes_len ? keyword->_allchars_length : 0;
1484
1485 const unsigned int *p = keyword->_selchars;
1486 int i = keyword->_selchars_length;
1487 for (; i > 0; p++, i--)
1488 sum += _asso_values[*p];
1489
1490 return keyword->_hash_value = sum;
1491}
1492
1493/* Finds good _asso_values[]. */
1494
1495void
1496Search::find_good_asso_values ()
1497{
1498 prepare_asso_values ();
1499
1500 /* Search for good _asso_values[]. */
1501 int asso_iteration;
1502 if ((asso_iteration = option.get_asso_iterations ()) == 0)
1503 /* Try only the given _initial_asso_value and _jump. */
1504 find_asso_values ();
1505 else
1506 {
1507 /* Try different pairs of _initial_asso_value and _jump, in the
1508 following order:
1509 (0, 1)
1510 (1, 1)
1511 (2, 1) (0, 3)
1512 (3, 1) (1, 3)
1513 (4, 1) (2, 3) (0, 5)
1514 (5, 1) (3, 3) (1, 5)
1515 ..... */
1516 KeywordExt_List *saved_head = _head;
1517 int best_initial_asso_value = 0;
1518 int best_jump = 1;
1519 int *best_asso_values = new int[_alpha_size];
1520 int best_collisions = INT_MAX;
1521 int best_max_hash_value = INT_MAX;
1522
1523 _initial_asso_value = 0; _jump = 1;
1524 for (;;)
1525 {
1526 /* Restore the keyword list in its original order. */
1527 _head = copy_list (saved_head);
1528 /* Find good _asso_values[]. */
1529 find_asso_values ();
1530 /* Test whether it is the best solution so far. */
1531 int collisions = 0;
1532 int max_hash_value = INT_MIN;
1533 _collision_detector->clear ();
1534 for (KeywordExt_List *ptr = _head; ptr; ptr = ptr->rest())
1535 {
1536 KeywordExt *keyword = ptr->first();
1537 int hashcode = compute_hash (keyword);
1538 if (max_hash_value < hashcode)
1539 max_hash_value = hashcode;
1540 if (_collision_detector->set_bit (hashcode))
1541 collisions++;
1542 }
1543 if (collisions < best_collisions
1544 || (collisions == best_collisions
1545 && max_hash_value < best_max_hash_value))
1546 {
1547 memcpy (best_asso_values, _asso_values,
1548 _alpha_size * sizeof (_asso_values[0]));
1549 best_collisions = collisions;
1550 best_max_hash_value = max_hash_value;
1551 }
1552 /* Delete the copied keyword list. */
1553 delete_list (_head);
1554
1555 if (--asso_iteration == 0)
1556 break;
1557 /* Prepare for next iteration. */
1558 if (_initial_asso_value >= 2)
1559 _initial_asso_value -= 2, _jump += 2;
1560 else
1561 _initial_asso_value += _jump, _jump = 1;
1562 }
1563 _head = saved_head;
1564 /* Install the best found asso_values. */
1565 _initial_asso_value = best_initial_asso_value;
1566 _jump = best_jump;
1567 memcpy (_asso_values, best_asso_values,
1568 _alpha_size * sizeof (_asso_values[0]));
1569 delete[] best_asso_values;
1570 /* The keywords' _hash_value fields are recomputed below. */
1571 }
1572}
1573
1574/* ========================================================================= */
1575
1576/* Comparison function for sorting by increasing _hash_value. */
1577static bool
1578less_by_hash_value (KeywordExt *keyword1, KeywordExt *keyword2)
1579{
1580 return keyword1->_hash_value < keyword2->_hash_value;
1581}
1582
1583/* Sorts the keyword list by hash value. */
1584
1585void
1586Search::sort ()
1587{
1588 _head = mergesort_list (_head, less_by_hash_value);
1589}
1590
1591void
1592Search::optimize ()
1593{
1594 /* Preparations. */
1595 prepare ();
1596
1597 /* Step 1: Finding good byte positions. */
1598 find_positions ();
1599
1600 /* Step 2: Finding good alpha increments. */
1601 find_alpha_inc ();
1602
1603 /* Step 3: Finding good asso_values. */
1604 find_good_asso_values ();
1605
1606 /* Make one final check, just to make sure nothing weird happened.... */
1607 _collision_detector->clear ();
1608 for (KeywordExt_List *curr_ptr = _head; curr_ptr; curr_ptr = curr_ptr->rest())
1609 {
1610 KeywordExt *curr = curr_ptr->first();
1611 unsigned int hashcode = compute_hash (curr);
1612 if (_collision_detector->set_bit (hashcode))
1613 {
1614 /* This shouldn't happen. proj1, proj2, proj3 must have been
1615 computed to be injective on the given keyword set. */
1616 fprintf (stderr,
1617 "\nInternal error, unexpected duplicate hash code\n");
1618 if (option[POSITIONS])
1619 fprintf (stderr, "try options -m or -r, or use new key positions.\n\n");
1620 else
1621 fprintf (stderr, "try options -m or -r.\n\n");
1622 exit (1);
1623 }
1624 }
1625
1626 /* Sorts the keyword list by hash value. */
1627 sort ();
1628
1629 /* Set unused asso_values[c] to max_hash_value + 1. This is not absolutely
1630 necessary, but speeds up the lookup function in many cases of lookup
1631 failure: no string comparison is needed once the hash value of a string
1632 is larger than the hash value of any keyword. */
1633 int max_hash_value;
1634 {
1635 KeywordExt_List *temp;
1636 for (temp = _head; temp->rest(); temp = temp->rest())
1637 ;
1638 max_hash_value = temp->first()->_hash_value;
1639 }
1640 for (unsigned int c = 0; c < _alpha_size; c++)
1641 if (_occurrences[c] == 0)
1642 _asso_values[c] = max_hash_value + 1;
1643
1644 /* Propagate unified asso_values. */
1645 if (_alpha_unify)
1646 {
1647 for (unsigned int c = 0; c < _alpha_size; c++)
1648 if (_alpha_unify[c] != c)
1649 _asso_values[c] = _asso_values[_alpha_unify[c]];
1650 }
1651}
1652
1653/* Prints out some diagnostics upon completion. */
1654
1655Search::~Search ()
1656{
1657 delete _collision_detector;
1658 if (option[DEBUG])
1659 {
1660 fprintf (stderr, "\ndumping occurrence and associated values tables\n");
1661
1662 for (unsigned int i = 0; i < _alpha_size; i++)
1663 if (_occurrences[i])
1664 fprintf (stderr, "asso_values[%c] = %6d, occurrences[%c] = %6d\n",
1665 i, _asso_values[i], i, _occurrences[i]);
1666
1667 fprintf (stderr, "end table dumping\n");
1668
1669 fprintf (stderr, "\nDumping key list information:\ntotal non-static linked keywords = %d"
1670 "\ntotal keywords = %d\ntotal duplicates = %d\nmaximum key length = %d\n",
1671 _list_len, _total_keys, _total_duplicates, _max_key_len);
1672
1673 int field_width = _max_selchars_length;
1674 fprintf (stderr, "\nList contents are:\n(hash value, key length, index, %*s, keyword):\n",
1675 field_width, "selchars");
1676 for (KeywordExt_List *ptr = _head; ptr; ptr = ptr->rest())
1677 {
1678 fprintf (stderr, "%11d,%11d,%6d, ",
1679 ptr->first()->_hash_value, ptr->first()->_allchars_length, ptr->first()->_final_index);
1680 if (field_width > ptr->first()->_selchars_length)
1681 fprintf (stderr, "%*s", field_width - ptr->first()->_selchars_length, "");
1682 for (int j = 0; j < ptr->first()->_selchars_length; j++)
1683 putc (ptr->first()->_selchars[j], stderr);
1684 fprintf (stderr, ", %.*s\n",
1685 ptr->first()->_allchars_length, ptr->first()->_allchars);
1686 }
1687
1688 fprintf (stderr, "End dumping list.\n\n");
1689 }
1690 delete[] _asso_values;
1691 delete[] _occurrences;
1692 delete[] _alpha_unify;
1693 delete[] _alpha_inc;
1694}
1695