1 | /* Search algorithm. |
2 | Copyright (C) 1989-1998, 2000, 2002, 2009, 2016 Free Software Foundation, Inc. |
3 | Written by Douglas C. Schmidt <schmidt@ics.uci.edu> |
4 | and Bruno Haible <bruno@clisp.org>. |
5 | |
6 | This file is part of GNU GPERF. |
7 | |
8 | This program is free software: you can redistribute it and/or modify |
9 | it under the terms of the GNU General Public License as published by |
10 | the Free Software Foundation; either version 3 of the License, or |
11 | (at your option) any later version. |
12 | |
13 | This program is distributed in the hope that it will be useful, |
14 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 | GNU General Public License for more details. |
17 | |
18 | You should have received a copy of the GNU General Public License |
19 | along with this program. If not, see <http://www.gnu.org/licenses/>. */ |
20 | |
21 | /* Specification. */ |
22 | #include "search.h" |
23 | |
24 | #include <stdio.h> |
25 | #include <stdlib.h> /* declares exit(), rand(), srand() */ |
26 | #include <string.h> /* declares memset(), memcmp() */ |
27 | #include <time.h> /* declares time() */ |
28 | #include <math.h> /* declares exp() */ |
29 | #include <limits.h> /* defines INT_MIN, INT_MAX, UINT_MAX */ |
30 | #include "options.h" |
31 | #include "hash-table.h" |
32 | #include "config.h" |
33 | |
34 | /* ============================== Portability ============================== */ |
35 | |
36 | /* Assume ISO C++ 'for' scoping rule. */ |
37 | #define for if (0) ; else for |
38 | |
39 | /* Dynamically allocated array with dynamic extent: |
40 | |
41 | Example: |
42 | DYNAMIC_ARRAY (my_array, int, n); |
43 | ... |
44 | FREE_DYNAMIC_ARRAY (my_array); |
45 | |
46 | Attention: depending on your implementation my_array is either the array |
47 | itself or a pointer to the array! Always use my_array only as expression! |
48 | */ |
49 | #if HAVE_DYNAMIC_ARRAY |
50 | #define DYNAMIC_ARRAY(var,eltype,size) eltype var[size] |
51 | #define FREE_DYNAMIC_ARRAY(var) |
52 | #else |
53 | #define DYNAMIC_ARRAY(var,eltype,size) eltype *var = new eltype[size] |
54 | #define FREE_DYNAMIC_ARRAY(var) delete[] var |
55 | #endif |
56 | |
57 | /* ================================ Theory ================================= */ |
58 | |
59 | /* The general form of the hash function is |
60 | |
61 | hash (keyword) = sum (asso_values[keyword[i] + alpha_inc[i]] : i in Pos) |
62 | + len (keyword) |
63 | |
64 | where Pos is a set of byte positions, |
65 | each alpha_inc[i] is a nonnegative integer, |
66 | each asso_values[c] is a nonnegative integer, |
67 | len (keyword) is the keyword's length if _hash_includes_len, or 0 otherwise. |
68 | |
69 | Theorem 1: If all keywords are different, there is a set Pos such that |
70 | all tuples (keyword[i] : i in Pos) are different. |
71 | |
72 | Theorem 2: If all tuples (keyword[i] : i in Pos) are different, there |
73 | are nonnegative integers alpha_inc[i] such that all multisets |
74 | {keyword[i] + alpha_inc[i] : i in Pos} are different. |
75 | |
76 | Define selchars[keyword] := {keyword[i] + alpha_inc[i] : i in Pos}. |
77 | |
78 | Theorem 3: If all multisets selchars[keyword] are different, there are |
79 | nonnegative integers asso_values[c] such that all hash values |
80 | sum (asso_values[c] : c in selchars[keyword]) are different. |
81 | |
82 | Based on these three facts, we find the hash function in three steps: |
83 | |
84 | Step 1 (Finding good byte positions): |
85 | Find a set Pos, as small as possible, such that all tuples |
86 | (keyword[i] : i in Pos) are different. |
87 | |
88 | Step 2 (Finding good alpha increments): |
89 | Find nonnegative integers alpha_inc[i], as many of them as possible being |
90 | zero, and the others being as small as possible, such that all multisets |
91 | {keyword[i] + alpha_inc[i] : i in Pos} are different. |
92 | |
93 | Step 3 (Finding good asso_values): |
94 | Find asso_values[c] such that all hash (keyword) are different. |
95 | |
96 | In other words, each step finds a projection that is injective on the |
97 | given finite set: |
98 | proj1 : String --> Map (Pos --> N) |
99 | proj2 : Map (Pos --> N) --> Map (Pos --> N) / S(Pos) |
100 | proj3 : Map (Pos --> N) / S(Pos) --> N |
101 | where |
102 | N denotes the set of nonnegative integers, |
103 | Map (A --> B) := Hom_Set (A, B) is the set of maps from A to B, and |
104 | S(Pos) is the symmetric group over Pos. |
105 | |
106 | This was the theory for !_hash_includes_len; if _hash_includes_len, slight |
107 | modifications apply: |
108 | proj1 : String --> Map (Pos --> N) x N |
109 | proj2 : Map (Pos --> N) x N --> Map (Pos --> N) / S(Pos) x N |
110 | proj3 : Map (Pos --> N) / S(Pos) x N --> N |
111 | |
112 | For a case-insensitive hash function, the general form is |
113 | |
114 | hash (keyword) = |
115 | sum (asso_values[alpha_unify[keyword[i] + alpha_inc[i]]] : i in Pos) |
116 | + len (keyword) |
117 | |
118 | where alpha_unify[c] is chosen so that an upper/lower case change in |
119 | keyword[i] doesn't change alpha_unify[keyword[i] + alpha_inc[i]]. |
120 | */ |
121 | |
122 | /* ==================== Initialization and Preparation ===================== */ |
123 | |
124 | Search::Search (KeywordExt_List *list) |
125 | : _head (list) |
126 | { |
127 | } |
128 | |
129 | void |
130 | Search::prepare () |
131 | { |
132 | /* Compute the total number of keywords. */ |
133 | _total_keys = 0; |
134 | for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) |
135 | _total_keys++; |
136 | |
137 | /* Compute the minimum and maximum keyword length. */ |
138 | _max_key_len = INT_MIN; |
139 | _min_key_len = INT_MAX; |
140 | for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) |
141 | { |
142 | KeywordExt *keyword = temp->first(); |
143 | |
144 | if (_max_key_len < keyword->_allchars_length) |
145 | _max_key_len = keyword->_allchars_length; |
146 | if (_min_key_len > keyword->_allchars_length) |
147 | _min_key_len = keyword->_allchars_length; |
148 | } |
149 | |
150 | /* Exit program if an empty string is used as keyword, since the comparison |
151 | expressions don't work correctly for looking up an empty string. */ |
152 | if (_min_key_len == 0) |
153 | { |
154 | fprintf (stderr, "Empty input keyword is not allowed.\n" |
155 | "To recognize an empty input keyword, your code should check for\n" |
156 | "len == 0 before calling the gperf generated lookup function.\n" ); |
157 | exit (1); |
158 | } |
159 | |
160 | /* Exit program if the characters in the keywords are not in the required |
161 | range. */ |
162 | if (option[SEVENBIT]) |
163 | { |
164 | for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) |
165 | { |
166 | KeywordExt *keyword = temp->first(); |
167 | |
168 | const char *k = keyword->_allchars; |
169 | for (int i = keyword->_allchars_length; i > 0; k++, i--) |
170 | if (!(static_cast<unsigned char>(*k) < 128)) |
171 | { |
172 | fprintf (stderr, "Option --seven-bit has been specified,\n" |
173 | "but keyword \"%.*s\" contains non-ASCII characters.\n" |
174 | "Try removing option --seven-bit.\n" , |
175 | keyword->_allchars_length, keyword->_allchars); |
176 | exit (1); |
177 | } |
178 | } |
179 | } |
180 | |
181 | /* Determine whether the hash function shall include the length. */ |
182 | _hash_includes_len = !(option[NOLENGTH] || (_min_key_len == _max_key_len)); |
183 | } |
184 | |
185 | /* ====================== Finding good byte positions ====================== */ |
186 | |
187 | /* Computes the upper bound on the indices passed to asso_values[], |
188 | assuming no alpha_increments. */ |
189 | unsigned int |
190 | Search::compute_alpha_size () const |
191 | { |
192 | return (option[SEVENBIT] ? 128 : 256); |
193 | } |
194 | |
195 | /* Computes the unification rules between different asso_values[c], |
196 | assuming no alpha_increments. */ |
197 | unsigned int * |
198 | Search::compute_alpha_unify () const |
199 | { |
200 | if (option[UPPERLOWER]) |
201 | { |
202 | /* Uppercase to lowercase mapping. */ |
203 | unsigned int alpha_size = compute_alpha_size(); |
204 | unsigned int *alpha_unify = new unsigned int[alpha_size]; |
205 | for (unsigned int c = 0; c < alpha_size; c++) |
206 | alpha_unify[c] = c; |
207 | for (unsigned int c = 'A'; c <= 'Z'; c++) |
208 | alpha_unify[c] = c + ('a'-'A'); |
209 | return alpha_unify; |
210 | } |
211 | else |
212 | /* Identity mapping. */ |
213 | return NULL; |
214 | } |
215 | |
216 | /* Initializes each keyword's _selchars array. */ |
217 | void |
218 | Search::init_selchars_tuple (const Positions& positions, const unsigned int *alpha_unify) const |
219 | { |
220 | for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) |
221 | temp->first()->init_selchars_tuple(positions, alpha_unify); |
222 | } |
223 | |
224 | /* Deletes each keyword's _selchars array. */ |
225 | void |
226 | Search::delete_selchars () const |
227 | { |
228 | for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) |
229 | temp->first()->delete_selchars(); |
230 | } |
231 | |
232 | /* Count the duplicate keywords that occur with a given set of positions. |
233 | In other words, it returns the difference |
234 | # K - # proj1 (K) |
235 | where K is the multiset of given keywords. */ |
236 | unsigned int |
237 | Search::count_duplicates_tuple (const Positions& positions, const unsigned int *alpha_unify) const |
238 | { |
239 | /* Run through the keyword list and count the duplicates incrementally. |
240 | The result does not depend on the order of the keyword list, thanks to |
241 | the formula above. */ |
242 | init_selchars_tuple (positions, alpha_unify); |
243 | |
244 | unsigned int count = 0; |
245 | { |
246 | Hash_Table representatives (_total_keys, !_hash_includes_len); |
247 | for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) |
248 | { |
249 | KeywordExt *keyword = temp->first(); |
250 | if (representatives.insert (keyword)) |
251 | count++; |
252 | } |
253 | } |
254 | |
255 | delete_selchars (); |
256 | |
257 | return count; |
258 | } |
259 | |
260 | /* Find good key positions. */ |
261 | |
262 | void |
263 | Search::find_positions () |
264 | { |
265 | /* If the user gave the key positions, we use them. */ |
266 | if (option[POSITIONS]) |
267 | { |
268 | _key_positions = option.get_key_positions(); |
269 | return; |
270 | } |
271 | |
272 | /* Compute preliminary alpha_unify table. */ |
273 | unsigned int *alpha_unify = compute_alpha_unify (); |
274 | |
275 | /* 1. Find positions that must occur in order to distinguish duplicates. */ |
276 | Positions mandatory; |
277 | |
278 | if (!option[DUP]) |
279 | { |
280 | for (KeywordExt_List *l1 = _head; l1 && l1->rest(); l1 = l1->rest()) |
281 | { |
282 | KeywordExt *keyword1 = l1->first(); |
283 | for (KeywordExt_List *l2 = l1->rest(); l2; l2 = l2->rest()) |
284 | { |
285 | KeywordExt *keyword2 = l2->first(); |
286 | |
287 | /* If keyword1 and keyword2 have the same length and differ |
288 | in just one position, and it is not the last character, |
289 | this position is mandatory. */ |
290 | if (keyword1->_allchars_length == keyword2->_allchars_length) |
291 | { |
292 | int n = keyword1->_allchars_length; |
293 | int i; |
294 | for (i = 0; i < n - 1; i++) |
295 | { |
296 | unsigned char c1 = keyword1->_allchars[i]; |
297 | unsigned char c2 = keyword2->_allchars[i]; |
298 | if (option[UPPERLOWER]) |
299 | { |
300 | if (c1 >= 'A' && c1 <= 'Z') |
301 | c1 += 'a' - 'A'; |
302 | if (c2 >= 'A' && c2 <= 'Z') |
303 | c2 += 'a' - 'A'; |
304 | } |
305 | if (c1 != c2) |
306 | break; |
307 | } |
308 | if (i < n - 1) |
309 | { |
310 | int j; |
311 | for (j = i + 1; j < n; j++) |
312 | { |
313 | unsigned char c1 = keyword1->_allchars[j]; |
314 | unsigned char c2 = keyword2->_allchars[j]; |
315 | if (option[UPPERLOWER]) |
316 | { |
317 | if (c1 >= 'A' && c1 <= 'Z') |
318 | c1 += 'a' - 'A'; |
319 | if (c2 >= 'A' && c2 <= 'Z') |
320 | c2 += 'a' - 'A'; |
321 | } |
322 | if (c1 != c2) |
323 | break; |
324 | } |
325 | if (j >= n) |
326 | { |
327 | /* Position i is mandatory. */ |
328 | if (!mandatory.contains (i)) |
329 | mandatory.add (i); |
330 | } |
331 | } |
332 | } |
333 | } |
334 | } |
335 | } |
336 | |
337 | /* 2. Add positions, as long as this decreases the duplicates count. */ |
338 | int imax = (_max_key_len - 1 < Positions::MAX_KEY_POS - 1 |
339 | ? _max_key_len - 1 : Positions::MAX_KEY_POS - 1); |
340 | Positions current = mandatory; |
341 | unsigned int current_duplicates_count = |
342 | count_duplicates_tuple (current, alpha_unify); |
343 | for (;;) |
344 | { |
345 | Positions best; |
346 | unsigned int best_duplicates_count = UINT_MAX; |
347 | |
348 | for (int i = imax; i >= -1; i--) |
349 | if (!current.contains (i)) |
350 | { |
351 | Positions tryal = current; |
352 | tryal.add (i); |
353 | unsigned int try_duplicates_count = |
354 | count_duplicates_tuple (tryal, alpha_unify); |
355 | |
356 | /* We prefer 'try' to 'best' if it produces less duplicates, |
357 | or if it produces the same number of duplicates but with |
358 | a more efficient hash function. */ |
359 | if (try_duplicates_count < best_duplicates_count |
360 | || (try_duplicates_count == best_duplicates_count && i >= 0)) |
361 | { |
362 | best = tryal; |
363 | best_duplicates_count = try_duplicates_count; |
364 | } |
365 | } |
366 | |
367 | /* Stop adding positions when it gives no improvement. */ |
368 | if (best_duplicates_count >= current_duplicates_count) |
369 | break; |
370 | |
371 | current = best; |
372 | current_duplicates_count = best_duplicates_count; |
373 | } |
374 | |
375 | /* 3. Remove positions, as long as this doesn't increase the duplicates |
376 | count. */ |
377 | for (;;) |
378 | { |
379 | Positions best; |
380 | unsigned int best_duplicates_count = UINT_MAX; |
381 | |
382 | for (int i = imax; i >= -1; i--) |
383 | if (current.contains (i) && !mandatory.contains (i)) |
384 | { |
385 | Positions tryal = current; |
386 | tryal.remove (i); |
387 | unsigned int try_duplicates_count = |
388 | count_duplicates_tuple (tryal, alpha_unify); |
389 | |
390 | /* We prefer 'try' to 'best' if it produces less duplicates, |
391 | or if it produces the same number of duplicates but with |
392 | a more efficient hash function. */ |
393 | if (try_duplicates_count < best_duplicates_count |
394 | || (try_duplicates_count == best_duplicates_count && i == -1)) |
395 | { |
396 | best = tryal; |
397 | best_duplicates_count = try_duplicates_count; |
398 | } |
399 | } |
400 | |
401 | /* Stop removing positions when it gives no improvement. */ |
402 | if (best_duplicates_count > current_duplicates_count) |
403 | break; |
404 | |
405 | current = best; |
406 | current_duplicates_count = best_duplicates_count; |
407 | } |
408 | |
409 | /* 4. Replace two positions by one, as long as this doesn't increase the |
410 | duplicates count. */ |
411 | for (;;) |
412 | { |
413 | Positions best; |
414 | unsigned int best_duplicates_count = UINT_MAX; |
415 | |
416 | for (int i1 = imax; i1 >= -1; i1--) |
417 | if (current.contains (i1) && !mandatory.contains (i1)) |
418 | { |
419 | for (int i2 = imax; i2 >= -1; i2--) |
420 | if (current.contains (i2) && !mandatory.contains (i2) && i2 != i1) |
421 | { |
422 | for (int i3 = imax; i3 >= 0; i3--) |
423 | if (!current.contains (i3)) |
424 | { |
425 | Positions tryal = current; |
426 | tryal.remove (i1); |
427 | tryal.remove (i2); |
428 | tryal.add (i3); |
429 | unsigned int try_duplicates_count = |
430 | count_duplicates_tuple (tryal, alpha_unify); |
431 | |
432 | /* We prefer 'try' to 'best' if it produces less |
433 | duplicates, or if it produces the same number |
434 | of duplicates but with a more efficient hash |
435 | function. */ |
436 | if (try_duplicates_count < best_duplicates_count |
437 | || (try_duplicates_count == best_duplicates_count |
438 | && (i1 == -1 || i2 == -1 || i3 >= 0))) |
439 | { |
440 | best = tryal; |
441 | best_duplicates_count = try_duplicates_count; |
442 | } |
443 | } |
444 | } |
445 | } |
446 | |
447 | /* Stop removing positions when it gives no improvement. */ |
448 | if (best_duplicates_count > current_duplicates_count) |
449 | break; |
450 | |
451 | current = best; |
452 | current_duplicates_count = best_duplicates_count; |
453 | } |
454 | |
455 | /* That's it. Hope it's good enough. */ |
456 | _key_positions = current; |
457 | |
458 | if (option[DEBUG]) |
459 | { |
460 | /* Print the result. */ |
461 | fprintf (stderr, "\nComputed positions: " ); |
462 | PositionReverseIterator iter = _key_positions.reviterator(); |
463 | bool seen_lastchar = false; |
464 | bool first = true; |
465 | for (int i; (i = iter.next ()) != PositionReverseIterator::EOS; ) |
466 | { |
467 | if (!first) |
468 | fprintf (stderr, ", " ); |
469 | if (i == Positions::LASTCHAR) |
470 | seen_lastchar = true; |
471 | else |
472 | { |
473 | fprintf (stderr, "%d" , i + 1); |
474 | first = false; |
475 | } |
476 | } |
477 | if (seen_lastchar) |
478 | { |
479 | if (!first) |
480 | fprintf (stderr, ", " ); |
481 | fprintf (stderr, "$" ); |
482 | } |
483 | fprintf (stderr, "\n" ); |
484 | } |
485 | |
486 | /* Free preliminary alpha_unify table. */ |
487 | delete[] alpha_unify; |
488 | } |
489 | |
490 | /* Count the duplicate keywords that occur with the found set of positions. |
491 | In other words, it returns the difference |
492 | # K - # proj1 (K) |
493 | where K is the multiset of given keywords. */ |
494 | unsigned int |
495 | Search::count_duplicates_tuple () const |
496 | { |
497 | unsigned int *alpha_unify = compute_alpha_unify (); |
498 | unsigned int count = count_duplicates_tuple (_key_positions, alpha_unify); |
499 | delete[] alpha_unify; |
500 | return count; |
501 | } |
502 | |
503 | /* ===================== Finding good alpha increments ===================== */ |
504 | |
505 | /* Computes the upper bound on the indices passed to asso_values[]. */ |
506 | unsigned int |
507 | Search::compute_alpha_size (const unsigned int *alpha_inc) const |
508 | { |
509 | unsigned int max_alpha_inc = 0; |
510 | for (int i = 0; i < _max_key_len; i++) |
511 | if (max_alpha_inc < alpha_inc[i]) |
512 | max_alpha_inc = alpha_inc[i]; |
513 | return (option[SEVENBIT] ? 128 : 256) + max_alpha_inc; |
514 | } |
515 | |
516 | /* Computes the unification rules between different asso_values[c]. */ |
517 | unsigned int * |
518 | Search::compute_alpha_unify (const Positions& positions, const unsigned int *alpha_inc) const |
519 | { |
520 | if (option[UPPERLOWER]) |
521 | { |
522 | /* Without alpha increments, we would simply unify |
523 | 'A' -> 'a', ..., 'Z' -> 'z'. |
524 | But when a keyword contains at position i a character c, |
525 | we have the constraint |
526 | asso_values[tolower(c) + alpha_inc[i]] == |
527 | asso_values[toupper(c) + alpha_inc[i]]. |
528 | This introduces a unification |
529 | toupper(c) + alpha_inc[i] -> tolower(c) + alpha_inc[i]. |
530 | Note that this unification can extend outside the range of |
531 | ASCII letters! But still every unified character pair is at |
532 | a distance of 'a'-'A' = 32, or (after chained unification) |
533 | at a multiple of 32. So in the end the alpha_unify vector has |
534 | the form c -> c + 32 * f(c) where f(c) is a nonnegative |
535 | integer. */ |
536 | unsigned int alpha_size = compute_alpha_size (alpha_inc); |
537 | |
538 | unsigned int *alpha_unify = new unsigned int[alpha_size]; |
539 | for (unsigned int c = 0; c < alpha_size; c++) |
540 | alpha_unify[c] = c; |
541 | |
542 | for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) |
543 | { |
544 | KeywordExt *keyword = temp->first(); |
545 | |
546 | /* Iterate through the selected character positions. */ |
547 | PositionIterator iter = positions.iterator(keyword->_allchars_length); |
548 | |
549 | for (int i; (i = iter.next ()) != PositionIterator::EOS; ) |
550 | { |
551 | unsigned int c; |
552 | if (i == Positions::LASTCHAR) |
553 | c = static_cast<unsigned char>(keyword->_allchars[keyword->_allchars_length - 1]); |
554 | else if (i < keyword->_allchars_length) |
555 | c = static_cast<unsigned char>(keyword->_allchars[i]); |
556 | else |
557 | abort (); |
558 | if (c >= 'A' && c <= 'Z') |
559 | c += 'a' - 'A'; |
560 | if (c >= 'a' && c <= 'z') |
561 | { |
562 | if (i != Positions::LASTCHAR) |
563 | c += alpha_inc[i]; |
564 | /* Unify c with c - ('a'-'A'). */ |
565 | unsigned int d = alpha_unify[c]; |
566 | unsigned int b = c - ('a'-'A'); |
567 | for (int a = b; a >= 0 && alpha_unify[a] == b; a -= ('a'-'A')) |
568 | alpha_unify[a] = d; |
569 | } |
570 | } |
571 | } |
572 | return alpha_unify; |
573 | } |
574 | else |
575 | /* Identity mapping. */ |
576 | return NULL; |
577 | } |
578 | |
579 | /* Initializes each keyword's _selchars array. */ |
580 | void |
581 | Search::init_selchars_multiset (const Positions& positions, const unsigned int *alpha_unify, const unsigned int *alpha_inc) const |
582 | { |
583 | for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) |
584 | temp->first()->init_selchars_multiset(positions, alpha_unify, alpha_inc); |
585 | } |
586 | |
587 | /* Count the duplicate keywords that occur with the given set of positions |
588 | and a given alpha_inc[] array. |
589 | In other words, it returns the difference |
590 | # K - # proj2 (proj1 (K)) |
591 | where K is the multiset of given keywords. */ |
592 | unsigned int |
593 | Search::count_duplicates_multiset (const unsigned int *alpha_inc) const |
594 | { |
595 | /* Run through the keyword list and count the duplicates incrementally. |
596 | The result does not depend on the order of the keyword list, thanks to |
597 | the formula above. */ |
598 | unsigned int *alpha_unify = compute_alpha_unify (_key_positions, alpha_inc); |
599 | init_selchars_multiset (_key_positions, alpha_unify, alpha_inc); |
600 | |
601 | unsigned int count = 0; |
602 | { |
603 | Hash_Table representatives (_total_keys, !_hash_includes_len); |
604 | for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) |
605 | { |
606 | KeywordExt *keyword = temp->first(); |
607 | if (representatives.insert (keyword)) |
608 | count++; |
609 | } |
610 | } |
611 | |
612 | delete_selchars (); |
613 | delete[] alpha_unify; |
614 | |
615 | return count; |
616 | } |
617 | |
618 | /* Find good _alpha_inc[]. */ |
619 | |
620 | void |
621 | Search::find_alpha_inc () |
622 | { |
623 | /* The goal is to choose _alpha_inc[] such that it doesn't introduce |
624 | artificial duplicates. |
625 | In other words, the goal is # proj2 (proj1 (K)) = # proj1 (K). */ |
626 | unsigned int duplicates_goal = count_duplicates_tuple (); |
627 | |
628 | /* Start with zero increments. This is sufficient in most cases. */ |
629 | unsigned int *current = new unsigned int [_max_key_len]; |
630 | for (int i = 0; i < _max_key_len; i++) |
631 | current[i] = 0; |
632 | unsigned int current_duplicates_count = count_duplicates_multiset (current); |
633 | |
634 | if (current_duplicates_count > duplicates_goal) |
635 | { |
636 | /* Look which _alpha_inc[i] we are free to increment. */ |
637 | unsigned int nindices; |
638 | { |
639 | nindices = 0; |
640 | PositionIterator iter = _key_positions.iterator(_max_key_len); |
641 | for (;;) |
642 | { |
643 | int key_pos = iter.next (); |
644 | if (key_pos == PositionIterator::EOS) |
645 | break; |
646 | if (key_pos != Positions::LASTCHAR) |
647 | nindices++; |
648 | } |
649 | } |
650 | |
651 | DYNAMIC_ARRAY (indices, unsigned int, nindices); |
652 | { |
653 | unsigned int j = 0; |
654 | PositionIterator iter = _key_positions.iterator(_max_key_len); |
655 | for (;;) |
656 | { |
657 | int key_pos = iter.next (); |
658 | if (key_pos == PositionIterator::EOS) |
659 | break; |
660 | if (key_pos != Positions::LASTCHAR) |
661 | indices[j++] = key_pos; |
662 | } |
663 | if (!(j == nindices)) |
664 | abort (); |
665 | } |
666 | |
667 | /* Perform several rounds of searching for a good alpha increment. |
668 | Each round reduces the number of artificial collisions by adding |
669 | an increment in a single key position. */ |
670 | DYNAMIC_ARRAY (best, unsigned int, _max_key_len); |
671 | DYNAMIC_ARRAY (tryal, unsigned int, _max_key_len); |
672 | do |
673 | { |
674 | /* An increment of 1 is not always enough. Try higher increments |
675 | also. */ |
676 | for (unsigned int inc = 1; ; inc++) |
677 | { |
678 | unsigned int best_duplicates_count = UINT_MAX; |
679 | |
680 | for (unsigned int j = 0; j < nindices; j++) |
681 | { |
682 | memcpy (tryal, current, _max_key_len * sizeof (unsigned int)); |
683 | tryal[indices[j]] += inc; |
684 | unsigned int try_duplicates_count = |
685 | count_duplicates_multiset (tryal); |
686 | |
687 | /* We prefer 'try' to 'best' if it produces less |
688 | duplicates. */ |
689 | if (try_duplicates_count < best_duplicates_count) |
690 | { |
691 | memcpy (best, tryal, _max_key_len * sizeof (unsigned int)); |
692 | best_duplicates_count = try_duplicates_count; |
693 | } |
694 | } |
695 | |
696 | /* Stop this round when we got an improvement. */ |
697 | if (best_duplicates_count < current_duplicates_count) |
698 | { |
699 | memcpy (current, best, _max_key_len * sizeof (unsigned int)); |
700 | current_duplicates_count = best_duplicates_count; |
701 | break; |
702 | } |
703 | } |
704 | } |
705 | while (current_duplicates_count > duplicates_goal); |
706 | FREE_DYNAMIC_ARRAY (tryal); |
707 | FREE_DYNAMIC_ARRAY (best); |
708 | |
709 | if (option[DEBUG]) |
710 | { |
711 | /* Print the result. */ |
712 | fprintf (stderr, "\nComputed alpha increments: " ); |
713 | bool first = true; |
714 | for (unsigned int j = nindices; j-- > 0; ) |
715 | if (current[indices[j]] != 0) |
716 | { |
717 | if (!first) |
718 | fprintf (stderr, ", " ); |
719 | fprintf (stderr, "%u:+%u" , |
720 | indices[j] + 1, current[indices[j]]); |
721 | first = false; |
722 | } |
723 | fprintf (stderr, "\n" ); |
724 | } |
725 | FREE_DYNAMIC_ARRAY (indices); |
726 | } |
727 | |
728 | _alpha_inc = current; |
729 | _alpha_size = compute_alpha_size (_alpha_inc); |
730 | _alpha_unify = compute_alpha_unify (_key_positions, _alpha_inc); |
731 | } |
732 | |
733 | /* ======================= Finding good asso_values ======================== */ |
734 | |
735 | /* Initializes the asso_values[] related parameters. */ |
736 | |
737 | void |
738 | Search::prepare_asso_values () |
739 | { |
740 | KeywordExt_List *temp; |
741 | |
742 | /* Initialize each keyword's _selchars array. */ |
743 | init_selchars_multiset(_key_positions, _alpha_unify, _alpha_inc); |
744 | |
745 | /* Compute the maximum _selchars_length over all keywords. */ |
746 | _max_selchars_length = _key_positions.iterator(_max_key_len).remaining(); |
747 | |
748 | /* Check for duplicates, i.e. keywords with the same _selchars array |
749 | (and - if _hash_includes_len - also the same length). |
750 | We deal with these by building an equivalence class, so that only |
751 | 1 keyword is representative of the entire collection. Only this |
752 | representative remains in the keyword list; the others are accessible |
753 | through the _duplicate_link chain, starting at the representative. |
754 | This *greatly* simplifies processing during later stages of the program. |
755 | Set _total_duplicates and _list_len = _total_keys - _total_duplicates. */ |
756 | { |
757 | _list_len = _total_keys; |
758 | _total_duplicates = 0; |
759 | /* Make hash table for efficiency. */ |
760 | Hash_Table representatives (_list_len, !_hash_includes_len); |
761 | |
762 | KeywordExt_List *prev = NULL; /* list node before temp */ |
763 | for (temp = _head; temp; ) |
764 | { |
765 | KeywordExt *keyword = temp->first(); |
766 | KeywordExt *other_keyword = representatives.insert (keyword); |
767 | KeywordExt_List *garbage = NULL; |
768 | |
769 | if (other_keyword) |
770 | { |
771 | _total_duplicates++; |
772 | _list_len--; |
773 | /* Remove keyword from the main list. */ |
774 | prev->rest() = temp->rest(); |
775 | garbage = temp; |
776 | /* And insert it on other_keyword's duplicate list. */ |
777 | keyword->_duplicate_link = other_keyword->_duplicate_link; |
778 | other_keyword->_duplicate_link = keyword; |
779 | |
780 | /* Complain if user hasn't enabled the duplicate option. */ |
781 | if (!option[DUP] || option[DEBUG]) |
782 | { |
783 | fprintf (stderr, "Key link: \"%.*s\" = \"%.*s\", with key set \"" , |
784 | keyword->_allchars_length, keyword->_allchars, |
785 | other_keyword->_allchars_length, other_keyword->_allchars); |
786 | for (int j = 0; j < keyword->_selchars_length; j++) |
787 | putc (keyword->_selchars[j], stderr); |
788 | fprintf (stderr, "\".\n" ); |
789 | } |
790 | } |
791 | else |
792 | { |
793 | keyword->_duplicate_link = NULL; |
794 | prev = temp; |
795 | } |
796 | temp = temp->rest(); |
797 | if (garbage) |
798 | delete garbage; |
799 | } |
800 | if (option[DEBUG]) |
801 | representatives.dump(); |
802 | } |
803 | |
804 | /* Exit program if duplicates exists and option[DUP] not set, since we |
805 | don't want to continue in this case. (We don't want to turn on |
806 | option[DUP] implicitly, because the generated code is usually much |
807 | slower. */ |
808 | if (_total_duplicates) |
809 | { |
810 | if (option[DUP]) |
811 | fprintf (stderr, "%d input keys have identical hash values, examine output carefully...\n" , |
812 | _total_duplicates); |
813 | else |
814 | { |
815 | fprintf (stderr, "%d input keys have identical hash values,\n" , |
816 | _total_duplicates); |
817 | if (option[POSITIONS]) |
818 | fprintf (stderr, "try different key positions or use option -D.\n" ); |
819 | else |
820 | fprintf (stderr, "use option -D.\n" ); |
821 | exit (1); |
822 | } |
823 | } |
824 | |
825 | /* Compute the occurrences of each character in the alphabet. */ |
826 | _occurrences = new int[_alpha_size]; |
827 | memset (_occurrences, 0, _alpha_size * sizeof (_occurrences[0])); |
828 | for (temp = _head; temp; temp = temp->rest()) |
829 | { |
830 | KeywordExt *keyword = temp->first(); |
831 | const unsigned int *ptr = keyword->_selchars; |
832 | for (int count = keyword->_selchars_length; count > 0; ptr++, count--) |
833 | _occurrences[*ptr]++; |
834 | } |
835 | |
836 | /* Memory allocation. */ |
837 | _asso_values = new int[_alpha_size]; |
838 | |
839 | int non_linked_length = _list_len; |
840 | unsigned int asso_value_max; |
841 | |
842 | asso_value_max = |
843 | static_cast<unsigned int>(non_linked_length * option.get_size_multiple()); |
844 | /* Round up to the next power of two. This makes it easy to ensure |
845 | an _asso_value[c] is >= 0 and < asso_value_max. Also, the jump value |
846 | being odd, it guarantees that Search::try_asso_value() will iterate |
847 | through different values for _asso_value[c]. */ |
848 | if (asso_value_max == 0) |
849 | asso_value_max = 1; |
850 | asso_value_max |= asso_value_max >> 1; |
851 | asso_value_max |= asso_value_max >> 2; |
852 | asso_value_max |= asso_value_max >> 4; |
853 | asso_value_max |= asso_value_max >> 8; |
854 | asso_value_max |= asso_value_max >> 16; |
855 | asso_value_max++; |
856 | _asso_value_max = asso_value_max; |
857 | |
858 | /* Given the bound for _asso_values[c], we have a bound for the possible |
859 | hash values, as computed in compute_hash(). */ |
860 | _max_hash_value = (_hash_includes_len ? _max_key_len : 0) |
861 | + (_asso_value_max - 1) * _max_selchars_length; |
862 | /* Allocate a sparse bit vector for detection of collisions of hash |
863 | values. */ |
864 | _collision_detector = new Bool_Array (_max_hash_value + 1); |
865 | |
866 | if (option[DEBUG]) |
867 | { |
868 | fprintf (stderr, "total non-linked keys = %d\nmaximum associated value is %d" |
869 | "\nmaximum size of generated hash table is %d\n" , |
870 | non_linked_length, asso_value_max, _max_hash_value); |
871 | |
872 | int field_width; |
873 | |
874 | field_width = 0; |
875 | { |
876 | for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) |
877 | { |
878 | KeywordExt *keyword = temp->first(); |
879 | if (field_width < keyword->_selchars_length) |
880 | field_width = keyword->_selchars_length; |
881 | } |
882 | } |
883 | |
884 | fprintf (stderr, "\ndumping the keyword list without duplicates\n" ); |
885 | fprintf (stderr, "keyword #, %*s, keyword\n" , field_width, "keysig" ); |
886 | int i = 0; |
887 | for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) |
888 | { |
889 | KeywordExt *keyword = temp->first(); |
890 | fprintf (stderr, "%9d, " , ++i); |
891 | if (field_width > keyword->_selchars_length) |
892 | fprintf (stderr, "%*s" , field_width - keyword->_selchars_length, "" ); |
893 | for (int j = 0; j < keyword->_selchars_length; j++) |
894 | putc (keyword->_selchars[j], stderr); |
895 | fprintf (stderr, ", %.*s\n" , |
896 | keyword->_allchars_length, keyword->_allchars); |
897 | } |
898 | fprintf (stderr, "\nend of keyword list\n\n" ); |
899 | } |
900 | |
901 | if (option[RANDOM] || option.get_jump () == 0) |
902 | /* We will use rand(), so initialize the random number generator. */ |
903 | srand (static_cast<long>(time (0))); |
904 | |
905 | _initial_asso_value = (option[RANDOM] ? -1 : option.get_initial_asso_value ()); |
906 | _jump = option.get_jump (); |
907 | } |
908 | |
909 | /* Finds some _asso_values[] that fit. */ |
910 | |
911 | /* The idea is to choose the _asso_values[] one by one, in a way that |
912 | a choice that has been made never needs to be undone later. This |
913 | means that we split the work into several steps. Each step chooses |
914 | one or more _asso_values[c]. The result of choosing one or more |
915 | _asso_values[c] is that the partitioning of the keyword set gets |
916 | broader. |
917 | Look at this partitioning: After every step, the _asso_values[] of a |
918 | certain set C of characters are undetermined. (At the beginning, C |
919 | is the set of characters c with _occurrences[c] > 0. At the end, C |
920 | is empty.) To each keyword K, we associate the multiset of _selchars |
921 | for which the _asso_values[] are undetermined: |
922 | K --> K->_selchars intersect C. |
923 | Consider two keywords equivalent if their value under this mapping is |
924 | the same. This introduces an equivalence relation on the set of |
925 | keywords. The equivalence classes partition the keyword set. (At the |
926 | beginning, the partition is the finest possible: each K is an equivalence |
927 | class by itself, because all K have a different _selchars. At the end, |
928 | all K have been merged into a single equivalence class.) |
929 | The partition before a step is always a refinement of the partition |
930 | after the step. |
931 | We choose the steps in such a way that the partition really becomes |
932 | broader at each step. (A step that only chooses an _asso_values[c] |
933 | without changing the partition is better merged with the previous step, |
934 | to avoid useless backtracking.) */ |
935 | |
936 | struct EquivalenceClass |
937 | { |
938 | /* The keywords in this equivalence class. */ |
939 | KeywordExt_List * _keywords; |
940 | KeywordExt_List * _keywords_last; |
941 | /* The number of keywords in this equivalence class. */ |
942 | unsigned int _cardinality; |
943 | /* The undetermined selected characters for the keywords in this |
944 | equivalence class, as a canonically reordered multiset. */ |
945 | unsigned int * _undetermined_chars; |
946 | unsigned int _undetermined_chars_length; |
947 | |
948 | EquivalenceClass * _next; |
949 | }; |
950 | |
951 | struct Step |
952 | { |
953 | /* The characters whose values are being determined in this step. */ |
954 | unsigned int _changing_count; |
955 | unsigned int * _changing; |
956 | /* Exclusive upper bound for the _asso_values[c] of this step. |
957 | A power of 2. */ |
958 | unsigned int _asso_value_max; |
959 | /* The characters whose values will be determined after this step. */ |
960 | bool * _undetermined; |
961 | /* The keyword set partition after this step. */ |
962 | EquivalenceClass * _partition; |
963 | /* The expected number of iterations in this step. */ |
964 | double _expected_lower; |
965 | double _expected_upper; |
966 | |
967 | Step * _next; |
968 | }; |
969 | |
970 | static inline bool |
971 | equals (const unsigned int *ptr1, const unsigned int *ptr2, unsigned int len) |
972 | { |
973 | while (len > 0) |
974 | { |
975 | if (*ptr1 != *ptr2) |
976 | return false; |
977 | ptr1++; |
978 | ptr2++; |
979 | len--; |
980 | } |
981 | return true; |
982 | } |
983 | |
984 | EquivalenceClass * |
985 | Search::compute_partition (bool *undetermined) const |
986 | { |
987 | EquivalenceClass *partition = NULL; |
988 | EquivalenceClass *partition_last = NULL; |
989 | for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) |
990 | { |
991 | KeywordExt *keyword = temp->first(); |
992 | |
993 | /* Compute the undetermined characters for this keyword. */ |
994 | unsigned int *undetermined_chars = |
995 | new unsigned int[keyword->_selchars_length]; |
996 | unsigned int undetermined_chars_length = 0; |
997 | |
998 | for (int i = 0; i < keyword->_selchars_length; i++) |
999 | if (undetermined[keyword->_selchars[i]]) |
1000 | undetermined_chars[undetermined_chars_length++] = keyword->_selchars[i]; |
1001 | |
1002 | /* Look up the equivalence class to which this keyword belongs. */ |
1003 | EquivalenceClass *equclass; |
1004 | for (equclass = partition; equclass; equclass = equclass->_next) |
1005 | if (equclass->_undetermined_chars_length == undetermined_chars_length |
1006 | && equals (equclass->_undetermined_chars, undetermined_chars, |
1007 | undetermined_chars_length)) |
1008 | break; |
1009 | if (equclass == NULL) |
1010 | { |
1011 | equclass = new EquivalenceClass(); |
1012 | equclass->_keywords = NULL; |
1013 | equclass->_keywords_last = NULL; |
1014 | equclass->_cardinality = 0; |
1015 | equclass->_undetermined_chars = undetermined_chars; |
1016 | equclass->_undetermined_chars_length = undetermined_chars_length; |
1017 | equclass->_next = NULL; |
1018 | if (partition) |
1019 | partition_last->_next = equclass; |
1020 | else |
1021 | partition = equclass; |
1022 | partition_last = equclass; |
1023 | } |
1024 | else |
1025 | delete[] undetermined_chars; |
1026 | |
1027 | /* Add the keyword to the equivalence class. */ |
1028 | KeywordExt_List *cons = new KeywordExt_List(keyword); |
1029 | if (equclass->_keywords) |
1030 | equclass->_keywords_last->rest() = cons; |
1031 | else |
1032 | equclass->_keywords = cons; |
1033 | equclass->_keywords_last = cons; |
1034 | equclass->_cardinality++; |
1035 | } |
1036 | |
1037 | /* Free some of the allocated memory. The caller doesn't need it. */ |
1038 | for (EquivalenceClass *cls = partition; cls; cls = cls->_next) |
1039 | delete[] cls->_undetermined_chars; |
1040 | |
1041 | return partition; |
1042 | } |
1043 | |
1044 | static void |
1045 | delete_partition (EquivalenceClass *partition) |
1046 | { |
1047 | while (partition != NULL) |
1048 | { |
1049 | EquivalenceClass *equclass = partition; |
1050 | partition = equclass->_next; |
1051 | delete_list (equclass->_keywords); |
1052 | //delete[] equclass->_undetermined_chars; // already freed above |
1053 | delete equclass; |
1054 | } |
1055 | } |
1056 | |
1057 | /* Compute the possible number of collisions when _asso_values[c] is |
1058 | chosen, leading to the given partition. */ |
1059 | unsigned int |
1060 | Search::count_possible_collisions (EquivalenceClass *partition, unsigned int c) const |
1061 | { |
1062 | /* Every equivalence class p is split according to the frequency of |
1063 | occurrence of c, leading to equivalence classes p1, p2, ... |
1064 | This leads to (|p|^2 - |p1|^2 - |p2|^2 - ...)/2 possible collisions. |
1065 | Return the sum of this expression over all equivalence classes. */ |
1066 | unsigned int sum = 0; |
1067 | unsigned int m = _max_selchars_length; |
1068 | DYNAMIC_ARRAY (split_cardinalities, unsigned int, m + 1); |
1069 | for (EquivalenceClass *cls = partition; cls; cls = cls->_next) |
1070 | { |
1071 | for (unsigned int i = 0; i <= m; i++) |
1072 | split_cardinalities[i] = 0; |
1073 | |
1074 | for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest()) |
1075 | { |
1076 | KeywordExt *keyword = temp->first(); |
1077 | |
1078 | unsigned int count = 0; |
1079 | for (int i = 0; i < keyword->_selchars_length; i++) |
1080 | if (keyword->_selchars[i] == c) |
1081 | count++; |
1082 | |
1083 | split_cardinalities[count]++; |
1084 | } |
1085 | |
1086 | sum += cls->_cardinality * cls->_cardinality; |
1087 | for (unsigned int i = 0; i <= m; i++) |
1088 | sum -= split_cardinalities[i] * split_cardinalities[i]; |
1089 | } |
1090 | FREE_DYNAMIC_ARRAY (split_cardinalities); |
1091 | return sum; |
1092 | } |
1093 | |
1094 | /* Test whether adding c to the undetermined characters changes the given |
1095 | partition. */ |
1096 | bool |
1097 | Search::unchanged_partition (EquivalenceClass *partition, unsigned int c) const |
1098 | { |
1099 | for (EquivalenceClass *cls = partition; cls; cls = cls->_next) |
1100 | { |
1101 | unsigned int first_count = UINT_MAX; |
1102 | |
1103 | for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest()) |
1104 | { |
1105 | KeywordExt *keyword = temp->first(); |
1106 | |
1107 | unsigned int count = 0; |
1108 | for (int i = 0; i < keyword->_selchars_length; i++) |
1109 | if (keyword->_selchars[i] == c) |
1110 | count++; |
1111 | |
1112 | if (temp == cls->_keywords) |
1113 | first_count = count; |
1114 | else if (count != first_count) |
1115 | /* c would split this equivalence class. */ |
1116 | return false; |
1117 | } |
1118 | } |
1119 | return true; |
1120 | } |
1121 | |
1122 | void |
1123 | Search::find_asso_values () |
1124 | { |
1125 | Step *steps; |
1126 | |
1127 | /* Determine the steps, starting with the last one. */ |
1128 | { |
1129 | bool *undetermined; |
1130 | bool *determined; |
1131 | |
1132 | steps = NULL; |
1133 | |
1134 | undetermined = new bool[_alpha_size]; |
1135 | for (unsigned int c = 0; c < _alpha_size; c++) |
1136 | undetermined[c] = false; |
1137 | |
1138 | determined = new bool[_alpha_size]; |
1139 | for (unsigned int c = 0; c < _alpha_size; c++) |
1140 | determined[c] = true; |
1141 | |
1142 | for (;;) |
1143 | { |
1144 | /* Compute the partition that needs to be refined. */ |
1145 | EquivalenceClass *partition = compute_partition (undetermined); |
1146 | |
1147 | /* Determine the main character to be chosen in this step. |
1148 | Choosing such a character c has the effect of splitting every |
1149 | equivalence class (according the the frequency of occurrence of c). |
1150 | We choose the c with the minimum number of possible collisions, |
1151 | so that characters which lead to a large number of collisions get |
1152 | handled early during the search. */ |
1153 | unsigned int chosen_c; |
1154 | unsigned int chosen_possible_collisions; |
1155 | { |
1156 | unsigned int best_c = 0; |
1157 | unsigned int best_possible_collisions = UINT_MAX; |
1158 | for (unsigned int c = 0; c < _alpha_size; c++) |
1159 | if (_occurrences[c] > 0 && determined[c]) |
1160 | { |
1161 | unsigned int possible_collisions = |
1162 | count_possible_collisions (partition, c); |
1163 | if (possible_collisions < best_possible_collisions) |
1164 | { |
1165 | best_c = c; |
1166 | best_possible_collisions = possible_collisions; |
1167 | } |
1168 | } |
1169 | if (best_possible_collisions == UINT_MAX) |
1170 | { |
1171 | /* All c with _occurrences[c] > 0 are undetermined. We are |
1172 | are the starting situation and don't need any more step. */ |
1173 | delete_partition (partition); |
1174 | break; |
1175 | } |
1176 | chosen_c = best_c; |
1177 | chosen_possible_collisions = best_possible_collisions; |
1178 | } |
1179 | |
1180 | /* We need one more step. */ |
1181 | Step *step = new Step(); |
1182 | |
1183 | step->_undetermined = new bool[_alpha_size]; |
1184 | memcpy (step->_undetermined, undetermined, _alpha_size*sizeof(bool)); |
1185 | |
1186 | step->_partition = partition; |
1187 | |
1188 | /* Now determine how the equivalence classes will be before this |
1189 | step. */ |
1190 | undetermined[chosen_c] = true; |
1191 | partition = compute_partition (undetermined); |
1192 | |
1193 | /* Now determine which other characters should be determined in this |
1194 | step, because they will not change the equivalence classes at |
1195 | this point. It is the set of all c which, for all equivalence |
1196 | classes, have the same frequency of occurrence in every keyword |
1197 | of the equivalence class. */ |
1198 | for (unsigned int c = 0; c < _alpha_size; c++) |
1199 | if (_occurrences[c] > 0 && determined[c] |
1200 | && unchanged_partition (partition, c)) |
1201 | { |
1202 | undetermined[c] = true; |
1203 | determined[c] = false; |
1204 | } |
1205 | |
1206 | /* main_c must be one of these. */ |
1207 | if (determined[chosen_c]) |
1208 | abort (); |
1209 | |
1210 | /* Now the set of changing characters of this step. */ |
1211 | unsigned int changing_count; |
1212 | |
1213 | changing_count = 0; |
1214 | for (unsigned int c = 0; c < _alpha_size; c++) |
1215 | if (undetermined[c] && !step->_undetermined[c]) |
1216 | changing_count++; |
1217 | |
1218 | unsigned int *changing = new unsigned int[changing_count]; |
1219 | changing_count = 0; |
1220 | for (unsigned int c = 0; c < _alpha_size; c++) |
1221 | if (undetermined[c] && !step->_undetermined[c]) |
1222 | changing[changing_count++] = c; |
1223 | |
1224 | step->_changing = changing; |
1225 | step->_changing_count = changing_count; |
1226 | |
1227 | step->_asso_value_max = _asso_value_max; |
1228 | |
1229 | step->_expected_lower = |
1230 | exp (static_cast<double>(chosen_possible_collisions) |
1231 | / static_cast<double>(_max_hash_value)); |
1232 | step->_expected_upper = |
1233 | exp (static_cast<double>(chosen_possible_collisions) |
1234 | / static_cast<double>(_asso_value_max)); |
1235 | |
1236 | delete_partition (partition); |
1237 | |
1238 | step->_next = steps; |
1239 | steps = step; |
1240 | } |
1241 | |
1242 | delete[] determined; |
1243 | delete[] undetermined; |
1244 | } |
1245 | |
1246 | if (option[DEBUG]) |
1247 | { |
1248 | unsigned int stepno = 0; |
1249 | for (Step *step = steps; step; step = step->_next) |
1250 | { |
1251 | stepno++; |
1252 | fprintf (stderr, "Step %u chooses _asso_values[" , stepno); |
1253 | for (unsigned int i = 0; i < step->_changing_count; i++) |
1254 | { |
1255 | if (i > 0) |
1256 | fprintf (stderr, "," ); |
1257 | fprintf (stderr, "'%c'" , step->_changing[i]); |
1258 | } |
1259 | fprintf (stderr, "], expected number of iterations between %g and %g.\n" , |
1260 | step->_expected_lower, step->_expected_upper); |
1261 | fprintf (stderr, "Keyword equivalence classes:\n" ); |
1262 | for (EquivalenceClass *cls = step->_partition; cls; cls = cls->_next) |
1263 | { |
1264 | fprintf (stderr, "\n" ); |
1265 | for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest()) |
1266 | { |
1267 | KeywordExt *keyword = temp->first(); |
1268 | fprintf (stderr, " %.*s\n" , |
1269 | keyword->_allchars_length, keyword->_allchars); |
1270 | } |
1271 | } |
1272 | fprintf (stderr, "\n" ); |
1273 | } |
1274 | } |
1275 | |
1276 | /* Initialize _asso_values[]. (The value given here matters only |
1277 | for those c which occur in all keywords with equal multiplicity.) */ |
1278 | for (unsigned int c = 0; c < _alpha_size; c++) |
1279 | _asso_values[c] = 0; |
1280 | |
1281 | unsigned int stepno = 0; |
1282 | for (Step *step = steps; step; step = step->_next) |
1283 | { |
1284 | stepno++; |
1285 | |
1286 | /* Initialize the asso_values[]. */ |
1287 | unsigned int k = step->_changing_count; |
1288 | for (unsigned int i = 0; i < k; i++) |
1289 | { |
1290 | unsigned int c = step->_changing[i]; |
1291 | _asso_values[c] = |
1292 | (_initial_asso_value < 0 ? rand () : _initial_asso_value) |
1293 | & (step->_asso_value_max - 1); |
1294 | } |
1295 | |
1296 | unsigned int iterations = 0; |
1297 | DYNAMIC_ARRAY (iter, unsigned int, k); |
1298 | for (unsigned int i = 0; i < k; i++) |
1299 | iter[i] = 0; |
1300 | unsigned int ii = (_jump != 0 ? k - 1 : 0); |
1301 | |
1302 | for (;;) |
1303 | { |
1304 | /* Test whether these asso_values[] lead to collisions among |
1305 | the equivalence classes that should be collision-free. */ |
1306 | bool has_collision = false; |
1307 | for (EquivalenceClass *cls = step->_partition; cls; cls = cls->_next) |
1308 | { |
1309 | /* Iteration Number array is a win, O(1) initialization time! */ |
1310 | _collision_detector->clear (); |
1311 | |
1312 | for (KeywordExt_List *ptr = cls->_keywords; ptr; ptr = ptr->rest()) |
1313 | { |
1314 | KeywordExt *keyword = ptr->first(); |
1315 | |
1316 | /* Compute the new hash code for the keyword, leaving apart |
1317 | the yet undetermined asso_values[]. */ |
1318 | int hashcode; |
1319 | { |
1320 | int sum = _hash_includes_len ? keyword->_allchars_length : 0; |
1321 | const unsigned int *p = keyword->_selchars; |
1322 | int i = keyword->_selchars_length; |
1323 | for (; i > 0; p++, i--) |
1324 | if (!step->_undetermined[*p]) |
1325 | sum += _asso_values[*p]; |
1326 | hashcode = sum; |
1327 | } |
1328 | |
1329 | /* See whether it collides with another keyword's hash code, |
1330 | from the same equivalence class. */ |
1331 | if (_collision_detector->set_bit (hashcode)) |
1332 | { |
1333 | has_collision = true; |
1334 | break; |
1335 | } |
1336 | } |
1337 | |
1338 | /* Don't need to continue looking at the other equivalence |
1339 | classes if we already have found a collision. */ |
1340 | if (has_collision) |
1341 | break; |
1342 | } |
1343 | |
1344 | iterations++; |
1345 | if (!has_collision) |
1346 | break; |
1347 | |
1348 | /* Try other asso_values[]. */ |
1349 | if (_jump != 0) |
1350 | { |
1351 | /* The way we try various values for |
1352 | asso_values[step->_changing[0],...step->_changing[k-1]] |
1353 | is like this: |
1354 | for (bound = 0,1,...) |
1355 | for (ii = 0,...,k-1) |
1356 | iter[ii] := bound |
1357 | iter[0..ii-1] := values <= bound |
1358 | iter[ii+1..k-1] := values < bound |
1359 | and |
1360 | asso_values[step->_changing[i]] = |
1361 | _initial_asso_value + iter[i] * _jump. |
1362 | This makes it more likely to find small asso_values[]. |
1363 | */ |
1364 | unsigned int bound = iter[ii]; |
1365 | unsigned int i = 0; |
1366 | while (i < ii) |
1367 | { |
1368 | unsigned int c = step->_changing[i]; |
1369 | iter[i]++; |
1370 | _asso_values[c] = |
1371 | (_asso_values[c] + _jump) & (step->_asso_value_max - 1); |
1372 | if (iter[i] <= bound) |
1373 | goto found_next; |
1374 | _asso_values[c] = |
1375 | (_asso_values[c] - iter[i] * _jump) |
1376 | & (step->_asso_value_max - 1); |
1377 | iter[i] = 0; |
1378 | i++; |
1379 | } |
1380 | i = ii + 1; |
1381 | while (i < k) |
1382 | { |
1383 | unsigned int c = step->_changing[i]; |
1384 | iter[i]++; |
1385 | _asso_values[c] = |
1386 | (_asso_values[c] + _jump) & (step->_asso_value_max - 1); |
1387 | if (iter[i] < bound) |
1388 | goto found_next; |
1389 | _asso_values[c] = |
1390 | (_asso_values[c] - iter[i] * _jump) |
1391 | & (step->_asso_value_max - 1); |
1392 | iter[i] = 0; |
1393 | i++; |
1394 | } |
1395 | /* Switch from one ii to the next. */ |
1396 | { |
1397 | unsigned int c = step->_changing[ii]; |
1398 | _asso_values[c] = |
1399 | (_asso_values[c] - bound * _jump) |
1400 | & (step->_asso_value_max - 1); |
1401 | iter[ii] = 0; |
1402 | } |
1403 | /* Here all iter[i] == 0. */ |
1404 | ii++; |
1405 | if (ii == k) |
1406 | { |
1407 | ii = 0; |
1408 | bound++; |
1409 | if (bound == step->_asso_value_max) |
1410 | { |
1411 | /* Out of search space! We can either backtrack, or |
1412 | increase the available search space of this step. |
1413 | It seems simpler to choose the latter solution. */ |
1414 | step->_asso_value_max = 2 * step->_asso_value_max; |
1415 | if (step->_asso_value_max > _asso_value_max) |
1416 | { |
1417 | _asso_value_max = step->_asso_value_max; |
1418 | /* Reinitialize _max_hash_value. */ |
1419 | _max_hash_value = |
1420 | (_hash_includes_len ? _max_key_len : 0) |
1421 | + (_asso_value_max - 1) * _max_selchars_length; |
1422 | /* Reinitialize _collision_detector. */ |
1423 | delete _collision_detector; |
1424 | _collision_detector = |
1425 | new Bool_Array (_max_hash_value + 1); |
1426 | } |
1427 | } |
1428 | } |
1429 | { |
1430 | unsigned int c = step->_changing[ii]; |
1431 | iter[ii] = bound; |
1432 | _asso_values[c] = |
1433 | (_asso_values[c] + bound * _jump) |
1434 | & (step->_asso_value_max - 1); |
1435 | } |
1436 | found_next: ; |
1437 | } |
1438 | else |
1439 | { |
1440 | /* Random. */ |
1441 | unsigned int c = step->_changing[ii]; |
1442 | _asso_values[c] = |
1443 | (_asso_values[c] + rand ()) & (step->_asso_value_max - 1); |
1444 | /* Next time, change the next c. */ |
1445 | ii++; |
1446 | if (ii == k) |
1447 | ii = 0; |
1448 | } |
1449 | } |
1450 | FREE_DYNAMIC_ARRAY (iter); |
1451 | |
1452 | if (option[DEBUG]) |
1453 | { |
1454 | fprintf (stderr, "Step %u chose _asso_values[" , stepno); |
1455 | for (unsigned int i = 0; i < step->_changing_count; i++) |
1456 | { |
1457 | if (i > 0) |
1458 | fprintf (stderr, "," ); |
1459 | fprintf (stderr, "'%c'" , step->_changing[i]); |
1460 | } |
1461 | fprintf (stderr, "] in %u iterations.\n" , iterations); |
1462 | } |
1463 | } |
1464 | |
1465 | /* Free allocated memory. */ |
1466 | while (steps != NULL) |
1467 | { |
1468 | Step *step = steps; |
1469 | steps = step->_next; |
1470 | delete[] step->_changing; |
1471 | delete[] step->_undetermined; |
1472 | delete_partition (step->_partition); |
1473 | delete step; |
1474 | } |
1475 | } |
1476 | |
1477 | /* Computes a keyword's hash value, relative to the current _asso_values[], |
1478 | and stores it in keyword->_hash_value. */ |
1479 | |
1480 | inline int |
1481 | Search::compute_hash (KeywordExt *keyword) const |
1482 | { |
1483 | int sum = _hash_includes_len ? keyword->_allchars_length : 0; |
1484 | |
1485 | const unsigned int *p = keyword->_selchars; |
1486 | int i = keyword->_selchars_length; |
1487 | for (; i > 0; p++, i--) |
1488 | sum += _asso_values[*p]; |
1489 | |
1490 | return keyword->_hash_value = sum; |
1491 | } |
1492 | |
1493 | /* Finds good _asso_values[]. */ |
1494 | |
1495 | void |
1496 | Search::find_good_asso_values () |
1497 | { |
1498 | prepare_asso_values (); |
1499 | |
1500 | /* Search for good _asso_values[]. */ |
1501 | int asso_iteration; |
1502 | if ((asso_iteration = option.get_asso_iterations ()) == 0) |
1503 | /* Try only the given _initial_asso_value and _jump. */ |
1504 | find_asso_values (); |
1505 | else |
1506 | { |
1507 | /* Try different pairs of _initial_asso_value and _jump, in the |
1508 | following order: |
1509 | (0, 1) |
1510 | (1, 1) |
1511 | (2, 1) (0, 3) |
1512 | (3, 1) (1, 3) |
1513 | (4, 1) (2, 3) (0, 5) |
1514 | (5, 1) (3, 3) (1, 5) |
1515 | ..... */ |
1516 | KeywordExt_List *saved_head = _head; |
1517 | int best_initial_asso_value = 0; |
1518 | int best_jump = 1; |
1519 | int *best_asso_values = new int[_alpha_size]; |
1520 | int best_collisions = INT_MAX; |
1521 | int best_max_hash_value = INT_MAX; |
1522 | |
1523 | _initial_asso_value = 0; _jump = 1; |
1524 | for (;;) |
1525 | { |
1526 | /* Restore the keyword list in its original order. */ |
1527 | _head = copy_list (saved_head); |
1528 | /* Find good _asso_values[]. */ |
1529 | find_asso_values (); |
1530 | /* Test whether it is the best solution so far. */ |
1531 | int collisions = 0; |
1532 | int max_hash_value = INT_MIN; |
1533 | _collision_detector->clear (); |
1534 | for (KeywordExt_List *ptr = _head; ptr; ptr = ptr->rest()) |
1535 | { |
1536 | KeywordExt *keyword = ptr->first(); |
1537 | int hashcode = compute_hash (keyword); |
1538 | if (max_hash_value < hashcode) |
1539 | max_hash_value = hashcode; |
1540 | if (_collision_detector->set_bit (hashcode)) |
1541 | collisions++; |
1542 | } |
1543 | if (collisions < best_collisions |
1544 | || (collisions == best_collisions |
1545 | && max_hash_value < best_max_hash_value)) |
1546 | { |
1547 | memcpy (best_asso_values, _asso_values, |
1548 | _alpha_size * sizeof (_asso_values[0])); |
1549 | best_collisions = collisions; |
1550 | best_max_hash_value = max_hash_value; |
1551 | } |
1552 | /* Delete the copied keyword list. */ |
1553 | delete_list (_head); |
1554 | |
1555 | if (--asso_iteration == 0) |
1556 | break; |
1557 | /* Prepare for next iteration. */ |
1558 | if (_initial_asso_value >= 2) |
1559 | _initial_asso_value -= 2, _jump += 2; |
1560 | else |
1561 | _initial_asso_value += _jump, _jump = 1; |
1562 | } |
1563 | _head = saved_head; |
1564 | /* Install the best found asso_values. */ |
1565 | _initial_asso_value = best_initial_asso_value; |
1566 | _jump = best_jump; |
1567 | memcpy (_asso_values, best_asso_values, |
1568 | _alpha_size * sizeof (_asso_values[0])); |
1569 | delete[] best_asso_values; |
1570 | /* The keywords' _hash_value fields are recomputed below. */ |
1571 | } |
1572 | } |
1573 | |
1574 | /* ========================================================================= */ |
1575 | |
1576 | /* Comparison function for sorting by increasing _hash_value. */ |
1577 | static bool |
1578 | less_by_hash_value (KeywordExt *keyword1, KeywordExt *keyword2) |
1579 | { |
1580 | return keyword1->_hash_value < keyword2->_hash_value; |
1581 | } |
1582 | |
1583 | /* Sorts the keyword list by hash value. */ |
1584 | |
1585 | void |
1586 | Search::sort () |
1587 | { |
1588 | _head = mergesort_list (_head, less_by_hash_value); |
1589 | } |
1590 | |
1591 | void |
1592 | Search::optimize () |
1593 | { |
1594 | /* Preparations. */ |
1595 | prepare (); |
1596 | |
1597 | /* Step 1: Finding good byte positions. */ |
1598 | find_positions (); |
1599 | |
1600 | /* Step 2: Finding good alpha increments. */ |
1601 | find_alpha_inc (); |
1602 | |
1603 | /* Step 3: Finding good asso_values. */ |
1604 | find_good_asso_values (); |
1605 | |
1606 | /* Make one final check, just to make sure nothing weird happened.... */ |
1607 | _collision_detector->clear (); |
1608 | for (KeywordExt_List *curr_ptr = _head; curr_ptr; curr_ptr = curr_ptr->rest()) |
1609 | { |
1610 | KeywordExt *curr = curr_ptr->first(); |
1611 | unsigned int hashcode = compute_hash (curr); |
1612 | if (_collision_detector->set_bit (hashcode)) |
1613 | { |
1614 | /* This shouldn't happen. proj1, proj2, proj3 must have been |
1615 | computed to be injective on the given keyword set. */ |
1616 | fprintf (stderr, |
1617 | "\nInternal error, unexpected duplicate hash code\n" ); |
1618 | if (option[POSITIONS]) |
1619 | fprintf (stderr, "try options -m or -r, or use new key positions.\n\n" ); |
1620 | else |
1621 | fprintf (stderr, "try options -m or -r.\n\n" ); |
1622 | exit (1); |
1623 | } |
1624 | } |
1625 | |
1626 | /* Sorts the keyword list by hash value. */ |
1627 | sort (); |
1628 | |
1629 | /* Set unused asso_values[c] to max_hash_value + 1. This is not absolutely |
1630 | necessary, but speeds up the lookup function in many cases of lookup |
1631 | failure: no string comparison is needed once the hash value of a string |
1632 | is larger than the hash value of any keyword. */ |
1633 | int max_hash_value; |
1634 | { |
1635 | KeywordExt_List *temp; |
1636 | for (temp = _head; temp->rest(); temp = temp->rest()) |
1637 | ; |
1638 | max_hash_value = temp->first()->_hash_value; |
1639 | } |
1640 | for (unsigned int c = 0; c < _alpha_size; c++) |
1641 | if (_occurrences[c] == 0) |
1642 | _asso_values[c] = max_hash_value + 1; |
1643 | |
1644 | /* Propagate unified asso_values. */ |
1645 | if (_alpha_unify) |
1646 | { |
1647 | for (unsigned int c = 0; c < _alpha_size; c++) |
1648 | if (_alpha_unify[c] != c) |
1649 | _asso_values[c] = _asso_values[_alpha_unify[c]]; |
1650 | } |
1651 | } |
1652 | |
1653 | /* Prints out some diagnostics upon completion. */ |
1654 | |
1655 | Search::~Search () |
1656 | { |
1657 | delete _collision_detector; |
1658 | if (option[DEBUG]) |
1659 | { |
1660 | fprintf (stderr, "\ndumping occurrence and associated values tables\n" ); |
1661 | |
1662 | for (unsigned int i = 0; i < _alpha_size; i++) |
1663 | if (_occurrences[i]) |
1664 | fprintf (stderr, "asso_values[%c] = %6d, occurrences[%c] = %6d\n" , |
1665 | i, _asso_values[i], i, _occurrences[i]); |
1666 | |
1667 | fprintf (stderr, "end table dumping\n" ); |
1668 | |
1669 | fprintf (stderr, "\nDumping key list information:\ntotal non-static linked keywords = %d" |
1670 | "\ntotal keywords = %d\ntotal duplicates = %d\nmaximum key length = %d\n" , |
1671 | _list_len, _total_keys, _total_duplicates, _max_key_len); |
1672 | |
1673 | int field_width = _max_selchars_length; |
1674 | fprintf (stderr, "\nList contents are:\n(hash value, key length, index, %*s, keyword):\n" , |
1675 | field_width, "selchars" ); |
1676 | for (KeywordExt_List *ptr = _head; ptr; ptr = ptr->rest()) |
1677 | { |
1678 | fprintf (stderr, "%11d,%11d,%6d, " , |
1679 | ptr->first()->_hash_value, ptr->first()->_allchars_length, ptr->first()->_final_index); |
1680 | if (field_width > ptr->first()->_selchars_length) |
1681 | fprintf (stderr, "%*s" , field_width - ptr->first()->_selchars_length, "" ); |
1682 | for (int j = 0; j < ptr->first()->_selchars_length; j++) |
1683 | putc (ptr->first()->_selchars[j], stderr); |
1684 | fprintf (stderr, ", %.*s\n" , |
1685 | ptr->first()->_allchars_length, ptr->first()->_allchars); |
1686 | } |
1687 | |
1688 | fprintf (stderr, "End dumping list.\n\n" ); |
1689 | } |
1690 | delete[] _asso_values; |
1691 | delete[] _occurrences; |
1692 | delete[] _alpha_unify; |
1693 | delete[] _alpha_inc; |
1694 | } |
1695 | |