search.cc source code [gperf/src/search.cc]

1	/ Search algorithm.*
2	Copyright (C) 1989-1998, 2000, 2002, 2009, 2016 Free Software Foundation, Inc.
3	Written by Douglas C. Schmidt <schmidt@ics.uci.edu>
4	and Bruno Haible <bruno@clisp.org>.
5
6	This file is part of GNU GPERF.
7
8	This program is free software: you can redistribute it and/or modify
9	it under the terms of the GNU General Public License as published by
10	the Free Software Foundation; either version 3 of the License, or
11	(at your option) any later version.
12
13	This program is distributed in the hope that it will be useful,
14	but WITHOUT ANY WARRANTY; without even the implied warranty of
15	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16	GNU General Public License for more details.
17
18	You should have received a copy of the GNU General Public License
19	along with this program. If not, see <http://www.gnu.org/licenses/>. /*
20
21	/ Specification. /
22	#include "search.h"
23
24	#include <stdio.h>
25	#include <stdlib.h> /* declares exit(), rand(), srand() */
26	#include <string.h> /* declares memset(), memcmp() */
27	#include <time.h> /* declares time() */
28	#include <math.h> /* declares exp() */
29	#include <limits.h> /* defines INT_MIN, INT_MAX, UINT_MAX */
30	#include "options.h"
31	#include "hash-table.h"
32	#include "config.h"
33
34	/ ============================== Portability ============================== /
35
36	/ Assume ISO C++ 'for' scoping rule. /
37	#define for if (0) ; else for
38
39	/ Dynamically allocated array with dynamic extent:*
40
41	Example:
42	DYNAMIC_ARRAY (my_array, int, n);
43	...
44	FREE_DYNAMIC_ARRAY (my_array);
45
46	Attention: depending on your implementation my_array is either the array
47	itself or a pointer to the array! Always use my_array only as expression!
48	*/
49	#if HAVE_DYNAMIC_ARRAY
50	#define DYNAMIC_ARRAY(var,eltype,size) eltype var[size]
51	#define FREE_DYNAMIC_ARRAY(var)
52	#else
53	#define DYNAMIC_ARRAY(var,eltype,size) eltype *var = new eltype[size]
54	#define FREE_DYNAMIC_ARRAY(var) delete[] var
55	#endif
56
57	/ ================================ Theory ================================= /
58
59	/ The general form of the hash function is*
60
61	hash (keyword) = sum (asso_values[keyword[i] + alpha_inc[i]] : i in Pos)
62	+ len (keyword)
63
64	where Pos is a set of byte positions,
65	each alpha_inc[i] is a nonnegative integer,
66	each asso_values[c] is a nonnegative integer,
67	len (keyword) is the keyword's length if _hash_includes_len, or 0 otherwise.
68
69	Theorem 1: If all keywords are different, there is a set Pos such that
70	all tuples (keyword[i] : i in Pos) are different.
71
72	Theorem 2: If all tuples (keyword[i] : i in Pos) are different, there
73	are nonnegative integers alpha_inc[i] such that all multisets
74	{keyword[i] + alpha_inc[i] : i in Pos} are different.
75
76	Define selchars[keyword] := {keyword[i] + alpha_inc[i] : i in Pos}.
77
78	Theorem 3: If all multisets selchars[keyword] are different, there are
79	nonnegative integers asso_values[c] such that all hash values
80	sum (asso_values[c] : c in selchars[keyword]) are different.
81
82	Based on these three facts, we find the hash function in three steps:
83
84	Step 1 (Finding good byte positions):
85	Find a set Pos, as small as possible, such that all tuples
86	(keyword[i] : i in Pos) are different.
87
88	Step 2 (Finding good alpha increments):
89	Find nonnegative integers alpha_inc[i], as many of them as possible being
90	zero, and the others being as small as possible, such that all multisets
91	{keyword[i] + alpha_inc[i] : i in Pos} are different.
92
93	Step 3 (Finding good asso_values):
94	Find asso_values[c] such that all hash (keyword) are different.
95
96	In other words, each step finds a projection that is injective on the
97	given finite set:
98	proj1 : String --> Map (Pos --> N)
99	proj2 : Map (Pos --> N) --> Map (Pos --> N) / S(Pos)
100	proj3 : Map (Pos --> N) / S(Pos) --> N
101	where
102	N denotes the set of nonnegative integers,
103	Map (A --> B) := Hom_Set (A, B) is the set of maps from A to B, and
104	S(Pos) is the symmetric group over Pos.
105
106	This was the theory for !_hash_includes_len; if _hash_includes_len, slight
107	modifications apply:
108	proj1 : String --> Map (Pos --> N) x N
109	proj2 : Map (Pos --> N) x N --> Map (Pos --> N) / S(Pos) x N
110	proj3 : Map (Pos --> N) / S(Pos) x N --> N
111
112	For a case-insensitive hash function, the general form is
113
114	hash (keyword) =
115	sum (asso_values[alpha_unify[keyword[i] + alpha_inc[i]]] : i in Pos)
116	+ len (keyword)
117
118	where alpha_unify[c] is chosen so that an upper/lower case change in
119	keyword[i] doesn't change alpha_unify[keyword[i] + alpha_inc[i]].
120	*/
121
122	/ ==================== Initialization and Preparation ===================== /
123
124	Search::Search (KeywordExt_List *list)
125	: _head (list)
126	{
127	}
128
129	void
130	Search::prepare ()
131	{
132	/ Compute the total number of keywords. /
133	_total_keys = `0`;
134	for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
135	_total_keys++;
136
137	/ Compute the minimum and maximum keyword length. /
138	_max_key_len = INT_MIN;
139	_min_key_len = INT_MAX;
140	for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
141	{
142	KeywordExt *keyword = temp->first();
143
144	if (_max_key_len < keyword->_allchars_length)
145	_max_key_len = keyword->_allchars_length;
146	if (_min_key_len > keyword->_allchars_length)
147	_min_key_len = keyword->_allchars_length;
148	}
149
150	/ Exit program if an empty string is used as keyword, since the comparison*
151	expressions don't work correctly for looking up an empty string. /*
152	if (_min_key_len == `0`)
153	{
154	fprintf (stderr, "Empty input keyword is not allowed.\n"
155	"To recognize an empty input keyword, your code should check for\n"
156	"len == 0 before calling the gperf generated lookup function.\n");
157	exit (`1`);
158	}
159
160	/ Exit program if the characters in the keywords are not in the required*
161	range. /*
162	if (option [SEVENBIT])
163	{
164	for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
165	{
166	KeywordExt *keyword = temp->first();
167
168	const char *k = keyword->_allchars;
169	for (int i = keyword->_allchars_length; i > `0`; k++, i--)
170	if (!(static_cast<unsigned char>(*k) < `128`))
171	{
172	fprintf (stderr, "Option --seven-bit has been specified,\n"
173	"but keyword \"%.*s\" contains non-ASCII characters.\n"
174	"Try removing option --seven-bit.\n",
175	keyword->_allchars_length, keyword->_allchars);
176	exit (`1`);
177	}
178	}
179	}
180
181	/ Determine whether the hash function shall include the length. /
182	_hash_includes_len = !(option [NOLENGTH] \|\| (_min_key_len == _max_key_len));
183	}
184
185	/ ====================== Finding good byte positions ====================== /
186
187	/ Computes the upper bound on the indices passed to asso_values[],*
188	assuming no alpha_increments. /*
189	unsigned int
190	Search::compute_alpha_size () const
191	{
192	return (option [SEVENBIT] ? `128` : `256`);
193	}
194
195	/ Computes the unification rules between different asso_values[c],*
196	assuming no alpha_increments. /*
197	unsigned int *
198	Search::compute_alpha_unify () const
199	{
200	if (option [UPPERLOWER])
201	{
202	/ Uppercase to lowercase mapping. /
203	unsigned int alpha_size = compute_alpha_size();
204	unsigned int alpha_unify = new* unsigned int[alpha_size];
205	for (unsigned int c = `0`; c < alpha_size; c++)
206	alpha_unify[c] = c;
207	for (unsigned int c = `'A'`; c <= `'Z'`; c++)
208	alpha_unify[c] = c + (`'a'`-`'A'`);
209	return alpha_unify;
210	}
211	else
212	/ Identity mapping. /
213	return NULL;
214	}
215
216	/ Initializes each keyword's _selchars array. /
217	void
218	Search::init_selchars_tuple (const Positions& positions, const unsigned int alpha_unify) const*
219	{
220	for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
221	temp->first()->init_selchars_tuple(positions, alpha_unify);
222	}
223
224	/ Deletes each keyword's _selchars array. /
225	void
226	Search::delete_selchars () const
227	{
228	for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
229	temp->first()->delete_selchars();
230	}
231
232	/ Count the duplicate keywords that occur with a given set of positions.*
233	In other words, it returns the difference
234	# K - # proj1 (K)
235	where K is the multiset of given keywords. /*
236	unsigned int
237	Search::count_duplicates_tuple (const Positions& positions, const unsigned int alpha_unify) const*
238	{
239	/ Run through the keyword list and count the duplicates incrementally.*
240	The result does not depend on the order of the keyword list, thanks to
241	the formula above. /*
242	init_selchars_tuple (positions, alpha_unify);
243
244	unsigned int count = `0`;
245	{
246	Hash_Table representatives (_total_keys, !_hash_includes_len);
247	for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
248	{
249	KeywordExt *keyword = temp->first();
250	if (representatives.insert (keyword))
251	count++;
252	}
253	}
254
255	delete_selchars ();
256
257	return count;
258	}
259
260	/ Find good key positions. /
261
262	void
263	Search::find_positions ()
264	{
265	/ If the user gave the key positions, we use them. /
266	if (option [POSITIONS])
267	{
268	_key_positions = option.get_key_positions();
269	return;
270	}
271
272	/ Compute preliminary alpha_unify table. /
273	unsigned int *alpha_unify = compute_alpha_unify ();
274
275	/ 1. Find positions that must occur in order to distinguish duplicates. /
276	Positions mandatory;
277
278	if (!option [DUP])
279	{
280	for (KeywordExt_List *l1 = _head; l1 && l1->rest(); l1 = l1->rest())
281	{
282	KeywordExt *keyword1 = l1->first();
283	for (KeywordExt_List *l2 = l1->rest(); l2; l2 = l2->rest())
284	{
285	KeywordExt *keyword2 = l2->first();
286
287	/ If keyword1 and keyword2 have the same length and differ*
288	in just one position, and it is not the last character,
289	this position is mandatory. /*
290	if (keyword1->_allchars_length == keyword2->_allchars_length)
291	{
292	int n = keyword1->_allchars_length;
293	int i;
294	for (i = `0`; i < n - `1`; i++)
295	{
296	unsigned char c1 = keyword1->_allchars[i];
297	unsigned char c2 = keyword2->_allchars[i];
298	if (option [UPPERLOWER])
299	{
300	if (c1 >= `'A'` && c1 <= `'Z'`)
301	c1 += `'a'` - `'A'`;
302	if (c2 >= `'A'` && c2 <= `'Z'`)
303	c2 += `'a'` - `'A'`;
304	}
305	if (c1 != c2)
306	break;
307	}
308	if (i < n - `1`)
309	{
310	int j;
311	for (j = i + `1`; j < n; j++)
312	{
313	unsigned char c1 = keyword1->_allchars[j];
314	unsigned char c2 = keyword2->_allchars[j];
315	if (option [UPPERLOWER])
316	{
317	if (c1 >= `'A'` && c1 <= `'Z'`)
318	c1 += `'a'` - `'A'`;
319	if (c2 >= `'A'` && c2 <= `'Z'`)
320	c2 += `'a'` - `'A'`;
321	}
322	if (c1 != c2)
323	break;
324	}
325	if (j >= n)
326	{
327	/ Position i is mandatory. /
328	if (!mandatory.contains (i))
329	mandatory.add (i);
330	}
331	}
332	}
333	}
334	}
335	}
336
337	/ 2. Add positions, as long as this decreases the duplicates count. /
338	int imax = (_max_key_len - `1` < Positions::MAX_KEY_POS - `1`
339	? _max_key_len - `1` : Positions::MAX_KEY_POS - `1`);
340	Positions current = mandatory;
341	unsigned int current_duplicates_count =
342	count_duplicates_tuple (current, alpha_unify);
343	for (;;)
344	{
345	Positions best;
346	unsigned int best_duplicates_count = UINT_MAX;
347
348	for (int i = imax; i >= -`1`; i--)
349	if (!current.contains (i))
350	{
351	Positions tryal = current;
352	tryal.add (i);
353	unsigned int try_duplicates_count =
354	count_duplicates_tuple (tryal, alpha_unify);
355
356	/ We prefer 'try' to 'best' if it produces less duplicates,*
357	or if it produces the same number of duplicates but with
358	a more efficient hash function. /*
359	if (try_duplicates_count < best_duplicates_count
360	\|\| (try_duplicates_count == best_duplicates_count && i >= `0`))
361	{
362	best = tryal;
363	best_duplicates_count = try_duplicates_count;
364	}
365	}
366
367	/ Stop adding positions when it gives no improvement. /
368	if (best_duplicates_count >= current_duplicates_count)
369	break;
370
371	current = best;
372	current_duplicates_count = best_duplicates_count;
373	}
374
375	/ 3. Remove positions, as long as this doesn't increase the duplicates*
376	count. /*
377	for (;;)
378	{
379	Positions best;
380	unsigned int best_duplicates_count = UINT_MAX;
381
382	for (int i = imax; i >= -`1`; i--)
383	if (current.contains (i) && !mandatory.contains (i))
384	{
385	Positions tryal = current;
386	tryal.remove (i);
387	unsigned int try_duplicates_count =
388	count_duplicates_tuple (tryal, alpha_unify);
389
390	/ We prefer 'try' to 'best' if it produces less duplicates,*
391	or if it produces the same number of duplicates but with
392	a more efficient hash function. /*
393	if (try_duplicates_count < best_duplicates_count
394	\|\| (try_duplicates_count == best_duplicates_count && i == -`1`))
395	{
396	best = tryal;
397	best_duplicates_count = try_duplicates_count;
398	}
399	}
400
401	/ Stop removing positions when it gives no improvement. /
402	if (best_duplicates_count > current_duplicates_count)
403	break;
404
405	current = best;
406	current_duplicates_count = best_duplicates_count;
407	}
408
409	/ 4. Replace two positions by one, as long as this doesn't increase the*
410	duplicates count. /*
411	for (;;)
412	{
413	Positions best;
414	unsigned int best_duplicates_count = UINT_MAX;
415
416	for (int i1 = imax; i1 >= -`1`; i1--)
417	if (current.contains (i1) && !mandatory.contains (i1))
418	{
419	for (int i2 = imax; i2 >= -`1`; i2--)
420	if (current.contains (i2) && !mandatory.contains (i2) && i2 != i1)
421	{
422	for (int i3 = imax; i3 >= `0`; i3--)
423	if (!current.contains (i3))
424	{
425	Positions tryal = current;
426	tryal.remove (i1);
427	tryal.remove (i2);
428	tryal.add (i3);
429	unsigned int try_duplicates_count =
430	count_duplicates_tuple (tryal, alpha_unify);
431
432	/ We prefer 'try' to 'best' if it produces less*
433	duplicates, or if it produces the same number
434	of duplicates but with a more efficient hash
435	function. /*
436	if (try_duplicates_count < best_duplicates_count
437	\|\| (try_duplicates_count == best_duplicates_count
438	&& (i1 == -`1` \|\| i2 == -`1` \|\| i3 >= `0`)))
439	{
440	best = tryal;
441	best_duplicates_count = try_duplicates_count;
442	}
443	}
444	}
445	}
446
447	/ Stop removing positions when it gives no improvement. /
448	if (best_duplicates_count > current_duplicates_count)
449	break;
450
451	current = best;
452	current_duplicates_count = best_duplicates_count;
453	}
454
455	/ That's it. Hope it's good enough. /
456	_key_positions = current;
457
458	if (option [DEBUG])
459	{
460	/ Print the result. /
461	fprintf (stderr, "\nComputed positions: ");
462	PositionReverseIterator iter = _key_positions.reviterator();
463	bool seen_lastchar = false;
464	bool first = true;
465	for (int i; (i = iter.next ()) != PositionReverseIterator::EOS; )
466	{
467	if (!first)
468	fprintf (stderr, ", ");
469	if (i == Positions::LASTCHAR)
470	seen_lastchar = true;
471	else
472	{
473	fprintf (stderr, "%d", i + `1`);
474	first = false;
475	}
476	}
477	if (seen_lastchar)
478	{
479	if (!first)
480	fprintf (stderr, ", ");
481	fprintf (stderr, "$");
482	}
483	fprintf (stderr, "\n");
484	}
485
486	/ Free preliminary alpha_unify table. /
487	delete[] alpha_unify;
488	}
489
490	/ Count the duplicate keywords that occur with the found set of positions.*
491	In other words, it returns the difference
492	# K - # proj1 (K)
493	where K is the multiset of given keywords. /*
494	unsigned int
495	Search::count_duplicates_tuple () const
496	{
497	unsigned int *alpha_unify = compute_alpha_unify ();
498	unsigned int count = count_duplicates_tuple (_key_positions, alpha_unify);
499	delete[] alpha_unify;
500	return count;
501	}
502
503	/ ===================== Finding good alpha increments ===================== /
504
505	/ Computes the upper bound on the indices passed to asso_values[]. /
506	unsigned int
507	Search::compute_alpha_size (const unsigned int alpha_inc) const*
508	{
509	unsigned int max_alpha_inc = `0`;
510	for (int i = `0`; i < _max_key_len; i++)
511	if (max_alpha_inc < alpha_inc[i])
512	max_alpha_inc = alpha_inc[i];
513	return (option [SEVENBIT] ? `128` : `256`) + max_alpha_inc;
514	}
515
516	/ Computes the unification rules between different asso_values[c]. /
517	unsigned int *
518	Search::compute_alpha_unify (const Positions& positions, const unsigned int alpha_inc) const*
519	{
520	if (option [UPPERLOWER])
521	{
522	/ Without alpha increments, we would simply unify*
523	'A' -> 'a', ..., 'Z' -> 'z'.
524	But when a keyword contains at position i a character c,
525	we have the constraint
526	asso_values[tolower(c) + alpha_inc[i]] ==
527	asso_values[toupper(c) + alpha_inc[i]].
528	This introduces a unification
529	toupper(c) + alpha_inc[i] -> tolower(c) + alpha_inc[i].
530	Note that this unification can extend outside the range of
531	ASCII letters! But still every unified character pair is at
532	a distance of 'a'-'A' = 32, or (after chained unification)
533	at a multiple of 32. So in the end the alpha_unify vector has
534	the form c -> c + 32 f(c) where f(c) is a nonnegative*
535	integer. /*
536	unsigned int alpha_size = compute_alpha_size (alpha_inc);
537
538	unsigned int alpha_unify = new* unsigned int[alpha_size];
539	for (unsigned int c = `0`; c < alpha_size; c++)
540	alpha_unify[c] = c;
541
542	for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
543	{
544	KeywordExt *keyword = temp->first();
545
546	/ Iterate through the selected character positions. /
547	PositionIterator iter = positions.iterator(keyword->_allchars_length);
548
549	for (int i; (i = iter.next ()) != PositionIterator::EOS; )
550	{
551	unsigned int c;
552	if (i == Positions::LASTCHAR)
553	c = static_cast<unsigned char>(keyword->_allchars[keyword->_allchars_length - `1`]);
554	else if (i < keyword->_allchars_length)
555	c = static_cast<unsigned char>(keyword->_allchars[i]);
556	else
557	abort ();
558	if (c >= `'A'` && c <= `'Z'`)
559	c += `'a'` - `'A'`;
560	if (c >= `'a'` && c <= `'z'`)
561	{
562	if (i != Positions::LASTCHAR)
563	c += alpha_inc[i];
564	/ Unify c with c - ('a'-'A'). /
565	unsigned int d = alpha_unify[c];
566	unsigned int b = c - (`'a'`-`'A'`);
567	for (int a = b; a >= `0` && alpha_unify[a] == b; a -= (`'a'`-`'A'`))
568	alpha_unify[a] = d;
569	}
570	}
571	}
572	return alpha_unify;
573	}
574	else
575	/ Identity mapping. /
576	return NULL;
577	}
578
579	/ Initializes each keyword's _selchars array. /
580	void
581	Search::init_selchars_multiset (const Positions& positions, const unsigned int alpha_unify, const* unsigned int alpha_inc) const*
582	{
583	for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
584	temp->first()->init_selchars_multiset(positions, alpha_unify, alpha_inc);
585	}
586
587	/ Count the duplicate keywords that occur with the given set of positions*
588	and a given alpha_inc[] array.
589	In other words, it returns the difference
590	# K - # proj2 (proj1 (K))
591	where K is the multiset of given keywords. /*
592	unsigned int
593	Search::count_duplicates_multiset (const unsigned int alpha_inc) const*
594	{
595	/ Run through the keyword list and count the duplicates incrementally.*
596	The result does not depend on the order of the keyword list, thanks to
597	the formula above. /*
598	unsigned int *alpha_unify = compute_alpha_unify (_key_positions, alpha_inc);
599	init_selchars_multiset (_key_positions, alpha_unify, alpha_inc);
600
601	unsigned int count = `0`;
602	{
603	Hash_Table representatives (_total_keys, !_hash_includes_len);
604	for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
605	{
606	KeywordExt *keyword = temp->first();
607	if (representatives.insert (keyword))
608	count++;
609	}
610	}
611
612	delete_selchars ();
613	delete[] alpha_unify;
614
615	return count;
616	}
617
618	/ Find good _alpha_inc[]. /
619
620	void
621	Search::find_alpha_inc ()
622	{
623	/ The goal is to choose _alpha_inc[] such that it doesn't introduce*
624	artificial duplicates.
625	In other words, the goal is # proj2 (proj1 (K)) = # proj1 (K). /*
626	unsigned int duplicates_goal = count_duplicates_tuple ();
627
628	/ Start with zero increments. This is sufficient in most cases. /
629	unsigned int current = new* unsigned int [_max_key_len];
630	for (int i = `0`; i < _max_key_len; i++)
631	current[i] = `0`;
632	unsigned int current_duplicates_count = count_duplicates_multiset (current);
633
634	if (current_duplicates_count > duplicates_goal)
635	{
636	/ Look which _alpha_inc[i] we are free to increment. /
637	unsigned int nindices;
638	{
639	nindices = `0`;
640	PositionIterator iter = _key_positions.iterator(_max_key_len);
641	for (;;)
642	{
643	int key_pos = iter.next ();
644	if (key_pos == PositionIterator::EOS)
645	break;
646	if (key_pos != Positions::LASTCHAR)
647	nindices++;
648	}
649	}
650
651	DYNAMIC_ARRAY (indices, unsigned int, nindices);
652	{
653	unsigned int j = `0`;
654	PositionIterator iter = _key_positions.iterator(_max_key_len);
655	for (;;)
656	{
657	int key_pos = iter.next ();
658	if (key_pos == PositionIterator::EOS)
659	break;
660	if (key_pos != Positions::LASTCHAR)
661	indices[j++] = key_pos;
662	}
663	if (!(j == nindices))
664	abort ();
665	}
666
667	/ Perform several rounds of searching for a good alpha increment.*
668	Each round reduces the number of artificial collisions by adding
669	an increment in a single key position. /*
670	DYNAMIC_ARRAY (best, unsigned int, _max_key_len);
671	DYNAMIC_ARRAY (tryal, unsigned int, _max_key_len);
672	do
673	{
674	/ An increment of 1 is not always enough. Try higher increments*
675	also. /*
676	for (unsigned int inc = `1`; ; inc++)
677	{
678	unsigned int best_duplicates_count = UINT_MAX;
679
680	for (unsigned int j = `0`; j < nindices; j++)
681	{
682	memcpy (tryal, current, _max_key_len * sizeof (unsigned int));
683	tryal[indices[j]] += inc;
684	unsigned int try_duplicates_count =
685	count_duplicates_multiset (tryal);
686
687	/ We prefer 'try' to 'best' if it produces less*
688	duplicates. /*
689	if (try_duplicates_count < best_duplicates_count)
690	{
691	memcpy (best, tryal, _max_key_len * sizeof (unsigned int));
692	best_duplicates_count = try_duplicates_count;
693	}
694	}
695
696	/ Stop this round when we got an improvement. /
697	if (best_duplicates_count < current_duplicates_count)
698	{
699	memcpy (current, best, _max_key_len * sizeof (unsigned int));
700	current_duplicates_count = best_duplicates_count;
701	break;
702	}
703	}
704	}
705	while (current_duplicates_count > duplicates_goal);
706	FREE_DYNAMIC_ARRAY (tryal);
707	FREE_DYNAMIC_ARRAY (best);
708
709	if (option [DEBUG])
710	{
711	/ Print the result. /
712	fprintf (stderr, "\nComputed alpha increments: ");
713	bool first = true;
714	for (unsigned int j = nindices; j-- > `0`; )
715	if (current[indices[j]] != `0`)
716	{
717	if (!first)
718	fprintf (stderr, ", ");
719	fprintf (stderr, "%u:+%u",
720	indices[j] + `1`, current[indices[j]]);
721	first = false;
722	}
723	fprintf (stderr, "\n");
724	}
725	FREE_DYNAMIC_ARRAY (indices);
726	}
727
728	_alpha_inc = current;
729	_alpha_size = compute_alpha_size (_alpha_inc);
730	_alpha_unify = compute_alpha_unify (_key_positions, _alpha_inc);
731	}
732
733	/ ======================= Finding good asso_values ======================== /
734
735	/ Initializes the asso_values[] related parameters. /
736
737	void
738	Search::prepare_asso_values ()
739	{
740	KeywordExt_List *temp;
741
742	/ Initialize each keyword's _selchars array. /
743	init_selchars_multiset(_key_positions, _alpha_unify, _alpha_inc);
744
745	/ Compute the maximum _selchars_length over all keywords. /
746	_max_selchars_length = _key_positions.iterator(_max_key_len).remaining();
747
748	/ Check for duplicates, i.e. keywords with the same _selchars array*
749	(and - if _hash_includes_len - also the same length).
750	We deal with these by building an equivalence class, so that only
751	1 keyword is representative of the entire collection. Only this
752	representative remains in the keyword list; the others are accessible
753	through the _duplicate_link chain, starting at the representative.
754	This greatly* simplifies processing during later stages of the program.*
755	Set _total_duplicates and _list_len = _total_keys - _total_duplicates. /*
756	{
757	_list_len = _total_keys;
758	_total_duplicates = `0`;
759	/ Make hash table for efficiency. /
760	Hash_Table representatives (_list_len, !_hash_includes_len);
761
762	KeywordExt_List prev = NULL; /* list node before temp /
763	for (temp = _head; temp; )
764	{
765	KeywordExt *keyword = temp->first();
766	KeywordExt *other_keyword = representatives.insert (keyword);
767	KeywordExt_List *garbage = NULL;
768
769	if (other_keyword)
770	{
771	_total_duplicates++;
772	_list_len--;
773	/ Remove keyword from the main list. /
774	prev->rest() = temp->rest();
775	garbage = temp;
776	/ And insert it on other_keyword's duplicate list. /
777	keyword->_duplicate_link = other_keyword->_duplicate_link;
778	other_keyword->_duplicate_link = keyword;
779
780	/ Complain if user hasn't enabled the duplicate option. /
781	if (!option [DUP] \|\| option [DEBUG])
782	{
783	fprintf (stderr, "Key link: \"%.s\" = \"%.s\", with key set \"",
784	keyword->_allchars_length, keyword->_allchars,
785	other_keyword->_allchars_length, other_keyword->_allchars);
786	for (int j = `0`; j < keyword->_selchars_length; j++)
787	putc (keyword->_selchars[j], stderr);
788	fprintf (stderr, "\".\n");
789	}
790	}
791	else
792	{
793	keyword->_duplicate_link = NULL;
794	prev = temp;
795	}
796	temp = temp->rest();
797	if (garbage)
798	delete garbage;
799	}
800	if (option [DEBUG])
801	representatives.dump();
802	}
803
804	/ Exit program if duplicates exists and option[DUP] not set, since we*
805	don't want to continue in this case. (We don't want to turn on
806	option[DUP] implicitly, because the generated code is usually much
807	slower. /*
808	if (_total_duplicates)
809	{
810	if (option [DUP])
811	fprintf (stderr, "%d input keys have identical hash values, examine output carefully...\n",
812	_total_duplicates);
813	else
814	{
815	fprintf (stderr, "%d input keys have identical hash values,\n",
816	_total_duplicates);
817	if (option [POSITIONS])
818	fprintf (stderr, "try different key positions or use option -D.\n");
819	else
820	fprintf (stderr, "use option -D.\n");
821	exit (`1`);
822	}
823	}
824
825	/ Compute the occurrences of each character in the alphabet. /
826	_occurrences = new int[_alpha_size];
827	memset (_occurrences, `0`, _alpha_size * sizeof (_occurrences[`0`]));
828	for (temp = _head; temp; temp = temp->rest())
829	{
830	KeywordExt *keyword = temp->first();
831	const unsigned int *ptr = keyword->_selchars;
832	for (int count = keyword->_selchars_length; count > `0`; ptr++, count--)
833	_occurrences[*ptr]++;
834	}
835
836	/ Memory allocation. /
837	_asso_values = new int[_alpha_size];
838
839	int non_linked_length = _list_len;
840	unsigned int asso_value_max;
841
842	asso_value_max =
843	static_cast<unsigned int>(non_linked_length * option.get_size_multiple());
844	/ Round up to the next power of two. This makes it easy to ensure*
845	an _asso_value[c] is >= 0 and < asso_value_max. Also, the jump value
846	being odd, it guarantees that Search::try_asso_value() will iterate
847	through different values for _asso_value[c]. /*
848	if (asso_value_max == `0`)
849	asso_value_max = `1`;
850	asso_value_max \|= asso_value_max >> `1`;
851	asso_value_max \|= asso_value_max >> `2`;
852	asso_value_max \|= asso_value_max >> `4`;
853	asso_value_max \|= asso_value_max >> `8`;
854	asso_value_max \|= asso_value_max >> `16`;
855	asso_value_max++;
856	_asso_value_max = asso_value_max;
857
858	/ Given the bound for _asso_values[c], we have a bound for the possible*
859	hash values, as computed in compute_hash(). /*
860	_max_hash_value = (_hash_includes_len ? _max_key_len : `0`)
861	+ (_asso_value_max - `1`) * _max_selchars_length;
862	/ Allocate a sparse bit vector for detection of collisions of hash*
863	values. /*
864	_collision_detector = new Bool_Array (_max_hash_value + `1`);
865
866	if (option [DEBUG])
867	{
868	fprintf (stderr, "total non-linked keys = %d\nmaximum associated value is %d"
869	"\nmaximum size of generated hash table is %d\n",
870	non_linked_length, asso_value_max, _max_hash_value);
871
872	int field_width;
873
874	field_width = `0`;
875	{
876	for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
877	{
878	KeywordExt *keyword = temp->first();
879	if (field_width < keyword->_selchars_length)
880	field_width = keyword->_selchars_length;
881	}
882	}
883
884	fprintf (stderr, "\ndumping the keyword list without duplicates\n");
885	fprintf (stderr, "keyword #, %*s, keyword\n", field_width, "keysig");
886	int i = `0`;
887	for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
888	{
889	KeywordExt *keyword = temp->first();
890	fprintf (stderr, "%9d, ", ++i);
891	if (field_width > keyword->_selchars_length)
892	fprintf (stderr, "%*s", field_width - keyword->_selchars_length, "");
893	for (int j = `0`; j < keyword->_selchars_length; j++)
894	putc (keyword->_selchars[j], stderr);
895	fprintf (stderr, ", %.*s\n",
896	keyword->_allchars_length, keyword->_allchars);
897	}
898	fprintf (stderr, "\nend of keyword list\n\n");
899	}
900
901	if (option [RANDOM] \|\| option.get_jump () == `0`)
902	/ We will use rand(), so initialize the random number generator. /
903	srand (static_cast<long>(time (`0`)));
904
905	_initial_asso_value = (option [RANDOM] ? -`1` : option.get_initial_asso_value ());
906	_jump = option.get_jump ();
907	}
908
909	/ Finds some _asso_values[] that fit. /
910
911	/ The idea is to choose the _asso_values[] one by one, in a way that*
912	a choice that has been made never needs to be undone later. This
913	means that we split the work into several steps. Each step chooses
914	one or more _asso_values[c]. The result of choosing one or more
915	_asso_values[c] is that the partitioning of the keyword set gets
916	broader.
917	Look at this partitioning: After every step, the _asso_values[] of a
918	certain set C of characters are undetermined. (At the beginning, C
919	is the set of characters c with _occurrences[c] > 0. At the end, C
920	is empty.) To each keyword K, we associate the multiset of _selchars
921	for which the _asso_values[] are undetermined:
922	K --> K->_selchars intersect C.
923	Consider two keywords equivalent if their value under this mapping is
924	the same. This introduces an equivalence relation on the set of
925	keywords. The equivalence classes partition the keyword set. (At the
926	beginning, the partition is the finest possible: each K is an equivalence
927	class by itself, because all K have a different _selchars. At the end,
928	all K have been merged into a single equivalence class.)
929	The partition before a step is always a refinement of the partition
930	after the step.
931	We choose the steps in such a way that the partition really becomes
932	broader at each step. (A step that only chooses an _asso_values[c]
933	without changing the partition is better merged with the previous step,
934	to avoid useless backtracking.) /*
935
936	struct EquivalenceClass
937	{
938	/ The keywords in this equivalence class. /
939	KeywordExt_List * _keywords;
940	KeywordExt_List * _keywords_last;
941	/ The number of keywords in this equivalence class. /
942	unsigned int _cardinality;
943	/ The undetermined selected characters for the keywords in this*
944	equivalence class, as a canonically reordered multiset. /*
945	unsigned int * _undetermined_chars;
946	unsigned int _undetermined_chars_length;
947
948	EquivalenceClass * _next;
949	};
950
951	struct Step
952	{
953	/ The characters whose values are being determined in this step. /
954	unsigned int _changing_count;
955	unsigned int * _changing;
956	/ Exclusive upper bound for the _asso_values[c] of this step.*
957	A power of 2. /*
958	unsigned int _asso_value_max;
959	/ The characters whose values will be determined after this step. /
960	bool * _undetermined;
961	/ The keyword set partition after this step. /
962	EquivalenceClass * _partition;
963	/ The expected number of iterations in this step. /
964	double _expected_lower;
965	double _expected_upper;
966
967	Step * _next;
968	};
969
970	static inline bool
971	equals (const unsigned int ptr1, const* unsigned int ptr2, unsigned* int len)
972	{
973	while (len > `0`)
974	{
975	if (ptr1 != ptr2)
976	return false;
977	ptr1++;
978	ptr2++;
979	len--;
980	}
981	return true;
982	}
983
984	EquivalenceClass *
985	Search::compute_partition (bool undetermined) const*
986	{
987	EquivalenceClass *partition = NULL;
988	EquivalenceClass *partition_last = NULL;
989	for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
990	{
991	KeywordExt *keyword = temp->first();
992
993	/ Compute the undetermined characters for this keyword. /
994	unsigned int *undetermined_chars =
995	new unsigned int[keyword->_selchars_length];
996	unsigned int undetermined_chars_length = `0`;
997
998	for (int i = `0`; i < keyword->_selchars_length; i++)
999	if (undetermined[keyword->_selchars[i]])
1000	undetermined_chars[undetermined_chars_length++] = keyword->_selchars[i];
1001
1002	/ Look up the equivalence class to which this keyword belongs. /
1003	EquivalenceClass *equclass;
1004	for (equclass = partition; equclass; equclass = equclass->_next)
1005	if (equclass->_undetermined_chars_length == undetermined_chars_length
1006	&& equals (equclass->_undetermined_chars, undetermined_chars,
1007	undetermined_chars_length))
1008	break;
1009	if (equclass == NULL)
1010	{
1011	equclass = new EquivalenceClass ();
1012	equclass->_keywords = NULL;
1013	equclass->_keywords_last = NULL;
1014	equclass->_cardinality = `0`;
1015	equclass->_undetermined_chars = undetermined_chars;
1016	equclass->_undetermined_chars_length = undetermined_chars_length;
1017	equclass->_next = NULL;
1018	if (partition)
1019	partition_last->_next = equclass;
1020	else
1021	partition = equclass;
1022	partition_last = equclass;
1023	}
1024	else
1025	delete[] undetermined_chars;
1026
1027	/ Add the keyword to the equivalence class. /
1028	KeywordExt_List cons = new* KeywordExt_List (keyword);
1029	if (equclass->_keywords)
1030	equclass->_keywords_last->rest() = cons;
1031	else
1032	equclass->_keywords = cons;
1033	equclass->_keywords_last = cons;
1034	equclass->_cardinality++;
1035	}
1036
1037	/ Free some of the allocated memory. The caller doesn't need it. /
1038	for (EquivalenceClass *cls = partition; cls; cls = cls->_next)
1039	delete[] cls->_undetermined_chars;
1040
1041	return partition;
1042	}
1043
1044	static void
1045	delete_partition (EquivalenceClass *partition)
1046	{
1047	while (partition != NULL)
1048	{
1049	EquivalenceClass *equclass = partition;
1050	partition = equclass->_next;
1051	delete_list (equclass->_keywords);
1052	//delete[] equclass->_undetermined_chars; // already freed above
1053	delete equclass;
1054	}
1055	}
1056
1057	/ Compute the possible number of collisions when _asso_values[c] is*
1058	chosen, leading to the given partition. /*
1059	unsigned int
1060	Search::count_possible_collisions (EquivalenceClass partition, unsigned* int c) const
1061	{
1062	/ Every equivalence class p is split according to the frequency of*
1063	occurrence of c, leading to equivalence classes p1, p2, ...
1064	This leads to (\|p\|^2 - \|p1\|^2 - \|p2\|^2 - ...)/2 possible collisions.
1065	Return the sum of this expression over all equivalence classes. /*
1066	unsigned int sum = `0`;
1067	unsigned int m = _max_selchars_length;
1068	DYNAMIC_ARRAY (split_cardinalities, unsigned int, m + `1`);
1069	for (EquivalenceClass *cls = partition; cls; cls = cls->_next)
1070	{
1071	for (unsigned int i = `0`; i <= m; i++)
1072	split_cardinalities[i] = `0`;
1073
1074	for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest())
1075	{
1076	KeywordExt *keyword = temp->first();
1077
1078	unsigned int count = `0`;
1079	for (int i = `0`; i < keyword->_selchars_length; i++)
1080	if (keyword->_selchars[i] == c)
1081	count++;
1082
1083	split_cardinalities[count]++;
1084	}
1085
1086	sum += cls->_cardinality * cls->_cardinality;
1087	for (unsigned int i = `0`; i <= m; i++)
1088	sum -= split_cardinalities[i] * split_cardinalities[i];
1089	}
1090	FREE_DYNAMIC_ARRAY (split_cardinalities);
1091	return sum;
1092	}
1093
1094	/ Test whether adding c to the undetermined characters changes the given*
1095	partition. /*
1096	bool
1097	Search::unchanged_partition (EquivalenceClass partition, unsigned* int c) const
1098	{
1099	for (EquivalenceClass *cls = partition; cls; cls = cls->_next)
1100	{
1101	unsigned int first_count = UINT_MAX;
1102
1103	for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest())
1104	{
1105	KeywordExt *keyword = temp->first();
1106
1107	unsigned int count = `0`;
1108	for (int i = `0`; i < keyword->_selchars_length; i++)
1109	if (keyword->_selchars[i] == c)
1110	count++;
1111
1112	if (temp == cls->_keywords)
1113	first_count = count;
1114	else if (count != first_count)
1115	/ c would split this equivalence class. /
1116	return false;
1117	}
1118	}
1119	return true;
1120	}
1121
1122	void
1123	Search::find_asso_values ()
1124	{
1125	Step *steps;
1126
1127	/ Determine the steps, starting with the last one. /
1128	{
1129	bool *undetermined;
1130	bool *determined;
1131
1132	steps = NULL;
1133
1134	undetermined = new bool[_alpha_size];
1135	for (unsigned int c = `0`; c < _alpha_size; c++)
1136	undetermined[c] = false;
1137
1138	determined = new bool[_alpha_size];
1139	for (unsigned int c = `0`; c < _alpha_size; c++)
1140	determined[c] = true;
1141
1142	for (;;)
1143	{
1144	/ Compute the partition that needs to be refined. /
1145	EquivalenceClass *partition = compute_partition (undetermined);
1146
1147	/ Determine the main character to be chosen in this step.*
1148	Choosing such a character c has the effect of splitting every
1149	equivalence class (according the the frequency of occurrence of c).
1150	We choose the c with the minimum number of possible collisions,
1151	so that characters which lead to a large number of collisions get
1152	handled early during the search. /*
1153	unsigned int chosen_c;
1154	unsigned int chosen_possible_collisions;
1155	{
1156	unsigned int best_c = `0`;
1157	unsigned int best_possible_collisions = UINT_MAX;
1158	for (unsigned int c = `0`; c < _alpha_size; c++)
1159	if (_occurrences[c] > `0` && determined[c])
1160	{
1161	unsigned int possible_collisions =
1162	count_possible_collisions (partition, c);
1163	if (possible_collisions < best_possible_collisions)
1164	{
1165	best_c = c;
1166	best_possible_collisions = possible_collisions;
1167	}
1168	}
1169	if (best_possible_collisions == UINT_MAX)
1170	{
1171	/ All c with _occurrences[c] > 0 are undetermined. We are*
1172	are the starting situation and don't need any more step. /*
1173	delete_partition (partition);
1174	break;
1175	}
1176	chosen_c = best_c;
1177	chosen_possible_collisions = best_possible_collisions;
1178	}
1179
1180	/ We need one more step. /
1181	Step step = new* Step ();
1182
1183	step->_undetermined = new bool[_alpha_size];
1184	memcpy (step->_undetermined, undetermined, _alpha_size*sizeof(bool));
1185
1186	step->_partition = partition;
1187
1188	/ Now determine how the equivalence classes will be before this*
1189	step. /*
1190	undetermined[chosen_c] = true;
1191	partition = compute_partition (undetermined);
1192
1193	/ Now determine which other characters should be determined in this*
1194	step, because they will not change the equivalence classes at
1195	this point. It is the set of all c which, for all equivalence
1196	classes, have the same frequency of occurrence in every keyword
1197	of the equivalence class. /*
1198	for (unsigned int c = `0`; c < _alpha_size; c++)
1199	if (_occurrences[c] > `0` && determined[c]
1200	&& unchanged_partition (partition, c))
1201	{
1202	undetermined[c] = true;
1203	determined[c] = false;
1204	}
1205
1206	/ main_c must be one of these. /
1207	if (determined[chosen_c])
1208	abort ();
1209
1210	/ Now the set of changing characters of this step. /
1211	unsigned int changing_count;
1212
1213	changing_count = `0`;
1214	for (unsigned int c = `0`; c < _alpha_size; c++)
1215	if (undetermined[c] && !step->_undetermined[c])
1216	changing_count++;
1217
1218	unsigned int changing = new* unsigned int[changing_count];
1219	changing_count = `0`;
1220	for (unsigned int c = `0`; c < _alpha_size; c++)
1221	if (undetermined[c] && !step->_undetermined[c])
1222	changing[changing_count++] = c;
1223
1224	step->_changing = changing;
1225	step->_changing_count = changing_count;
1226
1227	step->_asso_value_max = _asso_value_max;
1228
1229	step->_expected_lower =
1230	exp (static_cast<double>(chosen_possible_collisions)
1231	/ static_cast<double>(_max_hash_value));
1232	step->_expected_upper =
1233	exp (static_cast<double>(chosen_possible_collisions)
1234	/ static_cast<double>(_asso_value_max));
1235
1236	delete_partition (partition);
1237
1238	step->_next = steps;
1239	steps = step;
1240	}
1241
1242	delete[] determined;
1243	delete[] undetermined;
1244	}
1245
1246	if (option [DEBUG])
1247	{
1248	unsigned int stepno = `0`;
1249	for (Step *step = steps; step; step = step->_next)
1250	{
1251	stepno++;
1252	fprintf (stderr, "Step %u chooses _asso_values[", stepno);
1253	for (unsigned int i = `0`; i < step->_changing_count; i++)
1254	{
1255	if (i > `0`)
1256	fprintf (stderr, ",");
1257	fprintf (stderr, "'%c'", step->_changing[i]);
1258	}
1259	fprintf (stderr, "], expected number of iterations between %g and %g.\n",
1260	step->_expected_lower, step->_expected_upper);
1261	fprintf (stderr, "Keyword equivalence classes:\n");
1262	for (EquivalenceClass *cls = step->_partition; cls; cls = cls->_next)
1263	{
1264	fprintf (stderr, "\n");
1265	for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest())
1266	{
1267	KeywordExt *keyword = temp->first();
1268	fprintf (stderr, " %.*s\n",
1269	keyword->_allchars_length, keyword->_allchars);
1270	}
1271	}
1272	fprintf (stderr, "\n");
1273	}
1274	}
1275
1276	/ Initialize _asso_values[]. (The value given here matters only*
1277	for those c which occur in all keywords with equal multiplicity.) /*
1278	for (unsigned int c = `0`; c < _alpha_size; c++)
1279	_asso_values[c] = `0`;
1280
1281	unsigned int stepno = `0`;
1282	for (Step *step = steps; step; step = step->_next)
1283	{
1284	stepno++;
1285
1286	/ Initialize the asso_values[]. /
1287	unsigned int k = step->_changing_count;
1288	for (unsigned int i = `0`; i < k; i++)
1289	{
1290	unsigned int c = step->_changing[i];
1291	_asso_values[c] =
1292	(_initial_asso_value < `0` ? rand () : _initial_asso_value)
1293	& (step->_asso_value_max - `1`);
1294	}
1295
1296	unsigned int iterations = `0`;
1297	DYNAMIC_ARRAY (iter, unsigned int, k);
1298	for (unsigned int i = `0`; i < k; i++)
1299	iter[i] = `0`;
1300	unsigned int ii = (_jump != `0` ? k - `1` : `0`);
1301
1302	for (;;)
1303	{
1304	/ Test whether these asso_values[] lead to collisions among*
1305	the equivalence classes that should be collision-free. /*
1306	bool has_collision = false;
1307	for (EquivalenceClass *cls = step->_partition; cls; cls = cls->_next)
1308	{
1309	/ Iteration Number array is a win, O(1) initialization time! /
1310	_collision_detector->clear ();
1311
1312	for (KeywordExt_List *ptr = cls->_keywords; ptr; ptr = ptr->rest())
1313	{
1314	KeywordExt *keyword = ptr->first();
1315
1316	/ Compute the new hash code for the keyword, leaving apart*
1317	the yet undetermined asso_values[]. /*
1318	int hashcode;
1319	{
1320	int sum = _hash_includes_len ? keyword->_allchars_length : `0`;
1321	const unsigned int *p = keyword->_selchars;
1322	int i = keyword->_selchars_length;
1323	for (; i > `0`; p++, i--)
1324	if (!step->_undetermined[*p])
1325	sum += _asso_values[*p];
1326	hashcode = sum;
1327	}
1328
1329	/ See whether it collides with another keyword's hash code,*
1330	from the same equivalence class. /*
1331	if (_collision_detector->set_bit (hashcode))
1332	{
1333	has_collision = true;
1334	break;
1335	}
1336	}
1337
1338	/ Don't need to continue looking at the other equivalence*
1339	classes if we already have found a collision. /*
1340	if (has_collision)
1341	break;
1342	}
1343
1344	iterations++;
1345	if (!has_collision)
1346	break;
1347
1348	/ Try other asso_values[]. /
1349	if (_jump != `0`)
1350	{
1351	/ The way we try various values for*
1352	asso_values[step->_changing[0],...step->_changing[k-1]]
1353	is like this:
1354	for (bound = 0,1,...)
1355	for (ii = 0,...,k-1)
1356	iter[ii] := bound
1357	iter[0..ii-1] := values <= bound
1358	iter[ii+1..k-1] := values < bound
1359	and
1360	asso_values[step->_changing[i]] =
1361	_initial_asso_value + iter[i] _jump.*
1362	This makes it more likely to find small asso_values[].
1363	*/
1364	unsigned int bound = iter[ii];
1365	unsigned int i = `0`;
1366	while (i < ii)
1367	{
1368	unsigned int c = step->_changing[i];
1369	iter[i]++;
1370	_asso_values[c] =
1371	(_asso_values[c] + _jump) & (step->_asso_value_max - `1`);
1372	if (iter[i] <= bound)
1373	goto found_next;
1374	_asso_values[c] =
1375	(_asso_values[c] - iter[i] * _jump)
1376	& (step->_asso_value_max - `1`);
1377	iter[i] = `0`;
1378	i++;
1379	}
1380	i = ii + `1`;
1381	while (i < k)
1382	{
1383	unsigned int c = step->_changing[i];
1384	iter[i]++;
1385	_asso_values[c] =
1386	(_asso_values[c] + _jump) & (step->_asso_value_max - `1`);
1387	if (iter[i] < bound)
1388	goto found_next;
1389	_asso_values[c] =
1390	(_asso_values[c] - iter[i] * _jump)
1391	& (step->_asso_value_max - `1`);
1392	iter[i] = `0`;
1393	i++;
1394	}
1395	/ Switch from one ii to the next. /
1396	{
1397	unsigned int c = step->_changing[ii];
1398	_asso_values[c] =
1399	(_asso_values[c] - bound * _jump)
1400	& (step->_asso_value_max - `1`);
1401	iter[ii] = `0`;
1402	}
1403	/ Here all iter[i] == 0. /
1404	ii++;
1405	if (ii == k)
1406	{
1407	ii = `0`;
1408	bound++;
1409	if (bound == step->_asso_value_max)
1410	{
1411	/ Out of search space! We can either backtrack, or*
1412	increase the available search space of this step.
1413	It seems simpler to choose the latter solution. /*
1414	step->_asso_value_max = `2` * step->_asso_value_max;
1415	if (step->_asso_value_max > _asso_value_max)
1416	{
1417	_asso_value_max = step->_asso_value_max;
1418	/ Reinitialize _max_hash_value. /
1419	_max_hash_value =
1420	(_hash_includes_len ? _max_key_len : `0`)
1421	+ (_asso_value_max - `1`) * _max_selchars_length;
1422	/ Reinitialize _collision_detector. /
1423	delete _collision_detector;
1424	_collision_detector =
1425	new Bool_Array (_max_hash_value + `1`);
1426	}
1427	}
1428	}
1429	{
1430	unsigned int c = step->_changing[ii];
1431	iter[ii] = bound;
1432	_asso_values[c] =
1433	(_asso_values[c] + bound * _jump)
1434	& (step->_asso_value_max - `1`);
1435	}
1436	found_next: ;
1437	}
1438	else
1439	{
1440	/ Random. /
1441	unsigned int c = step->_changing[ii];
1442	_asso_values[c] =
1443	(_asso_values[c] + rand ()) & (step->_asso_value_max - `1`);
1444	/ Next time, change the next c. /
1445	ii++;
1446	if (ii == k)
1447	ii = `0`;
1448	}
1449	}
1450	FREE_DYNAMIC_ARRAY (iter);
1451
1452	if (option [DEBUG])
1453	{
1454	fprintf (stderr, "Step %u chose _asso_values[", stepno);
1455	for (unsigned int i = `0`; i < step->_changing_count; i++)
1456	{
1457	if (i > `0`)
1458	fprintf (stderr, ",");
1459	fprintf (stderr, "'%c'", step->_changing[i]);
1460	}
1461	fprintf (stderr, "] in %u iterations.\n", iterations);
1462	}
1463	}
1464
1465	/ Free allocated memory. /
1466	while (steps != NULL)
1467	{
1468	Step *step = steps;
1469	steps = step->_next;
1470	delete[] step->_changing;
1471	delete[] step->_undetermined;
1472	delete_partition (step->_partition);
1473	delete step;
1474	}
1475	}
1476
1477	/ Computes a keyword's hash value, relative to the current _asso_values[],*
1478	and stores it in keyword->_hash_value. /*
1479
1480	inline int
1481	Search::compute_hash (KeywordExt keyword) const*
1482	{
1483	int sum = _hash_includes_len ? keyword->_allchars_length : `0`;
1484
1485	const unsigned int *p = keyword->_selchars;
1486	int i = keyword->_selchars_length;
1487	for (; i > `0`; p++, i--)
1488	sum += _asso_values[*p];
1489
1490	return keyword->_hash_value = sum;
1491	}
1492
1493	/ Finds good _asso_values[]. /
1494
1495	void
1496	Search::find_good_asso_values ()
1497	{
1498	prepare_asso_values ();
1499
1500	/ Search for good _asso_values[]. /
1501	int asso_iteration;
1502	if ((asso_iteration = option.get_asso_iterations ()) == `0`)
1503	/ Try only the given _initial_asso_value and _jump. /
1504	find_asso_values ();
1505	else
1506	{
1507	/ Try different pairs of _initial_asso_value and _jump, in the*
1508	following order:
1509	(0, 1)
1510	(1, 1)
1511	(2, 1) (0, 3)
1512	(3, 1) (1, 3)
1513	(4, 1) (2, 3) (0, 5)
1514	(5, 1) (3, 3) (1, 5)
1515	..... /*
1516	KeywordExt_List *saved_head = _head;
1517	int best_initial_asso_value = `0`;
1518	int best_jump = `1`;
1519	int best_asso_values = new* int[_alpha_size];
1520	int best_collisions = INT_MAX;
1521	int best_max_hash_value = INT_MAX;
1522
1523	_initial_asso_value = `0`; _jump = `1`;
1524	for (;;)
1525	{
1526	/ Restore the keyword list in its original order. /
1527	_head = copy_list (saved_head);
1528	/ Find good _asso_values[]. /
1529	find_asso_values ();
1530	/ Test whether it is the best solution so far. /
1531	int collisions = `0`;
1532	int max_hash_value = INT_MIN;
1533	_collision_detector->clear ();
1534	for (KeywordExt_List *ptr = _head; ptr; ptr = ptr->rest())
1535	{
1536	KeywordExt *keyword = ptr->first();
1537	int hashcode = compute_hash (keyword);
1538	if (max_hash_value < hashcode)
1539	max_hash_value = hashcode;
1540	if (_collision_detector->set_bit (hashcode))
1541	collisions++;
1542	}
1543	if (collisions < best_collisions
1544	\|\| (collisions == best_collisions
1545	&& max_hash_value < best_max_hash_value))
1546	{
1547	memcpy (best_asso_values, _asso_values,
1548	_alpha_size * sizeof (_asso_values[`0`]));
1549	best_collisions = collisions;
1550	best_max_hash_value = max_hash_value;
1551	}
1552	/ Delete the copied keyword list. /
1553	delete_list (_head);
1554
1555	if (--asso_iteration == `0`)
1556	break;
1557	/ Prepare for next iteration. /
1558	if (_initial_asso_value >= `2`)
1559	_initial_asso_value -= `2`, _jump += `2`;
1560	else
1561	_initial_asso_value += _jump, _jump = `1`;
1562	}
1563	_head = saved_head;
1564	/ Install the best found asso_values. /
1565	_initial_asso_value = best_initial_asso_value;
1566	_jump = best_jump;
1567	memcpy (_asso_values, best_asso_values,
1568	_alpha_size * sizeof (_asso_values[`0`]));
1569	delete[] best_asso_values;
1570	/ The keywords' _hash_value fields are recomputed below. /
1571	}
1572	}
1573
1574	/ ========================================================================= /
1575
1576	/ Comparison function for sorting by increasing _hash_value. /
1577	static bool
1578	less_by_hash_value (KeywordExt keyword1, KeywordExt keyword2)
1579	{
1580	return keyword1->_hash_value < keyword2->_hash_value;
1581	}
1582
1583	/ Sorts the keyword list by hash value. /
1584
1585	void
1586	Search::sort ()
1587	{
1588	_head = mergesort_list (_head, less_by_hash_value);
1589	}
1590
1591	void
1592	Search::optimize ()
1593	{
1594	/ Preparations. /
1595	prepare ();
1596
1597	/ Step 1: Finding good byte positions. /
1598	find_positions ();
1599
1600	/ Step 2: Finding good alpha increments. /
1601	find_alpha_inc ();
1602
1603	/ Step 3: Finding good asso_values. /
1604	find_good_asso_values ();
1605
1606	/ Make one final check, just to make sure nothing weird happened.... /
1607	_collision_detector->clear ();
1608	for (KeywordExt_List *curr_ptr = _head; curr_ptr; curr_ptr = curr_ptr->rest())
1609	{
1610	KeywordExt *curr = curr_ptr->first();
1611	unsigned int hashcode = compute_hash (curr);
1612	if (_collision_detector->set_bit (hashcode))
1613	{
1614	/ This shouldn't happen. proj1, proj2, proj3 must have been*
1615	computed to be injective on the given keyword set. /*
1616	fprintf (stderr,
1617	"\nInternal error, unexpected duplicate hash code\n");
1618	if (option [POSITIONS])
1619	fprintf (stderr, "try options -m or -r, or use new key positions.\n\n");
1620	else
1621	fprintf (stderr, "try options -m or -r.\n\n");
1622	exit (`1`);
1623	}
1624	}
1625
1626	/ Sorts the keyword list by hash value. /
1627	sort ();
1628
1629	/ Set unused asso_values[c] to max_hash_value + 1. This is not absolutely*
1630	necessary, but speeds up the lookup function in many cases of lookup
1631	failure: no string comparison is needed once the hash value of a string
1632	is larger than the hash value of any keyword. /*
1633	int max_hash_value;
1634	{
1635	KeywordExt_List *temp;
1636	for (temp = _head; temp->rest(); temp = temp->rest())
1637	;
1638	max_hash_value = temp->first()->_hash_value;
1639	}
1640	for (unsigned int c = `0`; c < _alpha_size; c++)
1641	if (_occurrences[c] == `0`)
1642	_asso_values[c] = max_hash_value + `1`;
1643
1644	/ Propagate unified asso_values. /
1645	if (_alpha_unify)
1646	{
1647	for (unsigned int c = `0`; c < _alpha_size; c++)
1648	if (_alpha_unify[c] != c)
1649	_asso_values[c] = _asso_values[_alpha_unify[c]];
1650	}
1651	}
1652
1653	/ Prints out some diagnostics upon completion. /
1654
1655	Search::~Search ()
1656	{
1657	delete _collision_detector;
1658	if (option [DEBUG])
1659	{
1660	fprintf (stderr, "\ndumping occurrence and associated values tables\n");
1661
1662	for (unsigned int i = `0`; i < _alpha_size; i++)
1663	if (_occurrences[i])
1664	fprintf (stderr, "asso_values[%c] = %6d, occurrences[%c] = %6d\n",
1665	i, _asso_values[i], i, _occurrences[i]);
1666
1667	fprintf (stderr, "end table dumping\n");
1668
1669	fprintf (stderr, "\nDumping key list information:\ntotal non-static linked keywords = %d"
1670	"\ntotal keywords = %d\ntotal duplicates = %d\nmaximum key length = %d\n",
1671	_list_len, _total_keys, _total_duplicates, _max_key_len);
1672
1673	int field_width = _max_selchars_length;
1674	fprintf (stderr, "\nList contents are:\n(hash value, key length, index, %*s, keyword):\n",
1675	field_width, "selchars");
1676	for (KeywordExt_List *ptr = _head; ptr; ptr = ptr->rest())
1677	{
1678	fprintf (stderr, "%11d,%11d,%6d, ",
1679	ptr->first()->_hash_value, ptr->first()->_allchars_length, ptr->first()->_final_index);
1680	if (field_width > ptr->first()->_selchars_length)
1681	fprintf (stderr, "%*s", field_width - ptr->first()->_selchars_length, "");
1682	for (int j = `0`; j < ptr->first()->_selchars_length; j++)
1683	putc (ptr->first()->_selchars[j], stderr);
1684	fprintf (stderr, ", %.*s\n",
1685	ptr->first()->_allchars_length, ptr->first()->_allchars);
1686	}
1687
1688	fprintf (stderr, "End dumping list.\n\n");
1689	}
1690	delete[] _asso_values;
1691	delete[] _occurrences;
1692	delete[] _alpha_unify;
1693	delete[] _alpha_inc;
1694	}
1695

Browse the source code of gperf/src/search.cc