1/* Byte-wise substring search, using the Two-Way algorithm.
2 Copyright (C) 2008-2012 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Written by Eric Blake <ebb9@byu.net>, 2008.
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1, or (at your option)
9 any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public License along
17 with this program; if not, see <http://www.gnu.org/licenses/>. */
18
19/* Before including this file, you need to include <config.h> and
20 <string.h>, and define:
21 RESULT_TYPE A macro that expands to the return type.
22 AVAILABLE(h, h_l, j, n_l)
23 A macro that returns nonzero if there are
24 at least N_L bytes left starting at H[J].
25 H is 'unsigned char *', H_L, J, and N_L
26 are 'size_t'; H_L is an lvalue. For
27 NUL-terminated searches, H_L can be
28 modified each iteration to avoid having
29 to compute the end of H up front.
30
31 For case-insensitivity, you may optionally define:
32 CMP_FUNC(p1, p2, l) A macro that returns 0 iff the first L
33 characters of P1 and P2 are equal.
34 CANON_ELEMENT(c) A macro that canonicalizes an element right after
35 it has been fetched from one of the two strings.
36 The argument is an 'unsigned char'; the result
37 must be an 'unsigned char' as well.
38
39 This file undefines the macros documented above, and defines
40 LONG_NEEDLE_THRESHOLD.
41*/
42
43#include <limits.h>
44#include <stdint.h>
45
46/* We use the Two-Way string matching algorithm (also known as
47 Chrochemore-Perrin), which guarantees linear complexity with
48 constant space. Additionally, for long needles, we also use a bad
49 character shift table similar to the Boyer-Moore algorithm to
50 achieve improved (potentially sub-linear) performance.
51
52 See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260,
53 http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm,
54 http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.34.6641&rep=rep1&type=pdf
55*/
56
57/* Point at which computing a bad-byte shift table is likely to be
58 worthwhile. Small needles should not compute a table, since it
59 adds (1 << CHAR_BIT) + NEEDLE_LEN computations of preparation for a
60 speedup no greater than a factor of NEEDLE_LEN. The larger the
61 needle, the better the potential performance gain. On the other
62 hand, on non-POSIX systems with CHAR_BIT larger than eight, the
63 memory required for the table is prohibitive. */
64#if CHAR_BIT < 10
65# define LONG_NEEDLE_THRESHOLD 32U
66#else
67# define LONG_NEEDLE_THRESHOLD SIZE_MAX
68#endif
69
70#ifndef MAX
71# define MAX(a, b) ((a < b) ? (b) : (a))
72#endif
73
74#ifndef CANON_ELEMENT
75# define CANON_ELEMENT(c) c
76#endif
77#ifndef CMP_FUNC
78# define CMP_FUNC memcmp
79#endif
80
81/* Perform a critical factorization of NEEDLE, of length NEEDLE_LEN.
82 Return the index of the first byte in the right half, and set
83 *PERIOD to the global period of the right half.
84
85 The global period of a string is the smallest index (possibly its
86 length) at which all remaining bytes in the string are repetitions
87 of the prefix (the last repetition may be a subset of the prefix).
88
89 When NEEDLE is factored into two halves, a local period is the
90 length of the smallest word that shares a suffix with the left half
91 and shares a prefix with the right half. All factorizations of a
92 non-empty NEEDLE have a local period of at least 1 and no greater
93 than NEEDLE_LEN.
94
95 A critical factorization has the property that the local period
96 equals the global period. All strings have at least one critical
97 factorization with the left half smaller than the global period.
98 And while some strings have more than one critical factorization,
99 it is provable that with an ordered alphabet, at least one of the
100 critical factorizations corresponds to a maximal suffix.
101
102 Given an ordered alphabet, a critical factorization can be computed
103 in linear time, with 2 * NEEDLE_LEN comparisons, by computing the
104 shorter of two ordered maximal suffixes. The ordered maximal
105 suffixes are determined by lexicographic comparison while tracking
106 periodicity. */
107static size_t
108critical_factorization (const unsigned char *needle, size_t needle_len,
109 size_t *period)
110{
111 /* Index of last byte of left half, or SIZE_MAX. */
112 size_t max_suffix, max_suffix_rev;
113 size_t j; /* Index into NEEDLE for current candidate suffix. */
114 size_t k; /* Offset into current period. */
115 size_t p; /* Intermediate period. */
116 unsigned char a, b; /* Current comparison bytes. */
117
118 /* Special case NEEDLE_LEN of 1 or 2 (all callers already filtered
119 out 0-length needles. */
120 if (needle_len < 3)
121 {
122 *period = 1;
123 return needle_len - 1;
124 }
125
126 /* Invariants:
127 0 <= j < NEEDLE_LEN - 1
128 -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed)
129 min(max_suffix, max_suffix_rev) < global period of NEEDLE
130 1 <= p <= global period of NEEDLE
131 p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j]
132 1 <= k <= p
133 */
134
135 /* Perform lexicographic search. */
136 max_suffix = SIZE_MAX;
137 j = 0;
138 k = p = 1;
139 while (j + k < needle_len)
140 {
141 a = CANON_ELEMENT (needle[j + k]);
142 b = CANON_ELEMENT (needle[max_suffix + k]);
143 if (a < b)
144 {
145 /* Suffix is smaller, period is entire prefix so far. */
146 j += k;
147 k = 1;
148 p = j - max_suffix;
149 }
150 else if (a == b)
151 {
152 /* Advance through repetition of the current period. */
153 if (k != p)
154 ++k;
155 else
156 {
157 j += p;
158 k = 1;
159 }
160 }
161 else /* b < a */
162 {
163 /* Suffix is larger, start over from current location. */
164 max_suffix = j++;
165 k = p = 1;
166 }
167 }
168 *period = p;
169
170 /* Perform reverse lexicographic search. */
171 max_suffix_rev = SIZE_MAX;
172 j = 0;
173 k = p = 1;
174 while (j + k < needle_len)
175 {
176 a = CANON_ELEMENT (needle[j + k]);
177 b = CANON_ELEMENT (needle[max_suffix_rev + k]);
178 if (b < a)
179 {
180 /* Suffix is smaller, period is entire prefix so far. */
181 j += k;
182 k = 1;
183 p = j - max_suffix_rev;
184 }
185 else if (a == b)
186 {
187 /* Advance through repetition of the current period. */
188 if (k != p)
189 ++k;
190 else
191 {
192 j += p;
193 k = 1;
194 }
195 }
196 else /* a < b */
197 {
198 /* Suffix is larger, start over from current location. */
199 max_suffix_rev = j++;
200 k = p = 1;
201 }
202 }
203
204 /* Choose the shorter suffix. Return the index of the first byte of
205 the right half, rather than the last byte of the left half.
206
207 For some examples, 'banana' has two critical factorizations, both
208 exposed by the two lexicographic extreme suffixes of 'anana' and
209 'nana', where both suffixes have a period of 2. On the other
210 hand, with 'aab' and 'bba', both strings have a single critical
211 factorization of the last byte, with the suffix having a period
212 of 1. While the maximal lexicographic suffix of 'aab' is 'b',
213 the maximal lexicographic suffix of 'bba' is 'ba', which is not a
214 critical factorization. Conversely, the maximal reverse
215 lexicographic suffix of 'a' works for 'bba', but not 'ab' for
216 'aab'. The shorter suffix of the two will always be a critical
217 factorization. */
218 if (max_suffix_rev + 1 < max_suffix + 1)
219 return max_suffix + 1;
220 *period = p;
221 return max_suffix_rev + 1;
222}
223
224/* Return the first location of non-empty NEEDLE within HAYSTACK, or
225 NULL. HAYSTACK_LEN is the minimum known length of HAYSTACK. This
226 method is optimized for NEEDLE_LEN < LONG_NEEDLE_THRESHOLD.
227 Performance is guaranteed to be linear, with an initialization cost
228 of 2 * NEEDLE_LEN comparisons.
229
230 If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
231 most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.
232 If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
233 HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. */
234static RETURN_TYPE
235two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
236 const unsigned char *needle, size_t needle_len)
237{
238 size_t i; /* Index into current byte of NEEDLE. */
239 size_t j; /* Index into current window of HAYSTACK. */
240 size_t period; /* The period of the right half of needle. */
241 size_t suffix; /* The index of the right half of needle. */
242
243 /* Factor the needle into two halves, such that the left half is
244 smaller than the global period, and the right half is
245 periodic (with a period as large as NEEDLE_LEN - suffix). */
246 suffix = critical_factorization (needle, needle_len, &period);
247
248 /* Perform the search. Each iteration compares the right half
249 first. */
250 if (CMP_FUNC (needle, needle + period, suffix) == 0)
251 {
252 /* Entire needle is periodic; a mismatch in the left half can
253 only advance by the period, so use memory to avoid rescanning
254 known occurrences of the period in the right half. */
255 size_t memory = 0;
256 j = 0;
257 while (AVAILABLE (haystack, haystack_len, j, needle_len))
258 {
259 /* Scan for matches in right half. */
260 i = MAX (suffix, memory);
261 while (i < needle_len && (CANON_ELEMENT (needle[i])
262 == CANON_ELEMENT (haystack[i + j])))
263 ++i;
264 if (needle_len <= i)
265 {
266 /* Scan for matches in left half. */
267 i = suffix - 1;
268 while (memory < i + 1 && (CANON_ELEMENT (needle[i])
269 == CANON_ELEMENT (haystack[i + j])))
270 --i;
271 if (i + 1 < memory + 1)
272 return (RETURN_TYPE) (haystack + j);
273 /* No match, so remember how many repetitions of period
274 on the right half were scanned. */
275 j += period;
276 memory = needle_len - period;
277 }
278 else
279 {
280 j += i - suffix + 1;
281 memory = 0;
282 }
283 }
284 }
285 else
286 {
287 /* The two halves of needle are distinct; no extra memory is
288 required, and any mismatch results in a maximal shift. */
289 period = MAX (suffix, needle_len - suffix) + 1;
290 j = 0;
291 while (AVAILABLE (haystack, haystack_len, j, needle_len))
292 {
293 /* Scan for matches in right half. */
294 i = suffix;
295 while (i < needle_len && (CANON_ELEMENT (needle[i])
296 == CANON_ELEMENT (haystack[i + j])))
297 ++i;
298 if (needle_len <= i)
299 {
300 /* Scan for matches in left half. */
301 i = suffix - 1;
302 while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
303 == CANON_ELEMENT (haystack[i + j])))
304 --i;
305 if (i == SIZE_MAX)
306 return (RETURN_TYPE) (haystack + j);
307 j += period;
308 }
309 else
310 j += i - suffix + 1;
311 }
312 }
313 return NULL;
314}
315
316/* Return the first location of non-empty NEEDLE within HAYSTACK, or
317 NULL. HAYSTACK_LEN is the minimum known length of HAYSTACK. This
318 method is optimized for LONG_NEEDLE_THRESHOLD <= NEEDLE_LEN.
319 Performance is guaranteed to be linear, with an initialization cost
320 of 3 * NEEDLE_LEN + (1 << CHAR_BIT) operations.
321
322 If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
323 most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching,
324 and sublinear performance O(HAYSTACK_LEN / NEEDLE_LEN) is possible.
325 If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
326 HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and
327 sublinear performance is not possible. */
328static RETURN_TYPE
329two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
330 const unsigned char *needle, size_t needle_len)
331{
332 size_t i; /* Index into current byte of NEEDLE. */
333 size_t j; /* Index into current window of HAYSTACK. */
334 size_t period; /* The period of the right half of needle. */
335 size_t suffix; /* The index of the right half of needle. */
336 size_t shift_table[1U << CHAR_BIT]; /* See below. */
337
338 /* Factor the needle into two halves, such that the left half is
339 smaller than the global period, and the right half is
340 periodic (with a period as large as NEEDLE_LEN - suffix). */
341 suffix = critical_factorization (needle, needle_len, &period);
342
343 /* Populate shift_table. For each possible byte value c,
344 shift_table[c] is the distance from the last occurrence of c to
345 the end of NEEDLE, or NEEDLE_LEN if c is absent from the NEEDLE.
346 shift_table[NEEDLE[NEEDLE_LEN - 1]] contains the only 0. */
347 for (i = 0; i < 1U << CHAR_BIT; i++)
348 shift_table[i] = needle_len;
349 for (i = 0; i < needle_len; i++)
350 shift_table[CANON_ELEMENT (needle[i])] = needle_len - i - 1;
351
352 /* Perform the search. Each iteration compares the right half
353 first. */
354 if (CMP_FUNC (needle, needle + period, suffix) == 0)
355 {
356 /* Entire needle is periodic; a mismatch in the left half can
357 only advance by the period, so use memory to avoid rescanning
358 known occurrences of the period in the right half. */
359 size_t memory = 0;
360 size_t shift;
361 j = 0;
362 while (AVAILABLE (haystack, haystack_len, j, needle_len))
363 {
364 /* Check the last byte first; if it does not match, then
365 shift to the next possible match location. */
366 shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
367 if (0 < shift)
368 {
369 if (memory && shift < period)
370 {
371 /* Since needle is periodic, but the last period has
372 a byte out of place, there can be no match until
373 after the mismatch. */
374 shift = needle_len - period;
375 }
376 memory = 0;
377 j += shift;
378 continue;
379 }
380 /* Scan for matches in right half. The last byte has
381 already been matched, by virtue of the shift table. */
382 i = MAX (suffix, memory);
383 while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
384 == CANON_ELEMENT (haystack[i + j])))
385 ++i;
386 if (needle_len - 1 <= i)
387 {
388 /* Scan for matches in left half. */
389 i = suffix - 1;
390 while (memory < i + 1 && (CANON_ELEMENT (needle[i])
391 == CANON_ELEMENT (haystack[i + j])))
392 --i;
393 if (i + 1 < memory + 1)
394 return (RETURN_TYPE) (haystack + j);
395 /* No match, so remember how many repetitions of period
396 on the right half were scanned. */
397 j += period;
398 memory = needle_len - period;
399 }
400 else
401 {
402 j += i - suffix + 1;
403 memory = 0;
404 }
405 }
406 }
407 else
408 {
409 /* The two halves of needle are distinct; no extra memory is
410 required, and any mismatch results in a maximal shift. */
411 size_t shift;
412 period = MAX (suffix, needle_len - suffix) + 1;
413 j = 0;
414 while (AVAILABLE (haystack, haystack_len, j, needle_len))
415 {
416 /* Check the last byte first; if it does not match, then
417 shift to the next possible match location. */
418 shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
419 if (0 < shift)
420 {
421 j += shift;
422 continue;
423 }
424 /* Scan for matches in right half. The last byte has
425 already been matched, by virtue of the shift table. */
426 i = suffix;
427 while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
428 == CANON_ELEMENT (haystack[i + j])))
429 ++i;
430 if (needle_len - 1 <= i)
431 {
432 /* Scan for matches in left half. */
433 i = suffix - 1;
434 while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
435 == CANON_ELEMENT (haystack[i + j])))
436 --i;
437 if (i == SIZE_MAX)
438 return (RETURN_TYPE) (haystack + j);
439 j += period;
440 }
441 else
442 j += i - suffix + 1;
443 }
444 }
445 return NULL;
446}
447
448#undef AVAILABLE
449#undef CANON_ELEMENT
450#undef CMP_FUNC
451#undef MAX
452#undef RETURN_TYPE
453