1#pragma once
2
3#include <Common/Exception.h>
4#include <Common/StringUtils/StringUtils.h>
5#include <Common/UTF8Helpers.h>
6#include <Core/Defines.h>
7#include <ext/range.h>
8#include <Poco/UTF8Encoding.h>
9#include <Poco/Unicode.h>
10#include <stdint.h>
11#include <string.h>
12
13#ifdef __SSE2__
14 #include <emmintrin.h>
15#endif
16
17#ifdef __SSE4_1__
18 #include <smmintrin.h>
19#endif
20
21
22namespace DB
23{
24
25namespace ErrorCodes
26{
27 extern const int UNSUPPORTED_PARAMETER;
28 extern const int BAD_ARGUMENTS;
29}
30
31
32/** Variants for searching a substring in a string.
33 * In most cases, performance is less than Volnitsky (see Volnitsky.h).
34 */
35
36
37struct StringSearcherBase
38{
39#ifdef __SSE2__
40 static constexpr auto n = sizeof(__m128i);
41 const int page_size = getpagesize();
42
43 bool pageSafe(const void * const ptr) const
44 {
45 return ((page_size - 1) & reinterpret_cast<std::uintptr_t>(ptr)) <= page_size - n;
46 }
47#endif
48};
49
50
51/// Performs case-sensitive and case-insensitive search of UTF-8 strings
52template <bool CaseSensitive, bool ASCII> class StringSearcher;
53
54/// Case-insensitive UTF-8 searcher
55template <>
56class StringSearcher<false, false> : private StringSearcherBase
57{
58private:
59 using UTF8SequenceBuffer = UInt8[6];
60
61 /// substring to be searched for
62 const UInt8 * const needle;
63 const size_t needle_size;
64 const UInt8 * const needle_end = needle + needle_size;
65 /// lower and uppercase variants of the first octet of the first character in `needle`
66 bool first_needle_symbol_is_ascii{};
67 UInt8 l{};
68 UInt8 u{};
69
70#ifdef __SSE4_1__
71 /// vectors filled with `l` and `u`, for determining leftmost position of the first symbol
72 __m128i patl, patu;
73 /// lower and uppercase vectors of first 16 characters of `needle`
74 __m128i cachel = _mm_setzero_si128(), cacheu = _mm_setzero_si128();
75 int cachemask{};
76 size_t cache_valid_len{};
77 size_t cache_actual_len{};
78#endif
79
80public:
81 StringSearcher(const char * const needle_, const size_t needle_size_)
82 : needle{reinterpret_cast<const UInt8 *>(needle_)}, needle_size{needle_size_}
83 {
84 if (0 == needle_size)
85 return;
86
87 static const Poco::UTF8Encoding utf8;
88 UTF8SequenceBuffer l_seq, u_seq;
89
90 if (*needle < 0x80u)
91 {
92 first_needle_symbol_is_ascii = true;
93 l = std::tolower(*needle);
94 u = std::toupper(*needle);
95 }
96 else
97 {
98 const auto first_u32 = utf8.convert(needle);
99 const auto first_l_u32 = Poco::Unicode::toLower(first_u32);
100 const auto first_u_u32 = Poco::Unicode::toUpper(first_u32);
101
102 /// lower and uppercase variants of the first octet of the first character in `needle`
103 utf8.convert(first_l_u32, l_seq, sizeof(l_seq));
104 l = l_seq[0];
105 utf8.convert(first_u_u32, u_seq, sizeof(u_seq));
106 u = u_seq[0];
107 }
108
109#ifdef __SSE4_1__
110 /// for detecting leftmost position of the first symbol
111 patl = _mm_set1_epi8(l);
112 patu = _mm_set1_epi8(u);
113 /// lower and uppercase vectors of first 16 octets of `needle`
114
115 auto needle_pos = needle;
116
117 for (size_t i = 0; i < n;)
118 {
119 if (needle_pos == needle_end)
120 {
121 cachel = _mm_srli_si128(cachel, 1);
122 cacheu = _mm_srli_si128(cacheu, 1);
123 ++i;
124
125 continue;
126 }
127
128 const auto src_len = UTF8::seqLength(*needle_pos);
129 const auto c_u32 = utf8.convert(needle_pos);
130
131 const auto c_l_u32 = Poco::Unicode::toLower(c_u32);
132 const auto c_u_u32 = Poco::Unicode::toUpper(c_u32);
133
134 const auto dst_l_len = static_cast<UInt8>(utf8.convert(c_l_u32, l_seq, sizeof(l_seq)));
135 const auto dst_u_len = static_cast<UInt8>(utf8.convert(c_u_u32, u_seq, sizeof(u_seq)));
136
137 /// @note Unicode standard states it is a rare but possible occasion
138 if (!(dst_l_len == dst_u_len && dst_u_len == src_len))
139 throw Exception{"UTF8 sequences with different lowercase and uppercase lengths are not supported", ErrorCodes::UNSUPPORTED_PARAMETER};
140
141 cache_actual_len += src_len;
142 if (cache_actual_len < n)
143 cache_valid_len += src_len;
144
145 for (size_t j = 0; j < src_len && i < n; ++j, ++i)
146 {
147 cachel = _mm_srli_si128(cachel, 1);
148 cacheu = _mm_srli_si128(cacheu, 1);
149
150 if (needle_pos != needle_end)
151 {
152 cachel = _mm_insert_epi8(cachel, l_seq[j], n - 1);
153 cacheu = _mm_insert_epi8(cacheu, u_seq[j], n - 1);
154
155 cachemask |= 1 << i;
156 ++needle_pos;
157 }
158 }
159 }
160#endif
161 }
162
163 ALWAYS_INLINE bool compare(const UInt8 * /*haystack*/, const UInt8 * /*haystack_end*/, const UInt8 * pos) const
164 {
165 static const Poco::UTF8Encoding utf8;
166
167#ifdef __SSE4_1__
168 if (pageSafe(pos))
169 {
170 const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pos));
171 const auto v_against_l = _mm_cmpeq_epi8(v_haystack, cachel);
172 const auto v_against_u = _mm_cmpeq_epi8(v_haystack, cacheu);
173 const auto v_against_l_or_u = _mm_or_si128(v_against_l, v_against_u);
174 const auto mask = _mm_movemask_epi8(v_against_l_or_u);
175
176 if (0xffff == cachemask)
177 {
178 if (mask == cachemask)
179 {
180 pos += cache_valid_len;
181 auto needle_pos = needle + cache_valid_len;
182
183 while (needle_pos < needle_end &&
184 Poco::Unicode::toLower(utf8.convert(pos)) ==
185 Poco::Unicode::toLower(utf8.convert(needle_pos)))
186 {
187 /// @note assuming sequences for lowercase and uppercase have exact same length
188 const auto len = UTF8::seqLength(*pos);
189 pos += len;
190 needle_pos += len;
191 }
192
193 if (needle_pos == needle_end)
194 return true;
195 }
196 }
197 else if ((mask & cachemask) == cachemask)
198 return true;
199
200 return false;
201 }
202#endif
203
204 if (*pos == l || *pos == u)
205 {
206 pos += first_needle_symbol_is_ascii;
207 auto needle_pos = needle + first_needle_symbol_is_ascii;
208
209 while (needle_pos < needle_end &&
210 Poco::Unicode::toLower(utf8.convert(pos)) ==
211 Poco::Unicode::toLower(utf8.convert(needle_pos)))
212 {
213 const auto len = UTF8::seqLength(*pos);
214 pos += len;
215 needle_pos += len;
216 }
217
218 if (needle_pos == needle_end)
219 return true;
220 }
221
222 return false;
223 }
224
225 const UInt8 * search(const UInt8 * haystack, const UInt8 * const haystack_end) const
226 {
227 if (0 == needle_size)
228 return haystack;
229
230 static const Poco::UTF8Encoding utf8;
231
232 while (haystack < haystack_end)
233 {
234#ifdef __SSE4_1__
235 if (haystack + n <= haystack_end && pageSafe(haystack))
236 {
237 const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(haystack));
238 const auto v_against_l = _mm_cmpeq_epi8(v_haystack, patl);
239 const auto v_against_u = _mm_cmpeq_epi8(v_haystack, patu);
240 const auto v_against_l_or_u = _mm_or_si128(v_against_l, v_against_u);
241
242 const auto mask = _mm_movemask_epi8(v_against_l_or_u);
243
244 if (mask == 0)
245 {
246 haystack += n;
247 UTF8::syncForward(haystack, haystack_end);
248 continue;
249 }
250
251 const auto offset = __builtin_ctz(mask);
252 haystack += offset;
253
254 if (haystack < haystack_end && haystack + n <= haystack_end && pageSafe(haystack))
255 {
256 const auto v_haystack_offset = _mm_loadu_si128(reinterpret_cast<const __m128i *>(haystack));
257 const auto v_against_l_offset = _mm_cmpeq_epi8(v_haystack_offset, cachel);
258 const auto v_against_u_offset = _mm_cmpeq_epi8(v_haystack_offset, cacheu);
259 const auto v_against_l_or_u_offset = _mm_or_si128(v_against_l_offset, v_against_u_offset);
260 const auto mask_offset = _mm_movemask_epi8(v_against_l_or_u_offset);
261
262 if (0xffff == cachemask)
263 {
264 if (mask_offset == cachemask)
265 {
266 auto haystack_pos = haystack + cache_valid_len;
267 auto needle_pos = needle + cache_valid_len;
268
269 while (haystack_pos < haystack_end && needle_pos < needle_end &&
270 Poco::Unicode::toLower(utf8.convert(haystack_pos)) ==
271 Poco::Unicode::toLower(utf8.convert(needle_pos)))
272 {
273 /// @note assuming sequences for lowercase and uppercase have exact same length
274 const auto len = UTF8::seqLength(*haystack_pos);
275 haystack_pos += len;
276 needle_pos += len;
277 }
278
279 if (needle_pos == needle_end)
280 return haystack;
281 }
282 }
283 else if ((mask_offset & cachemask) == cachemask)
284 return haystack;
285
286 /// first octet was ok, but not the first 16, move to start of next sequence and reapply
287 haystack += UTF8::seqLength(*haystack);
288 continue;
289 }
290 }
291#endif
292
293 if (haystack == haystack_end)
294 return haystack_end;
295
296 if (*haystack == l || *haystack == u)
297 {
298 auto haystack_pos = haystack + first_needle_symbol_is_ascii;
299 auto needle_pos = needle + first_needle_symbol_is_ascii;
300
301 while (haystack_pos < haystack_end && needle_pos < needle_end &&
302 Poco::Unicode::toLower(utf8.convert(haystack_pos)) ==
303 Poco::Unicode::toLower(utf8.convert(needle_pos)))
304 {
305 const auto len = UTF8::seqLength(*haystack_pos);
306 haystack_pos += len;
307 needle_pos += len;
308 }
309
310 if (needle_pos == needle_end)
311 return haystack;
312 }
313
314 /// advance to the start of the next sequence
315 haystack += UTF8::seqLength(*haystack);
316 }
317
318 return haystack_end;
319 }
320
321 const UInt8 * search(const UInt8 * haystack, const size_t haystack_size) const
322 {
323 return search(haystack, haystack + haystack_size);
324 }
325};
326
327
328/// Case-insensitive ASCII searcher
329template <>
330class StringSearcher<false, true> : private StringSearcherBase
331{
332private:
333 /// string to be searched for
334 const UInt8 * const needle;
335 const UInt8 * const needle_end;
336 /// lower and uppercase variants of the first character in `needle`
337 UInt8 l{};
338 UInt8 u{};
339
340#ifdef __SSE4_1__
341 /// vectors filled with `l` and `u`, for determining leftmost position of the first symbol
342 __m128i patl, patu;
343 /// lower and uppercase vectors of first 16 characters of `needle`
344 __m128i cachel = _mm_setzero_si128(), cacheu = _mm_setzero_si128();
345 int cachemask{};
346#endif
347
348public:
349 StringSearcher(const char * const needle_, const size_t needle_size)
350 : needle{reinterpret_cast<const UInt8 *>(needle_)}, needle_end{needle + needle_size}
351 {
352 if (0 == needle_size)
353 return;
354
355 l = static_cast<UInt8>(std::tolower(*needle));
356 u = static_cast<UInt8>(std::toupper(*needle));
357
358#ifdef __SSE4_1__
359 patl = _mm_set1_epi8(l);
360 patu = _mm_set1_epi8(u);
361
362 auto needle_pos = needle;
363
364 for (const auto i : ext::range(0, n))
365 {
366 cachel = _mm_srli_si128(cachel, 1);
367 cacheu = _mm_srli_si128(cacheu, 1);
368
369 if (needle_pos != needle_end)
370 {
371 cachel = _mm_insert_epi8(cachel, std::tolower(*needle_pos), n - 1);
372 cacheu = _mm_insert_epi8(cacheu, std::toupper(*needle_pos), n - 1);
373 cachemask |= 1 << i;
374 ++needle_pos;
375 }
376 }
377#endif
378 }
379
380 ALWAYS_INLINE bool compare(const UInt8 * /*haystack*/, const UInt8 * /*haystack_end*/, const UInt8 * pos) const
381 {
382#ifdef __SSE4_1__
383 if (pageSafe(pos))
384 {
385 const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pos));
386 const auto v_against_l = _mm_cmpeq_epi8(v_haystack, cachel);
387 const auto v_against_u = _mm_cmpeq_epi8(v_haystack, cacheu);
388 const auto v_against_l_or_u = _mm_or_si128(v_against_l, v_against_u);
389 const auto mask = _mm_movemask_epi8(v_against_l_or_u);
390
391 if (0xffff == cachemask)
392 {
393 if (mask == cachemask)
394 {
395 pos += n;
396 auto needle_pos = needle + n;
397
398 while (needle_pos < needle_end && std::tolower(*pos) == std::tolower(*needle_pos))
399 {
400 ++pos;
401 ++needle_pos;
402 }
403
404 if (needle_pos == needle_end)
405 return true;
406 }
407 }
408 else if ((mask & cachemask) == cachemask)
409 return true;
410
411 return false;
412 }
413#endif
414
415 if (*pos == l || *pos == u)
416 {
417 ++pos;
418 auto needle_pos = needle + 1;
419
420 while (needle_pos < needle_end && std::tolower(*pos) == std::tolower(*needle_pos))
421 {
422 ++pos;
423 ++needle_pos;
424 }
425
426 if (needle_pos == needle_end)
427 return true;
428 }
429
430 return false;
431 }
432
433 const UInt8 * search(const UInt8 * haystack, const UInt8 * const haystack_end) const
434 {
435 if (needle == needle_end)
436 return haystack;
437
438 while (haystack < haystack_end)
439 {
440#ifdef __SSE4_1__
441 if (haystack + n <= haystack_end && pageSafe(haystack))
442 {
443 const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(haystack));
444 const auto v_against_l = _mm_cmpeq_epi8(v_haystack, patl);
445 const auto v_against_u = _mm_cmpeq_epi8(v_haystack, patu);
446 const auto v_against_l_or_u = _mm_or_si128(v_against_l, v_against_u);
447
448 const auto mask = _mm_movemask_epi8(v_against_l_or_u);
449
450 if (mask == 0)
451 {
452 haystack += n;
453 continue;
454 }
455
456 const auto offset = __builtin_ctz(mask);
457 haystack += offset;
458
459 if (haystack < haystack_end && haystack + n <= haystack_end && pageSafe(haystack))
460 {
461 const auto v_haystack_offset = _mm_loadu_si128(reinterpret_cast<const __m128i *>(haystack));
462 const auto v_against_l_offset = _mm_cmpeq_epi8(v_haystack_offset, cachel);
463 const auto v_against_u_offset = _mm_cmpeq_epi8(v_haystack_offset, cacheu);
464 const auto v_against_l_or_u_offset = _mm_or_si128(v_against_l_offset, v_against_u_offset);
465 const auto mask_offset = _mm_movemask_epi8(v_against_l_or_u_offset);
466
467 if (0xffff == cachemask)
468 {
469 if (mask_offset == cachemask)
470 {
471 auto haystack_pos = haystack + n;
472 auto needle_pos = needle + n;
473
474 while (haystack_pos < haystack_end && needle_pos < needle_end &&
475 std::tolower(*haystack_pos) == std::tolower(*needle_pos))
476 {
477 ++haystack_pos;
478 ++needle_pos;
479 }
480
481 if (needle_pos == needle_end)
482 return haystack;
483 }
484 }
485 else if ((mask_offset & cachemask) == cachemask)
486 return haystack;
487
488 ++haystack;
489 continue;
490 }
491 }
492#endif
493
494 if (haystack == haystack_end)
495 return haystack_end;
496
497 if (*haystack == l || *haystack == u)
498 {
499 auto haystack_pos = haystack + 1;
500 auto needle_pos = needle + 1;
501
502 while (haystack_pos < haystack_end && needle_pos < needle_end &&
503 std::tolower(*haystack_pos) == std::tolower(*needle_pos))
504 {
505 ++haystack_pos;
506 ++needle_pos;
507 }
508
509 if (needle_pos == needle_end)
510 return haystack;
511 }
512
513 ++haystack;
514 }
515
516 return haystack_end;
517 }
518
519 const UInt8 * search(const UInt8 * haystack, const size_t haystack_size) const
520 {
521 return search(haystack, haystack + haystack_size);
522 }
523};
524
525
526/// Case-sensitive searcher (both ASCII and UTF-8)
527template <bool ASCII>
528class StringSearcher<true, ASCII> : private StringSearcherBase
529{
530private:
531 /// string to be searched for
532 const UInt8 * const needle;
533 const UInt8 * const needle_end;
534 /// first character in `needle`
535 UInt8 first{};
536
537#ifdef __SSE4_1__
538 /// vector filled `first` for determining leftmost position of the first symbol
539 __m128i pattern;
540 /// vector of first 16 characters of `needle`
541 __m128i cache = _mm_setzero_si128();
542 int cachemask{};
543#endif
544
545public:
546 StringSearcher(const char * const needle_, const size_t needle_size)
547 : needle{reinterpret_cast<const UInt8 *>(needle_)}, needle_end{needle + needle_size}
548 {
549 if (0 == needle_size)
550 return;
551
552 first = *needle;
553
554#ifdef __SSE4_1__
555 pattern = _mm_set1_epi8(first);
556
557 auto needle_pos = needle;
558
559 for (const auto i : ext::range(0, n))
560 {
561 cache = _mm_srli_si128(cache, 1);
562
563 if (needle_pos != needle_end)
564 {
565 cache = _mm_insert_epi8(cache, *needle_pos, n - 1);
566 cachemask |= 1 << i;
567 ++needle_pos;
568 }
569 }
570#endif
571 }
572
573 ALWAYS_INLINE bool compare(const UInt8 * /*haystack*/, const UInt8 * /*haystack_end*/, const UInt8 * pos) const
574 {
575#ifdef __SSE4_1__
576 if (pageSafe(pos))
577 {
578 const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pos));
579 const auto v_against_cache = _mm_cmpeq_epi8(v_haystack, cache);
580 const auto mask = _mm_movemask_epi8(v_against_cache);
581
582 if (0xffff == cachemask)
583 {
584 if (mask == cachemask)
585 {
586 pos += n;
587 auto needle_pos = needle + n;
588
589 while (needle_pos < needle_end && *pos == *needle_pos)
590 ++pos, ++needle_pos;
591
592 if (needle_pos == needle_end)
593 return true;
594 }
595 }
596 else if ((mask & cachemask) == cachemask)
597 return true;
598
599 return false;
600 }
601#endif
602
603 if (*pos == first)
604 {
605 ++pos;
606 auto needle_pos = needle + 1;
607
608 while (needle_pos < needle_end && *pos == *needle_pos)
609 ++pos, ++needle_pos;
610
611 if (needle_pos == needle_end)
612 return true;
613 }
614
615 return false;
616 }
617
618 const UInt8 * search(const UInt8 * haystack, const UInt8 * const haystack_end) const
619 {
620 if (needle == needle_end)
621 return haystack;
622
623 while (haystack < haystack_end)
624 {
625#ifdef __SSE4_1__
626 if (haystack + n <= haystack_end && pageSafe(haystack))
627 {
628 /// find first character
629 const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(haystack));
630 const auto v_against_pattern = _mm_cmpeq_epi8(v_haystack, pattern);
631
632 const auto mask = _mm_movemask_epi8(v_against_pattern);
633
634 /// first character not present in 16 octets starting at `haystack`
635 if (mask == 0)
636 {
637 haystack += n;
638 continue;
639 }
640
641 const auto offset = __builtin_ctz(mask);
642 haystack += offset;
643
644 if (haystack < haystack_end && haystack + n <= haystack_end && pageSafe(haystack))
645 {
646 /// check for first 16 octets
647 const auto v_haystack_offset = _mm_loadu_si128(reinterpret_cast<const __m128i *>(haystack));
648 const auto v_against_cache = _mm_cmpeq_epi8(v_haystack_offset, cache);
649 const auto mask_offset = _mm_movemask_epi8(v_against_cache);
650
651 if (0xffff == cachemask)
652 {
653 if (mask_offset == cachemask)
654 {
655 auto haystack_pos = haystack + n;
656 auto needle_pos = needle + n;
657
658 while (haystack_pos < haystack_end && needle_pos < needle_end &&
659 *haystack_pos == *needle_pos)
660 ++haystack_pos, ++needle_pos;
661
662 if (needle_pos == needle_end)
663 return haystack;
664 }
665 }
666 else if ((mask_offset & cachemask) == cachemask)
667 return haystack;
668
669 ++haystack;
670 continue;
671 }
672 }
673#endif
674
675 if (haystack == haystack_end)
676 return haystack_end;
677
678 if (*haystack == first)
679 {
680 auto haystack_pos = haystack + 1;
681 auto needle_pos = needle + 1;
682
683 while (haystack_pos < haystack_end && needle_pos < needle_end &&
684 *haystack_pos == *needle_pos)
685 ++haystack_pos, ++needle_pos;
686
687 if (needle_pos == needle_end)
688 return haystack;
689 }
690
691 ++haystack;
692 }
693
694 return haystack_end;
695 }
696
697 const UInt8 * search(const UInt8 * haystack, const size_t haystack_size) const
698 {
699 return search(haystack, haystack + haystack_size);
700 }
701};
702
703// Searches for needle surrounded by token-separators.
704// Separators are anything inside ASCII (0-128) and not alphanum.
705// Any value outside of basic ASCII (>=128) is considered a non-separator symbol, hence UTF-8 strings
706// should work just fine. But any Unicode whitespace is not considered a token separtor.
707template <typename StringSearcher>
708class TokenSearcher
709{
710 StringSearcher searcher;
711 size_t needle_size;
712
713public:
714 TokenSearcher(const char * const needle_, const size_t needle_size_)
715 : searcher{needle_, needle_size_},
716 needle_size(needle_size_)
717 {
718 if (std::any_of(reinterpret_cast<const UInt8 *>(needle_), reinterpret_cast<const UInt8 *>(needle_) + needle_size_, isTokenSeparator))
719 {
720 throw Exception{"Needle must not contain whitespace or separator characters", ErrorCodes::BAD_ARGUMENTS};
721 }
722
723 }
724
725 ALWAYS_INLINE bool compare(const UInt8 * haystack, const UInt8 * haystack_end, const UInt8 * pos) const
726 {
727 // use searcher only if pos is in the beginning of token and pos + searcher.needle_size is end of token.
728 if (isToken(haystack, haystack_end, pos))
729 return searcher.compare(haystack, haystack_end, pos);
730
731 return false;
732 }
733
734 const UInt8 * search(const UInt8 * haystack, const UInt8 * const haystack_end) const
735 {
736 // use searcher.search(), then verify that returned value is a token
737 // if it is not, skip it and re-run
738
739 const UInt8 * pos = haystack;
740 while (pos < haystack_end)
741 {
742 pos = searcher.search(pos, haystack_end);
743 if (pos == haystack_end || isToken(haystack, haystack_end, pos))
744 return pos;
745
746 // assuming that heendle does not contain any token separators.
747 pos += needle_size;
748 }
749 return haystack_end;
750 }
751
752 const UInt8 * search(const UInt8 * haystack, const size_t haystack_size) const
753 {
754 return search(haystack, haystack + haystack_size);
755 }
756
757 ALWAYS_INLINE bool isToken(const UInt8 * haystack, const UInt8 * const haystack_end, const UInt8* p) const
758 {
759 return (p == haystack || isTokenSeparator(*(p - 1)))
760 && (p + needle_size >= haystack_end || isTokenSeparator(*(p + needle_size)));
761 }
762
763 ALWAYS_INLINE static bool isTokenSeparator(const UInt8 c)
764 {
765 if (isAlphaNumericASCII(c) || !isASCII(c))
766 return false;
767
768 return true;
769 }
770};
771
772
773using ASCIICaseSensitiveStringSearcher = StringSearcher<true, true>;
774using ASCIICaseInsensitiveStringSearcher = StringSearcher<false, true>;
775using UTF8CaseSensitiveStringSearcher = StringSearcher<true, false>;
776using UTF8CaseInsensitiveStringSearcher = StringSearcher<false, false>;
777using ASCIICaseSensitiveTokenSearcher = TokenSearcher<ASCIICaseSensitiveStringSearcher>;
778using ASCIICaseInsensitiveTokenSearcher = TokenSearcher<ASCIICaseInsensitiveStringSearcher>;
779
780
781/** Uses functions from libc.
782 * It makes sense to use only with short haystacks when cheap initialization is required.
783 * There is no option for case-insensitive search for UTF-8 strings.
784 * It is required that strings are zero-terminated.
785 */
786
787struct LibCASCIICaseSensitiveStringSearcher
788{
789 const char * const needle;
790
791 LibCASCIICaseSensitiveStringSearcher(const char * const needle_, const size_t /* needle_size */)
792 : needle(needle_) {}
793
794 const UInt8 * search(const UInt8 * haystack, const UInt8 * const haystack_end) const
795 {
796 auto res = strstr(reinterpret_cast<const char *>(haystack), reinterpret_cast<const char *>(needle));
797 if (!res)
798 return haystack_end;
799 return reinterpret_cast<const UInt8 *>(res);
800 }
801
802 const UInt8 * search(const UInt8 * haystack, const size_t haystack_size) const
803 {
804 return search(haystack, haystack + haystack_size);
805 }
806};
807
808struct LibCASCIICaseInsensitiveStringSearcher
809{
810 const char * const needle;
811
812 LibCASCIICaseInsensitiveStringSearcher(const char * const needle_, const size_t /* needle_size */)
813 : needle(needle_) {}
814
815 const UInt8 * search(const UInt8 * haystack, const UInt8 * const haystack_end) const
816 {
817 auto res = strcasestr(reinterpret_cast<const char *>(haystack), reinterpret_cast<const char *>(needle));
818 if (!res)
819 return haystack_end;
820 return reinterpret_cast<const UInt8 *>(res);
821 }
822
823 const UInt8 * search(const UInt8 * haystack, const size_t haystack_size) const
824 {
825 return search(haystack, haystack + haystack_size);
826 }
827};
828
829
830}
831