1 | #pragma once |
2 | |
3 | #include <Common/Exception.h> |
4 | #include <Common/StringUtils/StringUtils.h> |
5 | #include <Common/UTF8Helpers.h> |
6 | #include <Core/Defines.h> |
7 | #include <ext/range.h> |
8 | #include <Poco/UTF8Encoding.h> |
9 | #include <Poco/Unicode.h> |
10 | #include <stdint.h> |
11 | #include <string.h> |
12 | |
13 | #ifdef __SSE2__ |
14 | #include <emmintrin.h> |
15 | #endif |
16 | |
17 | #ifdef __SSE4_1__ |
18 | #include <smmintrin.h> |
19 | #endif |
20 | |
21 | |
22 | namespace DB |
23 | { |
24 | |
25 | namespace ErrorCodes |
26 | { |
27 | extern const int UNSUPPORTED_PARAMETER; |
28 | extern const int BAD_ARGUMENTS; |
29 | } |
30 | |
31 | |
32 | /** Variants for searching a substring in a string. |
33 | * In most cases, performance is less than Volnitsky (see Volnitsky.h). |
34 | */ |
35 | |
36 | |
37 | struct StringSearcherBase |
38 | { |
39 | #ifdef __SSE2__ |
40 | static constexpr auto n = sizeof(__m128i); |
41 | const int page_size = getpagesize(); |
42 | |
43 | bool pageSafe(const void * const ptr) const |
44 | { |
45 | return ((page_size - 1) & reinterpret_cast<std::uintptr_t>(ptr)) <= page_size - n; |
46 | } |
47 | #endif |
48 | }; |
49 | |
50 | |
51 | /// Performs case-sensitive and case-insensitive search of UTF-8 strings |
52 | template <bool CaseSensitive, bool ASCII> class StringSearcher; |
53 | |
54 | /// Case-insensitive UTF-8 searcher |
55 | template <> |
56 | class StringSearcher<false, false> : private StringSearcherBase |
57 | { |
58 | private: |
59 | using UTF8SequenceBuffer = UInt8[6]; |
60 | |
61 | /// substring to be searched for |
62 | const UInt8 * const needle; |
63 | const size_t needle_size; |
64 | const UInt8 * const needle_end = needle + needle_size; |
65 | /// lower and uppercase variants of the first octet of the first character in `needle` |
66 | bool first_needle_symbol_is_ascii{}; |
67 | UInt8 l{}; |
68 | UInt8 u{}; |
69 | |
70 | #ifdef __SSE4_1__ |
71 | /// vectors filled with `l` and `u`, for determining leftmost position of the first symbol |
72 | __m128i patl, patu; |
73 | /// lower and uppercase vectors of first 16 characters of `needle` |
74 | __m128i cachel = _mm_setzero_si128(), cacheu = _mm_setzero_si128(); |
75 | int cachemask{}; |
76 | size_t cache_valid_len{}; |
77 | size_t cache_actual_len{}; |
78 | #endif |
79 | |
80 | public: |
81 | StringSearcher(const char * const needle_, const size_t needle_size_) |
82 | : needle{reinterpret_cast<const UInt8 *>(needle_)}, needle_size{needle_size_} |
83 | { |
84 | if (0 == needle_size) |
85 | return; |
86 | |
87 | static const Poco::UTF8Encoding utf8; |
88 | UTF8SequenceBuffer l_seq, u_seq; |
89 | |
90 | if (*needle < 0x80u) |
91 | { |
92 | first_needle_symbol_is_ascii = true; |
93 | l = std::tolower(*needle); |
94 | u = std::toupper(*needle); |
95 | } |
96 | else |
97 | { |
98 | const auto first_u32 = utf8.convert(needle); |
99 | const auto first_l_u32 = Poco::Unicode::toLower(first_u32); |
100 | const auto first_u_u32 = Poco::Unicode::toUpper(first_u32); |
101 | |
102 | /// lower and uppercase variants of the first octet of the first character in `needle` |
103 | utf8.convert(first_l_u32, l_seq, sizeof(l_seq)); |
104 | l = l_seq[0]; |
105 | utf8.convert(first_u_u32, u_seq, sizeof(u_seq)); |
106 | u = u_seq[0]; |
107 | } |
108 | |
109 | #ifdef __SSE4_1__ |
110 | /// for detecting leftmost position of the first symbol |
111 | patl = _mm_set1_epi8(l); |
112 | patu = _mm_set1_epi8(u); |
113 | /// lower and uppercase vectors of first 16 octets of `needle` |
114 | |
115 | auto needle_pos = needle; |
116 | |
117 | for (size_t i = 0; i < n;) |
118 | { |
119 | if (needle_pos == needle_end) |
120 | { |
121 | cachel = _mm_srli_si128(cachel, 1); |
122 | cacheu = _mm_srli_si128(cacheu, 1); |
123 | ++i; |
124 | |
125 | continue; |
126 | } |
127 | |
128 | const auto src_len = UTF8::seqLength(*needle_pos); |
129 | const auto c_u32 = utf8.convert(needle_pos); |
130 | |
131 | const auto c_l_u32 = Poco::Unicode::toLower(c_u32); |
132 | const auto c_u_u32 = Poco::Unicode::toUpper(c_u32); |
133 | |
134 | const auto dst_l_len = static_cast<UInt8>(utf8.convert(c_l_u32, l_seq, sizeof(l_seq))); |
135 | const auto dst_u_len = static_cast<UInt8>(utf8.convert(c_u_u32, u_seq, sizeof(u_seq))); |
136 | |
137 | /// @note Unicode standard states it is a rare but possible occasion |
138 | if (!(dst_l_len == dst_u_len && dst_u_len == src_len)) |
139 | throw Exception{"UTF8 sequences with different lowercase and uppercase lengths are not supported" , ErrorCodes::UNSUPPORTED_PARAMETER}; |
140 | |
141 | cache_actual_len += src_len; |
142 | if (cache_actual_len < n) |
143 | cache_valid_len += src_len; |
144 | |
145 | for (size_t j = 0; j < src_len && i < n; ++j, ++i) |
146 | { |
147 | cachel = _mm_srli_si128(cachel, 1); |
148 | cacheu = _mm_srli_si128(cacheu, 1); |
149 | |
150 | if (needle_pos != needle_end) |
151 | { |
152 | cachel = _mm_insert_epi8(cachel, l_seq[j], n - 1); |
153 | cacheu = _mm_insert_epi8(cacheu, u_seq[j], n - 1); |
154 | |
155 | cachemask |= 1 << i; |
156 | ++needle_pos; |
157 | } |
158 | } |
159 | } |
160 | #endif |
161 | } |
162 | |
163 | ALWAYS_INLINE bool compare(const UInt8 * /*haystack*/, const UInt8 * /*haystack_end*/, const UInt8 * pos) const |
164 | { |
165 | static const Poco::UTF8Encoding utf8; |
166 | |
167 | #ifdef __SSE4_1__ |
168 | if (pageSafe(pos)) |
169 | { |
170 | const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pos)); |
171 | const auto v_against_l = _mm_cmpeq_epi8(v_haystack, cachel); |
172 | const auto v_against_u = _mm_cmpeq_epi8(v_haystack, cacheu); |
173 | const auto v_against_l_or_u = _mm_or_si128(v_against_l, v_against_u); |
174 | const auto mask = _mm_movemask_epi8(v_against_l_or_u); |
175 | |
176 | if (0xffff == cachemask) |
177 | { |
178 | if (mask == cachemask) |
179 | { |
180 | pos += cache_valid_len; |
181 | auto needle_pos = needle + cache_valid_len; |
182 | |
183 | while (needle_pos < needle_end && |
184 | Poco::Unicode::toLower(utf8.convert(pos)) == |
185 | Poco::Unicode::toLower(utf8.convert(needle_pos))) |
186 | { |
187 | /// @note assuming sequences for lowercase and uppercase have exact same length |
188 | const auto len = UTF8::seqLength(*pos); |
189 | pos += len; |
190 | needle_pos += len; |
191 | } |
192 | |
193 | if (needle_pos == needle_end) |
194 | return true; |
195 | } |
196 | } |
197 | else if ((mask & cachemask) == cachemask) |
198 | return true; |
199 | |
200 | return false; |
201 | } |
202 | #endif |
203 | |
204 | if (*pos == l || *pos == u) |
205 | { |
206 | pos += first_needle_symbol_is_ascii; |
207 | auto needle_pos = needle + first_needle_symbol_is_ascii; |
208 | |
209 | while (needle_pos < needle_end && |
210 | Poco::Unicode::toLower(utf8.convert(pos)) == |
211 | Poco::Unicode::toLower(utf8.convert(needle_pos))) |
212 | { |
213 | const auto len = UTF8::seqLength(*pos); |
214 | pos += len; |
215 | needle_pos += len; |
216 | } |
217 | |
218 | if (needle_pos == needle_end) |
219 | return true; |
220 | } |
221 | |
222 | return false; |
223 | } |
224 | |
225 | const UInt8 * search(const UInt8 * haystack, const UInt8 * const haystack_end) const |
226 | { |
227 | if (0 == needle_size) |
228 | return haystack; |
229 | |
230 | static const Poco::UTF8Encoding utf8; |
231 | |
232 | while (haystack < haystack_end) |
233 | { |
234 | #ifdef __SSE4_1__ |
235 | if (haystack + n <= haystack_end && pageSafe(haystack)) |
236 | { |
237 | const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(haystack)); |
238 | const auto v_against_l = _mm_cmpeq_epi8(v_haystack, patl); |
239 | const auto v_against_u = _mm_cmpeq_epi8(v_haystack, patu); |
240 | const auto v_against_l_or_u = _mm_or_si128(v_against_l, v_against_u); |
241 | |
242 | const auto mask = _mm_movemask_epi8(v_against_l_or_u); |
243 | |
244 | if (mask == 0) |
245 | { |
246 | haystack += n; |
247 | UTF8::syncForward(haystack, haystack_end); |
248 | continue; |
249 | } |
250 | |
251 | const auto offset = __builtin_ctz(mask); |
252 | haystack += offset; |
253 | |
254 | if (haystack < haystack_end && haystack + n <= haystack_end && pageSafe(haystack)) |
255 | { |
256 | const auto v_haystack_offset = _mm_loadu_si128(reinterpret_cast<const __m128i *>(haystack)); |
257 | const auto v_against_l_offset = _mm_cmpeq_epi8(v_haystack_offset, cachel); |
258 | const auto v_against_u_offset = _mm_cmpeq_epi8(v_haystack_offset, cacheu); |
259 | const auto v_against_l_or_u_offset = _mm_or_si128(v_against_l_offset, v_against_u_offset); |
260 | const auto mask_offset = _mm_movemask_epi8(v_against_l_or_u_offset); |
261 | |
262 | if (0xffff == cachemask) |
263 | { |
264 | if (mask_offset == cachemask) |
265 | { |
266 | auto haystack_pos = haystack + cache_valid_len; |
267 | auto needle_pos = needle + cache_valid_len; |
268 | |
269 | while (haystack_pos < haystack_end && needle_pos < needle_end && |
270 | Poco::Unicode::toLower(utf8.convert(haystack_pos)) == |
271 | Poco::Unicode::toLower(utf8.convert(needle_pos))) |
272 | { |
273 | /// @note assuming sequences for lowercase and uppercase have exact same length |
274 | const auto len = UTF8::seqLength(*haystack_pos); |
275 | haystack_pos += len; |
276 | needle_pos += len; |
277 | } |
278 | |
279 | if (needle_pos == needle_end) |
280 | return haystack; |
281 | } |
282 | } |
283 | else if ((mask_offset & cachemask) == cachemask) |
284 | return haystack; |
285 | |
286 | /// first octet was ok, but not the first 16, move to start of next sequence and reapply |
287 | haystack += UTF8::seqLength(*haystack); |
288 | continue; |
289 | } |
290 | } |
291 | #endif |
292 | |
293 | if (haystack == haystack_end) |
294 | return haystack_end; |
295 | |
296 | if (*haystack == l || *haystack == u) |
297 | { |
298 | auto haystack_pos = haystack + first_needle_symbol_is_ascii; |
299 | auto needle_pos = needle + first_needle_symbol_is_ascii; |
300 | |
301 | while (haystack_pos < haystack_end && needle_pos < needle_end && |
302 | Poco::Unicode::toLower(utf8.convert(haystack_pos)) == |
303 | Poco::Unicode::toLower(utf8.convert(needle_pos))) |
304 | { |
305 | const auto len = UTF8::seqLength(*haystack_pos); |
306 | haystack_pos += len; |
307 | needle_pos += len; |
308 | } |
309 | |
310 | if (needle_pos == needle_end) |
311 | return haystack; |
312 | } |
313 | |
314 | /// advance to the start of the next sequence |
315 | haystack += UTF8::seqLength(*haystack); |
316 | } |
317 | |
318 | return haystack_end; |
319 | } |
320 | |
321 | const UInt8 * search(const UInt8 * haystack, const size_t haystack_size) const |
322 | { |
323 | return search(haystack, haystack + haystack_size); |
324 | } |
325 | }; |
326 | |
327 | |
328 | /// Case-insensitive ASCII searcher |
329 | template <> |
330 | class StringSearcher<false, true> : private StringSearcherBase |
331 | { |
332 | private: |
333 | /// string to be searched for |
334 | const UInt8 * const needle; |
335 | const UInt8 * const needle_end; |
336 | /// lower and uppercase variants of the first character in `needle` |
337 | UInt8 l{}; |
338 | UInt8 u{}; |
339 | |
340 | #ifdef __SSE4_1__ |
341 | /// vectors filled with `l` and `u`, for determining leftmost position of the first symbol |
342 | __m128i patl, patu; |
343 | /// lower and uppercase vectors of first 16 characters of `needle` |
344 | __m128i cachel = _mm_setzero_si128(), cacheu = _mm_setzero_si128(); |
345 | int cachemask{}; |
346 | #endif |
347 | |
348 | public: |
349 | StringSearcher(const char * const needle_, const size_t needle_size) |
350 | : needle{reinterpret_cast<const UInt8 *>(needle_)}, needle_end{needle + needle_size} |
351 | { |
352 | if (0 == needle_size) |
353 | return; |
354 | |
355 | l = static_cast<UInt8>(std::tolower(*needle)); |
356 | u = static_cast<UInt8>(std::toupper(*needle)); |
357 | |
358 | #ifdef __SSE4_1__ |
359 | patl = _mm_set1_epi8(l); |
360 | patu = _mm_set1_epi8(u); |
361 | |
362 | auto needle_pos = needle; |
363 | |
364 | for (const auto i : ext::range(0, n)) |
365 | { |
366 | cachel = _mm_srli_si128(cachel, 1); |
367 | cacheu = _mm_srli_si128(cacheu, 1); |
368 | |
369 | if (needle_pos != needle_end) |
370 | { |
371 | cachel = _mm_insert_epi8(cachel, std::tolower(*needle_pos), n - 1); |
372 | cacheu = _mm_insert_epi8(cacheu, std::toupper(*needle_pos), n - 1); |
373 | cachemask |= 1 << i; |
374 | ++needle_pos; |
375 | } |
376 | } |
377 | #endif |
378 | } |
379 | |
380 | ALWAYS_INLINE bool compare(const UInt8 * /*haystack*/, const UInt8 * /*haystack_end*/, const UInt8 * pos) const |
381 | { |
382 | #ifdef __SSE4_1__ |
383 | if (pageSafe(pos)) |
384 | { |
385 | const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pos)); |
386 | const auto v_against_l = _mm_cmpeq_epi8(v_haystack, cachel); |
387 | const auto v_against_u = _mm_cmpeq_epi8(v_haystack, cacheu); |
388 | const auto v_against_l_or_u = _mm_or_si128(v_against_l, v_against_u); |
389 | const auto mask = _mm_movemask_epi8(v_against_l_or_u); |
390 | |
391 | if (0xffff == cachemask) |
392 | { |
393 | if (mask == cachemask) |
394 | { |
395 | pos += n; |
396 | auto needle_pos = needle + n; |
397 | |
398 | while (needle_pos < needle_end && std::tolower(*pos) == std::tolower(*needle_pos)) |
399 | { |
400 | ++pos; |
401 | ++needle_pos; |
402 | } |
403 | |
404 | if (needle_pos == needle_end) |
405 | return true; |
406 | } |
407 | } |
408 | else if ((mask & cachemask) == cachemask) |
409 | return true; |
410 | |
411 | return false; |
412 | } |
413 | #endif |
414 | |
415 | if (*pos == l || *pos == u) |
416 | { |
417 | ++pos; |
418 | auto needle_pos = needle + 1; |
419 | |
420 | while (needle_pos < needle_end && std::tolower(*pos) == std::tolower(*needle_pos)) |
421 | { |
422 | ++pos; |
423 | ++needle_pos; |
424 | } |
425 | |
426 | if (needle_pos == needle_end) |
427 | return true; |
428 | } |
429 | |
430 | return false; |
431 | } |
432 | |
433 | const UInt8 * search(const UInt8 * haystack, const UInt8 * const haystack_end) const |
434 | { |
435 | if (needle == needle_end) |
436 | return haystack; |
437 | |
438 | while (haystack < haystack_end) |
439 | { |
440 | #ifdef __SSE4_1__ |
441 | if (haystack + n <= haystack_end && pageSafe(haystack)) |
442 | { |
443 | const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(haystack)); |
444 | const auto v_against_l = _mm_cmpeq_epi8(v_haystack, patl); |
445 | const auto v_against_u = _mm_cmpeq_epi8(v_haystack, patu); |
446 | const auto v_against_l_or_u = _mm_or_si128(v_against_l, v_against_u); |
447 | |
448 | const auto mask = _mm_movemask_epi8(v_against_l_or_u); |
449 | |
450 | if (mask == 0) |
451 | { |
452 | haystack += n; |
453 | continue; |
454 | } |
455 | |
456 | const auto offset = __builtin_ctz(mask); |
457 | haystack += offset; |
458 | |
459 | if (haystack < haystack_end && haystack + n <= haystack_end && pageSafe(haystack)) |
460 | { |
461 | const auto v_haystack_offset = _mm_loadu_si128(reinterpret_cast<const __m128i *>(haystack)); |
462 | const auto v_against_l_offset = _mm_cmpeq_epi8(v_haystack_offset, cachel); |
463 | const auto v_against_u_offset = _mm_cmpeq_epi8(v_haystack_offset, cacheu); |
464 | const auto v_against_l_or_u_offset = _mm_or_si128(v_against_l_offset, v_against_u_offset); |
465 | const auto mask_offset = _mm_movemask_epi8(v_against_l_or_u_offset); |
466 | |
467 | if (0xffff == cachemask) |
468 | { |
469 | if (mask_offset == cachemask) |
470 | { |
471 | auto haystack_pos = haystack + n; |
472 | auto needle_pos = needle + n; |
473 | |
474 | while (haystack_pos < haystack_end && needle_pos < needle_end && |
475 | std::tolower(*haystack_pos) == std::tolower(*needle_pos)) |
476 | { |
477 | ++haystack_pos; |
478 | ++needle_pos; |
479 | } |
480 | |
481 | if (needle_pos == needle_end) |
482 | return haystack; |
483 | } |
484 | } |
485 | else if ((mask_offset & cachemask) == cachemask) |
486 | return haystack; |
487 | |
488 | ++haystack; |
489 | continue; |
490 | } |
491 | } |
492 | #endif |
493 | |
494 | if (haystack == haystack_end) |
495 | return haystack_end; |
496 | |
497 | if (*haystack == l || *haystack == u) |
498 | { |
499 | auto haystack_pos = haystack + 1; |
500 | auto needle_pos = needle + 1; |
501 | |
502 | while (haystack_pos < haystack_end && needle_pos < needle_end && |
503 | std::tolower(*haystack_pos) == std::tolower(*needle_pos)) |
504 | { |
505 | ++haystack_pos; |
506 | ++needle_pos; |
507 | } |
508 | |
509 | if (needle_pos == needle_end) |
510 | return haystack; |
511 | } |
512 | |
513 | ++haystack; |
514 | } |
515 | |
516 | return haystack_end; |
517 | } |
518 | |
519 | const UInt8 * search(const UInt8 * haystack, const size_t haystack_size) const |
520 | { |
521 | return search(haystack, haystack + haystack_size); |
522 | } |
523 | }; |
524 | |
525 | |
526 | /// Case-sensitive searcher (both ASCII and UTF-8) |
527 | template <bool ASCII> |
528 | class StringSearcher<true, ASCII> : private StringSearcherBase |
529 | { |
530 | private: |
531 | /// string to be searched for |
532 | const UInt8 * const needle; |
533 | const UInt8 * const needle_end; |
534 | /// first character in `needle` |
535 | UInt8 first{}; |
536 | |
537 | #ifdef __SSE4_1__ |
538 | /// vector filled `first` for determining leftmost position of the first symbol |
539 | __m128i pattern; |
540 | /// vector of first 16 characters of `needle` |
541 | __m128i cache = _mm_setzero_si128(); |
542 | int cachemask{}; |
543 | #endif |
544 | |
545 | public: |
546 | StringSearcher(const char * const needle_, const size_t needle_size) |
547 | : needle{reinterpret_cast<const UInt8 *>(needle_)}, needle_end{needle + needle_size} |
548 | { |
549 | if (0 == needle_size) |
550 | return; |
551 | |
552 | first = *needle; |
553 | |
554 | #ifdef __SSE4_1__ |
555 | pattern = _mm_set1_epi8(first); |
556 | |
557 | auto needle_pos = needle; |
558 | |
559 | for (const auto i : ext::range(0, n)) |
560 | { |
561 | cache = _mm_srli_si128(cache, 1); |
562 | |
563 | if (needle_pos != needle_end) |
564 | { |
565 | cache = _mm_insert_epi8(cache, *needle_pos, n - 1); |
566 | cachemask |= 1 << i; |
567 | ++needle_pos; |
568 | } |
569 | } |
570 | #endif |
571 | } |
572 | |
573 | ALWAYS_INLINE bool compare(const UInt8 * /*haystack*/, const UInt8 * /*haystack_end*/, const UInt8 * pos) const |
574 | { |
575 | #ifdef __SSE4_1__ |
576 | if (pageSafe(pos)) |
577 | { |
578 | const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pos)); |
579 | const auto v_against_cache = _mm_cmpeq_epi8(v_haystack, cache); |
580 | const auto mask = _mm_movemask_epi8(v_against_cache); |
581 | |
582 | if (0xffff == cachemask) |
583 | { |
584 | if (mask == cachemask) |
585 | { |
586 | pos += n; |
587 | auto needle_pos = needle + n; |
588 | |
589 | while (needle_pos < needle_end && *pos == *needle_pos) |
590 | ++pos, ++needle_pos; |
591 | |
592 | if (needle_pos == needle_end) |
593 | return true; |
594 | } |
595 | } |
596 | else if ((mask & cachemask) == cachemask) |
597 | return true; |
598 | |
599 | return false; |
600 | } |
601 | #endif |
602 | |
603 | if (*pos == first) |
604 | { |
605 | ++pos; |
606 | auto needle_pos = needle + 1; |
607 | |
608 | while (needle_pos < needle_end && *pos == *needle_pos) |
609 | ++pos, ++needle_pos; |
610 | |
611 | if (needle_pos == needle_end) |
612 | return true; |
613 | } |
614 | |
615 | return false; |
616 | } |
617 | |
618 | const UInt8 * search(const UInt8 * haystack, const UInt8 * const haystack_end) const |
619 | { |
620 | if (needle == needle_end) |
621 | return haystack; |
622 | |
623 | while (haystack < haystack_end) |
624 | { |
625 | #ifdef __SSE4_1__ |
626 | if (haystack + n <= haystack_end && pageSafe(haystack)) |
627 | { |
628 | /// find first character |
629 | const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(haystack)); |
630 | const auto v_against_pattern = _mm_cmpeq_epi8(v_haystack, pattern); |
631 | |
632 | const auto mask = _mm_movemask_epi8(v_against_pattern); |
633 | |
634 | /// first character not present in 16 octets starting at `haystack` |
635 | if (mask == 0) |
636 | { |
637 | haystack += n; |
638 | continue; |
639 | } |
640 | |
641 | const auto offset = __builtin_ctz(mask); |
642 | haystack += offset; |
643 | |
644 | if (haystack < haystack_end && haystack + n <= haystack_end && pageSafe(haystack)) |
645 | { |
646 | /// check for first 16 octets |
647 | const auto v_haystack_offset = _mm_loadu_si128(reinterpret_cast<const __m128i *>(haystack)); |
648 | const auto v_against_cache = _mm_cmpeq_epi8(v_haystack_offset, cache); |
649 | const auto mask_offset = _mm_movemask_epi8(v_against_cache); |
650 | |
651 | if (0xffff == cachemask) |
652 | { |
653 | if (mask_offset == cachemask) |
654 | { |
655 | auto haystack_pos = haystack + n; |
656 | auto needle_pos = needle + n; |
657 | |
658 | while (haystack_pos < haystack_end && needle_pos < needle_end && |
659 | *haystack_pos == *needle_pos) |
660 | ++haystack_pos, ++needle_pos; |
661 | |
662 | if (needle_pos == needle_end) |
663 | return haystack; |
664 | } |
665 | } |
666 | else if ((mask_offset & cachemask) == cachemask) |
667 | return haystack; |
668 | |
669 | ++haystack; |
670 | continue; |
671 | } |
672 | } |
673 | #endif |
674 | |
675 | if (haystack == haystack_end) |
676 | return haystack_end; |
677 | |
678 | if (*haystack == first) |
679 | { |
680 | auto haystack_pos = haystack + 1; |
681 | auto needle_pos = needle + 1; |
682 | |
683 | while (haystack_pos < haystack_end && needle_pos < needle_end && |
684 | *haystack_pos == *needle_pos) |
685 | ++haystack_pos, ++needle_pos; |
686 | |
687 | if (needle_pos == needle_end) |
688 | return haystack; |
689 | } |
690 | |
691 | ++haystack; |
692 | } |
693 | |
694 | return haystack_end; |
695 | } |
696 | |
697 | const UInt8 * search(const UInt8 * haystack, const size_t haystack_size) const |
698 | { |
699 | return search(haystack, haystack + haystack_size); |
700 | } |
701 | }; |
702 | |
703 | // Searches for needle surrounded by token-separators. |
704 | // Separators are anything inside ASCII (0-128) and not alphanum. |
705 | // Any value outside of basic ASCII (>=128) is considered a non-separator symbol, hence UTF-8 strings |
706 | // should work just fine. But any Unicode whitespace is not considered a token separtor. |
707 | template <typename StringSearcher> |
708 | class TokenSearcher |
709 | { |
710 | StringSearcher searcher; |
711 | size_t needle_size; |
712 | |
713 | public: |
714 | TokenSearcher(const char * const needle_, const size_t needle_size_) |
715 | : searcher{needle_, needle_size_}, |
716 | needle_size(needle_size_) |
717 | { |
718 | if (std::any_of(reinterpret_cast<const UInt8 *>(needle_), reinterpret_cast<const UInt8 *>(needle_) + needle_size_, isTokenSeparator)) |
719 | { |
720 | throw Exception{"Needle must not contain whitespace or separator characters" , ErrorCodes::BAD_ARGUMENTS}; |
721 | } |
722 | |
723 | } |
724 | |
725 | ALWAYS_INLINE bool compare(const UInt8 * haystack, const UInt8 * haystack_end, const UInt8 * pos) const |
726 | { |
727 | // use searcher only if pos is in the beginning of token and pos + searcher.needle_size is end of token. |
728 | if (isToken(haystack, haystack_end, pos)) |
729 | return searcher.compare(haystack, haystack_end, pos); |
730 | |
731 | return false; |
732 | } |
733 | |
734 | const UInt8 * search(const UInt8 * haystack, const UInt8 * const haystack_end) const |
735 | { |
736 | // use searcher.search(), then verify that returned value is a token |
737 | // if it is not, skip it and re-run |
738 | |
739 | const UInt8 * pos = haystack; |
740 | while (pos < haystack_end) |
741 | { |
742 | pos = searcher.search(pos, haystack_end); |
743 | if (pos == haystack_end || isToken(haystack, haystack_end, pos)) |
744 | return pos; |
745 | |
746 | // assuming that heendle does not contain any token separators. |
747 | pos += needle_size; |
748 | } |
749 | return haystack_end; |
750 | } |
751 | |
752 | const UInt8 * search(const UInt8 * haystack, const size_t haystack_size) const |
753 | { |
754 | return search(haystack, haystack + haystack_size); |
755 | } |
756 | |
757 | ALWAYS_INLINE bool isToken(const UInt8 * haystack, const UInt8 * const haystack_end, const UInt8* p) const |
758 | { |
759 | return (p == haystack || isTokenSeparator(*(p - 1))) |
760 | && (p + needle_size >= haystack_end || isTokenSeparator(*(p + needle_size))); |
761 | } |
762 | |
763 | ALWAYS_INLINE static bool isTokenSeparator(const UInt8 c) |
764 | { |
765 | if (isAlphaNumericASCII(c) || !isASCII(c)) |
766 | return false; |
767 | |
768 | return true; |
769 | } |
770 | }; |
771 | |
772 | |
773 | using ASCIICaseSensitiveStringSearcher = StringSearcher<true, true>; |
774 | using ASCIICaseInsensitiveStringSearcher = StringSearcher<false, true>; |
775 | using UTF8CaseSensitiveStringSearcher = StringSearcher<true, false>; |
776 | using UTF8CaseInsensitiveStringSearcher = StringSearcher<false, false>; |
777 | using ASCIICaseSensitiveTokenSearcher = TokenSearcher<ASCIICaseSensitiveStringSearcher>; |
778 | using ASCIICaseInsensitiveTokenSearcher = TokenSearcher<ASCIICaseInsensitiveStringSearcher>; |
779 | |
780 | |
781 | /** Uses functions from libc. |
782 | * It makes sense to use only with short haystacks when cheap initialization is required. |
783 | * There is no option for case-insensitive search for UTF-8 strings. |
784 | * It is required that strings are zero-terminated. |
785 | */ |
786 | |
787 | struct LibCASCIICaseSensitiveStringSearcher |
788 | { |
789 | const char * const needle; |
790 | |
791 | LibCASCIICaseSensitiveStringSearcher(const char * const needle_, const size_t /* needle_size */) |
792 | : needle(needle_) {} |
793 | |
794 | const UInt8 * search(const UInt8 * haystack, const UInt8 * const haystack_end) const |
795 | { |
796 | auto res = strstr(reinterpret_cast<const char *>(haystack), reinterpret_cast<const char *>(needle)); |
797 | if (!res) |
798 | return haystack_end; |
799 | return reinterpret_cast<const UInt8 *>(res); |
800 | } |
801 | |
802 | const UInt8 * search(const UInt8 * haystack, const size_t haystack_size) const |
803 | { |
804 | return search(haystack, haystack + haystack_size); |
805 | } |
806 | }; |
807 | |
808 | struct LibCASCIICaseInsensitiveStringSearcher |
809 | { |
810 | const char * const needle; |
811 | |
812 | LibCASCIICaseInsensitiveStringSearcher(const char * const needle_, const size_t /* needle_size */) |
813 | : needle(needle_) {} |
814 | |
815 | const UInt8 * search(const UInt8 * haystack, const UInt8 * const haystack_end) const |
816 | { |
817 | auto res = strcasestr(reinterpret_cast<const char *>(haystack), reinterpret_cast<const char *>(needle)); |
818 | if (!res) |
819 | return haystack_end; |
820 | return reinterpret_cast<const UInt8 *>(res); |
821 | } |
822 | |
823 | const UInt8 * search(const UInt8 * haystack, const size_t haystack_size) const |
824 | { |
825 | return search(haystack, haystack + haystack_size); |
826 | } |
827 | }; |
828 | |
829 | |
830 | } |
831 | |