shufti.c source code [ClickHouse/contrib/hyperscan/src/nfa/shufti.c]

1	/*
2	* Copyright (c) 2015-2017, Intel Corporation
3	*
4	* Redistribution and use in source and binary forms, with or without
5	* modification, are permitted provided that the following conditions are met:
6	*
7	* * Redistributions of source code must retain the above copyright notice,
8	* this list of conditions and the following disclaimer.
9	* * Redistributions in binary form must reproduce the above copyright
10	* notice, this list of conditions and the following disclaimer in the
11	* documentation and/or other materials provided with the distribution.
12	* * Neither the name of Intel Corporation nor the names of its contributors
13	* may be used to endorse or promote products derived from this software
14	* without specific prior written permission.
15	*
16	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19	* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20	* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26	* POSSIBILITY OF SUCH DAMAGE.
27	*/
28
29	/* \file*
30	* \brief Shufti: character class acceleration.
31	*
32	* Utilises the SSSE3 pshufb shuffle instruction
33	*/
34
35	#include "shufti.h"
36	#include "ue2common.h"
37	#include "util/arch.h"
38	#include "util/bitutils.h"
39	#include "util/simd_utils.h"
40	#include "util/unaligned.h"
41
42	#ifdef DEBUG
43	#include <ctype.h>
44
45	#define DUMP_MSK(_t) \
46	static UNUSED \
47	void dumpMsk##_t(m##_t msk) { \
48	u8 * mskAsU8 = (u8 *)&msk; \
49	for (unsigned i = 0; i < sizeof(msk); i++) { \
50	u8 c = mskAsU8[i]; \
51	for (int j = 0; j < 8; j++) { \
52	if ((c >> (7-j)) & 0x1) \
53	printf("1"); \
54	else \
55	printf("0"); \
56	} \
57	printf(" "); \
58	} \
59	} \
60	static UNUSED \
61	void dumpMsk##_t##AsChars(m##_t msk) { \
62	u8 * mskAsU8 = (u8 *)&msk; \
63	for (unsigned i = 0; i < sizeof(msk); i++) { \
64	u8 c = mskAsU8[i]; \
65	if (isprint(c)) \
66	printf("%c",c); \
67	else \
68	printf("."); \
69	} \
70	}
71
72	#endif
73
74	/* \brief Naive byte-by-byte implementation. /
75	static really_inline
76	const u8 shuftiFwdSlow(const* u8 lo, const* u8 hi, const* u8 *buf,
77	const u8 *buf_end) {
78	assert(buf < buf_end);
79
80	for (; buf < buf_end; ++buf) {
81	u8 c = *buf;
82	if (lo[c & `0xf`] & hi[c >> `4`]) {
83	break;
84	}
85	}
86	return buf;
87	}
88
89	/* \brief Naive byte-by-byte implementation. /
90	static really_inline
91	const u8 shuftiRevSlow(const* u8 lo, const* u8 hi, const* u8 *buf,
92	const u8 *buf_end) {
93	assert(buf < buf_end);
94
95	for (buf_end--; buf_end >= buf; buf_end--) {
96	u8 c = *buf_end;
97	if (lo[c & `0xf`] & hi[c >> `4`]) {
98	break;
99	}
100	}
101	return buf_end;
102	}
103
104	#if !defined(HAVE_AVX2)
105	/ Normal SSSE3 shufti /
106
107	#ifdef DEBUG
108	DUMP_MSK(`128`)
109	#endif
110
111	#define GET_LO_4(chars) and128(chars, low4bits)
112	#define GET_HI_4(chars) rshift64_m128(andnot128(low4bits, chars), 4)
113
114	static really_inline
115	u32 block(m128 mask_lo, m128 mask_hi, m128 chars, const m128 low4bits,
116	const m128 compare) {
117	m128 c_lo = pshufb_m128(mask_lo, GET_LO_4(chars));
118	m128 c_hi = pshufb_m128(mask_hi, GET_HI_4(chars));
119	m128 t = and128(c_lo, c_hi);
120
121	#ifdef DEBUG
122	DEBUG_PRINTF(" chars: "); dumpMsk128AsChars(chars); printf("\n");
123	DEBUG_PRINTF(" char: "); dumpMsk128(chars); printf("\n");
124	DEBUG_PRINTF(" c_lo: "); dumpMsk128(c_lo); printf("\n");
125	DEBUG_PRINTF(" c_hi: "); dumpMsk128(c_hi); printf("\n");
126	DEBUG_PRINTF(" t: "); dumpMsk128(t); printf("\n");
127	#endif
128	return movemask128(eq128(t, compare));
129	}
130
131	static really_inline
132	const u8 firstMatch(const* u8 *buf, u32 z) {
133	if (unlikely(z != `0xffff`)) {
134	u32 pos = ctz32(~z & `0xffff`);
135	assert(pos < `16`);
136	return buf + pos;
137	} else {
138	return NULL; // no match
139	}
140	}
141
142	static really_inline
143	const u8 fwdBlock(m128 mask_lo, m128 mask_hi, m128 chars, const* u8 *buf,
144	const m128 low4bits, const m128 zeroes) {
145	u32 z = block(mask_lo, mask_hi, chars, low4bits, zeroes);
146
147	return firstMatch(buf, z);
148	}
149
150	const u8 shuftiExec(m128 mask_lo, m128 mask_hi, const* u8 *buf,
151	const u8 *buf_end) {
152	assert(buf && buf_end);
153	assert(buf < buf_end);
154
155	// Slow path for small cases.
156	if (buf_end - buf < `16`) {
157	return shuftiFwdSlow((const u8 )&mask_lo, (const* u8 *)&mask_hi,
158	buf, buf_end);
159	}
160
161	const m128 zeroes = zeroes128();
162	const m128 low4bits = _mm_set1_epi8(`0xf`);
163	const u8 *rv;
164
165	size_t min = (size_t)buf % `16`;
166	assert(buf_end - buf >= `16`);
167
168	// Preconditioning: most of the time our buffer won't be aligned.
169	m128 chars = loadu128(buf);
170	rv = fwdBlock(mask_lo, mask_hi, chars, buf, low4bits, zeroes);
171	if (rv) {
172	return rv;
173	}
174	buf += (`16` - min);
175
176	// Unrolling was here, but it wasn't doing anything but taking up space.
177	// Reroll FTW.
178
179	const u8 *last_block = buf_end - `16`;
180	while (buf < last_block) {
181	m128 lchars = load128(buf);
182	rv = fwdBlock(mask_lo, mask_hi, lchars, buf, low4bits, zeroes);
183	if (rv) {
184	return rv;
185	}
186	buf += `16`;
187	}
188
189	// Use an unaligned load to mop up the last 16 bytes and get an accurate
190	// picture to buf_end.
191	assert(buf <= buf_end && buf >= buf_end - `16`);
192	chars = loadu128(buf_end - `16`);
193	rv = fwdBlock(mask_lo, mask_hi, chars, buf_end - `16`, low4bits, zeroes);
194	if (rv) {
195	return rv;
196	}
197
198	return buf_end;
199	}
200
201	static really_inline
202	const u8 lastMatch(const* u8 *buf, m128 t, m128 compare) {
203	#ifdef DEBUG
204	DEBUG_PRINTF("confirming match in:"); dumpMsk128(t); printf("\n");
205	#endif
206
207	u32 z = movemask128(eq128(t, compare));
208	if (unlikely(z != `0xffff`)) {
209	u32 pos = clz32(~z & `0xffff`);
210	DEBUG_PRINTF("buf=%p, pos=%u\n", buf, pos);
211	assert(pos >= `16` && pos < `32`);
212	return buf + (`31` - pos);
213	} else {
214	return NULL; // no match
215	}
216	}
217
218
219	static really_inline
220	const u8 revBlock(m128 mask_lo, m128 mask_hi, m128 chars, const* u8 *buf,
221	const m128 low4bits, const m128 zeroes) {
222	m128 c_lo = pshufb_m128(mask_lo, GET_LO_4(chars));
223	m128 c_hi = pshufb_m128(mask_hi, GET_HI_4(chars));
224	m128 t = and128(c_lo, c_hi);
225
226	#ifdef DEBUG
227	DEBUG_PRINTF(" chars: "); dumpMsk128AsChars(chars); printf("\n");
228	DEBUG_PRINTF(" char: "); dumpMsk128(chars); printf("\n");
229	DEBUG_PRINTF(" c_lo: "); dumpMsk128(c_lo); printf("\n");
230	DEBUG_PRINTF(" c_hi: "); dumpMsk128(c_hi); printf("\n");
231	DEBUG_PRINTF(" t: "); dumpMsk128(t); printf("\n");
232	#endif
233
234	return lastMatch(buf, t, zeroes);
235	}
236
237	const u8 rshuftiExec(m128 mask_lo, m128 mask_hi, const* u8 *buf,
238	const u8 *buf_end) {
239	assert(buf && buf_end);
240	assert(buf < buf_end);
241
242	// Slow path for small cases.
243	if (buf_end - buf < `16`) {
244	return shuftiRevSlow((const u8 )&mask_lo, (const* u8 *)&mask_hi,
245	buf, buf_end);
246	}
247
248	const m128 zeroes = zeroes128();
249	const m128 low4bits = _mm_set1_epi8(`0xf`);
250	const u8 *rv;
251
252	assert(buf_end - buf >= `16`);
253
254	// Preconditioning: most of the time our buffer won't be aligned.
255	m128 chars = loadu128(buf_end - `16`);
256	rv = revBlock(mask_lo, mask_hi, chars, buf_end - `16`, low4bits, zeroes);
257	if (rv) {
258	return rv;
259	}
260	buf_end = (const u8 *)((size_t)buf_end & ~((size_t)`0xf`));
261
262	// Unrolling was here, but it wasn't doing anything but taking up space.
263	// Reroll FTW.
264
265	const u8 *last_block = buf + `16`;
266	while (buf_end > last_block) {
267	buf_end -= `16`;
268	m128 lchars = load128(buf_end);
269	rv = revBlock(mask_lo, mask_hi, lchars, buf_end, low4bits, zeroes);
270	if (rv) {
271	return rv;
272	}
273	}
274
275	// Use an unaligned load to mop up the last 16 bytes and get an accurate
276	// picture to buf.
277	chars = loadu128(buf);
278	rv = revBlock(mask_lo, mask_hi, chars, buf, low4bits, zeroes);
279	if (rv) {
280	return rv;
281	}
282
283	return buf - `1`;
284	}
285
286	static really_inline
287	const u8 *fwdBlock2(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, m128 mask2_hi,
288	m128 chars, const u8 buf, const* m128 low4bits,
289	const m128 ones) {
290	m128 chars_lo = GET_LO_4(chars);
291	m128 chars_hi = GET_HI_4(chars);
292	m128 c_lo = pshufb_m128(mask1_lo, chars_lo);
293	m128 c_hi = pshufb_m128(mask1_hi, chars_hi);
294	m128 t = or128(c_lo, c_hi);
295
296	#ifdef DEBUG
297	DEBUG_PRINTF(" chars: "); dumpMsk128AsChars(chars); printf("\n");
298	DEBUG_PRINTF(" char: "); dumpMsk128(chars); printf("\n");
299	DEBUG_PRINTF(" c_lo: "); dumpMsk128(c_lo); printf("\n");
300	DEBUG_PRINTF(" c_hi: "); dumpMsk128(c_hi); printf("\n");
301	DEBUG_PRINTF(" t: "); dumpMsk128(t); printf("\n");
302	#endif
303
304	m128 c2_lo = pshufb_m128(mask2_lo, chars_lo);
305	m128 c2_hi = pshufb_m128(mask2_hi, chars_hi);
306	m128 t2 = or128(t, rshiftbyte_m128(or128(c2_lo, c2_hi), `1`));
307
308	#ifdef DEBUG
309	DEBUG_PRINTF(" c2_lo: "); dumpMsk128(c2_lo); printf("\n");
310	DEBUG_PRINTF(" c2_hi: "); dumpMsk128(c2_hi); printf("\n");
311	DEBUG_PRINTF(" t2: "); dumpMsk128(t2); printf("\n");
312	#endif
313
314	u32 z = movemask128(eq128(t2, ones));
315	DEBUG_PRINTF(" z: 0x%08x\n", z);
316	return firstMatch(buf, z);
317	}
318
319	const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
320	m128 mask2_lo, m128 mask2_hi,
321	const u8 buf, const* u8 *buf_end) {
322	const m128 ones = ones128();
323	const m128 low4bits = _mm_set1_epi8(`0xf`);
324	const u8 *rv;
325
326	size_t min = (size_t)buf % `16`;
327
328	// Preconditioning: most of the time our buffer won't be aligned.
329	m128 chars = loadu128(buf);
330	rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi,
331	chars, buf, low4bits, ones);
332	if (rv) {
333	return rv;
334	}
335	buf += (`16` - min);
336
337	// Unrolling was here, but it wasn't doing anything but taking up space.
338	// Reroll FTW.
339
340	const u8 *last_block = buf_end - `16`;
341	while (buf < last_block) {
342	m128 lchars = load128(buf);
343	rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi,
344	lchars, buf, low4bits, ones);
345	if (rv) {
346	return rv;
347	}
348	buf += `16`;
349	}
350
351	// Use an unaligned load to mop up the last 16 bytes and get an accurate
352	// picture to buf_end.
353	chars = loadu128(buf_end - `16`);
354	rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi,
355	chars, buf_end - `16`, low4bits, ones);
356	if (rv) {
357	return rv;
358	}
359
360	return buf_end;
361	}
362
363	#elif !defined(HAVE_AVX512)
364	// AVX2 - 256 wide shuftis
365
366	#ifdef DEBUG
367	DUMP_MSK(`256`)
368	#endif
369
370	#define GET_LO_4(chars) and256(chars, low4bits)
371	#define GET_HI_4(chars) rshift64_m256(andnot256(low4bits, chars), 4)
372
373	static really_inline
374	u32 block(m256 mask_lo, m256 mask_hi, m256 chars, const m256 low4bits,
375	const m256 compare) {
376	m256 c_lo = pshufb_m256(mask_lo, GET_LO_4(chars));
377	m256 c_hi = pshufb_m256(mask_hi, GET_HI_4(chars));
378	m256 t = and256(c_lo, c_hi);
379
380	#ifdef DEBUG
381	DEBUG_PRINTF(" chars: "); dumpMsk256AsChars(chars); printf("\n");
382	DEBUG_PRINTF(" char: "); dumpMsk256(chars); printf("\n");
383	DEBUG_PRINTF(" c_lo: "); dumpMsk256(c_lo); printf("\n");
384	DEBUG_PRINTF(" c_hi: "); dumpMsk256(c_hi); printf("\n");
385	DEBUG_PRINTF(" t: "); dumpMsk256(t); printf("\n");
386	#endif
387
388	return movemask256(eq256(t, compare));
389	}
390
391	static really_inline
392	const u8 firstMatch(const* u8 *buf, u32 z) {
393	DEBUG_PRINTF("z 0x%08x\n", z);
394	if (unlikely(z != `0xffffffff`)) {
395	u32 pos = ctz32(~z);
396	assert(pos < `32`);
397	DEBUG_PRINTF("match @ pos %u\n", pos);
398	return buf + pos;
399	} else {
400	return NULL; // no match
401	}
402	}
403
404	static really_inline
405	const u8 fwdBlockShort(m256 mask, m128 chars, const* u8 *buf,
406	const m256 low4bits) {
407	// do the hi and lo shuffles in the one avx register
408	m256 c = combine2x128(rshift64_m128(chars, `4`), chars);
409	c = and256(c, low4bits);
410	m256 c_shuf = pshufb_m256(mask, c);
411	m128 t = and128(movdq_hi(c_shuf), cast256to128(c_shuf));
412	// the upper 32-bits can't match
413	u32 z = `0xffff0000U` \| movemask128(eq128(t, zeroes128()));
414
415	return firstMatch(buf, z);
416	}
417
418	static really_inline
419	const u8 shuftiFwdShort(m128 mask_lo, m128 mask_hi, const* u8 *buf,
420	const u8 buf_end, const* m256 low4bits) {
421	// run shufti over two overlapping 16-byte unaligned reads
422	const m256 mask = combine2x128(mask_hi, mask_lo);
423	m128 chars = loadu128(buf);
424	const u8 *rv = fwdBlockShort(mask, chars, buf, low4bits);
425	if (rv) {
426	return rv;
427	}
428
429	chars = loadu128(buf_end - `16`);
430	rv = fwdBlockShort(mask, chars, buf_end - `16`, low4bits);
431	if (rv) {
432	return rv;
433	}
434	return buf_end;
435	}
436
437	static really_inline
438	const u8 fwdBlock(m256 mask_lo, m256 mask_hi, m256 chars, const* u8 *buf,
439	const m256 low4bits, const m256 zeroes) {
440	u32 z = block(mask_lo, mask_hi, chars, low4bits, zeroes);
441
442	return firstMatch(buf, z);
443	}
444
445	/ takes 128 bit masks, but operates on 256 bits of data /
446	const u8 shuftiExec(m128 mask_lo, m128 mask_hi, const* u8 *buf,
447	const u8 *buf_end) {
448	assert(buf && buf_end);
449	assert(buf < buf_end);
450	DEBUG_PRINTF("shufti %p len %zu\n", buf, buf_end - buf);
451
452	// Slow path for small cases.
453	if (buf_end - buf < `16`) {
454	return shuftiFwdSlow((const u8 )&mask_lo, (const* u8 *)&mask_hi,
455	buf, buf_end);
456	}
457
458	const m256 low4bits = set32x8(`0xf`);
459
460	if (buf_end - buf <= `32`) {
461	return shuftiFwdShort(mask_lo, mask_hi, buf, buf_end, low4bits);
462	}
463
464	const m256 zeroes = zeroes256();
465	const m256 wide_mask_lo = set2x128(mask_lo);
466	const m256 wide_mask_hi = set2x128(mask_hi);
467	const u8 *rv;
468
469	size_t min = (size_t)buf % `32`;
470	assert(buf_end - buf >= `32`);
471
472	// Preconditioning: most of the time our buffer won't be aligned.
473	m256 chars = loadu256(buf);
474	rv = fwdBlock(wide_mask_lo, wide_mask_hi, chars, buf, low4bits, zeroes);
475	if (rv) {
476	return rv;
477	}
478	buf += (`32` - min);
479
480	// Unrolling was here, but it wasn't doing anything but taking up space.
481	// Reroll FTW.
482
483	const u8 *last_block = buf_end - `32`;
484	while (buf < last_block) {
485	m256 lchars = load256(buf);
486	rv = fwdBlock(wide_mask_lo, wide_mask_hi, lchars, buf, low4bits, zeroes);
487	if (rv) {
488	return rv;
489	}
490	buf += `32`;
491	}
492
493	// Use an unaligned load to mop up the last 32 bytes and get an accurate
494	// picture to buf_end.
495	assert(buf <= buf_end && buf >= buf_end - `32`);
496	chars = loadu256(buf_end - `32`);
497	rv = fwdBlock(wide_mask_lo, wide_mask_hi, chars, buf_end - `32`, low4bits, zeroes);
498	if (rv) {
499	return rv;
500	}
501
502	return buf_end;
503	}
504
505	static really_inline
506	const u8 lastMatch(const* u8 *buf, u32 z) {
507	if (unlikely(z != `0xffffffff`)) {
508	u32 pos = clz32(~z);
509	DEBUG_PRINTF("buf=%p, pos=%u\n", buf, pos);
510	return buf + (`31` - pos);
511	} else {
512	return NULL; // no match
513	}
514	}
515
516	static really_inline
517	const u8 revBlock(m256 mask_lo, m256 mask_hi, m256 chars, const* u8 *buf,
518	const m256 low4bits, const m256 zeroes) {
519	m256 c_lo = pshufb_m256(mask_lo, GET_LO_4(chars));
520	m256 c_hi = pshufb_m256(mask_hi, GET_HI_4(chars));
521	m256 t = and256(c_lo, c_hi);
522
523	#ifdef DEBUG
524	DEBUG_PRINTF(" chars: "); dumpMsk256AsChars(chars); printf("\n");
525	DEBUG_PRINTF(" char: "); dumpMsk256(chars); printf("\n");
526	DEBUG_PRINTF(" c_lo: "); dumpMsk256(c_lo); printf("\n");
527	DEBUG_PRINTF(" c_hi: "); dumpMsk256(c_hi); printf("\n");
528	DEBUG_PRINTF(" t: "); dumpMsk256(t); printf("\n");
529	#endif
530
531	u32 z = movemask256(eq256(t, zeroes));
532	return lastMatch(buf, z);
533	}
534
535	static really_inline
536	const u8 revBlockShort(m256 mask, m128 chars, const* u8 *buf,
537	const m256 low4bits) {
538	// do the hi and lo shuffles in the one avx register
539	m256 c = combine2x128(rshift64_m128(chars, `4`), chars);
540	c = and256(c, low4bits);
541	m256 c_shuf = pshufb_m256(mask, c);
542	m128 t = and128(movdq_hi(c_shuf), cast256to128(c_shuf));
543	// the upper 32-bits can't match
544	u32 z = `0xffff0000U` \| movemask128(eq128(t, zeroes128()));
545
546	return lastMatch(buf, z);
547	}
548
549	static really_inline
550	const u8 shuftiRevShort(m128 mask_lo, m128 mask_hi, const* u8 *buf,
551	const u8 buf_end, const* m256 low4bits) {
552	// run shufti over two overlapping 16-byte unaligned reads
553	const m256 mask = combine2x128(mask_hi, mask_lo);
554
555	m128 chars = loadu128(buf_end - `16`);
556	const u8 *rv = revBlockShort(mask, chars, buf_end - `16`, low4bits);
557	if (rv) {
558	return rv;
559	}
560
561	chars = loadu128(buf);
562	rv = revBlockShort(mask, chars, buf, low4bits);
563	if (rv) {
564	return rv;
565	}
566	return buf - `1`;
567	}
568
569
570	/ takes 128 bit masks, but operates on 256 bits of data /
571	const u8 rshuftiExec(m128 mask_lo, m128 mask_hi, const* u8 *buf,
572	const u8 *buf_end) {
573	assert(buf && buf_end);
574	assert(buf < buf_end);
575
576	// Slow path for small cases.
577	if (buf_end - buf < `16`) {
578	return shuftiRevSlow((const u8 )&mask_lo, (const* u8 *)&mask_hi,
579	buf, buf_end);
580	}
581
582	const m256 low4bits = set32x8(`0xf`);
583
584	if (buf_end - buf <= `32`) {
585	return shuftiRevShort(mask_lo, mask_hi, buf, buf_end, low4bits);
586	}
587
588	const m256 zeroes = zeroes256();
589	const m256 wide_mask_lo = set2x128(mask_lo);
590	const m256 wide_mask_hi = set2x128(mask_hi);
591	const u8 *rv;
592
593	assert(buf_end - buf >= `32`);
594
595	// Preconditioning: most of the time our buffer won't be aligned.
596	m256 chars = loadu256(buf_end - `32`);
597	rv = revBlock(wide_mask_lo, wide_mask_hi, chars, buf_end - `32`, low4bits, zeroes);
598	if (rv) {
599	return rv;
600	}
601	buf_end = (const u8 *)((size_t)buf_end & ~((size_t)`0x1f`));
602
603	// Unrolling was here, but it wasn't doing anything but taking up space.
604	// Reroll FTW.
605	const u8 *last_block = buf + `32`;
606	while (buf_end > last_block) {
607	buf_end -= `32`;
608	m256 lchars = load256(buf_end);
609	rv = revBlock(wide_mask_lo, wide_mask_hi, lchars, buf_end, low4bits, zeroes);
610	if (rv) {
611	return rv;
612	}
613	}
614
615	// Use an unaligned load to mop up the last 32 bytes and get an accurate
616	// picture to buf.
617	chars = loadu256(buf);
618	rv = revBlock(wide_mask_lo, wide_mask_hi, chars, buf, low4bits, zeroes);
619	if (rv) {
620	return rv;
621	}
622
623	return buf - `1`;
624	}
625
626	static really_inline
627	const u8 *fwdBlock2(m256 mask1_lo, m256 mask1_hi, m256 mask2_lo, m256 mask2_hi,
628	m256 chars, const u8 buf, const* m256 low4bits,
629	const m256 ones) {
630	DEBUG_PRINTF("buf %p\n", buf);
631	m256 chars_lo = GET_LO_4(chars);
632	m256 chars_hi = GET_HI_4(chars);
633	m256 c_lo = pshufb_m256(mask1_lo, chars_lo);
634	m256 c_hi = pshufb_m256(mask1_hi, chars_hi);
635	m256 t = or256(c_lo, c_hi);
636
637	#ifdef DEBUG
638	DEBUG_PRINTF(" chars: "); dumpMsk256AsChars(chars); printf("\n");
639	DEBUG_PRINTF(" char: "); dumpMsk256(chars); printf("\n");
640	DEBUG_PRINTF(" c_lo: "); dumpMsk256(c_lo); printf("\n");
641	DEBUG_PRINTF(" c_hi: "); dumpMsk256(c_hi); printf("\n");
642	DEBUG_PRINTF(" t: "); dumpMsk256(t); printf("\n");
643	#endif
644
645	m256 c2_lo = pshufb_m256(mask2_lo, chars_lo);
646	m256 c2_hi = pshufb_m256(mask2_hi, chars_hi);
647	m256 t2 = or256(t, rshift128_m256(or256(c2_lo, c2_hi), `1`));
648
649	#ifdef DEBUG
650	DEBUG_PRINTF(" c2_lo: "); dumpMsk256(c2_lo); printf("\n");
651	DEBUG_PRINTF(" c2_hi: "); dumpMsk256(c2_hi); printf("\n");
652	DEBUG_PRINTF(" t2: "); dumpMsk256(t2); printf("\n");
653	#endif
654	u32 z = movemask256(eq256(t2, ones));
655
656	return firstMatch(buf, z);
657	}
658
659	static really_inline
660	const u8 fwdBlockShort2(m256 mask1, m256 mask2, m128 chars, const* u8 *buf,
661	const m256 low4bits) {
662	// do the hi and lo shuffles in the one avx register
663	m256 c = combine2x128(rshift64_m128(chars, `4`), chars);
664	c = and256(c, low4bits);
665	m256 c_shuf1 = pshufb_m256(mask1, c);
666	m256 c_shuf2 = rshift128_m256(pshufb_m256(mask2, c), `1`);
667	m256 t0 = or256(c_shuf1, c_shuf2);
668	m128 t = or128(movdq_hi(t0), cast256to128(t0));
669	// the upper 32-bits can't match
670	u32 z = `0xffff0000U` \| movemask128(eq128(t, ones128()));
671
672	return firstMatch(buf, z);
673	}
674
675	static really_inline
676	const u8 *shuftiDoubleShort(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo,
677	m128 mask2_hi, const u8 buf, const* u8 *buf_end) {
678	DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf);
679	const m256 low4bits = set32x8(`0xf`);
680	// run shufti over two overlapping 16-byte unaligned reads
681	const m256 mask1 = combine2x128(mask1_hi, mask1_lo);
682	const m256 mask2 = combine2x128(mask2_hi, mask2_lo);
683	m128 chars = loadu128(buf);
684	const u8 *rv = fwdBlockShort2(mask1, mask2, chars, buf, low4bits);
685	if (rv) {
686	return rv;
687	}
688
689	chars = loadu128(buf_end - `16`);
690	rv = fwdBlockShort2(mask1, mask2, chars, buf_end - `16`, low4bits);
691	if (rv) {
692	return rv;
693	}
694	return buf_end;
695	}
696
697	/ takes 128 bit masks, but operates on 256 bits of data /
698	const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
699	m128 mask2_lo, m128 mask2_hi,
700	const u8 buf, const* u8 *buf_end) {
701	/ we should always have at least 16 bytes /
702	assert(buf_end - buf >= `16`);
703	DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf);
704
705	if (buf_end - buf < `32`) {
706	return shuftiDoubleShort(mask1_lo, mask1_hi, mask2_lo, mask2_hi, buf,
707	buf_end);
708	}
709
710	const m256 ones = ones256();
711	const m256 low4bits = set32x8(`0xf`);
712	const m256 wide_mask1_lo = set2x128(mask1_lo);
713	const m256 wide_mask1_hi = set2x128(mask1_hi);
714	const m256 wide_mask2_lo = set2x128(mask2_lo);
715	const m256 wide_mask2_hi = set2x128(mask2_hi);
716	const u8 *rv;
717
718	size_t min = (size_t)buf % `32`;
719
720	// Preconditioning: most of the time our buffer won't be aligned.
721	m256 chars = loadu256(buf);
722	rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi,
723	chars, buf, low4bits, ones);
724	if (rv) {
725	return rv;
726	}
727	buf += (`32` - min);
728
729	// Unrolling was here, but it wasn't doing anything but taking up space.
730	// Reroll FTW.
731	const u8 *last_block = buf_end - `32`;
732	while (buf < last_block) {
733	m256 lchars = load256(buf);
734	rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi,
735	lchars, buf, low4bits, ones);
736	if (rv) {
737	return rv;
738	}
739	buf += `32`;
740	}
741
742	// Use an unaligned load to mop up the last 32 bytes and get an accurate
743	// picture to buf_end.
744	chars = loadu256(buf_end - `32`);
745	rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi,
746	chars, buf_end - `32`, low4bits, ones);
747	if (rv) {
748	return rv;
749	}
750
751	return buf_end;
752	}
753
754	#else // defined(HAVE_AVX512)
755
756	#ifdef DEBUG
757	DUMP_MSK(`512`)
758	#endif
759
760	static really_inline
761	u64a block(m512 mask_lo, m512 mask_hi, m512 chars, const m512 low4bits,
762	const m512 compare) {
763	m512 c_lo = pshufb_m512(mask_lo, and512(chars, low4bits));
764	m512 c_hi = pshufb_m512(mask_hi,
765	rshift64_m512(andnot512(low4bits, chars), `4`));
766	m512 t = and512(c_lo, c_hi);
767
768	#ifdef DEBUG
769	DEBUG_PRINTF(" chars: "); dumpMsk512AsChars(chars); printf("\n");
770	DEBUG_PRINTF(" char: "); dumpMsk512(chars); printf("\n");
771	DEBUG_PRINTF(" c_lo: "); dumpMsk512(c_lo); printf("\n");
772	DEBUG_PRINTF(" c_hi: "); dumpMsk512(c_hi); printf("\n");
773	DEBUG_PRINTF(" t: "); dumpMsk512(t); printf("\n");
774	#endif
775
776	return eq512mask(t, compare);
777	}
778	static really_inline
779	const u8 firstMatch64(const* u8 *buf, u64a z) {
780	DEBUG_PRINTF("z 0x%016llx\n", z);
781	if (unlikely(z != ~`0ULL`)) {
782	u32 pos = ctz64(~z);
783	DEBUG_PRINTF("match @ pos %u\n", pos);
784	assert(pos < `64`);
785	return buf + pos;
786	} else {
787	return NULL; // no match
788	}
789	}
790
791	static really_inline
792	const u8 fwdBlock512(m512 mask_lo, m512 mask_hi, m512 chars, const* u8 *buf,
793	const m512 low4bits, const m512 zeroes) {
794	u64a z = block(mask_lo, mask_hi, chars, low4bits, zeroes);
795
796	return firstMatch64(buf, z);
797	}
798
799	static really_inline
800	const u8 shortShufti512(m512 mask_lo, m512 mask_hi, const* u8 *buf,
801	const u8 buf_end, const* m512 low4bits,
802	const m512 zeroes) {
803	DEBUG_PRINTF("short shufti %p len %zu\n", buf, buf_end - buf);
804	uintptr_t len = buf_end - buf;
805	assert(len <= `64`);
806
807	// load mask
808	u64a k = (~`0ULL`) >> (`64` - len);
809	DEBUG_PRINTF("load mask 0x%016llx\n", k);
810
811	m512 chars = loadu_maskz_m512(k, buf);
812
813	u64a z = block(mask_lo, mask_hi, chars, low4bits, zeroes);
814
815	// reuse the load mask to indicate valid bytes
816	return firstMatch64(buf, z \| ~k);
817	}
818
819	/ takes 128 bit masks, but operates on 512 bits of data /
820	const u8 shuftiExec(m128 mask_lo, m128 mask_hi, const* u8 *buf,
821	const u8 *buf_end) {
822	assert(buf && buf_end);
823	assert(buf < buf_end);
824	DEBUG_PRINTF("shufti %p len %zu\n", buf, buf_end - buf);
825	DEBUG_PRINTF("b %s\n", buf);
826
827	const m512 low4bits = set64x8(`0xf`);
828	const m512 zeroes = zeroes512();
829	const m512 wide_mask_lo = set4x128(mask_lo);
830	const m512 wide_mask_hi = set4x128(mask_hi);
831	const u8 *rv;
832
833	// small cases.
834	if (buf_end - buf <= `64`) {
835	rv = shortShufti512(wide_mask_lo, wide_mask_hi, buf, buf_end, low4bits,
836	zeroes);
837	return rv ? rv : buf_end;
838	}
839
840	assert(buf_end - buf >= `64`);
841
842	// Preconditioning: most of the time our buffer won't be aligned.
843	if ((uintptr_t)buf % `64`) {
844	rv = shortShufti512(wide_mask_lo, wide_mask_hi, buf,
845	ROUNDUP_PTR(buf, `64`), low4bits, zeroes);
846	if (rv) {
847	return rv;
848	}
849	buf = ROUNDUP_PTR(buf, `64`);
850	}
851
852	const u8 *last_block = ROUNDDOWN_PTR(buf_end, `64`);
853	while (buf < last_block) {
854	m512 lchars = load512(buf);
855	rv = fwdBlock512(wide_mask_lo, wide_mask_hi, lchars, buf, low4bits,
856	zeroes);
857	if (rv) {
858	return rv;
859	}
860	buf += `64`;
861	}
862
863	if (buf == buf_end) {
864	goto done;
865	}
866
867	// Use an unaligned load to mop up the last 64 bytes and get an accurate
868	// picture to buf_end.
869	assert(buf <= buf_end && buf >= buf_end - `64`);
870	m512 chars = loadu512(buf_end - `64`);
871	rv = fwdBlock512(wide_mask_lo, wide_mask_hi, chars, buf_end - `64`, low4bits,
872	zeroes);
873	if (rv) {
874	return rv;
875	}
876	done:
877	return buf_end;
878	}
879
880	static really_inline
881	const u8 lastMatch64(const* u8 *buf, u64a z) {
882	DEBUG_PRINTF("z 0x%016llx\n", z);
883	if (unlikely(z != ~`0ULL`)) {
884	u32 pos = clz64(~z);
885	DEBUG_PRINTF("buf=%p, pos=%u\n", buf, pos);
886	return buf + (`63` - pos);
887	} else {
888	return NULL; // no match
889	}
890	}
891
892	static really_inline
893	const u8 rshortShufti512(m512 mask_lo, m512 mask_hi, const* u8 *buf,
894	const u8 buf_end, const* m512 low4bits,
895	const m512 zeroes) {
896	DEBUG_PRINTF("short %p len %zu\n", buf, buf_end - buf);
897	uintptr_t len = buf_end - buf;
898	assert(len <= `64`);
899
900	// load mask
901	u64a k = (~`0ULL`) >> (`64` - len);
902	DEBUG_PRINTF("load mask 0x%016llx\n", k);
903
904	m512 chars = loadu_maskz_m512(k, buf);
905
906	u64a z = block(mask_lo, mask_hi, chars, low4bits, zeroes);
907
908	// reuse the load mask to indicate valid bytes
909	return lastMatch64(buf, z \| ~k);
910	}
911
912	static really_inline
913	const u8 revBlock512(m512 mask_lo, m512 mask_hi, m512 chars, const* u8 *buf,
914	const m512 low4bits, const m512 zeroes) {
915	m512 c_lo = pshufb_m512(mask_lo, and512(chars, low4bits));
916	m512 c_hi = pshufb_m512(mask_hi,
917	rshift64_m512(andnot512(low4bits, chars), `4`));
918	m512 t = and512(c_lo, c_hi);
919
920	#ifdef DEBUG
921	DEBUG_PRINTF(" chars: "); dumpMsk512AsChars(chars); printf("\n");
922	DEBUG_PRINTF(" char: "); dumpMsk512(chars); printf("\n");
923	DEBUG_PRINTF(" c_lo: "); dumpMsk512(c_lo); printf("\n");
924	DEBUG_PRINTF(" c_hi: "); dumpMsk512(c_hi); printf("\n");
925	DEBUG_PRINTF(" t: "); dumpMsk512(t); printf("\n");
926	#endif
927
928	u64a z = eq512mask(t, zeroes);
929	return lastMatch64(buf, z);
930	}
931
932	/ takes 128 bit masks, but operates on 512 bits of data /
933	const u8 rshuftiExec(m128 mask_lo, m128 mask_hi, const* u8 *buf,
934	const u8 *buf_end) {
935	DEBUG_PRINTF("buf %p buf_end %p\n", buf, buf_end);
936	assert(buf && buf_end);
937	assert(buf < buf_end);
938
939	const m512 low4bits = set64x8(`0xf`);
940	const m512 zeroes = zeroes512();
941	const m512 wide_mask_lo = set4x128(mask_lo);
942	const m512 wide_mask_hi = set4x128(mask_hi);
943	const u8 *rv;
944
945	if (buf_end - buf < `64`) {
946	rv = rshortShufti512(wide_mask_lo, wide_mask_hi, buf, buf_end, low4bits,
947	zeroes);
948	return rv ? rv : buf - `1`;
949	}
950
951	if (ROUNDDOWN_PTR(buf_end, `64`) != buf_end) {
952	// peel off unaligned portion
953	assert(buf_end - buf >= `64`);
954	DEBUG_PRINTF("start\n");
955	rv = rshortShufti512(wide_mask_lo, wide_mask_hi,
956	ROUNDDOWN_PTR(buf_end, `64`), buf_end, low4bits,
957	zeroes);
958	if (rv) {
959	return rv;
960	}
961	buf_end = ROUNDDOWN_PTR(buf_end, `64`);
962	}
963
964	const u8 *last_block = ROUNDUP_PTR(buf, `64`);
965	while (buf_end > last_block) {
966	buf_end -= `64`;
967	m512 lchars = load512(buf_end);
968	rv = revBlock512(wide_mask_lo, wide_mask_hi, lchars, buf_end, low4bits,
969	zeroes);
970	if (rv) {
971	return rv;
972	}
973	}
974	if (buf_end == buf) {
975	goto done;
976	}
977	// Use an unaligned load to mop up the last 64 bytes and get an accurate
978	// picture to buf.
979	m512 chars = loadu512(buf);
980	rv = revBlock512(wide_mask_lo, wide_mask_hi, chars, buf, low4bits, zeroes);
981	if (rv) {
982	return rv;
983	}
984	done:
985	return buf - `1`;
986	}
987
988	static really_inline
989	const u8 *fwdBlock2(m512 mask1_lo, m512 mask1_hi, m512 mask2_lo, m512 mask2_hi,
990	m512 chars, const u8 buf, const* m512 low4bits,
991	const m512 ones, __mmask64 k) {
992	DEBUG_PRINTF("buf %p %.64s\n", buf, buf);
993	m512 chars_lo = and512(chars, low4bits);
994	m512 chars_hi = rshift64_m512(andnot512(low4bits, chars), `4`);
995	m512 c_lo = maskz_pshufb_m512(k, mask1_lo, chars_lo);
996	m512 c_hi = maskz_pshufb_m512(k, mask1_hi, chars_hi);
997	m512 t = or512(c_lo, c_hi);
998
999	#ifdef DEBUG
1000	DEBUG_PRINTF(" chars: "); dumpMsk512AsChars(chars); printf("\n");
1001	DEBUG_PRINTF(" char: "); dumpMsk512(chars); printf("\n");
1002	DEBUG_PRINTF(" c_lo: "); dumpMsk512(c_lo); printf("\n");
1003	DEBUG_PRINTF(" c_hi: "); dumpMsk512(c_hi); printf("\n");
1004	DEBUG_PRINTF(" t: "); dumpMsk512(t); printf("\n");
1005	#endif
1006
1007	m512 c2_lo = maskz_pshufb_m512(k, mask2_lo, chars_lo);
1008	m512 c2_hi = maskz_pshufb_m512(k, mask2_hi, chars_hi);
1009	m512 t2 = or512(t, rshift128_m512(or512(c2_lo, c2_hi), `1`));
1010
1011	#ifdef DEBUG
1012	DEBUG_PRINTF(" c2_lo: "); dumpMsk512(c2_lo); printf("\n");
1013	DEBUG_PRINTF(" c2_hi: "); dumpMsk512(c2_hi); printf("\n");
1014	DEBUG_PRINTF(" t2: "); dumpMsk512(t2); printf("\n");
1015	#endif
1016	u64a z = eq512mask(t2, ones);
1017
1018	return firstMatch64(buf, z \| ~k);
1019	}
1020
1021	static really_inline
1022	const u8 *shortDoubleShufti512(m512 mask1_lo, m512 mask1_hi, m512 mask2_lo,
1023	m512 mask2_hi, const u8 buf, const* u8 *buf_end,
1024	const m512 low4bits, const m512 ones) {
1025	DEBUG_PRINTF("short %p len %zu\n", buf, buf_end - buf);
1026	uintptr_t len = buf_end - buf;
1027	assert(len <= `64`);
1028
1029	u64a k = (~`0ULL`) >> (`64` - len);
1030	DEBUG_PRINTF("load mask 0x%016llx\n", k);
1031
1032	m512 chars = loadu_mask_m512(ones, k, buf);
1033
1034	const u8 *rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi, chars, buf,
1035	low4bits, ones, k);
1036
1037	return rv;
1038	}
1039
1040	/ takes 128 bit masks, but operates on 512 bits of data /
1041	const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
1042	m128 mask2_lo, m128 mask2_hi,
1043	const u8 buf, const* u8 *buf_end) {
1044	/ we should always have at least 16 bytes /
1045	assert(buf_end - buf >= `16`);
1046	DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf);
1047
1048	const m512 ones = ones512();
1049	const m512 low4bits = set64x8(`0xf`);
1050	const m512 wide_mask1_lo = set4x128(mask1_lo);
1051	const m512 wide_mask1_hi = set4x128(mask1_hi);
1052	const m512 wide_mask2_lo = set4x128(mask2_lo);
1053	const m512 wide_mask2_hi = set4x128(mask2_hi);
1054	const u8 *rv;
1055
1056	if (buf_end - buf <= `64`) {
1057	rv = shortDoubleShufti512(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo,
1058	wide_mask2_hi, buf, buf_end, low4bits, ones);
1059	DEBUG_PRINTF("rv %p\n", rv);
1060	return rv ? rv : buf_end;
1061	}
1062
1063	// Preconditioning: most of the time our buffer won't be aligned.
1064	if ((uintptr_t)buf % `64`) {
1065	rv = shortDoubleShufti512(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo,
1066	wide_mask2_hi, buf, ROUNDUP_PTR(buf, `64`),
1067	low4bits, ones);
1068	if (rv) {
1069	return rv;
1070	}
1071
1072	buf = ROUNDUP_PTR(buf, `64`);
1073	}
1074
1075	const u8 *last_block = buf_end - `64`;
1076	while (buf < last_block) {
1077	m512 lchars = load512(buf);
1078	rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo,
1079	wide_mask2_hi, lchars, buf, low4bits, ones, ~`0`);
1080	if (rv) {
1081	return rv;
1082	}
1083	buf += `64`;
1084	}
1085
1086	// Use an unaligned load to mop up the last 64 bytes and get an accurate
1087	// picture to buf_end.
1088	m512 chars = loadu512(buf_end - `64`);
1089	rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi,
1090	chars, buf_end - `64`, low4bits, ones, ~`0`);
1091	if (rv) {
1092	return rv;
1093	}
1094
1095	return buf_end;
1096	}
1097	#endif
1098

Browse the source code of ClickHouse/contrib/hyperscan/src/nfa/shufti.c