truffle.c source code [ClickHouse/contrib/hyperscan/src/nfa/truffle.c]

1	/*
2	* Copyright (c) 2015-2017, Intel Corporation
3	*
4	* Redistribution and use in source and binary forms, with or without
5	* modification, are permitted provided that the following conditions are met:
6	*
7	* * Redistributions of source code must retain the above copyright notice,
8	* this list of conditions and the following disclaimer.
9	* * Redistributions in binary form must reproduce the above copyright
10	* notice, this list of conditions and the following disclaimer in the
11	* documentation and/or other materials provided with the distribution.
12	* * Neither the name of Intel Corporation nor the names of its contributors
13	* may be used to endorse or promote products derived from this software
14	* without specific prior written permission.
15	*
16	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19	* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20	* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26	* POSSIBILITY OF SUCH DAMAGE.
27	*/
28
29	/*
30	* Matches a byte in a charclass using three shuffles
31	*/
32
33
34	#include "ue2common.h"
35	#include "truffle.h"
36	#include "util/arch.h"
37	#include "util/bitutils.h"
38	#include "util/simd_utils.h"
39
40	#if !defined(HAVE_AVX2)
41
42	static really_inline
43	const u8 lastMatch(const* u8 *buf, u32 z) {
44	if (unlikely(z != `0xffff`)) {
45	u32 pos = clz32(~z & `0xffff`);
46	assert(pos >= `16` && pos < `32`);
47	return buf + (`31` - pos);
48	}
49
50	return NULL; // no match
51	}
52
53	static really_inline
54	const u8 firstMatch(const* u8 *buf, u32 z) {
55	if (unlikely(z != `0xffff`)) {
56	u32 pos = ctz32(~z & `0xffff`);
57	assert(pos < `16`);
58	return buf + pos;
59	}
60
61	return NULL; // no match
62	}
63
64	static really_inline
65	u32 block(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, m128 v) {
66
67	m128 highconst = _mm_set1_epi8(`0x80`);
68	m128 shuf_mask_hi = _mm_set1_epi64x(`0x8040201008040201`);
69
70	// and now do the real work
71	m128 shuf1 = pshufb_m128(shuf_mask_lo_highclear, v);
72	m128 t1 = xor128(v, highconst);
73	m128 shuf2 = pshufb_m128(shuf_mask_lo_highset, t1);
74	m128 t2 = andnot128(highconst, rshift64_m128(v, `4`));
75	m128 shuf3 = pshufb_m128(shuf_mask_hi, t2);
76	m128 tmp = and128(or128(shuf1, shuf2), shuf3);
77	m128 tmp2 = eq128(tmp, zeroes128());
78	u32 z = movemask128(tmp2);
79
80	return z;
81	}
82
83	static
84	const u8 *truffleMini(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
85	const u8 buf, const* u8 *buf_end) {
86	uintptr_t len = buf_end - buf;
87	assert(len < `16`);
88
89	m128 chars = zeroes128();
90	memcpy(&chars, buf, len);
91
92	u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
93	// can't be these bytes in z
94	u32 mask = (`0xffff` >> (`16` - len)) ^ `0xffff`;
95	const u8 *rv = firstMatch(buf, z \| mask);
96
97	if (rv) {
98	return rv;
99	} else {
100	return buf_end;
101	}
102	}
103
104	static really_inline
105	const u8 *fwdBlock(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
106	m128 v, const u8 *buf) {
107	u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v);
108	return firstMatch(buf, z);
109	}
110
111	static really_inline
112	const u8 *revBlock(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
113	m128 v, const u8 *buf) {
114	u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v);
115	return lastMatch(buf, z);
116	}
117
118	const u8 *truffleExec(m128 shuf_mask_lo_highclear,
119	m128 shuf_mask_lo_highset,
120	const u8 buf, const* u8 *buf_end) {
121	DEBUG_PRINTF("len %zu\n", buf_end - buf);
122
123	assert(buf && buf_end);
124	assert(buf < buf_end);
125	const u8 *rv;
126
127	if (buf_end - buf < `16`) {
128	return truffleMini(shuf_mask_lo_highclear, shuf_mask_lo_highset, buf,
129	buf_end);
130	}
131
132	size_t min = (size_t)buf % `16`;
133	assert(buf_end - buf >= `16`);
134
135	// Preconditioning: most of the time our buffer won't be aligned.
136	m128 chars = loadu128(buf);
137	rv = fwdBlock(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars, buf);
138	if (rv) {
139	return rv;
140	}
141	buf += (`16` - min);
142
143	const u8 *last_block = buf_end - `16`;
144	while (buf < last_block) {
145	m128 lchars = load128(buf);
146	rv = fwdBlock(shuf_mask_lo_highclear, shuf_mask_lo_highset, lchars,
147	buf);
148	if (rv) {
149	return rv;
150	}
151	buf += `16`;
152	}
153
154	// Use an unaligned load to mop up the last 16 bytes and get an accurate
155	// picture to buf_end.
156	assert(buf <= buf_end && buf >= buf_end - `16`);
157	chars = loadu128(buf_end - `16`);
158	rv = fwdBlock(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars,
159	buf_end - `16`);
160	if (rv) {
161	return rv;
162	}
163
164	return buf_end;
165	}
166
167	static
168	const u8 *truffleRevMini(m128 shuf_mask_lo_highclear,
169	m128 shuf_mask_lo_highset, const u8 *buf,
170	const u8 *buf_end) {
171	uintptr_t len = buf_end - buf;
172	assert(len < `16`);
173
174	m128 chars = zeroes128();
175	memcpy(&chars, buf, len);
176
177	u32 mask = (`0xffff` >> (`16` - len)) ^ `0xffff`;
178	u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
179	const u8 *rv = lastMatch(buf, z \| mask);
180
181	if (rv) {
182	return rv;
183	}
184	return buf - `1`;
185	}
186
187	const u8 *rtruffleExec(m128 shuf_mask_lo_highclear,
188	m128 shuf_mask_lo_highset,
189	const u8 buf, const* u8 *buf_end) {
190	assert(buf && buf_end);
191	assert(buf < buf_end);
192	const u8 *rv;
193
194	DEBUG_PRINTF("len %zu\n", buf_end - buf);
195
196	if (buf_end - buf < `16`) {
197	return truffleRevMini(shuf_mask_lo_highclear, shuf_mask_lo_highset, buf,
198	buf_end);
199	}
200
201	assert(buf_end - buf >= `16`);
202
203	// Preconditioning: most of the time our buffer won't be aligned.
204	m128 chars = loadu128(buf_end - `16`);
205	rv = revBlock(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars,
206	buf_end - `16`);
207	if (rv) {
208	return rv;
209	}
210	buf_end = (const u8 *)((size_t)buf_end & ~((size_t)`0xf`));
211
212	const u8 *last_block = buf + `16`;
213	while (buf_end > last_block) {
214	buf_end -= `16`;
215	m128 lchars = load128(buf_end);
216	rv = revBlock(shuf_mask_lo_highclear, shuf_mask_lo_highset, lchars,
217	buf_end);
218	if (rv) {
219	return rv;
220	}
221	}
222
223	// Use an unaligned load to mop up the last 16 bytes and get an accurate
224	// picture to buf_end.
225	chars = loadu128(buf);
226	rv = revBlock(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars, buf);
227	if (rv) {
228	return rv;
229	}
230
231	return buf - `1`;
232	}
233
234	#elif !defined(HAVE_AVX512)
235
236	// AVX2
237
238	static really_inline
239	const u8 lastMatch(const* u8 *buf, u32 z) {
240	if (unlikely(z != `0xffffffff`)) {
241	u32 pos = clz32(~z);
242	assert(pos < `32`);
243	return buf + (`31` - pos);
244	}
245
246	return NULL; // no match
247	}
248
249	static really_inline
250	const u8 firstMatch(const* u8 *buf, u32 z) {
251	if (unlikely(z != `0xffffffff`)) {
252	u32 pos = ctz32(~z);
253	assert(pos < `32`);
254	return buf + pos;
255	}
256
257	return NULL; // no match
258	}
259
260	static really_inline
261	u32 block(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset, m256 v) {
262
263	m256 highconst = _mm256_set1_epi8(`0x80`);
264	m256 shuf_mask_hi = _mm256_set1_epi64x(`0x8040201008040201`);
265
266	// and now do the real work
267	m256 shuf1 = pshufb_m256(shuf_mask_lo_highclear, v);
268	m256 t1 = xor256(v, highconst);
269	m256 shuf2 = pshufb_m256(shuf_mask_lo_highset, t1);
270	m256 t2 = andnot256(highconst, rshift64_m256(v, `4`));
271	m256 shuf3 = pshufb_m256(shuf_mask_hi, t2);
272	m256 tmp = and256(or256(shuf1, shuf2), shuf3);
273	m256 tmp2 = eq256(tmp, zeroes256());
274	u32 z = movemask256(tmp2);
275
276	return z;
277	}
278
279	static
280	const u8 *truffleMini(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset,
281	const u8 buf, const* u8 *buf_end) {
282	uintptr_t len = buf_end - buf;
283	assert(len < `32`);
284
285	m256 chars = zeroes256();
286	memcpy(&chars, buf, len);
287
288	u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
289	// can't be these bytes in z
290	u32 mask = (`0xffffffff` >> (`32` - len)) ^ `0xffffffff`;
291	const u8 *rv = firstMatch(buf, z \| mask);
292
293	if (rv) {
294	return rv;
295	} else {
296	return buf_end;
297	}
298	}
299
300	static really_inline
301	const u8 *fwdBlock(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset,
302	m256 v, const u8 *buf) {
303	u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v);
304	return firstMatch(buf, z);
305	}
306
307	static really_inline
308	const u8 *revBlock(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset,
309	m256 v, const u8 *buf) {
310	u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v);
311	return lastMatch(buf, z);
312	}
313
314	const u8 *truffleExec(m128 shuf_mask_lo_highclear,
315	m128 shuf_mask_lo_highset,
316	const u8 buf, const* u8 *buf_end) {
317	DEBUG_PRINTF("len %zu\n", buf_end - buf);
318	const m256 wide_clear = set2x128(shuf_mask_lo_highclear);
319	const m256 wide_set = set2x128(shuf_mask_lo_highset);
320
321	assert(buf && buf_end);
322	assert(buf < buf_end);
323	const u8 *rv;
324
325	if (buf_end - buf < `32`) {
326	return truffleMini(wide_clear, wide_set, buf, buf_end);
327	}
328
329	size_t min = (size_t)buf % `32`;
330	assert(buf_end - buf >= `32`);
331
332	// Preconditioning: most of the time our buffer won't be aligned.
333	m256 chars = loadu256(buf);
334	rv = fwdBlock(wide_clear, wide_set, chars, buf);
335	if (rv) {
336	return rv;
337	}
338	buf += (`32` - min);
339
340	const u8 *last_block = buf_end - `32`;
341	while (buf < last_block) {
342	m256 lchars = load256(buf);
343	rv = fwdBlock(wide_clear, wide_set, lchars, buf);
344	if (rv) {
345	return rv;
346	}
347	buf += `32`;
348	}
349
350	// Use an unaligned load to mop up the last 32 bytes and get an accurate
351	// picture to buf_end.
352	assert(buf <= buf_end && buf >= buf_end - `32`);
353	chars = loadu256(buf_end - `32`);
354	rv = fwdBlock(wide_clear, wide_set, chars, buf_end - `32`);
355	if (rv) {
356	return rv;
357	}
358	return buf_end;
359	}
360
361	static
362	const u8 *truffleRevMini(m256 shuf_mask_lo_highclear,
363	m256 shuf_mask_lo_highset, const u8 *buf,
364	const u8 *buf_end) {
365	uintptr_t len = buf_end - buf;
366	assert(len < `32`);
367
368	m256 chars = zeroes256();
369	memcpy(&chars, buf, len);
370
371	u32 mask = (`0xffffffff` >> (`32` - len)) ^ `0xffffffff`;
372	u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
373	const u8 *rv = lastMatch(buf, z \| mask);
374
375	if (rv) {
376	return rv;
377	}
378	return buf - `1`;
379	}
380
381
382	const u8 *rtruffleExec(m128 shuf_mask_lo_highclear,
383	m128 shuf_mask_lo_highset,
384	const u8 buf, const* u8 *buf_end) {
385	const m256 wide_clear = set2x128(shuf_mask_lo_highclear);
386	const m256 wide_set = set2x128(shuf_mask_lo_highset);
387	assert(buf && buf_end);
388	assert(buf < buf_end);
389	const u8 *rv;
390
391	DEBUG_PRINTF("len %zu\n", buf_end - buf);
392
393	if (buf_end - buf < `32`) {
394	return truffleRevMini(wide_clear, wide_set, buf, buf_end);
395	}
396
397	assert(buf_end - buf >= `32`);
398
399	// Preconditioning: most of the time our buffer won't be aligned.
400	m256 chars = loadu256(buf_end - `32`);
401	rv = revBlock(wide_clear, wide_set, chars,
402	buf_end - `32`);
403	if (rv) {
404	return rv;
405	}
406	buf_end = (const u8 *)((size_t)buf_end & ~((size_t)`0x1f`));
407
408	const u8 *last_block = buf + `32`;
409	while (buf_end > last_block) {
410	buf_end -= `32`;
411	m256 lchars = load256(buf_end);
412	rv = revBlock(wide_clear, wide_set, lchars, buf_end);
413	if (rv) {
414	return rv;
415	}
416	}
417
418	// Use an unaligned load to mop up the last 32 bytes and get an accurate
419	// picture to buf_end.
420	chars = loadu256(buf);
421	rv = revBlock(wide_clear, wide_set, chars, buf);
422	if (rv) {
423	return rv;
424	}
425	return buf - `1`;
426	}
427
428	#else // AVX512
429
430	static really_inline
431	const u8 lastMatch(const* u8 *buf, u64a z) {
432	if (unlikely(z != ~`0ULL`)) {
433	u64a pos = clz64(~z);
434	assert(pos < `64`);
435	return buf + (`63` - pos);
436	}
437
438	return NULL; // no match
439	}
440
441	static really_inline
442	const u8 firstMatch(const* u8 *buf, u64a z) {
443	if (unlikely(z != ~`0ULL`)) {
444	u64a pos = ctz64(~z);
445	assert(pos < `64`);
446	DEBUG_PRINTF("pos %llu\n", pos);
447	return buf + pos;
448	}
449
450	return NULL; // no match
451	}
452
453	static really_inline
454	u64a block(m512 shuf_mask_lo_highclear, m512 shuf_mask_lo_highset, m512 v) {
455	m512 highconst = set64x8(`0x80`);
456	m512 shuf_mask_hi = set8x64(`0x8040201008040201`);
457
458	// and now do the real work
459	m512 shuf1 = pshufb_m512(shuf_mask_lo_highclear, v);
460	m512 t1 = xor512(v, highconst);
461	m512 shuf2 = pshufb_m512(shuf_mask_lo_highset, t1);
462	m512 t2 = andnot512(highconst, rshift64_m512(v, `4`));
463	m512 shuf3 = pshufb_m512(shuf_mask_hi, t2);
464	m512 tmp = and512(or512(shuf1, shuf2), shuf3);
465	u64a z = eq512mask(tmp, zeroes512());
466
467	return z;
468	}
469
470	static really_inline
471	const u8 *truffleMini(m512 shuf_mask_lo_highclear, m512 shuf_mask_lo_highset,
472	const u8 buf, const* u8 *buf_end) {
473	uintptr_t len = buf_end - buf;
474	assert(len <= `64`);
475
476	__mmask64 mask = (~`0ULL`) >> (`64` - len);
477
478	m512 chars = loadu_maskz_m512(mask, buf);
479
480	u64a z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
481
482	const u8 *rv = firstMatch(buf, z \| ~mask);
483
484	return rv;
485	}
486
487	static really_inline
488	const u8 *fwdBlock(m512 shuf_mask_lo_highclear, m512 shuf_mask_lo_highset,
489	m512 v, const u8 *buf) {
490	u64a z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v);
491	return firstMatch(buf, z);
492	}
493
494	static really_inline
495	const u8 *revBlock(m512 shuf_mask_lo_highclear, m512 shuf_mask_lo_highset,
496	m512 v, const u8 *buf) {
497	u64a z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v);
498	return lastMatch(buf, z);
499	}
500
501	const u8 *truffleExec(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
502	const u8 buf, const* u8 *buf_end) {
503	DEBUG_PRINTF("len %zu\n", buf_end - buf);
504	const m512 wide_clear = set4x128(shuf_mask_lo_highclear);
505	const m512 wide_set = set4x128(shuf_mask_lo_highset);
506
507	assert(buf && buf_end);
508	assert(buf < buf_end);
509	const u8 *rv;
510
511	if (buf_end - buf <= `64`) {
512	rv = truffleMini(wide_clear, wide_set, buf, buf_end);
513	return rv ? rv : buf_end;
514	}
515
516	assert(buf_end - buf >= `64`);
517	if ((uintptr_t)buf % `64`) {
518	// Preconditioning: most of the time our buffer won't be aligned.
519	rv = truffleMini(wide_clear, wide_set, buf, ROUNDUP_PTR(buf, `64`));
520	if (rv) {
521	return rv;
522	}
523	buf = ROUNDUP_PTR(buf, `64`);
524	}
525	const u8 *last_block = buf_end - `64`;
526	while (buf < last_block) {
527	m512 lchars = load512(buf);
528	rv = fwdBlock(wide_clear, wide_set, lchars, buf);
529	if (rv) {
530	return rv;
531	}
532	buf += `64`;
533	}
534
535	// Use an unaligned load to mop up the last 64 bytes and get an accurate
536	// picture to buf_end.
537	assert(buf <= buf_end && buf >= buf_end - `64`);
538	m512 chars = loadu512(buf_end - `64`);
539	rv = fwdBlock(wide_clear, wide_set, chars, buf_end - `64`);
540	if (rv) {
541	return rv;
542	}
543	return buf_end;
544	}
545
546	static really_inline
547	const u8 *truffleRevMini(m512 shuf_mask_lo_highclear, m512 shuf_mask_lo_highset,
548	const u8 buf, const* u8 *buf_end) {
549	uintptr_t len = buf_end - buf;
550	assert(len < `64`);
551
552	__mmask64 mask = (~`0ULL`) >> (`64` - len);
553	m512 chars = loadu_maskz_m512(mask, buf);
554	u64a z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
555	DEBUG_PRINTF("mask 0x%016llx z 0x%016llx\n", mask, z);
556	const u8 *rv = lastMatch(buf, z \| ~mask);
557
558	if (rv) {
559	return rv;
560	}
561	return buf - `1`;
562	}
563
564	const u8 *rtruffleExec(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
565	const u8 buf, const* u8 *buf_end) {
566	const m512 wide_clear = set4x128(shuf_mask_lo_highclear);
567	const m512 wide_set = set4x128(shuf_mask_lo_highset);
568	assert(buf && buf_end);
569	assert(buf < buf_end);
570	const u8 *rv;
571
572	DEBUG_PRINTF("len %zu\n", buf_end - buf);
573
574	if (buf_end - buf < `64`) {
575	return truffleRevMini(wide_clear, wide_set, buf, buf_end);
576	}
577
578	assert(buf_end - buf >= `64`);
579
580	// Preconditioning: most of the time our buffer won't be aligned.
581	m512 chars = loadu512(buf_end - `64`);
582	rv = revBlock(wide_clear, wide_set, chars, buf_end - `64`);
583	if (rv) {
584	return rv;
585	}
586	buf_end = (const u8 *)ROUNDDOWN_N((uintptr_t)buf_end, `64`);
587
588	const u8 *last_block = buf + `64`;
589	while (buf_end > last_block) {
590	buf_end -= `64`;
591	m512 lchars = load512(buf_end);
592	rv = revBlock(wide_clear, wide_set, lchars, buf_end);
593	if (rv) {
594	return rv;
595	}
596	}
597
598	// Use an unaligned load to mop up the last 64 bytes and get an accurate
599	// picture to buf_end.
600	chars = loadu512(buf);
601	rv = revBlock(wide_clear, wide_set, chars, buf);
602	if (rv) {
603	return rv;
604	}
605	return buf - `1`;
606	}
607
608	#endif
609

Browse the source code of ClickHouse/contrib/hyperscan/src/nfa/truffle.c