fdr.c source code [ClickHouse/contrib/hyperscan/src/fdr/fdr.c]

1	/*
2	* Copyright (c) 2015-2017, Intel Corporation
3	*
4	* Redistribution and use in source and binary forms, with or without
5	* modification, are permitted provided that the following conditions are met:
6	*
7	* * Redistributions of source code must retain the above copyright notice,
8	* this list of conditions and the following disclaimer.
9	* * Redistributions in binary form must reproduce the above copyright
10	* notice, this list of conditions and the following disclaimer in the
11	* documentation and/or other materials provided with the distribution.
12	* * Neither the name of Intel Corporation nor the names of its contributors
13	* may be used to endorse or promote products derived from this software
14	* without specific prior written permission.
15	*
16	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19	* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20	* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26	* POSSIBILITY OF SUCH DAMAGE.
27	*/
28
29	#include "fdr.h"
30	#include "fdr_confirm.h"
31	#include "fdr_confirm_runtime.h"
32	#include "fdr_internal.h"
33	#include "fdr_loadval.h"
34	#include "flood_runtime.h"
35	#include "scratch.h"
36	#include "teddy.h"
37	#include "teddy_internal.h"
38	#include "util/arch.h"
39	#include "util/simd_utils.h"
40	#include "util/uniform_ops.h"
41
42	/* \brief number of bytes processed in each iteration /
43	#define ITER_BYTES 16
44
45	/* \brief total zone buffer size /
46	#define ZONE_TOTAL_SIZE 64
47
48	/* \brief maximum number of allowed zones /
49	#define ZONE_MAX 3
50
51	/* \brief zone information.*
52	*
53	* Zone represents a region of data to scan in FDR.
54	*
55	* The incoming buffer is to split in multiple zones to ensure two properties:
56	* 1: that we can read 8? bytes behind to generate a hash safely
57	* 2: that we can read the 3 byte after the current byte (domain > 8)
58	*/
59	struct zone {
60	/* \brief copied buffer, used only when it is a boundary zone. /
61	u8 ALIGN_CL_DIRECTIVE buf[ZONE_TOTAL_SIZE];
62
63	/* \brief shift amount for fdr state to avoid unwanted match. /
64	u8 shift;
65
66	/* \brief if boundary zone, start points into the zone buffer after the*
67	* pre-padding. Otherwise, points to the main buffer, appropriately. */
68	const u8 *start;
69
70	/* \brief if boundary zone, end points to the end of zone. Otherwise,*
71	* pointer to the main buffer, appropriately. */
72	const u8 *end;
73
74	/* \brief the amount to adjust to go from a pointer in the zones region*
75	* (between start and end) to a pointer in the original data buffer. */
76	ptrdiff_t zone_pointer_adjust;
77
78	/* \brief firstFloodDetect from FDR_Runtime_Args for non-boundary zones,*
79	* otherwise end of the zone buf. floodPtr always points inside the same
80	* buffer as the start pointe. */
81	const u8 *floodPtr;
82	};
83
84	static
85	const ALIGN_CL_DIRECTIVE u8 zone_or_mask[ITER_BYTES+`1`][ITER_BYTES] = {
86	{ `0x00`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00`,
87	`0x00`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00` },
88	{ `0xff`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00`,
89	`0x00`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00` },
90	{ `0xff`, `0xff`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00`,
91	`0x00`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00` },
92	{ `0xff`, `0xff`, `0xff`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00`,
93	`0x00`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00` },
94	{ `0xff`, `0xff`, `0xff`, `0xff`, `0x00`, `0x00`, `0x00`, `0x00`,
95	`0x00`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00` },
96	{ `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0x00`, `0x00`, `0x00`,
97	`0x00`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00` },
98	{ `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0x00`, `0x00`,
99	`0x00`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00` },
100	{ `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0x00`,
101	`0x00`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00` },
102	{ `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`,
103	`0x00`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00` },
104	{ `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`,
105	`0xff`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00` },
106	{ `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`,
107	`0xff`, `0xff`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00` },
108	{ `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`,
109	`0xff`, `0xff`, `0xff`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00` },
110	{ `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`,
111	`0xff`, `0xff`, `0xff`, `0xff`, `0x00`, `0x00`, `0x00`, `0x00` },
112	{ `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`,
113	`0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0x00`, `0x00`, `0x00` },
114	{ `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`,
115	`0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0x00`, `0x00` },
116	{ `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`,
117	`0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0x00` },
118	{ `0x00`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00`,
119	`0x00`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00` }
120	};
121
122	/ compilers don't reliably synthesize the 32-bit ANDN instruction here,*
123	* so we force its generation.
124	*/
125	static really_inline
126	u64a andn(const u32 a, const u8 *b) {
127	u64a r;
128	#if defined(HAVE_BMI) && !defined(NO_ASM)
129	__asm__ ("andn\t%2,%1,%k0" : "=r"(r) : "r"(a), "m"((const* u32 *)b));
130	#else
131	r = unaligned_load_u32(b) & ~a;
132	#endif
133	return r;
134	}
135
136	/ generates an initial state mask based on the last byte-ish of history rather*
137	* than being all accepting. If there is no history to consider, the state is
138	* generated based on the minimum length of each bucket in order to prevent
139	* confirms.
140	*/
141	static really_inline
142	m128 getInitState(const struct FDR fdr, u8 len_history, const* u64a *ft,
143	const struct zone *z) {
144	m128 s;
145	if (len_history) {
146	/ +1: the zones ensure that we can read the byte at z->end /
147	u32 tmp = lv_u16(z->start + z->shift - `1`, z->buf, z->end + `1`);
148	tmp &= fdr->domainMask;
149	s = load_m128_from_u64a(ft + tmp);
150	s = rshiftbyte_m128(s, `1`);
151	} else {
152	s = fdr->start;
153	}
154	return s;
155	}
156
157	static really_inline
158	void get_conf_stride_1(const u8 itPtr, UNUSED const* u8 *start_ptr,
159	UNUSED const u8 *end_ptr, u32 domain_mask_flipped,
160	const u64a ft, u64a conf0, u64a conf8, m128 s) {
161	/ +1: the zones ensure that we can read the byte at z->end /
162	assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
163	u64a reach0 = andn(domain_mask_flipped, itPtr);
164	u64a reach1 = andn(domain_mask_flipped, itPtr + `1`);
165	u64a reach2 = andn(domain_mask_flipped, itPtr + `2`);
166	u64a reach3 = andn(domain_mask_flipped, itPtr + `3`);
167
168	m128 st0 = load_m128_from_u64a(ft + reach0);
169	m128 st1 = load_m128_from_u64a(ft + reach1);
170	m128 st2 = load_m128_from_u64a(ft + reach2);
171	m128 st3 = load_m128_from_u64a(ft + reach3);
172
173	u64a reach4 = andn(domain_mask_flipped, itPtr + `4`);
174	u64a reach5 = andn(domain_mask_flipped, itPtr + `5`);
175	u64a reach6 = andn(domain_mask_flipped, itPtr + `6`);
176	u64a reach7 = andn(domain_mask_flipped, itPtr + `7`);
177
178	m128 st4 = load_m128_from_u64a(ft + reach4);
179	m128 st5 = load_m128_from_u64a(ft + reach5);
180	m128 st6 = load_m128_from_u64a(ft + reach6);
181	m128 st7 = load_m128_from_u64a(ft + reach7);
182
183	st1 = lshiftbyte_m128(st1, `1`);
184	st2 = lshiftbyte_m128(st2, `2`);
185	st3 = lshiftbyte_m128(st3, `3`);
186	st4 = lshiftbyte_m128(st4, `4`);
187	st5 = lshiftbyte_m128(st5, `5`);
188	st6 = lshiftbyte_m128(st6, `6`);
189	st7 = lshiftbyte_m128(st7, `7`);
190
191	st0 = or128(st0, st1);
192	st2 = or128(st2, st3);
193	st4 = or128(st4, st5);
194	st6 = or128(st6, st7);
195	st0 = or128(st0, st2);
196	st4 = or128(st4, st6);
197	st0 = or128(st0, st4);
198	s = or128(s, st0);
199
200	conf0 = movq(s);
201	s = rshiftbyte_m128(s, `8`);
202	*conf0 ^= ~`0ULL`;
203
204	u64a reach8 = andn(domain_mask_flipped, itPtr + `8`);
205	u64a reach9 = andn(domain_mask_flipped, itPtr + `9`);
206	u64a reach10 = andn(domain_mask_flipped, itPtr + `10`);
207	u64a reach11 = andn(domain_mask_flipped, itPtr + `11`);
208
209	m128 st8 = load_m128_from_u64a(ft + reach8);
210	m128 st9 = load_m128_from_u64a(ft + reach9);
211	m128 st10 = load_m128_from_u64a(ft + reach10);
212	m128 st11 = load_m128_from_u64a(ft + reach11);
213
214	u64a reach12 = andn(domain_mask_flipped, itPtr + `12`);
215	u64a reach13 = andn(domain_mask_flipped, itPtr + `13`);
216	u64a reach14 = andn(domain_mask_flipped, itPtr + `14`);
217	u64a reach15 = andn(domain_mask_flipped, itPtr + `15`);
218
219	m128 st12 = load_m128_from_u64a(ft + reach12);
220	m128 st13 = load_m128_from_u64a(ft + reach13);
221	m128 st14 = load_m128_from_u64a(ft + reach14);
222	m128 st15 = load_m128_from_u64a(ft + reach15);
223
224	st9 = lshiftbyte_m128(st9, `1`);
225	st10 = lshiftbyte_m128(st10, `2`);
226	st11 = lshiftbyte_m128(st11, `3`);
227	st12 = lshiftbyte_m128(st12, `4`);
228	st13 = lshiftbyte_m128(st13, `5`);
229	st14 = lshiftbyte_m128(st14, `6`);
230	st15 = lshiftbyte_m128(st15, `7`);
231
232	st8 = or128(st8, st9);
233	st10 = or128(st10, st11);
234	st12 = or128(st12, st13);
235	st14 = or128(st14, st15);
236	st8 = or128(st8, st10);
237	st12 = or128(st12, st14);
238	st8 = or128(st8, st12);
239	s = or128(s, st8);
240
241	conf8 = movq(s);
242	s = rshiftbyte_m128(s, `8`);
243	*conf8 ^= ~`0ULL`;
244	}
245
246	static really_inline
247	void get_conf_stride_2(const u8 itPtr, UNUSED const* u8 *start_ptr,
248	UNUSED const u8 *end_ptr, u32 domain_mask_flipped,
249	const u64a ft, u64a conf0, u64a conf8, m128 s) {
250	assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
251	u64a reach0 = andn(domain_mask_flipped, itPtr);
252	u64a reach2 = andn(domain_mask_flipped, itPtr + `2`);
253	u64a reach4 = andn(domain_mask_flipped, itPtr + `4`);
254	u64a reach6 = andn(domain_mask_flipped, itPtr + `6`);
255
256	m128 st0 = load_m128_from_u64a(ft + reach0);
257	m128 st2 = load_m128_from_u64a(ft + reach2);
258	m128 st4 = load_m128_from_u64a(ft + reach4);
259	m128 st6 = load_m128_from_u64a(ft + reach6);
260
261	u64a reach8 = andn(domain_mask_flipped, itPtr + `8`);
262	u64a reach10 = andn(domain_mask_flipped, itPtr + `10`);
263	u64a reach12 = andn(domain_mask_flipped, itPtr + `12`);
264	u64a reach14 = andn(domain_mask_flipped, itPtr + `14`);
265
266	m128 st8 = load_m128_from_u64a(ft + reach8);
267	m128 st10 = load_m128_from_u64a(ft + reach10);
268	m128 st12 = load_m128_from_u64a(ft + reach12);
269	m128 st14 = load_m128_from_u64a(ft + reach14);
270
271	st2 = lshiftbyte_m128(st2, `2`);
272	st4 = lshiftbyte_m128(st4, `4`);
273	st6 = lshiftbyte_m128(st6, `6`);
274
275	s = or128(s, st0);
276	s = or128(s, st2);
277	s = or128(s, st4);
278	s = or128(s, st6);
279
280	conf0 = movq(s);
281	s = rshiftbyte_m128(s, `8`);
282	*conf0 ^= ~`0ULL`;
283
284	st10 = lshiftbyte_m128(st10, `2`);
285	st12 = lshiftbyte_m128(st12, `4`);
286	st14 = lshiftbyte_m128(st14, `6`);
287
288	s = or128(s, st8);
289	s = or128(s, st10);
290	s = or128(s, st12);
291	s = or128(s, st14);
292
293	conf8 = movq(s);
294	s = rshiftbyte_m128(s, `8`);
295	*conf8 ^= ~`0ULL`;
296	}
297
298	static really_inline
299	void get_conf_stride_4(const u8 itPtr, UNUSED const* u8 *start_ptr,
300	UNUSED const u8 *end_ptr, u32 domain_mask_flipped,
301	const u64a ft, u64a conf0, u64a conf8, m128 s) {
302	assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
303	u64a reach0 = andn(domain_mask_flipped, itPtr);
304	u64a reach4 = andn(domain_mask_flipped, itPtr + `4`);
305	u64a reach8 = andn(domain_mask_flipped, itPtr + `8`);
306	u64a reach12 = andn(domain_mask_flipped, itPtr + `12`);
307
308	m128 st0 = load_m128_from_u64a(ft + reach0);
309	m128 st4 = load_m128_from_u64a(ft + reach4);
310	m128 st8 = load_m128_from_u64a(ft + reach8);
311	m128 st12 = load_m128_from_u64a(ft + reach12);
312
313	st4 = lshiftbyte_m128(st4, `4`);
314	st12 = lshiftbyte_m128(st12, `4`);
315
316	s = or128(s, st0);
317	s = or128(s, st4);
318	conf0 = movq(s);
319	s = rshiftbyte_m128(s, `8`);
320	*conf0 ^= ~`0ULL`;
321
322	s = or128(s, st8);
323	s = or128(s, st12);
324	conf8 = movq(s);
325	s = rshiftbyte_m128(s, `8`);
326	*conf8 ^= ~`0ULL`;
327	}
328
329	static really_inline
330	void do_confirm_fdr(u64a conf, u8 offset, hwlmcb_rv_t control,
331	const u32 confBase, const* struct FDR_Runtime_Args *a,
332	const u8 ptr, u32 last_match_id, struct zone *z) {
333	const u8 bucket = `8`;
334
335	if (likely(!*conf)) {
336	return;
337	}
338
339	/ ptr is currently referring to a location in the zone's buffer, we also*
340	* need a pointer in the original, main buffer for the final string compare.
341	*/
342	const u8 ptr_main = (const* u8 *)((uintptr_t)ptr + z->zone_pointer_adjust);
343
344	const u8 *confLoc = ptr;
345
346	do {
347	u32 bit = findAndClearLSB_64(conf);
348	u32 byte = bit / bucket + offset;
349	u32 bitRem = bit % bucket;
350	u32 idx = bitRem;
351	u32 cf = confBase[idx];
352	if (!cf) {
353	continue;
354	}
355	const struct FDRConfirm fdrc = (const* struct FDRConfirm *)
356	((const u8 *)confBase + cf);
357	if (!(fdrc->groups & *control)) {
358	continue;
359	}
360	u64a confVal = unaligned_load_u64a(confLoc + byte - sizeof(u64a) + `1`);
361	confWithBit(fdrc, a, ptr_main - a->buf + byte, control,
362	last_match_id, confVal, conf, bit);
363	} while (unlikely(!!*conf));
364	}
365
366	static really_inline
367	void dumpZoneInfo(UNUSED struct zone *z, UNUSED size_t zone_id) {
368	#ifdef DEBUG
369	DEBUG_PRINTF("zone: zone=%zu, bufPtr=%p\n", zone_id, z->buf);
370	DEBUG_PRINTF("zone: startPtr=%p, endPtr=%p, shift=%u\n",
371	z->start, z->end, z->shift);
372	DEBUG_PRINTF("zone: zone_pointer_adjust=%zd, floodPtr=%p\n",
373	z->zone_pointer_adjust, z->floodPtr);
374	DEBUG_PRINTF("zone buf:");
375	for (size_t i = `0`; i < ZONE_TOTAL_SIZE; i++) {
376	if (i % `8` == `0`) {
377	printf("_");
378	}
379	if (z->buf[i]) {
380	printf("%02x", z->buf[i]);
381	} else {
382	printf("..");
383	}
384	}
385	printf("\n");
386	#endif
387	};
388
389	/**
390	* \brief Updates attributes for non-boundary region zone.
391	*/
392	static really_inline
393	void createMainZone(const u8 flood, const* u8 begin, const* u8 *end,
394	struct zone *z) {
395	z->zone_pointer_adjust = `0`; / zone buffer is the main buffer /
396	z->start = begin;
397	z->end = end;
398	z->floodPtr = flood;
399	z->shift = `0`;
400	}
401
402	/**
403	* \brief Create zone for short cases (<= ITER_BYTES).
404	*
405	* For this case we need to copy everything into the zone's internal buffer.
406	*
407	* We need to ensure that we run over real data if it exists (in history or
408	* before zone begin). We also need to ensure 8 bytes before any data being
409	* matched can be read (to perform a conf hash).
410	*
411	* We also need to ensure that the data at z->end can be read.
412	*
413	* Hence, the zone consists of:
414	* 16 bytes of history,
415	* 1 - 24 bytes of data form the buffer (ending at end),
416	* 1 byte of final padding
417	*/
418	static really_inline
419	void createShortZone(const u8 buf, const* u8 hend, const* u8 *begin,
420	const u8 end, struct* zone *z) {
421	/ the floodPtr for BOUNDARY zones are maximum of end of zone buf to avoid*
422	* the checks in boundary zone. */
423	z->floodPtr = z->buf + ZONE_TOTAL_SIZE;
424
425	ptrdiff_t z_len = end - begin;
426	assert(z_len > `0`);
427	assert(z_len <= ITER_BYTES);
428
429	z->shift = ITER_BYTES - z_len; / ignore bytes outside region specified /
430
431	static const size_t ZONE_SHORT_DATA_OFFSET = `16`; / after history /
432
433	/ we are guaranteed to always have 16 initialised bytes at the end of*
434	* the history buffer (they may be garbage coming from the stream state
435	* preceding hbuf, but bytes that don't correspond to actual history
436	* shouldn't affect computations). */
437	(m128 )z->buf = loadu128(hend - sizeof(m128));
438
439	/ The amount of data we have to copy from main buffer. /
440	size_t copy_len = MIN((size_t)(end - buf),
441	ITER_BYTES + sizeof(CONF_TYPE));
442
443	u8 *zone_data = z->buf + ZONE_SHORT_DATA_OFFSET;
444	switch (copy_len) {
445	case `1`:
446	zone_data = (end - `1`);
447	break;
448	case `2`:
449	(u16 )zone_data = unaligned_load_u16(end - `2`);
450	break;
451	case `3`:
452	(u16 )zone_data = unaligned_load_u16(end - `3`);
453	(zone_data + `2`) = (end - `1`);
454	break;
455	case `4`:
456	(u32 )zone_data = unaligned_load_u32(end - `4`);
457	break;
458	case `5`:
459	case `6`:
460	case `7`:
461	/ perform copy with 2 overlapping 4-byte chunks from buf. /
462	(u32 )zone_data = unaligned_load_u32(end - copy_len);
463	unaligned_store_u32(zone_data + copy_len - sizeof(u32),
464	unaligned_load_u32(end - sizeof(u32)));
465	break;
466	case `8`:
467	(u64a )zone_data = unaligned_load_u64a(end - `8`);
468	break;
469	case `9`:
470	case `10`:
471	case `11`:
472	case `12`:
473	case `13`:
474	case `14`:
475	case `15`:
476	/ perform copy with 2 overlapping 8-byte chunks from buf. /
477	(u64a )zone_data = unaligned_load_u64a(end - copy_len);
478	unaligned_store_u64a(zone_data + copy_len - sizeof(u64a),
479	unaligned_load_u64a(end - sizeof(u64a)));
480	break;
481	case `16`:
482	/ copy 16-bytes from buf. /
483	(m128 )zone_data = loadu128(end - `16`);
484	break;
485	default:
486	assert(copy_len <= sizeof(m128) + sizeof(u64a));
487
488	/ perform copy with (potentially overlapping) 8-byte and 16-byte chunks.*
489	*/
490	(u64a )zone_data = unaligned_load_u64a(end - copy_len);
491	storeu128(zone_data + copy_len - sizeof(m128),
492	loadu128(end - sizeof(m128)));
493	break;
494	}
495
496	/ set the start and end location of the zone buf*
497	* to be scanned */
498	u8 *z_end = z->buf + ZONE_SHORT_DATA_OFFSET + copy_len;
499	assert(ZONE_SHORT_DATA_OFFSET + copy_len >= ITER_BYTES);
500
501	/ copy the post-padding byte; this is required for domain > 8 due to*
502	* overhang */
503	assert(ZONE_SHORT_DATA_OFFSET + copy_len + `3` < `64`);
504	*z_end = `0`;
505
506	z->end = z_end;
507	z->start = z_end - ITER_BYTES;
508	z->zone_pointer_adjust = (ptrdiff_t)((uintptr_t)end - (uintptr_t)z_end);
509	assert(z->start + z->shift == z_end - z_len);
510	}
511
512	/**
513	* \brief Create a zone for the start region.
514	*
515	* This function requires that there is > ITER_BYTES of data in the buffer to
516	* scan. The start zone itself is always responsible for scanning exactly
517	* ITER_BYTES of data - there are no warmup/junk bytes scanned.
518	*
519	* This zone ensures that the byte at z->end can be read and corresponds to
520	* the next byte of data.
521	*
522	* 8 bytes of history data are provided before z->start to allow proper hash
523	* generation in streaming mode. If buf != begin, upto 8 bytes of data
524	* prior to begin is also provided.
525	*
526	* Although we are not interested in bare literals which start before begin
527	* if buf != begin, lookarounds associated with the literal may require
528	* the data prior to begin for hash purposes.
529	*/
530	static really_inline
531	void createStartZone(const u8 buf, const* u8 hend, const* u8 *begin,
532	struct zone *z) {
533	assert(ITER_BYTES == sizeof(m128));
534	assert(sizeof(CONF_TYPE) == `8`);
535	static const size_t ZONE_START_BEGIN = sizeof(CONF_TYPE);
536
537	const u8 *end = begin + ITER_BYTES;
538
539	/ set floodPtr to the end of zone buf to avoid checks in start zone /
540	z->floodPtr = z->buf + ZONE_TOTAL_SIZE;
541
542	z->shift = `0`; / we are processing ITER_BYTES of real data /
543
544	/ we are guaranteed to always have 16 initialised bytes at the end of the*
545	* history buffer (they may be garbage coming from the stream state
546	* preceding hbuf, but bytes that don't correspond to actual history
547	* shouldn't affect computations). However, for start zones, history is only
548	* required for conf hash purposes so we only need 8 bytes */
549	unaligned_store_u64a(z->buf, unaligned_load_u64a(hend - sizeof(u64a)));
550
551	/ The amount of data we have to copy from main buffer. /
552	size_t copy_len = MIN((size_t)(end - buf),
553	ITER_BYTES + sizeof(CONF_TYPE));
554	assert(copy_len >= `16`);
555
556	/ copy the post-padding byte; this is required for domain > 8 due to*
557	* overhang. The start requires that there is data after the zone so it
558	* it safe to dereference end */
559	z->buf[ZONE_START_BEGIN + copy_len] = *end;
560
561	/ set the start and end location of the zone buf to be scanned /
562	u8 *z_end = z->buf + ZONE_START_BEGIN + copy_len;
563	z->end = z_end;
564	z->start = z_end - ITER_BYTES;
565
566	/ copy the first 8 bytes of the valid region /
567	unaligned_store_u64a(z->buf + ZONE_START_BEGIN,
568	unaligned_load_u64a(end - copy_len));
569
570	/ copy the last 16 bytes, may overlap with the previous 8 byte write /
571	storeu128(z_end - sizeof(m128), loadu128(end - sizeof(m128)));
572
573	z->zone_pointer_adjust = (ptrdiff_t)((uintptr_t)end - (uintptr_t)z_end);
574
575	assert(ZONE_START_BEGIN + copy_len + `3` < `64`);
576	}
577
578	/**
579	* \brief Create a zone for the end region.
580	*
581	* This function requires that there is > ITER_BYTES of data in the buffer to
582	* scan. The end zone is responsible for a scanning the <= ITER_BYTES rump of
583	* data and optional ITER_BYTES. The main zone cannot handle the last 3 bytes
584	* of the buffer. The end zone is required to handle an optional full
585	* ITER_BYTES from main zone when there are less than 3 bytes to scan. The
586	* main zone size is reduced by ITER_BYTES in this case.
587	*
588	* This zone ensures that the byte at z->end can be read by filling it with a
589	* padding character.
590	*
591	* Upto 8 bytes of data prior to begin is also provided for the purposes of
592	* generating hashes. History is not copied, as all locations which require
593	* history for generating a hash are the responsiblity of the start zone.
594	*/
595	static really_inline
596	void createEndZone(const u8 buf, const* u8 begin, const* u8 *end,
597	struct zone *z) {
598	/ the floodPtr for BOUNDARY zones are maximum of end of zone buf to avoid*
599	* the checks in boundary zone. */
600	z->floodPtr = z->buf + ZONE_TOTAL_SIZE;
601
602	ptrdiff_t z_len = end - begin;
603	assert(z_len > `0`);
604	size_t iter_bytes_second = `0`;
605	size_t z_len_first = z_len;
606	if (z_len > ITER_BYTES) {
607	z_len_first = z_len - ITER_BYTES;
608	iter_bytes_second = ITER_BYTES;
609	}
610	z->shift = ITER_BYTES - z_len_first;
611
612	const u8 *end_first = end - iter_bytes_second;
613	/ The amount of data we have to copy from main buffer for the*
614	* first iteration. */
615	size_t copy_len_first = MIN((size_t)(end_first - buf),
616	ITER_BYTES + sizeof(CONF_TYPE));
617	assert(copy_len_first >= `16`);
618
619	size_t total_copy_len = copy_len_first + iter_bytes_second;
620	assert(total_copy_len + `3` < `64`);
621
622	/ copy the post-padding byte; this is required for domain > 8 due to*
623	* overhang */
624	z->buf[total_copy_len] = `0`;
625
626	/ set the start and end location of the zone buf*
627	* to be scanned */
628	u8 *z_end = z->buf + total_copy_len;
629	z->end = z_end;
630	z->start = z_end - ITER_BYTES - iter_bytes_second;
631	assert(z->start + z->shift == z_end - z_len);
632
633	u8 *z_end_first = z_end - iter_bytes_second;
634	/ copy the first 8 bytes of the valid region /
635	unaligned_store_u64a(z->buf,
636	unaligned_load_u64a(end_first - copy_len_first));
637
638	/ copy the last 16 bytes, may overlap with the previous 8 byte write /
639	storeu128(z_end_first - sizeof(m128), loadu128(end_first - sizeof(m128)));
640	if (iter_bytes_second) {
641	storeu128(z_end - sizeof(m128), loadu128(end - sizeof(m128)));
642	}
643
644	z->zone_pointer_adjust = (ptrdiff_t)((uintptr_t)end - (uintptr_t)z_end);
645	}
646
647	/**
648	* \brief Prepare zones.
649	*
650	* This function prepares zones with actual buffer and some padded bytes.
651	* The actual ITER_BYTES bytes in zone is preceded by main buf and/or
652	* history buf and succeeded by padded bytes possibly from main buf,
653	* if available.
654	*/
655	static really_inline
656	size_t prepareZones(const u8 buf, size_t len, const* u8 *hend,
657	size_t start, const u8 flood, struct* zone *zoneArr) {
658	const u8 *ptr = buf + start;
659	size_t remaining = len - start;
660
661	if (remaining <= ITER_BYTES) {
662	/ enough bytes to make only one zone /
663	createShortZone(buf, hend, ptr, buf + len, &zoneArr[`0`]);
664	return `1`;
665	}
666
667	/ enough bytes to make more than one zone /
668
669	size_t numZone = `0`;
670	createStartZone(buf, hend, ptr, &zoneArr[numZone++]);
671	ptr += ITER_BYTES;
672
673	assert(ptr < buf + len);
674
675	/ find maximum buffer location that the main zone can scan*
676	* - must be a multiple of ITER_BYTES, and
677	* - cannot contain the last 3 bytes (due to 3 bytes read behind the
678	end of buffer in FDR main loop)
679	*/
680	const u8 *main_end = buf + start + ROUNDDOWN_N(len - start - `3`, ITER_BYTES);
681
682	/ create a zone if multiple of ITER_BYTES are found /
683	if (main_end > ptr) {
684	createMainZone(flood, ptr, main_end, &zoneArr[numZone++]);
685	ptr = main_end;
686	}
687	/ create a zone with rest of the data from the main buffer /
688	createEndZone(buf, ptr, buf + len, &zoneArr[numZone++]);
689	return numZone;
690	}
691
692	#define INVALID_MATCH_ID (~0U)
693
694	#define FDR_MAIN_LOOP(zz, s, get_conf_fn) \
695	do { \
696	const u8 *tryFloodDetect = zz->floodPtr; \
697	const u8 *start_ptr = zz->start; \
698	const u8 *end_ptr = zz->end; \
699	\
700	for (const u8 *itPtr = start_ptr; itPtr + ITER_BYTES <= end_ptr; \
701	itPtr += ITER_BYTES) { \
702	if (unlikely(itPtr > tryFloodDetect)) { \
703	tryFloodDetect = floodDetect(fdr, a, &itPtr, tryFloodDetect,\
704	&floodBackoff, &control, \
705	ITER_BYTES); \
706	if (unlikely(control == HWLM_TERMINATE_MATCHING)) { \
707	return HWLM_TERMINATED; \
708	} \
709	} \
710	__builtin_prefetch(itPtr + ITER_BYTES); \
711	u64a conf0; \
712	u64a conf8; \
713	get_conf_fn(itPtr, start_ptr, end_ptr, domain_mask_flipped, \
714	ft, &conf0, &conf8, &s); \
715	do_confirm_fdr(&conf0, 0, &control, confBase, a, itPtr, \
716	&last_match_id, zz); \
717	do_confirm_fdr(&conf8, 8, &control, confBase, a, itPtr, \
718	&last_match_id, zz); \
719	if (unlikely(control == HWLM_TERMINATE_MATCHING)) { \
720	return HWLM_TERMINATED; \
721	} \
722	} /* end for loop */ \
723	} while (0) \
724
725	static never_inline
726	hwlm_error_t fdr_engine_exec(const struct FDR *fdr,
727	const struct FDR_Runtime_Args *a,
728	hwlm_group_t control) {
729	assert(ISALIGNED_CL(fdr));
730
731	u32 floodBackoff = FLOOD_BACKOFF_START;
732	u32 last_match_id = INVALID_MATCH_ID;
733	u32 domain_mask_flipped = ~fdr->domainMask;
734	u8 stride = fdr->stride;
735	const u64a *ft =
736	(const u64a )((const* u8 )fdr + ROUNDUP_CL(sizeof(struct* FDR)));
737	assert(ISALIGNED_CL(ft));
738	const u32 confBase = (const* u32 )((const* u8 *)fdr + fdr->confOffset);
739	assert(ISALIGNED_CL(confBase));
740	struct zone zones[ZONE_MAX];
741	assert(fdr->domain > `8` && fdr->domain < `16`);
742
743	size_t numZone = prepareZones(a->buf, a->len,
744	a->buf_history + a->len_history,
745	a->start_offset, a->firstFloodDetect, zones);
746	assert(numZone <= ZONE_MAX);
747	m128 state = getInitState(fdr, a->len_history, ft, &zones[`0`]);
748
749	for (size_t curZone = `0`; curZone < numZone; curZone++) {
750	struct zone *z = &zones[curZone];
751	dumpZoneInfo(z, curZone);
752
753	/ When a zone contains less data than is processed in an iteration*
754	* of FDR_MAIN_LOOP(), we need to scan over some extra data.
755	*
756	* We have chosen to scan this extra data at the start of the
757	* iteration. The extra data is either data we have already scanned or
758	* garbage (if it is earlier than offset 0),
759	*
760	* As a result we need to shift the incoming state back so that it will
761	* properly line up with the data being scanned.
762	*
763	* We also need to forbid reporting any matches in the data being
764	* rescanned as they have already been reported (or are over garbage but
765	* later stages should also provide that safety guarantee).
766	*/
767
768	u8 shift = z->shift;
769
770	state = variable_byte_shift_m128(state, shift);
771
772	state = or128(state, load128(zone_or_mask[shift]));
773
774	switch (stride) {
775	case `1`:
776	FDR_MAIN_LOOP(z, state, get_conf_stride_1);
777	break;
778	case `2`:
779	FDR_MAIN_LOOP(z, state, get_conf_stride_2);
780	break;
781	case `4`:
782	FDR_MAIN_LOOP(z, state, get_conf_stride_4);
783	break;
784	default:
785	break;
786	}
787	}
788
789	return HWLM_SUCCESS;
790	}
791
792	#if defined(HAVE_AVX2)
793	#define ONLY_AVX2(func) func
794	#else
795	#define ONLY_AVX2(func) NULL
796	#endif
797
798	typedef hwlm_error_t (FDRFUNCTYPE)(const* struct FDR *fdr,
799	const struct FDR_Runtime_Args *a,
800	hwlm_group_t control);
801
802	static const FDRFUNCTYPE funcs[] = {
803	fdr_engine_exec,
804	NULL, / old: fast teddy /
805	NULL, / old: fast teddy /
806	ONLY_AVX2(fdr_exec_fat_teddy_msks1),
807	ONLY_AVX2(fdr_exec_fat_teddy_msks1_pck),
808	ONLY_AVX2(fdr_exec_fat_teddy_msks2),
809	ONLY_AVX2(fdr_exec_fat_teddy_msks2_pck),
810	ONLY_AVX2(fdr_exec_fat_teddy_msks3),
811	ONLY_AVX2(fdr_exec_fat_teddy_msks3_pck),
812	ONLY_AVX2(fdr_exec_fat_teddy_msks4),
813	ONLY_AVX2(fdr_exec_fat_teddy_msks4_pck),
814	fdr_exec_teddy_msks1,
815	fdr_exec_teddy_msks1_pck,
816	fdr_exec_teddy_msks2,
817	fdr_exec_teddy_msks2_pck,
818	fdr_exec_teddy_msks3,
819	fdr_exec_teddy_msks3_pck,
820	fdr_exec_teddy_msks4,
821	fdr_exec_teddy_msks4_pck,
822	};
823
824	#define FAKE_HISTORY_SIZE 16
825	static const u8 fake_history[FAKE_HISTORY_SIZE];
826
827	hwlm_error_t fdrExec(const struct FDR fdr, const* u8 *buf, size_t len,
828	size_t start, HWLMCallback cb,
829	struct hs_scratch *scratch, hwlm_group_t groups) {
830	// We guarantee (for safezone construction) that it is safe to read 16
831	// bytes before the end of the history buffer.
832	const u8 *hbuf = fake_history + FAKE_HISTORY_SIZE;
833
834	const struct FDR_Runtime_Args a = {
835	buf,
836	len,
837	hbuf,
838	`0`,
839	start,
840	cb,
841	scratch,
842	nextFloodDetect(buf, len, FLOOD_BACKOFF_START),
843	`0`
844	};
845	if (unlikely(a.start_offset >= a.len)) {
846	return HWLM_SUCCESS;
847	} else {
848	assert(funcs[fdr->engineID]);
849	return funcs[fdr->engineID](fdr, &a, groups);
850	}
851	}
852
853	hwlm_error_t fdrExecStreaming(const struct FDR fdr, const* u8 *hbuf,
854	size_t hlen, const u8 *buf, size_t len,
855	size_t start, HWLMCallback cb,
856	struct hs_scratch *scratch,
857	hwlm_group_t groups) {
858	struct FDR_Runtime_Args a = {
859	buf,
860	len,
861	hbuf,
862	hlen,
863	start,
864	cb,
865	scratch,
866	nextFloodDetect(buf, len, FLOOD_BACKOFF_START),
867	/ we are guaranteed to always have 16 initialised bytes at the end of*
868	* the history buffer (they may be garbage). */
869	hbuf ? unaligned_load_u64a(hbuf + hlen - sizeof(u64a)) : (u64a)`0`
870	};
871
872	hwlm_error_t ret;
873	if (unlikely(a.start_offset >= a.len)) {
874	ret = HWLM_SUCCESS;
875	} else {
876	assert(funcs[fdr->engineID]);
877	ret = funcs[fdr->engineID](fdr, &a, groups);
878	}
879
880	return ret;
881	}
882

Browse the source code of ClickHouse/contrib/hyperscan/src/fdr/fdr.c