teddy_runtime_common.h source code [ClickHouse/contrib/hyperscan/src/fdr/teddy_runtime_common.h]

1	/*
2	* Copyright (c) 2016-2017, Intel Corporation
3	*
4	* Redistribution and use in source and binary forms, with or without
5	* modification, are permitted provided that the following conditions are met:
6	*
7	* * Redistributions of source code must retain the above copyright notice,
8	* this list of conditions and the following disclaimer.
9	* * Redistributions in binary form must reproduce the above copyright
10	* notice, this list of conditions and the following disclaimer in the
11	* documentation and/or other materials provided with the distribution.
12	* * Neither the name of Intel Corporation nor the names of its contributors
13	* may be used to endorse or promote products derived from this software
14	* without specific prior written permission.
15	*
16	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19	* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20	* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26	* POSSIBILITY OF SUCH DAMAGE.
27	*/
28
29	/* \file*
30	* \brief Teddy literal matcher: common runtime procedures.
31	*/
32
33	#ifndef TEDDY_RUNTIME_COMMON_H_
34	#define TEDDY_RUNTIME_COMMON_H_
35
36	#include "fdr_confirm.h"
37	#include "fdr_confirm_runtime.h"
38	#include "ue2common.h"
39	#include "util/bitutils.h"
40	#include "util/simd_utils.h"
41	#include "util/uniform_ops.h"
42
43	extern const u8 ALIGN_DIRECTIVE p_mask_arr[`17`][`32`];
44	#if defined(HAVE_AVX2)
45	extern const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[`33`][`64`];
46	#endif
47
48	#ifdef ARCH_64_BIT
49	#define TEDDY_CONF_TYPE u64a
50	#define TEDDY_FIND_AND_CLEAR_LSB(conf) findAndClearLSB_64(conf)
51	#else
52	#define TEDDY_CONF_TYPE u32
53	#define TEDDY_FIND_AND_CLEAR_LSB(conf) findAndClearLSB_32(conf)
54	#endif
55
56	#define CHECK_HWLM_TERMINATE_MATCHING \
57	do { \
58	if (unlikely(control == HWLM_TERMINATE_MATCHING)) { \
59	return HWLM_TERMINATED; \
60	} \
61	} while (0);
62
63	#define CHECK_FLOOD \
64	do { \
65	if (unlikely(ptr > tryFloodDetect)) { \
66	tryFloodDetect = floodDetect(fdr, a, &ptr, tryFloodDetect, \
67	&floodBackoff, &control, iterBytes); \
68	CHECK_HWLM_TERMINATE_MATCHING; \
69	} \
70	} while (0);
71
72	/*
73	* \brief Copy a block of [0,15] bytes efficiently.
74	*
75	* This function is a workaround intended to stop some compilers from
76	* synthesizing a memcpy function call out of the copy of a small number of
77	* bytes that we do in vectoredLoad128.
78	*/
79	static really_inline
80	void copyRuntBlock128(u8 dst, const* u8 *src, size_t len) {
81	switch (len) {
82	case `0`:
83	break;
84	case `1`:
85	dst = src;
86	break;
87	case `2`:
88	unaligned_store_u16(dst, unaligned_load_u16(src));
89	break;
90	case `3`:
91	unaligned_store_u16(dst, unaligned_load_u16(src));
92	dst[`2`] = src[`2`];
93	break;
94	case `4`:
95	unaligned_store_u32(dst, unaligned_load_u32(src));
96	break;
97	case `5`:
98	case `6`:
99	case `7`:
100	/ Perform copy with two overlapping 4-byte chunks. /
101	unaligned_store_u32(dst + len - `4`, unaligned_load_u32(src + len - `4`));
102	unaligned_store_u32(dst, unaligned_load_u32(src));
103	break;
104	case `8`:
105	unaligned_store_u64a(dst, unaligned_load_u64a(src));
106	break;
107	default:
108	/ Perform copy with two overlapping 8-byte chunks. /
109	assert(len < `16`);
110	unaligned_store_u64a(dst + len - `8`, unaligned_load_u64a(src + len - `8`));
111	unaligned_store_u64a(dst, unaligned_load_u64a(src));
112	break;
113	}
114	}
115
116	// Note: p_mask is an output param that initialises a poison mask.
117	// p_mask = load128(p_mask_arr[n] + 16 - m) means:*
118	// m byte 0xff in the beginning, followed by n byte 0x00,
119	// then followed by the rest bytes 0xff.
120	// ptr >= lo:
121	// no history.
122	// for end/short zone, ptr==lo and start_offset==0
123	// for start zone, see below
124	// lo ptr hi hi
125	// \|----------\|-------\|----------------\|............\|
126	// -start 0 -start+offset MIN(avail,16)
127	// p_mask ffff..ff0000...........00ffff..........
128	// ptr < lo:
129	// only start zone.
130	// history
131	// ptr lo hi hi
132	// \|----------\|-------\|----------------\|............\|
133	// 0 start start+offset end(<=16)
134	// p_mask ffff.....ffffff..ff0000...........00ffff..........
135	static really_inline
136	m128 vectoredLoad128(m128 p_mask, const* u8 ptr, const* size_t start_offset,
137	const u8 lo, const* u8 *hi,
138	const u8 *buf_history, size_t len_history,
139	const u32 nMasks) {
140	union {
141	u8 val8[`16`];
142	m128 val128;
143	} u;
144	u.val128 = zeroes128();
145
146	uintptr_t copy_start;
147	uintptr_t copy_len;
148
149	if (ptr >= lo) { // short/end/start zone
150	uintptr_t start = (uintptr_t)(ptr - lo);
151	uintptr_t avail = (uintptr_t)(hi - ptr);
152	if (avail >= `16`) {
153	assert(start_offset - start <= `16`);
154	*p_mask = loadu128(p_mask_arr[`16` - start_offset + start]
155	+ `16` - start_offset + start);
156	return loadu128(ptr);
157	}
158	assert(start_offset - start <= avail);
159	*p_mask = loadu128(p_mask_arr[avail - start_offset + start]
160	+ `16` - start_offset + start);
161	copy_start = `0`;
162	copy_len = avail;
163	} else { // start zone
164	uintptr_t need = MIN((uintptr_t)(lo - ptr),
165	MIN(len_history, nMasks - `1`));
166	uintptr_t start = (uintptr_t)(lo - ptr);
167	uintptr_t i;
168	for (i = start - need; i < start; i++) {
169	u.val8[i] = buf_history[len_history - (start - i)];
170	}
171	uintptr_t end = MIN(`16`, (uintptr_t)(hi - ptr));
172	assert(start + start_offset <= end);
173	*p_mask = loadu128(p_mask_arr[end - start - start_offset]
174	+ `16` - start - start_offset);
175	copy_start = start;
176	copy_len = end - start;
177	}
178
179	// Runt block from the buffer.
180	copyRuntBlock128(&u.val8[copy_start], &ptr[copy_start], copy_len);
181
182	return u.val128;
183	}
184
185	#if defined(HAVE_AVX2)
186	/*
187	* \brief Copy a block of [0,31] bytes efficiently.
188	*
189	* This function is a workaround intended to stop some compilers from
190	* synthesizing a memcpy function call out of the copy of a small number of
191	* bytes that we do in vectoredLoad256.
192	*/
193	static really_inline
194	void copyRuntBlock256(u8 dst, const* u8 *src, size_t len) {
195	switch (len) {
196	case `0`:
197	break;
198	case `1`:
199	dst = src;
200	break;
201	case `2`:
202	unaligned_store_u16(dst, unaligned_load_u16(src));
203	break;
204	case `3`:
205	unaligned_store_u16(dst, unaligned_load_u16(src));
206	dst[`2`] = src[`2`];
207	break;
208	case `4`:
209	unaligned_store_u32(dst, unaligned_load_u32(src));
210	break;
211	case `5`:
212	case `6`:
213	case `7`:
214	/ Perform copy with two overlapping 4-byte chunks. /
215	unaligned_store_u32(dst + len - `4`, unaligned_load_u32(src + len - `4`));
216	unaligned_store_u32(dst, unaligned_load_u32(src));
217	break;
218	case `8`:
219	unaligned_store_u64a(dst, unaligned_load_u64a(src));
220	break;
221	case `9`:
222	case `10`:
223	case `11`:
224	case `12`:
225	case `13`:
226	case `14`:
227	case `15`:
228	/ Perform copy with two overlapping 8-byte chunks. /
229	unaligned_store_u64a(dst + len - `8`, unaligned_load_u64a(src + len - `8`));
230	unaligned_store_u64a(dst, unaligned_load_u64a(src));
231	break;
232	case `16`:
233	storeu128(dst, loadu128(src));
234	break;
235	default:
236	/ Perform copy with two overlapping 16-byte chunks. /
237	assert(len < `32`);
238	storeu128(dst + len - `16`, loadu128(src + len - `16`));
239	storeu128(dst, loadu128(src));
240	break;
241	}
242	}
243
244	// Note: p_mask is an output param that initialises a poison mask.
245	// p_mask = load256(p_mask_arr256[n] + 32 - m) means:*
246	// m byte 0xff in the beginning, followed by n byte 0x00,
247	// then followed by the rest bytes 0xff.
248	// ptr >= lo:
249	// no history.
250	// for end/short zone, ptr==lo and start_offset==0
251	// for start zone, see below
252	// lo ptr hi hi
253	// \|----------\|-------\|----------------\|............\|
254	// -start 0 -start+offset MIN(avail,32)
255	// p_mask ffff..ff0000...........00ffff..........
256	// ptr < lo:
257	// only start zone.
258	// history
259	// ptr lo hi hi
260	// \|----------\|-------\|----------------\|............\|
261	// 0 start start+offset end(<=32)
262	// p_mask ffff.....ffffff..ff0000...........00ffff..........
263	static really_inline
264	m256 vectoredLoad256(m256 p_mask, const* u8 ptr, const* size_t start_offset,
265	const u8 lo, const* u8 *hi,
266	const u8 *buf_history, size_t len_history,
267	const u32 nMasks) {
268	union {
269	u8 val8[`32`];
270	m256 val256;
271	} u;
272	u.val256 = zeroes256();
273
274	uintptr_t copy_start;
275	uintptr_t copy_len;
276
277	if (ptr >= lo) { // short/end/start zone
278	uintptr_t start = (uintptr_t)(ptr - lo);
279	uintptr_t avail = (uintptr_t)(hi - ptr);
280	if (avail >= `32`) {
281	assert(start_offset - start <= `32`);
282	*p_mask = loadu256(p_mask_arr256[`32` - start_offset + start]
283	+ `32` - start_offset + start);
284	return loadu256(ptr);
285	}
286	assert(start_offset - start <= avail);
287	*p_mask = loadu256(p_mask_arr256[avail - start_offset + start]
288	+ `32` - start_offset + start);
289	copy_start = `0`;
290	copy_len = avail;
291	} else { //start zone
292	uintptr_t need = MIN((uintptr_t)(lo - ptr),
293	MIN(len_history, nMasks - `1`));
294	uintptr_t start = (uintptr_t)(lo - ptr);
295	uintptr_t i;
296	for (i = start - need; i < start; i++) {
297	u.val8[i] = buf_history[len_history - (start - i)];
298	}
299	uintptr_t end = MIN(`32`, (uintptr_t)(hi - ptr));
300	assert(start + start_offset <= end);
301	*p_mask = loadu256(p_mask_arr256[end - start - start_offset]
302	+ `32` - start - start_offset);
303	copy_start = start;
304	copy_len = end - start;
305	}
306
307	// Runt block from the buffer.
308	copyRuntBlock256(&u.val8[copy_start], &ptr[copy_start], copy_len);
309
310	return u.val256;
311	}
312	#endif // HAVE_AVX2
313
314	#if defined(HAVE_AVX512)
315	// Note: p_mask is an output param that initialises a poison mask.
316	// u64a k = ones_u64a << n' >> m'; // m' < n'
317	// p_mask = set_mask_m512(~k);*
318	// means p_mask is consist of:
319	// (n' - m') poison bytes "0xff" at the beginning,
320	// followed by (64 - n') valid bytes "0x00",
321	// then followed by the rest m' poison bytes "0xff".
322	// ptr >= lo:
323	// no history.
324	// for end/short zone, ptr==lo and start_offset==0
325	// for start zone, see below
326	// lo ptr hi hi
327	// \|----------\|-------\|----------------\|............\|
328	// -start 0 -start+offset MIN(avail,64)
329	// p_mask ffff..ff0000...........00ffff..........
330	// ptr < lo:
331	// only start zone.
332	// history
333	// ptr lo hi hi
334	// \|----------\|-------\|----------------\|............\|
335	// 0 start start+offset end(<=64)
336	// p_mask ffff.....ffffff..ff0000...........00ffff..........
337	static really_inline
338	m512 vectoredLoad512(m512 p_mask, const* u8 ptr, const* size_t start_offset,
339	const u8 lo, const* u8 hi, const* u8 *hbuf, size_t hlen,
340	const u32 nMasks) {
341	m512 val;
342
343	uintptr_t copy_start;
344	uintptr_t copy_len;
345
346	if (ptr >= lo) { // short/end/start zone
347	uintptr_t start = (uintptr_t)(ptr - lo);
348	uintptr_t avail = (uintptr_t)(hi - ptr);
349	if (avail >= `64`) {
350	assert(start_offset - start <= `64`);
351	u64a k = ones_u64a << (start_offset - start);
352	*p_mask = set_mask_m512(~k);
353	return loadu512(ptr);
354	}
355	assert(start_offset - start <= avail);
356	u64a k = ones_u64a << (`64` - avail + start_offset - start)
357	>> (`64` - avail);
358	*p_mask = set_mask_m512(~k);
359	copy_start = `0`;
360	copy_len = avail;
361	} else { //start zone
362	uintptr_t need = MIN((uintptr_t)(lo - ptr),
363	MIN(hlen, nMasks - `1`));
364	uintptr_t start = (uintptr_t)(lo - ptr);
365	u64a j = `0x7fffffffffffffffULL` >> (`63` - need) << (start - need);
366	val = loadu_maskz_m512(j, &hbuf[hlen - start]);
367	uintptr_t end = MIN(`64`, (uintptr_t)(hi - ptr));
368	assert(start + start_offset <= end);
369	u64a k = ones_u64a << (`64` - end + start + start_offset) >> (`64` - end);
370	*p_mask = set_mask_m512(~k);
371	copy_start = start;
372	copy_len = end - start;
373	}
374
375	assert(copy_len < `64`);
376	assert(copy_len > `0`);
377	u64a j = ones_u64a >> (`64` - copy_len) << copy_start;
378	val = loadu_mask_m512(val, j, ptr);
379
380	return val;
381	}
382	#endif // HAVE_AVX512
383
384	static really_inline
385	u64a getConfVal(const struct FDR_Runtime_Args a, const* u8 *ptr, u32 byte,
386	CautionReason reason) {
387	u64a confVal = `0`;
388	const u8 *buf = a->buf;
389	size_t len = a->len;
390	const u8 *confirm_loc = ptr + byte - `7`;
391	if (likely(reason == NOT_CAUTIOUS \|\| confirm_loc >= buf)) {
392	confVal = lv_u64a(confirm_loc, buf, buf + len);
393	} else { // r == VECTORING, confirm_loc < buf
394	u64a histBytes = a->histBytes;
395	confVal = lv_u64a_ce(confirm_loc, buf, buf + len);
396	// stitch together confVal and history
397	u32 overhang = buf - confirm_loc;
398	histBytes >>= `64` - (overhang * `8`);
399	confVal \|= histBytes;
400	}
401	return confVal;
402	}
403
404	static really_inline
405	void do_confWithBit_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
406	const u32 *confBase, CautionReason reason,
407	const struct FDR_Runtime_Args a, const* u8 *ptr,
408	hwlmcb_rv_t control, u32 last_match) {
409	do {
410	u32 bit = TEDDY_FIND_AND_CLEAR_LSB(conf);
411	u32 byte = bit / bucket + offset;
412	u32 idx = bit % bucket;
413	u32 cf = confBase[idx];
414	if (!cf) {
415	continue;
416	}
417	const struct FDRConfirm fdrc = (const* struct FDRConfirm *)
418	((const u8 *)confBase + cf);
419	if (!(fdrc->groups & *control)) {
420	continue;
421	}
422	u64a tmp = `0`;
423	u64a confVal = getConfVal(a, ptr, byte, reason);
424	confWithBit(fdrc, a, ptr - a->buf + byte, control,
425	last_match, confVal, &tmp, `0`);
426	} while (unlikely(*conf));
427	}
428
429	static really_inline
430	const m128 getMaskBase(const* struct Teddy *teddy) {
431	return (const m128 )((const* u8 )teddy + ROUNDUP_CL(sizeof(struct* Teddy)));
432	}
433
434	static really_inline
435	const u64a getReinforcedMaskBase(const* struct Teddy *teddy, u8 numMask) {
436	return (const u64a )((const* u8 *)getMaskBase(teddy)
437	+ ROUNDUP_CL(`2` * numMask * sizeof(m128)));
438	}
439
440	static really_inline
441	const u32 getConfBase(const* struct Teddy *teddy) {
442	return (const u32 )((const* u8 *)teddy + teddy->confOffset);
443	}
444
445	#endif /* TEDDY_RUNTIME_COMMON_H_ */
446

Browse the source code of ClickHouse/contrib/hyperscan/src/fdr/teddy_runtime_common.h