1/*
2 * Copyright (c) 2016-2017, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** \file
30 * \brief Teddy literal matcher: common runtime procedures.
31 */
32
33#ifndef TEDDY_RUNTIME_COMMON_H_
34#define TEDDY_RUNTIME_COMMON_H_
35
36#include "fdr_confirm.h"
37#include "fdr_confirm_runtime.h"
38#include "ue2common.h"
39#include "util/bitutils.h"
40#include "util/simd_utils.h"
41#include "util/uniform_ops.h"
42
43extern const u8 ALIGN_DIRECTIVE p_mask_arr[17][32];
44#if defined(HAVE_AVX2)
45extern const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64];
46#endif
47
48#ifdef ARCH_64_BIT
49#define TEDDY_CONF_TYPE u64a
50#define TEDDY_FIND_AND_CLEAR_LSB(conf) findAndClearLSB_64(conf)
51#else
52#define TEDDY_CONF_TYPE u32
53#define TEDDY_FIND_AND_CLEAR_LSB(conf) findAndClearLSB_32(conf)
54#endif
55
56#define CHECK_HWLM_TERMINATE_MATCHING \
57do { \
58 if (unlikely(control == HWLM_TERMINATE_MATCHING)) { \
59 return HWLM_TERMINATED; \
60 } \
61} while (0);
62
63#define CHECK_FLOOD \
64do { \
65 if (unlikely(ptr > tryFloodDetect)) { \
66 tryFloodDetect = floodDetect(fdr, a, &ptr, tryFloodDetect, \
67 &floodBackoff, &control, iterBytes); \
68 CHECK_HWLM_TERMINATE_MATCHING; \
69 } \
70} while (0);
71
72/*
73 * \brief Copy a block of [0,15] bytes efficiently.
74 *
75 * This function is a workaround intended to stop some compilers from
76 * synthesizing a memcpy function call out of the copy of a small number of
77 * bytes that we do in vectoredLoad128.
78 */
79static really_inline
80void copyRuntBlock128(u8 *dst, const u8 *src, size_t len) {
81 switch (len) {
82 case 0:
83 break;
84 case 1:
85 *dst = *src;
86 break;
87 case 2:
88 unaligned_store_u16(dst, unaligned_load_u16(src));
89 break;
90 case 3:
91 unaligned_store_u16(dst, unaligned_load_u16(src));
92 dst[2] = src[2];
93 break;
94 case 4:
95 unaligned_store_u32(dst, unaligned_load_u32(src));
96 break;
97 case 5:
98 case 6:
99 case 7:
100 /* Perform copy with two overlapping 4-byte chunks. */
101 unaligned_store_u32(dst + len - 4, unaligned_load_u32(src + len - 4));
102 unaligned_store_u32(dst, unaligned_load_u32(src));
103 break;
104 case 8:
105 unaligned_store_u64a(dst, unaligned_load_u64a(src));
106 break;
107 default:
108 /* Perform copy with two overlapping 8-byte chunks. */
109 assert(len < 16);
110 unaligned_store_u64a(dst + len - 8, unaligned_load_u64a(src + len - 8));
111 unaligned_store_u64a(dst, unaligned_load_u64a(src));
112 break;
113 }
114}
115
116// Note: p_mask is an output param that initialises a poison mask.
117// *p_mask = load128(p_mask_arr[n] + 16 - m) means:
118// m byte 0xff in the beginning, followed by n byte 0x00,
119// then followed by the rest bytes 0xff.
120// ptr >= lo:
121// no history.
122// for end/short zone, ptr==lo and start_offset==0
123// for start zone, see below
124// lo ptr hi hi
125// |----------|-------|----------------|............|
126// -start 0 -start+offset MIN(avail,16)
127// p_mask ffff..ff0000...........00ffff..........
128// ptr < lo:
129// only start zone.
130// history
131// ptr lo hi hi
132// |----------|-------|----------------|............|
133// 0 start start+offset end(<=16)
134// p_mask ffff.....ffffff..ff0000...........00ffff..........
135static really_inline
136m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const size_t start_offset,
137 const u8 *lo, const u8 *hi,
138 const u8 *buf_history, size_t len_history,
139 const u32 nMasks) {
140 union {
141 u8 val8[16];
142 m128 val128;
143 } u;
144 u.val128 = zeroes128();
145
146 uintptr_t copy_start;
147 uintptr_t copy_len;
148
149 if (ptr >= lo) { // short/end/start zone
150 uintptr_t start = (uintptr_t)(ptr - lo);
151 uintptr_t avail = (uintptr_t)(hi - ptr);
152 if (avail >= 16) {
153 assert(start_offset - start <= 16);
154 *p_mask = loadu128(p_mask_arr[16 - start_offset + start]
155 + 16 - start_offset + start);
156 return loadu128(ptr);
157 }
158 assert(start_offset - start <= avail);
159 *p_mask = loadu128(p_mask_arr[avail - start_offset + start]
160 + 16 - start_offset + start);
161 copy_start = 0;
162 copy_len = avail;
163 } else { // start zone
164 uintptr_t need = MIN((uintptr_t)(lo - ptr),
165 MIN(len_history, nMasks - 1));
166 uintptr_t start = (uintptr_t)(lo - ptr);
167 uintptr_t i;
168 for (i = start - need; i < start; i++) {
169 u.val8[i] = buf_history[len_history - (start - i)];
170 }
171 uintptr_t end = MIN(16, (uintptr_t)(hi - ptr));
172 assert(start + start_offset <= end);
173 *p_mask = loadu128(p_mask_arr[end - start - start_offset]
174 + 16 - start - start_offset);
175 copy_start = start;
176 copy_len = end - start;
177 }
178
179 // Runt block from the buffer.
180 copyRuntBlock128(&u.val8[copy_start], &ptr[copy_start], copy_len);
181
182 return u.val128;
183}
184
185#if defined(HAVE_AVX2)
186/*
187 * \brief Copy a block of [0,31] bytes efficiently.
188 *
189 * This function is a workaround intended to stop some compilers from
190 * synthesizing a memcpy function call out of the copy of a small number of
191 * bytes that we do in vectoredLoad256.
192 */
193static really_inline
194void copyRuntBlock256(u8 *dst, const u8 *src, size_t len) {
195 switch (len) {
196 case 0:
197 break;
198 case 1:
199 *dst = *src;
200 break;
201 case 2:
202 unaligned_store_u16(dst, unaligned_load_u16(src));
203 break;
204 case 3:
205 unaligned_store_u16(dst, unaligned_load_u16(src));
206 dst[2] = src[2];
207 break;
208 case 4:
209 unaligned_store_u32(dst, unaligned_load_u32(src));
210 break;
211 case 5:
212 case 6:
213 case 7:
214 /* Perform copy with two overlapping 4-byte chunks. */
215 unaligned_store_u32(dst + len - 4, unaligned_load_u32(src + len - 4));
216 unaligned_store_u32(dst, unaligned_load_u32(src));
217 break;
218 case 8:
219 unaligned_store_u64a(dst, unaligned_load_u64a(src));
220 break;
221 case 9:
222 case 10:
223 case 11:
224 case 12:
225 case 13:
226 case 14:
227 case 15:
228 /* Perform copy with two overlapping 8-byte chunks. */
229 unaligned_store_u64a(dst + len - 8, unaligned_load_u64a(src + len - 8));
230 unaligned_store_u64a(dst, unaligned_load_u64a(src));
231 break;
232 case 16:
233 storeu128(dst, loadu128(src));
234 break;
235 default:
236 /* Perform copy with two overlapping 16-byte chunks. */
237 assert(len < 32);
238 storeu128(dst + len - 16, loadu128(src + len - 16));
239 storeu128(dst, loadu128(src));
240 break;
241 }
242}
243
244// Note: p_mask is an output param that initialises a poison mask.
245// *p_mask = load256(p_mask_arr256[n] + 32 - m) means:
246// m byte 0xff in the beginning, followed by n byte 0x00,
247// then followed by the rest bytes 0xff.
248// ptr >= lo:
249// no history.
250// for end/short zone, ptr==lo and start_offset==0
251// for start zone, see below
252// lo ptr hi hi
253// |----------|-------|----------------|............|
254// -start 0 -start+offset MIN(avail,32)
255// p_mask ffff..ff0000...........00ffff..........
256// ptr < lo:
257// only start zone.
258// history
259// ptr lo hi hi
260// |----------|-------|----------------|............|
261// 0 start start+offset end(<=32)
262// p_mask ffff.....ffffff..ff0000...........00ffff..........
263static really_inline
264m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const size_t start_offset,
265 const u8 *lo, const u8 *hi,
266 const u8 *buf_history, size_t len_history,
267 const u32 nMasks) {
268 union {
269 u8 val8[32];
270 m256 val256;
271 } u;
272 u.val256 = zeroes256();
273
274 uintptr_t copy_start;
275 uintptr_t copy_len;
276
277 if (ptr >= lo) { // short/end/start zone
278 uintptr_t start = (uintptr_t)(ptr - lo);
279 uintptr_t avail = (uintptr_t)(hi - ptr);
280 if (avail >= 32) {
281 assert(start_offset - start <= 32);
282 *p_mask = loadu256(p_mask_arr256[32 - start_offset + start]
283 + 32 - start_offset + start);
284 return loadu256(ptr);
285 }
286 assert(start_offset - start <= avail);
287 *p_mask = loadu256(p_mask_arr256[avail - start_offset + start]
288 + 32 - start_offset + start);
289 copy_start = 0;
290 copy_len = avail;
291 } else { //start zone
292 uintptr_t need = MIN((uintptr_t)(lo - ptr),
293 MIN(len_history, nMasks - 1));
294 uintptr_t start = (uintptr_t)(lo - ptr);
295 uintptr_t i;
296 for (i = start - need; i < start; i++) {
297 u.val8[i] = buf_history[len_history - (start - i)];
298 }
299 uintptr_t end = MIN(32, (uintptr_t)(hi - ptr));
300 assert(start + start_offset <= end);
301 *p_mask = loadu256(p_mask_arr256[end - start - start_offset]
302 + 32 - start - start_offset);
303 copy_start = start;
304 copy_len = end - start;
305 }
306
307 // Runt block from the buffer.
308 copyRuntBlock256(&u.val8[copy_start], &ptr[copy_start], copy_len);
309
310 return u.val256;
311}
312#endif // HAVE_AVX2
313
314#if defined(HAVE_AVX512)
315// Note: p_mask is an output param that initialises a poison mask.
316// u64a k = ones_u64a << n' >> m'; // m' < n'
317// *p_mask = set_mask_m512(~k);
318// means p_mask is consist of:
319// (n' - m') poison bytes "0xff" at the beginning,
320// followed by (64 - n') valid bytes "0x00",
321// then followed by the rest m' poison bytes "0xff".
322// ptr >= lo:
323// no history.
324// for end/short zone, ptr==lo and start_offset==0
325// for start zone, see below
326// lo ptr hi hi
327// |----------|-------|----------------|............|
328// -start 0 -start+offset MIN(avail,64)
329// p_mask ffff..ff0000...........00ffff..........
330// ptr < lo:
331// only start zone.
332// history
333// ptr lo hi hi
334// |----------|-------|----------------|............|
335// 0 start start+offset end(<=64)
336// p_mask ffff.....ffffff..ff0000...........00ffff..........
337static really_inline
338m512 vectoredLoad512(m512 *p_mask, const u8 *ptr, const size_t start_offset,
339 const u8 *lo, const u8 *hi, const u8 *hbuf, size_t hlen,
340 const u32 nMasks) {
341 m512 val;
342
343 uintptr_t copy_start;
344 uintptr_t copy_len;
345
346 if (ptr >= lo) { // short/end/start zone
347 uintptr_t start = (uintptr_t)(ptr - lo);
348 uintptr_t avail = (uintptr_t)(hi - ptr);
349 if (avail >= 64) {
350 assert(start_offset - start <= 64);
351 u64a k = ones_u64a << (start_offset - start);
352 *p_mask = set_mask_m512(~k);
353 return loadu512(ptr);
354 }
355 assert(start_offset - start <= avail);
356 u64a k = ones_u64a << (64 - avail + start_offset - start)
357 >> (64 - avail);
358 *p_mask = set_mask_m512(~k);
359 copy_start = 0;
360 copy_len = avail;
361 } else { //start zone
362 uintptr_t need = MIN((uintptr_t)(lo - ptr),
363 MIN(hlen, nMasks - 1));
364 uintptr_t start = (uintptr_t)(lo - ptr);
365 u64a j = 0x7fffffffffffffffULL >> (63 - need) << (start - need);
366 val = loadu_maskz_m512(j, &hbuf[hlen - start]);
367 uintptr_t end = MIN(64, (uintptr_t)(hi - ptr));
368 assert(start + start_offset <= end);
369 u64a k = ones_u64a << (64 - end + start + start_offset) >> (64 - end);
370 *p_mask = set_mask_m512(~k);
371 copy_start = start;
372 copy_len = end - start;
373 }
374
375 assert(copy_len < 64);
376 assert(copy_len > 0);
377 u64a j = ones_u64a >> (64 - copy_len) << copy_start;
378 val = loadu_mask_m512(val, j, ptr);
379
380 return val;
381}
382#endif // HAVE_AVX512
383
384static really_inline
385u64a getConfVal(const struct FDR_Runtime_Args *a, const u8 *ptr, u32 byte,
386 CautionReason reason) {
387 u64a confVal = 0;
388 const u8 *buf = a->buf;
389 size_t len = a->len;
390 const u8 *confirm_loc = ptr + byte - 7;
391 if (likely(reason == NOT_CAUTIOUS || confirm_loc >= buf)) {
392 confVal = lv_u64a(confirm_loc, buf, buf + len);
393 } else { // r == VECTORING, confirm_loc < buf
394 u64a histBytes = a->histBytes;
395 confVal = lv_u64a_ce(confirm_loc, buf, buf + len);
396 // stitch together confVal and history
397 u32 overhang = buf - confirm_loc;
398 histBytes >>= 64 - (overhang * 8);
399 confVal |= histBytes;
400 }
401 return confVal;
402}
403
404static really_inline
405void do_confWithBit_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
406 const u32 *confBase, CautionReason reason,
407 const struct FDR_Runtime_Args *a, const u8 *ptr,
408 hwlmcb_rv_t *control, u32 *last_match) {
409 do {
410 u32 bit = TEDDY_FIND_AND_CLEAR_LSB(conf);
411 u32 byte = bit / bucket + offset;
412 u32 idx = bit % bucket;
413 u32 cf = confBase[idx];
414 if (!cf) {
415 continue;
416 }
417 const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
418 ((const u8 *)confBase + cf);
419 if (!(fdrc->groups & *control)) {
420 continue;
421 }
422 u64a tmp = 0;
423 u64a confVal = getConfVal(a, ptr, byte, reason);
424 confWithBit(fdrc, a, ptr - a->buf + byte, control,
425 last_match, confVal, &tmp, 0);
426 } while (unlikely(*conf));
427}
428
429static really_inline
430const m128 *getMaskBase(const struct Teddy *teddy) {
431 return (const m128 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy)));
432}
433
434static really_inline
435const u64a *getReinforcedMaskBase(const struct Teddy *teddy, u8 numMask) {
436 return (const u64a *)((const u8 *)getMaskBase(teddy)
437 + ROUNDUP_CL(2 * numMask * sizeof(m128)));
438}
439
440static really_inline
441const u32 *getConfBase(const struct Teddy *teddy) {
442 return (const u32 *)((const u8 *)teddy + teddy->confOffset);
443}
444
445#endif /* TEDDY_RUNTIME_COMMON_H_ */
446