1 | /* |
2 | * Copyright (c) 2016-2017, Intel Corporation |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions are met: |
6 | * |
7 | * * Redistributions of source code must retain the above copyright notice, |
8 | * this list of conditions and the following disclaimer. |
9 | * * Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * * Neither the name of Intel Corporation nor the names of its contributors |
13 | * may be used to endorse or promote products derived from this software |
14 | * without specific prior written permission. |
15 | * |
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
17 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
20 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
26 | * POSSIBILITY OF SUCH DAMAGE. |
27 | */ |
28 | |
29 | /** \file |
30 | * \brief Teddy literal matcher: common runtime procedures. |
31 | */ |
32 | |
33 | #ifndef TEDDY_RUNTIME_COMMON_H_ |
34 | #define TEDDY_RUNTIME_COMMON_H_ |
35 | |
36 | #include "fdr_confirm.h" |
37 | #include "fdr_confirm_runtime.h" |
38 | #include "ue2common.h" |
39 | #include "util/bitutils.h" |
40 | #include "util/simd_utils.h" |
41 | #include "util/uniform_ops.h" |
42 | |
43 | extern const u8 ALIGN_DIRECTIVE p_mask_arr[17][32]; |
44 | #if defined(HAVE_AVX2) |
45 | extern const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64]; |
46 | #endif |
47 | |
48 | #ifdef ARCH_64_BIT |
49 | #define TEDDY_CONF_TYPE u64a |
50 | #define TEDDY_FIND_AND_CLEAR_LSB(conf) findAndClearLSB_64(conf) |
51 | #else |
52 | #define TEDDY_CONF_TYPE u32 |
53 | #define TEDDY_FIND_AND_CLEAR_LSB(conf) findAndClearLSB_32(conf) |
54 | #endif |
55 | |
56 | #define CHECK_HWLM_TERMINATE_MATCHING \ |
57 | do { \ |
58 | if (unlikely(control == HWLM_TERMINATE_MATCHING)) { \ |
59 | return HWLM_TERMINATED; \ |
60 | } \ |
61 | } while (0); |
62 | |
63 | #define CHECK_FLOOD \ |
64 | do { \ |
65 | if (unlikely(ptr > tryFloodDetect)) { \ |
66 | tryFloodDetect = floodDetect(fdr, a, &ptr, tryFloodDetect, \ |
67 | &floodBackoff, &control, iterBytes); \ |
68 | CHECK_HWLM_TERMINATE_MATCHING; \ |
69 | } \ |
70 | } while (0); |
71 | |
72 | /* |
73 | * \brief Copy a block of [0,15] bytes efficiently. |
74 | * |
75 | * This function is a workaround intended to stop some compilers from |
76 | * synthesizing a memcpy function call out of the copy of a small number of |
77 | * bytes that we do in vectoredLoad128. |
78 | */ |
79 | static really_inline |
80 | void copyRuntBlock128(u8 *dst, const u8 *src, size_t len) { |
81 | switch (len) { |
82 | case 0: |
83 | break; |
84 | case 1: |
85 | *dst = *src; |
86 | break; |
87 | case 2: |
88 | unaligned_store_u16(dst, unaligned_load_u16(src)); |
89 | break; |
90 | case 3: |
91 | unaligned_store_u16(dst, unaligned_load_u16(src)); |
92 | dst[2] = src[2]; |
93 | break; |
94 | case 4: |
95 | unaligned_store_u32(dst, unaligned_load_u32(src)); |
96 | break; |
97 | case 5: |
98 | case 6: |
99 | case 7: |
100 | /* Perform copy with two overlapping 4-byte chunks. */ |
101 | unaligned_store_u32(dst + len - 4, unaligned_load_u32(src + len - 4)); |
102 | unaligned_store_u32(dst, unaligned_load_u32(src)); |
103 | break; |
104 | case 8: |
105 | unaligned_store_u64a(dst, unaligned_load_u64a(src)); |
106 | break; |
107 | default: |
108 | /* Perform copy with two overlapping 8-byte chunks. */ |
109 | assert(len < 16); |
110 | unaligned_store_u64a(dst + len - 8, unaligned_load_u64a(src + len - 8)); |
111 | unaligned_store_u64a(dst, unaligned_load_u64a(src)); |
112 | break; |
113 | } |
114 | } |
115 | |
116 | // Note: p_mask is an output param that initialises a poison mask. |
117 | // *p_mask = load128(p_mask_arr[n] + 16 - m) means: |
118 | // m byte 0xff in the beginning, followed by n byte 0x00, |
119 | // then followed by the rest bytes 0xff. |
120 | // ptr >= lo: |
121 | // no history. |
122 | // for end/short zone, ptr==lo and start_offset==0 |
123 | // for start zone, see below |
124 | // lo ptr hi hi |
125 | // |----------|-------|----------------|............| |
126 | // -start 0 -start+offset MIN(avail,16) |
127 | // p_mask ffff..ff0000...........00ffff.......... |
128 | // ptr < lo: |
129 | // only start zone. |
130 | // history |
131 | // ptr lo hi hi |
132 | // |----------|-------|----------------|............| |
133 | // 0 start start+offset end(<=16) |
134 | // p_mask ffff.....ffffff..ff0000...........00ffff.......... |
135 | static really_inline |
136 | m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const size_t start_offset, |
137 | const u8 *lo, const u8 *hi, |
138 | const u8 *buf_history, size_t len_history, |
139 | const u32 nMasks) { |
140 | union { |
141 | u8 val8[16]; |
142 | m128 val128; |
143 | } u; |
144 | u.val128 = zeroes128(); |
145 | |
146 | uintptr_t copy_start; |
147 | uintptr_t copy_len; |
148 | |
149 | if (ptr >= lo) { // short/end/start zone |
150 | uintptr_t start = (uintptr_t)(ptr - lo); |
151 | uintptr_t avail = (uintptr_t)(hi - ptr); |
152 | if (avail >= 16) { |
153 | assert(start_offset - start <= 16); |
154 | *p_mask = loadu128(p_mask_arr[16 - start_offset + start] |
155 | + 16 - start_offset + start); |
156 | return loadu128(ptr); |
157 | } |
158 | assert(start_offset - start <= avail); |
159 | *p_mask = loadu128(p_mask_arr[avail - start_offset + start] |
160 | + 16 - start_offset + start); |
161 | copy_start = 0; |
162 | copy_len = avail; |
163 | } else { // start zone |
164 | uintptr_t need = MIN((uintptr_t)(lo - ptr), |
165 | MIN(len_history, nMasks - 1)); |
166 | uintptr_t start = (uintptr_t)(lo - ptr); |
167 | uintptr_t i; |
168 | for (i = start - need; i < start; i++) { |
169 | u.val8[i] = buf_history[len_history - (start - i)]; |
170 | } |
171 | uintptr_t end = MIN(16, (uintptr_t)(hi - ptr)); |
172 | assert(start + start_offset <= end); |
173 | *p_mask = loadu128(p_mask_arr[end - start - start_offset] |
174 | + 16 - start - start_offset); |
175 | copy_start = start; |
176 | copy_len = end - start; |
177 | } |
178 | |
179 | // Runt block from the buffer. |
180 | copyRuntBlock128(&u.val8[copy_start], &ptr[copy_start], copy_len); |
181 | |
182 | return u.val128; |
183 | } |
184 | |
185 | #if defined(HAVE_AVX2) |
186 | /* |
187 | * \brief Copy a block of [0,31] bytes efficiently. |
188 | * |
189 | * This function is a workaround intended to stop some compilers from |
190 | * synthesizing a memcpy function call out of the copy of a small number of |
191 | * bytes that we do in vectoredLoad256. |
192 | */ |
193 | static really_inline |
194 | void copyRuntBlock256(u8 *dst, const u8 *src, size_t len) { |
195 | switch (len) { |
196 | case 0: |
197 | break; |
198 | case 1: |
199 | *dst = *src; |
200 | break; |
201 | case 2: |
202 | unaligned_store_u16(dst, unaligned_load_u16(src)); |
203 | break; |
204 | case 3: |
205 | unaligned_store_u16(dst, unaligned_load_u16(src)); |
206 | dst[2] = src[2]; |
207 | break; |
208 | case 4: |
209 | unaligned_store_u32(dst, unaligned_load_u32(src)); |
210 | break; |
211 | case 5: |
212 | case 6: |
213 | case 7: |
214 | /* Perform copy with two overlapping 4-byte chunks. */ |
215 | unaligned_store_u32(dst + len - 4, unaligned_load_u32(src + len - 4)); |
216 | unaligned_store_u32(dst, unaligned_load_u32(src)); |
217 | break; |
218 | case 8: |
219 | unaligned_store_u64a(dst, unaligned_load_u64a(src)); |
220 | break; |
221 | case 9: |
222 | case 10: |
223 | case 11: |
224 | case 12: |
225 | case 13: |
226 | case 14: |
227 | case 15: |
228 | /* Perform copy with two overlapping 8-byte chunks. */ |
229 | unaligned_store_u64a(dst + len - 8, unaligned_load_u64a(src + len - 8)); |
230 | unaligned_store_u64a(dst, unaligned_load_u64a(src)); |
231 | break; |
232 | case 16: |
233 | storeu128(dst, loadu128(src)); |
234 | break; |
235 | default: |
236 | /* Perform copy with two overlapping 16-byte chunks. */ |
237 | assert(len < 32); |
238 | storeu128(dst + len - 16, loadu128(src + len - 16)); |
239 | storeu128(dst, loadu128(src)); |
240 | break; |
241 | } |
242 | } |
243 | |
244 | // Note: p_mask is an output param that initialises a poison mask. |
245 | // *p_mask = load256(p_mask_arr256[n] + 32 - m) means: |
246 | // m byte 0xff in the beginning, followed by n byte 0x00, |
247 | // then followed by the rest bytes 0xff. |
248 | // ptr >= lo: |
249 | // no history. |
250 | // for end/short zone, ptr==lo and start_offset==0 |
251 | // for start zone, see below |
252 | // lo ptr hi hi |
253 | // |----------|-------|----------------|............| |
254 | // -start 0 -start+offset MIN(avail,32) |
255 | // p_mask ffff..ff0000...........00ffff.......... |
256 | // ptr < lo: |
257 | // only start zone. |
258 | // history |
259 | // ptr lo hi hi |
260 | // |----------|-------|----------------|............| |
261 | // 0 start start+offset end(<=32) |
262 | // p_mask ffff.....ffffff..ff0000...........00ffff.......... |
263 | static really_inline |
264 | m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const size_t start_offset, |
265 | const u8 *lo, const u8 *hi, |
266 | const u8 *buf_history, size_t len_history, |
267 | const u32 nMasks) { |
268 | union { |
269 | u8 val8[32]; |
270 | m256 val256; |
271 | } u; |
272 | u.val256 = zeroes256(); |
273 | |
274 | uintptr_t copy_start; |
275 | uintptr_t copy_len; |
276 | |
277 | if (ptr >= lo) { // short/end/start zone |
278 | uintptr_t start = (uintptr_t)(ptr - lo); |
279 | uintptr_t avail = (uintptr_t)(hi - ptr); |
280 | if (avail >= 32) { |
281 | assert(start_offset - start <= 32); |
282 | *p_mask = loadu256(p_mask_arr256[32 - start_offset + start] |
283 | + 32 - start_offset + start); |
284 | return loadu256(ptr); |
285 | } |
286 | assert(start_offset - start <= avail); |
287 | *p_mask = loadu256(p_mask_arr256[avail - start_offset + start] |
288 | + 32 - start_offset + start); |
289 | copy_start = 0; |
290 | copy_len = avail; |
291 | } else { //start zone |
292 | uintptr_t need = MIN((uintptr_t)(lo - ptr), |
293 | MIN(len_history, nMasks - 1)); |
294 | uintptr_t start = (uintptr_t)(lo - ptr); |
295 | uintptr_t i; |
296 | for (i = start - need; i < start; i++) { |
297 | u.val8[i] = buf_history[len_history - (start - i)]; |
298 | } |
299 | uintptr_t end = MIN(32, (uintptr_t)(hi - ptr)); |
300 | assert(start + start_offset <= end); |
301 | *p_mask = loadu256(p_mask_arr256[end - start - start_offset] |
302 | + 32 - start - start_offset); |
303 | copy_start = start; |
304 | copy_len = end - start; |
305 | } |
306 | |
307 | // Runt block from the buffer. |
308 | copyRuntBlock256(&u.val8[copy_start], &ptr[copy_start], copy_len); |
309 | |
310 | return u.val256; |
311 | } |
312 | #endif // HAVE_AVX2 |
313 | |
314 | #if defined(HAVE_AVX512) |
315 | // Note: p_mask is an output param that initialises a poison mask. |
316 | // u64a k = ones_u64a << n' >> m'; // m' < n' |
317 | // *p_mask = set_mask_m512(~k); |
318 | // means p_mask is consist of: |
319 | // (n' - m') poison bytes "0xff" at the beginning, |
320 | // followed by (64 - n') valid bytes "0x00", |
321 | // then followed by the rest m' poison bytes "0xff". |
322 | // ptr >= lo: |
323 | // no history. |
324 | // for end/short zone, ptr==lo and start_offset==0 |
325 | // for start zone, see below |
326 | // lo ptr hi hi |
327 | // |----------|-------|----------------|............| |
328 | // -start 0 -start+offset MIN(avail,64) |
329 | // p_mask ffff..ff0000...........00ffff.......... |
330 | // ptr < lo: |
331 | // only start zone. |
332 | // history |
333 | // ptr lo hi hi |
334 | // |----------|-------|----------------|............| |
335 | // 0 start start+offset end(<=64) |
336 | // p_mask ffff.....ffffff..ff0000...........00ffff.......... |
337 | static really_inline |
338 | m512 vectoredLoad512(m512 *p_mask, const u8 *ptr, const size_t start_offset, |
339 | const u8 *lo, const u8 *hi, const u8 *hbuf, size_t hlen, |
340 | const u32 nMasks) { |
341 | m512 val; |
342 | |
343 | uintptr_t copy_start; |
344 | uintptr_t copy_len; |
345 | |
346 | if (ptr >= lo) { // short/end/start zone |
347 | uintptr_t start = (uintptr_t)(ptr - lo); |
348 | uintptr_t avail = (uintptr_t)(hi - ptr); |
349 | if (avail >= 64) { |
350 | assert(start_offset - start <= 64); |
351 | u64a k = ones_u64a << (start_offset - start); |
352 | *p_mask = set_mask_m512(~k); |
353 | return loadu512(ptr); |
354 | } |
355 | assert(start_offset - start <= avail); |
356 | u64a k = ones_u64a << (64 - avail + start_offset - start) |
357 | >> (64 - avail); |
358 | *p_mask = set_mask_m512(~k); |
359 | copy_start = 0; |
360 | copy_len = avail; |
361 | } else { //start zone |
362 | uintptr_t need = MIN((uintptr_t)(lo - ptr), |
363 | MIN(hlen, nMasks - 1)); |
364 | uintptr_t start = (uintptr_t)(lo - ptr); |
365 | u64a j = 0x7fffffffffffffffULL >> (63 - need) << (start - need); |
366 | val = loadu_maskz_m512(j, &hbuf[hlen - start]); |
367 | uintptr_t end = MIN(64, (uintptr_t)(hi - ptr)); |
368 | assert(start + start_offset <= end); |
369 | u64a k = ones_u64a << (64 - end + start + start_offset) >> (64 - end); |
370 | *p_mask = set_mask_m512(~k); |
371 | copy_start = start; |
372 | copy_len = end - start; |
373 | } |
374 | |
375 | assert(copy_len < 64); |
376 | assert(copy_len > 0); |
377 | u64a j = ones_u64a >> (64 - copy_len) << copy_start; |
378 | val = loadu_mask_m512(val, j, ptr); |
379 | |
380 | return val; |
381 | } |
382 | #endif // HAVE_AVX512 |
383 | |
384 | static really_inline |
385 | u64a getConfVal(const struct FDR_Runtime_Args *a, const u8 *ptr, u32 byte, |
386 | CautionReason reason) { |
387 | u64a confVal = 0; |
388 | const u8 *buf = a->buf; |
389 | size_t len = a->len; |
390 | const u8 *confirm_loc = ptr + byte - 7; |
391 | if (likely(reason == NOT_CAUTIOUS || confirm_loc >= buf)) { |
392 | confVal = lv_u64a(confirm_loc, buf, buf + len); |
393 | } else { // r == VECTORING, confirm_loc < buf |
394 | u64a histBytes = a->histBytes; |
395 | confVal = lv_u64a_ce(confirm_loc, buf, buf + len); |
396 | // stitch together confVal and history |
397 | u32 overhang = buf - confirm_loc; |
398 | histBytes >>= 64 - (overhang * 8); |
399 | confVal |= histBytes; |
400 | } |
401 | return confVal; |
402 | } |
403 | |
404 | static really_inline |
405 | void do_confWithBit_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset, |
406 | const u32 *confBase, CautionReason reason, |
407 | const struct FDR_Runtime_Args *a, const u8 *ptr, |
408 | hwlmcb_rv_t *control, u32 *last_match) { |
409 | do { |
410 | u32 bit = TEDDY_FIND_AND_CLEAR_LSB(conf); |
411 | u32 byte = bit / bucket + offset; |
412 | u32 idx = bit % bucket; |
413 | u32 cf = confBase[idx]; |
414 | if (!cf) { |
415 | continue; |
416 | } |
417 | const struct FDRConfirm *fdrc = (const struct FDRConfirm *) |
418 | ((const u8 *)confBase + cf); |
419 | if (!(fdrc->groups & *control)) { |
420 | continue; |
421 | } |
422 | u64a tmp = 0; |
423 | u64a confVal = getConfVal(a, ptr, byte, reason); |
424 | confWithBit(fdrc, a, ptr - a->buf + byte, control, |
425 | last_match, confVal, &tmp, 0); |
426 | } while (unlikely(*conf)); |
427 | } |
428 | |
429 | static really_inline |
430 | const m128 *getMaskBase(const struct Teddy *teddy) { |
431 | return (const m128 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy))); |
432 | } |
433 | |
434 | static really_inline |
435 | const u64a *getReinforcedMaskBase(const struct Teddy *teddy, u8 numMask) { |
436 | return (const u64a *)((const u8 *)getMaskBase(teddy) |
437 | + ROUNDUP_CL(2 * numMask * sizeof(m128))); |
438 | } |
439 | |
440 | static really_inline |
441 | const u32 *getConfBase(const struct Teddy *teddy) { |
442 | return (const u32 *)((const u8 *)teddy + teddy->confOffset); |
443 | } |
444 | |
445 | #endif /* TEDDY_RUNTIME_COMMON_H_ */ |
446 | |