1/*
2 * Copyright (c) 2015-2017, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** \file
30 * \brief Teddy literal matcher: SSSE3 engine runtime.
31 */
32
33#include "fdr_internal.h"
34#include "flood_runtime.h"
35#include "teddy.h"
36#include "teddy_internal.h"
37#include "teddy_runtime_common.h"
38#include "util/simd_utils.h"
39
40const u8 ALIGN_DIRECTIVE p_mask_arr[17][32] = {
41 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
42 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
43 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
44 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
45 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
46 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
47 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
48 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
49 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
50 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
51 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
52 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
53 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
54 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
55 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
56 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
57 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
58 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
59 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
60 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
61 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
62 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
63 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
64 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff},
65 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
66 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff},
67 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
68 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff},
69 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
70 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff},
71 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
72 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff},
73 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
74 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}
75};
76
77#define CONF_CHUNK_64(chunk, bucket, off, reason, conf_fn) \
78do { \
79 if (unlikely(chunk != ones_u64a)) { \
80 chunk = ~chunk; \
81 conf_fn(&chunk, bucket, off, confBase, reason, a, ptr, \
82 &control, &last_match); \
83 CHECK_HWLM_TERMINATE_MATCHING; \
84 } \
85} while(0)
86
87#define CONF_CHUNK_32(chunk, bucket, off, reason, conf_fn) \
88do { \
89 if (unlikely(chunk != ones_u32)) { \
90 chunk = ~chunk; \
91 conf_fn(&chunk, bucket, off, confBase, reason, a, ptr, \
92 &control, &last_match); \
93 CHECK_HWLM_TERMINATE_MATCHING; \
94 } \
95} while(0)
96
97#if defined(HAVE_AVX512) // AVX512 reinforced teddy
98
99#ifdef ARCH_64_BIT
100#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \
101do { \
102 if (unlikely(diff512(var, ones512()))) { \
103 m128 p128_0 = extract128from512(var, 0); \
104 m128 p128_1 = extract128from512(var, 1); \
105 m128 p128_2 = extract128from512(var, 2); \
106 m128 p128_3 = extract128from512(var, 3); \
107 u64a part1 = movq(p128_0); \
108 u64a part2 = movq(rshiftbyte_m128(p128_0, 8)); \
109 u64a part3 = movq(p128_1); \
110 u64a part4 = movq(rshiftbyte_m128(p128_1, 8)); \
111 u64a part5 = movq(p128_2); \
112 u64a part6 = movq(rshiftbyte_m128(p128_2, 8)); \
113 u64a part7 = movq(p128_3); \
114 u64a part8 = movq(rshiftbyte_m128(p128_3, 8)); \
115 CONF_CHUNK_64(part1, bucket, offset, reason, conf_fn); \
116 CONF_CHUNK_64(part2, bucket, offset + 8, reason, conf_fn); \
117 CONF_CHUNK_64(part3, bucket, offset + 16, reason, conf_fn); \
118 CONF_CHUNK_64(part4, bucket, offset + 24, reason, conf_fn); \
119 CONF_CHUNK_64(part5, bucket, offset + 32, reason, conf_fn); \
120 CONF_CHUNK_64(part6, bucket, offset + 40, reason, conf_fn); \
121 CONF_CHUNK_64(part7, bucket, offset + 48, reason, conf_fn); \
122 CONF_CHUNK_64(part8, bucket, offset + 56, reason, conf_fn); \
123 } \
124} while(0)
125#else
126#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \
127do { \
128 if (unlikely(diff512(var, ones512()))) { \
129 m128 p128_0 = extract128from512(var, 0); \
130 m128 p128_1 = extract128from512(var, 1); \
131 m128 p128_2 = extract128from512(var, 2); \
132 m128 p128_3 = extract128from512(var, 3); \
133 u32 part1 = movd(p128_0); \
134 u32 part2 = movd(rshiftbyte_m128(p128_0, 4)); \
135 u32 part3 = movd(rshiftbyte_m128(p128_0, 8)); \
136 u32 part4 = movd(rshiftbyte_m128(p128_0, 12)); \
137 u32 part5 = movd(p128_1); \
138 u32 part6 = movd(rshiftbyte_m128(p128_1, 4)); \
139 u32 part7 = movd(rshiftbyte_m128(p128_1, 8)); \
140 u32 part8 = movd(rshiftbyte_m128(p128_1, 12)); \
141 u32 part9 = movd(p128_2); \
142 u32 part10 = movd(rshiftbyte_m128(p128_2, 4)); \
143 u32 part11 = movd(rshiftbyte_m128(p128_2, 8)); \
144 u32 part12 = movd(rshiftbyte_m128(p128_2, 12)); \
145 u32 part13 = movd(p128_3); \
146 u32 part14 = movd(rshiftbyte_m128(p128_3, 4)); \
147 u32 part15 = movd(rshiftbyte_m128(p128_3, 8)); \
148 u32 part16 = movd(rshiftbyte_m128(p128_3, 12)); \
149 CONF_CHUNK_32(part1, bucket, offset, reason, conf_fn); \
150 CONF_CHUNK_32(part2, bucket, offset + 4, reason, conf_fn); \
151 CONF_CHUNK_32(part3, bucket, offset + 8, reason, conf_fn); \
152 CONF_CHUNK_32(part4, bucket, offset + 12, reason, conf_fn); \
153 CONF_CHUNK_32(part5, bucket, offset + 16, reason, conf_fn); \
154 CONF_CHUNK_32(part6, bucket, offset + 20, reason, conf_fn); \
155 CONF_CHUNK_32(part7, bucket, offset + 24, reason, conf_fn); \
156 CONF_CHUNK_32(part8, bucket, offset + 28, reason, conf_fn); \
157 CONF_CHUNK_32(part9, bucket, offset + 32, reason, conf_fn); \
158 CONF_CHUNK_32(part10, bucket, offset + 36, reason, conf_fn); \
159 CONF_CHUNK_32(part11, bucket, offset + 40, reason, conf_fn); \
160 CONF_CHUNK_32(part12, bucket, offset + 44, reason, conf_fn); \
161 CONF_CHUNK_32(part13, bucket, offset + 48, reason, conf_fn); \
162 CONF_CHUNK_32(part14, bucket, offset + 52, reason, conf_fn); \
163 CONF_CHUNK_32(part15, bucket, offset + 56, reason, conf_fn); \
164 CONF_CHUNK_32(part16, bucket, offset + 60, reason, conf_fn); \
165 } \
166} while(0)
167#endif
168
169#define PREP_SHUF_MASK_NO_REINFORCEMENT(val) \
170 m512 lo = and512(val, *lo_mask); \
171 m512 hi = and512(rshift64_m512(val, 4), *lo_mask)
172
173#define PREP_SHUF_MASK \
174 PREP_SHUF_MASK_NO_REINFORCEMENT(load512(ptr)); \
175 *c_16 = *(ptr + 15); \
176 *c_32 = *(ptr + 31); \
177 *c_48 = *(ptr + 47); \
178 m512 r_msk = set512_64(0ULL, r_msk_base[*c_48], 0ULL, r_msk_base[*c_32],\
179 0ULL, r_msk_base[*c_16], 0ULL, r_msk_base[*c_0]);\
180 *c_0 = *(ptr + 63)
181
182#define SHIFT_OR_M1 \
183 or512(pshufb_m512(dup_mask[0], lo), pshufb_m512(dup_mask[1], hi))
184
185#define SHIFT_OR_M2 \
186 or512(lshift128_m512(or512(pshufb_m512(dup_mask[2], lo), \
187 pshufb_m512(dup_mask[3], hi)), \
188 1), SHIFT_OR_M1)
189
190#define SHIFT_OR_M3 \
191 or512(lshift128_m512(or512(pshufb_m512(dup_mask[4], lo), \
192 pshufb_m512(dup_mask[5], hi)), \
193 2), SHIFT_OR_M2)
194
195#define SHIFT_OR_M4 \
196 or512(lshift128_m512(or512(pshufb_m512(dup_mask[6], lo), \
197 pshufb_m512(dup_mask[7], hi)), \
198 3), SHIFT_OR_M3)
199
200static really_inline
201m512 prep_conf_teddy_no_reinforcement_m1(const m512 *lo_mask,
202 const m512 *dup_mask,
203 const m512 val) {
204 PREP_SHUF_MASK_NO_REINFORCEMENT(val);
205 return SHIFT_OR_M1;
206}
207
208static really_inline
209m512 prep_conf_teddy_no_reinforcement_m2(const m512 *lo_mask,
210 const m512 *dup_mask,
211 const m512 val) {
212 PREP_SHUF_MASK_NO_REINFORCEMENT(val);
213 return SHIFT_OR_M2;
214}
215
216static really_inline
217m512 prep_conf_teddy_no_reinforcement_m3(const m512 *lo_mask,
218 const m512 *dup_mask,
219 const m512 val) {
220 PREP_SHUF_MASK_NO_REINFORCEMENT(val);
221 return SHIFT_OR_M3;
222}
223
224static really_inline
225m512 prep_conf_teddy_no_reinforcement_m4(const m512 *lo_mask,
226 const m512 *dup_mask,
227 const m512 val) {
228 PREP_SHUF_MASK_NO_REINFORCEMENT(val);
229 return SHIFT_OR_M4;
230}
231
232static really_inline
233m512 prep_conf_teddy_m1(const m512 *lo_mask, const m512 *dup_mask,
234 const u8 *ptr, const u64a *r_msk_base,
235 u32 *c_0, u32 *c_16, u32 *c_32, u32 *c_48) {
236 PREP_SHUF_MASK;
237 return or512(SHIFT_OR_M1, r_msk);
238}
239
240static really_inline
241m512 prep_conf_teddy_m2(const m512 *lo_mask, const m512 *dup_mask,
242 const u8 *ptr, const u64a *r_msk_base,
243 u32 *c_0, u32 *c_16, u32 *c_32, u32 *c_48) {
244 PREP_SHUF_MASK;
245 return or512(SHIFT_OR_M2, r_msk);
246}
247
248static really_inline
249m512 prep_conf_teddy_m3(const m512 *lo_mask, const m512 *dup_mask,
250 const u8 *ptr, const u64a *r_msk_base,
251 u32 *c_0, u32 *c_16, u32 *c_32, u32 *c_48) {
252 PREP_SHUF_MASK;
253 return or512(SHIFT_OR_M3, r_msk);
254}
255
256static really_inline
257m512 prep_conf_teddy_m4(const m512 *lo_mask, const m512 *dup_mask,
258 const u8 *ptr, const u64a *r_msk_base,
259 u32 *c_0, u32 *c_16, u32 *c_32, u32 *c_48) {
260 PREP_SHUF_MASK;
261 return or512(SHIFT_OR_M4, r_msk);
262}
263
264#define PREP_CONF_FN_NO_REINFORCEMENT(val, n) \
265 prep_conf_teddy_no_reinforcement_m##n(&lo_mask, dup_mask, val)
266
267#define PREP_CONF_FN(ptr, n) \
268 prep_conf_teddy_m##n(&lo_mask, dup_mask, ptr, r_msk_base, \
269 &c_0, &c_16, &c_32, &c_48)
270
271#define PREPARE_MASKS_1 \
272 dup_mask[0] = set4x128(maskBase[0]); \
273 dup_mask[1] = set4x128(maskBase[1]);
274
275#define PREPARE_MASKS_2 \
276 PREPARE_MASKS_1 \
277 dup_mask[2] = set4x128(maskBase[2]); \
278 dup_mask[3] = set4x128(maskBase[3]);
279
280#define PREPARE_MASKS_3 \
281 PREPARE_MASKS_2 \
282 dup_mask[4] = set4x128(maskBase[4]); \
283 dup_mask[5] = set4x128(maskBase[5]);
284
285#define PREPARE_MASKS_4 \
286 PREPARE_MASKS_3 \
287 dup_mask[6] = set4x128(maskBase[6]); \
288 dup_mask[7] = set4x128(maskBase[7]);
289
290#define PREPARE_MASKS(n) \
291 m512 lo_mask = set64x8(0xf); \
292 m512 dup_mask[n * 2]; \
293 PREPARE_MASKS_##n
294
295#define FDR_EXEC_TEDDY(fdr, a, control, n_msk, conf_fn) \
296do { \
297 const u8 *buf_end = a->buf + a->len; \
298 const u8 *ptr = a->buf + a->start_offset; \
299 u32 floodBackoff = FLOOD_BACKOFF_START; \
300 const u8 *tryFloodDetect = a->firstFloodDetect; \
301 u32 last_match = ones_u32; \
302 const struct Teddy *teddy = (const struct Teddy *)fdr; \
303 const size_t iterBytes = 128; \
304 DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", \
305 a->buf, a->len, a->start_offset); \
306 \
307 const m128 *maskBase = getMaskBase(teddy); \
308 PREPARE_MASKS(n_msk); \
309 const u32 *confBase = getConfBase(teddy); \
310 \
311 const u64a *r_msk_base = getReinforcedMaskBase(teddy, n_msk); \
312 u32 c_0 = 0x100; \
313 u32 c_16 = 0x100; \
314 u32 c_32 = 0x100; \
315 u32 c_48 = 0x100; \
316 const u8 *mainStart = ROUNDUP_PTR(ptr, 64); \
317 DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); \
318 if (ptr < mainStart) { \
319 ptr = mainStart - 64; \
320 m512 p_mask; \
321 m512 val_0 = vectoredLoad512(&p_mask, ptr, a->start_offset, \
322 a->buf, buf_end, \
323 a->buf_history, a->len_history, n_msk); \
324 m512 r_0 = PREP_CONF_FN_NO_REINFORCEMENT(val_0, n_msk); \
325 r_0 = or512(r_0, p_mask); \
326 CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \
327 ptr += 64; \
328 } \
329 \
330 if (ptr + 64 <= buf_end) { \
331 m512 r_0 = PREP_CONF_FN(ptr, n_msk); \
332 CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \
333 ptr += 64; \
334 } \
335 \
336 for (; ptr + iterBytes <= buf_end; ptr += iterBytes) { \
337 __builtin_prefetch(ptr + (iterBytes * 4)); \
338 CHECK_FLOOD; \
339 m512 r_0 = PREP_CONF_FN(ptr, n_msk); \
340 CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn); \
341 m512 r_1 = PREP_CONF_FN(ptr + 64, n_msk); \
342 CONFIRM_TEDDY(r_1, 8, 64, NOT_CAUTIOUS, conf_fn); \
343 } \
344 \
345 if (ptr + 64 <= buf_end) { \
346 m512 r_0 = PREP_CONF_FN(ptr, n_msk); \
347 CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn); \
348 ptr += 64; \
349 } \
350 \
351 assert(ptr + 64 > buf_end); \
352 if (ptr < buf_end) { \
353 m512 p_mask; \
354 m512 val_0 = vectoredLoad512(&p_mask, ptr, 0, ptr, buf_end, \
355 a->buf_history, a->len_history, n_msk); \
356 m512 r_0 = PREP_CONF_FN_NO_REINFORCEMENT(val_0, n_msk); \
357 r_0 = or512(r_0, p_mask); \
358 CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \
359 } \
360 \
361 return HWLM_SUCCESS; \
362} while(0)
363
364#elif defined(HAVE_AVX2) // not HAVE_AVX512 but HAVE_AVX2 reinforced teddy
365
366#ifdef ARCH_64_BIT
367#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \
368do { \
369 if (unlikely(diff256(var, ones256()))) { \
370 m128 lo = movdq_lo(var); \
371 m128 hi = movdq_hi(var); \
372 u64a part1 = movq(lo); \
373 u64a part2 = movq(rshiftbyte_m128(lo, 8)); \
374 u64a part3 = movq(hi); \
375 u64a part4 = movq(rshiftbyte_m128(hi, 8)); \
376 CONF_CHUNK_64(part1, bucket, offset, reason, conf_fn); \
377 CONF_CHUNK_64(part2, bucket, offset + 8, reason, conf_fn); \
378 CONF_CHUNK_64(part3, bucket, offset + 16, reason, conf_fn); \
379 CONF_CHUNK_64(part4, bucket, offset + 24, reason, conf_fn); \
380 } \
381} while(0)
382#else
383#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \
384do { \
385 if (unlikely(diff256(var, ones256()))) { \
386 m128 lo = movdq_lo(var); \
387 m128 hi = movdq_hi(var); \
388 u32 part1 = movd(lo); \
389 u32 part2 = movd(rshiftbyte_m128(lo, 4)); \
390 u32 part3 = movd(rshiftbyte_m128(lo, 8)); \
391 u32 part4 = movd(rshiftbyte_m128(lo, 12)); \
392 u32 part5 = movd(hi); \
393 u32 part6 = movd(rshiftbyte_m128(hi, 4)); \
394 u32 part7 = movd(rshiftbyte_m128(hi, 8)); \
395 u32 part8 = movd(rshiftbyte_m128(hi, 12)); \
396 CONF_CHUNK_32(part1, bucket, offset, reason, conf_fn); \
397 CONF_CHUNK_32(part2, bucket, offset + 4, reason, conf_fn); \
398 CONF_CHUNK_32(part3, bucket, offset + 8, reason, conf_fn); \
399 CONF_CHUNK_32(part4, bucket, offset + 12, reason, conf_fn); \
400 CONF_CHUNK_32(part5, bucket, offset + 16, reason, conf_fn); \
401 CONF_CHUNK_32(part6, bucket, offset + 20, reason, conf_fn); \
402 CONF_CHUNK_32(part7, bucket, offset + 24, reason, conf_fn); \
403 CONF_CHUNK_32(part8, bucket, offset + 28, reason, conf_fn); \
404 } \
405} while(0)
406#endif
407
408#define PREP_SHUF_MASK_NO_REINFORCEMENT(val) \
409 m256 lo = and256(val, *lo_mask); \
410 m256 hi = and256(rshift64_m256(val, 4), *lo_mask)
411
412#define PREP_SHUF_MASK \
413 PREP_SHUF_MASK_NO_REINFORCEMENT(load256(ptr)); \
414 *c_128 = *(ptr + 15); \
415 m256 r_msk = set64x4(0ULL, r_msk_base[*c_128], 0ULL, r_msk_base[*c_0]); \
416 *c_0 = *(ptr + 31)
417
418#define SHIFT_OR_M1 \
419 or256(pshufb_m256(dup_mask[0], lo), pshufb_m256(dup_mask[1], hi))
420
421#define SHIFT_OR_M2 \
422 or256(lshift128_m256(or256(pshufb_m256(dup_mask[2], lo), \
423 pshufb_m256(dup_mask[3], hi)), \
424 1), SHIFT_OR_M1)
425
426#define SHIFT_OR_M3 \
427 or256(lshift128_m256(or256(pshufb_m256(dup_mask[4], lo), \
428 pshufb_m256(dup_mask[5], hi)), \
429 2), SHIFT_OR_M2)
430
431#define SHIFT_OR_M4 \
432 or256(lshift128_m256(or256(pshufb_m256(dup_mask[6], lo), \
433 pshufb_m256(dup_mask[7], hi)), \
434 3), SHIFT_OR_M3)
435
436static really_inline
437m256 prep_conf_teddy_no_reinforcement_m1(const m256 *lo_mask,
438 const m256 *dup_mask,
439 const m256 val) {
440 PREP_SHUF_MASK_NO_REINFORCEMENT(val);
441 return SHIFT_OR_M1;
442}
443
444static really_inline
445m256 prep_conf_teddy_no_reinforcement_m2(const m256 *lo_mask,
446 const m256 *dup_mask,
447 const m256 val) {
448 PREP_SHUF_MASK_NO_REINFORCEMENT(val);
449 return SHIFT_OR_M2;
450}
451
452static really_inline
453m256 prep_conf_teddy_no_reinforcement_m3(const m256 *lo_mask,
454 const m256 *dup_mask,
455 const m256 val) {
456 PREP_SHUF_MASK_NO_REINFORCEMENT(val);
457 return SHIFT_OR_M3;
458}
459
460static really_inline
461m256 prep_conf_teddy_no_reinforcement_m4(const m256 *lo_mask,
462 const m256 *dup_mask,
463 const m256 val) {
464 PREP_SHUF_MASK_NO_REINFORCEMENT(val);
465 return SHIFT_OR_M4;
466}
467
468static really_inline
469m256 prep_conf_teddy_m1(const m256 *lo_mask, const m256 *dup_mask,
470 const u8 *ptr, const u64a *r_msk_base,
471 u32 *c_0, u32 *c_128) {
472 PREP_SHUF_MASK;
473 return or256(SHIFT_OR_M1, r_msk);
474}
475
476static really_inline
477m256 prep_conf_teddy_m2(const m256 *lo_mask, const m256 *dup_mask,
478 const u8 *ptr, const u64a *r_msk_base,
479 u32 *c_0, u32 *c_128) {
480 PREP_SHUF_MASK;
481 return or256(SHIFT_OR_M2, r_msk);
482}
483
484static really_inline
485m256 prep_conf_teddy_m3(const m256 *lo_mask, const m256 *dup_mask,
486 const u8 *ptr, const u64a *r_msk_base,
487 u32 *c_0, u32 *c_128) {
488 PREP_SHUF_MASK;
489 return or256(SHIFT_OR_M3, r_msk);
490}
491
492static really_inline
493m256 prep_conf_teddy_m4(const m256 *lo_mask, const m256 *dup_mask,
494 const u8 *ptr, const u64a *r_msk_base,
495 u32 *c_0, u32 *c_128) {
496 PREP_SHUF_MASK;
497 return or256(SHIFT_OR_M4, r_msk);
498}
499
500#define PREP_CONF_FN_NO_REINFORCEMENT(val, n) \
501 prep_conf_teddy_no_reinforcement_m##n(&lo_mask, dup_mask, val)
502
503#define PREP_CONF_FN(ptr, n) \
504 prep_conf_teddy_m##n(&lo_mask, dup_mask, ptr, r_msk_base, &c_0, &c_128)
505
506#define PREPARE_MASKS_1 \
507 dup_mask[0] = set2x128(maskBase[0]); \
508 dup_mask[1] = set2x128(maskBase[1]);
509
510#define PREPARE_MASKS_2 \
511 PREPARE_MASKS_1 \
512 dup_mask[2] = set2x128(maskBase[2]); \
513 dup_mask[3] = set2x128(maskBase[3]);
514
515#define PREPARE_MASKS_3 \
516 PREPARE_MASKS_2 \
517 dup_mask[4] = set2x128(maskBase[4]); \
518 dup_mask[5] = set2x128(maskBase[5]);
519
520#define PREPARE_MASKS_4 \
521 PREPARE_MASKS_3 \
522 dup_mask[6] = set2x128(maskBase[6]); \
523 dup_mask[7] = set2x128(maskBase[7]);
524
525#define PREPARE_MASKS(n) \
526 m256 lo_mask = set32x8(0xf); \
527 m256 dup_mask[n * 2]; \
528 PREPARE_MASKS_##n
529
530#define FDR_EXEC_TEDDY(fdr, a, control, n_msk, conf_fn) \
531do { \
532 const u8 *buf_end = a->buf + a->len; \
533 const u8 *ptr = a->buf + a->start_offset; \
534 u32 floodBackoff = FLOOD_BACKOFF_START; \
535 const u8 *tryFloodDetect = a->firstFloodDetect; \
536 u32 last_match = ones_u32; \
537 const struct Teddy *teddy = (const struct Teddy *)fdr; \
538 const size_t iterBytes = 64; \
539 DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", \
540 a->buf, a->len, a->start_offset); \
541 \
542 const m128 *maskBase = getMaskBase(teddy); \
543 PREPARE_MASKS(n_msk); \
544 const u32 *confBase = getConfBase(teddy); \
545 \
546 const u64a *r_msk_base = getReinforcedMaskBase(teddy, n_msk); \
547 u32 c_0 = 0x100; \
548 u32 c_128 = 0x100; \
549 const u8 *mainStart = ROUNDUP_PTR(ptr, 32); \
550 DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); \
551 if (ptr < mainStart) { \
552 ptr = mainStart - 32; \
553 m256 p_mask; \
554 m256 val_0 = vectoredLoad256(&p_mask, ptr, a->start_offset, \
555 a->buf, buf_end, \
556 a->buf_history, a->len_history, n_msk); \
557 m256 r_0 = PREP_CONF_FN_NO_REINFORCEMENT(val_0, n_msk); \
558 r_0 = or256(r_0, p_mask); \
559 CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \
560 ptr += 32; \
561 } \
562 \
563 if (ptr + 32 <= buf_end) { \
564 m256 r_0 = PREP_CONF_FN(ptr, n_msk); \
565 CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \
566 ptr += 32; \
567 } \
568 \
569 for (; ptr + iterBytes <= buf_end; ptr += iterBytes) { \
570 __builtin_prefetch(ptr + (iterBytes * 4)); \
571 CHECK_FLOOD; \
572 m256 r_0 = PREP_CONF_FN(ptr, n_msk); \
573 CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn); \
574 m256 r_1 = PREP_CONF_FN(ptr + 32, n_msk); \
575 CONFIRM_TEDDY(r_1, 8, 32, NOT_CAUTIOUS, conf_fn); \
576 } \
577 \
578 if (ptr + 32 <= buf_end) { \
579 m256 r_0 = PREP_CONF_FN(ptr, n_msk); \
580 CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn); \
581 ptr += 32; \
582 } \
583 \
584 assert(ptr + 32 > buf_end); \
585 if (ptr < buf_end) { \
586 m256 p_mask; \
587 m256 val_0 = vectoredLoad256(&p_mask, ptr, 0, ptr, buf_end, \
588 a->buf_history, a->len_history, n_msk); \
589 m256 r_0 = PREP_CONF_FN_NO_REINFORCEMENT(val_0, n_msk); \
590 r_0 = or256(r_0, p_mask); \
591 CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \
592 } \
593 \
594 return HWLM_SUCCESS; \
595} while(0)
596
597#else // not defined HAVE_AVX2
598
599#ifdef ARCH_64_BIT
600#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \
601do { \
602 if (unlikely(diff128(var, ones128()))) { \
603 u64a lo = movq(var); \
604 u64a hi = movq(rshiftbyte_m128(var, 8)); \
605 CONF_CHUNK_64(lo, bucket, offset, reason, conf_fn); \
606 CONF_CHUNK_64(hi, bucket, offset + 8, reason, conf_fn); \
607 } \
608} while(0)
609#else
610#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \
611do { \
612 if (unlikely(diff128(var, ones128()))) { \
613 u32 part1 = movd(var); \
614 u32 part2 = movd(rshiftbyte_m128(var, 4)); \
615 u32 part3 = movd(rshiftbyte_m128(var, 8)); \
616 u32 part4 = movd(rshiftbyte_m128(var, 12)); \
617 CONF_CHUNK_32(part1, bucket, offset, reason, conf_fn); \
618 CONF_CHUNK_32(part2, bucket, offset + 4, reason, conf_fn); \
619 CONF_CHUNK_32(part3, bucket, offset + 8, reason, conf_fn); \
620 CONF_CHUNK_32(part4, bucket, offset + 12, reason, conf_fn); \
621 } \
622} while(0)
623#endif
624
625static really_inline
626m128 prep_conf_teddy_m1(const m128 *maskBase, m128 val) {
627 m128 mask = set16x8(0xf);
628 m128 lo = and128(val, mask);
629 m128 hi = and128(rshift64_m128(val, 4), mask);
630 return or128(pshufb_m128(maskBase[0 * 2], lo),
631 pshufb_m128(maskBase[0 * 2 + 1], hi));
632}
633
634static really_inline
635m128 prep_conf_teddy_m2(const m128 *maskBase, m128 *old_1, m128 val) {
636 m128 mask = set16x8(0xf);
637 m128 lo = and128(val, mask);
638 m128 hi = and128(rshift64_m128(val, 4), mask);
639 m128 r = prep_conf_teddy_m1(maskBase, val);
640
641 m128 res_1 = or128(pshufb_m128(maskBase[1 * 2], lo),
642 pshufb_m128(maskBase[1 * 2 + 1], hi));
643 m128 res_shifted_1 = palignr(res_1, *old_1, 16 - 1);
644 *old_1 = res_1;
645 return or128(r, res_shifted_1);
646}
647
648static really_inline
649m128 prep_conf_teddy_m3(const m128 *maskBase, m128 *old_1, m128 *old_2,
650 m128 val) {
651 m128 mask = set16x8(0xf);
652 m128 lo = and128(val, mask);
653 m128 hi = and128(rshift64_m128(val, 4), mask);
654 m128 r = prep_conf_teddy_m2(maskBase, old_1, val);
655
656 m128 res_2 = or128(pshufb_m128(maskBase[2 * 2], lo),
657 pshufb_m128(maskBase[2 * 2 + 1], hi));
658 m128 res_shifted_2 = palignr(res_2, *old_2, 16 - 2);
659 *old_2 = res_2;
660 return or128(r, res_shifted_2);
661}
662
663static really_inline
664m128 prep_conf_teddy_m4(const m128 *maskBase, m128 *old_1, m128 *old_2,
665 m128 *old_3, m128 val) {
666 m128 mask = set16x8(0xf);
667 m128 lo = and128(val, mask);
668 m128 hi = and128(rshift64_m128(val, 4), mask);
669 m128 r = prep_conf_teddy_m3(maskBase, old_1, old_2, val);
670
671 m128 res_3 = or128(pshufb_m128(maskBase[3 * 2], lo),
672 pshufb_m128(maskBase[3 * 2 + 1], hi));
673 m128 res_shifted_3 = palignr(res_3, *old_3, 16 - 3);
674 *old_3 = res_3;
675 return or128(r, res_shifted_3);
676}
677
678#define FDR_EXEC_TEDDY_RES_OLD_1
679
680#define FDR_EXEC_TEDDY_RES_OLD_2 \
681 m128 res_old_1 = zeroes128();
682
683#define FDR_EXEC_TEDDY_RES_OLD_3 \
684 m128 res_old_1 = zeroes128(); \
685 m128 res_old_2 = zeroes128();
686
687#define FDR_EXEC_TEDDY_RES_OLD_4 \
688 m128 res_old_1 = zeroes128(); \
689 m128 res_old_2 = zeroes128(); \
690 m128 res_old_3 = zeroes128();
691
692#define FDR_EXEC_TEDDY_RES_OLD(n) FDR_EXEC_TEDDY_RES_OLD_##n
693
694#define PREP_CONF_FN_1(mask_base, val) \
695 prep_conf_teddy_m1(mask_base, val)
696
697#define PREP_CONF_FN_2(mask_base, val) \
698 prep_conf_teddy_m2(mask_base, &res_old_1, val)
699
700#define PREP_CONF_FN_3(mask_base, val) \
701 prep_conf_teddy_m3(mask_base, &res_old_1, &res_old_2, val)
702
703#define PREP_CONF_FN_4(mask_base, val) \
704 prep_conf_teddy_m4(mask_base, &res_old_1, &res_old_2, &res_old_3, val)
705
706#define PREP_CONF_FN(mask_base, val, n) \
707 PREP_CONF_FN_##n(mask_base, val)
708
709#define FDR_EXEC_TEDDY(fdr, a, control, n_msk, conf_fn) \
710do { \
711 const u8 *buf_end = a->buf + a->len; \
712 const u8 *ptr = a->buf + a->start_offset; \
713 u32 floodBackoff = FLOOD_BACKOFF_START; \
714 const u8 *tryFloodDetect = a->firstFloodDetect; \
715 u32 last_match = ones_u32; \
716 const struct Teddy *teddy = (const struct Teddy *)fdr; \
717 const size_t iterBytes = 32; \
718 DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", \
719 a->buf, a->len, a->start_offset); \
720 \
721 const m128 *maskBase = getMaskBase(teddy); \
722 const u32 *confBase = getConfBase(teddy); \
723 \
724 FDR_EXEC_TEDDY_RES_OLD(n_msk); \
725 const u8 *mainStart = ROUNDUP_PTR(ptr, 16); \
726 DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); \
727 if (ptr < mainStart) { \
728 ptr = mainStart - 16; \
729 m128 p_mask; \
730 m128 val_0 = vectoredLoad128(&p_mask, ptr, a->start_offset, \
731 a->buf, buf_end, \
732 a->buf_history, a->len_history, n_msk); \
733 m128 r_0 = PREP_CONF_FN(maskBase, val_0, n_msk); \
734 r_0 = or128(r_0, p_mask); \
735 CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \
736 ptr += 16; \
737 } \
738 \
739 if (ptr + 16 <= buf_end) { \
740 m128 r_0 = PREP_CONF_FN(maskBase, load128(ptr), n_msk); \
741 CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \
742 ptr += 16; \
743 } \
744 \
745 for (; ptr + iterBytes <= buf_end; ptr += iterBytes) { \
746 __builtin_prefetch(ptr + (iterBytes * 4)); \
747 CHECK_FLOOD; \
748 m128 r_0 = PREP_CONF_FN(maskBase, load128(ptr), n_msk); \
749 CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn); \
750 m128 r_1 = PREP_CONF_FN(maskBase, load128(ptr + 16), n_msk); \
751 CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, conf_fn); \
752 } \
753 \
754 if (ptr + 16 <= buf_end) { \
755 m128 r_0 = PREP_CONF_FN(maskBase, load128(ptr), n_msk); \
756 CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn); \
757 ptr += 16; \
758 } \
759 \
760 assert(ptr + 16 > buf_end); \
761 if (ptr < buf_end) { \
762 m128 p_mask; \
763 m128 val_0 = vectoredLoad128(&p_mask, ptr, 0, ptr, buf_end, \
764 a->buf_history, a->len_history, n_msk); \
765 m128 r_0 = PREP_CONF_FN(maskBase, val_0, n_msk); \
766 r_0 = or128(r_0, p_mask); \
767 CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \
768 } \
769 \
770 return HWLM_SUCCESS; \
771} while(0)
772
773#endif // HAVE_AVX2 HAVE_AVX512
774
775hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr,
776 const struct FDR_Runtime_Args *a,
777 hwlm_group_t control) {
778 FDR_EXEC_TEDDY(fdr, a, control, 1, do_confWithBit_teddy);
779}
780
781hwlm_error_t fdr_exec_teddy_msks1_pck(const struct FDR *fdr,
782 const struct FDR_Runtime_Args *a,
783 hwlm_group_t control) {
784 FDR_EXEC_TEDDY(fdr, a, control, 1, do_confWithBit_teddy);
785}
786
787hwlm_error_t fdr_exec_teddy_msks2(const struct FDR *fdr,
788 const struct FDR_Runtime_Args *a,
789 hwlm_group_t control) {
790 FDR_EXEC_TEDDY(fdr, a, control, 2, do_confWithBit_teddy);
791}
792
793hwlm_error_t fdr_exec_teddy_msks2_pck(const struct FDR *fdr,
794 const struct FDR_Runtime_Args *a,
795 hwlm_group_t control) {
796 FDR_EXEC_TEDDY(fdr, a, control, 2, do_confWithBit_teddy);
797}
798
799hwlm_error_t fdr_exec_teddy_msks3(const struct FDR *fdr,
800 const struct FDR_Runtime_Args *a,
801 hwlm_group_t control) {
802 FDR_EXEC_TEDDY(fdr, a, control, 3, do_confWithBit_teddy);
803}
804
805hwlm_error_t fdr_exec_teddy_msks3_pck(const struct FDR *fdr,
806 const struct FDR_Runtime_Args *a,
807 hwlm_group_t control) {
808 FDR_EXEC_TEDDY(fdr, a, control, 3, do_confWithBit_teddy);
809}
810
811hwlm_error_t fdr_exec_teddy_msks4(const struct FDR *fdr,
812 const struct FDR_Runtime_Args *a,
813 hwlm_group_t control) {
814 FDR_EXEC_TEDDY(fdr, a, control, 4, do_confWithBit_teddy);
815}
816
817hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr,
818 const struct FDR_Runtime_Args *a,
819 hwlm_group_t control) {
820 FDR_EXEC_TEDDY(fdr, a, control, 4, do_confWithBit_teddy);
821}
822