1 | /* |
2 | * Copyright (c) 2015-2017, Intel Corporation |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions are met: |
6 | * |
7 | * * Redistributions of source code must retain the above copyright notice, |
8 | * this list of conditions and the following disclaimer. |
9 | * * Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * * Neither the name of Intel Corporation nor the names of its contributors |
13 | * may be used to endorse or promote products derived from this software |
14 | * without specific prior written permission. |
15 | * |
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
17 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
20 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
26 | * POSSIBILITY OF SUCH DAMAGE. |
27 | */ |
28 | |
29 | /** \file |
30 | * \brief Teddy literal matcher: SSSE3 engine runtime. |
31 | */ |
32 | |
33 | #include "fdr_internal.h" |
34 | #include "flood_runtime.h" |
35 | #include "teddy.h" |
36 | #include "teddy_internal.h" |
37 | #include "teddy_runtime_common.h" |
38 | #include "util/simd_utils.h" |
39 | |
40 | const u8 ALIGN_DIRECTIVE p_mask_arr[17][32] = { |
41 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
42 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, |
43 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
44 | 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, |
45 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
46 | 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, |
47 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
48 | 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, |
49 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
50 | 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, |
51 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
52 | 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, |
53 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
54 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, |
55 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
56 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, |
57 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
58 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, |
59 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
60 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, |
61 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
62 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, |
63 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
64 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff}, |
65 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
66 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff}, |
67 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
68 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff}, |
69 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
70 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff}, |
71 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
72 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff}, |
73 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
74 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00} |
75 | }; |
76 | |
77 | #define CONF_CHUNK_64(chunk, bucket, off, reason, conf_fn) \ |
78 | do { \ |
79 | if (unlikely(chunk != ones_u64a)) { \ |
80 | chunk = ~chunk; \ |
81 | conf_fn(&chunk, bucket, off, confBase, reason, a, ptr, \ |
82 | &control, &last_match); \ |
83 | CHECK_HWLM_TERMINATE_MATCHING; \ |
84 | } \ |
85 | } while(0) |
86 | |
87 | #define CONF_CHUNK_32(chunk, bucket, off, reason, conf_fn) \ |
88 | do { \ |
89 | if (unlikely(chunk != ones_u32)) { \ |
90 | chunk = ~chunk; \ |
91 | conf_fn(&chunk, bucket, off, confBase, reason, a, ptr, \ |
92 | &control, &last_match); \ |
93 | CHECK_HWLM_TERMINATE_MATCHING; \ |
94 | } \ |
95 | } while(0) |
96 | |
97 | #if defined(HAVE_AVX512) // AVX512 reinforced teddy |
98 | |
99 | #ifdef ARCH_64_BIT |
100 | #define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \ |
101 | do { \ |
102 | if (unlikely(diff512(var, ones512()))) { \ |
103 | m128 p128_0 = extract128from512(var, 0); \ |
104 | m128 p128_1 = extract128from512(var, 1); \ |
105 | m128 p128_2 = extract128from512(var, 2); \ |
106 | m128 p128_3 = extract128from512(var, 3); \ |
107 | u64a part1 = movq(p128_0); \ |
108 | u64a part2 = movq(rshiftbyte_m128(p128_0, 8)); \ |
109 | u64a part3 = movq(p128_1); \ |
110 | u64a part4 = movq(rshiftbyte_m128(p128_1, 8)); \ |
111 | u64a part5 = movq(p128_2); \ |
112 | u64a part6 = movq(rshiftbyte_m128(p128_2, 8)); \ |
113 | u64a part7 = movq(p128_3); \ |
114 | u64a part8 = movq(rshiftbyte_m128(p128_3, 8)); \ |
115 | CONF_CHUNK_64(part1, bucket, offset, reason, conf_fn); \ |
116 | CONF_CHUNK_64(part2, bucket, offset + 8, reason, conf_fn); \ |
117 | CONF_CHUNK_64(part3, bucket, offset + 16, reason, conf_fn); \ |
118 | CONF_CHUNK_64(part4, bucket, offset + 24, reason, conf_fn); \ |
119 | CONF_CHUNK_64(part5, bucket, offset + 32, reason, conf_fn); \ |
120 | CONF_CHUNK_64(part6, bucket, offset + 40, reason, conf_fn); \ |
121 | CONF_CHUNK_64(part7, bucket, offset + 48, reason, conf_fn); \ |
122 | CONF_CHUNK_64(part8, bucket, offset + 56, reason, conf_fn); \ |
123 | } \ |
124 | } while(0) |
125 | #else |
126 | #define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \ |
127 | do { \ |
128 | if (unlikely(diff512(var, ones512()))) { \ |
129 | m128 p128_0 = extract128from512(var, 0); \ |
130 | m128 p128_1 = extract128from512(var, 1); \ |
131 | m128 p128_2 = extract128from512(var, 2); \ |
132 | m128 p128_3 = extract128from512(var, 3); \ |
133 | u32 part1 = movd(p128_0); \ |
134 | u32 part2 = movd(rshiftbyte_m128(p128_0, 4)); \ |
135 | u32 part3 = movd(rshiftbyte_m128(p128_0, 8)); \ |
136 | u32 part4 = movd(rshiftbyte_m128(p128_0, 12)); \ |
137 | u32 part5 = movd(p128_1); \ |
138 | u32 part6 = movd(rshiftbyte_m128(p128_1, 4)); \ |
139 | u32 part7 = movd(rshiftbyte_m128(p128_1, 8)); \ |
140 | u32 part8 = movd(rshiftbyte_m128(p128_1, 12)); \ |
141 | u32 part9 = movd(p128_2); \ |
142 | u32 part10 = movd(rshiftbyte_m128(p128_2, 4)); \ |
143 | u32 part11 = movd(rshiftbyte_m128(p128_2, 8)); \ |
144 | u32 part12 = movd(rshiftbyte_m128(p128_2, 12)); \ |
145 | u32 part13 = movd(p128_3); \ |
146 | u32 part14 = movd(rshiftbyte_m128(p128_3, 4)); \ |
147 | u32 part15 = movd(rshiftbyte_m128(p128_3, 8)); \ |
148 | u32 part16 = movd(rshiftbyte_m128(p128_3, 12)); \ |
149 | CONF_CHUNK_32(part1, bucket, offset, reason, conf_fn); \ |
150 | CONF_CHUNK_32(part2, bucket, offset + 4, reason, conf_fn); \ |
151 | CONF_CHUNK_32(part3, bucket, offset + 8, reason, conf_fn); \ |
152 | CONF_CHUNK_32(part4, bucket, offset + 12, reason, conf_fn); \ |
153 | CONF_CHUNK_32(part5, bucket, offset + 16, reason, conf_fn); \ |
154 | CONF_CHUNK_32(part6, bucket, offset + 20, reason, conf_fn); \ |
155 | CONF_CHUNK_32(part7, bucket, offset + 24, reason, conf_fn); \ |
156 | CONF_CHUNK_32(part8, bucket, offset + 28, reason, conf_fn); \ |
157 | CONF_CHUNK_32(part9, bucket, offset + 32, reason, conf_fn); \ |
158 | CONF_CHUNK_32(part10, bucket, offset + 36, reason, conf_fn); \ |
159 | CONF_CHUNK_32(part11, bucket, offset + 40, reason, conf_fn); \ |
160 | CONF_CHUNK_32(part12, bucket, offset + 44, reason, conf_fn); \ |
161 | CONF_CHUNK_32(part13, bucket, offset + 48, reason, conf_fn); \ |
162 | CONF_CHUNK_32(part14, bucket, offset + 52, reason, conf_fn); \ |
163 | CONF_CHUNK_32(part15, bucket, offset + 56, reason, conf_fn); \ |
164 | CONF_CHUNK_32(part16, bucket, offset + 60, reason, conf_fn); \ |
165 | } \ |
166 | } while(0) |
167 | #endif |
168 | |
169 | #define PREP_SHUF_MASK_NO_REINFORCEMENT(val) \ |
170 | m512 lo = and512(val, *lo_mask); \ |
171 | m512 hi = and512(rshift64_m512(val, 4), *lo_mask) |
172 | |
173 | #define PREP_SHUF_MASK \ |
174 | PREP_SHUF_MASK_NO_REINFORCEMENT(load512(ptr)); \ |
175 | *c_16 = *(ptr + 15); \ |
176 | *c_32 = *(ptr + 31); \ |
177 | *c_48 = *(ptr + 47); \ |
178 | m512 r_msk = set512_64(0ULL, r_msk_base[*c_48], 0ULL, r_msk_base[*c_32],\ |
179 | 0ULL, r_msk_base[*c_16], 0ULL, r_msk_base[*c_0]);\ |
180 | *c_0 = *(ptr + 63) |
181 | |
182 | #define SHIFT_OR_M1 \ |
183 | or512(pshufb_m512(dup_mask[0], lo), pshufb_m512(dup_mask[1], hi)) |
184 | |
185 | #define SHIFT_OR_M2 \ |
186 | or512(lshift128_m512(or512(pshufb_m512(dup_mask[2], lo), \ |
187 | pshufb_m512(dup_mask[3], hi)), \ |
188 | 1), SHIFT_OR_M1) |
189 | |
190 | #define SHIFT_OR_M3 \ |
191 | or512(lshift128_m512(or512(pshufb_m512(dup_mask[4], lo), \ |
192 | pshufb_m512(dup_mask[5], hi)), \ |
193 | 2), SHIFT_OR_M2) |
194 | |
195 | #define SHIFT_OR_M4 \ |
196 | or512(lshift128_m512(or512(pshufb_m512(dup_mask[6], lo), \ |
197 | pshufb_m512(dup_mask[7], hi)), \ |
198 | 3), SHIFT_OR_M3) |
199 | |
200 | static really_inline |
201 | m512 prep_conf_teddy_no_reinforcement_m1(const m512 *lo_mask, |
202 | const m512 *dup_mask, |
203 | const m512 val) { |
204 | PREP_SHUF_MASK_NO_REINFORCEMENT(val); |
205 | return SHIFT_OR_M1; |
206 | } |
207 | |
208 | static really_inline |
209 | m512 prep_conf_teddy_no_reinforcement_m2(const m512 *lo_mask, |
210 | const m512 *dup_mask, |
211 | const m512 val) { |
212 | PREP_SHUF_MASK_NO_REINFORCEMENT(val); |
213 | return SHIFT_OR_M2; |
214 | } |
215 | |
216 | static really_inline |
217 | m512 prep_conf_teddy_no_reinforcement_m3(const m512 *lo_mask, |
218 | const m512 *dup_mask, |
219 | const m512 val) { |
220 | PREP_SHUF_MASK_NO_REINFORCEMENT(val); |
221 | return SHIFT_OR_M3; |
222 | } |
223 | |
224 | static really_inline |
225 | m512 prep_conf_teddy_no_reinforcement_m4(const m512 *lo_mask, |
226 | const m512 *dup_mask, |
227 | const m512 val) { |
228 | PREP_SHUF_MASK_NO_REINFORCEMENT(val); |
229 | return SHIFT_OR_M4; |
230 | } |
231 | |
232 | static really_inline |
233 | m512 prep_conf_teddy_m1(const m512 *lo_mask, const m512 *dup_mask, |
234 | const u8 *ptr, const u64a *r_msk_base, |
235 | u32 *c_0, u32 *c_16, u32 *c_32, u32 *c_48) { |
236 | PREP_SHUF_MASK; |
237 | return or512(SHIFT_OR_M1, r_msk); |
238 | } |
239 | |
240 | static really_inline |
241 | m512 prep_conf_teddy_m2(const m512 *lo_mask, const m512 *dup_mask, |
242 | const u8 *ptr, const u64a *r_msk_base, |
243 | u32 *c_0, u32 *c_16, u32 *c_32, u32 *c_48) { |
244 | PREP_SHUF_MASK; |
245 | return or512(SHIFT_OR_M2, r_msk); |
246 | } |
247 | |
248 | static really_inline |
249 | m512 prep_conf_teddy_m3(const m512 *lo_mask, const m512 *dup_mask, |
250 | const u8 *ptr, const u64a *r_msk_base, |
251 | u32 *c_0, u32 *c_16, u32 *c_32, u32 *c_48) { |
252 | PREP_SHUF_MASK; |
253 | return or512(SHIFT_OR_M3, r_msk); |
254 | } |
255 | |
256 | static really_inline |
257 | m512 prep_conf_teddy_m4(const m512 *lo_mask, const m512 *dup_mask, |
258 | const u8 *ptr, const u64a *r_msk_base, |
259 | u32 *c_0, u32 *c_16, u32 *c_32, u32 *c_48) { |
260 | PREP_SHUF_MASK; |
261 | return or512(SHIFT_OR_M4, r_msk); |
262 | } |
263 | |
264 | #define PREP_CONF_FN_NO_REINFORCEMENT(val, n) \ |
265 | prep_conf_teddy_no_reinforcement_m##n(&lo_mask, dup_mask, val) |
266 | |
267 | #define PREP_CONF_FN(ptr, n) \ |
268 | prep_conf_teddy_m##n(&lo_mask, dup_mask, ptr, r_msk_base, \ |
269 | &c_0, &c_16, &c_32, &c_48) |
270 | |
271 | #define PREPARE_MASKS_1 \ |
272 | dup_mask[0] = set4x128(maskBase[0]); \ |
273 | dup_mask[1] = set4x128(maskBase[1]); |
274 | |
275 | #define PREPARE_MASKS_2 \ |
276 | PREPARE_MASKS_1 \ |
277 | dup_mask[2] = set4x128(maskBase[2]); \ |
278 | dup_mask[3] = set4x128(maskBase[3]); |
279 | |
280 | #define PREPARE_MASKS_3 \ |
281 | PREPARE_MASKS_2 \ |
282 | dup_mask[4] = set4x128(maskBase[4]); \ |
283 | dup_mask[5] = set4x128(maskBase[5]); |
284 | |
285 | #define PREPARE_MASKS_4 \ |
286 | PREPARE_MASKS_3 \ |
287 | dup_mask[6] = set4x128(maskBase[6]); \ |
288 | dup_mask[7] = set4x128(maskBase[7]); |
289 | |
290 | #define PREPARE_MASKS(n) \ |
291 | m512 lo_mask = set64x8(0xf); \ |
292 | m512 dup_mask[n * 2]; \ |
293 | PREPARE_MASKS_##n |
294 | |
295 | #define FDR_EXEC_TEDDY(fdr, a, control, n_msk, conf_fn) \ |
296 | do { \ |
297 | const u8 *buf_end = a->buf + a->len; \ |
298 | const u8 *ptr = a->buf + a->start_offset; \ |
299 | u32 floodBackoff = FLOOD_BACKOFF_START; \ |
300 | const u8 *tryFloodDetect = a->firstFloodDetect; \ |
301 | u32 last_match = ones_u32; \ |
302 | const struct Teddy *teddy = (const struct Teddy *)fdr; \ |
303 | const size_t iterBytes = 128; \ |
304 | DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", \ |
305 | a->buf, a->len, a->start_offset); \ |
306 | \ |
307 | const m128 *maskBase = getMaskBase(teddy); \ |
308 | PREPARE_MASKS(n_msk); \ |
309 | const u32 *confBase = getConfBase(teddy); \ |
310 | \ |
311 | const u64a *r_msk_base = getReinforcedMaskBase(teddy, n_msk); \ |
312 | u32 c_0 = 0x100; \ |
313 | u32 c_16 = 0x100; \ |
314 | u32 c_32 = 0x100; \ |
315 | u32 c_48 = 0x100; \ |
316 | const u8 *mainStart = ROUNDUP_PTR(ptr, 64); \ |
317 | DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); \ |
318 | if (ptr < mainStart) { \ |
319 | ptr = mainStart - 64; \ |
320 | m512 p_mask; \ |
321 | m512 val_0 = vectoredLoad512(&p_mask, ptr, a->start_offset, \ |
322 | a->buf, buf_end, \ |
323 | a->buf_history, a->len_history, n_msk); \ |
324 | m512 r_0 = PREP_CONF_FN_NO_REINFORCEMENT(val_0, n_msk); \ |
325 | r_0 = or512(r_0, p_mask); \ |
326 | CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \ |
327 | ptr += 64; \ |
328 | } \ |
329 | \ |
330 | if (ptr + 64 <= buf_end) { \ |
331 | m512 r_0 = PREP_CONF_FN(ptr, n_msk); \ |
332 | CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \ |
333 | ptr += 64; \ |
334 | } \ |
335 | \ |
336 | for (; ptr + iterBytes <= buf_end; ptr += iterBytes) { \ |
337 | __builtin_prefetch(ptr + (iterBytes * 4)); \ |
338 | CHECK_FLOOD; \ |
339 | m512 r_0 = PREP_CONF_FN(ptr, n_msk); \ |
340 | CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn); \ |
341 | m512 r_1 = PREP_CONF_FN(ptr + 64, n_msk); \ |
342 | CONFIRM_TEDDY(r_1, 8, 64, NOT_CAUTIOUS, conf_fn); \ |
343 | } \ |
344 | \ |
345 | if (ptr + 64 <= buf_end) { \ |
346 | m512 r_0 = PREP_CONF_FN(ptr, n_msk); \ |
347 | CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn); \ |
348 | ptr += 64; \ |
349 | } \ |
350 | \ |
351 | assert(ptr + 64 > buf_end); \ |
352 | if (ptr < buf_end) { \ |
353 | m512 p_mask; \ |
354 | m512 val_0 = vectoredLoad512(&p_mask, ptr, 0, ptr, buf_end, \ |
355 | a->buf_history, a->len_history, n_msk); \ |
356 | m512 r_0 = PREP_CONF_FN_NO_REINFORCEMENT(val_0, n_msk); \ |
357 | r_0 = or512(r_0, p_mask); \ |
358 | CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \ |
359 | } \ |
360 | \ |
361 | return HWLM_SUCCESS; \ |
362 | } while(0) |
363 | |
364 | #elif defined(HAVE_AVX2) // not HAVE_AVX512 but HAVE_AVX2 reinforced teddy |
365 | |
366 | #ifdef ARCH_64_BIT |
367 | #define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \ |
368 | do { \ |
369 | if (unlikely(diff256(var, ones256()))) { \ |
370 | m128 lo = movdq_lo(var); \ |
371 | m128 hi = movdq_hi(var); \ |
372 | u64a part1 = movq(lo); \ |
373 | u64a part2 = movq(rshiftbyte_m128(lo, 8)); \ |
374 | u64a part3 = movq(hi); \ |
375 | u64a part4 = movq(rshiftbyte_m128(hi, 8)); \ |
376 | CONF_CHUNK_64(part1, bucket, offset, reason, conf_fn); \ |
377 | CONF_CHUNK_64(part2, bucket, offset + 8, reason, conf_fn); \ |
378 | CONF_CHUNK_64(part3, bucket, offset + 16, reason, conf_fn); \ |
379 | CONF_CHUNK_64(part4, bucket, offset + 24, reason, conf_fn); \ |
380 | } \ |
381 | } while(0) |
382 | #else |
383 | #define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \ |
384 | do { \ |
385 | if (unlikely(diff256(var, ones256()))) { \ |
386 | m128 lo = movdq_lo(var); \ |
387 | m128 hi = movdq_hi(var); \ |
388 | u32 part1 = movd(lo); \ |
389 | u32 part2 = movd(rshiftbyte_m128(lo, 4)); \ |
390 | u32 part3 = movd(rshiftbyte_m128(lo, 8)); \ |
391 | u32 part4 = movd(rshiftbyte_m128(lo, 12)); \ |
392 | u32 part5 = movd(hi); \ |
393 | u32 part6 = movd(rshiftbyte_m128(hi, 4)); \ |
394 | u32 part7 = movd(rshiftbyte_m128(hi, 8)); \ |
395 | u32 part8 = movd(rshiftbyte_m128(hi, 12)); \ |
396 | CONF_CHUNK_32(part1, bucket, offset, reason, conf_fn); \ |
397 | CONF_CHUNK_32(part2, bucket, offset + 4, reason, conf_fn); \ |
398 | CONF_CHUNK_32(part3, bucket, offset + 8, reason, conf_fn); \ |
399 | CONF_CHUNK_32(part4, bucket, offset + 12, reason, conf_fn); \ |
400 | CONF_CHUNK_32(part5, bucket, offset + 16, reason, conf_fn); \ |
401 | CONF_CHUNK_32(part6, bucket, offset + 20, reason, conf_fn); \ |
402 | CONF_CHUNK_32(part7, bucket, offset + 24, reason, conf_fn); \ |
403 | CONF_CHUNK_32(part8, bucket, offset + 28, reason, conf_fn); \ |
404 | } \ |
405 | } while(0) |
406 | #endif |
407 | |
408 | #define PREP_SHUF_MASK_NO_REINFORCEMENT(val) \ |
409 | m256 lo = and256(val, *lo_mask); \ |
410 | m256 hi = and256(rshift64_m256(val, 4), *lo_mask) |
411 | |
412 | #define PREP_SHUF_MASK \ |
413 | PREP_SHUF_MASK_NO_REINFORCEMENT(load256(ptr)); \ |
414 | *c_128 = *(ptr + 15); \ |
415 | m256 r_msk = set64x4(0ULL, r_msk_base[*c_128], 0ULL, r_msk_base[*c_0]); \ |
416 | *c_0 = *(ptr + 31) |
417 | |
418 | #define SHIFT_OR_M1 \ |
419 | or256(pshufb_m256(dup_mask[0], lo), pshufb_m256(dup_mask[1], hi)) |
420 | |
421 | #define SHIFT_OR_M2 \ |
422 | or256(lshift128_m256(or256(pshufb_m256(dup_mask[2], lo), \ |
423 | pshufb_m256(dup_mask[3], hi)), \ |
424 | 1), SHIFT_OR_M1) |
425 | |
426 | #define SHIFT_OR_M3 \ |
427 | or256(lshift128_m256(or256(pshufb_m256(dup_mask[4], lo), \ |
428 | pshufb_m256(dup_mask[5], hi)), \ |
429 | 2), SHIFT_OR_M2) |
430 | |
431 | #define SHIFT_OR_M4 \ |
432 | or256(lshift128_m256(or256(pshufb_m256(dup_mask[6], lo), \ |
433 | pshufb_m256(dup_mask[7], hi)), \ |
434 | 3), SHIFT_OR_M3) |
435 | |
436 | static really_inline |
437 | m256 prep_conf_teddy_no_reinforcement_m1(const m256 *lo_mask, |
438 | const m256 *dup_mask, |
439 | const m256 val) { |
440 | PREP_SHUF_MASK_NO_REINFORCEMENT(val); |
441 | return SHIFT_OR_M1; |
442 | } |
443 | |
444 | static really_inline |
445 | m256 prep_conf_teddy_no_reinforcement_m2(const m256 *lo_mask, |
446 | const m256 *dup_mask, |
447 | const m256 val) { |
448 | PREP_SHUF_MASK_NO_REINFORCEMENT(val); |
449 | return SHIFT_OR_M2; |
450 | } |
451 | |
452 | static really_inline |
453 | m256 prep_conf_teddy_no_reinforcement_m3(const m256 *lo_mask, |
454 | const m256 *dup_mask, |
455 | const m256 val) { |
456 | PREP_SHUF_MASK_NO_REINFORCEMENT(val); |
457 | return SHIFT_OR_M3; |
458 | } |
459 | |
460 | static really_inline |
461 | m256 prep_conf_teddy_no_reinforcement_m4(const m256 *lo_mask, |
462 | const m256 *dup_mask, |
463 | const m256 val) { |
464 | PREP_SHUF_MASK_NO_REINFORCEMENT(val); |
465 | return SHIFT_OR_M4; |
466 | } |
467 | |
468 | static really_inline |
469 | m256 prep_conf_teddy_m1(const m256 *lo_mask, const m256 *dup_mask, |
470 | const u8 *ptr, const u64a *r_msk_base, |
471 | u32 *c_0, u32 *c_128) { |
472 | PREP_SHUF_MASK; |
473 | return or256(SHIFT_OR_M1, r_msk); |
474 | } |
475 | |
476 | static really_inline |
477 | m256 prep_conf_teddy_m2(const m256 *lo_mask, const m256 *dup_mask, |
478 | const u8 *ptr, const u64a *r_msk_base, |
479 | u32 *c_0, u32 *c_128) { |
480 | PREP_SHUF_MASK; |
481 | return or256(SHIFT_OR_M2, r_msk); |
482 | } |
483 | |
484 | static really_inline |
485 | m256 prep_conf_teddy_m3(const m256 *lo_mask, const m256 *dup_mask, |
486 | const u8 *ptr, const u64a *r_msk_base, |
487 | u32 *c_0, u32 *c_128) { |
488 | PREP_SHUF_MASK; |
489 | return or256(SHIFT_OR_M3, r_msk); |
490 | } |
491 | |
492 | static really_inline |
493 | m256 prep_conf_teddy_m4(const m256 *lo_mask, const m256 *dup_mask, |
494 | const u8 *ptr, const u64a *r_msk_base, |
495 | u32 *c_0, u32 *c_128) { |
496 | PREP_SHUF_MASK; |
497 | return or256(SHIFT_OR_M4, r_msk); |
498 | } |
499 | |
500 | #define PREP_CONF_FN_NO_REINFORCEMENT(val, n) \ |
501 | prep_conf_teddy_no_reinforcement_m##n(&lo_mask, dup_mask, val) |
502 | |
503 | #define PREP_CONF_FN(ptr, n) \ |
504 | prep_conf_teddy_m##n(&lo_mask, dup_mask, ptr, r_msk_base, &c_0, &c_128) |
505 | |
506 | #define PREPARE_MASKS_1 \ |
507 | dup_mask[0] = set2x128(maskBase[0]); \ |
508 | dup_mask[1] = set2x128(maskBase[1]); |
509 | |
510 | #define PREPARE_MASKS_2 \ |
511 | PREPARE_MASKS_1 \ |
512 | dup_mask[2] = set2x128(maskBase[2]); \ |
513 | dup_mask[3] = set2x128(maskBase[3]); |
514 | |
515 | #define PREPARE_MASKS_3 \ |
516 | PREPARE_MASKS_2 \ |
517 | dup_mask[4] = set2x128(maskBase[4]); \ |
518 | dup_mask[5] = set2x128(maskBase[5]); |
519 | |
520 | #define PREPARE_MASKS_4 \ |
521 | PREPARE_MASKS_3 \ |
522 | dup_mask[6] = set2x128(maskBase[6]); \ |
523 | dup_mask[7] = set2x128(maskBase[7]); |
524 | |
525 | #define PREPARE_MASKS(n) \ |
526 | m256 lo_mask = set32x8(0xf); \ |
527 | m256 dup_mask[n * 2]; \ |
528 | PREPARE_MASKS_##n |
529 | |
530 | #define FDR_EXEC_TEDDY(fdr, a, control, n_msk, conf_fn) \ |
531 | do { \ |
532 | const u8 *buf_end = a->buf + a->len; \ |
533 | const u8 *ptr = a->buf + a->start_offset; \ |
534 | u32 floodBackoff = FLOOD_BACKOFF_START; \ |
535 | const u8 *tryFloodDetect = a->firstFloodDetect; \ |
536 | u32 last_match = ones_u32; \ |
537 | const struct Teddy *teddy = (const struct Teddy *)fdr; \ |
538 | const size_t iterBytes = 64; \ |
539 | DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", \ |
540 | a->buf, a->len, a->start_offset); \ |
541 | \ |
542 | const m128 *maskBase = getMaskBase(teddy); \ |
543 | PREPARE_MASKS(n_msk); \ |
544 | const u32 *confBase = getConfBase(teddy); \ |
545 | \ |
546 | const u64a *r_msk_base = getReinforcedMaskBase(teddy, n_msk); \ |
547 | u32 c_0 = 0x100; \ |
548 | u32 c_128 = 0x100; \ |
549 | const u8 *mainStart = ROUNDUP_PTR(ptr, 32); \ |
550 | DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); \ |
551 | if (ptr < mainStart) { \ |
552 | ptr = mainStart - 32; \ |
553 | m256 p_mask; \ |
554 | m256 val_0 = vectoredLoad256(&p_mask, ptr, a->start_offset, \ |
555 | a->buf, buf_end, \ |
556 | a->buf_history, a->len_history, n_msk); \ |
557 | m256 r_0 = PREP_CONF_FN_NO_REINFORCEMENT(val_0, n_msk); \ |
558 | r_0 = or256(r_0, p_mask); \ |
559 | CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \ |
560 | ptr += 32; \ |
561 | } \ |
562 | \ |
563 | if (ptr + 32 <= buf_end) { \ |
564 | m256 r_0 = PREP_CONF_FN(ptr, n_msk); \ |
565 | CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \ |
566 | ptr += 32; \ |
567 | } \ |
568 | \ |
569 | for (; ptr + iterBytes <= buf_end; ptr += iterBytes) { \ |
570 | __builtin_prefetch(ptr + (iterBytes * 4)); \ |
571 | CHECK_FLOOD; \ |
572 | m256 r_0 = PREP_CONF_FN(ptr, n_msk); \ |
573 | CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn); \ |
574 | m256 r_1 = PREP_CONF_FN(ptr + 32, n_msk); \ |
575 | CONFIRM_TEDDY(r_1, 8, 32, NOT_CAUTIOUS, conf_fn); \ |
576 | } \ |
577 | \ |
578 | if (ptr + 32 <= buf_end) { \ |
579 | m256 r_0 = PREP_CONF_FN(ptr, n_msk); \ |
580 | CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn); \ |
581 | ptr += 32; \ |
582 | } \ |
583 | \ |
584 | assert(ptr + 32 > buf_end); \ |
585 | if (ptr < buf_end) { \ |
586 | m256 p_mask; \ |
587 | m256 val_0 = vectoredLoad256(&p_mask, ptr, 0, ptr, buf_end, \ |
588 | a->buf_history, a->len_history, n_msk); \ |
589 | m256 r_0 = PREP_CONF_FN_NO_REINFORCEMENT(val_0, n_msk); \ |
590 | r_0 = or256(r_0, p_mask); \ |
591 | CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \ |
592 | } \ |
593 | \ |
594 | return HWLM_SUCCESS; \ |
595 | } while(0) |
596 | |
597 | #else // not defined HAVE_AVX2 |
598 | |
599 | #ifdef ARCH_64_BIT |
600 | #define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \ |
601 | do { \ |
602 | if (unlikely(diff128(var, ones128()))) { \ |
603 | u64a lo = movq(var); \ |
604 | u64a hi = movq(rshiftbyte_m128(var, 8)); \ |
605 | CONF_CHUNK_64(lo, bucket, offset, reason, conf_fn); \ |
606 | CONF_CHUNK_64(hi, bucket, offset + 8, reason, conf_fn); \ |
607 | } \ |
608 | } while(0) |
609 | #else |
610 | #define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \ |
611 | do { \ |
612 | if (unlikely(diff128(var, ones128()))) { \ |
613 | u32 part1 = movd(var); \ |
614 | u32 part2 = movd(rshiftbyte_m128(var, 4)); \ |
615 | u32 part3 = movd(rshiftbyte_m128(var, 8)); \ |
616 | u32 part4 = movd(rshiftbyte_m128(var, 12)); \ |
617 | CONF_CHUNK_32(part1, bucket, offset, reason, conf_fn); \ |
618 | CONF_CHUNK_32(part2, bucket, offset + 4, reason, conf_fn); \ |
619 | CONF_CHUNK_32(part3, bucket, offset + 8, reason, conf_fn); \ |
620 | CONF_CHUNK_32(part4, bucket, offset + 12, reason, conf_fn); \ |
621 | } \ |
622 | } while(0) |
623 | #endif |
624 | |
625 | static really_inline |
626 | m128 prep_conf_teddy_m1(const m128 *maskBase, m128 val) { |
627 | m128 mask = set16x8(0xf); |
628 | m128 lo = and128(val, mask); |
629 | m128 hi = and128(rshift64_m128(val, 4), mask); |
630 | return or128(pshufb_m128(maskBase[0 * 2], lo), |
631 | pshufb_m128(maskBase[0 * 2 + 1], hi)); |
632 | } |
633 | |
634 | static really_inline |
635 | m128 prep_conf_teddy_m2(const m128 *maskBase, m128 *old_1, m128 val) { |
636 | m128 mask = set16x8(0xf); |
637 | m128 lo = and128(val, mask); |
638 | m128 hi = and128(rshift64_m128(val, 4), mask); |
639 | m128 r = prep_conf_teddy_m1(maskBase, val); |
640 | |
641 | m128 res_1 = or128(pshufb_m128(maskBase[1 * 2], lo), |
642 | pshufb_m128(maskBase[1 * 2 + 1], hi)); |
643 | m128 res_shifted_1 = palignr(res_1, *old_1, 16 - 1); |
644 | *old_1 = res_1; |
645 | return or128(r, res_shifted_1); |
646 | } |
647 | |
648 | static really_inline |
649 | m128 prep_conf_teddy_m3(const m128 *maskBase, m128 *old_1, m128 *old_2, |
650 | m128 val) { |
651 | m128 mask = set16x8(0xf); |
652 | m128 lo = and128(val, mask); |
653 | m128 hi = and128(rshift64_m128(val, 4), mask); |
654 | m128 r = prep_conf_teddy_m2(maskBase, old_1, val); |
655 | |
656 | m128 res_2 = or128(pshufb_m128(maskBase[2 * 2], lo), |
657 | pshufb_m128(maskBase[2 * 2 + 1], hi)); |
658 | m128 res_shifted_2 = palignr(res_2, *old_2, 16 - 2); |
659 | *old_2 = res_2; |
660 | return or128(r, res_shifted_2); |
661 | } |
662 | |
663 | static really_inline |
664 | m128 prep_conf_teddy_m4(const m128 *maskBase, m128 *old_1, m128 *old_2, |
665 | m128 *old_3, m128 val) { |
666 | m128 mask = set16x8(0xf); |
667 | m128 lo = and128(val, mask); |
668 | m128 hi = and128(rshift64_m128(val, 4), mask); |
669 | m128 r = prep_conf_teddy_m3(maskBase, old_1, old_2, val); |
670 | |
671 | m128 res_3 = or128(pshufb_m128(maskBase[3 * 2], lo), |
672 | pshufb_m128(maskBase[3 * 2 + 1], hi)); |
673 | m128 res_shifted_3 = palignr(res_3, *old_3, 16 - 3); |
674 | *old_3 = res_3; |
675 | return or128(r, res_shifted_3); |
676 | } |
677 | |
678 | #define FDR_EXEC_TEDDY_RES_OLD_1 |
679 | |
680 | #define FDR_EXEC_TEDDY_RES_OLD_2 \ |
681 | m128 res_old_1 = zeroes128(); |
682 | |
683 | #define FDR_EXEC_TEDDY_RES_OLD_3 \ |
684 | m128 res_old_1 = zeroes128(); \ |
685 | m128 res_old_2 = zeroes128(); |
686 | |
687 | #define FDR_EXEC_TEDDY_RES_OLD_4 \ |
688 | m128 res_old_1 = zeroes128(); \ |
689 | m128 res_old_2 = zeroes128(); \ |
690 | m128 res_old_3 = zeroes128(); |
691 | |
692 | #define FDR_EXEC_TEDDY_RES_OLD(n) FDR_EXEC_TEDDY_RES_OLD_##n |
693 | |
694 | #define PREP_CONF_FN_1(mask_base, val) \ |
695 | prep_conf_teddy_m1(mask_base, val) |
696 | |
697 | #define PREP_CONF_FN_2(mask_base, val) \ |
698 | prep_conf_teddy_m2(mask_base, &res_old_1, val) |
699 | |
700 | #define PREP_CONF_FN_3(mask_base, val) \ |
701 | prep_conf_teddy_m3(mask_base, &res_old_1, &res_old_2, val) |
702 | |
703 | #define PREP_CONF_FN_4(mask_base, val) \ |
704 | prep_conf_teddy_m4(mask_base, &res_old_1, &res_old_2, &res_old_3, val) |
705 | |
706 | #define PREP_CONF_FN(mask_base, val, n) \ |
707 | PREP_CONF_FN_##n(mask_base, val) |
708 | |
709 | #define FDR_EXEC_TEDDY(fdr, a, control, n_msk, conf_fn) \ |
710 | do { \ |
711 | const u8 *buf_end = a->buf + a->len; \ |
712 | const u8 *ptr = a->buf + a->start_offset; \ |
713 | u32 floodBackoff = FLOOD_BACKOFF_START; \ |
714 | const u8 *tryFloodDetect = a->firstFloodDetect; \ |
715 | u32 last_match = ones_u32; \ |
716 | const struct Teddy *teddy = (const struct Teddy *)fdr; \ |
717 | const size_t iterBytes = 32; \ |
718 | DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", \ |
719 | a->buf, a->len, a->start_offset); \ |
720 | \ |
721 | const m128 *maskBase = getMaskBase(teddy); \ |
722 | const u32 *confBase = getConfBase(teddy); \ |
723 | \ |
724 | FDR_EXEC_TEDDY_RES_OLD(n_msk); \ |
725 | const u8 *mainStart = ROUNDUP_PTR(ptr, 16); \ |
726 | DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); \ |
727 | if (ptr < mainStart) { \ |
728 | ptr = mainStart - 16; \ |
729 | m128 p_mask; \ |
730 | m128 val_0 = vectoredLoad128(&p_mask, ptr, a->start_offset, \ |
731 | a->buf, buf_end, \ |
732 | a->buf_history, a->len_history, n_msk); \ |
733 | m128 r_0 = PREP_CONF_FN(maskBase, val_0, n_msk); \ |
734 | r_0 = or128(r_0, p_mask); \ |
735 | CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \ |
736 | ptr += 16; \ |
737 | } \ |
738 | \ |
739 | if (ptr + 16 <= buf_end) { \ |
740 | m128 r_0 = PREP_CONF_FN(maskBase, load128(ptr), n_msk); \ |
741 | CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \ |
742 | ptr += 16; \ |
743 | } \ |
744 | \ |
745 | for (; ptr + iterBytes <= buf_end; ptr += iterBytes) { \ |
746 | __builtin_prefetch(ptr + (iterBytes * 4)); \ |
747 | CHECK_FLOOD; \ |
748 | m128 r_0 = PREP_CONF_FN(maskBase, load128(ptr), n_msk); \ |
749 | CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn); \ |
750 | m128 r_1 = PREP_CONF_FN(maskBase, load128(ptr + 16), n_msk); \ |
751 | CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, conf_fn); \ |
752 | } \ |
753 | \ |
754 | if (ptr + 16 <= buf_end) { \ |
755 | m128 r_0 = PREP_CONF_FN(maskBase, load128(ptr), n_msk); \ |
756 | CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn); \ |
757 | ptr += 16; \ |
758 | } \ |
759 | \ |
760 | assert(ptr + 16 > buf_end); \ |
761 | if (ptr < buf_end) { \ |
762 | m128 p_mask; \ |
763 | m128 val_0 = vectoredLoad128(&p_mask, ptr, 0, ptr, buf_end, \ |
764 | a->buf_history, a->len_history, n_msk); \ |
765 | m128 r_0 = PREP_CONF_FN(maskBase, val_0, n_msk); \ |
766 | r_0 = or128(r_0, p_mask); \ |
767 | CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \ |
768 | } \ |
769 | \ |
770 | return HWLM_SUCCESS; \ |
771 | } while(0) |
772 | |
773 | #endif // HAVE_AVX2 HAVE_AVX512 |
774 | |
775 | hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr, |
776 | const struct FDR_Runtime_Args *a, |
777 | hwlm_group_t control) { |
778 | FDR_EXEC_TEDDY(fdr, a, control, 1, do_confWithBit_teddy); |
779 | } |
780 | |
781 | hwlm_error_t fdr_exec_teddy_msks1_pck(const struct FDR *fdr, |
782 | const struct FDR_Runtime_Args *a, |
783 | hwlm_group_t control) { |
784 | FDR_EXEC_TEDDY(fdr, a, control, 1, do_confWithBit_teddy); |
785 | } |
786 | |
787 | hwlm_error_t fdr_exec_teddy_msks2(const struct FDR *fdr, |
788 | const struct FDR_Runtime_Args *a, |
789 | hwlm_group_t control) { |
790 | FDR_EXEC_TEDDY(fdr, a, control, 2, do_confWithBit_teddy); |
791 | } |
792 | |
793 | hwlm_error_t fdr_exec_teddy_msks2_pck(const struct FDR *fdr, |
794 | const struct FDR_Runtime_Args *a, |
795 | hwlm_group_t control) { |
796 | FDR_EXEC_TEDDY(fdr, a, control, 2, do_confWithBit_teddy); |
797 | } |
798 | |
799 | hwlm_error_t fdr_exec_teddy_msks3(const struct FDR *fdr, |
800 | const struct FDR_Runtime_Args *a, |
801 | hwlm_group_t control) { |
802 | FDR_EXEC_TEDDY(fdr, a, control, 3, do_confWithBit_teddy); |
803 | } |
804 | |
805 | hwlm_error_t fdr_exec_teddy_msks3_pck(const struct FDR *fdr, |
806 | const struct FDR_Runtime_Args *a, |
807 | hwlm_group_t control) { |
808 | FDR_EXEC_TEDDY(fdr, a, control, 3, do_confWithBit_teddy); |
809 | } |
810 | |
811 | hwlm_error_t fdr_exec_teddy_msks4(const struct FDR *fdr, |
812 | const struct FDR_Runtime_Args *a, |
813 | hwlm_group_t control) { |
814 | FDR_EXEC_TEDDY(fdr, a, control, 4, do_confWithBit_teddy); |
815 | } |
816 | |
817 | hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr, |
818 | const struct FDR_Runtime_Args *a, |
819 | hwlm_group_t control) { |
820 | FDR_EXEC_TEDDY(fdr, a, control, 4, do_confWithBit_teddy); |
821 | } |
822 | |