1 | /* |
2 | * Copyright (c) 2015-2017, Intel Corporation |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions are met: |
6 | * |
7 | * * Redistributions of source code must retain the above copyright notice, |
8 | * this list of conditions and the following disclaimer. |
9 | * * Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * * Neither the name of Intel Corporation nor the names of its contributors |
13 | * may be used to endorse or promote products derived from this software |
14 | * without specific prior written permission. |
15 | * |
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
17 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
20 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
26 | * POSSIBILITY OF SUCH DAMAGE. |
27 | */ |
28 | |
29 | #include "fdr.h" |
30 | #include "fdr_confirm.h" |
31 | #include "fdr_confirm_runtime.h" |
32 | #include "fdr_internal.h" |
33 | #include "fdr_loadval.h" |
34 | #include "flood_runtime.h" |
35 | #include "scratch.h" |
36 | #include "teddy.h" |
37 | #include "teddy_internal.h" |
38 | #include "util/arch.h" |
39 | #include "util/simd_utils.h" |
40 | #include "util/uniform_ops.h" |
41 | |
42 | /** \brief number of bytes processed in each iteration */ |
43 | #define ITER_BYTES 16 |
44 | |
45 | /** \brief total zone buffer size */ |
46 | #define ZONE_TOTAL_SIZE 64 |
47 | |
48 | /** \brief maximum number of allowed zones */ |
49 | #define ZONE_MAX 3 |
50 | |
51 | /** \brief zone information. |
52 | * |
53 | * Zone represents a region of data to scan in FDR. |
54 | * |
55 | * The incoming buffer is to split in multiple zones to ensure two properties: |
56 | * 1: that we can read 8? bytes behind to generate a hash safely |
57 | * 2: that we can read the 3 byte after the current byte (domain > 8) |
58 | */ |
59 | struct zone { |
60 | /** \brief copied buffer, used only when it is a boundary zone. */ |
61 | u8 ALIGN_CL_DIRECTIVE buf[ZONE_TOTAL_SIZE]; |
62 | |
63 | /** \brief shift amount for fdr state to avoid unwanted match. */ |
64 | u8 shift; |
65 | |
66 | /** \brief if boundary zone, start points into the zone buffer after the |
67 | * pre-padding. Otherwise, points to the main buffer, appropriately. */ |
68 | const u8 *start; |
69 | |
70 | /** \brief if boundary zone, end points to the end of zone. Otherwise, |
71 | * pointer to the main buffer, appropriately. */ |
72 | const u8 *end; |
73 | |
74 | /** \brief the amount to adjust to go from a pointer in the zones region |
75 | * (between start and end) to a pointer in the original data buffer. */ |
76 | ptrdiff_t zone_pointer_adjust; |
77 | |
78 | /** \brief firstFloodDetect from FDR_Runtime_Args for non-boundary zones, |
79 | * otherwise end of the zone buf. floodPtr always points inside the same |
80 | * buffer as the start pointe. */ |
81 | const u8 *floodPtr; |
82 | }; |
83 | |
84 | static |
85 | const ALIGN_CL_DIRECTIVE u8 zone_or_mask[ITER_BYTES+1][ITER_BYTES] = { |
86 | { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
87 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, |
88 | { 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
89 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, |
90 | { 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
91 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, |
92 | { 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, |
93 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, |
94 | { 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, |
95 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, |
96 | { 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, |
97 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, |
98 | { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, |
99 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, |
100 | { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, |
101 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, |
102 | { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
103 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, |
104 | { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
105 | 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, |
106 | { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
107 | 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, |
108 | { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
109 | 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00 }, |
110 | { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
111 | 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 }, |
112 | { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
113 | 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00 }, |
114 | { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
115 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00 }, |
116 | { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
117 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00 }, |
118 | { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
119 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 } |
120 | }; |
121 | |
122 | /* compilers don't reliably synthesize the 32-bit ANDN instruction here, |
123 | * so we force its generation. |
124 | */ |
125 | static really_inline |
126 | u64a andn(const u32 a, const u8 *b) { |
127 | u64a r; |
128 | #if defined(HAVE_BMI) && !defined(NO_ASM) |
129 | __asm__ ("andn\t%2,%1,%k0" : "=r" (r) : "r" (a), "m" (*(const u32 *)b)); |
130 | #else |
131 | r = unaligned_load_u32(b) & ~a; |
132 | #endif |
133 | return r; |
134 | } |
135 | |
136 | /* generates an initial state mask based on the last byte-ish of history rather |
137 | * than being all accepting. If there is no history to consider, the state is |
138 | * generated based on the minimum length of each bucket in order to prevent |
139 | * confirms. |
140 | */ |
141 | static really_inline |
142 | m128 getInitState(const struct FDR *fdr, u8 len_history, const u64a *ft, |
143 | const struct zone *z) { |
144 | m128 s; |
145 | if (len_history) { |
146 | /* +1: the zones ensure that we can read the byte at z->end */ |
147 | u32 tmp = lv_u16(z->start + z->shift - 1, z->buf, z->end + 1); |
148 | tmp &= fdr->domainMask; |
149 | s = load_m128_from_u64a(ft + tmp); |
150 | s = rshiftbyte_m128(s, 1); |
151 | } else { |
152 | s = fdr->start; |
153 | } |
154 | return s; |
155 | } |
156 | |
157 | static really_inline |
158 | void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr, |
159 | UNUSED const u8 *end_ptr, u32 domain_mask_flipped, |
160 | const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) { |
161 | /* +1: the zones ensure that we can read the byte at z->end */ |
162 | assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr); |
163 | u64a reach0 = andn(domain_mask_flipped, itPtr); |
164 | u64a reach1 = andn(domain_mask_flipped, itPtr + 1); |
165 | u64a reach2 = andn(domain_mask_flipped, itPtr + 2); |
166 | u64a reach3 = andn(domain_mask_flipped, itPtr + 3); |
167 | |
168 | m128 st0 = load_m128_from_u64a(ft + reach0); |
169 | m128 st1 = load_m128_from_u64a(ft + reach1); |
170 | m128 st2 = load_m128_from_u64a(ft + reach2); |
171 | m128 st3 = load_m128_from_u64a(ft + reach3); |
172 | |
173 | u64a reach4 = andn(domain_mask_flipped, itPtr + 4); |
174 | u64a reach5 = andn(domain_mask_flipped, itPtr + 5); |
175 | u64a reach6 = andn(domain_mask_flipped, itPtr + 6); |
176 | u64a reach7 = andn(domain_mask_flipped, itPtr + 7); |
177 | |
178 | m128 st4 = load_m128_from_u64a(ft + reach4); |
179 | m128 st5 = load_m128_from_u64a(ft + reach5); |
180 | m128 st6 = load_m128_from_u64a(ft + reach6); |
181 | m128 st7 = load_m128_from_u64a(ft + reach7); |
182 | |
183 | st1 = lshiftbyte_m128(st1, 1); |
184 | st2 = lshiftbyte_m128(st2, 2); |
185 | st3 = lshiftbyte_m128(st3, 3); |
186 | st4 = lshiftbyte_m128(st4, 4); |
187 | st5 = lshiftbyte_m128(st5, 5); |
188 | st6 = lshiftbyte_m128(st6, 6); |
189 | st7 = lshiftbyte_m128(st7, 7); |
190 | |
191 | st0 = or128(st0, st1); |
192 | st2 = or128(st2, st3); |
193 | st4 = or128(st4, st5); |
194 | st6 = or128(st6, st7); |
195 | st0 = or128(st0, st2); |
196 | st4 = or128(st4, st6); |
197 | st0 = or128(st0, st4); |
198 | *s = or128(*s, st0); |
199 | |
200 | *conf0 = movq(*s); |
201 | *s = rshiftbyte_m128(*s, 8); |
202 | *conf0 ^= ~0ULL; |
203 | |
204 | u64a reach8 = andn(domain_mask_flipped, itPtr + 8); |
205 | u64a reach9 = andn(domain_mask_flipped, itPtr + 9); |
206 | u64a reach10 = andn(domain_mask_flipped, itPtr + 10); |
207 | u64a reach11 = andn(domain_mask_flipped, itPtr + 11); |
208 | |
209 | m128 st8 = load_m128_from_u64a(ft + reach8); |
210 | m128 st9 = load_m128_from_u64a(ft + reach9); |
211 | m128 st10 = load_m128_from_u64a(ft + reach10); |
212 | m128 st11 = load_m128_from_u64a(ft + reach11); |
213 | |
214 | u64a reach12 = andn(domain_mask_flipped, itPtr + 12); |
215 | u64a reach13 = andn(domain_mask_flipped, itPtr + 13); |
216 | u64a reach14 = andn(domain_mask_flipped, itPtr + 14); |
217 | u64a reach15 = andn(domain_mask_flipped, itPtr + 15); |
218 | |
219 | m128 st12 = load_m128_from_u64a(ft + reach12); |
220 | m128 st13 = load_m128_from_u64a(ft + reach13); |
221 | m128 st14 = load_m128_from_u64a(ft + reach14); |
222 | m128 st15 = load_m128_from_u64a(ft + reach15); |
223 | |
224 | st9 = lshiftbyte_m128(st9, 1); |
225 | st10 = lshiftbyte_m128(st10, 2); |
226 | st11 = lshiftbyte_m128(st11, 3); |
227 | st12 = lshiftbyte_m128(st12, 4); |
228 | st13 = lshiftbyte_m128(st13, 5); |
229 | st14 = lshiftbyte_m128(st14, 6); |
230 | st15 = lshiftbyte_m128(st15, 7); |
231 | |
232 | st8 = or128(st8, st9); |
233 | st10 = or128(st10, st11); |
234 | st12 = or128(st12, st13); |
235 | st14 = or128(st14, st15); |
236 | st8 = or128(st8, st10); |
237 | st12 = or128(st12, st14); |
238 | st8 = or128(st8, st12); |
239 | *s = or128(*s, st8); |
240 | |
241 | *conf8 = movq(*s); |
242 | *s = rshiftbyte_m128(*s, 8); |
243 | *conf8 ^= ~0ULL; |
244 | } |
245 | |
246 | static really_inline |
247 | void get_conf_stride_2(const u8 *itPtr, UNUSED const u8 *start_ptr, |
248 | UNUSED const u8 *end_ptr, u32 domain_mask_flipped, |
249 | const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) { |
250 | assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr); |
251 | u64a reach0 = andn(domain_mask_flipped, itPtr); |
252 | u64a reach2 = andn(domain_mask_flipped, itPtr + 2); |
253 | u64a reach4 = andn(domain_mask_flipped, itPtr + 4); |
254 | u64a reach6 = andn(domain_mask_flipped, itPtr + 6); |
255 | |
256 | m128 st0 = load_m128_from_u64a(ft + reach0); |
257 | m128 st2 = load_m128_from_u64a(ft + reach2); |
258 | m128 st4 = load_m128_from_u64a(ft + reach4); |
259 | m128 st6 = load_m128_from_u64a(ft + reach6); |
260 | |
261 | u64a reach8 = andn(domain_mask_flipped, itPtr + 8); |
262 | u64a reach10 = andn(domain_mask_flipped, itPtr + 10); |
263 | u64a reach12 = andn(domain_mask_flipped, itPtr + 12); |
264 | u64a reach14 = andn(domain_mask_flipped, itPtr + 14); |
265 | |
266 | m128 st8 = load_m128_from_u64a(ft + reach8); |
267 | m128 st10 = load_m128_from_u64a(ft + reach10); |
268 | m128 st12 = load_m128_from_u64a(ft + reach12); |
269 | m128 st14 = load_m128_from_u64a(ft + reach14); |
270 | |
271 | st2 = lshiftbyte_m128(st2, 2); |
272 | st4 = lshiftbyte_m128(st4, 4); |
273 | st6 = lshiftbyte_m128(st6, 6); |
274 | |
275 | *s = or128(*s, st0); |
276 | *s = or128(*s, st2); |
277 | *s = or128(*s, st4); |
278 | *s = or128(*s, st6); |
279 | |
280 | *conf0 = movq(*s); |
281 | *s = rshiftbyte_m128(*s, 8); |
282 | *conf0 ^= ~0ULL; |
283 | |
284 | st10 = lshiftbyte_m128(st10, 2); |
285 | st12 = lshiftbyte_m128(st12, 4); |
286 | st14 = lshiftbyte_m128(st14, 6); |
287 | |
288 | *s = or128(*s, st8); |
289 | *s = or128(*s, st10); |
290 | *s = or128(*s, st12); |
291 | *s = or128(*s, st14); |
292 | |
293 | *conf8 = movq(*s); |
294 | *s = rshiftbyte_m128(*s, 8); |
295 | *conf8 ^= ~0ULL; |
296 | } |
297 | |
298 | static really_inline |
299 | void get_conf_stride_4(const u8 *itPtr, UNUSED const u8 *start_ptr, |
300 | UNUSED const u8 *end_ptr, u32 domain_mask_flipped, |
301 | const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) { |
302 | assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr); |
303 | u64a reach0 = andn(domain_mask_flipped, itPtr); |
304 | u64a reach4 = andn(domain_mask_flipped, itPtr + 4); |
305 | u64a reach8 = andn(domain_mask_flipped, itPtr + 8); |
306 | u64a reach12 = andn(domain_mask_flipped, itPtr + 12); |
307 | |
308 | m128 st0 = load_m128_from_u64a(ft + reach0); |
309 | m128 st4 = load_m128_from_u64a(ft + reach4); |
310 | m128 st8 = load_m128_from_u64a(ft + reach8); |
311 | m128 st12 = load_m128_from_u64a(ft + reach12); |
312 | |
313 | st4 = lshiftbyte_m128(st4, 4); |
314 | st12 = lshiftbyte_m128(st12, 4); |
315 | |
316 | *s = or128(*s, st0); |
317 | *s = or128(*s, st4); |
318 | *conf0 = movq(*s); |
319 | *s = rshiftbyte_m128(*s, 8); |
320 | *conf0 ^= ~0ULL; |
321 | |
322 | *s = or128(*s, st8); |
323 | *s = or128(*s, st12); |
324 | *conf8 = movq(*s); |
325 | *s = rshiftbyte_m128(*s, 8); |
326 | *conf8 ^= ~0ULL; |
327 | } |
328 | |
329 | static really_inline |
330 | void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control, |
331 | const u32 *confBase, const struct FDR_Runtime_Args *a, |
332 | const u8 *ptr, u32 *last_match_id, struct zone *z) { |
333 | const u8 bucket = 8; |
334 | |
335 | if (likely(!*conf)) { |
336 | return; |
337 | } |
338 | |
339 | /* ptr is currently referring to a location in the zone's buffer, we also |
340 | * need a pointer in the original, main buffer for the final string compare. |
341 | */ |
342 | const u8 *ptr_main = (const u8 *)((uintptr_t)ptr + z->zone_pointer_adjust); |
343 | |
344 | const u8 *confLoc = ptr; |
345 | |
346 | do { |
347 | u32 bit = findAndClearLSB_64(conf); |
348 | u32 byte = bit / bucket + offset; |
349 | u32 bitRem = bit % bucket; |
350 | u32 idx = bitRem; |
351 | u32 cf = confBase[idx]; |
352 | if (!cf) { |
353 | continue; |
354 | } |
355 | const struct FDRConfirm *fdrc = (const struct FDRConfirm *) |
356 | ((const u8 *)confBase + cf); |
357 | if (!(fdrc->groups & *control)) { |
358 | continue; |
359 | } |
360 | u64a confVal = unaligned_load_u64a(confLoc + byte - sizeof(u64a) + 1); |
361 | confWithBit(fdrc, a, ptr_main - a->buf + byte, control, |
362 | last_match_id, confVal, conf, bit); |
363 | } while (unlikely(!!*conf)); |
364 | } |
365 | |
366 | static really_inline |
367 | void dumpZoneInfo(UNUSED struct zone *z, UNUSED size_t zone_id) { |
368 | #ifdef DEBUG |
369 | DEBUG_PRINTF("zone: zone=%zu, bufPtr=%p\n" , zone_id, z->buf); |
370 | DEBUG_PRINTF("zone: startPtr=%p, endPtr=%p, shift=%u\n" , |
371 | z->start, z->end, z->shift); |
372 | DEBUG_PRINTF("zone: zone_pointer_adjust=%zd, floodPtr=%p\n" , |
373 | z->zone_pointer_adjust, z->floodPtr); |
374 | DEBUG_PRINTF("zone buf:" ); |
375 | for (size_t i = 0; i < ZONE_TOTAL_SIZE; i++) { |
376 | if (i % 8 == 0) { |
377 | printf("_" ); |
378 | } |
379 | if (z->buf[i]) { |
380 | printf("%02x" , z->buf[i]); |
381 | } else { |
382 | printf(".." ); |
383 | } |
384 | } |
385 | printf("\n" ); |
386 | #endif |
387 | }; |
388 | |
389 | /** |
390 | * \brief Updates attributes for non-boundary region zone. |
391 | */ |
392 | static really_inline |
393 | void createMainZone(const u8 *flood, const u8 *begin, const u8 *end, |
394 | struct zone *z) { |
395 | z->zone_pointer_adjust = 0; /* zone buffer is the main buffer */ |
396 | z->start = begin; |
397 | z->end = end; |
398 | z->floodPtr = flood; |
399 | z->shift = 0; |
400 | } |
401 | |
402 | /** |
403 | * \brief Create zone for short cases (<= ITER_BYTES). |
404 | * |
405 | * For this case we need to copy everything into the zone's internal buffer. |
406 | * |
407 | * We need to ensure that we run over real data if it exists (in history or |
408 | * before zone begin). We also need to ensure 8 bytes before any data being |
409 | * matched can be read (to perform a conf hash). |
410 | * |
411 | * We also need to ensure that the data at z->end can be read. |
412 | * |
413 | * Hence, the zone consists of: |
414 | * 16 bytes of history, |
415 | * 1 - 24 bytes of data form the buffer (ending at end), |
416 | * 1 byte of final padding |
417 | */ |
418 | static really_inline |
419 | void createShortZone(const u8 *buf, const u8 *hend, const u8 *begin, |
420 | const u8 *end, struct zone *z) { |
421 | /* the floodPtr for BOUNDARY zones are maximum of end of zone buf to avoid |
422 | * the checks in boundary zone. */ |
423 | z->floodPtr = z->buf + ZONE_TOTAL_SIZE; |
424 | |
425 | ptrdiff_t z_len = end - begin; |
426 | assert(z_len > 0); |
427 | assert(z_len <= ITER_BYTES); |
428 | |
429 | z->shift = ITER_BYTES - z_len; /* ignore bytes outside region specified */ |
430 | |
431 | static const size_t ZONE_SHORT_DATA_OFFSET = 16; /* after history */ |
432 | |
433 | /* we are guaranteed to always have 16 initialised bytes at the end of |
434 | * the history buffer (they may be garbage coming from the stream state |
435 | * preceding hbuf, but bytes that don't correspond to actual history |
436 | * shouldn't affect computations). */ |
437 | *(m128 *)z->buf = loadu128(hend - sizeof(m128)); |
438 | |
439 | /* The amount of data we have to copy from main buffer. */ |
440 | size_t copy_len = MIN((size_t)(end - buf), |
441 | ITER_BYTES + sizeof(CONF_TYPE)); |
442 | |
443 | u8 *zone_data = z->buf + ZONE_SHORT_DATA_OFFSET; |
444 | switch (copy_len) { |
445 | case 1: |
446 | *zone_data = *(end - 1); |
447 | break; |
448 | case 2: |
449 | *(u16 *)zone_data = unaligned_load_u16(end - 2); |
450 | break; |
451 | case 3: |
452 | *(u16 *)zone_data = unaligned_load_u16(end - 3); |
453 | *(zone_data + 2) = *(end - 1); |
454 | break; |
455 | case 4: |
456 | *(u32 *)zone_data = unaligned_load_u32(end - 4); |
457 | break; |
458 | case 5: |
459 | case 6: |
460 | case 7: |
461 | /* perform copy with 2 overlapping 4-byte chunks from buf. */ |
462 | *(u32 *)zone_data = unaligned_load_u32(end - copy_len); |
463 | unaligned_store_u32(zone_data + copy_len - sizeof(u32), |
464 | unaligned_load_u32(end - sizeof(u32))); |
465 | break; |
466 | case 8: |
467 | *(u64a *)zone_data = unaligned_load_u64a(end - 8); |
468 | break; |
469 | case 9: |
470 | case 10: |
471 | case 11: |
472 | case 12: |
473 | case 13: |
474 | case 14: |
475 | case 15: |
476 | /* perform copy with 2 overlapping 8-byte chunks from buf. */ |
477 | *(u64a *)zone_data = unaligned_load_u64a(end - copy_len); |
478 | unaligned_store_u64a(zone_data + copy_len - sizeof(u64a), |
479 | unaligned_load_u64a(end - sizeof(u64a))); |
480 | break; |
481 | case 16: |
482 | /* copy 16-bytes from buf. */ |
483 | *(m128 *)zone_data = loadu128(end - 16); |
484 | break; |
485 | default: |
486 | assert(copy_len <= sizeof(m128) + sizeof(u64a)); |
487 | |
488 | /* perform copy with (potentially overlapping) 8-byte and 16-byte chunks. |
489 | */ |
490 | *(u64a *)zone_data = unaligned_load_u64a(end - copy_len); |
491 | storeu128(zone_data + copy_len - sizeof(m128), |
492 | loadu128(end - sizeof(m128))); |
493 | break; |
494 | } |
495 | |
496 | /* set the start and end location of the zone buf |
497 | * to be scanned */ |
498 | u8 *z_end = z->buf + ZONE_SHORT_DATA_OFFSET + copy_len; |
499 | assert(ZONE_SHORT_DATA_OFFSET + copy_len >= ITER_BYTES); |
500 | |
501 | /* copy the post-padding byte; this is required for domain > 8 due to |
502 | * overhang */ |
503 | assert(ZONE_SHORT_DATA_OFFSET + copy_len + 3 < 64); |
504 | *z_end = 0; |
505 | |
506 | z->end = z_end; |
507 | z->start = z_end - ITER_BYTES; |
508 | z->zone_pointer_adjust = (ptrdiff_t)((uintptr_t)end - (uintptr_t)z_end); |
509 | assert(z->start + z->shift == z_end - z_len); |
510 | } |
511 | |
512 | /** |
513 | * \brief Create a zone for the start region. |
514 | * |
515 | * This function requires that there is > ITER_BYTES of data in the buffer to |
516 | * scan. The start zone itself is always responsible for scanning exactly |
517 | * ITER_BYTES of data - there are no warmup/junk bytes scanned. |
518 | * |
519 | * This zone ensures that the byte at z->end can be read and corresponds to |
520 | * the next byte of data. |
521 | * |
522 | * 8 bytes of history data are provided before z->start to allow proper hash |
523 | * generation in streaming mode. If buf != begin, upto 8 bytes of data |
524 | * prior to begin is also provided. |
525 | * |
526 | * Although we are not interested in bare literals which start before begin |
527 | * if buf != begin, lookarounds associated with the literal may require |
528 | * the data prior to begin for hash purposes. |
529 | */ |
530 | static really_inline |
531 | void createStartZone(const u8 *buf, const u8 *hend, const u8 *begin, |
532 | struct zone *z) { |
533 | assert(ITER_BYTES == sizeof(m128)); |
534 | assert(sizeof(CONF_TYPE) == 8); |
535 | static const size_t ZONE_START_BEGIN = sizeof(CONF_TYPE); |
536 | |
537 | const u8 *end = begin + ITER_BYTES; |
538 | |
539 | /* set floodPtr to the end of zone buf to avoid checks in start zone */ |
540 | z->floodPtr = z->buf + ZONE_TOTAL_SIZE; |
541 | |
542 | z->shift = 0; /* we are processing ITER_BYTES of real data */ |
543 | |
544 | /* we are guaranteed to always have 16 initialised bytes at the end of the |
545 | * history buffer (they may be garbage coming from the stream state |
546 | * preceding hbuf, but bytes that don't correspond to actual history |
547 | * shouldn't affect computations). However, for start zones, history is only |
548 | * required for conf hash purposes so we only need 8 bytes */ |
549 | unaligned_store_u64a(z->buf, unaligned_load_u64a(hend - sizeof(u64a))); |
550 | |
551 | /* The amount of data we have to copy from main buffer. */ |
552 | size_t copy_len = MIN((size_t)(end - buf), |
553 | ITER_BYTES + sizeof(CONF_TYPE)); |
554 | assert(copy_len >= 16); |
555 | |
556 | /* copy the post-padding byte; this is required for domain > 8 due to |
557 | * overhang. The start requires that there is data after the zone so it |
558 | * it safe to dereference end */ |
559 | z->buf[ZONE_START_BEGIN + copy_len] = *end; |
560 | |
561 | /* set the start and end location of the zone buf to be scanned */ |
562 | u8 *z_end = z->buf + ZONE_START_BEGIN + copy_len; |
563 | z->end = z_end; |
564 | z->start = z_end - ITER_BYTES; |
565 | |
566 | /* copy the first 8 bytes of the valid region */ |
567 | unaligned_store_u64a(z->buf + ZONE_START_BEGIN, |
568 | unaligned_load_u64a(end - copy_len)); |
569 | |
570 | /* copy the last 16 bytes, may overlap with the previous 8 byte write */ |
571 | storeu128(z_end - sizeof(m128), loadu128(end - sizeof(m128))); |
572 | |
573 | z->zone_pointer_adjust = (ptrdiff_t)((uintptr_t)end - (uintptr_t)z_end); |
574 | |
575 | assert(ZONE_START_BEGIN + copy_len + 3 < 64); |
576 | } |
577 | |
578 | /** |
579 | * \brief Create a zone for the end region. |
580 | * |
581 | * This function requires that there is > ITER_BYTES of data in the buffer to |
582 | * scan. The end zone is responsible for a scanning the <= ITER_BYTES rump of |
583 | * data and optional ITER_BYTES. The main zone cannot handle the last 3 bytes |
584 | * of the buffer. The end zone is required to handle an optional full |
585 | * ITER_BYTES from main zone when there are less than 3 bytes to scan. The |
586 | * main zone size is reduced by ITER_BYTES in this case. |
587 | * |
588 | * This zone ensures that the byte at z->end can be read by filling it with a |
589 | * padding character. |
590 | * |
591 | * Upto 8 bytes of data prior to begin is also provided for the purposes of |
592 | * generating hashes. History is not copied, as all locations which require |
593 | * history for generating a hash are the responsiblity of the start zone. |
594 | */ |
595 | static really_inline |
596 | void createEndZone(const u8 *buf, const u8 *begin, const u8 *end, |
597 | struct zone *z) { |
598 | /* the floodPtr for BOUNDARY zones are maximum of end of zone buf to avoid |
599 | * the checks in boundary zone. */ |
600 | z->floodPtr = z->buf + ZONE_TOTAL_SIZE; |
601 | |
602 | ptrdiff_t z_len = end - begin; |
603 | assert(z_len > 0); |
604 | size_t iter_bytes_second = 0; |
605 | size_t z_len_first = z_len; |
606 | if (z_len > ITER_BYTES) { |
607 | z_len_first = z_len - ITER_BYTES; |
608 | iter_bytes_second = ITER_BYTES; |
609 | } |
610 | z->shift = ITER_BYTES - z_len_first; |
611 | |
612 | const u8 *end_first = end - iter_bytes_second; |
613 | /* The amount of data we have to copy from main buffer for the |
614 | * first iteration. */ |
615 | size_t copy_len_first = MIN((size_t)(end_first - buf), |
616 | ITER_BYTES + sizeof(CONF_TYPE)); |
617 | assert(copy_len_first >= 16); |
618 | |
619 | size_t total_copy_len = copy_len_first + iter_bytes_second; |
620 | assert(total_copy_len + 3 < 64); |
621 | |
622 | /* copy the post-padding byte; this is required for domain > 8 due to |
623 | * overhang */ |
624 | z->buf[total_copy_len] = 0; |
625 | |
626 | /* set the start and end location of the zone buf |
627 | * to be scanned */ |
628 | u8 *z_end = z->buf + total_copy_len; |
629 | z->end = z_end; |
630 | z->start = z_end - ITER_BYTES - iter_bytes_second; |
631 | assert(z->start + z->shift == z_end - z_len); |
632 | |
633 | u8 *z_end_first = z_end - iter_bytes_second; |
634 | /* copy the first 8 bytes of the valid region */ |
635 | unaligned_store_u64a(z->buf, |
636 | unaligned_load_u64a(end_first - copy_len_first)); |
637 | |
638 | /* copy the last 16 bytes, may overlap with the previous 8 byte write */ |
639 | storeu128(z_end_first - sizeof(m128), loadu128(end_first - sizeof(m128))); |
640 | if (iter_bytes_second) { |
641 | storeu128(z_end - sizeof(m128), loadu128(end - sizeof(m128))); |
642 | } |
643 | |
644 | z->zone_pointer_adjust = (ptrdiff_t)((uintptr_t)end - (uintptr_t)z_end); |
645 | } |
646 | |
647 | /** |
648 | * \brief Prepare zones. |
649 | * |
650 | * This function prepares zones with actual buffer and some padded bytes. |
651 | * The actual ITER_BYTES bytes in zone is preceded by main buf and/or |
652 | * history buf and succeeded by padded bytes possibly from main buf, |
653 | * if available. |
654 | */ |
655 | static really_inline |
656 | size_t prepareZones(const u8 *buf, size_t len, const u8 *hend, |
657 | size_t start, const u8 *flood, struct zone *zoneArr) { |
658 | const u8 *ptr = buf + start; |
659 | size_t remaining = len - start; |
660 | |
661 | if (remaining <= ITER_BYTES) { |
662 | /* enough bytes to make only one zone */ |
663 | createShortZone(buf, hend, ptr, buf + len, &zoneArr[0]); |
664 | return 1; |
665 | } |
666 | |
667 | /* enough bytes to make more than one zone */ |
668 | |
669 | size_t numZone = 0; |
670 | createStartZone(buf, hend, ptr, &zoneArr[numZone++]); |
671 | ptr += ITER_BYTES; |
672 | |
673 | assert(ptr < buf + len); |
674 | |
675 | /* find maximum buffer location that the main zone can scan |
676 | * - must be a multiple of ITER_BYTES, and |
677 | * - cannot contain the last 3 bytes (due to 3 bytes read behind the |
678 | end of buffer in FDR main loop) |
679 | */ |
680 | const u8 *main_end = buf + start + ROUNDDOWN_N(len - start - 3, ITER_BYTES); |
681 | |
682 | /* create a zone if multiple of ITER_BYTES are found */ |
683 | if (main_end > ptr) { |
684 | createMainZone(flood, ptr, main_end, &zoneArr[numZone++]); |
685 | ptr = main_end; |
686 | } |
687 | /* create a zone with rest of the data from the main buffer */ |
688 | createEndZone(buf, ptr, buf + len, &zoneArr[numZone++]); |
689 | return numZone; |
690 | } |
691 | |
692 | #define INVALID_MATCH_ID (~0U) |
693 | |
694 | #define FDR_MAIN_LOOP(zz, s, get_conf_fn) \ |
695 | do { \ |
696 | const u8 *tryFloodDetect = zz->floodPtr; \ |
697 | const u8 *start_ptr = zz->start; \ |
698 | const u8 *end_ptr = zz->end; \ |
699 | \ |
700 | for (const u8 *itPtr = start_ptr; itPtr + ITER_BYTES <= end_ptr; \ |
701 | itPtr += ITER_BYTES) { \ |
702 | if (unlikely(itPtr > tryFloodDetect)) { \ |
703 | tryFloodDetect = floodDetect(fdr, a, &itPtr, tryFloodDetect,\ |
704 | &floodBackoff, &control, \ |
705 | ITER_BYTES); \ |
706 | if (unlikely(control == HWLM_TERMINATE_MATCHING)) { \ |
707 | return HWLM_TERMINATED; \ |
708 | } \ |
709 | } \ |
710 | __builtin_prefetch(itPtr + ITER_BYTES); \ |
711 | u64a conf0; \ |
712 | u64a conf8; \ |
713 | get_conf_fn(itPtr, start_ptr, end_ptr, domain_mask_flipped, \ |
714 | ft, &conf0, &conf8, &s); \ |
715 | do_confirm_fdr(&conf0, 0, &control, confBase, a, itPtr, \ |
716 | &last_match_id, zz); \ |
717 | do_confirm_fdr(&conf8, 8, &control, confBase, a, itPtr, \ |
718 | &last_match_id, zz); \ |
719 | if (unlikely(control == HWLM_TERMINATE_MATCHING)) { \ |
720 | return HWLM_TERMINATED; \ |
721 | } \ |
722 | } /* end for loop */ \ |
723 | } while (0) \ |
724 | |
725 | static never_inline |
726 | hwlm_error_t fdr_engine_exec(const struct FDR *fdr, |
727 | const struct FDR_Runtime_Args *a, |
728 | hwlm_group_t control) { |
729 | assert(ISALIGNED_CL(fdr)); |
730 | |
731 | u32 floodBackoff = FLOOD_BACKOFF_START; |
732 | u32 last_match_id = INVALID_MATCH_ID; |
733 | u32 domain_mask_flipped = ~fdr->domainMask; |
734 | u8 stride = fdr->stride; |
735 | const u64a *ft = |
736 | (const u64a *)((const u8 *)fdr + ROUNDUP_CL(sizeof(struct FDR))); |
737 | assert(ISALIGNED_CL(ft)); |
738 | const u32 *confBase = (const u32 *)((const u8 *)fdr + fdr->confOffset); |
739 | assert(ISALIGNED_CL(confBase)); |
740 | struct zone zones[ZONE_MAX]; |
741 | assert(fdr->domain > 8 && fdr->domain < 16); |
742 | |
743 | size_t numZone = prepareZones(a->buf, a->len, |
744 | a->buf_history + a->len_history, |
745 | a->start_offset, a->firstFloodDetect, zones); |
746 | assert(numZone <= ZONE_MAX); |
747 | m128 state = getInitState(fdr, a->len_history, ft, &zones[0]); |
748 | |
749 | for (size_t curZone = 0; curZone < numZone; curZone++) { |
750 | struct zone *z = &zones[curZone]; |
751 | dumpZoneInfo(z, curZone); |
752 | |
753 | /* When a zone contains less data than is processed in an iteration |
754 | * of FDR_MAIN_LOOP(), we need to scan over some extra data. |
755 | * |
756 | * We have chosen to scan this extra data at the start of the |
757 | * iteration. The extra data is either data we have already scanned or |
758 | * garbage (if it is earlier than offset 0), |
759 | * |
760 | * As a result we need to shift the incoming state back so that it will |
761 | * properly line up with the data being scanned. |
762 | * |
763 | * We also need to forbid reporting any matches in the data being |
764 | * rescanned as they have already been reported (or are over garbage but |
765 | * later stages should also provide that safety guarantee). |
766 | */ |
767 | |
768 | u8 shift = z->shift; |
769 | |
770 | state = variable_byte_shift_m128(state, shift); |
771 | |
772 | state = or128(state, load128(zone_or_mask[shift])); |
773 | |
774 | switch (stride) { |
775 | case 1: |
776 | FDR_MAIN_LOOP(z, state, get_conf_stride_1); |
777 | break; |
778 | case 2: |
779 | FDR_MAIN_LOOP(z, state, get_conf_stride_2); |
780 | break; |
781 | case 4: |
782 | FDR_MAIN_LOOP(z, state, get_conf_stride_4); |
783 | break; |
784 | default: |
785 | break; |
786 | } |
787 | } |
788 | |
789 | return HWLM_SUCCESS; |
790 | } |
791 | |
792 | #if defined(HAVE_AVX2) |
793 | #define ONLY_AVX2(func) func |
794 | #else |
795 | #define ONLY_AVX2(func) NULL |
796 | #endif |
797 | |
798 | typedef hwlm_error_t (*FDRFUNCTYPE)(const struct FDR *fdr, |
799 | const struct FDR_Runtime_Args *a, |
800 | hwlm_group_t control); |
801 | |
802 | static const FDRFUNCTYPE funcs[] = { |
803 | fdr_engine_exec, |
804 | NULL, /* old: fast teddy */ |
805 | NULL, /* old: fast teddy */ |
806 | ONLY_AVX2(fdr_exec_fat_teddy_msks1), |
807 | ONLY_AVX2(fdr_exec_fat_teddy_msks1_pck), |
808 | ONLY_AVX2(fdr_exec_fat_teddy_msks2), |
809 | ONLY_AVX2(fdr_exec_fat_teddy_msks2_pck), |
810 | ONLY_AVX2(fdr_exec_fat_teddy_msks3), |
811 | ONLY_AVX2(fdr_exec_fat_teddy_msks3_pck), |
812 | ONLY_AVX2(fdr_exec_fat_teddy_msks4), |
813 | ONLY_AVX2(fdr_exec_fat_teddy_msks4_pck), |
814 | fdr_exec_teddy_msks1, |
815 | fdr_exec_teddy_msks1_pck, |
816 | fdr_exec_teddy_msks2, |
817 | fdr_exec_teddy_msks2_pck, |
818 | fdr_exec_teddy_msks3, |
819 | fdr_exec_teddy_msks3_pck, |
820 | fdr_exec_teddy_msks4, |
821 | fdr_exec_teddy_msks4_pck, |
822 | }; |
823 | |
824 | #define FAKE_HISTORY_SIZE 16 |
825 | static const u8 fake_history[FAKE_HISTORY_SIZE]; |
826 | |
827 | hwlm_error_t fdrExec(const struct FDR *fdr, const u8 *buf, size_t len, |
828 | size_t start, HWLMCallback cb, |
829 | struct hs_scratch *scratch, hwlm_group_t groups) { |
830 | // We guarantee (for safezone construction) that it is safe to read 16 |
831 | // bytes before the end of the history buffer. |
832 | const u8 *hbuf = fake_history + FAKE_HISTORY_SIZE; |
833 | |
834 | const struct FDR_Runtime_Args a = { |
835 | buf, |
836 | len, |
837 | hbuf, |
838 | 0, |
839 | start, |
840 | cb, |
841 | scratch, |
842 | nextFloodDetect(buf, len, FLOOD_BACKOFF_START), |
843 | 0 |
844 | }; |
845 | if (unlikely(a.start_offset >= a.len)) { |
846 | return HWLM_SUCCESS; |
847 | } else { |
848 | assert(funcs[fdr->engineID]); |
849 | return funcs[fdr->engineID](fdr, &a, groups); |
850 | } |
851 | } |
852 | |
853 | hwlm_error_t fdrExecStreaming(const struct FDR *fdr, const u8 *hbuf, |
854 | size_t hlen, const u8 *buf, size_t len, |
855 | size_t start, HWLMCallback cb, |
856 | struct hs_scratch *scratch, |
857 | hwlm_group_t groups) { |
858 | struct FDR_Runtime_Args a = { |
859 | buf, |
860 | len, |
861 | hbuf, |
862 | hlen, |
863 | start, |
864 | cb, |
865 | scratch, |
866 | nextFloodDetect(buf, len, FLOOD_BACKOFF_START), |
867 | /* we are guaranteed to always have 16 initialised bytes at the end of |
868 | * the history buffer (they may be garbage). */ |
869 | hbuf ? unaligned_load_u64a(hbuf + hlen - sizeof(u64a)) : (u64a)0 |
870 | }; |
871 | |
872 | hwlm_error_t ret; |
873 | if (unlikely(a.start_offset >= a.len)) { |
874 | ret = HWLM_SUCCESS; |
875 | } else { |
876 | assert(funcs[fdr->engineID]); |
877 | ret = funcs[fdr->engineID](fdr, &a, groups); |
878 | } |
879 | |
880 | return ret; |
881 | } |
882 | |