1/*
2 * Copyright (c) 2015-2017, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include "fdr.h"
30#include "fdr_confirm.h"
31#include "fdr_confirm_runtime.h"
32#include "fdr_internal.h"
33#include "fdr_loadval.h"
34#include "flood_runtime.h"
35#include "scratch.h"
36#include "teddy.h"
37#include "teddy_internal.h"
38#include "util/arch.h"
39#include "util/simd_utils.h"
40#include "util/uniform_ops.h"
41
42/** \brief number of bytes processed in each iteration */
43#define ITER_BYTES 16
44
45/** \brief total zone buffer size */
46#define ZONE_TOTAL_SIZE 64
47
48/** \brief maximum number of allowed zones */
49#define ZONE_MAX 3
50
51/** \brief zone information.
52 *
53 * Zone represents a region of data to scan in FDR.
54 *
55 * The incoming buffer is to split in multiple zones to ensure two properties:
56 * 1: that we can read 8? bytes behind to generate a hash safely
57 * 2: that we can read the 3 byte after the current byte (domain > 8)
58 */
59struct zone {
60 /** \brief copied buffer, used only when it is a boundary zone. */
61 u8 ALIGN_CL_DIRECTIVE buf[ZONE_TOTAL_SIZE];
62
63 /** \brief shift amount for fdr state to avoid unwanted match. */
64 u8 shift;
65
66 /** \brief if boundary zone, start points into the zone buffer after the
67 * pre-padding. Otherwise, points to the main buffer, appropriately. */
68 const u8 *start;
69
70 /** \brief if boundary zone, end points to the end of zone. Otherwise,
71 * pointer to the main buffer, appropriately. */
72 const u8 *end;
73
74 /** \brief the amount to adjust to go from a pointer in the zones region
75 * (between start and end) to a pointer in the original data buffer. */
76 ptrdiff_t zone_pointer_adjust;
77
78 /** \brief firstFloodDetect from FDR_Runtime_Args for non-boundary zones,
79 * otherwise end of the zone buf. floodPtr always points inside the same
80 * buffer as the start pointe. */
81 const u8 *floodPtr;
82};
83
84static
85const ALIGN_CL_DIRECTIVE u8 zone_or_mask[ITER_BYTES+1][ITER_BYTES] = {
86 { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
87 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
88 { 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
89 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
90 { 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
91 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
92 { 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00,
93 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
94 { 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
95 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
96 { 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00,
97 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
98 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00,
99 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
100 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00,
101 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
102 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
103 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
104 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
105 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
106 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
107 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
108 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
109 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00 },
110 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
111 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 },
112 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
113 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00 },
114 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
115 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00 },
116 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
117 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00 },
118 { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
119 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }
120};
121
122/* compilers don't reliably synthesize the 32-bit ANDN instruction here,
123 * so we force its generation.
124 */
125static really_inline
126u64a andn(const u32 a, const u8 *b) {
127 u64a r;
128#if defined(HAVE_BMI) && !defined(NO_ASM)
129 __asm__ ("andn\t%2,%1,%k0" : "=r"(r) : "r"(a), "m"(*(const u32 *)b));
130#else
131 r = unaligned_load_u32(b) & ~a;
132#endif
133 return r;
134}
135
136/* generates an initial state mask based on the last byte-ish of history rather
137 * than being all accepting. If there is no history to consider, the state is
138 * generated based on the minimum length of each bucket in order to prevent
139 * confirms.
140 */
141static really_inline
142m128 getInitState(const struct FDR *fdr, u8 len_history, const u64a *ft,
143 const struct zone *z) {
144 m128 s;
145 if (len_history) {
146 /* +1: the zones ensure that we can read the byte at z->end */
147 u32 tmp = lv_u16(z->start + z->shift - 1, z->buf, z->end + 1);
148 tmp &= fdr->domainMask;
149 s = load_m128_from_u64a(ft + tmp);
150 s = rshiftbyte_m128(s, 1);
151 } else {
152 s = fdr->start;
153 }
154 return s;
155}
156
157static really_inline
158void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr,
159 UNUSED const u8 *end_ptr, u32 domain_mask_flipped,
160 const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
161 /* +1: the zones ensure that we can read the byte at z->end */
162 assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
163 u64a reach0 = andn(domain_mask_flipped, itPtr);
164 u64a reach1 = andn(domain_mask_flipped, itPtr + 1);
165 u64a reach2 = andn(domain_mask_flipped, itPtr + 2);
166 u64a reach3 = andn(domain_mask_flipped, itPtr + 3);
167
168 m128 st0 = load_m128_from_u64a(ft + reach0);
169 m128 st1 = load_m128_from_u64a(ft + reach1);
170 m128 st2 = load_m128_from_u64a(ft + reach2);
171 m128 st3 = load_m128_from_u64a(ft + reach3);
172
173 u64a reach4 = andn(domain_mask_flipped, itPtr + 4);
174 u64a reach5 = andn(domain_mask_flipped, itPtr + 5);
175 u64a reach6 = andn(domain_mask_flipped, itPtr + 6);
176 u64a reach7 = andn(domain_mask_flipped, itPtr + 7);
177
178 m128 st4 = load_m128_from_u64a(ft + reach4);
179 m128 st5 = load_m128_from_u64a(ft + reach5);
180 m128 st6 = load_m128_from_u64a(ft + reach6);
181 m128 st7 = load_m128_from_u64a(ft + reach7);
182
183 st1 = lshiftbyte_m128(st1, 1);
184 st2 = lshiftbyte_m128(st2, 2);
185 st3 = lshiftbyte_m128(st3, 3);
186 st4 = lshiftbyte_m128(st4, 4);
187 st5 = lshiftbyte_m128(st5, 5);
188 st6 = lshiftbyte_m128(st6, 6);
189 st7 = lshiftbyte_m128(st7, 7);
190
191 st0 = or128(st0, st1);
192 st2 = or128(st2, st3);
193 st4 = or128(st4, st5);
194 st6 = or128(st6, st7);
195 st0 = or128(st0, st2);
196 st4 = or128(st4, st6);
197 st0 = or128(st0, st4);
198 *s = or128(*s, st0);
199
200 *conf0 = movq(*s);
201 *s = rshiftbyte_m128(*s, 8);
202 *conf0 ^= ~0ULL;
203
204 u64a reach8 = andn(domain_mask_flipped, itPtr + 8);
205 u64a reach9 = andn(domain_mask_flipped, itPtr + 9);
206 u64a reach10 = andn(domain_mask_flipped, itPtr + 10);
207 u64a reach11 = andn(domain_mask_flipped, itPtr + 11);
208
209 m128 st8 = load_m128_from_u64a(ft + reach8);
210 m128 st9 = load_m128_from_u64a(ft + reach9);
211 m128 st10 = load_m128_from_u64a(ft + reach10);
212 m128 st11 = load_m128_from_u64a(ft + reach11);
213
214 u64a reach12 = andn(domain_mask_flipped, itPtr + 12);
215 u64a reach13 = andn(domain_mask_flipped, itPtr + 13);
216 u64a reach14 = andn(domain_mask_flipped, itPtr + 14);
217 u64a reach15 = andn(domain_mask_flipped, itPtr + 15);
218
219 m128 st12 = load_m128_from_u64a(ft + reach12);
220 m128 st13 = load_m128_from_u64a(ft + reach13);
221 m128 st14 = load_m128_from_u64a(ft + reach14);
222 m128 st15 = load_m128_from_u64a(ft + reach15);
223
224 st9 = lshiftbyte_m128(st9, 1);
225 st10 = lshiftbyte_m128(st10, 2);
226 st11 = lshiftbyte_m128(st11, 3);
227 st12 = lshiftbyte_m128(st12, 4);
228 st13 = lshiftbyte_m128(st13, 5);
229 st14 = lshiftbyte_m128(st14, 6);
230 st15 = lshiftbyte_m128(st15, 7);
231
232 st8 = or128(st8, st9);
233 st10 = or128(st10, st11);
234 st12 = or128(st12, st13);
235 st14 = or128(st14, st15);
236 st8 = or128(st8, st10);
237 st12 = or128(st12, st14);
238 st8 = or128(st8, st12);
239 *s = or128(*s, st8);
240
241 *conf8 = movq(*s);
242 *s = rshiftbyte_m128(*s, 8);
243 *conf8 ^= ~0ULL;
244}
245
246static really_inline
247void get_conf_stride_2(const u8 *itPtr, UNUSED const u8 *start_ptr,
248 UNUSED const u8 *end_ptr, u32 domain_mask_flipped,
249 const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
250 assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
251 u64a reach0 = andn(domain_mask_flipped, itPtr);
252 u64a reach2 = andn(domain_mask_flipped, itPtr + 2);
253 u64a reach4 = andn(domain_mask_flipped, itPtr + 4);
254 u64a reach6 = andn(domain_mask_flipped, itPtr + 6);
255
256 m128 st0 = load_m128_from_u64a(ft + reach0);
257 m128 st2 = load_m128_from_u64a(ft + reach2);
258 m128 st4 = load_m128_from_u64a(ft + reach4);
259 m128 st6 = load_m128_from_u64a(ft + reach6);
260
261 u64a reach8 = andn(domain_mask_flipped, itPtr + 8);
262 u64a reach10 = andn(domain_mask_flipped, itPtr + 10);
263 u64a reach12 = andn(domain_mask_flipped, itPtr + 12);
264 u64a reach14 = andn(domain_mask_flipped, itPtr + 14);
265
266 m128 st8 = load_m128_from_u64a(ft + reach8);
267 m128 st10 = load_m128_from_u64a(ft + reach10);
268 m128 st12 = load_m128_from_u64a(ft + reach12);
269 m128 st14 = load_m128_from_u64a(ft + reach14);
270
271 st2 = lshiftbyte_m128(st2, 2);
272 st4 = lshiftbyte_m128(st4, 4);
273 st6 = lshiftbyte_m128(st6, 6);
274
275 *s = or128(*s, st0);
276 *s = or128(*s, st2);
277 *s = or128(*s, st4);
278 *s = or128(*s, st6);
279
280 *conf0 = movq(*s);
281 *s = rshiftbyte_m128(*s, 8);
282 *conf0 ^= ~0ULL;
283
284 st10 = lshiftbyte_m128(st10, 2);
285 st12 = lshiftbyte_m128(st12, 4);
286 st14 = lshiftbyte_m128(st14, 6);
287
288 *s = or128(*s, st8);
289 *s = or128(*s, st10);
290 *s = or128(*s, st12);
291 *s = or128(*s, st14);
292
293 *conf8 = movq(*s);
294 *s = rshiftbyte_m128(*s, 8);
295 *conf8 ^= ~0ULL;
296}
297
298static really_inline
299void get_conf_stride_4(const u8 *itPtr, UNUSED const u8 *start_ptr,
300 UNUSED const u8 *end_ptr, u32 domain_mask_flipped,
301 const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
302 assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
303 u64a reach0 = andn(domain_mask_flipped, itPtr);
304 u64a reach4 = andn(domain_mask_flipped, itPtr + 4);
305 u64a reach8 = andn(domain_mask_flipped, itPtr + 8);
306 u64a reach12 = andn(domain_mask_flipped, itPtr + 12);
307
308 m128 st0 = load_m128_from_u64a(ft + reach0);
309 m128 st4 = load_m128_from_u64a(ft + reach4);
310 m128 st8 = load_m128_from_u64a(ft + reach8);
311 m128 st12 = load_m128_from_u64a(ft + reach12);
312
313 st4 = lshiftbyte_m128(st4, 4);
314 st12 = lshiftbyte_m128(st12, 4);
315
316 *s = or128(*s, st0);
317 *s = or128(*s, st4);
318 *conf0 = movq(*s);
319 *s = rshiftbyte_m128(*s, 8);
320 *conf0 ^= ~0ULL;
321
322 *s = or128(*s, st8);
323 *s = or128(*s, st12);
324 *conf8 = movq(*s);
325 *s = rshiftbyte_m128(*s, 8);
326 *conf8 ^= ~0ULL;
327}
328
329static really_inline
330void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control,
331 const u32 *confBase, const struct FDR_Runtime_Args *a,
332 const u8 *ptr, u32 *last_match_id, struct zone *z) {
333 const u8 bucket = 8;
334
335 if (likely(!*conf)) {
336 return;
337 }
338
339 /* ptr is currently referring to a location in the zone's buffer, we also
340 * need a pointer in the original, main buffer for the final string compare.
341 */
342 const u8 *ptr_main = (const u8 *)((uintptr_t)ptr + z->zone_pointer_adjust);
343
344 const u8 *confLoc = ptr;
345
346 do {
347 u32 bit = findAndClearLSB_64(conf);
348 u32 byte = bit / bucket + offset;
349 u32 bitRem = bit % bucket;
350 u32 idx = bitRem;
351 u32 cf = confBase[idx];
352 if (!cf) {
353 continue;
354 }
355 const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
356 ((const u8 *)confBase + cf);
357 if (!(fdrc->groups & *control)) {
358 continue;
359 }
360 u64a confVal = unaligned_load_u64a(confLoc + byte - sizeof(u64a) + 1);
361 confWithBit(fdrc, a, ptr_main - a->buf + byte, control,
362 last_match_id, confVal, conf, bit);
363 } while (unlikely(!!*conf));
364}
365
366static really_inline
367void dumpZoneInfo(UNUSED struct zone *z, UNUSED size_t zone_id) {
368#ifdef DEBUG
369 DEBUG_PRINTF("zone: zone=%zu, bufPtr=%p\n", zone_id, z->buf);
370 DEBUG_PRINTF("zone: startPtr=%p, endPtr=%p, shift=%u\n",
371 z->start, z->end, z->shift);
372 DEBUG_PRINTF("zone: zone_pointer_adjust=%zd, floodPtr=%p\n",
373 z->zone_pointer_adjust, z->floodPtr);
374 DEBUG_PRINTF("zone buf:");
375 for (size_t i = 0; i < ZONE_TOTAL_SIZE; i++) {
376 if (i % 8 == 0) {
377 printf("_");
378 }
379 if (z->buf[i]) {
380 printf("%02x", z->buf[i]);
381 } else {
382 printf("..");
383 }
384 }
385 printf("\n");
386#endif
387};
388
389/**
390 * \brief Updates attributes for non-boundary region zone.
391 */
392static really_inline
393void createMainZone(const u8 *flood, const u8 *begin, const u8 *end,
394 struct zone *z) {
395 z->zone_pointer_adjust = 0; /* zone buffer is the main buffer */
396 z->start = begin;
397 z->end = end;
398 z->floodPtr = flood;
399 z->shift = 0;
400}
401
402/**
403 * \brief Create zone for short cases (<= ITER_BYTES).
404 *
405 * For this case we need to copy everything into the zone's internal buffer.
406 *
407 * We need to ensure that we run over real data if it exists (in history or
408 * before zone begin). We also need to ensure 8 bytes before any data being
409 * matched can be read (to perform a conf hash).
410 *
411 * We also need to ensure that the data at z->end can be read.
412 *
413 * Hence, the zone consists of:
414 * 16 bytes of history,
415 * 1 - 24 bytes of data form the buffer (ending at end),
416 * 1 byte of final padding
417 */
418static really_inline
419void createShortZone(const u8 *buf, const u8 *hend, const u8 *begin,
420 const u8 *end, struct zone *z) {
421 /* the floodPtr for BOUNDARY zones are maximum of end of zone buf to avoid
422 * the checks in boundary zone. */
423 z->floodPtr = z->buf + ZONE_TOTAL_SIZE;
424
425 ptrdiff_t z_len = end - begin;
426 assert(z_len > 0);
427 assert(z_len <= ITER_BYTES);
428
429 z->shift = ITER_BYTES - z_len; /* ignore bytes outside region specified */
430
431 static const size_t ZONE_SHORT_DATA_OFFSET = 16; /* after history */
432
433 /* we are guaranteed to always have 16 initialised bytes at the end of
434 * the history buffer (they may be garbage coming from the stream state
435 * preceding hbuf, but bytes that don't correspond to actual history
436 * shouldn't affect computations). */
437 *(m128 *)z->buf = loadu128(hend - sizeof(m128));
438
439 /* The amount of data we have to copy from main buffer. */
440 size_t copy_len = MIN((size_t)(end - buf),
441 ITER_BYTES + sizeof(CONF_TYPE));
442
443 u8 *zone_data = z->buf + ZONE_SHORT_DATA_OFFSET;
444 switch (copy_len) {
445 case 1:
446 *zone_data = *(end - 1);
447 break;
448 case 2:
449 *(u16 *)zone_data = unaligned_load_u16(end - 2);
450 break;
451 case 3:
452 *(u16 *)zone_data = unaligned_load_u16(end - 3);
453 *(zone_data + 2) = *(end - 1);
454 break;
455 case 4:
456 *(u32 *)zone_data = unaligned_load_u32(end - 4);
457 break;
458 case 5:
459 case 6:
460 case 7:
461 /* perform copy with 2 overlapping 4-byte chunks from buf. */
462 *(u32 *)zone_data = unaligned_load_u32(end - copy_len);
463 unaligned_store_u32(zone_data + copy_len - sizeof(u32),
464 unaligned_load_u32(end - sizeof(u32)));
465 break;
466 case 8:
467 *(u64a *)zone_data = unaligned_load_u64a(end - 8);
468 break;
469 case 9:
470 case 10:
471 case 11:
472 case 12:
473 case 13:
474 case 14:
475 case 15:
476 /* perform copy with 2 overlapping 8-byte chunks from buf. */
477 *(u64a *)zone_data = unaligned_load_u64a(end - copy_len);
478 unaligned_store_u64a(zone_data + copy_len - sizeof(u64a),
479 unaligned_load_u64a(end - sizeof(u64a)));
480 break;
481 case 16:
482 /* copy 16-bytes from buf. */
483 *(m128 *)zone_data = loadu128(end - 16);
484 break;
485 default:
486 assert(copy_len <= sizeof(m128) + sizeof(u64a));
487
488 /* perform copy with (potentially overlapping) 8-byte and 16-byte chunks.
489 */
490 *(u64a *)zone_data = unaligned_load_u64a(end - copy_len);
491 storeu128(zone_data + copy_len - sizeof(m128),
492 loadu128(end - sizeof(m128)));
493 break;
494 }
495
496 /* set the start and end location of the zone buf
497 * to be scanned */
498 u8 *z_end = z->buf + ZONE_SHORT_DATA_OFFSET + copy_len;
499 assert(ZONE_SHORT_DATA_OFFSET + copy_len >= ITER_BYTES);
500
501 /* copy the post-padding byte; this is required for domain > 8 due to
502 * overhang */
503 assert(ZONE_SHORT_DATA_OFFSET + copy_len + 3 < 64);
504 *z_end = 0;
505
506 z->end = z_end;
507 z->start = z_end - ITER_BYTES;
508 z->zone_pointer_adjust = (ptrdiff_t)((uintptr_t)end - (uintptr_t)z_end);
509 assert(z->start + z->shift == z_end - z_len);
510}
511
512/**
513 * \brief Create a zone for the start region.
514 *
515 * This function requires that there is > ITER_BYTES of data in the buffer to
516 * scan. The start zone itself is always responsible for scanning exactly
517 * ITER_BYTES of data - there are no warmup/junk bytes scanned.
518 *
519 * This zone ensures that the byte at z->end can be read and corresponds to
520 * the next byte of data.
521 *
522 * 8 bytes of history data are provided before z->start to allow proper hash
523 * generation in streaming mode. If buf != begin, upto 8 bytes of data
524 * prior to begin is also provided.
525 *
526 * Although we are not interested in bare literals which start before begin
527 * if buf != begin, lookarounds associated with the literal may require
528 * the data prior to begin for hash purposes.
529 */
530static really_inline
531void createStartZone(const u8 *buf, const u8 *hend, const u8 *begin,
532 struct zone *z) {
533 assert(ITER_BYTES == sizeof(m128));
534 assert(sizeof(CONF_TYPE) == 8);
535 static const size_t ZONE_START_BEGIN = sizeof(CONF_TYPE);
536
537 const u8 *end = begin + ITER_BYTES;
538
539 /* set floodPtr to the end of zone buf to avoid checks in start zone */
540 z->floodPtr = z->buf + ZONE_TOTAL_SIZE;
541
542 z->shift = 0; /* we are processing ITER_BYTES of real data */
543
544 /* we are guaranteed to always have 16 initialised bytes at the end of the
545 * history buffer (they may be garbage coming from the stream state
546 * preceding hbuf, but bytes that don't correspond to actual history
547 * shouldn't affect computations). However, for start zones, history is only
548 * required for conf hash purposes so we only need 8 bytes */
549 unaligned_store_u64a(z->buf, unaligned_load_u64a(hend - sizeof(u64a)));
550
551 /* The amount of data we have to copy from main buffer. */
552 size_t copy_len = MIN((size_t)(end - buf),
553 ITER_BYTES + sizeof(CONF_TYPE));
554 assert(copy_len >= 16);
555
556 /* copy the post-padding byte; this is required for domain > 8 due to
557 * overhang. The start requires that there is data after the zone so it
558 * it safe to dereference end */
559 z->buf[ZONE_START_BEGIN + copy_len] = *end;
560
561 /* set the start and end location of the zone buf to be scanned */
562 u8 *z_end = z->buf + ZONE_START_BEGIN + copy_len;
563 z->end = z_end;
564 z->start = z_end - ITER_BYTES;
565
566 /* copy the first 8 bytes of the valid region */
567 unaligned_store_u64a(z->buf + ZONE_START_BEGIN,
568 unaligned_load_u64a(end - copy_len));
569
570 /* copy the last 16 bytes, may overlap with the previous 8 byte write */
571 storeu128(z_end - sizeof(m128), loadu128(end - sizeof(m128)));
572
573 z->zone_pointer_adjust = (ptrdiff_t)((uintptr_t)end - (uintptr_t)z_end);
574
575 assert(ZONE_START_BEGIN + copy_len + 3 < 64);
576}
577
578/**
579 * \brief Create a zone for the end region.
580 *
581 * This function requires that there is > ITER_BYTES of data in the buffer to
582 * scan. The end zone is responsible for a scanning the <= ITER_BYTES rump of
583 * data and optional ITER_BYTES. The main zone cannot handle the last 3 bytes
584 * of the buffer. The end zone is required to handle an optional full
585 * ITER_BYTES from main zone when there are less than 3 bytes to scan. The
586 * main zone size is reduced by ITER_BYTES in this case.
587 *
588 * This zone ensures that the byte at z->end can be read by filling it with a
589 * padding character.
590 *
591 * Upto 8 bytes of data prior to begin is also provided for the purposes of
592 * generating hashes. History is not copied, as all locations which require
593 * history for generating a hash are the responsiblity of the start zone.
594 */
595static really_inline
596void createEndZone(const u8 *buf, const u8 *begin, const u8 *end,
597 struct zone *z) {
598 /* the floodPtr for BOUNDARY zones are maximum of end of zone buf to avoid
599 * the checks in boundary zone. */
600 z->floodPtr = z->buf + ZONE_TOTAL_SIZE;
601
602 ptrdiff_t z_len = end - begin;
603 assert(z_len > 0);
604 size_t iter_bytes_second = 0;
605 size_t z_len_first = z_len;
606 if (z_len > ITER_BYTES) {
607 z_len_first = z_len - ITER_BYTES;
608 iter_bytes_second = ITER_BYTES;
609 }
610 z->shift = ITER_BYTES - z_len_first;
611
612 const u8 *end_first = end - iter_bytes_second;
613 /* The amount of data we have to copy from main buffer for the
614 * first iteration. */
615 size_t copy_len_first = MIN((size_t)(end_first - buf),
616 ITER_BYTES + sizeof(CONF_TYPE));
617 assert(copy_len_first >= 16);
618
619 size_t total_copy_len = copy_len_first + iter_bytes_second;
620 assert(total_copy_len + 3 < 64);
621
622 /* copy the post-padding byte; this is required for domain > 8 due to
623 * overhang */
624 z->buf[total_copy_len] = 0;
625
626 /* set the start and end location of the zone buf
627 * to be scanned */
628 u8 *z_end = z->buf + total_copy_len;
629 z->end = z_end;
630 z->start = z_end - ITER_BYTES - iter_bytes_second;
631 assert(z->start + z->shift == z_end - z_len);
632
633 u8 *z_end_first = z_end - iter_bytes_second;
634 /* copy the first 8 bytes of the valid region */
635 unaligned_store_u64a(z->buf,
636 unaligned_load_u64a(end_first - copy_len_first));
637
638 /* copy the last 16 bytes, may overlap with the previous 8 byte write */
639 storeu128(z_end_first - sizeof(m128), loadu128(end_first - sizeof(m128)));
640 if (iter_bytes_second) {
641 storeu128(z_end - sizeof(m128), loadu128(end - sizeof(m128)));
642 }
643
644 z->zone_pointer_adjust = (ptrdiff_t)((uintptr_t)end - (uintptr_t)z_end);
645}
646
647/**
648 * \brief Prepare zones.
649 *
650 * This function prepares zones with actual buffer and some padded bytes.
651 * The actual ITER_BYTES bytes in zone is preceded by main buf and/or
652 * history buf and succeeded by padded bytes possibly from main buf,
653 * if available.
654 */
655static really_inline
656size_t prepareZones(const u8 *buf, size_t len, const u8 *hend,
657 size_t start, const u8 *flood, struct zone *zoneArr) {
658 const u8 *ptr = buf + start;
659 size_t remaining = len - start;
660
661 if (remaining <= ITER_BYTES) {
662 /* enough bytes to make only one zone */
663 createShortZone(buf, hend, ptr, buf + len, &zoneArr[0]);
664 return 1;
665 }
666
667 /* enough bytes to make more than one zone */
668
669 size_t numZone = 0;
670 createStartZone(buf, hend, ptr, &zoneArr[numZone++]);
671 ptr += ITER_BYTES;
672
673 assert(ptr < buf + len);
674
675 /* find maximum buffer location that the main zone can scan
676 * - must be a multiple of ITER_BYTES, and
677 * - cannot contain the last 3 bytes (due to 3 bytes read behind the
678 end of buffer in FDR main loop)
679 */
680 const u8 *main_end = buf + start + ROUNDDOWN_N(len - start - 3, ITER_BYTES);
681
682 /* create a zone if multiple of ITER_BYTES are found */
683 if (main_end > ptr) {
684 createMainZone(flood, ptr, main_end, &zoneArr[numZone++]);
685 ptr = main_end;
686 }
687 /* create a zone with rest of the data from the main buffer */
688 createEndZone(buf, ptr, buf + len, &zoneArr[numZone++]);
689 return numZone;
690}
691
692#define INVALID_MATCH_ID (~0U)
693
694#define FDR_MAIN_LOOP(zz, s, get_conf_fn) \
695 do { \
696 const u8 *tryFloodDetect = zz->floodPtr; \
697 const u8 *start_ptr = zz->start; \
698 const u8 *end_ptr = zz->end; \
699 \
700 for (const u8 *itPtr = start_ptr; itPtr + ITER_BYTES <= end_ptr; \
701 itPtr += ITER_BYTES) { \
702 if (unlikely(itPtr > tryFloodDetect)) { \
703 tryFloodDetect = floodDetect(fdr, a, &itPtr, tryFloodDetect,\
704 &floodBackoff, &control, \
705 ITER_BYTES); \
706 if (unlikely(control == HWLM_TERMINATE_MATCHING)) { \
707 return HWLM_TERMINATED; \
708 } \
709 } \
710 __builtin_prefetch(itPtr + ITER_BYTES); \
711 u64a conf0; \
712 u64a conf8; \
713 get_conf_fn(itPtr, start_ptr, end_ptr, domain_mask_flipped, \
714 ft, &conf0, &conf8, &s); \
715 do_confirm_fdr(&conf0, 0, &control, confBase, a, itPtr, \
716 &last_match_id, zz); \
717 do_confirm_fdr(&conf8, 8, &control, confBase, a, itPtr, \
718 &last_match_id, zz); \
719 if (unlikely(control == HWLM_TERMINATE_MATCHING)) { \
720 return HWLM_TERMINATED; \
721 } \
722 } /* end for loop */ \
723 } while (0) \
724
725static never_inline
726hwlm_error_t fdr_engine_exec(const struct FDR *fdr,
727 const struct FDR_Runtime_Args *a,
728 hwlm_group_t control) {
729 assert(ISALIGNED_CL(fdr));
730
731 u32 floodBackoff = FLOOD_BACKOFF_START;
732 u32 last_match_id = INVALID_MATCH_ID;
733 u32 domain_mask_flipped = ~fdr->domainMask;
734 u8 stride = fdr->stride;
735 const u64a *ft =
736 (const u64a *)((const u8 *)fdr + ROUNDUP_CL(sizeof(struct FDR)));
737 assert(ISALIGNED_CL(ft));
738 const u32 *confBase = (const u32 *)((const u8 *)fdr + fdr->confOffset);
739 assert(ISALIGNED_CL(confBase));
740 struct zone zones[ZONE_MAX];
741 assert(fdr->domain > 8 && fdr->domain < 16);
742
743 size_t numZone = prepareZones(a->buf, a->len,
744 a->buf_history + a->len_history,
745 a->start_offset, a->firstFloodDetect, zones);
746 assert(numZone <= ZONE_MAX);
747 m128 state = getInitState(fdr, a->len_history, ft, &zones[0]);
748
749 for (size_t curZone = 0; curZone < numZone; curZone++) {
750 struct zone *z = &zones[curZone];
751 dumpZoneInfo(z, curZone);
752
753 /* When a zone contains less data than is processed in an iteration
754 * of FDR_MAIN_LOOP(), we need to scan over some extra data.
755 *
756 * We have chosen to scan this extra data at the start of the
757 * iteration. The extra data is either data we have already scanned or
758 * garbage (if it is earlier than offset 0),
759 *
760 * As a result we need to shift the incoming state back so that it will
761 * properly line up with the data being scanned.
762 *
763 * We also need to forbid reporting any matches in the data being
764 * rescanned as they have already been reported (or are over garbage but
765 * later stages should also provide that safety guarantee).
766 */
767
768 u8 shift = z->shift;
769
770 state = variable_byte_shift_m128(state, shift);
771
772 state = or128(state, load128(zone_or_mask[shift]));
773
774 switch (stride) {
775 case 1:
776 FDR_MAIN_LOOP(z, state, get_conf_stride_1);
777 break;
778 case 2:
779 FDR_MAIN_LOOP(z, state, get_conf_stride_2);
780 break;
781 case 4:
782 FDR_MAIN_LOOP(z, state, get_conf_stride_4);
783 break;
784 default:
785 break;
786 }
787 }
788
789 return HWLM_SUCCESS;
790}
791
792#if defined(HAVE_AVX2)
793#define ONLY_AVX2(func) func
794#else
795#define ONLY_AVX2(func) NULL
796#endif
797
798typedef hwlm_error_t (*FDRFUNCTYPE)(const struct FDR *fdr,
799 const struct FDR_Runtime_Args *a,
800 hwlm_group_t control);
801
802static const FDRFUNCTYPE funcs[] = {
803 fdr_engine_exec,
804 NULL, /* old: fast teddy */
805 NULL, /* old: fast teddy */
806 ONLY_AVX2(fdr_exec_fat_teddy_msks1),
807 ONLY_AVX2(fdr_exec_fat_teddy_msks1_pck),
808 ONLY_AVX2(fdr_exec_fat_teddy_msks2),
809 ONLY_AVX2(fdr_exec_fat_teddy_msks2_pck),
810 ONLY_AVX2(fdr_exec_fat_teddy_msks3),
811 ONLY_AVX2(fdr_exec_fat_teddy_msks3_pck),
812 ONLY_AVX2(fdr_exec_fat_teddy_msks4),
813 ONLY_AVX2(fdr_exec_fat_teddy_msks4_pck),
814 fdr_exec_teddy_msks1,
815 fdr_exec_teddy_msks1_pck,
816 fdr_exec_teddy_msks2,
817 fdr_exec_teddy_msks2_pck,
818 fdr_exec_teddy_msks3,
819 fdr_exec_teddy_msks3_pck,
820 fdr_exec_teddy_msks4,
821 fdr_exec_teddy_msks4_pck,
822};
823
824#define FAKE_HISTORY_SIZE 16
825static const u8 fake_history[FAKE_HISTORY_SIZE];
826
827hwlm_error_t fdrExec(const struct FDR *fdr, const u8 *buf, size_t len,
828 size_t start, HWLMCallback cb,
829 struct hs_scratch *scratch, hwlm_group_t groups) {
830 // We guarantee (for safezone construction) that it is safe to read 16
831 // bytes before the end of the history buffer.
832 const u8 *hbuf = fake_history + FAKE_HISTORY_SIZE;
833
834 const struct FDR_Runtime_Args a = {
835 buf,
836 len,
837 hbuf,
838 0,
839 start,
840 cb,
841 scratch,
842 nextFloodDetect(buf, len, FLOOD_BACKOFF_START),
843 0
844 };
845 if (unlikely(a.start_offset >= a.len)) {
846 return HWLM_SUCCESS;
847 } else {
848 assert(funcs[fdr->engineID]);
849 return funcs[fdr->engineID](fdr, &a, groups);
850 }
851}
852
853hwlm_error_t fdrExecStreaming(const struct FDR *fdr, const u8 *hbuf,
854 size_t hlen, const u8 *buf, size_t len,
855 size_t start, HWLMCallback cb,
856 struct hs_scratch *scratch,
857 hwlm_group_t groups) {
858 struct FDR_Runtime_Args a = {
859 buf,
860 len,
861 hbuf,
862 hlen,
863 start,
864 cb,
865 scratch,
866 nextFloodDetect(buf, len, FLOOD_BACKOFF_START),
867 /* we are guaranteed to always have 16 initialised bytes at the end of
868 * the history buffer (they may be garbage). */
869 hbuf ? unaligned_load_u64a(hbuf + hlen - sizeof(u64a)) : (u64a)0
870 };
871
872 hwlm_error_t ret;
873 if (unlikely(a.start_offset >= a.len)) {
874 ret = HWLM_SUCCESS;
875 } else {
876 assert(funcs[fdr->engineID]);
877 ret = funcs[fdr->engineID](fdr, &a, groups);
878 }
879
880 return ret;
881}
882