1/*
2 * Copyright (c) 2016-2017, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** \file
30 * \brief Teddy literal matcher: AVX2 engine runtime.
31 */
32
33#include "fdr_internal.h"
34#include "flood_runtime.h"
35#include "teddy.h"
36#include "teddy_internal.h"
37#include "teddy_runtime_common.h"
38#include "util/arch.h"
39#include "util/simd_utils.h"
40
41#if defined(HAVE_AVX2)
42
43const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64] = {
44 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
45 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
46 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
47 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
48 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
49 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
50 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
51 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
52 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
53 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
54 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
55 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
56 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
57 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
58 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
59 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
60 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
61 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
62 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
63 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
64 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
65 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
66 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
67 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
68 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
69 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
70 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
71 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
72 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
73 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
74 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
75 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
76 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
77 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
78 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
79 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
80 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
81 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
82 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
83 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
84 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
85 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
86 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
87 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
88 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
89 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
90 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
91 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
92 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
93 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
94 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
95 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
96 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
97 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
98 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
99 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff},
100 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
101 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff},
102 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
103 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff},
104 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
105 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff},
106 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
107 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff},
108 {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
109 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}
110};
111
112#define CONF_FAT_CHUNK_64(chunk, bucket, off, reason, conf_fn) \
113do { \
114 if (unlikely(chunk != ones_u64a)) { \
115 chunk = ~chunk; \
116 conf_fn(&chunk, bucket, off, confBase, reason, a, ptr, \
117 &control, &last_match); \
118 CHECK_HWLM_TERMINATE_MATCHING; \
119 } \
120} while(0)
121
122#define CONF_FAT_CHUNK_32(chunk, bucket, off, reason, conf_fn) \
123do { \
124 if (unlikely(chunk != ones_u32)) { \
125 chunk = ~chunk; \
126 conf_fn(&chunk, bucket, off, confBase, reason, a, ptr, \
127 &control, &last_match); \
128 CHECK_HWLM_TERMINATE_MATCHING; \
129 } \
130} while(0)
131
132static really_inline
133const m256 *getMaskBase_fat(const struct Teddy *teddy) {
134 return (const m256 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy)));
135}
136
137#if defined(HAVE_AVX512)
138
139static really_inline
140const u64a *getReinforcedMaskBase_fat(const struct Teddy *teddy, u8 numMask) {
141 return (const u64a *)((const u8 *)getMaskBase_fat(teddy)
142 + ROUNDUP_CL(2 * numMask * sizeof(m256)));
143}
144
145#ifdef ARCH_64_BIT
146#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn) \
147do { \
148 if (unlikely(diff512(var, ones512()))) { \
149 m512 swap = swap256in512(var); \
150 m512 r = interleave512lo(var, swap); \
151 m128 r0 = extract128from512(r, 0); \
152 m128 r1 = extract128from512(r, 1); \
153 u64a part1 = movq(r0); \
154 u64a part2 = extract64from128(r0, 1); \
155 u64a part5 = movq(r1); \
156 u64a part6 = extract64from128(r1, 1); \
157 r = interleave512hi(var, swap); \
158 r0 = extract128from512(r, 0); \
159 r1 = extract128from512(r, 1); \
160 u64a part3 = movq(r0); \
161 u64a part4 = extract64from128(r0, 1); \
162 u64a part7 = movq(r1); \
163 u64a part8 = extract64from128(r1, 1); \
164 CONF_FAT_CHUNK_64(part1, bucket, offset, reason, conf_fn); \
165 CONF_FAT_CHUNK_64(part2, bucket, offset + 4, reason, conf_fn); \
166 CONF_FAT_CHUNK_64(part3, bucket, offset + 8, reason, conf_fn); \
167 CONF_FAT_CHUNK_64(part4, bucket, offset + 12, reason, conf_fn); \
168 CONF_FAT_CHUNK_64(part5, bucket, offset + 16, reason, conf_fn); \
169 CONF_FAT_CHUNK_64(part6, bucket, offset + 20, reason, conf_fn); \
170 CONF_FAT_CHUNK_64(part7, bucket, offset + 24, reason, conf_fn); \
171 CONF_FAT_CHUNK_64(part8, bucket, offset + 28, reason, conf_fn); \
172 } \
173} while(0)
174#else
175#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn) \
176do { \
177 if (unlikely(diff512(var, ones512()))) { \
178 m512 swap = swap256in512(var); \
179 m512 r = interleave512lo(var, swap); \
180 m128 r0 = extract128from512(r, 0); \
181 m128 r1 = extract128from512(r, 1); \
182 u32 part1 = movd(r0); \
183 u32 part2 = extract32from128(r0, 1); \
184 u32 part3 = extract32from128(r0, 2); \
185 u32 part4 = extract32from128(r0, 3); \
186 u32 part9 = movd(r1); \
187 u32 part10 = extract32from128(r1, 1); \
188 u32 part11 = extract32from128(r1, 2); \
189 u32 part12 = extract32from128(r1, 3); \
190 r = interleave512hi(var, swap); \
191 r0 = extract128from512(r, 0); \
192 r1 = extract128from512(r, 1); \
193 u32 part5 = movd(r0); \
194 u32 part6 = extract32from128(r0, 1); \
195 u32 part7 = extract32from128(r0, 2); \
196 u32 part8 = extract32from128(r0, 3); \
197 u32 part13 = movd(r1); \
198 u32 part14 = extract32from128(r1, 1); \
199 u32 part15 = extract32from128(r1, 2); \
200 u32 part16 = extract32from128(r1, 3); \
201 CONF_FAT_CHUNK_32(part1, bucket, offset, reason, conf_fn); \
202 CONF_FAT_CHUNK_32(part2, bucket, offset + 2, reason, conf_fn); \
203 CONF_FAT_CHUNK_32(part3, bucket, offset + 4, reason, conf_fn); \
204 CONF_FAT_CHUNK_32(part4, bucket, offset + 6, reason, conf_fn); \
205 CONF_FAT_CHUNK_32(part5, bucket, offset + 8, reason, conf_fn); \
206 CONF_FAT_CHUNK_32(part6, bucket, offset + 10, reason, conf_fn); \
207 CONF_FAT_CHUNK_32(part7, bucket, offset + 12, reason, conf_fn); \
208 CONF_FAT_CHUNK_32(part8, bucket, offset + 14, reason, conf_fn); \
209 CONF_FAT_CHUNK_32(part9, bucket, offset + 16, reason, conf_fn); \
210 CONF_FAT_CHUNK_32(part10, bucket, offset + 18, reason, conf_fn); \
211 CONF_FAT_CHUNK_32(part11, bucket, offset + 20, reason, conf_fn); \
212 CONF_FAT_CHUNK_32(part12, bucket, offset + 22, reason, conf_fn); \
213 CONF_FAT_CHUNK_32(part13, bucket, offset + 24, reason, conf_fn); \
214 CONF_FAT_CHUNK_32(part14, bucket, offset + 26, reason, conf_fn); \
215 CONF_FAT_CHUNK_32(part15, bucket, offset + 28, reason, conf_fn); \
216 CONF_FAT_CHUNK_32(part16, bucket, offset + 30, reason, conf_fn); \
217 } \
218} while(0)
219#endif
220
221static really_inline
222m512 vectoredLoad2x256(m512 *p_mask, const u8 *ptr, const size_t start_offset,
223 const u8 *lo, const u8 *hi,
224 const u8 *buf_history, size_t len_history,
225 const u32 nMasks) {
226 m256 p_mask256;
227 m512 ret = set2x256(vectoredLoad256(&p_mask256, ptr, start_offset, lo, hi,
228 buf_history, len_history, nMasks));
229 *p_mask = set2x256(p_mask256);
230 return ret;
231}
232
233#define PREP_FAT_SHUF_MASK_NO_REINFORCEMENT(val) \
234 m512 lo = and512(val, *lo_mask); \
235 m512 hi = and512(rshift64_m512(val, 4), *lo_mask)
236
237#define PREP_FAT_SHUF_MASK \
238 PREP_FAT_SHUF_MASK_NO_REINFORCEMENT(set2x256(load256(ptr))); \
239 *c_16 = *(ptr + 15); \
240 m512 r_msk = set512_64(0ULL, r_msk_base_hi[*c_16], \
241 0ULL, r_msk_base_hi[*c_0], \
242 0ULL, r_msk_base_lo[*c_16], \
243 0ULL, r_msk_base_lo[*c_0]); \
244 *c_0 = *(ptr + 31)
245
246#define FAT_SHIFT_OR_M1 \
247 or512(pshufb_m512(dup_mask[0], lo), pshufb_m512(dup_mask[1], hi))
248
249#define FAT_SHIFT_OR_M2 \
250 or512(lshift128_m512(or512(pshufb_m512(dup_mask[2], lo), \
251 pshufb_m512(dup_mask[3], hi)), \
252 1), FAT_SHIFT_OR_M1)
253
254#define FAT_SHIFT_OR_M3 \
255 or512(lshift128_m512(or512(pshufb_m512(dup_mask[4], lo), \
256 pshufb_m512(dup_mask[5], hi)), \
257 2), FAT_SHIFT_OR_M2)
258
259#define FAT_SHIFT_OR_M4 \
260 or512(lshift128_m512(or512(pshufb_m512(dup_mask[6], lo), \
261 pshufb_m512(dup_mask[7], hi)), \
262 3), FAT_SHIFT_OR_M3)
263
264static really_inline
265m512 prep_conf_fat_teddy_no_reinforcement_m1(const m512 *lo_mask,
266 const m512 *dup_mask,
267 const m512 val) {
268 PREP_FAT_SHUF_MASK_NO_REINFORCEMENT(val);
269 return FAT_SHIFT_OR_M1;
270}
271
272static really_inline
273m512 prep_conf_fat_teddy_no_reinforcement_m2(const m512 *lo_mask,
274 const m512 *dup_mask,
275 const m512 val) {
276 PREP_FAT_SHUF_MASK_NO_REINFORCEMENT(val);
277 return FAT_SHIFT_OR_M2;
278}
279
280static really_inline
281m512 prep_conf_fat_teddy_no_reinforcement_m3(const m512 *lo_mask,
282 const m512 *dup_mask,
283 const m512 val) {
284 PREP_FAT_SHUF_MASK_NO_REINFORCEMENT(val);
285 return FAT_SHIFT_OR_M3;
286}
287
288static really_inline
289m512 prep_conf_fat_teddy_no_reinforcement_m4(const m512 *lo_mask,
290 const m512 *dup_mask,
291 const m512 val) {
292 PREP_FAT_SHUF_MASK_NO_REINFORCEMENT(val);
293 return FAT_SHIFT_OR_M4;
294}
295
296static really_inline
297m512 prep_conf_fat_teddy_m1(const m512 *lo_mask, const m512 *dup_mask,
298 const u8 *ptr, const u64a *r_msk_base_lo,
299 const u64a *r_msk_base_hi, u32 *c_0, u32 *c_16) {
300 PREP_FAT_SHUF_MASK;
301 return or512(FAT_SHIFT_OR_M1, r_msk);
302}
303
304static really_inline
305m512 prep_conf_fat_teddy_m2(const m512 *lo_mask, const m512 *dup_mask,
306 const u8 *ptr, const u64a *r_msk_base_lo,
307 const u64a *r_msk_base_hi, u32 *c_0, u32 *c_16) {
308 PREP_FAT_SHUF_MASK;
309 return or512(FAT_SHIFT_OR_M2, r_msk);
310}
311
312static really_inline
313m512 prep_conf_fat_teddy_m3(const m512 *lo_mask, const m512 *dup_mask,
314 const u8 *ptr, const u64a *r_msk_base_lo,
315 const u64a *r_msk_base_hi, u32 *c_0, u32 *c_16) {
316 PREP_FAT_SHUF_MASK;
317 return or512(FAT_SHIFT_OR_M3, r_msk);
318}
319
320static really_inline
321m512 prep_conf_fat_teddy_m4(const m512 *lo_mask, const m512 *dup_mask,
322 const u8 *ptr, const u64a *r_msk_base_lo,
323 const u64a *r_msk_base_hi, u32 *c_0, u32 *c_16) {
324 PREP_FAT_SHUF_MASK;
325 return or512(FAT_SHIFT_OR_M4, r_msk);
326}
327
328#define PREP_CONF_FAT_FN_NO_REINFORCEMENT(val, n) \
329 prep_conf_fat_teddy_no_reinforcement_m##n(&lo_mask, dup_mask, val)
330
331#define PREP_CONF_FAT_FN(ptr, n) \
332 prep_conf_fat_teddy_m##n(&lo_mask, dup_mask, ptr, \
333 r_msk_base_lo, r_msk_base_hi, &c_0, &c_16)
334
335/*
336 * In FAT teddy, it needs 2 bytes to represent result of each position,
337 * so each nibble's(for example, lo nibble of last byte) FAT teddy mask
338 * has 16x2 bytes:
339 * |----------------------------------|----------------------------------|
340 * 16bytes (bucket 0..7 in each byte) 16bytes (bucket 8..15 in each byte)
341 * A B
342 * at runtime FAT teddy reads 16 bytes once and duplicate them to 32 bytes:
343 * |----------------------------------|----------------------------------|
344 * 16bytes input data (lo nibbles) 16bytes duplicated data (lo nibbles)
345 * X X
346 * then do pshufb_m256(AB, XX).
347 *
348 * In AVX512 reinforced FAT teddy, it reads 32 bytes once and duplicate them
349 * to 64 bytes:
350 * |----------------|----------------|----------------|----------------|
351 * X Y X Y
352 * in this case we need DUP_FAT_MASK to construct AABB:
353 * |----------------|----------------|----------------|----------------|
354 * A A B B
355 * then do pshufb_m512(AABB, XYXY).
356 */
357
358#define DUP_FAT_MASK(a) mask_set2x256(set2x256(swap128in256(a)), 0xC3, a)
359
360#define PREPARE_FAT_MASKS_1 \
361 dup_mask[0] = DUP_FAT_MASK(maskBase[0]); \
362 dup_mask[1] = DUP_FAT_MASK(maskBase[1]);
363
364#define PREPARE_FAT_MASKS_2 \
365 PREPARE_FAT_MASKS_1 \
366 dup_mask[2] = DUP_FAT_MASK(maskBase[2]); \
367 dup_mask[3] = DUP_FAT_MASK(maskBase[3]);
368
369#define PREPARE_FAT_MASKS_3 \
370 PREPARE_FAT_MASKS_2 \
371 dup_mask[4] = DUP_FAT_MASK(maskBase[4]); \
372 dup_mask[5] = DUP_FAT_MASK(maskBase[5]);
373
374#define PREPARE_FAT_MASKS_4 \
375 PREPARE_FAT_MASKS_3 \
376 dup_mask[6] = DUP_FAT_MASK(maskBase[6]); \
377 dup_mask[7] = DUP_FAT_MASK(maskBase[7]);
378
379#define PREPARE_FAT_MASKS(n) \
380 m512 lo_mask = set64x8(0xf); \
381 m512 dup_mask[n * 2]; \
382 PREPARE_FAT_MASKS_##n
383
384#define FDR_EXEC_FAT_TEDDY(fdr, a, control, n_msk, conf_fn) \
385do { \
386 const u8 *buf_end = a->buf + a->len; \
387 const u8 *ptr = a->buf + a->start_offset; \
388 u32 floodBackoff = FLOOD_BACKOFF_START; \
389 const u8 *tryFloodDetect = a->firstFloodDetect; \
390 u32 last_match = ones_u32; \
391 const struct Teddy *teddy = (const struct Teddy *)fdr; \
392 const size_t iterBytes = 64; \
393 DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", \
394 a->buf, a->len, a->start_offset); \
395 \
396 const m256 *maskBase = getMaskBase_fat(teddy); \
397 PREPARE_FAT_MASKS(n_msk); \
398 const u32 *confBase = getConfBase(teddy); \
399 \
400 const u64a *r_msk_base_lo = getReinforcedMaskBase_fat(teddy, n_msk); \
401 const u64a *r_msk_base_hi = r_msk_base_lo + (N_CHARS + 1); \
402 u32 c_0 = 0x100; \
403 u32 c_16 = 0x100; \
404 const u8 *mainStart = ROUNDUP_PTR(ptr, 32); \
405 DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); \
406 if (ptr < mainStart) { \
407 ptr = mainStart - 32; \
408 m512 p_mask; \
409 m512 val_0 = vectoredLoad2x256(&p_mask, ptr, a->start_offset, \
410 a->buf, buf_end, \
411 a->buf_history, a->len_history, n_msk); \
412 m512 r_0 = PREP_CONF_FAT_FN_NO_REINFORCEMENT(val_0, n_msk); \
413 r_0 = or512(r_0, p_mask); \
414 CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn); \
415 ptr += 32; \
416 } \
417 \
418 if (ptr + 32 <= buf_end) { \
419 m512 r_0 = PREP_CONF_FAT_FN(ptr, n_msk); \
420 CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn); \
421 ptr += 32; \
422 } \
423 \
424 for (; ptr + iterBytes <= buf_end; ptr += iterBytes) { \
425 __builtin_prefetch(ptr + (iterBytes * 4)); \
426 CHECK_FLOOD; \
427 m512 r_0 = PREP_CONF_FAT_FN(ptr, n_msk); \
428 CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, conf_fn); \
429 m512 r_1 = PREP_CONF_FAT_FN(ptr + 32, n_msk); \
430 CONFIRM_FAT_TEDDY(r_1, 16, 32, NOT_CAUTIOUS, conf_fn); \
431 } \
432 \
433 if (ptr + 32 <= buf_end) { \
434 m512 r_0 = PREP_CONF_FAT_FN(ptr, n_msk); \
435 CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, conf_fn); \
436 ptr += 32; \
437 } \
438 \
439 assert(ptr + 32 > buf_end); \
440 if (ptr < buf_end) { \
441 m512 p_mask; \
442 m512 val_0 = vectoredLoad2x256(&p_mask, ptr, 0, ptr, buf_end, \
443 a->buf_history, a->len_history, n_msk); \
444 m512 r_0 = PREP_CONF_FAT_FN_NO_REINFORCEMENT(val_0, n_msk); \
445 r_0 = or512(r_0, p_mask); \
446 CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn); \
447 } \
448 \
449 return HWLM_SUCCESS; \
450} while(0)
451
452#else // HAVE_AVX512
453
454#ifdef ARCH_64_BIT
455#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn) \
456do { \
457 if (unlikely(diff256(var, ones256()))) { \
458 m256 swap = swap128in256(var); \
459 m256 r = interleave256lo(var, swap); \
460 u64a part1 = extractlow64from256(r); \
461 u64a part2 = extract64from256(r, 1); \
462 r = interleave256hi(var, swap); \
463 u64a part3 = extractlow64from256(r); \
464 u64a part4 = extract64from256(r, 1); \
465 CONF_FAT_CHUNK_64(part1, bucket, offset, reason, conf_fn); \
466 CONF_FAT_CHUNK_64(part2, bucket, offset + 4, reason, conf_fn); \
467 CONF_FAT_CHUNK_64(part3, bucket, offset + 8, reason, conf_fn); \
468 CONF_FAT_CHUNK_64(part4, bucket, offset + 12, reason, conf_fn); \
469 } \
470} while(0)
471#else
472#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn) \
473do { \
474 if (unlikely(diff256(var, ones256()))) { \
475 m256 swap = swap128in256(var); \
476 m256 r = interleave256lo(var, swap); \
477 u32 part1 = extractlow32from256(r); \
478 u32 part2 = extract32from256(r, 1); \
479 u32 part3 = extract32from256(r, 2); \
480 u32 part4 = extract32from256(r, 3); \
481 r = interleave256hi(var, swap); \
482 u32 part5 = extractlow32from256(r); \
483 u32 part6 = extract32from256(r, 1); \
484 u32 part7 = extract32from256(r, 2); \
485 u32 part8 = extract32from256(r, 3); \
486 CONF_FAT_CHUNK_32(part1, bucket, offset, reason, conf_fn); \
487 CONF_FAT_CHUNK_32(part2, bucket, offset + 2, reason, conf_fn); \
488 CONF_FAT_CHUNK_32(part3, bucket, offset + 4, reason, conf_fn); \
489 CONF_FAT_CHUNK_32(part4, bucket, offset + 6, reason, conf_fn); \
490 CONF_FAT_CHUNK_32(part5, bucket, offset + 8, reason, conf_fn); \
491 CONF_FAT_CHUNK_32(part6, bucket, offset + 10, reason, conf_fn); \
492 CONF_FAT_CHUNK_32(part7, bucket, offset + 12, reason, conf_fn); \
493 CONF_FAT_CHUNK_32(part8, bucket, offset + 14, reason, conf_fn); \
494 } \
495} while(0)
496#endif
497
498static really_inline
499m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const size_t start_offset,
500 const u8 *lo, const u8 *hi,
501 const u8 *buf_history, size_t len_history,
502 const u32 nMasks) {
503 m128 p_mask128;
504 m256 ret = set2x128(vectoredLoad128(&p_mask128, ptr, start_offset, lo, hi,
505 buf_history, len_history, nMasks));
506 *p_mask = set2x128(p_mask128);
507 return ret;
508}
509
510static really_inline
511m256 prep_conf_fat_teddy_m1(const m256 *maskBase, m256 val) {
512 m256 mask = set32x8(0xf);
513 m256 lo = and256(val, mask);
514 m256 hi = and256(rshift64_m256(val, 4), mask);
515 return or256(pshufb_m256(maskBase[0 * 2], lo),
516 pshufb_m256(maskBase[0 * 2 + 1], hi));
517}
518
519static really_inline
520m256 prep_conf_fat_teddy_m2(const m256 *maskBase, m256 *old_1, m256 val) {
521 m256 mask = set32x8(0xf);
522 m256 lo = and256(val, mask);
523 m256 hi = and256(rshift64_m256(val, 4), mask);
524 m256 r = prep_conf_fat_teddy_m1(maskBase, val);
525
526 m256 res_1 = or256(pshufb_m256(maskBase[1 * 2], lo),
527 pshufb_m256(maskBase[1 * 2 + 1], hi));
528 m256 res_shifted_1 = vpalignr(res_1, *old_1, 16 - 1);
529 *old_1 = res_1;
530 return or256(r, res_shifted_1);
531}
532
533static really_inline
534m256 prep_conf_fat_teddy_m3(const m256 *maskBase, m256 *old_1, m256 *old_2,
535 m256 val) {
536 m256 mask = set32x8(0xf);
537 m256 lo = and256(val, mask);
538 m256 hi = and256(rshift64_m256(val, 4), mask);
539 m256 r = prep_conf_fat_teddy_m2(maskBase, old_1, val);
540
541 m256 res_2 = or256(pshufb_m256(maskBase[2 * 2], lo),
542 pshufb_m256(maskBase[2 * 2 + 1], hi));
543 m256 res_shifted_2 = vpalignr(res_2, *old_2, 16 - 2);
544 *old_2 = res_2;
545 return or256(r, res_shifted_2);
546}
547
548static really_inline
549m256 prep_conf_fat_teddy_m4(const m256 *maskBase, m256 *old_1, m256 *old_2,
550 m256 *old_3, m256 val) {
551 m256 mask = set32x8(0xf);
552 m256 lo = and256(val, mask);
553 m256 hi = and256(rshift64_m256(val, 4), mask);
554 m256 r = prep_conf_fat_teddy_m3(maskBase, old_1, old_2, val);
555
556 m256 res_3 = or256(pshufb_m256(maskBase[3 * 2], lo),
557 pshufb_m256(maskBase[3 * 2 + 1], hi));
558 m256 res_shifted_3 = vpalignr(res_3, *old_3, 16 - 3);
559 *old_3 = res_3;
560 return or256(r, res_shifted_3);
561}
562
563#define FDR_EXEC_FAT_TEDDY_RES_OLD_1 \
564do { \
565} while(0)
566
567#define FDR_EXEC_FAT_TEDDY_RES_OLD_2 \
568 m256 res_old_1 = zeroes256();
569
570#define FDR_EXEC_FAT_TEDDY_RES_OLD_3 \
571 m256 res_old_1 = zeroes256(); \
572 m256 res_old_2 = zeroes256();
573
574#define FDR_EXEC_FAT_TEDDY_RES_OLD_4 \
575 m256 res_old_1 = zeroes256(); \
576 m256 res_old_2 = zeroes256(); \
577 m256 res_old_3 = zeroes256();
578
579#define FDR_EXEC_FAT_TEDDY_RES_OLD(n) FDR_EXEC_FAT_TEDDY_RES_OLD_##n
580
581#define PREP_CONF_FAT_FN_1(mask_base, val) \
582 prep_conf_fat_teddy_m1(mask_base, val)
583
584#define PREP_CONF_FAT_FN_2(mask_base, val) \
585 prep_conf_fat_teddy_m2(mask_base, &res_old_1, val)
586
587#define PREP_CONF_FAT_FN_3(mask_base, val) \
588 prep_conf_fat_teddy_m3(mask_base, &res_old_1, &res_old_2, val)
589
590#define PREP_CONF_FAT_FN_4(mask_base, val) \
591 prep_conf_fat_teddy_m4(mask_base, &res_old_1, &res_old_2, &res_old_3, val)
592
593#define PREP_CONF_FAT_FN(mask_base, val, n) \
594 PREP_CONF_FAT_FN_##n(mask_base, val)
595
596#define FDR_EXEC_FAT_TEDDY(fdr, a, control, n_msk, conf_fn) \
597do { \
598 const u8 *buf_end = a->buf + a->len; \
599 const u8 *ptr = a->buf + a->start_offset; \
600 u32 floodBackoff = FLOOD_BACKOFF_START; \
601 const u8 *tryFloodDetect = a->firstFloodDetect; \
602 u32 last_match = ones_u32; \
603 const struct Teddy *teddy = (const struct Teddy *)fdr; \
604 const size_t iterBytes = 32; \
605 DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", \
606 a->buf, a->len, a->start_offset); \
607 \
608 const m256 *maskBase = getMaskBase_fat(teddy); \
609 const u32 *confBase = getConfBase(teddy); \
610 \
611 FDR_EXEC_FAT_TEDDY_RES_OLD(n_msk); \
612 const u8 *mainStart = ROUNDUP_PTR(ptr, 16); \
613 DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); \
614 if (ptr < mainStart) { \
615 ptr = mainStart - 16; \
616 m256 p_mask; \
617 m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->start_offset, \
618 a->buf, buf_end, \
619 a->buf_history, a->len_history, \
620 n_msk); \
621 m256 r_0 = PREP_CONF_FAT_FN(maskBase, val_0, n_msk); \
622 r_0 = or256(r_0, p_mask); \
623 CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn); \
624 ptr += 16; \
625 } \
626 \
627 if (ptr + 16 <= buf_end) { \
628 m256 r_0 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr), n_msk); \
629 CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn); \
630 ptr += 16; \
631 } \
632 \
633 for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) { \
634 __builtin_prefetch(ptr + (iterBytes * 4)); \
635 CHECK_FLOOD; \
636 m256 r_0 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr), n_msk); \
637 CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, conf_fn); \
638 m256 r_1 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr + 16), n_msk); \
639 CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, conf_fn); \
640 } \
641 \
642 if (ptr + 16 <= buf_end) { \
643 m256 r_0 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr), n_msk); \
644 CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, conf_fn); \
645 ptr += 16; \
646 } \
647 \
648 assert(ptr + 16 > buf_end); \
649 if (ptr < buf_end) { \
650 m256 p_mask; \
651 m256 val_0 = vectoredLoad2x128(&p_mask, ptr, 0, ptr, buf_end, \
652 a->buf_history, a->len_history, \
653 n_msk); \
654 m256 r_0 = PREP_CONF_FAT_FN(maskBase, val_0, n_msk); \
655 r_0 = or256(r_0, p_mask); \
656 CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn); \
657 } \
658 \
659 return HWLM_SUCCESS; \
660} while(0)
661
662#endif // HAVE_AVX512
663
664hwlm_error_t fdr_exec_fat_teddy_msks1(const struct FDR *fdr,
665 const struct FDR_Runtime_Args *a,
666 hwlm_group_t control) {
667 FDR_EXEC_FAT_TEDDY(fdr, a, control, 1, do_confWithBit_teddy);
668}
669
670hwlm_error_t fdr_exec_fat_teddy_msks1_pck(const struct FDR *fdr,
671 const struct FDR_Runtime_Args *a,
672 hwlm_group_t control) {
673 FDR_EXEC_FAT_TEDDY(fdr, a, control, 1, do_confWithBit_teddy);
674}
675
676hwlm_error_t fdr_exec_fat_teddy_msks2(const struct FDR *fdr,
677 const struct FDR_Runtime_Args *a,
678 hwlm_group_t control) {
679 FDR_EXEC_FAT_TEDDY(fdr, a, control, 2, do_confWithBit_teddy);
680}
681
682hwlm_error_t fdr_exec_fat_teddy_msks2_pck(const struct FDR *fdr,
683 const struct FDR_Runtime_Args *a,
684 hwlm_group_t control) {
685 FDR_EXEC_FAT_TEDDY(fdr, a, control, 2, do_confWithBit_teddy);
686}
687
688hwlm_error_t fdr_exec_fat_teddy_msks3(const struct FDR *fdr,
689 const struct FDR_Runtime_Args *a,
690 hwlm_group_t control) {
691 FDR_EXEC_FAT_TEDDY(fdr, a, control, 3, do_confWithBit_teddy);
692}
693
694hwlm_error_t fdr_exec_fat_teddy_msks3_pck(const struct FDR *fdr,
695 const struct FDR_Runtime_Args *a,
696 hwlm_group_t control) {
697 FDR_EXEC_FAT_TEDDY(fdr, a, control, 3, do_confWithBit_teddy);
698}
699
700hwlm_error_t fdr_exec_fat_teddy_msks4(const struct FDR *fdr,
701 const struct FDR_Runtime_Args *a,
702 hwlm_group_t control) {
703 FDR_EXEC_FAT_TEDDY(fdr, a, control, 4, do_confWithBit_teddy);
704}
705
706hwlm_error_t fdr_exec_fat_teddy_msks4_pck(const struct FDR *fdr,
707 const struct FDR_Runtime_Args *a,
708 hwlm_group_t control) {
709 FDR_EXEC_FAT_TEDDY(fdr, a, control, 4, do_confWithBit_teddy);
710}
711
712#endif // HAVE_AVX2
713