1 | /* |
2 | * Copyright (c) 2016-2017, Intel Corporation |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions are met: |
6 | * |
7 | * * Redistributions of source code must retain the above copyright notice, |
8 | * this list of conditions and the following disclaimer. |
9 | * * Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * * Neither the name of Intel Corporation nor the names of its contributors |
13 | * may be used to endorse or promote products derived from this software |
14 | * without specific prior written permission. |
15 | * |
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
17 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
20 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
26 | * POSSIBILITY OF SUCH DAMAGE. |
27 | */ |
28 | |
29 | /** \file |
30 | * \brief Teddy literal matcher: AVX2 engine runtime. |
31 | */ |
32 | |
33 | #include "fdr_internal.h" |
34 | #include "flood_runtime.h" |
35 | #include "teddy.h" |
36 | #include "teddy_internal.h" |
37 | #include "teddy_runtime_common.h" |
38 | #include "util/arch.h" |
39 | #include "util/simd_utils.h" |
40 | |
41 | #if defined(HAVE_AVX2) |
42 | |
43 | const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64] = { |
44 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
45 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, |
46 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
47 | 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, |
48 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
49 | 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, |
50 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
51 | 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, |
52 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
53 | 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, |
54 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
55 | 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, |
56 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
57 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, |
58 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
59 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, |
60 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
61 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, |
62 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
63 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, |
64 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
65 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, |
66 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
67 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, |
68 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
69 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, |
70 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
71 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, |
72 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
73 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, |
74 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
75 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, |
76 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
77 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, |
78 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
79 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, |
80 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
81 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, |
82 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
83 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, |
84 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
85 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, |
86 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
87 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, |
88 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
89 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, |
90 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
91 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, |
92 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
93 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, |
94 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
95 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, |
96 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
97 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, |
98 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
99 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff}, |
100 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
101 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff}, |
102 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
103 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff}, |
104 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
105 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff}, |
106 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
107 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff}, |
108 | {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
109 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00} |
110 | }; |
111 | |
112 | #define CONF_FAT_CHUNK_64(chunk, bucket, off, reason, conf_fn) \ |
113 | do { \ |
114 | if (unlikely(chunk != ones_u64a)) { \ |
115 | chunk = ~chunk; \ |
116 | conf_fn(&chunk, bucket, off, confBase, reason, a, ptr, \ |
117 | &control, &last_match); \ |
118 | CHECK_HWLM_TERMINATE_MATCHING; \ |
119 | } \ |
120 | } while(0) |
121 | |
122 | #define CONF_FAT_CHUNK_32(chunk, bucket, off, reason, conf_fn) \ |
123 | do { \ |
124 | if (unlikely(chunk != ones_u32)) { \ |
125 | chunk = ~chunk; \ |
126 | conf_fn(&chunk, bucket, off, confBase, reason, a, ptr, \ |
127 | &control, &last_match); \ |
128 | CHECK_HWLM_TERMINATE_MATCHING; \ |
129 | } \ |
130 | } while(0) |
131 | |
132 | static really_inline |
133 | const m256 *getMaskBase_fat(const struct Teddy *teddy) { |
134 | return (const m256 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy))); |
135 | } |
136 | |
137 | #if defined(HAVE_AVX512) |
138 | |
139 | static really_inline |
140 | const u64a *getReinforcedMaskBase_fat(const struct Teddy *teddy, u8 numMask) { |
141 | return (const u64a *)((const u8 *)getMaskBase_fat(teddy) |
142 | + ROUNDUP_CL(2 * numMask * sizeof(m256))); |
143 | } |
144 | |
145 | #ifdef ARCH_64_BIT |
146 | #define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn) \ |
147 | do { \ |
148 | if (unlikely(diff512(var, ones512()))) { \ |
149 | m512 swap = swap256in512(var); \ |
150 | m512 r = interleave512lo(var, swap); \ |
151 | m128 r0 = extract128from512(r, 0); \ |
152 | m128 r1 = extract128from512(r, 1); \ |
153 | u64a part1 = movq(r0); \ |
154 | u64a part2 = extract64from128(r0, 1); \ |
155 | u64a part5 = movq(r1); \ |
156 | u64a part6 = extract64from128(r1, 1); \ |
157 | r = interleave512hi(var, swap); \ |
158 | r0 = extract128from512(r, 0); \ |
159 | r1 = extract128from512(r, 1); \ |
160 | u64a part3 = movq(r0); \ |
161 | u64a part4 = extract64from128(r0, 1); \ |
162 | u64a part7 = movq(r1); \ |
163 | u64a part8 = extract64from128(r1, 1); \ |
164 | CONF_FAT_CHUNK_64(part1, bucket, offset, reason, conf_fn); \ |
165 | CONF_FAT_CHUNK_64(part2, bucket, offset + 4, reason, conf_fn); \ |
166 | CONF_FAT_CHUNK_64(part3, bucket, offset + 8, reason, conf_fn); \ |
167 | CONF_FAT_CHUNK_64(part4, bucket, offset + 12, reason, conf_fn); \ |
168 | CONF_FAT_CHUNK_64(part5, bucket, offset + 16, reason, conf_fn); \ |
169 | CONF_FAT_CHUNK_64(part6, bucket, offset + 20, reason, conf_fn); \ |
170 | CONF_FAT_CHUNK_64(part7, bucket, offset + 24, reason, conf_fn); \ |
171 | CONF_FAT_CHUNK_64(part8, bucket, offset + 28, reason, conf_fn); \ |
172 | } \ |
173 | } while(0) |
174 | #else |
175 | #define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn) \ |
176 | do { \ |
177 | if (unlikely(diff512(var, ones512()))) { \ |
178 | m512 swap = swap256in512(var); \ |
179 | m512 r = interleave512lo(var, swap); \ |
180 | m128 r0 = extract128from512(r, 0); \ |
181 | m128 r1 = extract128from512(r, 1); \ |
182 | u32 part1 = movd(r0); \ |
183 | u32 part2 = extract32from128(r0, 1); \ |
184 | u32 part3 = extract32from128(r0, 2); \ |
185 | u32 part4 = extract32from128(r0, 3); \ |
186 | u32 part9 = movd(r1); \ |
187 | u32 part10 = extract32from128(r1, 1); \ |
188 | u32 part11 = extract32from128(r1, 2); \ |
189 | u32 part12 = extract32from128(r1, 3); \ |
190 | r = interleave512hi(var, swap); \ |
191 | r0 = extract128from512(r, 0); \ |
192 | r1 = extract128from512(r, 1); \ |
193 | u32 part5 = movd(r0); \ |
194 | u32 part6 = extract32from128(r0, 1); \ |
195 | u32 part7 = extract32from128(r0, 2); \ |
196 | u32 part8 = extract32from128(r0, 3); \ |
197 | u32 part13 = movd(r1); \ |
198 | u32 part14 = extract32from128(r1, 1); \ |
199 | u32 part15 = extract32from128(r1, 2); \ |
200 | u32 part16 = extract32from128(r1, 3); \ |
201 | CONF_FAT_CHUNK_32(part1, bucket, offset, reason, conf_fn); \ |
202 | CONF_FAT_CHUNK_32(part2, bucket, offset + 2, reason, conf_fn); \ |
203 | CONF_FAT_CHUNK_32(part3, bucket, offset + 4, reason, conf_fn); \ |
204 | CONF_FAT_CHUNK_32(part4, bucket, offset + 6, reason, conf_fn); \ |
205 | CONF_FAT_CHUNK_32(part5, bucket, offset + 8, reason, conf_fn); \ |
206 | CONF_FAT_CHUNK_32(part6, bucket, offset + 10, reason, conf_fn); \ |
207 | CONF_FAT_CHUNK_32(part7, bucket, offset + 12, reason, conf_fn); \ |
208 | CONF_FAT_CHUNK_32(part8, bucket, offset + 14, reason, conf_fn); \ |
209 | CONF_FAT_CHUNK_32(part9, bucket, offset + 16, reason, conf_fn); \ |
210 | CONF_FAT_CHUNK_32(part10, bucket, offset + 18, reason, conf_fn); \ |
211 | CONF_FAT_CHUNK_32(part11, bucket, offset + 20, reason, conf_fn); \ |
212 | CONF_FAT_CHUNK_32(part12, bucket, offset + 22, reason, conf_fn); \ |
213 | CONF_FAT_CHUNK_32(part13, bucket, offset + 24, reason, conf_fn); \ |
214 | CONF_FAT_CHUNK_32(part14, bucket, offset + 26, reason, conf_fn); \ |
215 | CONF_FAT_CHUNK_32(part15, bucket, offset + 28, reason, conf_fn); \ |
216 | CONF_FAT_CHUNK_32(part16, bucket, offset + 30, reason, conf_fn); \ |
217 | } \ |
218 | } while(0) |
219 | #endif |
220 | |
221 | static really_inline |
222 | m512 vectoredLoad2x256(m512 *p_mask, const u8 *ptr, const size_t start_offset, |
223 | const u8 *lo, const u8 *hi, |
224 | const u8 *buf_history, size_t len_history, |
225 | const u32 nMasks) { |
226 | m256 p_mask256; |
227 | m512 ret = set2x256(vectoredLoad256(&p_mask256, ptr, start_offset, lo, hi, |
228 | buf_history, len_history, nMasks)); |
229 | *p_mask = set2x256(p_mask256); |
230 | return ret; |
231 | } |
232 | |
233 | #define PREP_FAT_SHUF_MASK_NO_REINFORCEMENT(val) \ |
234 | m512 lo = and512(val, *lo_mask); \ |
235 | m512 hi = and512(rshift64_m512(val, 4), *lo_mask) |
236 | |
237 | #define PREP_FAT_SHUF_MASK \ |
238 | PREP_FAT_SHUF_MASK_NO_REINFORCEMENT(set2x256(load256(ptr))); \ |
239 | *c_16 = *(ptr + 15); \ |
240 | m512 r_msk = set512_64(0ULL, r_msk_base_hi[*c_16], \ |
241 | 0ULL, r_msk_base_hi[*c_0], \ |
242 | 0ULL, r_msk_base_lo[*c_16], \ |
243 | 0ULL, r_msk_base_lo[*c_0]); \ |
244 | *c_0 = *(ptr + 31) |
245 | |
246 | #define FAT_SHIFT_OR_M1 \ |
247 | or512(pshufb_m512(dup_mask[0], lo), pshufb_m512(dup_mask[1], hi)) |
248 | |
249 | #define FAT_SHIFT_OR_M2 \ |
250 | or512(lshift128_m512(or512(pshufb_m512(dup_mask[2], lo), \ |
251 | pshufb_m512(dup_mask[3], hi)), \ |
252 | 1), FAT_SHIFT_OR_M1) |
253 | |
254 | #define FAT_SHIFT_OR_M3 \ |
255 | or512(lshift128_m512(or512(pshufb_m512(dup_mask[4], lo), \ |
256 | pshufb_m512(dup_mask[5], hi)), \ |
257 | 2), FAT_SHIFT_OR_M2) |
258 | |
259 | #define FAT_SHIFT_OR_M4 \ |
260 | or512(lshift128_m512(or512(pshufb_m512(dup_mask[6], lo), \ |
261 | pshufb_m512(dup_mask[7], hi)), \ |
262 | 3), FAT_SHIFT_OR_M3) |
263 | |
264 | static really_inline |
265 | m512 prep_conf_fat_teddy_no_reinforcement_m1(const m512 *lo_mask, |
266 | const m512 *dup_mask, |
267 | const m512 val) { |
268 | PREP_FAT_SHUF_MASK_NO_REINFORCEMENT(val); |
269 | return FAT_SHIFT_OR_M1; |
270 | } |
271 | |
272 | static really_inline |
273 | m512 prep_conf_fat_teddy_no_reinforcement_m2(const m512 *lo_mask, |
274 | const m512 *dup_mask, |
275 | const m512 val) { |
276 | PREP_FAT_SHUF_MASK_NO_REINFORCEMENT(val); |
277 | return FAT_SHIFT_OR_M2; |
278 | } |
279 | |
280 | static really_inline |
281 | m512 prep_conf_fat_teddy_no_reinforcement_m3(const m512 *lo_mask, |
282 | const m512 *dup_mask, |
283 | const m512 val) { |
284 | PREP_FAT_SHUF_MASK_NO_REINFORCEMENT(val); |
285 | return FAT_SHIFT_OR_M3; |
286 | } |
287 | |
288 | static really_inline |
289 | m512 prep_conf_fat_teddy_no_reinforcement_m4(const m512 *lo_mask, |
290 | const m512 *dup_mask, |
291 | const m512 val) { |
292 | PREP_FAT_SHUF_MASK_NO_REINFORCEMENT(val); |
293 | return FAT_SHIFT_OR_M4; |
294 | } |
295 | |
296 | static really_inline |
297 | m512 prep_conf_fat_teddy_m1(const m512 *lo_mask, const m512 *dup_mask, |
298 | const u8 *ptr, const u64a *r_msk_base_lo, |
299 | const u64a *r_msk_base_hi, u32 *c_0, u32 *c_16) { |
300 | PREP_FAT_SHUF_MASK; |
301 | return or512(FAT_SHIFT_OR_M1, r_msk); |
302 | } |
303 | |
304 | static really_inline |
305 | m512 prep_conf_fat_teddy_m2(const m512 *lo_mask, const m512 *dup_mask, |
306 | const u8 *ptr, const u64a *r_msk_base_lo, |
307 | const u64a *r_msk_base_hi, u32 *c_0, u32 *c_16) { |
308 | PREP_FAT_SHUF_MASK; |
309 | return or512(FAT_SHIFT_OR_M2, r_msk); |
310 | } |
311 | |
312 | static really_inline |
313 | m512 prep_conf_fat_teddy_m3(const m512 *lo_mask, const m512 *dup_mask, |
314 | const u8 *ptr, const u64a *r_msk_base_lo, |
315 | const u64a *r_msk_base_hi, u32 *c_0, u32 *c_16) { |
316 | PREP_FAT_SHUF_MASK; |
317 | return or512(FAT_SHIFT_OR_M3, r_msk); |
318 | } |
319 | |
320 | static really_inline |
321 | m512 prep_conf_fat_teddy_m4(const m512 *lo_mask, const m512 *dup_mask, |
322 | const u8 *ptr, const u64a *r_msk_base_lo, |
323 | const u64a *r_msk_base_hi, u32 *c_0, u32 *c_16) { |
324 | PREP_FAT_SHUF_MASK; |
325 | return or512(FAT_SHIFT_OR_M4, r_msk); |
326 | } |
327 | |
328 | #define PREP_CONF_FAT_FN_NO_REINFORCEMENT(val, n) \ |
329 | prep_conf_fat_teddy_no_reinforcement_m##n(&lo_mask, dup_mask, val) |
330 | |
331 | #define PREP_CONF_FAT_FN(ptr, n) \ |
332 | prep_conf_fat_teddy_m##n(&lo_mask, dup_mask, ptr, \ |
333 | r_msk_base_lo, r_msk_base_hi, &c_0, &c_16) |
334 | |
335 | /* |
336 | * In FAT teddy, it needs 2 bytes to represent result of each position, |
337 | * so each nibble's(for example, lo nibble of last byte) FAT teddy mask |
338 | * has 16x2 bytes: |
339 | * |----------------------------------|----------------------------------| |
340 | * 16bytes (bucket 0..7 in each byte) 16bytes (bucket 8..15 in each byte) |
341 | * A B |
342 | * at runtime FAT teddy reads 16 bytes once and duplicate them to 32 bytes: |
343 | * |----------------------------------|----------------------------------| |
344 | * 16bytes input data (lo nibbles) 16bytes duplicated data (lo nibbles) |
345 | * X X |
346 | * then do pshufb_m256(AB, XX). |
347 | * |
348 | * In AVX512 reinforced FAT teddy, it reads 32 bytes once and duplicate them |
349 | * to 64 bytes: |
350 | * |----------------|----------------|----------------|----------------| |
351 | * X Y X Y |
352 | * in this case we need DUP_FAT_MASK to construct AABB: |
353 | * |----------------|----------------|----------------|----------------| |
354 | * A A B B |
355 | * then do pshufb_m512(AABB, XYXY). |
356 | */ |
357 | |
358 | #define DUP_FAT_MASK(a) mask_set2x256(set2x256(swap128in256(a)), 0xC3, a) |
359 | |
360 | #define PREPARE_FAT_MASKS_1 \ |
361 | dup_mask[0] = DUP_FAT_MASK(maskBase[0]); \ |
362 | dup_mask[1] = DUP_FAT_MASK(maskBase[1]); |
363 | |
364 | #define PREPARE_FAT_MASKS_2 \ |
365 | PREPARE_FAT_MASKS_1 \ |
366 | dup_mask[2] = DUP_FAT_MASK(maskBase[2]); \ |
367 | dup_mask[3] = DUP_FAT_MASK(maskBase[3]); |
368 | |
369 | #define PREPARE_FAT_MASKS_3 \ |
370 | PREPARE_FAT_MASKS_2 \ |
371 | dup_mask[4] = DUP_FAT_MASK(maskBase[4]); \ |
372 | dup_mask[5] = DUP_FAT_MASK(maskBase[5]); |
373 | |
374 | #define PREPARE_FAT_MASKS_4 \ |
375 | PREPARE_FAT_MASKS_3 \ |
376 | dup_mask[6] = DUP_FAT_MASK(maskBase[6]); \ |
377 | dup_mask[7] = DUP_FAT_MASK(maskBase[7]); |
378 | |
379 | #define PREPARE_FAT_MASKS(n) \ |
380 | m512 lo_mask = set64x8(0xf); \ |
381 | m512 dup_mask[n * 2]; \ |
382 | PREPARE_FAT_MASKS_##n |
383 | |
384 | #define FDR_EXEC_FAT_TEDDY(fdr, a, control, n_msk, conf_fn) \ |
385 | do { \ |
386 | const u8 *buf_end = a->buf + a->len; \ |
387 | const u8 *ptr = a->buf + a->start_offset; \ |
388 | u32 floodBackoff = FLOOD_BACKOFF_START; \ |
389 | const u8 *tryFloodDetect = a->firstFloodDetect; \ |
390 | u32 last_match = ones_u32; \ |
391 | const struct Teddy *teddy = (const struct Teddy *)fdr; \ |
392 | const size_t iterBytes = 64; \ |
393 | DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", \ |
394 | a->buf, a->len, a->start_offset); \ |
395 | \ |
396 | const m256 *maskBase = getMaskBase_fat(teddy); \ |
397 | PREPARE_FAT_MASKS(n_msk); \ |
398 | const u32 *confBase = getConfBase(teddy); \ |
399 | \ |
400 | const u64a *r_msk_base_lo = getReinforcedMaskBase_fat(teddy, n_msk); \ |
401 | const u64a *r_msk_base_hi = r_msk_base_lo + (N_CHARS + 1); \ |
402 | u32 c_0 = 0x100; \ |
403 | u32 c_16 = 0x100; \ |
404 | const u8 *mainStart = ROUNDUP_PTR(ptr, 32); \ |
405 | DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); \ |
406 | if (ptr < mainStart) { \ |
407 | ptr = mainStart - 32; \ |
408 | m512 p_mask; \ |
409 | m512 val_0 = vectoredLoad2x256(&p_mask, ptr, a->start_offset, \ |
410 | a->buf, buf_end, \ |
411 | a->buf_history, a->len_history, n_msk); \ |
412 | m512 r_0 = PREP_CONF_FAT_FN_NO_REINFORCEMENT(val_0, n_msk); \ |
413 | r_0 = or512(r_0, p_mask); \ |
414 | CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn); \ |
415 | ptr += 32; \ |
416 | } \ |
417 | \ |
418 | if (ptr + 32 <= buf_end) { \ |
419 | m512 r_0 = PREP_CONF_FAT_FN(ptr, n_msk); \ |
420 | CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn); \ |
421 | ptr += 32; \ |
422 | } \ |
423 | \ |
424 | for (; ptr + iterBytes <= buf_end; ptr += iterBytes) { \ |
425 | __builtin_prefetch(ptr + (iterBytes * 4)); \ |
426 | CHECK_FLOOD; \ |
427 | m512 r_0 = PREP_CONF_FAT_FN(ptr, n_msk); \ |
428 | CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, conf_fn); \ |
429 | m512 r_1 = PREP_CONF_FAT_FN(ptr + 32, n_msk); \ |
430 | CONFIRM_FAT_TEDDY(r_1, 16, 32, NOT_CAUTIOUS, conf_fn); \ |
431 | } \ |
432 | \ |
433 | if (ptr + 32 <= buf_end) { \ |
434 | m512 r_0 = PREP_CONF_FAT_FN(ptr, n_msk); \ |
435 | CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, conf_fn); \ |
436 | ptr += 32; \ |
437 | } \ |
438 | \ |
439 | assert(ptr + 32 > buf_end); \ |
440 | if (ptr < buf_end) { \ |
441 | m512 p_mask; \ |
442 | m512 val_0 = vectoredLoad2x256(&p_mask, ptr, 0, ptr, buf_end, \ |
443 | a->buf_history, a->len_history, n_msk); \ |
444 | m512 r_0 = PREP_CONF_FAT_FN_NO_REINFORCEMENT(val_0, n_msk); \ |
445 | r_0 = or512(r_0, p_mask); \ |
446 | CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn); \ |
447 | } \ |
448 | \ |
449 | return HWLM_SUCCESS; \ |
450 | } while(0) |
451 | |
452 | #else // HAVE_AVX512 |
453 | |
454 | #ifdef ARCH_64_BIT |
455 | #define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn) \ |
456 | do { \ |
457 | if (unlikely(diff256(var, ones256()))) { \ |
458 | m256 swap = swap128in256(var); \ |
459 | m256 r = interleave256lo(var, swap); \ |
460 | u64a part1 = extractlow64from256(r); \ |
461 | u64a part2 = extract64from256(r, 1); \ |
462 | r = interleave256hi(var, swap); \ |
463 | u64a part3 = extractlow64from256(r); \ |
464 | u64a part4 = extract64from256(r, 1); \ |
465 | CONF_FAT_CHUNK_64(part1, bucket, offset, reason, conf_fn); \ |
466 | CONF_FAT_CHUNK_64(part2, bucket, offset + 4, reason, conf_fn); \ |
467 | CONF_FAT_CHUNK_64(part3, bucket, offset + 8, reason, conf_fn); \ |
468 | CONF_FAT_CHUNK_64(part4, bucket, offset + 12, reason, conf_fn); \ |
469 | } \ |
470 | } while(0) |
471 | #else |
472 | #define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn) \ |
473 | do { \ |
474 | if (unlikely(diff256(var, ones256()))) { \ |
475 | m256 swap = swap128in256(var); \ |
476 | m256 r = interleave256lo(var, swap); \ |
477 | u32 part1 = extractlow32from256(r); \ |
478 | u32 part2 = extract32from256(r, 1); \ |
479 | u32 part3 = extract32from256(r, 2); \ |
480 | u32 part4 = extract32from256(r, 3); \ |
481 | r = interleave256hi(var, swap); \ |
482 | u32 part5 = extractlow32from256(r); \ |
483 | u32 part6 = extract32from256(r, 1); \ |
484 | u32 part7 = extract32from256(r, 2); \ |
485 | u32 part8 = extract32from256(r, 3); \ |
486 | CONF_FAT_CHUNK_32(part1, bucket, offset, reason, conf_fn); \ |
487 | CONF_FAT_CHUNK_32(part2, bucket, offset + 2, reason, conf_fn); \ |
488 | CONF_FAT_CHUNK_32(part3, bucket, offset + 4, reason, conf_fn); \ |
489 | CONF_FAT_CHUNK_32(part4, bucket, offset + 6, reason, conf_fn); \ |
490 | CONF_FAT_CHUNK_32(part5, bucket, offset + 8, reason, conf_fn); \ |
491 | CONF_FAT_CHUNK_32(part6, bucket, offset + 10, reason, conf_fn); \ |
492 | CONF_FAT_CHUNK_32(part7, bucket, offset + 12, reason, conf_fn); \ |
493 | CONF_FAT_CHUNK_32(part8, bucket, offset + 14, reason, conf_fn); \ |
494 | } \ |
495 | } while(0) |
496 | #endif |
497 | |
498 | static really_inline |
499 | m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const size_t start_offset, |
500 | const u8 *lo, const u8 *hi, |
501 | const u8 *buf_history, size_t len_history, |
502 | const u32 nMasks) { |
503 | m128 p_mask128; |
504 | m256 ret = set2x128(vectoredLoad128(&p_mask128, ptr, start_offset, lo, hi, |
505 | buf_history, len_history, nMasks)); |
506 | *p_mask = set2x128(p_mask128); |
507 | return ret; |
508 | } |
509 | |
510 | static really_inline |
511 | m256 prep_conf_fat_teddy_m1(const m256 *maskBase, m256 val) { |
512 | m256 mask = set32x8(0xf); |
513 | m256 lo = and256(val, mask); |
514 | m256 hi = and256(rshift64_m256(val, 4), mask); |
515 | return or256(pshufb_m256(maskBase[0 * 2], lo), |
516 | pshufb_m256(maskBase[0 * 2 + 1], hi)); |
517 | } |
518 | |
519 | static really_inline |
520 | m256 prep_conf_fat_teddy_m2(const m256 *maskBase, m256 *old_1, m256 val) { |
521 | m256 mask = set32x8(0xf); |
522 | m256 lo = and256(val, mask); |
523 | m256 hi = and256(rshift64_m256(val, 4), mask); |
524 | m256 r = prep_conf_fat_teddy_m1(maskBase, val); |
525 | |
526 | m256 res_1 = or256(pshufb_m256(maskBase[1 * 2], lo), |
527 | pshufb_m256(maskBase[1 * 2 + 1], hi)); |
528 | m256 res_shifted_1 = vpalignr(res_1, *old_1, 16 - 1); |
529 | *old_1 = res_1; |
530 | return or256(r, res_shifted_1); |
531 | } |
532 | |
533 | static really_inline |
534 | m256 prep_conf_fat_teddy_m3(const m256 *maskBase, m256 *old_1, m256 *old_2, |
535 | m256 val) { |
536 | m256 mask = set32x8(0xf); |
537 | m256 lo = and256(val, mask); |
538 | m256 hi = and256(rshift64_m256(val, 4), mask); |
539 | m256 r = prep_conf_fat_teddy_m2(maskBase, old_1, val); |
540 | |
541 | m256 res_2 = or256(pshufb_m256(maskBase[2 * 2], lo), |
542 | pshufb_m256(maskBase[2 * 2 + 1], hi)); |
543 | m256 res_shifted_2 = vpalignr(res_2, *old_2, 16 - 2); |
544 | *old_2 = res_2; |
545 | return or256(r, res_shifted_2); |
546 | } |
547 | |
548 | static really_inline |
549 | m256 prep_conf_fat_teddy_m4(const m256 *maskBase, m256 *old_1, m256 *old_2, |
550 | m256 *old_3, m256 val) { |
551 | m256 mask = set32x8(0xf); |
552 | m256 lo = and256(val, mask); |
553 | m256 hi = and256(rshift64_m256(val, 4), mask); |
554 | m256 r = prep_conf_fat_teddy_m3(maskBase, old_1, old_2, val); |
555 | |
556 | m256 res_3 = or256(pshufb_m256(maskBase[3 * 2], lo), |
557 | pshufb_m256(maskBase[3 * 2 + 1], hi)); |
558 | m256 res_shifted_3 = vpalignr(res_3, *old_3, 16 - 3); |
559 | *old_3 = res_3; |
560 | return or256(r, res_shifted_3); |
561 | } |
562 | |
563 | #define FDR_EXEC_FAT_TEDDY_RES_OLD_1 \ |
564 | do { \ |
565 | } while(0) |
566 | |
567 | #define FDR_EXEC_FAT_TEDDY_RES_OLD_2 \ |
568 | m256 res_old_1 = zeroes256(); |
569 | |
570 | #define FDR_EXEC_FAT_TEDDY_RES_OLD_3 \ |
571 | m256 res_old_1 = zeroes256(); \ |
572 | m256 res_old_2 = zeroes256(); |
573 | |
574 | #define FDR_EXEC_FAT_TEDDY_RES_OLD_4 \ |
575 | m256 res_old_1 = zeroes256(); \ |
576 | m256 res_old_2 = zeroes256(); \ |
577 | m256 res_old_3 = zeroes256(); |
578 | |
579 | #define FDR_EXEC_FAT_TEDDY_RES_OLD(n) FDR_EXEC_FAT_TEDDY_RES_OLD_##n |
580 | |
581 | #define PREP_CONF_FAT_FN_1(mask_base, val) \ |
582 | prep_conf_fat_teddy_m1(mask_base, val) |
583 | |
584 | #define PREP_CONF_FAT_FN_2(mask_base, val) \ |
585 | prep_conf_fat_teddy_m2(mask_base, &res_old_1, val) |
586 | |
587 | #define PREP_CONF_FAT_FN_3(mask_base, val) \ |
588 | prep_conf_fat_teddy_m3(mask_base, &res_old_1, &res_old_2, val) |
589 | |
590 | #define PREP_CONF_FAT_FN_4(mask_base, val) \ |
591 | prep_conf_fat_teddy_m4(mask_base, &res_old_1, &res_old_2, &res_old_3, val) |
592 | |
593 | #define PREP_CONF_FAT_FN(mask_base, val, n) \ |
594 | PREP_CONF_FAT_FN_##n(mask_base, val) |
595 | |
596 | #define FDR_EXEC_FAT_TEDDY(fdr, a, control, n_msk, conf_fn) \ |
597 | do { \ |
598 | const u8 *buf_end = a->buf + a->len; \ |
599 | const u8 *ptr = a->buf + a->start_offset; \ |
600 | u32 floodBackoff = FLOOD_BACKOFF_START; \ |
601 | const u8 *tryFloodDetect = a->firstFloodDetect; \ |
602 | u32 last_match = ones_u32; \ |
603 | const struct Teddy *teddy = (const struct Teddy *)fdr; \ |
604 | const size_t iterBytes = 32; \ |
605 | DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", \ |
606 | a->buf, a->len, a->start_offset); \ |
607 | \ |
608 | const m256 *maskBase = getMaskBase_fat(teddy); \ |
609 | const u32 *confBase = getConfBase(teddy); \ |
610 | \ |
611 | FDR_EXEC_FAT_TEDDY_RES_OLD(n_msk); \ |
612 | const u8 *mainStart = ROUNDUP_PTR(ptr, 16); \ |
613 | DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); \ |
614 | if (ptr < mainStart) { \ |
615 | ptr = mainStart - 16; \ |
616 | m256 p_mask; \ |
617 | m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->start_offset, \ |
618 | a->buf, buf_end, \ |
619 | a->buf_history, a->len_history, \ |
620 | n_msk); \ |
621 | m256 r_0 = PREP_CONF_FAT_FN(maskBase, val_0, n_msk); \ |
622 | r_0 = or256(r_0, p_mask); \ |
623 | CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn); \ |
624 | ptr += 16; \ |
625 | } \ |
626 | \ |
627 | if (ptr + 16 <= buf_end) { \ |
628 | m256 r_0 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr), n_msk); \ |
629 | CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn); \ |
630 | ptr += 16; \ |
631 | } \ |
632 | \ |
633 | for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) { \ |
634 | __builtin_prefetch(ptr + (iterBytes * 4)); \ |
635 | CHECK_FLOOD; \ |
636 | m256 r_0 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr), n_msk); \ |
637 | CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, conf_fn); \ |
638 | m256 r_1 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr + 16), n_msk); \ |
639 | CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, conf_fn); \ |
640 | } \ |
641 | \ |
642 | if (ptr + 16 <= buf_end) { \ |
643 | m256 r_0 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr), n_msk); \ |
644 | CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, conf_fn); \ |
645 | ptr += 16; \ |
646 | } \ |
647 | \ |
648 | assert(ptr + 16 > buf_end); \ |
649 | if (ptr < buf_end) { \ |
650 | m256 p_mask; \ |
651 | m256 val_0 = vectoredLoad2x128(&p_mask, ptr, 0, ptr, buf_end, \ |
652 | a->buf_history, a->len_history, \ |
653 | n_msk); \ |
654 | m256 r_0 = PREP_CONF_FAT_FN(maskBase, val_0, n_msk); \ |
655 | r_0 = or256(r_0, p_mask); \ |
656 | CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn); \ |
657 | } \ |
658 | \ |
659 | return HWLM_SUCCESS; \ |
660 | } while(0) |
661 | |
662 | #endif // HAVE_AVX512 |
663 | |
664 | hwlm_error_t fdr_exec_fat_teddy_msks1(const struct FDR *fdr, |
665 | const struct FDR_Runtime_Args *a, |
666 | hwlm_group_t control) { |
667 | FDR_EXEC_FAT_TEDDY(fdr, a, control, 1, do_confWithBit_teddy); |
668 | } |
669 | |
670 | hwlm_error_t fdr_exec_fat_teddy_msks1_pck(const struct FDR *fdr, |
671 | const struct FDR_Runtime_Args *a, |
672 | hwlm_group_t control) { |
673 | FDR_EXEC_FAT_TEDDY(fdr, a, control, 1, do_confWithBit_teddy); |
674 | } |
675 | |
676 | hwlm_error_t fdr_exec_fat_teddy_msks2(const struct FDR *fdr, |
677 | const struct FDR_Runtime_Args *a, |
678 | hwlm_group_t control) { |
679 | FDR_EXEC_FAT_TEDDY(fdr, a, control, 2, do_confWithBit_teddy); |
680 | } |
681 | |
682 | hwlm_error_t fdr_exec_fat_teddy_msks2_pck(const struct FDR *fdr, |
683 | const struct FDR_Runtime_Args *a, |
684 | hwlm_group_t control) { |
685 | FDR_EXEC_FAT_TEDDY(fdr, a, control, 2, do_confWithBit_teddy); |
686 | } |
687 | |
688 | hwlm_error_t fdr_exec_fat_teddy_msks3(const struct FDR *fdr, |
689 | const struct FDR_Runtime_Args *a, |
690 | hwlm_group_t control) { |
691 | FDR_EXEC_FAT_TEDDY(fdr, a, control, 3, do_confWithBit_teddy); |
692 | } |
693 | |
694 | hwlm_error_t fdr_exec_fat_teddy_msks3_pck(const struct FDR *fdr, |
695 | const struct FDR_Runtime_Args *a, |
696 | hwlm_group_t control) { |
697 | FDR_EXEC_FAT_TEDDY(fdr, a, control, 3, do_confWithBit_teddy); |
698 | } |
699 | |
700 | hwlm_error_t fdr_exec_fat_teddy_msks4(const struct FDR *fdr, |
701 | const struct FDR_Runtime_Args *a, |
702 | hwlm_group_t control) { |
703 | FDR_EXEC_FAT_TEDDY(fdr, a, control, 4, do_confWithBit_teddy); |
704 | } |
705 | |
706 | hwlm_error_t fdr_exec_fat_teddy_msks4_pck(const struct FDR *fdr, |
707 | const struct FDR_Runtime_Args *a, |
708 | hwlm_group_t control) { |
709 | FDR_EXEC_FAT_TEDDY(fdr, a, control, 4, do_confWithBit_teddy); |
710 | } |
711 | |
712 | #endif // HAVE_AVX2 |
713 | |