1 | /* |
2 | * Copyright (c) 2015-2017, Intel Corporation |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions are met: |
6 | * |
7 | * * Redistributions of source code must retain the above copyright notice, |
8 | * this list of conditions and the following disclaimer. |
9 | * * Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * * Neither the name of Intel Corporation nor the names of its contributors |
13 | * may be used to endorse or promote products derived from this software |
14 | * without specific prior written permission. |
15 | * |
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
17 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
20 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
26 | * POSSIBILITY OF SUCH DAMAGE. |
27 | */ |
28 | |
29 | /* noodle scan parts for SSE */ |
30 | |
31 | static really_inline m128 getMask(u8 c, bool noCase) { |
32 | u8 k = caseClear8(c, noCase); |
33 | return set16x8(k); |
34 | } |
35 | |
36 | static really_inline m128 getCaseMask(void) { |
37 | return set16x8(0xdf); |
38 | } |
39 | |
40 | static really_inline |
41 | hwlm_error_t scanSingleShort(const struct noodTable *n, const u8 *buf, |
42 | size_t len, bool noCase, m128 caseMask, m128 mask1, |
43 | const struct cb_info *cbi, size_t start, |
44 | size_t end) { |
45 | const u8 *d = buf + start; |
46 | size_t l = end - start; |
47 | DEBUG_PRINTF("l %zu\n" , l); |
48 | assert(l <= 16); |
49 | if (!l) { |
50 | return HWLM_SUCCESS; |
51 | } |
52 | m128 v = zeroes128(); |
53 | // we don't have a clever way of doing this move yet |
54 | memcpy(&v, d, l); |
55 | if (noCase) { |
56 | v = and128(v, caseMask); |
57 | } |
58 | |
59 | // mask out where we can't match |
60 | u32 mask = (0xFFFF >> (16 - l)); |
61 | |
62 | u32 z = mask & movemask128(eq128(mask1, v)); |
63 | |
64 | SINGLE_ZSCAN(); |
65 | |
66 | return HWLM_SUCCESS; |
67 | } |
68 | |
69 | static really_inline |
70 | hwlm_error_t scanSingleUnaligned(const struct noodTable *n, const u8 *buf, |
71 | size_t len, size_t offset, bool noCase, |
72 | m128 caseMask, m128 mask1, |
73 | const struct cb_info *cbi, size_t start, |
74 | size_t end) { |
75 | const u8 *d = buf + offset; |
76 | DEBUG_PRINTF("start %zu end %zu offset %zu\n" , start, end, offset); |
77 | const size_t l = end - start; |
78 | |
79 | m128 v = loadu128(d); |
80 | |
81 | if (noCase) { |
82 | v = and128(v, caseMask); |
83 | } |
84 | |
85 | u32 buf_off = start - offset; |
86 | u32 mask = ((1 << l) - 1) << buf_off; |
87 | |
88 | u32 z = mask & movemask128(eq128(mask1, v)); |
89 | |
90 | DEBUG_PRINTF("mask 0x%08x z 0x%08x\n" , mask, z); |
91 | |
92 | z &= mask; |
93 | |
94 | SINGLE_ZSCAN(); |
95 | |
96 | return HWLM_SUCCESS; |
97 | } |
98 | |
99 | static really_inline |
100 | hwlm_error_t scanDoubleShort(const struct noodTable *n, const u8 *buf, |
101 | size_t len, bool noCase, m128 caseMask, m128 mask1, |
102 | m128 mask2, const struct cb_info *cbi, |
103 | size_t start, size_t end) { |
104 | const u8 *d = buf + start; |
105 | size_t l = end - start; |
106 | if (!l) { |
107 | return HWLM_SUCCESS; |
108 | } |
109 | assert(l <= 32); |
110 | |
111 | DEBUG_PRINTF("d %zu\n" , d - buf); |
112 | m128 v = zeroes128(); |
113 | memcpy(&v, d, l); |
114 | if (noCase) { |
115 | v = and128(v, caseMask); |
116 | } |
117 | |
118 | u32 z = movemask128(and128(lshiftbyte_m128(eq128(mask1, v), 1), |
119 | eq128(mask2, v))); |
120 | |
121 | // mask out where we can't match |
122 | u32 mask = (0xFFFF >> (16 - l)); |
123 | z &= mask; |
124 | |
125 | DOUBLE_ZSCAN(); |
126 | |
127 | return HWLM_SUCCESS; |
128 | } |
129 | |
130 | static really_inline |
131 | hwlm_error_t scanDoubleUnaligned(const struct noodTable *n, const u8 *buf, |
132 | size_t len, size_t offset, bool noCase, |
133 | m128 caseMask, m128 mask1, m128 mask2, |
134 | const struct cb_info *cbi, size_t start, |
135 | size_t end) { |
136 | const u8 *d = buf + offset; |
137 | DEBUG_PRINTF("start %zu end %zu offset %zu\n" , start, end, offset); |
138 | size_t l = end - start; |
139 | |
140 | m128 v = loadu128(d); |
141 | |
142 | if (noCase) { |
143 | v = and128(v, caseMask); |
144 | } |
145 | |
146 | u32 z = movemask128(and128(lshiftbyte_m128(eq128(mask1, v), 1), |
147 | eq128(mask2, v))); |
148 | |
149 | // mask out where we can't match |
150 | u32 buf_off = start - offset; |
151 | u32 mask = ((1 << l) - 1) << buf_off; |
152 | DEBUG_PRINTF("mask 0x%08x z 0x%08x\n" , mask, z); |
153 | z &= mask; |
154 | |
155 | DOUBLE_ZSCAN(); |
156 | |
157 | return HWLM_SUCCESS; |
158 | } |
159 | |
160 | static really_inline |
161 | hwlm_error_t scanSingleFast(const struct noodTable *n, const u8 *buf, |
162 | size_t len, bool noCase, m128 caseMask, m128 mask1, |
163 | const struct cb_info *cbi, size_t start, |
164 | size_t end) { |
165 | const u8 *d = buf + start, *e = buf + end; |
166 | assert(d < e); |
167 | |
168 | for (; d < e; d += 16) { |
169 | m128 v = noCase ? and128(load128(d), caseMask) : load128(d); |
170 | |
171 | u32 z = movemask128(eq128(mask1, v)); |
172 | |
173 | // On large packet buffers, this prefetch appears to get us about 2%. |
174 | __builtin_prefetch(d + 128); |
175 | |
176 | SINGLE_ZSCAN(); |
177 | } |
178 | return HWLM_SUCCESS; |
179 | } |
180 | |
181 | static really_inline |
182 | hwlm_error_t scanDoubleFast(const struct noodTable *n, const u8 *buf, |
183 | size_t len, bool noCase, m128 caseMask, m128 mask1, |
184 | m128 mask2, const struct cb_info *cbi, size_t start, |
185 | size_t end) { |
186 | const u8 *d = buf + start, *e = buf + end; |
187 | assert(d < e); |
188 | m128 lastz1 = zeroes128(); |
189 | |
190 | for (; d < e; d += 16) { |
191 | m128 v = noCase ? and128(load128(d), caseMask) : load128(d); |
192 | m128 z1 = eq128(mask1, v); |
193 | m128 z2 = eq128(mask2, v); |
194 | u32 z = movemask128(and128(palignr(z1, lastz1, 15), z2)); |
195 | lastz1 = z1; |
196 | |
197 | // On large packet buffers, this prefetch appears to get us about 2%. |
198 | __builtin_prefetch(d + 128); |
199 | DEBUG_PRINTF("z 0x%08x\n" , z); |
200 | DOUBLE_ZSCAN(); |
201 | } |
202 | return HWLM_SUCCESS; |
203 | } |
204 | |