1 | /* |
2 | * Copyright (c) 2015-2016, Intel Corporation |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions are met: |
6 | * |
7 | * * Redistributions of source code must retain the above copyright notice, |
8 | * this list of conditions and the following disclaimer. |
9 | * * Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * * Neither the name of Intel Corporation nor the names of its contributors |
13 | * may be used to endorse or promote products derived from this software |
14 | * without specific prior written permission. |
15 | * |
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
17 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
20 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
26 | * POSSIBILITY OF SUCH DAMAGE. |
27 | */ |
28 | |
29 | /** \file |
30 | * \brief Vermicelli: Intel SSE implementation. |
31 | * |
32 | * (users should include vermicelli.h) |
33 | */ |
34 | |
35 | #define VERM_BOUNDARY 16 |
36 | #define VERM_TYPE m128 |
37 | #define VERM_SET_FN set16x8 |
38 | |
39 | static really_inline |
40 | const u8 *vermSearchAligned(m128 chars, const u8 *buf, const u8 *buf_end, |
41 | char negate) { |
42 | assert((size_t)buf % 16 == 0); |
43 | for (; buf + 31 < buf_end; buf += 32) { |
44 | m128 data = load128(buf); |
45 | u32 z1 = movemask128(eq128(chars, data)); |
46 | m128 data2 = load128(buf + 16); |
47 | u32 z2 = movemask128(eq128(chars, data2)); |
48 | u32 z = z1 | (z2 << 16); |
49 | if (negate) { |
50 | z = ~z; |
51 | } |
52 | if (unlikely(z)) { |
53 | u32 pos = ctz32(z); |
54 | return buf + pos; |
55 | } |
56 | } |
57 | for (; buf + 15 < buf_end; buf += 16) { |
58 | m128 data = load128(buf); |
59 | u32 z = movemask128(eq128(chars, data)); |
60 | if (negate) { |
61 | z = ~z & 0xffff; |
62 | } |
63 | if (unlikely(z)) { |
64 | u32 pos = ctz32(z); |
65 | return buf + pos; |
66 | } |
67 | } |
68 | return NULL; |
69 | } |
70 | |
71 | static really_inline |
72 | const u8 *vermSearchAlignedNocase(m128 chars, const u8 *buf, |
73 | const u8 *buf_end, char negate) { |
74 | assert((size_t)buf % 16 == 0); |
75 | m128 casemask = set16x8(CASE_CLEAR); |
76 | |
77 | for (; buf + 31 < buf_end; buf += 32) { |
78 | m128 data = load128(buf); |
79 | u32 z1 = movemask128(eq128(chars, and128(casemask, data))); |
80 | m128 data2 = load128(buf + 16); |
81 | u32 z2 = movemask128(eq128(chars, and128(casemask, data2))); |
82 | u32 z = z1 | (z2 << 16); |
83 | if (negate) { |
84 | z = ~z; |
85 | } |
86 | if (unlikely(z)) { |
87 | u32 pos = ctz32(z); |
88 | return buf + pos; |
89 | } |
90 | } |
91 | |
92 | for (; buf + 15 < buf_end; buf += 16) { |
93 | m128 data = load128(buf); |
94 | u32 z = movemask128(eq128(chars, and128(casemask, data))); |
95 | if (negate) { |
96 | z = ~z & 0xffff; |
97 | } |
98 | if (unlikely(z)) { |
99 | u32 pos = ctz32(z); |
100 | return buf + pos; |
101 | } |
102 | } |
103 | return NULL; |
104 | } |
105 | |
106 | // returns NULL if not found |
107 | static really_inline |
108 | const u8 *vermUnalign(m128 chars, const u8 *buf, char negate) { |
109 | m128 data = loadu128(buf); // unaligned |
110 | u32 z = movemask128(eq128(chars, data)); |
111 | if (negate) { |
112 | z = ~z & 0xffff; |
113 | } |
114 | if (unlikely(z)) { |
115 | return buf + ctz32(z); |
116 | } |
117 | return NULL; |
118 | } |
119 | |
120 | // returns NULL if not found |
121 | static really_inline |
122 | const u8 *vermUnalignNocase(m128 chars, const u8 *buf, char negate) { |
123 | m128 casemask = set16x8(CASE_CLEAR); |
124 | m128 data = loadu128(buf); // unaligned |
125 | u32 z = movemask128(eq128(chars, and128(casemask, data))); |
126 | if (negate) { |
127 | z = ~z & 0xffff; |
128 | } |
129 | if (unlikely(z)) { |
130 | return buf + ctz32(z); |
131 | } |
132 | return NULL; |
133 | } |
134 | |
135 | static really_inline |
136 | const u8 *dvermSearchAligned(m128 chars1, m128 chars2, u8 c1, u8 c2, |
137 | const u8 *buf, const u8 *buf_end) { |
138 | for (; buf + 16 < buf_end; buf += 16) { |
139 | m128 data = load128(buf); |
140 | u32 z = movemask128(and128(eq128(chars1, data), |
141 | rshiftbyte_m128(eq128(chars2, data), 1))); |
142 | if (buf[15] == c1 && buf[16] == c2) { |
143 | z |= (1 << 15); |
144 | } |
145 | if (unlikely(z)) { |
146 | u32 pos = ctz32(z); |
147 | return buf + pos; |
148 | } |
149 | } |
150 | |
151 | return NULL; |
152 | } |
153 | |
154 | static really_inline |
155 | const u8 *dvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2, |
156 | const u8 *buf, const u8 *buf_end) { |
157 | assert((size_t)buf % 16 == 0); |
158 | m128 casemask = set16x8(CASE_CLEAR); |
159 | |
160 | for (; buf + 16 < buf_end; buf += 16) { |
161 | m128 data = load128(buf); |
162 | m128 v = and128(casemask, data); |
163 | u32 z = movemask128(and128(eq128(chars1, v), |
164 | rshiftbyte_m128(eq128(chars2, v), 1))); |
165 | if ((buf[15] & CASE_CLEAR) == c1 && (buf[16] & CASE_CLEAR) == c2) { |
166 | z |= (1 << 15); |
167 | } |
168 | if (unlikely(z)) { |
169 | u32 pos = ctz32(z); |
170 | return buf + pos; |
171 | } |
172 | } |
173 | |
174 | return NULL; |
175 | } |
176 | |
177 | static really_inline |
178 | const u8 *dvermSearchAlignedMasked(m128 chars1, m128 chars2, |
179 | m128 mask1, m128 mask2, u8 c1, u8 c2, u8 m1, |
180 | u8 m2, const u8 *buf, const u8 *buf_end) { |
181 | assert((size_t)buf % 16 == 0); |
182 | |
183 | for (; buf + 16 < buf_end; buf += 16) { |
184 | m128 data = load128(buf); |
185 | m128 v1 = eq128(chars1, and128(data, mask1)); |
186 | m128 v2 = eq128(chars2, and128(data, mask2)); |
187 | u32 z = movemask128(and128(v1, rshiftbyte_m128(v2, 1))); |
188 | |
189 | if ((buf[15] & m1) == c1 && (buf[16] & m2) == c2) { |
190 | z |= (1 << 15); |
191 | } |
192 | if (unlikely(z)) { |
193 | u32 pos = ctz32(z); |
194 | return buf + pos; |
195 | } |
196 | } |
197 | |
198 | return NULL; |
199 | } |
200 | |
201 | // returns NULL if not found |
202 | static really_inline |
203 | const u8 *dvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) { |
204 | m128 data = loadu128(buf); // unaligned |
205 | u32 z = movemask128(and128(eq128(chars1, data), |
206 | rshiftbyte_m128(eq128(chars2, data), 1))); |
207 | |
208 | /* no fixup of the boundary required - the aligned run will pick it up */ |
209 | if (unlikely(z)) { |
210 | u32 pos = ctz32(z); |
211 | return buf + pos; |
212 | } |
213 | return NULL; |
214 | } |
215 | |
216 | // returns NULL if not found |
217 | static really_inline |
218 | const u8 *dvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) { |
219 | /* due to laziness, nonalphas and nocase having interesting behaviour */ |
220 | m128 casemask = set16x8(CASE_CLEAR); |
221 | m128 data = loadu128(buf); // unaligned |
222 | m128 v = and128(casemask, data); |
223 | u32 z = movemask128(and128(eq128(chars1, v), |
224 | rshiftbyte_m128(eq128(chars2, v), 1))); |
225 | |
226 | /* no fixup of the boundary required - the aligned run will pick it up */ |
227 | if (unlikely(z)) { |
228 | u32 pos = ctz32(z); |
229 | return buf + pos; |
230 | } |
231 | return NULL; |
232 | } |
233 | |
234 | // returns NULL if not found |
235 | static really_inline |
236 | const u8 *dvermPreconditionMasked(m128 chars1, m128 chars2, |
237 | m128 mask1, m128 mask2, const u8 *buf) { |
238 | m128 data = loadu128(buf); // unaligned |
239 | m128 v1 = eq128(chars1, and128(data, mask1)); |
240 | m128 v2 = eq128(chars2, and128(data, mask2)); |
241 | u32 z = movemask128(and128(v1, rshiftbyte_m128(v2, 1))); |
242 | |
243 | /* no fixup of the boundary required - the aligned run will pick it up */ |
244 | if (unlikely(z)) { |
245 | u32 pos = ctz32(z); |
246 | return buf + pos; |
247 | } |
248 | return NULL; |
249 | } |
250 | |
251 | static really_inline |
252 | const u8 *lastMatchOffset(const u8 *buf_end, u32 z) { |
253 | assert(z); |
254 | return buf_end - 16 + 31 - clz32(z); |
255 | } |
256 | |
257 | static really_inline |
258 | const u8 *rvermSearchAligned(m128 chars, const u8 *buf, const u8 *buf_end, |
259 | char negate) { |
260 | assert((size_t)buf_end % 16 == 0); |
261 | for (; buf + 15 < buf_end; buf_end -= 16) { |
262 | m128 data = load128(buf_end - 16); |
263 | u32 z = movemask128(eq128(chars, data)); |
264 | if (negate) { |
265 | z = ~z & 0xffff; |
266 | } |
267 | if (unlikely(z)) { |
268 | return lastMatchOffset(buf_end, z); |
269 | } |
270 | } |
271 | return NULL; |
272 | } |
273 | |
274 | static really_inline |
275 | const u8 *rvermSearchAlignedNocase(m128 chars, const u8 *buf, |
276 | const u8 *buf_end, char negate) { |
277 | assert((size_t)buf_end % 16 == 0); |
278 | m128 casemask = set16x8(CASE_CLEAR); |
279 | |
280 | for (; buf + 15 < buf_end; buf_end -= 16) { |
281 | m128 data = load128(buf_end - 16); |
282 | u32 z = movemask128(eq128(chars, and128(casemask, data))); |
283 | if (negate) { |
284 | z = ~z & 0xffff; |
285 | } |
286 | if (unlikely(z)) { |
287 | return lastMatchOffset(buf_end, z); |
288 | } |
289 | } |
290 | return NULL; |
291 | } |
292 | |
293 | // returns NULL if not found |
294 | static really_inline |
295 | const u8 *rvermUnalign(m128 chars, const u8 *buf, char negate) { |
296 | m128 data = loadu128(buf); // unaligned |
297 | u32 z = movemask128(eq128(chars, data)); |
298 | if (negate) { |
299 | z = ~z & 0xffff; |
300 | } |
301 | if (unlikely(z)) { |
302 | return lastMatchOffset(buf + 16, z); |
303 | } |
304 | return NULL; |
305 | } |
306 | |
307 | // returns NULL if not found |
308 | static really_inline |
309 | const u8 *rvermUnalignNocase(m128 chars, const u8 *buf, char negate) { |
310 | m128 casemask = set16x8(CASE_CLEAR); |
311 | m128 data = loadu128(buf); // unaligned |
312 | u32 z = movemask128(eq128(chars, and128(casemask, data))); |
313 | if (negate) { |
314 | z = ~z & 0xffff; |
315 | } |
316 | if (unlikely(z)) { |
317 | return lastMatchOffset(buf + 16, z); |
318 | } |
319 | return NULL; |
320 | } |
321 | |
322 | static really_inline |
323 | const u8 *rdvermSearchAligned(m128 chars1, m128 chars2, u8 c1, u8 c2, |
324 | const u8 *buf, const u8 *buf_end) { |
325 | assert((size_t)buf_end % 16 == 0); |
326 | |
327 | for (; buf + 16 < buf_end; buf_end -= 16) { |
328 | m128 data = load128(buf_end - 16); |
329 | u32 z = movemask128(and128(eq128(chars2, data), |
330 | lshiftbyte_m128(eq128(chars1, data), 1))); |
331 | if (buf_end[-17] == c1 && buf_end[-16] == c2) { |
332 | z |= 1; |
333 | } |
334 | if (unlikely(z)) { |
335 | return lastMatchOffset(buf_end, z); |
336 | } |
337 | } |
338 | return buf_end; |
339 | } |
340 | |
341 | static really_inline |
342 | const u8 *rdvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2, |
343 | const u8 *buf, const u8 *buf_end) { |
344 | assert((size_t)buf_end % 16 == 0); |
345 | m128 casemask = set16x8(CASE_CLEAR); |
346 | |
347 | for (; buf + 16 < buf_end; buf_end -= 16) { |
348 | m128 data = load128(buf_end - 16); |
349 | m128 v = and128(casemask, data); |
350 | u32 z = movemask128(and128(eq128(chars2, v), |
351 | lshiftbyte_m128(eq128(chars1, v), 1))); |
352 | if ((buf_end[-17] & CASE_CLEAR) == c1 |
353 | && (buf_end[-16] & CASE_CLEAR) == c2) { |
354 | z |= 1; |
355 | } |
356 | if (unlikely(z)) { |
357 | return lastMatchOffset(buf_end, z); |
358 | } |
359 | } |
360 | return buf_end; |
361 | } |
362 | |
363 | // returns NULL if not found |
364 | static really_inline |
365 | const u8 *rdvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) { |
366 | m128 data = loadu128(buf); |
367 | u32 z = movemask128(and128(eq128(chars2, data), |
368 | lshiftbyte_m128(eq128(chars1, data), 1))); |
369 | |
370 | /* no fixup of the boundary required - the aligned run will pick it up */ |
371 | if (unlikely(z)) { |
372 | return lastMatchOffset(buf + 16, z); |
373 | } |
374 | |
375 | return NULL; |
376 | } |
377 | |
378 | // returns NULL if not found |
379 | static really_inline |
380 | const u8 *rdvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) { |
381 | /* due to laziness, nonalphas and nocase having interesting behaviour */ |
382 | m128 casemask = set16x8(CASE_CLEAR); |
383 | m128 data = loadu128(buf); |
384 | m128 v = and128(casemask, data); |
385 | u32 z = movemask128(and128(eq128(chars2, v), |
386 | lshiftbyte_m128(eq128(chars1, v), 1))); |
387 | /* no fixup of the boundary required - the aligned run will pick it up */ |
388 | if (unlikely(z)) { |
389 | return lastMatchOffset(buf + 16, z); |
390 | } |
391 | |
392 | return NULL; |
393 | } |
394 | |