1/*
2 * Copyright (c) 2015-2016, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** \file
30 * \brief Vermicelli: Intel SSE implementation.
31 *
32 * (users should include vermicelli.h)
33 */
34
35#define VERM_BOUNDARY 16
36#define VERM_TYPE m128
37#define VERM_SET_FN set16x8
38
39static really_inline
40const u8 *vermSearchAligned(m128 chars, const u8 *buf, const u8 *buf_end,
41 char negate) {
42 assert((size_t)buf % 16 == 0);
43 for (; buf + 31 < buf_end; buf += 32) {
44 m128 data = load128(buf);
45 u32 z1 = movemask128(eq128(chars, data));
46 m128 data2 = load128(buf + 16);
47 u32 z2 = movemask128(eq128(chars, data2));
48 u32 z = z1 | (z2 << 16);
49 if (negate) {
50 z = ~z;
51 }
52 if (unlikely(z)) {
53 u32 pos = ctz32(z);
54 return buf + pos;
55 }
56 }
57 for (; buf + 15 < buf_end; buf += 16) {
58 m128 data = load128(buf);
59 u32 z = movemask128(eq128(chars, data));
60 if (negate) {
61 z = ~z & 0xffff;
62 }
63 if (unlikely(z)) {
64 u32 pos = ctz32(z);
65 return buf + pos;
66 }
67 }
68 return NULL;
69}
70
71static really_inline
72const u8 *vermSearchAlignedNocase(m128 chars, const u8 *buf,
73 const u8 *buf_end, char negate) {
74 assert((size_t)buf % 16 == 0);
75 m128 casemask = set16x8(CASE_CLEAR);
76
77 for (; buf + 31 < buf_end; buf += 32) {
78 m128 data = load128(buf);
79 u32 z1 = movemask128(eq128(chars, and128(casemask, data)));
80 m128 data2 = load128(buf + 16);
81 u32 z2 = movemask128(eq128(chars, and128(casemask, data2)));
82 u32 z = z1 | (z2 << 16);
83 if (negate) {
84 z = ~z;
85 }
86 if (unlikely(z)) {
87 u32 pos = ctz32(z);
88 return buf + pos;
89 }
90 }
91
92 for (; buf + 15 < buf_end; buf += 16) {
93 m128 data = load128(buf);
94 u32 z = movemask128(eq128(chars, and128(casemask, data)));
95 if (negate) {
96 z = ~z & 0xffff;
97 }
98 if (unlikely(z)) {
99 u32 pos = ctz32(z);
100 return buf + pos;
101 }
102 }
103 return NULL;
104}
105
106// returns NULL if not found
107static really_inline
108const u8 *vermUnalign(m128 chars, const u8 *buf, char negate) {
109 m128 data = loadu128(buf); // unaligned
110 u32 z = movemask128(eq128(chars, data));
111 if (negate) {
112 z = ~z & 0xffff;
113 }
114 if (unlikely(z)) {
115 return buf + ctz32(z);
116 }
117 return NULL;
118}
119
120// returns NULL if not found
121static really_inline
122const u8 *vermUnalignNocase(m128 chars, const u8 *buf, char negate) {
123 m128 casemask = set16x8(CASE_CLEAR);
124 m128 data = loadu128(buf); // unaligned
125 u32 z = movemask128(eq128(chars, and128(casemask, data)));
126 if (negate) {
127 z = ~z & 0xffff;
128 }
129 if (unlikely(z)) {
130 return buf + ctz32(z);
131 }
132 return NULL;
133}
134
135static really_inline
136const u8 *dvermSearchAligned(m128 chars1, m128 chars2, u8 c1, u8 c2,
137 const u8 *buf, const u8 *buf_end) {
138 for (; buf + 16 < buf_end; buf += 16) {
139 m128 data = load128(buf);
140 u32 z = movemask128(and128(eq128(chars1, data),
141 rshiftbyte_m128(eq128(chars2, data), 1)));
142 if (buf[15] == c1 && buf[16] == c2) {
143 z |= (1 << 15);
144 }
145 if (unlikely(z)) {
146 u32 pos = ctz32(z);
147 return buf + pos;
148 }
149 }
150
151 return NULL;
152}
153
154static really_inline
155const u8 *dvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2,
156 const u8 *buf, const u8 *buf_end) {
157 assert((size_t)buf % 16 == 0);
158 m128 casemask = set16x8(CASE_CLEAR);
159
160 for (; buf + 16 < buf_end; buf += 16) {
161 m128 data = load128(buf);
162 m128 v = and128(casemask, data);
163 u32 z = movemask128(and128(eq128(chars1, v),
164 rshiftbyte_m128(eq128(chars2, v), 1)));
165 if ((buf[15] & CASE_CLEAR) == c1 && (buf[16] & CASE_CLEAR) == c2) {
166 z |= (1 << 15);
167 }
168 if (unlikely(z)) {
169 u32 pos = ctz32(z);
170 return buf + pos;
171 }
172 }
173
174 return NULL;
175}
176
177static really_inline
178const u8 *dvermSearchAlignedMasked(m128 chars1, m128 chars2,
179 m128 mask1, m128 mask2, u8 c1, u8 c2, u8 m1,
180 u8 m2, const u8 *buf, const u8 *buf_end) {
181 assert((size_t)buf % 16 == 0);
182
183 for (; buf + 16 < buf_end; buf += 16) {
184 m128 data = load128(buf);
185 m128 v1 = eq128(chars1, and128(data, mask1));
186 m128 v2 = eq128(chars2, and128(data, mask2));
187 u32 z = movemask128(and128(v1, rshiftbyte_m128(v2, 1)));
188
189 if ((buf[15] & m1) == c1 && (buf[16] & m2) == c2) {
190 z |= (1 << 15);
191 }
192 if (unlikely(z)) {
193 u32 pos = ctz32(z);
194 return buf + pos;
195 }
196 }
197
198 return NULL;
199}
200
201// returns NULL if not found
202static really_inline
203const u8 *dvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) {
204 m128 data = loadu128(buf); // unaligned
205 u32 z = movemask128(and128(eq128(chars1, data),
206 rshiftbyte_m128(eq128(chars2, data), 1)));
207
208 /* no fixup of the boundary required - the aligned run will pick it up */
209 if (unlikely(z)) {
210 u32 pos = ctz32(z);
211 return buf + pos;
212 }
213 return NULL;
214}
215
216// returns NULL if not found
217static really_inline
218const u8 *dvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) {
219 /* due to laziness, nonalphas and nocase having interesting behaviour */
220 m128 casemask = set16x8(CASE_CLEAR);
221 m128 data = loadu128(buf); // unaligned
222 m128 v = and128(casemask, data);
223 u32 z = movemask128(and128(eq128(chars1, v),
224 rshiftbyte_m128(eq128(chars2, v), 1)));
225
226 /* no fixup of the boundary required - the aligned run will pick it up */
227 if (unlikely(z)) {
228 u32 pos = ctz32(z);
229 return buf + pos;
230 }
231 return NULL;
232}
233
234// returns NULL if not found
235static really_inline
236const u8 *dvermPreconditionMasked(m128 chars1, m128 chars2,
237 m128 mask1, m128 mask2, const u8 *buf) {
238 m128 data = loadu128(buf); // unaligned
239 m128 v1 = eq128(chars1, and128(data, mask1));
240 m128 v2 = eq128(chars2, and128(data, mask2));
241 u32 z = movemask128(and128(v1, rshiftbyte_m128(v2, 1)));
242
243 /* no fixup of the boundary required - the aligned run will pick it up */
244 if (unlikely(z)) {
245 u32 pos = ctz32(z);
246 return buf + pos;
247 }
248 return NULL;
249}
250
251static really_inline
252const u8 *lastMatchOffset(const u8 *buf_end, u32 z) {
253 assert(z);
254 return buf_end - 16 + 31 - clz32(z);
255}
256
257static really_inline
258const u8 *rvermSearchAligned(m128 chars, const u8 *buf, const u8 *buf_end,
259 char negate) {
260 assert((size_t)buf_end % 16 == 0);
261 for (; buf + 15 < buf_end; buf_end -= 16) {
262 m128 data = load128(buf_end - 16);
263 u32 z = movemask128(eq128(chars, data));
264 if (negate) {
265 z = ~z & 0xffff;
266 }
267 if (unlikely(z)) {
268 return lastMatchOffset(buf_end, z);
269 }
270 }
271 return NULL;
272}
273
274static really_inline
275const u8 *rvermSearchAlignedNocase(m128 chars, const u8 *buf,
276 const u8 *buf_end, char negate) {
277 assert((size_t)buf_end % 16 == 0);
278 m128 casemask = set16x8(CASE_CLEAR);
279
280 for (; buf + 15 < buf_end; buf_end -= 16) {
281 m128 data = load128(buf_end - 16);
282 u32 z = movemask128(eq128(chars, and128(casemask, data)));
283 if (negate) {
284 z = ~z & 0xffff;
285 }
286 if (unlikely(z)) {
287 return lastMatchOffset(buf_end, z);
288 }
289 }
290 return NULL;
291}
292
293// returns NULL if not found
294static really_inline
295const u8 *rvermUnalign(m128 chars, const u8 *buf, char negate) {
296 m128 data = loadu128(buf); // unaligned
297 u32 z = movemask128(eq128(chars, data));
298 if (negate) {
299 z = ~z & 0xffff;
300 }
301 if (unlikely(z)) {
302 return lastMatchOffset(buf + 16, z);
303 }
304 return NULL;
305}
306
307// returns NULL if not found
308static really_inline
309const u8 *rvermUnalignNocase(m128 chars, const u8 *buf, char negate) {
310 m128 casemask = set16x8(CASE_CLEAR);
311 m128 data = loadu128(buf); // unaligned
312 u32 z = movemask128(eq128(chars, and128(casemask, data)));
313 if (negate) {
314 z = ~z & 0xffff;
315 }
316 if (unlikely(z)) {
317 return lastMatchOffset(buf + 16, z);
318 }
319 return NULL;
320}
321
322static really_inline
323const u8 *rdvermSearchAligned(m128 chars1, m128 chars2, u8 c1, u8 c2,
324 const u8 *buf, const u8 *buf_end) {
325 assert((size_t)buf_end % 16 == 0);
326
327 for (; buf + 16 < buf_end; buf_end -= 16) {
328 m128 data = load128(buf_end - 16);
329 u32 z = movemask128(and128(eq128(chars2, data),
330 lshiftbyte_m128(eq128(chars1, data), 1)));
331 if (buf_end[-17] == c1 && buf_end[-16] == c2) {
332 z |= 1;
333 }
334 if (unlikely(z)) {
335 return lastMatchOffset(buf_end, z);
336 }
337 }
338 return buf_end;
339}
340
341static really_inline
342const u8 *rdvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2,
343 const u8 *buf, const u8 *buf_end) {
344 assert((size_t)buf_end % 16 == 0);
345 m128 casemask = set16x8(CASE_CLEAR);
346
347 for (; buf + 16 < buf_end; buf_end -= 16) {
348 m128 data = load128(buf_end - 16);
349 m128 v = and128(casemask, data);
350 u32 z = movemask128(and128(eq128(chars2, v),
351 lshiftbyte_m128(eq128(chars1, v), 1)));
352 if ((buf_end[-17] & CASE_CLEAR) == c1
353 && (buf_end[-16] & CASE_CLEAR) == c2) {
354 z |= 1;
355 }
356 if (unlikely(z)) {
357 return lastMatchOffset(buf_end, z);
358 }
359 }
360 return buf_end;
361}
362
363// returns NULL if not found
364static really_inline
365const u8 *rdvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) {
366 m128 data = loadu128(buf);
367 u32 z = movemask128(and128(eq128(chars2, data),
368 lshiftbyte_m128(eq128(chars1, data), 1)));
369
370 /* no fixup of the boundary required - the aligned run will pick it up */
371 if (unlikely(z)) {
372 return lastMatchOffset(buf + 16, z);
373 }
374
375 return NULL;
376}
377
378// returns NULL if not found
379static really_inline
380const u8 *rdvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) {
381 /* due to laziness, nonalphas and nocase having interesting behaviour */
382 m128 casemask = set16x8(CASE_CLEAR);
383 m128 data = loadu128(buf);
384 m128 v = and128(casemask, data);
385 u32 z = movemask128(and128(eq128(chars2, v),
386 lshiftbyte_m128(eq128(chars1, v), 1)));
387 /* no fixup of the boundary required - the aligned run will pick it up */
388 if (unlikely(z)) {
389 return lastMatchOffset(buf + 16, z);
390 }
391
392 return NULL;
393}
394