1 | /* |
2 | * Copyright (c) 2015-2016, Intel Corporation |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions are met: |
6 | * |
7 | * * Redistributions of source code must retain the above copyright notice, |
8 | * this list of conditions and the following disclaimer. |
9 | * * Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * * Neither the name of Intel Corporation nor the names of its contributors |
13 | * may be used to endorse or promote products derived from this software |
14 | * without specific prior written permission. |
15 | * |
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
17 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
20 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
26 | * POSSIBILITY OF SUCH DAMAGE. |
27 | */ |
28 | |
29 | /** \file |
30 | * \brief Vermicelli: single-byte and double-byte acceleration. |
31 | */ |
32 | |
33 | #ifndef VERMICELLI_H |
34 | #define VERMICELLI_H |
35 | |
36 | #include "util/bitutils.h" |
37 | #include "util/simd_utils.h" |
38 | #include "util/unaligned.h" |
39 | |
40 | #include "vermicelli_sse.h" |
41 | |
42 | static really_inline |
43 | const u8 *vermicelliExec(char c, char nocase, const u8 *buf, |
44 | const u8 *buf_end) { |
45 | DEBUG_PRINTF("verm scan %s\\x%02hhx over %zu bytes\n" , |
46 | nocase ? "nocase " : "" , c, (size_t)(buf_end - buf)); |
47 | assert(buf < buf_end); |
48 | |
49 | // Handle small scans. |
50 | if (buf_end - buf < VERM_BOUNDARY) { |
51 | for (; buf < buf_end; buf++) { |
52 | char cur = (char)*buf; |
53 | if (nocase) { |
54 | cur &= CASE_CLEAR; |
55 | } |
56 | if (cur == c) { |
57 | break; |
58 | } |
59 | } |
60 | return buf; |
61 | } |
62 | |
63 | VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */ |
64 | uintptr_t min = (uintptr_t)buf % VERM_BOUNDARY; |
65 | if (min) { |
66 | // Input isn't aligned, so we need to run one iteration with an |
67 | // unaligned load, then skip buf forward to the next aligned address. |
68 | // There's some small overlap here, but we don't mind scanning it twice |
69 | // if we can do it quickly, do we? |
70 | const u8 *ptr = nocase ? vermUnalignNocase(chars, buf, 0) |
71 | : vermUnalign(chars, buf, 0); |
72 | if (ptr) { |
73 | return ptr; |
74 | } |
75 | |
76 | buf += VERM_BOUNDARY - min; |
77 | assert(buf < buf_end); |
78 | } |
79 | |
80 | // Aligned loops from here on in |
81 | const u8 *ptr = nocase ? vermSearchAlignedNocase(chars, buf, buf_end - 1, 0) |
82 | : vermSearchAligned(chars, buf, buf_end - 1, 0); |
83 | if (ptr) { |
84 | return ptr; |
85 | } |
86 | |
87 | // Tidy up the mess at the end |
88 | ptr = nocase ? vermUnalignNocase(chars, buf_end - VERM_BOUNDARY, 0) |
89 | : vermUnalign(chars, buf_end - VERM_BOUNDARY, 0); |
90 | return ptr ? ptr : buf_end; |
91 | } |
92 | |
93 | /* like vermicelliExec except returns the address of the first character which |
94 | * is not c */ |
95 | static really_inline |
96 | const u8 *nvermicelliExec(char c, char nocase, const u8 *buf, |
97 | const u8 *buf_end) { |
98 | DEBUG_PRINTF("nverm scan %s\\x%02hhx over %zu bytes\n" , |
99 | nocase ? "nocase " : "" , c, (size_t)(buf_end - buf)); |
100 | assert(buf < buf_end); |
101 | |
102 | // Handle small scans. |
103 | if (buf_end - buf < VERM_BOUNDARY) { |
104 | for (; buf < buf_end; buf++) { |
105 | char cur = (char)*buf; |
106 | if (nocase) { |
107 | cur &= CASE_CLEAR; |
108 | } |
109 | if (cur != c) { |
110 | break; |
111 | } |
112 | } |
113 | return buf; |
114 | } |
115 | |
116 | VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */ |
117 | size_t min = (size_t)buf % VERM_BOUNDARY; |
118 | if (min) { |
119 | // Input isn't aligned, so we need to run one iteration with an |
120 | // unaligned load, then skip buf forward to the next aligned address. |
121 | // There's some small overlap here, but we don't mind scanning it twice |
122 | // if we can do it quickly, do we? |
123 | const u8 *ptr = nocase ? vermUnalignNocase(chars, buf, 1) |
124 | : vermUnalign(chars, buf, 1); |
125 | if (ptr) { |
126 | return ptr; |
127 | } |
128 | |
129 | buf += VERM_BOUNDARY - min; |
130 | assert(buf < buf_end); |
131 | } |
132 | |
133 | // Aligned loops from here on in |
134 | const u8 *ptr = nocase ? vermSearchAlignedNocase(chars, buf, buf_end - 1, 1) |
135 | : vermSearchAligned(chars, buf, buf_end - 1, 1); |
136 | if (ptr) { |
137 | return ptr; |
138 | } |
139 | |
140 | // Tidy up the mess at the end |
141 | ptr = nocase ? vermUnalignNocase(chars, buf_end - VERM_BOUNDARY, 1) |
142 | : vermUnalign(chars, buf_end - VERM_BOUNDARY, 1); |
143 | return ptr ? ptr : buf_end; |
144 | } |
145 | |
146 | static really_inline |
147 | const u8 *vermicelliDoubleExec(char c1, char c2, char nocase, const u8 *buf, |
148 | const u8 *buf_end) { |
149 | DEBUG_PRINTF("double verm scan %s\\x%02hhx%02hhx over %zu bytes\n" , |
150 | nocase ? "nocase " : "" , c1, c2, (size_t)(buf_end - buf)); |
151 | assert(buf < buf_end); |
152 | assert((buf_end - buf) >= VERM_BOUNDARY); |
153 | |
154 | uintptr_t min = (uintptr_t)buf % VERM_BOUNDARY; |
155 | VERM_TYPE chars1 = VERM_SET_FN(c1); /* nocase already uppercase */ |
156 | VERM_TYPE chars2 = VERM_SET_FN(c2); /* nocase already uppercase */ |
157 | |
158 | if (min) { |
159 | // Input isn't aligned, so we need to run one iteration with an |
160 | // unaligned load, then skip buf forward to the next aligned address. |
161 | // There's some small overlap here, but we don't mind scanning it twice |
162 | // if we can do it quickly, do we? |
163 | const u8 *ptr = nocase |
164 | ? dvermPreconditionNocase(chars1, chars2, buf) |
165 | : dvermPrecondition(chars1, chars2, buf); |
166 | if (ptr) { |
167 | return ptr; |
168 | } |
169 | |
170 | buf += VERM_BOUNDARY - min; |
171 | assert(buf < buf_end); |
172 | } |
173 | |
174 | // Aligned loops from here on in |
175 | const u8 *ptr = nocase ? dvermSearchAlignedNocase(chars1, chars2, c1, c2, |
176 | buf, buf_end) |
177 | : dvermSearchAligned(chars1, chars2, c1, c2, buf, |
178 | buf_end); |
179 | if (ptr) { |
180 | return ptr; |
181 | } |
182 | |
183 | // Tidy up the mess at the end |
184 | ptr = nocase ? dvermPreconditionNocase(chars1, chars2, |
185 | buf_end - VERM_BOUNDARY) |
186 | : dvermPrecondition(chars1, chars2, buf_end - VERM_BOUNDARY); |
187 | |
188 | if (ptr) { |
189 | return ptr; |
190 | } |
191 | |
192 | /* check for partial match at end */ |
193 | u8 mask = nocase ? CASE_CLEAR : 0xff; |
194 | if ((buf_end[-1] & mask) == (u8)c1) { |
195 | DEBUG_PRINTF("partial!!!\n" ); |
196 | return buf_end - 1; |
197 | } |
198 | |
199 | return buf_end; |
200 | } |
201 | |
202 | static really_inline |
203 | const u8 *vermicelliDoubleMaskedExec(char c1, char c2, char m1, char m2, |
204 | const u8 *buf, const u8 *buf_end) { |
205 | DEBUG_PRINTF("double verm scan (\\x%02hhx&\\x%02hhx)(\\x%02hhx&\\x%02hhx) " |
206 | "over %zu bytes\n" , c1, m1, c2, m2, (size_t)(buf_end - buf)); |
207 | assert(buf < buf_end); |
208 | assert((buf_end - buf) >= VERM_BOUNDARY); |
209 | |
210 | uintptr_t min = (uintptr_t)buf % VERM_BOUNDARY; |
211 | VERM_TYPE chars1 = VERM_SET_FN(c1); |
212 | VERM_TYPE chars2 = VERM_SET_FN(c2); |
213 | VERM_TYPE mask1 = VERM_SET_FN(m1); |
214 | VERM_TYPE mask2 = VERM_SET_FN(m2); |
215 | |
216 | if (min) { |
217 | // Input isn't aligned, so we need to run one iteration with an |
218 | // unaligned load, then skip buf forward to the next aligned address. |
219 | // There's some small overlap here, but we don't mind scanning it twice |
220 | // if we can do it quickly, do we? |
221 | const u8 *p = dvermPreconditionMasked(chars1, chars2, mask1, mask2, buf); |
222 | if (p) { |
223 | return p; |
224 | } |
225 | |
226 | buf += VERM_BOUNDARY - min; |
227 | assert(buf < buf_end); |
228 | } |
229 | |
230 | // Aligned loops from here on in |
231 | const u8 *ptr = dvermSearchAlignedMasked(chars1, chars2, mask1, mask2, c1, |
232 | c2, m1, m2, buf, buf_end); |
233 | if (ptr) { |
234 | return ptr; |
235 | } |
236 | |
237 | // Tidy up the mess at the end |
238 | ptr = dvermPreconditionMasked(chars1, chars2, mask1, mask2, |
239 | buf_end - VERM_BOUNDARY); |
240 | |
241 | if (ptr) { |
242 | return ptr; |
243 | } |
244 | |
245 | /* check for partial match at end */ |
246 | if ((buf_end[-1] & m1) == (u8)c1) { |
247 | return buf_end - 1; |
248 | } |
249 | |
250 | return buf_end; |
251 | } |
252 | |
253 | // Reverse vermicelli scan. Provides exact semantics and returns (buf - 1) if |
254 | // character not found. |
255 | static really_inline |
256 | const u8 *rvermicelliExec(char c, char nocase, const u8 *buf, |
257 | const u8 *buf_end) { |
258 | DEBUG_PRINTF("rev verm scan %s\\x%02hhx over %zu bytes\n" , |
259 | nocase ? "nocase " : "" , c, (size_t)(buf_end - buf)); |
260 | assert(buf < buf_end); |
261 | |
262 | // Handle small scans. |
263 | if (buf_end - buf < VERM_BOUNDARY) { |
264 | for (buf_end--; buf_end >= buf; buf_end--) { |
265 | char cur = (char)*buf_end; |
266 | if (nocase) { |
267 | cur &= CASE_CLEAR; |
268 | } |
269 | if (cur == c) { |
270 | break; |
271 | } |
272 | } |
273 | return buf_end; |
274 | } |
275 | |
276 | VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */ |
277 | size_t min = (size_t)buf_end % VERM_BOUNDARY; |
278 | |
279 | if (min) { |
280 | // Input isn't aligned, so we need to run one iteration with an |
281 | // unaligned load, then skip buf backward to the next aligned address. |
282 | // There's some small overlap here, but we don't mind scanning it twice |
283 | // if we can do it quickly, do we? |
284 | if (nocase) { |
285 | const u8 *ptr = |
286 | rvermUnalignNocase(chars, buf_end - VERM_BOUNDARY, 0); |
287 | if (ptr) { |
288 | return ptr; |
289 | } |
290 | } else { |
291 | const u8 *ptr = rvermUnalign(chars, buf_end - VERM_BOUNDARY, 0); |
292 | if (ptr) { |
293 | return ptr; |
294 | } |
295 | } |
296 | |
297 | buf_end -= min; |
298 | if (buf >= buf_end) { |
299 | return buf_end; |
300 | } |
301 | } |
302 | |
303 | // Aligned loops from here on in. |
304 | const u8 *ptr = nocase ? rvermSearchAlignedNocase(chars, buf, buf_end, 0) |
305 | : rvermSearchAligned(chars, buf, buf_end, 0); |
306 | if (ptr) { |
307 | return ptr; |
308 | } |
309 | |
310 | // Tidy up the mess at the end, return buf - 1 if not found. |
311 | ptr = nocase ? rvermUnalignNocase(chars, buf, 0) |
312 | : rvermUnalign(chars, buf, 0); |
313 | return ptr ? ptr : buf - 1; |
314 | } |
315 | |
316 | /* like rvermicelliExec except returns the address of the last character which |
317 | * is not c */ |
318 | static really_inline |
319 | const u8 *rnvermicelliExec(char c, char nocase, const u8 *buf, |
320 | const u8 *buf_end) { |
321 | DEBUG_PRINTF("rev verm scan %s\\x%02hhx over %zu bytes\n" , |
322 | nocase ? "nocase " : "" , c, (size_t)(buf_end - buf)); |
323 | assert(buf < buf_end); |
324 | |
325 | // Handle small scans. |
326 | if (buf_end - buf < VERM_BOUNDARY) { |
327 | for (buf_end--; buf_end >= buf; buf_end--) { |
328 | char cur = (char)*buf_end; |
329 | if (nocase) { |
330 | cur &= CASE_CLEAR; |
331 | } |
332 | if (cur != c) { |
333 | break; |
334 | } |
335 | } |
336 | return buf_end; |
337 | } |
338 | |
339 | VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */ |
340 | size_t min = (size_t)buf_end % VERM_BOUNDARY; |
341 | |
342 | if (min) { |
343 | // Input isn't aligned, so we need to run one iteration with an |
344 | // unaligned load, then skip buf backward to the next aligned address. |
345 | // There's some small overlap here, but we don't mind scanning it twice |
346 | // if we can do it quickly, do we? |
347 | if (nocase) { |
348 | const u8 *ptr = |
349 | rvermUnalignNocase(chars, buf_end - VERM_BOUNDARY, 1); |
350 | if (ptr) { |
351 | return ptr; |
352 | } |
353 | } else { |
354 | const u8 *ptr = rvermUnalign(chars, buf_end - VERM_BOUNDARY, 1); |
355 | if (ptr) { |
356 | return ptr; |
357 | } |
358 | } |
359 | |
360 | buf_end -= min; |
361 | if (buf >= buf_end) { |
362 | return buf_end; |
363 | } |
364 | } |
365 | |
366 | // Aligned loops from here on in. |
367 | const u8 *ptr = nocase ? rvermSearchAlignedNocase(chars, buf, buf_end, 1) |
368 | : rvermSearchAligned(chars, buf, buf_end, 1); |
369 | if (ptr) { |
370 | return ptr; |
371 | } |
372 | |
373 | // Tidy up the mess at the end, return buf - 1 if not found. |
374 | ptr = nocase ? rvermUnalignNocase(chars, buf, 1) |
375 | : rvermUnalign(chars, buf, 1); |
376 | return ptr ? ptr : buf - 1; |
377 | } |
378 | |
379 | /* returns highest offset of c2 (NOTE: not c1) */ |
380 | static really_inline |
381 | const u8 *rvermicelliDoubleExec(char c1, char c2, char nocase, const u8 *buf, |
382 | const u8 *buf_end) { |
383 | DEBUG_PRINTF("rev double verm scan %s\\x%02hhx%02hhx over %zu bytes\n" , |
384 | nocase ? "nocase " : "" , c1, c2, (size_t)(buf_end - buf)); |
385 | assert(buf < buf_end); |
386 | assert((buf_end - buf) >= VERM_BOUNDARY); |
387 | |
388 | size_t min = (size_t)buf_end % VERM_BOUNDARY; |
389 | VERM_TYPE chars1 = VERM_SET_FN(c1); /* nocase already uppercase */ |
390 | VERM_TYPE chars2 = VERM_SET_FN(c2); /* nocase already uppercase */ |
391 | |
392 | if (min) { |
393 | // input not aligned, so we need to run one iteration with an unaligned |
394 | // load, then skip buf forward to the next aligned address. There's |
395 | // some small overlap here, but we don't mind scanning it twice if we |
396 | // can do it quickly, do we? |
397 | const u8 *ptr; |
398 | if (nocase) { |
399 | ptr = rdvermPreconditionNocase(chars1, chars2, |
400 | buf_end - VERM_BOUNDARY); |
401 | } else { |
402 | ptr = rdvermPrecondition(chars1, chars2, buf_end - VERM_BOUNDARY); |
403 | } |
404 | |
405 | if (ptr) { |
406 | return ptr; |
407 | } |
408 | |
409 | buf_end -= min; |
410 | if (buf >= buf_end) { |
411 | return buf_end; |
412 | } |
413 | } |
414 | |
415 | // Aligned loops from here on in |
416 | if (nocase) { |
417 | return rdvermSearchAlignedNocase(chars1, chars2, c1, c2, buf, buf_end); |
418 | } else { |
419 | return rdvermSearchAligned(chars1, chars2, c1, c2, buf, buf_end); |
420 | } |
421 | } |
422 | |
423 | #endif /* VERMICELLI_H */ |
424 | |