1 | /* |
2 | * Copyright (c) 2015-2018, Intel Corporation |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions are met: |
6 | * |
7 | * * Redistributions of source code must retain the above copyright notice, |
8 | * this list of conditions and the following disclaimer. |
9 | * * Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * * Neither the name of Intel Corporation nor the names of its contributors |
13 | * may be used to endorse or promote products derived from this software |
14 | * without specific prior written permission. |
15 | * |
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
17 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
20 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
26 | * POSSIBILITY OF SUCH DAMAGE. |
27 | */ |
28 | |
29 | #ifndef HS_COMPILE_H_ |
30 | #define HS_COMPILE_H_ |
31 | |
32 | /** |
33 | * @file |
34 | * @brief The Hyperscan compiler API definition. |
35 | * |
36 | * Hyperscan is a high speed regular expression engine. |
37 | * |
38 | * This header contains functions for compiling regular expressions into |
39 | * Hyperscan databases that can be used by the Hyperscan runtime. |
40 | */ |
41 | |
42 | #include "hs_common.h" |
43 | |
44 | #ifdef __cplusplus |
45 | extern "C" |
46 | { |
47 | #endif |
48 | |
49 | /** |
50 | * A type containing error details that is returned by the compile calls (@ref |
51 | * hs_compile(), @ref hs_compile_multi() and @ref hs_compile_ext_multi()) on |
52 | * failure. The caller may inspect the values returned in this type to |
53 | * determine the cause of failure. |
54 | * |
55 | * Common errors generated during the compile process include: |
56 | * |
57 | * - *Invalid parameter* |
58 | * |
59 | * An invalid argument was specified in the compile call. |
60 | * |
61 | * - *Unrecognised flag* |
62 | * |
63 | * An unrecognised value was passed in the flags argument. |
64 | * |
65 | * - *Pattern matches empty buffer* |
66 | * |
67 | * By default, Hyperscan only supports patterns that will *always* |
68 | * consume at least one byte of input. Patterns that do not have this |
69 | * property (such as `/(abc)?/`) will produce this error unless |
70 | * the @ref HS_FLAG_ALLOWEMPTY flag is supplied. Note that such |
71 | * patterns will produce a match for *every* byte when scanned. |
72 | * |
73 | * - *Embedded anchors not supported* |
74 | * |
75 | * Hyperscan only supports the use of anchor meta-characters (such as |
76 | * `^` and `$`) in patterns where they could *only* match |
77 | * at the start or end of a buffer. A pattern containing an embedded |
78 | * anchor, such as `/abc^def/`, can never match, as there is no |
79 | * way for `abc` to precede the start of the data stream. |
80 | * |
81 | * - *Bounded repeat is too large* |
82 | * |
83 | * The pattern contains a repeated construct with very large finite |
84 | * bounds. |
85 | * |
86 | * - *Unsupported component type* |
87 | * |
88 | * An unsupported PCRE construct was used in the pattern. |
89 | * |
90 | * - *Unable to generate bytecode* |
91 | * |
92 | * This error indicates that Hyperscan was unable to compile a pattern |
93 | * that is syntactically valid. The most common cause is a pattern that is |
94 | * very long and complex or contains a large repeated subpattern. |
95 | * |
96 | * - *Unable to allocate memory* |
97 | * |
98 | * The library was unable to allocate temporary storage used during |
99 | * compilation time. |
100 | * |
101 | * - *Allocator returned misaligned memory* |
102 | * |
103 | * The memory allocator (either malloc() or the allocator set with @ref |
104 | * hs_set_allocator()) did not correctly return memory suitably aligned |
105 | * for the largest representable data type on this platform. |
106 | * |
107 | * - *Internal error* |
108 | * |
109 | * An unexpected error occurred: if this error is reported, please contact |
110 | * the Hyperscan team with a description of the situation. |
111 | */ |
112 | typedef struct hs_compile_error { |
113 | /** |
114 | * A human-readable error message describing the error. |
115 | */ |
116 | char *message; |
117 | |
118 | /** |
119 | * The zero-based number of the expression that caused the error (if this |
120 | * can be determined). If the error is not specific to an expression, then |
121 | * this value will be less than zero. |
122 | */ |
123 | int expression; |
124 | } hs_compile_error_t; |
125 | |
126 | /** |
127 | * A type containing information on the target platform which may optionally be |
128 | * provided to the compile calls (@ref hs_compile(), @ref hs_compile_multi(), |
129 | * @ref hs_compile_ext_multi()). |
130 | * |
131 | * A hs_platform_info structure may be populated for the current platform by |
132 | * using the @ref hs_populate_platform() call. |
133 | */ |
134 | typedef struct hs_platform_info { |
135 | /** |
136 | * Information about the target platform which may be used to guide the |
137 | * optimisation process of the compile. |
138 | * |
139 | * Use of this field does not limit the processors that the resulting |
140 | * database can run on, but may impact the performance of the resulting |
141 | * database. |
142 | */ |
143 | unsigned int tune; |
144 | |
145 | /** |
146 | * Relevant CPU features available on the target platform |
147 | * |
148 | * This value may be produced by combining HS_CPU_FEATURE_* flags (such as |
149 | * @ref HS_CPU_FEATURES_AVX2). Multiple CPU features may be or'ed together |
150 | * to produce the value. |
151 | */ |
152 | unsigned long long cpu_features; |
153 | |
154 | /** |
155 | * Reserved for future use. |
156 | */ |
157 | unsigned long long reserved1; |
158 | |
159 | /** |
160 | * Reserved for future use. |
161 | */ |
162 | unsigned long long reserved2; |
163 | } hs_platform_info_t; |
164 | |
165 | /** |
166 | * A type containing information related to an expression that is returned by |
167 | * @ref hs_expression_info() or @ref hs_expression_ext_info. |
168 | */ |
169 | typedef struct hs_expr_info { |
170 | /** |
171 | * The minimum length in bytes of a match for the pattern. |
172 | * |
173 | * Note: in some cases when using advanced features to suppress matches |
174 | * (such as extended parameters or the @ref HS_FLAG_SINGLEMATCH flag) this |
175 | * may represent a conservative lower bound for the true minimum length of |
176 | * a match. |
177 | */ |
178 | unsigned int min_width; |
179 | |
180 | /** |
181 | * The maximum length in bytes of a match for the pattern. If the pattern |
182 | * has an unbounded maximum length, this will be set to the maximum value |
183 | * of an unsigned int (UINT_MAX). |
184 | * |
185 | * Note: in some cases when using advanced features to suppress matches |
186 | * (such as extended parameters or the @ref HS_FLAG_SINGLEMATCH flag) this |
187 | * may represent a conservative upper bound for the true maximum length of |
188 | * a match. |
189 | */ |
190 | unsigned int max_width; |
191 | |
192 | /** |
193 | * Whether this expression can produce matches that are not returned in |
194 | * order, such as those produced by assertions. Zero if false, non-zero if |
195 | * true. |
196 | */ |
197 | char unordered_matches; |
198 | |
199 | /** |
200 | * Whether this expression can produce matches at end of data (EOD). In |
201 | * streaming mode, EOD matches are raised during @ref hs_close_stream(), |
202 | * since it is only when @ref hs_close_stream() is called that the EOD |
203 | * location is known. Zero if false, non-zero if true. |
204 | * |
205 | * Note: trailing `\b` word boundary assertions may also result in EOD |
206 | * matches as end-of-data can act as a word boundary. |
207 | */ |
208 | char matches_at_eod; |
209 | |
210 | /** |
211 | * Whether this expression can *only* produce matches at end of data (EOD). |
212 | * In streaming mode, all matches for this expression are raised during |
213 | * @ref hs_close_stream(). Zero if false, non-zero if true. |
214 | */ |
215 | char matches_only_at_eod; |
216 | } hs_expr_info_t; |
217 | |
218 | /** |
219 | * A structure containing additional parameters related to an expression, |
220 | * passed in at build time to @ref hs_compile_ext_multi() or @ref |
221 | * hs_expression_ext_info. |
222 | * |
223 | * These parameters allow the set of matches produced by a pattern to be |
224 | * constrained at compile time, rather than relying on the application to |
225 | * process unwanted matches at runtime. |
226 | */ |
227 | typedef struct hs_expr_ext { |
228 | /** |
229 | * Flags governing which parts of this structure are to be used by the |
230 | * compiler. See @ref HS_EXT_FLAG. |
231 | */ |
232 | unsigned long long flags; |
233 | |
234 | /** |
235 | * The minimum end offset in the data stream at which this expression |
236 | * should match successfully. To use this parameter, set the |
237 | * @ref HS_EXT_FLAG_MIN_OFFSET flag in the hs_expr_ext::flags field. |
238 | */ |
239 | unsigned long long min_offset; |
240 | |
241 | /** |
242 | * The maximum end offset in the data stream at which this expression |
243 | * should match successfully. To use this parameter, set the |
244 | * @ref HS_EXT_FLAG_MAX_OFFSET flag in the hs_expr_ext::flags field. |
245 | */ |
246 | unsigned long long max_offset; |
247 | |
248 | /** |
249 | * The minimum match length (from start to end) required to successfully |
250 | * match this expression. To use this parameter, set the |
251 | * @ref HS_EXT_FLAG_MIN_LENGTH flag in the hs_expr_ext::flags field. |
252 | */ |
253 | unsigned long long min_length; |
254 | |
255 | /** |
256 | * Allow patterns to approximately match within this edit distance. To use |
257 | * this parameter, set the @ref HS_EXT_FLAG_EDIT_DISTANCE flag in the |
258 | * hs_expr_ext::flags field. |
259 | */ |
260 | unsigned edit_distance; |
261 | |
262 | /** |
263 | * Allow patterns to approximately match within this Hamming distance. To |
264 | * use this parameter, set the @ref HS_EXT_FLAG_HAMMING_DISTANCE flag in the |
265 | * hs_expr_ext::flags field. |
266 | */ |
267 | unsigned hamming_distance; |
268 | } hs_expr_ext_t; |
269 | |
270 | /** |
271 | * @defgroup HS_EXT_FLAG hs_expr_ext_t flags |
272 | * |
273 | * These flags are used in @ref hs_expr_ext_t::flags to indicate which fields |
274 | * are used. |
275 | * |
276 | * @{ |
277 | */ |
278 | |
279 | /** Flag indicating that the hs_expr_ext::min_offset field is used. */ |
280 | #define HS_EXT_FLAG_MIN_OFFSET 1ULL |
281 | |
282 | /** Flag indicating that the hs_expr_ext::max_offset field is used. */ |
283 | #define HS_EXT_FLAG_MAX_OFFSET 2ULL |
284 | |
285 | /** Flag indicating that the hs_expr_ext::min_length field is used. */ |
286 | #define HS_EXT_FLAG_MIN_LENGTH 4ULL |
287 | |
288 | /** Flag indicating that the hs_expr_ext::edit_distance field is used. */ |
289 | #define HS_EXT_FLAG_EDIT_DISTANCE 8ULL |
290 | |
291 | /** Flag indicating that the hs_expr_ext::hamming_distance field is used. */ |
292 | #define HS_EXT_FLAG_HAMMING_DISTANCE 16ULL |
293 | |
294 | /** @} */ |
295 | |
296 | /** |
297 | * The basic regular expression compiler. |
298 | * |
299 | * This is the function call with which an expression is compiled into a |
300 | * Hyperscan database which can be passed to the runtime functions (such as |
301 | * @ref hs_scan(), @ref hs_open_stream(), etc.) |
302 | * |
303 | * @param expression |
304 | * The NULL-terminated expression to parse. Note that this string must |
305 | * represent ONLY the pattern to be matched, with no delimiters or flags; |
306 | * any global flags should be specified with the @p flags argument. For |
307 | * example, the expression `/abc?def/i` should be compiled by providing |
308 | * `abc?def` as the @p expression, and @ref HS_FLAG_CASELESS as the @a |
309 | * flags. |
310 | * |
311 | * @param flags |
312 | * Flags which modify the behaviour of the expression. Multiple flags may |
313 | * be used by ORing them together. Valid values are: |
314 | * - HS_FLAG_CASELESS - Matching will be performed case-insensitively. |
315 | * - HS_FLAG_DOTALL - Matching a `.` will not exclude newlines. |
316 | * - HS_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data. |
317 | * - HS_FLAG_SINGLEMATCH - Only one match will be generated for the |
318 | * expression per stream. |
319 | * - HS_FLAG_ALLOWEMPTY - Allow expressions which can match against an |
320 | * empty string, such as `.*`. |
321 | * - HS_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters. |
322 | * - HS_FLAG_UCP - Use Unicode properties for character classes. |
323 | * - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode. |
324 | * - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset |
325 | * when a match is found. |
326 | * |
327 | * @param mode |
328 | * Compiler mode flags that affect the database as a whole. One of @ref |
329 | * HS_MODE_STREAM or @ref HS_MODE_BLOCK or @ref HS_MODE_VECTORED must be |
330 | * supplied, to select between the generation of a streaming, block or |
331 | * vectored database. In addition, other flags (beginning with HS_MODE_) |
332 | * may be supplied to enable specific features. See @ref HS_MODE_FLAG for |
333 | * more details. |
334 | * |
335 | * @param platform |
336 | * If not NULL, the platform structure is used to determine the target |
337 | * platform for the database. If NULL, a database suitable for running |
338 | * on the current host platform is produced. |
339 | * |
340 | * @param db |
341 | * On success, a pointer to the generated database will be returned in |
342 | * this parameter, or NULL on failure. The caller is responsible for |
343 | * deallocating the buffer using the @ref hs_free_database() function. |
344 | * |
345 | * @param error |
346 | * If the compile fails, a pointer to a @ref hs_compile_error_t will be |
347 | * returned, providing details of the error condition. The caller is |
348 | * responsible for deallocating the buffer using the @ref |
349 | * hs_free_compile_error() function. |
350 | * |
351 | * @return |
352 | * @ref HS_SUCCESS is returned on successful compilation; @ref |
353 | * HS_COMPILER_ERROR on failure, with details provided in the error |
354 | * parameter. |
355 | */ |
356 | hs_error_t HS_CDECL hs_compile(const char *expression, unsigned int flags, |
357 | unsigned int mode, |
358 | const hs_platform_info_t *platform, |
359 | hs_database_t **db, hs_compile_error_t **error); |
360 | |
361 | /** |
362 | * The multiple regular expression compiler. |
363 | * |
364 | * This is the function call with which a set of expressions is compiled into a |
365 | * database which can be passed to the runtime functions (such as @ref |
366 | * hs_scan(), @ref hs_open_stream(), etc.) Each expression can be labelled with |
367 | * a unique integer which is passed into the match callback to identify the |
368 | * pattern that has matched. |
369 | * |
370 | * @param expressions |
371 | * Array of NULL-terminated expressions to compile. Note that (as for @ref |
372 | * hs_compile()) these strings must contain only the pattern to be |
373 | * matched, with no delimiters or flags. For example, the expression |
374 | * `/abc?def/i` should be compiled by providing `abc?def` as the first |
375 | * string in the @p expressions array, and @ref HS_FLAG_CASELESS as the |
376 | * first value in the @p flags array. |
377 | * |
378 | * @param flags |
379 | * Array of flags which modify the behaviour of each expression. Multiple |
380 | * flags may be used by ORing them together. Specifying the NULL pointer |
381 | * in place of an array will set the flags value for all patterns to zero. |
382 | * Valid values are: |
383 | * - HS_FLAG_CASELESS - Matching will be performed case-insensitively. |
384 | * - HS_FLAG_DOTALL - Matching a `.` will not exclude newlines. |
385 | * - HS_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data. |
386 | * - HS_FLAG_SINGLEMATCH - Only one match will be generated by patterns |
387 | * with this match id per stream. |
388 | * - HS_FLAG_ALLOWEMPTY - Allow expressions which can match against an |
389 | * empty string, such as `.*`. |
390 | * - HS_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters. |
391 | * - HS_FLAG_UCP - Use Unicode properties for character classes. |
392 | * - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode. |
393 | * - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset |
394 | * when a match is found. |
395 | * |
396 | * @param ids |
397 | * An array of integers specifying the ID number to be associated with the |
398 | * corresponding pattern in the expressions array. Specifying the NULL |
399 | * pointer in place of an array will set the ID value for all patterns to |
400 | * zero. |
401 | * |
402 | * @param elements |
403 | * The number of elements in the input arrays. |
404 | * |
405 | * @param mode |
406 | * Compiler mode flags that affect the database as a whole. One of @ref |
407 | * HS_MODE_STREAM or @ref HS_MODE_BLOCK or @ref HS_MODE_VECTORED must be |
408 | * supplied, to select between the generation of a streaming, block or |
409 | * vectored database. In addition, other flags (beginning with HS_MODE_) |
410 | * may be supplied to enable specific features. See @ref HS_MODE_FLAG for |
411 | * more details. |
412 | * |
413 | * @param platform |
414 | * If not NULL, the platform structure is used to determine the target |
415 | * platform for the database. If NULL, a database suitable for running |
416 | * on the current host platform is produced. |
417 | * |
418 | * @param db |
419 | * On success, a pointer to the generated database will be returned in |
420 | * this parameter, or NULL on failure. The caller is responsible for |
421 | * deallocating the buffer using the @ref hs_free_database() function. |
422 | * |
423 | * @param error |
424 | * If the compile fails, a pointer to a @ref hs_compile_error_t will be |
425 | * returned, providing details of the error condition. The caller is |
426 | * responsible for deallocating the buffer using the @ref |
427 | * hs_free_compile_error() function. |
428 | * |
429 | * @return |
430 | * @ref HS_SUCCESS is returned on successful compilation; @ref |
431 | * HS_COMPILER_ERROR on failure, with details provided in the @p error |
432 | * parameter. |
433 | * |
434 | */ |
435 | hs_error_t HS_CDECL hs_compile_multi(const char *const *expressions, |
436 | const unsigned int *flags, |
437 | const unsigned int *ids, |
438 | unsigned int elements, unsigned int mode, |
439 | const hs_platform_info_t *platform, |
440 | hs_database_t **db, |
441 | hs_compile_error_t **error); |
442 | |
443 | /** |
444 | * The multiple regular expression compiler with extended parameter support. |
445 | * |
446 | * This function call compiles a group of expressions into a database in the |
447 | * same way as @ref hs_compile_multi(), but allows additional parameters to be |
448 | * specified via an @ref hs_expr_ext_t structure per expression. |
449 | * |
450 | * @param expressions |
451 | * Array of NULL-terminated expressions to compile. Note that (as for @ref |
452 | * hs_compile()) these strings must contain only the pattern to be |
453 | * matched, with no delimiters or flags. For example, the expression |
454 | * `/abc?def/i` should be compiled by providing `abc?def` as the first |
455 | * string in the @p expressions array, and @ref HS_FLAG_CASELESS as the |
456 | * first value in the @p flags array. |
457 | * |
458 | * @param flags |
459 | * Array of flags which modify the behaviour of each expression. Multiple |
460 | * flags may be used by ORing them together. Specifying the NULL pointer |
461 | * in place of an array will set the flags value for all patterns to zero. |
462 | * Valid values are: |
463 | * - HS_FLAG_CASELESS - Matching will be performed case-insensitively. |
464 | * - HS_FLAG_DOTALL - Matching a `.` will not exclude newlines. |
465 | * - HS_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data. |
466 | * - HS_FLAG_SINGLEMATCH - Only one match will be generated by patterns |
467 | * with this match id per stream. |
468 | * - HS_FLAG_ALLOWEMPTY - Allow expressions which can match against an |
469 | * empty string, such as `.*`. |
470 | * - HS_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters. |
471 | * - HS_FLAG_UCP - Use Unicode properties for character classes. |
472 | * - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode. |
473 | * - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset |
474 | * when a match is found. |
475 | * |
476 | * @param ids |
477 | * An array of integers specifying the ID number to be associated with the |
478 | * corresponding pattern in the expressions array. Specifying the NULL |
479 | * pointer in place of an array will set the ID value for all patterns to |
480 | * zero. |
481 | * |
482 | * @param ext |
483 | * An array of pointers to filled @ref hs_expr_ext_t structures that |
484 | * define extended behaviour for each pattern. NULL may be specified if no |
485 | * extended behaviour is needed for an individual pattern, or in place of |
486 | * the whole array if it is not needed for any expressions. Memory used by |
487 | * these structures must be both allocated and freed by the caller. |
488 | * |
489 | * @param elements |
490 | * The number of elements in the input arrays. |
491 | * |
492 | * @param mode |
493 | * Compiler mode flags that affect the database as a whole. One of @ref |
494 | * HS_MODE_STREAM, @ref HS_MODE_BLOCK or @ref HS_MODE_VECTORED must be |
495 | * supplied, to select between the generation of a streaming, block or |
496 | * vectored database. In addition, other flags (beginning with HS_MODE_) |
497 | * may be supplied to enable specific features. See @ref HS_MODE_FLAG for |
498 | * more details. |
499 | * |
500 | * @param platform |
501 | * If not NULL, the platform structure is used to determine the target |
502 | * platform for the database. If NULL, a database suitable for running |
503 | * on the current host platform is produced. |
504 | * |
505 | * @param db |
506 | * On success, a pointer to the generated database will be returned in |
507 | * this parameter, or NULL on failure. The caller is responsible for |
508 | * deallocating the buffer using the @ref hs_free_database() function. |
509 | * |
510 | * @param error |
511 | * If the compile fails, a pointer to a @ref hs_compile_error_t will be |
512 | * returned, providing details of the error condition. The caller is |
513 | * responsible for deallocating the buffer using the @ref |
514 | * hs_free_compile_error() function. |
515 | * |
516 | * @return |
517 | * @ref HS_SUCCESS is returned on successful compilation; @ref |
518 | * HS_COMPILER_ERROR on failure, with details provided in the @p error |
519 | * parameter. |
520 | * |
521 | */ |
522 | hs_error_t HS_CDECL hs_compile_ext_multi(const char *const *expressions, |
523 | const unsigned int *flags, |
524 | const unsigned int *ids, |
525 | const hs_expr_ext_t *const *ext, |
526 | unsigned int elements, unsigned int mode, |
527 | const hs_platform_info_t *platform, |
528 | hs_database_t **db, hs_compile_error_t **error); |
529 | |
530 | /** |
531 | * Free an error structure generated by @ref hs_compile(), @ref |
532 | * hs_compile_multi() or @ref hs_compile_ext_multi(). |
533 | * |
534 | * @param error |
535 | * The @ref hs_compile_error_t to be freed. NULL may also be safely |
536 | * provided. |
537 | * |
538 | * @return |
539 | * @ref HS_SUCCESS on success, other values on failure. |
540 | */ |
541 | hs_error_t HS_CDECL hs_free_compile_error(hs_compile_error_t *error); |
542 | |
543 | /** |
544 | * Utility function providing information about a regular expression. The |
545 | * information provided in @ref hs_expr_info_t includes the minimum and maximum |
546 | * width of a pattern match. |
547 | * |
548 | * Note: successful analysis of an expression with this function does not imply |
549 | * that compilation of the same expression (via @ref hs_compile(), @ref |
550 | * hs_compile_multi() or @ref hs_compile_ext_multi()) would succeed. This |
551 | * function may return @ref HS_SUCCESS for regular expressions that Hyperscan |
552 | * cannot compile. |
553 | * |
554 | * Note: some per-pattern flags (such as @ref HS_FLAG_ALLOWEMPTY, @ref |
555 | * HS_FLAG_SOM_LEFTMOST) are accepted by this call, but as they do not affect |
556 | * the properties returned in the @ref hs_expr_info_t structure, they will not |
557 | * affect the outcome of this function. |
558 | * |
559 | * @param expression |
560 | * The NULL-terminated expression to parse. Note that this string must |
561 | * represent ONLY the pattern to be matched, with no delimiters or flags; |
562 | * any global flags should be specified with the @p flags argument. For |
563 | * example, the expression `/abc?def/i` should be compiled by providing |
564 | * `abc?def` as the @p expression, and @ref HS_FLAG_CASELESS as the @a |
565 | * flags. |
566 | * |
567 | * @param flags |
568 | * Flags which modify the behaviour of the expression. Multiple flags may |
569 | * be used by ORing them together. Valid values are: |
570 | * - HS_FLAG_CASELESS - Matching will be performed case-insensitively. |
571 | * - HS_FLAG_DOTALL - Matching a `.` will not exclude newlines. |
572 | * - HS_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data. |
573 | * - HS_FLAG_SINGLEMATCH - Only one match will be generated by the |
574 | * expression per stream. |
575 | * - HS_FLAG_ALLOWEMPTY - Allow expressions which can match against an |
576 | * empty string, such as `.*`. |
577 | * - HS_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters. |
578 | * - HS_FLAG_UCP - Use Unicode properties for character classes. |
579 | * - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode. |
580 | * - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset |
581 | * when a match is found. |
582 | * |
583 | * @param info |
584 | * On success, a pointer to the pattern information will be returned in |
585 | * this parameter, or NULL on failure. This structure is allocated using |
586 | * the allocator supplied in @ref hs_set_allocator() (or malloc() if no |
587 | * allocator was set) and should be freed by the caller. |
588 | * |
589 | * @param error |
590 | * If the call fails, a pointer to a @ref hs_compile_error_t will be |
591 | * returned, providing details of the error condition. The caller is |
592 | * responsible for deallocating the buffer using the @ref |
593 | * hs_free_compile_error() function. |
594 | * |
595 | * @return |
596 | * @ref HS_SUCCESS is returned on successful compilation; @ref |
597 | * HS_COMPILER_ERROR on failure, with details provided in the error |
598 | * parameter. |
599 | */ |
600 | hs_error_t HS_CDECL hs_expression_info(const char *expression, |
601 | unsigned int flags, |
602 | hs_expr_info_t **info, |
603 | hs_compile_error_t **error); |
604 | |
605 | /** |
606 | * Utility function providing information about a regular expression, with |
607 | * extended parameter support. The information provided in @ref hs_expr_info_t |
608 | * includes the minimum and maximum width of a pattern match. |
609 | * |
610 | * Note: successful analysis of an expression with this function does not imply |
611 | * that compilation of the same expression (via @ref hs_compile(), @ref |
612 | * hs_compile_multi() or @ref hs_compile_ext_multi()) would succeed. This |
613 | * function may return @ref HS_SUCCESS for regular expressions that Hyperscan |
614 | * cannot compile. |
615 | * |
616 | * Note: some per-pattern flags (such as @ref HS_FLAG_ALLOWEMPTY, @ref |
617 | * HS_FLAG_SOM_LEFTMOST) are accepted by this call, but as they do not affect |
618 | * the properties returned in the @ref hs_expr_info_t structure, they will not |
619 | * affect the outcome of this function. |
620 | * |
621 | * @param expression |
622 | * The NULL-terminated expression to parse. Note that this string must |
623 | * represent ONLY the pattern to be matched, with no delimiters or flags; |
624 | * any global flags should be specified with the @p flags argument. For |
625 | * example, the expression `/abc?def/i` should be compiled by providing |
626 | * `abc?def` as the @p expression, and @ref HS_FLAG_CASELESS as the @a |
627 | * flags. |
628 | * |
629 | * @param flags |
630 | * Flags which modify the behaviour of the expression. Multiple flags may |
631 | * be used by ORing them together. Valid values are: |
632 | * - HS_FLAG_CASELESS - Matching will be performed case-insensitively. |
633 | * - HS_FLAG_DOTALL - Matching a `.` will not exclude newlines. |
634 | * - HS_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data. |
635 | * - HS_FLAG_SINGLEMATCH - Only one match will be generated by the |
636 | * expression per stream. |
637 | * - HS_FLAG_ALLOWEMPTY - Allow expressions which can match against an |
638 | * empty string, such as `.*`. |
639 | * - HS_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters. |
640 | * - HS_FLAG_UCP - Use Unicode properties for character classes. |
641 | * - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode. |
642 | * - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset |
643 | * when a match is found. |
644 | * |
645 | * @param ext |
646 | * A pointer to a filled @ref hs_expr_ext_t structure that defines |
647 | * extended behaviour for this pattern. NULL may be specified if no |
648 | * extended parameters are needed. |
649 | * |
650 | * @param info |
651 | * On success, a pointer to the pattern information will be returned in |
652 | * this parameter, or NULL on failure. This structure is allocated using |
653 | * the allocator supplied in @ref hs_set_allocator() (or malloc() if no |
654 | * allocator was set) and should be freed by the caller. |
655 | * |
656 | * @param error |
657 | * If the call fails, a pointer to a @ref hs_compile_error_t will be |
658 | * returned, providing details of the error condition. The caller is |
659 | * responsible for deallocating the buffer using the @ref |
660 | * hs_free_compile_error() function. |
661 | * |
662 | * @return |
663 | * @ref HS_SUCCESS is returned on successful compilation; @ref |
664 | * HS_COMPILER_ERROR on failure, with details provided in the error |
665 | * parameter. |
666 | */ |
667 | hs_error_t HS_CDECL hs_expression_ext_info(const char *expression, |
668 | unsigned int flags, |
669 | const hs_expr_ext_t *ext, |
670 | hs_expr_info_t **info, |
671 | hs_compile_error_t **error); |
672 | |
673 | /** |
674 | * Populates the platform information based on the current host. |
675 | * |
676 | * @param platform |
677 | * On success, the pointed to structure is populated based on the current |
678 | * host. |
679 | * |
680 | * @return |
681 | * @ref HS_SUCCESS on success, other values on failure. |
682 | */ |
683 | hs_error_t HS_CDECL hs_populate_platform(hs_platform_info_t *platform); |
684 | |
685 | /** |
686 | * @defgroup HS_PATTERN_FLAG Pattern flags |
687 | * |
688 | * @{ |
689 | */ |
690 | |
691 | /** |
692 | * Compile flag: Set case-insensitive matching. |
693 | * |
694 | * This flag sets the expression to be matched case-insensitively by default. |
695 | * The expression may still use PCRE tokens (notably `(?i)` and |
696 | * `(?-i)`) to switch case-insensitive matching on and off. |
697 | */ |
698 | #define HS_FLAG_CASELESS 1 |
699 | |
700 | /** |
701 | * Compile flag: Matching a `.` will not exclude newlines. |
702 | * |
703 | * This flag sets any instances of the `.` token to match newline characters as |
704 | * well as all other characters. The PCRE specification states that the `.` |
705 | * token does not match newline characters by default, so without this flag the |
706 | * `.` token will not cross line boundaries. |
707 | */ |
708 | #define HS_FLAG_DOTALL 2 |
709 | |
710 | /** |
711 | * Compile flag: Set multi-line anchoring. |
712 | * |
713 | * This flag instructs the expression to make the `^` and `$` tokens match |
714 | * newline characters as well as the start and end of the stream. If this flag |
715 | * is not specified, the `^` token will only ever match at the start of a |
716 | * stream, and the `$` token will only ever match at the end of a stream within |
717 | * the guidelines of the PCRE specification. |
718 | */ |
719 | #define HS_FLAG_MULTILINE 4 |
720 | |
721 | /** |
722 | * Compile flag: Set single-match only mode. |
723 | * |
724 | * This flag sets the expression's match ID to match at most once. In streaming |
725 | * mode, this means that the expression will return only a single match over |
726 | * the lifetime of the stream, rather than reporting every match as per |
727 | * standard Hyperscan semantics. In block mode or vectored mode, only the first |
728 | * match for each invocation of @ref hs_scan() or @ref hs_scan_vector() will be |
729 | * returned. |
730 | * |
731 | * If multiple expressions in the database share the same match ID, then they |
732 | * either must all specify @ref HS_FLAG_SINGLEMATCH or none of them specify |
733 | * @ref HS_FLAG_SINGLEMATCH. If a group of expressions sharing a match ID |
734 | * specify the flag, then at most one match with the match ID will be generated |
735 | * per stream. |
736 | * |
737 | * Note: The use of this flag in combination with @ref HS_FLAG_SOM_LEFTMOST |
738 | * is not currently supported. |
739 | */ |
740 | #define HS_FLAG_SINGLEMATCH 8 |
741 | |
742 | /** |
743 | * Compile flag: Allow expressions that can match against empty buffers. |
744 | * |
745 | * This flag instructs the compiler to allow expressions that can match against |
746 | * empty buffers, such as `.?`, `.*`, `(a|)`. Since Hyperscan can return every |
747 | * possible match for an expression, such expressions generally execute very |
748 | * slowly; the default behaviour is to return an error when an attempt to |
749 | * compile one is made. Using this flag will force the compiler to allow such |
750 | * an expression. |
751 | */ |
752 | #define HS_FLAG_ALLOWEMPTY 16 |
753 | |
754 | /** |
755 | * Compile flag: Enable UTF-8 mode for this expression. |
756 | * |
757 | * This flag instructs Hyperscan to treat the pattern as a sequence of UTF-8 |
758 | * characters. The results of scanning invalid UTF-8 sequences with a Hyperscan |
759 | * library that has been compiled with one or more patterns using this flag are |
760 | * undefined. |
761 | */ |
762 | #define HS_FLAG_UTF8 32 |
763 | |
764 | /** |
765 | * Compile flag: Enable Unicode property support for this expression. |
766 | * |
767 | * This flag instructs Hyperscan to use Unicode properties, rather than the |
768 | * default ASCII interpretations, for character mnemonics like `\w` and `\s` as |
769 | * well as the POSIX character classes. It is only meaningful in conjunction |
770 | * with @ref HS_FLAG_UTF8. |
771 | */ |
772 | #define HS_FLAG_UCP 64 |
773 | |
774 | /** |
775 | * Compile flag: Enable prefiltering mode for this expression. |
776 | * |
777 | * This flag instructs Hyperscan to compile an "approximate" version of this |
778 | * pattern for use in a prefiltering application, even if Hyperscan does not |
779 | * support the pattern in normal operation. |
780 | * |
781 | * The set of matches returned when this flag is used is guaranteed to be a |
782 | * superset of the matches specified by the non-prefiltering expression. |
783 | * |
784 | * If the pattern contains pattern constructs not supported by Hyperscan (such |
785 | * as zero-width assertions, back-references or conditional references) these |
786 | * constructs will be replaced internally with broader constructs that may |
787 | * match more often. |
788 | * |
789 | * Furthermore, in prefiltering mode Hyperscan may simplify a pattern that |
790 | * would otherwise return a "Pattern too large" error at compile time, or for |
791 | * performance reasons (subject to the matching guarantee above). |
792 | * |
793 | * It is generally expected that the application will subsequently confirm |
794 | * prefilter matches with another regular expression matcher that can provide |
795 | * exact matches for the pattern. |
796 | * |
797 | * Note: The use of this flag in combination with @ref HS_FLAG_SOM_LEFTMOST |
798 | * is not currently supported. |
799 | */ |
800 | #define HS_FLAG_PREFILTER 128 |
801 | |
802 | /** |
803 | * Compile flag: Enable leftmost start of match reporting. |
804 | * |
805 | * This flag instructs Hyperscan to report the leftmost possible start of match |
806 | * offset when a match is reported for this expression. (By default, no start |
807 | * of match is returned.) |
808 | * |
809 | * Enabling this behaviour may reduce performance and increase stream state |
810 | * requirements in streaming mode. |
811 | */ |
812 | #define HS_FLAG_SOM_LEFTMOST 256 |
813 | |
814 | /** |
815 | * Compile flag: Logical combination. |
816 | * |
817 | * This flag instructs Hyperscan to parse this expression as logical |
818 | * combination syntax. |
819 | * Logical constraints consist of operands, operators and parentheses. |
820 | * The operands are expression indices, and operators can be |
821 | * '!'(NOT), '&'(AND) or '|'(OR). |
822 | * For example: |
823 | * (101&102&103)|(104&!105) |
824 | * ((301|302)&303)&(304|305) |
825 | */ |
826 | #define HS_FLAG_COMBINATION 512 |
827 | |
828 | /** |
829 | * Compile flag: Don't do any match reporting. |
830 | * |
831 | * This flag instructs Hyperscan to ignore match reporting for this expression. |
832 | * It is designed to be used on the sub-expressions in logical combinations. |
833 | */ |
834 | #define HS_FLAG_QUIET 1024 |
835 | |
836 | /** @} */ |
837 | |
838 | /** |
839 | * @defgroup HS_CPU_FEATURES_FLAG CPU feature support flags |
840 | * |
841 | * @{ |
842 | */ |
843 | |
844 | /** |
845 | * CPU features flag - Intel(R) Advanced Vector Extensions 2 (Intel(R) AVX2) |
846 | * |
847 | * Setting this flag indicates that the target platform supports AVX2 |
848 | * instructions. |
849 | */ |
850 | #define HS_CPU_FEATURES_AVX2 (1ULL << 2) |
851 | |
852 | /** |
853 | * CPU features flag - Intel(R) Advanced Vector Extensions 512 (Intel(R) AVX512) |
854 | * |
855 | * Setting this flag indicates that the target platform supports AVX512 |
856 | * instructions, specifically AVX-512BW. Using AVX512 implies the use of AVX2. |
857 | */ |
858 | #define HS_CPU_FEATURES_AVX512 (1ULL << 3) |
859 | |
860 | /** @} */ |
861 | |
862 | /** |
863 | * @defgroup HS_TUNE_FLAG Tuning flags |
864 | * |
865 | * @{ |
866 | */ |
867 | |
868 | /** |
869 | * Tuning Parameter - Generic |
870 | * |
871 | * This indicates that the compiled database should not be tuned for any |
872 | * particular target platform. |
873 | */ |
874 | #define HS_TUNE_FAMILY_GENERIC 0 |
875 | |
876 | /** |
877 | * Tuning Parameter - Intel(R) microarchitecture code name Sandy Bridge |
878 | * |
879 | * This indicates that the compiled database should be tuned for the |
880 | * Sandy Bridge microarchitecture. |
881 | */ |
882 | #define HS_TUNE_FAMILY_SNB 1 |
883 | |
884 | /** |
885 | * Tuning Parameter - Intel(R) microarchitecture code name Ivy Bridge |
886 | * |
887 | * This indicates that the compiled database should be tuned for the |
888 | * Ivy Bridge microarchitecture. |
889 | */ |
890 | #define HS_TUNE_FAMILY_IVB 2 |
891 | |
892 | /** |
893 | * Tuning Parameter - Intel(R) microarchitecture code name Haswell |
894 | * |
895 | * This indicates that the compiled database should be tuned for the |
896 | * Haswell microarchitecture. |
897 | */ |
898 | #define HS_TUNE_FAMILY_HSW 3 |
899 | |
900 | /** |
901 | * Tuning Parameter - Intel(R) microarchitecture code name Silvermont |
902 | * |
903 | * This indicates that the compiled database should be tuned for the |
904 | * Silvermont microarchitecture. |
905 | */ |
906 | #define HS_TUNE_FAMILY_SLM 4 |
907 | |
908 | /** |
909 | * Tuning Parameter - Intel(R) microarchitecture code name Broadwell |
910 | * |
911 | * This indicates that the compiled database should be tuned for the |
912 | * Broadwell microarchitecture. |
913 | */ |
914 | #define HS_TUNE_FAMILY_BDW 5 |
915 | |
916 | /** |
917 | * Tuning Parameter - Intel(R) microarchitecture code name Skylake |
918 | * |
919 | * This indicates that the compiled database should be tuned for the |
920 | * Skylake microarchitecture. |
921 | */ |
922 | #define HS_TUNE_FAMILY_SKL 6 |
923 | |
924 | /** |
925 | * Tuning Parameter - Intel(R) microarchitecture code name Skylake Server |
926 | * |
927 | * This indicates that the compiled database should be tuned for the |
928 | * Skylake Server microarchitecture. |
929 | */ |
930 | #define HS_TUNE_FAMILY_SKX 7 |
931 | |
932 | /** |
933 | * Tuning Parameter - Intel(R) microarchitecture code name Goldmont |
934 | * |
935 | * This indicates that the compiled database should be tuned for the |
936 | * Goldmont microarchitecture. |
937 | */ |
938 | #define HS_TUNE_FAMILY_GLM 8 |
939 | |
940 | /** @} */ |
941 | |
942 | /** |
943 | * @defgroup HS_MODE_FLAG Compile mode flags |
944 | * |
945 | * The mode flags are used as values for the mode parameter of the various |
946 | * compile calls (@ref hs_compile(), @ref hs_compile_multi() and @ref |
947 | * hs_compile_ext_multi()). |
948 | * |
949 | * A mode value can be built by ORing these flag values together; the only |
950 | * required flag is one of @ref HS_MODE_BLOCK, @ref HS_MODE_STREAM or @ref |
951 | * HS_MODE_VECTORED. Other flags may be added to enable support for additional |
952 | * features. |
953 | * |
954 | * @{ |
955 | */ |
956 | |
957 | /** |
958 | * Compiler mode flag: Block scan (non-streaming) database. |
959 | */ |
960 | #define HS_MODE_BLOCK 1 |
961 | |
962 | /** |
963 | * Compiler mode flag: Alias for @ref HS_MODE_BLOCK. |
964 | */ |
965 | #define HS_MODE_NOSTREAM 1 |
966 | |
967 | /** |
968 | * Compiler mode flag: Streaming database. |
969 | */ |
970 | #define HS_MODE_STREAM 2 |
971 | |
972 | /** |
973 | * Compiler mode flag: Vectored scanning database. |
974 | */ |
975 | #define HS_MODE_VECTORED 4 |
976 | |
977 | /** |
978 | * Compiler mode flag: use full precision to track start of match offsets in |
979 | * stream state. |
980 | * |
981 | * This mode will use the most stream state per pattern, but will always return |
982 | * an accurate start of match offset regardless of how far back in the past it |
983 | * was found. |
984 | * |
985 | * One of the SOM_HORIZON modes must be selected to use the @ref |
986 | * HS_FLAG_SOM_LEFTMOST expression flag. |
987 | */ |
988 | #define HS_MODE_SOM_HORIZON_LARGE (1U << 24) |
989 | |
990 | /** |
991 | * Compiler mode flag: use medium precision to track start of match offsets in |
992 | * stream state. |
993 | * |
994 | * This mode will use less stream state than @ref HS_MODE_SOM_HORIZON_LARGE and |
995 | * will limit start of match accuracy to offsets within 2^32 bytes of the |
996 | * end of match offset reported. |
997 | * |
998 | * One of the SOM_HORIZON modes must be selected to use the @ref |
999 | * HS_FLAG_SOM_LEFTMOST expression flag. |
1000 | */ |
1001 | #define HS_MODE_SOM_HORIZON_MEDIUM (1U << 25) |
1002 | |
1003 | /** |
1004 | * Compiler mode flag: use limited precision to track start of match offsets in |
1005 | * stream state. |
1006 | * |
1007 | * This mode will use less stream state than @ref HS_MODE_SOM_HORIZON_LARGE and |
1008 | * will limit start of match accuracy to offsets within 2^16 bytes of the |
1009 | * end of match offset reported. |
1010 | * |
1011 | * One of the SOM_HORIZON modes must be selected to use the @ref |
1012 | * HS_FLAG_SOM_LEFTMOST expression flag. |
1013 | */ |
1014 | #define HS_MODE_SOM_HORIZON_SMALL (1U << 26) |
1015 | |
1016 | /** @} */ |
1017 | |
1018 | #ifdef __cplusplus |
1019 | } /* extern "C" */ |
1020 | #endif |
1021 | |
1022 | #endif /* HS_COMPILE_H_ */ |
1023 | |