1/*
2 * Copyright (c) 2015-2018, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#ifndef HS_COMPILE_H_
30#define HS_COMPILE_H_
31
32/**
33 * @file
34 * @brief The Hyperscan compiler API definition.
35 *
36 * Hyperscan is a high speed regular expression engine.
37 *
38 * This header contains functions for compiling regular expressions into
39 * Hyperscan databases that can be used by the Hyperscan runtime.
40 */
41
42#include "hs_common.h"
43
44#ifdef __cplusplus
45extern "C"
46{
47#endif
48
49/**
50 * A type containing error details that is returned by the compile calls (@ref
51 * hs_compile(), @ref hs_compile_multi() and @ref hs_compile_ext_multi()) on
52 * failure. The caller may inspect the values returned in this type to
53 * determine the cause of failure.
54 *
55 * Common errors generated during the compile process include:
56 *
57 * - *Invalid parameter*
58 *
59 * An invalid argument was specified in the compile call.
60 *
61 * - *Unrecognised flag*
62 *
63 * An unrecognised value was passed in the flags argument.
64 *
65 * - *Pattern matches empty buffer*
66 *
67 * By default, Hyperscan only supports patterns that will *always*
68 * consume at least one byte of input. Patterns that do not have this
69 * property (such as `/(abc)?/`) will produce this error unless
70 * the @ref HS_FLAG_ALLOWEMPTY flag is supplied. Note that such
71 * patterns will produce a match for *every* byte when scanned.
72 *
73 * - *Embedded anchors not supported*
74 *
75 * Hyperscan only supports the use of anchor meta-characters (such as
76 * `^` and `$`) in patterns where they could *only* match
77 * at the start or end of a buffer. A pattern containing an embedded
78 * anchor, such as `/abc^def/`, can never match, as there is no
79 * way for `abc` to precede the start of the data stream.
80 *
81 * - *Bounded repeat is too large*
82 *
83 * The pattern contains a repeated construct with very large finite
84 * bounds.
85 *
86 * - *Unsupported component type*
87 *
88 * An unsupported PCRE construct was used in the pattern.
89 *
90 * - *Unable to generate bytecode*
91 *
92 * This error indicates that Hyperscan was unable to compile a pattern
93 * that is syntactically valid. The most common cause is a pattern that is
94 * very long and complex or contains a large repeated subpattern.
95 *
96 * - *Unable to allocate memory*
97 *
98 * The library was unable to allocate temporary storage used during
99 * compilation time.
100 *
101 * - *Allocator returned misaligned memory*
102 *
103 * The memory allocator (either malloc() or the allocator set with @ref
104 * hs_set_allocator()) did not correctly return memory suitably aligned
105 * for the largest representable data type on this platform.
106 *
107 * - *Internal error*
108 *
109 * An unexpected error occurred: if this error is reported, please contact
110 * the Hyperscan team with a description of the situation.
111 */
112typedef struct hs_compile_error {
113 /**
114 * A human-readable error message describing the error.
115 */
116 char *message;
117
118 /**
119 * The zero-based number of the expression that caused the error (if this
120 * can be determined). If the error is not specific to an expression, then
121 * this value will be less than zero.
122 */
123 int expression;
124} hs_compile_error_t;
125
126/**
127 * A type containing information on the target platform which may optionally be
128 * provided to the compile calls (@ref hs_compile(), @ref hs_compile_multi(),
129 * @ref hs_compile_ext_multi()).
130 *
131 * A hs_platform_info structure may be populated for the current platform by
132 * using the @ref hs_populate_platform() call.
133 */
134typedef struct hs_platform_info {
135 /**
136 * Information about the target platform which may be used to guide the
137 * optimisation process of the compile.
138 *
139 * Use of this field does not limit the processors that the resulting
140 * database can run on, but may impact the performance of the resulting
141 * database.
142 */
143 unsigned int tune;
144
145 /**
146 * Relevant CPU features available on the target platform
147 *
148 * This value may be produced by combining HS_CPU_FEATURE_* flags (such as
149 * @ref HS_CPU_FEATURES_AVX2). Multiple CPU features may be or'ed together
150 * to produce the value.
151 */
152 unsigned long long cpu_features;
153
154 /**
155 * Reserved for future use.
156 */
157 unsigned long long reserved1;
158
159 /**
160 * Reserved for future use.
161 */
162 unsigned long long reserved2;
163} hs_platform_info_t;
164
165/**
166 * A type containing information related to an expression that is returned by
167 * @ref hs_expression_info() or @ref hs_expression_ext_info.
168 */
169typedef struct hs_expr_info {
170 /**
171 * The minimum length in bytes of a match for the pattern.
172 *
173 * Note: in some cases when using advanced features to suppress matches
174 * (such as extended parameters or the @ref HS_FLAG_SINGLEMATCH flag) this
175 * may represent a conservative lower bound for the true minimum length of
176 * a match.
177 */
178 unsigned int min_width;
179
180 /**
181 * The maximum length in bytes of a match for the pattern. If the pattern
182 * has an unbounded maximum length, this will be set to the maximum value
183 * of an unsigned int (UINT_MAX).
184 *
185 * Note: in some cases when using advanced features to suppress matches
186 * (such as extended parameters or the @ref HS_FLAG_SINGLEMATCH flag) this
187 * may represent a conservative upper bound for the true maximum length of
188 * a match.
189 */
190 unsigned int max_width;
191
192 /**
193 * Whether this expression can produce matches that are not returned in
194 * order, such as those produced by assertions. Zero if false, non-zero if
195 * true.
196 */
197 char unordered_matches;
198
199 /**
200 * Whether this expression can produce matches at end of data (EOD). In
201 * streaming mode, EOD matches are raised during @ref hs_close_stream(),
202 * since it is only when @ref hs_close_stream() is called that the EOD
203 * location is known. Zero if false, non-zero if true.
204 *
205 * Note: trailing `\b` word boundary assertions may also result in EOD
206 * matches as end-of-data can act as a word boundary.
207 */
208 char matches_at_eod;
209
210 /**
211 * Whether this expression can *only* produce matches at end of data (EOD).
212 * In streaming mode, all matches for this expression are raised during
213 * @ref hs_close_stream(). Zero if false, non-zero if true.
214 */
215 char matches_only_at_eod;
216} hs_expr_info_t;
217
218/**
219 * A structure containing additional parameters related to an expression,
220 * passed in at build time to @ref hs_compile_ext_multi() or @ref
221 * hs_expression_ext_info.
222 *
223 * These parameters allow the set of matches produced by a pattern to be
224 * constrained at compile time, rather than relying on the application to
225 * process unwanted matches at runtime.
226 */
227typedef struct hs_expr_ext {
228 /**
229 * Flags governing which parts of this structure are to be used by the
230 * compiler. See @ref HS_EXT_FLAG.
231 */
232 unsigned long long flags;
233
234 /**
235 * The minimum end offset in the data stream at which this expression
236 * should match successfully. To use this parameter, set the
237 * @ref HS_EXT_FLAG_MIN_OFFSET flag in the hs_expr_ext::flags field.
238 */
239 unsigned long long min_offset;
240
241 /**
242 * The maximum end offset in the data stream at which this expression
243 * should match successfully. To use this parameter, set the
244 * @ref HS_EXT_FLAG_MAX_OFFSET flag in the hs_expr_ext::flags field.
245 */
246 unsigned long long max_offset;
247
248 /**
249 * The minimum match length (from start to end) required to successfully
250 * match this expression. To use this parameter, set the
251 * @ref HS_EXT_FLAG_MIN_LENGTH flag in the hs_expr_ext::flags field.
252 */
253 unsigned long long min_length;
254
255 /**
256 * Allow patterns to approximately match within this edit distance. To use
257 * this parameter, set the @ref HS_EXT_FLAG_EDIT_DISTANCE flag in the
258 * hs_expr_ext::flags field.
259 */
260 unsigned edit_distance;
261
262 /**
263 * Allow patterns to approximately match within this Hamming distance. To
264 * use this parameter, set the @ref HS_EXT_FLAG_HAMMING_DISTANCE flag in the
265 * hs_expr_ext::flags field.
266 */
267 unsigned hamming_distance;
268} hs_expr_ext_t;
269
270/**
271 * @defgroup HS_EXT_FLAG hs_expr_ext_t flags
272 *
273 * These flags are used in @ref hs_expr_ext_t::flags to indicate which fields
274 * are used.
275 *
276 * @{
277 */
278
279/** Flag indicating that the hs_expr_ext::min_offset field is used. */
280#define HS_EXT_FLAG_MIN_OFFSET 1ULL
281
282/** Flag indicating that the hs_expr_ext::max_offset field is used. */
283#define HS_EXT_FLAG_MAX_OFFSET 2ULL
284
285/** Flag indicating that the hs_expr_ext::min_length field is used. */
286#define HS_EXT_FLAG_MIN_LENGTH 4ULL
287
288/** Flag indicating that the hs_expr_ext::edit_distance field is used. */
289#define HS_EXT_FLAG_EDIT_DISTANCE 8ULL
290
291/** Flag indicating that the hs_expr_ext::hamming_distance field is used. */
292#define HS_EXT_FLAG_HAMMING_DISTANCE 16ULL
293
294/** @} */
295
296/**
297 * The basic regular expression compiler.
298 *
299 * This is the function call with which an expression is compiled into a
300 * Hyperscan database which can be passed to the runtime functions (such as
301 * @ref hs_scan(), @ref hs_open_stream(), etc.)
302 *
303 * @param expression
304 * The NULL-terminated expression to parse. Note that this string must
305 * represent ONLY the pattern to be matched, with no delimiters or flags;
306 * any global flags should be specified with the @p flags argument. For
307 * example, the expression `/abc?def/i` should be compiled by providing
308 * `abc?def` as the @p expression, and @ref HS_FLAG_CASELESS as the @a
309 * flags.
310 *
311 * @param flags
312 * Flags which modify the behaviour of the expression. Multiple flags may
313 * be used by ORing them together. Valid values are:
314 * - HS_FLAG_CASELESS - Matching will be performed case-insensitively.
315 * - HS_FLAG_DOTALL - Matching a `.` will not exclude newlines.
316 * - HS_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data.
317 * - HS_FLAG_SINGLEMATCH - Only one match will be generated for the
318 * expression per stream.
319 * - HS_FLAG_ALLOWEMPTY - Allow expressions which can match against an
320 * empty string, such as `.*`.
321 * - HS_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters.
322 * - HS_FLAG_UCP - Use Unicode properties for character classes.
323 * - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode.
324 * - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset
325 * when a match is found.
326 *
327 * @param mode
328 * Compiler mode flags that affect the database as a whole. One of @ref
329 * HS_MODE_STREAM or @ref HS_MODE_BLOCK or @ref HS_MODE_VECTORED must be
330 * supplied, to select between the generation of a streaming, block or
331 * vectored database. In addition, other flags (beginning with HS_MODE_)
332 * may be supplied to enable specific features. See @ref HS_MODE_FLAG for
333 * more details.
334 *
335 * @param platform
336 * If not NULL, the platform structure is used to determine the target
337 * platform for the database. If NULL, a database suitable for running
338 * on the current host platform is produced.
339 *
340 * @param db
341 * On success, a pointer to the generated database will be returned in
342 * this parameter, or NULL on failure. The caller is responsible for
343 * deallocating the buffer using the @ref hs_free_database() function.
344 *
345 * @param error
346 * If the compile fails, a pointer to a @ref hs_compile_error_t will be
347 * returned, providing details of the error condition. The caller is
348 * responsible for deallocating the buffer using the @ref
349 * hs_free_compile_error() function.
350 *
351 * @return
352 * @ref HS_SUCCESS is returned on successful compilation; @ref
353 * HS_COMPILER_ERROR on failure, with details provided in the error
354 * parameter.
355 */
356hs_error_t HS_CDECL hs_compile(const char *expression, unsigned int flags,
357 unsigned int mode,
358 const hs_platform_info_t *platform,
359 hs_database_t **db, hs_compile_error_t **error);
360
361/**
362 * The multiple regular expression compiler.
363 *
364 * This is the function call with which a set of expressions is compiled into a
365 * database which can be passed to the runtime functions (such as @ref
366 * hs_scan(), @ref hs_open_stream(), etc.) Each expression can be labelled with
367 * a unique integer which is passed into the match callback to identify the
368 * pattern that has matched.
369 *
370 * @param expressions
371 * Array of NULL-terminated expressions to compile. Note that (as for @ref
372 * hs_compile()) these strings must contain only the pattern to be
373 * matched, with no delimiters or flags. For example, the expression
374 * `/abc?def/i` should be compiled by providing `abc?def` as the first
375 * string in the @p expressions array, and @ref HS_FLAG_CASELESS as the
376 * first value in the @p flags array.
377 *
378 * @param flags
379 * Array of flags which modify the behaviour of each expression. Multiple
380 * flags may be used by ORing them together. Specifying the NULL pointer
381 * in place of an array will set the flags value for all patterns to zero.
382 * Valid values are:
383 * - HS_FLAG_CASELESS - Matching will be performed case-insensitively.
384 * - HS_FLAG_DOTALL - Matching a `.` will not exclude newlines.
385 * - HS_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data.
386 * - HS_FLAG_SINGLEMATCH - Only one match will be generated by patterns
387 * with this match id per stream.
388 * - HS_FLAG_ALLOWEMPTY - Allow expressions which can match against an
389 * empty string, such as `.*`.
390 * - HS_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters.
391 * - HS_FLAG_UCP - Use Unicode properties for character classes.
392 * - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode.
393 * - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset
394 * when a match is found.
395 *
396 * @param ids
397 * An array of integers specifying the ID number to be associated with the
398 * corresponding pattern in the expressions array. Specifying the NULL
399 * pointer in place of an array will set the ID value for all patterns to
400 * zero.
401 *
402 * @param elements
403 * The number of elements in the input arrays.
404 *
405 * @param mode
406 * Compiler mode flags that affect the database as a whole. One of @ref
407 * HS_MODE_STREAM or @ref HS_MODE_BLOCK or @ref HS_MODE_VECTORED must be
408 * supplied, to select between the generation of a streaming, block or
409 * vectored database. In addition, other flags (beginning with HS_MODE_)
410 * may be supplied to enable specific features. See @ref HS_MODE_FLAG for
411 * more details.
412 *
413 * @param platform
414 * If not NULL, the platform structure is used to determine the target
415 * platform for the database. If NULL, a database suitable for running
416 * on the current host platform is produced.
417 *
418 * @param db
419 * On success, a pointer to the generated database will be returned in
420 * this parameter, or NULL on failure. The caller is responsible for
421 * deallocating the buffer using the @ref hs_free_database() function.
422 *
423 * @param error
424 * If the compile fails, a pointer to a @ref hs_compile_error_t will be
425 * returned, providing details of the error condition. The caller is
426 * responsible for deallocating the buffer using the @ref
427 * hs_free_compile_error() function.
428 *
429 * @return
430 * @ref HS_SUCCESS is returned on successful compilation; @ref
431 * HS_COMPILER_ERROR on failure, with details provided in the @p error
432 * parameter.
433 *
434 */
435hs_error_t HS_CDECL hs_compile_multi(const char *const *expressions,
436 const unsigned int *flags,
437 const unsigned int *ids,
438 unsigned int elements, unsigned int mode,
439 const hs_platform_info_t *platform,
440 hs_database_t **db,
441 hs_compile_error_t **error);
442
443/**
444 * The multiple regular expression compiler with extended parameter support.
445 *
446 * This function call compiles a group of expressions into a database in the
447 * same way as @ref hs_compile_multi(), but allows additional parameters to be
448 * specified via an @ref hs_expr_ext_t structure per expression.
449 *
450 * @param expressions
451 * Array of NULL-terminated expressions to compile. Note that (as for @ref
452 * hs_compile()) these strings must contain only the pattern to be
453 * matched, with no delimiters or flags. For example, the expression
454 * `/abc?def/i` should be compiled by providing `abc?def` as the first
455 * string in the @p expressions array, and @ref HS_FLAG_CASELESS as the
456 * first value in the @p flags array.
457 *
458 * @param flags
459 * Array of flags which modify the behaviour of each expression. Multiple
460 * flags may be used by ORing them together. Specifying the NULL pointer
461 * in place of an array will set the flags value for all patterns to zero.
462 * Valid values are:
463 * - HS_FLAG_CASELESS - Matching will be performed case-insensitively.
464 * - HS_FLAG_DOTALL - Matching a `.` will not exclude newlines.
465 * - HS_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data.
466 * - HS_FLAG_SINGLEMATCH - Only one match will be generated by patterns
467 * with this match id per stream.
468 * - HS_FLAG_ALLOWEMPTY - Allow expressions which can match against an
469 * empty string, such as `.*`.
470 * - HS_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters.
471 * - HS_FLAG_UCP - Use Unicode properties for character classes.
472 * - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode.
473 * - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset
474 * when a match is found.
475 *
476 * @param ids
477 * An array of integers specifying the ID number to be associated with the
478 * corresponding pattern in the expressions array. Specifying the NULL
479 * pointer in place of an array will set the ID value for all patterns to
480 * zero.
481 *
482 * @param ext
483 * An array of pointers to filled @ref hs_expr_ext_t structures that
484 * define extended behaviour for each pattern. NULL may be specified if no
485 * extended behaviour is needed for an individual pattern, or in place of
486 * the whole array if it is not needed for any expressions. Memory used by
487 * these structures must be both allocated and freed by the caller.
488 *
489 * @param elements
490 * The number of elements in the input arrays.
491 *
492 * @param mode
493 * Compiler mode flags that affect the database as a whole. One of @ref
494 * HS_MODE_STREAM, @ref HS_MODE_BLOCK or @ref HS_MODE_VECTORED must be
495 * supplied, to select between the generation of a streaming, block or
496 * vectored database. In addition, other flags (beginning with HS_MODE_)
497 * may be supplied to enable specific features. See @ref HS_MODE_FLAG for
498 * more details.
499 *
500 * @param platform
501 * If not NULL, the platform structure is used to determine the target
502 * platform for the database. If NULL, a database suitable for running
503 * on the current host platform is produced.
504 *
505 * @param db
506 * On success, a pointer to the generated database will be returned in
507 * this parameter, or NULL on failure. The caller is responsible for
508 * deallocating the buffer using the @ref hs_free_database() function.
509 *
510 * @param error
511 * If the compile fails, a pointer to a @ref hs_compile_error_t will be
512 * returned, providing details of the error condition. The caller is
513 * responsible for deallocating the buffer using the @ref
514 * hs_free_compile_error() function.
515 *
516 * @return
517 * @ref HS_SUCCESS is returned on successful compilation; @ref
518 * HS_COMPILER_ERROR on failure, with details provided in the @p error
519 * parameter.
520 *
521 */
522hs_error_t HS_CDECL hs_compile_ext_multi(const char *const *expressions,
523 const unsigned int *flags,
524 const unsigned int *ids,
525 const hs_expr_ext_t *const *ext,
526 unsigned int elements, unsigned int mode,
527 const hs_platform_info_t *platform,
528 hs_database_t **db, hs_compile_error_t **error);
529
530/**
531 * Free an error structure generated by @ref hs_compile(), @ref
532 * hs_compile_multi() or @ref hs_compile_ext_multi().
533 *
534 * @param error
535 * The @ref hs_compile_error_t to be freed. NULL may also be safely
536 * provided.
537 *
538 * @return
539 * @ref HS_SUCCESS on success, other values on failure.
540 */
541hs_error_t HS_CDECL hs_free_compile_error(hs_compile_error_t *error);
542
543/**
544 * Utility function providing information about a regular expression. The
545 * information provided in @ref hs_expr_info_t includes the minimum and maximum
546 * width of a pattern match.
547 *
548 * Note: successful analysis of an expression with this function does not imply
549 * that compilation of the same expression (via @ref hs_compile(), @ref
550 * hs_compile_multi() or @ref hs_compile_ext_multi()) would succeed. This
551 * function may return @ref HS_SUCCESS for regular expressions that Hyperscan
552 * cannot compile.
553 *
554 * Note: some per-pattern flags (such as @ref HS_FLAG_ALLOWEMPTY, @ref
555 * HS_FLAG_SOM_LEFTMOST) are accepted by this call, but as they do not affect
556 * the properties returned in the @ref hs_expr_info_t structure, they will not
557 * affect the outcome of this function.
558 *
559 * @param expression
560 * The NULL-terminated expression to parse. Note that this string must
561 * represent ONLY the pattern to be matched, with no delimiters or flags;
562 * any global flags should be specified with the @p flags argument. For
563 * example, the expression `/abc?def/i` should be compiled by providing
564 * `abc?def` as the @p expression, and @ref HS_FLAG_CASELESS as the @a
565 * flags.
566 *
567 * @param flags
568 * Flags which modify the behaviour of the expression. Multiple flags may
569 * be used by ORing them together. Valid values are:
570 * - HS_FLAG_CASELESS - Matching will be performed case-insensitively.
571 * - HS_FLAG_DOTALL - Matching a `.` will not exclude newlines.
572 * - HS_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data.
573 * - HS_FLAG_SINGLEMATCH - Only one match will be generated by the
574 * expression per stream.
575 * - HS_FLAG_ALLOWEMPTY - Allow expressions which can match against an
576 * empty string, such as `.*`.
577 * - HS_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters.
578 * - HS_FLAG_UCP - Use Unicode properties for character classes.
579 * - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode.
580 * - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset
581 * when a match is found.
582 *
583 * @param info
584 * On success, a pointer to the pattern information will be returned in
585 * this parameter, or NULL on failure. This structure is allocated using
586 * the allocator supplied in @ref hs_set_allocator() (or malloc() if no
587 * allocator was set) and should be freed by the caller.
588 *
589 * @param error
590 * If the call fails, a pointer to a @ref hs_compile_error_t will be
591 * returned, providing details of the error condition. The caller is
592 * responsible for deallocating the buffer using the @ref
593 * hs_free_compile_error() function.
594 *
595 * @return
596 * @ref HS_SUCCESS is returned on successful compilation; @ref
597 * HS_COMPILER_ERROR on failure, with details provided in the error
598 * parameter.
599 */
600hs_error_t HS_CDECL hs_expression_info(const char *expression,
601 unsigned int flags,
602 hs_expr_info_t **info,
603 hs_compile_error_t **error);
604
605/**
606 * Utility function providing information about a regular expression, with
607 * extended parameter support. The information provided in @ref hs_expr_info_t
608 * includes the minimum and maximum width of a pattern match.
609 *
610 * Note: successful analysis of an expression with this function does not imply
611 * that compilation of the same expression (via @ref hs_compile(), @ref
612 * hs_compile_multi() or @ref hs_compile_ext_multi()) would succeed. This
613 * function may return @ref HS_SUCCESS for regular expressions that Hyperscan
614 * cannot compile.
615 *
616 * Note: some per-pattern flags (such as @ref HS_FLAG_ALLOWEMPTY, @ref
617 * HS_FLAG_SOM_LEFTMOST) are accepted by this call, but as they do not affect
618 * the properties returned in the @ref hs_expr_info_t structure, they will not
619 * affect the outcome of this function.
620 *
621 * @param expression
622 * The NULL-terminated expression to parse. Note that this string must
623 * represent ONLY the pattern to be matched, with no delimiters or flags;
624 * any global flags should be specified with the @p flags argument. For
625 * example, the expression `/abc?def/i` should be compiled by providing
626 * `abc?def` as the @p expression, and @ref HS_FLAG_CASELESS as the @a
627 * flags.
628 *
629 * @param flags
630 * Flags which modify the behaviour of the expression. Multiple flags may
631 * be used by ORing them together. Valid values are:
632 * - HS_FLAG_CASELESS - Matching will be performed case-insensitively.
633 * - HS_FLAG_DOTALL - Matching a `.` will not exclude newlines.
634 * - HS_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data.
635 * - HS_FLAG_SINGLEMATCH - Only one match will be generated by the
636 * expression per stream.
637 * - HS_FLAG_ALLOWEMPTY - Allow expressions which can match against an
638 * empty string, such as `.*`.
639 * - HS_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters.
640 * - HS_FLAG_UCP - Use Unicode properties for character classes.
641 * - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode.
642 * - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset
643 * when a match is found.
644 *
645 * @param ext
646 * A pointer to a filled @ref hs_expr_ext_t structure that defines
647 * extended behaviour for this pattern. NULL may be specified if no
648 * extended parameters are needed.
649 *
650 * @param info
651 * On success, a pointer to the pattern information will be returned in
652 * this parameter, or NULL on failure. This structure is allocated using
653 * the allocator supplied in @ref hs_set_allocator() (or malloc() if no
654 * allocator was set) and should be freed by the caller.
655 *
656 * @param error
657 * If the call fails, a pointer to a @ref hs_compile_error_t will be
658 * returned, providing details of the error condition. The caller is
659 * responsible for deallocating the buffer using the @ref
660 * hs_free_compile_error() function.
661 *
662 * @return
663 * @ref HS_SUCCESS is returned on successful compilation; @ref
664 * HS_COMPILER_ERROR on failure, with details provided in the error
665 * parameter.
666 */
667hs_error_t HS_CDECL hs_expression_ext_info(const char *expression,
668 unsigned int flags,
669 const hs_expr_ext_t *ext,
670 hs_expr_info_t **info,
671 hs_compile_error_t **error);
672
673/**
674 * Populates the platform information based on the current host.
675 *
676 * @param platform
677 * On success, the pointed to structure is populated based on the current
678 * host.
679 *
680 * @return
681 * @ref HS_SUCCESS on success, other values on failure.
682 */
683hs_error_t HS_CDECL hs_populate_platform(hs_platform_info_t *platform);
684
685/**
686 * @defgroup HS_PATTERN_FLAG Pattern flags
687 *
688 * @{
689 */
690
691/**
692 * Compile flag: Set case-insensitive matching.
693 *
694 * This flag sets the expression to be matched case-insensitively by default.
695 * The expression may still use PCRE tokens (notably `(?i)` and
696 * `(?-i)`) to switch case-insensitive matching on and off.
697 */
698#define HS_FLAG_CASELESS 1
699
700/**
701 * Compile flag: Matching a `.` will not exclude newlines.
702 *
703 * This flag sets any instances of the `.` token to match newline characters as
704 * well as all other characters. The PCRE specification states that the `.`
705 * token does not match newline characters by default, so without this flag the
706 * `.` token will not cross line boundaries.
707 */
708#define HS_FLAG_DOTALL 2
709
710/**
711 * Compile flag: Set multi-line anchoring.
712 *
713 * This flag instructs the expression to make the `^` and `$` tokens match
714 * newline characters as well as the start and end of the stream. If this flag
715 * is not specified, the `^` token will only ever match at the start of a
716 * stream, and the `$` token will only ever match at the end of a stream within
717 * the guidelines of the PCRE specification.
718 */
719#define HS_FLAG_MULTILINE 4
720
721/**
722 * Compile flag: Set single-match only mode.
723 *
724 * This flag sets the expression's match ID to match at most once. In streaming
725 * mode, this means that the expression will return only a single match over
726 * the lifetime of the stream, rather than reporting every match as per
727 * standard Hyperscan semantics. In block mode or vectored mode, only the first
728 * match for each invocation of @ref hs_scan() or @ref hs_scan_vector() will be
729 * returned.
730 *
731 * If multiple expressions in the database share the same match ID, then they
732 * either must all specify @ref HS_FLAG_SINGLEMATCH or none of them specify
733 * @ref HS_FLAG_SINGLEMATCH. If a group of expressions sharing a match ID
734 * specify the flag, then at most one match with the match ID will be generated
735 * per stream.
736 *
737 * Note: The use of this flag in combination with @ref HS_FLAG_SOM_LEFTMOST
738 * is not currently supported.
739 */
740#define HS_FLAG_SINGLEMATCH 8
741
742/**
743 * Compile flag: Allow expressions that can match against empty buffers.
744 *
745 * This flag instructs the compiler to allow expressions that can match against
746 * empty buffers, such as `.?`, `.*`, `(a|)`. Since Hyperscan can return every
747 * possible match for an expression, such expressions generally execute very
748 * slowly; the default behaviour is to return an error when an attempt to
749 * compile one is made. Using this flag will force the compiler to allow such
750 * an expression.
751 */
752#define HS_FLAG_ALLOWEMPTY 16
753
754/**
755 * Compile flag: Enable UTF-8 mode for this expression.
756 *
757 * This flag instructs Hyperscan to treat the pattern as a sequence of UTF-8
758 * characters. The results of scanning invalid UTF-8 sequences with a Hyperscan
759 * library that has been compiled with one or more patterns using this flag are
760 * undefined.
761 */
762#define HS_FLAG_UTF8 32
763
764/**
765 * Compile flag: Enable Unicode property support for this expression.
766 *
767 * This flag instructs Hyperscan to use Unicode properties, rather than the
768 * default ASCII interpretations, for character mnemonics like `\w` and `\s` as
769 * well as the POSIX character classes. It is only meaningful in conjunction
770 * with @ref HS_FLAG_UTF8.
771 */
772#define HS_FLAG_UCP 64
773
774/**
775 * Compile flag: Enable prefiltering mode for this expression.
776 *
777 * This flag instructs Hyperscan to compile an "approximate" version of this
778 * pattern for use in a prefiltering application, even if Hyperscan does not
779 * support the pattern in normal operation.
780 *
781 * The set of matches returned when this flag is used is guaranteed to be a
782 * superset of the matches specified by the non-prefiltering expression.
783 *
784 * If the pattern contains pattern constructs not supported by Hyperscan (such
785 * as zero-width assertions, back-references or conditional references) these
786 * constructs will be replaced internally with broader constructs that may
787 * match more often.
788 *
789 * Furthermore, in prefiltering mode Hyperscan may simplify a pattern that
790 * would otherwise return a "Pattern too large" error at compile time, or for
791 * performance reasons (subject to the matching guarantee above).
792 *
793 * It is generally expected that the application will subsequently confirm
794 * prefilter matches with another regular expression matcher that can provide
795 * exact matches for the pattern.
796 *
797 * Note: The use of this flag in combination with @ref HS_FLAG_SOM_LEFTMOST
798 * is not currently supported.
799 */
800#define HS_FLAG_PREFILTER 128
801
802/**
803 * Compile flag: Enable leftmost start of match reporting.
804 *
805 * This flag instructs Hyperscan to report the leftmost possible start of match
806 * offset when a match is reported for this expression. (By default, no start
807 * of match is returned.)
808 *
809 * Enabling this behaviour may reduce performance and increase stream state
810 * requirements in streaming mode.
811 */
812#define HS_FLAG_SOM_LEFTMOST 256
813
814/**
815 * Compile flag: Logical combination.
816 *
817 * This flag instructs Hyperscan to parse this expression as logical
818 * combination syntax.
819 * Logical constraints consist of operands, operators and parentheses.
820 * The operands are expression indices, and operators can be
821 * '!'(NOT), '&'(AND) or '|'(OR).
822 * For example:
823 * (101&102&103)|(104&!105)
824 * ((301|302)&303)&(304|305)
825 */
826#define HS_FLAG_COMBINATION 512
827
828/**
829 * Compile flag: Don't do any match reporting.
830 *
831 * This flag instructs Hyperscan to ignore match reporting for this expression.
832 * It is designed to be used on the sub-expressions in logical combinations.
833 */
834#define HS_FLAG_QUIET 1024
835
836/** @} */
837
838/**
839 * @defgroup HS_CPU_FEATURES_FLAG CPU feature support flags
840 *
841 * @{
842 */
843
844/**
845 * CPU features flag - Intel(R) Advanced Vector Extensions 2 (Intel(R) AVX2)
846 *
847 * Setting this flag indicates that the target platform supports AVX2
848 * instructions.
849 */
850#define HS_CPU_FEATURES_AVX2 (1ULL << 2)
851
852/**
853 * CPU features flag - Intel(R) Advanced Vector Extensions 512 (Intel(R) AVX512)
854 *
855 * Setting this flag indicates that the target platform supports AVX512
856 * instructions, specifically AVX-512BW. Using AVX512 implies the use of AVX2.
857 */
858#define HS_CPU_FEATURES_AVX512 (1ULL << 3)
859
860/** @} */
861
862/**
863 * @defgroup HS_TUNE_FLAG Tuning flags
864 *
865 * @{
866 */
867
868/**
869 * Tuning Parameter - Generic
870 *
871 * This indicates that the compiled database should not be tuned for any
872 * particular target platform.
873 */
874#define HS_TUNE_FAMILY_GENERIC 0
875
876/**
877 * Tuning Parameter - Intel(R) microarchitecture code name Sandy Bridge
878 *
879 * This indicates that the compiled database should be tuned for the
880 * Sandy Bridge microarchitecture.
881 */
882#define HS_TUNE_FAMILY_SNB 1
883
884/**
885 * Tuning Parameter - Intel(R) microarchitecture code name Ivy Bridge
886 *
887 * This indicates that the compiled database should be tuned for the
888 * Ivy Bridge microarchitecture.
889 */
890#define HS_TUNE_FAMILY_IVB 2
891
892/**
893 * Tuning Parameter - Intel(R) microarchitecture code name Haswell
894 *
895 * This indicates that the compiled database should be tuned for the
896 * Haswell microarchitecture.
897 */
898#define HS_TUNE_FAMILY_HSW 3
899
900/**
901 * Tuning Parameter - Intel(R) microarchitecture code name Silvermont
902 *
903 * This indicates that the compiled database should be tuned for the
904 * Silvermont microarchitecture.
905 */
906#define HS_TUNE_FAMILY_SLM 4
907
908/**
909 * Tuning Parameter - Intel(R) microarchitecture code name Broadwell
910 *
911 * This indicates that the compiled database should be tuned for the
912 * Broadwell microarchitecture.
913 */
914#define HS_TUNE_FAMILY_BDW 5
915
916/**
917 * Tuning Parameter - Intel(R) microarchitecture code name Skylake
918 *
919 * This indicates that the compiled database should be tuned for the
920 * Skylake microarchitecture.
921 */
922#define HS_TUNE_FAMILY_SKL 6
923
924/**
925 * Tuning Parameter - Intel(R) microarchitecture code name Skylake Server
926 *
927 * This indicates that the compiled database should be tuned for the
928 * Skylake Server microarchitecture.
929 */
930#define HS_TUNE_FAMILY_SKX 7
931
932/**
933 * Tuning Parameter - Intel(R) microarchitecture code name Goldmont
934 *
935 * This indicates that the compiled database should be tuned for the
936 * Goldmont microarchitecture.
937 */
938#define HS_TUNE_FAMILY_GLM 8
939
940/** @} */
941
942/**
943 * @defgroup HS_MODE_FLAG Compile mode flags
944 *
945 * The mode flags are used as values for the mode parameter of the various
946 * compile calls (@ref hs_compile(), @ref hs_compile_multi() and @ref
947 * hs_compile_ext_multi()).
948 *
949 * A mode value can be built by ORing these flag values together; the only
950 * required flag is one of @ref HS_MODE_BLOCK, @ref HS_MODE_STREAM or @ref
951 * HS_MODE_VECTORED. Other flags may be added to enable support for additional
952 * features.
953 *
954 * @{
955 */
956
957/**
958 * Compiler mode flag: Block scan (non-streaming) database.
959 */
960#define HS_MODE_BLOCK 1
961
962/**
963 * Compiler mode flag: Alias for @ref HS_MODE_BLOCK.
964 */
965#define HS_MODE_NOSTREAM 1
966
967/**
968 * Compiler mode flag: Streaming database.
969 */
970#define HS_MODE_STREAM 2
971
972/**
973 * Compiler mode flag: Vectored scanning database.
974 */
975#define HS_MODE_VECTORED 4
976
977/**
978 * Compiler mode flag: use full precision to track start of match offsets in
979 * stream state.
980 *
981 * This mode will use the most stream state per pattern, but will always return
982 * an accurate start of match offset regardless of how far back in the past it
983 * was found.
984 *
985 * One of the SOM_HORIZON modes must be selected to use the @ref
986 * HS_FLAG_SOM_LEFTMOST expression flag.
987 */
988#define HS_MODE_SOM_HORIZON_LARGE (1U << 24)
989
990/**
991 * Compiler mode flag: use medium precision to track start of match offsets in
992 * stream state.
993 *
994 * This mode will use less stream state than @ref HS_MODE_SOM_HORIZON_LARGE and
995 * will limit start of match accuracy to offsets within 2^32 bytes of the
996 * end of match offset reported.
997 *
998 * One of the SOM_HORIZON modes must be selected to use the @ref
999 * HS_FLAG_SOM_LEFTMOST expression flag.
1000 */
1001#define HS_MODE_SOM_HORIZON_MEDIUM (1U << 25)
1002
1003/**
1004 * Compiler mode flag: use limited precision to track start of match offsets in
1005 * stream state.
1006 *
1007 * This mode will use less stream state than @ref HS_MODE_SOM_HORIZON_LARGE and
1008 * will limit start of match accuracy to offsets within 2^16 bytes of the
1009 * end of match offset reported.
1010 *
1011 * One of the SOM_HORIZON modes must be selected to use the @ref
1012 * HS_FLAG_SOM_LEFTMOST expression flag.
1013 */
1014#define HS_MODE_SOM_HORIZON_SMALL (1U << 26)
1015
1016/** @} */
1017
1018#ifdef __cplusplus
1019} /* extern "C" */
1020#endif
1021
1022#endif /* HS_COMPILE_H_ */
1023