1 | /* |
2 | * Copyright (c) 2015-2018, Intel Corporation |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions are met: |
6 | * |
7 | * * Redistributions of source code must retain the above copyright notice, |
8 | * this list of conditions and the following disclaimer. |
9 | * * Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * * Neither the name of Intel Corporation nor the names of its contributors |
13 | * may be used to endorse or promote products derived from this software |
14 | * without specific prior written permission. |
15 | * |
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
17 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
20 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
26 | * POSSIBILITY OF SUCH DAMAGE. |
27 | */ |
28 | |
29 | /** \file |
30 | * \brief Compiler front-end interface. |
31 | */ |
32 | #include "allocator.h" |
33 | #include "asserts.h" |
34 | #include "compiler.h" |
35 | #include "crc32.h" |
36 | #include "database.h" |
37 | #include "grey.h" |
38 | #include "hs_internal.h" |
39 | #include "hs_runtime.h" |
40 | #include "ue2common.h" |
41 | #include "nfagraph/ng_builder.h" |
42 | #include "nfagraph/ng_dump.h" |
43 | #include "nfagraph/ng.h" |
44 | #include "nfagraph/ng_util.h" |
45 | #include "parser/buildstate.h" |
46 | #include "parser/dump.h" |
47 | #include "parser/Component.h" |
48 | #include "parser/logical_combination.h" |
49 | #include "parser/parse_error.h" |
50 | #include "parser/Parser.h" // for flags |
51 | #include "parser/position.h" |
52 | #include "parser/position_dump.h" |
53 | #include "parser/position_info.h" |
54 | #include "parser/prefilter.h" |
55 | #include "parser/shortcut_literal.h" |
56 | #include "parser/unsupported.h" |
57 | #include "parser/utf8_validate.h" |
58 | #include "rose/rose_build.h" |
59 | #include "som/slot_manager_dump.h" |
60 | #include "util/bytecode_ptr.h" |
61 | #include "util/compile_error.h" |
62 | #include "util/target_info.h" |
63 | #include "util/verify_types.h" |
64 | |
65 | #include <algorithm> |
66 | #include <cassert> |
67 | #include <cstdlib> |
68 | #include <cstring> |
69 | #include <fstream> |
70 | #include <memory> |
71 | #include <sstream> |
72 | |
73 | using namespace std; |
74 | |
75 | namespace ue2 { |
76 | |
77 | static |
78 | void validateExt(const hs_expr_ext &ext) { |
79 | static const unsigned long long ALL_EXT_FLAGS = HS_EXT_FLAG_MIN_OFFSET | |
80 | HS_EXT_FLAG_MAX_OFFSET | |
81 | HS_EXT_FLAG_MIN_LENGTH | |
82 | HS_EXT_FLAG_EDIT_DISTANCE | |
83 | HS_EXT_FLAG_HAMMING_DISTANCE; |
84 | if (ext.flags & ~ALL_EXT_FLAGS) { |
85 | throw CompileError("Invalid hs_expr_ext flag set." ); |
86 | } |
87 | |
88 | if ((ext.flags & HS_EXT_FLAG_MIN_OFFSET) && |
89 | (ext.flags & HS_EXT_FLAG_MAX_OFFSET) && |
90 | (ext.min_offset > ext.max_offset)) { |
91 | throw CompileError("In hs_expr_ext, min_offset must be less than or " |
92 | "equal to max_offset." ); |
93 | } |
94 | |
95 | if ((ext.flags & HS_EXT_FLAG_MIN_LENGTH) && |
96 | (ext.flags & HS_EXT_FLAG_MAX_OFFSET) && |
97 | (ext.min_length > ext.max_offset)) { |
98 | throw CompileError("In hs_expr_ext, min_length must be less than or " |
99 | "equal to max_offset." ); |
100 | } |
101 | |
102 | if ((ext.flags & HS_EXT_FLAG_EDIT_DISTANCE) && |
103 | (ext.flags & HS_EXT_FLAG_HAMMING_DISTANCE)) { |
104 | throw CompileError("In hs_expr_ext, cannot have both edit distance and " |
105 | "Hamming distance." ); |
106 | } |
107 | |
108 | } |
109 | |
110 | ParsedExpression::ParsedExpression(unsigned index_in, const char *expression, |
111 | unsigned flags, ReportID report, |
112 | const hs_expr_ext *ext) |
113 | : expr(index_in, flags & HS_FLAG_ALLOWEMPTY, flags & HS_FLAG_SINGLEMATCH, |
114 | false, flags & HS_FLAG_PREFILTER, SOM_NONE, report, 0, MAX_OFFSET, |
115 | 0, 0, 0, flags & HS_FLAG_QUIET) { |
116 | // We disallow SOM + Quiet. |
117 | if ((flags & HS_FLAG_QUIET) && (flags & HS_FLAG_SOM_LEFTMOST)) { |
118 | throw CompileError("HS_FLAG_QUIET is not supported in " |
119 | "combination with HS_FLAG_SOM_LEFTMOST." ); |
120 | } |
121 | flags &= ~HS_FLAG_QUIET; |
122 | ParseMode mode(flags); |
123 | |
124 | component = parse(expression, mode); |
125 | |
126 | expr.utf8 = mode.utf8; /* utf8 may be set by parse() */ |
127 | |
128 | const size_t len = strlen(expression); |
129 | if (expr.utf8 && !isValidUtf8(expression, len)) { |
130 | throw ParseError("Expression is not valid UTF-8." ); |
131 | } |
132 | |
133 | if (!component) { |
134 | assert(0); // parse() should have thrown a ParseError. |
135 | throw ParseError("Parse error." ); |
136 | } |
137 | |
138 | if (flags & ~HS_FLAG_ALL) { |
139 | DEBUG_PRINTF("Unrecognised flag, flags=%u.\n" , flags); |
140 | throw CompileError("Unrecognised flag." ); |
141 | } |
142 | |
143 | // FIXME: we disallow highlander + SOM, see UE-1850. |
144 | if ((flags & HS_FLAG_SINGLEMATCH) && (flags & HS_FLAG_SOM_LEFTMOST)) { |
145 | throw CompileError("HS_FLAG_SINGLEMATCH is not supported in " |
146 | "combination with HS_FLAG_SOM_LEFTMOST." ); |
147 | } |
148 | |
149 | // FIXME: we disallow prefilter + SOM, see UE-1899. |
150 | if ((flags & HS_FLAG_PREFILTER) && (flags & HS_FLAG_SOM_LEFTMOST)) { |
151 | throw CompileError("HS_FLAG_PREFILTER is not supported in " |
152 | "combination with HS_FLAG_SOM_LEFTMOST." ); |
153 | } |
154 | |
155 | // Set SOM type. |
156 | if (flags & HS_FLAG_SOM_LEFTMOST) { |
157 | expr.som = SOM_LEFT; |
158 | } |
159 | |
160 | // Set extended parameters, if we have them. |
161 | if (ext) { |
162 | // Ensure that the given parameters make sense. |
163 | validateExt(*ext); |
164 | |
165 | if (ext->flags & HS_EXT_FLAG_MIN_OFFSET) { |
166 | expr.min_offset = ext->min_offset; |
167 | } |
168 | if (ext->flags & HS_EXT_FLAG_MAX_OFFSET) { |
169 | expr.max_offset = ext->max_offset; |
170 | } |
171 | if (ext->flags & HS_EXT_FLAG_MIN_LENGTH) { |
172 | expr.min_length = ext->min_length; |
173 | } |
174 | if (ext->flags & HS_EXT_FLAG_EDIT_DISTANCE) { |
175 | expr.edit_distance = ext->edit_distance; |
176 | } |
177 | if (ext->flags & HS_EXT_FLAG_HAMMING_DISTANCE) { |
178 | expr.hamm_distance = ext->hamming_distance; |
179 | } |
180 | } |
181 | |
182 | // These are validated in validateExt, so an error will already have been |
183 | // thrown if these conditions don't hold. |
184 | assert(expr.max_offset >= expr.min_offset); |
185 | assert(expr.max_offset >= expr.min_length); |
186 | |
187 | // Since prefiltering and SOM aren't supported together, we must squash any |
188 | // min_length constraint as well. |
189 | if (flags & HS_FLAG_PREFILTER && expr.min_length) { |
190 | DEBUG_PRINTF("prefiltering mode: squashing min_length constraint\n" ); |
191 | expr.min_length = 0; |
192 | } |
193 | } |
194 | |
195 | #if defined(DUMP_SUPPORT) || defined(DEBUG) |
196 | /** |
197 | * \brief Dumps the parse tree to screen in debug mode and to disk in dump |
198 | * mode. |
199 | */ |
200 | void dumpExpression(UNUSED const ParsedExpression &pe, |
201 | UNUSED const char *stage, UNUSED const Grey &grey) { |
202 | #if defined(DEBUG) |
203 | DEBUG_PRINTF("===== Rule ID: %u (expression index: %u) =====\n" , |
204 | pe.expr.report, pe.expr.index); |
205 | ostringstream debug_tree; |
206 | dumpTree(debug_tree, pe.component.get()); |
207 | printf("%s\n" , debug_tree.str().c_str()); |
208 | #endif // DEBUG |
209 | |
210 | #if defined(DUMP_SUPPORT) |
211 | if (grey.dumpFlags & Grey::DUMP_PARSE) { |
212 | stringstream ss; |
213 | ss << grey.dumpPath << "Expr_" << pe.expr.index << "_componenttree_" |
214 | << stage << ".txt" ; |
215 | ofstream out(ss.str().c_str()); |
216 | out << "Component Tree for " << pe.expr.report << endl; |
217 | dumpTree(out, pe.component.get()); |
218 | if (pe.expr.utf8) { |
219 | out << "UTF8 mode" << endl; |
220 | } |
221 | } |
222 | #endif // DEBUG |
223 | } |
224 | #endif |
225 | |
226 | /** \brief Run Component tree optimisations on \a expr. */ |
227 | static |
228 | void optimise(ParsedExpression &pe) { |
229 | if (pe.expr.min_length || pe.expr.som) { |
230 | return; |
231 | } |
232 | |
233 | DEBUG_PRINTF("optimising\n" ); |
234 | pe.component->optimise(true /* root is connected to sds */); |
235 | } |
236 | |
237 | void addExpression(NG &ng, unsigned index, const char *expression, |
238 | unsigned flags, const hs_expr_ext *ext, ReportID id) { |
239 | assert(expression); |
240 | const CompileContext &cc = ng.cc; |
241 | DEBUG_PRINTF("index=%u, id=%u, flags=%u, expr='%s'\n" , index, id, flags, |
242 | expression); |
243 | |
244 | if (flags & HS_FLAG_COMBINATION) { |
245 | if (flags & ~(HS_FLAG_COMBINATION | HS_FLAG_QUIET | |
246 | HS_FLAG_SINGLEMATCH)) { |
247 | throw CompileError("only HS_FLAG_QUIET and HS_FLAG_SINGLEMATCH " |
248 | "are supported in combination " |
249 | "with HS_FLAG_COMBINATION." ); |
250 | } |
251 | if (flags & HS_FLAG_QUIET) { |
252 | DEBUG_PRINTF("skip QUIET logical combination expression %u\n" , id); |
253 | } else { |
254 | u32 ekey = INVALID_EKEY; |
255 | u64a min_offset = 0; |
256 | u64a max_offset = MAX_OFFSET; |
257 | if (flags & HS_FLAG_SINGLEMATCH) { |
258 | ekey = ng.rm.getExhaustibleKey(id); |
259 | } |
260 | if (ext) { |
261 | validateExt(*ext); |
262 | if (ext->flags & ~(HS_EXT_FLAG_MIN_OFFSET | |
263 | HS_EXT_FLAG_MAX_OFFSET)) { |
264 | throw CompileError("only HS_EXT_FLAG_MIN_OFFSET and " |
265 | "HS_EXT_FLAG_MAX_OFFSET extra flags " |
266 | "are supported in combination " |
267 | "with HS_FLAG_COMBINATION." ); |
268 | } |
269 | if (ext->flags & HS_EXT_FLAG_MIN_OFFSET) { |
270 | min_offset = ext->min_offset; |
271 | } |
272 | if (ext->flags & HS_EXT_FLAG_MAX_OFFSET) { |
273 | max_offset = ext->max_offset; |
274 | } |
275 | } |
276 | ng.rm.pl.parseLogicalCombination(id, expression, ekey, min_offset, |
277 | max_offset); |
278 | DEBUG_PRINTF("parsed logical combination expression %u\n" , id); |
279 | } |
280 | return; |
281 | } |
282 | |
283 | // Ensure that our pattern isn't too long (in characters). |
284 | if (strlen(expression) > cc.grey.limitPatternLength) { |
285 | throw CompileError("Pattern length exceeds limit." ); |
286 | } |
287 | |
288 | // Do per-expression processing: errors here will result in an exception |
289 | // being thrown up to our caller |
290 | ParsedExpression pe(index, expression, flags, id, ext); |
291 | dumpExpression(pe, "orig" , cc.grey); |
292 | |
293 | // Apply prefiltering transformations if desired. |
294 | if (pe.expr.prefilter) { |
295 | prefilterTree(pe.component, ParseMode(flags)); |
296 | dumpExpression(pe, "prefiltered" , cc.grey); |
297 | } |
298 | |
299 | // Expressions containing zero-width assertions and other extended pcre |
300 | // types aren't supported yet. This call will throw a ParseError exception |
301 | // if the component tree contains such a construct. |
302 | checkUnsupported(*pe.component); |
303 | |
304 | pe.component->checkEmbeddedStartAnchor(true); |
305 | pe.component->checkEmbeddedEndAnchor(true); |
306 | |
307 | if (cc.grey.optimiseComponentTree) { |
308 | optimise(pe); |
309 | dumpExpression(pe, "opt" , cc.grey); |
310 | } |
311 | |
312 | DEBUG_PRINTF("component=%p, nfaId=%u, reportId=%u\n" , |
313 | pe.component.get(), pe.expr.index, pe.expr.report); |
314 | |
315 | // You can only use the SOM flags if you've also specified an SOM |
316 | // precision mode. |
317 | if (pe.expr.som != SOM_NONE && cc.streaming && !ng.ssm.somPrecision()) { |
318 | throw CompileError("To use a SOM expression flag in streaming mode, " |
319 | "an SOM precision mode (e.g. " |
320 | "HS_MODE_SOM_HORIZON_LARGE) must be specified." ); |
321 | } |
322 | |
323 | // If this expression is a literal, we can feed it directly to Rose rather |
324 | // than building the NFA graph. |
325 | if (shortcutLiteral(ng, pe)) { |
326 | DEBUG_PRINTF("took literal short cut\n" ); |
327 | return; |
328 | } |
329 | |
330 | auto built_expr = buildGraph(ng.rm, cc, pe); |
331 | if (!built_expr.g) { |
332 | DEBUG_PRINTF("NFA build failed on ID %u, but no exception was " |
333 | "thrown.\n" , pe.expr.report); |
334 | throw CompileError("Internal error." ); |
335 | } |
336 | |
337 | if (!pe.expr.allow_vacuous && matches_everywhere(*built_expr.g)) { |
338 | throw CompileError("Pattern matches empty buffer; use " |
339 | "HS_FLAG_ALLOWEMPTY to enable support." ); |
340 | } |
341 | |
342 | if (!ng.addGraph(built_expr.expr, std::move(built_expr.g))) { |
343 | DEBUG_PRINTF("NFA addGraph failed on ID %u.\n" , pe.expr.report); |
344 | throw CompileError("Error compiling expression." ); |
345 | } |
346 | } |
347 | |
348 | static |
349 | bytecode_ptr<RoseEngine> generateRoseEngine(NG &ng) { |
350 | const u32 minWidth = |
351 | ng.minWidth.is_finite() ? verify_u32(ng.minWidth) : ROSE_BOUND_INF; |
352 | auto rose = ng.rose->buildRose(minWidth); |
353 | |
354 | if (!rose) { |
355 | DEBUG_PRINTF("error building rose\n" ); |
356 | assert(0); |
357 | return nullptr; |
358 | } |
359 | |
360 | dumpReportManager(ng.rm, ng.cc.grey); |
361 | dumpSomSlotManager(ng.ssm, ng.cc.grey); |
362 | dumpSmallWrite(rose.get(), ng.cc.grey); |
363 | |
364 | return rose; |
365 | } |
366 | |
367 | platform_t target_to_platform(const target_t &target_info) { |
368 | platform_t p; |
369 | p = 0; |
370 | |
371 | if (!target_info.has_avx2()) { |
372 | p |= HS_PLATFORM_NOAVX2; |
373 | } |
374 | if (!target_info.has_avx512()) { |
375 | p |= HS_PLATFORM_NOAVX512; |
376 | } |
377 | return p; |
378 | } |
379 | |
380 | /** \brief Encapsulate the given bytecode (RoseEngine) in a newly-allocated |
381 | * \ref hs_database, ensuring that it is padded correctly to give cacheline |
382 | * alignment. */ |
383 | static |
384 | hs_database_t *dbCreate(const char *in_bytecode, size_t len, u64a platform) { |
385 | size_t db_len = sizeof(struct hs_database) + len; |
386 | DEBUG_PRINTF("db size %zu\n" , db_len); |
387 | DEBUG_PRINTF("db platform %llx\n" , platform); |
388 | |
389 | struct hs_database *db = (struct hs_database *)hs_database_alloc(db_len); |
390 | if (hs_check_alloc(db) != HS_SUCCESS) { |
391 | hs_database_free(db); |
392 | return nullptr; |
393 | } |
394 | |
395 | // So that none of our database is uninitialized |
396 | memset(db, 0, db_len); |
397 | |
398 | // we need to align things manually |
399 | size_t shift = (uintptr_t)db->bytes & 0x3f; |
400 | DEBUG_PRINTF("shift is %zu\n" , shift); |
401 | |
402 | db->bytecode = offsetof(struct hs_database, bytes) - shift; |
403 | char *bytecode = (char *)db + db->bytecode; |
404 | assert(ISALIGNED_CL(bytecode)); |
405 | |
406 | db->magic = HS_DB_MAGIC; |
407 | db->version = HS_DB_VERSION; |
408 | db->length = len; |
409 | db->platform = platform; |
410 | |
411 | // Copy bytecode |
412 | memcpy(bytecode, in_bytecode, len); |
413 | |
414 | db->crc32 = Crc32c_ComputeBuf(0, bytecode, db->length); |
415 | return db; |
416 | } |
417 | |
418 | |
419 | struct hs_database *build(NG &ng, unsigned int *length) { |
420 | assert(length); |
421 | |
422 | auto rose = generateRoseEngine(ng); |
423 | if (!rose) { |
424 | throw CompileError("Unable to generate bytecode." ); |
425 | } |
426 | *length = rose.size(); |
427 | if (!*length) { |
428 | DEBUG_PRINTF("RoseEngine has zero length\n" ); |
429 | assert(0); |
430 | throw CompileError("Internal error." ); |
431 | } |
432 | |
433 | const char *bytecode = (const char *)(rose.get()); |
434 | const platform_t p = target_to_platform(ng.cc.target_info); |
435 | struct hs_database *db = dbCreate(bytecode, *length, p); |
436 | if (!db) { |
437 | throw CompileError("Could not allocate memory for bytecode." ); |
438 | } |
439 | |
440 | return db; |
441 | } |
442 | |
443 | static |
444 | void stripFromPositions(vector<PositionInfo> &v, Position pos) { |
445 | auto removed = remove(v.begin(), v.end(), PositionInfo(pos)); |
446 | v.erase(removed, v.end()); |
447 | } |
448 | |
449 | static |
450 | void connectInitialStates(GlushkovBuildState &bs, |
451 | const ParsedExpression &expr) { |
452 | vector<PositionInfo> initials = expr.component->first(); |
453 | const NFABuilder &builder = bs.getBuilder(); |
454 | const Position startState = builder.getStart(); |
455 | const Position startDotStarState = builder.getStartDotStar(); |
456 | |
457 | DEBUG_PRINTF("wiring initials = %s\n" , |
458 | dumpPositions(initials.begin(), initials.end()).c_str()); |
459 | |
460 | vector<PositionInfo> starts = {startState, startDotStarState}; |
461 | |
462 | // strip start and startDs, which can be present due to boundaries |
463 | stripFromPositions(initials, startState); |
464 | stripFromPositions(initials, startDotStarState); |
465 | |
466 | // replace epsilons with accepts |
467 | for (const auto &s : initials) { |
468 | if (s.pos != GlushkovBuildState::POS_EPSILON) { |
469 | continue; |
470 | } |
471 | |
472 | assert(starts.size() == 2); /* start, startds */ |
473 | vector<PositionInfo> starts_temp = starts; |
474 | starts_temp[0].flags = s.flags; |
475 | starts_temp[1].flags = s.flags; |
476 | bs.connectAccepts(starts_temp); |
477 | } |
478 | |
479 | if (!initials.empty()) { |
480 | bs.connectRegions(starts, initials); |
481 | } |
482 | } |
483 | |
484 | static |
485 | void connectFinalStates(GlushkovBuildState &bs, const ParsedExpression &expr) { |
486 | vector<PositionInfo> finals = expr.component->last(); |
487 | |
488 | DEBUG_PRINTF("wiring finals = %s\n" , |
489 | dumpPositions(finals.begin(), finals.end()).c_str()); |
490 | |
491 | bs.connectAccepts(finals); |
492 | } |
493 | |
494 | #ifndef NDEBUG |
495 | static |
496 | bool isSupported(const Component &c) { |
497 | try { |
498 | checkUnsupported(c); |
499 | return true; |
500 | } |
501 | catch (ParseError &) { |
502 | return false; |
503 | } |
504 | } |
505 | #endif |
506 | |
507 | BuiltExpression buildGraph(ReportManager &rm, const CompileContext &cc, |
508 | const ParsedExpression &pe) { |
509 | assert(isSupported(*pe.component)); |
510 | |
511 | const auto builder = makeNFABuilder(rm, cc, pe); |
512 | assert(builder); |
513 | |
514 | // Set up START and ACCEPT states; retrieve the special states |
515 | const auto bs = makeGlushkovBuildState(*builder, pe.expr.prefilter); |
516 | |
517 | // Map position IDs to characters/components |
518 | pe.component->notePositions(*bs); |
519 | |
520 | // Wire the start dotstar state to the firsts |
521 | connectInitialStates(*bs, pe); |
522 | |
523 | DEBUG_PRINTF("wire up body of expr\n" ); |
524 | // Build the rest of the FOLLOW set |
525 | vector<PositionInfo> initials = {builder->getStartDotStar(), |
526 | builder->getStart()}; |
527 | pe.component->buildFollowSet(*bs, initials); |
528 | |
529 | // Wire the lasts to the accept state |
530 | connectFinalStates(*bs, pe); |
531 | |
532 | // Create our edges |
533 | bs->buildEdges(); |
534 | |
535 | BuiltExpression built_expr = builder->getGraph(); |
536 | assert(built_expr.g); |
537 | |
538 | dumpDotWrapper(*built_expr.g, built_expr.expr, "00_before_asserts" , |
539 | cc.grey); |
540 | removeAssertVertices(rm, *built_expr.g, built_expr.expr); |
541 | |
542 | return built_expr; |
543 | } |
544 | |
545 | } // namespace ue2 |
546 | |