1/*
2 * Copyright (c) 2015-2018, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** \file
30 * \brief Compiler front-end interface.
31 */
32#include "allocator.h"
33#include "asserts.h"
34#include "compiler.h"
35#include "crc32.h"
36#include "database.h"
37#include "grey.h"
38#include "hs_internal.h"
39#include "hs_runtime.h"
40#include "ue2common.h"
41#include "nfagraph/ng_builder.h"
42#include "nfagraph/ng_dump.h"
43#include "nfagraph/ng.h"
44#include "nfagraph/ng_util.h"
45#include "parser/buildstate.h"
46#include "parser/dump.h"
47#include "parser/Component.h"
48#include "parser/logical_combination.h"
49#include "parser/parse_error.h"
50#include "parser/Parser.h" // for flags
51#include "parser/position.h"
52#include "parser/position_dump.h"
53#include "parser/position_info.h"
54#include "parser/prefilter.h"
55#include "parser/shortcut_literal.h"
56#include "parser/unsupported.h"
57#include "parser/utf8_validate.h"
58#include "rose/rose_build.h"
59#include "som/slot_manager_dump.h"
60#include "util/bytecode_ptr.h"
61#include "util/compile_error.h"
62#include "util/target_info.h"
63#include "util/verify_types.h"
64
65#include <algorithm>
66#include <cassert>
67#include <cstdlib>
68#include <cstring>
69#include <fstream>
70#include <memory>
71#include <sstream>
72
73using namespace std;
74
75namespace ue2 {
76
77static
78void validateExt(const hs_expr_ext &ext) {
79 static const unsigned long long ALL_EXT_FLAGS = HS_EXT_FLAG_MIN_OFFSET |
80 HS_EXT_FLAG_MAX_OFFSET |
81 HS_EXT_FLAG_MIN_LENGTH |
82 HS_EXT_FLAG_EDIT_DISTANCE |
83 HS_EXT_FLAG_HAMMING_DISTANCE;
84 if (ext.flags & ~ALL_EXT_FLAGS) {
85 throw CompileError("Invalid hs_expr_ext flag set.");
86 }
87
88 if ((ext.flags & HS_EXT_FLAG_MIN_OFFSET) &&
89 (ext.flags & HS_EXT_FLAG_MAX_OFFSET) &&
90 (ext.min_offset > ext.max_offset)) {
91 throw CompileError("In hs_expr_ext, min_offset must be less than or "
92 "equal to max_offset.");
93 }
94
95 if ((ext.flags & HS_EXT_FLAG_MIN_LENGTH) &&
96 (ext.flags & HS_EXT_FLAG_MAX_OFFSET) &&
97 (ext.min_length > ext.max_offset)) {
98 throw CompileError("In hs_expr_ext, min_length must be less than or "
99 "equal to max_offset.");
100 }
101
102 if ((ext.flags & HS_EXT_FLAG_EDIT_DISTANCE) &&
103 (ext.flags & HS_EXT_FLAG_HAMMING_DISTANCE)) {
104 throw CompileError("In hs_expr_ext, cannot have both edit distance and "
105 "Hamming distance.");
106 }
107
108}
109
110ParsedExpression::ParsedExpression(unsigned index_in, const char *expression,
111 unsigned flags, ReportID report,
112 const hs_expr_ext *ext)
113 : expr(index_in, flags & HS_FLAG_ALLOWEMPTY, flags & HS_FLAG_SINGLEMATCH,
114 false, flags & HS_FLAG_PREFILTER, SOM_NONE, report, 0, MAX_OFFSET,
115 0, 0, 0, flags & HS_FLAG_QUIET) {
116 // We disallow SOM + Quiet.
117 if ((flags & HS_FLAG_QUIET) && (flags & HS_FLAG_SOM_LEFTMOST)) {
118 throw CompileError("HS_FLAG_QUIET is not supported in "
119 "combination with HS_FLAG_SOM_LEFTMOST.");
120 }
121 flags &= ~HS_FLAG_QUIET;
122 ParseMode mode(flags);
123
124 component = parse(expression, mode);
125
126 expr.utf8 = mode.utf8; /* utf8 may be set by parse() */
127
128 const size_t len = strlen(expression);
129 if (expr.utf8 && !isValidUtf8(expression, len)) {
130 throw ParseError("Expression is not valid UTF-8.");
131 }
132
133 if (!component) {
134 assert(0); // parse() should have thrown a ParseError.
135 throw ParseError("Parse error.");
136 }
137
138 if (flags & ~HS_FLAG_ALL) {
139 DEBUG_PRINTF("Unrecognised flag, flags=%u.\n", flags);
140 throw CompileError("Unrecognised flag.");
141 }
142
143 // FIXME: we disallow highlander + SOM, see UE-1850.
144 if ((flags & HS_FLAG_SINGLEMATCH) && (flags & HS_FLAG_SOM_LEFTMOST)) {
145 throw CompileError("HS_FLAG_SINGLEMATCH is not supported in "
146 "combination with HS_FLAG_SOM_LEFTMOST.");
147 }
148
149 // FIXME: we disallow prefilter + SOM, see UE-1899.
150 if ((flags & HS_FLAG_PREFILTER) && (flags & HS_FLAG_SOM_LEFTMOST)) {
151 throw CompileError("HS_FLAG_PREFILTER is not supported in "
152 "combination with HS_FLAG_SOM_LEFTMOST.");
153 }
154
155 // Set SOM type.
156 if (flags & HS_FLAG_SOM_LEFTMOST) {
157 expr.som = SOM_LEFT;
158 }
159
160 // Set extended parameters, if we have them.
161 if (ext) {
162 // Ensure that the given parameters make sense.
163 validateExt(*ext);
164
165 if (ext->flags & HS_EXT_FLAG_MIN_OFFSET) {
166 expr.min_offset = ext->min_offset;
167 }
168 if (ext->flags & HS_EXT_FLAG_MAX_OFFSET) {
169 expr.max_offset = ext->max_offset;
170 }
171 if (ext->flags & HS_EXT_FLAG_MIN_LENGTH) {
172 expr.min_length = ext->min_length;
173 }
174 if (ext->flags & HS_EXT_FLAG_EDIT_DISTANCE) {
175 expr.edit_distance = ext->edit_distance;
176 }
177 if (ext->flags & HS_EXT_FLAG_HAMMING_DISTANCE) {
178 expr.hamm_distance = ext->hamming_distance;
179 }
180 }
181
182 // These are validated in validateExt, so an error will already have been
183 // thrown if these conditions don't hold.
184 assert(expr.max_offset >= expr.min_offset);
185 assert(expr.max_offset >= expr.min_length);
186
187 // Since prefiltering and SOM aren't supported together, we must squash any
188 // min_length constraint as well.
189 if (flags & HS_FLAG_PREFILTER && expr.min_length) {
190 DEBUG_PRINTF("prefiltering mode: squashing min_length constraint\n");
191 expr.min_length = 0;
192 }
193}
194
195#if defined(DUMP_SUPPORT) || defined(DEBUG)
196/**
197 * \brief Dumps the parse tree to screen in debug mode and to disk in dump
198 * mode.
199 */
200void dumpExpression(UNUSED const ParsedExpression &pe,
201 UNUSED const char *stage, UNUSED const Grey &grey) {
202#if defined(DEBUG)
203 DEBUG_PRINTF("===== Rule ID: %u (expression index: %u) =====\n",
204 pe.expr.report, pe.expr.index);
205 ostringstream debug_tree;
206 dumpTree(debug_tree, pe.component.get());
207 printf("%s\n", debug_tree.str().c_str());
208#endif // DEBUG
209
210#if defined(DUMP_SUPPORT)
211 if (grey.dumpFlags & Grey::DUMP_PARSE) {
212 stringstream ss;
213 ss << grey.dumpPath << "Expr_" << pe.expr.index << "_componenttree_"
214 << stage << ".txt";
215 ofstream out(ss.str().c_str());
216 out << "Component Tree for " << pe.expr.report << endl;
217 dumpTree(out, pe.component.get());
218 if (pe.expr.utf8) {
219 out << "UTF8 mode" << endl;
220 }
221 }
222#endif // DEBUG
223}
224#endif
225
226/** \brief Run Component tree optimisations on \a expr. */
227static
228void optimise(ParsedExpression &pe) {
229 if (pe.expr.min_length || pe.expr.som) {
230 return;
231 }
232
233 DEBUG_PRINTF("optimising\n");
234 pe.component->optimise(true /* root is connected to sds */);
235}
236
237void addExpression(NG &ng, unsigned index, const char *expression,
238 unsigned flags, const hs_expr_ext *ext, ReportID id) {
239 assert(expression);
240 const CompileContext &cc = ng.cc;
241 DEBUG_PRINTF("index=%u, id=%u, flags=%u, expr='%s'\n", index, id, flags,
242 expression);
243
244 if (flags & HS_FLAG_COMBINATION) {
245 if (flags & ~(HS_FLAG_COMBINATION | HS_FLAG_QUIET |
246 HS_FLAG_SINGLEMATCH)) {
247 throw CompileError("only HS_FLAG_QUIET and HS_FLAG_SINGLEMATCH "
248 "are supported in combination "
249 "with HS_FLAG_COMBINATION.");
250 }
251 if (flags & HS_FLAG_QUIET) {
252 DEBUG_PRINTF("skip QUIET logical combination expression %u\n", id);
253 } else {
254 u32 ekey = INVALID_EKEY;
255 u64a min_offset = 0;
256 u64a max_offset = MAX_OFFSET;
257 if (flags & HS_FLAG_SINGLEMATCH) {
258 ekey = ng.rm.getExhaustibleKey(id);
259 }
260 if (ext) {
261 validateExt(*ext);
262 if (ext->flags & ~(HS_EXT_FLAG_MIN_OFFSET |
263 HS_EXT_FLAG_MAX_OFFSET)) {
264 throw CompileError("only HS_EXT_FLAG_MIN_OFFSET and "
265 "HS_EXT_FLAG_MAX_OFFSET extra flags "
266 "are supported in combination "
267 "with HS_FLAG_COMBINATION.");
268 }
269 if (ext->flags & HS_EXT_FLAG_MIN_OFFSET) {
270 min_offset = ext->min_offset;
271 }
272 if (ext->flags & HS_EXT_FLAG_MAX_OFFSET) {
273 max_offset = ext->max_offset;
274 }
275 }
276 ng.rm.pl.parseLogicalCombination(id, expression, ekey, min_offset,
277 max_offset);
278 DEBUG_PRINTF("parsed logical combination expression %u\n", id);
279 }
280 return;
281 }
282
283 // Ensure that our pattern isn't too long (in characters).
284 if (strlen(expression) > cc.grey.limitPatternLength) {
285 throw CompileError("Pattern length exceeds limit.");
286 }
287
288 // Do per-expression processing: errors here will result in an exception
289 // being thrown up to our caller
290 ParsedExpression pe(index, expression, flags, id, ext);
291 dumpExpression(pe, "orig", cc.grey);
292
293 // Apply prefiltering transformations if desired.
294 if (pe.expr.prefilter) {
295 prefilterTree(pe.component, ParseMode(flags));
296 dumpExpression(pe, "prefiltered", cc.grey);
297 }
298
299 // Expressions containing zero-width assertions and other extended pcre
300 // types aren't supported yet. This call will throw a ParseError exception
301 // if the component tree contains such a construct.
302 checkUnsupported(*pe.component);
303
304 pe.component->checkEmbeddedStartAnchor(true);
305 pe.component->checkEmbeddedEndAnchor(true);
306
307 if (cc.grey.optimiseComponentTree) {
308 optimise(pe);
309 dumpExpression(pe, "opt", cc.grey);
310 }
311
312 DEBUG_PRINTF("component=%p, nfaId=%u, reportId=%u\n",
313 pe.component.get(), pe.expr.index, pe.expr.report);
314
315 // You can only use the SOM flags if you've also specified an SOM
316 // precision mode.
317 if (pe.expr.som != SOM_NONE && cc.streaming && !ng.ssm.somPrecision()) {
318 throw CompileError("To use a SOM expression flag in streaming mode, "
319 "an SOM precision mode (e.g. "
320 "HS_MODE_SOM_HORIZON_LARGE) must be specified.");
321 }
322
323 // If this expression is a literal, we can feed it directly to Rose rather
324 // than building the NFA graph.
325 if (shortcutLiteral(ng, pe)) {
326 DEBUG_PRINTF("took literal short cut\n");
327 return;
328 }
329
330 auto built_expr = buildGraph(ng.rm, cc, pe);
331 if (!built_expr.g) {
332 DEBUG_PRINTF("NFA build failed on ID %u, but no exception was "
333 "thrown.\n", pe.expr.report);
334 throw CompileError("Internal error.");
335 }
336
337 if (!pe.expr.allow_vacuous && matches_everywhere(*built_expr.g)) {
338 throw CompileError("Pattern matches empty buffer; use "
339 "HS_FLAG_ALLOWEMPTY to enable support.");
340 }
341
342 if (!ng.addGraph(built_expr.expr, std::move(built_expr.g))) {
343 DEBUG_PRINTF("NFA addGraph failed on ID %u.\n", pe.expr.report);
344 throw CompileError("Error compiling expression.");
345 }
346}
347
348static
349bytecode_ptr<RoseEngine> generateRoseEngine(NG &ng) {
350 const u32 minWidth =
351 ng.minWidth.is_finite() ? verify_u32(ng.minWidth) : ROSE_BOUND_INF;
352 auto rose = ng.rose->buildRose(minWidth);
353
354 if (!rose) {
355 DEBUG_PRINTF("error building rose\n");
356 assert(0);
357 return nullptr;
358 }
359
360 dumpReportManager(ng.rm, ng.cc.grey);
361 dumpSomSlotManager(ng.ssm, ng.cc.grey);
362 dumpSmallWrite(rose.get(), ng.cc.grey);
363
364 return rose;
365}
366
367platform_t target_to_platform(const target_t &target_info) {
368 platform_t p;
369 p = 0;
370
371 if (!target_info.has_avx2()) {
372 p |= HS_PLATFORM_NOAVX2;
373 }
374 if (!target_info.has_avx512()) {
375 p |= HS_PLATFORM_NOAVX512;
376 }
377 return p;
378}
379
380/** \brief Encapsulate the given bytecode (RoseEngine) in a newly-allocated
381 * \ref hs_database, ensuring that it is padded correctly to give cacheline
382 * alignment. */
383static
384hs_database_t *dbCreate(const char *in_bytecode, size_t len, u64a platform) {
385 size_t db_len = sizeof(struct hs_database) + len;
386 DEBUG_PRINTF("db size %zu\n", db_len);
387 DEBUG_PRINTF("db platform %llx\n", platform);
388
389 struct hs_database *db = (struct hs_database *)hs_database_alloc(db_len);
390 if (hs_check_alloc(db) != HS_SUCCESS) {
391 hs_database_free(db);
392 return nullptr;
393 }
394
395 // So that none of our database is uninitialized
396 memset(db, 0, db_len);
397
398 // we need to align things manually
399 size_t shift = (uintptr_t)db->bytes & 0x3f;
400 DEBUG_PRINTF("shift is %zu\n", shift);
401
402 db->bytecode = offsetof(struct hs_database, bytes) - shift;
403 char *bytecode = (char *)db + db->bytecode;
404 assert(ISALIGNED_CL(bytecode));
405
406 db->magic = HS_DB_MAGIC;
407 db->version = HS_DB_VERSION;
408 db->length = len;
409 db->platform = platform;
410
411 // Copy bytecode
412 memcpy(bytecode, in_bytecode, len);
413
414 db->crc32 = Crc32c_ComputeBuf(0, bytecode, db->length);
415 return db;
416}
417
418
419struct hs_database *build(NG &ng, unsigned int *length) {
420 assert(length);
421
422 auto rose = generateRoseEngine(ng);
423 if (!rose) {
424 throw CompileError("Unable to generate bytecode.");
425 }
426 *length = rose.size();
427 if (!*length) {
428 DEBUG_PRINTF("RoseEngine has zero length\n");
429 assert(0);
430 throw CompileError("Internal error.");
431 }
432
433 const char *bytecode = (const char *)(rose.get());
434 const platform_t p = target_to_platform(ng.cc.target_info);
435 struct hs_database *db = dbCreate(bytecode, *length, p);
436 if (!db) {
437 throw CompileError("Could not allocate memory for bytecode.");
438 }
439
440 return db;
441}
442
443static
444void stripFromPositions(vector<PositionInfo> &v, Position pos) {
445 auto removed = remove(v.begin(), v.end(), PositionInfo(pos));
446 v.erase(removed, v.end());
447}
448
449static
450void connectInitialStates(GlushkovBuildState &bs,
451 const ParsedExpression &expr) {
452 vector<PositionInfo> initials = expr.component->first();
453 const NFABuilder &builder = bs.getBuilder();
454 const Position startState = builder.getStart();
455 const Position startDotStarState = builder.getStartDotStar();
456
457 DEBUG_PRINTF("wiring initials = %s\n",
458 dumpPositions(initials.begin(), initials.end()).c_str());
459
460 vector<PositionInfo> starts = {startState, startDotStarState};
461
462 // strip start and startDs, which can be present due to boundaries
463 stripFromPositions(initials, startState);
464 stripFromPositions(initials, startDotStarState);
465
466 // replace epsilons with accepts
467 for (const auto &s : initials) {
468 if (s.pos != GlushkovBuildState::POS_EPSILON) {
469 continue;
470 }
471
472 assert(starts.size() == 2); /* start, startds */
473 vector<PositionInfo> starts_temp = starts;
474 starts_temp[0].flags = s.flags;
475 starts_temp[1].flags = s.flags;
476 bs.connectAccepts(starts_temp);
477 }
478
479 if (!initials.empty()) {
480 bs.connectRegions(starts, initials);
481 }
482}
483
484static
485void connectFinalStates(GlushkovBuildState &bs, const ParsedExpression &expr) {
486 vector<PositionInfo> finals = expr.component->last();
487
488 DEBUG_PRINTF("wiring finals = %s\n",
489 dumpPositions(finals.begin(), finals.end()).c_str());
490
491 bs.connectAccepts(finals);
492}
493
494#ifndef NDEBUG
495static
496bool isSupported(const Component &c) {
497 try {
498 checkUnsupported(c);
499 return true;
500 }
501 catch (ParseError &) {
502 return false;
503 }
504}
505#endif
506
507BuiltExpression buildGraph(ReportManager &rm, const CompileContext &cc,
508 const ParsedExpression &pe) {
509 assert(isSupported(*pe.component));
510
511 const auto builder = makeNFABuilder(rm, cc, pe);
512 assert(builder);
513
514 // Set up START and ACCEPT states; retrieve the special states
515 const auto bs = makeGlushkovBuildState(*builder, pe.expr.prefilter);
516
517 // Map position IDs to characters/components
518 pe.component->notePositions(*bs);
519
520 // Wire the start dotstar state to the firsts
521 connectInitialStates(*bs, pe);
522
523 DEBUG_PRINTF("wire up body of expr\n");
524 // Build the rest of the FOLLOW set
525 vector<PositionInfo> initials = {builder->getStartDotStar(),
526 builder->getStart()};
527 pe.component->buildFollowSet(*bs, initials);
528
529 // Wire the lasts to the accept state
530 connectFinalStates(*bs, pe);
531
532 // Create our edges
533 bs->buildEdges();
534
535 BuiltExpression built_expr = builder->getGraph();
536 assert(built_expr.g);
537
538 dumpDotWrapper(*built_expr.g, built_expr.expr, "00_before_asserts",
539 cc.grey);
540 removeAssertVertices(rm, *built_expr.g, built_expr.expr);
541
542 return built_expr;
543}
544
545} // namespace ue2
546