1 | /* |
2 | * Copyright (c) 2015-2017, Intel Corporation |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions are met: |
6 | * |
7 | * * Redistributions of source code must retain the above copyright notice, |
8 | * this list of conditions and the following disclaimer. |
9 | * * Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * * Neither the name of Intel Corporation nor the names of its contributors |
13 | * may be used to endorse or promote products derived from this software |
14 | * without specific prior written permission. |
15 | * |
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
17 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
20 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
26 | * POSSIBILITY OF SUCH DAMAGE. |
27 | */ |
28 | |
29 | /** \file |
30 | * \brief Glushkov construction. |
31 | */ |
32 | #include "buildstate.h" |
33 | #include "position.h" |
34 | #include "position_dump.h" |
35 | #include "position_info.h" |
36 | #include "parse_error.h" |
37 | #include "hs_internal.h" |
38 | #include "ue2common.h" |
39 | #include "nfagraph/ng_builder.h" |
40 | #include "util/charreach.h" |
41 | #include "util/container.h" |
42 | #include "util/flat_containers.h" |
43 | #include "util/hash.h" |
44 | #include "util/make_unique.h" |
45 | #include "util/unordered.h" |
46 | |
47 | #include <algorithm> |
48 | #include <iterator> |
49 | #include <limits> |
50 | #include <map> |
51 | #include <utility> |
52 | |
53 | #if defined(DEBUG) || defined(DUMP_SUPPORT) |
54 | #include <ostream> |
55 | #include <sstream> |
56 | #endif |
57 | |
58 | using namespace std; |
59 | |
60 | namespace ue2 { |
61 | |
62 | /** \brief Represents an uninitialized state. */ |
63 | const Position GlushkovBuildState::POS_UNINITIALIZED = |
64 | numeric_limits<Position>::max(); |
65 | |
66 | /** \brief Represents an epsilon transition in the firsts of a component. */ |
67 | const Position GlushkovBuildState::POS_EPSILON = |
68 | numeric_limits<Position>::max() - 1; |
69 | |
70 | GlushkovBuildState::~GlushkovBuildState() { } |
71 | |
72 | namespace /* anonymous */ { |
73 | |
74 | class CheckPositionFlags { |
75 | public: |
76 | explicit CheckPositionFlags(int fl) : flags(fl) {} |
77 | bool operator()(const PositionInfo &p) const { |
78 | return (p.flags & flags) == flags; |
79 | } |
80 | private: |
81 | int flags; |
82 | }; |
83 | |
84 | class CheckUnflaggedEpsilon { |
85 | public: |
86 | bool operator()(const PositionInfo &p) const { |
87 | return p.pos == GlushkovBuildState::POS_EPSILON && p.flags == 0; |
88 | } |
89 | }; |
90 | |
91 | /** \brief Concrete impl of the GlushkovBuildState interface. */ |
92 | class GlushkovBuildStateImpl : public GlushkovBuildState { |
93 | public: |
94 | GlushkovBuildStateImpl(NFABuilder &b, bool prefilter); |
95 | |
96 | /** \brief Returns a reference to the NFABuilder being used. */ |
97 | NFABuilder &getBuilder() override { return builder; } |
98 | |
99 | /** \brief Returns a const reference to the NFABuilder being used. */ |
100 | const NFABuilder &getBuilder() const override { return builder; } |
101 | |
102 | /** \brief Wire up the lasts of one component to the firsts of another. */ |
103 | void connectRegions(const vector<PositionInfo> &lasts, |
104 | const vector<PositionInfo> &firsts) override; |
105 | |
106 | /** \brief Wire the lasts of the main sequence to accepts. */ |
107 | void connectAccepts(const vector<PositionInfo> &lasts) override; |
108 | |
109 | /** \brief Wire up a single last to a list of firsts. */ |
110 | void connectSuccessors(const PositionInfo &last, |
111 | vector<PositionInfo> firsts); |
112 | |
113 | /** Wire up a pair of positions. */ |
114 | void addSuccessor(Position from, Position to) override; |
115 | |
116 | /** \brief Clone the vertex properties and edges of all vertices between |
117 | * two positions. */ |
118 | void cloneFollowSet(Position from, Position to, unsigned offset) override; |
119 | |
120 | /** \brief Build the prioritised list of edges out of our successor map. */ |
121 | void buildEdges() override; |
122 | |
123 | /** Construct an edge, called internally by \ref buildEdges. */ |
124 | void buildEdge(Position from, const PositionInfo &to); |
125 | |
126 | Position startState; |
127 | Position startDotstarState; |
128 | Position acceptState; |
129 | Position acceptEodState; |
130 | Position acceptNlEodState; |
131 | Position acceptNlState; |
132 | |
133 | NFABuilder &builder; //!< \brief builder for the NFAGraph |
134 | |
135 | bool doPrefilter; //!< \brief we're building a prefiltering pattern |
136 | |
137 | /** \brief Map storing successors for each position. */ |
138 | map<Position, flat_set<PositionInfo>> successors; |
139 | }; |
140 | |
141 | } // namespace |
142 | |
143 | GlushkovBuildStateImpl::GlushkovBuildStateImpl(NFABuilder &b, |
144 | bool prefilter) : |
145 | startState(b.getStart()), |
146 | startDotstarState(b.getStartDotStar()), |
147 | acceptState(b.getAccept()), |
148 | acceptEodState(b.getAcceptEOD()), |
149 | acceptNlEodState(POS_UNINITIALIZED), |
150 | acceptNlState(POS_UNINITIALIZED), |
151 | builder(b), |
152 | doPrefilter(prefilter) |
153 | { |
154 | // Our special nodes need special relationships. |
155 | vector<PositionInfo> lasts, firsts; |
156 | |
157 | // start->startDs and startDs self-loop. |
158 | lasts.push_back(startState); |
159 | lasts.push_back(startDotstarState); |
160 | firsts.push_back(startDotstarState); |
161 | connectRegions(lasts, firsts); |
162 | |
163 | // accept to acceptEod edges already wired |
164 | |
165 | // XXX: a small hack to support vacuous NFAs: give start and startDs an |
166 | // initial report ID. |
167 | builder.setNodeReportID(startState, 0); |
168 | builder.setNodeReportID(startDotstarState, 0); |
169 | } |
170 | |
171 | static |
172 | void checkEmbeddedEndAnchor(const PositionInfo &from, |
173 | const vector<PositionInfo> &firsts) { |
174 | if (!(from.flags & POS_FLAG_ONLY_ENDS)) { |
175 | return; |
176 | } |
177 | |
178 | for (const auto &first : firsts) { |
179 | if (first.pos != GlushkovBuildStateImpl::POS_EPSILON) { |
180 | /* can make it through the parse tree */ |
181 | throw ParseError("Embedded end anchors not supported." ); |
182 | } |
183 | } |
184 | } |
185 | |
186 | // Wire up the lasts of one component to the firsts of another |
187 | void |
188 | GlushkovBuildStateImpl::connectRegions(const vector<PositionInfo> &lasts, |
189 | const vector<PositionInfo> &firsts) { |
190 | for (const auto &last : lasts) { |
191 | checkEmbeddedEndAnchor(last, firsts); |
192 | connectSuccessors(last, firsts); |
193 | } |
194 | } |
195 | |
196 | static |
197 | void filterEdges(const GlushkovBuildStateImpl &bs, const PositionInfo &from, |
198 | vector<PositionInfo> &tolist) { |
199 | if (from.pos == bs.startDotstarState) { |
200 | // If we're connecting from start-dotstar, remove all caret flavoured |
201 | // positions. |
202 | CheckPositionFlags check(POS_FLAG_NOFLOAT); |
203 | tolist.erase(remove_if(tolist.begin(), tolist.end(), check), |
204 | tolist.end()); |
205 | if (from.flags & POS_FLAG_NOFLOAT) { |
206 | tolist.clear(); |
207 | } |
208 | } else if (from.pos == bs.startState) { |
209 | // If we're connecting from start, we should remove any epsilons that |
210 | // aren't caret flavoured. |
211 | CheckUnflaggedEpsilon check; |
212 | tolist.erase(remove_if(tolist.begin(), tolist.end(), check), |
213 | tolist.end()); |
214 | CheckPositionFlags check2(POS_FLAG_MUST_FLOAT | POS_FLAG_NOFLOAT); |
215 | tolist.erase(remove_if(tolist.begin(), tolist.end(), check2), |
216 | tolist.end()); |
217 | } |
218 | |
219 | if (bs.builder.getAssertFlag(from.pos) & POS_FLAG_MULTILINE_START) { |
220 | // If we have a (mildly boneheaded) pattern like /^$/m, we're right up |
221 | // against the edge of what we can do without true assertion support. |
222 | // Here we have an evil hack to prevent us plugging the \n generated by |
223 | // the caret right into acceptEod (which is in the firsts of the |
224 | // dollar). |
225 | /* This is due to the 'interesting quirk' that multiline ^ does not |
226 | * not match a newline at the end of buffer. */ |
227 | DEBUG_PRINTF("multiline start - no eod\n" ); |
228 | tolist.erase(remove(tolist.begin(), tolist.end(), bs.acceptEodState), |
229 | tolist.end()); |
230 | } |
231 | } |
232 | |
233 | static |
234 | Position makeNewlineAssertPos(GlushkovBuildState &bs) { |
235 | NFABuilder &builder = bs.getBuilder(); |
236 | Position newline = builder.makePositions(1); |
237 | builder.addCharReach(newline, CharReach('\n')); |
238 | builder.setAssertFlag(newline, POS_FLAG_FIDDLE_ACCEPT); |
239 | builder.setNodeReportID(newline, -1); |
240 | return newline; |
241 | } |
242 | |
243 | static |
244 | void generateAccepts(GlushkovBuildStateImpl &bs, const PositionInfo &from, |
245 | vector<PositionInfo> *tolist) { |
246 | NFABuilder &builder = bs.getBuilder(); |
247 | u32 flags = from.flags; |
248 | |
249 | bool require_eod = flags & POS_FLAG_WIRE_EOD; |
250 | bool require_nl_eod = flags & POS_FLAG_WIRE_NL_EOD |
251 | && !(flags & POS_FLAG_NO_NL_EOD); |
252 | bool require_nl_accept = (flags & POS_FLAG_WIRE_NL_ACCEPT) |
253 | && !(flags & POS_FLAG_NO_NL_ACCEPT); |
254 | |
255 | bool require_accept = !(flags & POS_FLAG_ONLY_ENDS); |
256 | |
257 | if (require_eod) { |
258 | tolist->push_back(bs.acceptEodState); |
259 | } |
260 | |
261 | if (require_nl_accept) { |
262 | if (bs.acceptNlState == GlushkovBuildState::POS_UNINITIALIZED) { |
263 | Position newline = makeNewlineAssertPos(bs); |
264 | bs.addSuccessor(newline, builder.getAccept()); |
265 | bs.acceptNlState = newline; |
266 | } |
267 | tolist->push_back(bs.acceptNlState); |
268 | } |
269 | |
270 | if (require_nl_eod) { |
271 | if (bs.acceptNlEodState == GlushkovBuildState::POS_UNINITIALIZED) { |
272 | Position newline = makeNewlineAssertPos(bs); |
273 | bs.addSuccessor(newline, builder.getAcceptEOD()); |
274 | bs.acceptNlEodState = newline; |
275 | } |
276 | tolist->push_back(bs.acceptNlEodState); |
277 | } |
278 | |
279 | if (require_accept) { |
280 | tolist->push_back(bs.acceptState); |
281 | } |
282 | } |
283 | |
284 | void GlushkovBuildStateImpl::connectAccepts(const vector<PositionInfo> &lasts) { |
285 | for (const auto &last : lasts) { |
286 | vector<PositionInfo> accepts; |
287 | generateAccepts(*this, last, &accepts); |
288 | connectSuccessors(last, accepts); |
289 | } |
290 | } |
291 | |
292 | #if defined(DEBUG) || defined(DUMP_SUPPORT) |
293 | |
294 | static UNUSED |
295 | string dumpCaptures(const PositionInfo &p) { |
296 | ostringstream oss; |
297 | |
298 | if (p.flags & POS_FLAG_NOFLOAT) { |
299 | oss << "<nofloat>" ; |
300 | } |
301 | if (p.flags & POS_FLAG_MUST_FLOAT) { |
302 | oss << "<must_float>" ; |
303 | } |
304 | if (p.flags & POS_FLAG_FIDDLE_ACCEPT) { |
305 | oss << "<fiddle_accept>" ; |
306 | } |
307 | if (p.flags & POS_FLAG_ONLY_ENDS) { |
308 | oss << "<only_ends>" ; |
309 | } |
310 | if (p.flags & POS_FLAG_NO_NL_EOD) { |
311 | oss << "<no_nl_eod>" ; |
312 | } |
313 | if (p.flags & POS_FLAG_NO_NL_ACCEPT) { |
314 | oss << "<no_nl_acc>" ; |
315 | } |
316 | |
317 | return oss.str(); |
318 | } |
319 | |
320 | #endif // DEBUG || DUMP_SUPPORT |
321 | |
322 | void GlushkovBuildStateImpl::connectSuccessors(const PositionInfo &from, |
323 | vector<PositionInfo> tolist) { |
324 | /* note: tolist maybe modified for our own internal use -> not a reference */ |
325 | assert(from.pos != POS_EPSILON); |
326 | assert(from.pos != POS_UNINITIALIZED); |
327 | assert(find(tolist.begin(), tolist.end(), POS_UNINITIALIZED) |
328 | == tolist.end()); |
329 | |
330 | DEBUG_PRINTF("FROM = %u%s TO = %s\n" , from.pos, dumpCaptures(from).c_str(), |
331 | dumpPositions(tolist.begin(), tolist.end()).c_str()); |
332 | |
333 | /* prevent creation of edges with invalid assertions */ |
334 | filterEdges(*this, from, tolist); |
335 | |
336 | if (from.flags & POS_FLAG_FIDDLE_ACCEPT) { |
337 | auto accept = find(tolist.begin(), tolist.end(), acceptState); |
338 | if (accept != tolist.end()) { |
339 | DEBUG_PRINTF("accept through -1 offset-adjusting dot\n" ); |
340 | Position fakedot = builder.makePositions(1); |
341 | builder.addCharReach(fakedot, CharReach(0x00, 0xff)); |
342 | builder.setNodeReportID(fakedot, -1); |
343 | addSuccessor(fakedot, acceptState); |
344 | *accept = fakedot; |
345 | } else { |
346 | // We might lead to accept via an assertion vertex, so we add the |
347 | // offset adj to this vertex itself. Used for cases like /^\B/m, |
348 | // which should match only at 0 for '\n'. |
349 | builder.setNodeReportID(from.pos, -1); |
350 | } |
351 | |
352 | assert(find(tolist.begin(), tolist.end(), acceptState) == tolist.end()); |
353 | } |
354 | |
355 | auto &succ = successors[from.pos]; |
356 | |
357 | DEBUG_PRINTF("connect %u -> %s\n" , from.pos, |
358 | dumpPositions(tolist.begin(), tolist.end()).c_str()); |
359 | DEBUG_PRINTF("%u curr succ: %s\n" , from.pos, |
360 | dumpPositions(begin(succ), end(succ)).c_str()); |
361 | |
362 | for (const auto &to : tolist) { |
363 | if (to.pos != POS_EPSILON) { |
364 | succ.insert(to); |
365 | } |
366 | } |
367 | |
368 | DEBUG_PRINTF("%u succ: %s\n" , from.pos, |
369 | dumpPositions(begin(succ), end(succ)).c_str()); |
370 | } |
371 | |
372 | void GlushkovBuildStateImpl::addSuccessor(Position from, Position to) { |
373 | DEBUG_PRINTF("connect %u -> %u\n" , from, to); |
374 | assert(from != POS_EPSILON && from != POS_UNINITIALIZED); |
375 | assert(to != POS_EPSILON && to != POS_UNINITIALIZED); |
376 | |
377 | auto &succ = successors[from]; |
378 | succ.insert(to); |
379 | |
380 | DEBUG_PRINTF("%u succ: %s\n" , from, |
381 | dumpPositions(begin(succ), end(succ)).c_str()); |
382 | } |
383 | |
384 | void GlushkovBuildStateImpl::cloneFollowSet(Position first, Position last, |
385 | unsigned offset) { |
386 | assert(first <= last); |
387 | |
388 | // Clone vertex properties (reachability, etc) |
389 | builder.cloneRegion(first, last, offset); |
390 | |
391 | /* Clone the successors of all the positions between first and last |
392 | * inclusive, producing a new set of positions starting at (first + |
393 | * offset). */ |
394 | for (Position i = first; i <= last; i++) { |
395 | // This should be a new position. |
396 | assert(successors[i + offset].empty()); |
397 | |
398 | for (const PositionInfo &to : successors[i]) { |
399 | if (to.pos >= first && to.pos <= last) { |
400 | PositionInfo clone(to); |
401 | clone.pos += offset; |
402 | DEBUG_PRINTF("clone: %u -> %u\n" , i + offset, clone.pos); |
403 | successors[i + offset].insert(clone); |
404 | } else { |
405 | // There shouldn't be any stray edges leading out of this |
406 | // region! |
407 | assert(0); |
408 | } |
409 | } |
410 | } |
411 | } |
412 | |
413 | void GlushkovBuildStateImpl::buildEdge(Position from, const PositionInfo &to) { |
414 | // Guard against embedded anchors |
415 | if (to == startState) { |
416 | /* can make it through the parse tree */ |
417 | throw ParseError("Embedded start anchors not supported." ); |
418 | } |
419 | |
420 | assert(to.pos != POS_UNINITIALIZED); |
421 | assert(to.pos != POS_EPSILON); |
422 | |
423 | if (builder.hasEdge(from, to.pos)) { |
424 | return; |
425 | } |
426 | |
427 | builder.addEdge(from, to.pos); |
428 | } |
429 | |
430 | void GlushkovBuildStateImpl::buildEdges() { |
431 | // Create all the edges and track which vertices are asserts which need to |
432 | // be removed later. |
433 | for (const auto &m : successors) { |
434 | const Position from = m.first; |
435 | for (const auto &to : m.second) { |
436 | buildEdge(from, to); |
437 | } |
438 | } |
439 | } |
440 | |
441 | // Construct a usable GlushkovBuildState for the outside world. |
442 | unique_ptr<GlushkovBuildState> makeGlushkovBuildState(NFABuilder &b, |
443 | bool prefilter) { |
444 | return ue2::make_unique<GlushkovBuildStateImpl>(b, prefilter); |
445 | } |
446 | |
447 | // free functions for utility use |
448 | |
449 | /** \brief Eliminate lower-priority duplicate PositionInfo entries. |
450 | * |
451 | * Scans through a list of positions and retains only the highest priority |
452 | * version of a given (position, flags) entry. */ |
453 | void cleanupPositions(vector<PositionInfo> &a) { |
454 | ue2_unordered_set<pair<Position, int>> seen; |
455 | |
456 | vector<PositionInfo> out; |
457 | out.reserve(a.size()); // output should be close to input in size. |
458 | |
459 | for (const auto &p : a) { |
460 | if (seen.emplace(p.pos, p.flags).second) { |
461 | out.push_back(p); // first encounter |
462 | } |
463 | } |
464 | |
465 | DEBUG_PRINTF("in %zu; out %zu\n" , a.size(), out.size()); |
466 | a.swap(out); |
467 | } |
468 | |
469 | static |
470 | vector<PositionInfo>::iterator |
471 | replaceElemWithSequence(vector<PositionInfo> &dest, |
472 | vector<PositionInfo>::iterator &victim, |
473 | const vector<PositionInfo> &replacement) { |
474 | auto past = dest.erase(victim); |
475 | size_t d = distance(dest.begin(), past) + replacement.size(); |
476 | dest.insert(past, replacement.begin(), replacement.end()); |
477 | /* recalc past as iterator may have been invalidated */ |
478 | return dest.begin() + d; |
479 | } |
480 | |
481 | /** \brief Replace all epsilons with the given positions. |
482 | * |
483 | * Replace epsilons in a firsts list with another given firsts list. Note: the |
484 | * firsts lists must come from disjoint sets of components. If no epsilons are |
485 | * in the first firsts list the source is appended to the end. |
486 | */ |
487 | void replaceEpsilons(vector<PositionInfo> &target, |
488 | const vector<PositionInfo> &source) { |
489 | auto found = |
490 | find(target.begin(), target.end(), GlushkovBuildState::POS_EPSILON); |
491 | |
492 | if (found == target.end()) { |
493 | // no epsilons to replace, push on to the end |
494 | target.insert(target.end(), source.begin(), source.end()); |
495 | return; |
496 | } |
497 | |
498 | while (found != target.end()) { |
499 | checkEmbeddedEndAnchor(*found, source); |
500 | |
501 | // replace this epsilon with a copy of source with the same flags |
502 | vector<PositionInfo> newsource(source); |
503 | for (auto &pos : newsource) { |
504 | pos.flags |= found->flags; |
505 | } |
506 | |
507 | found = replaceElemWithSequence(target, found, newsource); |
508 | // find the next epsilon |
509 | found = find(found, target.end(), GlushkovBuildState::POS_EPSILON); |
510 | } |
511 | |
512 | cleanupPositions(target); |
513 | } |
514 | |
515 | #ifdef DUMP_SUPPORT |
516 | |
517 | void dump(ostream &os, const PositionInfo &p) { |
518 | if (p.pos == GlushkovBuildState::POS_EPSILON) { |
519 | os << "epsilon" ; |
520 | } else { |
521 | os << p.pos; |
522 | } |
523 | |
524 | os << dumpCaptures(p); |
525 | } |
526 | |
527 | #endif // DUMP_SUPPORT |
528 | |
529 | } // namespace ue2 |
530 | |