1/*
2 * Copyright (c) 2015-2017, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** \file
30 * \brief Glushkov construction.
31 */
32#include "buildstate.h"
33#include "position.h"
34#include "position_dump.h"
35#include "position_info.h"
36#include "parse_error.h"
37#include "hs_internal.h"
38#include "ue2common.h"
39#include "nfagraph/ng_builder.h"
40#include "util/charreach.h"
41#include "util/container.h"
42#include "util/flat_containers.h"
43#include "util/hash.h"
44#include "util/make_unique.h"
45#include "util/unordered.h"
46
47#include <algorithm>
48#include <iterator>
49#include <limits>
50#include <map>
51#include <utility>
52
53#if defined(DEBUG) || defined(DUMP_SUPPORT)
54#include <ostream>
55#include <sstream>
56#endif
57
58using namespace std;
59
60namespace ue2 {
61
62/** \brief Represents an uninitialized state. */
63const Position GlushkovBuildState::POS_UNINITIALIZED =
64 numeric_limits<Position>::max();
65
66/** \brief Represents an epsilon transition in the firsts of a component. */
67const Position GlushkovBuildState::POS_EPSILON =
68 numeric_limits<Position>::max() - 1;
69
70GlushkovBuildState::~GlushkovBuildState() { }
71
72namespace /* anonymous */ {
73
74class CheckPositionFlags {
75public:
76 explicit CheckPositionFlags(int fl) : flags(fl) {}
77 bool operator()(const PositionInfo &p) const {
78 return (p.flags & flags) == flags;
79 }
80private:
81 int flags;
82};
83
84class CheckUnflaggedEpsilon {
85public:
86 bool operator()(const PositionInfo &p) const {
87 return p.pos == GlushkovBuildState::POS_EPSILON && p.flags == 0;
88 }
89};
90
91/** \brief Concrete impl of the GlushkovBuildState interface. */
92class GlushkovBuildStateImpl : public GlushkovBuildState {
93public:
94 GlushkovBuildStateImpl(NFABuilder &b, bool prefilter);
95
96 /** \brief Returns a reference to the NFABuilder being used. */
97 NFABuilder &getBuilder() override { return builder; }
98
99 /** \brief Returns a const reference to the NFABuilder being used. */
100 const NFABuilder &getBuilder() const override { return builder; }
101
102 /** \brief Wire up the lasts of one component to the firsts of another. */
103 void connectRegions(const vector<PositionInfo> &lasts,
104 const vector<PositionInfo> &firsts) override;
105
106 /** \brief Wire the lasts of the main sequence to accepts. */
107 void connectAccepts(const vector<PositionInfo> &lasts) override;
108
109 /** \brief Wire up a single last to a list of firsts. */
110 void connectSuccessors(const PositionInfo &last,
111 vector<PositionInfo> firsts);
112
113 /** Wire up a pair of positions. */
114 void addSuccessor(Position from, Position to) override;
115
116 /** \brief Clone the vertex properties and edges of all vertices between
117 * two positions. */
118 void cloneFollowSet(Position from, Position to, unsigned offset) override;
119
120 /** \brief Build the prioritised list of edges out of our successor map. */
121 void buildEdges() override;
122
123 /** Construct an edge, called internally by \ref buildEdges. */
124 void buildEdge(Position from, const PositionInfo &to);
125
126 Position startState;
127 Position startDotstarState;
128 Position acceptState;
129 Position acceptEodState;
130 Position acceptNlEodState;
131 Position acceptNlState;
132
133 NFABuilder &builder; //!< \brief builder for the NFAGraph
134
135 bool doPrefilter; //!< \brief we're building a prefiltering pattern
136
137 /** \brief Map storing successors for each position. */
138 map<Position, flat_set<PositionInfo>> successors;
139};
140
141} // namespace
142
143GlushkovBuildStateImpl::GlushkovBuildStateImpl(NFABuilder &b,
144 bool prefilter) :
145 startState(b.getStart()),
146 startDotstarState(b.getStartDotStar()),
147 acceptState(b.getAccept()),
148 acceptEodState(b.getAcceptEOD()),
149 acceptNlEodState(POS_UNINITIALIZED),
150 acceptNlState(POS_UNINITIALIZED),
151 builder(b),
152 doPrefilter(prefilter)
153{
154 // Our special nodes need special relationships.
155 vector<PositionInfo> lasts, firsts;
156
157 // start->startDs and startDs self-loop.
158 lasts.push_back(startState);
159 lasts.push_back(startDotstarState);
160 firsts.push_back(startDotstarState);
161 connectRegions(lasts, firsts);
162
163 // accept to acceptEod edges already wired
164
165 // XXX: a small hack to support vacuous NFAs: give start and startDs an
166 // initial report ID.
167 builder.setNodeReportID(startState, 0);
168 builder.setNodeReportID(startDotstarState, 0);
169}
170
171static
172void checkEmbeddedEndAnchor(const PositionInfo &from,
173 const vector<PositionInfo> &firsts) {
174 if (!(from.flags & POS_FLAG_ONLY_ENDS)) {
175 return;
176 }
177
178 for (const auto &first : firsts) {
179 if (first.pos != GlushkovBuildStateImpl::POS_EPSILON) {
180 /* can make it through the parse tree */
181 throw ParseError("Embedded end anchors not supported.");
182 }
183 }
184}
185
186// Wire up the lasts of one component to the firsts of another
187void
188GlushkovBuildStateImpl::connectRegions(const vector<PositionInfo> &lasts,
189 const vector<PositionInfo> &firsts) {
190 for (const auto &last : lasts) {
191 checkEmbeddedEndAnchor(last, firsts);
192 connectSuccessors(last, firsts);
193 }
194}
195
196static
197void filterEdges(const GlushkovBuildStateImpl &bs, const PositionInfo &from,
198 vector<PositionInfo> &tolist) {
199 if (from.pos == bs.startDotstarState) {
200 // If we're connecting from start-dotstar, remove all caret flavoured
201 // positions.
202 CheckPositionFlags check(POS_FLAG_NOFLOAT);
203 tolist.erase(remove_if(tolist.begin(), tolist.end(), check),
204 tolist.end());
205 if (from.flags & POS_FLAG_NOFLOAT) {
206 tolist.clear();
207 }
208 } else if (from.pos == bs.startState) {
209 // If we're connecting from start, we should remove any epsilons that
210 // aren't caret flavoured.
211 CheckUnflaggedEpsilon check;
212 tolist.erase(remove_if(tolist.begin(), tolist.end(), check),
213 tolist.end());
214 CheckPositionFlags check2(POS_FLAG_MUST_FLOAT | POS_FLAG_NOFLOAT);
215 tolist.erase(remove_if(tolist.begin(), tolist.end(), check2),
216 tolist.end());
217 }
218
219 if (bs.builder.getAssertFlag(from.pos) & POS_FLAG_MULTILINE_START) {
220 // If we have a (mildly boneheaded) pattern like /^$/m, we're right up
221 // against the edge of what we can do without true assertion support.
222 // Here we have an evil hack to prevent us plugging the \n generated by
223 // the caret right into acceptEod (which is in the firsts of the
224 // dollar).
225 /* This is due to the 'interesting quirk' that multiline ^ does not
226 * not match a newline at the end of buffer. */
227 DEBUG_PRINTF("multiline start - no eod\n");
228 tolist.erase(remove(tolist.begin(), tolist.end(), bs.acceptEodState),
229 tolist.end());
230 }
231}
232
233static
234Position makeNewlineAssertPos(GlushkovBuildState &bs) {
235 NFABuilder &builder = bs.getBuilder();
236 Position newline = builder.makePositions(1);
237 builder.addCharReach(newline, CharReach('\n'));
238 builder.setAssertFlag(newline, POS_FLAG_FIDDLE_ACCEPT);
239 builder.setNodeReportID(newline, -1);
240 return newline;
241}
242
243static
244void generateAccepts(GlushkovBuildStateImpl &bs, const PositionInfo &from,
245 vector<PositionInfo> *tolist) {
246 NFABuilder &builder = bs.getBuilder();
247 u32 flags = from.flags;
248
249 bool require_eod = flags & POS_FLAG_WIRE_EOD;
250 bool require_nl_eod = flags & POS_FLAG_WIRE_NL_EOD
251 && !(flags & POS_FLAG_NO_NL_EOD);
252 bool require_nl_accept = (flags & POS_FLAG_WIRE_NL_ACCEPT)
253 && !(flags & POS_FLAG_NO_NL_ACCEPT);
254
255 bool require_accept = !(flags & POS_FLAG_ONLY_ENDS);
256
257 if (require_eod) {
258 tolist->push_back(bs.acceptEodState);
259 }
260
261 if (require_nl_accept) {
262 if (bs.acceptNlState == GlushkovBuildState::POS_UNINITIALIZED) {
263 Position newline = makeNewlineAssertPos(bs);
264 bs.addSuccessor(newline, builder.getAccept());
265 bs.acceptNlState = newline;
266 }
267 tolist->push_back(bs.acceptNlState);
268 }
269
270 if (require_nl_eod) {
271 if (bs.acceptNlEodState == GlushkovBuildState::POS_UNINITIALIZED) {
272 Position newline = makeNewlineAssertPos(bs);
273 bs.addSuccessor(newline, builder.getAcceptEOD());
274 bs.acceptNlEodState = newline;
275 }
276 tolist->push_back(bs.acceptNlEodState);
277 }
278
279 if (require_accept) {
280 tolist->push_back(bs.acceptState);
281 }
282}
283
284void GlushkovBuildStateImpl::connectAccepts(const vector<PositionInfo> &lasts) {
285 for (const auto &last : lasts) {
286 vector<PositionInfo> accepts;
287 generateAccepts(*this, last, &accepts);
288 connectSuccessors(last, accepts);
289 }
290}
291
292#if defined(DEBUG) || defined(DUMP_SUPPORT)
293
294static UNUSED
295string dumpCaptures(const PositionInfo &p) {
296 ostringstream oss;
297
298 if (p.flags & POS_FLAG_NOFLOAT) {
299 oss << "<nofloat>";
300 }
301 if (p.flags & POS_FLAG_MUST_FLOAT) {
302 oss << "<must_float>";
303 }
304 if (p.flags & POS_FLAG_FIDDLE_ACCEPT) {
305 oss << "<fiddle_accept>";
306 }
307 if (p.flags & POS_FLAG_ONLY_ENDS) {
308 oss << "<only_ends>";
309 }
310 if (p.flags & POS_FLAG_NO_NL_EOD) {
311 oss << "<no_nl_eod>";
312 }
313 if (p.flags & POS_FLAG_NO_NL_ACCEPT) {
314 oss << "<no_nl_acc>";
315 }
316
317 return oss.str();
318}
319
320#endif // DEBUG || DUMP_SUPPORT
321
322void GlushkovBuildStateImpl::connectSuccessors(const PositionInfo &from,
323 vector<PositionInfo> tolist) {
324 /* note: tolist maybe modified for our own internal use -> not a reference */
325 assert(from.pos != POS_EPSILON);
326 assert(from.pos != POS_UNINITIALIZED);
327 assert(find(tolist.begin(), tolist.end(), POS_UNINITIALIZED)
328 == tolist.end());
329
330 DEBUG_PRINTF("FROM = %u%s TO = %s\n", from.pos, dumpCaptures(from).c_str(),
331 dumpPositions(tolist.begin(), tolist.end()).c_str());
332
333 /* prevent creation of edges with invalid assertions */
334 filterEdges(*this, from, tolist);
335
336 if (from.flags & POS_FLAG_FIDDLE_ACCEPT) {
337 auto accept = find(tolist.begin(), tolist.end(), acceptState);
338 if (accept != tolist.end()) {
339 DEBUG_PRINTF("accept through -1 offset-adjusting dot\n");
340 Position fakedot = builder.makePositions(1);
341 builder.addCharReach(fakedot, CharReach(0x00, 0xff));
342 builder.setNodeReportID(fakedot, -1);
343 addSuccessor(fakedot, acceptState);
344 *accept = fakedot;
345 } else {
346 // We might lead to accept via an assertion vertex, so we add the
347 // offset adj to this vertex itself. Used for cases like /^\B/m,
348 // which should match only at 0 for '\n'.
349 builder.setNodeReportID(from.pos, -1);
350 }
351
352 assert(find(tolist.begin(), tolist.end(), acceptState) == tolist.end());
353 }
354
355 auto &succ = successors[from.pos];
356
357 DEBUG_PRINTF("connect %u -> %s\n", from.pos,
358 dumpPositions(tolist.begin(), tolist.end()).c_str());
359 DEBUG_PRINTF("%u curr succ: %s\n", from.pos,
360 dumpPositions(begin(succ), end(succ)).c_str());
361
362 for (const auto &to : tolist) {
363 if (to.pos != POS_EPSILON) {
364 succ.insert(to);
365 }
366 }
367
368 DEBUG_PRINTF("%u succ: %s\n", from.pos,
369 dumpPositions(begin(succ), end(succ)).c_str());
370}
371
372void GlushkovBuildStateImpl::addSuccessor(Position from, Position to) {
373 DEBUG_PRINTF("connect %u -> %u\n", from, to);
374 assert(from != POS_EPSILON && from != POS_UNINITIALIZED);
375 assert(to != POS_EPSILON && to != POS_UNINITIALIZED);
376
377 auto &succ = successors[from];
378 succ.insert(to);
379
380 DEBUG_PRINTF("%u succ: %s\n", from,
381 dumpPositions(begin(succ), end(succ)).c_str());
382}
383
384void GlushkovBuildStateImpl::cloneFollowSet(Position first, Position last,
385 unsigned offset) {
386 assert(first <= last);
387
388 // Clone vertex properties (reachability, etc)
389 builder.cloneRegion(first, last, offset);
390
391 /* Clone the successors of all the positions between first and last
392 * inclusive, producing a new set of positions starting at (first +
393 * offset). */
394 for (Position i = first; i <= last; i++) {
395 // This should be a new position.
396 assert(successors[i + offset].empty());
397
398 for (const PositionInfo &to : successors[i]) {
399 if (to.pos >= first && to.pos <= last) {
400 PositionInfo clone(to);
401 clone.pos += offset;
402 DEBUG_PRINTF("clone: %u -> %u\n", i + offset, clone.pos);
403 successors[i + offset].insert(clone);
404 } else {
405 // There shouldn't be any stray edges leading out of this
406 // region!
407 assert(0);
408 }
409 }
410 }
411}
412
413void GlushkovBuildStateImpl::buildEdge(Position from, const PositionInfo &to) {
414 // Guard against embedded anchors
415 if (to == startState) {
416 /* can make it through the parse tree */
417 throw ParseError("Embedded start anchors not supported.");
418 }
419
420 assert(to.pos != POS_UNINITIALIZED);
421 assert(to.pos != POS_EPSILON);
422
423 if (builder.hasEdge(from, to.pos)) {
424 return;
425 }
426
427 builder.addEdge(from, to.pos);
428}
429
430void GlushkovBuildStateImpl::buildEdges() {
431 // Create all the edges and track which vertices are asserts which need to
432 // be removed later.
433 for (const auto &m : successors) {
434 const Position from = m.first;
435 for (const auto &to : m.second) {
436 buildEdge(from, to);
437 }
438 }
439}
440
441// Construct a usable GlushkovBuildState for the outside world.
442unique_ptr<GlushkovBuildState> makeGlushkovBuildState(NFABuilder &b,
443 bool prefilter) {
444 return ue2::make_unique<GlushkovBuildStateImpl>(b, prefilter);
445}
446
447// free functions for utility use
448
449/** \brief Eliminate lower-priority duplicate PositionInfo entries.
450 *
451 * Scans through a list of positions and retains only the highest priority
452 * version of a given (position, flags) entry. */
453void cleanupPositions(vector<PositionInfo> &a) {
454 ue2_unordered_set<pair<Position, int>> seen;
455
456 vector<PositionInfo> out;
457 out.reserve(a.size()); // output should be close to input in size.
458
459 for (const auto &p : a) {
460 if (seen.emplace(p.pos, p.flags).second) {
461 out.push_back(p); // first encounter
462 }
463 }
464
465 DEBUG_PRINTF("in %zu; out %zu\n", a.size(), out.size());
466 a.swap(out);
467}
468
469static
470vector<PositionInfo>::iterator
471replaceElemWithSequence(vector<PositionInfo> &dest,
472 vector<PositionInfo>::iterator &victim,
473 const vector<PositionInfo> &replacement) {
474 auto past = dest.erase(victim);
475 size_t d = distance(dest.begin(), past) + replacement.size();
476 dest.insert(past, replacement.begin(), replacement.end());
477 /* recalc past as iterator may have been invalidated */
478 return dest.begin() + d;
479}
480
481/** \brief Replace all epsilons with the given positions.
482 *
483 * Replace epsilons in a firsts list with another given firsts list. Note: the
484 * firsts lists must come from disjoint sets of components. If no epsilons are
485 * in the first firsts list the source is appended to the end.
486 */
487void replaceEpsilons(vector<PositionInfo> &target,
488 const vector<PositionInfo> &source) {
489 auto found =
490 find(target.begin(), target.end(), GlushkovBuildState::POS_EPSILON);
491
492 if (found == target.end()) {
493 // no epsilons to replace, push on to the end
494 target.insert(target.end(), source.begin(), source.end());
495 return;
496 }
497
498 while (found != target.end()) {
499 checkEmbeddedEndAnchor(*found, source);
500
501 // replace this epsilon with a copy of source with the same flags
502 vector<PositionInfo> newsource(source);
503 for (auto &pos : newsource) {
504 pos.flags |= found->flags;
505 }
506
507 found = replaceElemWithSequence(target, found, newsource);
508 // find the next epsilon
509 found = find(found, target.end(), GlushkovBuildState::POS_EPSILON);
510 }
511
512 cleanupPositions(target);
513}
514
515#ifdef DUMP_SUPPORT
516
517void dump(ostream &os, const PositionInfo &p) {
518 if (p.pos == GlushkovBuildState::POS_EPSILON) {
519 os << "epsilon";
520 } else {
521 os << p.pos;
522 }
523
524 os << dumpCaptures(p);
525}
526
527#endif // DUMP_SUPPORT
528
529} // namespace ue2
530