1 | /* |
2 | * Copyright (c) 2015, Intel Corporation |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions are met: |
6 | * |
7 | * * Redistributions of source code must retain the above copyright notice, |
8 | * this list of conditions and the following disclaimer. |
9 | * * Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * * Neither the name of Intel Corporation nor the names of its contributors |
13 | * may be used to endorse or promote products derived from this software |
14 | * without specific prior written permission. |
15 | * |
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
17 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
20 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
26 | * POSSIBILITY OF SUCH DAMAGE. |
27 | */ |
28 | |
29 | /** \file |
30 | * \brief Repeats ('*', '+', '?', '{M,N}', etc) |
31 | */ |
32 | |
33 | |
34 | #include "ComponentRepeat.h" |
35 | |
36 | #include "buildstate.h" |
37 | #include "nfagraph/ng_builder.h" |
38 | #include "parse_error.h" |
39 | #include "Parser.h" |
40 | #include "position.h" |
41 | #include "position_dump.h" |
42 | #include "position_info.h" |
43 | #include "ue2common.h" |
44 | #include "util/make_unique.h" |
45 | |
46 | #include <algorithm> |
47 | #include <cassert> |
48 | |
49 | using namespace std; |
50 | |
51 | namespace ue2 { |
52 | |
53 | /** \brief Hard limit on the maximum repeat for bounded repeats. */ |
54 | static constexpr u32 MAX_REPEAT = 32767; |
55 | |
56 | /** \brief If expanding a repeat would lead to this many positions being |
57 | * generated, we fail the pattern. */ |
58 | static constexpr u32 MAX_POSITIONS_EXPANDED = 500000; // arbitrarily huge |
59 | |
60 | /* no edge priorities means that if our subcomponent can be empty, our min |
61 | * extent is effectively zero. */ |
62 | ComponentRepeat::ComponentRepeat(unique_ptr<Component> sub_comp_in, u32 min, |
63 | u32 max, enum RepeatType t) |
64 | : type(t), sub_comp(move(sub_comp_in)), m_min(min), m_max(max), |
65 | posFirst(GlushkovBuildState::POS_UNINITIALIZED), |
66 | posLast(GlushkovBuildState::POS_UNINITIALIZED) { |
67 | assert(sub_comp); |
68 | assert(max > 0); |
69 | assert(m_min <= m_max); |
70 | |
71 | if (m_min > MAX_REPEAT) { |
72 | throw ParseError("Bounded repeat is too large." ); |
73 | } |
74 | if (m_max != NoLimit && m_max > MAX_REPEAT) { |
75 | throw ParseError("Bounded repeat is too large." ); |
76 | } |
77 | } |
78 | |
79 | ComponentRepeat::~ComponentRepeat() {} |
80 | |
81 | ComponentRepeat *ComponentRepeat::clone() const { |
82 | return new ComponentRepeat(*this); |
83 | } |
84 | |
85 | ComponentRepeat::ComponentRepeat(const ComponentRepeat &other) |
86 | : Component(other), |
87 | type(other.type), sub_comp(unique_ptr<Component>(other.sub_comp->clone())), |
88 | m_min(other.m_min), m_max(other.m_max), |
89 | m_firsts(other.m_firsts), m_lasts(other.m_lasts), |
90 | posFirst(other.posFirst), posLast(other.posLast) {} |
91 | |
92 | bool ComponentRepeat::empty() const { |
93 | return m_min == 0 || sub_comp->empty(); |
94 | } |
95 | |
96 | bool ComponentRepeat::repeatable() const { |
97 | return false; |
98 | } |
99 | |
100 | static |
101 | void addBase(Position base, vector<PositionInfo> &firsts, |
102 | vector<PositionInfo> &lasts) { |
103 | for (auto &e : firsts) { |
104 | if (e.pos != GlushkovBuildState::POS_EPSILON) { |
105 | e.pos += base; |
106 | } |
107 | } |
108 | for (auto &e : lasts) { |
109 | e.pos += base; |
110 | } |
111 | } |
112 | |
113 | static |
114 | void checkPositions(vector<PositionInfo> &v, const GlushkovBuildState &bs) { |
115 | const NFABuilder& builder = bs.getBuilder(); |
116 | for (const auto &e : v) { |
117 | if (builder.isSpecialState(e.pos)) { |
118 | throw ParseError("Embedded anchors not supported." ); |
119 | } |
120 | } |
121 | } |
122 | |
123 | void ComponentRepeat::notePositions(GlushkovBuildState &bs) { |
124 | assert(m_max > 0); |
125 | assert(m_max == NoLimit || m_max < MAX_REPEAT); |
126 | |
127 | /* Note: We can construct smaller subgraphs if we're not maintaining edge |
128 | * priorities. */ |
129 | |
130 | // We create one copy only through a recursive call to notePositions(), |
131 | // first() and last(). Then we clone its positions and store the |
132 | // appropriate firsts and lasts values for the copies. |
133 | posFirst = bs.getBuilder().numVertices(); |
134 | sub_comp->notePositions(bs); |
135 | |
136 | u32 copies = m_max < NoLimit ? m_max : MAX(m_min, 1); |
137 | DEBUG_PRINTF("building %u copies of repeated region\n" , copies); |
138 | m_firsts.clear(); |
139 | m_lasts.clear(); |
140 | m_firsts.resize(copies); |
141 | m_lasts.resize(copies); |
142 | |
143 | m_firsts[0] = sub_comp->first(); |
144 | m_lasts[0] = sub_comp->last(); |
145 | |
146 | postSubNotePositionHook(); |
147 | |
148 | posLast = bs.getBuilder().numVertices() - 1; |
149 | u32 vcount = posLast + 1 - posFirst; |
150 | |
151 | // If we're making more than one copy, then our firsts and lasts must only |
152 | // contain vertices inside [posFirst, posLast]: anything else means we have |
153 | // an embedded anchor or otherwise weird situation. |
154 | if (copies > 1) { |
155 | checkPositions(m_firsts[0], bs); |
156 | checkPositions(m_lasts[0], bs); |
157 | } |
158 | |
159 | // Avoid enormous expansions |
160 | if (vcount * copies > MAX_POSITIONS_EXPANDED) { |
161 | throw ParseError("Bounded repeat is too large." ); |
162 | } |
163 | |
164 | // Add positions for the rest of the copies |
165 | size_t copyPositions = vcount * (copies - 1); |
166 | bs.getBuilder().makePositions(copyPositions); |
167 | |
168 | // Calculate our firsts and lasts for the copies |
169 | for (u32 i = 1; i < copies; ++i) { |
170 | m_firsts[i] = m_firsts[0]; |
171 | m_lasts[i] = m_lasts[0]; |
172 | u32 base = i * vcount; |
173 | addBase(base, m_firsts[i], m_lasts[i]); |
174 | } |
175 | |
176 | recordPosBounds(posFirst, bs.getBuilder().numVertices()); |
177 | |
178 | // Each optional repeat has an epsilon at the end of its firsts list. |
179 | for (u32 i = m_min; i < m_firsts.size(); i++) { |
180 | m_firsts[i].push_back(GlushkovBuildState::POS_EPSILON); |
181 | } |
182 | |
183 | } |
184 | |
185 | vector<PositionInfo> ComponentRepeat::first() const { |
186 | if (!m_max) { |
187 | return {}; |
188 | } |
189 | |
190 | assert(!m_firsts.empty()); // notePositions should already have run |
191 | const vector<PositionInfo> &firsts = m_firsts.front(); |
192 | DEBUG_PRINTF("firsts = %s\n" , |
193 | dumpPositions(begin(firsts), end(firsts)).c_str()); |
194 | return firsts; |
195 | } |
196 | |
197 | void ComponentRepeat::buildFollowSet(GlushkovBuildState &bs, |
198 | const vector<PositionInfo> &lastPos) { |
199 | if (!m_max) { |
200 | return; |
201 | } |
202 | DEBUG_PRINTF("enter\n" ); |
203 | |
204 | // Wire up the first (the "real") entry |
205 | |
206 | DEBUG_PRINTF("initial repeat\n" ); |
207 | sub_comp->buildFollowSet(bs, lastPos); |
208 | |
209 | // Clone the subgraph we just added N times, where N is the minimum extent |
210 | // of the graph minus one, wiring them up in a linear sequence |
211 | |
212 | u32 copies = m_firsts.size(); |
213 | DEBUG_PRINTF("cloning %u copies of repeat\n" , copies - 1); |
214 | for (u32 rep = 1; rep < copies; rep++) { |
215 | u32 offset = (posLast + 1 - posFirst) * rep; |
216 | if (offset > 0) { |
217 | bs.cloneFollowSet(posFirst, posLast, offset); |
218 | } |
219 | } |
220 | |
221 | wireRepeats(bs); |
222 | |
223 | DEBUG_PRINTF("leave\n" ); |
224 | } |
225 | |
226 | void ComponentRepeat::optimise(bool connected_to_sds) { |
227 | DEBUG_PRINTF("opt %d\n" , (int)connected_to_sds); |
228 | if (!connected_to_sds) { |
229 | return; |
230 | } |
231 | |
232 | DEBUG_PRINTF("setting m_max to %u\n" , m_min); |
233 | m_max = m_min; |
234 | } |
235 | |
236 | bool ComponentRepeat::vacuous_everywhere() const { |
237 | return !m_min || sub_comp->vacuous_everywhere(); |
238 | } |
239 | |
240 | bool ComponentRepeat::checkEmbeddedStartAnchor(bool at_start) const { |
241 | at_start = sub_comp->checkEmbeddedStartAnchor(at_start); |
242 | |
243 | if (m_max > 1) { |
244 | at_start = sub_comp->checkEmbeddedStartAnchor(at_start); |
245 | } |
246 | |
247 | return at_start; |
248 | } |
249 | |
250 | bool ComponentRepeat::checkEmbeddedEndAnchor(bool at_end) const { |
251 | at_end = sub_comp->checkEmbeddedEndAnchor(at_end); |
252 | |
253 | if (m_max > 1) { |
254 | at_end = sub_comp->checkEmbeddedEndAnchor(at_end); |
255 | } |
256 | |
257 | return at_end; |
258 | } |
259 | |
260 | Component *ComponentRepeat::accept(ComponentVisitor &v) { |
261 | Component *c = v.visit(this); |
262 | if (c != this) { |
263 | v.post(this); |
264 | return c; |
265 | } |
266 | |
267 | c = sub_comp->accept(v); |
268 | if (c != sub_comp.get()) { |
269 | sub_comp.reset(c); |
270 | } |
271 | |
272 | v.post(this); |
273 | return !sub_comp ? nullptr : this; |
274 | } |
275 | |
276 | void ComponentRepeat::accept(ConstComponentVisitor &v) const { |
277 | v.pre(*this); |
278 | sub_comp->accept(v); |
279 | v.post(*this); |
280 | } |
281 | |
282 | vector<PositionInfo> ComponentRepeat::last() const { |
283 | vector<PositionInfo> lasts; |
284 | if (!m_max) { |
285 | return lasts; |
286 | } |
287 | |
288 | assert(!m_firsts.empty()); // notePositions should already have run |
289 | assert(!m_lasts.empty()); |
290 | |
291 | const auto &l = m_min ? m_lasts[m_min - 1] : m_lasts[0]; |
292 | lasts.insert(lasts.end(), l.begin(), l.end()); |
293 | |
294 | if (!m_min || m_min != m_lasts.size()) { |
295 | lasts.insert(lasts.end(), m_lasts.back().begin(), m_lasts.back().end()); |
296 | } |
297 | |
298 | DEBUG_PRINTF("lasts = %s\n" , |
299 | dumpPositions(lasts.begin(), lasts.end()).c_str()); |
300 | return lasts; |
301 | } |
302 | |
303 | void ComponentRepeat::wireRepeats(GlushkovBuildState &bs) { |
304 | /* note: m_lasts[0] already valid */ |
305 | u32 copies = m_firsts.size(); |
306 | const bool isEmpty = sub_comp->empty(); |
307 | const vector<PositionInfo> &optLasts = |
308 | m_min ? m_lasts[m_min - 1] : m_lasts[0]; |
309 | |
310 | if (!copies) { |
311 | goto inf_check; |
312 | } |
313 | |
314 | DEBUG_PRINTF("wiring up %u mand repeats\n" , m_min); |
315 | for (u32 rep = 1; rep < m_min; rep++) { |
316 | bs.connectRegions(m_lasts[rep - 1], m_firsts[rep]); |
317 | |
318 | if (isEmpty) { |
319 | m_lasts[rep].insert(m_lasts[rep].end(), m_lasts[rep - 1].begin(), |
320 | m_lasts[rep - 1].end()); |
321 | } |
322 | } |
323 | |
324 | DEBUG_PRINTF("wiring up %d optional repeats\n" , copies - m_min); |
325 | for (u32 rep = MAX(m_min, 1); rep < copies; rep++) { |
326 | vector<PositionInfo> lasts = m_lasts[rep - 1]; |
327 | if (rep != m_min) { |
328 | lasts.insert(lasts.end(), optLasts.begin(), optLasts.end()); |
329 | sort(lasts.begin(), lasts.end()); |
330 | lasts.erase(unique(lasts.begin(), lasts.end()), lasts.end()); |
331 | } |
332 | bs.connectRegions(lasts, m_firsts[rep]); |
333 | } |
334 | |
335 | inf_check: |
336 | // If we have no max bound, we need a self-loop as well. |
337 | if (m_max == NoLimit) { |
338 | DEBUG_PRINTF("final repeat self-loop\n" ); |
339 | bs.connectRegions(m_lasts.back(), m_firsts.back()); |
340 | } |
341 | } |
342 | |
343 | static |
344 | bool hasPositionFlags(const Component &c) { |
345 | for (const auto &e : c.first()) { |
346 | if (e.flags) { |
347 | return true; |
348 | } |
349 | } |
350 | return false; |
351 | } |
352 | |
353 | void ComponentRepeat::postSubNotePositionHook() { |
354 | // UE-444 optimization: we can REWRITE m_min under various circumstances, |
355 | // so that we create smaller NFA graphs. Note that this is _not_ possible |
356 | // if our subcomponent contains a flagged position, e.g. nofloat. |
357 | if (!hasPositionFlags(*sub_comp) && sub_comp->empty()) { |
358 | m_min = 0; |
359 | } |
360 | } |
361 | |
362 | unique_ptr<ComponentRepeat> makeComponentRepeat(unique_ptr<Component> sub_comp, |
363 | u32 min, u32 max, |
364 | ComponentRepeat::RepeatType t) { |
365 | return ue2::make_unique<ComponentRepeat>(move(sub_comp), min, max, t); |
366 | } |
367 | |
368 | } // namespace ue2 |
369 | |