1/*
2 * Copyright (c) 2015, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** \file
30 * \brief Repeats ('*', '+', '?', '{M,N}', etc)
31 */
32
33
34#include "ComponentRepeat.h"
35
36#include "buildstate.h"
37#include "nfagraph/ng_builder.h"
38#include "parse_error.h"
39#include "Parser.h"
40#include "position.h"
41#include "position_dump.h"
42#include "position_info.h"
43#include "ue2common.h"
44#include "util/make_unique.h"
45
46#include <algorithm>
47#include <cassert>
48
49using namespace std;
50
51namespace ue2 {
52
53/** \brief Hard limit on the maximum repeat for bounded repeats. */
54static constexpr u32 MAX_REPEAT = 32767;
55
56/** \brief If expanding a repeat would lead to this many positions being
57 * generated, we fail the pattern. */
58static constexpr u32 MAX_POSITIONS_EXPANDED = 500000; // arbitrarily huge
59
60/* no edge priorities means that if our subcomponent can be empty, our min
61 * extent is effectively zero. */
62ComponentRepeat::ComponentRepeat(unique_ptr<Component> sub_comp_in, u32 min,
63 u32 max, enum RepeatType t)
64 : type(t), sub_comp(move(sub_comp_in)), m_min(min), m_max(max),
65 posFirst(GlushkovBuildState::POS_UNINITIALIZED),
66 posLast(GlushkovBuildState::POS_UNINITIALIZED) {
67 assert(sub_comp);
68 assert(max > 0);
69 assert(m_min <= m_max);
70
71 if (m_min > MAX_REPEAT) {
72 throw ParseError("Bounded repeat is too large.");
73 }
74 if (m_max != NoLimit && m_max > MAX_REPEAT) {
75 throw ParseError("Bounded repeat is too large.");
76 }
77}
78
79ComponentRepeat::~ComponentRepeat() {}
80
81ComponentRepeat *ComponentRepeat::clone() const {
82 return new ComponentRepeat(*this);
83}
84
85ComponentRepeat::ComponentRepeat(const ComponentRepeat &other)
86 : Component(other),
87 type(other.type), sub_comp(unique_ptr<Component>(other.sub_comp->clone())),
88 m_min(other.m_min), m_max(other.m_max),
89 m_firsts(other.m_firsts), m_lasts(other.m_lasts),
90 posFirst(other.posFirst), posLast(other.posLast) {}
91
92bool ComponentRepeat::empty() const {
93 return m_min == 0 || sub_comp->empty();
94}
95
96bool ComponentRepeat::repeatable() const {
97 return false;
98}
99
100static
101void addBase(Position base, vector<PositionInfo> &firsts,
102 vector<PositionInfo> &lasts) {
103 for (auto &e : firsts) {
104 if (e.pos != GlushkovBuildState::POS_EPSILON) {
105 e.pos += base;
106 }
107 }
108 for (auto &e : lasts) {
109 e.pos += base;
110 }
111}
112
113static
114void checkPositions(vector<PositionInfo> &v, const GlushkovBuildState &bs) {
115 const NFABuilder& builder = bs.getBuilder();
116 for (const auto &e : v) {
117 if (builder.isSpecialState(e.pos)) {
118 throw ParseError("Embedded anchors not supported.");
119 }
120 }
121}
122
123void ComponentRepeat::notePositions(GlushkovBuildState &bs) {
124 assert(m_max > 0);
125 assert(m_max == NoLimit || m_max < MAX_REPEAT);
126
127 /* Note: We can construct smaller subgraphs if we're not maintaining edge
128 * priorities. */
129
130 // We create one copy only through a recursive call to notePositions(),
131 // first() and last(). Then we clone its positions and store the
132 // appropriate firsts and lasts values for the copies.
133 posFirst = bs.getBuilder().numVertices();
134 sub_comp->notePositions(bs);
135
136 u32 copies = m_max < NoLimit ? m_max : MAX(m_min, 1);
137 DEBUG_PRINTF("building %u copies of repeated region\n", copies);
138 m_firsts.clear();
139 m_lasts.clear();
140 m_firsts.resize(copies);
141 m_lasts.resize(copies);
142
143 m_firsts[0] = sub_comp->first();
144 m_lasts[0] = sub_comp->last();
145
146 postSubNotePositionHook();
147
148 posLast = bs.getBuilder().numVertices() - 1;
149 u32 vcount = posLast + 1 - posFirst;
150
151 // If we're making more than one copy, then our firsts and lasts must only
152 // contain vertices inside [posFirst, posLast]: anything else means we have
153 // an embedded anchor or otherwise weird situation.
154 if (copies > 1) {
155 checkPositions(m_firsts[0], bs);
156 checkPositions(m_lasts[0], bs);
157 }
158
159 // Avoid enormous expansions
160 if (vcount * copies > MAX_POSITIONS_EXPANDED) {
161 throw ParseError("Bounded repeat is too large.");
162 }
163
164 // Add positions for the rest of the copies
165 size_t copyPositions = vcount * (copies - 1);
166 bs.getBuilder().makePositions(copyPositions);
167
168 // Calculate our firsts and lasts for the copies
169 for (u32 i = 1; i < copies; ++i) {
170 m_firsts[i] = m_firsts[0];
171 m_lasts[i] = m_lasts[0];
172 u32 base = i * vcount;
173 addBase(base, m_firsts[i], m_lasts[i]);
174 }
175
176 recordPosBounds(posFirst, bs.getBuilder().numVertices());
177
178 // Each optional repeat has an epsilon at the end of its firsts list.
179 for (u32 i = m_min; i < m_firsts.size(); i++) {
180 m_firsts[i].push_back(GlushkovBuildState::POS_EPSILON);
181 }
182
183}
184
185vector<PositionInfo> ComponentRepeat::first() const {
186 if (!m_max) {
187 return {};
188 }
189
190 assert(!m_firsts.empty()); // notePositions should already have run
191 const vector<PositionInfo> &firsts = m_firsts.front();
192 DEBUG_PRINTF("firsts = %s\n",
193 dumpPositions(begin(firsts), end(firsts)).c_str());
194 return firsts;
195}
196
197void ComponentRepeat::buildFollowSet(GlushkovBuildState &bs,
198 const vector<PositionInfo> &lastPos) {
199 if (!m_max) {
200 return;
201 }
202 DEBUG_PRINTF("enter\n");
203
204 // Wire up the first (the "real") entry
205
206 DEBUG_PRINTF("initial repeat\n");
207 sub_comp->buildFollowSet(bs, lastPos);
208
209 // Clone the subgraph we just added N times, where N is the minimum extent
210 // of the graph minus one, wiring them up in a linear sequence
211
212 u32 copies = m_firsts.size();
213 DEBUG_PRINTF("cloning %u copies of repeat\n", copies - 1);
214 for (u32 rep = 1; rep < copies; rep++) {
215 u32 offset = (posLast + 1 - posFirst) * rep;
216 if (offset > 0) {
217 bs.cloneFollowSet(posFirst, posLast, offset);
218 }
219 }
220
221 wireRepeats(bs);
222
223 DEBUG_PRINTF("leave\n");
224}
225
226void ComponentRepeat::optimise(bool connected_to_sds) {
227 DEBUG_PRINTF("opt %d\n", (int)connected_to_sds);
228 if (!connected_to_sds) {
229 return;
230 }
231
232 DEBUG_PRINTF("setting m_max to %u\n", m_min);
233 m_max = m_min;
234}
235
236bool ComponentRepeat::vacuous_everywhere() const {
237 return !m_min || sub_comp->vacuous_everywhere();
238}
239
240bool ComponentRepeat::checkEmbeddedStartAnchor(bool at_start) const {
241 at_start = sub_comp->checkEmbeddedStartAnchor(at_start);
242
243 if (m_max > 1) {
244 at_start = sub_comp->checkEmbeddedStartAnchor(at_start);
245 }
246
247 return at_start;
248}
249
250bool ComponentRepeat::checkEmbeddedEndAnchor(bool at_end) const {
251 at_end = sub_comp->checkEmbeddedEndAnchor(at_end);
252
253 if (m_max > 1) {
254 at_end = sub_comp->checkEmbeddedEndAnchor(at_end);
255 }
256
257 return at_end;
258}
259
260Component *ComponentRepeat::accept(ComponentVisitor &v) {
261 Component *c = v.visit(this);
262 if (c != this) {
263 v.post(this);
264 return c;
265 }
266
267 c = sub_comp->accept(v);
268 if (c != sub_comp.get()) {
269 sub_comp.reset(c);
270 }
271
272 v.post(this);
273 return !sub_comp ? nullptr : this;
274}
275
276void ComponentRepeat::accept(ConstComponentVisitor &v) const {
277 v.pre(*this);
278 sub_comp->accept(v);
279 v.post(*this);
280}
281
282vector<PositionInfo> ComponentRepeat::last() const {
283 vector<PositionInfo> lasts;
284 if (!m_max) {
285 return lasts;
286 }
287
288 assert(!m_firsts.empty()); // notePositions should already have run
289 assert(!m_lasts.empty());
290
291 const auto &l = m_min ? m_lasts[m_min - 1] : m_lasts[0];
292 lasts.insert(lasts.end(), l.begin(), l.end());
293
294 if (!m_min || m_min != m_lasts.size()) {
295 lasts.insert(lasts.end(), m_lasts.back().begin(), m_lasts.back().end());
296 }
297
298 DEBUG_PRINTF("lasts = %s\n",
299 dumpPositions(lasts.begin(), lasts.end()).c_str());
300 return lasts;
301}
302
303void ComponentRepeat::wireRepeats(GlushkovBuildState &bs) {
304 /* note: m_lasts[0] already valid */
305 u32 copies = m_firsts.size();
306 const bool isEmpty = sub_comp->empty();
307 const vector<PositionInfo> &optLasts =
308 m_min ? m_lasts[m_min - 1] : m_lasts[0];
309
310 if (!copies) {
311 goto inf_check;
312 }
313
314 DEBUG_PRINTF("wiring up %u mand repeats\n", m_min);
315 for (u32 rep = 1; rep < m_min; rep++) {
316 bs.connectRegions(m_lasts[rep - 1], m_firsts[rep]);
317
318 if (isEmpty) {
319 m_lasts[rep].insert(m_lasts[rep].end(), m_lasts[rep - 1].begin(),
320 m_lasts[rep - 1].end());
321 }
322 }
323
324 DEBUG_PRINTF("wiring up %d optional repeats\n", copies - m_min);
325 for (u32 rep = MAX(m_min, 1); rep < copies; rep++) {
326 vector<PositionInfo> lasts = m_lasts[rep - 1];
327 if (rep != m_min) {
328 lasts.insert(lasts.end(), optLasts.begin(), optLasts.end());
329 sort(lasts.begin(), lasts.end());
330 lasts.erase(unique(lasts.begin(), lasts.end()), lasts.end());
331 }
332 bs.connectRegions(lasts, m_firsts[rep]);
333 }
334
335inf_check:
336 // If we have no max bound, we need a self-loop as well.
337 if (m_max == NoLimit) {
338 DEBUG_PRINTF("final repeat self-loop\n");
339 bs.connectRegions(m_lasts.back(), m_firsts.back());
340 }
341}
342
343static
344bool hasPositionFlags(const Component &c) {
345 for (const auto &e : c.first()) {
346 if (e.flags) {
347 return true;
348 }
349 }
350 return false;
351}
352
353void ComponentRepeat::postSubNotePositionHook() {
354 // UE-444 optimization: we can REWRITE m_min under various circumstances,
355 // so that we create smaller NFA graphs. Note that this is _not_ possible
356 // if our subcomponent contains a flagged position, e.g. nofloat.
357 if (!hasPositionFlags(*sub_comp) && sub_comp->empty()) {
358 m_min = 0;
359 }
360}
361
362unique_ptr<ComponentRepeat> makeComponentRepeat(unique_ptr<Component> sub_comp,
363 u32 min, u32 max,
364 ComponentRepeat::RepeatType t) {
365 return ue2::make_unique<ComponentRepeat>(move(sub_comp), min, max, t);
366}
367
368} // namespace ue2
369