1/*
2 * Copyright (c) 2015-2016, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** \file
30 * \brief Prefiltering component tree transformation.
31 */
32#include "ComponentAssertion.h"
33#include "ComponentAtomicGroup.h"
34#include "ComponentBackReference.h"
35#include "ComponentBoundary.h"
36#include "ComponentClass.h"
37#include "ComponentCondReference.h"
38#include "ComponentRepeat.h"
39#include "ComponentSequence.h"
40#include "ComponentVisitor.h"
41#include "ComponentWordBoundary.h"
42#include "ConstComponentVisitor.h"
43#include "Parser.h"
44#include "prefilter.h"
45
46#include <algorithm>
47#include <stack>
48
49using namespace std;
50
51namespace ue2 {
52
53/** \brief Max number of positions a referent can have to be considered safe to
54 * replace a reference in prefiltering mode. */
55static const size_t MAX_REFERENT_POSITIONS = 1;
56
57/** \brief Constructs a \ref ComponentClass that matches a dot (any
58 * byte/codepoint, depending on whether UTF-8). */
59static
60unique_ptr<ComponentClass> makeDotClass(const ParseMode &mode_in) {
61 ParseMode mode(mode_in);
62 mode.dotall = true;
63 return generateComponent(CLASS_ANY, false, mode);
64}
65
66namespace {
67
68/**
69 * \brief Visitor used to determine if a given referent component is safe to
70 * replace its reference in prefiltering mode. Throws
71 * SafeReferentVisitor::Unsafe to terminate early on unsafe cases. */
72class SafeReferentVisitor : public DefaultConstComponentVisitor {
73public:
74 struct Unsafe {};
75
76 SafeReferentVisitor() : numPositions(0) {}
77
78 bool is_safe() const {
79 DEBUG_PRINTF("numPositions = %zu\n", numPositions);
80 return numPositions <= MAX_REFERENT_POSITIONS;
81 }
82
83 using DefaultConstComponentVisitor::pre;
84 using DefaultConstComponentVisitor::post;
85
86 void pre(const AsciiComponentClass &) override {
87 numPositions++;
88 }
89
90 void pre(const UTF8ComponentClass &) override {
91 // FIXME: we should be able to tell precisely how many positions this
92 // class will use. Right now, use the worst case.
93 numPositions += 4;
94 }
95
96 void pre(const ComponentBoundary &) override {
97 numPositions++;
98 }
99
100 void pre(const ComponentByte &) override {
101 numPositions++;
102 }
103
104 void pre(const ComponentEUS &) override {
105 numPositions++;
106 }
107
108 void pre(const ComponentRepeat &) override {
109 // Record the number of positions used before we visit the contents of
110 // the repeat.
111 countStack.push(numPositions);
112 }
113
114 void post(const ComponentRepeat &c) override {
115 assert(!countStack.empty());
116 size_t before = countStack.top();
117 countStack.pop();
118 assert(before <= numPositions);
119
120 std::pair<u32, u32> bounds = c.getBounds();
121 size_t subPositions = numPositions - before;
122 size_t copies = bounds.second < ComponentRepeat::NoLimit
123 ? bounds.second
124 : max(bounds.first, 1U);
125 numPositions = before + (subPositions * copies);
126 }
127
128 void pre(const ComponentWordBoundary &) override {
129 // not quite accurate, as these are expanded out in assert
130 // resolution...
131 numPositions++;
132 }
133
134 void pre(const ComponentBackReference &) override {
135 throw Unsafe();
136 }
137
138 void pre(const ComponentCondReference &) override {
139 throw Unsafe();
140 }
141
142private:
143 size_t numPositions;
144
145 // For temporary use
146 std::stack<size_t> countStack;
147};
148
149static
150bool isSafeReferent(const Component &c) {
151 try {
152 SafeReferentVisitor vis;
153 c.accept(vis);
154 return vis.is_safe();
155 }
156 catch (const SafeReferentVisitor::Unsafe &) {
157 return false;
158 }
159}
160
161/**
162 * \brief Visitor to find the \ref ComponentSequence with a given reference ID
163 * or name: if found, the visitor will throw a const ptr to it.
164 */
165class FindSequenceVisitor : public DefaultConstComponentVisitor {
166public:
167 explicit FindSequenceVisitor(unsigned ref_id) : id(ref_id) {}
168 explicit FindSequenceVisitor(const std::string &s) : name(s) {}
169
170 using DefaultConstComponentVisitor::pre;
171
172 void pre(const ComponentSequence &c) override {
173 if (!name.empty()) {
174 if (c.getCaptureName() == name) {
175 throw &c;
176 }
177 } else if (c.getCaptureIndex() == id) {
178 throw &c;
179 }
180 }
181private:
182 const std::string name;
183 const unsigned id = 0;
184};
185
186static
187const ComponentSequence *findCapturingGroup(const Component *root,
188 FindSequenceVisitor &vis) {
189 try {
190 root->accept(vis);
191 DEBUG_PRINTF("group not found\n");
192 return nullptr;
193 } catch (const ComponentSequence *seq) {
194 return seq;
195 }
196}
197
198} // namespace
199
200/**
201 * \brief Visitor to apply prefilter reductions, swapping components for which
202 * we don't have real implementations with implementable ones. Any such
203 * replacement should produce a superset of the matches that would be produced
204 * by the original.
205 */
206class PrefilterVisitor : public DefaultComponentVisitor {
207public:
208 PrefilterVisitor(Component *c, const ParseMode &m) : root(c), mode(m) {}
209 ~PrefilterVisitor() override;
210
211 using DefaultComponentVisitor::visit;
212
213 /** \brief Calls the visitor (recursively) on a new replacement component
214 * we've just created. Takes care of freeing it if the sequence is itself
215 * replaced. */
216 template<class T>
217 Component *visit_replacement(T *r) {
218 Component *c = r->accept(*this);
219 if (c != r) {
220 delete r;
221 }
222 return c;
223 }
224
225 Component *visit(ComponentBackReference *c) override {
226 assert(c);
227
228 // If the referent is simple (represents a single position), then we
229 // replace the back-reference with a copy of it.
230 const ComponentSequence *ref = nullptr;
231 const std::string &ref_name = c->getRefName();
232 const unsigned ref_id = c->getRefID();
233 if (!ref_name.empty()) {
234 FindSequenceVisitor vis(ref_name);
235 ref = findCapturingGroup(root, vis);
236 } else if (ref_id > 0) {
237 FindSequenceVisitor vis(ref_id);
238 ref = findCapturingGroup(root, vis);
239 }
240
241 if (ref && isSafeReferent(*ref)) {
242 DEBUG_PRINTF("found safe ref %p\n", ref);
243 ComponentSequence *seq = ref->clone();
244 // Remove labels from cloned sequence.
245 seq->setCaptureName("");
246 seq->setCaptureIndex(ComponentSequence::NOT_CAPTURED);
247
248 return visit_replacement(seq);
249 }
250
251 // Replace with ".*".
252 auto rep = makeComponentRepeat(makeDotClass(mode), 0,
253 ComponentRepeat::NoLimit,
254 ComponentRepeat::REPEAT_GREEDY);
255 return rep.release(); // FIXME: owning raw ptr
256 }
257
258 Component *visit(UNUSED ComponentAssertion *c) override {
259 assert(c);
260 // Replace with an empty sequence.
261 return new ComponentSequence();
262 }
263
264 Component *visit(ComponentRepeat *c) override {
265 assert(c);
266 // Possessive repeats become greedy.
267 if (c->type == ComponentRepeat::REPEAT_POSSESSIVE) {
268 c->type = ComponentRepeat::REPEAT_GREEDY;
269 }
270 return c;
271 }
272
273 Component *visit(ComponentAtomicGroup *c) override {
274 assert(c);
275 // Replace with a plain sequence containing the atomic group's
276 // children.
277 ComponentSequence *seq = new ComponentSequence();
278 const auto &children = c->getChildren();
279 for (const auto &child : children) {
280 assert(child);
281 seq->addComponent(unique_ptr<Component>(child->clone()));
282 }
283
284 return visit_replacement(seq);
285 }
286
287 Component *visit(UNUSED ComponentEUS *c) override {
288 assert(c);
289 // Replace with ".+".
290 auto rep = makeComponentRepeat(makeDotClass(mode), 1,
291 ComponentRepeat::NoLimit,
292 ComponentRepeat::REPEAT_GREEDY);
293 return rep.release(); // FIXME: owning raw ptr
294 }
295
296 Component *visit(ComponentWordBoundary *c) override {
297 assert(c);
298
299 // TODO: Right now, we do not have correct code for resolving these
300 // when prefiltering is on, UCP is on, and UTF-8 is *off*. For now, we
301 // just replace with an empty sequence (as that will return a superset
302 // of matches).
303 if (mode.ucp && !mode.utf8) {
304 return new ComponentSequence();
305 }
306
307 // All other cases can be prefiltered.
308 c->setPrefilter(true);
309 return c;
310 }
311
312 Component *visit(ComponentCondReference *c) override {
313 assert(c);
314 // Replace with a plain sequence containing the conditional reference's
315 // children.
316 ComponentSequence *seq = new ComponentSequence();
317 const auto &children = c->getChildren();
318
319 // Empty children is accepted by PCRE as a "do nothing" case.
320 if (children.empty()) {
321 return seq;
322 }
323
324 for (const auto &child : children) {
325 assert(child);
326 seq->addComponent(unique_ptr<Component>(child->clone()));
327 }
328
329 // If the conditional reference had just a YES branch, we want this to
330 // be an alternation with an empty sequence (the NO branch).
331 if (!c->hasBothBranches) {
332 seq->addAlternation();
333 seq->finalize();
334 }
335
336 return visit_replacement(seq);
337 }
338
339private:
340 Component *root;
341 const ParseMode &mode;
342};
343
344PrefilterVisitor::~PrefilterVisitor() {}
345
346void prefilterTree(unique_ptr<Component> &root, const ParseMode &mode) {
347 assert(root);
348 PrefilterVisitor vis(root.get(), mode);
349
350 Component *c = root->accept(vis);
351 if (c != root.get()) {
352 root.reset(c);
353 }
354}
355
356} // namespace ue2
357