1 | /* |
2 | * Copyright (c) 2015-2016, Intel Corporation |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions are met: |
6 | * |
7 | * * Redistributions of source code must retain the above copyright notice, |
8 | * this list of conditions and the following disclaimer. |
9 | * * Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * * Neither the name of Intel Corporation nor the names of its contributors |
13 | * may be used to endorse or promote products derived from this software |
14 | * without specific prior written permission. |
15 | * |
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
17 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
20 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
26 | * POSSIBILITY OF SUCH DAMAGE. |
27 | */ |
28 | |
29 | /** \file |
30 | * \brief Prefiltering component tree transformation. |
31 | */ |
32 | #include "ComponentAssertion.h" |
33 | #include "ComponentAtomicGroup.h" |
34 | #include "ComponentBackReference.h" |
35 | #include "ComponentBoundary.h" |
36 | #include "ComponentClass.h" |
37 | #include "ComponentCondReference.h" |
38 | #include "ComponentRepeat.h" |
39 | #include "ComponentSequence.h" |
40 | #include "ComponentVisitor.h" |
41 | #include "ComponentWordBoundary.h" |
42 | #include "ConstComponentVisitor.h" |
43 | #include "Parser.h" |
44 | #include "prefilter.h" |
45 | |
46 | #include <algorithm> |
47 | #include <stack> |
48 | |
49 | using namespace std; |
50 | |
51 | namespace ue2 { |
52 | |
53 | /** \brief Max number of positions a referent can have to be considered safe to |
54 | * replace a reference in prefiltering mode. */ |
55 | static const size_t MAX_REFERENT_POSITIONS = 1; |
56 | |
57 | /** \brief Constructs a \ref ComponentClass that matches a dot (any |
58 | * byte/codepoint, depending on whether UTF-8). */ |
59 | static |
60 | unique_ptr<ComponentClass> makeDotClass(const ParseMode &mode_in) { |
61 | ParseMode mode(mode_in); |
62 | mode.dotall = true; |
63 | return generateComponent(CLASS_ANY, false, mode); |
64 | } |
65 | |
66 | namespace { |
67 | |
68 | /** |
69 | * \brief Visitor used to determine if a given referent component is safe to |
70 | * replace its reference in prefiltering mode. Throws |
71 | * SafeReferentVisitor::Unsafe to terminate early on unsafe cases. */ |
72 | class SafeReferentVisitor : public DefaultConstComponentVisitor { |
73 | public: |
74 | struct Unsafe {}; |
75 | |
76 | SafeReferentVisitor() : numPositions(0) {} |
77 | |
78 | bool is_safe() const { |
79 | DEBUG_PRINTF("numPositions = %zu\n" , numPositions); |
80 | return numPositions <= MAX_REFERENT_POSITIONS; |
81 | } |
82 | |
83 | using DefaultConstComponentVisitor::pre; |
84 | using DefaultConstComponentVisitor::post; |
85 | |
86 | void pre(const AsciiComponentClass &) override { |
87 | numPositions++; |
88 | } |
89 | |
90 | void pre(const UTF8ComponentClass &) override { |
91 | // FIXME: we should be able to tell precisely how many positions this |
92 | // class will use. Right now, use the worst case. |
93 | numPositions += 4; |
94 | } |
95 | |
96 | void pre(const ComponentBoundary &) override { |
97 | numPositions++; |
98 | } |
99 | |
100 | void pre(const ComponentByte &) override { |
101 | numPositions++; |
102 | } |
103 | |
104 | void pre(const ComponentEUS &) override { |
105 | numPositions++; |
106 | } |
107 | |
108 | void pre(const ComponentRepeat &) override { |
109 | // Record the number of positions used before we visit the contents of |
110 | // the repeat. |
111 | countStack.push(numPositions); |
112 | } |
113 | |
114 | void post(const ComponentRepeat &c) override { |
115 | assert(!countStack.empty()); |
116 | size_t before = countStack.top(); |
117 | countStack.pop(); |
118 | assert(before <= numPositions); |
119 | |
120 | std::pair<u32, u32> bounds = c.getBounds(); |
121 | size_t subPositions = numPositions - before; |
122 | size_t copies = bounds.second < ComponentRepeat::NoLimit |
123 | ? bounds.second |
124 | : max(bounds.first, 1U); |
125 | numPositions = before + (subPositions * copies); |
126 | } |
127 | |
128 | void pre(const ComponentWordBoundary &) override { |
129 | // not quite accurate, as these are expanded out in assert |
130 | // resolution... |
131 | numPositions++; |
132 | } |
133 | |
134 | void pre(const ComponentBackReference &) override { |
135 | throw Unsafe(); |
136 | } |
137 | |
138 | void pre(const ComponentCondReference &) override { |
139 | throw Unsafe(); |
140 | } |
141 | |
142 | private: |
143 | size_t numPositions; |
144 | |
145 | // For temporary use |
146 | std::stack<size_t> countStack; |
147 | }; |
148 | |
149 | static |
150 | bool isSafeReferent(const Component &c) { |
151 | try { |
152 | SafeReferentVisitor vis; |
153 | c.accept(vis); |
154 | return vis.is_safe(); |
155 | } |
156 | catch (const SafeReferentVisitor::Unsafe &) { |
157 | return false; |
158 | } |
159 | } |
160 | |
161 | /** |
162 | * \brief Visitor to find the \ref ComponentSequence with a given reference ID |
163 | * or name: if found, the visitor will throw a const ptr to it. |
164 | */ |
165 | class FindSequenceVisitor : public DefaultConstComponentVisitor { |
166 | public: |
167 | explicit FindSequenceVisitor(unsigned ref_id) : id(ref_id) {} |
168 | explicit FindSequenceVisitor(const std::string &s) : name(s) {} |
169 | |
170 | using DefaultConstComponentVisitor::pre; |
171 | |
172 | void pre(const ComponentSequence &c) override { |
173 | if (!name.empty()) { |
174 | if (c.getCaptureName() == name) { |
175 | throw &c; |
176 | } |
177 | } else if (c.getCaptureIndex() == id) { |
178 | throw &c; |
179 | } |
180 | } |
181 | private: |
182 | const std::string name; |
183 | const unsigned id = 0; |
184 | }; |
185 | |
186 | static |
187 | const ComponentSequence *findCapturingGroup(const Component *root, |
188 | FindSequenceVisitor &vis) { |
189 | try { |
190 | root->accept(vis); |
191 | DEBUG_PRINTF("group not found\n" ); |
192 | return nullptr; |
193 | } catch (const ComponentSequence *seq) { |
194 | return seq; |
195 | } |
196 | } |
197 | |
198 | } // namespace |
199 | |
200 | /** |
201 | * \brief Visitor to apply prefilter reductions, swapping components for which |
202 | * we don't have real implementations with implementable ones. Any such |
203 | * replacement should produce a superset of the matches that would be produced |
204 | * by the original. |
205 | */ |
206 | class PrefilterVisitor : public DefaultComponentVisitor { |
207 | public: |
208 | PrefilterVisitor(Component *c, const ParseMode &m) : root(c), mode(m) {} |
209 | ~PrefilterVisitor() override; |
210 | |
211 | using DefaultComponentVisitor::visit; |
212 | |
213 | /** \brief Calls the visitor (recursively) on a new replacement component |
214 | * we've just created. Takes care of freeing it if the sequence is itself |
215 | * replaced. */ |
216 | template<class T> |
217 | Component *visit_replacement(T *r) { |
218 | Component *c = r->accept(*this); |
219 | if (c != r) { |
220 | delete r; |
221 | } |
222 | return c; |
223 | } |
224 | |
225 | Component *visit(ComponentBackReference *c) override { |
226 | assert(c); |
227 | |
228 | // If the referent is simple (represents a single position), then we |
229 | // replace the back-reference with a copy of it. |
230 | const ComponentSequence *ref = nullptr; |
231 | const std::string &ref_name = c->getRefName(); |
232 | const unsigned ref_id = c->getRefID(); |
233 | if (!ref_name.empty()) { |
234 | FindSequenceVisitor vis(ref_name); |
235 | ref = findCapturingGroup(root, vis); |
236 | } else if (ref_id > 0) { |
237 | FindSequenceVisitor vis(ref_id); |
238 | ref = findCapturingGroup(root, vis); |
239 | } |
240 | |
241 | if (ref && isSafeReferent(*ref)) { |
242 | DEBUG_PRINTF("found safe ref %p\n" , ref); |
243 | ComponentSequence *seq = ref->clone(); |
244 | // Remove labels from cloned sequence. |
245 | seq->setCaptureName("" ); |
246 | seq->setCaptureIndex(ComponentSequence::NOT_CAPTURED); |
247 | |
248 | return visit_replacement(seq); |
249 | } |
250 | |
251 | // Replace with ".*". |
252 | auto rep = makeComponentRepeat(makeDotClass(mode), 0, |
253 | ComponentRepeat::NoLimit, |
254 | ComponentRepeat::REPEAT_GREEDY); |
255 | return rep.release(); // FIXME: owning raw ptr |
256 | } |
257 | |
258 | Component *visit(UNUSED ComponentAssertion *c) override { |
259 | assert(c); |
260 | // Replace with an empty sequence. |
261 | return new ComponentSequence(); |
262 | } |
263 | |
264 | Component *visit(ComponentRepeat *c) override { |
265 | assert(c); |
266 | // Possessive repeats become greedy. |
267 | if (c->type == ComponentRepeat::REPEAT_POSSESSIVE) { |
268 | c->type = ComponentRepeat::REPEAT_GREEDY; |
269 | } |
270 | return c; |
271 | } |
272 | |
273 | Component *visit(ComponentAtomicGroup *c) override { |
274 | assert(c); |
275 | // Replace with a plain sequence containing the atomic group's |
276 | // children. |
277 | ComponentSequence *seq = new ComponentSequence(); |
278 | const auto &children = c->getChildren(); |
279 | for (const auto &child : children) { |
280 | assert(child); |
281 | seq->addComponent(unique_ptr<Component>(child->clone())); |
282 | } |
283 | |
284 | return visit_replacement(seq); |
285 | } |
286 | |
287 | Component *visit(UNUSED ComponentEUS *c) override { |
288 | assert(c); |
289 | // Replace with ".+". |
290 | auto rep = makeComponentRepeat(makeDotClass(mode), 1, |
291 | ComponentRepeat::NoLimit, |
292 | ComponentRepeat::REPEAT_GREEDY); |
293 | return rep.release(); // FIXME: owning raw ptr |
294 | } |
295 | |
296 | Component *visit(ComponentWordBoundary *c) override { |
297 | assert(c); |
298 | |
299 | // TODO: Right now, we do not have correct code for resolving these |
300 | // when prefiltering is on, UCP is on, and UTF-8 is *off*. For now, we |
301 | // just replace with an empty sequence (as that will return a superset |
302 | // of matches). |
303 | if (mode.ucp && !mode.utf8) { |
304 | return new ComponentSequence(); |
305 | } |
306 | |
307 | // All other cases can be prefiltered. |
308 | c->setPrefilter(true); |
309 | return c; |
310 | } |
311 | |
312 | Component *visit(ComponentCondReference *c) override { |
313 | assert(c); |
314 | // Replace with a plain sequence containing the conditional reference's |
315 | // children. |
316 | ComponentSequence *seq = new ComponentSequence(); |
317 | const auto &children = c->getChildren(); |
318 | |
319 | // Empty children is accepted by PCRE as a "do nothing" case. |
320 | if (children.empty()) { |
321 | return seq; |
322 | } |
323 | |
324 | for (const auto &child : children) { |
325 | assert(child); |
326 | seq->addComponent(unique_ptr<Component>(child->clone())); |
327 | } |
328 | |
329 | // If the conditional reference had just a YES branch, we want this to |
330 | // be an alternation with an empty sequence (the NO branch). |
331 | if (!c->hasBothBranches) { |
332 | seq->addAlternation(); |
333 | seq->finalize(); |
334 | } |
335 | |
336 | return visit_replacement(seq); |
337 | } |
338 | |
339 | private: |
340 | Component *root; |
341 | const ParseMode &mode; |
342 | }; |
343 | |
344 | PrefilterVisitor::~PrefilterVisitor() {} |
345 | |
346 | void prefilterTree(unique_ptr<Component> &root, const ParseMode &mode) { |
347 | assert(root); |
348 | PrefilterVisitor vis(root.get(), mode); |
349 | |
350 | Component *c = root->accept(vis); |
351 | if (c != root.get()) { |
352 | root.reset(c); |
353 | } |
354 | } |
355 | |
356 | } // namespace ue2 |
357 | |