| 1 | /* |
| 2 | * Copyright (c) 2015-2016, Intel Corporation |
| 3 | * |
| 4 | * Redistribution and use in source and binary forms, with or without |
| 5 | * modification, are permitted provided that the following conditions are met: |
| 6 | * |
| 7 | * * Redistributions of source code must retain the above copyright notice, |
| 8 | * this list of conditions and the following disclaimer. |
| 9 | * * Redistributions in binary form must reproduce the above copyright |
| 10 | * notice, this list of conditions and the following disclaimer in the |
| 11 | * documentation and/or other materials provided with the distribution. |
| 12 | * * Neither the name of Intel Corporation nor the names of its contributors |
| 13 | * may be used to endorse or promote products derived from this software |
| 14 | * without specific prior written permission. |
| 15 | * |
| 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| 17 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
| 20 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| 21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| 22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| 23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| 24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| 25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| 26 | * POSSIBILITY OF SUCH DAMAGE. |
| 27 | */ |
| 28 | |
| 29 | /** \file |
| 30 | * \brief Prefiltering component tree transformation. |
| 31 | */ |
| 32 | #include "ComponentAssertion.h" |
| 33 | #include "ComponentAtomicGroup.h" |
| 34 | #include "ComponentBackReference.h" |
| 35 | #include "ComponentBoundary.h" |
| 36 | #include "ComponentClass.h" |
| 37 | #include "ComponentCondReference.h" |
| 38 | #include "ComponentRepeat.h" |
| 39 | #include "ComponentSequence.h" |
| 40 | #include "ComponentVisitor.h" |
| 41 | #include "ComponentWordBoundary.h" |
| 42 | #include "ConstComponentVisitor.h" |
| 43 | #include "Parser.h" |
| 44 | #include "prefilter.h" |
| 45 | |
| 46 | #include <algorithm> |
| 47 | #include <stack> |
| 48 | |
| 49 | using namespace std; |
| 50 | |
| 51 | namespace ue2 { |
| 52 | |
| 53 | /** \brief Max number of positions a referent can have to be considered safe to |
| 54 | * replace a reference in prefiltering mode. */ |
| 55 | static const size_t MAX_REFERENT_POSITIONS = 1; |
| 56 | |
| 57 | /** \brief Constructs a \ref ComponentClass that matches a dot (any |
| 58 | * byte/codepoint, depending on whether UTF-8). */ |
| 59 | static |
| 60 | unique_ptr<ComponentClass> makeDotClass(const ParseMode &mode_in) { |
| 61 | ParseMode mode(mode_in); |
| 62 | mode.dotall = true; |
| 63 | return generateComponent(CLASS_ANY, false, mode); |
| 64 | } |
| 65 | |
| 66 | namespace { |
| 67 | |
| 68 | /** |
| 69 | * \brief Visitor used to determine if a given referent component is safe to |
| 70 | * replace its reference in prefiltering mode. Throws |
| 71 | * SafeReferentVisitor::Unsafe to terminate early on unsafe cases. */ |
| 72 | class SafeReferentVisitor : public DefaultConstComponentVisitor { |
| 73 | public: |
| 74 | struct Unsafe {}; |
| 75 | |
| 76 | SafeReferentVisitor() : numPositions(0) {} |
| 77 | |
| 78 | bool is_safe() const { |
| 79 | DEBUG_PRINTF("numPositions = %zu\n" , numPositions); |
| 80 | return numPositions <= MAX_REFERENT_POSITIONS; |
| 81 | } |
| 82 | |
| 83 | using DefaultConstComponentVisitor::pre; |
| 84 | using DefaultConstComponentVisitor::post; |
| 85 | |
| 86 | void pre(const AsciiComponentClass &) override { |
| 87 | numPositions++; |
| 88 | } |
| 89 | |
| 90 | void pre(const UTF8ComponentClass &) override { |
| 91 | // FIXME: we should be able to tell precisely how many positions this |
| 92 | // class will use. Right now, use the worst case. |
| 93 | numPositions += 4; |
| 94 | } |
| 95 | |
| 96 | void pre(const ComponentBoundary &) override { |
| 97 | numPositions++; |
| 98 | } |
| 99 | |
| 100 | void pre(const ComponentByte &) override { |
| 101 | numPositions++; |
| 102 | } |
| 103 | |
| 104 | void pre(const ComponentEUS &) override { |
| 105 | numPositions++; |
| 106 | } |
| 107 | |
| 108 | void pre(const ComponentRepeat &) override { |
| 109 | // Record the number of positions used before we visit the contents of |
| 110 | // the repeat. |
| 111 | countStack.push(numPositions); |
| 112 | } |
| 113 | |
| 114 | void post(const ComponentRepeat &c) override { |
| 115 | assert(!countStack.empty()); |
| 116 | size_t before = countStack.top(); |
| 117 | countStack.pop(); |
| 118 | assert(before <= numPositions); |
| 119 | |
| 120 | std::pair<u32, u32> bounds = c.getBounds(); |
| 121 | size_t subPositions = numPositions - before; |
| 122 | size_t copies = bounds.second < ComponentRepeat::NoLimit |
| 123 | ? bounds.second |
| 124 | : max(bounds.first, 1U); |
| 125 | numPositions = before + (subPositions * copies); |
| 126 | } |
| 127 | |
| 128 | void pre(const ComponentWordBoundary &) override { |
| 129 | // not quite accurate, as these are expanded out in assert |
| 130 | // resolution... |
| 131 | numPositions++; |
| 132 | } |
| 133 | |
| 134 | void pre(const ComponentBackReference &) override { |
| 135 | throw Unsafe(); |
| 136 | } |
| 137 | |
| 138 | void pre(const ComponentCondReference &) override { |
| 139 | throw Unsafe(); |
| 140 | } |
| 141 | |
| 142 | private: |
| 143 | size_t numPositions; |
| 144 | |
| 145 | // For temporary use |
| 146 | std::stack<size_t> countStack; |
| 147 | }; |
| 148 | |
| 149 | static |
| 150 | bool isSafeReferent(const Component &c) { |
| 151 | try { |
| 152 | SafeReferentVisitor vis; |
| 153 | c.accept(vis); |
| 154 | return vis.is_safe(); |
| 155 | } |
| 156 | catch (const SafeReferentVisitor::Unsafe &) { |
| 157 | return false; |
| 158 | } |
| 159 | } |
| 160 | |
| 161 | /** |
| 162 | * \brief Visitor to find the \ref ComponentSequence with a given reference ID |
| 163 | * or name: if found, the visitor will throw a const ptr to it. |
| 164 | */ |
| 165 | class FindSequenceVisitor : public DefaultConstComponentVisitor { |
| 166 | public: |
| 167 | explicit FindSequenceVisitor(unsigned ref_id) : id(ref_id) {} |
| 168 | explicit FindSequenceVisitor(const std::string &s) : name(s) {} |
| 169 | |
| 170 | using DefaultConstComponentVisitor::pre; |
| 171 | |
| 172 | void pre(const ComponentSequence &c) override { |
| 173 | if (!name.empty()) { |
| 174 | if (c.getCaptureName() == name) { |
| 175 | throw &c; |
| 176 | } |
| 177 | } else if (c.getCaptureIndex() == id) { |
| 178 | throw &c; |
| 179 | } |
| 180 | } |
| 181 | private: |
| 182 | const std::string name; |
| 183 | const unsigned id = 0; |
| 184 | }; |
| 185 | |
| 186 | static |
| 187 | const ComponentSequence *findCapturingGroup(const Component *root, |
| 188 | FindSequenceVisitor &vis) { |
| 189 | try { |
| 190 | root->accept(vis); |
| 191 | DEBUG_PRINTF("group not found\n" ); |
| 192 | return nullptr; |
| 193 | } catch (const ComponentSequence *seq) { |
| 194 | return seq; |
| 195 | } |
| 196 | } |
| 197 | |
| 198 | } // namespace |
| 199 | |
| 200 | /** |
| 201 | * \brief Visitor to apply prefilter reductions, swapping components for which |
| 202 | * we don't have real implementations with implementable ones. Any such |
| 203 | * replacement should produce a superset of the matches that would be produced |
| 204 | * by the original. |
| 205 | */ |
| 206 | class PrefilterVisitor : public DefaultComponentVisitor { |
| 207 | public: |
| 208 | PrefilterVisitor(Component *c, const ParseMode &m) : root(c), mode(m) {} |
| 209 | ~PrefilterVisitor() override; |
| 210 | |
| 211 | using DefaultComponentVisitor::visit; |
| 212 | |
| 213 | /** \brief Calls the visitor (recursively) on a new replacement component |
| 214 | * we've just created. Takes care of freeing it if the sequence is itself |
| 215 | * replaced. */ |
| 216 | template<class T> |
| 217 | Component *visit_replacement(T *r) { |
| 218 | Component *c = r->accept(*this); |
| 219 | if (c != r) { |
| 220 | delete r; |
| 221 | } |
| 222 | return c; |
| 223 | } |
| 224 | |
| 225 | Component *visit(ComponentBackReference *c) override { |
| 226 | assert(c); |
| 227 | |
| 228 | // If the referent is simple (represents a single position), then we |
| 229 | // replace the back-reference with a copy of it. |
| 230 | const ComponentSequence *ref = nullptr; |
| 231 | const std::string &ref_name = c->getRefName(); |
| 232 | const unsigned ref_id = c->getRefID(); |
| 233 | if (!ref_name.empty()) { |
| 234 | FindSequenceVisitor vis(ref_name); |
| 235 | ref = findCapturingGroup(root, vis); |
| 236 | } else if (ref_id > 0) { |
| 237 | FindSequenceVisitor vis(ref_id); |
| 238 | ref = findCapturingGroup(root, vis); |
| 239 | } |
| 240 | |
| 241 | if (ref && isSafeReferent(*ref)) { |
| 242 | DEBUG_PRINTF("found safe ref %p\n" , ref); |
| 243 | ComponentSequence *seq = ref->clone(); |
| 244 | // Remove labels from cloned sequence. |
| 245 | seq->setCaptureName("" ); |
| 246 | seq->setCaptureIndex(ComponentSequence::NOT_CAPTURED); |
| 247 | |
| 248 | return visit_replacement(seq); |
| 249 | } |
| 250 | |
| 251 | // Replace with ".*". |
| 252 | auto rep = makeComponentRepeat(makeDotClass(mode), 0, |
| 253 | ComponentRepeat::NoLimit, |
| 254 | ComponentRepeat::REPEAT_GREEDY); |
| 255 | return rep.release(); // FIXME: owning raw ptr |
| 256 | } |
| 257 | |
| 258 | Component *visit(UNUSED ComponentAssertion *c) override { |
| 259 | assert(c); |
| 260 | // Replace with an empty sequence. |
| 261 | return new ComponentSequence(); |
| 262 | } |
| 263 | |
| 264 | Component *visit(ComponentRepeat *c) override { |
| 265 | assert(c); |
| 266 | // Possessive repeats become greedy. |
| 267 | if (c->type == ComponentRepeat::REPEAT_POSSESSIVE) { |
| 268 | c->type = ComponentRepeat::REPEAT_GREEDY; |
| 269 | } |
| 270 | return c; |
| 271 | } |
| 272 | |
| 273 | Component *visit(ComponentAtomicGroup *c) override { |
| 274 | assert(c); |
| 275 | // Replace with a plain sequence containing the atomic group's |
| 276 | // children. |
| 277 | ComponentSequence *seq = new ComponentSequence(); |
| 278 | const auto &children = c->getChildren(); |
| 279 | for (const auto &child : children) { |
| 280 | assert(child); |
| 281 | seq->addComponent(unique_ptr<Component>(child->clone())); |
| 282 | } |
| 283 | |
| 284 | return visit_replacement(seq); |
| 285 | } |
| 286 | |
| 287 | Component *visit(UNUSED ComponentEUS *c) override { |
| 288 | assert(c); |
| 289 | // Replace with ".+". |
| 290 | auto rep = makeComponentRepeat(makeDotClass(mode), 1, |
| 291 | ComponentRepeat::NoLimit, |
| 292 | ComponentRepeat::REPEAT_GREEDY); |
| 293 | return rep.release(); // FIXME: owning raw ptr |
| 294 | } |
| 295 | |
| 296 | Component *visit(ComponentWordBoundary *c) override { |
| 297 | assert(c); |
| 298 | |
| 299 | // TODO: Right now, we do not have correct code for resolving these |
| 300 | // when prefiltering is on, UCP is on, and UTF-8 is *off*. For now, we |
| 301 | // just replace with an empty sequence (as that will return a superset |
| 302 | // of matches). |
| 303 | if (mode.ucp && !mode.utf8) { |
| 304 | return new ComponentSequence(); |
| 305 | } |
| 306 | |
| 307 | // All other cases can be prefiltered. |
| 308 | c->setPrefilter(true); |
| 309 | return c; |
| 310 | } |
| 311 | |
| 312 | Component *visit(ComponentCondReference *c) override { |
| 313 | assert(c); |
| 314 | // Replace with a plain sequence containing the conditional reference's |
| 315 | // children. |
| 316 | ComponentSequence *seq = new ComponentSequence(); |
| 317 | const auto &children = c->getChildren(); |
| 318 | |
| 319 | // Empty children is accepted by PCRE as a "do nothing" case. |
| 320 | if (children.empty()) { |
| 321 | return seq; |
| 322 | } |
| 323 | |
| 324 | for (const auto &child : children) { |
| 325 | assert(child); |
| 326 | seq->addComponent(unique_ptr<Component>(child->clone())); |
| 327 | } |
| 328 | |
| 329 | // If the conditional reference had just a YES branch, we want this to |
| 330 | // be an alternation with an empty sequence (the NO branch). |
| 331 | if (!c->hasBothBranches) { |
| 332 | seq->addAlternation(); |
| 333 | seq->finalize(); |
| 334 | } |
| 335 | |
| 336 | return visit_replacement(seq); |
| 337 | } |
| 338 | |
| 339 | private: |
| 340 | Component *root; |
| 341 | const ParseMode &mode; |
| 342 | }; |
| 343 | |
| 344 | PrefilterVisitor::~PrefilterVisitor() {} |
| 345 | |
| 346 | void prefilterTree(unique_ptr<Component> &root, const ParseMode &mode) { |
| 347 | assert(root); |
| 348 | PrefilterVisitor vis(root.get(), mode); |
| 349 | |
| 350 | Component *c = root->accept(vis); |
| 351 | if (c != root.get()) { |
| 352 | root.reset(c); |
| 353 | } |
| 354 | } |
| 355 | |
| 356 | } // namespace ue2 |
| 357 | |