1 | /* |
2 | * Copyright (c) 2015-2019, Intel Corporation |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions are met: |
6 | * |
7 | * * Redistributions of source code must retain the above copyright notice, |
8 | * this list of conditions and the following disclaimer. |
9 | * * Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * * Neither the name of Intel Corporation nor the names of its contributors |
13 | * may be used to endorse or promote products derived from this software |
14 | * without specific prior written permission. |
15 | * |
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
17 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
20 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
26 | * POSSIBILITY OF SUCH DAMAGE. |
27 | */ |
28 | |
29 | /** \file |
30 | * \brief Shortcut literal pass: directly add literal components to Rose. |
31 | */ |
32 | #include "AsciiComponentClass.h" |
33 | #include "Utf8ComponentClass.h" |
34 | #include "ComponentAssertion.h" |
35 | #include "ComponentAtomicGroup.h" |
36 | #include "ComponentBackReference.h" |
37 | #include "ComponentBoundary.h" |
38 | #include "ComponentClass.h" |
39 | #include "ComponentCondReference.h" |
40 | #include "ComponentRepeat.h" |
41 | #include "ComponentSequence.h" |
42 | #include "ComponentVisitor.h" |
43 | #include "ComponentWordBoundary.h" |
44 | #include "ConstComponentVisitor.h" |
45 | #include "parse_error.h" |
46 | #include "shortcut_literal.h" |
47 | #include "grey.h" |
48 | #include "nfagraph/ng.h" |
49 | #include "compiler/compiler.h" |
50 | #include "util/ue2string.h" |
51 | #include "ue2common.h" |
52 | |
53 | #include <stack> |
54 | |
55 | using namespace std; |
56 | |
57 | namespace ue2 { |
58 | |
59 | /** |
60 | * \brief Visitor that constructs a ue2_literal from a component tree. |
61 | * |
62 | * If a component that can't be part of a literal is encountered, this visitor |
63 | * will throw ConstructLiteralVisitor::NotLiteral. |
64 | */ |
65 | class ConstructLiteralVisitor : public ConstComponentVisitor { |
66 | public: |
67 | ~ConstructLiteralVisitor() override; |
68 | |
69 | /** \brief Thrown if this component does not represent a literal. */ |
70 | struct NotLiteral {}; |
71 | |
72 | void pre(const AsciiComponentClass &c) override { |
73 | const CharReach &cr = c.cr; |
74 | const size_t width = cr.count(); |
75 | if (width == 1) { |
76 | lit.push_back(cr.find_first(), false); |
77 | } else if (width == 2 && cr.isCaselessChar()) { |
78 | lit.push_back(cr.find_first(), true); |
79 | } else { |
80 | throw NotLiteral(); |
81 | } |
82 | } |
83 | |
84 | void pre(const ComponentRepeat &c) override { |
85 | if (c.m_min == 0 || c.m_min != c.m_max) { |
86 | throw NotLiteral(); |
87 | } |
88 | |
89 | if (c.m_max < ComponentRepeat::NoLimit && c.m_max > 32767) { |
90 | throw ParseError("Bounded repeat is too large." ); |
91 | } |
92 | |
93 | // Store the current length of the literal; in this repeat's post() |
94 | // call we will append N-1 more copies of [index..end]. |
95 | repeat_stack.push(lit.length()); |
96 | } |
97 | |
98 | void post(const ComponentRepeat &c) override { |
99 | // Add N-1 copies of the string between the entry to the repeat and the |
100 | // current end of the literal. |
101 | assert(!repeat_stack.empty()); |
102 | const ue2_literal suffix = lit.substr(repeat_stack.top()); |
103 | repeat_stack.pop(); |
104 | |
105 | for (unsigned i = 1; i < c.m_min; i++) { |
106 | lit += suffix; |
107 | } |
108 | } |
109 | |
110 | void pre(const ComponentSequence &) override { |
111 | // Pass through. |
112 | } |
113 | |
114 | void pre(const ComponentAlternation &) override { throw NotLiteral(); } |
115 | void pre(const ComponentAssertion &) override { throw NotLiteral(); } |
116 | void pre(const ComponentAtomicGroup &) override { throw NotLiteral(); } |
117 | void pre(const ComponentBackReference &) override { throw NotLiteral(); } |
118 | void pre(const ComponentBoundary &) override { throw NotLiteral(); } |
119 | void pre(const ComponentByte &) override { throw NotLiteral(); } |
120 | void pre(const ComponentCondReference &) override { throw NotLiteral(); } |
121 | void pre(const ComponentEmpty &) override { throw NotLiteral(); } |
122 | void pre(const ComponentEUS &) override { throw NotLiteral(); } |
123 | void pre(const ComponentWordBoundary &) override { throw NotLiteral(); } |
124 | void pre(const UTF8ComponentClass &) override { throw NotLiteral(); } |
125 | |
126 | void during(const AsciiComponentClass &) override {} |
127 | void during(const ComponentAlternation &) override {} |
128 | void during(const ComponentAssertion &) override {} |
129 | void during(const ComponentAtomicGroup &) override {} |
130 | void during(const ComponentBackReference &) override {} |
131 | void during(const ComponentBoundary &) override {} |
132 | void during(const ComponentByte &) override {} |
133 | void during(const ComponentCondReference &) override {} |
134 | void during(const ComponentEmpty &) override {} |
135 | void during(const ComponentEUS &) override {} |
136 | void during(const ComponentRepeat &) override {} |
137 | void during(const ComponentSequence &) override {} |
138 | void during(const ComponentWordBoundary &) override {} |
139 | void during(const UTF8ComponentClass &) override {} |
140 | |
141 | void post(const AsciiComponentClass &) override {} |
142 | void post(const ComponentAlternation &) override {} |
143 | void post(const ComponentAssertion &) override {} |
144 | void post(const ComponentAtomicGroup &) override {} |
145 | void post(const ComponentBackReference &) override {} |
146 | void post(const ComponentBoundary &) override {} |
147 | void post(const ComponentByte &) override {} |
148 | void post(const ComponentCondReference &) override {} |
149 | void post(const ComponentEmpty &) override {} |
150 | void post(const ComponentEUS &) override {} |
151 | void post(const ComponentSequence &) override {} |
152 | void post(const ComponentWordBoundary &) override {} |
153 | void post(const UTF8ComponentClass &) override {} |
154 | |
155 | ue2_literal lit; |
156 | stack<size_t> repeat_stack; //!< index of entry to repeat. |
157 | }; |
158 | |
159 | ConstructLiteralVisitor::~ConstructLiteralVisitor() {} |
160 | |
161 | /** \brief True if the literal expression \a expr could be added to Rose. */ |
162 | bool shortcutLiteral(NG &ng, const ParsedExpression &pe) { |
163 | assert(pe.component); |
164 | |
165 | if (!ng.cc.grey.allowLiteral) { |
166 | return false; |
167 | } |
168 | |
169 | const auto &expr = pe.expr; |
170 | |
171 | // XXX: don't shortcut literals with extended params (yet) |
172 | if (expr.min_offset || expr.max_offset != MAX_OFFSET || expr.min_length || |
173 | expr.edit_distance || expr.hamm_distance) { |
174 | DEBUG_PRINTF("extended params not allowed\n" ); |
175 | return false; |
176 | } |
177 | |
178 | ConstructLiteralVisitor vis; |
179 | try { |
180 | assert(pe.component); |
181 | pe.component->accept(vis); |
182 | assert(vis.repeat_stack.empty()); |
183 | } catch (const ConstructLiteralVisitor::NotLiteral&) { |
184 | DEBUG_PRINTF("not a literal\n" ); |
185 | return false; |
186 | } |
187 | |
188 | vis.lit.set_pure(); |
189 | const ue2_literal &lit = vis.lit; |
190 | |
191 | if (lit.empty()) { |
192 | DEBUG_PRINTF("empty literal\n" ); |
193 | return false; |
194 | } |
195 | |
196 | if (expr.highlander && lit.length() <= 1) { |
197 | DEBUG_PRINTF("not shortcutting SEP literal\n" ); |
198 | return false; |
199 | } |
200 | |
201 | DEBUG_PRINTF("constructed literal %s\n" , dumpString(lit).c_str()); |
202 | return ng.addLiteral(lit, expr.index, expr.report, expr.highlander, |
203 | expr.som, expr.quiet); |
204 | } |
205 | |
206 | } // namespace ue2 |
207 | |