1/*
2 * Copyright (c) 2015-2019, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** \file
30 * \brief Shortcut literal pass: directly add literal components to Rose.
31 */
32#include "AsciiComponentClass.h"
33#include "Utf8ComponentClass.h"
34#include "ComponentAssertion.h"
35#include "ComponentAtomicGroup.h"
36#include "ComponentBackReference.h"
37#include "ComponentBoundary.h"
38#include "ComponentClass.h"
39#include "ComponentCondReference.h"
40#include "ComponentRepeat.h"
41#include "ComponentSequence.h"
42#include "ComponentVisitor.h"
43#include "ComponentWordBoundary.h"
44#include "ConstComponentVisitor.h"
45#include "parse_error.h"
46#include "shortcut_literal.h"
47#include "grey.h"
48#include "nfagraph/ng.h"
49#include "compiler/compiler.h"
50#include "util/ue2string.h"
51#include "ue2common.h"
52
53#include <stack>
54
55using namespace std;
56
57namespace ue2 {
58
59/**
60 * \brief Visitor that constructs a ue2_literal from a component tree.
61 *
62 * If a component that can't be part of a literal is encountered, this visitor
63 * will throw ConstructLiteralVisitor::NotLiteral.
64 */
65class ConstructLiteralVisitor : public ConstComponentVisitor {
66public:
67 ~ConstructLiteralVisitor() override;
68
69 /** \brief Thrown if this component does not represent a literal. */
70 struct NotLiteral {};
71
72 void pre(const AsciiComponentClass &c) override {
73 const CharReach &cr = c.cr;
74 const size_t width = cr.count();
75 if (width == 1) {
76 lit.push_back(cr.find_first(), false);
77 } else if (width == 2 && cr.isCaselessChar()) {
78 lit.push_back(cr.find_first(), true);
79 } else {
80 throw NotLiteral();
81 }
82 }
83
84 void pre(const ComponentRepeat &c) override {
85 if (c.m_min == 0 || c.m_min != c.m_max) {
86 throw NotLiteral();
87 }
88
89 if (c.m_max < ComponentRepeat::NoLimit && c.m_max > 32767) {
90 throw ParseError("Bounded repeat is too large.");
91 }
92
93 // Store the current length of the literal; in this repeat's post()
94 // call we will append N-1 more copies of [index..end].
95 repeat_stack.push(lit.length());
96 }
97
98 void post(const ComponentRepeat &c) override {
99 // Add N-1 copies of the string between the entry to the repeat and the
100 // current end of the literal.
101 assert(!repeat_stack.empty());
102 const ue2_literal suffix = lit.substr(repeat_stack.top());
103 repeat_stack.pop();
104
105 for (unsigned i = 1; i < c.m_min; i++) {
106 lit += suffix;
107 }
108 }
109
110 void pre(const ComponentSequence &) override {
111 // Pass through.
112 }
113
114 void pre(const ComponentAlternation &) override { throw NotLiteral(); }
115 void pre(const ComponentAssertion &) override { throw NotLiteral(); }
116 void pre(const ComponentAtomicGroup &) override { throw NotLiteral(); }
117 void pre(const ComponentBackReference &) override { throw NotLiteral(); }
118 void pre(const ComponentBoundary &) override { throw NotLiteral(); }
119 void pre(const ComponentByte &) override { throw NotLiteral(); }
120 void pre(const ComponentCondReference &) override { throw NotLiteral(); }
121 void pre(const ComponentEmpty &) override { throw NotLiteral(); }
122 void pre(const ComponentEUS &) override { throw NotLiteral(); }
123 void pre(const ComponentWordBoundary &) override { throw NotLiteral(); }
124 void pre(const UTF8ComponentClass &) override { throw NotLiteral(); }
125
126 void during(const AsciiComponentClass &) override {}
127 void during(const ComponentAlternation &) override {}
128 void during(const ComponentAssertion &) override {}
129 void during(const ComponentAtomicGroup &) override {}
130 void during(const ComponentBackReference &) override {}
131 void during(const ComponentBoundary &) override {}
132 void during(const ComponentByte &) override {}
133 void during(const ComponentCondReference &) override {}
134 void during(const ComponentEmpty &) override {}
135 void during(const ComponentEUS &) override {}
136 void during(const ComponentRepeat &) override {}
137 void during(const ComponentSequence &) override {}
138 void during(const ComponentWordBoundary &) override {}
139 void during(const UTF8ComponentClass &) override {}
140
141 void post(const AsciiComponentClass &) override {}
142 void post(const ComponentAlternation &) override {}
143 void post(const ComponentAssertion &) override {}
144 void post(const ComponentAtomicGroup &) override {}
145 void post(const ComponentBackReference &) override {}
146 void post(const ComponentBoundary &) override {}
147 void post(const ComponentByte &) override {}
148 void post(const ComponentCondReference &) override {}
149 void post(const ComponentEmpty &) override {}
150 void post(const ComponentEUS &) override {}
151 void post(const ComponentSequence &) override {}
152 void post(const ComponentWordBoundary &) override {}
153 void post(const UTF8ComponentClass &) override {}
154
155 ue2_literal lit;
156 stack<size_t> repeat_stack; //!< index of entry to repeat.
157};
158
159ConstructLiteralVisitor::~ConstructLiteralVisitor() {}
160
161/** \brief True if the literal expression \a expr could be added to Rose. */
162bool shortcutLiteral(NG &ng, const ParsedExpression &pe) {
163 assert(pe.component);
164
165 if (!ng.cc.grey.allowLiteral) {
166 return false;
167 }
168
169 const auto &expr = pe.expr;
170
171 // XXX: don't shortcut literals with extended params (yet)
172 if (expr.min_offset || expr.max_offset != MAX_OFFSET || expr.min_length ||
173 expr.edit_distance || expr.hamm_distance) {
174 DEBUG_PRINTF("extended params not allowed\n");
175 return false;
176 }
177
178 ConstructLiteralVisitor vis;
179 try {
180 assert(pe.component);
181 pe.component->accept(vis);
182 assert(vis.repeat_stack.empty());
183 } catch (const ConstructLiteralVisitor::NotLiteral&) {
184 DEBUG_PRINTF("not a literal\n");
185 return false;
186 }
187
188 vis.lit.set_pure();
189 const ue2_literal &lit = vis.lit;
190
191 if (lit.empty()) {
192 DEBUG_PRINTF("empty literal\n");
193 return false;
194 }
195
196 if (expr.highlander && lit.length() <= 1) {
197 DEBUG_PRINTF("not shortcutting SEP literal\n");
198 return false;
199 }
200
201 DEBUG_PRINTF("constructed literal %s\n", dumpString(lit).c_str());
202 return ng.addLiteral(lit, expr.index, expr.report, expr.highlander,
203 expr.som, expr.quiet);
204}
205
206} // namespace ue2
207