1/*
2 * Copyright (c) 2015-2017, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** \file
30 * \brief Rose construction from NGHolder for cases representing small literal
31 * sets.
32 */
33#include "ng_small_literal_set.h"
34
35#include "grey.h"
36#include "ng_holder.h"
37#include "ng_util.h"
38#include "rose/rose_build.h"
39#include "util/compare.h"
40#include "util/compile_context.h"
41#include "util/container.h"
42#include "util/graph_range.h"
43#include "util/order_check.h"
44#include "util/ue2string.h"
45#include "ue2common.h"
46
47#include <map>
48#include <set>
49#include <vector>
50#include <boost/range/adaptor/map.hpp>
51
52using namespace std;
53using boost::adaptors::map_keys;
54
55namespace ue2 {
56
57/** \brief The maximum number of literals to accept per pattern. */
58static const size_t MAX_LITERAL_SET_SIZE = 30;
59
60/**
61 * \brief The maximum number of literals to accept per pattern where at least
62 * one is weak (has period < MIN_STRONG_PERIOD).
63 */
64static const size_t MAX_WEAK_LITERAL_SET_SIZE = 20;
65
66/**
67 * \brief The minimum string period to consider a literal "strong" (and not
68 * apply the weak size limit).
69 */
70static const size_t MIN_STRONG_PERIOD = 3;
71
72namespace {
73
74struct sls_literal {
75 bool anchored;
76 bool eod;
77 ue2_literal s;
78
79 explicit sls_literal(bool a) : anchored(a), eod(false) {}
80
81 sls_literal append(char c, bool nocase) const {
82 sls_literal rv(anchored);
83 rv.s = s;
84 rv.s.push_back(ue2_literal::elem(c, nocase));
85
86 return rv;
87 }
88};
89
90static
91bool operator<(const sls_literal &a, const sls_literal &b) {
92 ORDER_CHECK(anchored);
93 ORDER_CHECK(eod);
94 ORDER_CHECK(s);
95
96 return false;
97}
98
99} // namespace
100
101static
102bool checkLongMixedSensitivityLiterals(
103 const map<sls_literal, flat_set<ReportID>> &literals) {
104 const size_t len = MAX_MASK2_WIDTH;
105
106 for (const sls_literal &lit : literals | map_keys) {
107 if (mixed_sensitivity(lit.s) && lit.s.length() > len) {
108 return false;
109 }
110 }
111
112 return true;
113}
114
115static
116bool findLiterals(const NGHolder &g,
117 map<sls_literal, flat_set<ReportID>> *literals) {
118 vector<NFAVertex> order = getTopoOrdering(g);
119
120 vector<set<sls_literal>> built(num_vertices(g));
121 vector<size_t> read_count(num_vertices(g));
122
123 for (auto it = order.rbegin(); it != order.rend(); ++it) {
124 NFAVertex v = *it;
125 set<sls_literal> &out = built[g[v].index];
126 read_count[g[v].index] = out_degree(v, g);
127
128 DEBUG_PRINTF("setting read_count to %zu for %zu\n",
129 read_count[g[v].index], g[v].index);
130
131 assert(out.empty());
132 if (v == g.start) {
133 out.insert(sls_literal(true));
134 continue;
135 } else if (v == g.startDs) {
136 out.insert(sls_literal(false));
137 continue;
138 }
139
140 bool eod = v == g.acceptEod;
141 bool accept = v == g.accept || v == g.acceptEod;
142 const CharReach &cr = g[v].char_reach;
143
144 for (auto u : inv_adjacent_vertices_range(v, g)) {
145 if (u == g.accept) {
146 continue;
147 }
148
149 if (u == g.start && edge(g.startDs, v, g).second) {
150 /* floating start states may have connections to start and
151 * startDs - don't create duplicate anchored literals */
152 DEBUG_PRINTF("skipping as floating\n");
153 continue;
154 }
155
156 set<sls_literal> &in = built[g[u].index];
157 DEBUG_PRINTF("getting from %zu (%zu reads to go)\n",
158 g[u].index, read_count[g[u].index]);
159 assert(!in.empty());
160 assert(read_count[g[u].index]);
161
162 for (const sls_literal &lit : in) {
163 if (accept) {
164 sls_literal accept_lit = lit; // copy
165 accept_lit.eod = eod;
166 insert(&(*literals)[accept_lit], g[u].reports);
167 continue;
168 }
169
170 for (size_t c = cr.find_first(); c != cr.npos;
171 c = cr.find_next(c)) {
172 bool nocase = ourisalpha(c) && cr.test(mytoupper(c))
173 && cr.test(mytolower(c));
174
175 if (nocase && (char)c == mytolower(c)) {
176 continue; /* uppercase already handled us */
177 }
178
179 out.insert(lit.append((u8)c, nocase));
180
181 if (out.size() + literals->size() > MAX_LITERAL_SET_SIZE) {
182 DEBUG_PRINTF("too big %zu + %zu\n", out.size(),
183 literals->size());
184 return false;
185 }
186 }
187 }
188
189 read_count[g[u].index]--;
190 if (!read_count[g[u].index]) {
191 DEBUG_PRINTF("clearing %zu as finished reading\n", g[u].index);
192 in.clear();
193 }
194 }
195 }
196
197 return true;
198}
199
200static
201size_t min_period(const map<sls_literal, flat_set<ReportID>> &literals) {
202 size_t rv = SIZE_MAX;
203
204 for (const sls_literal &lit : literals | map_keys) {
205 rv = min(rv, minStringPeriod(lit.s));
206 }
207 DEBUG_PRINTF("min period %zu\n", rv);
208 return rv;
209}
210
211// If this component is just a small set of literals and can be handled by
212// Rose, feed it directly into rose.
213bool handleSmallLiteralSets(RoseBuild &rose, const NGHolder &g,
214 const CompileContext &cc) {
215 if (!cc.grey.allowSmallLiteralSet) {
216 return false;
217 }
218
219 if (!isAcyclic(g)) {
220 /* literal sets would typically be acyclic... */
221 DEBUG_PRINTF("not acyclic\n");
222 return false;
223 }
224
225 if (!hasNarrowReachVertex(g, MAX_LITERAL_SET_SIZE * 2 + 1)) {
226 DEBUG_PRINTF("vertex with wide reach found\n");
227 return false;
228 }
229
230 DEBUG_PRINTF("looking for literals\n");
231
232 map<sls_literal, flat_set<ReportID>> literals;
233 if (!findLiterals(g, &literals)) {
234 DEBUG_PRINTF(":(\n");
235 return false;
236 }
237
238 assert(!literals.empty());
239
240 if (literals.size() > MAX_LITERAL_SET_SIZE) {
241 /* try a mask instead */
242 DEBUG_PRINTF("too many literals\n");
243 return false;
244 }
245
246 size_t period = min_period(literals);
247 if (period < MIN_STRONG_PERIOD &&
248 literals.size() > MAX_WEAK_LITERAL_SET_SIZE) {
249 DEBUG_PRINTF("too many literals with weak period\n");
250 return false;
251 }
252
253 if (!checkLongMixedSensitivityLiterals(literals)) {
254 DEBUG_PRINTF("long mixed\n");
255 return false;
256 }
257
258 DEBUG_PRINTF("adding %zu literals\n", literals.size());
259 for (const auto &m : literals) {
260 const sls_literal &lit = m.first;
261 const auto &reports = m.second;
262 rose.add(lit.anchored, lit.eod, lit.s, reports);
263 }
264
265 return true;
266}
267
268} // namespace ue2
269