1/*
2 * Copyright (c) 2015-2017, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include "rose/rose_build_infix.h"
30
31#include "ue2common.h"
32#include "nfa/castlecompile.h"
33#include "nfagraph/ng_dump.h"
34#include "nfagraph/ng_width.h"
35#include "nfagraph/ng_util.h"
36#include "rose/rose_build_impl.h"
37#include "util/container.h"
38#include "util/dump_charclass.h"
39#include "util/flat_containers.h"
40#include "util/graph_range.h"
41#include "util/graph.h"
42#include "util/hash.h"
43#include "util/ue2string.h"
44#include "util/unordered.h"
45
46#include <algorithm>
47#include <set>
48
49using namespace std;
50
51namespace ue2 {
52
53static
54bool couldEndLiteral(const ue2_literal &s, NFAVertex initial,
55 const NGHolder &h) {
56 flat_set<NFAVertex> curr, next;
57 curr.insert(initial);
58
59 for (auto it = s.rbegin(), ite = s.rend(); it != ite; ++it) {
60 const CharReach &cr_s = *it;
61 bool matched = false;
62 next.clear();
63
64 for (auto v : curr) {
65 if (v == h.start) {
66 // We can't see what we had before the start, so we must assume
67 // the literal could overlap with it.
68 return true;
69 }
70 const CharReach &cr_v = h[v].char_reach;
71 if (overlaps(cr_v, cr_s)) {
72 insert(&next, inv_adjacent_vertices(v, h));
73 matched = true;
74 }
75 }
76
77 if (!matched) {
78 return false;
79 }
80
81 curr.swap(next);
82 }
83
84 return true;
85}
86
87using EdgeCache = ue2_unordered_set<pair<NFAVertex, NFAVertex>>;
88
89static
90void contractVertex(NGHolder &g, NFAVertex v, EdgeCache &all_edges) {
91 for (auto u : inv_adjacent_vertices_range(v, g)) {
92 if (u == v) {
93 continue; // self-edge
94 }
95 for (auto w : adjacent_vertices_range(v, g)) {
96 if (w == v) {
97 continue; // self-edge
98 }
99
100 // Construct edge (u, v) only if it doesn't already exist. We use
101 // the all_edges container here, as checking existence inside the
102 // graph is expensive when u or v have large degree.
103 if (all_edges.emplace(u, w).second) {
104 add_edge(u, w, g);
105 }
106 }
107 }
108
109 // Note that edges to/from v will remain in all_edges.
110 clear_vertex(v, g);
111}
112
113static
114u32 findMaxLiteralMatches(const NGHolder &h, const set<ue2_literal> &lits) {
115 DEBUG_PRINTF("h=%p, %zu literals\n", &h, lits.size());
116 //dumpGraph("infix.dot", h);
117
118 // Indices of vertices that could terminate any of the literals in 'lits'.
119 set<u32> terms;
120
121 for (const auto &s : lits) {
122 DEBUG_PRINTF("lit s='%s'\n", escapeString(s).c_str());
123 if (s.empty()) {
124 // Likely an anchored case, be conservative here.
125 return NO_MATCH_LIMIT;
126 }
127
128 for (auto v : vertices_range(h)) {
129 if (is_special(v, h)) {
130 continue;
131 }
132
133 if (couldEndLiteral(s, v, h)) {
134 u32 idx = h[v].index;
135 DEBUG_PRINTF("vertex %u could terminate lit\n", idx);
136 terms.insert(idx);
137 }
138 }
139 }
140
141 if (terms.empty()) {
142 DEBUG_PRINTF("literals cannot match inside infix\n");
143 return 0;
144 }
145
146 NGHolder g;
147 cloneHolder(g, h);
148 vector<NFAVertex> dead;
149
150 // The set of all edges in the graph is used for existence checks in
151 // contractVertex.
152 EdgeCache all_edges;
153 for (const auto &e : edges_range(g)) {
154 all_edges.emplace(source(e, g), target(e, g));
155 }
156
157 for (auto v : vertices_range(g)) {
158 if (is_special(v, g)) {
159 continue;
160 }
161 if (contains(terms, g[v].index)) {
162 continue;
163 }
164
165 contractVertex(g, v, all_edges);
166 dead.push_back(v);
167 }
168
169 remove_vertices(dead, g);
170 //dumpGraph("relaxed.dot", g);
171
172 depth maxWidth = findMaxWidth(g);
173 DEBUG_PRINTF("maxWidth=%s\n", maxWidth.str().c_str());
174 assert(maxWidth.is_reachable());
175
176 if (maxWidth.is_infinite()) {
177 // Cycle detected, so we can likely squeeze an unlimited number of
178 // matches into this graph.
179 return NO_MATCH_LIMIT;
180 }
181
182 assert(terms.size() >= maxWidth);
183 return maxWidth;
184}
185
186namespace {
187struct ReachMismatch {
188 explicit ReachMismatch(const CharReach &cr_in) : cr(cr_in) {}
189 bool operator()(const CharReach &a) const { return !overlaps(cr, a); }
190
191private:
192 CharReach cr;
193};
194}
195
196static
197u32 findMaxInfixMatches(const CastleProto &castle,
198 const set<ue2_literal> &lits) {
199 DEBUG_PRINTF("castle=%p, %zu literals\n", &castle, lits.size());
200
201 if (castle.repeats.size() > 1) {
202 DEBUG_PRINTF("more than one top!\n");
203 return NO_MATCH_LIMIT;
204 }
205
206 assert(!castle.repeats.empty());
207 const PureRepeat &pr = castle.repeats.begin()->second;
208 DEBUG_PRINTF("repeat=%s reach=%s\n", pr.bounds.str().c_str(),
209 describeClass(pr.reach).c_str());
210
211 size_t max_count = 0;
212
213 for (const auto &s : lits) {
214 DEBUG_PRINTF("lit s='%s'\n", escapeString(s).c_str());
215 if (s.empty()) {
216 // Likely an anchored case, be conservative here.
217 return NO_MATCH_LIMIT;
218 }
219
220 size_t count = 0;
221
222 auto f = find_if(s.rbegin(), s.rend(), ReachMismatch(pr.reach));
223
224 if (f == s.rbegin()) {
225 DEBUG_PRINTF("lit can't terminate inside infix\n");
226 count = 0;
227 } else if (f != s.rend()) {
228 size_t suffix_len = distance(s.rbegin(), f);
229 DEBUG_PRINTF("suffix of len %zu matches at start\n", suffix_len);
230 if (pr.bounds.max.is_finite()) {
231 count = min(suffix_len, (size_t)pr.bounds.max);
232 } else {
233 count = suffix_len;
234 }
235 } else {
236 DEBUG_PRINTF("whole lit can match inside infix (repeatedly)\n");
237 if (pr.bounds.max.is_finite()) {
238 count = pr.bounds.max;
239 } else {
240 DEBUG_PRINTF("inf bound\n");
241 return NO_MATCH_LIMIT;
242 }
243 }
244
245 DEBUG_PRINTF("count=%zu\n", count);
246 max_count = max(max_count, count);
247 }
248
249 DEBUG_PRINTF("max_count %zu\n", max_count);
250
251 if (max_count > NO_MATCH_LIMIT) {
252 assert(0); // This would be a surprise.
253 return NO_MATCH_LIMIT;
254 }
255
256 return (u32)max_count;
257}
258
259u32 findMaxInfixMatches(const left_id &left, const set<ue2_literal> &lits) {
260 if (left.castle()) {
261 return findMaxInfixMatches(*left.castle(), lits);
262 }
263 if (left.graph()) {
264 if (!onlyOneTop(*left.graph())) {
265 DEBUG_PRINTF("more than one top!n");
266 return NO_MATCH_LIMIT;
267 }
268 return findMaxLiteralMatches(*left.graph(), lits);
269 }
270
271 return NO_MATCH_LIMIT;
272}
273
274void findCountingMiracleInfo(const left_id &left, const vector<u8> &stopTable,
275 u8 *cm_count, CharReach *cm_cr) {
276 DEBUG_PRINTF("hello\n");
277 *cm_count = 0;
278 cm_cr->clear();
279 if (!left.graph()) {
280 return;
281 }
282
283 const NGHolder &g = *left.graph();
284
285 auto cyclics = find_vertices_in_cycles(g);
286
287 if (!proper_out_degree(g.startDs, g)) {
288 cyclics.erase(g.startDs);
289 }
290
291 CharReach cyclic_cr;
292 for (NFAVertex v : cyclics) {
293 DEBUG_PRINTF("considering %zu ||=%zu\n", g[v].index,
294 g[v].char_reach.count());
295 cyclic_cr |= g[v].char_reach;
296 }
297
298 if (cyclic_cr.none() || cyclic_cr.all()) {
299 DEBUG_PRINTF("cyclic cr width %zu\n", cyclic_cr.count());
300 return; /* useless */
301 }
302
303 *cm_cr = ~cyclic_cr;
304
305 /* stop character will be part of normal miracles, no need to look for them
306 * here too */
307 assert(stopTable.size() == N_CHARS);
308 for (u32 i = 0; i < N_CHARS; i++) {
309 if (stopTable[i]) {
310 cm_cr->clear(i);
311 }
312 }
313
314 set<ue2_literal> lits;
315 for (size_t c = cm_cr->find_first(); c != CharReach::npos;
316 c = cm_cr->find_next(c)) {
317 DEBUG_PRINTF("considering %hhx as stop character\n", (u8)c);
318 lits.insert(ue2_literal(c, false));
319 }
320
321 u32 count = findMaxLiteralMatches(*left.graph(), lits);
322 DEBUG_PRINTF("counting miracle %u\n", count + 1);
323 if (count && count < 50) {
324 *cm_count = count + 1;
325 }
326}
327
328} // namespace ue2
329