1 | /* |
2 | * Copyright (c) 2015-2017, Intel Corporation |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions are met: |
6 | * |
7 | * * Redistributions of source code must retain the above copyright notice, |
8 | * this list of conditions and the following disclaimer. |
9 | * * Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * * Neither the name of Intel Corporation nor the names of its contributors |
13 | * may be used to endorse or promote products derived from this software |
14 | * without specific prior written permission. |
15 | * |
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
17 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
20 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
26 | * POSSIBILITY OF SUCH DAMAGE. |
27 | */ |
28 | |
29 | #include "rose/rose_build_infix.h" |
30 | |
31 | #include "ue2common.h" |
32 | #include "nfa/castlecompile.h" |
33 | #include "nfagraph/ng_dump.h" |
34 | #include "nfagraph/ng_width.h" |
35 | #include "nfagraph/ng_util.h" |
36 | #include "rose/rose_build_impl.h" |
37 | #include "util/container.h" |
38 | #include "util/dump_charclass.h" |
39 | #include "util/flat_containers.h" |
40 | #include "util/graph_range.h" |
41 | #include "util/graph.h" |
42 | #include "util/hash.h" |
43 | #include "util/ue2string.h" |
44 | #include "util/unordered.h" |
45 | |
46 | #include <algorithm> |
47 | #include <set> |
48 | |
49 | using namespace std; |
50 | |
51 | namespace ue2 { |
52 | |
53 | static |
54 | bool couldEndLiteral(const ue2_literal &s, NFAVertex initial, |
55 | const NGHolder &h) { |
56 | flat_set<NFAVertex> curr, next; |
57 | curr.insert(initial); |
58 | |
59 | for (auto it = s.rbegin(), ite = s.rend(); it != ite; ++it) { |
60 | const CharReach &cr_s = *it; |
61 | bool matched = false; |
62 | next.clear(); |
63 | |
64 | for (auto v : curr) { |
65 | if (v == h.start) { |
66 | // We can't see what we had before the start, so we must assume |
67 | // the literal could overlap with it. |
68 | return true; |
69 | } |
70 | const CharReach &cr_v = h[v].char_reach; |
71 | if (overlaps(cr_v, cr_s)) { |
72 | insert(&next, inv_adjacent_vertices(v, h)); |
73 | matched = true; |
74 | } |
75 | } |
76 | |
77 | if (!matched) { |
78 | return false; |
79 | } |
80 | |
81 | curr.swap(next); |
82 | } |
83 | |
84 | return true; |
85 | } |
86 | |
87 | using EdgeCache = ue2_unordered_set<pair<NFAVertex, NFAVertex>>; |
88 | |
89 | static |
90 | void contractVertex(NGHolder &g, NFAVertex v, EdgeCache &all_edges) { |
91 | for (auto u : inv_adjacent_vertices_range(v, g)) { |
92 | if (u == v) { |
93 | continue; // self-edge |
94 | } |
95 | for (auto w : adjacent_vertices_range(v, g)) { |
96 | if (w == v) { |
97 | continue; // self-edge |
98 | } |
99 | |
100 | // Construct edge (u, v) only if it doesn't already exist. We use |
101 | // the all_edges container here, as checking existence inside the |
102 | // graph is expensive when u or v have large degree. |
103 | if (all_edges.emplace(u, w).second) { |
104 | add_edge(u, w, g); |
105 | } |
106 | } |
107 | } |
108 | |
109 | // Note that edges to/from v will remain in all_edges. |
110 | clear_vertex(v, g); |
111 | } |
112 | |
113 | static |
114 | u32 findMaxLiteralMatches(const NGHolder &h, const set<ue2_literal> &lits) { |
115 | DEBUG_PRINTF("h=%p, %zu literals\n" , &h, lits.size()); |
116 | //dumpGraph("infix.dot", h); |
117 | |
118 | // Indices of vertices that could terminate any of the literals in 'lits'. |
119 | set<u32> terms; |
120 | |
121 | for (const auto &s : lits) { |
122 | DEBUG_PRINTF("lit s='%s'\n" , escapeString(s).c_str()); |
123 | if (s.empty()) { |
124 | // Likely an anchored case, be conservative here. |
125 | return NO_MATCH_LIMIT; |
126 | } |
127 | |
128 | for (auto v : vertices_range(h)) { |
129 | if (is_special(v, h)) { |
130 | continue; |
131 | } |
132 | |
133 | if (couldEndLiteral(s, v, h)) { |
134 | u32 idx = h[v].index; |
135 | DEBUG_PRINTF("vertex %u could terminate lit\n" , idx); |
136 | terms.insert(idx); |
137 | } |
138 | } |
139 | } |
140 | |
141 | if (terms.empty()) { |
142 | DEBUG_PRINTF("literals cannot match inside infix\n" ); |
143 | return 0; |
144 | } |
145 | |
146 | NGHolder g; |
147 | cloneHolder(g, h); |
148 | vector<NFAVertex> dead; |
149 | |
150 | // The set of all edges in the graph is used for existence checks in |
151 | // contractVertex. |
152 | EdgeCache all_edges; |
153 | for (const auto &e : edges_range(g)) { |
154 | all_edges.emplace(source(e, g), target(e, g)); |
155 | } |
156 | |
157 | for (auto v : vertices_range(g)) { |
158 | if (is_special(v, g)) { |
159 | continue; |
160 | } |
161 | if (contains(terms, g[v].index)) { |
162 | continue; |
163 | } |
164 | |
165 | contractVertex(g, v, all_edges); |
166 | dead.push_back(v); |
167 | } |
168 | |
169 | remove_vertices(dead, g); |
170 | //dumpGraph("relaxed.dot", g); |
171 | |
172 | depth maxWidth = findMaxWidth(g); |
173 | DEBUG_PRINTF("maxWidth=%s\n" , maxWidth.str().c_str()); |
174 | assert(maxWidth.is_reachable()); |
175 | |
176 | if (maxWidth.is_infinite()) { |
177 | // Cycle detected, so we can likely squeeze an unlimited number of |
178 | // matches into this graph. |
179 | return NO_MATCH_LIMIT; |
180 | } |
181 | |
182 | assert(terms.size() >= maxWidth); |
183 | return maxWidth; |
184 | } |
185 | |
186 | namespace { |
187 | struct ReachMismatch { |
188 | explicit ReachMismatch(const CharReach &cr_in) : cr(cr_in) {} |
189 | bool operator()(const CharReach &a) const { return !overlaps(cr, a); } |
190 | |
191 | private: |
192 | CharReach cr; |
193 | }; |
194 | } |
195 | |
196 | static |
197 | u32 findMaxInfixMatches(const CastleProto &castle, |
198 | const set<ue2_literal> &lits) { |
199 | DEBUG_PRINTF("castle=%p, %zu literals\n" , &castle, lits.size()); |
200 | |
201 | if (castle.repeats.size() > 1) { |
202 | DEBUG_PRINTF("more than one top!\n" ); |
203 | return NO_MATCH_LIMIT; |
204 | } |
205 | |
206 | assert(!castle.repeats.empty()); |
207 | const PureRepeat &pr = castle.repeats.begin()->second; |
208 | DEBUG_PRINTF("repeat=%s reach=%s\n" , pr.bounds.str().c_str(), |
209 | describeClass(pr.reach).c_str()); |
210 | |
211 | size_t max_count = 0; |
212 | |
213 | for (const auto &s : lits) { |
214 | DEBUG_PRINTF("lit s='%s'\n" , escapeString(s).c_str()); |
215 | if (s.empty()) { |
216 | // Likely an anchored case, be conservative here. |
217 | return NO_MATCH_LIMIT; |
218 | } |
219 | |
220 | size_t count = 0; |
221 | |
222 | auto f = find_if(s.rbegin(), s.rend(), ReachMismatch(pr.reach)); |
223 | |
224 | if (f == s.rbegin()) { |
225 | DEBUG_PRINTF("lit can't terminate inside infix\n" ); |
226 | count = 0; |
227 | } else if (f != s.rend()) { |
228 | size_t suffix_len = distance(s.rbegin(), f); |
229 | DEBUG_PRINTF("suffix of len %zu matches at start\n" , suffix_len); |
230 | if (pr.bounds.max.is_finite()) { |
231 | count = min(suffix_len, (size_t)pr.bounds.max); |
232 | } else { |
233 | count = suffix_len; |
234 | } |
235 | } else { |
236 | DEBUG_PRINTF("whole lit can match inside infix (repeatedly)\n" ); |
237 | if (pr.bounds.max.is_finite()) { |
238 | count = pr.bounds.max; |
239 | } else { |
240 | DEBUG_PRINTF("inf bound\n" ); |
241 | return NO_MATCH_LIMIT; |
242 | } |
243 | } |
244 | |
245 | DEBUG_PRINTF("count=%zu\n" , count); |
246 | max_count = max(max_count, count); |
247 | } |
248 | |
249 | DEBUG_PRINTF("max_count %zu\n" , max_count); |
250 | |
251 | if (max_count > NO_MATCH_LIMIT) { |
252 | assert(0); // This would be a surprise. |
253 | return NO_MATCH_LIMIT; |
254 | } |
255 | |
256 | return (u32)max_count; |
257 | } |
258 | |
259 | u32 findMaxInfixMatches(const left_id &left, const set<ue2_literal> &lits) { |
260 | if (left.castle()) { |
261 | return findMaxInfixMatches(*left.castle(), lits); |
262 | } |
263 | if (left.graph()) { |
264 | if (!onlyOneTop(*left.graph())) { |
265 | DEBUG_PRINTF("more than one top!n" ); |
266 | return NO_MATCH_LIMIT; |
267 | } |
268 | return findMaxLiteralMatches(*left.graph(), lits); |
269 | } |
270 | |
271 | return NO_MATCH_LIMIT; |
272 | } |
273 | |
274 | void findCountingMiracleInfo(const left_id &left, const vector<u8> &stopTable, |
275 | u8 *cm_count, CharReach *cm_cr) { |
276 | DEBUG_PRINTF("hello\n" ); |
277 | *cm_count = 0; |
278 | cm_cr->clear(); |
279 | if (!left.graph()) { |
280 | return; |
281 | } |
282 | |
283 | const NGHolder &g = *left.graph(); |
284 | |
285 | auto cyclics = find_vertices_in_cycles(g); |
286 | |
287 | if (!proper_out_degree(g.startDs, g)) { |
288 | cyclics.erase(g.startDs); |
289 | } |
290 | |
291 | CharReach cyclic_cr; |
292 | for (NFAVertex v : cyclics) { |
293 | DEBUG_PRINTF("considering %zu ||=%zu\n" , g[v].index, |
294 | g[v].char_reach.count()); |
295 | cyclic_cr |= g[v].char_reach; |
296 | } |
297 | |
298 | if (cyclic_cr.none() || cyclic_cr.all()) { |
299 | DEBUG_PRINTF("cyclic cr width %zu\n" , cyclic_cr.count()); |
300 | return; /* useless */ |
301 | } |
302 | |
303 | *cm_cr = ~cyclic_cr; |
304 | |
305 | /* stop character will be part of normal miracles, no need to look for them |
306 | * here too */ |
307 | assert(stopTable.size() == N_CHARS); |
308 | for (u32 i = 0; i < N_CHARS; i++) { |
309 | if (stopTable[i]) { |
310 | cm_cr->clear(i); |
311 | } |
312 | } |
313 | |
314 | set<ue2_literal> lits; |
315 | for (size_t c = cm_cr->find_first(); c != CharReach::npos; |
316 | c = cm_cr->find_next(c)) { |
317 | DEBUG_PRINTF("considering %hhx as stop character\n" , (u8)c); |
318 | lits.insert(ue2_literal(c, false)); |
319 | } |
320 | |
321 | u32 count = findMaxLiteralMatches(*left.graph(), lits); |
322 | DEBUG_PRINTF("counting miracle %u\n" , count + 1); |
323 | if (count && count < 50) { |
324 | *cm_count = count + 1; |
325 | } |
326 | } |
327 | |
328 | } // namespace ue2 |
329 | |