1/*
2 * Copyright (c) 2015-2017, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include "rdfa_merge.h"
30
31#include "grey.h"
32#include "dfa_min.h"
33#include "mcclellancompile_util.h"
34#include "rdfa.h"
35#include "ue2common.h"
36#include "nfagraph/ng_mcclellan_internal.h"
37#include "util/container.h"
38#include "util/determinise.h"
39#include "util/flat_containers.h"
40#include "util/make_unique.h"
41#include "util/report_manager.h"
42#include "util/unordered.h"
43
44#include <algorithm>
45#include <queue>
46
47using namespace std;
48
49namespace ue2 {
50
51#define MAX_DFA_STATES 16383
52
53namespace {
54
55class Automaton_Merge {
56public:
57 using StateSet = vector<u16>;
58 using StateMap = ue2_unordered_map<StateSet, dstate_id_t>;
59
60 Automaton_Merge(const raw_dfa *rdfa1, const raw_dfa *rdfa2,
61 const ReportManager *rm_in, const Grey &grey_in)
62 : rm(rm_in), grey(grey_in), nfas{rdfa1, rdfa2}, dead(2) {
63 calculateAlphabet();
64 populateAsFs();
65 prunable = isPrunable();
66 }
67
68 Automaton_Merge(const vector<const raw_dfa *> &dfas,
69 const ReportManager *rm_in, const Grey &grey_in)
70 : rm(rm_in), grey(grey_in), nfas(dfas), dead(nfas.size()) {
71 calculateAlphabet();
72 populateAsFs();
73 prunable = isPrunable();
74 }
75
76 void populateAsFs(void) {
77 bool fs_same = true;
78 bool fs_dead = true;
79
80 as.resize(nfas.size());
81 fs.resize(nfas.size());
82 for (size_t i = 0, end = nfas.size(); i < end; i++) {
83 as[i] = nfas[i]->start_anchored;
84 fs[i] = nfas[i]->start_floating;
85
86 if (fs[i]) {
87 fs_dead = false;
88 }
89
90 if (as[i] != fs[i]) {
91 fs_same = false;
92 }
93 }
94
95 start_anchored = DEAD_STATE + 1;
96 if (fs_same) {
97 start_floating = start_anchored;
98 } else if (fs_dead) {
99 start_floating = DEAD_STATE;
100 } else {
101 start_floating = start_anchored + 1;
102 }
103 }
104
105 void calculateAlphabet(void) {
106 DEBUG_PRINTF("calculating alphabet\n");
107 vector<CharReach> esets = {CharReach::dot()};
108
109 for (const auto &rdfa : nfas) {
110 DEBUG_PRINTF("...next dfa alphabet\n");
111 assert(rdfa);
112 const auto &alpha_remap = rdfa->alpha_remap;
113
114 for (size_t i = 0; i < esets.size(); i++) {
115 assert(esets[i].count());
116 if (esets[i].count() == 1) {
117 DEBUG_PRINTF("skipping singleton eq set\n");
118 continue;
119 }
120
121 CharReach t;
122 u8 leader_s = alpha_remap[esets[i].find_first()];
123
124 DEBUG_PRINTF("checking eq set, leader %02hhx \n", leader_s);
125
126 for (size_t s = esets[i].find_first(); s != CharReach::npos;
127 s = esets[i].find_next(s)) {
128 if (alpha_remap[s] != leader_s) {
129 t.set(s);
130 }
131 }
132
133 if (t.any() && t != esets[i]) {
134 esets[i] &= ~t;
135 esets.push_back(t);
136 }
137 }
138 }
139
140 // Sort so that our alphabet mapping isn't dependent on the order of
141 // rdfas passed in.
142 sort(esets.begin(), esets.end());
143
144 alphasize = buildAlphabetFromEquivSets(esets, alpha, unalpha);
145 }
146
147 bool isPrunable() const {
148 if (!grey.highlanderPruneDFA || !rm) {
149 DEBUG_PRINTF("disabled, or not managed reports\n");
150 return false;
151 }
152
153 assert(!nfas.empty());
154 if (!generates_callbacks(nfas.front()->kind)) {
155 DEBUG_PRINTF("doesn't generate callbacks\n");
156 return false;
157 }
158
159 // Collect all reports from all merge candidates.
160 flat_set<ReportID> merge_reports;
161 for (const auto &rdfa : nfas) {
162 insert(&merge_reports, all_reports(*rdfa));
163 }
164
165 DEBUG_PRINTF("all reports: %s\n", as_string_list(merge_reports).c_str());
166
167 // Return true if they're all exhaustible with the same exhaustion key.
168 u32 ekey = INVALID_EKEY;
169 for (const auto &report_id : merge_reports) {
170 const Report &r = rm->getReport(report_id);
171 if (!isSimpleExhaustible(r)) {
172 DEBUG_PRINTF("report %u not simple exhaustible\n", report_id);
173 return false;
174 }
175 assert(r.ekey != INVALID_EKEY);
176 if (ekey == INVALID_EKEY) {
177 ekey = r.ekey;
178 } else if (ekey != r.ekey) {
179 DEBUG_PRINTF("two different ekeys, %u and %u\n", ekey, r.ekey);
180 return false;
181 }
182 }
183
184 DEBUG_PRINTF("is prunable\n");
185 return true;
186 }
187
188
189 void transition(const StateSet &in, StateSet *next) {
190 u16 t[ALPHABET_SIZE];
191
192 for (u32 i = 0; i < alphasize; i++) {
193 next[i].resize(nfas.size());
194 }
195
196 for (size_t j = 0, j_end = nfas.size(); j < j_end; j++) {
197 getFullTransitionFromState(*nfas[j], in[j], t);
198 for (u32 i = 0; i < alphasize; i++) {
199 next[i][j] = t[unalpha[i]];
200 }
201 }
202 }
203
204 const vector<StateSet> initial() {
205 vector<StateSet> rv = {as};
206 if (start_floating != DEAD_STATE && start_floating != start_anchored) {
207 rv.push_back(fs);
208 }
209 return rv;
210 }
211
212private:
213 void reports_i(const StateSet &in, flat_set<ReportID> dstate::*r_set,
214 flat_set<ReportID> &r) const {
215 for (size_t i = 0, end = nfas.size(); i < end; i++) {
216 const auto &rs = nfas[i]->states[in[i]].*r_set;
217 insert(&r, rs);
218 }
219 }
220
221public:
222 void reports(const StateSet &in, flat_set<ReportID> &rv) const {
223 reports_i(in, &dstate::reports, rv);
224 }
225 void reportsEod(const StateSet &in, flat_set<ReportID> &rv) const {
226 reports_i(in, &dstate::reports_eod, rv);
227 }
228
229 bool canPrune(const flat_set<ReportID> &test_reports) const {
230 if (!grey.highlanderPruneDFA || !prunable) {
231 return false;
232 }
233
234 // Must all be external reports.
235 assert(rm);
236 for (const auto &report_id : test_reports) {
237 if (!isExternalReport(rm->getReport(report_id))) {
238 return false;
239 }
240 }
241
242 return true;
243 }
244
245 /** True if the minimization algorithm should be run after merging. */
246 bool shouldMinimize() const {
247 // We only need to run minimization if our merged DFAs shared a report.
248 flat_set<ReportID> seen_reports;
249 for (const auto &rdfa : nfas) {
250 for (const auto &report_id : all_reports(*rdfa)) {
251 if (!seen_reports.insert(report_id).second) {
252 DEBUG_PRINTF("report %u in several dfas\n", report_id);
253 return true;
254 }
255 }
256 }
257
258 return false;
259 }
260
261private:
262 const ReportManager *rm;
263 const Grey &grey;
264
265 vector<const raw_dfa *> nfas;
266 vector<dstate_id_t> as;
267 vector<dstate_id_t> fs;
268
269 bool prunable = false;
270
271public:
272 std::array<u16, ALPHABET_SIZE> alpha;
273 std::array<u16, ALPHABET_SIZE> unalpha;
274 u16 alphasize;
275 StateSet dead;
276
277 u16 start_anchored;
278 u16 start_floating;
279};
280
281} // namespace
282
283unique_ptr<raw_dfa> mergeTwoDfas(const raw_dfa *d1, const raw_dfa *d2,
284 size_t max_states, const ReportManager *rm,
285 const Grey &grey) {
286 assert(d1 && d2);
287 assert(d1->kind == d2->kind);
288 assert(max_states <= MAX_DFA_STATES);
289
290 auto rdfa = ue2::make_unique<raw_dfa>(d1->kind);
291
292 Automaton_Merge autom(d1, d2, rm, grey);
293 if (determinise(autom, rdfa->states, max_states)) {
294 rdfa->start_anchored = autom.start_anchored;
295 rdfa->start_floating = autom.start_floating;
296 rdfa->alpha_size = autom.alphasize;
297 rdfa->alpha_remap = autom.alpha;
298 DEBUG_PRINTF("merge succeeded, %zu states\n", rdfa->states.size());
299
300 if (autom.shouldMinimize()) {
301 minimize_hopcroft(*rdfa, grey);
302 DEBUG_PRINTF("minimized, %zu states\n", rdfa->states.size());
303 }
304
305 return rdfa;
306 }
307
308 return nullptr;
309}
310
311void mergeDfas(vector<unique_ptr<raw_dfa>> &dfas, size_t max_states,
312 const ReportManager *rm, const Grey &grey) {
313 assert(max_states <= MAX_DFA_STATES);
314
315 if (dfas.size() <= 1) {
316 return;
317 }
318
319 DEBUG_PRINTF("before merging, we have %zu dfas\n", dfas.size());
320
321 queue<unique_ptr<raw_dfa>> q;
322 for (auto &dfa : dfas) {
323 q.push(move(dfa));
324 }
325
326 // All DFAs are now on the queue, so we'll clear the vector and use it for
327 // output from here.
328 dfas.clear();
329
330 while (q.size() > 1) {
331 // Attempt to merge the two front elements of the queue.
332 unique_ptr<raw_dfa> d1 = move(q.front());
333 q.pop();
334 unique_ptr<raw_dfa> d2 = move(q.front());
335 q.pop();
336
337 auto rdfa = mergeTwoDfas(d1.get(), d2.get(), max_states, rm, grey);
338 if (rdfa) {
339 q.push(move(rdfa));
340 } else {
341 DEBUG_PRINTF("failed to merge\n");
342 // Put the larger of the two DFAs on the output list, retain the
343 // smaller one on the queue for further merge attempts.
344 if (d2->states.size() > d1->states.size()) {
345 dfas.push_back(move(d2));
346 q.push(move(d1));
347 } else {
348 dfas.push_back(move(d1));
349 q.push(move(d2));
350 }
351 }
352 }
353
354 while (!q.empty()) {
355 dfas.push_back(move(q.front()));
356 q.pop();
357 }
358
359 DEBUG_PRINTF("after merging, we have %zu dfas\n", dfas.size());
360}
361
362unique_ptr<raw_dfa> mergeAllDfas(const vector<const raw_dfa *> &dfas,
363 size_t max_states, const ReportManager *rm,
364 const Grey &grey) {
365 assert(max_states <= MAX_DFA_STATES);
366 assert(!dfas.empty());
367
368 // All the DFAs should be of the same kind.
369 const auto kind = dfas.front()->kind;
370 assert(all_of(begin(dfas), end(dfas),
371 [&kind](const raw_dfa *rdfa) { return rdfa->kind == kind; }));
372
373 auto rdfa = ue2::make_unique<raw_dfa>(kind);
374 Automaton_Merge n(dfas, rm, grey);
375
376 DEBUG_PRINTF("merging dfa\n");
377
378 if (!determinise(n, rdfa->states, max_states)) {
379 DEBUG_PRINTF("state limit (%zu) exceeded\n", max_states);
380 return nullptr; /* over state limit */
381 }
382
383 rdfa->start_anchored = n.start_anchored;
384 rdfa->start_floating = n.start_floating;
385 rdfa->alpha_size = n.alphasize;
386 rdfa->alpha_remap = n.alpha;
387
388 DEBUG_PRINTF("merged, building impl dfa (a,f) = (%hu,%hu)\n",
389 rdfa->start_anchored, rdfa->start_floating);
390
391 if (n.shouldMinimize()) {
392 minimize_hopcroft(*rdfa, grey);
393 DEBUG_PRINTF("minimized, %zu states\n", rdfa->states.size());
394 }
395
396 return rdfa;
397}
398
399} // namespace ue2
400