1/*
2 * Copyright (c) 2015-2017, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include "mcclellancompile_util.h"
30
31#include "rdfa.h"
32#include "util/container.h"
33#include "util/hash.h"
34#include "ue2common.h"
35
36#include <deque>
37#include <map>
38
39using namespace std;
40
41namespace ue2 {
42
43#define INIT_STATE 1
44
45static
46bool state_has_reports(const raw_dfa &raw, dstate_id_t s) {
47 const auto &ds = raw.states[s];
48 return !ds.reports.empty() || !ds.reports_eod.empty();
49}
50
51static
52u32 count_dots(const raw_dfa &raw) {
53 assert(raw.start_anchored == INIT_STATE);
54
55 u32 i = INIT_STATE;
56 for (; i < raw.states.size() && i != raw.start_floating; i++) {
57 DEBUG_PRINTF("checking %u\n", i);
58 assert(raw.states[i].reports.empty());
59 assert(raw.states[i].reports_eod.empty());
60
61 for (symbol_t s = 0; s < raw.getImplAlphaSize(); s++) {
62 DEBUG_PRINTF("%hu -> %hu\n", s, raw.states[i].next[s]);
63 if (raw.states[i].next[s] != i + 1) {
64 goto validate;
65 }
66 }
67
68 if (state_has_reports(raw, raw.states[i].next[0])) {
69 goto validate;
70 }
71
72 DEBUG_PRINTF("got dot\n");
73 }
74
75 validate:
76 u32 dot_count = i - INIT_STATE;
77
78 /* we need to check that no later state has a transition into these leading
79 * dots */
80 for (; i < raw.states.size(); i++) {
81 for (symbol_t s = 0; s < raw.getImplAlphaSize(); s++) {
82 DEBUG_PRINTF("%hu -> %hu\n", s, raw.states[i].next[s]);
83 dstate_id_t n = raw.states[i].next[s];
84 if (n != DEAD_STATE && n <= dot_count) {
85 return 0;
86 }
87 }
88 }
89
90 return dot_count;
91}
92
93static
94void prune_leading_states(raw_dfa &raw, u32 count) {
95 if (!count) {
96 return;
97 }
98
99 for (u32 i = INIT_STATE + count; i < raw.states.size(); i++) {
100 dstate &curr = raw.states[i - count];
101 curr = raw.states[i];
102 if (curr.daddy > count) {
103 curr.daddy -= count;
104 } else {
105 curr.daddy = DEAD_STATE;
106 }
107
108 for (u32 j = 0; j < raw.alpha_size; j++) {
109 assert(curr.next[j] == DEAD_STATE || curr.next[j] > count);
110 if (curr.next[j]) {
111 curr.next[j] -= count;
112 }
113 }
114 }
115
116 raw.states.erase(raw.states.end() - count, raw.states.end());
117}
118
119u32 remove_leading_dots(raw_dfa &raw) {
120 u32 count = count_dots(raw);
121 prune_leading_states(raw, count);
122 DEBUG_PRINTF("removed %u leading dots\n", count);
123 return count;
124}
125
126static never_inline
127u32 calc_min_dist_from_bob(raw_dfa &raw, vector<u32> *dist_in) {
128 vector<u32> &dist = *dist_in;
129 dist.assign(raw.states.size(), ~0U);
130
131 assert(raw.start_anchored != DEAD_STATE);
132
133 deque<dstate_id_t> to_visit = { raw.start_anchored };
134 dist[raw.start_anchored] = 0;
135
136 u32 last_d = 0;
137
138 while (!to_visit.empty()) {
139 dstate_id_t s = to_visit.front();
140 DEBUG_PRINTF("inspecting %u\n", s);
141 to_visit.pop_front();
142 assert(s != DEAD_STATE);
143
144 u32 d = dist[s];
145 assert(d >= last_d);
146 assert(d != ~0U);
147
148 for (dstate_id_t t : raw.states[s].next) {
149 if (t == DEAD_STATE) {
150 continue;
151 }
152 if (dist[t] == ~0U) {
153 to_visit.push_back(t);
154 dist[t] = d + 1;
155 } else {
156 assert(dist[t] <= d + 1);
157 }
158 }
159
160 last_d = d;
161 }
162
163 return last_d;
164}
165
166bool clear_deeper_reports(raw_dfa &raw, u32 max_offset) {
167 DEBUG_PRINTF("clearing reports on states deeper than %u\n", max_offset);
168 vector<u32> bob_dist;
169 u32 max_min_dist_bob = calc_min_dist_from_bob(raw, &bob_dist);
170
171 if (max_min_dist_bob <= max_offset) {
172 return false;
173 }
174
175 bool changed = false;
176 for (u32 s = DEAD_STATE + 1; s < raw.states.size(); s++) {
177 if (bob_dist[s] > max_offset && state_has_reports(raw, s)) {
178 DEBUG_PRINTF("clearing reports on %u (depth %u)\n", s, bob_dist[s]);
179 auto &ds = raw.states[s];
180 ds.reports.clear();
181 ds.reports_eod.clear();
182 changed = true;
183 }
184 }
185
186 if (!changed) {
187 return false;
188 }
189
190 // We may have cleared all reports from the DFA, in which case it should
191 // become empty.
192 if (all_of_in(raw.states, [](const dstate &ds) {
193 return ds.reports.empty() && ds.reports_eod.empty();
194 })) {
195 DEBUG_PRINTF("no reports left at all, dfa is dead\n");
196 raw.start_anchored = DEAD_STATE;
197 raw.start_floating = DEAD_STATE;
198 }
199
200 return true;
201}
202
203set<ReportID> all_reports(const raw_dfa &rdfa) {
204 set<ReportID> all;
205 for (const auto &ds : rdfa.states) {
206 insert(&all, ds.reports);
207 insert(&all, ds.reports_eod);
208 }
209 return all;
210}
211
212bool has_eod_accepts(const raw_dfa &rdfa) {
213 for (const auto &ds : rdfa.states) {
214 if (!ds.reports_eod.empty()) {
215 return true;
216 }
217 }
218 return false;
219}
220
221bool has_non_eod_accepts(const raw_dfa &rdfa) {
222 for (const auto &ds : rdfa.states) {
223 if (!ds.reports.empty()) {
224 return true;
225 }
226 }
227 return false;
228}
229
230size_t hash_dfa_no_reports(const raw_dfa &rdfa) {
231 size_t v = 0;
232 hash_combine(v, rdfa.alpha_size);
233 hash_combine(v, rdfa.alpha_remap);
234
235 for (const auto &ds : rdfa.states) {
236 hash_combine(v, ds.next);
237 }
238
239 return v;
240}
241
242size_t hash_dfa(const raw_dfa &rdfa) {
243 size_t v = 0;
244 hash_combine(v, hash_dfa_no_reports(rdfa));
245 hash_combine(v, all_reports(rdfa));
246 return v;
247}
248
249static
250bool can_die_early(const raw_dfa &raw, dstate_id_t s,
251 map<dstate_id_t, u32> &visited, u32 age_limit) {
252 if (contains(visited, s) && visited[s] >= age_limit) {
253 /* we have already visited (or are in the process of visiting) here with
254 * a looser limit. */
255 return false;
256 }
257 visited[s] = age_limit;
258
259 if (s == DEAD_STATE) {
260 return true;
261 }
262
263 if (age_limit == 0) {
264 return false;
265 }
266
267 for (const auto &next : raw.states[s].next) {
268 if (can_die_early(raw, next, visited, age_limit - 1)) {
269 return true;
270 }
271 }
272
273 return false;
274}
275
276bool can_die_early(const raw_dfa &raw, u32 age_limit) {
277 map<dstate_id_t, u32> visited;
278 return can_die_early(raw, raw.start_anchored, visited, age_limit);
279}
280
281bool is_dead(const raw_dfa &rdfa) {
282 return rdfa.start_anchored == DEAD_STATE &&
283 rdfa.start_floating == DEAD_STATE;
284}
285
286} // namespace ue2
287