1/*
2 * Copyright (c) 2015-2017, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** \file
30 * \brief UTF-8 transforms and operations.
31 */
32#include "ng_utf8.h"
33
34#include "ng.h"
35#include "ng_prune.h"
36#include "ng_util.h"
37#include "compiler/compiler.h"
38#include "util/graph_range.h"
39#include "util/unicode_def.h"
40
41#include <set>
42#include <vector>
43
44using namespace std;
45
46namespace ue2 {
47
48static
49void allowIllegal(NGHolder &g, NFAVertex v, u8 pred_char) {
50 if (in_degree(v, g) != 1) {
51 DEBUG_PRINTF("unexpected pred\n");
52 assert(0); /* should be true due to the early stage of this analysis */
53 return;
54 }
55
56 CharReach &cr = g[v].char_reach;
57 if (pred_char == 0xe0) {
58 assert(cr.isSubsetOf(CharReach(0xa0, 0xbf)));
59 if (cr == CharReach(0xa0, 0xbf)) {
60 cr |= CharReach(0x80, 0x9f);
61 }
62 } else if (pred_char == 0xf0) {
63 assert(cr.isSubsetOf(CharReach(0x90, 0xbf)));
64 if (cr == CharReach(0x90, 0xbf)) {
65 cr |= CharReach(0x80, 0x8f);
66 }
67 } else if (pred_char == 0xf4) {
68 assert(cr.isSubsetOf(CharReach(0x80, 0x8f)));
69 if (cr == CharReach(0x80, 0x8f)) {
70 cr |= CharReach(0x90, 0xbf);
71 }
72 } else {
73 assert(0); /* unexpected pred */
74 }
75}
76
77/** \brief Relax forbidden UTF-8 sequences.
78 *
79 * Some byte sequences can not appear in valid UTF-8 as they encode code points
80 * above \\x{10ffff} or they represent overlong encodings. As we require valid
81 * UTF-8 input, we have no defined behaviour in these cases, as a result we can
82 * accept them if it simplifies the graph. */
83void relaxForbiddenUtf8(NGHolder &g, const ExpressionInfo &expr) {
84 if (!expr.utf8) {
85 return;
86 }
87
88 const CharReach e0(0xe0);
89 const CharReach f0(0xf0);
90 const CharReach f4(0xf4);
91
92 for (auto v : vertices_range(g)) {
93 const CharReach &cr = g[v].char_reach;
94 if (cr == e0 || cr == f0 || cr == f4) {
95 u8 pred_char = cr.find_first();
96 for (auto t : adjacent_vertices_range(v, g)) {
97 allowIllegal(g, t, pred_char);
98 }
99 }
100 }
101}
102
103static
104bool hasPredInSet(const NGHolder &g, NFAVertex v, const set<NFAVertex> &s) {
105 for (auto u : inv_adjacent_vertices_range(v, g)) {
106 if (contains(s, u)) {
107 return true;
108 }
109 }
110 return false;
111}
112
113static
114bool hasSuccInSet(const NGHolder &g, NFAVertex v, const set<NFAVertex> &s) {
115 for (auto w : adjacent_vertices_range(v, g)) {
116 if (contains(s, w)) {
117 return true;
118 }
119 }
120 return false;
121}
122
123static
124void findSeeds(const NGHolder &h, const bool som, vector<NFAVertex> *seeds) {
125 set<NFAVertex> bad; /* from zero-width asserts near accepts, etc */
126 for (auto v : inv_adjacent_vertices_range(h.accept, h)) {
127 const CharReach &cr = h[v].char_reach;
128 if (!isutf8ascii(cr) && !isutf8start(cr)) {
129 bad.insert(v);
130 }
131 }
132
133 for (auto v : inv_adjacent_vertices_range(h.acceptEod, h)) {
134 const CharReach &cr = h[v].char_reach;
135 if (!isutf8ascii(cr) && !isutf8start(cr)) {
136 bad.insert(v);
137 }
138 }
139
140 // we want to be careful with asserts connected to starts
141 // as well as they may not finish a code point
142 for (auto v : vertices_range(h)) {
143 if (is_virtual_start(v, h)) {
144 bad.insert(v);
145 insert(&bad, adjacent_vertices(v, h));
146 }
147 }
148
149 /* we cannot handle vertices connected to accept as would report matches in
150 * the middle of codepoints. acceptEod is not a problem as the input must
151 * end at a codepoint boundary */
152 bad.insert(h.accept);
153
154 // If we're in SOM mode, we don't want to mess with vertices that have a
155 // direct edge from startDs.
156 if (som) {
157 insert(&bad, adjacent_vertices(h.startDs, h));
158 }
159
160 set<NFAVertex> already_seeds; /* already marked as seeds */
161 for (auto v : vertices_range(h)) {
162 const CharReach &cr = h[v].char_reach;
163
164 if (!isutf8ascii(cr) || !hasSelfLoop(v, h)) {
165 continue;
166 }
167
168 if (hasSuccInSet(h, v, bad)) {
169 continue;
170 }
171
172 // Skip vertices that are directly connected to other vertices already
173 // in the seeds list: we can't collapse two of these directly next to
174 // each other.
175 if (hasPredInSet(h, v, already_seeds) ||
176 hasSuccInSet(h, v, already_seeds)) {
177 continue;
178 }
179
180 DEBUG_PRINTF("%zu is a seed\n", h[v].index);
181 seeds->push_back(v);
182 already_seeds.insert(v);
183 }
184}
185
186static
187bool expandCyclic(NGHolder &h, NFAVertex v) {
188 DEBUG_PRINTF("inspecting %zu\n", h[v].index);
189 bool changes = false;
190
191 auto v_preds = preds(v, h);
192 auto v_succs = succs(v, h);
193
194 set<NFAVertex> start_siblings;
195 set<NFAVertex> end_siblings;
196
197 CharReach &v_cr = h[v].char_reach;
198
199 /* We need to find start vertices which have all of our preds.
200 * As we have a self loop, it must be one of our succs. */
201 for (auto a : adjacent_vertices_range(v, h)) {
202 auto a_preds = preds(a, h);
203
204 if (a_preds == v_preds && isutf8start(h[a].char_reach)) {
205 DEBUG_PRINTF("%zu is a start v\n", h[a].index);
206 start_siblings.insert(a);
207 }
208 }
209
210 /* We also need to find full cont vertices which have all our own succs;
211 * As we have a self loop, it must be one of our preds. */
212 for (auto a : inv_adjacent_vertices_range(v, h)) {
213 auto a_succs = succs(a, h);
214
215 if (a_succs == v_succs && h[a].char_reach == UTF_CONT_CR) {
216 DEBUG_PRINTF("%zu is a full tail cont\n", h[a].index);
217 end_siblings.insert(a);
218 }
219 }
220
221 for (auto s : start_siblings) {
222 if (out_degree(s, h) != 1) {
223 continue;
224 }
225
226 const CharReach &cr = h[s].char_reach;
227 if (cr.isSubsetOf(UTF_TWO_START_CR)) {
228 if (end_siblings.find(*adjacent_vertices(s, h).first)
229 == end_siblings.end()) {
230 DEBUG_PRINTF("%zu is odd\n", h[s].index);
231 continue;
232 }
233 } else if (cr.isSubsetOf(UTF_THREE_START_CR)) {
234 NFAVertex m = *adjacent_vertices(s, h).first;
235
236 if (h[m].char_reach != UTF_CONT_CR
237 || out_degree(m, h) != 1) {
238 continue;
239 }
240 if (end_siblings.find(*adjacent_vertices(m, h).first)
241 == end_siblings.end()) {
242 DEBUG_PRINTF("%zu is odd\n", h[s].index);
243 continue;
244 }
245 } else if (cr.isSubsetOf(UTF_FOUR_START_CR)) {
246 NFAVertex m1 = *adjacent_vertices(s, h).first;
247
248 if (h[m1].char_reach != UTF_CONT_CR
249 || out_degree(m1, h) != 1) {
250 continue;
251 }
252
253 NFAVertex m2 = *adjacent_vertices(m1, h).first;
254
255 if (h[m2].char_reach != UTF_CONT_CR
256 || out_degree(m2, h) != 1) {
257 continue;
258 }
259
260 if (end_siblings.find(*adjacent_vertices(m2, h).first)
261 == end_siblings.end()) {
262 DEBUG_PRINTF("%zu is odd\n", h[s].index);
263 continue;
264 }
265 } else {
266 DEBUG_PRINTF("%zu is bad\n", h[s].index);
267 continue;
268 }
269
270 v_cr |= cr;
271 clear_vertex(s, h);
272 changes = true;
273 }
274
275 if (changes) {
276 v_cr |= UTF_CONT_CR; /* we need to add in cont reach */
277 v_cr.set(0xc0); /* we can also add in the forbidden bytes as we require
278 * valid unicode data */
279 v_cr.set(0xc1);
280 v_cr |= CharReach(0xf5, 0xff);
281 }
282
283 return changes;
284}
285
286/** \brief Contract cycles of UTF-8 code points down to a single cyclic vertex
287 * where possible, based on the assumption that we will always be matching
288 * against well-formed input. */
289void utf8DotRestoration(NGHolder &h, bool som) {
290 vector<NFAVertex> seeds; /* cyclic ascii vertices */
291 findSeeds(h, som, &seeds);
292
293 bool changes = false;
294 for (auto v : seeds) {
295 changes |= expandCyclic(h, v);
296 }
297
298 if (changes) {
299 pruneUseless(h);
300 }
301}
302
303} // namespace ue2
304