| 1 | /* |
| 2 | * Copyright (c) 2015-2017, Intel Corporation |
| 3 | * |
| 4 | * Redistribution and use in source and binary forms, with or without |
| 5 | * modification, are permitted provided that the following conditions are met: |
| 6 | * |
| 7 | * * Redistributions of source code must retain the above copyright notice, |
| 8 | * this list of conditions and the following disclaimer. |
| 9 | * * Redistributions in binary form must reproduce the above copyright |
| 10 | * notice, this list of conditions and the following disclaimer in the |
| 11 | * documentation and/or other materials provided with the distribution. |
| 12 | * * Neither the name of Intel Corporation nor the names of its contributors |
| 13 | * may be used to endorse or promote products derived from this software |
| 14 | * without specific prior written permission. |
| 15 | * |
| 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| 17 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
| 20 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| 21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| 22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| 23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| 24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| 25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| 26 | * POSSIBILITY OF SUCH DAMAGE. |
| 27 | */ |
| 28 | |
| 29 | /** \file |
| 30 | * \brief UTF-8 transforms and operations. |
| 31 | */ |
| 32 | #include "ng_utf8.h" |
| 33 | |
| 34 | #include "ng.h" |
| 35 | #include "ng_prune.h" |
| 36 | #include "ng_util.h" |
| 37 | #include "compiler/compiler.h" |
| 38 | #include "util/graph_range.h" |
| 39 | #include "util/unicode_def.h" |
| 40 | |
| 41 | #include <set> |
| 42 | #include <vector> |
| 43 | |
| 44 | using namespace std; |
| 45 | |
| 46 | namespace ue2 { |
| 47 | |
| 48 | static |
| 49 | void allowIllegal(NGHolder &g, NFAVertex v, u8 pred_char) { |
| 50 | if (in_degree(v, g) != 1) { |
| 51 | DEBUG_PRINTF("unexpected pred\n" ); |
| 52 | assert(0); /* should be true due to the early stage of this analysis */ |
| 53 | return; |
| 54 | } |
| 55 | |
| 56 | CharReach &cr = g[v].char_reach; |
| 57 | if (pred_char == 0xe0) { |
| 58 | assert(cr.isSubsetOf(CharReach(0xa0, 0xbf))); |
| 59 | if (cr == CharReach(0xa0, 0xbf)) { |
| 60 | cr |= CharReach(0x80, 0x9f); |
| 61 | } |
| 62 | } else if (pred_char == 0xf0) { |
| 63 | assert(cr.isSubsetOf(CharReach(0x90, 0xbf))); |
| 64 | if (cr == CharReach(0x90, 0xbf)) { |
| 65 | cr |= CharReach(0x80, 0x8f); |
| 66 | } |
| 67 | } else if (pred_char == 0xf4) { |
| 68 | assert(cr.isSubsetOf(CharReach(0x80, 0x8f))); |
| 69 | if (cr == CharReach(0x80, 0x8f)) { |
| 70 | cr |= CharReach(0x90, 0xbf); |
| 71 | } |
| 72 | } else { |
| 73 | assert(0); /* unexpected pred */ |
| 74 | } |
| 75 | } |
| 76 | |
| 77 | /** \brief Relax forbidden UTF-8 sequences. |
| 78 | * |
| 79 | * Some byte sequences can not appear in valid UTF-8 as they encode code points |
| 80 | * above \\x{10ffff} or they represent overlong encodings. As we require valid |
| 81 | * UTF-8 input, we have no defined behaviour in these cases, as a result we can |
| 82 | * accept them if it simplifies the graph. */ |
| 83 | void relaxForbiddenUtf8(NGHolder &g, const ExpressionInfo &expr) { |
| 84 | if (!expr.utf8) { |
| 85 | return; |
| 86 | } |
| 87 | |
| 88 | const CharReach e0(0xe0); |
| 89 | const CharReach f0(0xf0); |
| 90 | const CharReach f4(0xf4); |
| 91 | |
| 92 | for (auto v : vertices_range(g)) { |
| 93 | const CharReach &cr = g[v].char_reach; |
| 94 | if (cr == e0 || cr == f0 || cr == f4) { |
| 95 | u8 pred_char = cr.find_first(); |
| 96 | for (auto t : adjacent_vertices_range(v, g)) { |
| 97 | allowIllegal(g, t, pred_char); |
| 98 | } |
| 99 | } |
| 100 | } |
| 101 | } |
| 102 | |
| 103 | static |
| 104 | bool hasPredInSet(const NGHolder &g, NFAVertex v, const set<NFAVertex> &s) { |
| 105 | for (auto u : inv_adjacent_vertices_range(v, g)) { |
| 106 | if (contains(s, u)) { |
| 107 | return true; |
| 108 | } |
| 109 | } |
| 110 | return false; |
| 111 | } |
| 112 | |
| 113 | static |
| 114 | bool hasSuccInSet(const NGHolder &g, NFAVertex v, const set<NFAVertex> &s) { |
| 115 | for (auto w : adjacent_vertices_range(v, g)) { |
| 116 | if (contains(s, w)) { |
| 117 | return true; |
| 118 | } |
| 119 | } |
| 120 | return false; |
| 121 | } |
| 122 | |
| 123 | static |
| 124 | void findSeeds(const NGHolder &h, const bool som, vector<NFAVertex> *seeds) { |
| 125 | set<NFAVertex> bad; /* from zero-width asserts near accepts, etc */ |
| 126 | for (auto v : inv_adjacent_vertices_range(h.accept, h)) { |
| 127 | const CharReach &cr = h[v].char_reach; |
| 128 | if (!isutf8ascii(cr) && !isutf8start(cr)) { |
| 129 | bad.insert(v); |
| 130 | } |
| 131 | } |
| 132 | |
| 133 | for (auto v : inv_adjacent_vertices_range(h.acceptEod, h)) { |
| 134 | const CharReach &cr = h[v].char_reach; |
| 135 | if (!isutf8ascii(cr) && !isutf8start(cr)) { |
| 136 | bad.insert(v); |
| 137 | } |
| 138 | } |
| 139 | |
| 140 | // we want to be careful with asserts connected to starts |
| 141 | // as well as they may not finish a code point |
| 142 | for (auto v : vertices_range(h)) { |
| 143 | if (is_virtual_start(v, h)) { |
| 144 | bad.insert(v); |
| 145 | insert(&bad, adjacent_vertices(v, h)); |
| 146 | } |
| 147 | } |
| 148 | |
| 149 | /* we cannot handle vertices connected to accept as would report matches in |
| 150 | * the middle of codepoints. acceptEod is not a problem as the input must |
| 151 | * end at a codepoint boundary */ |
| 152 | bad.insert(h.accept); |
| 153 | |
| 154 | // If we're in SOM mode, we don't want to mess with vertices that have a |
| 155 | // direct edge from startDs. |
| 156 | if (som) { |
| 157 | insert(&bad, adjacent_vertices(h.startDs, h)); |
| 158 | } |
| 159 | |
| 160 | set<NFAVertex> already_seeds; /* already marked as seeds */ |
| 161 | for (auto v : vertices_range(h)) { |
| 162 | const CharReach &cr = h[v].char_reach; |
| 163 | |
| 164 | if (!isutf8ascii(cr) || !hasSelfLoop(v, h)) { |
| 165 | continue; |
| 166 | } |
| 167 | |
| 168 | if (hasSuccInSet(h, v, bad)) { |
| 169 | continue; |
| 170 | } |
| 171 | |
| 172 | // Skip vertices that are directly connected to other vertices already |
| 173 | // in the seeds list: we can't collapse two of these directly next to |
| 174 | // each other. |
| 175 | if (hasPredInSet(h, v, already_seeds) || |
| 176 | hasSuccInSet(h, v, already_seeds)) { |
| 177 | continue; |
| 178 | } |
| 179 | |
| 180 | DEBUG_PRINTF("%zu is a seed\n" , h[v].index); |
| 181 | seeds->push_back(v); |
| 182 | already_seeds.insert(v); |
| 183 | } |
| 184 | } |
| 185 | |
| 186 | static |
| 187 | bool expandCyclic(NGHolder &h, NFAVertex v) { |
| 188 | DEBUG_PRINTF("inspecting %zu\n" , h[v].index); |
| 189 | bool changes = false; |
| 190 | |
| 191 | auto v_preds = preds(v, h); |
| 192 | auto v_succs = succs(v, h); |
| 193 | |
| 194 | set<NFAVertex> start_siblings; |
| 195 | set<NFAVertex> end_siblings; |
| 196 | |
| 197 | CharReach &v_cr = h[v].char_reach; |
| 198 | |
| 199 | /* We need to find start vertices which have all of our preds. |
| 200 | * As we have a self loop, it must be one of our succs. */ |
| 201 | for (auto a : adjacent_vertices_range(v, h)) { |
| 202 | auto a_preds = preds(a, h); |
| 203 | |
| 204 | if (a_preds == v_preds && isutf8start(h[a].char_reach)) { |
| 205 | DEBUG_PRINTF("%zu is a start v\n" , h[a].index); |
| 206 | start_siblings.insert(a); |
| 207 | } |
| 208 | } |
| 209 | |
| 210 | /* We also need to find full cont vertices which have all our own succs; |
| 211 | * As we have a self loop, it must be one of our preds. */ |
| 212 | for (auto a : inv_adjacent_vertices_range(v, h)) { |
| 213 | auto a_succs = succs(a, h); |
| 214 | |
| 215 | if (a_succs == v_succs && h[a].char_reach == UTF_CONT_CR) { |
| 216 | DEBUG_PRINTF("%zu is a full tail cont\n" , h[a].index); |
| 217 | end_siblings.insert(a); |
| 218 | } |
| 219 | } |
| 220 | |
| 221 | for (auto s : start_siblings) { |
| 222 | if (out_degree(s, h) != 1) { |
| 223 | continue; |
| 224 | } |
| 225 | |
| 226 | const CharReach &cr = h[s].char_reach; |
| 227 | if (cr.isSubsetOf(UTF_TWO_START_CR)) { |
| 228 | if (end_siblings.find(*adjacent_vertices(s, h).first) |
| 229 | == end_siblings.end()) { |
| 230 | DEBUG_PRINTF("%zu is odd\n" , h[s].index); |
| 231 | continue; |
| 232 | } |
| 233 | } else if (cr.isSubsetOf(UTF_THREE_START_CR)) { |
| 234 | NFAVertex m = *adjacent_vertices(s, h).first; |
| 235 | |
| 236 | if (h[m].char_reach != UTF_CONT_CR |
| 237 | || out_degree(m, h) != 1) { |
| 238 | continue; |
| 239 | } |
| 240 | if (end_siblings.find(*adjacent_vertices(m, h).first) |
| 241 | == end_siblings.end()) { |
| 242 | DEBUG_PRINTF("%zu is odd\n" , h[s].index); |
| 243 | continue; |
| 244 | } |
| 245 | } else if (cr.isSubsetOf(UTF_FOUR_START_CR)) { |
| 246 | NFAVertex m1 = *adjacent_vertices(s, h).first; |
| 247 | |
| 248 | if (h[m1].char_reach != UTF_CONT_CR |
| 249 | || out_degree(m1, h) != 1) { |
| 250 | continue; |
| 251 | } |
| 252 | |
| 253 | NFAVertex m2 = *adjacent_vertices(m1, h).first; |
| 254 | |
| 255 | if (h[m2].char_reach != UTF_CONT_CR |
| 256 | || out_degree(m2, h) != 1) { |
| 257 | continue; |
| 258 | } |
| 259 | |
| 260 | if (end_siblings.find(*adjacent_vertices(m2, h).first) |
| 261 | == end_siblings.end()) { |
| 262 | DEBUG_PRINTF("%zu is odd\n" , h[s].index); |
| 263 | continue; |
| 264 | } |
| 265 | } else { |
| 266 | DEBUG_PRINTF("%zu is bad\n" , h[s].index); |
| 267 | continue; |
| 268 | } |
| 269 | |
| 270 | v_cr |= cr; |
| 271 | clear_vertex(s, h); |
| 272 | changes = true; |
| 273 | } |
| 274 | |
| 275 | if (changes) { |
| 276 | v_cr |= UTF_CONT_CR; /* we need to add in cont reach */ |
| 277 | v_cr.set(0xc0); /* we can also add in the forbidden bytes as we require |
| 278 | * valid unicode data */ |
| 279 | v_cr.set(0xc1); |
| 280 | v_cr |= CharReach(0xf5, 0xff); |
| 281 | } |
| 282 | |
| 283 | return changes; |
| 284 | } |
| 285 | |
| 286 | /** \brief Contract cycles of UTF-8 code points down to a single cyclic vertex |
| 287 | * where possible, based on the assumption that we will always be matching |
| 288 | * against well-formed input. */ |
| 289 | void utf8DotRestoration(NGHolder &h, bool som) { |
| 290 | vector<NFAVertex> seeds; /* cyclic ascii vertices */ |
| 291 | findSeeds(h, som, &seeds); |
| 292 | |
| 293 | bool changes = false; |
| 294 | for (auto v : seeds) { |
| 295 | changes |= expandCyclic(h, v); |
| 296 | } |
| 297 | |
| 298 | if (changes) { |
| 299 | pruneUseless(h); |
| 300 | } |
| 301 | } |
| 302 | |
| 303 | } // namespace ue2 |
| 304 | |