| 1 | /* | 
| 2 |  * Copyright (c) 2015-2017, Intel Corporation | 
| 3 |  * | 
| 4 |  * Redistribution and use in source and binary forms, with or without | 
| 5 |  * modification, are permitted provided that the following conditions are met: | 
| 6 |  * | 
| 7 |  *  * Redistributions of source code must retain the above copyright notice, | 
| 8 |  *    this list of conditions and the following disclaimer. | 
| 9 |  *  * Redistributions in binary form must reproduce the above copyright | 
| 10 |  *    notice, this list of conditions and the following disclaimer in the | 
| 11 |  *    documentation and/or other materials provided with the distribution. | 
| 12 |  *  * Neither the name of Intel Corporation nor the names of its contributors | 
| 13 |  *    may be used to endorse or promote products derived from this software | 
| 14 |  *    without specific prior written permission. | 
| 15 |  * | 
| 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | 
| 17 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | 
| 18 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | 
| 19 |  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | 
| 20 |  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | 
| 21 |  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | 
| 22 |  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | 
| 23 |  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | 
| 24 |  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | 
| 25 |  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | 
| 26 |  * POSSIBILITY OF SUCH DAMAGE. | 
| 27 |  */ | 
| 28 |  | 
| 29 | /** \file | 
| 30 |  * \brief Literal analysis and scoring. | 
| 31 |  */ | 
| 32 | #include "ng_literal_analysis.h" | 
| 33 |  | 
| 34 | #include "ng_holder.h" | 
| 35 | #include "ng_split.h" | 
| 36 | #include "ng_util.h" | 
| 37 | #include "ue2common.h" | 
| 38 | #include "rose/rose_common.h" | 
| 39 | #include "util/compare.h" | 
| 40 | #include "util/depth.h" | 
| 41 | #include "util/graph.h" | 
| 42 | #include "util/graph_range.h" | 
| 43 | #include "util/graph_small_color_map.h" | 
| 44 | #include "util/ue2_graph.h" | 
| 45 | #include "util/ue2string.h" | 
| 46 |  | 
| 47 | #include <algorithm> | 
| 48 | #include <fstream> | 
| 49 | #include <queue> | 
| 50 |  | 
| 51 | #include <boost/graph/boykov_kolmogorov_max_flow.hpp> | 
| 52 |  | 
| 53 | using namespace std; | 
| 54 |  | 
| 55 | namespace ue2 { | 
| 56 |  | 
| 57 | /** Maximum number of paths to generate. */ | 
| 58 | static const u32 MAX_WIDTH = 11; | 
| 59 |  | 
| 60 | /** Scoring adjustment for 'uniqueness' in literal. */ | 
| 61 | static const u64a WEIGHT_OF_UNIQUENESS = 250; | 
| 62 |  | 
| 63 | namespace { | 
| 64 |  | 
| 65 | /* Small literal graph type used for the suffix tree used in | 
| 66 |  * compressAndScore. */ | 
| 67 |  | 
| 68 | struct LitGraphVertexProps { | 
| 69 |     LitGraphVertexProps() = default; | 
| 70 |     explicit LitGraphVertexProps(ue2_literal::elem c_in) : c(move(c_in)) {} | 
| 71 |     ue2_literal::elem c; // string element (char + bool) | 
| 72 |     size_t index; // managed by ue2_graph | 
| 73 | }; | 
| 74 |  | 
| 75 | struct LitGraphEdgeProps { | 
| 76 |     LitGraphEdgeProps() = default; | 
| 77 |     explicit LitGraphEdgeProps(u64a score_in) : score(score_in) {} | 
| 78 |     u64a score = NO_LITERAL_AT_EDGE_SCORE; | 
| 79 |     size_t index; // managed by ue2_graph | 
| 80 | }; | 
| 81 |  | 
| 82 | struct LitGraph | 
| 83 |     : public ue2_graph<LitGraph, LitGraphVertexProps, LitGraphEdgeProps> { | 
| 84 |  | 
| 85 |     LitGraph() : root(add_vertex(*this)), sink(add_vertex(*this)) {} | 
| 86 |  | 
| 87 |     const vertex_descriptor root; | 
| 88 |     const vertex_descriptor sink; | 
| 89 | }; | 
| 90 |  | 
| 91 | typedef LitGraph::vertex_descriptor LitVertex; | 
| 92 | typedef LitGraph::edge_descriptor LitEdge; | 
| 93 |  | 
| 94 | typedef pair<LitVertex, NFAVertex> VertexPair; | 
| 95 | typedef std::queue<VertexPair> LitVertexQ; | 
| 96 |  | 
| 97 | } // namespace | 
| 98 |  | 
| 99 | #ifdef DUMP_SUPPORT | 
| 100 |  | 
| 101 | /** \brief Dump the literal graph in Graphviz format. */ | 
| 102 | static UNUSED | 
| 103 | void dumpGraph(const char *filename, const LitGraph &lg) { | 
| 104 |     ofstream fout(filename); | 
| 105 |  | 
| 106 |     fout << "digraph G {"  << endl; | 
| 107 |  | 
| 108 |     for (auto v : vertices_range(lg)) { | 
| 109 |         fout << lg[v].index; | 
| 110 |         if (v == lg.root) { | 
| 111 |             fout << "[label=\"ROOT\"];" ; | 
| 112 |         } else if (v == lg.sink) { | 
| 113 |             fout << "[label=\"SINK\"];" ; | 
| 114 |         } else { | 
| 115 |             ue2_literal s; | 
| 116 |             s.push_back(lg[v].c); | 
| 117 |             fout << "[label=\""  << dumpString(s) << "\"];" ; | 
| 118 |         } | 
| 119 |         fout << endl; | 
| 120 |     } | 
| 121 |  | 
| 122 |     for (const auto &e : edges_range(lg)) { | 
| 123 |         LitVertex u = source(e, lg), v = target(e, lg); | 
| 124 |         fout << lg[u].index << " -> "  << lg[v].index << "[label=\""  | 
| 125 |              << lg[e].score << "\"]"  | 
| 126 |              << ";"  << endl; | 
| 127 |     } | 
| 128 |  | 
| 129 |     fout << "}"  << endl; | 
| 130 | } | 
| 131 |  | 
| 132 | #endif // DUMP_SUPPORT | 
| 133 |  | 
| 134 | static | 
| 135 | bool allowExpand(size_t numItems, size_t totalPathsSoFar) { | 
| 136 |     if (numItems == 0) { | 
| 137 |         return false; | 
| 138 |     } | 
| 139 |  | 
| 140 |     if (numItems + totalPathsSoFar > MAX_WIDTH) { | 
| 141 |         return false; | 
| 142 |     } | 
| 143 |  | 
| 144 |     return true; | 
| 145 | } | 
| 146 |  | 
| 147 | static | 
| 148 | LitVertex addToLitGraph(LitGraph &lg, LitVertex pred, | 
| 149 |                         const ue2_literal::elem &c) { | 
| 150 |     // Check if we already have this in the graph. | 
| 151 |     for (auto v : adjacent_vertices_range(pred, lg)) { | 
| 152 |         if (v == lg.sink) { | 
| 153 |             continue; | 
| 154 |         } | 
| 155 |         if (lg[v].c == c) { | 
| 156 |             return v; | 
| 157 |         } | 
| 158 |     } | 
| 159 |  | 
| 160 |     LitVertex lv = add_vertex(LitGraphVertexProps(c), lg); | 
| 161 |     add_edge(pred, lv, lg); | 
| 162 |     return lv; | 
| 163 | } | 
| 164 |  | 
| 165 | static | 
| 166 | void addToQueue(LitVertexQ &workQ, LitGraph &lg, LitVertex pred, | 
| 167 |                 const CharReach &cr, NFAVertex v) { | 
| 168 |     for (size_t i = cr.find_first(); i != CharReach::npos; | 
| 169 |          i = cr.find_next(i)) { | 
| 170 |         if (myisupper(i) && cr.test(mytolower(i))) { | 
| 171 |             // ignore upper half of a nocase pair | 
| 172 |             continue; | 
| 173 |         } | 
| 174 |  | 
| 175 |         bool nocase = myislower(i) && cr.test(mytoupper(i)); | 
| 176 |         ue2_literal::elem c((char)i, nocase); | 
| 177 |         LitVertex lv = addToLitGraph(lg, pred, c); | 
| 178 |         workQ.push(VertexPair(lv, v)); | 
| 179 |     } | 
| 180 | } | 
| 181 |  | 
| 182 | static | 
| 183 | void initWorkQueue(LitVertexQ &workQ, LitGraph &lg, const NGHolder &g, | 
| 184 |                    const NFAEdge &e) { | 
| 185 |     NFAVertex u = source(e, g); | 
| 186 |     NFAVertex v = target(e, g); | 
| 187 |     const CharReach &cr = g[v].char_reach; | 
| 188 |  | 
| 189 |     if (!allowExpand(cr.count(), 0)) { | 
| 190 |         return; | 
| 191 |     } | 
| 192 |  | 
| 193 |     addToQueue(workQ, lg, lg.root, cr, u); | 
| 194 | } | 
| 195 |  | 
| 196 | static | 
| 197 | u32 crCardinality(const CharReach &cr) { | 
| 198 |     // Special-case for handling dots, much faster than running the find_next | 
| 199 |     // loop below. | 
| 200 |     if (cr.all()) { | 
| 201 |         return 230; // [^A-Z] | 
| 202 |     } | 
| 203 |  | 
| 204 |     u32 rv = 0; | 
| 205 |     for (size_t i = cr.find_first(); i != CharReach::npos; | 
| 206 |          i = cr.find_next(i)) { | 
| 207 |         if (myisupper(i) && cr.test(mytolower(i))) { | 
| 208 |             // ignore upper half of a nocase pair | 
| 209 |             continue; | 
| 210 |         } | 
| 211 |         rv++; | 
| 212 |     } | 
| 213 |  | 
| 214 |     return rv; | 
| 215 | } | 
| 216 |  | 
| 217 | /** Filter out literals that include other literals as suffixes. We do this by | 
| 218 |  * identifying vertices connected to the sink and removing their other | 
| 219 |  * out-edges. */ | 
| 220 | static | 
| 221 | void filterLitGraph(LitGraph &lg) { | 
| 222 |     for (auto v : inv_adjacent_vertices_range(lg.sink, lg)) { | 
| 223 |         remove_out_edge_if(v, [&lg](const LitEdge &e) { | 
| 224 |             return target(e, lg) != lg.sink; | 
| 225 |         }, lg); | 
| 226 |     } | 
| 227 |  | 
| 228 |     // We could do a DFS-and-prune here, if we wanted. Right now, we just | 
| 229 |     // handle it in extractLiterals by throwing away paths that don't run all | 
| 230 |     // the way from sink to root. | 
| 231 | } | 
| 232 |  | 
| 233 | /** Extracts all the literals from the given literal graph. Walks the graph | 
| 234 |  * from each predecessor of the sink (note: it's a suffix tree except for this | 
| 235 |  * convenience) towards the source, storing each string as we go. */ | 
| 236 | static | 
| 237 | void (const LitGraph &lg, set<ue2_literal> &s) { | 
| 238 |     ue2_literal lit; | 
| 239 |  | 
| 240 |     for (auto u : inv_adjacent_vertices_range(lg.sink, lg)) { | 
| 241 |         lit.clear(); | 
| 242 |         while (u != lg.root) { | 
| 243 |             lit.push_back(lg[u].c); | 
| 244 |             assert(in_degree(u, lg) <= 1); | 
| 245 |             LitGraph::inv_adjacency_iterator ai2, ae2; | 
| 246 |             tie(ai2, ae2) = inv_adjacent_vertices(u, lg); | 
| 247 |             if (ai2 == ae2) { | 
| 248 |                 // Path has been cut, time for the next literal. | 
| 249 |                 goto next_literal; | 
| 250 |             } | 
| 251 |             u = *ai2; | 
| 252 |         } | 
| 253 |         s.insert(lit); | 
| 254 | next_literal: | 
| 255 |         ; | 
| 256 |     } | 
| 257 | } | 
| 258 |  | 
| 259 | #ifndef NDEBUG | 
| 260 | static | 
| 261 | bool hasSuffixLiterals(const set<ue2_literal> &s) { | 
| 262 |     for (auto it = s.begin(), ite = s.end(); it != ite; ++it) { | 
| 263 |         for (auto jt = std::next(it); jt != ite; ++jt) { | 
| 264 |             if (isSuffix(*it, *jt) || isSuffix(*jt, *it)) { | 
| 265 |                 DEBUG_PRINTF("'%s' and '%s' have suffix issues\n" , | 
| 266 |                              dumpString(*it).c_str(), | 
| 267 |                              dumpString(*jt).c_str()); | 
| 268 |                 return true; | 
| 269 |             } | 
| 270 |         } | 
| 271 |     } | 
| 272 |     return false; | 
| 273 | } | 
| 274 | #endif | 
| 275 |  | 
| 276 | static | 
| 277 | void processWorkQueue(const NGHolder &g, const NFAEdge &e, | 
| 278 |                       set<ue2_literal> &s) { | 
| 279 |     if (is_special(target(e, g), g)) { | 
| 280 |         return; | 
| 281 |     } | 
| 282 |  | 
| 283 |     LitGraph lg; | 
| 284 |  | 
| 285 |     LitVertexQ workQ; | 
| 286 |     initWorkQueue(workQ, lg, g, e); | 
| 287 |  | 
| 288 |     while (!workQ.empty()) { | 
| 289 |         const LitVertex lv = workQ.front().first; | 
| 290 |         const NFAVertex &t = workQ.front().second; | 
| 291 |         const CharReach &cr = g[t].char_reach; | 
| 292 |  | 
| 293 |         u32 cr_card = crCardinality(cr); | 
| 294 |         size_t numItems = cr_card * in_degree(t, g); | 
| 295 |         size_t committed_count = workQ.size() + in_degree(lg.sink, lg) - 1; | 
| 296 |  | 
| 297 |         if (g[t].index == NODE_START) { | 
| 298 |             // reached start, add to literal set | 
| 299 |             add_edge_if_not_present(lv, lg.sink, lg); | 
| 300 |             goto next_work_elem; | 
| 301 |         } | 
| 302 |  | 
| 303 |         // Expand next vertex | 
| 304 |         if (allowExpand(numItems, committed_count)) { | 
| 305 |             for (auto u : inv_adjacent_vertices_range(t, g)) { | 
| 306 |                 addToQueue(workQ, lg, lv, cr, u); | 
| 307 |             } | 
| 308 |             goto next_work_elem; | 
| 309 |         } | 
| 310 |  | 
| 311 |         // Expand this vertex | 
| 312 |         if (allowExpand(cr_card, committed_count)) { | 
| 313 |             for (size_t i = cr.find_first(); i != CharReach::npos; | 
| 314 |                  i = cr.find_next(i)) { | 
| 315 |                 if (myisupper(i) && cr.test(mytolower(i))) { | 
| 316 |                     // ignore upper half of a nocase pair | 
| 317 |                     continue; | 
| 318 |                 } | 
| 319 |  | 
| 320 |                 bool nocase = myislower(i) && cr.test(mytoupper(i)); | 
| 321 |                 ue2_literal::elem c((char)i, nocase); | 
| 322 |                 LitVertex lt = addToLitGraph(lg, lv, c); | 
| 323 |                 add_edge_if_not_present(lt, lg.sink, lg); | 
| 324 |             } | 
| 325 |             goto next_work_elem; | 
| 326 |         } | 
| 327 |  | 
| 328 |         // add to literal set | 
| 329 |         add_edge_if_not_present(lv, lg.sink, lg); | 
| 330 |     next_work_elem: | 
| 331 |         workQ.pop(); | 
| 332 |     } | 
| 333 |  | 
| 334 |     filterLitGraph(lg); | 
| 335 |     //dumpGraph("litgraph.dot", lg); | 
| 336 |     extractLiterals(lg, s); | 
| 337 |  | 
| 338 |     // Our literal set should contain no literal that is a suffix of another. | 
| 339 |     assert(!hasSuffixLiterals(s)); | 
| 340 |  | 
| 341 |     DEBUG_PRINTF("edge %zu (%zu->%zu) produced %zu literals\n" , g[e].index, | 
| 342 |                  g[source(e, g)].index, g[target(e, g)].index, s.size()); | 
| 343 | } | 
| 344 |  | 
| 345 | bool bad_mixed_sensitivity(const ue2_literal &s) { | 
| 346 |     /* TODO: if the mixed cases is entirely within MAX_MASK2_WIDTH of the end, | 
| 347 |      * we should be able to handle it */ | 
| 348 |     return mixed_sensitivity(s) && s.length() > MAX_MASK2_WIDTH; | 
| 349 | } | 
| 350 |  | 
| 351 | static | 
| 352 | u64a litUniqueness(const string &s) { | 
| 353 |     CharReach seen(s); | 
| 354 |     return seen.count(); | 
| 355 | } | 
| 356 |  | 
| 357 | /** Count the significant bits of this literal (i.e. seven for nocase alpha, | 
| 358 |  * eight for everything else). */ | 
| 359 | static | 
| 360 | u64a litCountBits(const ue2_literal &lit) { | 
| 361 |     u64a n = 0; | 
| 362 |     for (const auto &c : lit) { | 
| 363 |         n += c.nocase ? 7 : 8; | 
| 364 |     } | 
| 365 |     return n; | 
| 366 | } | 
| 367 |  | 
| 368 | /** Returns a fairly arbitrary score for the given literal, used to compare the | 
| 369 |  * suitability of different candidates. */ | 
| 370 | static | 
| 371 | u64a scoreLiteral(const ue2_literal &s) { | 
| 372 |     // old scoring scheme: SUM(s in S: 1/s.len()^2) | 
| 373 |     // now weight (currently 75/25) with number of unique chars | 
| 374 |     // in the string | 
| 375 |     u64a len = litCountBits(s); | 
| 376 |     u64a lenUnique = litUniqueness(s.get_string()) * 8; | 
| 377 |  | 
| 378 |     u64a weightedLen = (1000ULL - WEIGHT_OF_UNIQUENESS) * len + | 
| 379 |                          WEIGHT_OF_UNIQUENESS * lenUnique; | 
| 380 |     weightedLen /= 8; | 
| 381 |  | 
| 382 |     DEBUG_PRINTF("scored literal '%s' %llu\n" , | 
| 383 |                  escapeString(s.get_string()).c_str(), weightedLen); | 
| 384 |  | 
| 385 |     return weightedLen; | 
| 386 | } | 
| 387 |  | 
| 388 |  | 
| 389 | /** | 
| 390 |  * calculateScore has the following properties: | 
| 391 |  * - score of literal is the same as the score of the reversed literal; | 
| 392 |  * - score of substring of literal is worse than the original literal's score; | 
| 393 |  * - score of any literal should be non-zero. | 
| 394 |  */ | 
| 395 | static | 
| 396 | u64a calculateScore(const ue2_literal &s) { | 
| 397 |     if (s.empty()) { | 
| 398 |         return NO_LITERAL_AT_EDGE_SCORE; | 
| 399 |     } | 
| 400 |  | 
| 401 |     u64a weightedLen = scoreLiteral(s); | 
| 402 |  | 
| 403 |     DEBUG_PRINTF("len %zu, wl %llu\n" , s.length(), weightedLen); | 
| 404 |     u64a rv = 1000000000000000ULL/(weightedLen * weightedLen * weightedLen); | 
| 405 |  | 
| 406 |     if (!rv) { | 
| 407 |         rv = 1; | 
| 408 |     } | 
| 409 |     DEBUG_PRINTF("len %zu, score %llu\n" , s.length(), rv); | 
| 410 |     return rv; | 
| 411 | } | 
| 412 |  | 
| 413 | /** Adds a literal in reverse order, building up a suffix tree. */ | 
| 414 | static | 
| 415 | void addReversedLiteral(const ue2_literal &lit, LitGraph &lg) { | 
| 416 |     DEBUG_PRINTF("literal: '%s'\n" , escapeString(lit).c_str()); | 
| 417 |     ue2_literal suffix; | 
| 418 |     LitVertex v = lg.root; | 
| 419 |     for (auto it = lit.rbegin(), ite = lit.rend(); it != ite; ++it) { | 
| 420 |         suffix.push_back(*it); | 
| 421 |         LitVertex w; | 
| 422 |         for (auto v2 : adjacent_vertices_range(v, lg)) { | 
| 423 |             if (v2 != lg.sink && lg[v2].c == *it) { | 
| 424 |                 w = v2; | 
| 425 |                 goto next_char; | 
| 426 |             } | 
| 427 |         } | 
| 428 |         w = add_vertex(LitGraphVertexProps(*it), lg); | 
| 429 |         add_edge(v, w, LitGraphEdgeProps(calculateScore(suffix)), lg); | 
| 430 | next_char: | 
| 431 |         v = w; | 
| 432 |     } | 
| 433 |  | 
| 434 |     // Wire the last vertex to the sink. | 
| 435 |     add_edge(v, lg.sink, lg); | 
| 436 | } | 
| 437 |  | 
| 438 | static | 
| 439 | void (const vector<LitEdge> &cutset, const LitGraph &lg, | 
| 440 |                      set<ue2_literal> &s) { | 
| 441 |     for (const auto &e : cutset) { | 
| 442 |         LitVertex u = source(e, lg); | 
| 443 |         LitVertex v = target(e, lg); | 
| 444 |         ue2_literal lit; | 
| 445 |         lit.push_back(lg[v].c); | 
| 446 |         while (u != lg.root) { | 
| 447 |             lit.push_back(lg[u].c); | 
| 448 |             assert(in_degree(u, lg) == 1); | 
| 449 |             LitGraph::inv_adjacency_iterator ai, ae; | 
| 450 |             tie(ai, ae) = inv_adjacent_vertices(u, lg); | 
| 451 |             if (ai == ae) { | 
| 452 |                 // Path has been cut, time for the next literal. | 
| 453 |                 goto next_literal; | 
| 454 |             } | 
| 455 |             u = *ai; | 
| 456 |         } | 
| 457 |         DEBUG_PRINTF("extracted: '%s'\n" , escapeString(lit).c_str()); | 
| 458 |         s.insert(lit); | 
| 459 | next_literal: | 
| 460 |         ; | 
| 461 |     } | 
| 462 | } | 
| 463 |  | 
| 464 | #ifdef DEBUG | 
| 465 | static UNUSED | 
| 466 | const char *describeColor(small_color c) { | 
| 467 |     switch (c) { | 
| 468 |     case small_color::white: | 
| 469 |         return "white" ; | 
| 470 |     case small_color::gray: | 
| 471 |         return "gray" ; | 
| 472 |     case small_color::black: | 
| 473 |         return "black" ; | 
| 474 |     default: | 
| 475 |         return "unknown" ; | 
| 476 |     } | 
| 477 | } | 
| 478 | #endif | 
| 479 |  | 
| 480 | /** | 
| 481 |  * The BGL's boykov_kolmogorov_max_flow requires that all edges have their | 
| 482 |  * reverse edge in the graph. This function adds them, returning a vector | 
| 483 |  * mapping edge index to reverse edge. Note: LitGraph should be a DAG so there | 
| 484 |  * should be no existing reverse_edges. | 
| 485 |  */ | 
| 486 | static | 
| 487 | vector<LitEdge> add_reverse_edges_and_index(LitGraph &lg) { | 
| 488 |     const size_t edge_count = num_edges(lg); | 
| 489 |     vector<LitEdge> fwd_edges; | 
| 490 |     fwd_edges.reserve(edge_count); | 
| 491 |     for (const auto &e : edges_range(lg)) { | 
| 492 |         fwd_edges.push_back(e); | 
| 493 |     } | 
| 494 |  | 
| 495 |     vector<LitEdge> rev_map(2 * edge_count); | 
| 496 |  | 
| 497 |     for (const auto &e : fwd_edges) { | 
| 498 |         LitVertex u = source(e, lg); | 
| 499 |         LitVertex v = target(e, lg); | 
| 500 |  | 
| 501 |         assert(!edge(v, u, lg).second); | 
| 502 |  | 
| 503 |         LitEdge rev = add_edge(v, u, LitGraphEdgeProps(0), lg).first; | 
| 504 |         rev_map[lg[e].index] = rev; | 
| 505 |         rev_map[lg[rev].index] = e; | 
| 506 |     } | 
| 507 |  | 
| 508 |     return rev_map; | 
| 509 | } | 
| 510 |  | 
| 511 | static | 
| 512 | void findMinCut(LitGraph &lg, vector<LitEdge> &cutset) { | 
| 513 |     cutset.clear(); | 
| 514 |  | 
| 515 |     //dumpGraph("litgraph.dot", lg); | 
| 516 |  | 
| 517 |     assert(!in_degree(lg.root, lg)); | 
| 518 |     assert(!out_degree(lg.sink, lg)); | 
| 519 |     size_t num_real_edges = num_edges(lg); | 
| 520 |  | 
| 521 |     // Add reverse edges for the convenience of the BGL's max flow algorithm. | 
| 522 |     vector<LitEdge> rev_edges = add_reverse_edges_and_index(lg); | 
| 523 |  | 
| 524 |     const auto v_index_map = get(&LitGraphVertexProps::index, lg); | 
| 525 |     const auto e_index_map = get(&LitGraphEdgeProps::index, lg); | 
| 526 |     const size_t num_verts = num_vertices(lg); | 
| 527 |     auto colors = make_small_color_map(lg); | 
| 528 |     vector<s32> distances(num_verts); | 
| 529 |     vector<LitEdge> predecessors(num_verts); | 
| 530 |     vector<u64a> residuals(num_edges(lg)); | 
| 531 |  | 
| 532 |     UNUSED u64a flow = boykov_kolmogorov_max_flow(lg, | 
| 533 |             get(&LitGraphEdgeProps::score, lg), | 
| 534 |             make_iterator_property_map(residuals.begin(), e_index_map), | 
| 535 |             make_iterator_property_map(rev_edges.begin(), e_index_map), | 
| 536 |             make_iterator_property_map(predecessors.begin(), v_index_map), | 
| 537 |             colors, | 
| 538 |             make_iterator_property_map(distances.begin(), v_index_map), | 
| 539 |             v_index_map, lg.root, lg.sink); | 
| 540 |     DEBUG_PRINTF("done, flow = %llu\n" , flow); | 
| 541 |  | 
| 542 |     /* remove reverse edges */ | 
| 543 |     remove_edge_if([&](const LitEdge &e) { | 
| 544 |                        return lg[e].index >= num_real_edges; | 
| 545 |                    }, lg); | 
| 546 |  | 
| 547 |     vector<LitEdge> white_cut, black_cut; | 
| 548 |     u64a white_flow = 0, black_flow = 0; | 
| 549 |  | 
| 550 |     for (const auto &e : edges_range(lg)) { | 
| 551 |         const LitVertex u = source(e, lg), v = target(e, lg); | 
| 552 |         const auto ucolor = get(colors, u); | 
| 553 |         const auto vcolor = get(colors, v); | 
| 554 |  | 
| 555 |         DEBUG_PRINTF("edge %zu:%s -> %zu:%s score %llu\n" , lg[u].index, | 
| 556 |                      describeColor(ucolor), lg[v].index, describeColor(vcolor), | 
| 557 |                      lg[e].score); | 
| 558 |  | 
| 559 |         if (ucolor != small_color::white && vcolor == small_color::white) { | 
| 560 |             assert(v != lg.sink); | 
| 561 |             white_cut.push_back(e); | 
| 562 |             white_flow += lg[e].score; | 
| 563 |         } | 
| 564 |         if (ucolor == small_color::black && vcolor != small_color::black) { | 
| 565 |             assert(v != lg.sink); | 
| 566 |             black_cut.push_back(e); | 
| 567 |             black_flow += lg[e].score; | 
| 568 |         } | 
| 569 |     } | 
| 570 |  | 
| 571 |     DEBUG_PRINTF("white flow = %llu, black flow = %llu\n" , | 
| 572 |                  white_flow, black_flow); | 
| 573 |     assert(white_flow && black_flow); | 
| 574 |  | 
| 575 |     if (white_flow <= black_flow) { | 
| 576 |         DEBUG_PRINTF("selected white cut\n" ); | 
| 577 |         cutset.swap(white_cut); | 
| 578 |     } else { | 
| 579 |         DEBUG_PRINTF("selected black cut\n" ); | 
| 580 |         cutset.swap(black_cut); | 
| 581 |     } | 
| 582 |  | 
| 583 |     DEBUG_PRINTF("min cut has %zu edges\n" , cutset.size()); | 
| 584 |     assert(!cutset.empty()); | 
| 585 | } | 
| 586 |  | 
| 587 | /** Takes a set of literals and derives a better one from them, returning its | 
| 588 |  * score. Literals with a common suffix S will be replaced with S. (for | 
| 589 |  * example, {foobar, fooobar} -> {oobar}). | 
| 590 |  */ | 
| 591 | u64a compressAndScore(set<ue2_literal> &s) { | 
| 592 |     if (s.empty()) { | 
| 593 |         return NO_LITERAL_AT_EDGE_SCORE; | 
| 594 |     } | 
| 595 |  | 
| 596 |     if (s.size() == 1) { | 
| 597 |         return calculateScore(*s.begin()); | 
| 598 |     } | 
| 599 |  | 
| 600 |     UNUSED u64a initialScore = scoreSet(s); | 
| 601 |     DEBUG_PRINTF("begin, initial literals have score %llu\n" , | 
| 602 |                   initialScore); | 
| 603 |  | 
| 604 |     LitGraph lg; | 
| 605 |  | 
| 606 |     for (const auto &lit : s) { | 
| 607 |         addReversedLiteral(lit, lg); | 
| 608 |     } | 
| 609 |  | 
| 610 |     DEBUG_PRINTF("suffix tree has %zu vertices and %zu edges\n" , | 
| 611 |                   num_vertices(lg), num_edges(lg)); | 
| 612 |  | 
| 613 |     vector<LitEdge> cutset; | 
| 614 |     findMinCut(lg, cutset); | 
| 615 |  | 
| 616 |     s.clear(); | 
| 617 |     extractLiterals(cutset, lg, s); | 
| 618 |  | 
| 619 |     u64a score = scoreSet(s); | 
| 620 |     DEBUG_PRINTF("compressed score is %llu\n" , score); | 
| 621 |     assert(score <= initialScore); | 
| 622 |     return score; | 
| 623 | } | 
| 624 |  | 
| 625 | /* like compressAndScore, but replaces long mixed sensitivity literals with | 
| 626 |  * something weaker. */ | 
| 627 | u64a sanitizeAndCompressAndScore(set<ue2_literal> &lits) { | 
| 628 |     const size_t maxExploded = 8; // only case-explode this far | 
| 629 |  | 
| 630 |     /* TODO: the whole compression thing could be made better by systematically | 
| 631 |      * considering replacing literal sets not just by common suffixes but also | 
| 632 |      * by nocase literals. */ | 
| 633 |  | 
| 634 |     vector<ue2_literal> replacements; | 
| 635 |  | 
| 636 |     for (auto it = lits.begin(); it != lits.end();) { | 
| 637 |         auto jt = it; | 
| 638 |         ++it; | 
| 639 |  | 
| 640 |         if (!bad_mixed_sensitivity(*jt)) { | 
| 641 |             continue; | 
| 642 |         } | 
| 643 |  | 
| 644 |         /* we have to replace *jt with something... */ | 
| 645 |         ue2_literal s = *jt; | 
| 646 |         lits.erase(jt); | 
| 647 |  | 
| 648 |         vector<ue2_literal> exploded; | 
| 649 |         for (auto cit = caseIterateBegin(s); cit != caseIterateEnd(); ++cit) { | 
| 650 |             exploded.emplace_back(*cit, false); | 
| 651 |             if (exploded.size() > maxExploded) { | 
| 652 |                 goto dont_explode; | 
| 653 |             } | 
| 654 |         } | 
| 655 |         insert(&replacements, replacements.end(), exploded); | 
| 656 |  | 
| 657 |         continue; | 
| 658 |     dont_explode: | 
| 659 |         make_nocase(&s); | 
| 660 |         replacements.push_back(s); | 
| 661 |     } | 
| 662 |  | 
| 663 |     insert(&lits, replacements); | 
| 664 |     return compressAndScore(lits); | 
| 665 | } | 
| 666 |  | 
| 667 | u64a scoreSet(const set<ue2_literal> &s) { | 
| 668 |     if (s.empty()) { | 
| 669 |         return NO_LITERAL_AT_EDGE_SCORE; | 
| 670 |     } | 
| 671 |  | 
| 672 |     u64a score = 1ULL; | 
| 673 |  | 
| 674 |     for (const auto &lit : s) { | 
| 675 |         score += calculateScore(lit); | 
| 676 |     } | 
| 677 |  | 
| 678 |     return score; | 
| 679 | } | 
| 680 |  | 
| 681 | set<ue2_literal> getLiteralSet(const NGHolder &g, const NFAEdge &e) { | 
| 682 |     set<ue2_literal> s; | 
| 683 |     processWorkQueue(g, e, s); | 
| 684 |     return s; | 
| 685 | } | 
| 686 |  | 
| 687 | set<ue2_literal> getLiteralSet(const NGHolder &g, const NFAVertex &v, | 
| 688 |                                bool only_first_encounter) { | 
| 689 |     set<ue2_literal> s; | 
| 690 |  | 
| 691 |     if (is_special(v, g)) { | 
| 692 |         return s; | 
| 693 |     } | 
| 694 |  | 
| 695 |     set<ue2_literal> ls; | 
| 696 |  | 
| 697 |     for (const auto &e : in_edges_range(v, g)) { | 
| 698 |         if (source(e, g) == v && only_first_encounter) { | 
| 699 |             continue; /* ignore self loop on root vertex as we are interested in | 
| 700 |                        * the first time we visit the vertex on the way to | 
| 701 |                        * accept. In fact, we can ignore any back edges - but | 
| 702 |                        * they would require a bit of effort to discover. */ | 
| 703 |         } | 
| 704 |  | 
| 705 |         ls = getLiteralSet(g, e); | 
| 706 |         if (ls.empty()) { | 
| 707 |             s.clear(); | 
| 708 |             return s; | 
| 709 |         } else { | 
| 710 |             s.insert(ls.begin(), ls.end()); | 
| 711 |         } | 
| 712 |     } | 
| 713 |  | 
| 714 |     return s; | 
| 715 | } | 
| 716 |  | 
| 717 | vector<u64a> scoreEdges(const NGHolder &g, const flat_set<NFAEdge> &known_bad) { | 
| 718 |     assert(hasCorrectlyNumberedEdges(g)); | 
| 719 |  | 
| 720 |     vector<u64a> scores(num_edges(g)); | 
| 721 |  | 
| 722 |     for (const auto &e : edges_range(g)) { | 
| 723 |         u32 eidx = g[e].index; | 
| 724 |         assert(eidx < scores.size()); | 
| 725 |         if (contains(known_bad, e)) { | 
| 726 |             scores[eidx] = NO_LITERAL_AT_EDGE_SCORE; | 
| 727 |         } else { | 
| 728 |             set<ue2_literal> ls = getLiteralSet(g, e); | 
| 729 |             scores[eidx] = compressAndScore(ls); | 
| 730 |         } | 
| 731 |     } | 
| 732 |  | 
| 733 |     return scores; | 
| 734 | } | 
| 735 |  | 
| 736 | bool splitOffLeadingLiteral(const NGHolder &g, ue2_literal *lit_out, | 
| 737 |                             NGHolder *rhs) { | 
| 738 |     DEBUG_PRINTF("looking for leading floating literal\n" ); | 
| 739 |     set<NFAVertex> s_succ; | 
| 740 |     insert(&s_succ, adjacent_vertices(g.start, g)); | 
| 741 |  | 
| 742 |     set<NFAVertex> sds_succ; | 
| 743 |     insert(&sds_succ, adjacent_vertices(g.startDs, g)); | 
| 744 |  | 
| 745 |     bool floating = is_subset_of(s_succ, sds_succ); | 
| 746 |     if (!floating) { | 
| 747 |         DEBUG_PRINTF("not floating\n" ); | 
| 748 |         return false; | 
| 749 |     } | 
| 750 |  | 
| 751 |     sds_succ.erase(g.startDs); | 
| 752 |     if (sds_succ.size() != 1) { | 
| 753 |         DEBUG_PRINTF("branchy root\n" ); | 
| 754 |         return false; | 
| 755 |     } | 
| 756 |  | 
| 757 |     NFAVertex u = g.startDs; | 
| 758 |     NFAVertex v = *sds_succ.begin(); | 
| 759 |  | 
| 760 |     while (true) { | 
| 761 |         DEBUG_PRINTF("validating vertex %zu\n" , g[v].index); | 
| 762 |  | 
| 763 |         assert(v != g.acceptEod && v != g.accept); | 
| 764 |  | 
| 765 |         const CharReach &cr = g[v].char_reach; | 
| 766 |         if (cr.count() != 1 && !cr.isCaselessChar()) { | 
| 767 |             break; | 
| 768 |         } | 
| 769 |  | 
| 770 |         // Rose can only handle mixed-sensitivity literals up to the max mask | 
| 771 |         // length. | 
| 772 |         if (lit_out->length() >= MAX_MASK2_WIDTH) { | 
| 773 |             if (mixed_sensitivity(*lit_out)) { | 
| 774 |                 DEBUG_PRINTF("long and mixed sensitivity\n" ); | 
| 775 |                 break; | 
| 776 |             } | 
| 777 |             if (ourisalpha((char)cr.find_first())) { | 
| 778 |                 if (cr.isCaselessChar() != lit_out->any_nocase()) { | 
| 779 |                     DEBUG_PRINTF("stop at mixed sensitivity on '%c'\n" , | 
| 780 |                                  (char)cr.find_first()); | 
| 781 |                     break; | 
| 782 |                 } | 
| 783 |             } | 
| 784 |         } | 
| 785 |  | 
| 786 |         if (edge(v, g.accept, g).second || edge(v, g.acceptEod, g).second) { | 
| 787 |             DEBUG_PRINTF("connection to accept\n" ); | 
| 788 |             break; | 
| 789 |         } | 
| 790 |  | 
| 791 |         lit_out->push_back(cr.find_first(), cr.isCaselessChar()); | 
| 792 |         u = v; | 
| 793 |  | 
| 794 |         if (out_degree(v, g) != 1) { | 
| 795 |             DEBUG_PRINTF("out_degree != 1\n" ); | 
| 796 |             break; | 
| 797 |         } | 
| 798 |  | 
| 799 |         v = *adjacent_vertices(v, g).first; | 
| 800 |  | 
| 801 |         if (in_degree(v, g) != 1) { | 
| 802 |             DEBUG_PRINTF("blargh\n" ); /* picks up cases where there is no path | 
| 803 |                                        * to case accept (large cycles), | 
| 804 |                                        * ensures term */ | 
| 805 |             break; | 
| 806 |         } | 
| 807 |     } | 
| 808 |  | 
| 809 |     if (lit_out->empty()) { | 
| 810 |         return false; | 
| 811 |     } | 
| 812 |     assert(u != g.startDs); | 
| 813 |  | 
| 814 |     unordered_map<NFAVertex, NFAVertex> rhs_map; | 
| 815 |     vector<NFAVertex> pivots = make_vector_from(adjacent_vertices(u, g)); | 
| 816 |     splitRHS(g, pivots, rhs, &rhs_map); | 
| 817 |  | 
| 818 |     DEBUG_PRINTF("literal is '%s' (len %zu)\n" , dumpString(*lit_out).c_str(), | 
| 819 |                  lit_out->length()); | 
| 820 |     assert(is_triggered(*rhs)); | 
| 821 |     return true; | 
| 822 | } | 
| 823 |  | 
| 824 | bool getTrailingLiteral(const NGHolder &g, ue2_literal *lit_out) { | 
| 825 |     if (in_degree(g.acceptEod, g) != 1) { | 
| 826 |         return false; | 
| 827 |     } | 
| 828 |  | 
| 829 |     NFAVertex v = getSoleSourceVertex(g, g.accept); | 
| 830 |  | 
| 831 |     if (!v) { | 
| 832 |         return false; | 
| 833 |     } | 
| 834 |  | 
| 835 |     set<ue2_literal> s = getLiteralSet(g, v, false); | 
| 836 |  | 
| 837 |     if (s.size() != 1) { | 
| 838 |         return false; | 
| 839 |     } | 
| 840 |  | 
| 841 |     const ue2_literal &lit = *s.begin(); | 
| 842 |  | 
| 843 |     if (lit.length() > MAX_MASK2_WIDTH && mixed_sensitivity(lit)) { | 
| 844 |         DEBUG_PRINTF("long & mixed-sensitivity, Rose can't handle this.\n" ); | 
| 845 |         return false; | 
| 846 |     } | 
| 847 |  | 
| 848 |     *lit_out = lit; | 
| 849 |     return true; | 
| 850 | } | 
| 851 |  | 
| 852 | bool literalIsWholeGraph(const NGHolder &g, const ue2_literal &lit) { | 
| 853 |     NFAVertex v = g.accept; | 
| 854 |  | 
| 855 |     for (auto it = lit.rbegin(), ite = lit.rend(); it != ite; ++it) { | 
| 856 |         NGHolder::inv_adjacency_iterator ai, ae; | 
| 857 |         tie(ai, ae) = inv_adjacent_vertices(v, g); | 
| 858 |         if (ai == ae) { | 
| 859 |             assert(0); // no predecessors? | 
| 860 |             return false; | 
| 861 |         } | 
| 862 |         v = *ai++; | 
| 863 |         if (ai != ae) { | 
| 864 |             DEBUG_PRINTF("branch, fail\n" ); | 
| 865 |             return false; | 
| 866 |         } | 
| 867 |  | 
| 868 |         if (is_special(v, g)) { | 
| 869 |             DEBUG_PRINTF("special found, fail\n" ); | 
| 870 |             return false; | 
| 871 |         } | 
| 872 |  | 
| 873 |         const CharReach &cr_g = g[v].char_reach; | 
| 874 |         const CharReach &cr_l = *it; | 
| 875 |  | 
| 876 |         if (!cr_l.isSubsetOf(cr_g)) { | 
| 877 |             /* running over the prefix is needed to prevent false postives */ | 
| 878 |             DEBUG_PRINTF("reach fail\n" ); | 
| 879 |             return false; | 
| 880 |         } | 
| 881 |     } | 
| 882 |  | 
| 883 |     // Our last value for v should have only start states for predecessors. | 
| 884 |     for (auto u : inv_adjacent_vertices_range(v, g)) { | 
| 885 |         if (!is_any_start(u, g)) { | 
| 886 |             DEBUG_PRINTF("pred is not start\n" ); | 
| 887 |             return false; | 
| 888 |         } | 
| 889 |     } | 
| 890 |  | 
| 891 |     assert(num_vertices(g) == lit.length() + N_SPECIALS); | 
| 892 |  | 
| 893 |     DEBUG_PRINTF("ok\n" ); | 
| 894 |     return true; | 
| 895 | } | 
| 896 |  | 
| 897 | } // namespace ue2 | 
| 898 |  |