| 1 | /* |
| 2 | * Copyright (c) 2015-2018, Intel Corporation |
| 3 | * |
| 4 | * Redistribution and use in source and binary forms, with or without |
| 5 | * modification, are permitted provided that the following conditions are met: |
| 6 | * |
| 7 | * * Redistributions of source code must retain the above copyright notice, |
| 8 | * this list of conditions and the following disclaimer. |
| 9 | * * Redistributions in binary form must reproduce the above copyright |
| 10 | * notice, this list of conditions and the following disclaimer in the |
| 11 | * documentation and/or other materials provided with the distribution. |
| 12 | * * Neither the name of Intel Corporation nor the names of its contributors |
| 13 | * may be used to endorse or promote products derived from this software |
| 14 | * without specific prior written permission. |
| 15 | * |
| 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| 17 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
| 20 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| 21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| 22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| 23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| 24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| 25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| 26 | * POSSIBILITY OF SUCH DAMAGE. |
| 27 | */ |
| 28 | |
| 29 | /** \file |
| 30 | * \brief Build code for Haig SOM DFA. |
| 31 | */ |
| 32 | #include "ng_haig.h" |
| 33 | |
| 34 | #include "grey.h" |
| 35 | #include "nfa/goughcompile.h" |
| 36 | #include "ng_holder.h" |
| 37 | #include "ng_mcclellan_internal.h" |
| 38 | #include "ng_som_util.h" |
| 39 | #include "ng_squash.h" |
| 40 | #include "util/bitfield.h" |
| 41 | #include "util/container.h" |
| 42 | #include "util/determinise.h" |
| 43 | #include "util/flat_containers.h" |
| 44 | #include "util/graph.h" |
| 45 | #include "util/graph_range.h" |
| 46 | #include "util/hash_dynamic_bitset.h" |
| 47 | #include "util/make_unique.h" |
| 48 | #include "util/unordered.h" |
| 49 | |
| 50 | #include <algorithm> |
| 51 | #include <functional> |
| 52 | #include <map> |
| 53 | #include <set> |
| 54 | #include <vector> |
| 55 | #include <boost/dynamic_bitset.hpp> |
| 56 | |
| 57 | using namespace std; |
| 58 | using boost::dynamic_bitset; |
| 59 | |
| 60 | namespace ue2 { |
| 61 | |
| 62 | #define NFA_STATE_LIMIT 256 |
| 63 | |
| 64 | #define HAIG_MAX_NFA_STATE 600 |
| 65 | #define HAIG_MAX_LIVE_SOM_SLOTS 32 |
| 66 | |
| 67 | namespace { |
| 68 | struct haig_too_wide { |
| 69 | }; |
| 70 | |
| 71 | template<typename stateset> |
| 72 | static |
| 73 | void populateInit(const NGHolder &g, const flat_set<NFAVertex> &unused, |
| 74 | stateset *init, stateset *initDS, |
| 75 | vector<NFAVertex> *v_by_index) { |
| 76 | DEBUG_PRINTF("graph kind: %s\n" , to_string(g.kind).c_str()); |
| 77 | for (auto v : vertices_range(g)) { |
| 78 | if (contains(unused, v)) { |
| 79 | continue; |
| 80 | } |
| 81 | u32 v_index = g[v].index; |
| 82 | if (is_any_start(v, g)) { |
| 83 | init->set(v_index); |
| 84 | if (hasSelfLoop(v, g) || is_triggered(g)) { |
| 85 | DEBUG_PRINTF("setting %u\n" , v_index); |
| 86 | initDS->set(v_index); |
| 87 | } |
| 88 | } |
| 89 | assert(v_index < init->size()); |
| 90 | } |
| 91 | |
| 92 | v_by_index->clear(); |
| 93 | v_by_index->resize(num_vertices(g), NGHolder::null_vertex()); |
| 94 | |
| 95 | for (auto v : vertices_range(g)) { |
| 96 | u32 v_index = g[v].index; |
| 97 | assert((*v_by_index)[v_index] == NGHolder::null_vertex()); |
| 98 | (*v_by_index)[v_index] = v; |
| 99 | } |
| 100 | } |
| 101 | |
| 102 | template<typename StateSet> |
| 103 | void populateAccepts(const NGHolder &g, StateSet *accept, StateSet *acceptEod) { |
| 104 | for (auto v : inv_adjacent_vertices_range(g.accept, g)) { |
| 105 | accept->set(g[v].index); |
| 106 | } |
| 107 | for (auto v : inv_adjacent_vertices_range(g.acceptEod, g)) { |
| 108 | if (v == g.accept) { |
| 109 | continue; |
| 110 | } |
| 111 | acceptEod->set(g[v].index); |
| 112 | } |
| 113 | } |
| 114 | |
| 115 | template<typename Automaton_Traits> |
| 116 | class Automaton_Base { |
| 117 | public: |
| 118 | using StateSet = typename Automaton_Traits::StateSet; |
| 119 | using StateMap = typename Automaton_Traits::StateMap; |
| 120 | |
| 121 | protected: |
| 122 | Automaton_Base(const NGHolder &graph_in, som_type som, |
| 123 | const vector<vector<CharReach>> &triggers, |
| 124 | bool unordered_som) |
| 125 | : graph(graph_in), numStates(num_vertices(graph)), |
| 126 | unused(getRedundantStarts(graph_in)), |
| 127 | init(Automaton_Traits::init_states(numStates)), |
| 128 | initDS(Automaton_Traits::init_states(numStates)), |
| 129 | squash(Automaton_Traits::init_states(numStates)), |
| 130 | accept(Automaton_Traits::init_states(numStates)), |
| 131 | acceptEod(Automaton_Traits::init_states(numStates)), |
| 132 | toppable(Automaton_Traits::init_states(numStates)), |
| 133 | dead(Automaton_Traits::init_states(numStates)) { |
| 134 | calculateAlphabet(graph, alpha, unalpha, &alphasize); |
| 135 | assert(alphasize <= ALPHABET_SIZE); |
| 136 | |
| 137 | populateInit(graph, unused, &init, &initDS, &v_by_index); |
| 138 | populateAccepts(graph, &accept, &acceptEod); |
| 139 | |
| 140 | start_anchored = DEAD_STATE + 1; |
| 141 | if (initDS == init) { |
| 142 | start_floating = start_anchored; |
| 143 | } else if (initDS.any()) { |
| 144 | start_floating = start_anchored + 1; |
| 145 | } else { |
| 146 | start_floating = DEAD_STATE; |
| 147 | } |
| 148 | |
| 149 | cr_by_index = populateCR(graph, v_by_index, alpha); |
| 150 | |
| 151 | if (!unordered_som) { |
| 152 | for (const auto &sq : findSquashers(graph, som)) { |
| 153 | NFAVertex v = sq.first; |
| 154 | u32 vert_id = graph[v].index; |
| 155 | squash.set(vert_id); |
| 156 | squash_mask[vert_id] = shrinkStateSet(sq.second); |
| 157 | } |
| 158 | } |
| 159 | |
| 160 | if (is_triggered(graph)) { |
| 161 | dynamic_bitset<> temp(numStates); |
| 162 | markToppableStarts(graph, unused, false, triggers, &temp); |
| 163 | toppable = Automaton_Traits::copy_states(temp, numStates); |
| 164 | } |
| 165 | } |
| 166 | |
| 167 | private: |
| 168 | // Convert an NFAStateSet (as used by the squash code) into a StateSet. |
| 169 | StateSet shrinkStateSet(const NFAStateSet &in) const { |
| 170 | StateSet out = Automaton_Traits::init_states(numStates); |
| 171 | for (size_t i = in.find_first(); i != in.npos && i < out.size(); |
| 172 | i = in.find_next(i)) { |
| 173 | out.set(i); |
| 174 | } |
| 175 | return out; |
| 176 | } |
| 177 | |
| 178 | void reports_i(const StateSet &in, bool eod, flat_set<ReportID> &rv) { |
| 179 | StateSet acc = in & (eod ? acceptEod : accept); |
| 180 | for (size_t i = acc.find_first(); i != StateSet::npos; |
| 181 | i = acc.find_next(i)) { |
| 182 | NFAVertex v = v_by_index[i]; |
| 183 | DEBUG_PRINTF("marking report\n" ); |
| 184 | const auto &my_reports = graph[v].reports; |
| 185 | rv.insert(my_reports.begin(), my_reports.end()); |
| 186 | } |
| 187 | } |
| 188 | |
| 189 | public: |
| 190 | void transition(const StateSet &in, StateSet *next) { |
| 191 | transition_graph(*this, v_by_index, in, next); |
| 192 | } |
| 193 | |
| 194 | const vector<StateSet> initial() { |
| 195 | vector<StateSet> rv = {init}; |
| 196 | if (start_floating != DEAD_STATE && start_floating != start_anchored) { |
| 197 | rv.push_back(initDS); |
| 198 | } |
| 199 | return rv; |
| 200 | } |
| 201 | |
| 202 | void reports(const StateSet &in, flat_set<ReportID> &rv) { |
| 203 | reports_i(in, false, rv); |
| 204 | } |
| 205 | |
| 206 | void reportsEod(const StateSet &in, flat_set<ReportID> &rv) { |
| 207 | reports_i(in, true, rv); |
| 208 | } |
| 209 | |
| 210 | static bool canPrune(const flat_set<ReportID> &) { return false; } |
| 211 | |
| 212 | const NGHolder &graph; |
| 213 | const u32 numStates; |
| 214 | const flat_set<NFAVertex> unused; |
| 215 | |
| 216 | array<u16, ALPHABET_SIZE> alpha; |
| 217 | array<u16, ALPHABET_SIZE> unalpha; |
| 218 | u16 alphasize; |
| 219 | |
| 220 | set<dstate_id_t> done_a; |
| 221 | set<dstate_id_t> done_b; |
| 222 | |
| 223 | u16 start_anchored; |
| 224 | u16 start_floating; |
| 225 | |
| 226 | vector<NFAVertex> v_by_index; |
| 227 | vector<CharReach> cr_by_index; /* pre alpha'ed */ |
| 228 | StateSet init; |
| 229 | StateSet initDS; |
| 230 | StateSet squash; /* states which allow us to mask out other states */ |
| 231 | StateSet accept; |
| 232 | StateSet acceptEod; |
| 233 | StateSet toppable; /* states which are allowed to be on when a top arrives, |
| 234 | * triggered dfas only */ |
| 235 | map<u32, StateSet> squash_mask; |
| 236 | StateSet dead; |
| 237 | }; |
| 238 | |
| 239 | struct Big_Traits { |
| 240 | using StateSet = dynamic_bitset<>; |
| 241 | using StateMap = unordered_map<StateSet, dstate_id_t, hash_dynamic_bitset>; |
| 242 | |
| 243 | static StateSet init_states(u32 num) { |
| 244 | return StateSet(num); |
| 245 | } |
| 246 | |
| 247 | static StateSet copy_states(const dynamic_bitset<> &in, UNUSED u32 num) { |
| 248 | assert(in.size() == num); |
| 249 | return in; |
| 250 | } |
| 251 | }; |
| 252 | |
| 253 | class Automaton_Big : public Automaton_Base<Big_Traits> { |
| 254 | public: |
| 255 | Automaton_Big(const NGHolder &graph_in, som_type som, |
| 256 | const vector<vector<CharReach>> &triggers, bool unordered_som) |
| 257 | : Automaton_Base(graph_in, som, triggers, unordered_som) {} |
| 258 | }; |
| 259 | |
| 260 | struct Graph_Traits { |
| 261 | using StateSet = bitfield<NFA_STATE_LIMIT>; |
| 262 | using StateMap = unordered_map<StateSet, dstate_id_t>; |
| 263 | |
| 264 | static StateSet init_states(UNUSED u32 num) { |
| 265 | assert(num <= NFA_STATE_LIMIT); |
| 266 | return StateSet(); |
| 267 | } |
| 268 | |
| 269 | static StateSet copy_states(const dynamic_bitset<> &in, u32 num) { |
| 270 | StateSet out = init_states(num); |
| 271 | for (size_t i = in.find_first(); i != in.npos && i < out.size(); |
| 272 | i = in.find_next(i)) { |
| 273 | out.set(i); |
| 274 | } |
| 275 | return out; |
| 276 | } |
| 277 | }; |
| 278 | |
| 279 | class Automaton_Graph : public Automaton_Base<Graph_Traits> { |
| 280 | public: |
| 281 | Automaton_Graph(const NGHolder &graph_in, som_type som, |
| 282 | const vector<vector<CharReach>> &triggers, |
| 283 | bool unordered_som) |
| 284 | : Automaton_Base(graph_in, som, triggers, unordered_som) {} |
| 285 | }; |
| 286 | |
| 287 | class Automaton_Haig_Merge { |
| 288 | public: |
| 289 | using StateSet = vector<u16>; |
| 290 | using StateMap = ue2_unordered_map<StateSet, dstate_id_t>; |
| 291 | |
| 292 | explicit Automaton_Haig_Merge(const vector<const raw_som_dfa *> &in) |
| 293 | : nfas(in.begin(), in.end()), dead(in.size()) { |
| 294 | calculateAlphabet(); |
| 295 | populateAsFs(); |
| 296 | } |
| 297 | |
| 298 | void populateAsFs(void) { |
| 299 | bool fs_same = true; |
| 300 | bool fs_dead = true; |
| 301 | |
| 302 | as.resize(nfas.size()); |
| 303 | fs.resize(nfas.size()); |
| 304 | for (u32 i = 0; i < nfas.size(); i++) { |
| 305 | as[i] = nfas[i]->start_anchored; |
| 306 | fs[i] = nfas[i]->start_floating; |
| 307 | |
| 308 | if (fs[i]) { |
| 309 | fs_dead = false; |
| 310 | } |
| 311 | |
| 312 | if (as[i] != fs[i]) { |
| 313 | fs_same = false; |
| 314 | } |
| 315 | } |
| 316 | |
| 317 | start_anchored = DEAD_STATE + 1; |
| 318 | if (fs_same) { |
| 319 | start_floating = start_anchored; |
| 320 | } else if (fs_dead) { |
| 321 | start_floating = DEAD_STATE; |
| 322 | } else { |
| 323 | start_floating = start_anchored + 1; |
| 324 | } |
| 325 | } |
| 326 | |
| 327 | void calculateAlphabet(void) { |
| 328 | DEBUG_PRINTF("calculating alphabet\n" ); |
| 329 | vector<CharReach> esets(1, CharReach::dot()); |
| 330 | |
| 331 | for (const auto &haig : nfas) { |
| 332 | DEBUG_PRINTF("...next dfa alphabet\n" ); |
| 333 | assert(haig); |
| 334 | const auto &alpha_remap = haig->alpha_remap; |
| 335 | |
| 336 | for (size_t i = 0; i < esets.size(); i++) { |
| 337 | assert(esets[i].any()); |
| 338 | if (esets[i].count() == 1) { |
| 339 | DEBUG_PRINTF("skipping singleton eq set\n" ); |
| 340 | continue; |
| 341 | } |
| 342 | |
| 343 | CharReach t; |
| 344 | u8 leader_s = alpha_remap[esets[i].find_first()]; |
| 345 | |
| 346 | DEBUG_PRINTF("checking eq set, leader %02hhx \n" , leader_s); |
| 347 | |
| 348 | for (size_t s = esets[i].find_first(); |
| 349 | s != CharReach::npos; s = esets[i].find_next(s)) { |
| 350 | if (alpha_remap[s] != leader_s) { |
| 351 | t.set(s); |
| 352 | } |
| 353 | } |
| 354 | |
| 355 | if (t.any() && t != esets[i]) { |
| 356 | esets[i] &= ~t; |
| 357 | esets.push_back(t); |
| 358 | } |
| 359 | } |
| 360 | } |
| 361 | |
| 362 | alphasize = buildAlphabetFromEquivSets(esets, alpha, unalpha); |
| 363 | } |
| 364 | |
| 365 | void transition(const StateSet &in, StateSet *next) { |
| 366 | u16 t[ALPHABET_SIZE]; |
| 367 | |
| 368 | for (u32 i = 0; i < alphasize; i++) { |
| 369 | next[i].resize(nfas.size()); |
| 370 | } |
| 371 | |
| 372 | for (u32 j = 0; j < nfas.size(); j++) { |
| 373 | getFullTransitionFromState(*nfas[j], in[j], t); |
| 374 | for (u32 i = 0; i < alphasize; i++) { |
| 375 | next[i][j]= t[unalpha[i]]; |
| 376 | } |
| 377 | } |
| 378 | } |
| 379 | |
| 380 | const vector<StateSet> initial() { |
| 381 | vector<StateSet> rv(1, as); |
| 382 | if (start_floating != DEAD_STATE && start_floating != start_anchored) { |
| 383 | rv.push_back(fs); |
| 384 | } |
| 385 | return rv; |
| 386 | } |
| 387 | |
| 388 | private: |
| 389 | void reports_i(const StateSet &in, flat_set<ReportID> dstate::*r_set, |
| 390 | flat_set<ReportID> &r) { |
| 391 | for (u32 i = 0; i < nfas.size(); i++) { |
| 392 | const auto &rs = nfas[i]->states[in[i]].*r_set; |
| 393 | insert(&r, rs); |
| 394 | } |
| 395 | } |
| 396 | |
| 397 | public: |
| 398 | void reports(const StateSet &in, flat_set<ReportID> &rv) { |
| 399 | reports_i(in, &dstate::reports, rv); |
| 400 | } |
| 401 | void reportsEod(const StateSet &in, flat_set<ReportID> &rv) { |
| 402 | reports_i(in, &dstate::reports_eod, rv); |
| 403 | } |
| 404 | |
| 405 | static bool canPrune(const flat_set<ReportID> &) { return false; } |
| 406 | |
| 407 | private: |
| 408 | vector<const raw_som_dfa *> nfas; |
| 409 | vector<dstate_id_t> as; |
| 410 | vector<dstate_id_t> fs; |
| 411 | public: |
| 412 | array<u16, ALPHABET_SIZE> alpha; |
| 413 | array<u16, ALPHABET_SIZE> unalpha; |
| 414 | u16 alphasize; |
| 415 | StateSet dead; |
| 416 | |
| 417 | u16 start_anchored; |
| 418 | u16 start_floating; |
| 419 | }; |
| 420 | } |
| 421 | |
| 422 | enum bslm_mode { |
| 423 | ONLY_EXISTING, |
| 424 | INCLUDE_INVALID |
| 425 | }; |
| 426 | |
| 427 | static |
| 428 | bool is_any_start_inc_virtual(NFAVertex v, const NGHolder &g) { |
| 429 | return is_virtual_start(v, g) || is_any_start(v, g); |
| 430 | } |
| 431 | |
| 432 | static |
| 433 | s32 getSlotID(const NGHolder &g, UNUSED const flat_set<NFAVertex> &unused, |
| 434 | NFAVertex v) { |
| 435 | if (is_triggered(g) && v == g.start) { |
| 436 | assert(!contains(unused, v)); |
| 437 | } else if (is_any_start_inc_virtual(v, g)) { |
| 438 | return CREATE_NEW_SOM; |
| 439 | } |
| 440 | |
| 441 | return g[v].index; |
| 442 | } |
| 443 | |
| 444 | template<typename stateset> |
| 445 | static |
| 446 | void haig_do_preds(const NGHolder &g, const stateset &nfa_states, |
| 447 | const vector<NFAVertex> &state_mapping, |
| 448 | som_tran_info &preds) { |
| 449 | for (size_t i = nfa_states.find_first(); i != stateset::npos; |
| 450 | i = nfa_states.find_next(i)) { |
| 451 | NFAVertex v = state_mapping[i]; |
| 452 | s32 slot_id = g[v].index; |
| 453 | |
| 454 | DEBUG_PRINTF("d vertex %zu\n" , g[v].index); |
| 455 | vector<u32> &out_map = preds[slot_id]; |
| 456 | for (auto u : inv_adjacent_vertices_range(v, g)) { |
| 457 | out_map.push_back(g[u].index); |
| 458 | } |
| 459 | |
| 460 | sort(out_map.begin(), out_map.end()); |
| 461 | assert(!out_map.empty() || v == g.start); |
| 462 | } |
| 463 | } |
| 464 | |
| 465 | template<typename stateset> |
| 466 | static |
| 467 | void haig_do_report(const NGHolder &g, const flat_set<NFAVertex> &unused, |
| 468 | NFAVertex accept_v, const stateset &source_nfa_states, |
| 469 | const vector<NFAVertex> &state_mapping, |
| 470 | set<som_report> &out) { |
| 471 | for (size_t i = source_nfa_states.find_first(); i != stateset::npos; |
| 472 | i = source_nfa_states.find_next(i)) { |
| 473 | NFAVertex v = state_mapping[i]; |
| 474 | if (!edge(v, accept_v, g).second) { |
| 475 | continue; |
| 476 | } |
| 477 | for (ReportID report_id : g[v].reports) { |
| 478 | out.insert(som_report(report_id, getSlotID(g, unused, v))); |
| 479 | } |
| 480 | } |
| 481 | } |
| 482 | |
| 483 | static |
| 484 | void haig_note_starts(const NGHolder &g, map<u32, u32> *out) { |
| 485 | if (is_triggered(g)) { |
| 486 | return; |
| 487 | } |
| 488 | |
| 489 | DEBUG_PRINTF("seeing who creates new som values\n" ); |
| 490 | |
| 491 | vector<DepthMinMax> depths = getDistancesFromSOM(g); |
| 492 | |
| 493 | for (auto v : vertices_range(g)) { |
| 494 | if (is_any_start_inc_virtual(v, g)) { |
| 495 | DEBUG_PRINTF("%zu creates new som value\n" , g[v].index); |
| 496 | out->emplace(g[v].index, 0U); |
| 497 | continue; |
| 498 | } |
| 499 | |
| 500 | if (is_any_accept(v, g)) { |
| 501 | continue; |
| 502 | } |
| 503 | |
| 504 | const DepthMinMax &d = depths[g[v].index]; |
| 505 | if (d.min == d.max && d.min.is_finite()) { |
| 506 | DEBUG_PRINTF("%zu is fixed at %u\n" , g[v].index, (u32)d.min); |
| 507 | out->emplace(g[v].index, d.min); |
| 508 | } |
| 509 | } |
| 510 | } |
| 511 | |
| 512 | template<class Auto> |
| 513 | static |
| 514 | bool doHaig(const NGHolder &g, som_type som, |
| 515 | const vector<vector<CharReach>> &triggers, bool unordered_som, |
| 516 | raw_som_dfa *rdfa) { |
| 517 | u32 state_limit = HAIG_FINAL_DFA_STATE_LIMIT; /* haig never backs down from |
| 518 | a fight */ |
| 519 | using StateSet = typename Auto::StateSet; |
| 520 | vector<StateSet> nfa_state_map; |
| 521 | Auto n(g, som, triggers, unordered_som); |
| 522 | try { |
| 523 | if (!determinise(n, rdfa->states, state_limit, &nfa_state_map)) { |
| 524 | DEBUG_PRINTF("state limit exceeded\n" ); |
| 525 | return false; |
| 526 | } |
| 527 | } catch (haig_too_wide &) { |
| 528 | DEBUG_PRINTF("too many live som states\n" ); |
| 529 | return false; |
| 530 | } |
| 531 | |
| 532 | rdfa->start_anchored = n.start_anchored; |
| 533 | rdfa->start_floating = n.start_floating; |
| 534 | rdfa->alpha_size = n.alphasize; |
| 535 | rdfa->alpha_remap = n.alpha; |
| 536 | |
| 537 | rdfa->state_som.reserve(rdfa->states.size()); |
| 538 | for (u32 i = 0; i < rdfa->states.size(); i++) { |
| 539 | rdfa->state_som.push_back(dstate_som()); |
| 540 | const StateSet &source_states = nfa_state_map[i]; |
| 541 | if (source_states.count() > HAIG_MAX_LIVE_SOM_SLOTS) { |
| 542 | DEBUG_PRINTF("too many live states\n" ); |
| 543 | return false; |
| 544 | } |
| 545 | |
| 546 | DEBUG_PRINTF("generating som info for %u\n" , i); |
| 547 | |
| 548 | haig_do_preds(g, source_states, n.v_by_index, |
| 549 | rdfa->state_som.back().preds); |
| 550 | |
| 551 | haig_do_report(g, n.unused, g.accept, source_states, n.v_by_index, |
| 552 | rdfa->state_som.back().reports); |
| 553 | haig_do_report(g, n.unused, g.acceptEod, source_states, n.v_by_index, |
| 554 | rdfa->state_som.back().reports_eod); |
| 555 | } |
| 556 | |
| 557 | haig_note_starts(g, &rdfa->new_som_nfa_states); |
| 558 | |
| 559 | return true; |
| 560 | } |
| 561 | |
| 562 | unique_ptr<raw_som_dfa> |
| 563 | attemptToBuildHaig(const NGHolder &g, som_type som, u32 somPrecision, |
| 564 | const vector<vector<CharReach>> &triggers, const Grey &grey, |
| 565 | bool unordered_som) { |
| 566 | assert(is_triggered(g) != triggers.empty()); |
| 567 | assert(!unordered_som || is_triggered(g)); |
| 568 | |
| 569 | if (!grey.allowGough) { |
| 570 | /* must be at least one engine capable of handling raw som dfas */ |
| 571 | return nullptr; |
| 572 | } |
| 573 | |
| 574 | DEBUG_PRINTF("attempting to build haig \n" ); |
| 575 | assert(allMatchStatesHaveReports(g)); |
| 576 | assert(hasCorrectlyNumberedVertices(g)); |
| 577 | |
| 578 | u32 numStates = num_vertices(g); |
| 579 | if (numStates > HAIG_MAX_NFA_STATE) { |
| 580 | DEBUG_PRINTF("giving up... looks too big\n" ); |
| 581 | return nullptr; |
| 582 | } |
| 583 | |
| 584 | auto rdfa = ue2::make_unique<raw_som_dfa>(g.kind, unordered_som, NODE_START, |
| 585 | somPrecision); |
| 586 | |
| 587 | DEBUG_PRINTF("determinising nfa with %u vertices\n" , numStates); |
| 588 | bool rv; |
| 589 | if (numStates <= NFA_STATE_LIMIT) { |
| 590 | /* fast path */ |
| 591 | rv = doHaig<Automaton_Graph>(g, som, triggers, unordered_som, |
| 592 | rdfa.get()); |
| 593 | } else { |
| 594 | /* not the fast path */ |
| 595 | rv = doHaig<Automaton_Big>(g, som, triggers, unordered_som, rdfa.get()); |
| 596 | } |
| 597 | |
| 598 | if (!rv) { |
| 599 | return nullptr; |
| 600 | } |
| 601 | |
| 602 | DEBUG_PRINTF("determinised, building impl dfa (a,f) = (%hu,%hu)\n" , |
| 603 | rdfa->start_anchored, rdfa->start_floating); |
| 604 | |
| 605 | assert(rdfa->kind == g.kind); |
| 606 | return rdfa; |
| 607 | } |
| 608 | |
| 609 | static |
| 610 | void haig_merge_do_preds(const vector<const raw_som_dfa *> &dfas, |
| 611 | const vector<u32> &per_dfa_adj, |
| 612 | const vector<dstate_id_t> &source_nfa_states, |
| 613 | som_tran_info &som_tran) { |
| 614 | for (u32 d = 0; d < dfas.size(); ++d) { |
| 615 | u32 adj = per_dfa_adj[d]; |
| 616 | |
| 617 | const som_tran_info &som_tran_d |
| 618 | = dfas[d]->state_som[source_nfa_states[d]].preds; |
| 619 | for (som_tran_info::const_iterator it = som_tran_d.begin(); |
| 620 | it != som_tran_d.end(); ++it) { |
| 621 | assert(it->first != CREATE_NEW_SOM); |
| 622 | u32 dest_slot = it->first < N_SPECIALS ? it->first |
| 623 | : it->first + adj; |
| 624 | vector<u32> &out = som_tran[dest_slot]; |
| 625 | |
| 626 | if (!out.empty()) { |
| 627 | /* stylised specials already done; it does not matter who builds |
| 628 | the preds */ |
| 629 | assert(dest_slot < N_SPECIALS); |
| 630 | continue; |
| 631 | } |
| 632 | for (vector<u32>::const_iterator jt = it->second.begin(); |
| 633 | jt != it->second.end(); ++jt) { |
| 634 | if (*jt < N_SPECIALS || *jt == CREATE_NEW_SOM) { |
| 635 | out.push_back(*jt); |
| 636 | } else { |
| 637 | out.push_back(*jt + adj); |
| 638 | } |
| 639 | } |
| 640 | } |
| 641 | } |
| 642 | } |
| 643 | |
| 644 | static |
| 645 | void haig_merge_note_starts(const vector<const raw_som_dfa *> &dfas, |
| 646 | const vector<u32> &per_dfa_adj, |
| 647 | map<u32, u32> *out) { |
| 648 | for (u32 d = 0; d < dfas.size(); ++d) { |
| 649 | u32 adj = per_dfa_adj[d]; |
| 650 | const map<u32, u32> &new_soms = dfas[d]->new_som_nfa_states; |
| 651 | for (map<u32, u32>::const_iterator it = new_soms.begin(); |
| 652 | it != new_soms.end(); ++it) { |
| 653 | if (it->first < N_SPECIALS) { |
| 654 | assert(!it->second); |
| 655 | out->emplace(it->first, 0U); |
| 656 | } else { |
| 657 | assert(d + 1 >= per_dfa_adj.size() |
| 658 | || it->first + adj < per_dfa_adj[d + 1]); |
| 659 | out->emplace(it->first + adj, it->second); |
| 660 | } |
| 661 | } |
| 662 | } |
| 663 | } |
| 664 | |
| 665 | static never_inline |
| 666 | void haig_merge_do_report(const vector<const raw_som_dfa *> &dfas, |
| 667 | const vector<u32> &per_dfa_adj, |
| 668 | const vector<dstate_id_t> &source_nfa_states, |
| 669 | bool eod, set<som_report> &out) { |
| 670 | for (u32 d = 0; d < dfas.size(); ++d) { |
| 671 | u32 adj = per_dfa_adj[d]; |
| 672 | |
| 673 | const set<som_report> &reps = eod |
| 674 | ? dfas[d]->state_som[source_nfa_states[d]].reports_eod |
| 675 | : dfas[d]->state_som[source_nfa_states[d]].reports; |
| 676 | for (set<som_report>::const_iterator it = reps.begin(); |
| 677 | it != reps.end(); ++it) { |
| 678 | u32 slot = it->slot; |
| 679 | if (slot != CREATE_NEW_SOM && slot >= N_SPECIALS) { |
| 680 | slot += adj; |
| 681 | } |
| 682 | out.insert(som_report(it->report, slot)); |
| 683 | } |
| 684 | } |
| 685 | } |
| 686 | |
| 687 | static |
| 688 | u32 total_slots_used(const raw_som_dfa &rdfa) { |
| 689 | u32 rv = 0; |
| 690 | for (vector<dstate_som>::const_iterator it = rdfa.state_som.begin(); |
| 691 | it != rdfa.state_som.end(); ++it) { |
| 692 | for (som_tran_info::const_iterator jt = it->preds.begin(); |
| 693 | jt != it->preds.end(); ++jt) { |
| 694 | assert(jt->first != CREATE_NEW_SOM); |
| 695 | ENSURE_AT_LEAST(&rv, jt->first + 1); |
| 696 | } |
| 697 | } |
| 698 | const map<u32, u32> &new_soms = rdfa.new_som_nfa_states; |
| 699 | for (map<u32, u32>::const_iterator it = new_soms.begin(); |
| 700 | it != new_soms.end(); ++it) { |
| 701 | ENSURE_AT_LEAST(&rv, it->first + 1); |
| 702 | } |
| 703 | return rv; |
| 704 | } |
| 705 | |
| 706 | unique_ptr<raw_som_dfa> attemptToMergeHaig(const vector<const raw_som_dfa *> &dfas, |
| 707 | u32 limit) { |
| 708 | assert(!dfas.empty()); |
| 709 | |
| 710 | Automaton_Haig_Merge n(dfas); |
| 711 | |
| 712 | DEBUG_PRINTF("merging %zu dfas\n" , dfas.size()); |
| 713 | |
| 714 | bool unordered_som = false; |
| 715 | for (const auto &haig : dfas) { |
| 716 | assert(haig); |
| 717 | assert(haig->kind == dfas.front()->kind); |
| 718 | unordered_som |= haig->unordered_som_triggers; |
| 719 | if (haig->states.size() > limit) { |
| 720 | DEBUG_PRINTF("too many states!\n" ); |
| 721 | return nullptr; |
| 722 | } |
| 723 | } |
| 724 | |
| 725 | using StateSet = Automaton_Haig_Merge::StateSet; |
| 726 | vector<StateSet> nfa_state_map; |
| 727 | auto rdfa = ue2::make_unique<raw_som_dfa>(dfas[0]->kind, unordered_som, |
| 728 | NODE_START, |
| 729 | dfas[0]->stream_som_loc_width); |
| 730 | |
| 731 | if (!determinise(n, rdfa->states, limit, &nfa_state_map)) { |
| 732 | DEBUG_PRINTF("state limit (%u) exceeded\n" , limit); |
| 733 | return nullptr; /* over state limit */ |
| 734 | } |
| 735 | |
| 736 | rdfa->start_anchored = n.start_anchored; |
| 737 | rdfa->start_floating = n.start_floating; |
| 738 | rdfa->alpha_size = n.alphasize; |
| 739 | rdfa->alpha_remap = n.alpha; |
| 740 | |
| 741 | vector<u32> per_dfa_adj; |
| 742 | u32 curr_adj = 0; |
| 743 | for (const auto &haig : dfas) { |
| 744 | per_dfa_adj.push_back(curr_adj); |
| 745 | curr_adj += total_slots_used(*haig); |
| 746 | if (curr_adj < per_dfa_adj.back()) { |
| 747 | /* overflowed our som slot count */ |
| 748 | return nullptr; |
| 749 | } |
| 750 | } |
| 751 | |
| 752 | rdfa->state_som.reserve(rdfa->states.size()); |
| 753 | for (u32 i = 0; i < rdfa->states.size(); i++) { |
| 754 | rdfa->state_som.push_back(dstate_som()); |
| 755 | const vector<dstate_id_t> &source_nfa_states = nfa_state_map[i]; |
| 756 | DEBUG_PRINTF("finishing state %u\n" , i); |
| 757 | |
| 758 | haig_merge_do_preds(dfas, per_dfa_adj, source_nfa_states, |
| 759 | rdfa->state_som.back().preds); |
| 760 | |
| 761 | if (rdfa->state_som.back().preds.size() > HAIG_MAX_LIVE_SOM_SLOTS) { |
| 762 | DEBUG_PRINTF("som slot limit exceeded (%zu)\n" , |
| 763 | rdfa->state_som.back().preds.size()); |
| 764 | return nullptr; |
| 765 | } |
| 766 | |
| 767 | haig_merge_do_report(dfas, per_dfa_adj, source_nfa_states, |
| 768 | false /* not eod */, |
| 769 | rdfa->state_som.back().reports); |
| 770 | haig_merge_do_report(dfas, per_dfa_adj, source_nfa_states, |
| 771 | true /* eod */, |
| 772 | rdfa->state_som.back().reports_eod); |
| 773 | } |
| 774 | |
| 775 | haig_merge_note_starts(dfas, per_dfa_adj, &rdfa->new_som_nfa_states); |
| 776 | |
| 777 | DEBUG_PRINTF("merged, building impl dfa (a,f) = (%hu,%hu)\n" , |
| 778 | rdfa->start_anchored, rdfa->start_floating); |
| 779 | |
| 780 | return rdfa; |
| 781 | } |
| 782 | |
| 783 | } // namespace ue2 |
| 784 | |