| 1 | /* |
| 2 | * Copyright (c) 2015-2019, Intel Corporation |
| 3 | * |
| 4 | * Redistribution and use in source and binary forms, with or without |
| 5 | * modification, are permitted provided that the following conditions are met: |
| 6 | * |
| 7 | * * Redistributions of source code must retain the above copyright notice, |
| 8 | * this list of conditions and the following disclaimer. |
| 9 | * * Redistributions in binary form must reproduce the above copyright |
| 10 | * notice, this list of conditions and the following disclaimer in the |
| 11 | * documentation and/or other materials provided with the distribution. |
| 12 | * * Neither the name of Intel Corporation nor the names of its contributors |
| 13 | * may be used to endorse or promote products derived from this software |
| 14 | * without specific prior written permission. |
| 15 | * |
| 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| 17 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
| 20 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| 21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| 22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| 23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| 24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| 25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| 26 | * POSSIBILITY OF SUCH DAMAGE. |
| 27 | */ |
| 28 | |
| 29 | /** \file |
| 30 | * \brief FDR literal matcher: build API. |
| 31 | */ |
| 32 | |
| 33 | #include "fdr_compile.h" |
| 34 | |
| 35 | #include "fdr_internal.h" |
| 36 | #include "fdr_confirm.h" |
| 37 | #include "fdr_compile_internal.h" |
| 38 | #include "fdr_engine_description.h" |
| 39 | #include "teddy_compile.h" |
| 40 | #include "teddy_engine_description.h" |
| 41 | #include "grey.h" |
| 42 | #include "ue2common.h" |
| 43 | #include "hwlm/hwlm_build.h" |
| 44 | #include "util/compare.h" |
| 45 | #include "util/container.h" |
| 46 | #include "util/dump_mask.h" |
| 47 | #include "util/make_unique.h" |
| 48 | #include "util/math.h" |
| 49 | #include "util/noncopyable.h" |
| 50 | #include "util/target_info.h" |
| 51 | #include "util/ue2string.h" |
| 52 | #include "util/verify_types.h" |
| 53 | |
| 54 | #include <algorithm> |
| 55 | #include <array> |
| 56 | #include <cassert> |
| 57 | #include <cctype> |
| 58 | #include <cstdio> |
| 59 | #include <cstdlib> |
| 60 | #include <cstring> |
| 61 | #include <limits> |
| 62 | #include <map> |
| 63 | #include <memory> |
| 64 | #include <numeric> |
| 65 | #include <set> |
| 66 | #include <string> |
| 67 | #include <unordered_map> |
| 68 | #include <unordered_set> |
| 69 | #include <vector> |
| 70 | |
| 71 | #include <boost/multi_array.hpp> |
| 72 | |
| 73 | using namespace std; |
| 74 | |
| 75 | namespace ue2 { |
| 76 | |
| 77 | namespace { |
| 78 | |
| 79 | class FDRCompiler : noncopyable { |
| 80 | private: |
| 81 | const FDREngineDescription ŋ |
| 82 | const Grey &grey; |
| 83 | vector<u8> tab; |
| 84 | vector<hwlmLiteral> lits; |
| 85 | map<BucketIndex, std::vector<LiteralIndex> > bucketToLits; |
| 86 | bool make_small; |
| 87 | |
| 88 | u8 *tabIndexToMask(u32 indexInTable); |
| 89 | #ifdef DEBUG |
| 90 | void dumpMasks(const u8 *defaultMask); |
| 91 | #endif |
| 92 | void setupTab(); |
| 93 | bytecode_ptr<FDR> setupFDR(); |
| 94 | void createInitialState(FDR *fdr); |
| 95 | |
| 96 | public: |
| 97 | FDRCompiler(vector<hwlmLiteral> lits_in, |
| 98 | map<BucketIndex, std::vector<LiteralIndex>> bucketToLits_in, |
| 99 | const FDREngineDescription &eng_in, |
| 100 | bool make_small_in, const Grey &grey_in) |
| 101 | : eng(eng_in), grey(grey_in), tab(eng_in.getTabSizeBytes()), |
| 102 | lits(move(lits_in)), bucketToLits(move(bucketToLits_in)), |
| 103 | make_small(make_small_in) {} |
| 104 | |
| 105 | bytecode_ptr<FDR> build(); |
| 106 | }; |
| 107 | |
| 108 | u8 *FDRCompiler::tabIndexToMask(u32 indexInTable) { |
| 109 | assert(indexInTable < tab.size()); |
| 110 | return &tab[0] + (indexInTable * (eng.getSchemeWidth() / 8)); |
| 111 | } |
| 112 | |
| 113 | static |
| 114 | void setbit(u8 *msk, u32 bit) { |
| 115 | msk[bit / 8] |= 1U << (bit % 8); |
| 116 | } |
| 117 | |
| 118 | static |
| 119 | void clearbit(u8 *msk, u32 bit) { |
| 120 | msk[bit / 8] &= ~(1U << (bit % 8)); |
| 121 | } |
| 122 | |
| 123 | static |
| 124 | void andMask(u8 *dest, const u8 *a, const u8 *b, u32 num_bytes) { |
| 125 | for (u32 i = 0; i < num_bytes; i++) { |
| 126 | dest[i] = a[i] & b[i]; |
| 127 | } |
| 128 | } |
| 129 | |
| 130 | void FDRCompiler::createInitialState(FDR *fdr) { |
| 131 | u8 *start = (u8 *)&fdr->start; |
| 132 | |
| 133 | /* initial state should to be 1 in each slot in the bucket up to bucket |
| 134 | * minlen - 1, and 0 thereafter */ |
| 135 | for (BucketIndex b = 0; b < eng.getNumBuckets(); b++) { |
| 136 | // Find the minimum length for the literals in this bucket. |
| 137 | const vector<LiteralIndex> &bucket_lits = bucketToLits[b]; |
| 138 | u32 min_len = ~0U; |
| 139 | for (const LiteralIndex &lit_idx : bucket_lits) { |
| 140 | min_len = min(min_len, verify_u32(lits[lit_idx].s.length())); |
| 141 | } |
| 142 | |
| 143 | DEBUG_PRINTF("bucket %u has min_len=%u\n" , b, min_len); |
| 144 | assert(min_len); |
| 145 | |
| 146 | for (PositionInBucket i = 0; i < eng.getBucketWidth(b); i++) { |
| 147 | if (i < min_len - 1) { |
| 148 | setbit(start, eng.getSchemeBit(b, i)); |
| 149 | } |
| 150 | } |
| 151 | } |
| 152 | } |
| 153 | |
| 154 | /** |
| 155 | * \brief Lay out FDR structures in bytecode. |
| 156 | * |
| 157 | * Note that each major structure (header, table, confirm, flood control) is |
| 158 | * cacheline-aligned. |
| 159 | */ |
| 160 | bytecode_ptr<FDR> FDRCompiler::setupFDR() { |
| 161 | auto floodTable = setupFDRFloodControl(lits, eng, grey); |
| 162 | auto confirmTable = setupFullConfs(lits, eng, bucketToLits, make_small); |
| 163 | |
| 164 | size_t = sizeof(FDR); |
| 165 | size_t tabSize = eng.getTabSizeBytes(); |
| 166 | |
| 167 | // Note: we place each major structure here on a cacheline boundary. |
| 168 | size_t size = ROUNDUP_CL(headerSize) + ROUNDUP_CL(tabSize) + |
| 169 | ROUNDUP_CL(confirmTable.size()) + floodTable.size(); |
| 170 | |
| 171 | DEBUG_PRINTF("sizes base=%zu tabSize=%zu confirm=%zu floodControl=%zu " |
| 172 | "total=%zu\n" , |
| 173 | headerSize, tabSize, confirmTable.size(), floodTable.size(), |
| 174 | size); |
| 175 | |
| 176 | auto fdr = make_zeroed_bytecode_ptr<FDR>(size, 64); |
| 177 | assert(fdr); // otherwise would have thrown std::bad_alloc |
| 178 | |
| 179 | u8 *fdr_base = (u8 *)fdr.get(); |
| 180 | |
| 181 | // Write header. |
| 182 | fdr->size = size; |
| 183 | fdr->engineID = eng.getID(); |
| 184 | fdr->maxStringLen = verify_u32(maxLen(lits)); |
| 185 | fdr->numStrings = verify_u32(lits.size()); |
| 186 | assert(eng.bits > 8 && eng.bits < 16); // we allow domains 9 to 15 only |
| 187 | fdr->domain = eng.bits; |
| 188 | fdr->domainMask = (1 << eng.bits) - 1; |
| 189 | fdr->tabSize = tabSize; |
| 190 | fdr->stride = eng.stride; |
| 191 | createInitialState(fdr.get()); |
| 192 | |
| 193 | // Write table. |
| 194 | u8 *ptr = fdr_base + ROUNDUP_CL(sizeof(FDR)); |
| 195 | assert(ISALIGNED_CL(ptr)); |
| 196 | copy(tab.begin(), tab.end(), ptr); |
| 197 | ptr += ROUNDUP_CL(tabSize); |
| 198 | |
| 199 | // Write confirm structures. |
| 200 | assert(ISALIGNED_CL(ptr)); |
| 201 | fdr->confOffset = verify_u32(ptr - fdr_base); |
| 202 | memcpy(ptr, confirmTable.get(), confirmTable.size()); |
| 203 | ptr += ROUNDUP_CL(confirmTable.size()); |
| 204 | |
| 205 | // Write flood control structures. |
| 206 | assert(ISALIGNED_CL(ptr)); |
| 207 | fdr->floodOffset = verify_u32(ptr - fdr_base); |
| 208 | memcpy(ptr, floodTable.get(), floodTable.size()); |
| 209 | ptr += floodTable.size(); // last write, no need to round up |
| 210 | |
| 211 | return fdr; |
| 212 | } |
| 213 | |
| 214 | //#define DEBUG_ASSIGNMENT |
| 215 | |
| 216 | /** |
| 217 | * Utility class for computing: |
| 218 | * |
| 219 | * score(count, len) = pow(count, 1.05) * pow(len, -3) |
| 220 | * |
| 221 | * Calling pow() is expensive. This is mitigated by using pre-computed LUTs for |
| 222 | * small inputs and a cache for larger ones. |
| 223 | */ |
| 224 | class Scorer { |
| 225 | unordered_map<u32, double> count_factor_cache; |
| 226 | |
| 227 | // LUT: pow(count, 1.05) for small values of count. |
| 228 | static const array<double, 100> count_lut; |
| 229 | |
| 230 | double count_factor(u32 count) { |
| 231 | if (count < count_lut.size()) { |
| 232 | return count_lut[count]; |
| 233 | } |
| 234 | |
| 235 | auto it = count_factor_cache.find(count); |
| 236 | if (it != count_factor_cache.end()) { |
| 237 | return it->second; |
| 238 | } |
| 239 | double r = our_pow(count, 1.05); |
| 240 | count_factor_cache.emplace(count, r); |
| 241 | return r; |
| 242 | } |
| 243 | |
| 244 | // LUT: pow(len, -3) for len in range [0,8]. |
| 245 | static const array<double, 9> len_lut; |
| 246 | |
| 247 | double len_factor(u32 len) { |
| 248 | assert(len <= len_lut.size()); |
| 249 | return len_lut[len]; |
| 250 | } |
| 251 | |
| 252 | public: |
| 253 | double operator()(u32 len, u32 count) { |
| 254 | if (len == 0) { |
| 255 | return numeric_limits<double>::max(); |
| 256 | } |
| 257 | return count_factor(count) * len_factor(len); |
| 258 | } |
| 259 | }; |
| 260 | |
| 261 | const array<double, 100> Scorer::count_lut{{ |
| 262 | pow(0, 1.05), pow(1, 1.05), pow(2, 1.05), pow(3, 1.05), pow(4, 1.05), |
| 263 | pow(5, 1.05), pow(6, 1.05), pow(7, 1.05), pow(8, 1.05), pow(9, 1.05), |
| 264 | pow(10, 1.05), pow(11, 1.05), pow(12, 1.05), pow(13, 1.05), pow(14, 1.05), |
| 265 | pow(15, 1.05), pow(16, 1.05), pow(17, 1.05), pow(18, 1.05), pow(19, 1.05), |
| 266 | pow(20, 1.05), pow(21, 1.05), pow(22, 1.05), pow(23, 1.05), pow(24, 1.05), |
| 267 | pow(25, 1.05), pow(26, 1.05), pow(27, 1.05), pow(28, 1.05), pow(29, 1.05), |
| 268 | pow(30, 1.05), pow(31, 1.05), pow(32, 1.05), pow(33, 1.05), pow(34, 1.05), |
| 269 | pow(35, 1.05), pow(36, 1.05), pow(37, 1.05), pow(38, 1.05), pow(39, 1.05), |
| 270 | pow(40, 1.05), pow(41, 1.05), pow(42, 1.05), pow(43, 1.05), pow(44, 1.05), |
| 271 | pow(45, 1.05), pow(46, 1.05), pow(47, 1.05), pow(48, 1.05), pow(49, 1.05), |
| 272 | pow(50, 1.05), pow(51, 1.05), pow(52, 1.05), pow(53, 1.05), pow(54, 1.05), |
| 273 | pow(55, 1.05), pow(56, 1.05), pow(57, 1.05), pow(58, 1.05), pow(59, 1.05), |
| 274 | pow(60, 1.05), pow(61, 1.05), pow(62, 1.05), pow(63, 1.05), pow(64, 1.05), |
| 275 | pow(65, 1.05), pow(66, 1.05), pow(67, 1.05), pow(68, 1.05), pow(69, 1.05), |
| 276 | pow(70, 1.05), pow(71, 1.05), pow(72, 1.05), pow(73, 1.05), pow(74, 1.05), |
| 277 | pow(75, 1.05), pow(76, 1.05), pow(77, 1.05), pow(78, 1.05), pow(79, 1.05), |
| 278 | pow(80, 1.05), pow(81, 1.05), pow(82, 1.05), pow(83, 1.05), pow(84, 1.05), |
| 279 | pow(85, 1.05), pow(86, 1.05), pow(87, 1.05), pow(88, 1.05), pow(89, 1.05), |
| 280 | pow(90, 1.05), pow(91, 1.05), pow(92, 1.05), pow(93, 1.05), pow(94, 1.05), |
| 281 | pow(95, 1.05), pow(96, 1.05), pow(97, 1.05), pow(98, 1.05), pow(99, 1.05), |
| 282 | }}; |
| 283 | |
| 284 | const array<double, 9> Scorer::len_lut{{ |
| 285 | pow(0, -3.0), pow(1, -3.0), pow(2, -3.0), pow(3, -3.0), pow(4, -3.0), |
| 286 | pow(5, -3.0), pow(6, -3.0), pow(7, -3.0), pow(8, -3.0)}}; |
| 287 | |
| 288 | /** |
| 289 | * Returns true if the two given literals should be placed in the same chunk as |
| 290 | * they are identical except for a difference in caselessness. |
| 291 | */ |
| 292 | static |
| 293 | bool isEquivLit(const hwlmLiteral &a, const hwlmLiteral &b, |
| 294 | const hwlmLiteral *last_nocase_lit) { |
| 295 | const size_t a_len = a.s.size(); |
| 296 | const size_t b_len = b.s.size(); |
| 297 | |
| 298 | if (a_len != b_len) { |
| 299 | return false; |
| 300 | } |
| 301 | |
| 302 | bool nocase = last_nocase_lit && a_len == last_nocase_lit->s.size() && |
| 303 | !cmp(a.s.c_str(), last_nocase_lit->s.c_str(), a_len, true); |
| 304 | return !cmp(a.s.c_str(), b.s.c_str(), a.s.size(), nocase); |
| 305 | } |
| 306 | |
| 307 | struct Chunk { |
| 308 | Chunk(u32 first_id_in, u32 count_in, u32 length_in) |
| 309 | : first_id(first_id_in), count(count_in), length(length_in) {} |
| 310 | u32 first_id; //!< first id in this chunk |
| 311 | u32 count; //!< how many are in this chunk |
| 312 | u32 length; //!< how long things in the chunk are |
| 313 | }; |
| 314 | |
| 315 | static |
| 316 | vector<Chunk> assignChunks(const vector<hwlmLiteral> &lits, |
| 317 | const map<u32, u32> &lenCounts) { |
| 318 | const u32 CHUNK_MAX = 512; |
| 319 | const u32 MAX_CONSIDERED_LENGTH = 16; |
| 320 | |
| 321 | // TODO: detailed early stage literal analysis for v. small cases (actually |
| 322 | // look at lits) yes - after we factor this out and merge in the Teddy |
| 323 | // style of building we can look at this, although the teddy merge |
| 324 | // modelling is quite different. It's still probably adaptable to some |
| 325 | // extent for this class of problem. |
| 326 | |
| 327 | vector<Chunk> chunks; |
| 328 | chunks.reserve(CHUNK_MAX); |
| 329 | |
| 330 | const u32 maxPerChunk = lits.size() / |
| 331 | (CHUNK_MAX - MIN(MAX_CONSIDERED_LENGTH, lenCounts.size())) + 1; |
| 332 | |
| 333 | u32 currentSize = 0; |
| 334 | u32 chunkStartID = 0; |
| 335 | const hwlmLiteral *last_nocase_lit = nullptr; |
| 336 | |
| 337 | for (u32 i = 0; i < lits.size() && chunks.size() < CHUNK_MAX - 1; i++) { |
| 338 | const auto &lit = lits[i]; |
| 339 | |
| 340 | DEBUG_PRINTF("i=%u, lit=%s%s\n" , i, escapeString(lit.s).c_str(), |
| 341 | lit.nocase ? " (nocase)" : "" ); |
| 342 | |
| 343 | // If this literal is identical to the last one (aside from differences |
| 344 | // in caselessness), keep going even if we will "overfill" a chunk; we |
| 345 | // don't want to split identical literals into different buckets. |
| 346 | if (i != 0 && isEquivLit(lit, lits[i - 1], last_nocase_lit)) { |
| 347 | DEBUG_PRINTF("identical lit\n" ); |
| 348 | goto next_literal; |
| 349 | } |
| 350 | |
| 351 | if ((currentSize < MAX_CONSIDERED_LENGTH && |
| 352 | (lit.s.size() != currentSize)) || |
| 353 | (currentSize != 1 && ((i - chunkStartID) >= maxPerChunk))) { |
| 354 | currentSize = lit.s.size(); |
| 355 | if (!chunks.empty()) { |
| 356 | chunks.back().count = i - chunkStartID; |
| 357 | } |
| 358 | chunkStartID = i; |
| 359 | chunks.emplace_back(i, 0, currentSize); |
| 360 | } |
| 361 | next_literal: |
| 362 | if (lit.nocase) { |
| 363 | last_nocase_lit = &lit; |
| 364 | } |
| 365 | } |
| 366 | |
| 367 | assert(!chunks.empty()); |
| 368 | chunks.back().count = lits.size() - chunkStartID; |
| 369 | // close off chunks with an empty row |
| 370 | chunks.emplace_back(lits.size(), 0, 0); |
| 371 | |
| 372 | #ifdef DEBUG_ASSIGNMENT |
| 373 | for (size_t j = 0; j < chunks.size(); j++) { |
| 374 | const auto &chunk = chunks[j]; |
| 375 | printf("chunk %zu first_id=%u count=%u length=%u\n" , j, chunk.first_id, |
| 376 | chunk.count, chunk.length); |
| 377 | } |
| 378 | #endif |
| 379 | |
| 380 | DEBUG_PRINTF("built %zu chunks (%zu lits)\n" , chunks.size(), lits.size()); |
| 381 | assert(chunks.size() <= CHUNK_MAX); |
| 382 | return chunks; |
| 383 | } |
| 384 | |
| 385 | static |
| 386 | map<BucketIndex, vector<LiteralIndex>> assignStringsToBuckets( |
| 387 | vector<hwlmLiteral> &lits, |
| 388 | const FDREngineDescription &eng) { |
| 389 | const double MAX_SCORE = numeric_limits<double>::max(); |
| 390 | |
| 391 | assert(!lits.empty()); // Shouldn't be called with no literals. |
| 392 | |
| 393 | // Count the number of literals for each length. |
| 394 | map<u32, u32> lenCounts; |
| 395 | for (const auto &lit : lits) { |
| 396 | lenCounts[lit.s.size()]++; |
| 397 | } |
| 398 | |
| 399 | #ifdef DEBUG_ASSIGNMENT |
| 400 | for (const auto &m : lenCounts) { |
| 401 | printf("l<%u>:%u " , m.first, m.second); |
| 402 | } |
| 403 | printf("\n" ); |
| 404 | #endif |
| 405 | |
| 406 | // Sort literals by literal length. If tied on length, use lexicographic |
| 407 | // ordering (of the reversed literals). |
| 408 | stable_sort(lits.begin(), lits.end(), |
| 409 | [](const hwlmLiteral &a, const hwlmLiteral &b) { |
| 410 | if (a.s.size() != b.s.size()) { |
| 411 | return a.s.size() < b.s.size(); |
| 412 | } |
| 413 | auto p = mismatch(a.s.rbegin(), a.s.rend(), b.s.rbegin()); |
| 414 | if (p.first != a.s.rend()) { |
| 415 | return *p.first < *p.second; |
| 416 | } |
| 417 | // Sort caseless variants first. |
| 418 | return a.nocase > b.nocase; |
| 419 | }); |
| 420 | |
| 421 | vector<Chunk> chunks = assignChunks(lits, lenCounts); |
| 422 | |
| 423 | const u32 numChunks = chunks.size(); |
| 424 | const u32 numBuckets = eng.getNumBuckets(); |
| 425 | |
| 426 | // 2D array of (score, chunk index) pairs, indexed by |
| 427 | // [chunk_index][bucket_index]. |
| 428 | boost::multi_array<pair<double, u32>, 2> t( |
| 429 | boost::extents[numChunks][numBuckets]); |
| 430 | |
| 431 | Scorer scorer; |
| 432 | |
| 433 | for (u32 j = 0; j < numChunks; j++) { |
| 434 | u32 cnt = 0; |
| 435 | for (u32 k = j; k < numChunks; ++k) { |
| 436 | cnt += chunks[k].count; |
| 437 | } |
| 438 | t[j][0] = {scorer(chunks[j].length, cnt), 0}; |
| 439 | } |
| 440 | |
| 441 | for (u32 i = 1; i < numBuckets; i++) { |
| 442 | for (u32 j = 0; j < numChunks - 1; j++) { // don't do last, empty row |
| 443 | pair<double, u32> best = {MAX_SCORE, 0}; |
| 444 | u32 cnt = chunks[j].count; |
| 445 | for (u32 k = j + 1; k < numChunks - 1; k++) { |
| 446 | auto score = scorer(chunks[j].length, cnt); |
| 447 | if (score > best.first) { |
| 448 | break; // now worse locally than our best score, give up |
| 449 | } |
| 450 | score += t[k][i-1].first; |
| 451 | if (score < best.first) { |
| 452 | best = {score, k}; |
| 453 | } |
| 454 | cnt += chunks[k].count; |
| 455 | } |
| 456 | t[j][i] = best; |
| 457 | } |
| 458 | t[numChunks - 1][i] = {0,0}; // fill in empty final row for next iter |
| 459 | } |
| 460 | |
| 461 | #ifdef DEBUG_ASSIGNMENT |
| 462 | for (u32 j = 0; j < numChunks; j++) { |
| 463 | printf("%03u: " , j); |
| 464 | for (u32 i = 0; i < numBuckets; i++) { |
| 465 | const auto &v = t[j][i]; |
| 466 | printf("<%0.3f,%3d> " , v.first, v.second); |
| 467 | } |
| 468 | printf("\n" ); |
| 469 | } |
| 470 | #endif |
| 471 | |
| 472 | // our best score is in t[0][N_BUCKETS-1] and we can follow the links |
| 473 | // to find where our buckets should start and what goes into them |
| 474 | vector<vector<LiteralIndex>> buckets; |
| 475 | for (u32 i = 0, n = numBuckets; n && (i != numChunks - 1); n--) { |
| 476 | u32 j = t[i][n - 1].second; |
| 477 | if (j == 0) { |
| 478 | j = numChunks - 1; |
| 479 | } |
| 480 | |
| 481 | // put chunks between i - j into bucket (numBuckets - n). |
| 482 | u32 first_id = chunks[i].first_id; |
| 483 | u32 last_id = chunks[j].first_id; |
| 484 | assert(first_id < last_id); |
| 485 | UNUSED const auto &first_lit = lits[first_id]; |
| 486 | UNUSED const auto &last_lit = lits[last_id - 1]; |
| 487 | DEBUG_PRINTF("placing [%u-%u) in one bucket (%u lits, len %zu-%zu, " |
| 488 | "score %0.4f)\n" , |
| 489 | first_id, last_id, last_id - first_id, |
| 490 | first_lit.s.length(), last_lit.s.length(), |
| 491 | scorer(first_lit.s.length(), last_id - first_id)); |
| 492 | |
| 493 | vector<LiteralIndex> litIds; |
| 494 | u32 cnt = last_id - first_id; |
| 495 | // long literals first for included literals checking |
| 496 | for (u32 k = 0; k < cnt; k++) { |
| 497 | litIds.push_back(last_id - k - 1); |
| 498 | } |
| 499 | |
| 500 | i = j; |
| 501 | buckets.push_back(litIds); |
| 502 | } |
| 503 | |
| 504 | // reverse bucket id, longer literals come first |
| 505 | map<BucketIndex, vector<LiteralIndex>> bucketToLits; |
| 506 | size_t bucketCnt = buckets.size(); |
| 507 | for (size_t i = 0; i < bucketCnt; i++) { |
| 508 | bucketToLits.emplace(bucketCnt - i - 1, move(buckets[i])); |
| 509 | } |
| 510 | |
| 511 | return bucketToLits; |
| 512 | } |
| 513 | |
| 514 | #ifdef DEBUG |
| 515 | void FDRCompiler::dumpMasks(const u8 *defaultMask) { |
| 516 | const size_t width = eng.getSchemeWidth(); |
| 517 | printf("default mask: %s\n" , dumpMask(defaultMask, width).c_str()); |
| 518 | for (u32 i = 0; i < eng.getNumTableEntries(); i++) { |
| 519 | u8 *m = tabIndexToMask(i); |
| 520 | if (memcmp(m, defaultMask, width / 8)) { |
| 521 | printf("tab %04x: %s\n" , i, dumpMask(m, width).c_str()); |
| 522 | } |
| 523 | } |
| 524 | } |
| 525 | #endif |
| 526 | |
| 527 | static |
| 528 | bool getMultiEntriesAtPosition(const FDREngineDescription &eng, |
| 529 | const vector<LiteralIndex> &vl, |
| 530 | const vector<hwlmLiteral> &lits, |
| 531 | SuffixPositionInString pos, |
| 532 | map<u32, unordered_set<u32>> &m2) { |
| 533 | assert(eng.bits < 32); |
| 534 | |
| 535 | u32 distance = 0; |
| 536 | if (eng.bits <= 8) { |
| 537 | distance = 1; |
| 538 | } else if (eng.bits <= 16) { |
| 539 | distance = 2; |
| 540 | } else { |
| 541 | distance = 4; |
| 542 | } |
| 543 | |
| 544 | for (auto i = vl.begin(), e = vl.end(); i != e; ++i) { |
| 545 | if (e - i > 5) { |
| 546 | __builtin_prefetch(&lits[*(i + 5)]); |
| 547 | } |
| 548 | const hwlmLiteral &lit = lits[*i]; |
| 549 | const size_t sz = lit.s.size(); |
| 550 | u32 mask = 0; |
| 551 | u32 dontCares = 0; |
| 552 | for (u32 cnt = 0; cnt < distance; cnt++) { |
| 553 | int newPos = pos - cnt; |
| 554 | u8 dontCareByte = 0x0; |
| 555 | u8 maskByte = 0x0; |
| 556 | if (newPos < 0 || ((u32)newPos >= sz)) { |
| 557 | dontCareByte = 0xff; |
| 558 | } else { |
| 559 | u8 c = lit.s[sz - newPos - 1]; |
| 560 | maskByte = c; |
| 561 | u32 remainder = eng.bits - cnt * 8; |
| 562 | assert(remainder != 0); |
| 563 | if (remainder < 8) { |
| 564 | u8 cmask = (1U << remainder) - 1; |
| 565 | maskByte &= cmask; |
| 566 | dontCareByte |= ~cmask; |
| 567 | } |
| 568 | if (lit.nocase && ourisalpha(c)) { |
| 569 | maskByte &= 0xdf; |
| 570 | dontCareByte |= 0x20; |
| 571 | } |
| 572 | } |
| 573 | u32 loc = cnt * 8; |
| 574 | mask |= maskByte << loc; |
| 575 | dontCares |= dontCareByte << loc; |
| 576 | } |
| 577 | |
| 578 | // truncate m and dc down to nBits |
| 579 | mask &= (1U << eng.bits) - 1; |
| 580 | dontCares &= (1U << eng.bits) - 1; |
| 581 | if (dontCares == ((1U << eng.bits) - 1)) { |
| 582 | return true; |
| 583 | } |
| 584 | m2[dontCares].insert(mask); |
| 585 | } |
| 586 | return false; |
| 587 | } |
| 588 | |
| 589 | void FDRCompiler::setupTab() { |
| 590 | const size_t mask_size = eng.getSchemeWidth() / 8; |
| 591 | assert(mask_size); |
| 592 | |
| 593 | vector<u8> defaultMask(mask_size, 0xff); |
| 594 | for (u32 i = 0; i < eng.getNumTableEntries(); i++) { |
| 595 | memcpy(tabIndexToMask(i), &defaultMask[0], mask_size); |
| 596 | } |
| 597 | |
| 598 | for (BucketIndex b = 0; b < eng.getNumBuckets(); b++) { |
| 599 | const vector<LiteralIndex> &vl = bucketToLits[b]; |
| 600 | SuffixPositionInString pLimit = eng.getBucketWidth(b); |
| 601 | for (SuffixPositionInString pos = 0; pos < pLimit; pos++) { |
| 602 | u32 bit = eng.getSchemeBit(b, pos); |
| 603 | map<u32, unordered_set<u32>> m2; |
| 604 | bool done = getMultiEntriesAtPosition(eng, vl, lits, pos, m2); |
| 605 | if (done) { |
| 606 | clearbit(&defaultMask[0], bit); |
| 607 | continue; |
| 608 | } |
| 609 | for (const auto &elem : m2) { |
| 610 | u32 dc = elem.first; |
| 611 | const unordered_set<u32> &mskSet = elem.second; |
| 612 | u32 v = ~dc; |
| 613 | do { |
| 614 | u32 b2 = v & dc; |
| 615 | for (const u32 &mskVal : mskSet) { |
| 616 | u32 val = (mskVal & ~dc) | b2; |
| 617 | clearbit(tabIndexToMask(val), bit); |
| 618 | } |
| 619 | v = (v + (dc & -dc)) | ~dc; |
| 620 | } while (v != ~dc); |
| 621 | } |
| 622 | } |
| 623 | } |
| 624 | |
| 625 | for (u32 i = 0; i < eng.getNumTableEntries(); i++) { |
| 626 | u8 *m = tabIndexToMask(i); |
| 627 | andMask(m, m, &defaultMask[0], mask_size); |
| 628 | } |
| 629 | #ifdef DEBUG |
| 630 | dumpMasks(&defaultMask[0]); |
| 631 | #endif |
| 632 | } |
| 633 | |
| 634 | bytecode_ptr<FDR> FDRCompiler::build() { |
| 635 | setupTab(); |
| 636 | return setupFDR(); |
| 637 | } |
| 638 | |
| 639 | static |
| 640 | bool isSuffix(const hwlmLiteral &lit1, const hwlmLiteral &lit2) { |
| 641 | const auto &s1 = lit1.s; |
| 642 | const auto &s2 = lit2.s; |
| 643 | size_t len1 = s1.length(); |
| 644 | size_t len2 = s2.length(); |
| 645 | assert(len1 >= len2); |
| 646 | |
| 647 | if (lit1.nocase || lit2.nocase) { |
| 648 | return equal(s2.begin(), s2.end(), s1.begin() + len1 - len2, |
| 649 | [](char a, char b) { return mytoupper(a) == mytoupper(b); }); |
| 650 | } else { |
| 651 | return equal(s2.begin(), s2.end(), s1.begin() + len1 - len2); |
| 652 | } |
| 653 | } |
| 654 | |
| 655 | /* |
| 656 | * if lit2 is a suffix of lit1 but the case sensitivity, groups or mask info |
| 657 | * of lit2 is a subset of lit1, then lit1 can't squash lit2 and lit2 can |
| 658 | * possibly match when lit1 matches. In this case, we can't do bucket |
| 659 | * squashing. e.g. AAA(no case) in bucket 0, AA(no case) and aa in bucket 1, |
| 660 | * we can't squash bucket 1 if we have input like "aaa" as aa can also match. |
| 661 | */ |
| 662 | static |
| 663 | bool includedCheck(const hwlmLiteral &lit1, const hwlmLiteral &lit2) { |
| 664 | /* lit1 is caseless and lit2 is case sensitive */ |
| 665 | if ((lit1.nocase && !lit2.nocase)) { |
| 666 | return true; |
| 667 | } |
| 668 | |
| 669 | /* lit2's group is a subset of lit1 */ |
| 670 | if (lit1.groups != lit2.groups && |
| 671 | (lit2.groups == (lit1.groups & lit2.groups))) { |
| 672 | return true; |
| 673 | } |
| 674 | |
| 675 | /* TODO: narrow down cases for mask check */ |
| 676 | if (lit1.cmp != lit2.cmp || lit1.msk != lit2.msk) { |
| 677 | return true; |
| 678 | } |
| 679 | |
| 680 | return false; |
| 681 | } |
| 682 | |
| 683 | /* |
| 684 | * if lit2 is an included literal of both lit0 and lit1, then lit0 and lit1 |
| 685 | * shouldn't match at the same offset, otherwise we give up squashing for lit1. |
| 686 | * e.g. lit0:AAA(no case), lit1:aa, lit2:A(no case). We can have duplicate |
| 687 | * matches for input "aaa" if lit0 and lit1 both squash lit2. |
| 688 | */ |
| 689 | static |
| 690 | bool checkParentLit( |
| 691 | const vector<hwlmLiteral> &lits, u32 pos1, |
| 692 | const unordered_set<u32> &parent_map, |
| 693 | const unordered_map<u32, unordered_set<u32>> &exception_map) { |
| 694 | assert(pos1 < lits.size()); |
| 695 | const auto &lit1 = lits[pos1]; |
| 696 | for (const auto pos2 : parent_map) { |
| 697 | if (contains(exception_map, pos2)) { |
| 698 | const auto &exception_pos = exception_map.at(pos2); |
| 699 | if (contains(exception_pos, pos1)) { |
| 700 | return false; |
| 701 | } |
| 702 | } |
| 703 | |
| 704 | /* if lit1 isn't an exception of lit2, then we have to do further |
| 705 | * exclusive check. |
| 706 | * TODO: More mask checks. Note if two literals are group exclusive, |
| 707 | * it is possible that they match at the same offset. */ |
| 708 | assert(pos2 < lits.size()); |
| 709 | const auto &lit2 = lits[pos2]; |
| 710 | if (isSuffix(lit2, lit1)) { |
| 711 | return false; |
| 712 | } |
| 713 | } |
| 714 | |
| 715 | return true; |
| 716 | } |
| 717 | |
| 718 | static |
| 719 | void buildSquashMask(vector<hwlmLiteral> &lits, u32 id1, u32 bucket1, |
| 720 | size_t start, const vector<pair<u32, u32>> &group, |
| 721 | unordered_map<u32, unordered_set<u32>> &parent_map, |
| 722 | unordered_map<u32, unordered_set<u32>> &exception_map) { |
| 723 | auto &lit1 = lits[id1]; |
| 724 | DEBUG_PRINTF("b:%u len:%zu\n" , bucket1, lit1.s.length()); |
| 725 | |
| 726 | size_t cnt = group.size(); |
| 727 | bool included = false; |
| 728 | bool exception = false; |
| 729 | u32 child_id = ~0U; |
| 730 | for (size_t i = start; i < cnt; i++) { |
| 731 | u32 bucket2 = group[i].first; |
| 732 | assert(bucket2 >= bucket1); |
| 733 | |
| 734 | u32 id2 = group[i].second; |
| 735 | auto &lit2 = lits[id2]; |
| 736 | // check if lit2 is a suffix of lit1 |
| 737 | if (isSuffix(lit1, lit2)) { |
| 738 | /* if we have a included literal in the same bucket, |
| 739 | * quit and let the included literal to do possible squashing */ |
| 740 | if (bucket1 == bucket2) { |
| 741 | DEBUG_PRINTF("same bucket\n" ); |
| 742 | return; |
| 743 | } |
| 744 | /* if lit2 is a suffix but doesn't pass included checks for |
| 745 | * extra info, we give up sqaushing */ |
| 746 | if (includedCheck(lit1, lit2)) { |
| 747 | DEBUG_PRINTF("find exceptional suffix %u\n" , lit2.id); |
| 748 | exception_map[id1].insert(id2); |
| 749 | exception = true; |
| 750 | } else if (checkParentLit(lits, id1, parent_map[id2], |
| 751 | exception_map)) { |
| 752 | if (lit1.included_id == INVALID_LIT_ID) { |
| 753 | DEBUG_PRINTF("find suffix lit1 %u lit2 %u\n" , |
| 754 | lit1.id, lit2.id); |
| 755 | lit1.included_id = lit2.id; |
| 756 | } else { |
| 757 | /* if we have multiple included literals in one bucket, |
| 758 | * give up squashing. */ |
| 759 | DEBUG_PRINTF("multiple included literals\n" ); |
| 760 | lit1.included_id = INVALID_LIT_ID; |
| 761 | return; |
| 762 | } |
| 763 | child_id = id2; |
| 764 | included = true; |
| 765 | } |
| 766 | } |
| 767 | |
| 768 | size_t next = i + 1; |
| 769 | u32 nextBucket = next < cnt ? group[next].first : ~0U; |
| 770 | if (bucket2 != nextBucket) { |
| 771 | if (included) { |
| 772 | if (exception) { |
| 773 | /* give up if we have exception literals |
| 774 | * in the same bucket as the included literal. */ |
| 775 | lit1.included_id = INVALID_LIT_ID; |
| 776 | } else { |
| 777 | parent_map[child_id].insert(id1); |
| 778 | |
| 779 | lit1.squash |= 1U << bucket2; |
| 780 | DEBUG_PRINTF("build squash mask %2x for %u\n" , |
| 781 | lit1.squash, lit1.id); |
| 782 | } |
| 783 | return; |
| 784 | } |
| 785 | exception = false; |
| 786 | } |
| 787 | } |
| 788 | } |
| 789 | |
| 790 | static constexpr u32 INCLUDED_LIMIT = 1000; |
| 791 | |
| 792 | static |
| 793 | void findIncludedLits(vector<hwlmLiteral> &lits, |
| 794 | const vector<vector<pair<u32, u32>>> &lastCharMap) { |
| 795 | /* Map for finding the positions of literal which includes a literal |
| 796 | * in FDR hwlm literal vector. */ |
| 797 | unordered_map<u32, unordered_set<u32>> parent_map; |
| 798 | |
| 799 | /* Map for finding the positions of exception literals which could |
| 800 | * sometimes match if a literal matches in FDR hwlm literal vector. */ |
| 801 | unordered_map<u32, unordered_set<u32>> exception_map; |
| 802 | for (const auto &group : lastCharMap) { |
| 803 | size_t cnt = group.size(); |
| 804 | if (cnt > INCLUDED_LIMIT) { |
| 805 | continue; |
| 806 | } |
| 807 | for (size_t i = 0; i < cnt; i++) { |
| 808 | u32 bucket1 = group[i].first; |
| 809 | u32 id1 = group[i].second; |
| 810 | if (lits[id1].pure) { |
| 811 | continue; |
| 812 | } |
| 813 | buildSquashMask(lits, id1, bucket1, i + 1, group, parent_map, |
| 814 | exception_map); |
| 815 | } |
| 816 | } |
| 817 | } |
| 818 | |
| 819 | static |
| 820 | void addIncludedInfo( |
| 821 | vector<hwlmLiteral> &lits, u32 nBuckets, |
| 822 | map<BucketIndex, vector<LiteralIndex>> &bucketToLits) { |
| 823 | vector<vector<pair<u32, u32>>> lastCharMap(256); |
| 824 | |
| 825 | for (BucketIndex b = 0; b < nBuckets; b++) { |
| 826 | if (!bucketToLits[b].empty()) { |
| 827 | for (const LiteralIndex &lit_idx : bucketToLits[b]) { |
| 828 | const auto &lit = lits[lit_idx]; |
| 829 | u8 c = mytoupper(lit.s.back()); |
| 830 | lastCharMap[c].emplace_back(b, lit_idx); |
| 831 | } |
| 832 | } |
| 833 | } |
| 834 | |
| 835 | findIncludedLits(lits, lastCharMap); |
| 836 | } |
| 837 | |
| 838 | } // namespace |
| 839 | |
| 840 | static |
| 841 | unique_ptr<HWLMProto> fdrBuildProtoInternal(u8 engType, |
| 842 | vector<hwlmLiteral> &lits, |
| 843 | bool make_small, |
| 844 | const target_t &target, |
| 845 | const Grey &grey, u32 hint) { |
| 846 | DEBUG_PRINTF("cpu has %s\n" , target.has_avx2() ? "avx2" : "no-avx2" ); |
| 847 | |
| 848 | if (grey.fdrAllowTeddy) { |
| 849 | auto proto = teddyBuildProtoHinted(engType, lits, make_small, hint, |
| 850 | target); |
| 851 | if (proto) { |
| 852 | DEBUG_PRINTF("build with teddy succeeded\n" ); |
| 853 | return proto; |
| 854 | } else { |
| 855 | DEBUG_PRINTF("build with teddy failed, will try with FDR\n" ); |
| 856 | } |
| 857 | } |
| 858 | |
| 859 | auto des = (hint == HINT_INVALID) ? chooseEngine(target, lits, make_small) |
| 860 | : getFdrDescription(hint); |
| 861 | if (!des) { |
| 862 | return nullptr; |
| 863 | } |
| 864 | |
| 865 | // temporary hack for unit testing |
| 866 | if (hint != HINT_INVALID) { |
| 867 | des->bits = 9; |
| 868 | des->stride = 1; |
| 869 | } |
| 870 | |
| 871 | auto bucketToLits = assignStringsToBuckets(lits, *des); |
| 872 | addIncludedInfo(lits, des->getNumBuckets(), bucketToLits); |
| 873 | auto proto = |
| 874 | ue2::make_unique<HWLMProto>(engType, move(des), lits, bucketToLits, |
| 875 | make_small); |
| 876 | return proto; |
| 877 | } |
| 878 | |
| 879 | unique_ptr<HWLMProto> fdrBuildProto(u8 engType, vector<hwlmLiteral> lits, |
| 880 | bool make_small, const target_t &target, |
| 881 | const Grey &grey) { |
| 882 | return fdrBuildProtoInternal(engType, lits, make_small, target, grey, |
| 883 | HINT_INVALID); |
| 884 | } |
| 885 | |
| 886 | static |
| 887 | bytecode_ptr<FDR> fdrBuildTableInternal(const HWLMProto &proto, |
| 888 | const Grey &grey) { |
| 889 | |
| 890 | if (proto.teddyEng) { |
| 891 | return teddyBuildTable(proto, grey); |
| 892 | } |
| 893 | |
| 894 | FDRCompiler fc(proto.lits, proto.bucketToLits, *(proto.fdrEng), |
| 895 | proto.make_small, grey); |
| 896 | return fc.build(); |
| 897 | } |
| 898 | |
| 899 | bytecode_ptr<FDR> fdrBuildTable(const HWLMProto &proto, const Grey &grey) { |
| 900 | return fdrBuildTableInternal(proto, grey); |
| 901 | } |
| 902 | |
| 903 | #if !defined(RELEASE_BUILD) |
| 904 | |
| 905 | unique_ptr<HWLMProto> fdrBuildProtoHinted(u8 engType, |
| 906 | vector<hwlmLiteral> lits, |
| 907 | bool make_small, u32 hint, |
| 908 | const target_t &target, |
| 909 | const Grey &grey) { |
| 910 | return fdrBuildProtoInternal(engType, lits, make_small, target, grey, |
| 911 | hint); |
| 912 | } |
| 913 | |
| 914 | #endif |
| 915 | |
| 916 | size_t fdrSize(const FDR *fdr) { |
| 917 | assert(fdr); |
| 918 | return fdr->size; |
| 919 | } |
| 920 | |
| 921 | } // namespace ue2 |
| 922 | |