1 | /* |
2 | * Copyright (c) 2015-2019, Intel Corporation |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions are met: |
6 | * |
7 | * * Redistributions of source code must retain the above copyright notice, |
8 | * this list of conditions and the following disclaimer. |
9 | * * Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * * Neither the name of Intel Corporation nor the names of its contributors |
13 | * may be used to endorse or promote products derived from this software |
14 | * without specific prior written permission. |
15 | * |
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
17 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
20 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
26 | * POSSIBILITY OF SUCH DAMAGE. |
27 | */ |
28 | |
29 | /** \file |
30 | * \brief FDR literal matcher: build API. |
31 | */ |
32 | |
33 | #include "fdr_compile.h" |
34 | |
35 | #include "fdr_internal.h" |
36 | #include "fdr_confirm.h" |
37 | #include "fdr_compile_internal.h" |
38 | #include "fdr_engine_description.h" |
39 | #include "teddy_compile.h" |
40 | #include "teddy_engine_description.h" |
41 | #include "grey.h" |
42 | #include "ue2common.h" |
43 | #include "hwlm/hwlm_build.h" |
44 | #include "util/compare.h" |
45 | #include "util/container.h" |
46 | #include "util/dump_mask.h" |
47 | #include "util/make_unique.h" |
48 | #include "util/math.h" |
49 | #include "util/noncopyable.h" |
50 | #include "util/target_info.h" |
51 | #include "util/ue2string.h" |
52 | #include "util/verify_types.h" |
53 | |
54 | #include <algorithm> |
55 | #include <array> |
56 | #include <cassert> |
57 | #include <cctype> |
58 | #include <cstdio> |
59 | #include <cstdlib> |
60 | #include <cstring> |
61 | #include <limits> |
62 | #include <map> |
63 | #include <memory> |
64 | #include <numeric> |
65 | #include <set> |
66 | #include <string> |
67 | #include <unordered_map> |
68 | #include <unordered_set> |
69 | #include <vector> |
70 | |
71 | #include <boost/multi_array.hpp> |
72 | |
73 | using namespace std; |
74 | |
75 | namespace ue2 { |
76 | |
77 | namespace { |
78 | |
79 | class FDRCompiler : noncopyable { |
80 | private: |
81 | const FDREngineDescription ŋ |
82 | const Grey &grey; |
83 | vector<u8> tab; |
84 | vector<hwlmLiteral> lits; |
85 | map<BucketIndex, std::vector<LiteralIndex> > bucketToLits; |
86 | bool make_small; |
87 | |
88 | u8 *tabIndexToMask(u32 indexInTable); |
89 | #ifdef DEBUG |
90 | void dumpMasks(const u8 *defaultMask); |
91 | #endif |
92 | void setupTab(); |
93 | bytecode_ptr<FDR> setupFDR(); |
94 | void createInitialState(FDR *fdr); |
95 | |
96 | public: |
97 | FDRCompiler(vector<hwlmLiteral> lits_in, |
98 | map<BucketIndex, std::vector<LiteralIndex>> bucketToLits_in, |
99 | const FDREngineDescription &eng_in, |
100 | bool make_small_in, const Grey &grey_in) |
101 | : eng(eng_in), grey(grey_in), tab(eng_in.getTabSizeBytes()), |
102 | lits(move(lits_in)), bucketToLits(move(bucketToLits_in)), |
103 | make_small(make_small_in) {} |
104 | |
105 | bytecode_ptr<FDR> build(); |
106 | }; |
107 | |
108 | u8 *FDRCompiler::tabIndexToMask(u32 indexInTable) { |
109 | assert(indexInTable < tab.size()); |
110 | return &tab[0] + (indexInTable * (eng.getSchemeWidth() / 8)); |
111 | } |
112 | |
113 | static |
114 | void setbit(u8 *msk, u32 bit) { |
115 | msk[bit / 8] |= 1U << (bit % 8); |
116 | } |
117 | |
118 | static |
119 | void clearbit(u8 *msk, u32 bit) { |
120 | msk[bit / 8] &= ~(1U << (bit % 8)); |
121 | } |
122 | |
123 | static |
124 | void andMask(u8 *dest, const u8 *a, const u8 *b, u32 num_bytes) { |
125 | for (u32 i = 0; i < num_bytes; i++) { |
126 | dest[i] = a[i] & b[i]; |
127 | } |
128 | } |
129 | |
130 | void FDRCompiler::createInitialState(FDR *fdr) { |
131 | u8 *start = (u8 *)&fdr->start; |
132 | |
133 | /* initial state should to be 1 in each slot in the bucket up to bucket |
134 | * minlen - 1, and 0 thereafter */ |
135 | for (BucketIndex b = 0; b < eng.getNumBuckets(); b++) { |
136 | // Find the minimum length for the literals in this bucket. |
137 | const vector<LiteralIndex> &bucket_lits = bucketToLits[b]; |
138 | u32 min_len = ~0U; |
139 | for (const LiteralIndex &lit_idx : bucket_lits) { |
140 | min_len = min(min_len, verify_u32(lits[lit_idx].s.length())); |
141 | } |
142 | |
143 | DEBUG_PRINTF("bucket %u has min_len=%u\n" , b, min_len); |
144 | assert(min_len); |
145 | |
146 | for (PositionInBucket i = 0; i < eng.getBucketWidth(b); i++) { |
147 | if (i < min_len - 1) { |
148 | setbit(start, eng.getSchemeBit(b, i)); |
149 | } |
150 | } |
151 | } |
152 | } |
153 | |
154 | /** |
155 | * \brief Lay out FDR structures in bytecode. |
156 | * |
157 | * Note that each major structure (header, table, confirm, flood control) is |
158 | * cacheline-aligned. |
159 | */ |
160 | bytecode_ptr<FDR> FDRCompiler::setupFDR() { |
161 | auto floodTable = setupFDRFloodControl(lits, eng, grey); |
162 | auto confirmTable = setupFullConfs(lits, eng, bucketToLits, make_small); |
163 | |
164 | size_t = sizeof(FDR); |
165 | size_t tabSize = eng.getTabSizeBytes(); |
166 | |
167 | // Note: we place each major structure here on a cacheline boundary. |
168 | size_t size = ROUNDUP_CL(headerSize) + ROUNDUP_CL(tabSize) + |
169 | ROUNDUP_CL(confirmTable.size()) + floodTable.size(); |
170 | |
171 | DEBUG_PRINTF("sizes base=%zu tabSize=%zu confirm=%zu floodControl=%zu " |
172 | "total=%zu\n" , |
173 | headerSize, tabSize, confirmTable.size(), floodTable.size(), |
174 | size); |
175 | |
176 | auto fdr = make_zeroed_bytecode_ptr<FDR>(size, 64); |
177 | assert(fdr); // otherwise would have thrown std::bad_alloc |
178 | |
179 | u8 *fdr_base = (u8 *)fdr.get(); |
180 | |
181 | // Write header. |
182 | fdr->size = size; |
183 | fdr->engineID = eng.getID(); |
184 | fdr->maxStringLen = verify_u32(maxLen(lits)); |
185 | fdr->numStrings = verify_u32(lits.size()); |
186 | assert(eng.bits > 8 && eng.bits < 16); // we allow domains 9 to 15 only |
187 | fdr->domain = eng.bits; |
188 | fdr->domainMask = (1 << eng.bits) - 1; |
189 | fdr->tabSize = tabSize; |
190 | fdr->stride = eng.stride; |
191 | createInitialState(fdr.get()); |
192 | |
193 | // Write table. |
194 | u8 *ptr = fdr_base + ROUNDUP_CL(sizeof(FDR)); |
195 | assert(ISALIGNED_CL(ptr)); |
196 | copy(tab.begin(), tab.end(), ptr); |
197 | ptr += ROUNDUP_CL(tabSize); |
198 | |
199 | // Write confirm structures. |
200 | assert(ISALIGNED_CL(ptr)); |
201 | fdr->confOffset = verify_u32(ptr - fdr_base); |
202 | memcpy(ptr, confirmTable.get(), confirmTable.size()); |
203 | ptr += ROUNDUP_CL(confirmTable.size()); |
204 | |
205 | // Write flood control structures. |
206 | assert(ISALIGNED_CL(ptr)); |
207 | fdr->floodOffset = verify_u32(ptr - fdr_base); |
208 | memcpy(ptr, floodTable.get(), floodTable.size()); |
209 | ptr += floodTable.size(); // last write, no need to round up |
210 | |
211 | return fdr; |
212 | } |
213 | |
214 | //#define DEBUG_ASSIGNMENT |
215 | |
216 | /** |
217 | * Utility class for computing: |
218 | * |
219 | * score(count, len) = pow(count, 1.05) * pow(len, -3) |
220 | * |
221 | * Calling pow() is expensive. This is mitigated by using pre-computed LUTs for |
222 | * small inputs and a cache for larger ones. |
223 | */ |
224 | class Scorer { |
225 | unordered_map<u32, double> count_factor_cache; |
226 | |
227 | // LUT: pow(count, 1.05) for small values of count. |
228 | static const array<double, 100> count_lut; |
229 | |
230 | double count_factor(u32 count) { |
231 | if (count < count_lut.size()) { |
232 | return count_lut[count]; |
233 | } |
234 | |
235 | auto it = count_factor_cache.find(count); |
236 | if (it != count_factor_cache.end()) { |
237 | return it->second; |
238 | } |
239 | double r = our_pow(count, 1.05); |
240 | count_factor_cache.emplace(count, r); |
241 | return r; |
242 | } |
243 | |
244 | // LUT: pow(len, -3) for len in range [0,8]. |
245 | static const array<double, 9> len_lut; |
246 | |
247 | double len_factor(u32 len) { |
248 | assert(len <= len_lut.size()); |
249 | return len_lut[len]; |
250 | } |
251 | |
252 | public: |
253 | double operator()(u32 len, u32 count) { |
254 | if (len == 0) { |
255 | return numeric_limits<double>::max(); |
256 | } |
257 | return count_factor(count) * len_factor(len); |
258 | } |
259 | }; |
260 | |
261 | const array<double, 100> Scorer::count_lut{{ |
262 | pow(0, 1.05), pow(1, 1.05), pow(2, 1.05), pow(3, 1.05), pow(4, 1.05), |
263 | pow(5, 1.05), pow(6, 1.05), pow(7, 1.05), pow(8, 1.05), pow(9, 1.05), |
264 | pow(10, 1.05), pow(11, 1.05), pow(12, 1.05), pow(13, 1.05), pow(14, 1.05), |
265 | pow(15, 1.05), pow(16, 1.05), pow(17, 1.05), pow(18, 1.05), pow(19, 1.05), |
266 | pow(20, 1.05), pow(21, 1.05), pow(22, 1.05), pow(23, 1.05), pow(24, 1.05), |
267 | pow(25, 1.05), pow(26, 1.05), pow(27, 1.05), pow(28, 1.05), pow(29, 1.05), |
268 | pow(30, 1.05), pow(31, 1.05), pow(32, 1.05), pow(33, 1.05), pow(34, 1.05), |
269 | pow(35, 1.05), pow(36, 1.05), pow(37, 1.05), pow(38, 1.05), pow(39, 1.05), |
270 | pow(40, 1.05), pow(41, 1.05), pow(42, 1.05), pow(43, 1.05), pow(44, 1.05), |
271 | pow(45, 1.05), pow(46, 1.05), pow(47, 1.05), pow(48, 1.05), pow(49, 1.05), |
272 | pow(50, 1.05), pow(51, 1.05), pow(52, 1.05), pow(53, 1.05), pow(54, 1.05), |
273 | pow(55, 1.05), pow(56, 1.05), pow(57, 1.05), pow(58, 1.05), pow(59, 1.05), |
274 | pow(60, 1.05), pow(61, 1.05), pow(62, 1.05), pow(63, 1.05), pow(64, 1.05), |
275 | pow(65, 1.05), pow(66, 1.05), pow(67, 1.05), pow(68, 1.05), pow(69, 1.05), |
276 | pow(70, 1.05), pow(71, 1.05), pow(72, 1.05), pow(73, 1.05), pow(74, 1.05), |
277 | pow(75, 1.05), pow(76, 1.05), pow(77, 1.05), pow(78, 1.05), pow(79, 1.05), |
278 | pow(80, 1.05), pow(81, 1.05), pow(82, 1.05), pow(83, 1.05), pow(84, 1.05), |
279 | pow(85, 1.05), pow(86, 1.05), pow(87, 1.05), pow(88, 1.05), pow(89, 1.05), |
280 | pow(90, 1.05), pow(91, 1.05), pow(92, 1.05), pow(93, 1.05), pow(94, 1.05), |
281 | pow(95, 1.05), pow(96, 1.05), pow(97, 1.05), pow(98, 1.05), pow(99, 1.05), |
282 | }}; |
283 | |
284 | const array<double, 9> Scorer::len_lut{{ |
285 | pow(0, -3.0), pow(1, -3.0), pow(2, -3.0), pow(3, -3.0), pow(4, -3.0), |
286 | pow(5, -3.0), pow(6, -3.0), pow(7, -3.0), pow(8, -3.0)}}; |
287 | |
288 | /** |
289 | * Returns true if the two given literals should be placed in the same chunk as |
290 | * they are identical except for a difference in caselessness. |
291 | */ |
292 | static |
293 | bool isEquivLit(const hwlmLiteral &a, const hwlmLiteral &b, |
294 | const hwlmLiteral *last_nocase_lit) { |
295 | const size_t a_len = a.s.size(); |
296 | const size_t b_len = b.s.size(); |
297 | |
298 | if (a_len != b_len) { |
299 | return false; |
300 | } |
301 | |
302 | bool nocase = last_nocase_lit && a_len == last_nocase_lit->s.size() && |
303 | !cmp(a.s.c_str(), last_nocase_lit->s.c_str(), a_len, true); |
304 | return !cmp(a.s.c_str(), b.s.c_str(), a.s.size(), nocase); |
305 | } |
306 | |
307 | struct Chunk { |
308 | Chunk(u32 first_id_in, u32 count_in, u32 length_in) |
309 | : first_id(first_id_in), count(count_in), length(length_in) {} |
310 | u32 first_id; //!< first id in this chunk |
311 | u32 count; //!< how many are in this chunk |
312 | u32 length; //!< how long things in the chunk are |
313 | }; |
314 | |
315 | static |
316 | vector<Chunk> assignChunks(const vector<hwlmLiteral> &lits, |
317 | const map<u32, u32> &lenCounts) { |
318 | const u32 CHUNK_MAX = 512; |
319 | const u32 MAX_CONSIDERED_LENGTH = 16; |
320 | |
321 | // TODO: detailed early stage literal analysis for v. small cases (actually |
322 | // look at lits) yes - after we factor this out and merge in the Teddy |
323 | // style of building we can look at this, although the teddy merge |
324 | // modelling is quite different. It's still probably adaptable to some |
325 | // extent for this class of problem. |
326 | |
327 | vector<Chunk> chunks; |
328 | chunks.reserve(CHUNK_MAX); |
329 | |
330 | const u32 maxPerChunk = lits.size() / |
331 | (CHUNK_MAX - MIN(MAX_CONSIDERED_LENGTH, lenCounts.size())) + 1; |
332 | |
333 | u32 currentSize = 0; |
334 | u32 chunkStartID = 0; |
335 | const hwlmLiteral *last_nocase_lit = nullptr; |
336 | |
337 | for (u32 i = 0; i < lits.size() && chunks.size() < CHUNK_MAX - 1; i++) { |
338 | const auto &lit = lits[i]; |
339 | |
340 | DEBUG_PRINTF("i=%u, lit=%s%s\n" , i, escapeString(lit.s).c_str(), |
341 | lit.nocase ? " (nocase)" : "" ); |
342 | |
343 | // If this literal is identical to the last one (aside from differences |
344 | // in caselessness), keep going even if we will "overfill" a chunk; we |
345 | // don't want to split identical literals into different buckets. |
346 | if (i != 0 && isEquivLit(lit, lits[i - 1], last_nocase_lit)) { |
347 | DEBUG_PRINTF("identical lit\n" ); |
348 | goto next_literal; |
349 | } |
350 | |
351 | if ((currentSize < MAX_CONSIDERED_LENGTH && |
352 | (lit.s.size() != currentSize)) || |
353 | (currentSize != 1 && ((i - chunkStartID) >= maxPerChunk))) { |
354 | currentSize = lit.s.size(); |
355 | if (!chunks.empty()) { |
356 | chunks.back().count = i - chunkStartID; |
357 | } |
358 | chunkStartID = i; |
359 | chunks.emplace_back(i, 0, currentSize); |
360 | } |
361 | next_literal: |
362 | if (lit.nocase) { |
363 | last_nocase_lit = &lit; |
364 | } |
365 | } |
366 | |
367 | assert(!chunks.empty()); |
368 | chunks.back().count = lits.size() - chunkStartID; |
369 | // close off chunks with an empty row |
370 | chunks.emplace_back(lits.size(), 0, 0); |
371 | |
372 | #ifdef DEBUG_ASSIGNMENT |
373 | for (size_t j = 0; j < chunks.size(); j++) { |
374 | const auto &chunk = chunks[j]; |
375 | printf("chunk %zu first_id=%u count=%u length=%u\n" , j, chunk.first_id, |
376 | chunk.count, chunk.length); |
377 | } |
378 | #endif |
379 | |
380 | DEBUG_PRINTF("built %zu chunks (%zu lits)\n" , chunks.size(), lits.size()); |
381 | assert(chunks.size() <= CHUNK_MAX); |
382 | return chunks; |
383 | } |
384 | |
385 | static |
386 | map<BucketIndex, vector<LiteralIndex>> assignStringsToBuckets( |
387 | vector<hwlmLiteral> &lits, |
388 | const FDREngineDescription &eng) { |
389 | const double MAX_SCORE = numeric_limits<double>::max(); |
390 | |
391 | assert(!lits.empty()); // Shouldn't be called with no literals. |
392 | |
393 | // Count the number of literals for each length. |
394 | map<u32, u32> lenCounts; |
395 | for (const auto &lit : lits) { |
396 | lenCounts[lit.s.size()]++; |
397 | } |
398 | |
399 | #ifdef DEBUG_ASSIGNMENT |
400 | for (const auto &m : lenCounts) { |
401 | printf("l<%u>:%u " , m.first, m.second); |
402 | } |
403 | printf("\n" ); |
404 | #endif |
405 | |
406 | // Sort literals by literal length. If tied on length, use lexicographic |
407 | // ordering (of the reversed literals). |
408 | stable_sort(lits.begin(), lits.end(), |
409 | [](const hwlmLiteral &a, const hwlmLiteral &b) { |
410 | if (a.s.size() != b.s.size()) { |
411 | return a.s.size() < b.s.size(); |
412 | } |
413 | auto p = mismatch(a.s.rbegin(), a.s.rend(), b.s.rbegin()); |
414 | if (p.first != a.s.rend()) { |
415 | return *p.first < *p.second; |
416 | } |
417 | // Sort caseless variants first. |
418 | return a.nocase > b.nocase; |
419 | }); |
420 | |
421 | vector<Chunk> chunks = assignChunks(lits, lenCounts); |
422 | |
423 | const u32 numChunks = chunks.size(); |
424 | const u32 numBuckets = eng.getNumBuckets(); |
425 | |
426 | // 2D array of (score, chunk index) pairs, indexed by |
427 | // [chunk_index][bucket_index]. |
428 | boost::multi_array<pair<double, u32>, 2> t( |
429 | boost::extents[numChunks][numBuckets]); |
430 | |
431 | Scorer scorer; |
432 | |
433 | for (u32 j = 0; j < numChunks; j++) { |
434 | u32 cnt = 0; |
435 | for (u32 k = j; k < numChunks; ++k) { |
436 | cnt += chunks[k].count; |
437 | } |
438 | t[j][0] = {scorer(chunks[j].length, cnt), 0}; |
439 | } |
440 | |
441 | for (u32 i = 1; i < numBuckets; i++) { |
442 | for (u32 j = 0; j < numChunks - 1; j++) { // don't do last, empty row |
443 | pair<double, u32> best = {MAX_SCORE, 0}; |
444 | u32 cnt = chunks[j].count; |
445 | for (u32 k = j + 1; k < numChunks - 1; k++) { |
446 | auto score = scorer(chunks[j].length, cnt); |
447 | if (score > best.first) { |
448 | break; // now worse locally than our best score, give up |
449 | } |
450 | score += t[k][i-1].first; |
451 | if (score < best.first) { |
452 | best = {score, k}; |
453 | } |
454 | cnt += chunks[k].count; |
455 | } |
456 | t[j][i] = best; |
457 | } |
458 | t[numChunks - 1][i] = {0,0}; // fill in empty final row for next iter |
459 | } |
460 | |
461 | #ifdef DEBUG_ASSIGNMENT |
462 | for (u32 j = 0; j < numChunks; j++) { |
463 | printf("%03u: " , j); |
464 | for (u32 i = 0; i < numBuckets; i++) { |
465 | const auto &v = t[j][i]; |
466 | printf("<%0.3f,%3d> " , v.first, v.second); |
467 | } |
468 | printf("\n" ); |
469 | } |
470 | #endif |
471 | |
472 | // our best score is in t[0][N_BUCKETS-1] and we can follow the links |
473 | // to find where our buckets should start and what goes into them |
474 | vector<vector<LiteralIndex>> buckets; |
475 | for (u32 i = 0, n = numBuckets; n && (i != numChunks - 1); n--) { |
476 | u32 j = t[i][n - 1].second; |
477 | if (j == 0) { |
478 | j = numChunks - 1; |
479 | } |
480 | |
481 | // put chunks between i - j into bucket (numBuckets - n). |
482 | u32 first_id = chunks[i].first_id; |
483 | u32 last_id = chunks[j].first_id; |
484 | assert(first_id < last_id); |
485 | UNUSED const auto &first_lit = lits[first_id]; |
486 | UNUSED const auto &last_lit = lits[last_id - 1]; |
487 | DEBUG_PRINTF("placing [%u-%u) in one bucket (%u lits, len %zu-%zu, " |
488 | "score %0.4f)\n" , |
489 | first_id, last_id, last_id - first_id, |
490 | first_lit.s.length(), last_lit.s.length(), |
491 | scorer(first_lit.s.length(), last_id - first_id)); |
492 | |
493 | vector<LiteralIndex> litIds; |
494 | u32 cnt = last_id - first_id; |
495 | // long literals first for included literals checking |
496 | for (u32 k = 0; k < cnt; k++) { |
497 | litIds.push_back(last_id - k - 1); |
498 | } |
499 | |
500 | i = j; |
501 | buckets.push_back(litIds); |
502 | } |
503 | |
504 | // reverse bucket id, longer literals come first |
505 | map<BucketIndex, vector<LiteralIndex>> bucketToLits; |
506 | size_t bucketCnt = buckets.size(); |
507 | for (size_t i = 0; i < bucketCnt; i++) { |
508 | bucketToLits.emplace(bucketCnt - i - 1, move(buckets[i])); |
509 | } |
510 | |
511 | return bucketToLits; |
512 | } |
513 | |
514 | #ifdef DEBUG |
515 | void FDRCompiler::dumpMasks(const u8 *defaultMask) { |
516 | const size_t width = eng.getSchemeWidth(); |
517 | printf("default mask: %s\n" , dumpMask(defaultMask, width).c_str()); |
518 | for (u32 i = 0; i < eng.getNumTableEntries(); i++) { |
519 | u8 *m = tabIndexToMask(i); |
520 | if (memcmp(m, defaultMask, width / 8)) { |
521 | printf("tab %04x: %s\n" , i, dumpMask(m, width).c_str()); |
522 | } |
523 | } |
524 | } |
525 | #endif |
526 | |
527 | static |
528 | bool getMultiEntriesAtPosition(const FDREngineDescription &eng, |
529 | const vector<LiteralIndex> &vl, |
530 | const vector<hwlmLiteral> &lits, |
531 | SuffixPositionInString pos, |
532 | map<u32, unordered_set<u32>> &m2) { |
533 | assert(eng.bits < 32); |
534 | |
535 | u32 distance = 0; |
536 | if (eng.bits <= 8) { |
537 | distance = 1; |
538 | } else if (eng.bits <= 16) { |
539 | distance = 2; |
540 | } else { |
541 | distance = 4; |
542 | } |
543 | |
544 | for (auto i = vl.begin(), e = vl.end(); i != e; ++i) { |
545 | if (e - i > 5) { |
546 | __builtin_prefetch(&lits[*(i + 5)]); |
547 | } |
548 | const hwlmLiteral &lit = lits[*i]; |
549 | const size_t sz = lit.s.size(); |
550 | u32 mask = 0; |
551 | u32 dontCares = 0; |
552 | for (u32 cnt = 0; cnt < distance; cnt++) { |
553 | int newPos = pos - cnt; |
554 | u8 dontCareByte = 0x0; |
555 | u8 maskByte = 0x0; |
556 | if (newPos < 0 || ((u32)newPos >= sz)) { |
557 | dontCareByte = 0xff; |
558 | } else { |
559 | u8 c = lit.s[sz - newPos - 1]; |
560 | maskByte = c; |
561 | u32 remainder = eng.bits - cnt * 8; |
562 | assert(remainder != 0); |
563 | if (remainder < 8) { |
564 | u8 cmask = (1U << remainder) - 1; |
565 | maskByte &= cmask; |
566 | dontCareByte |= ~cmask; |
567 | } |
568 | if (lit.nocase && ourisalpha(c)) { |
569 | maskByte &= 0xdf; |
570 | dontCareByte |= 0x20; |
571 | } |
572 | } |
573 | u32 loc = cnt * 8; |
574 | mask |= maskByte << loc; |
575 | dontCares |= dontCareByte << loc; |
576 | } |
577 | |
578 | // truncate m and dc down to nBits |
579 | mask &= (1U << eng.bits) - 1; |
580 | dontCares &= (1U << eng.bits) - 1; |
581 | if (dontCares == ((1U << eng.bits) - 1)) { |
582 | return true; |
583 | } |
584 | m2[dontCares].insert(mask); |
585 | } |
586 | return false; |
587 | } |
588 | |
589 | void FDRCompiler::setupTab() { |
590 | const size_t mask_size = eng.getSchemeWidth() / 8; |
591 | assert(mask_size); |
592 | |
593 | vector<u8> defaultMask(mask_size, 0xff); |
594 | for (u32 i = 0; i < eng.getNumTableEntries(); i++) { |
595 | memcpy(tabIndexToMask(i), &defaultMask[0], mask_size); |
596 | } |
597 | |
598 | for (BucketIndex b = 0; b < eng.getNumBuckets(); b++) { |
599 | const vector<LiteralIndex> &vl = bucketToLits[b]; |
600 | SuffixPositionInString pLimit = eng.getBucketWidth(b); |
601 | for (SuffixPositionInString pos = 0; pos < pLimit; pos++) { |
602 | u32 bit = eng.getSchemeBit(b, pos); |
603 | map<u32, unordered_set<u32>> m2; |
604 | bool done = getMultiEntriesAtPosition(eng, vl, lits, pos, m2); |
605 | if (done) { |
606 | clearbit(&defaultMask[0], bit); |
607 | continue; |
608 | } |
609 | for (const auto &elem : m2) { |
610 | u32 dc = elem.first; |
611 | const unordered_set<u32> &mskSet = elem.second; |
612 | u32 v = ~dc; |
613 | do { |
614 | u32 b2 = v & dc; |
615 | for (const u32 &mskVal : mskSet) { |
616 | u32 val = (mskVal & ~dc) | b2; |
617 | clearbit(tabIndexToMask(val), bit); |
618 | } |
619 | v = (v + (dc & -dc)) | ~dc; |
620 | } while (v != ~dc); |
621 | } |
622 | } |
623 | } |
624 | |
625 | for (u32 i = 0; i < eng.getNumTableEntries(); i++) { |
626 | u8 *m = tabIndexToMask(i); |
627 | andMask(m, m, &defaultMask[0], mask_size); |
628 | } |
629 | #ifdef DEBUG |
630 | dumpMasks(&defaultMask[0]); |
631 | #endif |
632 | } |
633 | |
634 | bytecode_ptr<FDR> FDRCompiler::build() { |
635 | setupTab(); |
636 | return setupFDR(); |
637 | } |
638 | |
639 | static |
640 | bool isSuffix(const hwlmLiteral &lit1, const hwlmLiteral &lit2) { |
641 | const auto &s1 = lit1.s; |
642 | const auto &s2 = lit2.s; |
643 | size_t len1 = s1.length(); |
644 | size_t len2 = s2.length(); |
645 | assert(len1 >= len2); |
646 | |
647 | if (lit1.nocase || lit2.nocase) { |
648 | return equal(s2.begin(), s2.end(), s1.begin() + len1 - len2, |
649 | [](char a, char b) { return mytoupper(a) == mytoupper(b); }); |
650 | } else { |
651 | return equal(s2.begin(), s2.end(), s1.begin() + len1 - len2); |
652 | } |
653 | } |
654 | |
655 | /* |
656 | * if lit2 is a suffix of lit1 but the case sensitivity, groups or mask info |
657 | * of lit2 is a subset of lit1, then lit1 can't squash lit2 and lit2 can |
658 | * possibly match when lit1 matches. In this case, we can't do bucket |
659 | * squashing. e.g. AAA(no case) in bucket 0, AA(no case) and aa in bucket 1, |
660 | * we can't squash bucket 1 if we have input like "aaa" as aa can also match. |
661 | */ |
662 | static |
663 | bool includedCheck(const hwlmLiteral &lit1, const hwlmLiteral &lit2) { |
664 | /* lit1 is caseless and lit2 is case sensitive */ |
665 | if ((lit1.nocase && !lit2.nocase)) { |
666 | return true; |
667 | } |
668 | |
669 | /* lit2's group is a subset of lit1 */ |
670 | if (lit1.groups != lit2.groups && |
671 | (lit2.groups == (lit1.groups & lit2.groups))) { |
672 | return true; |
673 | } |
674 | |
675 | /* TODO: narrow down cases for mask check */ |
676 | if (lit1.cmp != lit2.cmp || lit1.msk != lit2.msk) { |
677 | return true; |
678 | } |
679 | |
680 | return false; |
681 | } |
682 | |
683 | /* |
684 | * if lit2 is an included literal of both lit0 and lit1, then lit0 and lit1 |
685 | * shouldn't match at the same offset, otherwise we give up squashing for lit1. |
686 | * e.g. lit0:AAA(no case), lit1:aa, lit2:A(no case). We can have duplicate |
687 | * matches for input "aaa" if lit0 and lit1 both squash lit2. |
688 | */ |
689 | static |
690 | bool checkParentLit( |
691 | const vector<hwlmLiteral> &lits, u32 pos1, |
692 | const unordered_set<u32> &parent_map, |
693 | const unordered_map<u32, unordered_set<u32>> &exception_map) { |
694 | assert(pos1 < lits.size()); |
695 | const auto &lit1 = lits[pos1]; |
696 | for (const auto pos2 : parent_map) { |
697 | if (contains(exception_map, pos2)) { |
698 | const auto &exception_pos = exception_map.at(pos2); |
699 | if (contains(exception_pos, pos1)) { |
700 | return false; |
701 | } |
702 | } |
703 | |
704 | /* if lit1 isn't an exception of lit2, then we have to do further |
705 | * exclusive check. |
706 | * TODO: More mask checks. Note if two literals are group exclusive, |
707 | * it is possible that they match at the same offset. */ |
708 | assert(pos2 < lits.size()); |
709 | const auto &lit2 = lits[pos2]; |
710 | if (isSuffix(lit2, lit1)) { |
711 | return false; |
712 | } |
713 | } |
714 | |
715 | return true; |
716 | } |
717 | |
718 | static |
719 | void buildSquashMask(vector<hwlmLiteral> &lits, u32 id1, u32 bucket1, |
720 | size_t start, const vector<pair<u32, u32>> &group, |
721 | unordered_map<u32, unordered_set<u32>> &parent_map, |
722 | unordered_map<u32, unordered_set<u32>> &exception_map) { |
723 | auto &lit1 = lits[id1]; |
724 | DEBUG_PRINTF("b:%u len:%zu\n" , bucket1, lit1.s.length()); |
725 | |
726 | size_t cnt = group.size(); |
727 | bool included = false; |
728 | bool exception = false; |
729 | u32 child_id = ~0U; |
730 | for (size_t i = start; i < cnt; i++) { |
731 | u32 bucket2 = group[i].first; |
732 | assert(bucket2 >= bucket1); |
733 | |
734 | u32 id2 = group[i].second; |
735 | auto &lit2 = lits[id2]; |
736 | // check if lit2 is a suffix of lit1 |
737 | if (isSuffix(lit1, lit2)) { |
738 | /* if we have a included literal in the same bucket, |
739 | * quit and let the included literal to do possible squashing */ |
740 | if (bucket1 == bucket2) { |
741 | DEBUG_PRINTF("same bucket\n" ); |
742 | return; |
743 | } |
744 | /* if lit2 is a suffix but doesn't pass included checks for |
745 | * extra info, we give up sqaushing */ |
746 | if (includedCheck(lit1, lit2)) { |
747 | DEBUG_PRINTF("find exceptional suffix %u\n" , lit2.id); |
748 | exception_map[id1].insert(id2); |
749 | exception = true; |
750 | } else if (checkParentLit(lits, id1, parent_map[id2], |
751 | exception_map)) { |
752 | if (lit1.included_id == INVALID_LIT_ID) { |
753 | DEBUG_PRINTF("find suffix lit1 %u lit2 %u\n" , |
754 | lit1.id, lit2.id); |
755 | lit1.included_id = lit2.id; |
756 | } else { |
757 | /* if we have multiple included literals in one bucket, |
758 | * give up squashing. */ |
759 | DEBUG_PRINTF("multiple included literals\n" ); |
760 | lit1.included_id = INVALID_LIT_ID; |
761 | return; |
762 | } |
763 | child_id = id2; |
764 | included = true; |
765 | } |
766 | } |
767 | |
768 | size_t next = i + 1; |
769 | u32 nextBucket = next < cnt ? group[next].first : ~0U; |
770 | if (bucket2 != nextBucket) { |
771 | if (included) { |
772 | if (exception) { |
773 | /* give up if we have exception literals |
774 | * in the same bucket as the included literal. */ |
775 | lit1.included_id = INVALID_LIT_ID; |
776 | } else { |
777 | parent_map[child_id].insert(id1); |
778 | |
779 | lit1.squash |= 1U << bucket2; |
780 | DEBUG_PRINTF("build squash mask %2x for %u\n" , |
781 | lit1.squash, lit1.id); |
782 | } |
783 | return; |
784 | } |
785 | exception = false; |
786 | } |
787 | } |
788 | } |
789 | |
790 | static constexpr u32 INCLUDED_LIMIT = 1000; |
791 | |
792 | static |
793 | void findIncludedLits(vector<hwlmLiteral> &lits, |
794 | const vector<vector<pair<u32, u32>>> &lastCharMap) { |
795 | /* Map for finding the positions of literal which includes a literal |
796 | * in FDR hwlm literal vector. */ |
797 | unordered_map<u32, unordered_set<u32>> parent_map; |
798 | |
799 | /* Map for finding the positions of exception literals which could |
800 | * sometimes match if a literal matches in FDR hwlm literal vector. */ |
801 | unordered_map<u32, unordered_set<u32>> exception_map; |
802 | for (const auto &group : lastCharMap) { |
803 | size_t cnt = group.size(); |
804 | if (cnt > INCLUDED_LIMIT) { |
805 | continue; |
806 | } |
807 | for (size_t i = 0; i < cnt; i++) { |
808 | u32 bucket1 = group[i].first; |
809 | u32 id1 = group[i].second; |
810 | if (lits[id1].pure) { |
811 | continue; |
812 | } |
813 | buildSquashMask(lits, id1, bucket1, i + 1, group, parent_map, |
814 | exception_map); |
815 | } |
816 | } |
817 | } |
818 | |
819 | static |
820 | void addIncludedInfo( |
821 | vector<hwlmLiteral> &lits, u32 nBuckets, |
822 | map<BucketIndex, vector<LiteralIndex>> &bucketToLits) { |
823 | vector<vector<pair<u32, u32>>> lastCharMap(256); |
824 | |
825 | for (BucketIndex b = 0; b < nBuckets; b++) { |
826 | if (!bucketToLits[b].empty()) { |
827 | for (const LiteralIndex &lit_idx : bucketToLits[b]) { |
828 | const auto &lit = lits[lit_idx]; |
829 | u8 c = mytoupper(lit.s.back()); |
830 | lastCharMap[c].emplace_back(b, lit_idx); |
831 | } |
832 | } |
833 | } |
834 | |
835 | findIncludedLits(lits, lastCharMap); |
836 | } |
837 | |
838 | } // namespace |
839 | |
840 | static |
841 | unique_ptr<HWLMProto> fdrBuildProtoInternal(u8 engType, |
842 | vector<hwlmLiteral> &lits, |
843 | bool make_small, |
844 | const target_t &target, |
845 | const Grey &grey, u32 hint) { |
846 | DEBUG_PRINTF("cpu has %s\n" , target.has_avx2() ? "avx2" : "no-avx2" ); |
847 | |
848 | if (grey.fdrAllowTeddy) { |
849 | auto proto = teddyBuildProtoHinted(engType, lits, make_small, hint, |
850 | target); |
851 | if (proto) { |
852 | DEBUG_PRINTF("build with teddy succeeded\n" ); |
853 | return proto; |
854 | } else { |
855 | DEBUG_PRINTF("build with teddy failed, will try with FDR\n" ); |
856 | } |
857 | } |
858 | |
859 | auto des = (hint == HINT_INVALID) ? chooseEngine(target, lits, make_small) |
860 | : getFdrDescription(hint); |
861 | if (!des) { |
862 | return nullptr; |
863 | } |
864 | |
865 | // temporary hack for unit testing |
866 | if (hint != HINT_INVALID) { |
867 | des->bits = 9; |
868 | des->stride = 1; |
869 | } |
870 | |
871 | auto bucketToLits = assignStringsToBuckets(lits, *des); |
872 | addIncludedInfo(lits, des->getNumBuckets(), bucketToLits); |
873 | auto proto = |
874 | ue2::make_unique<HWLMProto>(engType, move(des), lits, bucketToLits, |
875 | make_small); |
876 | return proto; |
877 | } |
878 | |
879 | unique_ptr<HWLMProto> fdrBuildProto(u8 engType, vector<hwlmLiteral> lits, |
880 | bool make_small, const target_t &target, |
881 | const Grey &grey) { |
882 | return fdrBuildProtoInternal(engType, lits, make_small, target, grey, |
883 | HINT_INVALID); |
884 | } |
885 | |
886 | static |
887 | bytecode_ptr<FDR> fdrBuildTableInternal(const HWLMProto &proto, |
888 | const Grey &grey) { |
889 | |
890 | if (proto.teddyEng) { |
891 | return teddyBuildTable(proto, grey); |
892 | } |
893 | |
894 | FDRCompiler fc(proto.lits, proto.bucketToLits, *(proto.fdrEng), |
895 | proto.make_small, grey); |
896 | return fc.build(); |
897 | } |
898 | |
899 | bytecode_ptr<FDR> fdrBuildTable(const HWLMProto &proto, const Grey &grey) { |
900 | return fdrBuildTableInternal(proto, grey); |
901 | } |
902 | |
903 | #if !defined(RELEASE_BUILD) |
904 | |
905 | unique_ptr<HWLMProto> fdrBuildProtoHinted(u8 engType, |
906 | vector<hwlmLiteral> lits, |
907 | bool make_small, u32 hint, |
908 | const target_t &target, |
909 | const Grey &grey) { |
910 | return fdrBuildProtoInternal(engType, lits, make_small, target, grey, |
911 | hint); |
912 | } |
913 | |
914 | #endif |
915 | |
916 | size_t fdrSize(const FDR *fdr) { |
917 | assert(fdr); |
918 | return fdr->size; |
919 | } |
920 | |
921 | } // namespace ue2 |
922 | |