1 | /* |
2 | * Copyright (c) 2015-2017, Intel Corporation |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions are met: |
6 | * |
7 | * * Redistributions of source code must retain the above copyright notice, |
8 | * this list of conditions and the following disclaimer. |
9 | * * Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * * Neither the name of Intel Corporation nor the names of its contributors |
13 | * may be used to endorse or promote products derived from this software |
14 | * without specific prior written permission. |
15 | * |
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
17 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
20 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
26 | * POSSIBILITY OF SUCH DAMAGE. |
27 | */ |
28 | |
29 | /** \file |
30 | * \brief Corpus Generation tool. |
31 | */ |
32 | |
33 | #include "config.h" |
34 | |
35 | #include "ng_corpus_generator.h" |
36 | |
37 | #include "ng_corpus_editor.h" |
38 | #include "compiler/compiler.h" |
39 | #include "nfagraph/ng.h" |
40 | #include "nfagraph/ng_util.h" |
41 | #include "ue2common.h" |
42 | #include "util/container.h" |
43 | #include "util/graph_range.h" |
44 | #include "util/make_unique.h" |
45 | #include "util/ue2string.h" |
46 | #include "util/unicode_def.h" |
47 | #include "util/unicode_set.h" |
48 | |
49 | #include <algorithm> |
50 | #include <deque> |
51 | #include <memory> |
52 | #include <set> |
53 | #include <sstream> |
54 | #include <unordered_set> |
55 | #include <vector> |
56 | |
57 | #include <boost/utility.hpp> |
58 | |
59 | using namespace std; |
60 | using namespace ue2; |
61 | |
62 | typedef vector<NFAVertex> VertexPath; |
63 | |
64 | #if defined(DEBUG) |
65 | // For debugging output |
66 | static |
67 | string pathToString(const NGHolder &g, const VertexPath &p) { |
68 | ostringstream oss; |
69 | oss << '['; |
70 | for (auto i = p.begin(); i != p.end(); ++i) { |
71 | if (i != p.begin()) { |
72 | oss << ','; |
73 | } |
74 | oss << g[*i].index; |
75 | } |
76 | oss << ']'; |
77 | return oss.str(); |
78 | } |
79 | #endif |
80 | |
81 | /** True if this graph has no non-special successors of start or startDs. */ |
82 | static |
83 | bool graph_is_empty(const NGHolder &g) { |
84 | for (const auto &v : adjacent_vertices_range(g.start, g)) { |
85 | if (!is_special(v, g)) { |
86 | return false; |
87 | } |
88 | } |
89 | for (const auto &v : adjacent_vertices_range(g.start, g)) { |
90 | if (!is_special(v, g)) { |
91 | return false; |
92 | } |
93 | } |
94 | |
95 | return true; |
96 | } |
97 | |
98 | static |
99 | string encodeUtf8(const vector<unichar> &v) { |
100 | string rv; |
101 | for (const unichar &cp : v) { |
102 | if (cp < UTF_2CHAR_MIN) { |
103 | rv.push_back(cp); |
104 | } else if (cp < UTF_3CHAR_MIN) { |
105 | rv.push_back(UTF_TWO_BYTE_HEADER | (cp >> UTF_CONT_SHIFT)); |
106 | rv.push_back(makeContByte(cp)); |
107 | } else if (cp < UTF_4CHAR_MIN) { |
108 | rv.push_back(UTF_THREE_BYTE_HEADER | (cp >> (2 * UTF_CONT_SHIFT))); |
109 | rv.push_back(makeContByte(cp >> UTF_CONT_SHIFT)); |
110 | rv.push_back(makeContByte(cp)); |
111 | } else { |
112 | rv.push_back(UTF_FOUR_BYTE_HEADER | (cp >> (3 * UTF_CONT_SHIFT))); |
113 | rv.push_back(makeContByte(cp >> (2 * UTF_CONT_SHIFT))); |
114 | rv.push_back(makeContByte(cp >> UTF_CONT_SHIFT)); |
115 | rv.push_back(makeContByte(cp)); |
116 | } |
117 | } |
118 | return rv; |
119 | } |
120 | |
121 | template<class Iter, class Val> |
122 | static |
123 | bool has_greater_than(Iter it, Iter end, const Val &v, size_t limit) { |
124 | for (; it != end; ++it) { |
125 | if (*it == v) { |
126 | if (limit == 0) { |
127 | return true; |
128 | } |
129 | --limit; |
130 | } |
131 | } |
132 | return false; |
133 | } |
134 | |
135 | static |
136 | void findPaths(const NGHolder &g, CorpusProperties &cProps, |
137 | vector<VertexPath> &allPaths, size_t cycleLimit, |
138 | size_t corpusLimit) { |
139 | // The maximum number of open (in progress) paths. New paths beyond this |
140 | // limit will evict a random existing one. |
141 | const size_t MAX_OPEN = min((size_t)1000, corpusLimit * 10); |
142 | |
143 | vector<unique_ptr<VertexPath>> open; |
144 | open.push_back(ue2::make_unique<VertexPath>(1, g.start)); |
145 | |
146 | unordered_set<NFAVertex> one_way_in; |
147 | for (const auto &v : vertices_range(g)) { |
148 | if (in_degree(v, g) <= 1) { |
149 | one_way_in.insert(v); |
150 | } |
151 | } |
152 | |
153 | while (!open.empty()) { |
154 | u32 slot = cProps.rand(0, open.size() - 1); |
155 | swap(open.at(slot), open.back()); |
156 | auto p = std::move(open.back()); |
157 | open.pop_back(); |
158 | NFAVertex u = p->back(); |
159 | |
160 | DEBUG_PRINTF("dequeuing path %s, back %zu\n" , |
161 | pathToString(g, *p).c_str(), g[u].index); |
162 | |
163 | NGHolder::adjacency_iterator ai, ae; |
164 | for (tie(ai, ae) = adjacent_vertices(u, g); ai != ae; ++ai) { |
165 | NFAVertex v = *ai; |
166 | |
167 | if (u == g.startDs && v == g.startDs) { |
168 | // explicitly avoid following startDs self-loop, as we have |
169 | // other mechanisms for adding prefixes to our corpora. |
170 | continue; |
171 | } |
172 | |
173 | // Accept vertices generate completed paths. |
174 | if (v == g.accept || v == g.acceptEod) { |
175 | DEBUG_PRINTF("path complete: %s\n" , |
176 | pathToString(g, *p).c_str()); |
177 | allPaths.push_back(*p); |
178 | if (allPaths.size() >= corpusLimit) { |
179 | DEBUG_PRINTF("full, going home\n" ); |
180 | return; |
181 | } |
182 | |
183 | // No meaningful edges out of accept or acceptEod. |
184 | continue; |
185 | } |
186 | |
187 | if (!contains(one_way_in, v) && |
188 | has_greater_than(p->begin(), p->end(), v, cycleLimit)) { |
189 | // Note that vertices that only have one predecessor don't need |
190 | // their cycle limit checked, as their predecessors will have |
191 | // the same count. |
192 | DEBUG_PRINTF("exceeded cycle limit for v=%zu, pruning path\n" , |
193 | g[v].index); |
194 | continue; |
195 | } |
196 | |
197 | // If we've got no further adjacent vertices, re-use p rather than |
198 | // copying it for the next path. |
199 | unique_ptr<VertexPath> new_path; |
200 | if (boost::next(ai) == ae) { |
201 | new_path = std::move(p); |
202 | } else { |
203 | new_path = make_unique<VertexPath>(*p); |
204 | } |
205 | |
206 | new_path->push_back(v); |
207 | if (open.size() < MAX_OPEN) { |
208 | open.push_back(std::move(new_path)); |
209 | } else { |
210 | u32 victim = cProps.rand(0, open.size() - 1); |
211 | open[victim] = std::move(new_path); |
212 | } |
213 | } |
214 | } |
215 | DEBUG_PRINTF("bored, going home\n" ); |
216 | } |
217 | |
218 | namespace { |
219 | |
220 | /** \brief Concrete implementation */ |
221 | class CorpusGeneratorImpl : public CorpusGenerator { |
222 | public: |
223 | CorpusGeneratorImpl(const NGHolder &graph_in, const ExpressionInfo &expr_in, |
224 | CorpusProperties &props); |
225 | ~CorpusGeneratorImpl() = default; |
226 | |
227 | void generateCorpus(vector<string> &data); |
228 | |
229 | private: |
230 | unsigned char getRandomChar(); |
231 | unsigned char getMatchChar(const CharReach &cr); |
232 | unsigned char getUnmatchChar(const CharReach &cr); |
233 | |
234 | unsigned char getChar(NFAVertex v); |
235 | void newGenerator(vector<string> &data); |
236 | string pathToCorpus(const VertexPath &path); |
237 | |
238 | /** \brief Generate a string of random bytes between minLen and maxLen |
239 | * bytes in length. */ |
240 | void addRandom(const min_max &mm, string *out); |
241 | |
242 | /** \brief Info about this expression. */ |
243 | const ExpressionInfo &expr; |
244 | |
245 | /** \brief The NFA graph we operate over. */ |
246 | const NGHolder &graph; |
247 | |
248 | /** \brief Reference to our corpus generator properties object (stores some |
249 | * state) */ |
250 | CorpusProperties &cProps; |
251 | }; |
252 | |
253 | CorpusGeneratorImpl::CorpusGeneratorImpl(const NGHolder &graph_in, |
254 | const ExpressionInfo &expr_in, |
255 | CorpusProperties &props) |
256 | : expr(expr_in), graph(graph_in), cProps(props) { |
257 | // if this pattern is to be matched approximately |
258 | if ((expr.edit_distance || expr.hamm_distance) && !props.editDistance) { |
259 | props.editDistance = |
260 | props.rand(0, expr.hamm_distance + expr.edit_distance + 1); |
261 | } |
262 | } |
263 | |
264 | void CorpusGeneratorImpl::generateCorpus(vector<string> &data) { |
265 | newGenerator(data); |
266 | |
267 | if (cProps.editDistance && !data.empty() && |
268 | data.size() < cProps.corpusLimit) { |
269 | // Create more entries by copying the corpora and applying edits |
270 | size_t diff = cProps.corpusLimit - data.size(); |
271 | size_t repeats = diff / data.size(); |
272 | size_t remains = diff % data.size(); |
273 | vector<string> newdata; |
274 | for (size_t i = 0; i < repeats; i++) { |
275 | std::copy(data.begin(), data.end(), std::back_inserter(newdata)); |
276 | } |
277 | if (remains) { |
278 | std::copy_n(data.begin(), remains, std::back_inserter(newdata)); |
279 | } |
280 | for (auto &s : newdata) { |
281 | editCorpus(&s, cProps); |
282 | } |
283 | std::move(newdata.begin(), newdata.end(), back_inserter(data)); |
284 | } else if (cProps.editDistance) { |
285 | // If the caller has asked us, apply edit distance to corpora |
286 | for (auto &s : data) { |
287 | editCorpus(&s, cProps); |
288 | } |
289 | } |
290 | } |
291 | |
292 | /** \brief Generate a random character, taking care to stick to the alphabet |
293 | * that we've been asked for. */ |
294 | u8 CorpusGeneratorImpl::getRandomChar() { |
295 | return 'a' + cProps.rand(0, min(cProps.alphabetSize, (u32)CharReach::npos)); |
296 | } |
297 | |
298 | /** \brief Select a random character from the given string of valid match |
299 | * characters. */ |
300 | unsigned char CorpusGeneratorImpl::getMatchChar(const CharReach &cr) { |
301 | unsigned int num = cr.count(); |
302 | if (num == 0) { |
303 | return 0; |
304 | } else if (num == 1) { |
305 | return (unsigned char)cr.find_first(); |
306 | } else if (num == 256) { |
307 | // Dot class, any character is OK! |
308 | return (unsigned char)cProps.rand(0, 255); |
309 | } |
310 | else { |
311 | unsigned idx = cProps.rand(0, num - 1); |
312 | return (unsigned char)cr.find_nth(idx); |
313 | } |
314 | } |
315 | |
316 | /** \brief Select a character that does not belong to the given bitset. This |
317 | * makes no guarantees on unmatchability if the bitset is full. */ |
318 | unsigned char CorpusGeneratorImpl::getUnmatchChar(const CharReach &cr) { |
319 | return getMatchChar(~cr); |
320 | } |
321 | |
322 | void CorpusGeneratorImpl::addRandom(const min_max &mm, string *out) { |
323 | assert(mm.min <= mm.max); |
324 | u32 range = mm.max - mm.min; |
325 | u32 len = mm.min + (range ? cProps.rand(0, range - 1) : 0); |
326 | for (u32 i = 0; i < len; ++i) { |
327 | out->push_back(getRandomChar()); |
328 | } |
329 | } |
330 | |
331 | unsigned char CorpusGeneratorImpl::getChar(NFAVertex v) { |
332 | const CharReach &cr = graph[v].char_reach; |
333 | |
334 | switch (cProps.throwDice()) { |
335 | case CorpusProperties::ROLLED_MATCH: |
336 | return getMatchChar(cr); |
337 | case CorpusProperties::ROLLED_UNMATCH: |
338 | return getUnmatchChar(cr); |
339 | case CorpusProperties::ROLLED_RANDOM: /* character pulled from hat */ |
340 | return getRandomChar(); |
341 | } |
342 | assert(0); |
343 | return 0; |
344 | } |
345 | |
346 | /** \brief Convert a path through the graph to a corpus string. */ |
347 | string CorpusGeneratorImpl::pathToCorpus(const VertexPath &path) { |
348 | string s; |
349 | |
350 | // Add random prefix |
351 | if (cProps.prefixRange.max) { |
352 | addRandom(cProps.prefixRange, &s); |
353 | } |
354 | |
355 | // Generate a corpus from our path |
356 | for (const auto &e : path) { |
357 | if (!is_special(e, graph)) { |
358 | s += getChar(e); |
359 | } |
360 | } |
361 | |
362 | // Add random suffix |
363 | if (cProps.suffixRange.max) { |
364 | addRandom(cProps.suffixRange, &s); |
365 | } |
366 | |
367 | return s; |
368 | } |
369 | |
370 | void CorpusGeneratorImpl::newGenerator(vector<string> &outdata) { |
371 | const unsigned int maxCycles = cProps.getCycleLimit().second; |
372 | DEBUG_PRINTF("generating up to %u corpora, cycle limit of %u\n" , |
373 | cProps.corpusLimit, maxCycles); |
374 | |
375 | vector<VertexPath> allPaths; |
376 | |
377 | // Special case: if the graph has ONLY special vertices, then this is |
378 | // likely to be an odd vacuous pattern or a pattern that can never match. |
379 | // In these cases, an empty corpus is useful. |
380 | if (graph_is_empty(graph)) { |
381 | VertexPath empty(1, graph.start); |
382 | allPaths.push_back(empty); |
383 | } |
384 | |
385 | // build a set of unique paths |
386 | findPaths(graph, cProps, allPaths, maxCycles, cProps.corpusLimit); |
387 | |
388 | // transform paths into corpora: we do this repeatedly until we (a) hit our |
389 | // limit, or (b) don't generate any new corpora for any of our paths. |
390 | set<string> data; |
391 | while (data.size() < cProps.corpusLimit) { |
392 | size_t count = data.size(); |
393 | for (const auto &path : allPaths) { |
394 | string s = pathToCorpus(path); |
395 | if (data.insert(s).second) { |
396 | DEBUG_PRINTF("corpus %zu (%zu bytes): '%s'\n" , data.size(), |
397 | s.size(), escapeString(s).c_str()); |
398 | if (data.size() == cProps.corpusLimit) { |
399 | goto hit_limit; |
400 | } |
401 | } |
402 | } |
403 | if (data.size() == count) { |
404 | break; // we're finding it hard to generate more corpora |
405 | } |
406 | } |
407 | |
408 | hit_limit: |
409 | DEBUG_PRINTF("%zu corpora built\n" , data.size()); |
410 | |
411 | // populate the output vector from the set we built. |
412 | outdata.reserve(data.size()); |
413 | copy(data.begin(), data.end(), back_inserter(outdata)); |
414 | } |
415 | |
416 | /** \brief Concrete implementation for UTF-8 */ |
417 | class CorpusGeneratorUtf8 : public CorpusGenerator { |
418 | public: |
419 | CorpusGeneratorUtf8(const NGHolder &graph_in, const ExpressionInfo &expr_in, |
420 | CorpusProperties &props); |
421 | ~CorpusGeneratorUtf8() = default; |
422 | |
423 | void generateCorpus(vector<string> &data); |
424 | |
425 | private: |
426 | unichar getRandomChar(); |
427 | unichar getMatchChar(CodePointSet cps); |
428 | unichar getUnmatchChar(const CodePointSet &cps); |
429 | |
430 | unichar getChar(const CodePointSet &cps); |
431 | void newGenerator(vector<vector<unichar> > &data); |
432 | vector<unichar> pathToCorpus(const vector<CodePointSet> &path); |
433 | |
434 | /** \brief Generate a random string between min and max codepoints in |
435 | * length. */ |
436 | void addRandom(const min_max &mm, vector<unichar> *out); |
437 | |
438 | /** \brief Info about this expression. */ |
439 | const ExpressionInfo &expr; |
440 | |
441 | /** \brief The NFA graph we operate over. */ |
442 | const NGHolder &graph; |
443 | |
444 | /** \brief Reference to our corpus generator properties object (stores some |
445 | * state) */ |
446 | CorpusProperties &cProps; |
447 | }; |
448 | |
449 | CorpusGeneratorUtf8::CorpusGeneratorUtf8(const NGHolder &graph_in, |
450 | const ExpressionInfo &expr_in, |
451 | CorpusProperties &props) |
452 | : expr(expr_in), graph(graph_in), cProps(props) { |
453 | // we do not support Utf8 for approximate matching |
454 | if (expr.edit_distance) { |
455 | throw CorpusGenerationFailure("UTF-8 for edited patterns is not " |
456 | "supported." ); |
457 | } |
458 | } |
459 | |
460 | void CorpusGeneratorUtf8::generateCorpus(vector<string> &data) { |
461 | vector<vector<unichar>> raw; |
462 | newGenerator(raw); |
463 | |
464 | // If the caller has asked us, apply edit distance to corpora |
465 | if (cProps.editDistance) { |
466 | for (auto &e : raw) { |
467 | editCorpus(&e, cProps); |
468 | } |
469 | } |
470 | |
471 | for (const auto &e : raw) { |
472 | data.push_back(encodeUtf8(e)); |
473 | } |
474 | } |
475 | |
476 | /** \brief Generate a random character, taking care to stick to the alphabet |
477 | * that we've been asked for. */ |
478 | unichar CorpusGeneratorUtf8::getRandomChar() { |
479 | u32 range = MAX_UNICODE + 1 |
480 | - (UNICODE_SURROGATE_MAX + UNICODE_SURROGATE_MIN + 1); |
481 | range = min(cProps.alphabetSize, range); |
482 | assert(range); |
483 | |
484 | unichar c = 'a' + cProps.rand(0, range - 1); |
485 | |
486 | if (c >= UNICODE_SURROGATE_MIN) { |
487 | c =+ UNICODE_SURROGATE_MAX + 1; |
488 | } |
489 | |
490 | return c % (MAX_UNICODE + 1); |
491 | } |
492 | |
493 | /** \brief Select a random character from the given string of valid match |
494 | * characters. */ |
495 | unichar CorpusGeneratorUtf8::getMatchChar(CodePointSet cps) { |
496 | cps.unsetRange(UNICODE_SURROGATE_MIN, UNICODE_SURROGATE_MAX); |
497 | u32 num = cps.count(); |
498 | if (num == 0) { |
499 | return 0; |
500 | } else if (num == 1) { |
501 | return lower(*cps.begin()); |
502 | } else { |
503 | unichar rv = cps.at(cProps.rand(0, num - 1)); |
504 | assert(rv != INVALID_UNICODE); |
505 | return rv; |
506 | } |
507 | } |
508 | |
509 | /** \brief Select a character that does not belong to the given bitset. This |
510 | * makes no guarantees on unmatchability if the bitset is full. */ |
511 | unichar CorpusGeneratorUtf8::getUnmatchChar(const CodePointSet &cps) { |
512 | return getMatchChar(~cps); |
513 | } |
514 | |
515 | void CorpusGeneratorUtf8::addRandom(const min_max &mm, vector<unichar> *out) { |
516 | assert(mm.min <= mm.max); |
517 | u32 range = mm.max - mm.min; |
518 | u32 len = mm.min + (range ? cProps.rand(0, range - 1) : 0); |
519 | for (u32 i = 0; i < len; ++i) { |
520 | out->push_back(getRandomChar()); |
521 | } |
522 | } |
523 | |
524 | unichar CorpusGeneratorUtf8::getChar(const CodePointSet &cps) { |
525 | switch (cProps.throwDice()) { |
526 | case CorpusProperties::ROLLED_MATCH: |
527 | return getMatchChar(cps); |
528 | case CorpusProperties::ROLLED_UNMATCH: |
529 | return getUnmatchChar(cps); |
530 | case CorpusProperties::ROLLED_RANDOM: /* character pulled from hat */ |
531 | return getRandomChar(); |
532 | } |
533 | assert(0); |
534 | return 0; |
535 | } |
536 | |
537 | /** \brief Convert a path through the graph to a corpus string. */ |
538 | vector<unichar> |
539 | CorpusGeneratorUtf8::pathToCorpus(const vector<CodePointSet> &path) { |
540 | vector<unichar> s; |
541 | |
542 | // Add random prefix |
543 | if (cProps.prefixRange.max) { |
544 | addRandom(cProps.prefixRange, &s); |
545 | } |
546 | |
547 | // Generate a corpus from our path |
548 | for (const auto &e : path) { |
549 | s.push_back(getChar(e)); |
550 | } |
551 | |
552 | // Add random suffix |
553 | if (cProps.suffixRange.max) { |
554 | addRandom(cProps.suffixRange, &s); |
555 | } |
556 | |
557 | return s; |
558 | } |
559 | |
560 | static |
561 | u32 classify_vertex(const NGHolder &g, NFAVertex v) { |
562 | const CharReach &cr = g[v].char_reach; |
563 | if (cr.isSubsetOf(UTF_ASCII_CR)) { |
564 | return 1; |
565 | } else if (cr.isSubsetOf(UTF_TWO_START_CR)) { |
566 | return 2; |
567 | } else if (cr.isSubsetOf(UTF_THREE_START_CR)) { |
568 | return 3; |
569 | } else if (cr.isSubsetOf(UTF_FOUR_START_CR)) { |
570 | return 4; |
571 | } |
572 | |
573 | /* this can happen due to dummy vertices from zwa */ |
574 | return 1; |
575 | } |
576 | |
577 | static |
578 | void fillCodePointSet(const CharReach &cr, CodePointSet *out, u8 mask = 0xff) { |
579 | for (u32 i = cr.find_first(); i != CharReach::npos; i = cr.find_next(i)) { |
580 | out->set(i & mask); |
581 | } |
582 | } |
583 | |
584 | static |
585 | void expandCodePointSet(const CharReach &cr, CodePointSet *out, u32 mask, |
586 | u32 n) { |
587 | CodePointSet base; |
588 | base.swap(*out); |
589 | for (u32 i = cr.find_first(); i != CharReach::npos; i = cr.find_next(i)) { |
590 | u32 val = (i & mask) << (n * UTF_CONT_SHIFT); |
591 | for (const auto &cp : base) { |
592 | unichar ll = lower(cp); |
593 | unichar uu = upper(cp); |
594 | out->setRange(val + ll, MIN(val + uu, MAX_UNICODE)); |
595 | } |
596 | } |
597 | } |
598 | |
599 | static |
600 | void decodePath(const NGHolder &g, const VertexPath &in, |
601 | vector<CodePointSet> &out) { |
602 | VertexPath::const_iterator it = in.begin(); |
603 | while (it != in.end()) { |
604 | if (is_special(*it, g)) { |
605 | ++it; |
606 | continue; |
607 | } |
608 | |
609 | out.push_back(CodePointSet()); |
610 | CodePointSet &cps = out.back(); |
611 | |
612 | switch (classify_vertex(g, *it)) { |
613 | case 1: |
614 | fillCodePointSet(g[*it].char_reach, &cps); |
615 | ++it; |
616 | break; |
617 | case 2: |
618 | fillCodePointSet(g[*(it + 1)].char_reach, &cps, |
619 | UTF_CONT_BYTE_VALUE_MASK); |
620 | expandCodePointSet(g[*it].char_reach, &cps, |
621 | ~UTF_TWO_BYTE_HEADER, 1); |
622 | it += 2; |
623 | break; |
624 | case 3: |
625 | fillCodePointSet(g[*(it + 2)].char_reach, &cps, |
626 | UTF_CONT_BYTE_VALUE_MASK); |
627 | expandCodePointSet(g[*(it + 1)].char_reach, &cps, |
628 | UTF_CONT_BYTE_VALUE_MASK, 1); |
629 | expandCodePointSet(g[*it].char_reach, &cps, |
630 | ~UTF_THREE_BYTE_HEADER, 2); |
631 | it += 3; |
632 | break; |
633 | case 4: |
634 | fillCodePointSet(g[*(it + 3)].char_reach, &cps, |
635 | UTF_CONT_BYTE_VALUE_MASK); |
636 | expandCodePointSet(g[*(it + 2)].char_reach, &cps, |
637 | UTF_CONT_BYTE_VALUE_MASK, 1); |
638 | expandCodePointSet(g[*(it + 1)].char_reach, &cps, |
639 | UTF_CONT_BYTE_VALUE_MASK, 2); |
640 | expandCodePointSet(g[*it].char_reach, &cps, |
641 | ~UTF_FOUR_BYTE_HEADER, 3); |
642 | it += 4; |
643 | break; |
644 | default:; |
645 | assert(0); |
646 | ++it; |
647 | } |
648 | } |
649 | } |
650 | |
651 | static |
652 | void translatePaths(const NGHolder &graph, |
653 | const vector<VertexPath> &allPathsTemp, |
654 | vector<vector<CodePointSet>> *out) { |
655 | assert(out); |
656 | for (const auto &path : allPathsTemp) { |
657 | out->push_back(vector<CodePointSet>()); |
658 | decodePath(graph, path, out->back()); |
659 | } |
660 | } |
661 | |
662 | void CorpusGeneratorUtf8::newGenerator(vector<vector<unichar>> &outdata) { |
663 | const u32 maxCycles = cProps.getCycleLimit().second; |
664 | DEBUG_PRINTF("generating up to %u corpora, cycle limit of %u\n" , |
665 | cProps.corpusLimit, maxCycles); |
666 | |
667 | vector<vector<CodePointSet>> allPaths; |
668 | |
669 | // Special case: if the graph has ONLY special vertices, then this is |
670 | // likely to be an odd vacuous pattern or a pattern that can never match. |
671 | // In these cases, an empty corpus is useful. |
672 | if (graph_is_empty(graph)) { |
673 | allPaths.push_back(vector<CodePointSet>()); |
674 | } else { |
675 | // build a set of unique paths |
676 | vector<VertexPath> allPathsTemp; |
677 | findPaths(graph, cProps, allPathsTemp, maxCycles, cProps.corpusLimit); |
678 | translatePaths(graph, allPathsTemp, &allPaths); |
679 | } |
680 | |
681 | // transform paths into corpora: we do this repeatedly until we (a) hit our |
682 | // limit, or (b) don't generate any new corpora for any of our paths. |
683 | set<vector<unichar> > data; |
684 | while (data.size() < cProps.corpusLimit) { |
685 | size_t count = data.size(); |
686 | for (const auto &path : allPaths) { |
687 | vector<unichar> vu = pathToCorpus(path); |
688 | if (data.insert(vu).second) { |
689 | if (data.size() == cProps.corpusLimit) { |
690 | goto hit_limit; |
691 | } |
692 | } |
693 | } |
694 | if (data.size() == count) { |
695 | break; // we're finding it hard to generate more corpora |
696 | } |
697 | } |
698 | |
699 | hit_limit: |
700 | DEBUG_PRINTF("%zu corpora built\n" , data.size()); |
701 | |
702 | // populate the output vector from the set we built. |
703 | outdata.reserve(data.size()); |
704 | copy(data.begin(), data.end(), back_inserter(outdata)); |
705 | } |
706 | |
707 | } // namespace |
708 | |
709 | CorpusGenerator::~CorpusGenerator() { } |
710 | |
711 | // External entry point |
712 | |
713 | unique_ptr<CorpusGenerator> makeCorpusGenerator(const NGHolder &graph, |
714 | const ExpressionInfo &expr, |
715 | CorpusProperties &props) { |
716 | if (expr.utf8) { |
717 | return ue2::make_unique<CorpusGeneratorUtf8>(graph, expr, props); |
718 | } else { |
719 | return ue2::make_unique<CorpusGeneratorImpl>(graph, expr, props); |
720 | } |
721 | } |
722 | |