1/*
2 * Copyright (c) 2015-2017, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** \file
30 * \brief Corpus Generation tool.
31 */
32
33#include "config.h"
34
35#include "ng_corpus_generator.h"
36
37#include "ng_corpus_editor.h"
38#include "compiler/compiler.h"
39#include "nfagraph/ng.h"
40#include "nfagraph/ng_util.h"
41#include "ue2common.h"
42#include "util/container.h"
43#include "util/graph_range.h"
44#include "util/make_unique.h"
45#include "util/ue2string.h"
46#include "util/unicode_def.h"
47#include "util/unicode_set.h"
48
49#include <algorithm>
50#include <deque>
51#include <memory>
52#include <set>
53#include <sstream>
54#include <unordered_set>
55#include <vector>
56
57#include <boost/utility.hpp>
58
59using namespace std;
60using namespace ue2;
61
62typedef vector<NFAVertex> VertexPath;
63
64#if defined(DEBUG)
65// For debugging output
66static
67string pathToString(const NGHolder &g, const VertexPath &p) {
68 ostringstream oss;
69 oss << '[';
70 for (auto i = p.begin(); i != p.end(); ++i) {
71 if (i != p.begin()) {
72 oss << ',';
73 }
74 oss << g[*i].index;
75 }
76 oss << ']';
77 return oss.str();
78}
79#endif
80
81/** True if this graph has no non-special successors of start or startDs. */
82static
83bool graph_is_empty(const NGHolder &g) {
84 for (const auto &v : adjacent_vertices_range(g.start, g)) {
85 if (!is_special(v, g)) {
86 return false;
87 }
88 }
89 for (const auto &v : adjacent_vertices_range(g.start, g)) {
90 if (!is_special(v, g)) {
91 return false;
92 }
93 }
94
95 return true;
96}
97
98static
99string encodeUtf8(const vector<unichar> &v) {
100 string rv;
101 for (const unichar &cp : v) {
102 if (cp < UTF_2CHAR_MIN) {
103 rv.push_back(cp);
104 } else if (cp < UTF_3CHAR_MIN) {
105 rv.push_back(UTF_TWO_BYTE_HEADER | (cp >> UTF_CONT_SHIFT));
106 rv.push_back(makeContByte(cp));
107 } else if (cp < UTF_4CHAR_MIN) {
108 rv.push_back(UTF_THREE_BYTE_HEADER | (cp >> (2 * UTF_CONT_SHIFT)));
109 rv.push_back(makeContByte(cp >> UTF_CONT_SHIFT));
110 rv.push_back(makeContByte(cp));
111 } else {
112 rv.push_back(UTF_FOUR_BYTE_HEADER | (cp >> (3 * UTF_CONT_SHIFT)));
113 rv.push_back(makeContByte(cp >> (2 * UTF_CONT_SHIFT)));
114 rv.push_back(makeContByte(cp >> UTF_CONT_SHIFT));
115 rv.push_back(makeContByte(cp));
116 }
117 }
118 return rv;
119}
120
121template<class Iter, class Val>
122static
123bool has_greater_than(Iter it, Iter end, const Val &v, size_t limit) {
124 for (; it != end; ++it) {
125 if (*it == v) {
126 if (limit == 0) {
127 return true;
128 }
129 --limit;
130 }
131 }
132 return false;
133}
134
135static
136void findPaths(const NGHolder &g, CorpusProperties &cProps,
137 vector<VertexPath> &allPaths, size_t cycleLimit,
138 size_t corpusLimit) {
139 // The maximum number of open (in progress) paths. New paths beyond this
140 // limit will evict a random existing one.
141 const size_t MAX_OPEN = min((size_t)1000, corpusLimit * 10);
142
143 vector<unique_ptr<VertexPath>> open;
144 open.push_back(ue2::make_unique<VertexPath>(1, g.start));
145
146 unordered_set<NFAVertex> one_way_in;
147 for (const auto &v : vertices_range(g)) {
148 if (in_degree(v, g) <= 1) {
149 one_way_in.insert(v);
150 }
151 }
152
153 while (!open.empty()) {
154 u32 slot = cProps.rand(0, open.size() - 1);
155 swap(open.at(slot), open.back());
156 auto p = std::move(open.back());
157 open.pop_back();
158 NFAVertex u = p->back();
159
160 DEBUG_PRINTF("dequeuing path %s, back %zu\n",
161 pathToString(g, *p).c_str(), g[u].index);
162
163 NGHolder::adjacency_iterator ai, ae;
164 for (tie(ai, ae) = adjacent_vertices(u, g); ai != ae; ++ai) {
165 NFAVertex v = *ai;
166
167 if (u == g.startDs && v == g.startDs) {
168 // explicitly avoid following startDs self-loop, as we have
169 // other mechanisms for adding prefixes to our corpora.
170 continue;
171 }
172
173 // Accept vertices generate completed paths.
174 if (v == g.accept || v == g.acceptEod) {
175 DEBUG_PRINTF("path complete: %s\n",
176 pathToString(g, *p).c_str());
177 allPaths.push_back(*p);
178 if (allPaths.size() >= corpusLimit) {
179 DEBUG_PRINTF("full, going home\n");
180 return;
181 }
182
183 // No meaningful edges out of accept or acceptEod.
184 continue;
185 }
186
187 if (!contains(one_way_in, v) &&
188 has_greater_than(p->begin(), p->end(), v, cycleLimit)) {
189 // Note that vertices that only have one predecessor don't need
190 // their cycle limit checked, as their predecessors will have
191 // the same count.
192 DEBUG_PRINTF("exceeded cycle limit for v=%zu, pruning path\n",
193 g[v].index);
194 continue;
195 }
196
197 // If we've got no further adjacent vertices, re-use p rather than
198 // copying it for the next path.
199 unique_ptr<VertexPath> new_path;
200 if (boost::next(ai) == ae) {
201 new_path = std::move(p);
202 } else {
203 new_path = make_unique<VertexPath>(*p);
204 }
205
206 new_path->push_back(v);
207 if (open.size() < MAX_OPEN) {
208 open.push_back(std::move(new_path));
209 } else {
210 u32 victim = cProps.rand(0, open.size() - 1);
211 open[victim] = std::move(new_path);
212 }
213 }
214 }
215 DEBUG_PRINTF("bored, going home\n");
216}
217
218namespace {
219
220/** \brief Concrete implementation */
221class CorpusGeneratorImpl : public CorpusGenerator {
222public:
223 CorpusGeneratorImpl(const NGHolder &graph_in, const ExpressionInfo &expr_in,
224 CorpusProperties &props);
225 ~CorpusGeneratorImpl() = default;
226
227 void generateCorpus(vector<string> &data);
228
229private:
230 unsigned char getRandomChar();
231 unsigned char getMatchChar(const CharReach &cr);
232 unsigned char getUnmatchChar(const CharReach &cr);
233
234 unsigned char getChar(NFAVertex v);
235 void newGenerator(vector<string> &data);
236 string pathToCorpus(const VertexPath &path);
237
238 /** \brief Generate a string of random bytes between minLen and maxLen
239 * bytes in length. */
240 void addRandom(const min_max &mm, string *out);
241
242 /** \brief Info about this expression. */
243 const ExpressionInfo &expr;
244
245 /** \brief The NFA graph we operate over. */
246 const NGHolder &graph;
247
248 /** \brief Reference to our corpus generator properties object (stores some
249 * state) */
250 CorpusProperties &cProps;
251};
252
253CorpusGeneratorImpl::CorpusGeneratorImpl(const NGHolder &graph_in,
254 const ExpressionInfo &expr_in,
255 CorpusProperties &props)
256 : expr(expr_in), graph(graph_in), cProps(props) {
257 // if this pattern is to be matched approximately
258 if ((expr.edit_distance || expr.hamm_distance) && !props.editDistance) {
259 props.editDistance =
260 props.rand(0, expr.hamm_distance + expr.edit_distance + 1);
261 }
262}
263
264void CorpusGeneratorImpl::generateCorpus(vector<string> &data) {
265 newGenerator(data);
266
267 if (cProps.editDistance && !data.empty() &&
268 data.size() < cProps.corpusLimit) {
269 // Create more entries by copying the corpora and applying edits
270 size_t diff = cProps.corpusLimit - data.size();
271 size_t repeats = diff / data.size();
272 size_t remains = diff % data.size();
273 vector<string> newdata;
274 for (size_t i = 0; i < repeats; i++) {
275 std::copy(data.begin(), data.end(), std::back_inserter(newdata));
276 }
277 if (remains) {
278 std::copy_n(data.begin(), remains, std::back_inserter(newdata));
279 }
280 for (auto &s : newdata) {
281 editCorpus(&s, cProps);
282 }
283 std::move(newdata.begin(), newdata.end(), back_inserter(data));
284 } else if (cProps.editDistance) {
285 // If the caller has asked us, apply edit distance to corpora
286 for (auto &s : data) {
287 editCorpus(&s, cProps);
288 }
289 }
290}
291
292/** \brief Generate a random character, taking care to stick to the alphabet
293 * that we've been asked for. */
294u8 CorpusGeneratorImpl::getRandomChar() {
295 return 'a' + cProps.rand(0, min(cProps.alphabetSize, (u32)CharReach::npos));
296}
297
298/** \brief Select a random character from the given string of valid match
299 * characters. */
300unsigned char CorpusGeneratorImpl::getMatchChar(const CharReach &cr) {
301 unsigned int num = cr.count();
302 if (num == 0) {
303 return 0;
304 } else if (num == 1) {
305 return (unsigned char)cr.find_first();
306 } else if (num == 256) {
307 // Dot class, any character is OK!
308 return (unsigned char)cProps.rand(0, 255);
309 }
310 else {
311 unsigned idx = cProps.rand(0, num - 1);
312 return (unsigned char)cr.find_nth(idx);
313 }
314}
315
316/** \brief Select a character that does not belong to the given bitset. This
317 * makes no guarantees on unmatchability if the bitset is full. */
318unsigned char CorpusGeneratorImpl::getUnmatchChar(const CharReach &cr) {
319 return getMatchChar(~cr);
320}
321
322void CorpusGeneratorImpl::addRandom(const min_max &mm, string *out) {
323 assert(mm.min <= mm.max);
324 u32 range = mm.max - mm.min;
325 u32 len = mm.min + (range ? cProps.rand(0, range - 1) : 0);
326 for (u32 i = 0; i < len; ++i) {
327 out->push_back(getRandomChar());
328 }
329}
330
331unsigned char CorpusGeneratorImpl::getChar(NFAVertex v) {
332 const CharReach &cr = graph[v].char_reach;
333
334 switch (cProps.throwDice()) {
335 case CorpusProperties::ROLLED_MATCH:
336 return getMatchChar(cr);
337 case CorpusProperties::ROLLED_UNMATCH:
338 return getUnmatchChar(cr);
339 case CorpusProperties::ROLLED_RANDOM: /* character pulled from hat */
340 return getRandomChar();
341 }
342 assert(0);
343 return 0;
344}
345
346/** \brief Convert a path through the graph to a corpus string. */
347string CorpusGeneratorImpl::pathToCorpus(const VertexPath &path) {
348 string s;
349
350 // Add random prefix
351 if (cProps.prefixRange.max) {
352 addRandom(cProps.prefixRange, &s);
353 }
354
355 // Generate a corpus from our path
356 for (const auto &e : path) {
357 if (!is_special(e, graph)) {
358 s += getChar(e);
359 }
360 }
361
362 // Add random suffix
363 if (cProps.suffixRange.max) {
364 addRandom(cProps.suffixRange, &s);
365 }
366
367 return s;
368}
369
370void CorpusGeneratorImpl::newGenerator(vector<string> &outdata) {
371 const unsigned int maxCycles = cProps.getCycleLimit().second;
372 DEBUG_PRINTF("generating up to %u corpora, cycle limit of %u\n",
373 cProps.corpusLimit, maxCycles);
374
375 vector<VertexPath> allPaths;
376
377 // Special case: if the graph has ONLY special vertices, then this is
378 // likely to be an odd vacuous pattern or a pattern that can never match.
379 // In these cases, an empty corpus is useful.
380 if (graph_is_empty(graph)) {
381 VertexPath empty(1, graph.start);
382 allPaths.push_back(empty);
383 }
384
385 // build a set of unique paths
386 findPaths(graph, cProps, allPaths, maxCycles, cProps.corpusLimit);
387
388 // transform paths into corpora: we do this repeatedly until we (a) hit our
389 // limit, or (b) don't generate any new corpora for any of our paths.
390 set<string> data;
391 while (data.size() < cProps.corpusLimit) {
392 size_t count = data.size();
393 for (const auto &path : allPaths) {
394 string s = pathToCorpus(path);
395 if (data.insert(s).second) {
396 DEBUG_PRINTF("corpus %zu (%zu bytes): '%s'\n", data.size(),
397 s.size(), escapeString(s).c_str());
398 if (data.size() == cProps.corpusLimit) {
399 goto hit_limit;
400 }
401 }
402 }
403 if (data.size() == count) {
404 break; // we're finding it hard to generate more corpora
405 }
406 }
407
408hit_limit:
409 DEBUG_PRINTF("%zu corpora built\n", data.size());
410
411 // populate the output vector from the set we built.
412 outdata.reserve(data.size());
413 copy(data.begin(), data.end(), back_inserter(outdata));
414}
415
416/** \brief Concrete implementation for UTF-8 */
417class CorpusGeneratorUtf8 : public CorpusGenerator {
418public:
419 CorpusGeneratorUtf8(const NGHolder &graph_in, const ExpressionInfo &expr_in,
420 CorpusProperties &props);
421 ~CorpusGeneratorUtf8() = default;
422
423 void generateCorpus(vector<string> &data);
424
425private:
426 unichar getRandomChar();
427 unichar getMatchChar(CodePointSet cps);
428 unichar getUnmatchChar(const CodePointSet &cps);
429
430 unichar getChar(const CodePointSet &cps);
431 void newGenerator(vector<vector<unichar> > &data);
432 vector<unichar> pathToCorpus(const vector<CodePointSet> &path);
433
434 /** \brief Generate a random string between min and max codepoints in
435 * length. */
436 void addRandom(const min_max &mm, vector<unichar> *out);
437
438 /** \brief Info about this expression. */
439 const ExpressionInfo &expr;
440
441 /** \brief The NFA graph we operate over. */
442 const NGHolder &graph;
443
444 /** \brief Reference to our corpus generator properties object (stores some
445 * state) */
446 CorpusProperties &cProps;
447};
448
449CorpusGeneratorUtf8::CorpusGeneratorUtf8(const NGHolder &graph_in,
450 const ExpressionInfo &expr_in,
451 CorpusProperties &props)
452 : expr(expr_in), graph(graph_in), cProps(props) {
453 // we do not support Utf8 for approximate matching
454 if (expr.edit_distance) {
455 throw CorpusGenerationFailure("UTF-8 for edited patterns is not "
456 "supported.");
457 }
458}
459
460void CorpusGeneratorUtf8::generateCorpus(vector<string> &data) {
461 vector<vector<unichar>> raw;
462 newGenerator(raw);
463
464 // If the caller has asked us, apply edit distance to corpora
465 if (cProps.editDistance) {
466 for (auto &e : raw) {
467 editCorpus(&e, cProps);
468 }
469 }
470
471 for (const auto &e : raw) {
472 data.push_back(encodeUtf8(e));
473 }
474}
475
476/** \brief Generate a random character, taking care to stick to the alphabet
477 * that we've been asked for. */
478unichar CorpusGeneratorUtf8::getRandomChar() {
479 u32 range = MAX_UNICODE + 1
480 - (UNICODE_SURROGATE_MAX + UNICODE_SURROGATE_MIN + 1);
481 range = min(cProps.alphabetSize, range);
482 assert(range);
483
484 unichar c = 'a' + cProps.rand(0, range - 1);
485
486 if (c >= UNICODE_SURROGATE_MIN) {
487 c =+ UNICODE_SURROGATE_MAX + 1;
488 }
489
490 return c % (MAX_UNICODE + 1);
491}
492
493/** \brief Select a random character from the given string of valid match
494 * characters. */
495unichar CorpusGeneratorUtf8::getMatchChar(CodePointSet cps) {
496 cps.unsetRange(UNICODE_SURROGATE_MIN, UNICODE_SURROGATE_MAX);
497 u32 num = cps.count();
498 if (num == 0) {
499 return 0;
500 } else if (num == 1) {
501 return lower(*cps.begin());
502 } else {
503 unichar rv = cps.at(cProps.rand(0, num - 1));
504 assert(rv != INVALID_UNICODE);
505 return rv;
506 }
507}
508
509/** \brief Select a character that does not belong to the given bitset. This
510 * makes no guarantees on unmatchability if the bitset is full. */
511unichar CorpusGeneratorUtf8::getUnmatchChar(const CodePointSet &cps) {
512 return getMatchChar(~cps);
513}
514
515void CorpusGeneratorUtf8::addRandom(const min_max &mm, vector<unichar> *out) {
516 assert(mm.min <= mm.max);
517 u32 range = mm.max - mm.min;
518 u32 len = mm.min + (range ? cProps.rand(0, range - 1) : 0);
519 for (u32 i = 0; i < len; ++i) {
520 out->push_back(getRandomChar());
521 }
522}
523
524unichar CorpusGeneratorUtf8::getChar(const CodePointSet &cps) {
525 switch (cProps.throwDice()) {
526 case CorpusProperties::ROLLED_MATCH:
527 return getMatchChar(cps);
528 case CorpusProperties::ROLLED_UNMATCH:
529 return getUnmatchChar(cps);
530 case CorpusProperties::ROLLED_RANDOM: /* character pulled from hat */
531 return getRandomChar();
532 }
533 assert(0);
534 return 0;
535}
536
537/** \brief Convert a path through the graph to a corpus string. */
538vector<unichar>
539CorpusGeneratorUtf8::pathToCorpus(const vector<CodePointSet> &path) {
540 vector<unichar> s;
541
542 // Add random prefix
543 if (cProps.prefixRange.max) {
544 addRandom(cProps.prefixRange, &s);
545 }
546
547 // Generate a corpus from our path
548 for (const auto &e : path) {
549 s.push_back(getChar(e));
550 }
551
552 // Add random suffix
553 if (cProps.suffixRange.max) {
554 addRandom(cProps.suffixRange, &s);
555 }
556
557 return s;
558}
559
560static
561u32 classify_vertex(const NGHolder &g, NFAVertex v) {
562 const CharReach &cr = g[v].char_reach;
563 if (cr.isSubsetOf(UTF_ASCII_CR)) {
564 return 1;
565 } else if (cr.isSubsetOf(UTF_TWO_START_CR)) {
566 return 2;
567 } else if (cr.isSubsetOf(UTF_THREE_START_CR)) {
568 return 3;
569 } else if (cr.isSubsetOf(UTF_FOUR_START_CR)) {
570 return 4;
571 }
572
573 /* this can happen due to dummy vertices from zwa */
574 return 1;
575}
576
577static
578void fillCodePointSet(const CharReach &cr, CodePointSet *out, u8 mask = 0xff) {
579 for (u32 i = cr.find_first(); i != CharReach::npos; i = cr.find_next(i)) {
580 out->set(i & mask);
581 }
582}
583
584static
585void expandCodePointSet(const CharReach &cr, CodePointSet *out, u32 mask,
586 u32 n) {
587 CodePointSet base;
588 base.swap(*out);
589 for (u32 i = cr.find_first(); i != CharReach::npos; i = cr.find_next(i)) {
590 u32 val = (i & mask) << (n * UTF_CONT_SHIFT);
591 for (const auto &cp : base) {
592 unichar ll = lower(cp);
593 unichar uu = upper(cp);
594 out->setRange(val + ll, MIN(val + uu, MAX_UNICODE));
595 }
596 }
597}
598
599static
600void decodePath(const NGHolder &g, const VertexPath &in,
601 vector<CodePointSet> &out) {
602 VertexPath::const_iterator it = in.begin();
603 while (it != in.end()) {
604 if (is_special(*it, g)) {
605 ++it;
606 continue;
607 }
608
609 out.push_back(CodePointSet());
610 CodePointSet &cps = out.back();
611
612 switch (classify_vertex(g, *it)) {
613 case 1:
614 fillCodePointSet(g[*it].char_reach, &cps);
615 ++it;
616 break;
617 case 2:
618 fillCodePointSet(g[*(it + 1)].char_reach, &cps,
619 UTF_CONT_BYTE_VALUE_MASK);
620 expandCodePointSet(g[*it].char_reach, &cps,
621 ~UTF_TWO_BYTE_HEADER, 1);
622 it += 2;
623 break;
624 case 3:
625 fillCodePointSet(g[*(it + 2)].char_reach, &cps,
626 UTF_CONT_BYTE_VALUE_MASK);
627 expandCodePointSet(g[*(it + 1)].char_reach, &cps,
628 UTF_CONT_BYTE_VALUE_MASK, 1);
629 expandCodePointSet(g[*it].char_reach, &cps,
630 ~UTF_THREE_BYTE_HEADER, 2);
631 it += 3;
632 break;
633 case 4:
634 fillCodePointSet(g[*(it + 3)].char_reach, &cps,
635 UTF_CONT_BYTE_VALUE_MASK);
636 expandCodePointSet(g[*(it + 2)].char_reach, &cps,
637 UTF_CONT_BYTE_VALUE_MASK, 1);
638 expandCodePointSet(g[*(it + 1)].char_reach, &cps,
639 UTF_CONT_BYTE_VALUE_MASK, 2);
640 expandCodePointSet(g[*it].char_reach, &cps,
641 ~UTF_FOUR_BYTE_HEADER, 3);
642 it += 4;
643 break;
644 default:;
645 assert(0);
646 ++it;
647 }
648 }
649}
650
651static
652void translatePaths(const NGHolder &graph,
653 const vector<VertexPath> &allPathsTemp,
654 vector<vector<CodePointSet>> *out) {
655 assert(out);
656 for (const auto &path : allPathsTemp) {
657 out->push_back(vector<CodePointSet>());
658 decodePath(graph, path, out->back());
659 }
660}
661
662void CorpusGeneratorUtf8::newGenerator(vector<vector<unichar>> &outdata) {
663 const u32 maxCycles = cProps.getCycleLimit().second;
664 DEBUG_PRINTF("generating up to %u corpora, cycle limit of %u\n",
665 cProps.corpusLimit, maxCycles);
666
667 vector<vector<CodePointSet>> allPaths;
668
669 // Special case: if the graph has ONLY special vertices, then this is
670 // likely to be an odd vacuous pattern or a pattern that can never match.
671 // In these cases, an empty corpus is useful.
672 if (graph_is_empty(graph)) {
673 allPaths.push_back(vector<CodePointSet>());
674 } else {
675 // build a set of unique paths
676 vector<VertexPath> allPathsTemp;
677 findPaths(graph, cProps, allPathsTemp, maxCycles, cProps.corpusLimit);
678 translatePaths(graph, allPathsTemp, &allPaths);
679 }
680
681 // transform paths into corpora: we do this repeatedly until we (a) hit our
682 // limit, or (b) don't generate any new corpora for any of our paths.
683 set<vector<unichar> > data;
684 while (data.size() < cProps.corpusLimit) {
685 size_t count = data.size();
686 for (const auto &path : allPaths) {
687 vector<unichar> vu = pathToCorpus(path);
688 if (data.insert(vu).second) {
689 if (data.size() == cProps.corpusLimit) {
690 goto hit_limit;
691 }
692 }
693 }
694 if (data.size() == count) {
695 break; // we're finding it hard to generate more corpora
696 }
697 }
698
699hit_limit:
700 DEBUG_PRINTF("%zu corpora built\n", data.size());
701
702 // populate the output vector from the set we built.
703 outdata.reserve(data.size());
704 copy(data.begin(), data.end(), back_inserter(outdata));
705}
706
707} // namespace
708
709CorpusGenerator::~CorpusGenerator() { }
710
711// External entry point
712
713unique_ptr<CorpusGenerator> makeCorpusGenerator(const NGHolder &graph,
714 const ExpressionInfo &expr,
715 CorpusProperties &props) {
716 if (expr.utf8) {
717 return ue2::make_unique<CorpusGeneratorUtf8>(graph, expr, props);
718 } else {
719 return ue2::make_unique<CorpusGeneratorImpl>(graph, expr, props);
720 }
721}
722