1/*
2 * Copyright (c) 2015, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** \file
30 * \brief Corpus Editor: applies random transformation to a corpus.
31 */
32
33#include "config.h"
34
35#include "ng_corpus_editor.h"
36#include "ng_corpus_properties.h"
37#include "ue2common.h"
38#include "util/compare.h"
39#include "util/unicode_def.h"
40#include "parser/ucp_table.h"
41
42#include <algorithm>
43#include <cassert>
44#include <string>
45
46using namespace std;
47using namespace ue2;
48
49namespace {
50
51enum Operation {
52 EDIT_INSERT = 0, //!< insert a character
53 EDIT_REMOVE = 1, //!< remove a character
54 EDIT_SUBSTITUTE = 2, //!< substitute a character for another
55 EDIT_TRANSPOSE = 3, //!< swap two characters
56 EDIT_FLIP_CASE = 4, //!< invert the case of an alpha character
57};
58
59template<typename SeqT>
60static
61size_t choosePosition(const SeqT &corpus, CorpusProperties &props) {
62 assert(!corpus.empty());
63 unsigned pos = props.rand(0, corpus.size() - 1);
64 return pos;
65}
66
67class CorpusEditor {
68public:
69 CorpusEditor(CorpusProperties &p) : props(p) {}
70
71 // Apply edits to a corpus
72 void applyEdits(string &corpus);
73
74private:
75 // operations
76 void insert(string &corpus);
77 void remove(string &corpus);
78 void substitute(string &corpus);
79 void transpose(string &corpus);
80 void flip_case(string &corpus);
81
82 Operation chooseOperation();
83 u8 chooseByte();
84
85 CorpusProperties &props;
86};
87
88Operation CorpusEditor::chooseOperation() {
89 return (Operation)props.rand(EDIT_INSERT, EDIT_FLIP_CASE);
90}
91
92void CorpusEditor::applyEdits(string &corpus) {
93 for (size_t i = 0; i != props.editDistance; i++) {
94 Operation op = chooseOperation();
95 switch (op) {
96 case EDIT_INSERT:
97 insert(corpus);
98 break;
99 case EDIT_REMOVE:
100 remove(corpus);
101 break;
102 case EDIT_SUBSTITUTE:
103 substitute(corpus);
104 break;
105 case EDIT_TRANSPOSE:
106 transpose(corpus);
107 break;
108 case EDIT_FLIP_CASE:
109 flip_case(corpus);
110 break;
111 }
112 }
113}
114
115void CorpusEditor::insert(string &corpus) {
116 unsigned pos = props.rand(0, corpus.size());
117 u8 c = chooseByte();
118 corpus.insert(pos, 1, (char)c);
119}
120
121void CorpusEditor::remove(string &corpus) {
122 if (corpus.empty()) return;
123 size_t pos = choosePosition(corpus, props);
124 corpus.erase(pos, 1);
125}
126
127void CorpusEditor::substitute(string &corpus) {
128 if (corpus.empty()) return;
129 size_t pos = choosePosition(corpus, props);
130 corpus[pos] = chooseByte();
131}
132
133void CorpusEditor::transpose(string &corpus) {
134 if (corpus.empty()) return;
135 size_t a = choosePosition(corpus, props);
136 size_t b = choosePosition(corpus, props);
137 u8 tmp = corpus[a];
138 corpus[a] = corpus[b];
139 corpus[b] = tmp;
140}
141
142void CorpusEditor::flip_case(string &corpus) {
143 if (corpus.empty()) return;
144
145 // Pick a random starting position and walk forward (wrapping at the end)
146 // until we find an alpha character.
147 const size_t len = corpus.size();
148 const size_t pos = choosePosition(corpus, props);
149
150 size_t i = pos;
151 for (;;) {
152 char c = corpus[i];
153 if (ourisalpha(c)) {
154 char upper = mytoupper(c), lower = mytolower(c);
155 corpus[i] = c == upper ? lower : upper;
156 DEBUG_PRINTF("flipped c=%c to %c\n", c, corpus[i]);
157 return;
158 }
159 if (++i == len) {
160 i = 0;
161 }
162 if (i == pos) { // wrapped, no alpha characters
163 break;
164 }
165 }
166}
167
168u8 CorpusEditor::chooseByte() {
169 return (u8)props.rand(0, 255);
170}
171
172class CorpusEditorUtf8 {
173public:
174 CorpusEditorUtf8(CorpusProperties &p) : props(p) {}
175
176 // Apply edits to a corpus.
177 void applyEdits(vector<unichar> &corpus);
178
179private:
180 // operations
181 void insert(vector<unichar> &corpus);
182 void remove(vector<unichar> &corpus);
183 void substitute(vector<unichar> &corpus);
184 void transpose(vector<unichar> &corpus);
185 void flip_case(vector<unichar> &corpus);
186
187 Operation chooseOperation();
188 unichar chooseCodePoint();
189
190 CorpusProperties &props;
191};
192
193Operation CorpusEditorUtf8::chooseOperation() {
194 return (Operation)props.rand(EDIT_INSERT, EDIT_FLIP_CASE);
195}
196
197void CorpusEditorUtf8::applyEdits(vector<unichar> &corpus) {
198 for (size_t i = 0; i != props.editDistance; i++) {
199 Operation op = chooseOperation();
200 switch (op) {
201 case EDIT_INSERT:
202 insert(corpus);
203 break;
204 case EDIT_REMOVE:
205 remove(corpus);
206 break;
207 case EDIT_SUBSTITUTE:
208 substitute(corpus);
209 break;
210 case EDIT_TRANSPOSE:
211 transpose(corpus);
212 break;
213 case EDIT_FLIP_CASE:
214 flip_case(corpus);
215 break;
216 }
217 }
218}
219
220void CorpusEditorUtf8::insert(vector<unichar> &corpus) {
221 unsigned pos = props.rand(0, corpus.size());
222 corpus.insert(corpus.begin() + pos, chooseCodePoint());
223}
224
225void CorpusEditorUtf8::remove(vector<unichar> &corpus) {
226 if (corpus.empty()) return;
227 size_t pos = choosePosition(corpus, props);
228 corpus.erase(corpus.begin() + pos);
229}
230
231void CorpusEditorUtf8::substitute(vector<unichar> &corpus) {
232 if (corpus.empty()) return;
233 size_t pos = choosePosition(corpus, props);
234 corpus[pos] = chooseCodePoint();
235}
236
237void CorpusEditorUtf8::transpose(vector<unichar> &corpus) {
238 if (corpus.empty()) return;
239 size_t a = choosePosition(corpus, props);
240 size_t b = choosePosition(corpus, props);
241 unichar tmp = corpus[a];
242 corpus[a] = corpus[b];
243 corpus[b] = tmp;
244}
245
246void CorpusEditorUtf8::flip_case(vector<unichar> &corpus) {
247 if (corpus.empty()) return;
248
249 // Pick a random starting position and walk forward (wrapping at the end)
250 // until we find an alpha character.
251 const size_t len = corpus.size();
252 const size_t pos = choosePosition(corpus, props);
253
254 size_t i = pos;
255 for (;;) {
256 if (::flip_case(&corpus[i])) {
257 return;
258 }
259 if (++i == len) {
260 i = 0;
261 }
262 if (i == pos) { // wrapped, no alpha characters
263 break;
264 }
265 }
266}
267
268unichar CorpusEditorUtf8::chooseCodePoint(void) {
269 /* We need to ensure that we don't pick a surrogate cp */
270 const u32 range =
271 MAX_UNICODE + 1 - (UNICODE_SURROGATE_MAX + UNICODE_SURROGATE_MIN + 1);
272 unichar raw = props.rand(0, range - 1);
273 if (raw < UNICODE_SURROGATE_MIN) {
274 return raw;
275 } else {
276 return raw + UNICODE_SURROGATE_MAX + 1;
277 }
278}
279
280} // namespace
281
282void editCorpus(string *corpus, CorpusProperties &props) {
283 CorpusEditor ed(props);
284 ed.applyEdits(*corpus);
285}
286
287void editCorpus(vector<unichar> *corpus, CorpusProperties &props) {
288 CorpusEditorUtf8 ed(props);
289 ed.applyEdits(*corpus);
290}
291