1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | /* |
4 | ********************************************************************** |
5 | * Copyright (C) 1999-2011, International Business Machines |
6 | * Corporation and others. All Rights Reserved. |
7 | ********************************************************************** |
8 | * Date Name Description |
9 | * 11/17/99 aliu Creation. |
10 | ********************************************************************** |
11 | */ |
12 | |
13 | #include "unicode/utypes.h" |
14 | |
15 | #if !UCONFIG_NO_TRANSLITERATION |
16 | |
17 | #include "unicode/rep.h" |
18 | #include "unicode/unifilt.h" |
19 | #include "unicode/uniset.h" |
20 | #include "unicode/utf16.h" |
21 | #include "rbt_rule.h" |
22 | #include "rbt_data.h" |
23 | #include "cmemory.h" |
24 | #include "strmatch.h" |
25 | #include "strrepl.h" |
26 | #include "util.h" |
27 | #include "putilimp.h" |
28 | |
29 | static const UChar FORWARD_OP[] = {32,62,32,0}; // " > " |
30 | |
31 | U_NAMESPACE_BEGIN |
32 | |
33 | /** |
34 | * Construct a new rule with the given input, output text, and other |
35 | * attributes. A cursor position may be specified for the output text. |
36 | * @param input input string, including key and optional ante and |
37 | * post context |
38 | * @param anteContextPos offset into input to end of ante context, or -1 if |
39 | * none. Must be <= input.length() if not -1. |
40 | * @param postContextPos offset into input to start of post context, or -1 |
41 | * if none. Must be <= input.length() if not -1, and must be >= |
42 | * anteContextPos. |
43 | * @param output output string |
44 | * @param cursorPosition offset into output at which cursor is located, or -1 if |
45 | * none. If less than zero, then the cursor is placed after the |
46 | * <code>output</code>; that is, -1 is equivalent to |
47 | * <code>output.length()</code>. If greater than |
48 | * <code>output.length()</code> then an exception is thrown. |
49 | * @param segs array of UnicodeFunctors corresponding to input pattern |
50 | * segments, or null if there are none. The array itself is adopted, |
51 | * but the pointers within it are not. |
52 | * @param segsCount number of elements in segs[] |
53 | * @param anchorStart TRUE if the the rule is anchored on the left to |
54 | * the context start |
55 | * @param anchorEnd TRUE if the rule is anchored on the right to the |
56 | * context limit |
57 | */ |
58 | TransliterationRule::TransliterationRule(const UnicodeString& input, |
59 | int32_t anteContextPos, int32_t postContextPos, |
60 | const UnicodeString& outputStr, |
61 | int32_t cursorPosition, int32_t cursorOffset, |
62 | UnicodeFunctor** segs, |
63 | int32_t segsCount, |
64 | UBool anchorStart, UBool anchorEnd, |
65 | const TransliterationRuleData* theData, |
66 | UErrorCode& status) : |
67 | UMemory(), |
68 | segments(0), |
69 | data(theData) { |
70 | |
71 | if (U_FAILURE(status)) { |
72 | return; |
73 | } |
74 | // Do range checks only when warranted to save time |
75 | if (anteContextPos < 0) { |
76 | anteContextLength = 0; |
77 | } else { |
78 | if (anteContextPos > input.length()) { |
79 | // throw new IllegalArgumentException("Invalid ante context"); |
80 | status = U_ILLEGAL_ARGUMENT_ERROR; |
81 | return; |
82 | } |
83 | anteContextLength = anteContextPos; |
84 | } |
85 | if (postContextPos < 0) { |
86 | keyLength = input.length() - anteContextLength; |
87 | } else { |
88 | if (postContextPos < anteContextLength || |
89 | postContextPos > input.length()) { |
90 | // throw new IllegalArgumentException("Invalid post context"); |
91 | status = U_ILLEGAL_ARGUMENT_ERROR; |
92 | return; |
93 | } |
94 | keyLength = postContextPos - anteContextLength; |
95 | } |
96 | if (cursorPosition < 0) { |
97 | cursorPosition = outputStr.length(); |
98 | } else if (cursorPosition > outputStr.length()) { |
99 | // throw new IllegalArgumentException("Invalid cursor position"); |
100 | status = U_ILLEGAL_ARGUMENT_ERROR; |
101 | return; |
102 | } |
103 | // We don't validate the segments array. The caller must |
104 | // guarantee that the segments are well-formed (that is, that |
105 | // all $n references in the output refer to indices of this |
106 | // array, and that no array elements are null). |
107 | this->segments = segs; |
108 | this->segmentsCount = segsCount; |
109 | |
110 | pattern = input; |
111 | flags = 0; |
112 | if (anchorStart) { |
113 | flags |= ANCHOR_START; |
114 | } |
115 | if (anchorEnd) { |
116 | flags |= ANCHOR_END; |
117 | } |
118 | |
119 | anteContext = NULL; |
120 | if (anteContextLength > 0) { |
121 | anteContext = new StringMatcher(pattern, 0, anteContextLength, |
122 | FALSE, *data); |
123 | /* test for NULL */ |
124 | if (anteContext == 0) { |
125 | status = U_MEMORY_ALLOCATION_ERROR; |
126 | return; |
127 | } |
128 | } |
129 | |
130 | key = NULL; |
131 | if (keyLength > 0) { |
132 | key = new StringMatcher(pattern, anteContextLength, anteContextLength + keyLength, |
133 | FALSE, *data); |
134 | /* test for NULL */ |
135 | if (key == 0) { |
136 | status = U_MEMORY_ALLOCATION_ERROR; |
137 | return; |
138 | } |
139 | } |
140 | |
141 | int32_t postContextLength = pattern.length() - keyLength - anteContextLength; |
142 | postContext = NULL; |
143 | if (postContextLength > 0) { |
144 | postContext = new StringMatcher(pattern, anteContextLength + keyLength, pattern.length(), |
145 | FALSE, *data); |
146 | /* test for NULL */ |
147 | if (postContext == 0) { |
148 | status = U_MEMORY_ALLOCATION_ERROR; |
149 | return; |
150 | } |
151 | } |
152 | |
153 | this->output = new StringReplacer(outputStr, cursorPosition + cursorOffset, data); |
154 | /* test for NULL */ |
155 | if (this->output == 0) { |
156 | status = U_MEMORY_ALLOCATION_ERROR; |
157 | return; |
158 | } |
159 | } |
160 | |
161 | /** |
162 | * Copy constructor. |
163 | */ |
164 | TransliterationRule::TransliterationRule(TransliterationRule& other) : |
165 | UMemory(other), |
166 | anteContext(NULL), |
167 | key(NULL), |
168 | postContext(NULL), |
169 | pattern(other.pattern), |
170 | anteContextLength(other.anteContextLength), |
171 | keyLength(other.keyLength), |
172 | flags(other.flags), |
173 | data(other.data) { |
174 | |
175 | segments = NULL; |
176 | segmentsCount = 0; |
177 | if (other.segmentsCount > 0) { |
178 | segments = (UnicodeFunctor **)uprv_malloc(other.segmentsCount * sizeof(UnicodeFunctor *)); |
179 | uprv_memcpy(segments, other.segments, (size_t)other.segmentsCount*sizeof(segments[0])); |
180 | } |
181 | |
182 | if (other.anteContext != NULL) { |
183 | anteContext = other.anteContext->clone(); |
184 | } |
185 | if (other.key != NULL) { |
186 | key = other.key->clone(); |
187 | } |
188 | if (other.postContext != NULL) { |
189 | postContext = other.postContext->clone(); |
190 | } |
191 | output = other.output->clone(); |
192 | } |
193 | |
194 | TransliterationRule::~TransliterationRule() { |
195 | uprv_free(segments); |
196 | delete anteContext; |
197 | delete key; |
198 | delete postContext; |
199 | delete output; |
200 | } |
201 | |
202 | /** |
203 | * Return the preceding context length. This method is needed to |
204 | * support the <code>Transliterator</code> method |
205 | * <code>getMaximumContextLength()</code>. Internally, this is |
206 | * implemented as the anteContextLength, optionally plus one if |
207 | * there is a start anchor. The one character anchor gap is |
208 | * needed to make repeated incremental transliteration with |
209 | * anchors work. |
210 | */ |
211 | int32_t TransliterationRule::getContextLength(void) const { |
212 | return anteContextLength + ((flags & ANCHOR_START) ? 1 : 0); |
213 | } |
214 | |
215 | /** |
216 | * Internal method. Returns 8-bit index value for this rule. |
217 | * This is the low byte of the first character of the key, |
218 | * unless the first character of the key is a set. If it's a |
219 | * set, or otherwise can match multiple keys, the index value is -1. |
220 | */ |
221 | int16_t TransliterationRule::getIndexValue() const { |
222 | if (anteContextLength == pattern.length()) { |
223 | // A pattern with just ante context {such as foo)>bar} can |
224 | // match any key. |
225 | return -1; |
226 | } |
227 | UChar32 c = pattern.char32At(anteContextLength); |
228 | return (int16_t)(data->lookupMatcher(c) == NULL ? (c & 0xFF) : -1); |
229 | } |
230 | |
231 | /** |
232 | * Internal method. Returns true if this rule matches the given |
233 | * index value. The index value is an 8-bit integer, 0..255, |
234 | * representing the low byte of the first character of the key. |
235 | * It matches this rule if it matches the first character of the |
236 | * key, or if the first character of the key is a set, and the set |
237 | * contains any character with a low byte equal to the index |
238 | * value. If the rule contains only ante context, as in foo)>bar, |
239 | * then it will match any key. |
240 | */ |
241 | UBool TransliterationRule::matchesIndexValue(uint8_t v) const { |
242 | // Delegate to the key, or if there is none, to the postContext. |
243 | // If there is neither then we match any key; return true. |
244 | UnicodeMatcher *m = (key != NULL) ? key : postContext; |
245 | return (m != NULL) ? m->matchesIndexValue(v) : TRUE; |
246 | } |
247 | |
248 | /** |
249 | * Return true if this rule masks another rule. If r1 masks r2 then |
250 | * r1 matches any input string that r2 matches. If r1 masks r2 and r2 masks |
251 | * r1 then r1 == r2. Examples: "a>x" masks "ab>y". "a>x" masks "a[b]>y". |
252 | * "[c]a>x" masks "[dc]a>y". |
253 | */ |
254 | UBool TransliterationRule::masks(const TransliterationRule& r2) const { |
255 | /* Rule r1 masks rule r2 if the string formed of the |
256 | * antecontext, key, and postcontext overlaps in the following |
257 | * way: |
258 | * |
259 | * r1: aakkkpppp |
260 | * r2: aaakkkkkpppp |
261 | * ^ |
262 | * |
263 | * The strings must be aligned at the first character of the |
264 | * key. The length of r1 to the left of the alignment point |
265 | * must be <= the length of r2 to the left; ditto for the |
266 | * right. The characters of r1 must equal (or be a superset |
267 | * of) the corresponding characters of r2. The superset |
268 | * operation should be performed to check for UnicodeSet |
269 | * masking. |
270 | * |
271 | * Anchors: Two patterns that differ only in anchors only |
272 | * mask one another if they are exactly equal, and r2 has |
273 | * all the anchors r1 has (optionally, plus some). Here Y |
274 | * means the row masks the column, N means it doesn't. |
275 | * |
276 | * ab ^ab ab$ ^ab$ |
277 | * ab Y Y Y Y |
278 | * ^ab N Y N Y |
279 | * ab$ N N Y Y |
280 | * ^ab$ N N N Y |
281 | * |
282 | * Post context: {a}b masks ab, but not vice versa, since {a}b |
283 | * matches everything ab matches, and {a}b matches {|a|}b but ab |
284 | * does not. Pre context is different (a{b} does not align with |
285 | * ab). |
286 | */ |
287 | |
288 | /* LIMITATION of the current mask algorithm: Some rule |
289 | * maskings are currently not detected. For example, |
290 | * "{Lu}]a>x" masks "A]a>y". This can be added later. TODO |
291 | */ |
292 | |
293 | int32_t len = pattern.length(); |
294 | int32_t left = anteContextLength; |
295 | int32_t left2 = r2.anteContextLength; |
296 | int32_t right = len - left; |
297 | int32_t right2 = r2.pattern.length() - left2; |
298 | int32_t cachedCompare = r2.pattern.compare(left2 - left, len, pattern); |
299 | |
300 | // TODO Clean this up -- some logic might be combinable with the |
301 | // next statement. |
302 | |
303 | // Test for anchor masking |
304 | if (left == left2 && right == right2 && |
305 | keyLength <= r2.keyLength && |
306 | 0 == cachedCompare) { |
307 | // The following boolean logic implements the table above |
308 | return (flags == r2.flags) || |
309 | (!(flags & ANCHOR_START) && !(flags & ANCHOR_END)) || |
310 | ((r2.flags & ANCHOR_START) && (r2.flags & ANCHOR_END)); |
311 | } |
312 | |
313 | return left <= left2 && |
314 | (right < right2 || |
315 | (right == right2 && keyLength <= r2.keyLength)) && |
316 | (0 == cachedCompare); |
317 | } |
318 | |
319 | static inline int32_t posBefore(const Replaceable& str, int32_t pos) { |
320 | return (pos > 0) ? |
321 | pos - U16_LENGTH(str.char32At(pos-1)) : |
322 | pos - 1; |
323 | } |
324 | |
325 | static inline int32_t posAfter(const Replaceable& str, int32_t pos) { |
326 | return (pos >= 0 && pos < str.length()) ? |
327 | pos + U16_LENGTH(str.char32At(pos)) : |
328 | pos + 1; |
329 | } |
330 | |
331 | /** |
332 | * Attempt a match and replacement at the given position. Return |
333 | * the degree of match between this rule and the given text. The |
334 | * degree of match may be mismatch, a partial match, or a full |
335 | * match. A mismatch means at least one character of the text |
336 | * does not match the context or key. A partial match means some |
337 | * context and key characters match, but the text is not long |
338 | * enough to match all of them. A full match means all context |
339 | * and key characters match. |
340 | * |
341 | * If a full match is obtained, perform a replacement, update pos, |
342 | * and return U_MATCH. Otherwise both text and pos are unchanged. |
343 | * |
344 | * @param text the text |
345 | * @param pos the position indices |
346 | * @param incremental if TRUE, test for partial matches that may |
347 | * be completed by additional text inserted at pos.limit. |
348 | * @return one of <code>U_MISMATCH</code>, |
349 | * <code>U_PARTIAL_MATCH</code>, or <code>U_MATCH</code>. If |
350 | * incremental is FALSE then U_PARTIAL_MATCH will not be returned. |
351 | */ |
352 | UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text, |
353 | UTransPosition& pos, |
354 | UBool incremental) const { |
355 | // Matching and replacing are done in one method because the |
356 | // replacement operation needs information obtained during the |
357 | // match. Another way to do this is to have the match method |
358 | // create a match result struct with relevant offsets, and to pass |
359 | // this into the replace method. |
360 | |
361 | // ============================ MATCH =========================== |
362 | |
363 | // Reset segment match data |
364 | if (segments != NULL) { |
365 | for (int32_t i=0; i<segmentsCount; ++i) { |
366 | ((StringMatcher*) segments[i])->resetMatch(); |
367 | } |
368 | } |
369 | |
370 | // int32_t lenDelta, keyLimit; |
371 | int32_t keyLimit; |
372 | |
373 | // ------------------------ Ante Context ------------------------ |
374 | |
375 | // A mismatch in the ante context, or with the start anchor, |
376 | // is an outright U_MISMATCH regardless of whether we are |
377 | // incremental or not. |
378 | int32_t oText; // offset into 'text' |
379 | // int32_t newStart = 0; |
380 | int32_t minOText; |
381 | |
382 | // Note (1): We process text in 16-bit code units, rather than |
383 | // 32-bit code points. This works because stand-ins are |
384 | // always in the BMP and because we are doing a literal match |
385 | // operation, which can be done 16-bits at a time. |
386 | |
387 | int32_t anteLimit = posBefore(text, pos.contextStart); |
388 | |
389 | UMatchDegree match; |
390 | |
391 | // Start reverse match at char before pos.start |
392 | oText = posBefore(text, pos.start); |
393 | |
394 | if (anteContext != NULL) { |
395 | match = anteContext->matches(text, oText, anteLimit, FALSE); |
396 | if (match != U_MATCH) { |
397 | return U_MISMATCH; |
398 | } |
399 | } |
400 | |
401 | minOText = posAfter(text, oText); |
402 | |
403 | // ------------------------ Start Anchor ------------------------ |
404 | |
405 | if (((flags & ANCHOR_START) != 0) && oText != anteLimit) { |
406 | return U_MISMATCH; |
407 | } |
408 | |
409 | // -------------------- Key and Post Context -------------------- |
410 | |
411 | oText = pos.start; |
412 | |
413 | if (key != NULL) { |
414 | match = key->matches(text, oText, pos.limit, incremental); |
415 | if (match != U_MATCH) { |
416 | return match; |
417 | } |
418 | } |
419 | |
420 | keyLimit = oText; |
421 | |
422 | if (postContext != NULL) { |
423 | if (incremental && keyLimit == pos.limit) { |
424 | // The key matches just before pos.limit, and there is |
425 | // a postContext. Since we are in incremental mode, |
426 | // we must assume more characters may be inserted at |
427 | // pos.limit -- this is a partial match. |
428 | return U_PARTIAL_MATCH; |
429 | } |
430 | |
431 | match = postContext->matches(text, oText, pos.contextLimit, incremental); |
432 | if (match != U_MATCH) { |
433 | return match; |
434 | } |
435 | } |
436 | |
437 | // ------------------------- Stop Anchor ------------------------ |
438 | |
439 | if (((flags & ANCHOR_END)) != 0) { |
440 | if (oText != pos.contextLimit) { |
441 | return U_MISMATCH; |
442 | } |
443 | if (incremental) { |
444 | return U_PARTIAL_MATCH; |
445 | } |
446 | } |
447 | |
448 | // =========================== REPLACE ========================== |
449 | |
450 | // We have a full match. The key is between pos.start and |
451 | // keyLimit. |
452 | |
453 | int32_t newStart; |
454 | int32_t newLength = output->toReplacer()->replace(text, pos.start, keyLimit, newStart); |
455 | int32_t lenDelta = newLength - (keyLimit - pos.start); |
456 | |
457 | oText += lenDelta; |
458 | pos.limit += lenDelta; |
459 | pos.contextLimit += lenDelta; |
460 | // Restrict new value of start to [minOText, min(oText, pos.limit)]. |
461 | pos.start = uprv_max(minOText, uprv_min(uprv_min(oText, pos.limit), newStart)); |
462 | return U_MATCH; |
463 | } |
464 | |
465 | /** |
466 | * Create a source string that represents this rule. Append it to the |
467 | * given string. |
468 | */ |
469 | UnicodeString& TransliterationRule::toRule(UnicodeString& rule, |
470 | UBool escapeUnprintable) const { |
471 | |
472 | // Accumulate special characters (and non-specials following them) |
473 | // into quoteBuf. Append quoteBuf, within single quotes, when |
474 | // a non-quoted element must be inserted. |
475 | UnicodeString str, quoteBuf; |
476 | |
477 | // Do not emit the braces '{' '}' around the pattern if there |
478 | // is neither anteContext nor postContext. |
479 | UBool emitBraces = |
480 | (anteContext != NULL) || (postContext != NULL); |
481 | |
482 | // Emit start anchor |
483 | if ((flags & ANCHOR_START) != 0) { |
484 | rule.append((UChar)94/*^*/); |
485 | } |
486 | |
487 | // Emit the input pattern |
488 | ICU_Utility::appendToRule(rule, anteContext, escapeUnprintable, quoteBuf); |
489 | |
490 | if (emitBraces) { |
491 | ICU_Utility::appendToRule(rule, (UChar) 0x007B /*{*/, TRUE, escapeUnprintable, quoteBuf); |
492 | } |
493 | |
494 | ICU_Utility::appendToRule(rule, key, escapeUnprintable, quoteBuf); |
495 | |
496 | if (emitBraces) { |
497 | ICU_Utility::appendToRule(rule, (UChar) 0x007D /*}*/, TRUE, escapeUnprintable, quoteBuf); |
498 | } |
499 | |
500 | ICU_Utility::appendToRule(rule, postContext, escapeUnprintable, quoteBuf); |
501 | |
502 | // Emit end anchor |
503 | if ((flags & ANCHOR_END) != 0) { |
504 | rule.append((UChar)36/*$*/); |
505 | } |
506 | |
507 | ICU_Utility::appendToRule(rule, UnicodeString(TRUE, FORWARD_OP, 3), TRUE, escapeUnprintable, quoteBuf); |
508 | |
509 | // Emit the output pattern |
510 | |
511 | ICU_Utility::appendToRule(rule, output->toReplacer()->toReplacerPattern(str, escapeUnprintable), |
512 | TRUE, escapeUnprintable, quoteBuf); |
513 | |
514 | ICU_Utility::appendToRule(rule, (UChar) 0x003B /*;*/, TRUE, escapeUnprintable, quoteBuf); |
515 | |
516 | return rule; |
517 | } |
518 | |
519 | void TransliterationRule::setData(const TransliterationRuleData* d) { |
520 | data = d; |
521 | if (anteContext != NULL) anteContext->setData(d); |
522 | if (postContext != NULL) postContext->setData(d); |
523 | if (key != NULL) key->setData(d); |
524 | // assert(output != NULL); |
525 | output->setData(d); |
526 | // Don't have to do segments since they are in the context or key |
527 | } |
528 | |
529 | /** |
530 | * Union the set of all characters that may be modified by this rule |
531 | * into the given set. |
532 | */ |
533 | void TransliterationRule::addSourceSetTo(UnicodeSet& toUnionTo) const { |
534 | int32_t limit = anteContextLength + keyLength; |
535 | for (int32_t i=anteContextLength; i<limit; ) { |
536 | UChar32 ch = pattern.char32At(i); |
537 | i += U16_LENGTH(ch); |
538 | const UnicodeMatcher* matcher = data->lookupMatcher(ch); |
539 | if (matcher == NULL) { |
540 | toUnionTo.add(ch); |
541 | } else { |
542 | matcher->addMatchSetTo(toUnionTo); |
543 | } |
544 | } |
545 | } |
546 | |
547 | /** |
548 | * Union the set of all characters that may be emitted by this rule |
549 | * into the given set. |
550 | */ |
551 | void TransliterationRule::addTargetSetTo(UnicodeSet& toUnionTo) const { |
552 | output->toReplacer()->addReplacementSetTo(toUnionTo); |
553 | } |
554 | |
555 | U_NAMESPACE_END |
556 | |
557 | #endif /* #if !UCONFIG_NO_TRANSLITERATION */ |
558 | |
559 | //eof |
560 | |