1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | /* |
4 | ********************************************************************** |
5 | * Copyright (C) 1999-2011, International Business Machines |
6 | * Corporation and others. All Rights Reserved. |
7 | ********************************************************************** |
8 | * Date Name Description |
9 | * 11/17/99 aliu Creation. |
10 | ********************************************************************** |
11 | */ |
12 | |
13 | #include "unicode/utypes.h" |
14 | |
15 | #if !UCONFIG_NO_TRANSLITERATION |
16 | |
17 | #include "unicode/unifilt.h" |
18 | #include "unicode/uniset.h" |
19 | #include "cpdtrans.h" |
20 | #include "uvector.h" |
21 | #include "tridpars.h" |
22 | #include "cmemory.h" |
23 | |
24 | // keep in sync with Transliterator |
25 | //static const UChar ID_SEP = 0x002D; /*-*/ |
26 | static const UChar ID_DELIM = 0x003B; /*;*/ |
27 | static const UChar NEWLINE = 10; |
28 | |
29 | static const UChar COLON_COLON[] = {0x3A, 0x3A, 0}; //"::" |
30 | |
31 | U_NAMESPACE_BEGIN |
32 | |
33 | const UChar CompoundTransliterator::PASS_STRING[] = { 0x0025, 0x0050, 0x0061, 0x0073, 0x0073, 0 }; // "%Pass" |
34 | |
35 | UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CompoundTransliterator) |
36 | |
37 | /** |
38 | * Constructs a new compound transliterator given an array of |
39 | * transliterators. The array of transliterators may be of any |
40 | * length, including zero or one, however, useful compound |
41 | * transliterators have at least two components. |
42 | * @param transliterators array of <code>Transliterator</code> |
43 | * objects |
44 | * @param transliteratorCount The number of |
45 | * <code>Transliterator</code> objects in transliterators. |
46 | * @param filter the filter. Any character for which |
47 | * <tt>filter.contains()</tt> returns <tt>false</tt> will not be |
48 | * altered by this transliterator. If <tt>filter</tt> is |
49 | * <tt>null</tt> then no filtering is applied. |
50 | */ |
51 | CompoundTransliterator::CompoundTransliterator( |
52 | Transliterator* const transliterators[], |
53 | int32_t transliteratorCount, |
54 | UnicodeFilter* adoptedFilter) : |
55 | Transliterator(joinIDs(transliterators, transliteratorCount), adoptedFilter), |
56 | trans(0), count(0), numAnonymousRBTs(0) { |
57 | setTransliterators(transliterators, transliteratorCount); |
58 | } |
59 | |
60 | /** |
61 | * Splits an ID of the form "ID;ID;..." into a compound using each |
62 | * of the IDs. |
63 | * @param id of above form |
64 | * @param forward if false, does the list in reverse order, and |
65 | * takes the inverse of each ID. |
66 | */ |
67 | CompoundTransliterator::CompoundTransliterator(const UnicodeString& id, |
68 | UTransDirection direction, |
69 | UnicodeFilter* adoptedFilter, |
70 | UParseError& /*parseError*/, |
71 | UErrorCode& status) : |
72 | Transliterator(id, adoptedFilter), |
73 | trans(0), numAnonymousRBTs(0) { |
74 | // TODO add code for parseError...currently unused, but |
75 | // later may be used by parsing code... |
76 | init(id, direction, TRUE, status); |
77 | } |
78 | |
79 | CompoundTransliterator::CompoundTransliterator(const UnicodeString& id, |
80 | UParseError& /*parseError*/, |
81 | UErrorCode& status) : |
82 | Transliterator(id, 0), // set filter to 0 here! |
83 | trans(0), numAnonymousRBTs(0) { |
84 | // TODO add code for parseError...currently unused, but |
85 | // later may be used by parsing code... |
86 | init(id, UTRANS_FORWARD, TRUE, status); |
87 | } |
88 | |
89 | |
90 | /** |
91 | * Private constructor for use of TransliteratorAlias |
92 | */ |
93 | CompoundTransliterator::CompoundTransliterator(const UnicodeString& newID, |
94 | UVector& list, |
95 | UnicodeFilter* adoptedFilter, |
96 | int32_t anonymousRBTs, |
97 | UParseError& /*parseError*/, |
98 | UErrorCode& status) : |
99 | Transliterator(newID, adoptedFilter), |
100 | trans(0), numAnonymousRBTs(anonymousRBTs) |
101 | { |
102 | init(list, UTRANS_FORWARD, FALSE, status); |
103 | } |
104 | |
105 | /** |
106 | * Private constructor for Transliterator from a vector of |
107 | * transliterators. The caller is responsible for fixing up the |
108 | * ID. |
109 | */ |
110 | CompoundTransliterator::CompoundTransliterator(UVector& list, |
111 | UParseError& /*parseError*/, |
112 | UErrorCode& status) : |
113 | Transliterator(UnicodeString(), NULL), |
114 | trans(0), numAnonymousRBTs(0) |
115 | { |
116 | // TODO add code for parseError...currently unused, but |
117 | // later may be used by parsing code... |
118 | init(list, UTRANS_FORWARD, FALSE, status); |
119 | // assume caller will fixup ID |
120 | } |
121 | |
122 | CompoundTransliterator::CompoundTransliterator(UVector& list, |
123 | int32_t anonymousRBTs, |
124 | UParseError& /*parseError*/, |
125 | UErrorCode& status) : |
126 | Transliterator(UnicodeString(), NULL), |
127 | trans(0), numAnonymousRBTs(anonymousRBTs) |
128 | { |
129 | init(list, UTRANS_FORWARD, FALSE, status); |
130 | } |
131 | |
132 | /** |
133 | * Finish constructing a transliterator: only to be called by |
134 | * constructors. Before calling init(), set trans and filter to NULL. |
135 | * @param id the id containing ';'-separated entries |
136 | * @param direction either FORWARD or REVERSE |
137 | * @param idSplitPoint the index into id at which the |
138 | * adoptedSplitTransliterator should be inserted, if there is one, or |
139 | * -1 if there is none. |
140 | * @param adoptedSplitTransliterator a transliterator to be inserted |
141 | * before the entry at offset idSplitPoint in the id string. May be |
142 | * NULL to insert no entry. |
143 | * @param fixReverseID if TRUE, then reconstruct the ID of reverse |
144 | * entries by calling getID() of component entries. Some constructors |
145 | * do not require this because they apply a facade ID anyway. |
146 | * @param status the error code indicating success or failure |
147 | */ |
148 | void CompoundTransliterator::init(const UnicodeString& id, |
149 | UTransDirection direction, |
150 | UBool fixReverseID, |
151 | UErrorCode& status) { |
152 | // assert(trans == 0); |
153 | |
154 | if (U_FAILURE(status)) { |
155 | return; |
156 | } |
157 | |
158 | UVector list(status); |
159 | UnicodeSet* compoundFilter = NULL; |
160 | UnicodeString regenID; |
161 | if (!TransliteratorIDParser::parseCompoundID(id, direction, |
162 | regenID, list, compoundFilter)) { |
163 | status = U_INVALID_ID; |
164 | delete compoundFilter; |
165 | return; |
166 | } |
167 | |
168 | TransliteratorIDParser::instantiateList(list, status); |
169 | |
170 | init(list, direction, fixReverseID, status); |
171 | |
172 | if (compoundFilter != NULL) { |
173 | adoptFilter(compoundFilter); |
174 | } |
175 | } |
176 | |
177 | /** |
178 | * Finish constructing a transliterator: only to be called by |
179 | * constructors. Before calling init(), set trans and filter to NULL. |
180 | * @param list a vector of transliterator objects to be adopted. It |
181 | * should NOT be empty. The list should be in declared order. That |
182 | * is, it should be in the FORWARD order; if direction is REVERSE then |
183 | * the list order will be reversed. |
184 | * @param direction either FORWARD or REVERSE |
185 | * @param fixReverseID if TRUE, then reconstruct the ID of reverse |
186 | * entries by calling getID() of component entries. Some constructors |
187 | * do not require this because they apply a facade ID anyway. |
188 | * @param status the error code indicating success or failure |
189 | */ |
190 | void CompoundTransliterator::init(UVector& list, |
191 | UTransDirection direction, |
192 | UBool fixReverseID, |
193 | UErrorCode& status) { |
194 | // assert(trans == 0); |
195 | |
196 | // Allocate array |
197 | if (U_SUCCESS(status)) { |
198 | count = list.size(); |
199 | trans = (Transliterator **)uprv_malloc(count * sizeof(Transliterator *)); |
200 | /* test for NULL */ |
201 | if (trans == 0) { |
202 | status = U_MEMORY_ALLOCATION_ERROR; |
203 | return; |
204 | } |
205 | } |
206 | |
207 | if (U_FAILURE(status) || trans == 0) { |
208 | // assert(trans == 0); |
209 | return; |
210 | } |
211 | |
212 | // Move the transliterators from the vector into an array. |
213 | // Reverse the order if necessary. |
214 | int32_t i; |
215 | for (i=0; i<count; ++i) { |
216 | int32_t j = (direction == UTRANS_FORWARD) ? i : count - 1 - i; |
217 | trans[i] = (Transliterator*) list.elementAt(j); |
218 | } |
219 | |
220 | // If the direction is UTRANS_REVERSE then we may need to fix the |
221 | // ID. |
222 | if (direction == UTRANS_REVERSE && fixReverseID) { |
223 | UnicodeString newID; |
224 | for (i=0; i<count; ++i) { |
225 | if (i > 0) { |
226 | newID.append(ID_DELIM); |
227 | } |
228 | newID.append(trans[i]->getID()); |
229 | } |
230 | setID(newID); |
231 | } |
232 | |
233 | computeMaximumContextLength(); |
234 | } |
235 | |
236 | /** |
237 | * Return the IDs of the given list of transliterators, concatenated |
238 | * with ID_DELIM delimiting them. Equivalent to the perlish expression |
239 | * join(ID_DELIM, map($_.getID(), transliterators). |
240 | */ |
241 | UnicodeString CompoundTransliterator::joinIDs(Transliterator* const transliterators[], |
242 | int32_t transCount) { |
243 | UnicodeString id; |
244 | for (int32_t i=0; i<transCount; ++i) { |
245 | if (i > 0) { |
246 | id.append(ID_DELIM); |
247 | } |
248 | id.append(transliterators[i]->getID()); |
249 | } |
250 | return id; // Return temporary |
251 | } |
252 | |
253 | /** |
254 | * Copy constructor. |
255 | */ |
256 | CompoundTransliterator::CompoundTransliterator(const CompoundTransliterator& t) : |
257 | Transliterator(t), trans(0), count(0), numAnonymousRBTs(-1) { |
258 | *this = t; |
259 | } |
260 | |
261 | /** |
262 | * Destructor |
263 | */ |
264 | CompoundTransliterator::~CompoundTransliterator() { |
265 | freeTransliterators(); |
266 | } |
267 | |
268 | void CompoundTransliterator::freeTransliterators(void) { |
269 | if (trans != 0) { |
270 | for (int32_t i=0; i<count; ++i) { |
271 | delete trans[i]; |
272 | } |
273 | uprv_free(trans); |
274 | } |
275 | trans = 0; |
276 | count = 0; |
277 | } |
278 | |
279 | /** |
280 | * Assignment operator. |
281 | */ |
282 | CompoundTransliterator& CompoundTransliterator::operator=( |
283 | const CompoundTransliterator& t) |
284 | { |
285 | Transliterator::operator=(t); |
286 | int32_t i = 0; |
287 | UBool failed = FALSE; |
288 | if (trans != NULL) { |
289 | for (i=0; i<count; ++i) { |
290 | delete trans[i]; |
291 | trans[i] = 0; |
292 | } |
293 | } |
294 | if (t.count > count) { |
295 | if (trans != NULL) { |
296 | uprv_free(trans); |
297 | } |
298 | trans = (Transliterator **)uprv_malloc(t.count * sizeof(Transliterator *)); |
299 | } |
300 | count = t.count; |
301 | if (trans != NULL) { |
302 | for (i=0; i<count; ++i) { |
303 | trans[i] = t.trans[i]->clone(); |
304 | if (trans[i] == NULL) { |
305 | failed = TRUE; |
306 | break; |
307 | } |
308 | } |
309 | } |
310 | |
311 | // if memory allocation failed delete backwards trans array |
312 | if (failed && i > 0) { |
313 | int32_t n; |
314 | for (n = i-1; n >= 0; n--) { |
315 | uprv_free(trans[n]); |
316 | trans[n] = NULL; |
317 | } |
318 | } |
319 | numAnonymousRBTs = t.numAnonymousRBTs; |
320 | return *this; |
321 | } |
322 | |
323 | /** |
324 | * Transliterator API. |
325 | */ |
326 | CompoundTransliterator* CompoundTransliterator::clone() const { |
327 | return new CompoundTransliterator(*this); |
328 | } |
329 | |
330 | /** |
331 | * Returns the number of transliterators in this chain. |
332 | * @return number of transliterators in this chain. |
333 | */ |
334 | int32_t CompoundTransliterator::getCount(void) const { |
335 | return count; |
336 | } |
337 | |
338 | /** |
339 | * Returns the transliterator at the given index in this chain. |
340 | * @param index index into chain, from 0 to <code>getCount() - 1</code> |
341 | * @return transliterator at the given index |
342 | */ |
343 | const Transliterator& CompoundTransliterator::getTransliterator(int32_t index) const { |
344 | return *trans[index]; |
345 | } |
346 | |
347 | void CompoundTransliterator::setTransliterators(Transliterator* const transliterators[], |
348 | int32_t transCount) { |
349 | Transliterator** a = (Transliterator **)uprv_malloc(transCount * sizeof(Transliterator *)); |
350 | if (a == NULL) { |
351 | return; |
352 | } |
353 | int32_t i = 0; |
354 | UBool failed = FALSE; |
355 | for (i=0; i<transCount; ++i) { |
356 | a[i] = transliterators[i]->clone(); |
357 | if (a[i] == NULL) { |
358 | failed = TRUE; |
359 | break; |
360 | } |
361 | } |
362 | if (failed && i > 0) { |
363 | int32_t n; |
364 | for (n = i-1; n >= 0; n--) { |
365 | uprv_free(a[n]); |
366 | a[n] = NULL; |
367 | } |
368 | return; |
369 | } |
370 | adoptTransliterators(a, transCount); |
371 | } |
372 | |
373 | void CompoundTransliterator::adoptTransliterators(Transliterator* adoptedTransliterators[], |
374 | int32_t transCount) { |
375 | // First free trans[] and set count to zero. Once this is done, |
376 | // orphan the filter. Set up the new trans[]. |
377 | freeTransliterators(); |
378 | trans = adoptedTransliterators; |
379 | count = transCount; |
380 | computeMaximumContextLength(); |
381 | setID(joinIDs(trans, count)); |
382 | } |
383 | |
384 | /** |
385 | * Append c to buf, unless buf is empty or buf already ends in c. |
386 | */ |
387 | static void _smartAppend(UnicodeString& buf, UChar c) { |
388 | if (buf.length() != 0 && |
389 | buf.charAt(buf.length() - 1) != c) { |
390 | buf.append(c); |
391 | } |
392 | } |
393 | |
394 | UnicodeString& CompoundTransliterator::toRules(UnicodeString& rulesSource, |
395 | UBool escapeUnprintable) const { |
396 | // We do NOT call toRules() on our component transliterators, in |
397 | // general. If we have several rule-based transliterators, this |
398 | // yields a concatenation of the rules -- not what we want. We do |
399 | // handle compound RBT transliterators specially -- those for which |
400 | // compoundRBTIndex >= 0. For the transliterator at compoundRBTIndex, |
401 | // we do call toRules() recursively. |
402 | rulesSource.truncate(0); |
403 | if (numAnonymousRBTs >= 1 && getFilter() != NULL) { |
404 | // If we are a compound RBT and if we have a global |
405 | // filter, then emit it at the top. |
406 | UnicodeString pat; |
407 | rulesSource.append(COLON_COLON, 2).append(getFilter()->toPattern(pat, escapeUnprintable)).append(ID_DELIM); |
408 | } |
409 | for (int32_t i=0; i<count; ++i) { |
410 | UnicodeString rule; |
411 | |
412 | // Anonymous RuleBasedTransliterators (inline rules and |
413 | // ::BEGIN/::END blocks) are given IDs that begin with |
414 | // "%Pass": use toRules() to write all the rules to the output |
415 | // (and insert "::Null;" if we have two in a row) |
416 | if (trans[i]->getID().startsWith(PASS_STRING, 5)) { |
417 | trans[i]->toRules(rule, escapeUnprintable); |
418 | if (numAnonymousRBTs > 1 && i > 0 && trans[i - 1]->getID().startsWith(PASS_STRING, 5)) |
419 | rule = UNICODE_STRING_SIMPLE("::Null;" ) + rule; |
420 | |
421 | // we also use toRules() on CompoundTransliterators (which we |
422 | // check for by looking for a semicolon in the ID)-- this gets |
423 | // the list of their child transliterators output in the right |
424 | // format |
425 | } else if (trans[i]->getID().indexOf(ID_DELIM) >= 0) { |
426 | trans[i]->toRules(rule, escapeUnprintable); |
427 | |
428 | // for everything else, use Transliterator::toRules() |
429 | } else { |
430 | trans[i]->Transliterator::toRules(rule, escapeUnprintable); |
431 | } |
432 | _smartAppend(rulesSource, NEWLINE); |
433 | rulesSource.append(rule); |
434 | _smartAppend(rulesSource, ID_DELIM); |
435 | } |
436 | return rulesSource; |
437 | } |
438 | |
439 | /** |
440 | * Implement Transliterator framework |
441 | */ |
442 | void CompoundTransliterator::handleGetSourceSet(UnicodeSet& result) const { |
443 | UnicodeSet set; |
444 | result.clear(); |
445 | for (int32_t i=0; i<count; ++i) { |
446 | result.addAll(trans[i]->getSourceSet(set)); |
447 | // Take the example of Hiragana-Latin. This is really |
448 | // Hiragana-Katakana; Katakana-Latin. The source set of |
449 | // these two is roughly [:Hiragana:] and [:Katakana:]. |
450 | // But the source set for the entire transliterator is |
451 | // actually [:Hiragana:] ONLY -- that is, the first |
452 | // non-empty source set. |
453 | |
454 | // This is a heuristic, and not 100% reliable. |
455 | if (!result.isEmpty()) { |
456 | break; |
457 | } |
458 | } |
459 | } |
460 | |
461 | /** |
462 | * Override Transliterator framework |
463 | */ |
464 | UnicodeSet& CompoundTransliterator::getTargetSet(UnicodeSet& result) const { |
465 | UnicodeSet set; |
466 | result.clear(); |
467 | for (int32_t i=0; i<count; ++i) { |
468 | // This is a heuristic, and not 100% reliable. |
469 | result.addAll(trans[i]->getTargetSet(set)); |
470 | } |
471 | return result; |
472 | } |
473 | |
474 | /** |
475 | * Implements {@link Transliterator#handleTransliterate}. |
476 | */ |
477 | void CompoundTransliterator::handleTransliterate(Replaceable& text, UTransPosition& index, |
478 | UBool incremental) const { |
479 | /* Call each transliterator with the same contextStart and |
480 | * start, but with the limit as modified |
481 | * by preceding transliterators. The start index must be |
482 | * reset for each transliterator to give each a chance to |
483 | * transliterate the text. The initial contextStart index is known |
484 | * to still point to the same place after each transliterator |
485 | * is called because each transliterator will not change the |
486 | * text between contextStart and the initial start index. |
487 | * |
488 | * IMPORTANT: After the first transliterator, each subsequent |
489 | * transliterator only gets to transliterate text committed by |
490 | * preceding transliterators; that is, the start (output |
491 | * value) of transliterator i becomes the limit (input value) |
492 | * of transliterator i+1. Finally, the overall limit is fixed |
493 | * up before we return. |
494 | * |
495 | * Assumptions we make here: |
496 | * (1) contextStart <= start <= limit <= contextLimit <= text.length() |
497 | * (2) start <= start' <= limit' ;cursor doesn't move back |
498 | * (3) start <= limit' ;text before cursor unchanged |
499 | * - start' is the value of start after calling handleKT |
500 | * - limit' is the value of limit after calling handleKT |
501 | */ |
502 | |
503 | /** |
504 | * Example: 3 transliterators. This example illustrates the |
505 | * mechanics we need to implement. C, S, and L are the contextStart, |
506 | * start, and limit. gl is the globalLimit. contextLimit is |
507 | * equal to limit throughout. |
508 | * |
509 | * 1. h-u, changes hex to Unicode |
510 | * |
511 | * 4 7 a d 0 4 7 a |
512 | * abc/u0061/u => abca/u |
513 | * C S L C S L gl=f->a |
514 | * |
515 | * 2. upup, changes "x" to "XX" |
516 | * |
517 | * 4 7 a 4 7 a |
518 | * abca/u => abcAA/u |
519 | * C SL C S |
520 | * L gl=a->b |
521 | * 3. u-h, changes Unicode to hex |
522 | * |
523 | * 4 7 a 4 7 a d 0 3 |
524 | * abcAA/u => abc/u0041/u0041/u |
525 | * C S L C S |
526 | * L gl=b->15 |
527 | * 4. return |
528 | * |
529 | * 4 7 a d 0 3 |
530 | * abc/u0041/u0041/u |
531 | * C S L |
532 | */ |
533 | |
534 | if (count < 1) { |
535 | index.start = index.limit; |
536 | return; // Short circuit for empty compound transliterators |
537 | } |
538 | |
539 | // compoundLimit is the limit value for the entire compound |
540 | // operation. We overwrite index.limit with the previous |
541 | // index.start. After each transliteration, we update |
542 | // compoundLimit for insertions or deletions that have happened. |
543 | int32_t compoundLimit = index.limit; |
544 | |
545 | // compoundStart is the start for the entire compound |
546 | // operation. |
547 | int32_t compoundStart = index.start; |
548 | |
549 | int32_t delta = 0; // delta in length |
550 | |
551 | // Give each transliterator a crack at the run of characters. |
552 | // See comments at the top of the method for more detail. |
553 | for (int32_t i=0; i<count; ++i) { |
554 | index.start = compoundStart; // Reset start |
555 | int32_t limit = index.limit; |
556 | |
557 | if (index.start == index.limit) { |
558 | // Short circuit for empty range |
559 | break; |
560 | } |
561 | |
562 | trans[i]->filteredTransliterate(text, index, incremental); |
563 | |
564 | // In a properly written transliterator, start == limit after |
565 | // handleTransliterate() returns when incremental is false. |
566 | // Catch cases where the subclass doesn't do this, and throw |
567 | // an exception. (Just pinning start to limit is a bad idea, |
568 | // because what's probably happening is that the subclass |
569 | // isn't transliterating all the way to the end, and it should |
570 | // in non-incremental mode.) |
571 | if (!incremental && index.start != index.limit) { |
572 | // We can't throw an exception, so just fudge things |
573 | index.start = index.limit; |
574 | } |
575 | |
576 | // Cumulative delta for insertions/deletions |
577 | delta += index.limit - limit; |
578 | |
579 | if (incremental) { |
580 | // In the incremental case, only allow subsequent |
581 | // transliterators to modify what has already been |
582 | // completely processed by prior transliterators. In the |
583 | // non-incrmental case, allow each transliterator to |
584 | // process the entire text. |
585 | index.limit = index.start; |
586 | } |
587 | } |
588 | |
589 | compoundLimit += delta; |
590 | |
591 | // Start is good where it is -- where the last transliterator left |
592 | // it. Limit needs to be put back where it was, modulo |
593 | // adjustments for deletions/insertions. |
594 | index.limit = compoundLimit; |
595 | } |
596 | |
597 | /** |
598 | * Sets the length of the longest context required by this transliterator. |
599 | * This is <em>preceding</em> context. |
600 | */ |
601 | void CompoundTransliterator::computeMaximumContextLength(void) { |
602 | int32_t max = 0; |
603 | for (int32_t i=0; i<count; ++i) { |
604 | int32_t len = trans[i]->getMaximumContextLength(); |
605 | if (len > max) { |
606 | max = len; |
607 | } |
608 | } |
609 | setMaximumContextLength(max); |
610 | } |
611 | |
612 | U_NAMESPACE_END |
613 | |
614 | #endif /* #if !UCONFIG_NO_TRANSLITERATION */ |
615 | |
616 | /* eof */ |
617 | |