1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | /* |
4 | ********************************************************************** |
5 | * Copyright (c) 2002-2014, International Business Machines Corporation |
6 | * and others. All Rights Reserved. |
7 | ********************************************************************** |
8 | * Date Name Description |
9 | * 01/14/2002 aliu Creation. |
10 | ********************************************************************** |
11 | */ |
12 | |
13 | #include "unicode/utypes.h" |
14 | |
15 | #if !UCONFIG_NO_TRANSLITERATION |
16 | |
17 | #include "tridpars.h" |
18 | #include "hash.h" |
19 | #include "mutex.h" |
20 | #include "transreg.h" |
21 | #include "uassert.h" |
22 | #include "ucln_in.h" |
23 | #include "unicode/parsepos.h" |
24 | #include "unicode/translit.h" |
25 | #include "unicode/uchar.h" |
26 | #include "unicode/uniset.h" |
27 | #include "unicode/unistr.h" |
28 | #include "unicode/utrans.h" |
29 | #include "util.h" |
30 | #include "uvector.h" |
31 | |
32 | U_NAMESPACE_BEGIN |
33 | |
34 | static const UChar ID_DELIM = 0x003B; // ; |
35 | static const UChar TARGET_SEP = 0x002D; // - |
36 | static const UChar VARIANT_SEP = 0x002F; // / |
37 | static const UChar OPEN_REV = 0x0028; // ( |
38 | static const UChar CLOSE_REV = 0x0029; // ) |
39 | |
40 | //static const UChar EMPTY[] = {0}; // "" |
41 | static const UChar ANY[] = {65,110,121,0}; // "Any" |
42 | static const UChar ANY_NULL[] = {65,110,121,45,78,117,108,108,0}; // "Any-Null" |
43 | |
44 | static const int32_t FORWARD = UTRANS_FORWARD; |
45 | static const int32_t REVERSE = UTRANS_REVERSE; |
46 | |
47 | static Hashtable* SPECIAL_INVERSES = NULL; |
48 | static UInitOnce gSpecialInversesInitOnce = U_INITONCE_INITIALIZER; |
49 | |
50 | /** |
51 | * The mutex controlling access to SPECIAL_INVERSES |
52 | */ |
53 | static UMutex LOCK; |
54 | |
55 | TransliteratorIDParser::Specs::Specs(const UnicodeString& s, const UnicodeString& t, |
56 | const UnicodeString& v, UBool sawS, |
57 | const UnicodeString& f) { |
58 | source = s; |
59 | target = t; |
60 | variant = v; |
61 | sawSource = sawS; |
62 | filter = f; |
63 | } |
64 | |
65 | TransliteratorIDParser::SingleID::SingleID(const UnicodeString& c, const UnicodeString& b, |
66 | const UnicodeString& f) { |
67 | canonID = c; |
68 | basicID = b; |
69 | filter = f; |
70 | } |
71 | |
72 | TransliteratorIDParser::SingleID::SingleID(const UnicodeString& c, const UnicodeString& b) { |
73 | canonID = c; |
74 | basicID = b; |
75 | } |
76 | |
77 | Transliterator* TransliteratorIDParser::SingleID::createInstance() { |
78 | Transliterator* t; |
79 | if (basicID.length() == 0) { |
80 | t = createBasicInstance(UnicodeString(TRUE, ANY_NULL, 8), &canonID); |
81 | } else { |
82 | t = createBasicInstance(basicID, &canonID); |
83 | } |
84 | if (t != NULL) { |
85 | if (filter.length() != 0) { |
86 | UErrorCode ec = U_ZERO_ERROR; |
87 | UnicodeSet *set = new UnicodeSet(filter, ec); |
88 | if (U_FAILURE(ec)) { |
89 | delete set; |
90 | } else { |
91 | t->adoptFilter(set); |
92 | } |
93 | } |
94 | } |
95 | return t; |
96 | } |
97 | |
98 | |
99 | /** |
100 | * Parse a single ID, that is, an ID of the general form |
101 | * "[f1] s1-t1/v1 ([f2] s2-t3/v2)", with the parenthesized element |
102 | * optional, the filters optional, and the variants optional. |
103 | * @param id the id to be parsed |
104 | * @param pos INPUT-OUTPUT parameter. On input, the position of |
105 | * the first character to parse. On output, the position after |
106 | * the last character parsed. |
107 | * @param dir the direction. If the direction is REVERSE then the |
108 | * SingleID is constructed for the reverse direction. |
109 | * @return a SingleID object or NULL |
110 | */ |
111 | TransliteratorIDParser::SingleID* |
112 | TransliteratorIDParser::parseSingleID(const UnicodeString& id, int32_t& pos, |
113 | int32_t dir, UErrorCode& status) { |
114 | |
115 | int32_t start = pos; |
116 | |
117 | // The ID will be of the form A, A(), A(B), or (B), where |
118 | // A and B are filter IDs. |
119 | Specs* specsA = NULL; |
120 | Specs* specsB = NULL; |
121 | UBool sawParen = FALSE; |
122 | |
123 | // On the first pass, look for (B) or (). If this fails, then |
124 | // on the second pass, look for A, A(B), or A(). |
125 | for (int32_t pass=1; pass<=2; ++pass) { |
126 | if (pass == 2) { |
127 | specsA = parseFilterID(id, pos, TRUE); |
128 | if (specsA == NULL) { |
129 | pos = start; |
130 | return NULL; |
131 | } |
132 | } |
133 | if (ICU_Utility::parseChar(id, pos, OPEN_REV)) { |
134 | sawParen = TRUE; |
135 | if (!ICU_Utility::parseChar(id, pos, CLOSE_REV)) { |
136 | specsB = parseFilterID(id, pos, TRUE); |
137 | // Must close with a ')' |
138 | if (specsB == NULL || !ICU_Utility::parseChar(id, pos, CLOSE_REV)) { |
139 | delete specsA; |
140 | pos = start; |
141 | return NULL; |
142 | } |
143 | } |
144 | break; |
145 | } |
146 | } |
147 | |
148 | // Assemble return results |
149 | SingleID* single; |
150 | if (sawParen) { |
151 | if (dir == FORWARD) { |
152 | SingleID* b = specsToID(specsB, FORWARD); |
153 | single = specsToID(specsA, FORWARD); |
154 | // Null pointers check |
155 | if (b == NULL || single == NULL) { |
156 | delete b; |
157 | delete single; |
158 | status = U_MEMORY_ALLOCATION_ERROR; |
159 | return NULL; |
160 | } |
161 | single->canonID.append(OPEN_REV) |
162 | .append(b->canonID).append(CLOSE_REV); |
163 | if (specsA != NULL) { |
164 | single->filter = specsA->filter; |
165 | } |
166 | delete b; |
167 | } else { |
168 | SingleID* a = specsToID(specsA, FORWARD); |
169 | single = specsToID(specsB, FORWARD); |
170 | // Check for null pointer. |
171 | if (a == NULL || single == NULL) { |
172 | delete a; |
173 | delete single; |
174 | status = U_MEMORY_ALLOCATION_ERROR; |
175 | return NULL; |
176 | } |
177 | single->canonID.append(OPEN_REV) |
178 | .append(a->canonID).append(CLOSE_REV); |
179 | if (specsB != NULL) { |
180 | single->filter = specsB->filter; |
181 | } |
182 | delete a; |
183 | } |
184 | } else { |
185 | // assert(specsA != NULL); |
186 | if (dir == FORWARD) { |
187 | single = specsToID(specsA, FORWARD); |
188 | } else { |
189 | single = specsToSpecialInverse(*specsA, status); |
190 | if (single == NULL) { |
191 | single = specsToID(specsA, REVERSE); |
192 | } |
193 | } |
194 | // Check for NULL pointer |
195 | if (single == NULL) { |
196 | status = U_MEMORY_ALLOCATION_ERROR; |
197 | return NULL; |
198 | } |
199 | single->filter = specsA->filter; |
200 | } |
201 | |
202 | delete specsA; |
203 | delete specsB; |
204 | |
205 | return single; |
206 | } |
207 | |
208 | /** |
209 | * Parse a filter ID, that is, an ID of the general form |
210 | * "[f1] s1-t1/v1", with the filters optional, and the variants optional. |
211 | * @param id the id to be parsed |
212 | * @param pos INPUT-OUTPUT parameter. On input, the position of |
213 | * the first character to parse. On output, the position after |
214 | * the last character parsed. |
215 | * @return a SingleID object or null if the parse fails |
216 | */ |
217 | TransliteratorIDParser::SingleID* |
218 | TransliteratorIDParser::parseFilterID(const UnicodeString& id, int32_t& pos) { |
219 | |
220 | int32_t start = pos; |
221 | |
222 | Specs* specs = parseFilterID(id, pos, TRUE); |
223 | if (specs == NULL) { |
224 | pos = start; |
225 | return NULL; |
226 | } |
227 | |
228 | // Assemble return results |
229 | SingleID* single = specsToID(specs, FORWARD); |
230 | if (single != NULL) { |
231 | single->filter = specs->filter; |
232 | } |
233 | delete specs; |
234 | return single; |
235 | } |
236 | |
237 | /** |
238 | * Parse a global filter of the form "[f]" or "([f])", depending |
239 | * on 'withParens'. |
240 | * @param id the pattern the parse |
241 | * @param pos INPUT-OUTPUT parameter. On input, the position of |
242 | * the first character to parse. On output, the position after |
243 | * the last character parsed. |
244 | * @param dir the direction. |
245 | * @param withParens INPUT-OUTPUT parameter. On entry, if |
246 | * withParens is 0, then parens are disallowed. If it is 1, |
247 | * then parens are requires. If it is -1, then parens are |
248 | * optional, and the return result will be set to 0 or 1. |
249 | * @param canonID OUTPUT parameter. The pattern for the filter |
250 | * added to the canonID, either at the end, if dir is FORWARD, or |
251 | * at the start, if dir is REVERSE. The pattern will be enclosed |
252 | * in parentheses if appropriate, and will be suffixed with an |
253 | * ID_DELIM character. May be NULL. |
254 | * @return a UnicodeSet object or NULL. A non-NULL results |
255 | * indicates a successful parse, regardless of whether the filter |
256 | * applies to the given direction. The caller should discard it |
257 | * if withParens != (dir == REVERSE). |
258 | */ |
259 | UnicodeSet* TransliteratorIDParser::parseGlobalFilter(const UnicodeString& id, int32_t& pos, |
260 | int32_t dir, |
261 | int32_t& withParens, |
262 | UnicodeString* canonID) { |
263 | UnicodeSet* filter = NULL; |
264 | int32_t start = pos; |
265 | |
266 | if (withParens == -1) { |
267 | withParens = ICU_Utility::parseChar(id, pos, OPEN_REV) ? 1 : 0; |
268 | } else if (withParens == 1) { |
269 | if (!ICU_Utility::parseChar(id, pos, OPEN_REV)) { |
270 | pos = start; |
271 | return NULL; |
272 | } |
273 | } |
274 | |
275 | ICU_Utility::skipWhitespace(id, pos, TRUE); |
276 | |
277 | if (UnicodeSet::resemblesPattern(id, pos)) { |
278 | ParsePosition ppos(pos); |
279 | UErrorCode ec = U_ZERO_ERROR; |
280 | filter = new UnicodeSet(id, ppos, USET_IGNORE_SPACE, NULL, ec); |
281 | /* test for NULL */ |
282 | if (filter == 0) { |
283 | pos = start; |
284 | return 0; |
285 | } |
286 | if (U_FAILURE(ec)) { |
287 | delete filter; |
288 | pos = start; |
289 | return NULL; |
290 | } |
291 | |
292 | UnicodeString pattern; |
293 | id.extractBetween(pos, ppos.getIndex(), pattern); |
294 | pos = ppos.getIndex(); |
295 | |
296 | if (withParens == 1 && !ICU_Utility::parseChar(id, pos, CLOSE_REV)) { |
297 | delete filter; |
298 | pos = start; |
299 | return NULL; |
300 | } |
301 | |
302 | // In the forward direction, append the pattern to the |
303 | // canonID. In the reverse, insert it at zero, and invert |
304 | // the presence of parens ("A" <-> "(A)"). |
305 | if (canonID != NULL) { |
306 | if (dir == FORWARD) { |
307 | if (withParens == 1) { |
308 | pattern.insert(0, OPEN_REV); |
309 | pattern.append(CLOSE_REV); |
310 | } |
311 | canonID->append(pattern).append(ID_DELIM); |
312 | } else { |
313 | if (withParens == 0) { |
314 | pattern.insert(0, OPEN_REV); |
315 | pattern.append(CLOSE_REV); |
316 | } |
317 | canonID->insert(0, pattern); |
318 | canonID->insert(pattern.length(), ID_DELIM); |
319 | } |
320 | } |
321 | } |
322 | |
323 | return filter; |
324 | } |
325 | |
326 | U_CDECL_BEGIN |
327 | static void U_CALLCONV _deleteSingleID(void* obj) { |
328 | delete (TransliteratorIDParser::SingleID*) obj; |
329 | } |
330 | |
331 | static void U_CALLCONV _deleteTransliteratorTrIDPars(void* obj) { |
332 | delete (Transliterator*) obj; |
333 | } |
334 | U_CDECL_END |
335 | |
336 | /** |
337 | * Parse a compound ID, consisting of an optional forward global |
338 | * filter, a separator, one or more single IDs delimited by |
339 | * separators, an an optional reverse global filter. The |
340 | * separator is a semicolon. The global filters are UnicodeSet |
341 | * patterns. The reverse global filter must be enclosed in |
342 | * parentheses. |
343 | * @param id the pattern the parse |
344 | * @param dir the direction. |
345 | * @param canonID OUTPUT parameter that receives the canonical ID, |
346 | * consisting of canonical IDs for all elements, as returned by |
347 | * parseSingleID(), separated by semicolons. Previous contents |
348 | * are discarded. |
349 | * @param list OUTPUT parameter that receives a list of SingleID |
350 | * objects representing the parsed IDs. Previous contents are |
351 | * discarded. |
352 | * @param globalFilter OUTPUT parameter that receives a pointer to |
353 | * a newly created global filter for this ID in this direction, or |
354 | * NULL if there is none. |
355 | * @return TRUE if the parse succeeds, that is, if the entire |
356 | * id is consumed without syntax error. |
357 | */ |
358 | UBool TransliteratorIDParser::parseCompoundID(const UnicodeString& id, int32_t dir, |
359 | UnicodeString& canonID, |
360 | UVector& list, |
361 | UnicodeSet*& globalFilter) { |
362 | UErrorCode ec = U_ZERO_ERROR; |
363 | int32_t i; |
364 | int32_t pos = 0; |
365 | int32_t withParens = 1; |
366 | list.removeAllElements(); |
367 | UnicodeSet* filter; |
368 | globalFilter = NULL; |
369 | canonID.truncate(0); |
370 | |
371 | // Parse leading global filter, if any |
372 | withParens = 0; // parens disallowed |
373 | filter = parseGlobalFilter(id, pos, dir, withParens, &canonID); |
374 | if (filter != NULL) { |
375 | if (!ICU_Utility::parseChar(id, pos, ID_DELIM)) { |
376 | // Not a global filter; backup and resume |
377 | canonID.truncate(0); |
378 | pos = 0; |
379 | } |
380 | if (dir == FORWARD) { |
381 | globalFilter = filter; |
382 | } else { |
383 | delete filter; |
384 | } |
385 | filter = NULL; |
386 | } |
387 | |
388 | UBool sawDelimiter = TRUE; |
389 | for (;;) { |
390 | SingleID* single = parseSingleID(id, pos, dir, ec); |
391 | if (single == NULL) { |
392 | break; |
393 | } |
394 | if (dir == FORWARD) { |
395 | list.addElement(single, ec); |
396 | } else { |
397 | list.insertElementAt(single, 0, ec); |
398 | } |
399 | if (U_FAILURE(ec)) { |
400 | goto FAIL; |
401 | } |
402 | if (!ICU_Utility::parseChar(id, pos, ID_DELIM)) { |
403 | sawDelimiter = FALSE; |
404 | break; |
405 | } |
406 | } |
407 | |
408 | if (list.size() == 0) { |
409 | goto FAIL; |
410 | } |
411 | |
412 | // Construct canonical ID |
413 | for (i=0; i<list.size(); ++i) { |
414 | SingleID* single = (SingleID*) list.elementAt(i); |
415 | canonID.append(single->canonID); |
416 | if (i != (list.size()-1)) { |
417 | canonID.append(ID_DELIM); |
418 | } |
419 | } |
420 | |
421 | // Parse trailing global filter, if any, and only if we saw |
422 | // a trailing delimiter after the IDs. |
423 | if (sawDelimiter) { |
424 | withParens = 1; // parens required |
425 | filter = parseGlobalFilter(id, pos, dir, withParens, &canonID); |
426 | if (filter != NULL) { |
427 | // Don't require trailing ';', but parse it if present |
428 | ICU_Utility::parseChar(id, pos, ID_DELIM); |
429 | |
430 | if (dir == REVERSE) { |
431 | globalFilter = filter; |
432 | } else { |
433 | delete filter; |
434 | } |
435 | filter = NULL; |
436 | } |
437 | } |
438 | |
439 | // Trailing unparsed text is a syntax error |
440 | ICU_Utility::skipWhitespace(id, pos, TRUE); |
441 | if (pos != id.length()) { |
442 | goto FAIL; |
443 | } |
444 | |
445 | return TRUE; |
446 | |
447 | FAIL: |
448 | UObjectDeleter *save = list.setDeleter(_deleteSingleID); |
449 | list.removeAllElements(); |
450 | list.setDeleter(save); |
451 | delete globalFilter; |
452 | globalFilter = NULL; |
453 | return FALSE; |
454 | } |
455 | |
456 | /** |
457 | * Convert the elements of the 'list' vector, which are SingleID |
458 | * objects, into actual Transliterator objects. In the course of |
459 | * this, some (or all) entries may be removed. If all entries |
460 | * are removed, the NULL transliterator will be added. |
461 | * |
462 | * Delete entries with empty basicIDs; these are generated by |
463 | * elements like "(A)" in the forward direction, or "A()" in |
464 | * the reverse. THIS MAY RESULT IN AN EMPTY VECTOR. Convert |
465 | * SingleID entries to actual transliterators. |
466 | * |
467 | * @param list vector of SingleID objects. On exit, vector |
468 | * of one or more Transliterators. |
469 | * @return new value of insertIndex. The index will shift if |
470 | * there are empty items, like "(Lower)", with indices less than |
471 | * insertIndex. |
472 | */ |
473 | void TransliteratorIDParser::instantiateList(UVector& list, |
474 | UErrorCode& ec) { |
475 | UVector tlist(ec); |
476 | if (U_FAILURE(ec)) { |
477 | goto RETURN; |
478 | } |
479 | tlist.setDeleter(_deleteTransliteratorTrIDPars); |
480 | |
481 | Transliterator* t; |
482 | int32_t i; |
483 | for (i=0; i<=list.size(); ++i) { // [sic]: i<=list.size() |
484 | // We run the loop too long by one, so we can |
485 | // do an insert after the last element |
486 | if (i==list.size()) { |
487 | break; |
488 | } |
489 | |
490 | SingleID* single = (SingleID*) list.elementAt(i); |
491 | if (single->basicID.length() != 0) { |
492 | t = single->createInstance(); |
493 | if (t == NULL) { |
494 | ec = U_INVALID_ID; |
495 | goto RETURN; |
496 | } |
497 | tlist.addElement(t, ec); |
498 | if (U_FAILURE(ec)) { |
499 | delete t; |
500 | goto RETURN; |
501 | } |
502 | } |
503 | } |
504 | |
505 | // An empty list is equivalent to a NULL transliterator. |
506 | if (tlist.size() == 0) { |
507 | t = createBasicInstance(UnicodeString(TRUE, ANY_NULL, 8), NULL); |
508 | if (t == NULL) { |
509 | // Should never happen |
510 | ec = U_INTERNAL_TRANSLITERATOR_ERROR; |
511 | } |
512 | tlist.addElement(t, ec); |
513 | if (U_FAILURE(ec)) { |
514 | delete t; |
515 | } |
516 | } |
517 | |
518 | RETURN: |
519 | |
520 | UObjectDeleter *save = list.setDeleter(_deleteSingleID); |
521 | list.removeAllElements(); |
522 | |
523 | if (U_SUCCESS(ec)) { |
524 | list.setDeleter(_deleteTransliteratorTrIDPars); |
525 | |
526 | while (tlist.size() > 0) { |
527 | t = (Transliterator*) tlist.orphanElementAt(0); |
528 | list.addElement(t, ec); |
529 | if (U_FAILURE(ec)) { |
530 | delete t; |
531 | list.removeAllElements(); |
532 | break; |
533 | } |
534 | } |
535 | } |
536 | |
537 | list.setDeleter(save); |
538 | } |
539 | |
540 | /** |
541 | * Parse an ID into pieces. Take IDs of the form T, T/V, S-T, |
542 | * S-T/V, or S/V-T. If the source is missing, return a source of |
543 | * ANY. |
544 | * @param id the id string, in any of several forms |
545 | * @return an array of 4 strings: source, target, variant, and |
546 | * isSourcePresent. If the source is not present, ANY will be |
547 | * given as the source, and isSourcePresent will be NULL. Otherwise |
548 | * isSourcePresent will be non-NULL. The target may be empty if the |
549 | * id is not well-formed. The variant may be empty. |
550 | */ |
551 | void TransliteratorIDParser::IDtoSTV(const UnicodeString& id, |
552 | UnicodeString& source, |
553 | UnicodeString& target, |
554 | UnicodeString& variant, |
555 | UBool& isSourcePresent) { |
556 | source.setTo(ANY, 3); |
557 | target.truncate(0); |
558 | variant.truncate(0); |
559 | |
560 | int32_t sep = id.indexOf(TARGET_SEP); |
561 | int32_t var = id.indexOf(VARIANT_SEP); |
562 | if (var < 0) { |
563 | var = id.length(); |
564 | } |
565 | isSourcePresent = FALSE; |
566 | |
567 | if (sep < 0) { |
568 | // Form: T/V or T (or /V) |
569 | id.extractBetween(0, var, target); |
570 | id.extractBetween(var, id.length(), variant); |
571 | } else if (sep < var) { |
572 | // Form: S-T/V or S-T (or -T/V or -T) |
573 | if (sep > 0) { |
574 | id.extractBetween(0, sep, source); |
575 | isSourcePresent = TRUE; |
576 | } |
577 | id.extractBetween(++sep, var, target); |
578 | id.extractBetween(var, id.length(), variant); |
579 | } else { |
580 | // Form: (S/V-T or /V-T) |
581 | if (var > 0) { |
582 | id.extractBetween(0, var, source); |
583 | isSourcePresent = TRUE; |
584 | } |
585 | id.extractBetween(var, sep++, variant); |
586 | id.extractBetween(sep, id.length(), target); |
587 | } |
588 | |
589 | if (variant.length() > 0) { |
590 | variant.remove(0, 1); |
591 | } |
592 | } |
593 | |
594 | /** |
595 | * Given source, target, and variant strings, concatenate them into a |
596 | * full ID. If the source is empty, then "Any" will be used for the |
597 | * source, so the ID will always be of the form s-t/v or s-t. |
598 | */ |
599 | void TransliteratorIDParser::STVtoID(const UnicodeString& source, |
600 | const UnicodeString& target, |
601 | const UnicodeString& variant, |
602 | UnicodeString& id) { |
603 | id = source; |
604 | if (id.length() == 0) { |
605 | id.setTo(ANY, 3); |
606 | } |
607 | id.append(TARGET_SEP).append(target); |
608 | if (variant.length() != 0) { |
609 | id.append(VARIANT_SEP).append(variant); |
610 | } |
611 | // NUL-terminate the ID string for getTerminatedBuffer. |
612 | // This prevents valgrind and Purify warnings. |
613 | id.append((UChar)0); |
614 | id.truncate(id.length()-1); |
615 | } |
616 | |
617 | /** |
618 | * Register two targets as being inverses of one another. For |
619 | * example, calling registerSpecialInverse("NFC", "NFD", TRUE) causes |
620 | * Transliterator to form the following inverse relationships: |
621 | * |
622 | * <pre>NFC => NFD |
623 | * Any-NFC => Any-NFD |
624 | * NFD => NFC |
625 | * Any-NFD => Any-NFC</pre> |
626 | * |
627 | * (Without the special inverse registration, the inverse of NFC |
628 | * would be NFC-Any.) Note that NFD is shorthand for Any-NFD, but |
629 | * that the presence or absence of "Any-" is preserved. |
630 | * |
631 | * <p>The relationship is symmetrical; registering (a, b) is |
632 | * equivalent to registering (b, a). |
633 | * |
634 | * <p>The relevant IDs must still be registered separately as |
635 | * factories or classes. |
636 | * |
637 | * <p>Only the targets are specified. Special inverses always |
638 | * have the form Any-Target1 <=> Any-Target2. The target should |
639 | * have canonical casing (the casing desired to be produced when |
640 | * an inverse is formed) and should contain no whitespace or other |
641 | * extraneous characters. |
642 | * |
643 | * @param target the target against which to register the inverse |
644 | * @param inverseTarget the inverse of target, that is |
645 | * Any-target.getInverse() => Any-inverseTarget |
646 | * @param bidirectional if TRUE, register the reverse relation |
647 | * as well, that is, Any-inverseTarget.getInverse() => Any-target |
648 | */ |
649 | void TransliteratorIDParser::registerSpecialInverse(const UnicodeString& target, |
650 | const UnicodeString& inverseTarget, |
651 | UBool bidirectional, |
652 | UErrorCode &status) { |
653 | umtx_initOnce(gSpecialInversesInitOnce, init, status); |
654 | if (U_FAILURE(status)) { |
655 | return; |
656 | } |
657 | |
658 | // If target == inverseTarget then force bidirectional => FALSE |
659 | if (bidirectional && 0==target.caseCompare(inverseTarget, U_FOLD_CASE_DEFAULT)) { |
660 | bidirectional = FALSE; |
661 | } |
662 | |
663 | Mutex lock(&LOCK); |
664 | |
665 | UnicodeString *tempus = new UnicodeString(inverseTarget); // Used for null pointer check before usage. |
666 | if (tempus == NULL) { |
667 | status = U_MEMORY_ALLOCATION_ERROR; |
668 | return; |
669 | } |
670 | SPECIAL_INVERSES->put(target, tempus, status); |
671 | if (bidirectional) { |
672 | tempus = new UnicodeString(target); |
673 | if (tempus == NULL) { |
674 | status = U_MEMORY_ALLOCATION_ERROR; |
675 | return; |
676 | } |
677 | SPECIAL_INVERSES->put(inverseTarget, tempus, status); |
678 | } |
679 | } |
680 | |
681 | //---------------------------------------------------------------- |
682 | // Private implementation |
683 | //---------------------------------------------------------------- |
684 | |
685 | /** |
686 | * Parse an ID into component pieces. Take IDs of the form T, |
687 | * T/V, S-T, S-T/V, or S/V-T. If the source is missing, return a |
688 | * source of ANY. |
689 | * @param id the id string, in any of several forms |
690 | * @param pos INPUT-OUTPUT parameter. On input, pos is the |
691 | * offset of the first character to parse in id. On output, |
692 | * pos is the offset after the last parsed character. If the |
693 | * parse failed, pos will be unchanged. |
694 | * @param allowFilter2 if TRUE, a UnicodeSet pattern is allowed |
695 | * at any location between specs or delimiters, and is returned |
696 | * as the fifth string in the array. |
697 | * @return a Specs object, or NULL if the parse failed. If |
698 | * neither source nor target was seen in the parsed id, then the |
699 | * parse fails. If allowFilter is TRUE, then the parsed filter |
700 | * pattern is returned in the Specs object, otherwise the returned |
701 | * filter reference is NULL. If the parse fails for any reason |
702 | * NULL is returned. |
703 | */ |
704 | TransliteratorIDParser::Specs* |
705 | TransliteratorIDParser::parseFilterID(const UnicodeString& id, int32_t& pos, |
706 | UBool allowFilter) { |
707 | UnicodeString first; |
708 | UnicodeString source; |
709 | UnicodeString target; |
710 | UnicodeString variant; |
711 | UnicodeString filter; |
712 | UChar delimiter = 0; |
713 | int32_t specCount = 0; |
714 | int32_t start = pos; |
715 | |
716 | // This loop parses one of the following things with each |
717 | // pass: a filter, a delimiter character (either '-' or '/'), |
718 | // or a spec (source, target, or variant). |
719 | for (;;) { |
720 | ICU_Utility::skipWhitespace(id, pos, TRUE); |
721 | if (pos == id.length()) { |
722 | break; |
723 | } |
724 | |
725 | // Parse filters |
726 | if (allowFilter && filter.length() == 0 && |
727 | UnicodeSet::resemblesPattern(id, pos)) { |
728 | |
729 | ParsePosition ppos(pos); |
730 | UErrorCode ec = U_ZERO_ERROR; |
731 | UnicodeSet set(id, ppos, USET_IGNORE_SPACE, NULL, ec); |
732 | if (U_FAILURE(ec)) { |
733 | pos = start; |
734 | return NULL; |
735 | } |
736 | id.extractBetween(pos, ppos.getIndex(), filter); |
737 | pos = ppos.getIndex(); |
738 | continue; |
739 | } |
740 | |
741 | if (delimiter == 0) { |
742 | UChar c = id.charAt(pos); |
743 | if ((c == TARGET_SEP && target.length() == 0) || |
744 | (c == VARIANT_SEP && variant.length() == 0)) { |
745 | delimiter = c; |
746 | ++pos; |
747 | continue; |
748 | } |
749 | } |
750 | |
751 | // We are about to try to parse a spec with no delimiter |
752 | // when we can no longer do so (we can only do so at the |
753 | // start); break. |
754 | if (delimiter == 0 && specCount > 0) { |
755 | break; |
756 | } |
757 | |
758 | UnicodeString spec = ICU_Utility::parseUnicodeIdentifier(id, pos); |
759 | if (spec.length() == 0) { |
760 | // Note that if there was a trailing delimiter, we |
761 | // consume it. So Foo-, Foo/, Foo-Bar/, and Foo/Bar- |
762 | // are legal. |
763 | break; |
764 | } |
765 | |
766 | switch (delimiter) { |
767 | case 0: |
768 | first = spec; |
769 | break; |
770 | case TARGET_SEP: |
771 | target = spec; |
772 | break; |
773 | case VARIANT_SEP: |
774 | variant = spec; |
775 | break; |
776 | } |
777 | ++specCount; |
778 | delimiter = 0; |
779 | } |
780 | |
781 | // A spec with no prior character is either source or target, |
782 | // depending on whether an explicit "-target" was seen. |
783 | if (first.length() != 0) { |
784 | if (target.length() == 0) { |
785 | target = first; |
786 | } else { |
787 | source = first; |
788 | } |
789 | } |
790 | |
791 | // Must have either source or target |
792 | if (source.length() == 0 && target.length() == 0) { |
793 | pos = start; |
794 | return NULL; |
795 | } |
796 | |
797 | // Empty source or target defaults to ANY |
798 | UBool sawSource = TRUE; |
799 | if (source.length() == 0) { |
800 | source.setTo(ANY, 3); |
801 | sawSource = FALSE; |
802 | } |
803 | if (target.length() == 0) { |
804 | target.setTo(ANY, 3); |
805 | } |
806 | |
807 | return new Specs(source, target, variant, sawSource, filter); |
808 | } |
809 | |
810 | /** |
811 | * Givens a Spec object, convert it to a SingleID object. The |
812 | * Spec object is a more unprocessed parse result. The SingleID |
813 | * object contains information about canonical and basic IDs. |
814 | * @return a SingleID; never returns NULL. Returned object always |
815 | * has 'filter' field of NULL. |
816 | */ |
817 | TransliteratorIDParser::SingleID* |
818 | TransliteratorIDParser::specsToID(const Specs* specs, int32_t dir) { |
819 | UnicodeString canonID; |
820 | UnicodeString basicID; |
821 | UnicodeString basicPrefix; |
822 | if (specs != NULL) { |
823 | UnicodeString buf; |
824 | if (dir == FORWARD) { |
825 | if (specs->sawSource) { |
826 | buf.append(specs->source).append(TARGET_SEP); |
827 | } else { |
828 | basicPrefix = specs->source; |
829 | basicPrefix.append(TARGET_SEP); |
830 | } |
831 | buf.append(specs->target); |
832 | } else { |
833 | buf.append(specs->target).append(TARGET_SEP).append(specs->source); |
834 | } |
835 | if (specs->variant.length() != 0) { |
836 | buf.append(VARIANT_SEP).append(specs->variant); |
837 | } |
838 | basicID = basicPrefix; |
839 | basicID.append(buf); |
840 | if (specs->filter.length() != 0) { |
841 | buf.insert(0, specs->filter); |
842 | } |
843 | canonID = buf; |
844 | } |
845 | return new SingleID(canonID, basicID); |
846 | } |
847 | |
848 | /** |
849 | * Given a Specs object, return a SingleID representing the |
850 | * special inverse of that ID. If there is no special inverse |
851 | * then return NULL. |
852 | * @return a SingleID or NULL. Returned object always has |
853 | * 'filter' field of NULL. |
854 | */ |
855 | TransliteratorIDParser::SingleID* |
856 | TransliteratorIDParser::specsToSpecialInverse(const Specs& specs, UErrorCode &status) { |
857 | if (0!=specs.source.caseCompare(ANY, 3, U_FOLD_CASE_DEFAULT)) { |
858 | return NULL; |
859 | } |
860 | umtx_initOnce(gSpecialInversesInitOnce, init, status); |
861 | if (U_FAILURE(status)) { |
862 | return NULL; |
863 | } |
864 | |
865 | UnicodeString* inverseTarget; |
866 | |
867 | umtx_lock(&LOCK); |
868 | inverseTarget = (UnicodeString*) SPECIAL_INVERSES->get(specs.target); |
869 | umtx_unlock(&LOCK); |
870 | |
871 | if (inverseTarget != NULL) { |
872 | // If the original ID contained "Any-" then make the |
873 | // special inverse "Any-Foo"; otherwise make it "Foo". |
874 | // So "Any-NFC" => "Any-NFD" but "NFC" => "NFD". |
875 | UnicodeString buf; |
876 | if (specs.filter.length() != 0) { |
877 | buf.append(specs.filter); |
878 | } |
879 | if (specs.sawSource) { |
880 | buf.append(ANY, 3).append(TARGET_SEP); |
881 | } |
882 | buf.append(*inverseTarget); |
883 | |
884 | UnicodeString basicID(TRUE, ANY, 3); |
885 | basicID.append(TARGET_SEP).append(*inverseTarget); |
886 | |
887 | if (specs.variant.length() != 0) { |
888 | buf.append(VARIANT_SEP).append(specs.variant); |
889 | basicID.append(VARIANT_SEP).append(specs.variant); |
890 | } |
891 | return new SingleID(buf, basicID); |
892 | } |
893 | return NULL; |
894 | } |
895 | |
896 | /** |
897 | * Glue method to get around access problems in C++. This would |
898 | * ideally be inline but we want to avoid a circular header |
899 | * dependency. |
900 | */ |
901 | Transliterator* TransliteratorIDParser::createBasicInstance(const UnicodeString& id, const UnicodeString* canonID) { |
902 | return Transliterator::createBasicInstance(id, canonID); |
903 | } |
904 | |
905 | /** |
906 | * Initialize static memory. Called through umtx_initOnce only. |
907 | */ |
908 | void U_CALLCONV TransliteratorIDParser::init(UErrorCode &status) { |
909 | U_ASSERT(SPECIAL_INVERSES == NULL); |
910 | ucln_i18n_registerCleanup(UCLN_I18N_TRANSLITERATOR, utrans_transliterator_cleanup); |
911 | |
912 | SPECIAL_INVERSES = new Hashtable(TRUE, status); |
913 | if (SPECIAL_INVERSES == NULL) { |
914 | status = U_MEMORY_ALLOCATION_ERROR; |
915 | return; |
916 | } |
917 | SPECIAL_INVERSES->setValueDeleter(uprv_deleteUObject); |
918 | } |
919 | |
920 | /** |
921 | * Free static memory. |
922 | */ |
923 | void TransliteratorIDParser::cleanup() { |
924 | if (SPECIAL_INVERSES) { |
925 | delete SPECIAL_INVERSES; |
926 | SPECIAL_INVERSES = NULL; |
927 | } |
928 | gSpecialInversesInitOnce.reset(); |
929 | } |
930 | |
931 | U_NAMESPACE_END |
932 | |
933 | #endif /* #if !UCONFIG_NO_TRANSLITERATION */ |
934 | |
935 | //eof |
936 | |