tridpars.cpp source code [ClickHouse/contrib/icu/icu4c/source/i18n/tridpars.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	**********************************************************************
5	* Copyright (c) 2002-2014, International Business Machines Corporation
6	* and others. All Rights Reserved.
7	**********************************************************************
8	* Date Name Description
9	* 01/14/2002 aliu Creation.
10	**********************************************************************
11	*/
12
13	#include "unicode/utypes.h"
14
15	#if !UCONFIG_NO_TRANSLITERATION
16
17	#include "tridpars.h"
18	#include "hash.h"
19	#include "mutex.h"
20	#include "transreg.h"
21	#include "uassert.h"
22	#include "ucln_in.h"
23	#include "unicode/parsepos.h"
24	#include "unicode/translit.h"
25	#include "unicode/uchar.h"
26	#include "unicode/uniset.h"
27	#include "unicode/unistr.h"
28	#include "unicode/utrans.h"
29	#include "util.h"
30	#include "uvector.h"
31
32	U_NAMESPACE_BEGIN
33
34	static const UChar ID_DELIM = `0x003B`; // ;
35	static const UChar TARGET_SEP = `0x002D`; // -
36	static const UChar VARIANT_SEP = `0x002F`; // /
37	static const UChar OPEN_REV = `0x0028`; // (
38	static const UChar CLOSE_REV = `0x0029`; // )
39
40	//static const UChar EMPTY[] = {0}; // ""
41	static const UChar ANY[] = {`65`,`110`,`121`,`0`}; // "Any"
42	static const UChar ANY_NULL[] = {`65`,`110`,`121`,`45`,`78`,`117`,`108`,`108`,`0`}; // "Any-Null"
43
44	static const int32_t FORWARD = UTRANS_FORWARD;
45	static const int32_t REVERSE = UTRANS_REVERSE;
46
47	static Hashtable* SPECIAL_INVERSES = NULL;
48	static UInitOnce gSpecialInversesInitOnce = U_INITONCE_INITIALIZER;
49
50	/**
51	* The mutex controlling access to SPECIAL_INVERSES
52	*/
53	static UMutex LOCK;
54
55	TransliteratorIDParser::Specs::Specs(const UnicodeString& s, const UnicodeString& t,
56	const UnicodeString& v, UBool sawS,
57	const UnicodeString& f) {
58	source = s;
59	target = t;
60	variant = v;
61	sawSource = sawS;
62	filter = f;
63	}
64
65	TransliteratorIDParser::SingleID::SingleID(const UnicodeString& c, const UnicodeString& b,
66	const UnicodeString& f) {
67	canonID = c;
68	basicID = b;
69	filter = f;
70	}
71
72	TransliteratorIDParser::SingleID::SingleID(const UnicodeString& c, const UnicodeString& b) {
73	canonID = c;
74	basicID = b;
75	}
76
77	Transliterator* TransliteratorIDParser::SingleID::createInstance() {
78	Transliterator* t;
79	if (basicID.length() == `0`) {
80	t = createBasicInstance(UnicodeString (TRUE, ANY_NULL, `8`), &canonID);
81	} else {
82	t = createBasicInstance(basicID, &canonID);
83	}
84	if (t != NULL) {
85	if (filter.length() != `0`) {
86	UErrorCode ec = U_ZERO_ERROR;
87	UnicodeSet set = new* UnicodeSet (filter, ec);
88	if (U_FAILURE(ec)) {
89	delete set;
90	} else {
91	t->adoptFilter(set);
92	}
93	}
94	}
95	return t;
96	}
97
98
99	/**
100	* Parse a single ID, that is, an ID of the general form
101	* "[f1] s1-t1/v1 ([f2] s2-t3/v2)", with the parenthesized element
102	* optional, the filters optional, and the variants optional.
103	* @param id the id to be parsed
104	* @param pos INPUT-OUTPUT parameter. On input, the position of
105	* the first character to parse. On output, the position after
106	* the last character parsed.
107	* @param dir the direction. If the direction is REVERSE then the
108	* SingleID is constructed for the reverse direction.
109	* @return a SingleID object or NULL
110	*/
111	TransliteratorIDParser::SingleID*
112	TransliteratorIDParser::parseSingleID(const UnicodeString& id, int32_t& pos,
113	int32_t dir, UErrorCode& status) {
114
115	int32_t start = pos;
116
117	// The ID will be of the form A, A(), A(B), or (B), where
118	// A and B are filter IDs.
119	Specs* specsA = NULL;
120	Specs* specsB = NULL;
121	UBool sawParen = FALSE;
122
123	// On the first pass, look for (B) or (). If this fails, then
124	// on the second pass, look for A, A(B), or A().
125	for (int32_t pass=`1`; pass<=`2`; ++pass) {
126	if (pass == `2`) {
127	specsA = parseFilterID(id, pos, TRUE);
128	if (specsA == NULL) {
129	pos = start;
130	return NULL;
131	}
132	}
133	if (ICU_Utility::parseChar(id, pos, OPEN_REV)) {
134	sawParen = TRUE;
135	if (!ICU_Utility::parseChar(id, pos, CLOSE_REV)) {
136	specsB = parseFilterID(id, pos, TRUE);
137	// Must close with a ')'
138	if (specsB == NULL \|\| !ICU_Utility::parseChar(id, pos, CLOSE_REV)) {
139	delete specsA;
140	pos = start;
141	return NULL;
142	}
143	}
144	break;
145	}
146	}
147
148	// Assemble return results
149	SingleID* single;
150	if (sawParen) {
151	if (dir == FORWARD) {
152	SingleID* b = specsToID(specsB, FORWARD);
153	single = specsToID(specsA, FORWARD);
154	// Null pointers check
155	if (b == NULL \|\| single == NULL) {
156	delete b;
157	delete single;
158	status = U_MEMORY_ALLOCATION_ERROR;
159	return NULL;
160	}
161	single->canonID.append(OPEN_REV)
162	.append(b->canonID).append(CLOSE_REV);
163	if (specsA != NULL) {
164	single->filter = specsA->filter;
165	}
166	delete b;
167	} else {
168	SingleID* a = specsToID(specsA, FORWARD);
169	single = specsToID(specsB, FORWARD);
170	// Check for null pointer.
171	if (a == NULL \|\| single == NULL) {
172	delete a;
173	delete single;
174	status = U_MEMORY_ALLOCATION_ERROR;
175	return NULL;
176	}
177	single->canonID.append(OPEN_REV)
178	.append(a->canonID).append(CLOSE_REV);
179	if (specsB != NULL) {
180	single->filter = specsB->filter;
181	}
182	delete a;
183	}
184	} else {
185	// assert(specsA != NULL);
186	if (dir == FORWARD) {
187	single = specsToID(specsA, FORWARD);
188	} else {
189	single = specsToSpecialInverse(*specsA, status);
190	if (single == NULL) {
191	single = specsToID(specsA, REVERSE);
192	}
193	}
194	// Check for NULL pointer
195	if (single == NULL) {
196	status = U_MEMORY_ALLOCATION_ERROR;
197	return NULL;
198	}
199	single->filter = specsA->filter;
200	}
201
202	delete specsA;
203	delete specsB;
204
205	return single;
206	}
207
208	/**
209	* Parse a filter ID, that is, an ID of the general form
210	* "[f1] s1-t1/v1", with the filters optional, and the variants optional.
211	* @param id the id to be parsed
212	* @param pos INPUT-OUTPUT parameter. On input, the position of
213	* the first character to parse. On output, the position after
214	* the last character parsed.
215	* @return a SingleID object or null if the parse fails
216	*/
217	TransliteratorIDParser::SingleID*
218	TransliteratorIDParser::parseFilterID(const UnicodeString& id, int32_t& pos) {
219
220	int32_t start = pos;
221
222	Specs* specs = parseFilterID(id, pos, TRUE);
223	if (specs == NULL) {
224	pos = start;
225	return NULL;
226	}
227
228	// Assemble return results
229	SingleID* single = specsToID(specs, FORWARD);
230	if (single != NULL) {
231	single->filter = specs->filter;
232	}
233	delete specs;
234	return single;
235	}
236
237	/**
238	* Parse a global filter of the form "[f]" or "([f])", depending
239	* on 'withParens'.
240	* @param id the pattern the parse
241	* @param pos INPUT-OUTPUT parameter. On input, the position of
242	* the first character to parse. On output, the position after
243	* the last character parsed.
244	* @param dir the direction.
245	* @param withParens INPUT-OUTPUT parameter. On entry, if
246	* withParens is 0, then parens are disallowed. If it is 1,
247	* then parens are requires. If it is -1, then parens are
248	* optional, and the return result will be set to 0 or 1.
249	* @param canonID OUTPUT parameter. The pattern for the filter
250	* added to the canonID, either at the end, if dir is FORWARD, or
251	* at the start, if dir is REVERSE. The pattern will be enclosed
252	* in parentheses if appropriate, and will be suffixed with an
253	* ID_DELIM character. May be NULL.
254	* @return a UnicodeSet object or NULL. A non-NULL results
255	* indicates a successful parse, regardless of whether the filter
256	* applies to the given direction. The caller should discard it
257	* if withParens != (dir == REVERSE).
258	*/
259	UnicodeSet* TransliteratorIDParser::parseGlobalFilter(const UnicodeString& id, int32_t& pos,
260	int32_t dir,
261	int32_t& withParens,
262	UnicodeString* canonID) {
263	UnicodeSet* filter = NULL;
264	int32_t start = pos;
265
266	if (withParens == -`1`) {
267	withParens = ICU_Utility::parseChar(id, pos, OPEN_REV) ? `1` : `0`;
268	} else if (withParens == `1`) {
269	if (!ICU_Utility::parseChar(id, pos, OPEN_REV)) {
270	pos = start;
271	return NULL;
272	}
273	}
274
275	ICU_Utility::skipWhitespace(id, pos, TRUE);
276
277	if (UnicodeSet::resemblesPattern(id, pos)) {
278	ParsePosition ppos(pos);
279	UErrorCode ec = U_ZERO_ERROR;
280	filter = new UnicodeSet (id, ppos, USET_IGNORE_SPACE, NULL, ec);
281	/ test for NULL /
282	if (filter == `0`) {
283	pos = start;
284	return `0`;
285	}
286	if (U_FAILURE(ec)) {
287	delete filter;
288	pos = start;
289	return NULL;
290	}
291
292	UnicodeString pattern;
293	id.extractBetween(pos, ppos.getIndex(), pattern);
294	pos = ppos.getIndex();
295
296	if (withParens == `1` && !ICU_Utility::parseChar(id, pos, CLOSE_REV)) {
297	delete filter;
298	pos = start;
299	return NULL;
300	}
301
302	// In the forward direction, append the pattern to the
303	// canonID. In the reverse, insert it at zero, and invert
304	// the presence of parens ("A" <-> "(A)").
305	if (canonID != NULL) {
306	if (dir == FORWARD) {
307	if (withParens == `1`) {
308	pattern.insert(`0`, OPEN_REV);
309	pattern.append(CLOSE_REV);
310	}
311	canonID->append(pattern).append(ID_DELIM);
312	} else {
313	if (withParens == `0`) {
314	pattern.insert(`0`, OPEN_REV);
315	pattern.append(CLOSE_REV);
316	}
317	canonID->insert(`0`, pattern);
318	canonID->insert(pattern.length(), ID_DELIM);
319	}
320	}
321	}
322
323	return filter;
324	}
325
326	U_CDECL_BEGIN
327	static void U_CALLCONV _deleteSingleID(void* obj) {
328	delete (TransliteratorIDParser::SingleID*) obj;
329	}
330
331	static void U_CALLCONV _deleteTransliteratorTrIDPars(void* obj) {
332	delete (Transliterator*) obj;
333	}
334	U_CDECL_END
335
336	/**
337	* Parse a compound ID, consisting of an optional forward global
338	* filter, a separator, one or more single IDs delimited by
339	* separators, an an optional reverse global filter. The
340	* separator is a semicolon. The global filters are UnicodeSet
341	* patterns. The reverse global filter must be enclosed in
342	* parentheses.
343	* @param id the pattern the parse
344	* @param dir the direction.
345	* @param canonID OUTPUT parameter that receives the canonical ID,
346	* consisting of canonical IDs for all elements, as returned by
347	* parseSingleID(), separated by semicolons. Previous contents
348	* are discarded.
349	* @param list OUTPUT parameter that receives a list of SingleID
350	* objects representing the parsed IDs. Previous contents are
351	* discarded.
352	* @param globalFilter OUTPUT parameter that receives a pointer to
353	* a newly created global filter for this ID in this direction, or
354	* NULL if there is none.
355	* @return TRUE if the parse succeeds, that is, if the entire
356	* id is consumed without syntax error.
357	*/
358	UBool TransliteratorIDParser::parseCompoundID(const UnicodeString& id, int32_t dir,
359	UnicodeString& canonID,
360	UVector& list,
361	UnicodeSet*& globalFilter) {
362	UErrorCode ec = U_ZERO_ERROR;
363	int32_t i;
364	int32_t pos = `0`;
365	int32_t withParens = `1`;
366	list.removeAllElements();
367	UnicodeSet* filter;
368	globalFilter = NULL;
369	canonID.truncate(`0`);
370
371	// Parse leading global filter, if any
372	withParens = `0`; // parens disallowed
373	filter = parseGlobalFilter(id, pos, dir, withParens, &canonID);
374	if (filter != NULL) {
375	if (!ICU_Utility::parseChar(id, pos, ID_DELIM)) {
376	// Not a global filter; backup and resume
377	canonID.truncate(`0`);
378	pos = `0`;
379	}
380	if (dir == FORWARD) {
381	globalFilter = filter;
382	} else {
383	delete filter;
384	}
385	filter = NULL;
386	}
387
388	UBool sawDelimiter = TRUE;
389	for (;;) {
390	SingleID* single = parseSingleID(id, pos, dir, ec);
391	if (single == NULL) {
392	break;
393	}
394	if (dir == FORWARD) {
395	list.addElement(single, ec);
396	} else {
397	list.insertElementAt(single, `0`, ec);
398	}
399	if (U_FAILURE(ec)) {
400	goto FAIL;
401	}
402	if (!ICU_Utility::parseChar(id, pos, ID_DELIM)) {
403	sawDelimiter = FALSE;
404	break;
405	}
406	}
407
408	if (list.size() == `0`) {
409	goto FAIL;
410	}
411
412	// Construct canonical ID
413	for (i=`0`; i<list.size(); ++i) {
414	SingleID* single = (SingleID*) list.elementAt(i);
415	canonID.append(single->canonID);
416	if (i != (list.size()-`1`)) {
417	canonID.append(ID_DELIM);
418	}
419	}
420
421	// Parse trailing global filter, if any, and only if we saw
422	// a trailing delimiter after the IDs.
423	if (sawDelimiter) {
424	withParens = `1`; // parens required
425	filter = parseGlobalFilter(id, pos, dir, withParens, &canonID);
426	if (filter != NULL) {
427	// Don't require trailing ';', but parse it if present
428	ICU_Utility::parseChar(id, pos, ID_DELIM);
429
430	if (dir == REVERSE) {
431	globalFilter = filter;
432	} else {
433	delete filter;
434	}
435	filter = NULL;
436	}
437	}
438
439	// Trailing unparsed text is a syntax error
440	ICU_Utility::skipWhitespace(id, pos, TRUE);
441	if (pos != id.length()) {
442	goto FAIL;
443	}
444
445	return TRUE;
446
447	FAIL:
448	UObjectDeleter *save = list.setDeleter(_deleteSingleID);
449	list.removeAllElements();
450	list.setDeleter(save);
451	delete globalFilter;
452	globalFilter = NULL;
453	return FALSE;
454	}
455
456	/**
457	* Convert the elements of the 'list' vector, which are SingleID
458	* objects, into actual Transliterator objects. In the course of
459	* this, some (or all) entries may be removed. If all entries
460	* are removed, the NULL transliterator will be added.
461	*
462	* Delete entries with empty basicIDs; these are generated by
463	* elements like "(A)" in the forward direction, or "A()" in
464	* the reverse. THIS MAY RESULT IN AN EMPTY VECTOR. Convert
465	* SingleID entries to actual transliterators.
466	*
467	* @param list vector of SingleID objects. On exit, vector
468	* of one or more Transliterators.
469	* @return new value of insertIndex. The index will shift if
470	* there are empty items, like "(Lower)", with indices less than
471	* insertIndex.
472	*/
473	void TransliteratorIDParser::instantiateList(UVector& list,
474	UErrorCode& ec) {
475	UVector tlist(ec);
476	if (U_FAILURE(ec)) {
477	goto RETURN;
478	}
479	tlist.setDeleter(_deleteTransliteratorTrIDPars);
480
481	Transliterator* t;
482	int32_t i;
483	for (i=`0`; i<=list.size(); ++i) { // [sic]: i<=list.size()
484	// We run the loop too long by one, so we can
485	// do an insert after the last element
486	if (i==list.size()) {
487	break;
488	}
489
490	SingleID* single = (SingleID*) list.elementAt(i);
491	if (single->basicID.length() != `0`) {
492	t = single->createInstance();
493	if (t == NULL) {
494	ec = U_INVALID_ID;
495	goto RETURN;
496	}
497	tlist.addElement(t, ec);
498	if (U_FAILURE(ec)) {
499	delete t;
500	goto RETURN;
501	}
502	}
503	}
504
505	// An empty list is equivalent to a NULL transliterator.
506	if (tlist.size() == `0`) {
507	t = createBasicInstance(UnicodeString (TRUE, ANY_NULL, `8`), NULL);
508	if (t == NULL) {
509	// Should never happen
510	ec = U_INTERNAL_TRANSLITERATOR_ERROR;
511	}
512	tlist.addElement(t, ec);
513	if (U_FAILURE(ec)) {
514	delete t;
515	}
516	}
517
518	RETURN:
519
520	UObjectDeleter *save = list.setDeleter(_deleteSingleID);
521	list.removeAllElements();
522
523	if (U_SUCCESS(ec)) {
524	list.setDeleter(_deleteTransliteratorTrIDPars);
525
526	while (tlist.size() > `0`) {
527	t = (Transliterator*) tlist.orphanElementAt(`0`);
528	list.addElement(t, ec);
529	if (U_FAILURE(ec)) {
530	delete t;
531	list.removeAllElements();
532	break;
533	}
534	}
535	}
536
537	list.setDeleter(save);
538	}
539
540	/**
541	* Parse an ID into pieces. Take IDs of the form T, T/V, S-T,
542	* S-T/V, or S/V-T. If the source is missing, return a source of
543	* ANY.
544	* @param id the id string, in any of several forms
545	* @return an array of 4 strings: source, target, variant, and
546	* isSourcePresent. If the source is not present, ANY will be
547	* given as the source, and isSourcePresent will be NULL. Otherwise
548	* isSourcePresent will be non-NULL. The target may be empty if the
549	* id is not well-formed. The variant may be empty.
550	*/
551	void TransliteratorIDParser::IDtoSTV(const UnicodeString& id,
552	UnicodeString& source,
553	UnicodeString& target,
554	UnicodeString& variant,
555	UBool& isSourcePresent) {
556	source.setTo(ANY, `3`);
557	target.truncate(`0`);
558	variant.truncate(`0`);
559
560	int32_t sep = id.indexOf(TARGET_SEP);
561	int32_t var = id.indexOf(VARIANT_SEP);
562	if (var < `0`) {
563	var = id.length();
564	}
565	isSourcePresent = FALSE;
566
567	if (sep < `0`) {
568	// Form: T/V or T (or /V)
569	id.extractBetween(`0`, var, target);
570	id.extractBetween(var, id.length(), variant);
571	} else if (sep < var) {
572	// Form: S-T/V or S-T (or -T/V or -T)
573	if (sep > `0`) {
574	id.extractBetween(`0`, sep, source);
575	isSourcePresent = TRUE;
576	}
577	id.extractBetween(++sep, var, target);
578	id.extractBetween(var, id.length(), variant);
579	} else {
580	// Form: (S/V-T or /V-T)
581	if (var > `0`) {
582	id.extractBetween(`0`, var, source);
583	isSourcePresent = TRUE;
584	}
585	id.extractBetween(var, sep++, variant);
586	id.extractBetween(sep, id.length(), target);
587	}
588
589	if (variant.length() > `0`) {
590	variant.remove(`0`, `1`);
591	}
592	}
593
594	/**
595	* Given source, target, and variant strings, concatenate them into a
596	* full ID. If the source is empty, then "Any" will be used for the
597	* source, so the ID will always be of the form s-t/v or s-t.
598	*/
599	void TransliteratorIDParser::STVtoID(const UnicodeString& source,
600	const UnicodeString& target,
601	const UnicodeString& variant,
602	UnicodeString& id) {
603	id = source;
604	if (id.length() == `0`) {
605	id.setTo(ANY, `3`);
606	}
607	id.append(TARGET_SEP).append(target);
608	if (variant.length() != `0`) {
609	id.append(VARIANT_SEP).append(variant);
610	}
611	// NUL-terminate the ID string for getTerminatedBuffer.
612	// This prevents valgrind and Purify warnings.
613	id.append((UChar)`0`);
614	id.truncate(id.length()-`1`);
615	}
616
617	/**
618	* Register two targets as being inverses of one another. For
619	* example, calling registerSpecialInverse("NFC", "NFD", TRUE) causes
620	* Transliterator to form the following inverse relationships:
621	*
622	* <pre>NFC => NFD
623	* Any-NFC => Any-NFD
624	* NFD => NFC
625	* Any-NFD => Any-NFC</pre>
626	*
627	* (Without the special inverse registration, the inverse of NFC
628	* would be NFC-Any.) Note that NFD is shorthand for Any-NFD, but
629	* that the presence or absence of "Any-" is preserved.
630	*
631	* <p>The relationship is symmetrical; registering (a, b) is
632	* equivalent to registering (b, a).
633	*
634	* <p>The relevant IDs must still be registered separately as
635	* factories or classes.
636	*
637	* <p>Only the targets are specified. Special inverses always
638	* have the form Any-Target1 <=> Any-Target2. The target should
639	* have canonical casing (the casing desired to be produced when
640	* an inverse is formed) and should contain no whitespace or other
641	* extraneous characters.
642	*
643	* @param target the target against which to register the inverse
644	* @param inverseTarget the inverse of target, that is
645	* Any-target.getInverse() => Any-inverseTarget
646	* @param bidirectional if TRUE, register the reverse relation
647	* as well, that is, Any-inverseTarget.getInverse() => Any-target
648	*/
649	void TransliteratorIDParser::registerSpecialInverse(const UnicodeString& target,
650	const UnicodeString& inverseTarget,
651	UBool bidirectional,
652	UErrorCode &status) {
653	umtx_initOnce(gSpecialInversesInitOnce, init, status);
654	if (U_FAILURE(status)) {
655	return;
656	}
657
658	// If target == inverseTarget then force bidirectional => FALSE
659	if (bidirectional && `0`==target.caseCompare(inverseTarget, U_FOLD_CASE_DEFAULT)) {
660	bidirectional = FALSE;
661	}
662
663	Mutex lock(&LOCK);
664
665	UnicodeString tempus = new* UnicodeString (inverseTarget); // Used for null pointer check before usage.
666	if (tempus == NULL) {
667	status = U_MEMORY_ALLOCATION_ERROR;
668	return;
669	}
670	SPECIAL_INVERSES->put(target, tempus, status);
671	if (bidirectional) {
672	tempus = new UnicodeString (target);
673	if (tempus == NULL) {
674	status = U_MEMORY_ALLOCATION_ERROR;
675	return;
676	}
677	SPECIAL_INVERSES->put(inverseTarget, tempus, status);
678	}
679	}
680
681	//----------------------------------------------------------------
682	// Private implementation
683	//----------------------------------------------------------------
684
685	/**
686	* Parse an ID into component pieces. Take IDs of the form T,
687	* T/V, S-T, S-T/V, or S/V-T. If the source is missing, return a
688	* source of ANY.
689	* @param id the id string, in any of several forms
690	* @param pos INPUT-OUTPUT parameter. On input, pos is the
691	* offset of the first character to parse in id. On output,
692	* pos is the offset after the last parsed character. If the
693	* parse failed, pos will be unchanged.
694	* @param allowFilter2 if TRUE, a UnicodeSet pattern is allowed
695	* at any location between specs or delimiters, and is returned
696	* as the fifth string in the array.
697	* @return a Specs object, or NULL if the parse failed. If
698	* neither source nor target was seen in the parsed id, then the
699	* parse fails. If allowFilter is TRUE, then the parsed filter
700	* pattern is returned in the Specs object, otherwise the returned
701	* filter reference is NULL. If the parse fails for any reason
702	* NULL is returned.
703	*/
704	TransliteratorIDParser::Specs*
705	TransliteratorIDParser::parseFilterID(const UnicodeString& id, int32_t& pos,
706	UBool allowFilter) {
707	UnicodeString first;
708	UnicodeString source;
709	UnicodeString target;
710	UnicodeString variant;
711	UnicodeString filter;
712	UChar delimiter = `0`;
713	int32_t specCount = `0`;
714	int32_t start = pos;
715
716	// This loop parses one of the following things with each
717	// pass: a filter, a delimiter character (either '-' or '/'),
718	// or a spec (source, target, or variant).
719	for (;;) {
720	ICU_Utility::skipWhitespace(id, pos, TRUE);
721	if (pos == id.length()) {
722	break;
723	}
724
725	// Parse filters
726	if (allowFilter && filter.length() == `0` &&
727	UnicodeSet::resemblesPattern(id, pos)) {
728
729	ParsePosition ppos(pos);
730	UErrorCode ec = U_ZERO_ERROR;
731	UnicodeSet set(id, ppos, USET_IGNORE_SPACE, NULL, ec);
732	if (U_FAILURE(ec)) {
733	pos = start;
734	return NULL;
735	}
736	id.extractBetween(pos, ppos.getIndex(), filter);
737	pos = ppos.getIndex();
738	continue;
739	}
740
741	if (delimiter == `0`) {
742	UChar c = id.charAt(pos);
743	if ((c == TARGET_SEP && target.length() == `0`) \|\|
744	(c == VARIANT_SEP && variant.length() == `0`)) {
745	delimiter = c;
746	++pos;
747	continue;
748	}
749	}
750
751	// We are about to try to parse a spec with no delimiter
752	// when we can no longer do so (we can only do so at the
753	// start); break.
754	if (delimiter == `0` && specCount > `0`) {
755	break;
756	}
757
758	UnicodeString spec = ICU_Utility::parseUnicodeIdentifier(id, pos);
759	if (spec.length() == `0`) {
760	// Note that if there was a trailing delimiter, we
761	// consume it. So Foo-, Foo/, Foo-Bar/, and Foo/Bar-
762	// are legal.
763	break;
764	}
765
766	switch (delimiter) {
767	case `0`:
768	first = spec;
769	break;
770	case TARGET_SEP:
771	target = spec;
772	break;
773	case VARIANT_SEP:
774	variant = spec;
775	break;
776	}
777	++specCount;
778	delimiter = `0`;
779	}
780
781	// A spec with no prior character is either source or target,
782	// depending on whether an explicit "-target" was seen.
783	if (first.length() != `0`) {
784	if (target.length() == `0`) {
785	target = first;
786	} else {
787	source = first;
788	}
789	}
790
791	// Must have either source or target
792	if (source.length() == `0` && target.length() == `0`) {
793	pos = start;
794	return NULL;
795	}
796
797	// Empty source or target defaults to ANY
798	UBool sawSource = TRUE;
799	if (source.length() == `0`) {
800	source.setTo(ANY, `3`);
801	sawSource = FALSE;
802	}
803	if (target.length() == `0`) {
804	target.setTo(ANY, `3`);
805	}
806
807	return new Specs (source, target, variant, sawSource, filter);
808	}
809
810	/**
811	* Givens a Spec object, convert it to a SingleID object. The
812	* Spec object is a more unprocessed parse result. The SingleID
813	* object contains information about canonical and basic IDs.
814	* @return a SingleID; never returns NULL. Returned object always
815	* has 'filter' field of NULL.
816	*/
817	TransliteratorIDParser::SingleID*
818	TransliteratorIDParser::specsToID(const Specs* specs, int32_t dir) {
819	UnicodeString canonID;
820	UnicodeString basicID;
821	UnicodeString basicPrefix;
822	if (specs != NULL) {
823	UnicodeString buf;
824	if (dir == FORWARD) {
825	if (specs->sawSource) {
826	buf.append(specs->source).append(TARGET_SEP);
827	} else {
828	basicPrefix = specs->source;
829	basicPrefix.append(TARGET_SEP);
830	}
831	buf.append(specs->target);
832	} else {
833	buf.append(specs->target).append(TARGET_SEP).append(specs->source);
834	}
835	if (specs->variant.length() != `0`) {
836	buf.append(VARIANT_SEP).append(specs->variant);
837	}
838	basicID = basicPrefix;
839	basicID.append(buf);
840	if (specs->filter.length() != `0`) {
841	buf.insert(`0`, specs->filter);
842	}
843	canonID = buf;
844	}
845	return new SingleID (canonID, basicID);
846	}
847
848	/**
849	* Given a Specs object, return a SingleID representing the
850	* special inverse of that ID. If there is no special inverse
851	* then return NULL.
852	* @return a SingleID or NULL. Returned object always has
853	* 'filter' field of NULL.
854	*/
855	TransliteratorIDParser::SingleID*
856	TransliteratorIDParser::specsToSpecialInverse(const Specs& specs, UErrorCode &status) {
857	if (`0`!=specs.source.caseCompare(ANY, `3`, U_FOLD_CASE_DEFAULT)) {
858	return NULL;
859	}
860	umtx_initOnce(gSpecialInversesInitOnce, init, status);
861	if (U_FAILURE(status)) {
862	return NULL;
863	}
864
865	UnicodeString* inverseTarget;
866
867	umtx_lock(&LOCK);
868	inverseTarget = (UnicodeString*) SPECIAL_INVERSES->get(specs.target);
869	umtx_unlock(&LOCK);
870
871	if (inverseTarget != NULL) {
872	// If the original ID contained "Any-" then make the
873	// special inverse "Any-Foo"; otherwise make it "Foo".
874	// So "Any-NFC" => "Any-NFD" but "NFC" => "NFD".
875	UnicodeString buf;
876	if (specs.filter.length() != `0`) {
877	buf.append(specs.filter);
878	}
879	if (specs.sawSource) {
880	buf.append(ANY, `3`).append(TARGET_SEP);
881	}
882	buf.append(*inverseTarget);
883
884	UnicodeString basicID(TRUE, ANY, `3`);
885	basicID.append(TARGET_SEP).append(*inverseTarget);
886
887	if (specs.variant.length() != `0`) {
888	buf.append(VARIANT_SEP).append(specs.variant);
889	basicID.append(VARIANT_SEP).append(specs.variant);
890	}
891	return new SingleID (buf, basicID);
892	}
893	return NULL;
894	}
895
896	/**
897	* Glue method to get around access problems in C++. This would
898	* ideally be inline but we want to avoid a circular header
899	* dependency.
900	*/
901	Transliterator* TransliteratorIDParser::createBasicInstance(const UnicodeString& id, const UnicodeString* canonID) {
902	return Transliterator::createBasicInstance(id, canonID);
903	}
904
905	/**
906	* Initialize static memory. Called through umtx_initOnce only.
907	*/
908	void U_CALLCONV TransliteratorIDParser::init(UErrorCode &status) {
909	U_ASSERT(SPECIAL_INVERSES == NULL);
910	ucln_i18n_registerCleanup(UCLN_I18N_TRANSLITERATOR, utrans_transliterator_cleanup);
911
912	SPECIAL_INVERSES = new Hashtable (TRUE, status);
913	if (SPECIAL_INVERSES == NULL) {
914	status = U_MEMORY_ALLOCATION_ERROR;
915	return;
916	}
917	SPECIAL_INVERSES->setValueDeleter(uprv_deleteUObject);
918	}
919
920	/**
921	* Free static memory.
922	*/
923	void TransliteratorIDParser::cleanup() {
924	if (SPECIAL_INVERSES) {
925	delete SPECIAL_INVERSES;
926	SPECIAL_INVERSES = NULL;
927	}
928	gSpecialInversesInitOnce.reset();
929	}
930
931	U_NAMESPACE_END
932
933	#endif /* #if !UCONFIG_NO_TRANSLITERATION */
934
935	//eof
936

Browse the source code of ClickHouse/contrib/icu/icu4c/source/i18n/tridpars.cpp