collationruleparser.cpp source code [ClickHouse/contrib/icu/icu4c/source/i18n/collationruleparser.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	*******************************************************************************
5	* Copyright (C) 2013-2015, International Business Machines
6	* Corporation and others. All Rights Reserved.
7	*******************************************************************************
8	* collationruleparser.cpp
9	*
10	* (replaced the former ucol_tok.cpp)
11	*
12	* created on: 2013apr10
13	* created by: Markus W. Scherer
14	*/
15
16	#include "unicode/utypes.h"
17
18	#if !UCONFIG_NO_COLLATION
19
20	#include "unicode/normalizer2.h"
21	#include "unicode/parseerr.h"
22	#include "unicode/uchar.h"
23	#include "unicode/ucol.h"
24	#include "unicode/uloc.h"
25	#include "unicode/unistr.h"
26	#include "unicode/utf16.h"
27	#include "charstr.h"
28	#include "cmemory.h"
29	#include "collation.h"
30	#include "collationdata.h"
31	#include "collationruleparser.h"
32	#include "collationsettings.h"
33	#include "collationtailoring.h"
34	#include "cstring.h"
35	#include "patternprops.h"
36	#include "uassert.h"
37	#include "uvectr32.h"
38
39	U_NAMESPACE_BEGIN
40
41	namespace {
42
43	static const UChar BEFORE[] = { `0x5b`, `0x62`, `0x65`, `0x66`, `0x6f`, `0x72`, `0x65`, `0` }; // "[before"
44	const int32_t BEFORE_LENGTH = `7`;
45
46	} // namespace
47
48	CollationRuleParser::Sink::~Sink() {}
49
50	void
51	CollationRuleParser::Sink::suppressContractions(const UnicodeSet &, const char *&, UErrorCode &) {}
52
53	void
54	CollationRuleParser::Sink::optimize(const UnicodeSet &, const char *&, UErrorCode &) {}
55
56	CollationRuleParser::Importer::~Importer() {}
57
58	CollationRuleParser::CollationRuleParser(const CollationData *base, UErrorCode &errorCode)
59	: nfd(*Normalizer2::getNFDInstance(errorCode)),
60	nfc(*Normalizer2::getNFCInstance(errorCode)),
61	rules(NULL), baseData(base), settings(NULL),
62	parseError(NULL), errorReason(NULL),
63	sink(NULL), importer(NULL),
64	ruleIndex(`0`) {
65	}
66
67	CollationRuleParser::~CollationRuleParser() {
68	}
69
70	void
71	CollationRuleParser::parse(const UnicodeString &ruleString,
72	CollationSettings &outSettings,
73	UParseError *outParseError,
74	UErrorCode &errorCode) {
75	if(U_FAILURE(errorCode)) { return; }
76	settings = &outSettings;
77	parseError = outParseError;
78	if(parseError != NULL) {
79	parseError->line = `0`;
80	parseError->offset = -`1`;
81	parseError->preContext[`0`] = `0`;
82	parseError->postContext[`0`] = `0`;
83	}
84	errorReason = NULL;
85	parse(ruleString, errorCode);
86	}
87
88	void
89	CollationRuleParser::parse(const UnicodeString &ruleString, UErrorCode &errorCode) {
90	if(U_FAILURE(errorCode)) { return; }
91	rules = &ruleString;
92	ruleIndex = `0`;
93
94	while(ruleIndex < rules->length()) {
95	UChar c = rules->charAt(ruleIndex);
96	if(PatternProps::isWhiteSpace(c)) {
97	++ruleIndex;
98	continue;
99	}
100	switch(c) {
101	case `0x26`: // '&'
102	parseRuleChain(errorCode);
103	break;
104	case `0x5b`: // '['
105	parseSetting(errorCode);
106	break;
107	case `0x23`: // '#' starts a comment, until the end of the line
108	ruleIndex = skipComment(ruleIndex + `1`);
109	break;
110	case `0x40`: // '@' is equivalent to [backwards 2]
111	settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
112	UCOL_ON, `0`, errorCode);
113	++ruleIndex;
114	break;
115	case `0x21`: // '!' used to turn on Thai/Lao character reversal
116	// Accept but ignore. The root collator has contractions
117	// that are equivalent to the character reversal, where appropriate.
118	++ruleIndex;
119	break;
120	default:
121	setParseError("expected a reset or setting or comment", errorCode);
122	break;
123	}
124	if(U_FAILURE(errorCode)) { return; }
125	}
126	}
127
128	void
129	CollationRuleParser::parseRuleChain(UErrorCode &errorCode) {
130	int32_t resetStrength = parseResetAndPosition(errorCode);
131	UBool isFirstRelation = TRUE;
132	for(;;) {
133	int32_t result = parseRelationOperator(errorCode);
134	if(U_FAILURE(errorCode)) { return; }
135	if(result < `0`) {
136	if(ruleIndex < rules->length() && rules->charAt(ruleIndex) == `0x23`) {
137	// '#' starts a comment, until the end of the line
138	ruleIndex = skipComment(ruleIndex + `1`);
139	continue;
140	}
141	if(isFirstRelation) {
142	setParseError("reset not followed by a relation", errorCode);
143	}
144	return;
145	}
146	int32_t strength = result & STRENGTH_MASK;
147	if(resetStrength < UCOL_IDENTICAL) {
148	// reset-before rule chain
149	if(isFirstRelation) {
150	if(strength != resetStrength) {
151	setParseError("reset-before strength differs from its first relation", errorCode);
152	return;
153	}
154	} else {
155	if(strength < resetStrength) {
156	setParseError("reset-before strength followed by a stronger relation", errorCode);
157	return;
158	}
159	}
160	}
161	int32_t i = ruleIndex + (result >> OFFSET_SHIFT); // skip over the relation operator
162	if((result & STARRED_FLAG) == `0`) {
163	parseRelationStrings(strength, i, errorCode);
164	} else {
165	parseStarredCharacters(strength, i, errorCode);
166	}
167	if(U_FAILURE(errorCode)) { return; }
168	isFirstRelation = FALSE;
169	}
170	}
171
172	int32_t
173	CollationRuleParser::parseResetAndPosition(UErrorCode &errorCode) {
174	if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
175	int32_t i = skipWhiteSpace(ruleIndex + `1`);
176	int32_t j;
177	UChar c;
178	int32_t resetStrength;
179	if(rules->compare(i, BEFORE_LENGTH, BEFORE, `0`, BEFORE_LENGTH) == `0` &&
180	(j = i + BEFORE_LENGTH) < rules->length() &&
181	PatternProps::isWhiteSpace(rules->charAt(j)) &&
182	((j = skipWhiteSpace(j + `1`)) + `1`) < rules->length() &&
183	`0x31` <= (c = rules->charAt(j)) && c <= `0x33` &&
184	rules->charAt(j + `1`) == `0x5d`) {
185	// &[before n] with n=1 or 2 or 3
186	resetStrength = UCOL_PRIMARY + (c - `0x31`);
187	i = skipWhiteSpace(j + `2`);
188	} else {
189	resetStrength = UCOL_IDENTICAL;
190	}
191	if(i >= rules->length()) {
192	setParseError("reset without position", errorCode);
193	return UCOL_DEFAULT;
194	}
195	UnicodeString str;
196	if(rules->charAt(i) == `0x5b`) { // '['
197	i = parseSpecialPosition(i, str, errorCode);
198	} else {
199	i = parseTailoringString(i, str, errorCode);
200	}
201	sink->addReset(resetStrength, str, errorReason, errorCode);
202	if(U_FAILURE(errorCode)) { setErrorContext(); }
203	ruleIndex = i;
204	return resetStrength;
205	}
206
207	int32_t
208	CollationRuleParser::parseRelationOperator(UErrorCode &errorCode) {
209	if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
210	ruleIndex = skipWhiteSpace(ruleIndex);
211	if(ruleIndex >= rules->length()) { return UCOL_DEFAULT; }
212	int32_t strength;
213	int32_t i = ruleIndex;
214	UChar c = rules->charAt(i++);
215	switch(c) {
216	case `0x3c`: // '<'
217	if(i < rules->length() && rules->charAt(i) == `0x3c`) { // <<
218	++i;
219	if(i < rules->length() && rules->charAt(i) == `0x3c`) { // <<<
220	++i;
221	if(i < rules->length() && rules->charAt(i) == `0x3c`) { // <<<<
222	++i;
223	strength = UCOL_QUATERNARY;
224	} else {
225	strength = UCOL_TERTIARY;
226	}
227	} else {
228	strength = UCOL_SECONDARY;
229	}
230	} else {
231	strength = UCOL_PRIMARY;
232	}
233	if(i < rules->length() && rules->charAt(i) == `0x2a`) { // ''*
234	++i;
235	strength \|= STARRED_FLAG;
236	}
237	break;
238	case `0x3b`: // ';' same as <<
239	strength = UCOL_SECONDARY;
240	break;
241	case `0x2c`: // ',' same as <<<
242	strength = UCOL_TERTIARY;
243	break;
244	case `0x3d`: // '='
245	strength = UCOL_IDENTICAL;
246	if(i < rules->length() && rules->charAt(i) == `0x2a`) { // ''*
247	++i;
248	strength \|= STARRED_FLAG;
249	}
250	break;
251	default:
252	return UCOL_DEFAULT;
253	}
254	return ((i - ruleIndex) << OFFSET_SHIFT) \| strength;
255	}
256
257	void
258	CollationRuleParser::parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode) {
259	// Parse
260	// prefix \| str / extension
261	// where prefix and extension are optional.
262	UnicodeString prefix, str, extension;
263	i = parseTailoringString(i, str, errorCode);
264	if(U_FAILURE(errorCode)) { return; }
265	UChar next = (i < rules->length()) ? rules->charAt(i) : `0`;
266	if(next == `0x7c`) { // '\|' separates the context prefix from the string.
267	prefix = str;
268	i = parseTailoringString(i + `1`, str, errorCode);
269	if(U_FAILURE(errorCode)) { return; }
270	next = (i < rules->length()) ? rules->charAt(i) : `0`;
271	}
272	if(next == `0x2f`) { // '/' separates the string from the extension.
273	i = parseTailoringString(i + `1`, extension, errorCode);
274	}
275	if(!prefix.isEmpty()) {
276	UChar32 prefix0 = prefix.char32At(`0`);
277	UChar32 c = str.char32At(`0`);
278	if(!nfc.hasBoundaryBefore(prefix0) \|\| !nfc.hasBoundaryBefore(c)) {
279	setParseError("in 'prefix\|str', prefix and str must each start with an NFC boundary",
280	errorCode);
281	return;
282	}
283	}
284	sink->addRelation(strength, prefix, str, extension, errorReason, errorCode);
285	if(U_FAILURE(errorCode)) { setErrorContext(); }
286	ruleIndex = i;
287	}
288
289	void
290	CollationRuleParser::parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode) {
291	UnicodeString empty, raw;
292	i = parseString(skipWhiteSpace(i), raw, errorCode);
293	if(U_FAILURE(errorCode)) { return; }
294	if(raw.isEmpty()) {
295	setParseError("missing starred-relation string", errorCode);
296	return;
297	}
298	UChar32 prev = -`1`;
299	int32_t j = `0`;
300	for(;;) {
301	while(j < raw.length()) {
302	UChar32 c = raw.char32At(j);
303	if(!nfd.isInert(c)) {
304	setParseError("starred-relation string is not all NFD-inert", errorCode);
305	return;
306	}
307	sink->addRelation(strength, empty, UnicodeString (c), empty, errorReason, errorCode);
308	if(U_FAILURE(errorCode)) {
309	setErrorContext();
310	return;
311	}
312	j += U16_LENGTH(c);
313	prev = c;
314	}
315	if(i >= rules->length() \|\| rules->charAt(i) != `0x2d`) { // '-'
316	break;
317	}
318	if(prev < `0`) {
319	setParseError("range without start in starred-relation string", errorCode);
320	return;
321	}
322	i = parseString(i + `1`, raw, errorCode);
323	if(U_FAILURE(errorCode)) { return; }
324	if(raw.isEmpty()) {
325	setParseError("range without end in starred-relation string", errorCode);
326	return;
327	}
328	UChar32 c = raw.char32At(`0`);
329	if(c < prev) {
330	setParseError("range start greater than end in starred-relation string", errorCode);
331	return;
332	}
333	// range prev-c
334	UnicodeString s;
335	while(++prev <= c) {
336	if(!nfd.isInert(prev)) {
337	setParseError("starred-relation string range is not all NFD-inert", errorCode);
338	return;
339	}
340	if(U_IS_SURROGATE(prev)) {
341	setParseError("starred-relation string range contains a surrogate", errorCode);
342	return;
343	}
344	if(`0xfffd` <= prev && prev <= `0xffff`) {
345	setParseError("starred-relation string range contains U+FFFD, U+FFFE or U+FFFF", errorCode);
346	return;
347	}
348	s.setTo(prev);
349	sink->addRelation(strength, empty, s, empty, errorReason, errorCode);
350	if(U_FAILURE(errorCode)) {
351	setErrorContext();
352	return;
353	}
354	}
355	prev = -`1`;
356	j = U16_LENGTH(c);
357	}
358	ruleIndex = skipWhiteSpace(i);
359	}
360
361	int32_t
362	CollationRuleParser::parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
363	i = parseString(skipWhiteSpace(i), raw, errorCode);
364	if(U_SUCCESS(errorCode) && raw.isEmpty()) {
365	setParseError("missing relation string", errorCode);
366	}
367	return skipWhiteSpace(i);
368	}
369
370	int32_t
371	CollationRuleParser::parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
372	if(U_FAILURE(errorCode)) { return i; }
373	raw.remove();
374	while(i < rules->length()) {
375	UChar32 c = rules->charAt(i++);
376	if(isSyntaxChar(c)) {
377	if(c == `0x27`) { // apostrophe
378	if(i < rules->length() && rules->charAt(i) == `0x27`) {
379	// Double apostrophe, encodes a single one.
380	raw.append((UChar)`0x27`);
381	++i;
382	continue;
383	}
384	// Quote literal text until the next single apostrophe.
385	for(;;) {
386	if(i == rules->length()) {
387	setParseError("quoted literal text missing terminating apostrophe", errorCode);
388	return i;
389	}
390	c = rules->charAt(i++);
391	if(c == `0x27`) {
392	if(i < rules->length() && rules->charAt(i) == `0x27`) {
393	// Double apostrophe inside quoted literal text,
394	// still encodes a single apostrophe.
395	++i;
396	} else {
397	break;
398	}
399	}
400	raw.append((UChar)c);
401	}
402	} else if(c == `0x5c`) { // backslash
403	if(i == rules->length()) {
404	setParseError("backslash escape at the end of the rule string", errorCode);
405	return i;
406	}
407	c = rules->char32At(i);
408	raw.append(c);
409	i += U16_LENGTH(c);
410	} else {
411	// Any other syntax character terminates a string.
412	--i;
413	break;
414	}
415	} else if(PatternProps::isWhiteSpace(c)) {
416	// Unquoted white space terminates a string.
417	--i;
418	break;
419	} else {
420	raw.append((UChar)c);
421	}
422	}
423	for(int32_t j = `0`; j < raw.length();) {
424	UChar32 c = raw.char32At(j);
425	if(U_IS_SURROGATE(c)) {
426	setParseError("string contains an unpaired surrogate", errorCode);
427	return i;
428	}
429	if(`0xfffd` <= c && c <= `0xffff`) {
430	setParseError("string contains U+FFFD, U+FFFE or U+FFFF", errorCode);
431	return i;
432	}
433	j += U16_LENGTH(c);
434	}
435	return i;
436	}
437
438	namespace {
439
440	static const char *const positions[] = {
441	"first tertiary ignorable",
442	"last tertiary ignorable",
443	"first secondary ignorable",
444	"last secondary ignorable",
445	"first primary ignorable",
446	"last primary ignorable",
447	"first variable",
448	"last variable",
449	"first regular",
450	"last regular",
451	"first implicit",
452	"last implicit",
453	"first trailing",
454	"last trailing"
455	};
456
457	} // namespace
458
459	int32_t
460	CollationRuleParser::parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode) {
461	if(U_FAILURE(errorCode)) { return `0`; }
462	UnicodeString raw;
463	int32_t j = readWords(i + `1`, raw);
464	if(j > i && rules->charAt(j) == `0x5d` && !raw.isEmpty()) { // words end with ]
465	++j;
466	for(int32_t pos = `0`; pos < UPRV_LENGTHOF(positions); ++pos) {
467	if(raw == UnicodeString (positions[pos], -`1`, US_INV)) {
468	str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + pos));
469	return j;
470	}
471	}
472	if(raw == UNICODE_STRING_SIMPLE("top")) {
473	str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + LAST_REGULAR));
474	return j;
475	}
476	if(raw == UNICODE_STRING_SIMPLE("variable top")) {
477	str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + LAST_VARIABLE));
478	return j;
479	}
480	}
481	setParseError("not a valid special reset position", errorCode);
482	return i;
483	}
484
485	void
486	CollationRuleParser::parseSetting(UErrorCode &errorCode) {
487	if(U_FAILURE(errorCode)) { return; }
488	UnicodeString raw;
489	int32_t i = ruleIndex + `1`;
490	int32_t j = readWords(i, raw);
491	if(j <= i \|\| raw.isEmpty()) {
492	setParseError("expected a setting/option at '['", errorCode);
493	}
494	if(rules->charAt(j) == `0x5d`) { // words end with ]
495	++j;
496	if(raw.startsWith(UNICODE_STRING_SIMPLE("reorder")) &&
497	(raw.length() == `7` \|\| raw.charAt(`7`) == `0x20`)) {
498	parseReordering(raw, errorCode);
499	ruleIndex = j;
500	return;
501	}
502	if(raw == UNICODE_STRING_SIMPLE("backwards 2")) {
503	settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
504	UCOL_ON, `0`, errorCode);
505	ruleIndex = j;
506	return;
507	}
508	UnicodeString v;
509	int32_t valueIndex = raw.lastIndexOf((UChar)`0x20`);
510	if(valueIndex >= `0`) {
511	v.setTo(raw, valueIndex + `1`);
512	raw.truncate(valueIndex);
513	}
514	if(raw == UNICODE_STRING_SIMPLE("strength") && v.length() == `1`) {
515	int32_t value = UCOL_DEFAULT;
516	UChar c = v.charAt(`0`);
517	if(`0x31` <= c && c <= `0x34`) { // 1..4
518	value = UCOL_PRIMARY + (c - `0x31`);
519	} else if(c == `0x49`) { // 'I'
520	value = UCOL_IDENTICAL;
521	}
522	if(value != UCOL_DEFAULT) {
523	settings->setStrength(value, `0`, errorCode);
524	ruleIndex = j;
525	return;
526	}
527	} else if(raw == UNICODE_STRING_SIMPLE("alternate")) {
528	UColAttributeValue value = UCOL_DEFAULT;
529	if(v == UNICODE_STRING_SIMPLE("non-ignorable")) {
530	value = UCOL_NON_IGNORABLE;
531	} else if(v == UNICODE_STRING_SIMPLE("shifted")) {
532	value = UCOL_SHIFTED;
533	}
534	if(value != UCOL_DEFAULT) {
535	settings->setAlternateHandling(value, `0`, errorCode);
536	ruleIndex = j;
537	return;
538	}
539	} else if(raw == UNICODE_STRING_SIMPLE("maxVariable")) {
540	int32_t value = UCOL_DEFAULT;
541	if(v == UNICODE_STRING_SIMPLE("space")) {
542	value = CollationSettings::MAX_VAR_SPACE;
543	} else if(v == UNICODE_STRING_SIMPLE("punct")) {
544	value = CollationSettings::MAX_VAR_PUNCT;
545	} else if(v == UNICODE_STRING_SIMPLE("symbol")) {
546	value = CollationSettings::MAX_VAR_SYMBOL;
547	} else if(v == UNICODE_STRING_SIMPLE("currency")) {
548	value = CollationSettings::MAX_VAR_CURRENCY;
549	}
550	if(value != UCOL_DEFAULT) {
551	settings->setMaxVariable(value, `0`, errorCode);
552	settings->variableTop = baseData->getLastPrimaryForGroup(
553	UCOL_REORDER_CODE_FIRST + value);
554	U_ASSERT(settings->variableTop != `0`);
555	ruleIndex = j;
556	return;
557	}
558	} else if(raw == UNICODE_STRING_SIMPLE("caseFirst")) {
559	UColAttributeValue value = UCOL_DEFAULT;
560	if(v == UNICODE_STRING_SIMPLE("off")) {
561	value = UCOL_OFF;
562	} else if(v == UNICODE_STRING_SIMPLE("lower")) {
563	value = UCOL_LOWER_FIRST;
564	} else if(v == UNICODE_STRING_SIMPLE("upper")) {
565	value = UCOL_UPPER_FIRST;
566	}
567	if(value != UCOL_DEFAULT) {
568	settings->setCaseFirst(value, `0`, errorCode);
569	ruleIndex = j;
570	return;
571	}
572	} else if(raw == UNICODE_STRING_SIMPLE("caseLevel")) {
573	UColAttributeValue value = getOnOffValue(v);
574	if(value != UCOL_DEFAULT) {
575	settings->setFlag(CollationSettings::CASE_LEVEL, value, `0`, errorCode);
576	ruleIndex = j;
577	return;
578	}
579	} else if(raw == UNICODE_STRING_SIMPLE("normalization")) {
580	UColAttributeValue value = getOnOffValue(v);
581	if(value != UCOL_DEFAULT) {
582	settings->setFlag(CollationSettings::CHECK_FCD, value, `0`, errorCode);
583	ruleIndex = j;
584	return;
585	}
586	} else if(raw == UNICODE_STRING_SIMPLE("numericOrdering")) {
587	UColAttributeValue value = getOnOffValue(v);
588	if(value != UCOL_DEFAULT) {
589	settings->setFlag(CollationSettings::NUMERIC, value, `0`, errorCode);
590	ruleIndex = j;
591	return;
592	}
593	} else if(raw == UNICODE_STRING_SIMPLE("hiraganaQ")) {
594	UColAttributeValue value = getOnOffValue(v);
595	if(value != UCOL_DEFAULT) {
596	if(value == UCOL_ON) {
597	setParseError("[hiraganaQ on] is not supported", errorCode);
598	}
599	ruleIndex = j;
600	return;
601	}
602	} else if(raw == UNICODE_STRING_SIMPLE("import")) {
603	CharString lang;
604	lang.appendInvariantChars(v, errorCode);
605	if(errorCode == U_MEMORY_ALLOCATION_ERROR) { return; }
606	// BCP 47 language tag -> ICU locale ID
607	char localeID[ULOC_FULLNAME_CAPACITY];
608	int32_t parsedLength;
609	int32_t length = uloc_forLanguageTag(lang.data(), localeID, ULOC_FULLNAME_CAPACITY,
610	&parsedLength, &errorCode);
611	if(U_FAILURE(errorCode) \|\|
612	parsedLength != lang.length() \|\| length >= ULOC_FULLNAME_CAPACITY) {
613	errorCode = U_ZERO_ERROR;
614	setParseError("expected language tag in [import langTag]", errorCode);
615	return;
616	}
617	// localeID minus all keywords
618	char baseID[ULOC_FULLNAME_CAPACITY];
619	length = uloc_getBaseName(localeID, baseID, ULOC_FULLNAME_CAPACITY, &errorCode);
620	if(U_FAILURE(errorCode) \|\| length >= ULOC_KEYWORDS_CAPACITY) {
621	errorCode = U_ZERO_ERROR;
622	setParseError("expected language tag in [import langTag]", errorCode);
623	return;
624	}
625	if(length == `0`) {
626	uprv_strcpy(baseID, "root");
627	} else if(*baseID == `'_'`) {
628	uprv_memmove(baseID + `3`, baseID, length + `1`);
629	uprv_memcpy(baseID, "und", `3`);
630	}
631	// @collation=type, or length=0 if not specified
632	char collationType[ULOC_KEYWORDS_CAPACITY];
633	length = uloc_getKeywordValue(localeID, "collation",
634	collationType, ULOC_KEYWORDS_CAPACITY,
635	&errorCode);
636	if(U_FAILURE(errorCode) \|\| length >= ULOC_KEYWORDS_CAPACITY) {
637	errorCode = U_ZERO_ERROR;
638	setParseError("expected language tag in [import langTag]", errorCode);
639	return;
640	}
641	if(importer == NULL) {
642	setParseError("[import langTag] is not supported", errorCode);
643	} else {
644	UnicodeString importedRules;
645	importer->getRules(baseID, length > `0` ? collationType : "standard",
646	importedRules, errorReason, errorCode);
647	if(U_FAILURE(errorCode)) {
648	if(errorReason == NULL) {
649	errorReason = "[import langTag] failed";
650	}
651	setErrorContext();
652	return;
653	}
654	const UnicodeString *outerRules = rules;
655	int32_t outerRuleIndex = ruleIndex;
656	parse(importedRules, errorCode);
657	if(U_FAILURE(errorCode)) {
658	if(parseError != NULL) {
659	parseError->offset = outerRuleIndex;
660	}
661	}
662	rules = outerRules;
663	ruleIndex = j;
664	}
665	return;
666	}
667	} else if(rules->charAt(j) == `0x5b`) { // words end with [
668	UnicodeSet set;
669	j = parseUnicodeSet(j, set, errorCode);
670	if(U_FAILURE(errorCode)) { return; }
671	if(raw == UNICODE_STRING_SIMPLE("optimize")) {
672	sink->optimize(set, errorReason, errorCode);
673	if(U_FAILURE(errorCode)) { setErrorContext(); }
674	ruleIndex = j;
675	return;
676	} else if(raw == UNICODE_STRING_SIMPLE("suppressContractions")) {
677	sink->suppressContractions(set, errorReason, errorCode);
678	if(U_FAILURE(errorCode)) { setErrorContext(); }
679	ruleIndex = j;
680	return;
681	}
682	}
683	setParseError("not a valid setting/option", errorCode);
684	}
685
686	void
687	CollationRuleParser::parseReordering(const UnicodeString &raw, UErrorCode &errorCode) {
688	if(U_FAILURE(errorCode)) { return; }
689	int32_t i = `7`; // after "reorder"
690	if(i == raw.length()) {
691	// empty [reorder] with no codes
692	settings->resetReordering();
693	return;
694	}
695	// Parse the codes in [reorder aa bb cc].
696	UVector32 reorderCodes(errorCode);
697	if(U_FAILURE(errorCode)) { return; }
698	CharString word;
699	while(i < raw.length()) {
700	++i; // skip the word-separating space
701	int32_t limit = raw.indexOf((UChar)`0x20`, i);
702	if(limit < `0`) { limit = raw.length(); }
703	word.clear().appendInvariantChars(raw.tempSubStringBetween(i, limit), errorCode);
704	if(U_FAILURE(errorCode)) { return; }
705	int32_t code = getReorderCode(word.data());
706	if(code < `0`) {
707	setParseError("unknown script or reorder code", errorCode);
708	return;
709	}
710	reorderCodes.addElement(code, errorCode);
711	if(U_FAILURE(errorCode)) { return; }
712	i = limit;
713	}
714	settings->setReordering(*baseData, reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
715	}
716
717	static const char *const gSpecialReorderCodes[] = {
718	"space", "punct", "symbol", "currency", "digit"
719	};
720
721	int32_t
722	CollationRuleParser::getReorderCode(const char *word) {
723	for(int32_t i = `0`; i < UPRV_LENGTHOF(gSpecialReorderCodes); ++i) {
724	if(uprv_stricmp(word, gSpecialReorderCodes[i]) == `0`) {
725	return UCOL_REORDER_CODE_FIRST + i;
726	}
727	}
728	int32_t script = u_getPropertyValueEnum(UCHAR_SCRIPT, word);
729	if(script >= `0`) {
730	return script;
731	}
732	if(uprv_stricmp(word, "others") == `0`) {
733	return UCOL_REORDER_CODE_OTHERS; // same as Zzzz = USCRIPT_UNKNOWN
734	}
735	return -`1`;
736	}
737
738	UColAttributeValue
739	CollationRuleParser::getOnOffValue(const UnicodeString &s) {
740	if(s == UNICODE_STRING_SIMPLE("on")) {
741	return UCOL_ON;
742	} else if(s == UNICODE_STRING_SIMPLE("off")) {
743	return UCOL_OFF;
744	} else {
745	return UCOL_DEFAULT;
746	}
747	}
748
749	int32_t
750	CollationRuleParser::parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode) {
751	// Collect a UnicodeSet pattern between a balanced pair of [brackets].
752	int32_t level = `0`;
753	int32_t j = i;
754	for(;;) {
755	if(j == rules->length()) {
756	setParseError("unbalanced UnicodeSet pattern brackets", errorCode);
757	return j;
758	}
759	UChar c = rules->charAt(j++);
760	if(c == `0x5b`) { // '['
761	++level;
762	} else if(c == `0x5d`) { // ']'
763	if(--level == `0`) { break; }
764	}
765	}
766	set.applyPattern(rules->tempSubStringBetween(i, j), errorCode);
767	if(U_FAILURE(errorCode)) {
768	errorCode = U_ZERO_ERROR;
769	setParseError("not a valid UnicodeSet pattern", errorCode);
770	return j;
771	}
772	j = skipWhiteSpace(j);
773	if(j == rules->length() \|\| rules->charAt(j) != `0x5d`) {
774	setParseError("missing option-terminating ']' after UnicodeSet pattern", errorCode);
775	return j;
776	}
777	return ++j;
778	}
779
780	int32_t
781	CollationRuleParser::readWords(int32_t i, UnicodeString &raw) const {
782	static const UChar sp = `0x20`;
783	raw.remove();
784	i = skipWhiteSpace(i);
785	for(;;) {
786	if(i >= rules->length()) { return `0`; }
787	UChar c = rules->charAt(i);
788	if(isSyntaxChar(c) && c != `0x2d` && c != `0x5f`) { // syntax except -_
789	if(raw.isEmpty()) { return i; }
790	if(raw.endsWith(&sp, `1`)) { // remove trailing space
791	raw.truncate(raw.length() - `1`);
792	}
793	return i;
794	}
795	if(PatternProps::isWhiteSpace(c)) {
796	raw.append(sp);
797	i = skipWhiteSpace(i + `1`);
798	} else {
799	raw.append(c);
800	++i;
801	}
802	}
803	}
804
805	int32_t
806	CollationRuleParser::skipComment(int32_t i) const {
807	// skip to past the newline
808	while(i < rules->length()) {
809	UChar c = rules->charAt(i++);
810	// LF or FF or CR or NEL or LS or PS
811	if(c == `0xa` \|\| c == `0xc` \|\| c == `0xd` \|\| c == `0x85` \|\| c == `0x2028` \|\| c == `0x2029`) {
812	// Unicode Newline Guidelines: "A readline function should stop at NLF, LS, FF, or PS."
813	// NLF (new line function) = CR or LF or CR+LF or NEL.
814	// No need to collect all of CR+LF because a following LF will be ignored anyway.
815	break;
816	}
817	}
818	return i;
819	}
820
821	void
822	CollationRuleParser::setParseError(const char *reason, UErrorCode &errorCode) {
823	if(U_FAILURE(errorCode)) { return; }
824	// Error code consistent with the old parser (from ca. 2001),
825	// rather than U_PARSE_ERROR;
826	errorCode = U_INVALID_FORMAT_ERROR;
827	errorReason = reason;
828	if(parseError != NULL) { setErrorContext(); }
829	}
830
831	void
832	CollationRuleParser::setErrorContext() {
833	if(parseError == NULL) { return; }
834
835	// Note: This relies on the calling code maintaining the ruleIndex
836	// at a position that is useful for debugging.
837	// For example, at the beginning of a reset or relation etc.
838	parseError->offset = ruleIndex;
839	parseError->line = `0`; // We are not counting line numbers.
840
841	// before ruleIndex
842	int32_t start = ruleIndex - (U_PARSE_CONTEXT_LEN - `1`);
843	if(start < `0`) {
844	start = `0`;
845	} else if(start > `0` && U16_IS_TRAIL(rules->charAt(start))) {
846	++start;
847	}
848	int32_t length = ruleIndex - start;
849	rules->extract(start, length, parseError->preContext);
850	parseError->preContext[length] = `0`;
851
852	// starting from ruleIndex
853	length = rules->length() - ruleIndex;
854	if(length >= U_PARSE_CONTEXT_LEN) {
855	length = U_PARSE_CONTEXT_LEN - `1`;
856	if(U16_IS_LEAD(rules->charAt(ruleIndex + length - `1`))) {
857	--length;
858	}
859	}
860	rules->extract(ruleIndex, length, parseError->postContext);
861	parseError->postContext[length] = `0`;
862	}
863
864	UBool
865	CollationRuleParser::isSyntaxChar(UChar32 c) {
866	return `0x21` <= c && c <= `0x7e` &&
867	(c <= `0x2f` \|\| (`0x3a` <= c && c <= `0x40`) \|\|
868	(`0x5b` <= c && c <= `0x60`) \|\| (`0x7b` <= c));
869	}
870
871	int32_t
872	CollationRuleParser::skipWhiteSpace(int32_t i) const {
873	while(i < rules->length() && PatternProps::isWhiteSpace(rules->charAt(i))) {
874	++i;
875	}
876	return i;
877	}
878
879	U_NAMESPACE_END
880
881	#endif // !UCONFIG_NO_COLLATION
882

Browse the source code of ClickHouse/contrib/icu/icu4c/source/i18n/collationruleparser.cpp