1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | /* |
4 | ******************************************************************************* |
5 | * Copyright (C) 2013-2014, International Business Machines |
6 | * Corporation and others. All Rights Reserved. |
7 | ******************************************************************************* |
8 | * collationruleparser.h |
9 | * |
10 | * created on: 2013apr10 |
11 | * created by: Markus W. Scherer |
12 | */ |
13 | |
14 | #ifndef __COLLATIONRULEPARSER_H__ |
15 | #define __COLLATIONRULEPARSER_H__ |
16 | |
17 | #include "unicode/utypes.h" |
18 | |
19 | #if !UCONFIG_NO_COLLATION |
20 | |
21 | #include "unicode/ucol.h" |
22 | #include "unicode/uniset.h" |
23 | #include "unicode/unistr.h" |
24 | |
25 | struct UParseError; |
26 | |
27 | U_NAMESPACE_BEGIN |
28 | |
29 | struct CollationData; |
30 | struct CollationTailoring; |
31 | |
32 | class Locale; |
33 | class Normalizer2; |
34 | |
35 | struct CollationSettings; |
36 | |
37 | class U_I18N_API CollationRuleParser : public UMemory { |
38 | public: |
39 | /** Special reset positions. */ |
40 | enum Position { |
41 | FIRST_TERTIARY_IGNORABLE, |
42 | LAST_TERTIARY_IGNORABLE, |
43 | FIRST_SECONDARY_IGNORABLE, |
44 | LAST_SECONDARY_IGNORABLE, |
45 | FIRST_PRIMARY_IGNORABLE, |
46 | LAST_PRIMARY_IGNORABLE, |
47 | FIRST_VARIABLE, |
48 | LAST_VARIABLE, |
49 | FIRST_REGULAR, |
50 | LAST_REGULAR, |
51 | FIRST_IMPLICIT, |
52 | LAST_IMPLICIT, |
53 | FIRST_TRAILING, |
54 | LAST_TRAILING |
55 | }; |
56 | |
57 | /** |
58 | * First character of contractions that encode special reset positions. |
59 | * U+FFFE cannot be tailored via rule syntax. |
60 | * |
61 | * The second contraction character is POS_BASE + Position. |
62 | */ |
63 | static const UChar POS_LEAD = 0xfffe; |
64 | /** |
65 | * Base for the second character of contractions that encode special reset positions. |
66 | * Braille characters U+28xx are printable and normalization-inert. |
67 | * @see POS_LEAD |
68 | */ |
69 | static const UChar POS_BASE = 0x2800; |
70 | |
71 | class U_I18N_API Sink : public UObject { |
72 | public: |
73 | virtual ~Sink(); |
74 | /** |
75 | * Adds a reset. |
76 | * strength=UCOL_IDENTICAL for &str. |
77 | * strength=UCOL_PRIMARY/UCOL_SECONDARY/UCOL_TERTIARY for &[before n]str where n=1/2/3. |
78 | */ |
79 | virtual void addReset(int32_t strength, const UnicodeString &str, |
80 | const char *&errorReason, UErrorCode &errorCode) = 0; |
81 | /** |
82 | * Adds a relation with strength and prefix | str / extension. |
83 | */ |
84 | virtual void addRelation(int32_t strength, const UnicodeString &prefix, |
85 | const UnicodeString &str, const UnicodeString &extension, |
86 | const char *&errorReason, UErrorCode &errorCode) = 0; |
87 | |
88 | virtual void suppressContractions(const UnicodeSet &set, const char *&errorReason, |
89 | UErrorCode &errorCode); |
90 | |
91 | virtual void optimize(const UnicodeSet &set, const char *&errorReason, |
92 | UErrorCode &errorCode); |
93 | }; |
94 | |
95 | class U_I18N_API Importer : public UObject { |
96 | public: |
97 | virtual ~Importer(); |
98 | virtual void getRules( |
99 | const char *localeID, const char *collationType, |
100 | UnicodeString &rules, |
101 | const char *&errorReason, UErrorCode &errorCode) = 0; |
102 | }; |
103 | |
104 | /** |
105 | * Constructor. |
106 | * The Sink must be set before parsing. |
107 | * The Importer can be set, otherwise [import locale] syntax is not supported. |
108 | */ |
109 | CollationRuleParser(const CollationData *base, UErrorCode &errorCode); |
110 | ~CollationRuleParser(); |
111 | |
112 | /** |
113 | * Sets the pointer to a Sink object. |
114 | * The pointer is aliased: Pointer copy without cloning or taking ownership. |
115 | */ |
116 | void setSink(Sink *sinkAlias) { |
117 | sink = sinkAlias; |
118 | } |
119 | |
120 | /** |
121 | * Sets the pointer to an Importer object. |
122 | * The pointer is aliased: Pointer copy without cloning or taking ownership. |
123 | */ |
124 | void setImporter(Importer *importerAlias) { |
125 | importer = importerAlias; |
126 | } |
127 | |
128 | void parse(const UnicodeString &ruleString, |
129 | CollationSettings &outSettings, |
130 | UParseError *outParseError, |
131 | UErrorCode &errorCode); |
132 | |
133 | const char *getErrorReason() const { return errorReason; } |
134 | |
135 | /** |
136 | * Gets a script or reorder code from its string representation. |
137 | * @return the script/reorder code, or |
138 | * -1 if not recognized |
139 | */ |
140 | static int32_t getReorderCode(const char *word); |
141 | |
142 | private: |
143 | /** UCOL_PRIMARY=0 .. UCOL_IDENTICAL=15 */ |
144 | static const int32_t STRENGTH_MASK = 0xf; |
145 | static const int32_t STARRED_FLAG = 0x10; |
146 | static const int32_t OFFSET_SHIFT = 8; |
147 | |
148 | void parse(const UnicodeString &ruleString, UErrorCode &errorCode); |
149 | void parseRuleChain(UErrorCode &errorCode); |
150 | int32_t parseResetAndPosition(UErrorCode &errorCode); |
151 | int32_t parseRelationOperator(UErrorCode &errorCode); |
152 | void parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode); |
153 | void parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode); |
154 | int32_t parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode); |
155 | int32_t parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode); |
156 | |
157 | /** |
158 | * Sets str to a contraction of U+FFFE and (U+2800 + Position). |
159 | * @return rule index after the special reset position |
160 | */ |
161 | int32_t parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode); |
162 | void parseSetting(UErrorCode &errorCode); |
163 | void parseReordering(const UnicodeString &raw, UErrorCode &errorCode); |
164 | static UColAttributeValue getOnOffValue(const UnicodeString &s); |
165 | |
166 | int32_t parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode); |
167 | int32_t readWords(int32_t i, UnicodeString &raw) const; |
168 | int32_t (int32_t i) const; |
169 | |
170 | void setParseError(const char *reason, UErrorCode &errorCode); |
171 | void setErrorContext(); |
172 | |
173 | /** |
174 | * ASCII [:P:] and [:S:]: |
175 | * [\u0021-\u002F \u003A-\u0040 \u005B-\u0060 \u007B-\u007E] |
176 | */ |
177 | static UBool isSyntaxChar(UChar32 c); |
178 | int32_t skipWhiteSpace(int32_t i) const; |
179 | |
180 | const Normalizer2 &nfd, &nfc; |
181 | |
182 | const UnicodeString *rules; |
183 | const CollationData *const baseData; |
184 | CollationSettings *settings; |
185 | UParseError *parseError; |
186 | const char *errorReason; |
187 | |
188 | Sink *sink; |
189 | Importer *importer; |
190 | |
191 | int32_t ruleIndex; |
192 | }; |
193 | |
194 | U_NAMESPACE_END |
195 | |
196 | #endif // !UCONFIG_NO_COLLATION |
197 | #endif // __COLLATIONRULEPARSER_H__ |
198 | |