1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5* Copyright (C) 2013-2014, International Business Machines
6* Corporation and others. All Rights Reserved.
7*******************************************************************************
8* collationruleparser.h
9*
10* created on: 2013apr10
11* created by: Markus W. Scherer
12*/
13
14#ifndef __COLLATIONRULEPARSER_H__
15#define __COLLATIONRULEPARSER_H__
16
17#include "unicode/utypes.h"
18
19#if !UCONFIG_NO_COLLATION
20
21#include "unicode/ucol.h"
22#include "unicode/uniset.h"
23#include "unicode/unistr.h"
24
25struct UParseError;
26
27U_NAMESPACE_BEGIN
28
29struct CollationData;
30struct CollationTailoring;
31
32class Locale;
33class Normalizer2;
34
35struct CollationSettings;
36
37class U_I18N_API CollationRuleParser : public UMemory {
38public:
39 /** Special reset positions. */
40 enum Position {
41 FIRST_TERTIARY_IGNORABLE,
42 LAST_TERTIARY_IGNORABLE,
43 FIRST_SECONDARY_IGNORABLE,
44 LAST_SECONDARY_IGNORABLE,
45 FIRST_PRIMARY_IGNORABLE,
46 LAST_PRIMARY_IGNORABLE,
47 FIRST_VARIABLE,
48 LAST_VARIABLE,
49 FIRST_REGULAR,
50 LAST_REGULAR,
51 FIRST_IMPLICIT,
52 LAST_IMPLICIT,
53 FIRST_TRAILING,
54 LAST_TRAILING
55 };
56
57 /**
58 * First character of contractions that encode special reset positions.
59 * U+FFFE cannot be tailored via rule syntax.
60 *
61 * The second contraction character is POS_BASE + Position.
62 */
63 static const UChar POS_LEAD = 0xfffe;
64 /**
65 * Base for the second character of contractions that encode special reset positions.
66 * Braille characters U+28xx are printable and normalization-inert.
67 * @see POS_LEAD
68 */
69 static const UChar POS_BASE = 0x2800;
70
71 class U_I18N_API Sink : public UObject {
72 public:
73 virtual ~Sink();
74 /**
75 * Adds a reset.
76 * strength=UCOL_IDENTICAL for &str.
77 * strength=UCOL_PRIMARY/UCOL_SECONDARY/UCOL_TERTIARY for &[before n]str where n=1/2/3.
78 */
79 virtual void addReset(int32_t strength, const UnicodeString &str,
80 const char *&errorReason, UErrorCode &errorCode) = 0;
81 /**
82 * Adds a relation with strength and prefix | str / extension.
83 */
84 virtual void addRelation(int32_t strength, const UnicodeString &prefix,
85 const UnicodeString &str, const UnicodeString &extension,
86 const char *&errorReason, UErrorCode &errorCode) = 0;
87
88 virtual void suppressContractions(const UnicodeSet &set, const char *&errorReason,
89 UErrorCode &errorCode);
90
91 virtual void optimize(const UnicodeSet &set, const char *&errorReason,
92 UErrorCode &errorCode);
93 };
94
95 class U_I18N_API Importer : public UObject {
96 public:
97 virtual ~Importer();
98 virtual void getRules(
99 const char *localeID, const char *collationType,
100 UnicodeString &rules,
101 const char *&errorReason, UErrorCode &errorCode) = 0;
102 };
103
104 /**
105 * Constructor.
106 * The Sink must be set before parsing.
107 * The Importer can be set, otherwise [import locale] syntax is not supported.
108 */
109 CollationRuleParser(const CollationData *base, UErrorCode &errorCode);
110 ~CollationRuleParser();
111
112 /**
113 * Sets the pointer to a Sink object.
114 * The pointer is aliased: Pointer copy without cloning or taking ownership.
115 */
116 void setSink(Sink *sinkAlias) {
117 sink = sinkAlias;
118 }
119
120 /**
121 * Sets the pointer to an Importer object.
122 * The pointer is aliased: Pointer copy without cloning or taking ownership.
123 */
124 void setImporter(Importer *importerAlias) {
125 importer = importerAlias;
126 }
127
128 void parse(const UnicodeString &ruleString,
129 CollationSettings &outSettings,
130 UParseError *outParseError,
131 UErrorCode &errorCode);
132
133 const char *getErrorReason() const { return errorReason; }
134
135 /**
136 * Gets a script or reorder code from its string representation.
137 * @return the script/reorder code, or
138 * -1 if not recognized
139 */
140 static int32_t getReorderCode(const char *word);
141
142private:
143 /** UCOL_PRIMARY=0 .. UCOL_IDENTICAL=15 */
144 static const int32_t STRENGTH_MASK = 0xf;
145 static const int32_t STARRED_FLAG = 0x10;
146 static const int32_t OFFSET_SHIFT = 8;
147
148 void parse(const UnicodeString &ruleString, UErrorCode &errorCode);
149 void parseRuleChain(UErrorCode &errorCode);
150 int32_t parseResetAndPosition(UErrorCode &errorCode);
151 int32_t parseRelationOperator(UErrorCode &errorCode);
152 void parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode);
153 void parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode);
154 int32_t parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode);
155 int32_t parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode);
156
157 /**
158 * Sets str to a contraction of U+FFFE and (U+2800 + Position).
159 * @return rule index after the special reset position
160 */
161 int32_t parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode);
162 void parseSetting(UErrorCode &errorCode);
163 void parseReordering(const UnicodeString &raw, UErrorCode &errorCode);
164 static UColAttributeValue getOnOffValue(const UnicodeString &s);
165
166 int32_t parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode);
167 int32_t readWords(int32_t i, UnicodeString &raw) const;
168 int32_t skipComment(int32_t i) const;
169
170 void setParseError(const char *reason, UErrorCode &errorCode);
171 void setErrorContext();
172
173 /**
174 * ASCII [:P:] and [:S:]:
175 * [\u0021-\u002F \u003A-\u0040 \u005B-\u0060 \u007B-\u007E]
176 */
177 static UBool isSyntaxChar(UChar32 c);
178 int32_t skipWhiteSpace(int32_t i) const;
179
180 const Normalizer2 &nfd, &nfc;
181
182 const UnicodeString *rules;
183 const CollationData *const baseData;
184 CollationSettings *settings;
185 UParseError *parseError;
186 const char *errorReason;
187
188 Sink *sink;
189 Importer *importer;
190
191 int32_t ruleIndex;
192};
193
194U_NAMESPACE_END
195
196#endif // !UCONFIG_NO_COLLATION
197#endif // __COLLATIONRULEPARSER_H__
198