1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | /* |
4 | * Copyright (C) {1999-2001}, International Business Machines Corporation and others. All Rights Reserved. |
5 | ********************************************************************** |
6 | * Date Name Description |
7 | * 11/17/99 aliu Creation. |
8 | ********************************************************************** |
9 | */ |
10 | #ifndef RBT_RULE_H |
11 | #define RBT_RULE_H |
12 | |
13 | #include "unicode/utypes.h" |
14 | |
15 | #if !UCONFIG_NO_TRANSLITERATION |
16 | |
17 | #include "unicode/uobject.h" |
18 | #include "unicode/unistr.h" |
19 | #include "unicode/utrans.h" |
20 | #include "unicode/unimatch.h" |
21 | |
22 | U_NAMESPACE_BEGIN |
23 | |
24 | class Replaceable; |
25 | class TransliterationRuleData; |
26 | class StringMatcher; |
27 | class UnicodeFunctor; |
28 | |
29 | /** |
30 | * A transliteration rule used by |
31 | * <code>RuleBasedTransliterator</code>. |
32 | * <code>TransliterationRule</code> is an immutable object. |
33 | * |
34 | * <p>A rule consists of an input pattern and an output string. When |
35 | * the input pattern is matched, the output string is emitted. The |
36 | * input pattern consists of zero or more characters which are matched |
37 | * exactly (the key) and optional context. Context must match if it |
38 | * is specified. Context may be specified before the key, after the |
39 | * key, or both. The key, preceding context, and following context |
40 | * may contain variables. Variables represent a set of Unicode |
41 | * characters, such as the letters <i>a</i> through <i>z</i>. |
42 | * Variables are detected by looking up each character in a supplied |
43 | * variable list to see if it has been so defined. |
44 | * |
45 | * <p>A rule may contain segments in its input string and segment |
46 | * references in its output string. A segment is a substring of the |
47 | * input pattern, indicated by an offset and limit. The segment may |
48 | * be in the preceding or following context. It may not span a |
49 | * context boundary. A segment reference is a special character in |
50 | * the output string that causes a segment of the input string (not |
51 | * the input pattern) to be copied to the output string. The range of |
52 | * special characters that represent segment references is defined by |
53 | * RuleBasedTransliterator.Data. |
54 | * |
55 | * @author Alan Liu |
56 | */ |
57 | class TransliterationRule : public UMemory { |
58 | |
59 | private: |
60 | |
61 | // TODO Eliminate the pattern and keyLength data members. They |
62 | // are used only by masks() and getIndexValue() which are called |
63 | // only during build time, not during run-time. Perhaps these |
64 | // methods and pattern/keyLength can be isolated into a separate |
65 | // object. |
66 | |
67 | /** |
68 | * The match that must occur before the key, or null if there is no |
69 | * preceding context. |
70 | */ |
71 | StringMatcher *anteContext; |
72 | |
73 | /** |
74 | * The matcher object for the key. If null, then the key is empty. |
75 | */ |
76 | StringMatcher *key; |
77 | |
78 | /** |
79 | * The match that must occur after the key, or null if there is no |
80 | * following context. |
81 | */ |
82 | StringMatcher *postContext; |
83 | |
84 | /** |
85 | * The object that performs the replacement if the key, |
86 | * anteContext, and postContext are matched. Never null. |
87 | */ |
88 | UnicodeFunctor* output; |
89 | |
90 | /** |
91 | * The string that must be matched, consisting of the anteContext, key, |
92 | * and postContext, concatenated together, in that order. Some components |
93 | * may be empty (zero length). |
94 | * @see anteContextLength |
95 | * @see keyLength |
96 | */ |
97 | UnicodeString pattern; |
98 | |
99 | /** |
100 | * An array of matcher objects corresponding to the input pattern |
101 | * segments. If there are no segments this is null. N.B. This is |
102 | * a UnicodeMatcher for generality, but in practice it is always a |
103 | * StringMatcher. In the future we may generalize this, but for |
104 | * now we sometimes cast down to StringMatcher. |
105 | * |
106 | * The array is owned, but the pointers within it are not. |
107 | */ |
108 | UnicodeFunctor** segments; |
109 | |
110 | /** |
111 | * The number of elements in segments[] or zero if segments is NULL. |
112 | */ |
113 | int32_t segmentsCount; |
114 | |
115 | /** |
116 | * The length of the string that must match before the key. If |
117 | * zero, then there is no matching requirement before the key. |
118 | * Substring [0,anteContextLength) of pattern is the anteContext. |
119 | */ |
120 | int32_t anteContextLength; |
121 | |
122 | /** |
123 | * The length of the key. Substring [anteContextLength, |
124 | * anteContextLength + keyLength) is the key. |
125 | |
126 | */ |
127 | int32_t keyLength; |
128 | |
129 | /** |
130 | * Miscellaneous attributes. |
131 | */ |
132 | int8_t flags; |
133 | |
134 | /** |
135 | * Flag attributes. |
136 | */ |
137 | enum { |
138 | ANCHOR_START = 1, |
139 | ANCHOR_END = 2 |
140 | }; |
141 | |
142 | /** |
143 | * An alias pointer to the data for this rule. The data provides |
144 | * lookup services for matchers and segments. |
145 | */ |
146 | const TransliterationRuleData* data; |
147 | |
148 | public: |
149 | |
150 | /** |
151 | * Construct a new rule with the given input, output text, and other |
152 | * attributes. A cursor position may be specified for the output text. |
153 | * @param input input string, including key and optional ante and |
154 | * post context. |
155 | * @param anteContextPos offset into input to end of ante context, or -1 if |
156 | * none. Must be <= input.length() if not -1. |
157 | * @param postContextPos offset into input to start of post context, or -1 |
158 | * if none. Must be <= input.length() if not -1, and must be >= |
159 | * anteContextPos. |
160 | * @param outputStr output string. |
161 | * @param cursorPosition offset into output at which cursor is located, or -1 if |
162 | * none. If less than zero, then the cursor is placed after the |
163 | * <code>output</code>; that is, -1 is equivalent to |
164 | * <code>output.length()</code>. If greater than |
165 | * <code>output.length()</code> then an exception is thrown. |
166 | * @param cursorOffset an offset to be added to cursorPos to position the |
167 | * cursor either in the ante context, if < 0, or in the post context, if > |
168 | * 0. For example, the rule "abc{def} > | @@@ xyz;" changes "def" to |
169 | * "xyz" and moves the cursor to before "a". It would have a cursorOffset |
170 | * of -3. |
171 | * @param segs array of UnicodeMatcher corresponding to input pattern |
172 | * segments, or null if there are none. The array itself is adopted, |
173 | * but the pointers within it are not. |
174 | * @param segsCount number of elements in segs[]. |
175 | * @param anchorStart TRUE if the the rule is anchored on the left to |
176 | * the context start. |
177 | * @param anchorEnd TRUE if the rule is anchored on the right to the |
178 | * context limit. |
179 | * @param data the rule data. |
180 | * @param status Output parameter filled in with success or failure status. |
181 | */ |
182 | TransliterationRule(const UnicodeString& input, |
183 | int32_t anteContextPos, int32_t postContextPos, |
184 | const UnicodeString& outputStr, |
185 | int32_t cursorPosition, int32_t cursorOffset, |
186 | UnicodeFunctor** segs, |
187 | int32_t segsCount, |
188 | UBool anchorStart, UBool anchorEnd, |
189 | const TransliterationRuleData* data, |
190 | UErrorCode& status); |
191 | |
192 | /** |
193 | * Copy constructor. |
194 | * @param other the object to be copied. |
195 | */ |
196 | TransliterationRule(TransliterationRule& other); |
197 | |
198 | /** |
199 | * Destructor. |
200 | */ |
201 | virtual ~TransliterationRule(); |
202 | |
203 | /** |
204 | * Change the data object that this rule belongs to. Used |
205 | * internally by the TransliterationRuleData copy constructor. |
206 | * @param data the new data value to be set. |
207 | */ |
208 | void setData(const TransliterationRuleData* data); |
209 | |
210 | /** |
211 | * Return the preceding context length. This method is needed to |
212 | * support the <code>Transliterator</code> method |
213 | * <code>getMaximumContextLength()</code>. Internally, this is |
214 | * implemented as the anteContextLength, optionally plus one if |
215 | * there is a start anchor. The one character anchor gap is |
216 | * needed to make repeated incremental transliteration with |
217 | * anchors work. |
218 | * @return the preceding context length. |
219 | */ |
220 | virtual int32_t getContextLength(void) const; |
221 | |
222 | /** |
223 | * Internal method. Returns 8-bit index value for this rule. |
224 | * This is the low byte of the first character of the key, |
225 | * unless the first character of the key is a set. If it's a |
226 | * set, or otherwise can match multiple keys, the index value is -1. |
227 | * @return 8-bit index value for this rule. |
228 | */ |
229 | int16_t getIndexValue() const; |
230 | |
231 | /** |
232 | * Internal method. Returns true if this rule matches the given |
233 | * index value. The index value is an 8-bit integer, 0..255, |
234 | * representing the low byte of the first character of the key. |
235 | * It matches this rule if it matches the first character of the |
236 | * key, or if the first character of the key is a set, and the set |
237 | * contains any character with a low byte equal to the index |
238 | * value. If the rule contains only ante context, as in foo)>bar, |
239 | * then it will match any key. |
240 | * @param v the given index value. |
241 | * @return true if this rule matches the given index value. |
242 | */ |
243 | UBool matchesIndexValue(uint8_t v) const; |
244 | |
245 | /** |
246 | * Return true if this rule masks another rule. If r1 masks r2 then |
247 | * r1 matches any input string that r2 matches. If r1 masks r2 and r2 masks |
248 | * r1 then r1 == r2. Examples: "a>x" masks "ab>y". "a>x" masks "a[b]>y". |
249 | * "[c]a>x" masks "[dc]a>y". |
250 | * @param r2 the given rule to be compared with. |
251 | * @return true if this rule masks 'r2' |
252 | */ |
253 | virtual UBool masks(const TransliterationRule& r2) const; |
254 | |
255 | /** |
256 | * Attempt a match and replacement at the given position. Return |
257 | * the degree of match between this rule and the given text. The |
258 | * degree of match may be mismatch, a partial match, or a full |
259 | * match. A mismatch means at least one character of the text |
260 | * does not match the context or key. A partial match means some |
261 | * context and key characters match, but the text is not long |
262 | * enough to match all of them. A full match means all context |
263 | * and key characters match. |
264 | * |
265 | * If a full match is obtained, perform a replacement, update pos, |
266 | * and return U_MATCH. Otherwise both text and pos are unchanged. |
267 | * |
268 | * @param text the text |
269 | * @param pos the position indices |
270 | * @param incremental if TRUE, test for partial matches that may |
271 | * be completed by additional text inserted at pos.limit. |
272 | * @return one of <code>U_MISMATCH</code>, |
273 | * <code>U_PARTIAL_MATCH</code>, or <code>U_MATCH</code>. If |
274 | * incremental is FALSE then U_PARTIAL_MATCH will not be returned. |
275 | */ |
276 | UMatchDegree matchAndReplace(Replaceable& text, |
277 | UTransPosition& pos, |
278 | UBool incremental) const; |
279 | |
280 | /** |
281 | * Create a rule string that represents this rule object. Append |
282 | * it to the given string. |
283 | */ |
284 | virtual UnicodeString& toRule(UnicodeString& pat, |
285 | UBool escapeUnprintable) const; |
286 | |
287 | /** |
288 | * Union the set of all characters that may be modified by this rule |
289 | * into the given set. |
290 | */ |
291 | void addSourceSetTo(UnicodeSet& toUnionTo) const; |
292 | |
293 | /** |
294 | * Union the set of all characters that may be emitted by this rule |
295 | * into the given set. |
296 | */ |
297 | void addTargetSetTo(UnicodeSet& toUnionTo) const; |
298 | |
299 | private: |
300 | |
301 | friend class StringMatcher; |
302 | |
303 | TransliterationRule &operator=(const TransliterationRule &other); // forbid copying of this class |
304 | }; |
305 | |
306 | U_NAMESPACE_END |
307 | |
308 | #endif /* #if !UCONFIG_NO_TRANSLITERATION */ |
309 | |
310 | #endif |
311 | |