1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | /* |
4 | ********************************************************************** |
5 | * Copyright (C) 1999-2011, International Business Machines Corporation |
6 | * and others. All Rights Reserved. |
7 | ********************************************************************** |
8 | * Date Name Description |
9 | * 11/17/99 aliu Creation. |
10 | ********************************************************************** |
11 | */ |
12 | #ifndef RBT_PARS_H |
13 | #define RBT_PARS_H |
14 | |
15 | #include "unicode/utypes.h" |
16 | |
17 | #if !UCONFIG_NO_TRANSLITERATION |
18 | #ifdef __cplusplus |
19 | |
20 | #include "unicode/uobject.h" |
21 | #include "unicode/parseerr.h" |
22 | #include "unicode/unorm.h" |
23 | #include "rbt.h" |
24 | #include "hash.h" |
25 | #include "uvector.h" |
26 | |
27 | U_NAMESPACE_BEGIN |
28 | |
29 | class TransliterationRuleData; |
30 | class UnicodeFunctor; |
31 | class ParseData; |
32 | class RuleHalf; |
33 | class ParsePosition; |
34 | class StringMatcher; |
35 | |
36 | class TransliteratorParser : public UMemory { |
37 | |
38 | public: |
39 | |
40 | /** |
41 | * A Vector of TransliterationRuleData objects, one for each discrete group |
42 | * of rules in the rule set |
43 | */ |
44 | UVector dataVector; |
45 | |
46 | /** |
47 | * PUBLIC data member. |
48 | * A Vector of UnicodeStrings containing all of the ID blocks in the rule set |
49 | */ |
50 | UVector idBlockVector; |
51 | |
52 | /** |
53 | * PUBLIC data member containing the parsed compound filter, if any. |
54 | */ |
55 | UnicodeSet* compoundFilter; |
56 | |
57 | private: |
58 | |
59 | /** |
60 | * The current data object for which we are parsing rules |
61 | */ |
62 | TransliterationRuleData* curData; |
63 | |
64 | UTransDirection direction; |
65 | |
66 | /** |
67 | * Parse error information. |
68 | */ |
69 | UParseError parseError; |
70 | |
71 | /** |
72 | * Temporary symbol table used during parsing. |
73 | */ |
74 | ParseData* parseData; |
75 | |
76 | /** |
77 | * Temporary vector of matcher variables. When parsing is complete, this |
78 | * is copied into the array data.variables. As with data.variables, |
79 | * element 0 corresponds to character data.variablesBase. |
80 | */ |
81 | UVector variablesVector; |
82 | |
83 | /** |
84 | * Temporary table of variable names. When parsing is complete, this is |
85 | * copied into data.variableNames. |
86 | */ |
87 | Hashtable variableNames; |
88 | |
89 | /** |
90 | * String of standins for segments. Used during the parsing of a single |
91 | * rule. segmentStandins.charAt(0) is the standin for "$1" and corresponds |
92 | * to StringMatcher object segmentObjects.elementAt(0), etc. |
93 | */ |
94 | UnicodeString segmentStandins; |
95 | |
96 | /** |
97 | * Vector of StringMatcher objects for segments. Used during the |
98 | * parsing of a single rule. |
99 | * segmentStandins.charAt(0) is the standin for "$1" and corresponds |
100 | * to StringMatcher object segmentObjects.elementAt(0), etc. |
101 | */ |
102 | UVector segmentObjects; |
103 | |
104 | /** |
105 | * The next available stand-in for variables. This starts at some point in |
106 | * the private use area (discovered dynamically) and increments up toward |
107 | * <code>variableLimit</code>. At any point during parsing, available |
108 | * variables are <code>variableNext..variableLimit-1</code>. |
109 | */ |
110 | UChar variableNext; |
111 | |
112 | /** |
113 | * The last available stand-in for variables. This is discovered |
114 | * dynamically. At any point during parsing, available variables are |
115 | * <code>variableNext..variableLimit-1</code>. |
116 | */ |
117 | UChar variableLimit; |
118 | |
119 | /** |
120 | * When we encounter an undefined variable, we do not immediately signal |
121 | * an error, in case we are defining this variable, e.g., "$a = [a-z];". |
122 | * Instead, we save the name of the undefined variable, and substitute |
123 | * in the placeholder char variableLimit - 1, and decrement |
124 | * variableLimit. |
125 | */ |
126 | UnicodeString undefinedVariableName; |
127 | |
128 | /** |
129 | * The stand-in character for the 'dot' set, represented by '.' in |
130 | * patterns. This is allocated the first time it is needed, and |
131 | * reused thereafter. |
132 | */ |
133 | UChar dotStandIn; |
134 | |
135 | public: |
136 | |
137 | /** |
138 | * Constructor. |
139 | */ |
140 | TransliteratorParser(UErrorCode &statusReturn); |
141 | |
142 | /** |
143 | * Destructor. |
144 | */ |
145 | ~TransliteratorParser(); |
146 | |
147 | /** |
148 | * Parse the given string as a sequence of rules, separated by newline |
149 | * characters ('\n'), and cause this object to implement those rules. Any |
150 | * previous rules are discarded. Typically this method is called exactly |
151 | * once after construction. |
152 | * |
153 | * Parse the given rules, in the given direction. After this call |
154 | * returns, query the public data members for results. The caller |
155 | * owns the 'data' and 'compoundFilter' data members after this |
156 | * call returns. |
157 | * @param rules rules, separated by ';' |
158 | * @param direction either FORWARD or REVERSE. |
159 | * @param pe Struct to recieve information on position |
160 | * of error if an error is encountered |
161 | * @param ec Output param set to success/failure code. |
162 | */ |
163 | void parse(const UnicodeString& rules, |
164 | UTransDirection direction, |
165 | UParseError& pe, |
166 | UErrorCode& ec); |
167 | |
168 | /** |
169 | * Return the compound filter parsed by parse(). Caller owns result. |
170 | * @return the compound filter parsed by parse(). |
171 | */ |
172 | UnicodeSet* orphanCompoundFilter(); |
173 | |
174 | private: |
175 | |
176 | /** |
177 | * Return a representation of this transliterator as source rules. |
178 | * @param rules Output param to receive the rules. |
179 | * @param direction either FORWARD or REVERSE. |
180 | */ |
181 | void parseRules(const UnicodeString& rules, |
182 | UTransDirection direction, |
183 | UErrorCode& status); |
184 | |
185 | /** |
186 | * MAIN PARSER. Parse the next rule in the given rule string, starting |
187 | * at pos. Return the index after the last character parsed. Do not |
188 | * parse characters at or after limit. |
189 | * |
190 | * Important: The character at pos must be a non-whitespace character |
191 | * that is not the comment character. |
192 | * |
193 | * This method handles quoting, escaping, and whitespace removal. It |
194 | * parses the end-of-rule character. It recognizes context and cursor |
195 | * indicators. Once it does a lexical breakdown of the rule at pos, it |
196 | * creates a rule object and adds it to our rule list. |
197 | * @param rules Output param to receive the rules. |
198 | * @param pos the starting position. |
199 | * @param limit pointer past the last character of the rule. |
200 | * @return the index after the last character parsed. |
201 | */ |
202 | int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status); |
203 | |
204 | /** |
205 | * Set the variable range to [start, end] (inclusive). |
206 | * @param start the start value of the range. |
207 | * @param end the end value of the range. |
208 | */ |
209 | void setVariableRange(int32_t start, int32_t end, UErrorCode& status); |
210 | |
211 | /** |
212 | * Assert that the given character is NOT within the variable range. |
213 | * If it is, return FALSE. This is neccesary to ensure that the |
214 | * variable range does not overlap characters used in a rule. |
215 | * @param ch the given character. |
216 | * @return True, if the given character is NOT within the variable range. |
217 | */ |
218 | UBool checkVariableRange(UChar32 ch) const; |
219 | |
220 | /** |
221 | * Set the maximum backup to 'backup', in response to a pragma |
222 | * statement. |
223 | * @param backup the new value to be set. |
224 | */ |
225 | void pragmaMaximumBackup(int32_t backup); |
226 | |
227 | /** |
228 | * Begin normalizing all rules using the given mode, in response |
229 | * to a pragma statement. |
230 | * @param mode the given mode. |
231 | */ |
232 | void pragmaNormalizeRules(UNormalizationMode mode); |
233 | |
234 | /** |
235 | * Return true if the given rule looks like a pragma. |
236 | * @param pos offset to the first non-whitespace character |
237 | * of the rule. |
238 | * @param limit pointer past the last character of the rule. |
239 | * @return true if the given rule looks like a pragma. |
240 | */ |
241 | static UBool resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit); |
242 | |
243 | /** |
244 | * Parse a pragma. This method assumes resemblesPragma() has |
245 | * already returned true. |
246 | * @param pos offset to the first non-whitespace character |
247 | * of the rule. |
248 | * @param limit pointer past the last character of the rule. |
249 | * @return the position index after the final ';' of the pragma, |
250 | * or -1 on failure. |
251 | */ |
252 | int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status); |
253 | |
254 | /** |
255 | * Called by main parser upon syntax error. Search the rule string |
256 | * for the probable end of the rule. Of course, if the error is that |
257 | * the end of rule marker is missing, then the rule end will not be found. |
258 | * In any case the rule start will be correctly reported. |
259 | * @param parseErrorCode error code. |
260 | * @param msg error description. |
261 | * @param start position of first character of current rule. |
262 | * @return start position of first character of current rule. |
263 | */ |
264 | int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start, |
265 | UErrorCode& status); |
266 | |
267 | /** |
268 | * Parse a UnicodeSet out, store it, and return the stand-in character |
269 | * used to represent it. |
270 | * |
271 | * @param rule the rule for UnicodeSet. |
272 | * @param pos the position in pattern at which to start parsing. |
273 | * @return the stand-in character used to represent it. |
274 | */ |
275 | UChar parseSet(const UnicodeString& rule, |
276 | ParsePosition& pos, |
277 | UErrorCode& status); |
278 | |
279 | /** |
280 | * Generate and return a stand-in for a new UnicodeFunctor. Store |
281 | * the matcher (adopt it). |
282 | * @param adopted the UnicodeFunctor to be adopted. |
283 | * @return a stand-in for a new UnicodeFunctor. |
284 | */ |
285 | UChar generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status); |
286 | |
287 | /** |
288 | * Return the standin for segment seg (1-based). |
289 | * @param seg the given segment. |
290 | * @return the standIn character for the given segment. |
291 | */ |
292 | UChar getSegmentStandin(int32_t seg, UErrorCode& status); |
293 | |
294 | /** |
295 | * Set the object for segment seg (1-based). |
296 | * @param seg the given segment. |
297 | * @param adopted the StringMatcher to be adopted. |
298 | */ |
299 | void setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status); |
300 | |
301 | /** |
302 | * Return the stand-in for the dot set. It is allocated the first |
303 | * time and reused thereafter. |
304 | * @return the stand-in for the dot set. |
305 | */ |
306 | UChar getDotStandIn(UErrorCode& status); |
307 | |
308 | /** |
309 | * Append the value of the given variable name to the given |
310 | * UnicodeString. |
311 | * @param name the variable name to be appended. |
312 | * @param buf the given UnicodeString to append to. |
313 | */ |
314 | void appendVariableDef(const UnicodeString& name, |
315 | UnicodeString& buf, |
316 | UErrorCode& status); |
317 | |
318 | /** |
319 | * Glue method to get around access restrictions in C++. |
320 | */ |
321 | /*static Transliterator* createBasicInstance(const UnicodeString& id, |
322 | const UnicodeString* canonID);*/ |
323 | |
324 | friend class RuleHalf; |
325 | |
326 | // Disallowed methods; no impl. |
327 | /** |
328 | * Copy constructor |
329 | */ |
330 | TransliteratorParser(const TransliteratorParser&); |
331 | |
332 | /** |
333 | * Assignment operator |
334 | */ |
335 | TransliteratorParser& operator=(const TransliteratorParser&); |
336 | }; |
337 | |
338 | U_NAMESPACE_END |
339 | |
340 | #endif /* #ifdef __cplusplus */ |
341 | |
342 | /** |
343 | * Strip/convert the following from the transliterator rules: |
344 | * comments |
345 | * newlines |
346 | * white space at the beginning and end of a line |
347 | * unescape \u notation |
348 | * |
349 | * The target must be equal in size as the source. |
350 | * @internal |
351 | */ |
352 | U_CAPI int32_t |
353 | utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorCode *status); |
354 | |
355 | #endif /* #if !UCONFIG_NO_TRANSLITERATION */ |
356 | |
357 | #endif |
358 | |