rbt_pars.h source code [ClickHouse/contrib/icu/icu4c/source/i18n/rbt_pars.h]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	**********************************************************************
5	* Copyright (C) 1999-2011, International Business Machines Corporation
6	* and others. All Rights Reserved.
7	**********************************************************************
8	* Date Name Description
9	* 11/17/99 aliu Creation.
10	**********************************************************************
11	*/
12	#ifndef RBT_PARS_H
13	#define RBT_PARS_H
14
15	#include "unicode/utypes.h"
16
17	#if !UCONFIG_NO_TRANSLITERATION
18	#ifdef __cplusplus
19
20	#include "unicode/uobject.h"
21	#include "unicode/parseerr.h"
22	#include "unicode/unorm.h"
23	#include "rbt.h"
24	#include "hash.h"
25	#include "uvector.h"
26
27	U_NAMESPACE_BEGIN
28
29	class TransliterationRuleData;
30	class UnicodeFunctor;
31	class ParseData;
32	class RuleHalf;
33	class ParsePosition;
34	class StringMatcher;
35
36	class TransliteratorParser : public UMemory {
37
38	public:
39
40	/**
41	* A Vector of TransliterationRuleData objects, one for each discrete group
42	* of rules in the rule set
43	*/
44	UVector dataVector;
45
46	/**
47	* PUBLIC data member.
48	* A Vector of UnicodeStrings containing all of the ID blocks in the rule set
49	*/
50	UVector idBlockVector;
51
52	/**
53	* PUBLIC data member containing the parsed compound filter, if any.
54	*/
55	UnicodeSet* compoundFilter;
56
57	private:
58
59	/**
60	* The current data object for which we are parsing rules
61	*/
62	TransliterationRuleData* curData;
63
64	UTransDirection direction;
65
66	/**
67	* Parse error information.
68	*/
69	UParseError parseError;
70
71	/**
72	* Temporary symbol table used during parsing.
73	*/
74	ParseData* parseData;
75
76	/**
77	* Temporary vector of matcher variables. When parsing is complete, this
78	* is copied into the array data.variables. As with data.variables,
79	* element 0 corresponds to character data.variablesBase.
80	*/
81	UVector variablesVector;
82
83	/**
84	* Temporary table of variable names. When parsing is complete, this is
85	* copied into data.variableNames.
86	*/
87	Hashtable variableNames;
88
89	/**
90	* String of standins for segments. Used during the parsing of a single
91	* rule. segmentStandins.charAt(0) is the standin for "$1" and corresponds
92	* to StringMatcher object segmentObjects.elementAt(0), etc.
93	*/
94	UnicodeString segmentStandins;
95
96	/**
97	* Vector of StringMatcher objects for segments. Used during the
98	* parsing of a single rule.
99	* segmentStandins.charAt(0) is the standin for "$1" and corresponds
100	* to StringMatcher object segmentObjects.elementAt(0), etc.
101	*/
102	UVector segmentObjects;
103
104	/**
105	* The next available stand-in for variables. This starts at some point in
106	* the private use area (discovered dynamically) and increments up toward
107	* <code>variableLimit</code>. At any point during parsing, available
108	* variables are <code>variableNext..variableLimit-1</code>.
109	*/
110	UChar variableNext;
111
112	/**
113	* The last available stand-in for variables. This is discovered
114	* dynamically. At any point during parsing, available variables are
115	* <code>variableNext..variableLimit-1</code>.
116	*/
117	UChar variableLimit;
118
119	/**
120	* When we encounter an undefined variable, we do not immediately signal
121	* an error, in case we are defining this variable, e.g., "$a = [a-z];".
122	* Instead, we save the name of the undefined variable, and substitute
123	* in the placeholder char variableLimit - 1, and decrement
124	* variableLimit.
125	*/
126	UnicodeString undefinedVariableName;
127
128	/**
129	* The stand-in character for the 'dot' set, represented by '.' in
130	* patterns. This is allocated the first time it is needed, and
131	* reused thereafter.
132	*/
133	UChar dotStandIn;
134
135	public:
136
137	/**
138	* Constructor.
139	*/
140	TransliteratorParser(UErrorCode &statusReturn);
141
142	/**
143	* Destructor.
144	*/
145	~TransliteratorParser();
146
147	/**
148	* Parse the given string as a sequence of rules, separated by newline
149	* characters ('\n'), and cause this object to implement those rules. Any
150	* previous rules are discarded. Typically this method is called exactly
151	* once after construction.
152	*
153	* Parse the given rules, in the given direction. After this call
154	* returns, query the public data members for results. The caller
155	* owns the 'data' and 'compoundFilter' data members after this
156	* call returns.
157	* @param rules rules, separated by ';'
158	* @param direction either FORWARD or REVERSE.
159	* @param pe Struct to recieve information on position
160	* of error if an error is encountered
161	* @param ec Output param set to success/failure code.
162	*/
163	void parse(const UnicodeString& rules,
164	UTransDirection direction,
165	UParseError& pe,
166	UErrorCode& ec);
167
168	/**
169	* Return the compound filter parsed by parse(). Caller owns result.
170	* @return the compound filter parsed by parse().
171	*/
172	UnicodeSet* orphanCompoundFilter();
173
174	private:
175
176	/**
177	* Return a representation of this transliterator as source rules.
178	* @param rules Output param to receive the rules.
179	* @param direction either FORWARD or REVERSE.
180	*/
181	void parseRules(const UnicodeString& rules,
182	UTransDirection direction,
183	UErrorCode& status);
184
185	/**
186	* MAIN PARSER. Parse the next rule in the given rule string, starting
187	* at pos. Return the index after the last character parsed. Do not
188	* parse characters at or after limit.
189	*
190	* Important: The character at pos must be a non-whitespace character
191	* that is not the comment character.
192	*
193	* This method handles quoting, escaping, and whitespace removal. It
194	* parses the end-of-rule character. It recognizes context and cursor
195	* indicators. Once it does a lexical breakdown of the rule at pos, it
196	* creates a rule object and adds it to our rule list.
197	* @param rules Output param to receive the rules.
198	* @param pos the starting position.
199	* @param limit pointer past the last character of the rule.
200	* @return the index after the last character parsed.
201	*/
202	int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);
203
204	/**
205	* Set the variable range to [start, end] (inclusive).
206	* @param start the start value of the range.
207	* @param end the end value of the range.
208	*/
209	void setVariableRange(int32_t start, int32_t end, UErrorCode& status);
210
211	/**
212	* Assert that the given character is NOT within the variable range.
213	* If it is, return FALSE. This is neccesary to ensure that the
214	* variable range does not overlap characters used in a rule.
215	* @param ch the given character.
216	* @return True, if the given character is NOT within the variable range.
217	*/
218	UBool checkVariableRange(UChar32 ch) const;
219
220	/**
221	* Set the maximum backup to 'backup', in response to a pragma
222	* statement.
223	* @param backup the new value to be set.
224	*/
225	void pragmaMaximumBackup(int32_t backup);
226
227	/**
228	* Begin normalizing all rules using the given mode, in response
229	* to a pragma statement.
230	* @param mode the given mode.
231	*/
232	void pragmaNormalizeRules(UNormalizationMode mode);
233
234	/**
235	* Return true if the given rule looks like a pragma.
236	* @param pos offset to the first non-whitespace character
237	* of the rule.
238	* @param limit pointer past the last character of the rule.
239	* @return true if the given rule looks like a pragma.
240	*/
241	static UBool resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit);
242
243	/**
244	* Parse a pragma. This method assumes resemblesPragma() has
245	* already returned true.
246	* @param pos offset to the first non-whitespace character
247	* of the rule.
248	* @param limit pointer past the last character of the rule.
249	* @return the position index after the final ';' of the pragma,
250	* or -1 on failure.
251	*/
252	int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);
253
254	/**
255	* Called by main parser upon syntax error. Search the rule string
256	* for the probable end of the rule. Of course, if the error is that
257	* the end of rule marker is missing, then the rule end will not be found.
258	* In any case the rule start will be correctly reported.
259	* @param parseErrorCode error code.
260	* @param msg error description.
261	* @param start position of first character of current rule.
262	* @return start position of first character of current rule.
263	*/
264	int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start,
265	UErrorCode& status);
266
267	/**
268	* Parse a UnicodeSet out, store it, and return the stand-in character
269	* used to represent it.
270	*
271	* @param rule the rule for UnicodeSet.
272	* @param pos the position in pattern at which to start parsing.
273	* @return the stand-in character used to represent it.
274	*/
275	UChar parseSet(const UnicodeString& rule,
276	ParsePosition& pos,
277	UErrorCode& status);
278
279	/**
280	* Generate and return a stand-in for a new UnicodeFunctor. Store
281	* the matcher (adopt it).
282	* @param adopted the UnicodeFunctor to be adopted.
283	* @return a stand-in for a new UnicodeFunctor.
284	*/
285	UChar generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status);
286
287	/**
288	* Return the standin for segment seg (1-based).
289	* @param seg the given segment.
290	* @return the standIn character for the given segment.
291	*/
292	UChar getSegmentStandin(int32_t seg, UErrorCode& status);
293
294	/**
295	* Set the object for segment seg (1-based).
296	* @param seg the given segment.
297	* @param adopted the StringMatcher to be adopted.
298	*/
299	void setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status);
300
301	/**
302	* Return the stand-in for the dot set. It is allocated the first
303	* time and reused thereafter.
304	* @return the stand-in for the dot set.
305	*/
306	UChar getDotStandIn(UErrorCode& status);
307
308	/**
309	* Append the value of the given variable name to the given
310	* UnicodeString.
311	* @param name the variable name to be appended.
312	* @param buf the given UnicodeString to append to.
313	*/
314	void appendVariableDef(const UnicodeString& name,
315	UnicodeString& buf,
316	UErrorCode& status);
317
318	/**
319	* Glue method to get around access restrictions in C++.
320	*/
321	/static Transliterator* createBasicInstance(const UnicodeString& id,*
322	const UnicodeString canonID);/
323
324	friend class RuleHalf;
325
326	// Disallowed methods; no impl.
327	/**
328	* Copy constructor
329	*/
330	TransliteratorParser(const TransliteratorParser&);
331
332	/**
333	* Assignment operator
334	*/
335	TransliteratorParser& operator=(const TransliteratorParser&);
336	};
337
338	U_NAMESPACE_END
339
340	#endif /* #ifdef __cplusplus */
341
342	/**
343	* Strip/convert the following from the transliterator rules:
344	* comments
345	* newlines
346	* white space at the beginning and end of a line
347	* unescape \u notation
348	*
349	* The target must be equal in size as the source.
350	* @internal
351	*/
352	U_CAPI int32_t
353	utrans_stripRules(const UChar source, int32_t sourceLen, UChar target, UErrorCode *status);
354
355	#endif /* #if !UCONFIG_NO_TRANSLITERATION */
356
357	#endif
358

Browse the source code of ClickHouse/contrib/icu/icu4c/source/i18n/rbt_pars.h