rbt_rule.h source code [ClickHouse/contrib/icu/icu4c/source/i18n/rbt_rule.h]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	* Copyright (C) {1999-2001}, International Business Machines Corporation and others. All Rights Reserved.
5	**********************************************************************
6	* Date Name Description
7	* 11/17/99 aliu Creation.
8	**********************************************************************
9	*/
10	#ifndef RBT_RULE_H
11	#define RBT_RULE_H
12
13	#include "unicode/utypes.h"
14
15	#if !UCONFIG_NO_TRANSLITERATION
16
17	#include "unicode/uobject.h"
18	#include "unicode/unistr.h"
19	#include "unicode/utrans.h"
20	#include "unicode/unimatch.h"
21
22	U_NAMESPACE_BEGIN
23
24	class Replaceable;
25	class TransliterationRuleData;
26	class StringMatcher;
27	class UnicodeFunctor;
28
29	/**
30	* A transliteration rule used by
31	* <code>RuleBasedTransliterator</code>.
32	* <code>TransliterationRule</code> is an immutable object.
33	*
34	* <p>A rule consists of an input pattern and an output string. When
35	* the input pattern is matched, the output string is emitted. The
36	* input pattern consists of zero or more characters which are matched
37	* exactly (the key) and optional context. Context must match if it
38	* is specified. Context may be specified before the key, after the
39	* key, or both. The key, preceding context, and following context
40	* may contain variables. Variables represent a set of Unicode
41	* characters, such as the letters <i>a</i> through <i>z</i>.
42	* Variables are detected by looking up each character in a supplied
43	* variable list to see if it has been so defined.
44	*
45	* <p>A rule may contain segments in its input string and segment
46	* references in its output string. A segment is a substring of the
47	* input pattern, indicated by an offset and limit. The segment may
48	* be in the preceding or following context. It may not span a
49	* context boundary. A segment reference is a special character in
50	* the output string that causes a segment of the input string (not
51	* the input pattern) to be copied to the output string. The range of
52	* special characters that represent segment references is defined by
53	* RuleBasedTransliterator.Data.
54	*
55	* @author Alan Liu
56	*/
57	class TransliterationRule : public UMemory {
58
59	private:
60
61	// TODO Eliminate the pattern and keyLength data members. They
62	// are used only by masks() and getIndexValue() which are called
63	// only during build time, not during run-time. Perhaps these
64	// methods and pattern/keyLength can be isolated into a separate
65	// object.
66
67	/**
68	* The match that must occur before the key, or null if there is no
69	* preceding context.
70	*/
71	StringMatcher *anteContext;
72
73	/**
74	* The matcher object for the key. If null, then the key is empty.
75	*/
76	StringMatcher *key;
77
78	/**
79	* The match that must occur after the key, or null if there is no
80	* following context.
81	*/
82	StringMatcher *postContext;
83
84	/**
85	* The object that performs the replacement if the key,
86	* anteContext, and postContext are matched. Never null.
87	*/
88	UnicodeFunctor* output;
89
90	/**
91	* The string that must be matched, consisting of the anteContext, key,
92	* and postContext, concatenated together, in that order. Some components
93	* may be empty (zero length).
94	* @see anteContextLength
95	* @see keyLength
96	*/
97	UnicodeString pattern;
98
99	/**
100	* An array of matcher objects corresponding to the input pattern
101	* segments. If there are no segments this is null. N.B. This is
102	* a UnicodeMatcher for generality, but in practice it is always a
103	* StringMatcher. In the future we may generalize this, but for
104	* now we sometimes cast down to StringMatcher.
105	*
106	* The array is owned, but the pointers within it are not.
107	*/
108	UnicodeFunctor** segments;
109
110	/**
111	* The number of elements in segments[] or zero if segments is NULL.
112	*/
113	int32_t segmentsCount;
114
115	/**
116	* The length of the string that must match before the key. If
117	* zero, then there is no matching requirement before the key.
118	* Substring [0,anteContextLength) of pattern is the anteContext.
119	*/
120	int32_t anteContextLength;
121
122	/**
123	* The length of the key. Substring [anteContextLength,
124	* anteContextLength + keyLength) is the key.
125
126	*/
127	int32_t keyLength;
128
129	/**
130	* Miscellaneous attributes.
131	*/
132	int8_t flags;
133
134	/**
135	* Flag attributes.
136	*/
137	enum {
138	ANCHOR_START = `1`,
139	ANCHOR_END = `2`
140	};
141
142	/**
143	* An alias pointer to the data for this rule. The data provides
144	* lookup services for matchers and segments.
145	*/
146	const TransliterationRuleData* data;
147
148	public:
149
150	/**
151	* Construct a new rule with the given input, output text, and other
152	* attributes. A cursor position may be specified for the output text.
153	* @param input input string, including key and optional ante and
154	* post context.
155	* @param anteContextPos offset into input to end of ante context, or -1 if
156	* none. Must be <= input.length() if not -1.
157	* @param postContextPos offset into input to start of post context, or -1
158	* if none. Must be <= input.length() if not -1, and must be >=
159	* anteContextPos.
160	* @param outputStr output string.
161	* @param cursorPosition offset into output at which cursor is located, or -1 if
162	* none. If less than zero, then the cursor is placed after the
163	* <code>output</code>; that is, -1 is equivalent to
164	* <code>output.length()</code>. If greater than
165	* <code>output.length()</code> then an exception is thrown.
166	* @param cursorOffset an offset to be added to cursorPos to position the
167	* cursor either in the ante context, if < 0, or in the post context, if >
168	* 0. For example, the rule "abc{def} > \| @@@ xyz;" changes "def" to
169	* "xyz" and moves the cursor to before "a". It would have a cursorOffset
170	* of -3.
171	* @param segs array of UnicodeMatcher corresponding to input pattern
172	* segments, or null if there are none. The array itself is adopted,
173	* but the pointers within it are not.
174	* @param segsCount number of elements in segs[].
175	* @param anchorStart TRUE if the the rule is anchored on the left to
176	* the context start.
177	* @param anchorEnd TRUE if the rule is anchored on the right to the
178	* context limit.
179	* @param data the rule data.
180	* @param status Output parameter filled in with success or failure status.
181	*/
182	TransliterationRule(const UnicodeString& input,
183	int32_t anteContextPos, int32_t postContextPos,
184	const UnicodeString& outputStr,
185	int32_t cursorPosition, int32_t cursorOffset,
186	UnicodeFunctor** segs,
187	int32_t segsCount,
188	UBool anchorStart, UBool anchorEnd,
189	const TransliterationRuleData* data,
190	UErrorCode& status);
191
192	/**
193	* Copy constructor.
194	* @param other the object to be copied.
195	*/
196	TransliterationRule(TransliterationRule& other);
197
198	/**
199	* Destructor.
200	*/
201	virtual ~TransliterationRule();
202
203	/**
204	* Change the data object that this rule belongs to. Used
205	* internally by the TransliterationRuleData copy constructor.
206	* @param data the new data value to be set.
207	*/
208	void setData(const TransliterationRuleData* data);
209
210	/**
211	* Return the preceding context length. This method is needed to
212	* support the <code>Transliterator</code> method
213	* <code>getMaximumContextLength()</code>. Internally, this is
214	* implemented as the anteContextLength, optionally plus one if
215	* there is a start anchor. The one character anchor gap is
216	* needed to make repeated incremental transliteration with
217	* anchors work.
218	* @return the preceding context length.
219	*/
220	virtual int32_t getContextLength(void) const;
221
222	/**
223	* Internal method. Returns 8-bit index value for this rule.
224	* This is the low byte of the first character of the key,
225	* unless the first character of the key is a set. If it's a
226	* set, or otherwise can match multiple keys, the index value is -1.
227	* @return 8-bit index value for this rule.
228	*/
229	int16_t getIndexValue() const;
230
231	/**
232	* Internal method. Returns true if this rule matches the given
233	* index value. The index value is an 8-bit integer, 0..255,
234	* representing the low byte of the first character of the key.
235	* It matches this rule if it matches the first character of the
236	* key, or if the first character of the key is a set, and the set
237	* contains any character with a low byte equal to the index
238	* value. If the rule contains only ante context, as in foo)>bar,
239	* then it will match any key.
240	* @param v the given index value.
241	* @return true if this rule matches the given index value.
242	*/
243	UBool matchesIndexValue(uint8_t v) const;
244
245	/**
246	* Return true if this rule masks another rule. If r1 masks r2 then
247	* r1 matches any input string that r2 matches. If r1 masks r2 and r2 masks
248	* r1 then r1 == r2. Examples: "a>x" masks "ab>y". "a>x" masks "a[b]>y".
249	* "[c]a>x" masks "[dc]a>y".
250	* @param r2 the given rule to be compared with.
251	* @return true if this rule masks 'r2'
252	*/
253	virtual UBool masks(const TransliterationRule& r2) const;
254
255	/**
256	* Attempt a match and replacement at the given position. Return
257	* the degree of match between this rule and the given text. The
258	* degree of match may be mismatch, a partial match, or a full
259	* match. A mismatch means at least one character of the text
260	* does not match the context or key. A partial match means some
261	* context and key characters match, but the text is not long
262	* enough to match all of them. A full match means all context
263	* and key characters match.
264	*
265	* If a full match is obtained, perform a replacement, update pos,
266	* and return U_MATCH. Otherwise both text and pos are unchanged.
267	*
268	* @param text the text
269	* @param pos the position indices
270	* @param incremental if TRUE, test for partial matches that may
271	* be completed by additional text inserted at pos.limit.
272	* @return one of <code>U_MISMATCH</code>,
273	* <code>U_PARTIAL_MATCH</code>, or <code>U_MATCH</code>. If
274	* incremental is FALSE then U_PARTIAL_MATCH will not be returned.
275	*/
276	UMatchDegree matchAndReplace(Replaceable& text,
277	UTransPosition& pos,
278	UBool incremental) const;
279
280	/**
281	* Create a rule string that represents this rule object. Append
282	* it to the given string.
283	*/
284	virtual UnicodeString& toRule(UnicodeString& pat,
285	UBool escapeUnprintable) const;
286
287	/**
288	* Union the set of all characters that may be modified by this rule
289	* into the given set.
290	*/
291	void addSourceSetTo(UnicodeSet& toUnionTo) const;
292
293	/**
294	* Union the set of all characters that may be emitted by this rule
295	* into the given set.
296	*/
297	void addTargetSetTo(UnicodeSet& toUnionTo) const;
298
299	private:
300
301	friend class StringMatcher;
302
303	TransliterationRule &operator=(const TransliterationRule &other); // forbid copying of this class
304	};
305
306	U_NAMESPACE_END
307
308	#endif /* #if !UCONFIG_NO_TRANSLITERATION */
309
310	#endif
311

Browse the source code of ClickHouse/contrib/icu/icu4c/source/i18n/rbt_rule.h