1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | /* |
4 | * Copyright (C) 2001-2011, International Business Machines Corporation |
5 | * and others. All Rights Reserved. |
6 | ********************************************************************** |
7 | * Date Name Description |
8 | * 07/23/01 aliu Creation. |
9 | ********************************************************************** |
10 | */ |
11 | #ifndef STRMATCH_H |
12 | #define STRMATCH_H |
13 | |
14 | #include "unicode/utypes.h" |
15 | |
16 | #if !UCONFIG_NO_TRANSLITERATION |
17 | |
18 | #include "unicode/unistr.h" |
19 | #include "unicode/unifunct.h" |
20 | #include "unicode/unimatch.h" |
21 | #include "unicode/unirepl.h" |
22 | |
23 | U_NAMESPACE_BEGIN |
24 | |
25 | class TransliterationRuleData; |
26 | |
27 | /** |
28 | * An object that matches a fixed input string, implementing the |
29 | * UnicodeMatcher API. This object also implements the |
30 | * UnicodeReplacer API, allowing it to emit the matched text as |
31 | * output. Since the match text may contain flexible match elements, |
32 | * such as UnicodeSets, the emitted text is not the match pattern, but |
33 | * instead a substring of the actual matched text. Following |
34 | * convention, the output text is the leftmost match seen up to this |
35 | * point. |
36 | * |
37 | * A StringMatcher may represent a segment, in which case it has a |
38 | * positive segment number. This affects how the matcher converts |
39 | * itself to a pattern but does not otherwise affect its function. |
40 | * |
41 | * A StringMatcher that is not a segment should not be used as a |
42 | * UnicodeReplacer. |
43 | */ |
44 | class StringMatcher : public UnicodeFunctor, public UnicodeMatcher, public UnicodeReplacer { |
45 | |
46 | public: |
47 | |
48 | /** |
49 | * Construct a matcher that matches the given pattern string. |
50 | * @param string the pattern to be matched, possibly containing |
51 | * stand-ins that represent nested UnicodeMatcher objects. |
52 | * @param start inclusive start index of text to be replaced |
53 | * @param limit exclusive end index of text to be replaced; |
54 | * must be greater than or equal to start |
55 | * @param segmentNum the segment number from 1..n, or 0 if this is |
56 | * not a segment. |
57 | * @param data context object mapping stand-ins to |
58 | * UnicodeMatcher objects. |
59 | */ |
60 | StringMatcher(const UnicodeString& string, |
61 | int32_t start, |
62 | int32_t limit, |
63 | int32_t segmentNum, |
64 | const TransliterationRuleData& data); |
65 | |
66 | /** |
67 | * Copy constructor |
68 | * @param o the object to be copied. |
69 | */ |
70 | StringMatcher(const StringMatcher& o); |
71 | |
72 | /** |
73 | * Destructor |
74 | */ |
75 | virtual ~StringMatcher(); |
76 | |
77 | /** |
78 | * Implement UnicodeFunctor |
79 | * @return a copy of the object. |
80 | */ |
81 | virtual StringMatcher* clone() const; |
82 | |
83 | /** |
84 | * UnicodeFunctor API. Cast 'this' to a UnicodeMatcher* pointer |
85 | * and return the pointer. |
86 | * @return the UnicodeMatcher point. |
87 | */ |
88 | virtual UnicodeMatcher* toMatcher() const; |
89 | |
90 | /** |
91 | * UnicodeFunctor API. Cast 'this' to a UnicodeReplacer* pointer |
92 | * and return the pointer. |
93 | * @return the UnicodeReplacer pointer. |
94 | */ |
95 | virtual UnicodeReplacer* toReplacer() const; |
96 | |
97 | /** |
98 | * Implement UnicodeMatcher |
99 | * @param text the text to be matched |
100 | * @param offset on input, the index into text at which to begin |
101 | * matching. On output, the limit of the matched text. The |
102 | * number of matched characters is the output value of offset |
103 | * minus the input value. Offset should always point to the |
104 | * HIGH SURROGATE (leading code unit) of a pair of surrogates, |
105 | * both on entry and upon return. |
106 | * @param limit the limit index of text to be matched. Greater |
107 | * than offset for a forward direction match, less than offset for |
108 | * a backward direction match. The last character to be |
109 | * considered for matching will be text.charAt(limit-1) in the |
110 | * forward direction or text.charAt(limit+1) in the backward |
111 | * direction. |
112 | * @param incremental if TRUE, then assume further characters may |
113 | * be inserted at limit and check for partial matching. Otherwise |
114 | * assume the text as given is complete. |
115 | * @return a match degree value indicating a full match, a partial |
116 | * match, or a mismatch. If incremental is FALSE then |
117 | * U_PARTIAL_MATCH should never be returned. |
118 | */ |
119 | virtual UMatchDegree matches(const Replaceable& text, |
120 | int32_t& offset, |
121 | int32_t limit, |
122 | UBool incremental); |
123 | |
124 | /** |
125 | * Implement UnicodeMatcher |
126 | * @param result Output param to receive the pattern. |
127 | * @param escapeUnprintable if True then escape the unprintable characters. |
128 | * @return A reference to 'result'. |
129 | */ |
130 | virtual UnicodeString& toPattern(UnicodeString& result, |
131 | UBool escapeUnprintable = FALSE) const; |
132 | |
133 | /** |
134 | * Implement UnicodeMatcher |
135 | * Returns TRUE if this matcher will match a character c, where c |
136 | * & 0xFF == v, at offset, in the forward direction (with limit > |
137 | * offset). This is used by <tt>RuleBasedTransliterator</tt> for |
138 | * indexing. |
139 | * @param v the given value |
140 | * @return TRUE if this matcher will match a character c, |
141 | * where c & 0xFF == v |
142 | */ |
143 | virtual UBool matchesIndexValue(uint8_t v) const; |
144 | |
145 | /** |
146 | * Implement UnicodeMatcher |
147 | */ |
148 | virtual void addMatchSetTo(UnicodeSet& toUnionTo) const; |
149 | |
150 | /** |
151 | * Implement UnicodeFunctor |
152 | */ |
153 | virtual void setData(const TransliterationRuleData*); |
154 | |
155 | /** |
156 | * Replace characters in 'text' from 'start' to 'limit' with the |
157 | * output text of this object. Update the 'cursor' parameter to |
158 | * give the cursor position and return the length of the |
159 | * replacement text. |
160 | * |
161 | * @param text the text to be matched |
162 | * @param start inclusive start index of text to be replaced |
163 | * @param limit exclusive end index of text to be replaced; |
164 | * must be greater than or equal to start |
165 | * @param cursor output parameter for the cursor position. |
166 | * Not all replacer objects will update this, but in a complete |
167 | * tree of replacer objects, representing the entire output side |
168 | * of a transliteration rule, at least one must update it. |
169 | * @return the number of 16-bit code units in the text replacing |
170 | * the characters at offsets start..(limit-1) in text |
171 | */ |
172 | virtual int32_t replace(Replaceable& text, |
173 | int32_t start, |
174 | int32_t limit, |
175 | int32_t& cursor); |
176 | |
177 | /** |
178 | * Returns a string representation of this replacer. If the |
179 | * result of calling this function is passed to the appropriate |
180 | * parser, typically TransliteratorParser, it will produce another |
181 | * replacer that is equal to this one. |
182 | * @param result the string to receive the pattern. Previous |
183 | * contents will be deleted. |
184 | * @param escapeUnprintable if TRUE then convert unprintable |
185 | * character to their hex escape representations, \\uxxxx or |
186 | * \\Uxxxxxxxx. Unprintable characters are defined by |
187 | * Utility.isUnprintable(). |
188 | * @return a reference to 'result'. |
189 | */ |
190 | virtual UnicodeString& toReplacerPattern(UnicodeString& result, |
191 | UBool escapeUnprintable) const; |
192 | |
193 | /** |
194 | * Remove any match data. This must be called before performing a |
195 | * set of matches with this segment. |
196 | */ |
197 | void resetMatch(); |
198 | |
199 | /** |
200 | * ICU "poor man's RTTI", returns a UClassID for the actual class. |
201 | */ |
202 | virtual UClassID getDynamicClassID() const; |
203 | |
204 | /** |
205 | * ICU "poor man's RTTI", returns a UClassID for this class. |
206 | */ |
207 | static UClassID U_EXPORT2 getStaticClassID(); |
208 | |
209 | /** |
210 | * Union the set of all characters that may output by this object |
211 | * into the given set. |
212 | * @param toUnionTo the set into which to union the output characters |
213 | */ |
214 | virtual void addReplacementSetTo(UnicodeSet& toUnionTo) const; |
215 | |
216 | private: |
217 | |
218 | /** |
219 | * The text to be matched. |
220 | */ |
221 | UnicodeString pattern; |
222 | |
223 | /** |
224 | * Context object that maps stand-ins to matcher and replacer |
225 | * objects. |
226 | */ |
227 | const TransliterationRuleData* data; |
228 | |
229 | /** |
230 | * The segment number, 1-based, or 0 if not a segment. |
231 | */ |
232 | int32_t segmentNumber; |
233 | |
234 | /** |
235 | * Start offset, in the match text, of the <em>rightmost</em> |
236 | * match. |
237 | */ |
238 | int32_t matchStart; |
239 | |
240 | /** |
241 | * Limit offset, in the match text, of the <em>rightmost</em> |
242 | * match. |
243 | */ |
244 | int32_t matchLimit; |
245 | |
246 | }; |
247 | |
248 | U_NAMESPACE_END |
249 | |
250 | #endif /* #if !UCONFIG_NO_TRANSLITERATION */ |
251 | |
252 | #endif |
253 | |