1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | /* |
4 | ********************************************************************** |
5 | * Copyright (c) 2002-2012, International Business Machines Corporation |
6 | * and others. All Rights Reserved. |
7 | ********************************************************************** |
8 | * Date Name Description |
9 | * 01/21/2002 aliu Creation. |
10 | ********************************************************************** |
11 | */ |
12 | |
13 | #include "unicode/utypes.h" |
14 | |
15 | #if !UCONFIG_NO_TRANSLITERATION |
16 | |
17 | #include "unicode/uniset.h" |
18 | #include "unicode/utf16.h" |
19 | #include "strrepl.h" |
20 | #include "rbt_data.h" |
21 | #include "util.h" |
22 | |
23 | U_NAMESPACE_BEGIN |
24 | |
25 | UnicodeReplacer::~UnicodeReplacer() {} |
26 | UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringReplacer) |
27 | |
28 | /** |
29 | * Construct a StringReplacer that sets the emits the given output |
30 | * text and sets the cursor to the given position. |
31 | * @param theOutput text that will replace input text when the |
32 | * replace() method is called. May contain stand-in characters |
33 | * that represent nested replacers. |
34 | * @param theCursorPos cursor position that will be returned by |
35 | * the replace() method |
36 | * @param theData transliterator context object that translates |
37 | * stand-in characters to UnicodeReplacer objects |
38 | */ |
39 | StringReplacer::StringReplacer(const UnicodeString& theOutput, |
40 | int32_t theCursorPos, |
41 | const TransliterationRuleData* theData) { |
42 | output = theOutput; |
43 | cursorPos = theCursorPos; |
44 | hasCursor = TRUE; |
45 | data = theData; |
46 | isComplex = TRUE; |
47 | } |
48 | |
49 | /** |
50 | * Construct a StringReplacer that sets the emits the given output |
51 | * text and does not modify the cursor. |
52 | * @param theOutput text that will replace input text when the |
53 | * replace() method is called. May contain stand-in characters |
54 | * that represent nested replacers. |
55 | * @param theData transliterator context object that translates |
56 | * stand-in characters to UnicodeReplacer objects |
57 | */ |
58 | StringReplacer::StringReplacer(const UnicodeString& theOutput, |
59 | const TransliterationRuleData* theData) { |
60 | output = theOutput; |
61 | cursorPos = 0; |
62 | hasCursor = FALSE; |
63 | data = theData; |
64 | isComplex = TRUE; |
65 | } |
66 | |
67 | /** |
68 | * Copy constructor. |
69 | */ |
70 | StringReplacer::StringReplacer(const StringReplacer& other) : |
71 | UnicodeFunctor(other), |
72 | UnicodeReplacer(other) |
73 | { |
74 | output = other.output; |
75 | cursorPos = other.cursorPos; |
76 | hasCursor = other.hasCursor; |
77 | data = other.data; |
78 | isComplex = other.isComplex; |
79 | } |
80 | |
81 | /** |
82 | * Destructor |
83 | */ |
84 | StringReplacer::~StringReplacer() { |
85 | } |
86 | |
87 | /** |
88 | * Implement UnicodeFunctor |
89 | */ |
90 | StringReplacer* StringReplacer::clone() const { |
91 | return new StringReplacer(*this); |
92 | } |
93 | |
94 | /** |
95 | * Implement UnicodeFunctor |
96 | */ |
97 | UnicodeReplacer* StringReplacer::toReplacer() const { |
98 | return const_cast<StringReplacer *>(this); |
99 | } |
100 | |
101 | /** |
102 | * UnicodeReplacer API |
103 | */ |
104 | int32_t StringReplacer::replace(Replaceable& text, |
105 | int32_t start, |
106 | int32_t limit, |
107 | int32_t& cursor) { |
108 | int32_t outLen; |
109 | int32_t newStart = 0; |
110 | |
111 | // NOTE: It should be possible to _always_ run the complex |
112 | // processing code; just slower. If not, then there is a bug |
113 | // in the complex processing code. |
114 | |
115 | // Simple (no nested replacers) Processing Code : |
116 | if (!isComplex) { |
117 | text.handleReplaceBetween(start, limit, output); |
118 | outLen = output.length(); |
119 | |
120 | // Setup default cursor position (for cursorPos within output) |
121 | newStart = cursorPos; |
122 | } |
123 | |
124 | // Complex (nested replacers) Processing Code : |
125 | else { |
126 | /* When there are segments to be copied, use the Replaceable.copy() |
127 | * API in order to retain out-of-band data. Copy everything to the |
128 | * end of the string, then copy them back over the key. This preserves |
129 | * the integrity of indices into the key and surrounding context while |
130 | * generating the output text. |
131 | */ |
132 | UnicodeString buf; |
133 | int32_t oOutput; // offset into 'output' |
134 | isComplex = FALSE; |
135 | |
136 | // The temporary buffer starts at tempStart, and extends |
137 | // to destLimit. The start of the buffer has a single |
138 | // character from before the key. This provides style |
139 | // data when addition characters are filled into the |
140 | // temporary buffer. If there is nothing to the left, use |
141 | // the non-character U+FFFF, which Replaceable subclasses |
142 | // should treat specially as a "no-style character." |
143 | // destStart points to the point after the style context |
144 | // character, so it is tempStart+1 or tempStart+2. |
145 | int32_t tempStart = text.length(); // start of temp buffer |
146 | int32_t destStart = tempStart; // copy new text to here |
147 | if (start > 0) { |
148 | int32_t len = U16_LENGTH(text.char32At(start-1)); |
149 | text.copy(start-len, start, tempStart); |
150 | destStart += len; |
151 | } else { |
152 | UnicodeString str((UChar) 0xFFFF); |
153 | text.handleReplaceBetween(tempStart, tempStart, str); |
154 | destStart++; |
155 | } |
156 | int32_t destLimit = destStart; |
157 | |
158 | for (oOutput=0; oOutput<output.length(); ) { |
159 | if (oOutput == cursorPos) { |
160 | // Record the position of the cursor |
161 | newStart = destLimit - destStart; // relative to start |
162 | } |
163 | UChar32 c = output.char32At(oOutput); |
164 | UnicodeReplacer* r = data->lookupReplacer(c); |
165 | if (r == NULL) { |
166 | // Accumulate straight (non-segment) text. |
167 | buf.append(c); |
168 | } else { |
169 | isComplex = TRUE; |
170 | |
171 | // Insert any accumulated straight text. |
172 | if (buf.length() > 0) { |
173 | text.handleReplaceBetween(destLimit, destLimit, buf); |
174 | destLimit += buf.length(); |
175 | buf.truncate(0); |
176 | } |
177 | |
178 | // Delegate output generation to replacer object |
179 | int32_t len = r->replace(text, destLimit, destLimit, cursor); |
180 | destLimit += len; |
181 | } |
182 | oOutput += U16_LENGTH(c); |
183 | } |
184 | // Insert any accumulated straight text. |
185 | if (buf.length() > 0) { |
186 | text.handleReplaceBetween(destLimit, destLimit, buf); |
187 | destLimit += buf.length(); |
188 | } |
189 | if (oOutput == cursorPos) { |
190 | // Record the position of the cursor |
191 | newStart = destLimit - destStart; // relative to start |
192 | } |
193 | |
194 | outLen = destLimit - destStart; |
195 | |
196 | // Copy new text to start, and delete it |
197 | text.copy(destStart, destLimit, start); |
198 | text.handleReplaceBetween(tempStart + outLen, destLimit + outLen, UnicodeString()); |
199 | |
200 | // Delete the old text (the key) |
201 | text.handleReplaceBetween(start + outLen, limit + outLen, UnicodeString()); |
202 | } |
203 | |
204 | if (hasCursor) { |
205 | // Adjust the cursor for positions outside the key. These |
206 | // refer to code points rather than code units. If cursorPos |
207 | // is within the output string, then use newStart, which has |
208 | // already been set above. |
209 | if (cursorPos < 0) { |
210 | newStart = start; |
211 | int32_t n = cursorPos; |
212 | // Outside the output string, cursorPos counts code points |
213 | while (n < 0 && newStart > 0) { |
214 | newStart -= U16_LENGTH(text.char32At(newStart-1)); |
215 | ++n; |
216 | } |
217 | newStart += n; |
218 | } else if (cursorPos > output.length()) { |
219 | newStart = start + outLen; |
220 | int32_t n = cursorPos - output.length(); |
221 | // Outside the output string, cursorPos counts code points |
222 | while (n > 0 && newStart < text.length()) { |
223 | newStart += U16_LENGTH(text.char32At(newStart)); |
224 | --n; |
225 | } |
226 | newStart += n; |
227 | } else { |
228 | // Cursor is within output string. It has been set up above |
229 | // to be relative to start. |
230 | newStart += start; |
231 | } |
232 | |
233 | cursor = newStart; |
234 | } |
235 | |
236 | return outLen; |
237 | } |
238 | |
239 | /** |
240 | * UnicodeReplacer API |
241 | */ |
242 | UnicodeString& StringReplacer::toReplacerPattern(UnicodeString& rule, |
243 | UBool escapeUnprintable) const { |
244 | rule.truncate(0); |
245 | UnicodeString quoteBuf; |
246 | |
247 | int32_t cursor = cursorPos; |
248 | |
249 | // Handle a cursor preceding the output |
250 | if (hasCursor && cursor < 0) { |
251 | while (cursor++ < 0) { |
252 | ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf); |
253 | } |
254 | // Fall through and append '|' below |
255 | } |
256 | |
257 | for (int32_t i=0; i<output.length(); ++i) { |
258 | if (hasCursor && i == cursor) { |
259 | ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf); |
260 | } |
261 | UChar c = output.charAt(i); // Ok to use 16-bits here |
262 | |
263 | UnicodeReplacer* r = data->lookupReplacer(c); |
264 | if (r == NULL) { |
265 | ICU_Utility::appendToRule(rule, c, FALSE, escapeUnprintable, quoteBuf); |
266 | } else { |
267 | UnicodeString buf; |
268 | r->toReplacerPattern(buf, escapeUnprintable); |
269 | buf.insert(0, (UChar)0x20); |
270 | buf.append((UChar)0x20); |
271 | ICU_Utility::appendToRule(rule, buf, |
272 | TRUE, escapeUnprintable, quoteBuf); |
273 | } |
274 | } |
275 | |
276 | // Handle a cursor after the output. Use > rather than >= because |
277 | // if cursor == output.length() it is at the end of the output, |
278 | // which is the default position, so we need not emit it. |
279 | if (hasCursor && cursor > output.length()) { |
280 | cursor -= output.length(); |
281 | while (cursor-- > 0) { |
282 | ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf); |
283 | } |
284 | ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf); |
285 | } |
286 | // Flush quoteBuf out to result |
287 | ICU_Utility::appendToRule(rule, -1, |
288 | TRUE, escapeUnprintable, quoteBuf); |
289 | |
290 | return rule; |
291 | } |
292 | |
293 | /** |
294 | * Implement UnicodeReplacer |
295 | */ |
296 | void StringReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const { |
297 | UChar32 ch; |
298 | for (int32_t i=0; i<output.length(); i+=U16_LENGTH(ch)) { |
299 | ch = output.char32At(i); |
300 | UnicodeReplacer* r = data->lookupReplacer(ch); |
301 | if (r == NULL) { |
302 | toUnionTo.add(ch); |
303 | } else { |
304 | r->addReplacementSetTo(toUnionTo); |
305 | } |
306 | } |
307 | } |
308 | |
309 | /** |
310 | * UnicodeFunctor API |
311 | */ |
312 | void StringReplacer::setData(const TransliterationRuleData* d) { |
313 | data = d; |
314 | int32_t i = 0; |
315 | while (i<output.length()) { |
316 | UChar32 c = output.char32At(i); |
317 | UnicodeFunctor* f = data->lookup(c); |
318 | if (f != NULL) { |
319 | f->setData(data); |
320 | } |
321 | i += U16_LENGTH(c); |
322 | } |
323 | } |
324 | |
325 | U_NAMESPACE_END |
326 | |
327 | #endif /* #if !UCONFIG_NO_TRANSLITERATION */ |
328 | |
329 | //eof |
330 | |