| 1 | // © 2016 and later: Unicode, Inc. and others. | 
|---|
| 2 | // License & terms of use: http://www.unicode.org/copyright.html | 
|---|
| 3 | /* | 
|---|
| 4 | ********************************************************************** | 
|---|
| 5 | *   Copyright (c) 2002-2012, International Business Machines Corporation | 
|---|
| 6 | *   and others.  All Rights Reserved. | 
|---|
| 7 | ********************************************************************** | 
|---|
| 8 | *   Date        Name        Description | 
|---|
| 9 | *   01/21/2002  aliu        Creation. | 
|---|
| 10 | ********************************************************************** | 
|---|
| 11 | */ | 
|---|
| 12 |  | 
|---|
| 13 | #include "unicode/utypes.h" | 
|---|
| 14 |  | 
|---|
| 15 | #if !UCONFIG_NO_TRANSLITERATION | 
|---|
| 16 |  | 
|---|
| 17 | #include "unicode/uniset.h" | 
|---|
| 18 | #include "unicode/utf16.h" | 
|---|
| 19 | #include "strrepl.h" | 
|---|
| 20 | #include "rbt_data.h" | 
|---|
| 21 | #include "util.h" | 
|---|
| 22 |  | 
|---|
| 23 | U_NAMESPACE_BEGIN | 
|---|
| 24 |  | 
|---|
| 25 | UnicodeReplacer::~UnicodeReplacer() {} | 
|---|
| 26 | UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringReplacer) | 
|---|
| 27 |  | 
|---|
| 28 | /** | 
|---|
| 29 | * Construct a StringReplacer that sets the emits the given output | 
|---|
| 30 | * text and sets the cursor to the given position. | 
|---|
| 31 | * @param theOutput text that will replace input text when the | 
|---|
| 32 | * replace() method is called.  May contain stand-in characters | 
|---|
| 33 | * that represent nested replacers. | 
|---|
| 34 | * @param theCursorPos cursor position that will be returned by | 
|---|
| 35 | * the replace() method | 
|---|
| 36 | * @param theData transliterator context object that translates | 
|---|
| 37 | * stand-in characters to UnicodeReplacer objects | 
|---|
| 38 | */ | 
|---|
| 39 | StringReplacer::StringReplacer(const UnicodeString& theOutput, | 
|---|
| 40 | int32_t theCursorPos, | 
|---|
| 41 | const TransliterationRuleData* theData) { | 
|---|
| 42 | output = theOutput; | 
|---|
| 43 | cursorPos = theCursorPos; | 
|---|
| 44 | hasCursor = TRUE; | 
|---|
| 45 | data = theData; | 
|---|
| 46 | isComplex = TRUE; | 
|---|
| 47 | } | 
|---|
| 48 |  | 
|---|
| 49 | /** | 
|---|
| 50 | * Construct a StringReplacer that sets the emits the given output | 
|---|
| 51 | * text and does not modify the cursor. | 
|---|
| 52 | * @param theOutput text that will replace input text when the | 
|---|
| 53 | * replace() method is called.  May contain stand-in characters | 
|---|
| 54 | * that represent nested replacers. | 
|---|
| 55 | * @param theData transliterator context object that translates | 
|---|
| 56 | * stand-in characters to UnicodeReplacer objects | 
|---|
| 57 | */ | 
|---|
| 58 | StringReplacer::StringReplacer(const UnicodeString& theOutput, | 
|---|
| 59 | const TransliterationRuleData* theData) { | 
|---|
| 60 | output = theOutput; | 
|---|
| 61 | cursorPos = 0; | 
|---|
| 62 | hasCursor = FALSE; | 
|---|
| 63 | data = theData; | 
|---|
| 64 | isComplex = TRUE; | 
|---|
| 65 | } | 
|---|
| 66 |  | 
|---|
| 67 | /** | 
|---|
| 68 | * Copy constructor. | 
|---|
| 69 | */ | 
|---|
| 70 | StringReplacer::StringReplacer(const StringReplacer& other) : | 
|---|
| 71 | UnicodeFunctor(other), | 
|---|
| 72 | UnicodeReplacer(other) | 
|---|
| 73 | { | 
|---|
| 74 | output = other.output; | 
|---|
| 75 | cursorPos = other.cursorPos; | 
|---|
| 76 | hasCursor = other.hasCursor; | 
|---|
| 77 | data = other.data; | 
|---|
| 78 | isComplex = other.isComplex; | 
|---|
| 79 | } | 
|---|
| 80 |  | 
|---|
| 81 | /** | 
|---|
| 82 | * Destructor | 
|---|
| 83 | */ | 
|---|
| 84 | StringReplacer::~StringReplacer() { | 
|---|
| 85 | } | 
|---|
| 86 |  | 
|---|
| 87 | /** | 
|---|
| 88 | * Implement UnicodeFunctor | 
|---|
| 89 | */ | 
|---|
| 90 | StringReplacer* StringReplacer::clone() const { | 
|---|
| 91 | return new StringReplacer(*this); | 
|---|
| 92 | } | 
|---|
| 93 |  | 
|---|
| 94 | /** | 
|---|
| 95 | * Implement UnicodeFunctor | 
|---|
| 96 | */ | 
|---|
| 97 | UnicodeReplacer* StringReplacer::toReplacer() const { | 
|---|
| 98 | return const_cast<StringReplacer *>(this); | 
|---|
| 99 | } | 
|---|
| 100 |  | 
|---|
| 101 | /** | 
|---|
| 102 | * UnicodeReplacer API | 
|---|
| 103 | */ | 
|---|
| 104 | int32_t StringReplacer::replace(Replaceable& text, | 
|---|
| 105 | int32_t start, | 
|---|
| 106 | int32_t limit, | 
|---|
| 107 | int32_t& cursor) { | 
|---|
| 108 | int32_t outLen; | 
|---|
| 109 | int32_t newStart = 0; | 
|---|
| 110 |  | 
|---|
| 111 | // NOTE: It should be possible to _always_ run the complex | 
|---|
| 112 | // processing code; just slower.  If not, then there is a bug | 
|---|
| 113 | // in the complex processing code. | 
|---|
| 114 |  | 
|---|
| 115 | // Simple (no nested replacers) Processing Code : | 
|---|
| 116 | if (!isComplex) { | 
|---|
| 117 | text.handleReplaceBetween(start, limit, output); | 
|---|
| 118 | outLen = output.length(); | 
|---|
| 119 |  | 
|---|
| 120 | // Setup default cursor position (for cursorPos within output) | 
|---|
| 121 | newStart = cursorPos; | 
|---|
| 122 | } | 
|---|
| 123 |  | 
|---|
| 124 | // Complex (nested replacers) Processing Code : | 
|---|
| 125 | else { | 
|---|
| 126 | /* When there are segments to be copied, use the Replaceable.copy() | 
|---|
| 127 | * API in order to retain out-of-band data.  Copy everything to the | 
|---|
| 128 | * end of the string, then copy them back over the key.  This preserves | 
|---|
| 129 | * the integrity of indices into the key and surrounding context while | 
|---|
| 130 | * generating the output text. | 
|---|
| 131 | */ | 
|---|
| 132 | UnicodeString buf; | 
|---|
| 133 | int32_t oOutput; // offset into 'output' | 
|---|
| 134 | isComplex = FALSE; | 
|---|
| 135 |  | 
|---|
| 136 | // The temporary buffer starts at tempStart, and extends | 
|---|
| 137 | // to destLimit.  The start of the buffer has a single | 
|---|
| 138 | // character from before the key.  This provides style | 
|---|
| 139 | // data when addition characters are filled into the | 
|---|
| 140 | // temporary buffer.  If there is nothing to the left, use | 
|---|
| 141 | // the non-character U+FFFF, which Replaceable subclasses | 
|---|
| 142 | // should treat specially as a "no-style character." | 
|---|
| 143 | // destStart points to the point after the style context | 
|---|
| 144 | // character, so it is tempStart+1 or tempStart+2. | 
|---|
| 145 | int32_t tempStart = text.length(); // start of temp buffer | 
|---|
| 146 | int32_t destStart = tempStart; // copy new text to here | 
|---|
| 147 | if (start > 0) { | 
|---|
| 148 | int32_t len = U16_LENGTH(text.char32At(start-1)); | 
|---|
| 149 | text.copy(start-len, start, tempStart); | 
|---|
| 150 | destStart += len; | 
|---|
| 151 | } else { | 
|---|
| 152 | UnicodeString str((UChar) 0xFFFF); | 
|---|
| 153 | text.handleReplaceBetween(tempStart, tempStart, str); | 
|---|
| 154 | destStart++; | 
|---|
| 155 | } | 
|---|
| 156 | int32_t destLimit = destStart; | 
|---|
| 157 |  | 
|---|
| 158 | for (oOutput=0; oOutput<output.length(); ) { | 
|---|
| 159 | if (oOutput == cursorPos) { | 
|---|
| 160 | // Record the position of the cursor | 
|---|
| 161 | newStart = destLimit - destStart; // relative to start | 
|---|
| 162 | } | 
|---|
| 163 | UChar32 c = output.char32At(oOutput); | 
|---|
| 164 | UnicodeReplacer* r = data->lookupReplacer(c); | 
|---|
| 165 | if (r == NULL) { | 
|---|
| 166 | // Accumulate straight (non-segment) text. | 
|---|
| 167 | buf.append(c); | 
|---|
| 168 | } else { | 
|---|
| 169 | isComplex = TRUE; | 
|---|
| 170 |  | 
|---|
| 171 | // Insert any accumulated straight text. | 
|---|
| 172 | if (buf.length() > 0) { | 
|---|
| 173 | text.handleReplaceBetween(destLimit, destLimit, buf); | 
|---|
| 174 | destLimit += buf.length(); | 
|---|
| 175 | buf.truncate(0); | 
|---|
| 176 | } | 
|---|
| 177 |  | 
|---|
| 178 | // Delegate output generation to replacer object | 
|---|
| 179 | int32_t len = r->replace(text, destLimit, destLimit, cursor); | 
|---|
| 180 | destLimit += len; | 
|---|
| 181 | } | 
|---|
| 182 | oOutput += U16_LENGTH(c); | 
|---|
| 183 | } | 
|---|
| 184 | // Insert any accumulated straight text. | 
|---|
| 185 | if (buf.length() > 0) { | 
|---|
| 186 | text.handleReplaceBetween(destLimit, destLimit, buf); | 
|---|
| 187 | destLimit += buf.length(); | 
|---|
| 188 | } | 
|---|
| 189 | if (oOutput == cursorPos) { | 
|---|
| 190 | // Record the position of the cursor | 
|---|
| 191 | newStart = destLimit - destStart; // relative to start | 
|---|
| 192 | } | 
|---|
| 193 |  | 
|---|
| 194 | outLen = destLimit - destStart; | 
|---|
| 195 |  | 
|---|
| 196 | // Copy new text to start, and delete it | 
|---|
| 197 | text.copy(destStart, destLimit, start); | 
|---|
| 198 | text.handleReplaceBetween(tempStart + outLen, destLimit + outLen, UnicodeString()); | 
|---|
| 199 |  | 
|---|
| 200 | // Delete the old text (the key) | 
|---|
| 201 | text.handleReplaceBetween(start + outLen, limit + outLen, UnicodeString()); | 
|---|
| 202 | } | 
|---|
| 203 |  | 
|---|
| 204 | if (hasCursor) { | 
|---|
| 205 | // Adjust the cursor for positions outside the key.  These | 
|---|
| 206 | // refer to code points rather than code units.  If cursorPos | 
|---|
| 207 | // is within the output string, then use newStart, which has | 
|---|
| 208 | // already been set above. | 
|---|
| 209 | if (cursorPos < 0) { | 
|---|
| 210 | newStart = start; | 
|---|
| 211 | int32_t n = cursorPos; | 
|---|
| 212 | // Outside the output string, cursorPos counts code points | 
|---|
| 213 | while (n < 0 && newStart > 0) { | 
|---|
| 214 | newStart -= U16_LENGTH(text.char32At(newStart-1)); | 
|---|
| 215 | ++n; | 
|---|
| 216 | } | 
|---|
| 217 | newStart += n; | 
|---|
| 218 | } else if (cursorPos > output.length()) { | 
|---|
| 219 | newStart = start + outLen; | 
|---|
| 220 | int32_t n = cursorPos - output.length(); | 
|---|
| 221 | // Outside the output string, cursorPos counts code points | 
|---|
| 222 | while (n > 0 && newStart < text.length()) { | 
|---|
| 223 | newStart += U16_LENGTH(text.char32At(newStart)); | 
|---|
| 224 | --n; | 
|---|
| 225 | } | 
|---|
| 226 | newStart += n; | 
|---|
| 227 | } else { | 
|---|
| 228 | // Cursor is within output string.  It has been set up above | 
|---|
| 229 | // to be relative to start. | 
|---|
| 230 | newStart += start; | 
|---|
| 231 | } | 
|---|
| 232 |  | 
|---|
| 233 | cursor = newStart; | 
|---|
| 234 | } | 
|---|
| 235 |  | 
|---|
| 236 | return outLen; | 
|---|
| 237 | } | 
|---|
| 238 |  | 
|---|
| 239 | /** | 
|---|
| 240 | * UnicodeReplacer API | 
|---|
| 241 | */ | 
|---|
| 242 | UnicodeString& StringReplacer::toReplacerPattern(UnicodeString& rule, | 
|---|
| 243 | UBool escapeUnprintable) const { | 
|---|
| 244 | rule.truncate(0); | 
|---|
| 245 | UnicodeString quoteBuf; | 
|---|
| 246 |  | 
|---|
| 247 | int32_t cursor = cursorPos; | 
|---|
| 248 |  | 
|---|
| 249 | // Handle a cursor preceding the output | 
|---|
| 250 | if (hasCursor && cursor < 0) { | 
|---|
| 251 | while (cursor++ < 0) { | 
|---|
| 252 | ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf); | 
|---|
| 253 | } | 
|---|
| 254 | // Fall through and append '|' below | 
|---|
| 255 | } | 
|---|
| 256 |  | 
|---|
| 257 | for (int32_t i=0; i<output.length(); ++i) { | 
|---|
| 258 | if (hasCursor && i == cursor) { | 
|---|
| 259 | ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf); | 
|---|
| 260 | } | 
|---|
| 261 | UChar c = output.charAt(i); // Ok to use 16-bits here | 
|---|
| 262 |  | 
|---|
| 263 | UnicodeReplacer* r = data->lookupReplacer(c); | 
|---|
| 264 | if (r == NULL) { | 
|---|
| 265 | ICU_Utility::appendToRule(rule, c, FALSE, escapeUnprintable, quoteBuf); | 
|---|
| 266 | } else { | 
|---|
| 267 | UnicodeString buf; | 
|---|
| 268 | r->toReplacerPattern(buf, escapeUnprintable); | 
|---|
| 269 | buf.insert(0, (UChar)0x20); | 
|---|
| 270 | buf.append((UChar)0x20); | 
|---|
| 271 | ICU_Utility::appendToRule(rule, buf, | 
|---|
| 272 | TRUE, escapeUnprintable, quoteBuf); | 
|---|
| 273 | } | 
|---|
| 274 | } | 
|---|
| 275 |  | 
|---|
| 276 | // Handle a cursor after the output.  Use > rather than >= because | 
|---|
| 277 | // if cursor == output.length() it is at the end of the output, | 
|---|
| 278 | // which is the default position, so we need not emit it. | 
|---|
| 279 | if (hasCursor && cursor > output.length()) { | 
|---|
| 280 | cursor -= output.length(); | 
|---|
| 281 | while (cursor-- > 0) { | 
|---|
| 282 | ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf); | 
|---|
| 283 | } | 
|---|
| 284 | ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf); | 
|---|
| 285 | } | 
|---|
| 286 | // Flush quoteBuf out to result | 
|---|
| 287 | ICU_Utility::appendToRule(rule, -1, | 
|---|
| 288 | TRUE, escapeUnprintable, quoteBuf); | 
|---|
| 289 |  | 
|---|
| 290 | return rule; | 
|---|
| 291 | } | 
|---|
| 292 |  | 
|---|
| 293 | /** | 
|---|
| 294 | * Implement UnicodeReplacer | 
|---|
| 295 | */ | 
|---|
| 296 | void StringReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const { | 
|---|
| 297 | UChar32 ch; | 
|---|
| 298 | for (int32_t i=0; i<output.length(); i+=U16_LENGTH(ch)) { | 
|---|
| 299 | ch = output.char32At(i); | 
|---|
| 300 | UnicodeReplacer* r = data->lookupReplacer(ch); | 
|---|
| 301 | if (r == NULL) { | 
|---|
| 302 | toUnionTo.add(ch); | 
|---|
| 303 | } else { | 
|---|
| 304 | r->addReplacementSetTo(toUnionTo); | 
|---|
| 305 | } | 
|---|
| 306 | } | 
|---|
| 307 | } | 
|---|
| 308 |  | 
|---|
| 309 | /** | 
|---|
| 310 | * UnicodeFunctor API | 
|---|
| 311 | */ | 
|---|
| 312 | void StringReplacer::setData(const TransliterationRuleData* d) { | 
|---|
| 313 | data = d; | 
|---|
| 314 | int32_t i = 0; | 
|---|
| 315 | while (i<output.length()) { | 
|---|
| 316 | UChar32 c = output.char32At(i); | 
|---|
| 317 | UnicodeFunctor* f = data->lookup(c); | 
|---|
| 318 | if (f != NULL) { | 
|---|
| 319 | f->setData(data); | 
|---|
| 320 | } | 
|---|
| 321 | i += U16_LENGTH(c); | 
|---|
| 322 | } | 
|---|
| 323 | } | 
|---|
| 324 |  | 
|---|
| 325 | U_NAMESPACE_END | 
|---|
| 326 |  | 
|---|
| 327 | #endif /* #if !UCONFIG_NO_TRANSLITERATION */ | 
|---|
| 328 |  | 
|---|
| 329 | //eof | 
|---|
| 330 |  | 
|---|