| 1 | // © 2016 and later: Unicode, Inc. and others. |
| 2 | // License & terms of use: http://www.unicode.org/copyright.html |
| 3 | /* |
| 4 | ********************************************************************** |
| 5 | * Copyright (C) 1999-2015, International Business Machines |
| 6 | * Corporation and others. All Rights Reserved. |
| 7 | ********************************************************************** |
| 8 | * Date Name Description |
| 9 | * 11/17/99 aliu Creation. |
| 10 | ********************************************************************** |
| 11 | */ |
| 12 | |
| 13 | #include "unicode/utypes.h" |
| 14 | |
| 15 | #if !UCONFIG_NO_TRANSLITERATION |
| 16 | |
| 17 | #include "unicode/rep.h" |
| 18 | #include "unicode/uniset.h" |
| 19 | #include "rbt_pars.h" |
| 20 | #include "rbt_data.h" |
| 21 | #include "rbt_rule.h" |
| 22 | #include "rbt.h" |
| 23 | #include "mutex.h" |
| 24 | #include "umutex.h" |
| 25 | |
| 26 | U_NAMESPACE_BEGIN |
| 27 | |
| 28 | UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedTransliterator) |
| 29 | |
| 30 | static Replaceable *gLockedText = NULL; |
| 31 | |
| 32 | void RuleBasedTransliterator::_construct(const UnicodeString& rules, |
| 33 | UTransDirection direction, |
| 34 | UParseError& parseError, |
| 35 | UErrorCode& status) { |
| 36 | fData = 0; |
| 37 | isDataOwned = TRUE; |
| 38 | if (U_FAILURE(status)) { |
| 39 | return; |
| 40 | } |
| 41 | |
| 42 | TransliteratorParser parser(status); |
| 43 | parser.parse(rules, direction, parseError, status); |
| 44 | if (U_FAILURE(status)) { |
| 45 | return; |
| 46 | } |
| 47 | |
| 48 | if (parser.idBlockVector.size() != 0 || |
| 49 | parser.compoundFilter != NULL || |
| 50 | parser.dataVector.size() == 0) { |
| 51 | status = U_INVALID_RBT_SYNTAX; // ::ID blocks disallowed in RBT |
| 52 | return; |
| 53 | } |
| 54 | |
| 55 | fData = (TransliterationRuleData*)parser.dataVector.orphanElementAt(0); |
| 56 | setMaximumContextLength(fData->ruleSet.getMaximumContextLength()); |
| 57 | } |
| 58 | |
| 59 | /** |
| 60 | * Constructs a new transliterator from the given rules. |
| 61 | * @param id the id for the transliterator. |
| 62 | * @param rules rules, separated by ';' |
| 63 | * @param direction either FORWARD or REVERSE. |
| 64 | * @param adoptedFilter the filter for this transliterator. |
| 65 | * @param parseError Struct to recieve information on position |
| 66 | * of error if an error is encountered |
| 67 | * @param status Output param set to success/failure code. |
| 68 | * @exception IllegalArgumentException if rules are malformed |
| 69 | * or direction is invalid. |
| 70 | */ |
| 71 | RuleBasedTransliterator::RuleBasedTransliterator( |
| 72 | const UnicodeString& id, |
| 73 | const UnicodeString& rules, |
| 74 | UTransDirection direction, |
| 75 | UnicodeFilter* adoptedFilter, |
| 76 | UParseError& parseError, |
| 77 | UErrorCode& status) : |
| 78 | Transliterator(id, adoptedFilter) { |
| 79 | _construct(rules, direction,parseError,status); |
| 80 | } |
| 81 | |
| 82 | /** |
| 83 | * Constructs a new transliterator from the given rules. |
| 84 | * @param id the id for the transliterator. |
| 85 | * @param rules rules, separated by ';' |
| 86 | * @param direction either FORWARD or REVERSE. |
| 87 | * @param adoptedFilter the filter for this transliterator. |
| 88 | * @param status Output param set to success/failure code. |
| 89 | * @exception IllegalArgumentException if rules are malformed |
| 90 | * or direction is invalid. |
| 91 | */ |
| 92 | /*RuleBasedTransliterator::RuleBasedTransliterator( |
| 93 | const UnicodeString& id, |
| 94 | const UnicodeString& rules, |
| 95 | UTransDirection direction, |
| 96 | UnicodeFilter* adoptedFilter, |
| 97 | UErrorCode& status) : |
| 98 | Transliterator(id, adoptedFilter) { |
| 99 | UParseError parseError; |
| 100 | _construct(rules, direction,parseError, status); |
| 101 | }*/ |
| 102 | |
| 103 | /** |
| 104 | * Covenience constructor with no filter. |
| 105 | */ |
| 106 | /*RuleBasedTransliterator::RuleBasedTransliterator( |
| 107 | const UnicodeString& id, |
| 108 | const UnicodeString& rules, |
| 109 | UTransDirection direction, |
| 110 | UErrorCode& status) : |
| 111 | Transliterator(id, 0) { |
| 112 | UParseError parseError; |
| 113 | _construct(rules, direction,parseError, status); |
| 114 | }*/ |
| 115 | |
| 116 | /** |
| 117 | * Covenience constructor with no filter and FORWARD direction. |
| 118 | */ |
| 119 | /*RuleBasedTransliterator::RuleBasedTransliterator( |
| 120 | const UnicodeString& id, |
| 121 | const UnicodeString& rules, |
| 122 | UErrorCode& status) : |
| 123 | Transliterator(id, 0) { |
| 124 | UParseError parseError; |
| 125 | _construct(rules, UTRANS_FORWARD, parseError, status); |
| 126 | }*/ |
| 127 | |
| 128 | /** |
| 129 | * Covenience constructor with FORWARD direction. |
| 130 | */ |
| 131 | /*RuleBasedTransliterator::RuleBasedTransliterator( |
| 132 | const UnicodeString& id, |
| 133 | const UnicodeString& rules, |
| 134 | UnicodeFilter* adoptedFilter, |
| 135 | UErrorCode& status) : |
| 136 | Transliterator(id, adoptedFilter) { |
| 137 | UParseError parseError; |
| 138 | _construct(rules, UTRANS_FORWARD,parseError, status); |
| 139 | }*/ |
| 140 | |
| 141 | RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id, |
| 142 | const TransliterationRuleData* theData, |
| 143 | UnicodeFilter* adoptedFilter) : |
| 144 | Transliterator(id, adoptedFilter), |
| 145 | fData((TransliterationRuleData*)theData), // cast away const |
| 146 | isDataOwned(FALSE) { |
| 147 | setMaximumContextLength(fData->ruleSet.getMaximumContextLength()); |
| 148 | } |
| 149 | |
| 150 | /** |
| 151 | * Internal constructor. |
| 152 | */ |
| 153 | RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id, |
| 154 | TransliterationRuleData* theData, |
| 155 | UBool isDataAdopted) : |
| 156 | Transliterator(id, 0), |
| 157 | fData(theData), |
| 158 | isDataOwned(isDataAdopted) { |
| 159 | setMaximumContextLength(fData->ruleSet.getMaximumContextLength()); |
| 160 | } |
| 161 | |
| 162 | /** |
| 163 | * Copy constructor. |
| 164 | */ |
| 165 | RuleBasedTransliterator::RuleBasedTransliterator( |
| 166 | const RuleBasedTransliterator& other) : |
| 167 | Transliterator(other), fData(other.fData), |
| 168 | isDataOwned(other.isDataOwned) { |
| 169 | |
| 170 | // The data object may or may not be owned. If it is not owned we |
| 171 | // share it; it is invariant. If it is owned, it's still |
| 172 | // invariant, but we need to copy it to prevent double-deletion. |
| 173 | // If this becomes a performance issue (if people do a lot of RBT |
| 174 | // copying -- unlikely) we can reference count the data object. |
| 175 | |
| 176 | // Only do a deep copy if this is owned data, that is, data that |
| 177 | // will be later deleted. System transliterators contain |
| 178 | // non-owned data. |
| 179 | if (isDataOwned) { |
| 180 | fData = new TransliterationRuleData(*other.fData); |
| 181 | } |
| 182 | } |
| 183 | |
| 184 | /** |
| 185 | * Destructor. |
| 186 | */ |
| 187 | RuleBasedTransliterator::~RuleBasedTransliterator() { |
| 188 | // Delete the data object only if we own it. |
| 189 | if (isDataOwned) { |
| 190 | delete fData; |
| 191 | } |
| 192 | } |
| 193 | |
| 194 | RuleBasedTransliterator* |
| 195 | RuleBasedTransliterator::clone() const { |
| 196 | return new RuleBasedTransliterator(*this); |
| 197 | } |
| 198 | |
| 199 | /** |
| 200 | * Implements {@link Transliterator#handleTransliterate}. |
| 201 | */ |
| 202 | void |
| 203 | RuleBasedTransliterator::handleTransliterate(Replaceable& text, UTransPosition& index, |
| 204 | UBool isIncremental) const { |
| 205 | /* We keep contextStart and contextLimit fixed the entire time, |
| 206 | * relative to the text -- contextLimit may move numerically if |
| 207 | * text is inserted or removed. The start offset moves toward |
| 208 | * limit, with replacements happening under it. |
| 209 | * |
| 210 | * Example: rules 1. ab>x|y |
| 211 | * 2. yc>z |
| 212 | * |
| 213 | * |eabcd begin - no match, advance start |
| 214 | * e|abcd match rule 1 - change text & adjust start |
| 215 | * ex|ycd match rule 2 - change text & adjust start |
| 216 | * exz|d no match, advance start |
| 217 | * exzd| done |
| 218 | */ |
| 219 | |
| 220 | /* A rule like |
| 221 | * a>b|a |
| 222 | * creates an infinite loop. To prevent that, we put an arbitrary |
| 223 | * limit on the number of iterations that we take, one that is |
| 224 | * high enough that any reasonable rules are ok, but low enough to |
| 225 | * prevent a server from hanging. The limit is 16 times the |
| 226 | * number of characters n, unless n is so large that 16n exceeds a |
| 227 | * uint32_t. |
| 228 | */ |
| 229 | uint32_t loopCount = 0; |
| 230 | uint32_t loopLimit = index.limit - index.start; |
| 231 | if (loopLimit >= 0x10000000) { |
| 232 | loopLimit = 0xFFFFFFFF; |
| 233 | } else { |
| 234 | loopLimit <<= 4; |
| 235 | } |
| 236 | |
| 237 | // Transliterator locking. Rule-based Transliterators are not thread safe; concurrent |
| 238 | // operations must be prevented. |
| 239 | // A Complication: compound transliterators can result in recursive entries to this |
| 240 | // function, sometimes with different "This" objects, always with the same text. |
| 241 | // Double-locking must be prevented in these cases. |
| 242 | // |
| 243 | |
| 244 | UBool lockedMutexAtThisLevel = FALSE; |
| 245 | |
| 246 | // Test whether this request is operating on the same text string as |
| 247 | // some other transliteration that is still in progress and holding the |
| 248 | // transliteration mutex. If so, do not lock the transliteration |
| 249 | // mutex again. |
| 250 | // |
| 251 | // gLockedText variable is protected by the global ICU mutex. |
| 252 | // Shared RBT data protected by transliteratorDataMutex. |
| 253 | // |
| 254 | // TODO(andy): Need a better scheme for handling this. |
| 255 | |
| 256 | static UMutex transliteratorDataMutex; |
| 257 | UBool needToLock; |
| 258 | { |
| 259 | Mutex m; |
| 260 | needToLock = (&text != gLockedText); |
| 261 | } |
| 262 | if (needToLock) { |
| 263 | umtx_lock(&transliteratorDataMutex); // Contention, longish waits possible here. |
| 264 | Mutex m; |
| 265 | gLockedText = &text; |
| 266 | lockedMutexAtThisLevel = TRUE; |
| 267 | } |
| 268 | |
| 269 | // Check to make sure we don't dereference a null pointer. |
| 270 | if (fData != NULL) { |
| 271 | while (index.start < index.limit && |
| 272 | loopCount <= loopLimit && |
| 273 | fData->ruleSet.transliterate(text, index, isIncremental)) { |
| 274 | ++loopCount; |
| 275 | } |
| 276 | } |
| 277 | if (lockedMutexAtThisLevel) { |
| 278 | { |
| 279 | Mutex m; |
| 280 | gLockedText = NULL; |
| 281 | } |
| 282 | umtx_unlock(&transliteratorDataMutex); |
| 283 | } |
| 284 | } |
| 285 | |
| 286 | UnicodeString& RuleBasedTransliterator::toRules(UnicodeString& rulesSource, |
| 287 | UBool escapeUnprintable) const { |
| 288 | return fData->ruleSet.toRules(rulesSource, escapeUnprintable); |
| 289 | } |
| 290 | |
| 291 | /** |
| 292 | * Implement Transliterator framework |
| 293 | */ |
| 294 | void RuleBasedTransliterator::handleGetSourceSet(UnicodeSet& result) const { |
| 295 | fData->ruleSet.getSourceTargetSet(result, FALSE); |
| 296 | } |
| 297 | |
| 298 | /** |
| 299 | * Override Transliterator framework |
| 300 | */ |
| 301 | UnicodeSet& RuleBasedTransliterator::getTargetSet(UnicodeSet& result) const { |
| 302 | return fData->ruleSet.getSourceTargetSet(result, TRUE); |
| 303 | } |
| 304 | |
| 305 | U_NAMESPACE_END |
| 306 | |
| 307 | #endif /* #if !UCONFIG_NO_TRANSLITERATION */ |
| 308 | |