1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | /* |
4 | ********************************************************************** |
5 | * Copyright (C) 1999-2015, International Business Machines |
6 | * Corporation and others. All Rights Reserved. |
7 | ********************************************************************** |
8 | * Date Name Description |
9 | * 11/17/99 aliu Creation. |
10 | ********************************************************************** |
11 | */ |
12 | |
13 | #include "unicode/utypes.h" |
14 | |
15 | #if !UCONFIG_NO_TRANSLITERATION |
16 | |
17 | #include "unicode/rep.h" |
18 | #include "unicode/uniset.h" |
19 | #include "rbt_pars.h" |
20 | #include "rbt_data.h" |
21 | #include "rbt_rule.h" |
22 | #include "rbt.h" |
23 | #include "mutex.h" |
24 | #include "umutex.h" |
25 | |
26 | U_NAMESPACE_BEGIN |
27 | |
28 | UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedTransliterator) |
29 | |
30 | static Replaceable *gLockedText = NULL; |
31 | |
32 | void RuleBasedTransliterator::_construct(const UnicodeString& rules, |
33 | UTransDirection direction, |
34 | UParseError& parseError, |
35 | UErrorCode& status) { |
36 | fData = 0; |
37 | isDataOwned = TRUE; |
38 | if (U_FAILURE(status)) { |
39 | return; |
40 | } |
41 | |
42 | TransliteratorParser parser(status); |
43 | parser.parse(rules, direction, parseError, status); |
44 | if (U_FAILURE(status)) { |
45 | return; |
46 | } |
47 | |
48 | if (parser.idBlockVector.size() != 0 || |
49 | parser.compoundFilter != NULL || |
50 | parser.dataVector.size() == 0) { |
51 | status = U_INVALID_RBT_SYNTAX; // ::ID blocks disallowed in RBT |
52 | return; |
53 | } |
54 | |
55 | fData = (TransliterationRuleData*)parser.dataVector.orphanElementAt(0); |
56 | setMaximumContextLength(fData->ruleSet.getMaximumContextLength()); |
57 | } |
58 | |
59 | /** |
60 | * Constructs a new transliterator from the given rules. |
61 | * @param id the id for the transliterator. |
62 | * @param rules rules, separated by ';' |
63 | * @param direction either FORWARD or REVERSE. |
64 | * @param adoptedFilter the filter for this transliterator. |
65 | * @param parseError Struct to recieve information on position |
66 | * of error if an error is encountered |
67 | * @param status Output param set to success/failure code. |
68 | * @exception IllegalArgumentException if rules are malformed |
69 | * or direction is invalid. |
70 | */ |
71 | RuleBasedTransliterator::RuleBasedTransliterator( |
72 | const UnicodeString& id, |
73 | const UnicodeString& rules, |
74 | UTransDirection direction, |
75 | UnicodeFilter* adoptedFilter, |
76 | UParseError& parseError, |
77 | UErrorCode& status) : |
78 | Transliterator(id, adoptedFilter) { |
79 | _construct(rules, direction,parseError,status); |
80 | } |
81 | |
82 | /** |
83 | * Constructs a new transliterator from the given rules. |
84 | * @param id the id for the transliterator. |
85 | * @param rules rules, separated by ';' |
86 | * @param direction either FORWARD or REVERSE. |
87 | * @param adoptedFilter the filter for this transliterator. |
88 | * @param status Output param set to success/failure code. |
89 | * @exception IllegalArgumentException if rules are malformed |
90 | * or direction is invalid. |
91 | */ |
92 | /*RuleBasedTransliterator::RuleBasedTransliterator( |
93 | const UnicodeString& id, |
94 | const UnicodeString& rules, |
95 | UTransDirection direction, |
96 | UnicodeFilter* adoptedFilter, |
97 | UErrorCode& status) : |
98 | Transliterator(id, adoptedFilter) { |
99 | UParseError parseError; |
100 | _construct(rules, direction,parseError, status); |
101 | }*/ |
102 | |
103 | /** |
104 | * Covenience constructor with no filter. |
105 | */ |
106 | /*RuleBasedTransliterator::RuleBasedTransliterator( |
107 | const UnicodeString& id, |
108 | const UnicodeString& rules, |
109 | UTransDirection direction, |
110 | UErrorCode& status) : |
111 | Transliterator(id, 0) { |
112 | UParseError parseError; |
113 | _construct(rules, direction,parseError, status); |
114 | }*/ |
115 | |
116 | /** |
117 | * Covenience constructor with no filter and FORWARD direction. |
118 | */ |
119 | /*RuleBasedTransliterator::RuleBasedTransliterator( |
120 | const UnicodeString& id, |
121 | const UnicodeString& rules, |
122 | UErrorCode& status) : |
123 | Transliterator(id, 0) { |
124 | UParseError parseError; |
125 | _construct(rules, UTRANS_FORWARD, parseError, status); |
126 | }*/ |
127 | |
128 | /** |
129 | * Covenience constructor with FORWARD direction. |
130 | */ |
131 | /*RuleBasedTransliterator::RuleBasedTransliterator( |
132 | const UnicodeString& id, |
133 | const UnicodeString& rules, |
134 | UnicodeFilter* adoptedFilter, |
135 | UErrorCode& status) : |
136 | Transliterator(id, adoptedFilter) { |
137 | UParseError parseError; |
138 | _construct(rules, UTRANS_FORWARD,parseError, status); |
139 | }*/ |
140 | |
141 | RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id, |
142 | const TransliterationRuleData* theData, |
143 | UnicodeFilter* adoptedFilter) : |
144 | Transliterator(id, adoptedFilter), |
145 | fData((TransliterationRuleData*)theData), // cast away const |
146 | isDataOwned(FALSE) { |
147 | setMaximumContextLength(fData->ruleSet.getMaximumContextLength()); |
148 | } |
149 | |
150 | /** |
151 | * Internal constructor. |
152 | */ |
153 | RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id, |
154 | TransliterationRuleData* theData, |
155 | UBool isDataAdopted) : |
156 | Transliterator(id, 0), |
157 | fData(theData), |
158 | isDataOwned(isDataAdopted) { |
159 | setMaximumContextLength(fData->ruleSet.getMaximumContextLength()); |
160 | } |
161 | |
162 | /** |
163 | * Copy constructor. |
164 | */ |
165 | RuleBasedTransliterator::RuleBasedTransliterator( |
166 | const RuleBasedTransliterator& other) : |
167 | Transliterator(other), fData(other.fData), |
168 | isDataOwned(other.isDataOwned) { |
169 | |
170 | // The data object may or may not be owned. If it is not owned we |
171 | // share it; it is invariant. If it is owned, it's still |
172 | // invariant, but we need to copy it to prevent double-deletion. |
173 | // If this becomes a performance issue (if people do a lot of RBT |
174 | // copying -- unlikely) we can reference count the data object. |
175 | |
176 | // Only do a deep copy if this is owned data, that is, data that |
177 | // will be later deleted. System transliterators contain |
178 | // non-owned data. |
179 | if (isDataOwned) { |
180 | fData = new TransliterationRuleData(*other.fData); |
181 | } |
182 | } |
183 | |
184 | /** |
185 | * Destructor. |
186 | */ |
187 | RuleBasedTransliterator::~RuleBasedTransliterator() { |
188 | // Delete the data object only if we own it. |
189 | if (isDataOwned) { |
190 | delete fData; |
191 | } |
192 | } |
193 | |
194 | RuleBasedTransliterator* |
195 | RuleBasedTransliterator::clone() const { |
196 | return new RuleBasedTransliterator(*this); |
197 | } |
198 | |
199 | /** |
200 | * Implements {@link Transliterator#handleTransliterate}. |
201 | */ |
202 | void |
203 | RuleBasedTransliterator::handleTransliterate(Replaceable& text, UTransPosition& index, |
204 | UBool isIncremental) const { |
205 | /* We keep contextStart and contextLimit fixed the entire time, |
206 | * relative to the text -- contextLimit may move numerically if |
207 | * text is inserted or removed. The start offset moves toward |
208 | * limit, with replacements happening under it. |
209 | * |
210 | * Example: rules 1. ab>x|y |
211 | * 2. yc>z |
212 | * |
213 | * |eabcd begin - no match, advance start |
214 | * e|abcd match rule 1 - change text & adjust start |
215 | * ex|ycd match rule 2 - change text & adjust start |
216 | * exz|d no match, advance start |
217 | * exzd| done |
218 | */ |
219 | |
220 | /* A rule like |
221 | * a>b|a |
222 | * creates an infinite loop. To prevent that, we put an arbitrary |
223 | * limit on the number of iterations that we take, one that is |
224 | * high enough that any reasonable rules are ok, but low enough to |
225 | * prevent a server from hanging. The limit is 16 times the |
226 | * number of characters n, unless n is so large that 16n exceeds a |
227 | * uint32_t. |
228 | */ |
229 | uint32_t loopCount = 0; |
230 | uint32_t loopLimit = index.limit - index.start; |
231 | if (loopLimit >= 0x10000000) { |
232 | loopLimit = 0xFFFFFFFF; |
233 | } else { |
234 | loopLimit <<= 4; |
235 | } |
236 | |
237 | // Transliterator locking. Rule-based Transliterators are not thread safe; concurrent |
238 | // operations must be prevented. |
239 | // A Complication: compound transliterators can result in recursive entries to this |
240 | // function, sometimes with different "This" objects, always with the same text. |
241 | // Double-locking must be prevented in these cases. |
242 | // |
243 | |
244 | UBool lockedMutexAtThisLevel = FALSE; |
245 | |
246 | // Test whether this request is operating on the same text string as |
247 | // some other transliteration that is still in progress and holding the |
248 | // transliteration mutex. If so, do not lock the transliteration |
249 | // mutex again. |
250 | // |
251 | // gLockedText variable is protected by the global ICU mutex. |
252 | // Shared RBT data protected by transliteratorDataMutex. |
253 | // |
254 | // TODO(andy): Need a better scheme for handling this. |
255 | |
256 | static UMutex transliteratorDataMutex; |
257 | UBool needToLock; |
258 | { |
259 | Mutex m; |
260 | needToLock = (&text != gLockedText); |
261 | } |
262 | if (needToLock) { |
263 | umtx_lock(&transliteratorDataMutex); // Contention, longish waits possible here. |
264 | Mutex m; |
265 | gLockedText = &text; |
266 | lockedMutexAtThisLevel = TRUE; |
267 | } |
268 | |
269 | // Check to make sure we don't dereference a null pointer. |
270 | if (fData != NULL) { |
271 | while (index.start < index.limit && |
272 | loopCount <= loopLimit && |
273 | fData->ruleSet.transliterate(text, index, isIncremental)) { |
274 | ++loopCount; |
275 | } |
276 | } |
277 | if (lockedMutexAtThisLevel) { |
278 | { |
279 | Mutex m; |
280 | gLockedText = NULL; |
281 | } |
282 | umtx_unlock(&transliteratorDataMutex); |
283 | } |
284 | } |
285 | |
286 | UnicodeString& RuleBasedTransliterator::toRules(UnicodeString& rulesSource, |
287 | UBool escapeUnprintable) const { |
288 | return fData->ruleSet.toRules(rulesSource, escapeUnprintable); |
289 | } |
290 | |
291 | /** |
292 | * Implement Transliterator framework |
293 | */ |
294 | void RuleBasedTransliterator::handleGetSourceSet(UnicodeSet& result) const { |
295 | fData->ruleSet.getSourceTargetSet(result, FALSE); |
296 | } |
297 | |
298 | /** |
299 | * Override Transliterator framework |
300 | */ |
301 | UnicodeSet& RuleBasedTransliterator::getTargetSet(UnicodeSet& result) const { |
302 | return fData->ruleSet.getSourceTargetSet(result, TRUE); |
303 | } |
304 | |
305 | U_NAMESPACE_END |
306 | |
307 | #endif /* #if !UCONFIG_NO_TRANSLITERATION */ |
308 | |