rbt.cpp source code [ClickHouse/contrib/icu/icu4c/source/i18n/rbt.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	**********************************************************************
5	* Copyright (C) 1999-2015, International Business Machines
6	* Corporation and others. All Rights Reserved.
7	**********************************************************************
8	* Date Name Description
9	* 11/17/99 aliu Creation.
10	**********************************************************************
11	*/
12
13	#include "unicode/utypes.h"
14
15	#if !UCONFIG_NO_TRANSLITERATION
16
17	#include "unicode/rep.h"
18	#include "unicode/uniset.h"
19	#include "rbt_pars.h"
20	#include "rbt_data.h"
21	#include "rbt_rule.h"
22	#include "rbt.h"
23	#include "mutex.h"
24	#include "umutex.h"
25
26	U_NAMESPACE_BEGIN
27
28	UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedTransliterator)
29
30	static Replaceable *gLockedText = NULL;
31
32	void RuleBasedTransliterator::_construct(const UnicodeString& rules,
33	UTransDirection direction,
34	UParseError& parseError,
35	UErrorCode& status) {
36	fData = `0`;
37	isDataOwned = TRUE;
38	if (U_FAILURE(status)) {
39	return;
40	}
41
42	TransliteratorParser parser(status);
43	parser.parse(rules, direction, parseError, status);
44	if (U_FAILURE(status)) {
45	return;
46	}
47
48	if (parser.idBlockVector.size() != `0` \|\|
49	parser.compoundFilter != NULL \|\|
50	parser.dataVector.size() == `0`) {
51	status = U_INVALID_RBT_SYNTAX; // ::ID blocks disallowed in RBT
52	return;
53	}
54
55	fData = (TransliterationRuleData*)parser.dataVector.orphanElementAt(`0`);
56	setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
57	}
58
59	/**
60	* Constructs a new transliterator from the given rules.
61	* @param id the id for the transliterator.
62	* @param rules rules, separated by ';'
63	* @param direction either FORWARD or REVERSE.
64	* @param adoptedFilter the filter for this transliterator.
65	* @param parseError Struct to recieve information on position
66	* of error if an error is encountered
67	* @param status Output param set to success/failure code.
68	* @exception IllegalArgumentException if rules are malformed
69	* or direction is invalid.
70	*/
71	RuleBasedTransliterator::RuleBasedTransliterator(
72	const UnicodeString& id,
73	const UnicodeString& rules,
74	UTransDirection direction,
75	UnicodeFilter* adoptedFilter,
76	UParseError& parseError,
77	UErrorCode& status) :
78	Transliterator (id, adoptedFilter) {
79	_construct(rules, direction,parseError,status);
80	}
81
82	/**
83	* Constructs a new transliterator from the given rules.
84	* @param id the id for the transliterator.
85	* @param rules rules, separated by ';'
86	* @param direction either FORWARD or REVERSE.
87	* @param adoptedFilter the filter for this transliterator.
88	* @param status Output param set to success/failure code.
89	* @exception IllegalArgumentException if rules are malformed
90	* or direction is invalid.
91	*/
92	/RuleBasedTransliterator::RuleBasedTransliterator(*
93	const UnicodeString& id,
94	const UnicodeString& rules,
95	UTransDirection direction,
96	UnicodeFilter adoptedFilter,*
97	UErrorCode& status) :
98	Transliterator(id, adoptedFilter) {
99	UParseError parseError;
100	_construct(rules, direction,parseError, status);
101	}/*
102
103	/**
104	* Covenience constructor with no filter.
105	*/
106	/RuleBasedTransliterator::RuleBasedTransliterator(*
107	const UnicodeString& id,
108	const UnicodeString& rules,
109	UTransDirection direction,
110	UErrorCode& status) :
111	Transliterator(id, 0) {
112	UParseError parseError;
113	_construct(rules, direction,parseError, status);
114	}/*
115
116	/**
117	* Covenience constructor with no filter and FORWARD direction.
118	*/
119	/RuleBasedTransliterator::RuleBasedTransliterator(*
120	const UnicodeString& id,
121	const UnicodeString& rules,
122	UErrorCode& status) :
123	Transliterator(id, 0) {
124	UParseError parseError;
125	_construct(rules, UTRANS_FORWARD, parseError, status);
126	}/*
127
128	/**
129	* Covenience constructor with FORWARD direction.
130	*/
131	/RuleBasedTransliterator::RuleBasedTransliterator(*
132	const UnicodeString& id,
133	const UnicodeString& rules,
134	UnicodeFilter adoptedFilter,*
135	UErrorCode& status) :
136	Transliterator(id, adoptedFilter) {
137	UParseError parseError;
138	_construct(rules, UTRANS_FORWARD,parseError, status);
139	}/*
140
141	RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id,
142	const TransliterationRuleData* theData,
143	UnicodeFilter* adoptedFilter) :
144	Transliterator (id, adoptedFilter),
145	fData((TransliterationRuleData)theData), // cast away const*
146	isDataOwned(FALSE) {
147	setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
148	}
149
150	/**
151	* Internal constructor.
152	*/
153	RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id,
154	TransliterationRuleData* theData,
155	UBool isDataAdopted) :
156	Transliterator (id, `0`),
157	fData(theData),
158	isDataOwned(isDataAdopted) {
159	setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
160	}
161
162	/**
163	* Copy constructor.
164	*/
165	RuleBasedTransliterator::RuleBasedTransliterator(
166	const RuleBasedTransliterator& other) :
167	Transliterator (other), fData(other.fData),
168	isDataOwned(other.isDataOwned) {
169
170	// The data object may or may not be owned. If it is not owned we
171	// share it; it is invariant. If it is owned, it's still
172	// invariant, but we need to copy it to prevent double-deletion.
173	// If this becomes a performance issue (if people do a lot of RBT
174	// copying -- unlikely) we can reference count the data object.
175
176	// Only do a deep copy if this is owned data, that is, data that
177	// will be later deleted. System transliterators contain
178	// non-owned data.
179	if (isDataOwned) {
180	fData = new TransliterationRuleData (*other.fData);
181	}
182	}
183
184	/**
185	* Destructor.
186	*/
187	RuleBasedTransliterator::~RuleBasedTransliterator() {
188	// Delete the data object only if we own it.
189	if (isDataOwned) {
190	delete fData;
191	}
192	}
193
194	RuleBasedTransliterator*
195	RuleBasedTransliterator::clone() const {
196	return new RuleBasedTransliterator (*this);
197	}
198
199	/**
200	* Implements {@link Transliterator#handleTransliterate}.
201	*/
202	void
203	RuleBasedTransliterator::handleTransliterate(Replaceable& text, UTransPosition& index,
204	UBool isIncremental) const {
205	/ We keep contextStart and contextLimit fixed the entire time,*
206	* relative to the text -- contextLimit may move numerically if
207	* text is inserted or removed. The start offset moves toward
208	* limit, with replacements happening under it.
209	*
210	* Example: rules 1. ab>x\|y
211	* 2. yc>z
212	*
213	* \|eabcd begin - no match, advance start
214	* e\|abcd match rule 1 - change text & adjust start
215	* ex\|ycd match rule 2 - change text & adjust start
216	* exz\|d no match, advance start
217	* exzd\| done
218	*/
219
220	/ A rule like*
221	* a>b\|a
222	* creates an infinite loop. To prevent that, we put an arbitrary
223	* limit on the number of iterations that we take, one that is
224	* high enough that any reasonable rules are ok, but low enough to
225	* prevent a server from hanging. The limit is 16 times the
226	* number of characters n, unless n is so large that 16n exceeds a
227	* uint32_t.
228	*/
229	uint32_t loopCount = `0`;
230	uint32_t loopLimit = index.limit - index.start;
231	if (loopLimit >= `0x10000000`) {
232	loopLimit = `0xFFFFFFFF`;
233	} else {
234	loopLimit <<= `4`;
235	}
236
237	// Transliterator locking. Rule-based Transliterators are not thread safe; concurrent
238	// operations must be prevented.
239	// A Complication: compound transliterators can result in recursive entries to this
240	// function, sometimes with different "This" objects, always with the same text.
241	// Double-locking must be prevented in these cases.
242	//
243
244	UBool lockedMutexAtThisLevel = FALSE;
245
246	// Test whether this request is operating on the same text string as
247	// some other transliteration that is still in progress and holding the
248	// transliteration mutex. If so, do not lock the transliteration
249	// mutex again.
250	//
251	// gLockedText variable is protected by the global ICU mutex.
252	// Shared RBT data protected by transliteratorDataMutex.
253	//
254	// TODO(andy): Need a better scheme for handling this.
255
256	static UMutex transliteratorDataMutex;
257	UBool needToLock;
258	{
259	Mutex m;
260	needToLock = (&text != gLockedText);
261	}
262	if (needToLock) {
263	umtx_lock(&transliteratorDataMutex); // Contention, longish waits possible here.
264	Mutex m;
265	gLockedText = &text;
266	lockedMutexAtThisLevel = TRUE;
267	}
268
269	// Check to make sure we don't dereference a null pointer.
270	if (fData != NULL) {
271	while (index.start < index.limit &&
272	loopCount <= loopLimit &&
273	fData->ruleSet.transliterate(text, index, isIncremental)) {
274	++loopCount;
275	}
276	}
277	if (lockedMutexAtThisLevel) {
278	{
279	Mutex m;
280	gLockedText = NULL;
281	}
282	umtx_unlock(&transliteratorDataMutex);
283	}
284	}
285
286	UnicodeString& RuleBasedTransliterator::toRules(UnicodeString& rulesSource,
287	UBool escapeUnprintable) const {
288	return fData->ruleSet.toRules(rulesSource, escapeUnprintable);
289	}
290
291	/**
292	* Implement Transliterator framework
293	*/
294	void RuleBasedTransliterator::handleGetSourceSet(UnicodeSet& result) const {
295	fData->ruleSet.getSourceTargetSet(result, FALSE);
296	}
297
298	/**
299	* Override Transliterator framework
300	*/
301	UnicodeSet& RuleBasedTransliterator::getTargetSet(UnicodeSet& result) const {
302	return fData->ruleSet.getSourceTargetSet(result, TRUE);
303	}
304
305	U_NAMESPACE_END
306
307	#endif /* #if !UCONFIG_NO_TRANSLITERATION */
308

Browse the source code of ClickHouse/contrib/icu/icu4c/source/i18n/rbt.cpp