strrepl.cpp source code [ClickHouse/contrib/icu/icu4c/source/i18n/strrepl.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	**********************************************************************
5	* Copyright (c) 2002-2012, International Business Machines Corporation
6	* and others. All Rights Reserved.
7	**********************************************************************
8	* Date Name Description
9	* 01/21/2002 aliu Creation.
10	**********************************************************************
11	*/
12
13	#include "unicode/utypes.h"
14
15	#if !UCONFIG_NO_TRANSLITERATION
16
17	#include "unicode/uniset.h"
18	#include "unicode/utf16.h"
19	#include "strrepl.h"
20	#include "rbt_data.h"
21	#include "util.h"
22
23	U_NAMESPACE_BEGIN
24
25	UnicodeReplacer::~UnicodeReplacer() {}
26	UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringReplacer)
27
28	/**
29	* Construct a StringReplacer that sets the emits the given output
30	* text and sets the cursor to the given position.
31	* @param theOutput text that will replace input text when the
32	* replace() method is called. May contain stand-in characters
33	* that represent nested replacers.
34	* @param theCursorPos cursor position that will be returned by
35	* the replace() method
36	* @param theData transliterator context object that translates
37	* stand-in characters to UnicodeReplacer objects
38	*/
39	StringReplacer::StringReplacer(const UnicodeString& theOutput,
40	int32_t theCursorPos,
41	const TransliterationRuleData* theData) {
42	output = theOutput;
43	cursorPos = theCursorPos;
44	hasCursor = TRUE;
45	data = theData;
46	isComplex = TRUE;
47	}
48
49	/**
50	* Construct a StringReplacer that sets the emits the given output
51	* text and does not modify the cursor.
52	* @param theOutput text that will replace input text when the
53	* replace() method is called. May contain stand-in characters
54	* that represent nested replacers.
55	* @param theData transliterator context object that translates
56	* stand-in characters to UnicodeReplacer objects
57	*/
58	StringReplacer::StringReplacer(const UnicodeString& theOutput,
59	const TransliterationRuleData* theData) {
60	output = theOutput;
61	cursorPos = `0`;
62	hasCursor = FALSE;
63	data = theData;
64	isComplex = TRUE;
65	}
66
67	/**
68	* Copy constructor.
69	*/
70	StringReplacer::StringReplacer(const StringReplacer& other) :
71	UnicodeFunctor (other),
72	UnicodeReplacer (other)
73	{
74	output = other.output;
75	cursorPos = other.cursorPos;
76	hasCursor = other.hasCursor;
77	data = other.data;
78	isComplex = other.isComplex;
79	}
80
81	/**
82	* Destructor
83	*/
84	StringReplacer::~StringReplacer() {
85	}
86
87	/**
88	* Implement UnicodeFunctor
89	*/
90	StringReplacer* StringReplacer::clone() const {
91	return new StringReplacer (*this);
92	}
93
94	/**
95	* Implement UnicodeFunctor
96	*/
97	UnicodeReplacer* StringReplacer::toReplacer() const {
98	return const_cast<StringReplacer >(this*);
99	}
100
101	/**
102	* UnicodeReplacer API
103	*/
104	int32_t StringReplacer::replace(Replaceable& text,
105	int32_t start,
106	int32_t limit,
107	int32_t& cursor) {
108	int32_t outLen;
109	int32_t newStart = `0`;
110
111	// NOTE: It should be possible to _always_ run the complex
112	// processing code; just slower. If not, then there is a bug
113	// in the complex processing code.
114
115	// Simple (no nested replacers) Processing Code :
116	if (!isComplex) {
117	text.handleReplaceBetween(start, limit, output);
118	outLen = output.length();
119
120	// Setup default cursor position (for cursorPos within output)
121	newStart = cursorPos;
122	}
123
124	// Complex (nested replacers) Processing Code :
125	else {
126	/ When there are segments to be copied, use the Replaceable.copy()*
127	* API in order to retain out-of-band data. Copy everything to the
128	* end of the string, then copy them back over the key. This preserves
129	* the integrity of indices into the key and surrounding context while
130	* generating the output text.
131	*/
132	UnicodeString buf;
133	int32_t oOutput; // offset into 'output'
134	isComplex = FALSE;
135
136	// The temporary buffer starts at tempStart, and extends
137	// to destLimit. The start of the buffer has a single
138	// character from before the key. This provides style
139	// data when addition characters are filled into the
140	// temporary buffer. If there is nothing to the left, use
141	// the non-character U+FFFF, which Replaceable subclasses
142	// should treat specially as a "no-style character."
143	// destStart points to the point after the style context
144	// character, so it is tempStart+1 or tempStart+2.
145	int32_t tempStart = text.length(); // start of temp buffer
146	int32_t destStart = tempStart; // copy new text to here
147	if (start > `0`) {
148	int32_t len = U16_LENGTH(text.char32At(start-`1`));
149	text.copy(start-len, start, tempStart);
150	destStart += len;
151	} else {
152	UnicodeString str((UChar) `0xFFFF`);
153	text.handleReplaceBetween(tempStart, tempStart, str);
154	destStart++;
155	}
156	int32_t destLimit = destStart;
157
158	for (oOutput=`0`; oOutput<output.length(); ) {
159	if (oOutput == cursorPos) {
160	// Record the position of the cursor
161	newStart = destLimit - destStart; // relative to start
162	}
163	UChar32 c = output.char32At(oOutput);
164	UnicodeReplacer* r = data->lookupReplacer(c);
165	if (r == NULL) {
166	// Accumulate straight (non-segment) text.
167	buf.append(c);
168	} else {
169	isComplex = TRUE;
170
171	// Insert any accumulated straight text.
172	if (buf.length() > `0`) {
173	text.handleReplaceBetween(destLimit, destLimit, buf);
174	destLimit += buf.length();
175	buf.truncate(`0`);
176	}
177
178	// Delegate output generation to replacer object
179	int32_t len = r->replace(text, destLimit, destLimit, cursor);
180	destLimit += len;
181	}
182	oOutput += U16_LENGTH(c);
183	}
184	// Insert any accumulated straight text.
185	if (buf.length() > `0`) {
186	text.handleReplaceBetween(destLimit, destLimit, buf);
187	destLimit += buf.length();
188	}
189	if (oOutput == cursorPos) {
190	// Record the position of the cursor
191	newStart = destLimit - destStart; // relative to start
192	}
193
194	outLen = destLimit - destStart;
195
196	// Copy new text to start, and delete it
197	text.copy(destStart, destLimit, start);
198	text.handleReplaceBetween(tempStart + outLen, destLimit + outLen, UnicodeString ());
199
200	// Delete the old text (the key)
201	text.handleReplaceBetween(start + outLen, limit + outLen, UnicodeString ());
202	}
203
204	if (hasCursor) {
205	// Adjust the cursor for positions outside the key. These
206	// refer to code points rather than code units. If cursorPos
207	// is within the output string, then use newStart, which has
208	// already been set above.
209	if (cursorPos < `0`) {
210	newStart = start;
211	int32_t n = cursorPos;
212	// Outside the output string, cursorPos counts code points
213	while (n < `0` && newStart > `0`) {
214	newStart -= U16_LENGTH(text.char32At(newStart-`1`));
215	++n;
216	}
217	newStart += n;
218	} else if (cursorPos > output.length()) {
219	newStart = start + outLen;
220	int32_t n = cursorPos - output.length();
221	// Outside the output string, cursorPos counts code points
222	while (n > `0` && newStart < text.length()) {
223	newStart += U16_LENGTH(text.char32At(newStart));
224	--n;
225	}
226	newStart += n;
227	} else {
228	// Cursor is within output string. It has been set up above
229	// to be relative to start.
230	newStart += start;
231	}
232
233	cursor = newStart;
234	}
235
236	return outLen;
237	}
238
239	/**
240	* UnicodeReplacer API
241	*/
242	UnicodeString& StringReplacer::toReplacerPattern(UnicodeString& rule,
243	UBool escapeUnprintable) const {
244	rule.truncate(`0`);
245	UnicodeString quoteBuf;
246
247	int32_t cursor = cursorPos;
248
249	// Handle a cursor preceding the output
250	if (hasCursor && cursor < `0`) {
251	while (cursor++ < `0`) {
252	ICU_Utility::appendToRule(rule, (UChar)`0x0040` /@/, TRUE, escapeUnprintable, quoteBuf);
253	}
254	// Fall through and append '\|' below
255	}
256
257	for (int32_t i=`0`; i<output.length(); ++i) {
258	if (hasCursor && i == cursor) {
259	ICU_Utility::appendToRule(rule, (UChar)`0x007C` /\|/, TRUE, escapeUnprintable, quoteBuf);
260	}
261	UChar c = output.charAt(i); // Ok to use 16-bits here
262
263	UnicodeReplacer* r = data->lookupReplacer(c);
264	if (r == NULL) {
265	ICU_Utility::appendToRule(rule, c, FALSE, escapeUnprintable, quoteBuf);
266	} else {
267	UnicodeString buf;
268	r->toReplacerPattern(buf, escapeUnprintable);
269	buf.insert(`0`, (UChar)`0x20`);
270	buf.append((UChar)`0x20`);
271	ICU_Utility::appendToRule(rule, buf,
272	TRUE, escapeUnprintable, quoteBuf);
273	}
274	}
275
276	// Handle a cursor after the output. Use > rather than >= because
277	// if cursor == output.length() it is at the end of the output,
278	// which is the default position, so we need not emit it.
279	if (hasCursor && cursor > output.length()) {
280	cursor -= output.length();
281	while (cursor-- > `0`) {
282	ICU_Utility::appendToRule(rule, (UChar)`0x0040` /@/, TRUE, escapeUnprintable, quoteBuf);
283	}
284	ICU_Utility::appendToRule(rule, (UChar)`0x007C` /\|/, TRUE, escapeUnprintable, quoteBuf);
285	}
286	// Flush quoteBuf out to result
287	ICU_Utility::appendToRule(rule, -`1`,
288	TRUE, escapeUnprintable, quoteBuf);
289
290	return rule;
291	}
292
293	/**
294	* Implement UnicodeReplacer
295	*/
296	void StringReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const {
297	UChar32 ch;
298	for (int32_t i=`0`; i<output.length(); i+=U16_LENGTH(ch)) {
299	ch = output.char32At(i);
300	UnicodeReplacer* r = data->lookupReplacer(ch);
301	if (r == NULL) {
302	toUnionTo.add(ch);
303	} else {
304	r->addReplacementSetTo(toUnionTo);
305	}
306	}
307	}
308
309	/**
310	* UnicodeFunctor API
311	*/
312	void StringReplacer::setData(const TransliterationRuleData* d) {
313	data = d;
314	int32_t i = `0`;
315	while (i<output.length()) {
316	UChar32 c = output.char32At(i);
317	UnicodeFunctor* f = data->lookup(c);
318	if (f != NULL) {
319	f->setData(data);
320	}
321	i += U16_LENGTH(c);
322	}
323	}
324
325	U_NAMESPACE_END
326
327	#endif /* #if !UCONFIG_NO_TRANSLITERATION */
328
329	//eof
330

Browse the source code of ClickHouse/contrib/icu/icu4c/source/i18n/strrepl.cpp