unesctrn.cpp source code [ClickHouse/contrib/icu/icu4c/source/i18n/unesctrn.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	**********************************************************************
5	* Copyright (c) 2001-2011, International Business Machines
6	* Corporation and others. All Rights Reserved.
7	**********************************************************************
8	* Date Name Description
9	* 11/19/2001 aliu Creation.
10	**********************************************************************
11	*/
12
13	#include "unicode/utypes.h"
14
15	#if !UCONFIG_NO_TRANSLITERATION
16
17	#include "unicode/uchar.h"
18	#include "unicode/utf16.h"
19	#include "unesctrn.h"
20	#include "util.h"
21
22	#include "cmemory.h"
23
24	U_NAMESPACE_BEGIN
25
26	/**
27	* Special character marking the end of the spec[] array.
28	*/
29	static const UChar END = `0xFFFF`;
30
31	// Unicode: "U+10FFFF" hex, min=4, max=6
32	static const UChar SPEC_Unicode[] = {
33	`2`, `0`, `16`, `4`, `6`, `85`/U/, `43`/+/,
34	END
35	};
36
37	// Java: "\\uFFFF" hex, min=4, max=4
38	static const UChar SPEC_Java[] = {
39	`2`, `0`, `16`, `4`, `4`, `92`/\/, `117`/u/,
40	END
41	};
42
43	// C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8
44	static const UChar SPEC_C[] = {
45	`2`, `0`, `16`, `4`, `4`, `92`/\/, `117`/u/,
46	`2`, `0`, `16`, `8`, `8`, `92`/\/, `85`/U/,
47	END
48	};
49
50	// XML: "􏿿" hex, min=1, max=6
51	static const UChar SPEC_XML[] = {
52	`3`, `1`, `16`, `1`, `6`, `38`/&/, `35`/#/, `120`/x/, `59`/;/,
53	END
54	};
55
56	// XML10: "􏿿" dec, min=1, max=7 (not really "Hex-Any")
57	static const UChar SPEC_XML10[] = {
58	`2`, `1`, `10`, `1`, `7`, `38`/&/, `35`/#/, `59`/;/,
59	END
60	};
61
62	// Perl: "\\x{263A}" hex, min=1, max=6
63	static const UChar SPEC_Perl[] = {
64	`3`, `1`, `16`, `1`, `6`, `92`/\/, `120`/x/, `123`/{/, `125`/}/,
65	END
66	};
67
68	// All: Java, C, Perl, XML, XML10, Unicode
69	static const UChar SPEC_Any[] = {
70	`2`, `0`, `16`, `4`, `6`, `85`/U/, `43`/+/, // Unicode
71	`2`, `0`, `16`, `4`, `4`, `92`/\/, `117`/u/, // Java
72	`2`, `0`, `16`, `8`, `8`, `92`/\/, `85`/U/, // C (surrogates)
73	`3`, `1`, `16`, `1`, `6`, `38`/&/, `35`/#/, `120`/x/, `59`/;/, // XML
74	`2`, `1`, `10`, `1`, `7`, `38`/&/, `35`/#/, `59`/;/, // XML10
75	`3`, `1`, `16`, `1`, `6`, `92`/\/, `120`/x/, `123`/{/, `125`/}/, // Perl
76	END
77	};
78
79	UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnescapeTransliterator)
80
81	static UChar* copySpec(const UChar* spec) {
82	int32_t len = `0`;
83	while (spec[len] != END) {
84	++len;
85	}
86	++len;
87	UChar result = (UChar )uprv_malloc(len*sizeof(UChar));
88	// Check for memory allocation error.
89	if (result != NULL) {
90	uprv_memcpy(result, spec, (size_t)len*sizeof(result[`0`]));
91	}
92	return result;
93	}
94
95	/**
96	* Factory methods. Ignore the context.
97	*/
98	static Transliterator* _createUnicode(const UnicodeString& ID, Transliterator::Token /context/) {
99	return new UnescapeTransliterator (ID, SPEC_Unicode);
100	}
101	static Transliterator* _createJava(const UnicodeString& ID, Transliterator::Token /context/) {
102	return new UnescapeTransliterator (ID, SPEC_Java);
103	}
104	static Transliterator* _createC(const UnicodeString& ID, Transliterator::Token /context/) {
105	return new UnescapeTransliterator (ID, SPEC_C);
106	}
107	static Transliterator* _createXML(const UnicodeString& ID, Transliterator::Token /context/) {
108	return new UnescapeTransliterator (ID, SPEC_XML);
109	}
110	static Transliterator* _createXML10(const UnicodeString& ID, Transliterator::Token /context/) {
111	return new UnescapeTransliterator (ID, SPEC_XML10);
112	}
113	static Transliterator* _createPerl(const UnicodeString& ID, Transliterator::Token /context/) {
114	return new UnescapeTransliterator (ID, SPEC_Perl);
115	}
116	static Transliterator* _createAny(const UnicodeString& ID, Transliterator::Token /context/) {
117	return new UnescapeTransliterator (ID, SPEC_Any);
118	}
119
120	/**
121	* Registers standard variants with the system. Called by
122	* Transliterator during initialization.
123	*/
124	void UnescapeTransliterator::registerIDs() {
125	Token t = integerToken(`0`);
126
127	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Unicode"), _createUnicode, t);
128
129	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Java"), _createJava, t);
130
131	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/C"), _createC, t);
132
133	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML"), _createXML, t);
134
135	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML10"), _createXML10, t);
136
137	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Perl"), _createPerl, t);
138
139	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any"), _createAny, t);
140	}
141
142	/**
143	* Constructor. Takes the encoded spec array.
144	*/
145	UnescapeTransliterator::UnescapeTransliterator(const UnicodeString& newID,
146	const UChar *newSpec) :
147	Transliterator (newID, NULL)
148	{
149	this->spec = copySpec(newSpec);
150	}
151
152	/**
153	* Copy constructor.
154	*/
155	UnescapeTransliterator::UnescapeTransliterator(const UnescapeTransliterator& o) :
156	Transliterator (o) {
157	this->spec = copySpec(o.spec);
158	}
159
160	UnescapeTransliterator::~UnescapeTransliterator() {
161	uprv_free(spec);
162	}
163
164	/**
165	* Transliterator API.
166	*/
167	UnescapeTransliterator* UnescapeTransliterator::clone() const {
168	return new UnescapeTransliterator (*this);
169	}
170
171	/**
172	* Implements {@link Transliterator#handleTransliterate}.
173	*/
174	void UnescapeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
175	UBool isIncremental) const {
176	int32_t start = pos.start;
177	int32_t limit = pos.limit;
178	int32_t i, j, ipat;
179
180	while (start < limit) {
181	// Loop over the forms in spec[]. Exit this loop when we
182	// match one of the specs. Exit the outer loop if a
183	// partial match is detected and isIncremental is true.
184	for (j=`0`, ipat=`0`; spec[ipat] != END; ++j) {
185
186	// Read the header
187	int32_t prefixLen = spec[ipat++];
188	int32_t suffixLen = spec[ipat++];
189	int8_t radix = (int8_t) spec[ipat++];
190	int32_t minDigits = spec[ipat++];
191	int32_t maxDigits = spec[ipat++];
192
193	// s is a copy of start that is advanced over the
194	// characters as we parse them.
195	int32_t s = start;
196	UBool match = TRUE;
197
198	for (i=`0`; i<prefixLen; ++i) {
199	if (s >= limit) {
200	if (i > `0`) {
201	// We've already matched a character. This is
202	// a partial match, so we return if in
203	// incremental mode. In non-incremental mode,
204	// go to the next spec.
205	if (isIncremental) {
206	goto exit;
207	}
208	match = FALSE;
209	break;
210	}
211	}
212	UChar c = text.charAt(s++);
213	if (c != spec[ipat + i]) {
214	match = FALSE;
215	break;
216	}
217	}
218
219	if (match) {
220	UChar32 u = `0`;
221	int32_t digitCount = `0`;
222	for (;;) {
223	if (s >= limit) {
224	// Check for partial match in incremental mode.
225	if (s > start && isIncremental) {
226	goto exit;
227	}
228	break;
229	}
230	UChar32 ch = text.char32At(s);
231	int32_t digit = u_digit(ch, radix);
232	if (digit < `0`) {
233	break;
234	}
235	s += U16_LENGTH(ch);
236	u = (u * radix) + digit;
237	if (++digitCount == maxDigits) {
238	break;
239	}
240	}
241
242	match = (digitCount >= minDigits);
243
244	if (match) {
245	for (i=`0`; i<suffixLen; ++i) {
246	if (s >= limit) {
247	// Check for partial match in incremental mode.
248	if (s > start && isIncremental) {
249	goto exit;
250	}
251	match = FALSE;
252	break;
253	}
254	UChar c = text.charAt(s++);
255	if (c != spec[ipat + prefixLen + i]) {
256	match = FALSE;
257	break;
258	}
259	}
260
261	if (match) {
262	// At this point, we have a match
263	UnicodeString str(u);
264	text.handleReplaceBetween(start, s, str);
265	limit -= s - start - str.length();
266	// The following break statement leaves the
267	// loop that is traversing the forms in
268	// spec[]. We then parse the next input
269	// character.
270	break;
271	}
272	}
273	}
274
275	ipat += prefixLen + suffixLen;
276	}
277
278	if (start < limit) {
279	start += U16_LENGTH(text.char32At(start));
280	}
281	}
282
283	exit:
284	pos.contextLimit += limit - pos.limit;
285	pos.limit = limit;
286	pos.start = start;
287	}
288
289	U_NAMESPACE_END
290
291	#endif /* #if !UCONFIG_NO_TRANSLITERATION */
292
293	//eof
294

Browse the source code of ClickHouse/contrib/icu/icu4c/source/i18n/unesctrn.cpp