name2uni.cpp source code [ClickHouse/contrib/icu/icu4c/source/i18n/name2uni.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	**********************************************************************
5	* Copyright (C) 2001-2011, International Business Machines
6	* Corporation and others. All Rights Reserved.
7	**********************************************************************
8	* Date Name Description
9	* 06/07/01 aliu Creation.
10	**********************************************************************
11	*/
12
13	#include "unicode/utypes.h"
14
15	#if !UCONFIG_NO_TRANSLITERATION
16
17	#include "unicode/unifilt.h"
18	#include "unicode/uchar.h"
19	#include "unicode/uniset.h"
20	#include "unicode/utf16.h"
21	#include "cmemory.h"
22	#include "name2uni.h"
23	#include "patternprops.h"
24	#include "uprops.h"
25	#include "uinvchar.h"
26	#include "util.h"
27
28	U_NAMESPACE_BEGIN
29
30	UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NameUnicodeTransliterator)
31
32	static const UChar OPEN[] = {`92`,`78`,`126`,`123`,`126`,`0`}; // "\N~{~"
33	static const UChar OPEN_DELIM = `92`; // '\\' first char of OPEN
34	static const UChar CLOSE_DELIM = `125`; // '}'
35	static const UChar SPACE = `32`; // ' '
36
37	U_CDECL_BEGIN
38
39	// USetAdder implementation
40	// Does not use uset.h to reduce code dependencies
41	static void U_CALLCONV
42	_set_add(USet *set, UChar32 c) {
43	uset_add(set, c);
44	}
45
46	// These functions aren't used.
47	/static void U_CALLCONV*
48	_set_addRange(USet set, UChar32 start, UChar32 end) {*
49	((UnicodeSet )set)->add(start, end);*
50	}
51
52	static void U_CALLCONV
53	_set_addString(USet set, const UChar str, int32_t length) {
54	((UnicodeSet )set)->add(UnicodeString((UBool)(length<0), str, length));*
55	}/*
56
57	U_CDECL_END
58
59	/**
60	* Constructs a transliterator with the default delimiters '{' and
61	* '}'.
62	*/
63	NameUnicodeTransliterator::NameUnicodeTransliterator(UnicodeFilter* adoptedFilter) :
64	Transliterator (UNICODE_STRING("Name-Any", `8`), adoptedFilter) {
65
66	UnicodeSet *legalPtr = &legal;
67	// Get the legal character set
68	USetAdder sa = {
69	(USet )legalPtr, // USet == UnicodeSet*
70	_set_add,
71	NULL, // Don't need _set_addRange
72	NULL, // Don't need _set_addString
73	NULL, // Don't need remove()
74	NULL
75	};
76	uprv_getCharNameCharacters(&sa);
77	}
78
79	/**
80	* Destructor.
81	*/
82	NameUnicodeTransliterator::~NameUnicodeTransliterator() {}
83
84	/**
85	* Copy constructor.
86	*/
87	NameUnicodeTransliterator::NameUnicodeTransliterator(const NameUnicodeTransliterator& o) :
88	Transliterator (o), legal (o.legal) {}
89
90	/**
91	* Assignment operator.
92	*/
93	/NameUnicodeTransliterator& NameUnicodeTransliterator::operator=(*
94	const NameUnicodeTransliterator& o) {
95	Transliterator::operator=(o);
96	// not necessary: the legal sets should all be the same -- legal=o.legal;
97	return this;*
98	}/*
99
100	/**
101	* Transliterator API.
102	*/
103	NameUnicodeTransliterator* NameUnicodeTransliterator::clone() const {
104	return new NameUnicodeTransliterator (*this);
105	}
106
107	/**
108	* Implements {@link Transliterator#handleTransliterate}.
109	*/
110	void NameUnicodeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
111	UBool isIncremental) const {
112	// The failure mode, here and below, is to behave like Any-Null,
113	// if either there is no name data (max len == 0) or there is no
114	// memory (malloc() => NULL).
115
116	int32_t maxLen = uprv_getMaxCharNameLength();
117	if (maxLen == `0`) {
118	offsets.start = offsets.limit;
119	return;
120	}
121
122	// Accomodate the longest possible name
123	++maxLen; // allow for temporary trailing space
124	char* cbuf = (char*) uprv_malloc(maxLen);
125	if (cbuf == NULL) {
126	offsets.start = offsets.limit;
127	return;
128	}
129
130	UnicodeString openPat(TRUE, OPEN, -`1`);
131	UnicodeString str, name;
132
133	int32_t cursor = offsets.start;
134	int32_t limit = offsets.limit;
135
136	// Modes:
137	// 0 - looking for open delimiter
138	// 1 - after open delimiter
139	int32_t mode = `0`;
140	int32_t openPos = -`1`; // open delim candidate pos
141
142	UChar32 c;
143	while (cursor < limit) {
144	c = text.char32At(cursor);
145
146	switch (mode) {
147	case `0`: // looking for open delimiter
148	if (c == OPEN_DELIM) { // quick check first
149	openPos = cursor;
150	int32_t i =
151	ICU_Utility::parsePattern(openPat, text, cursor, limit);
152	if (i >= `0` && i < limit) {
153	mode = `1`;
154	name.truncate(`0`);
155	cursor = i;
156	continue; // ** reprocess char32At(cursor)*
157	}
158	}
159	break;
160
161	case `1`: // after open delimiter
162	// Look for legal chars. If \s+ is found, convert it
163	// to a single space. If closeDelimiter is found, exit
164	// the loop. If any other character is found, exit the
165	// loop. If the limit is reached, exit the loop.
166
167	// Convert \s+ => SPACE. This assumes there are no
168	// runs of >1 space characters in names.
169	if (PatternProps::isWhiteSpace(c)) {
170	// Ignore leading whitespace
171	if (name.length() > `0` &&
172	name.charAt(name.length()-`1`) != SPACE) {
173	name.append(SPACE);
174	// If we are too long then abort. maxLen includes
175	// temporary trailing space, so use '>'.
176	if (name.length() > maxLen) {
177	mode = `0`;
178	}
179	}
180	break;
181	}
182
183	if (c == CLOSE_DELIM) {
184	int32_t len = name.length();
185
186	// Delete trailing space, if any
187	if (len > `0` &&
188	name.charAt(len-`1`) == SPACE) {
189	--len;
190	}
191
192	if (uprv_isInvariantUString(name.getBuffer(), len)) {
193	name.extract(`0`, len, cbuf, maxLen, US_INV);
194
195	UErrorCode status = U_ZERO_ERROR;
196	c = u_charFromName(U_EXTENDED_CHAR_NAME, cbuf, &status);
197	if (U_SUCCESS(status)) {
198	// Lookup succeeded
199
200	// assert(U16_LENGTH(CLOSE_DELIM) == 1);
201	cursor++; // advance over CLOSE_DELIM
202
203	str.truncate(`0`);
204	str.append(c);
205	text.handleReplaceBetween(openPos, cursor, str);
206
207	// Adjust indices for the change in the length of
208	// the string. Do not assume that str.length() ==
209	// 1, in case of surrogates.
210	int32_t delta = cursor - openPos - str.length();
211	cursor -= delta;
212	limit -= delta;
213	// assert(cursor == openPos + str.length());
214	}
215	}
216	// If the lookup failed, we leave things as-is and
217	// still switch to mode 0 and continue.
218	mode = `0`;
219	openPos = -`1`; // close off candidate
220	continue; // ** reprocess char32At(cursor)*
221	}
222
223	// Check if c is a legal char. We assume here that
224	// legal.contains(OPEN_DELIM) is FALSE, so when we abort a
225	// name, we don't have to go back to openPos+1.
226	if (legal.contains(c)) {
227	name.append(c);
228	// If we go past the longest possible name then abort.
229	// maxLen includes temporary trailing space, so use '>='.
230	if (name.length() >= maxLen) {
231	mode = `0`;
232	}
233	}
234
235	// Invalid character
236	else {
237	--cursor; // Backup and reprocess this character
238	mode = `0`;
239	}
240
241	break;
242	}
243
244	cursor += U16_LENGTH(c);
245	}
246
247	offsets.contextLimit += limit - offsets.limit;
248	offsets.limit = limit;
249	// In incremental mode, only advance the cursor up to the last
250	// open delimiter candidate.
251	offsets.start = (isIncremental && openPos >= `0`) ? openPos : cursor;
252
253	uprv_free(cbuf);
254	}
255
256	U_NAMESPACE_END
257
258	#endif /* #if !UCONFIG_NO_TRANSLITERATION */
259

Browse the source code of ClickHouse/contrib/icu/icu4c/source/i18n/name2uni.cpp