unorm.cpp source code [ClickHouse/contrib/icu/icu4c/source/common/unorm.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	******************************************************************************
5	* Copyright (c) 1996-2014, International Business Machines
6	* Corporation and others. All Rights Reserved.
7	******************************************************************************
8	* File unorm.cpp
9	*
10	* Created by: Vladimir Weinstein 12052000
11	*
12	* Modification history :
13	*
14	* Date Name Description
15	* 02/01/01 synwee Added normalization quickcheck enum and method.
16	* 02/12/01 synwee Commented out quickcheck util api has been approved
17	* Added private method for doing FCD checks
18	* 02/23/01 synwee Modified quickcheck and checkFCE to run through
19	* string for codepoints < 0x300 for the normalization
20	* mode NFC.
21	* 05/25/01+ Markus Scherer total rewrite, implement all normalization here
22	* instead of just wrappers around normlzr.cpp,
23	* load unorm.dat, support Unicode 3.1 with
24	* supplementary code points, etc.
25	* 2009-nov..2010-jan Markus Scherer total rewrite, new Normalizer2 API & code
26	*/
27
28	#include "unicode/utypes.h"
29
30	#if !UCONFIG_NO_NORMALIZATION
31
32	#include "unicode/udata.h"
33	#include "unicode/ustring.h"
34	#include "unicode/uiter.h"
35	#include "unicode/unorm.h"
36	#include "unicode/unorm2.h"
37	#include "normalizer2impl.h"
38	#include "unormimp.h"
39	#include "uprops.h"
40	#include "ustr_imp.h"
41
42	U_NAMESPACE_USE
43
44	/ quick check functions ---------------------------------------------------- /
45
46	U_CAPI UNormalizationCheckResult U_EXPORT2
47	unorm_quickCheck(const UChar *src,
48	int32_t srcLength,
49	UNormalizationMode mode,
50	UErrorCode *pErrorCode) {
51	const Normalizer2 n2=Normalizer2Factory::getInstance(mode, pErrorCode);
52	return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
53	}
54
55	U_CAPI UNormalizationCheckResult U_EXPORT2
56	unorm_quickCheckWithOptions(const UChar *src, int32_t srcLength,
57	UNormalizationMode mode, int32_t options,
58	UErrorCode *pErrorCode) {
59	const Normalizer2 n2=Normalizer2Factory::getInstance(mode, pErrorCode);
60	if(options&UNORM_UNICODE_3_2) {
61	FilteredNormalizer2 fn2(n2, uniset_getUnicode32Instance(*pErrorCode));
62	return unorm2_quickCheck(
63	reinterpret_cast<const UNormalizer2 >(static_cast<Normalizer2 >(&fn2)),
64	src, srcLength, pErrorCode);
65	} else {
66	return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
67	}
68	}
69
70	U_CAPI UBool U_EXPORT2
71	unorm_isNormalized(const UChar *src, int32_t srcLength,
72	UNormalizationMode mode,
73	UErrorCode *pErrorCode) {
74	const Normalizer2 n2=Normalizer2Factory::getInstance(mode, pErrorCode);
75	return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
76	}
77
78	U_CAPI UBool U_EXPORT2
79	unorm_isNormalizedWithOptions(const UChar *src, int32_t srcLength,
80	UNormalizationMode mode, int32_t options,
81	UErrorCode *pErrorCode) {
82	const Normalizer2 n2=Normalizer2Factory::getInstance(mode, pErrorCode);
83	if(options&UNORM_UNICODE_3_2) {
84	FilteredNormalizer2 fn2(n2, uniset_getUnicode32Instance(*pErrorCode));
85	return unorm2_isNormalized(
86	reinterpret_cast<const UNormalizer2 >(static_cast<Normalizer2 >(&fn2)),
87	src, srcLength, pErrorCode);
88	} else {
89	return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
90	}
91	}
92
93	/ normalize() API ---------------------------------------------------------- /
94
95	/* Public API for normalizing. /
96	U_CAPI int32_t U_EXPORT2
97	unorm_normalize(const UChar *src, int32_t srcLength,
98	UNormalizationMode mode, int32_t options,
99	UChar *dest, int32_t destCapacity,
100	UErrorCode *pErrorCode) {
101	const Normalizer2 n2=Normalizer2Factory::getInstance(mode, pErrorCode);
102	if(options&UNORM_UNICODE_3_2) {
103	FilteredNormalizer2 fn2(n2, uniset_getUnicode32Instance(*pErrorCode));
104	return unorm2_normalize(
105	reinterpret_cast<const UNormalizer2 >(static_cast<Normalizer2 >(&fn2)),
106	src, srcLength, dest, destCapacity, pErrorCode);
107	} else {
108	return unorm2_normalize((const UNormalizer2 *)n2,
109	src, srcLength, dest, destCapacity, pErrorCode);
110	}
111	}
112
113
114	/ iteration functions ------------------------------------------------------ /
115
116	static int32_t
117	_iterate(UCharIterator *src, UBool forward,
118	UChar *dest, int32_t destCapacity,
119	const Normalizer2 *n2,
120	UBool doNormalize, UBool *pNeededToNormalize,
121	UErrorCode *pErrorCode) {
122	if(U_FAILURE(*pErrorCode)) {
123	return `0`;
124	}
125	if(destCapacity<`0` \|\| (dest==NULL && destCapacity>`0`) \|\| src==NULL) {
126	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
127	return `0`;
128	}
129
130	if(pNeededToNormalize!=NULL) {
131	*pNeededToNormalize=FALSE;
132	}
133	if(!(forward ? src->hasNext(src) : src->hasPrevious(src))) {
134	return u_terminateUChars(dest, destCapacity, `0`, pErrorCode);
135	}
136
137	UnicodeString buffer;
138	UChar32 c;
139	if(forward) {
140	/ get one character and ignore its properties /
141	buffer.append(uiter_next32(src));
142	/ get all following characters until we see a boundary /
143	while((c=uiter_next32(src))>=`0`) {
144	if(n2->hasBoundaryBefore(c)) {
145	/ back out the latest movement to stop at the boundary /
146	src->move(src, -U16_LENGTH(c), UITER_CURRENT);
147	break;
148	} else {
149	buffer.append(c);
150	}
151	}
152	} else {
153	while((c=uiter_previous32(src))>=`0`) {
154	/ always write this character to the front of the buffer /
155	buffer.insert(`0`, c);
156	/ stop if this just-copied character is a boundary /
157	if(n2->hasBoundaryBefore(c)) {
158	break;
159	}
160	}
161	}
162
163	UnicodeString destString(dest, `0`, destCapacity);
164	if(buffer.length()>`0` && doNormalize) {
165	n2->normalize(buffer, destString, pErrorCode).extract(dest, destCapacity, pErrorCode);
166	if(pNeededToNormalize!=NULL && U_SUCCESS(*pErrorCode)) {
167	*pNeededToNormalize= destString !=buffer;
168	}
169	return destString.length();
170	} else {
171	/ just copy the source characters /
172	return buffer.extract(dest, destCapacity, *pErrorCode);
173	}
174	}
175
176	static int32_t
177	unorm_iterate(UCharIterator *src, UBool forward,
178	UChar *dest, int32_t destCapacity,
179	UNormalizationMode mode, int32_t options,
180	UBool doNormalize, UBool *pNeededToNormalize,
181	UErrorCode *pErrorCode) {
182	const Normalizer2 n2=Normalizer2Factory::getInstance(mode, pErrorCode);
183	if(options&UNORM_UNICODE_3_2) {
184	const UnicodeSet uni32 = uniset_getUnicode32Instance(pErrorCode);
185	if(U_FAILURE(*pErrorCode)) {
186	return `0`;
187	}
188	FilteredNormalizer2 fn2(n2, uni32);
189	return _iterate(src, forward, dest, destCapacity,
190	&fn2, doNormalize, pNeededToNormalize, pErrorCode);
191	}
192	return _iterate(src, forward, dest, destCapacity,
193	n2, doNormalize, pNeededToNormalize, pErrorCode);
194	}
195
196	U_CAPI int32_t U_EXPORT2
197	unorm_previous(UCharIterator *src,
198	UChar *dest, int32_t destCapacity,
199	UNormalizationMode mode, int32_t options,
200	UBool doNormalize, UBool *pNeededToNormalize,
201	UErrorCode *pErrorCode) {
202	return unorm_iterate(src, FALSE,
203	dest, destCapacity,
204	mode, options,
205	doNormalize, pNeededToNormalize,
206	pErrorCode);
207	}
208
209	U_CAPI int32_t U_EXPORT2
210	unorm_next(UCharIterator *src,
211	UChar *dest, int32_t destCapacity,
212	UNormalizationMode mode, int32_t options,
213	UBool doNormalize, UBool *pNeededToNormalize,
214	UErrorCode *pErrorCode) {
215	return unorm_iterate(src, TRUE,
216	dest, destCapacity,
217	mode, options,
218	doNormalize, pNeededToNormalize,
219	pErrorCode);
220	}
221
222	/ Concatenation of normalized strings -------------------------------------- /
223
224	static int32_t
225	_concatenate(const UChar *left, int32_t leftLength,
226	const UChar *right, int32_t rightLength,
227	UChar *dest, int32_t destCapacity,
228	const Normalizer2 *n2,
229	UErrorCode *pErrorCode) {
230	if(U_FAILURE(*pErrorCode)) {
231	return `0`;
232	}
233	if(destCapacity<`0` \|\| (dest==NULL && destCapacity>`0`) \|\|
234	left==NULL \|\| leftLength<-`1` \|\| right==NULL \|\| rightLength<-`1`) {
235	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
236	return `0`;
237	}
238
239	/ check for overlapping right and destination /
240	if( dest!=NULL &&
241	((right>=dest && right<(dest+destCapacity)) \|\|
242	(rightLength>`0` && dest>=right && dest<(right+rightLength)))
243	) {
244	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
245	return `0`;
246	}
247
248	/ allow left==dest /
249	UnicodeString destString;
250	if(left==dest) {
251	destString.setTo(dest, leftLength, destCapacity);
252	} else {
253	destString.setTo(dest, `0`, destCapacity);
254	destString.append(left, leftLength);
255	}
256	return n2->append(destString, UnicodeString (rightLength<`0`, right, rightLength), *pErrorCode).
257	extract(dest, destCapacity, *pErrorCode);
258	}
259
260	U_CAPI int32_t U_EXPORT2
261	unorm_concatenate(const UChar *left, int32_t leftLength,
262	const UChar *right, int32_t rightLength,
263	UChar *dest, int32_t destCapacity,
264	UNormalizationMode mode, int32_t options,
265	UErrorCode *pErrorCode) {
266	const Normalizer2 n2=Normalizer2Factory::getInstance(mode, pErrorCode);
267	if(options&UNORM_UNICODE_3_2) {
268	const UnicodeSet uni32 = uniset_getUnicode32Instance(pErrorCode);
269	if(U_FAILURE(*pErrorCode)) {
270	return `0`;
271	}
272	FilteredNormalizer2 fn2(n2, uni32);
273	return _concatenate(left, leftLength, right, rightLength,
274	dest, destCapacity, &fn2, pErrorCode);
275	}
276	return _concatenate(left, leftLength, right, rightLength,
277	dest, destCapacity, n2, pErrorCode);
278	}
279
280	#endif /* #if !UCONFIG_NO_NORMALIZATION */
281

Browse the source code of ClickHouse/contrib/icu/icu4c/source/common/unorm.cpp