filterednormalizer2.cpp source code [Godot/thirdparty/icu4c/common/filterednormalizer2.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	*******************************************************************************
5	*
6	* Copyright (C) 2009-2012, International Business Machines
7	* Corporation and others. All Rights Reserved.
8	*
9	*******************************************************************************
10	* file name: filterednormalizer2.cpp
11	* encoding: UTF-8
12	* tab size: 8 (not used)
13	* indentation:4
14	*
15	* created on: 2009dec10
16	* created by: Markus W. Scherer
17	*/
18
19	#include "unicode/utypes.h"
20
21	#if !UCONFIG_NO_NORMALIZATION
22
23	#include "unicode/edits.h"
24	#include "unicode/normalizer2.h"
25	#include "unicode/stringoptions.h"
26	#include "unicode/uniset.h"
27	#include "unicode/unistr.h"
28	#include "unicode/unorm.h"
29	#include "cpputils.h"
30
31	U_NAMESPACE_BEGIN
32
33	FilteredNormalizer2::~FilteredNormalizer2() {}
34
35	UnicodeString &
36	FilteredNormalizer2::normalize(const UnicodeString &src,
37	UnicodeString &dest,
38	UErrorCode &errorCode) const {
39	uprv_checkCanGetBuffer(src, errorCode);
40	if(U_FAILURE(errorCode)) {
41	dest.setToBogus();
42	return dest;
43	}
44	if(&dest==&src) {
45	errorCode=U_ILLEGAL_ARGUMENT_ERROR;
46	return dest;
47	}
48	dest.remove();
49	return normalize(src, dest, USET_SPAN_SIMPLE, errorCode);
50	}
51
52	// Internal: No argument checking, and appends to dest.
53	// Pass as input spanCondition the one that is likely to yield a non-zero
54	// span length at the start of src.
55	// For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
56	// USET_SPAN_SIMPLE should be passed in for the start of src
57	// and USET_SPAN_NOT_CONTAINED should be passed in if we continue after
58	// an in-filter prefix.
59	UnicodeString &
60	FilteredNormalizer2::normalize(const UnicodeString &src,
61	UnicodeString &dest,
62	USetSpanCondition spanCondition,
63	UErrorCode &errorCode) const {
64	UnicodeString tempDest; // Don't throw away destination buffer between iterations.
65	for(int32_t prevSpanLimit=`0`; prevSpanLimit<src.length();) {
66	int32_t spanLimit=set.span(src, prevSpanLimit, spanCondition);
67	int32_t spanLength=spanLimit-prevSpanLimit;
68	if(spanCondition==USET_SPAN_NOT_CONTAINED) {
69	if(spanLength!=`0`) {
70	dest.append(src, prevSpanLimit, spanLength);
71	}
72	spanCondition=USET_SPAN_SIMPLE;
73	} else {
74	if(spanLength!=`0`) {
75	// Not norm2.normalizeSecondAndAppend() because we do not want
76	// to modify the non-filter part of dest.
77	dest.append(norm2.normalize(src.tempSubStringBetween(prevSpanLimit, spanLimit),
78	tempDest, errorCode));
79	if(U_FAILURE(errorCode)) {
80	break;
81	}
82	}
83	spanCondition=USET_SPAN_NOT_CONTAINED;
84	}
85	prevSpanLimit=spanLimit;
86	}
87	return dest;
88	}
89
90	void
91	FilteredNormalizer2::normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
92	Edits edits, UErrorCode &errorCode) const* {
93	if (U_FAILURE(errorCode)) {
94	return;
95	}
96	if (edits != nullptr && (options & U_EDITS_NO_RESET) == `0`) {
97	edits->reset();
98	}
99	options \|= U_EDITS_NO_RESET; // Do not reset for each span.
100	normalizeUTF8(options, src.data(), src.length(), sink, edits, USET_SPAN_SIMPLE, errorCode);
101	}
102
103	void
104	FilteredNormalizer2::normalizeUTF8(uint32_t options, const char *src, int32_t length,
105	ByteSink &sink, Edits *edits,
106	USetSpanCondition spanCondition,
107	UErrorCode &errorCode) const {
108	while (length > `0`) {
109	int32_t spanLength = set.spanUTF8(src, length, spanCondition);
110	if (spanCondition == USET_SPAN_NOT_CONTAINED) {
111	if (spanLength != `0`) {
112	if (edits != nullptr) {
113	edits->addUnchanged(spanLength);
114	}
115	if ((options & U_OMIT_UNCHANGED_TEXT) == `0`) {
116	sink.Append(src, spanLength);
117	}
118	}
119	spanCondition = USET_SPAN_SIMPLE;
120	} else {
121	if (spanLength != `0`) {
122	// Not norm2.normalizeSecondAndAppend() because we do not want
123	// to modify the non-filter part of dest.
124	norm2.normalizeUTF8(options, StringPiece (src, spanLength), sink, edits, errorCode);
125	if (U_FAILURE(errorCode)) {
126	break;
127	}
128	}
129	spanCondition = USET_SPAN_NOT_CONTAINED;
130	}
131	src += spanLength;
132	length -= spanLength;
133	}
134	}
135
136	UnicodeString &
137	FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
138	const UnicodeString &second,
139	UErrorCode &errorCode) const {
140	return normalizeSecondAndAppend(first, second, true, errorCode);
141	}
142
143	UnicodeString &
144	FilteredNormalizer2::append(UnicodeString &first,
145	const UnicodeString &second,
146	UErrorCode &errorCode) const {
147	return normalizeSecondAndAppend(first, second, false, errorCode);
148	}
149
150	UnicodeString &
151	FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
152	const UnicodeString &second,
153	UBool doNormalize,
154	UErrorCode &errorCode) const {
155	uprv_checkCanGetBuffer(first, errorCode);
156	uprv_checkCanGetBuffer(second, errorCode);
157	if(U_FAILURE(errorCode)) {
158	return first;
159	}
160	if(&first==&second) {
161	errorCode=U_ILLEGAL_ARGUMENT_ERROR;
162	return first;
163	}
164	if(first.isEmpty()) {
165	if(doNormalize) {
166	return normalize(second, first, errorCode);
167	} else {
168	return first =second;
169	}
170	}
171	// merge the in-filter suffix of the first string with the in-filter prefix of the second
172	int32_t prefixLimit=set.span(second, `0`, USET_SPAN_SIMPLE);
173	if(prefixLimit!=`0`) {
174	UnicodeString prefix(second.tempSubString(`0`, prefixLimit));
175	int32_t suffixStart=set.spanBack(first, INT32_MAX, USET_SPAN_SIMPLE);
176	if(suffixStart==`0`) {
177	if(doNormalize) {
178	norm2.normalizeSecondAndAppend(first, prefix, errorCode);
179	} else {
180	norm2.append(first, prefix, errorCode);
181	}
182	} else {
183	UnicodeString middle(first, suffixStart, INT32_MAX);
184	if(doNormalize) {
185	norm2.normalizeSecondAndAppend(middle, prefix, errorCode);
186	} else {
187	norm2.append(middle, prefix, errorCode);
188	}
189	first.replace(suffixStart, INT32_MAX, middle);
190	}
191	}
192	if(prefixLimit<second.length()) {
193	UnicodeString rest(second.tempSubString(prefixLimit, INT32_MAX));
194	if(doNormalize) {
195	normalize(rest, first, USET_SPAN_NOT_CONTAINED, errorCode);
196	} else {
197	first.append(rest);
198	}
199	}
200	return first;
201	}
202
203	UBool
204	FilteredNormalizer2::getDecomposition(UChar32 c, UnicodeString &decomposition) const {
205	return set.contains(c) && norm2.getDecomposition(c, decomposition);
206	}
207
208	UBool
209	FilteredNormalizer2::getRawDecomposition(UChar32 c, UnicodeString &decomposition) const {
210	return set.contains(c) && norm2.getRawDecomposition(c, decomposition);
211	}
212
213	UChar32
214	FilteredNormalizer2::composePair(UChar32 a, UChar32 b) const {
215	return (set.contains(a) && set.contains(b)) ? norm2.composePair(a, b) : U_SENTINEL;
216	}
217
218	uint8_t
219	FilteredNormalizer2::getCombiningClass(UChar32 c) const {
220	return set.contains(c) ? norm2.getCombiningClass(c) : `0`;
221	}
222
223	UBool
224	FilteredNormalizer2::isNormalized(const UnicodeString &s, UErrorCode &errorCode) const {
225	uprv_checkCanGetBuffer(s, errorCode);
226	if(U_FAILURE(errorCode)) {
227	return false;
228	}
229	USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
230	for(int32_t prevSpanLimit=`0`; prevSpanLimit<s.length();) {
231	int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
232	if(spanCondition==USET_SPAN_NOT_CONTAINED) {
233	spanCondition=USET_SPAN_SIMPLE;
234	} else {
235	if( !norm2.isNormalized(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode) \|\|
236	U_FAILURE(errorCode)
237	) {
238	return false;
239	}
240	spanCondition=USET_SPAN_NOT_CONTAINED;
241	}
242	prevSpanLimit=spanLimit;
243	}
244	return true;
245	}
246
247	UBool
248	FilteredNormalizer2::isNormalizedUTF8(StringPiece sp, UErrorCode &errorCode) const {
249	if(U_FAILURE(errorCode)) {
250	return false;
251	}
252	const char *s = sp.data();
253	int32_t length = sp.length();
254	USetSpanCondition spanCondition = USET_SPAN_SIMPLE;
255	while (length > `0`) {
256	int32_t spanLength = set.spanUTF8(s, length, spanCondition);
257	if (spanCondition == USET_SPAN_NOT_CONTAINED) {
258	spanCondition = USET_SPAN_SIMPLE;
259	} else {
260	if (!norm2.isNormalizedUTF8(StringPiece (s, spanLength), errorCode) \|\|
261	U_FAILURE(errorCode)) {
262	return false;
263	}
264	spanCondition = USET_SPAN_NOT_CONTAINED;
265	}
266	s += spanLength;
267	length -= spanLength;
268	}
269	return true;
270	}
271
272	UNormalizationCheckResult
273	FilteredNormalizer2::quickCheck(const UnicodeString &s, UErrorCode &errorCode) const {
274	uprv_checkCanGetBuffer(s, errorCode);
275	if(U_FAILURE(errorCode)) {
276	return UNORM_MAYBE;
277	}
278	UNormalizationCheckResult result=UNORM_YES;
279	USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
280	for(int32_t prevSpanLimit=`0`; prevSpanLimit<s.length();) {
281	int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
282	if(spanCondition==USET_SPAN_NOT_CONTAINED) {
283	spanCondition=USET_SPAN_SIMPLE;
284	} else {
285	UNormalizationCheckResult qcResult=
286	norm2.quickCheck(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
287	if(U_FAILURE(errorCode) \|\| qcResult==UNORM_NO) {
288	return qcResult;
289	} else if(qcResult==UNORM_MAYBE) {
290	result=qcResult;
291	}
292	spanCondition=USET_SPAN_NOT_CONTAINED;
293	}
294	prevSpanLimit=spanLimit;
295	}
296	return result;
297	}
298
299	int32_t
300	FilteredNormalizer2::spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const {
301	uprv_checkCanGetBuffer(s, errorCode);
302	if(U_FAILURE(errorCode)) {
303	return `0`;
304	}
305	USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
306	for(int32_t prevSpanLimit=`0`; prevSpanLimit<s.length();) {
307	int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
308	if(spanCondition==USET_SPAN_NOT_CONTAINED) {
309	spanCondition=USET_SPAN_SIMPLE;
310	} else {
311	int32_t yesLimit=
312	prevSpanLimit+
313	norm2.spanQuickCheckYes(
314	s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
315	if(U_FAILURE(errorCode) \|\| yesLimit<spanLimit) {
316	return yesLimit;
317	}
318	spanCondition=USET_SPAN_NOT_CONTAINED;
319	}
320	prevSpanLimit=spanLimit;
321	}
322	return s.length();
323	}
324
325	UBool
326	FilteredNormalizer2::hasBoundaryBefore(UChar32 c) const {
327	return !set.contains(c) \|\| norm2.hasBoundaryBefore(c);
328	}
329
330	UBool
331	FilteredNormalizer2::hasBoundaryAfter(UChar32 c) const {
332	return !set.contains(c) \|\| norm2.hasBoundaryAfter(c);
333	}
334
335	UBool
336	FilteredNormalizer2::isInert(UChar32 c) const {
337	return !set.contains(c) \|\| norm2.isInert(c);
338	}
339
340	U_NAMESPACE_END
341
342	// C API ------------------------------------------------------------------- ***
343
344	U_NAMESPACE_USE
345
346	U_CAPI UNormalizer2 * U_EXPORT2
347	unorm2_openFiltered(const UNormalizer2 norm2, const* USet filterSet, UErrorCode pErrorCode) {
348	if(U_FAILURE(*pErrorCode)) {
349	return nullptr;
350	}
351	if(filterSet==nullptr) {
352	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
353	return nullptr;
354	}
355	Normalizer2 fn2=new* FilteredNormalizer2 ((Normalizer2 )norm2,
356	*UnicodeSet::fromUSet(filterSet));
357	if(fn2==nullptr) {
358	*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
359	}
360	return (UNormalizer2 *)fn2;
361	}
362
363	#endif // !UCONFIG_NO_NORMALIZATION
364

Browse the source code of Godot/thirdparty/icu4c/common/filterednormalizer2.cpp