1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*
6* Copyright (C) 2009-2012, International Business Machines
7* Corporation and others. All Rights Reserved.
8*
9*******************************************************************************
10* file name: filterednormalizer2.cpp
11* encoding: UTF-8
12* tab size: 8 (not used)
13* indentation:4
14*
15* created on: 2009dec10
16* created by: Markus W. Scherer
17*/
18
19#include "unicode/utypes.h"
20
21#if !UCONFIG_NO_NORMALIZATION
22
23#include "unicode/edits.h"
24#include "unicode/normalizer2.h"
25#include "unicode/stringoptions.h"
26#include "unicode/uniset.h"
27#include "unicode/unistr.h"
28#include "unicode/unorm.h"
29#include "cpputils.h"
30
31U_NAMESPACE_BEGIN
32
33FilteredNormalizer2::~FilteredNormalizer2() {}
34
35UnicodeString &
36FilteredNormalizer2::normalize(const UnicodeString &src,
37 UnicodeString &dest,
38 UErrorCode &errorCode) const {
39 uprv_checkCanGetBuffer(src, errorCode);
40 if(U_FAILURE(errorCode)) {
41 dest.setToBogus();
42 return dest;
43 }
44 if(&dest==&src) {
45 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
46 return dest;
47 }
48 dest.remove();
49 return normalize(src, dest, USET_SPAN_SIMPLE, errorCode);
50}
51
52// Internal: No argument checking, and appends to dest.
53// Pass as input spanCondition the one that is likely to yield a non-zero
54// span length at the start of src.
55// For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
56// USET_SPAN_SIMPLE should be passed in for the start of src
57// and USET_SPAN_NOT_CONTAINED should be passed in if we continue after
58// an in-filter prefix.
59UnicodeString &
60FilteredNormalizer2::normalize(const UnicodeString &src,
61 UnicodeString &dest,
62 USetSpanCondition spanCondition,
63 UErrorCode &errorCode) const {
64 UnicodeString tempDest; // Don't throw away destination buffer between iterations.
65 for(int32_t prevSpanLimit=0; prevSpanLimit<src.length();) {
66 int32_t spanLimit=set.span(src, prevSpanLimit, spanCondition);
67 int32_t spanLength=spanLimit-prevSpanLimit;
68 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
69 if(spanLength!=0) {
70 dest.append(src, prevSpanLimit, spanLength);
71 }
72 spanCondition=USET_SPAN_SIMPLE;
73 } else {
74 if(spanLength!=0) {
75 // Not norm2.normalizeSecondAndAppend() because we do not want
76 // to modify the non-filter part of dest.
77 dest.append(norm2.normalize(src.tempSubStringBetween(prevSpanLimit, spanLimit),
78 tempDest, errorCode));
79 if(U_FAILURE(errorCode)) {
80 break;
81 }
82 }
83 spanCondition=USET_SPAN_NOT_CONTAINED;
84 }
85 prevSpanLimit=spanLimit;
86 }
87 return dest;
88}
89
90void
91FilteredNormalizer2::normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
92 Edits *edits, UErrorCode &errorCode) const {
93 if (U_FAILURE(errorCode)) {
94 return;
95 }
96 if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
97 edits->reset();
98 }
99 options |= U_EDITS_NO_RESET; // Do not reset for each span.
100 normalizeUTF8(options, src.data(), src.length(), sink, edits, USET_SPAN_SIMPLE, errorCode);
101}
102
103void
104FilteredNormalizer2::normalizeUTF8(uint32_t options, const char *src, int32_t length,
105 ByteSink &sink, Edits *edits,
106 USetSpanCondition spanCondition,
107 UErrorCode &errorCode) const {
108 while (length > 0) {
109 int32_t spanLength = set.spanUTF8(src, length, spanCondition);
110 if (spanCondition == USET_SPAN_NOT_CONTAINED) {
111 if (spanLength != 0) {
112 if (edits != nullptr) {
113 edits->addUnchanged(spanLength);
114 }
115 if ((options & U_OMIT_UNCHANGED_TEXT) == 0) {
116 sink.Append(src, spanLength);
117 }
118 }
119 spanCondition = USET_SPAN_SIMPLE;
120 } else {
121 if (spanLength != 0) {
122 // Not norm2.normalizeSecondAndAppend() because we do not want
123 // to modify the non-filter part of dest.
124 norm2.normalizeUTF8(options, StringPiece(src, spanLength), sink, edits, errorCode);
125 if (U_FAILURE(errorCode)) {
126 break;
127 }
128 }
129 spanCondition = USET_SPAN_NOT_CONTAINED;
130 }
131 src += spanLength;
132 length -= spanLength;
133 }
134}
135
136UnicodeString &
137FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
138 const UnicodeString &second,
139 UErrorCode &errorCode) const {
140 return normalizeSecondAndAppend(first, second, true, errorCode);
141}
142
143UnicodeString &
144FilteredNormalizer2::append(UnicodeString &first,
145 const UnicodeString &second,
146 UErrorCode &errorCode) const {
147 return normalizeSecondAndAppend(first, second, false, errorCode);
148}
149
150UnicodeString &
151FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
152 const UnicodeString &second,
153 UBool doNormalize,
154 UErrorCode &errorCode) const {
155 uprv_checkCanGetBuffer(first, errorCode);
156 uprv_checkCanGetBuffer(second, errorCode);
157 if(U_FAILURE(errorCode)) {
158 return first;
159 }
160 if(&first==&second) {
161 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
162 return first;
163 }
164 if(first.isEmpty()) {
165 if(doNormalize) {
166 return normalize(second, first, errorCode);
167 } else {
168 return first=second;
169 }
170 }
171 // merge the in-filter suffix of the first string with the in-filter prefix of the second
172 int32_t prefixLimit=set.span(second, 0, USET_SPAN_SIMPLE);
173 if(prefixLimit!=0) {
174 UnicodeString prefix(second.tempSubString(0, prefixLimit));
175 int32_t suffixStart=set.spanBack(first, INT32_MAX, USET_SPAN_SIMPLE);
176 if(suffixStart==0) {
177 if(doNormalize) {
178 norm2.normalizeSecondAndAppend(first, prefix, errorCode);
179 } else {
180 norm2.append(first, prefix, errorCode);
181 }
182 } else {
183 UnicodeString middle(first, suffixStart, INT32_MAX);
184 if(doNormalize) {
185 norm2.normalizeSecondAndAppend(middle, prefix, errorCode);
186 } else {
187 norm2.append(middle, prefix, errorCode);
188 }
189 first.replace(suffixStart, INT32_MAX, middle);
190 }
191 }
192 if(prefixLimit<second.length()) {
193 UnicodeString rest(second.tempSubString(prefixLimit, INT32_MAX));
194 if(doNormalize) {
195 normalize(rest, first, USET_SPAN_NOT_CONTAINED, errorCode);
196 } else {
197 first.append(rest);
198 }
199 }
200 return first;
201}
202
203UBool
204FilteredNormalizer2::getDecomposition(UChar32 c, UnicodeString &decomposition) const {
205 return set.contains(c) && norm2.getDecomposition(c, decomposition);
206}
207
208UBool
209FilteredNormalizer2::getRawDecomposition(UChar32 c, UnicodeString &decomposition) const {
210 return set.contains(c) && norm2.getRawDecomposition(c, decomposition);
211}
212
213UChar32
214FilteredNormalizer2::composePair(UChar32 a, UChar32 b) const {
215 return (set.contains(a) && set.contains(b)) ? norm2.composePair(a, b) : U_SENTINEL;
216}
217
218uint8_t
219FilteredNormalizer2::getCombiningClass(UChar32 c) const {
220 return set.contains(c) ? norm2.getCombiningClass(c) : 0;
221}
222
223UBool
224FilteredNormalizer2::isNormalized(const UnicodeString &s, UErrorCode &errorCode) const {
225 uprv_checkCanGetBuffer(s, errorCode);
226 if(U_FAILURE(errorCode)) {
227 return false;
228 }
229 USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
230 for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
231 int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
232 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
233 spanCondition=USET_SPAN_SIMPLE;
234 } else {
235 if( !norm2.isNormalized(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode) ||
236 U_FAILURE(errorCode)
237 ) {
238 return false;
239 }
240 spanCondition=USET_SPAN_NOT_CONTAINED;
241 }
242 prevSpanLimit=spanLimit;
243 }
244 return true;
245}
246
247UBool
248FilteredNormalizer2::isNormalizedUTF8(StringPiece sp, UErrorCode &errorCode) const {
249 if(U_FAILURE(errorCode)) {
250 return false;
251 }
252 const char *s = sp.data();
253 int32_t length = sp.length();
254 USetSpanCondition spanCondition = USET_SPAN_SIMPLE;
255 while (length > 0) {
256 int32_t spanLength = set.spanUTF8(s, length, spanCondition);
257 if (spanCondition == USET_SPAN_NOT_CONTAINED) {
258 spanCondition = USET_SPAN_SIMPLE;
259 } else {
260 if (!norm2.isNormalizedUTF8(StringPiece(s, spanLength), errorCode) ||
261 U_FAILURE(errorCode)) {
262 return false;
263 }
264 spanCondition = USET_SPAN_NOT_CONTAINED;
265 }
266 s += spanLength;
267 length -= spanLength;
268 }
269 return true;
270}
271
272UNormalizationCheckResult
273FilteredNormalizer2::quickCheck(const UnicodeString &s, UErrorCode &errorCode) const {
274 uprv_checkCanGetBuffer(s, errorCode);
275 if(U_FAILURE(errorCode)) {
276 return UNORM_MAYBE;
277 }
278 UNormalizationCheckResult result=UNORM_YES;
279 USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
280 for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
281 int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
282 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
283 spanCondition=USET_SPAN_SIMPLE;
284 } else {
285 UNormalizationCheckResult qcResult=
286 norm2.quickCheck(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
287 if(U_FAILURE(errorCode) || qcResult==UNORM_NO) {
288 return qcResult;
289 } else if(qcResult==UNORM_MAYBE) {
290 result=qcResult;
291 }
292 spanCondition=USET_SPAN_NOT_CONTAINED;
293 }
294 prevSpanLimit=spanLimit;
295 }
296 return result;
297}
298
299int32_t
300FilteredNormalizer2::spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const {
301 uprv_checkCanGetBuffer(s, errorCode);
302 if(U_FAILURE(errorCode)) {
303 return 0;
304 }
305 USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
306 for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
307 int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
308 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
309 spanCondition=USET_SPAN_SIMPLE;
310 } else {
311 int32_t yesLimit=
312 prevSpanLimit+
313 norm2.spanQuickCheckYes(
314 s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
315 if(U_FAILURE(errorCode) || yesLimit<spanLimit) {
316 return yesLimit;
317 }
318 spanCondition=USET_SPAN_NOT_CONTAINED;
319 }
320 prevSpanLimit=spanLimit;
321 }
322 return s.length();
323}
324
325UBool
326FilteredNormalizer2::hasBoundaryBefore(UChar32 c) const {
327 return !set.contains(c) || norm2.hasBoundaryBefore(c);
328}
329
330UBool
331FilteredNormalizer2::hasBoundaryAfter(UChar32 c) const {
332 return !set.contains(c) || norm2.hasBoundaryAfter(c);
333}
334
335UBool
336FilteredNormalizer2::isInert(UChar32 c) const {
337 return !set.contains(c) || norm2.isInert(c);
338}
339
340U_NAMESPACE_END
341
342// C API ------------------------------------------------------------------- ***
343
344U_NAMESPACE_USE
345
346U_CAPI UNormalizer2 * U_EXPORT2
347unorm2_openFiltered(const UNormalizer2 *norm2, const USet *filterSet, UErrorCode *pErrorCode) {
348 if(U_FAILURE(*pErrorCode)) {
349 return nullptr;
350 }
351 if(filterSet==nullptr) {
352 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
353 return nullptr;
354 }
355 Normalizer2 *fn2=new FilteredNormalizer2(*(Normalizer2 *)norm2,
356 *UnicodeSet::fromUSet(filterSet));
357 if(fn2==nullptr) {
358 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
359 }
360 return (UNormalizer2 *)fn2;
361}
362
363#endif // !UCONFIG_NO_NORMALIZATION
364