1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | /* |
4 | ******************************************************************************* |
5 | * |
6 | * Copyright (C) 2009-2012, International Business Machines |
7 | * Corporation and others. All Rights Reserved. |
8 | * |
9 | ******************************************************************************* |
10 | * file name: filterednormalizer2.cpp |
11 | * encoding: UTF-8 |
12 | * tab size: 8 (not used) |
13 | * indentation:4 |
14 | * |
15 | * created on: 2009dec10 |
16 | * created by: Markus W. Scherer |
17 | */ |
18 | |
19 | #include "unicode/utypes.h" |
20 | |
21 | #if !UCONFIG_NO_NORMALIZATION |
22 | |
23 | #include "unicode/edits.h" |
24 | #include "unicode/normalizer2.h" |
25 | #include "unicode/stringoptions.h" |
26 | #include "unicode/uniset.h" |
27 | #include "unicode/unistr.h" |
28 | #include "unicode/unorm.h" |
29 | #include "cpputils.h" |
30 | |
31 | U_NAMESPACE_BEGIN |
32 | |
33 | FilteredNormalizer2::~FilteredNormalizer2() {} |
34 | |
35 | UnicodeString & |
36 | FilteredNormalizer2::normalize(const UnicodeString &src, |
37 | UnicodeString &dest, |
38 | UErrorCode &errorCode) const { |
39 | uprv_checkCanGetBuffer(src, errorCode); |
40 | if(U_FAILURE(errorCode)) { |
41 | dest.setToBogus(); |
42 | return dest; |
43 | } |
44 | if(&dest==&src) { |
45 | errorCode=U_ILLEGAL_ARGUMENT_ERROR; |
46 | return dest; |
47 | } |
48 | dest.remove(); |
49 | return normalize(src, dest, USET_SPAN_SIMPLE, errorCode); |
50 | } |
51 | |
52 | // Internal: No argument checking, and appends to dest. |
53 | // Pass as input spanCondition the one that is likely to yield a non-zero |
54 | // span length at the start of src. |
55 | // For set=[:age=3.2:], since almost all common characters were in Unicode 3.2, |
56 | // USET_SPAN_SIMPLE should be passed in for the start of src |
57 | // and USET_SPAN_NOT_CONTAINED should be passed in if we continue after |
58 | // an in-filter prefix. |
59 | UnicodeString & |
60 | FilteredNormalizer2::normalize(const UnicodeString &src, |
61 | UnicodeString &dest, |
62 | USetSpanCondition spanCondition, |
63 | UErrorCode &errorCode) const { |
64 | UnicodeString tempDest; // Don't throw away destination buffer between iterations. |
65 | for(int32_t prevSpanLimit=0; prevSpanLimit<src.length();) { |
66 | int32_t spanLimit=set.span(src, prevSpanLimit, spanCondition); |
67 | int32_t spanLength=spanLimit-prevSpanLimit; |
68 | if(spanCondition==USET_SPAN_NOT_CONTAINED) { |
69 | if(spanLength!=0) { |
70 | dest.append(src, prevSpanLimit, spanLength); |
71 | } |
72 | spanCondition=USET_SPAN_SIMPLE; |
73 | } else { |
74 | if(spanLength!=0) { |
75 | // Not norm2.normalizeSecondAndAppend() because we do not want |
76 | // to modify the non-filter part of dest. |
77 | dest.append(norm2.normalize(src.tempSubStringBetween(prevSpanLimit, spanLimit), |
78 | tempDest, errorCode)); |
79 | if(U_FAILURE(errorCode)) { |
80 | break; |
81 | } |
82 | } |
83 | spanCondition=USET_SPAN_NOT_CONTAINED; |
84 | } |
85 | prevSpanLimit=spanLimit; |
86 | } |
87 | return dest; |
88 | } |
89 | |
90 | void |
91 | FilteredNormalizer2::normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink, |
92 | Edits *edits, UErrorCode &errorCode) const { |
93 | if (U_FAILURE(errorCode)) { |
94 | return; |
95 | } |
96 | if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) { |
97 | edits->reset(); |
98 | } |
99 | options |= U_EDITS_NO_RESET; // Do not reset for each span. |
100 | normalizeUTF8(options, src.data(), src.length(), sink, edits, USET_SPAN_SIMPLE, errorCode); |
101 | } |
102 | |
103 | void |
104 | FilteredNormalizer2::normalizeUTF8(uint32_t options, const char *src, int32_t length, |
105 | ByteSink &sink, Edits *edits, |
106 | USetSpanCondition spanCondition, |
107 | UErrorCode &errorCode) const { |
108 | while (length > 0) { |
109 | int32_t spanLength = set.spanUTF8(src, length, spanCondition); |
110 | if (spanCondition == USET_SPAN_NOT_CONTAINED) { |
111 | if (spanLength != 0) { |
112 | if (edits != nullptr) { |
113 | edits->addUnchanged(spanLength); |
114 | } |
115 | if ((options & U_OMIT_UNCHANGED_TEXT) == 0) { |
116 | sink.Append(src, spanLength); |
117 | } |
118 | } |
119 | spanCondition = USET_SPAN_SIMPLE; |
120 | } else { |
121 | if (spanLength != 0) { |
122 | // Not norm2.normalizeSecondAndAppend() because we do not want |
123 | // to modify the non-filter part of dest. |
124 | norm2.normalizeUTF8(options, StringPiece(src, spanLength), sink, edits, errorCode); |
125 | if (U_FAILURE(errorCode)) { |
126 | break; |
127 | } |
128 | } |
129 | spanCondition = USET_SPAN_NOT_CONTAINED; |
130 | } |
131 | src += spanLength; |
132 | length -= spanLength; |
133 | } |
134 | } |
135 | |
136 | UnicodeString & |
137 | FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first, |
138 | const UnicodeString &second, |
139 | UErrorCode &errorCode) const { |
140 | return normalizeSecondAndAppend(first, second, true, errorCode); |
141 | } |
142 | |
143 | UnicodeString & |
144 | FilteredNormalizer2::append(UnicodeString &first, |
145 | const UnicodeString &second, |
146 | UErrorCode &errorCode) const { |
147 | return normalizeSecondAndAppend(first, second, false, errorCode); |
148 | } |
149 | |
150 | UnicodeString & |
151 | FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first, |
152 | const UnicodeString &second, |
153 | UBool doNormalize, |
154 | UErrorCode &errorCode) const { |
155 | uprv_checkCanGetBuffer(first, errorCode); |
156 | uprv_checkCanGetBuffer(second, errorCode); |
157 | if(U_FAILURE(errorCode)) { |
158 | return first; |
159 | } |
160 | if(&first==&second) { |
161 | errorCode=U_ILLEGAL_ARGUMENT_ERROR; |
162 | return first; |
163 | } |
164 | if(first.isEmpty()) { |
165 | if(doNormalize) { |
166 | return normalize(second, first, errorCode); |
167 | } else { |
168 | return first=second; |
169 | } |
170 | } |
171 | // merge the in-filter suffix of the first string with the in-filter prefix of the second |
172 | int32_t prefixLimit=set.span(second, 0, USET_SPAN_SIMPLE); |
173 | if(prefixLimit!=0) { |
174 | UnicodeString prefix(second.tempSubString(0, prefixLimit)); |
175 | int32_t suffixStart=set.spanBack(first, INT32_MAX, USET_SPAN_SIMPLE); |
176 | if(suffixStart==0) { |
177 | if(doNormalize) { |
178 | norm2.normalizeSecondAndAppend(first, prefix, errorCode); |
179 | } else { |
180 | norm2.append(first, prefix, errorCode); |
181 | } |
182 | } else { |
183 | UnicodeString middle(first, suffixStart, INT32_MAX); |
184 | if(doNormalize) { |
185 | norm2.normalizeSecondAndAppend(middle, prefix, errorCode); |
186 | } else { |
187 | norm2.append(middle, prefix, errorCode); |
188 | } |
189 | first.replace(suffixStart, INT32_MAX, middle); |
190 | } |
191 | } |
192 | if(prefixLimit<second.length()) { |
193 | UnicodeString rest(second.tempSubString(prefixLimit, INT32_MAX)); |
194 | if(doNormalize) { |
195 | normalize(rest, first, USET_SPAN_NOT_CONTAINED, errorCode); |
196 | } else { |
197 | first.append(rest); |
198 | } |
199 | } |
200 | return first; |
201 | } |
202 | |
203 | UBool |
204 | FilteredNormalizer2::getDecomposition(UChar32 c, UnicodeString &decomposition) const { |
205 | return set.contains(c) && norm2.getDecomposition(c, decomposition); |
206 | } |
207 | |
208 | UBool |
209 | FilteredNormalizer2::getRawDecomposition(UChar32 c, UnicodeString &decomposition) const { |
210 | return set.contains(c) && norm2.getRawDecomposition(c, decomposition); |
211 | } |
212 | |
213 | UChar32 |
214 | FilteredNormalizer2::composePair(UChar32 a, UChar32 b) const { |
215 | return (set.contains(a) && set.contains(b)) ? norm2.composePair(a, b) : U_SENTINEL; |
216 | } |
217 | |
218 | uint8_t |
219 | FilteredNormalizer2::getCombiningClass(UChar32 c) const { |
220 | return set.contains(c) ? norm2.getCombiningClass(c) : 0; |
221 | } |
222 | |
223 | UBool |
224 | FilteredNormalizer2::isNormalized(const UnicodeString &s, UErrorCode &errorCode) const { |
225 | uprv_checkCanGetBuffer(s, errorCode); |
226 | if(U_FAILURE(errorCode)) { |
227 | return false; |
228 | } |
229 | USetSpanCondition spanCondition=USET_SPAN_SIMPLE; |
230 | for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) { |
231 | int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition); |
232 | if(spanCondition==USET_SPAN_NOT_CONTAINED) { |
233 | spanCondition=USET_SPAN_SIMPLE; |
234 | } else { |
235 | if( !norm2.isNormalized(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode) || |
236 | U_FAILURE(errorCode) |
237 | ) { |
238 | return false; |
239 | } |
240 | spanCondition=USET_SPAN_NOT_CONTAINED; |
241 | } |
242 | prevSpanLimit=spanLimit; |
243 | } |
244 | return true; |
245 | } |
246 | |
247 | UBool |
248 | FilteredNormalizer2::isNormalizedUTF8(StringPiece sp, UErrorCode &errorCode) const { |
249 | if(U_FAILURE(errorCode)) { |
250 | return false; |
251 | } |
252 | const char *s = sp.data(); |
253 | int32_t length = sp.length(); |
254 | USetSpanCondition spanCondition = USET_SPAN_SIMPLE; |
255 | while (length > 0) { |
256 | int32_t spanLength = set.spanUTF8(s, length, spanCondition); |
257 | if (spanCondition == USET_SPAN_NOT_CONTAINED) { |
258 | spanCondition = USET_SPAN_SIMPLE; |
259 | } else { |
260 | if (!norm2.isNormalizedUTF8(StringPiece(s, spanLength), errorCode) || |
261 | U_FAILURE(errorCode)) { |
262 | return false; |
263 | } |
264 | spanCondition = USET_SPAN_NOT_CONTAINED; |
265 | } |
266 | s += spanLength; |
267 | length -= spanLength; |
268 | } |
269 | return true; |
270 | } |
271 | |
272 | UNormalizationCheckResult |
273 | FilteredNormalizer2::quickCheck(const UnicodeString &s, UErrorCode &errorCode) const { |
274 | uprv_checkCanGetBuffer(s, errorCode); |
275 | if(U_FAILURE(errorCode)) { |
276 | return UNORM_MAYBE; |
277 | } |
278 | UNormalizationCheckResult result=UNORM_YES; |
279 | USetSpanCondition spanCondition=USET_SPAN_SIMPLE; |
280 | for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) { |
281 | int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition); |
282 | if(spanCondition==USET_SPAN_NOT_CONTAINED) { |
283 | spanCondition=USET_SPAN_SIMPLE; |
284 | } else { |
285 | UNormalizationCheckResult qcResult= |
286 | norm2.quickCheck(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode); |
287 | if(U_FAILURE(errorCode) || qcResult==UNORM_NO) { |
288 | return qcResult; |
289 | } else if(qcResult==UNORM_MAYBE) { |
290 | result=qcResult; |
291 | } |
292 | spanCondition=USET_SPAN_NOT_CONTAINED; |
293 | } |
294 | prevSpanLimit=spanLimit; |
295 | } |
296 | return result; |
297 | } |
298 | |
299 | int32_t |
300 | FilteredNormalizer2::spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const { |
301 | uprv_checkCanGetBuffer(s, errorCode); |
302 | if(U_FAILURE(errorCode)) { |
303 | return 0; |
304 | } |
305 | USetSpanCondition spanCondition=USET_SPAN_SIMPLE; |
306 | for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) { |
307 | int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition); |
308 | if(spanCondition==USET_SPAN_NOT_CONTAINED) { |
309 | spanCondition=USET_SPAN_SIMPLE; |
310 | } else { |
311 | int32_t yesLimit= |
312 | prevSpanLimit+ |
313 | norm2.spanQuickCheckYes( |
314 | s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode); |
315 | if(U_FAILURE(errorCode) || yesLimit<spanLimit) { |
316 | return yesLimit; |
317 | } |
318 | spanCondition=USET_SPAN_NOT_CONTAINED; |
319 | } |
320 | prevSpanLimit=spanLimit; |
321 | } |
322 | return s.length(); |
323 | } |
324 | |
325 | UBool |
326 | FilteredNormalizer2::hasBoundaryBefore(UChar32 c) const { |
327 | return !set.contains(c) || norm2.hasBoundaryBefore(c); |
328 | } |
329 | |
330 | UBool |
331 | FilteredNormalizer2::hasBoundaryAfter(UChar32 c) const { |
332 | return !set.contains(c) || norm2.hasBoundaryAfter(c); |
333 | } |
334 | |
335 | UBool |
336 | FilteredNormalizer2::isInert(UChar32 c) const { |
337 | return !set.contains(c) || norm2.isInert(c); |
338 | } |
339 | |
340 | U_NAMESPACE_END |
341 | |
342 | // C API ------------------------------------------------------------------- *** |
343 | |
344 | U_NAMESPACE_USE |
345 | |
346 | U_CAPI UNormalizer2 * U_EXPORT2 |
347 | unorm2_openFiltered(const UNormalizer2 *norm2, const USet *filterSet, UErrorCode *pErrorCode) { |
348 | if(U_FAILURE(*pErrorCode)) { |
349 | return nullptr; |
350 | } |
351 | if(filterSet==nullptr) { |
352 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
353 | return nullptr; |
354 | } |
355 | Normalizer2 *fn2=new FilteredNormalizer2(*(Normalizer2 *)norm2, |
356 | *UnicodeSet::fromUSet(filterSet)); |
357 | if(fn2==nullptr) { |
358 | *pErrorCode=U_MEMORY_ALLOCATION_ERROR; |
359 | } |
360 | return (UNormalizer2 *)fn2; |
361 | } |
362 | |
363 | #endif // !UCONFIG_NO_NORMALIZATION |
364 | |