1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | /* |
4 | ****************************************************************************** |
5 | * Copyright (c) 1996-2014, International Business Machines |
6 | * Corporation and others. All Rights Reserved. |
7 | ****************************************************************************** |
8 | * File unorm.cpp |
9 | * |
10 | * Created by: Vladimir Weinstein 12052000 |
11 | * |
12 | * Modification history : |
13 | * |
14 | * Date Name Description |
15 | * 02/01/01 synwee Added normalization quickcheck enum and method. |
16 | * 02/12/01 synwee Commented out quickcheck util api has been approved |
17 | * Added private method for doing FCD checks |
18 | * 02/23/01 synwee Modified quickcheck and checkFCE to run through |
19 | * string for codepoints < 0x300 for the normalization |
20 | * mode NFC. |
21 | * 05/25/01+ Markus Scherer total rewrite, implement all normalization here |
22 | * instead of just wrappers around normlzr.cpp, |
23 | * load unorm.dat, support Unicode 3.1 with |
24 | * supplementary code points, etc. |
25 | * 2009-nov..2010-jan Markus Scherer total rewrite, new Normalizer2 API & code |
26 | */ |
27 | |
28 | #include "unicode/utypes.h" |
29 | |
30 | #if !UCONFIG_NO_NORMALIZATION |
31 | |
32 | #include "unicode/udata.h" |
33 | #include "unicode/ustring.h" |
34 | #include "unicode/uiter.h" |
35 | #include "unicode/unorm.h" |
36 | #include "unicode/unorm2.h" |
37 | #include "normalizer2impl.h" |
38 | #include "unormimp.h" |
39 | #include "uprops.h" |
40 | #include "ustr_imp.h" |
41 | |
42 | U_NAMESPACE_USE |
43 | |
44 | /* quick check functions ---------------------------------------------------- */ |
45 | |
46 | U_CAPI UNormalizationCheckResult U_EXPORT2 |
47 | unorm_quickCheck(const char16_t *src, |
48 | int32_t srcLength, |
49 | UNormalizationMode mode, |
50 | UErrorCode *pErrorCode) { |
51 | const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); |
52 | return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode); |
53 | } |
54 | |
55 | U_CAPI UNormalizationCheckResult U_EXPORT2 |
56 | unorm_quickCheckWithOptions(const char16_t *src, int32_t srcLength, |
57 | UNormalizationMode mode, int32_t options, |
58 | UErrorCode *pErrorCode) { |
59 | const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); |
60 | if(options&UNORM_UNICODE_3_2) { |
61 | FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode)); |
62 | return unorm2_quickCheck( |
63 | reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)), |
64 | src, srcLength, pErrorCode); |
65 | } else { |
66 | return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode); |
67 | } |
68 | } |
69 | |
70 | U_CAPI UBool U_EXPORT2 |
71 | unorm_isNormalized(const char16_t *src, int32_t srcLength, |
72 | UNormalizationMode mode, |
73 | UErrorCode *pErrorCode) { |
74 | const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); |
75 | return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode); |
76 | } |
77 | |
78 | U_CAPI UBool U_EXPORT2 |
79 | unorm_isNormalizedWithOptions(const char16_t *src, int32_t srcLength, |
80 | UNormalizationMode mode, int32_t options, |
81 | UErrorCode *pErrorCode) { |
82 | const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); |
83 | if(options&UNORM_UNICODE_3_2) { |
84 | FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode)); |
85 | return unorm2_isNormalized( |
86 | reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)), |
87 | src, srcLength, pErrorCode); |
88 | } else { |
89 | return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode); |
90 | } |
91 | } |
92 | |
93 | /* normalize() API ---------------------------------------------------------- */ |
94 | |
95 | /** Public API for normalizing. */ |
96 | U_CAPI int32_t U_EXPORT2 |
97 | unorm_normalize(const char16_t *src, int32_t srcLength, |
98 | UNormalizationMode mode, int32_t options, |
99 | char16_t *dest, int32_t destCapacity, |
100 | UErrorCode *pErrorCode) { |
101 | const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); |
102 | if(options&UNORM_UNICODE_3_2) { |
103 | FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode)); |
104 | return unorm2_normalize( |
105 | reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)), |
106 | src, srcLength, dest, destCapacity, pErrorCode); |
107 | } else { |
108 | return unorm2_normalize((const UNormalizer2 *)n2, |
109 | src, srcLength, dest, destCapacity, pErrorCode); |
110 | } |
111 | } |
112 | |
113 | |
114 | /* iteration functions ------------------------------------------------------ */ |
115 | |
116 | static int32_t |
117 | _iterate(UCharIterator *src, UBool forward, |
118 | char16_t *dest, int32_t destCapacity, |
119 | const Normalizer2 *n2, |
120 | UBool doNormalize, UBool *pNeededToNormalize, |
121 | UErrorCode *pErrorCode) { |
122 | if(U_FAILURE(*pErrorCode)) { |
123 | return 0; |
124 | } |
125 | if(destCapacity<0 || (dest==nullptr && destCapacity>0) || src==nullptr) { |
126 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
127 | return 0; |
128 | } |
129 | |
130 | if(pNeededToNormalize!=nullptr) { |
131 | *pNeededToNormalize=false; |
132 | } |
133 | if(!(forward ? src->hasNext(src) : src->hasPrevious(src))) { |
134 | return u_terminateUChars(dest, destCapacity, 0, pErrorCode); |
135 | } |
136 | |
137 | UnicodeString buffer; |
138 | UChar32 c; |
139 | if(forward) { |
140 | /* get one character and ignore its properties */ |
141 | buffer.append(uiter_next32(src)); |
142 | /* get all following characters until we see a boundary */ |
143 | while((c=uiter_next32(src))>=0) { |
144 | if(n2->hasBoundaryBefore(c)) { |
145 | /* back out the latest movement to stop at the boundary */ |
146 | src->move(src, -U16_LENGTH(c), UITER_CURRENT); |
147 | break; |
148 | } else { |
149 | buffer.append(c); |
150 | } |
151 | } |
152 | } else { |
153 | while((c=uiter_previous32(src))>=0) { |
154 | /* always write this character to the front of the buffer */ |
155 | buffer.insert(0, c); |
156 | /* stop if this just-copied character is a boundary */ |
157 | if(n2->hasBoundaryBefore(c)) { |
158 | break; |
159 | } |
160 | } |
161 | } |
162 | |
163 | UnicodeString destString(dest, 0, destCapacity); |
164 | if(buffer.length()>0 && doNormalize) { |
165 | n2->normalize(buffer, destString, *pErrorCode).extract(dest, destCapacity, *pErrorCode); |
166 | if(pNeededToNormalize!=nullptr && U_SUCCESS(*pErrorCode)) { |
167 | *pNeededToNormalize= destString!=buffer; |
168 | } |
169 | return destString.length(); |
170 | } else { |
171 | /* just copy the source characters */ |
172 | return buffer.extract(dest, destCapacity, *pErrorCode); |
173 | } |
174 | } |
175 | |
176 | static int32_t |
177 | unorm_iterate(UCharIterator *src, UBool forward, |
178 | char16_t *dest, int32_t destCapacity, |
179 | UNormalizationMode mode, int32_t options, |
180 | UBool doNormalize, UBool *pNeededToNormalize, |
181 | UErrorCode *pErrorCode) { |
182 | const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); |
183 | if(options&UNORM_UNICODE_3_2) { |
184 | const UnicodeSet *uni32 = uniset_getUnicode32Instance(*pErrorCode); |
185 | if(U_FAILURE(*pErrorCode)) { |
186 | return 0; |
187 | } |
188 | FilteredNormalizer2 fn2(*n2, *uni32); |
189 | return _iterate(src, forward, dest, destCapacity, |
190 | &fn2, doNormalize, pNeededToNormalize, pErrorCode); |
191 | } |
192 | return _iterate(src, forward, dest, destCapacity, |
193 | n2, doNormalize, pNeededToNormalize, pErrorCode); |
194 | } |
195 | |
196 | U_CAPI int32_t U_EXPORT2 |
197 | unorm_previous(UCharIterator *src, |
198 | char16_t *dest, int32_t destCapacity, |
199 | UNormalizationMode mode, int32_t options, |
200 | UBool doNormalize, UBool *pNeededToNormalize, |
201 | UErrorCode *pErrorCode) { |
202 | return unorm_iterate(src, false, |
203 | dest, destCapacity, |
204 | mode, options, |
205 | doNormalize, pNeededToNormalize, |
206 | pErrorCode); |
207 | } |
208 | |
209 | U_CAPI int32_t U_EXPORT2 |
210 | unorm_next(UCharIterator *src, |
211 | char16_t *dest, int32_t destCapacity, |
212 | UNormalizationMode mode, int32_t options, |
213 | UBool doNormalize, UBool *pNeededToNormalize, |
214 | UErrorCode *pErrorCode) { |
215 | return unorm_iterate(src, true, |
216 | dest, destCapacity, |
217 | mode, options, |
218 | doNormalize, pNeededToNormalize, |
219 | pErrorCode); |
220 | } |
221 | |
222 | /* Concatenation of normalized strings -------------------------------------- */ |
223 | |
224 | static int32_t |
225 | _concatenate(const char16_t *left, int32_t leftLength, |
226 | const char16_t *right, int32_t rightLength, |
227 | char16_t *dest, int32_t destCapacity, |
228 | const Normalizer2 *n2, |
229 | UErrorCode *pErrorCode) { |
230 | if(U_FAILURE(*pErrorCode)) { |
231 | return 0; |
232 | } |
233 | if(destCapacity<0 || (dest==nullptr && destCapacity>0) || |
234 | left==nullptr || leftLength<-1 || right==nullptr || rightLength<-1) { |
235 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
236 | return 0; |
237 | } |
238 | |
239 | /* check for overlapping right and destination */ |
240 | if( dest!=nullptr && |
241 | ((right>=dest && right<(dest+destCapacity)) || |
242 | (rightLength>0 && dest>=right && dest<(right+rightLength))) |
243 | ) { |
244 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
245 | return 0; |
246 | } |
247 | |
248 | /* allow left==dest */ |
249 | UnicodeString destString; |
250 | if(left==dest) { |
251 | destString.setTo(dest, leftLength, destCapacity); |
252 | } else { |
253 | destString.setTo(dest, 0, destCapacity); |
254 | destString.append(left, leftLength); |
255 | } |
256 | return n2->append(destString, UnicodeString(rightLength<0, right, rightLength), *pErrorCode). |
257 | extract(dest, destCapacity, *pErrorCode); |
258 | } |
259 | |
260 | U_CAPI int32_t U_EXPORT2 |
261 | unorm_concatenate(const char16_t *left, int32_t leftLength, |
262 | const char16_t *right, int32_t rightLength, |
263 | char16_t *dest, int32_t destCapacity, |
264 | UNormalizationMode mode, int32_t options, |
265 | UErrorCode *pErrorCode) { |
266 | const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); |
267 | if(options&UNORM_UNICODE_3_2) { |
268 | const UnicodeSet *uni32 = uniset_getUnicode32Instance(*pErrorCode); |
269 | if(U_FAILURE(*pErrorCode)) { |
270 | return 0; |
271 | } |
272 | FilteredNormalizer2 fn2(*n2, *uni32); |
273 | return _concatenate(left, leftLength, right, rightLength, |
274 | dest, destCapacity, &fn2, pErrorCode); |
275 | } |
276 | return _concatenate(left, leftLength, right, rightLength, |
277 | dest, destCapacity, n2, pErrorCode); |
278 | } |
279 | |
280 | #endif /* #if !UCONFIG_NO_NORMALIZATION */ |
281 | |