1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4******************************************************************************
5* Copyright (c) 1996-2014, International Business Machines
6* Corporation and others. All Rights Reserved.
7******************************************************************************
8* File unorm.cpp
9*
10* Created by: Vladimir Weinstein 12052000
11*
12* Modification history :
13*
14* Date Name Description
15* 02/01/01 synwee Added normalization quickcheck enum and method.
16* 02/12/01 synwee Commented out quickcheck util api has been approved
17* Added private method for doing FCD checks
18* 02/23/01 synwee Modified quickcheck and checkFCE to run through
19* string for codepoints < 0x300 for the normalization
20* mode NFC.
21* 05/25/01+ Markus Scherer total rewrite, implement all normalization here
22* instead of just wrappers around normlzr.cpp,
23* load unorm.dat, support Unicode 3.1 with
24* supplementary code points, etc.
25* 2009-nov..2010-jan Markus Scherer total rewrite, new Normalizer2 API & code
26*/
27
28#include "unicode/utypes.h"
29
30#if !UCONFIG_NO_NORMALIZATION
31
32#include "unicode/udata.h"
33#include "unicode/ustring.h"
34#include "unicode/uiter.h"
35#include "unicode/unorm.h"
36#include "unicode/unorm2.h"
37#include "normalizer2impl.h"
38#include "unormimp.h"
39#include "uprops.h"
40#include "ustr_imp.h"
41
42U_NAMESPACE_USE
43
44/* quick check functions ---------------------------------------------------- */
45
46U_CAPI UNormalizationCheckResult U_EXPORT2
47unorm_quickCheck(const UChar *src,
48 int32_t srcLength,
49 UNormalizationMode mode,
50 UErrorCode *pErrorCode) {
51 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
52 return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
53}
54
55U_CAPI UNormalizationCheckResult U_EXPORT2
56unorm_quickCheckWithOptions(const UChar *src, int32_t srcLength,
57 UNormalizationMode mode, int32_t options,
58 UErrorCode *pErrorCode) {
59 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
60 if(options&UNORM_UNICODE_3_2) {
61 FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode));
62 return unorm2_quickCheck(
63 reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)),
64 src, srcLength, pErrorCode);
65 } else {
66 return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
67 }
68}
69
70U_CAPI UBool U_EXPORT2
71unorm_isNormalized(const UChar *src, int32_t srcLength,
72 UNormalizationMode mode,
73 UErrorCode *pErrorCode) {
74 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
75 return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
76}
77
78U_CAPI UBool U_EXPORT2
79unorm_isNormalizedWithOptions(const UChar *src, int32_t srcLength,
80 UNormalizationMode mode, int32_t options,
81 UErrorCode *pErrorCode) {
82 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
83 if(options&UNORM_UNICODE_3_2) {
84 FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode));
85 return unorm2_isNormalized(
86 reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)),
87 src, srcLength, pErrorCode);
88 } else {
89 return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
90 }
91}
92
93/* normalize() API ---------------------------------------------------------- */
94
95/** Public API for normalizing. */
96U_CAPI int32_t U_EXPORT2
97unorm_normalize(const UChar *src, int32_t srcLength,
98 UNormalizationMode mode, int32_t options,
99 UChar *dest, int32_t destCapacity,
100 UErrorCode *pErrorCode) {
101 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
102 if(options&UNORM_UNICODE_3_2) {
103 FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode));
104 return unorm2_normalize(
105 reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)),
106 src, srcLength, dest, destCapacity, pErrorCode);
107 } else {
108 return unorm2_normalize((const UNormalizer2 *)n2,
109 src, srcLength, dest, destCapacity, pErrorCode);
110 }
111}
112
113
114/* iteration functions ------------------------------------------------------ */
115
116static int32_t
117_iterate(UCharIterator *src, UBool forward,
118 UChar *dest, int32_t destCapacity,
119 const Normalizer2 *n2,
120 UBool doNormalize, UBool *pNeededToNormalize,
121 UErrorCode *pErrorCode) {
122 if(U_FAILURE(*pErrorCode)) {
123 return 0;
124 }
125 if(destCapacity<0 || (dest==NULL && destCapacity>0) || src==NULL) {
126 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
127 return 0;
128 }
129
130 if(pNeededToNormalize!=NULL) {
131 *pNeededToNormalize=FALSE;
132 }
133 if(!(forward ? src->hasNext(src) : src->hasPrevious(src))) {
134 return u_terminateUChars(dest, destCapacity, 0, pErrorCode);
135 }
136
137 UnicodeString buffer;
138 UChar32 c;
139 if(forward) {
140 /* get one character and ignore its properties */
141 buffer.append(uiter_next32(src));
142 /* get all following characters until we see a boundary */
143 while((c=uiter_next32(src))>=0) {
144 if(n2->hasBoundaryBefore(c)) {
145 /* back out the latest movement to stop at the boundary */
146 src->move(src, -U16_LENGTH(c), UITER_CURRENT);
147 break;
148 } else {
149 buffer.append(c);
150 }
151 }
152 } else {
153 while((c=uiter_previous32(src))>=0) {
154 /* always write this character to the front of the buffer */
155 buffer.insert(0, c);
156 /* stop if this just-copied character is a boundary */
157 if(n2->hasBoundaryBefore(c)) {
158 break;
159 }
160 }
161 }
162
163 UnicodeString destString(dest, 0, destCapacity);
164 if(buffer.length()>0 && doNormalize) {
165 n2->normalize(buffer, destString, *pErrorCode).extract(dest, destCapacity, *pErrorCode);
166 if(pNeededToNormalize!=NULL && U_SUCCESS(*pErrorCode)) {
167 *pNeededToNormalize= destString!=buffer;
168 }
169 return destString.length();
170 } else {
171 /* just copy the source characters */
172 return buffer.extract(dest, destCapacity, *pErrorCode);
173 }
174}
175
176static int32_t
177unorm_iterate(UCharIterator *src, UBool forward,
178 UChar *dest, int32_t destCapacity,
179 UNormalizationMode mode, int32_t options,
180 UBool doNormalize, UBool *pNeededToNormalize,
181 UErrorCode *pErrorCode) {
182 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
183 if(options&UNORM_UNICODE_3_2) {
184 const UnicodeSet *uni32 = uniset_getUnicode32Instance(*pErrorCode);
185 if(U_FAILURE(*pErrorCode)) {
186 return 0;
187 }
188 FilteredNormalizer2 fn2(*n2, *uni32);
189 return _iterate(src, forward, dest, destCapacity,
190 &fn2, doNormalize, pNeededToNormalize, pErrorCode);
191 }
192 return _iterate(src, forward, dest, destCapacity,
193 n2, doNormalize, pNeededToNormalize, pErrorCode);
194}
195
196U_CAPI int32_t U_EXPORT2
197unorm_previous(UCharIterator *src,
198 UChar *dest, int32_t destCapacity,
199 UNormalizationMode mode, int32_t options,
200 UBool doNormalize, UBool *pNeededToNormalize,
201 UErrorCode *pErrorCode) {
202 return unorm_iterate(src, FALSE,
203 dest, destCapacity,
204 mode, options,
205 doNormalize, pNeededToNormalize,
206 pErrorCode);
207}
208
209U_CAPI int32_t U_EXPORT2
210unorm_next(UCharIterator *src,
211 UChar *dest, int32_t destCapacity,
212 UNormalizationMode mode, int32_t options,
213 UBool doNormalize, UBool *pNeededToNormalize,
214 UErrorCode *pErrorCode) {
215 return unorm_iterate(src, TRUE,
216 dest, destCapacity,
217 mode, options,
218 doNormalize, pNeededToNormalize,
219 pErrorCode);
220}
221
222/* Concatenation of normalized strings -------------------------------------- */
223
224static int32_t
225_concatenate(const UChar *left, int32_t leftLength,
226 const UChar *right, int32_t rightLength,
227 UChar *dest, int32_t destCapacity,
228 const Normalizer2 *n2,
229 UErrorCode *pErrorCode) {
230 if(U_FAILURE(*pErrorCode)) {
231 return 0;
232 }
233 if(destCapacity<0 || (dest==NULL && destCapacity>0) ||
234 left==NULL || leftLength<-1 || right==NULL || rightLength<-1) {
235 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
236 return 0;
237 }
238
239 /* check for overlapping right and destination */
240 if( dest!=NULL &&
241 ((right>=dest && right<(dest+destCapacity)) ||
242 (rightLength>0 && dest>=right && dest<(right+rightLength)))
243 ) {
244 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
245 return 0;
246 }
247
248 /* allow left==dest */
249 UnicodeString destString;
250 if(left==dest) {
251 destString.setTo(dest, leftLength, destCapacity);
252 } else {
253 destString.setTo(dest, 0, destCapacity);
254 destString.append(left, leftLength);
255 }
256 return n2->append(destString, UnicodeString(rightLength<0, right, rightLength), *pErrorCode).
257 extract(dest, destCapacity, *pErrorCode);
258}
259
260U_CAPI int32_t U_EXPORT2
261unorm_concatenate(const UChar *left, int32_t leftLength,
262 const UChar *right, int32_t rightLength,
263 UChar *dest, int32_t destCapacity,
264 UNormalizationMode mode, int32_t options,
265 UErrorCode *pErrorCode) {
266 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
267 if(options&UNORM_UNICODE_3_2) {
268 const UnicodeSet *uni32 = uniset_getUnicode32Instance(*pErrorCode);
269 if(U_FAILURE(*pErrorCode)) {
270 return 0;
271 }
272 FilteredNormalizer2 fn2(*n2, *uni32);
273 return _concatenate(left, leftLength, right, rightLength,
274 dest, destCapacity, &fn2, pErrorCode);
275 }
276 return _concatenate(left, leftLength, right, rightLength,
277 dest, destCapacity, n2, pErrorCode);
278}
279
280#endif /* #if !UCONFIG_NO_NORMALIZATION */
281