1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*
6* Copyright (C) 2009-2016, International Business Machines
7* Corporation and others. All Rights Reserved.
8*
9*******************************************************************************
10* file name: normalizer2.cpp
11* encoding: UTF-8
12* tab size: 8 (not used)
13* indentation:4
14*
15* created on: 2009nov22
16* created by: Markus W. Scherer
17*/
18
19#include "unicode/utypes.h"
20
21#if !UCONFIG_NO_NORMALIZATION
22
23#include "unicode/edits.h"
24#include "unicode/normalizer2.h"
25#include "unicode/stringoptions.h"
26#include "unicode/unistr.h"
27#include "unicode/unorm.h"
28#include "cstring.h"
29#include "mutex.h"
30#include "norm2allmodes.h"
31#include "normalizer2impl.h"
32#include "uassert.h"
33#include "ucln_cmn.h"
34
35using icu::Normalizer2Impl;
36
37#if NORM2_HARDCODE_NFC_DATA
38// NFC/NFD data machine-generated by gennorm2 --csource
39#define INCLUDED_FROM_NORMALIZER2_CPP
40#include "norm2_nfc_data.h"
41#endif
42
43U_NAMESPACE_BEGIN
44
45// Public API dispatch via Normalizer2 subclasses -------------------------- ***
46
47Normalizer2::~Normalizer2() {}
48
49void
50Normalizer2::normalizeUTF8(uint32_t /*options*/, StringPiece src, ByteSink &sink,
51 Edits *edits, UErrorCode &errorCode) const {
52 if (U_FAILURE(errorCode)) {
53 return;
54 }
55 if (edits != nullptr) {
56 errorCode = U_UNSUPPORTED_ERROR;
57 return;
58 }
59 UnicodeString src16 = UnicodeString::fromUTF8(src);
60 normalize(src16, errorCode).toUTF8(sink);
61}
62
63UBool
64Normalizer2::getRawDecomposition(UChar32, UnicodeString &) const {
65 return false;
66}
67
68UChar32
69Normalizer2::composePair(UChar32, UChar32) const {
70 return U_SENTINEL;
71}
72
73uint8_t
74Normalizer2::getCombiningClass(UChar32 /*c*/) const {
75 return 0;
76}
77
78UBool
79Normalizer2::isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const {
80 return U_SUCCESS(errorCode) && isNormalized(UnicodeString::fromUTF8(s), errorCode);
81}
82
83// Normalizer2 implementation for the old UNORM_NONE.
84class NoopNormalizer2 : public Normalizer2 {
85 virtual ~NoopNormalizer2();
86
87 virtual UnicodeString &
88 normalize(const UnicodeString &src,
89 UnicodeString &dest,
90 UErrorCode &errorCode) const override {
91 if(U_SUCCESS(errorCode)) {
92 if(&dest!=&src) {
93 dest=src;
94 } else {
95 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
96 }
97 }
98 return dest;
99 }
100 virtual void
101 normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
102 Edits *edits, UErrorCode &errorCode) const override {
103 if(U_SUCCESS(errorCode)) {
104 if (edits != nullptr) {
105 if ((options & U_EDITS_NO_RESET) == 0) {
106 edits->reset();
107 }
108 edits->addUnchanged(src.length());
109 }
110 if ((options & U_OMIT_UNCHANGED_TEXT) == 0) {
111 sink.Append(src.data(), src.length());
112 }
113 sink.Flush();
114 }
115 }
116
117 virtual UnicodeString &
118 normalizeSecondAndAppend(UnicodeString &first,
119 const UnicodeString &second,
120 UErrorCode &errorCode) const override {
121 if(U_SUCCESS(errorCode)) {
122 if(&first!=&second) {
123 first.append(second);
124 } else {
125 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
126 }
127 }
128 return first;
129 }
130 virtual UnicodeString &
131 append(UnicodeString &first,
132 const UnicodeString &second,
133 UErrorCode &errorCode) const override {
134 if(U_SUCCESS(errorCode)) {
135 if(&first!=&second) {
136 first.append(second);
137 } else {
138 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
139 }
140 }
141 return first;
142 }
143 virtual UBool
144 getDecomposition(UChar32, UnicodeString &) const override {
145 return false;
146 }
147 // No need to override the default getRawDecomposition().
148 virtual UBool
149 isNormalized(const UnicodeString &, UErrorCode &errorCode) const override {
150 return U_SUCCESS(errorCode);
151 }
152 virtual UBool
153 isNormalizedUTF8(StringPiece, UErrorCode &errorCode) const override {
154 return U_SUCCESS(errorCode);
155 }
156 virtual UNormalizationCheckResult
157 quickCheck(const UnicodeString &, UErrorCode &) const override {
158 return UNORM_YES;
159 }
160 virtual int32_t
161 spanQuickCheckYes(const UnicodeString &s, UErrorCode &) const override {
162 return s.length();
163 }
164 virtual UBool hasBoundaryBefore(UChar32) const override { return true; }
165 virtual UBool hasBoundaryAfter(UChar32) const override { return true; }
166 virtual UBool isInert(UChar32) const override { return true; }
167};
168
169NoopNormalizer2::~NoopNormalizer2() {}
170
171Normalizer2WithImpl::~Normalizer2WithImpl() {}
172
173DecomposeNormalizer2::~DecomposeNormalizer2() {}
174
175ComposeNormalizer2::~ComposeNormalizer2() {}
176
177FCDNormalizer2::~FCDNormalizer2() {}
178
179// instance cache ---------------------------------------------------------- ***
180
181U_CDECL_BEGIN
182static UBool U_CALLCONV uprv_normalizer2_cleanup();
183U_CDECL_END
184
185static Normalizer2 *noopSingleton;
186static icu::UInitOnce noopInitOnce {};
187
188static void U_CALLCONV initNoopSingleton(UErrorCode &errorCode) {
189 if(U_FAILURE(errorCode)) {
190 return;
191 }
192 noopSingleton=new NoopNormalizer2;
193 if(noopSingleton==nullptr) {
194 errorCode=U_MEMORY_ALLOCATION_ERROR;
195 return;
196 }
197 ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2, uprv_normalizer2_cleanup);
198}
199
200const Normalizer2 *Normalizer2Factory::getNoopInstance(UErrorCode &errorCode) {
201 if(U_FAILURE(errorCode)) { return nullptr; }
202 umtx_initOnce(noopInitOnce, &initNoopSingleton, errorCode);
203 return noopSingleton;
204}
205
206const Normalizer2Impl *
207Normalizer2Factory::getImpl(const Normalizer2 *norm2) {
208 return &((Normalizer2WithImpl *)norm2)->impl;
209}
210
211Norm2AllModes::~Norm2AllModes() {
212 delete impl;
213}
214
215Norm2AllModes *
216Norm2AllModes::createInstance(Normalizer2Impl *impl, UErrorCode &errorCode) {
217 if(U_FAILURE(errorCode)) {
218 delete impl;
219 return nullptr;
220 }
221 Norm2AllModes *allModes=new Norm2AllModes(impl);
222 if(allModes==nullptr) {
223 errorCode=U_MEMORY_ALLOCATION_ERROR;
224 delete impl;
225 return nullptr;
226 }
227 return allModes;
228}
229
230#if NORM2_HARDCODE_NFC_DATA
231Norm2AllModes *
232Norm2AllModes::createNFCInstance(UErrorCode &errorCode) {
233 if(U_FAILURE(errorCode)) {
234 return nullptr;
235 }
236 Normalizer2Impl *impl=new Normalizer2Impl;
237 if(impl==nullptr) {
238 errorCode=U_MEMORY_ALLOCATION_ERROR;
239 return nullptr;
240 }
241 impl->init(norm2_nfc_data_indexes, &norm2_nfc_data_trie,
242 norm2_nfc_data_extraData, norm2_nfc_data_smallFCD);
243 return createInstance(impl, errorCode);
244}
245
246static Norm2AllModes *nfcSingleton;
247
248static icu::UInitOnce nfcInitOnce {};
249
250static void U_CALLCONV initNFCSingleton(UErrorCode &errorCode) {
251 nfcSingleton=Norm2AllModes::createNFCInstance(errorCode);
252 ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2, uprv_normalizer2_cleanup);
253}
254
255const Norm2AllModes *
256Norm2AllModes::getNFCInstance(UErrorCode &errorCode) {
257 if(U_FAILURE(errorCode)) { return nullptr; }
258 umtx_initOnce(nfcInitOnce, &initNFCSingleton, errorCode);
259 return nfcSingleton;
260}
261
262const Normalizer2 *
263Normalizer2::getNFCInstance(UErrorCode &errorCode) {
264 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
265 return allModes!=nullptr ? &allModes->comp : nullptr;
266}
267
268const Normalizer2 *
269Normalizer2::getNFDInstance(UErrorCode &errorCode) {
270 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
271 return allModes!=nullptr ? &allModes->decomp : nullptr;
272}
273
274const Normalizer2 *Normalizer2Factory::getFCDInstance(UErrorCode &errorCode) {
275 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
276 return allModes!=nullptr ? &allModes->fcd : nullptr;
277}
278
279const Normalizer2 *Normalizer2Factory::getFCCInstance(UErrorCode &errorCode) {
280 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
281 return allModes!=nullptr ? &allModes->fcc : nullptr;
282}
283
284const Normalizer2Impl *
285Normalizer2Factory::getNFCImpl(UErrorCode &errorCode) {
286 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
287 return allModes!=nullptr ? allModes->impl : nullptr;
288}
289#endif // NORM2_HARDCODE_NFC_DATA
290
291U_CDECL_BEGIN
292
293static UBool U_CALLCONV uprv_normalizer2_cleanup() {
294 delete noopSingleton;
295 noopSingleton = nullptr;
296 noopInitOnce.reset();
297#if NORM2_HARDCODE_NFC_DATA
298 delete nfcSingleton;
299 nfcSingleton = nullptr;
300 nfcInitOnce.reset();
301#endif
302 return true;
303}
304
305U_CDECL_END
306
307U_NAMESPACE_END
308
309// C API ------------------------------------------------------------------- ***
310
311U_NAMESPACE_USE
312
313U_CAPI const UNormalizer2 * U_EXPORT2
314unorm2_getNFCInstance(UErrorCode *pErrorCode) {
315 return (const UNormalizer2 *)Normalizer2::getNFCInstance(*pErrorCode);
316}
317
318U_CAPI const UNormalizer2 * U_EXPORT2
319unorm2_getNFDInstance(UErrorCode *pErrorCode) {
320 return (const UNormalizer2 *)Normalizer2::getNFDInstance(*pErrorCode);
321}
322
323U_CAPI void U_EXPORT2
324unorm2_close(UNormalizer2 *norm2) {
325 delete (Normalizer2 *)norm2;
326}
327
328U_CAPI int32_t U_EXPORT2
329unorm2_normalize(const UNormalizer2 *norm2,
330 const char16_t *src, int32_t length,
331 char16_t *dest, int32_t capacity,
332 UErrorCode *pErrorCode) {
333 if(U_FAILURE(*pErrorCode)) {
334 return 0;
335 }
336 if( (src==nullptr ? length!=0 : length<-1) ||
337 (dest==nullptr ? capacity!=0 : capacity<0) ||
338 (src==dest && src!=nullptr)
339 ) {
340 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
341 return 0;
342 }
343 UnicodeString destString(dest, 0, capacity);
344 // length==0: Nothing to do, and n2wi->normalize(nullptr, nullptr, buffer, ...) would crash.
345 if(length!=0) {
346 const Normalizer2 *n2=(const Normalizer2 *)norm2;
347 const Normalizer2WithImpl *n2wi=dynamic_cast<const Normalizer2WithImpl *>(n2);
348 if(n2wi!=nullptr) {
349 // Avoid duplicate argument checking and support NUL-terminated src.
350 ReorderingBuffer buffer(n2wi->impl, destString);
351 if(buffer.init(length, *pErrorCode)) {
352 n2wi->normalize(src, length>=0 ? src+length : nullptr, buffer, *pErrorCode);
353 }
354 } else {
355 UnicodeString srcString(length<0, src, length);
356 n2->normalize(srcString, destString, *pErrorCode);
357 }
358 }
359 return destString.extract(dest, capacity, *pErrorCode);
360}
361
362static int32_t
363normalizeSecondAndAppend(const UNormalizer2 *norm2,
364 char16_t *first, int32_t firstLength, int32_t firstCapacity,
365 const char16_t *second, int32_t secondLength,
366 UBool doNormalize,
367 UErrorCode *pErrorCode) {
368 if(U_FAILURE(*pErrorCode)) {
369 return 0;
370 }
371 if( (second==nullptr ? secondLength!=0 : secondLength<-1) ||
372 (first==nullptr ? (firstCapacity!=0 || firstLength!=0) :
373 (firstCapacity<0 || firstLength<-1)) ||
374 (first==second && first!=nullptr)
375 ) {
376 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
377 return 0;
378 }
379 UnicodeString firstString(first, firstLength, firstCapacity);
380 firstLength=firstString.length(); // In case it was -1.
381 // secondLength==0: Nothing to do, and n2wi->normalizeAndAppend(nullptr, nullptr, buffer, ...) would crash.
382 if(secondLength!=0) {
383 const Normalizer2 *n2=(const Normalizer2 *)norm2;
384 const Normalizer2WithImpl *n2wi=dynamic_cast<const Normalizer2WithImpl *>(n2);
385 if(n2wi!=nullptr) {
386 // Avoid duplicate argument checking and support NUL-terminated src.
387 UnicodeString safeMiddle;
388 {
389 ReorderingBuffer buffer(n2wi->impl, firstString);
390 if(buffer.init(firstLength+secondLength+1, *pErrorCode)) { // destCapacity>=-1
391 n2wi->normalizeAndAppend(second, secondLength>=0 ? second+secondLength : nullptr,
392 doNormalize, safeMiddle, buffer, *pErrorCode);
393 }
394 } // The ReorderingBuffer destructor finalizes firstString.
395 if(U_FAILURE(*pErrorCode) || firstString.length()>firstCapacity) {
396 // Restore the modified suffix of the first string.
397 // This does not restore first[] array contents between firstLength and firstCapacity.
398 // (That might be uninitialized memory, as far as we know.)
399 if(first!=nullptr) { /* don't dereference nullptr */
400 safeMiddle.extract(0, 0x7fffffff, first+firstLength-safeMiddle.length());
401 if(firstLength<firstCapacity) {
402 first[firstLength]=0; // NUL-terminate in case it was originally.
403 }
404 }
405 }
406 } else {
407 UnicodeString secondString(secondLength<0, second, secondLength);
408 if(doNormalize) {
409 n2->normalizeSecondAndAppend(firstString, secondString, *pErrorCode);
410 } else {
411 n2->append(firstString, secondString, *pErrorCode);
412 }
413 }
414 }
415 return firstString.extract(first, firstCapacity, *pErrorCode);
416}
417
418U_CAPI int32_t U_EXPORT2
419unorm2_normalizeSecondAndAppend(const UNormalizer2 *norm2,
420 char16_t *first, int32_t firstLength, int32_t firstCapacity,
421 const char16_t *second, int32_t secondLength,
422 UErrorCode *pErrorCode) {
423 return normalizeSecondAndAppend(norm2,
424 first, firstLength, firstCapacity,
425 second, secondLength,
426 true, pErrorCode);
427}
428
429U_CAPI int32_t U_EXPORT2
430unorm2_append(const UNormalizer2 *norm2,
431 char16_t *first, int32_t firstLength, int32_t firstCapacity,
432 const char16_t *second, int32_t secondLength,
433 UErrorCode *pErrorCode) {
434 return normalizeSecondAndAppend(norm2,
435 first, firstLength, firstCapacity,
436 second, secondLength,
437 false, pErrorCode);
438}
439
440U_CAPI int32_t U_EXPORT2
441unorm2_getDecomposition(const UNormalizer2 *norm2,
442 UChar32 c, char16_t *decomposition, int32_t capacity,
443 UErrorCode *pErrorCode) {
444 if(U_FAILURE(*pErrorCode)) {
445 return 0;
446 }
447 if(decomposition==nullptr ? capacity!=0 : capacity<0) {
448 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
449 return 0;
450 }
451 UnicodeString destString(decomposition, 0, capacity);
452 if(reinterpret_cast<const Normalizer2 *>(norm2)->getDecomposition(c, destString)) {
453 return destString.extract(decomposition, capacity, *pErrorCode);
454 } else {
455 return -1;
456 }
457}
458
459U_CAPI int32_t U_EXPORT2
460unorm2_getRawDecomposition(const UNormalizer2 *norm2,
461 UChar32 c, char16_t *decomposition, int32_t capacity,
462 UErrorCode *pErrorCode) {
463 if(U_FAILURE(*pErrorCode)) {
464 return 0;
465 }
466 if(decomposition==nullptr ? capacity!=0 : capacity<0) {
467 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
468 return 0;
469 }
470 UnicodeString destString(decomposition, 0, capacity);
471 if(reinterpret_cast<const Normalizer2 *>(norm2)->getRawDecomposition(c, destString)) {
472 return destString.extract(decomposition, capacity, *pErrorCode);
473 } else {
474 return -1;
475 }
476}
477
478U_CAPI UChar32 U_EXPORT2
479unorm2_composePair(const UNormalizer2 *norm2, UChar32 a, UChar32 b) {
480 return reinterpret_cast<const Normalizer2 *>(norm2)->composePair(a, b);
481}
482
483U_CAPI uint8_t U_EXPORT2
484unorm2_getCombiningClass(const UNormalizer2 *norm2, UChar32 c) {
485 return reinterpret_cast<const Normalizer2 *>(norm2)->getCombiningClass(c);
486}
487
488U_CAPI UBool U_EXPORT2
489unorm2_isNormalized(const UNormalizer2 *norm2,
490 const char16_t *s, int32_t length,
491 UErrorCode *pErrorCode) {
492 if(U_FAILURE(*pErrorCode)) {
493 return 0;
494 }
495 if((s==nullptr && length!=0) || length<-1) {
496 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
497 return 0;
498 }
499 UnicodeString sString(length<0, s, length);
500 return ((const Normalizer2 *)norm2)->isNormalized(sString, *pErrorCode);
501}
502
503U_CAPI UNormalizationCheckResult U_EXPORT2
504unorm2_quickCheck(const UNormalizer2 *norm2,
505 const char16_t *s, int32_t length,
506 UErrorCode *pErrorCode) {
507 if(U_FAILURE(*pErrorCode)) {
508 return UNORM_NO;
509 }
510 if((s==nullptr && length!=0) || length<-1) {
511 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
512 return UNORM_NO;
513 }
514 UnicodeString sString(length<0, s, length);
515 return ((const Normalizer2 *)norm2)->quickCheck(sString, *pErrorCode);
516}
517
518U_CAPI int32_t U_EXPORT2
519unorm2_spanQuickCheckYes(const UNormalizer2 *norm2,
520 const char16_t *s, int32_t length,
521 UErrorCode *pErrorCode) {
522 if(U_FAILURE(*pErrorCode)) {
523 return 0;
524 }
525 if((s==nullptr && length!=0) || length<-1) {
526 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
527 return 0;
528 }
529 UnicodeString sString(length<0, s, length);
530 return ((const Normalizer2 *)norm2)->spanQuickCheckYes(sString, *pErrorCode);
531}
532
533U_CAPI UBool U_EXPORT2
534unorm2_hasBoundaryBefore(const UNormalizer2 *norm2, UChar32 c) {
535 return ((const Normalizer2 *)norm2)->hasBoundaryBefore(c);
536}
537
538U_CAPI UBool U_EXPORT2
539unorm2_hasBoundaryAfter(const UNormalizer2 *norm2, UChar32 c) {
540 return ((const Normalizer2 *)norm2)->hasBoundaryAfter(c);
541}
542
543U_CAPI UBool U_EXPORT2
544unorm2_isInert(const UNormalizer2 *norm2, UChar32 c) {
545 return ((const Normalizer2 *)norm2)->isInert(c);
546}
547
548// Some properties APIs ---------------------------------------------------- ***
549
550U_CAPI uint8_t U_EXPORT2
551u_getCombiningClass(UChar32 c) {
552 UErrorCode errorCode=U_ZERO_ERROR;
553 const Normalizer2 *nfd=Normalizer2::getNFDInstance(errorCode);
554 if(U_SUCCESS(errorCode)) {
555 return nfd->getCombiningClass(c);
556 } else {
557 return 0;
558 }
559}
560
561U_CFUNC uint16_t
562unorm_getFCD16(UChar32 c) {
563 UErrorCode errorCode=U_ZERO_ERROR;
564 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
565 if(U_SUCCESS(errorCode)) {
566 return impl->getFCD16(c);
567 } else {
568 return 0;
569 }
570}
571
572#endif // !UCONFIG_NO_NORMALIZATION
573