1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4**********************************************************************
5* Copyright (C) 2001-2011, International Business Machines
6* Corporation and others. All Rights Reserved.
7**********************************************************************
8* Date Name Description
9* 07/03/01 aliu Creation.
10**********************************************************************
11*/
12
13#include "unicode/utypes.h"
14
15#if !UCONFIG_NO_TRANSLITERATION
16
17#include "unicode/normalizer2.h"
18#include "unicode/utf16.h"
19#include "cstring.h"
20#include "nortrans.h"
21
22U_NAMESPACE_BEGIN
23
24UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator)
25
26static inline Transliterator::Token cstrToken(const char *s) {
27 return Transliterator::pointerToken((void *)s);
28}
29
30/**
31 * System registration hook.
32 */
33void NormalizationTransliterator::registerIDs() {
34 // In the Token, the byte after the NUL is the UNormalization2Mode.
35 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"),
36 _create, cstrToken("nfc\0\0"));
37 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"),
38 _create, cstrToken("nfkc\0\0"));
39 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"),
40 _create, cstrToken("nfc\0\1"));
41 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"),
42 _create, cstrToken("nfkc\0\1"));
43 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCD"),
44 _create, cstrToken("nfc\0\2"));
45 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCC"),
46 _create, cstrToken("nfc\0\3"));
47 Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"),
48 UNICODE_STRING_SIMPLE("NFD"), TRUE);
49 Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"),
50 UNICODE_STRING_SIMPLE("NFKD"), TRUE);
51 Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCC"),
52 UNICODE_STRING_SIMPLE("NFD"), FALSE);
53 Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCD"),
54 UNICODE_STRING_SIMPLE("FCD"), FALSE);
55}
56
57/**
58 * Factory methods
59 */
60Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID,
61 Token context) {
62 const char *name = (const char *)context.pointer;
63 UNormalization2Mode mode = (UNormalization2Mode)uprv_strchr(name, 0)[1];
64 UErrorCode errorCode = U_ZERO_ERROR;
65 const Normalizer2 *norm2 = Normalizer2::getInstance(NULL, name, mode, errorCode);
66 if(U_SUCCESS(errorCode)) {
67 return new NormalizationTransliterator(ID, *norm2);
68 } else {
69 return NULL;
70 }
71}
72
73/**
74 * Constructs a transliterator.
75 */
76NormalizationTransliterator::NormalizationTransliterator(const UnicodeString& id,
77 const Normalizer2 &norm2) :
78 Transliterator(id, 0), fNorm2(norm2) {}
79
80/**
81 * Destructor.
82 */
83NormalizationTransliterator::~NormalizationTransliterator() {
84}
85
86/**
87 * Copy constructor.
88 */
89NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) :
90 Transliterator(o), fNorm2(o.fNorm2) {}
91
92/**
93 * Transliterator API.
94 */
95NormalizationTransliterator* NormalizationTransliterator::clone() const {
96 return new NormalizationTransliterator(*this);
97}
98
99/**
100 * Implements {@link Transliterator#handleTransliterate}.
101 */
102void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
103 UBool isIncremental) const {
104 // start and limit of the input range
105 int32_t start = offsets.start;
106 int32_t limit = offsets.limit;
107 if(start >= limit) {
108 return;
109 }
110
111 /*
112 * Normalize as short chunks at a time as possible even in
113 * bulk mode, so that styled text is minimally disrupted.
114 * In incremental mode, a chunk that ends with offsets.limit
115 * must not be normalized.
116 *
117 * If it was known that the input text is not styled, then
118 * a bulk mode normalization could look like this:
119
120 UnicodeString input, normalized;
121 int32_t length = limit - start;
122 _Replaceable_extractBetween(text, start, limit, input.getBuffer(length));
123 input.releaseBuffer(length);
124
125 UErrorCode status = U_ZERO_ERROR;
126 fNorm2.normalize(input, normalized, status);
127
128 text.handleReplaceBetween(start, limit, normalized);
129
130 int32_t delta = normalized.length() - length;
131 offsets.contextLimit += delta;
132 offsets.limit += delta;
133 offsets.start = limit + delta;
134
135 */
136 UErrorCode errorCode = U_ZERO_ERROR;
137 UnicodeString segment;
138 UnicodeString normalized;
139 UChar32 c = text.char32At(start);
140 do {
141 int32_t prev = start;
142 // Skip at least one character so we make progress.
143 // c holds the character at start.
144 segment.remove();
145 do {
146 segment.append(c);
147 start += U16_LENGTH(c);
148 } while(start < limit && !fNorm2.hasBoundaryBefore(c = text.char32At(start)));
149 if(start == limit && isIncremental && !fNorm2.hasBoundaryAfter(c)) {
150 // stop in incremental mode when we reach the input limit
151 // in case there are additional characters that could change the
152 // normalization result
153 start=prev;
154 break;
155 }
156 fNorm2.normalize(segment, normalized, errorCode);
157 if(U_FAILURE(errorCode)) {
158 break;
159 }
160 if(segment != normalized) {
161 // replace the input chunk with its normalized form
162 text.handleReplaceBetween(prev, start, normalized);
163
164 // update all necessary indexes accordingly
165 int32_t delta = normalized.length() - (start - prev);
166 start += delta;
167 limit += delta;
168 }
169 } while(start < limit);
170
171 offsets.start = start;
172 offsets.contextLimit += limit - offsets.limit;
173 offsets.limit = limit;
174}
175
176U_NAMESPACE_END
177
178#endif /* #if !UCONFIG_NO_TRANSLITERATION */
179