1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4 *************************************************************************
5 * COPYRIGHT:
6 * Copyright (c) 1996-2012, International Business Machines Corporation and
7 * others. All Rights Reserved.
8 *************************************************************************
9 */
10
11#include "unicode/utypes.h"
12
13#if !UCONFIG_NO_NORMALIZATION
14
15#include "unicode/uniset.h"
16#include "unicode/unistr.h"
17#include "unicode/chariter.h"
18#include "unicode/schriter.h"
19#include "unicode/uchriter.h"
20#include "unicode/normlzr.h"
21#include "unicode/utf16.h"
22#include "cmemory.h"
23#include "normalizer2impl.h"
24#include "uprops.h" // for uniset_getUnicode32Instance()
25
26#if defined(move32)
27 // System can define move32 intrinsics, but the char iters define move32 method
28 // using same undef trick in headers, so undef here to re-enable the method.
29#undef move32
30#endif
31
32U_NAMESPACE_BEGIN
33
34UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)
35
36//-------------------------------------------------------------------------
37// Constructors and other boilerplate
38//-------------------------------------------------------------------------
39
40Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) :
41 UObject(), fFilteredNorm2(nullptr), fNorm2(nullptr), fUMode(mode), fOptions(0),
42 text(new StringCharacterIterator(str)),
43 currentIndex(0), nextIndex(0),
44 buffer(), bufferPos(0)
45{
46 init();
47}
48
49Normalizer::Normalizer(ConstChar16Ptr str, int32_t length, UNormalizationMode mode) :
50 UObject(), fFilteredNorm2(nullptr), fNorm2(nullptr), fUMode(mode), fOptions(0),
51 text(new UCharCharacterIterator(str, length)),
52 currentIndex(0), nextIndex(0),
53 buffer(), bufferPos(0)
54{
55 init();
56}
57
58Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) :
59 UObject(), fFilteredNorm2(nullptr), fNorm2(nullptr), fUMode(mode), fOptions(0),
60 text(iter.clone()),
61 currentIndex(0), nextIndex(0),
62 buffer(), bufferPos(0)
63{
64 init();
65}
66
67Normalizer::Normalizer(const Normalizer &copy) :
68 UObject(copy), fFilteredNorm2(nullptr), fNorm2(nullptr), fUMode(copy.fUMode), fOptions(copy.fOptions),
69 text(copy.text->clone()),
70 currentIndex(copy.currentIndex), nextIndex(copy.nextIndex),
71 buffer(copy.buffer), bufferPos(copy.bufferPos)
72{
73 init();
74}
75
76void
77Normalizer::init() {
78 UErrorCode errorCode=U_ZERO_ERROR;
79 fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode);
80 if(fOptions&UNORM_UNICODE_3_2) {
81 delete fFilteredNorm2;
82 fNorm2=fFilteredNorm2=
83 new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode));
84 }
85 if(U_FAILURE(errorCode)) {
86 errorCode=U_ZERO_ERROR;
87 fNorm2=Normalizer2Factory::getNoopInstance(errorCode);
88 }
89}
90
91Normalizer::~Normalizer()
92{
93 delete fFilteredNorm2;
94 delete text;
95}
96
97Normalizer*
98Normalizer::clone() const
99{
100 return new Normalizer(*this);
101}
102
103/**
104 * Generates a hash code for this iterator.
105 */
106int32_t Normalizer::hashCode() const
107{
108 return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex;
109}
110
111bool Normalizer::operator==(const Normalizer& that) const
112{
113 return
114 this==&that ||
115 (fUMode==that.fUMode &&
116 fOptions==that.fOptions &&
117 *text==*that.text &&
118 buffer==that.buffer &&
119 bufferPos==that.bufferPos &&
120 nextIndex==that.nextIndex);
121}
122
123//-------------------------------------------------------------------------
124// Static utility methods
125//-------------------------------------------------------------------------
126
127void U_EXPORT2
128Normalizer::normalize(const UnicodeString& source,
129 UNormalizationMode mode, int32_t options,
130 UnicodeString& result,
131 UErrorCode &status) {
132 if(source.isBogus() || U_FAILURE(status)) {
133 result.setToBogus();
134 if(U_SUCCESS(status)) {
135 status=U_ILLEGAL_ARGUMENT_ERROR;
136 }
137 } else {
138 UnicodeString localDest;
139 UnicodeString *dest;
140
141 if(&source!=&result) {
142 dest=&result;
143 } else {
144 // the source and result strings are the same object, use a temporary one
145 dest=&localDest;
146 }
147 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
148 if(U_SUCCESS(status)) {
149 if(options&UNORM_UNICODE_3_2) {
150 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
151 normalize(source, *dest, status);
152 } else {
153 n2->normalize(source, *dest, status);
154 }
155 }
156 if(dest==&localDest && U_SUCCESS(status)) {
157 result=*dest;
158 }
159 }
160}
161
162void U_EXPORT2
163Normalizer::compose(const UnicodeString& source,
164 UBool compat, int32_t options,
165 UnicodeString& result,
166 UErrorCode &status) {
167 normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status);
168}
169
170void U_EXPORT2
171Normalizer::decompose(const UnicodeString& source,
172 UBool compat, int32_t options,
173 UnicodeString& result,
174 UErrorCode &status) {
175 normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status);
176}
177
178UNormalizationCheckResult
179Normalizer::quickCheck(const UnicodeString& source,
180 UNormalizationMode mode, int32_t options,
181 UErrorCode &status) {
182 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
183 if(U_SUCCESS(status)) {
184 if(options&UNORM_UNICODE_3_2) {
185 return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
186 quickCheck(source, status);
187 } else {
188 return n2->quickCheck(source, status);
189 }
190 } else {
191 return UNORM_MAYBE;
192 }
193}
194
195UBool
196Normalizer::isNormalized(const UnicodeString& source,
197 UNormalizationMode mode, int32_t options,
198 UErrorCode &status) {
199 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
200 if(U_SUCCESS(status)) {
201 if(options&UNORM_UNICODE_3_2) {
202 return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
203 isNormalized(source, status);
204 } else {
205 return n2->isNormalized(source, status);
206 }
207 } else {
208 return false;
209 }
210}
211
212UnicodeString & U_EXPORT2
213Normalizer::concatenate(const UnicodeString &left, const UnicodeString &right,
214 UnicodeString &result,
215 UNormalizationMode mode, int32_t options,
216 UErrorCode &errorCode) {
217 if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) {
218 result.setToBogus();
219 if(U_SUCCESS(errorCode)) {
220 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
221 }
222 } else {
223 UnicodeString localDest;
224 UnicodeString *dest;
225
226 if(&right!=&result) {
227 dest=&result;
228 } else {
229 // the right and result strings are the same object, use a temporary one
230 dest=&localDest;
231 }
232 *dest=left;
233 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode);
234 if(U_SUCCESS(errorCode)) {
235 if(options&UNORM_UNICODE_3_2) {
236 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)).
237 append(*dest, right, errorCode);
238 } else {
239 n2->append(*dest, right, errorCode);
240 }
241 }
242 if(dest==&localDest && U_SUCCESS(errorCode)) {
243 result=*dest;
244 }
245 }
246 return result;
247}
248
249//-------------------------------------------------------------------------
250// Iteration API
251//-------------------------------------------------------------------------
252
253/**
254 * Return the current character in the normalized text.
255 */
256UChar32 Normalizer::current() {
257 if(bufferPos<buffer.length() || nextNormalize()) {
258 return buffer.char32At(bufferPos);
259 } else {
260 return DONE;
261 }
262}
263
264/**
265 * Return the next character in the normalized text and advance
266 * the iteration position by one. If the end
267 * of the text has already been reached, {@link #DONE} is returned.
268 */
269UChar32 Normalizer::next() {
270 if(bufferPos<buffer.length() || nextNormalize()) {
271 UChar32 c=buffer.char32At(bufferPos);
272 bufferPos+=U16_LENGTH(c);
273 return c;
274 } else {
275 return DONE;
276 }
277}
278
279/**
280 * Return the previous character in the normalized text and decrement
281 * the iteration position by one. If the beginning
282 * of the text has already been reached, {@link #DONE} is returned.
283 */
284UChar32 Normalizer::previous() {
285 if(bufferPos>0 || previousNormalize()) {
286 UChar32 c=buffer.char32At(bufferPos-1);
287 bufferPos-=U16_LENGTH(c);
288 return c;
289 } else {
290 return DONE;
291 }
292}
293
294void Normalizer::reset() {
295 currentIndex=nextIndex=text->setToStart();
296 clearBuffer();
297}
298
299void
300Normalizer::setIndexOnly(int32_t index) {
301 text->setIndex(index); // pins index
302 currentIndex=nextIndex=text->getIndex();
303 clearBuffer();
304}
305
306/**
307 * Return the first character in the normalized text. This resets
308 * the <tt>Normalizer's</tt> position to the beginning of the text.
309 */
310UChar32 Normalizer::first() {
311 reset();
312 return next();
313}
314
315/**
316 * Return the last character in the normalized text. This resets
317 * the <tt>Normalizer's</tt> position to be just before the
318 * the input text corresponding to that normalized character.
319 */
320UChar32 Normalizer::last() {
321 currentIndex=nextIndex=text->setToEnd();
322 clearBuffer();
323 return previous();
324}
325
326/**
327 * Retrieve the current iteration position in the input text that is
328 * being normalized. This method is useful in applications such as
329 * searching, where you need to be able to determine the position in
330 * the input text that corresponds to a given normalized output character.
331 * <p>
332 * <b>Note:</b> This method sets the position in the <em>input</em>, while
333 * {@link #next} and {@link #previous} iterate through characters in the
334 * <em>output</em>. This means that there is not necessarily a one-to-one
335 * correspondence between characters returned by <tt>next</tt> and
336 * <tt>previous</tt> and the indices passed to and returned from
337 * <tt>setIndex</tt> and {@link #getIndex}.
338 *
339 */
340int32_t Normalizer::getIndex() const {
341 if(bufferPos<buffer.length()) {
342 return currentIndex;
343 } else {
344 return nextIndex;
345 }
346}
347
348/**
349 * Retrieve the index of the start of the input text. This is the begin index
350 * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
351 * over which this <tt>Normalizer</tt> is iterating
352 */
353int32_t Normalizer::startIndex() const {
354 return text->startIndex();
355}
356
357/**
358 * Retrieve the index of the end of the input text. This is the end index
359 * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
360 * over which this <tt>Normalizer</tt> is iterating
361 */
362int32_t Normalizer::endIndex() const {
363 return text->endIndex();
364}
365
366//-------------------------------------------------------------------------
367// Property access methods
368//-------------------------------------------------------------------------
369
370void
371Normalizer::setMode(UNormalizationMode newMode)
372{
373 fUMode = newMode;
374 init();
375}
376
377UNormalizationMode
378Normalizer::getUMode() const
379{
380 return fUMode;
381}
382
383void
384Normalizer::setOption(int32_t option,
385 UBool value)
386{
387 if (value) {
388 fOptions |= option;
389 } else {
390 fOptions &= (~option);
391 }
392 init();
393}
394
395UBool
396Normalizer::getOption(int32_t option) const
397{
398 return (fOptions & option) != 0;
399}
400
401/**
402 * Set the input text over which this <tt>Normalizer</tt> will iterate.
403 * The iteration position is set to the beginning of the input text.
404 */
405void
406Normalizer::setText(const UnicodeString& newText,
407 UErrorCode &status)
408{
409 if (U_FAILURE(status)) {
410 return;
411 }
412 CharacterIterator *newIter = new StringCharacterIterator(newText);
413 if (newIter == nullptr) {
414 status = U_MEMORY_ALLOCATION_ERROR;
415 return;
416 }
417 delete text;
418 text = newIter;
419 reset();
420}
421
422/**
423 * Set the input text over which this <tt>Normalizer</tt> will iterate.
424 * The iteration position is set to the beginning of the string.
425 */
426void
427Normalizer::setText(const CharacterIterator& newText,
428 UErrorCode &status)
429{
430 if (U_FAILURE(status)) {
431 return;
432 }
433 CharacterIterator *newIter = newText.clone();
434 if (newIter == nullptr) {
435 status = U_MEMORY_ALLOCATION_ERROR;
436 return;
437 }
438 delete text;
439 text = newIter;
440 reset();
441}
442
443void
444Normalizer::setText(ConstChar16Ptr newText,
445 int32_t length,
446 UErrorCode &status)
447{
448 if (U_FAILURE(status)) {
449 return;
450 }
451 CharacterIterator *newIter = new UCharCharacterIterator(newText, length);
452 if (newIter == nullptr) {
453 status = U_MEMORY_ALLOCATION_ERROR;
454 return;
455 }
456 delete text;
457 text = newIter;
458 reset();
459}
460
461/**
462 * Copies the text under iteration into the UnicodeString referred to by "result".
463 * @param result Receives a copy of the text under iteration.
464 */
465void
466Normalizer::getText(UnicodeString& result)
467{
468 text->getText(result);
469}
470
471//-------------------------------------------------------------------------
472// Private utility methods
473//-------------------------------------------------------------------------
474
475void Normalizer::clearBuffer() {
476 buffer.remove();
477 bufferPos=0;
478}
479
480UBool
481Normalizer::nextNormalize() {
482 clearBuffer();
483 currentIndex=nextIndex;
484 text->setIndex(nextIndex);
485 if(!text->hasNext()) {
486 return false;
487 }
488 // Skip at least one character so we make progress.
489 UnicodeString segment(text->next32PostInc());
490 while(text->hasNext()) {
491 UChar32 c;
492 if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) {
493 text->move32(-1, CharacterIterator::kCurrent);
494 break;
495 }
496 segment.append(c);
497 }
498 nextIndex=text->getIndex();
499 UErrorCode errorCode=U_ZERO_ERROR;
500 fNorm2->normalize(segment, buffer, errorCode);
501 return U_SUCCESS(errorCode) && !buffer.isEmpty();
502}
503
504UBool
505Normalizer::previousNormalize() {
506 clearBuffer();
507 nextIndex=currentIndex;
508 text->setIndex(currentIndex);
509 if(!text->hasPrevious()) {
510 return false;
511 }
512 UnicodeString segment;
513 while(text->hasPrevious()) {
514 UChar32 c=text->previous32();
515 segment.insert(0, c);
516 if(fNorm2->hasBoundaryBefore(c)) {
517 break;
518 }
519 }
520 currentIndex=text->getIndex();
521 UErrorCode errorCode=U_ZERO_ERROR;
522 fNorm2->normalize(segment, buffer, errorCode);
523 bufferPos=buffer.length();
524 return U_SUCCESS(errorCode) && !buffer.isEmpty();
525}
526
527U_NAMESPACE_END
528
529#endif /* #if !UCONFIG_NO_NORMALIZATION */
530