1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*
6* Copyright (C) 2009-2013, International Business Machines
7* Corporation and others. All Rights Reserved.
8*
9*******************************************************************************
10* file name: normalizer2.h
11* encoding: UTF-8
12* tab size: 8 (not used)
13* indentation:4
14*
15* created on: 2009nov22
16* created by: Markus W. Scherer
17*/
18
19#ifndef __NORMALIZER2_H__
20#define __NORMALIZER2_H__
21
22/**
23 * \file
24 * \brief C++ API: New API for Unicode Normalization.
25 */
26
27#include "unicode/utypes.h"
28
29#if U_SHOW_CPLUSPLUS_API
30
31#if !UCONFIG_NO_NORMALIZATION
32
33#include "unicode/stringpiece.h"
34#include "unicode/uniset.h"
35#include "unicode/unistr.h"
36#include "unicode/unorm2.h"
37
38U_NAMESPACE_BEGIN
39
40class ByteSink;
41
42/**
43 * Unicode normalization functionality for standard Unicode normalization or
44 * for using custom mapping tables.
45 * All instances of this class are unmodifiable/immutable.
46 * Instances returned by getInstance() are singletons that must not be deleted by the caller.
47 * The Normalizer2 class is not intended for public subclassing.
48 *
49 * The primary functions are to produce a normalized string and to detect whether
50 * a string is already normalized.
51 * The most commonly used normalization forms are those defined in
52 * http://www.unicode.org/unicode/reports/tr15/
53 * However, this API supports additional normalization forms for specialized purposes.
54 * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)
55 * and can be used in implementations of UTS #46.
56 *
57 * Not only are the standard compose and decompose modes supplied,
58 * but additional modes are provided as documented in the Mode enum.
59 *
60 * Some of the functions in this class identify normalization boundaries.
61 * At a normalization boundary, the portions of the string
62 * before it and starting from it do not interact and can be handled independently.
63 *
64 * The spanQuickCheckYes() stops at a normalization boundary.
65 * When the goal is a normalized string, then the text before the boundary
66 * can be copied, and the remainder can be processed with normalizeSecondAndAppend().
67 *
68 * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether
69 * a character is guaranteed to be at a normalization boundary,
70 * regardless of context.
71 * This is used for moving from one normalization boundary to the next
72 * or preceding boundary, and for performing iterative normalization.
73 *
74 * Iterative normalization is useful when only a small portion of a
75 * longer string needs to be processed.
76 * For example, in ICU, iterative normalization is used by the NormalizationTransliterator
77 * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()
78 * (to process only the substring for which sort key bytes are computed).
79 *
80 * The set of normalization boundaries returned by these functions may not be
81 * complete: There may be more boundaries that could be returned.
82 * Different functions may return different boundaries.
83 * @stable ICU 4.4
84 */
85class U_COMMON_API Normalizer2 : public UObject {
86public:
87 /**
88 * Destructor.
89 * @stable ICU 4.4
90 */
91 ~Normalizer2();
92
93 /**
94 * Returns a Normalizer2 instance for Unicode NFC normalization.
95 * Same as getInstance(nullptr, "nfc", UNORM2_COMPOSE, errorCode).
96 * Returns an unmodifiable singleton instance. Do not delete it.
97 * @param errorCode Standard ICU error code. Its input value must
98 * pass the U_SUCCESS() test, or else the function returns
99 * immediately. Check for U_FAILURE() on output or use with
100 * function chaining. (See User Guide for details.)
101 * @return the requested Normalizer2, if successful
102 * @stable ICU 49
103 */
104 static const Normalizer2 *
105 getNFCInstance(UErrorCode &errorCode);
106
107 /**
108 * Returns a Normalizer2 instance for Unicode NFD normalization.
109 * Same as getInstance(nullptr, "nfc", UNORM2_DECOMPOSE, errorCode).
110 * Returns an unmodifiable singleton instance. Do not delete it.
111 * @param errorCode Standard ICU error code. Its input value must
112 * pass the U_SUCCESS() test, or else the function returns
113 * immediately. Check for U_FAILURE() on output or use with
114 * function chaining. (See User Guide for details.)
115 * @return the requested Normalizer2, if successful
116 * @stable ICU 49
117 */
118 static const Normalizer2 *
119 getNFDInstance(UErrorCode &errorCode);
120
121 /**
122 * Returns a Normalizer2 instance for Unicode NFKC normalization.
123 * Same as getInstance(nullptr, "nfkc", UNORM2_COMPOSE, errorCode).
124 * Returns an unmodifiable singleton instance. Do not delete it.
125 * @param errorCode Standard ICU error code. Its input value must
126 * pass the U_SUCCESS() test, or else the function returns
127 * immediately. Check for U_FAILURE() on output or use with
128 * function chaining. (See User Guide for details.)
129 * @return the requested Normalizer2, if successful
130 * @stable ICU 49
131 */
132 static const Normalizer2 *
133 getNFKCInstance(UErrorCode &errorCode);
134
135 /**
136 * Returns a Normalizer2 instance for Unicode NFKD normalization.
137 * Same as getInstance(nullptr, "nfkc", UNORM2_DECOMPOSE, errorCode).
138 * Returns an unmodifiable singleton instance. Do not delete it.
139 * @param errorCode Standard ICU error code. Its input value must
140 * pass the U_SUCCESS() test, or else the function returns
141 * immediately. Check for U_FAILURE() on output or use with
142 * function chaining. (See User Guide for details.)
143 * @return the requested Normalizer2, if successful
144 * @stable ICU 49
145 */
146 static const Normalizer2 *
147 getNFKDInstance(UErrorCode &errorCode);
148
149 /**
150 * Returns a Normalizer2 instance for Unicode NFKC_Casefold normalization.
151 * Same as getInstance(nullptr, "nfkc_cf", UNORM2_COMPOSE, errorCode).
152 * Returns an unmodifiable singleton instance. Do not delete it.
153 * @param errorCode Standard ICU error code. Its input value must
154 * pass the U_SUCCESS() test, or else the function returns
155 * immediately. Check for U_FAILURE() on output or use with
156 * function chaining. (See User Guide for details.)
157 * @return the requested Normalizer2, if successful
158 * @stable ICU 49
159 */
160 static const Normalizer2 *
161 getNFKCCasefoldInstance(UErrorCode &errorCode);
162
163 /**
164 * Returns a Normalizer2 instance which uses the specified data file
165 * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle)
166 * and which composes or decomposes text according to the specified mode.
167 * Returns an unmodifiable singleton instance. Do not delete it.
168 *
169 * Use packageName=nullptr for data files that are part of ICU's own data.
170 * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD.
171 * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD.
172 * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
173 *
174 * @param packageName nullptr for ICU built-in data, otherwise application data package name
175 * @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file
176 * @param mode normalization mode (compose or decompose etc.)
177 * @param errorCode Standard ICU error code. Its input value must
178 * pass the U_SUCCESS() test, or else the function returns
179 * immediately. Check for U_FAILURE() on output or use with
180 * function chaining. (See User Guide for details.)
181 * @return the requested Normalizer2, if successful
182 * @stable ICU 4.4
183 */
184 static const Normalizer2 *
185 getInstance(const char *packageName,
186 const char *name,
187 UNormalization2Mode mode,
188 UErrorCode &errorCode);
189
190 /**
191 * Returns the normalized form of the source string.
192 * @param src source string
193 * @param errorCode Standard ICU error code. Its input value must
194 * pass the U_SUCCESS() test, or else the function returns
195 * immediately. Check for U_FAILURE() on output or use with
196 * function chaining. (See User Guide for details.)
197 * @return normalized src
198 * @stable ICU 4.4
199 */
200 UnicodeString
201 normalize(const UnicodeString &src, UErrorCode &errorCode) const {
202 UnicodeString result;
203 normalize(src, result, errorCode);
204 return result;
205 }
206 /**
207 * Writes the normalized form of the source string to the destination string
208 * (replacing its contents) and returns the destination string.
209 * The source and destination strings must be different objects.
210 * @param src source string
211 * @param dest destination string; its contents is replaced with normalized src
212 * @param errorCode Standard ICU error code. Its input value must
213 * pass the U_SUCCESS() test, or else the function returns
214 * immediately. Check for U_FAILURE() on output or use with
215 * function chaining. (See User Guide for details.)
216 * @return dest
217 * @stable ICU 4.4
218 */
219 virtual UnicodeString &
220 normalize(const UnicodeString &src,
221 UnicodeString &dest,
222 UErrorCode &errorCode) const = 0;
223
224 /**
225 * Normalizes a UTF-8 string and optionally records how source substrings
226 * relate to changed and unchanged result substrings.
227 *
228 * Implemented completely for all built-in modes except for FCD.
229 * The base class implementation converts to & from UTF-16 and does not support edits.
230 *
231 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
232 * @param src Source UTF-8 string.
233 * @param sink A ByteSink to which the normalized UTF-8 result string is written.
234 * sink.Flush() is called at the end.
235 * @param edits Records edits for index mapping, working with styled text,
236 * and getting only changes (if any).
237 * The Edits contents is undefined if any error occurs.
238 * This function calls edits->reset() first unless
239 * options includes U_EDITS_NO_RESET. edits can be nullptr.
240 * @param errorCode Standard ICU error code. Its input value must
241 * pass the U_SUCCESS() test, or else the function returns
242 * immediately. Check for U_FAILURE() on output or use with
243 * function chaining. (See User Guide for details.)
244 * @stable ICU 60
245 */
246 virtual void
247 normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
248 Edits *edits, UErrorCode &errorCode) const;
249
250 /**
251 * Appends the normalized form of the second string to the first string
252 * (merging them at the boundary) and returns the first string.
253 * The result is normalized if the first string was normalized.
254 * The first and second strings must be different objects.
255 * @param first string, should be normalized
256 * @param second string, will be normalized
257 * @param errorCode Standard ICU error code. Its input value must
258 * pass the U_SUCCESS() test, or else the function returns
259 * immediately. Check for U_FAILURE() on output or use with
260 * function chaining. (See User Guide for details.)
261 * @return first
262 * @stable ICU 4.4
263 */
264 virtual UnicodeString &
265 normalizeSecondAndAppend(UnicodeString &first,
266 const UnicodeString &second,
267 UErrorCode &errorCode) const = 0;
268 /**
269 * Appends the second string to the first string
270 * (merging them at the boundary) and returns the first string.
271 * The result is normalized if both the strings were normalized.
272 * The first and second strings must be different objects.
273 * @param first string, should be normalized
274 * @param second string, should be normalized
275 * @param errorCode Standard ICU error code. Its input value must
276 * pass the U_SUCCESS() test, or else the function returns
277 * immediately. Check for U_FAILURE() on output or use with
278 * function chaining. (See User Guide for details.)
279 * @return first
280 * @stable ICU 4.4
281 */
282 virtual UnicodeString &
283 append(UnicodeString &first,
284 const UnicodeString &second,
285 UErrorCode &errorCode) const = 0;
286
287 /**
288 * Gets the decomposition mapping of c.
289 * Roughly equivalent to normalizing the String form of c
290 * on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster, and except that this function
291 * returns false and does not write a string
292 * if c does not have a decomposition mapping in this instance's data.
293 * This function is independent of the mode of the Normalizer2.
294 * @param c code point
295 * @param decomposition String object which will be set to c's
296 * decomposition mapping, if there is one.
297 * @return true if c has a decomposition, otherwise false
298 * @stable ICU 4.6
299 */
300 virtual UBool
301 getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0;
302
303 /**
304 * Gets the raw decomposition mapping of c.
305 *
306 * This is similar to the getDecomposition() method but returns the
307 * raw decomposition mapping as specified in UnicodeData.txt or
308 * (for custom data) in the mapping files processed by the gennorm2 tool.
309 * By contrast, getDecomposition() returns the processed,
310 * recursively-decomposed version of this mapping.
311 *
312 * When used on a standard NFKC Normalizer2 instance,
313 * getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property.
314 *
315 * When used on a standard NFC Normalizer2 instance,
316 * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can);
317 * in this case, the result contains either one or two code points (=1..4 char16_ts).
318 *
319 * This function is independent of the mode of the Normalizer2.
320 * The default implementation returns false.
321 * @param c code point
322 * @param decomposition String object which will be set to c's
323 * raw decomposition mapping, if there is one.
324 * @return true if c has a decomposition, otherwise false
325 * @stable ICU 49
326 */
327 virtual UBool
328 getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
329
330 /**
331 * Performs pairwise composition of a & b and returns the composite if there is one.
332 *
333 * Returns a composite code point c only if c has a two-way mapping to a+b.
334 * In standard Unicode normalization, this means that
335 * c has a canonical decomposition to a+b
336 * and c does not have the Full_Composition_Exclusion property.
337 *
338 * This function is independent of the mode of the Normalizer2.
339 * The default implementation returns a negative value.
340 * @param a A (normalization starter) code point.
341 * @param b Another code point.
342 * @return The non-negative composite code point if there is one; otherwise a negative value.
343 * @stable ICU 49
344 */
345 virtual UChar32
346 composePair(UChar32 a, UChar32 b) const;
347
348 /**
349 * Gets the combining class of c.
350 * The default implementation returns 0
351 * but all standard implementations return the Unicode Canonical_Combining_Class value.
352 * @param c code point
353 * @return c's combining class
354 * @stable ICU 49
355 */
356 virtual uint8_t
357 getCombiningClass(UChar32 c) const;
358
359 /**
360 * Tests if the string is normalized.
361 * Internally, in cases where the quickCheck() method would return "maybe"
362 * (which is only possible for the two COMPOSE modes) this method
363 * resolves to "yes" or "no" to provide a definitive result,
364 * at the cost of doing more work in those cases.
365 * @param s input string
366 * @param errorCode Standard ICU error code. Its input value must
367 * pass the U_SUCCESS() test, or else the function returns
368 * immediately. Check for U_FAILURE() on output or use with
369 * function chaining. (See User Guide for details.)
370 * @return true if s is normalized
371 * @stable ICU 4.4
372 */
373 virtual UBool
374 isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;
375 /**
376 * Tests if the UTF-8 string is normalized.
377 * Internally, in cases where the quickCheck() method would return "maybe"
378 * (which is only possible for the two COMPOSE modes) this method
379 * resolves to "yes" or "no" to provide a definitive result,
380 * at the cost of doing more work in those cases.
381 *
382 * This works for all normalization modes.
383 * It is optimized for UTF-8 for all built-in modes except for FCD.
384 * The base class implementation converts to UTF-16 and calls isNormalized().
385 *
386 * @param s UTF-8 input string
387 * @param errorCode Standard ICU error code. Its input value must
388 * pass the U_SUCCESS() test, or else the function returns
389 * immediately. Check for U_FAILURE() on output or use with
390 * function chaining. (See User Guide for details.)
391 * @return true if s is normalized
392 * @stable ICU 60
393 */
394 virtual UBool
395 isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const;
396
397
398 /**
399 * Tests if the string is normalized.
400 * For the two COMPOSE modes, the result could be "maybe" in cases that
401 * would take a little more work to resolve definitively.
402 * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster
403 * combination of quick check + normalization, to avoid
404 * re-checking the "yes" prefix.
405 * @param s input string
406 * @param errorCode Standard ICU error code. Its input value must
407 * pass the U_SUCCESS() test, or else the function returns
408 * immediately. Check for U_FAILURE() on output or use with
409 * function chaining. (See User Guide for details.)
410 * @return UNormalizationCheckResult
411 * @stable ICU 4.4
412 */
413 virtual UNormalizationCheckResult
414 quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0;
415
416 /**
417 * Returns the end of the normalized substring of the input string.
418 * In other words, with <code>end=spanQuickCheckYes(s, ec);</code>
419 * the substring <code>UnicodeString(s, 0, end)</code>
420 * will pass the quick check with a "yes" result.
421 *
422 * The returned end index is usually one or more characters before the
423 * "no" or "maybe" character: The end index is at a normalization boundary.
424 * (See the class documentation for more about normalization boundaries.)
425 *
426 * When the goal is a normalized string and most input strings are expected
427 * to be normalized already, then call this method,
428 * and if it returns a prefix shorter than the input string,
429 * copy that prefix and use normalizeSecondAndAppend() for the remainder.
430 * @param s input string
431 * @param errorCode Standard ICU error code. Its input value must
432 * pass the U_SUCCESS() test, or else the function returns
433 * immediately. Check for U_FAILURE() on output or use with
434 * function chaining. (See User Guide for details.)
435 * @return "yes" span end index
436 * @stable ICU 4.4
437 */
438 virtual int32_t
439 spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0;
440
441 /**
442 * Tests if the character always has a normalization boundary before it,
443 * regardless of context.
444 * If true, then the character does not normalization-interact with
445 * preceding characters.
446 * In other words, a string containing this character can be normalized
447 * by processing portions before this character and starting from this
448 * character independently.
449 * This is used for iterative normalization. See the class documentation for details.
450 * @param c character to test
451 * @return true if c has a normalization boundary before it
452 * @stable ICU 4.4
453 */
454 virtual UBool hasBoundaryBefore(UChar32 c) const = 0;
455
456 /**
457 * Tests if the character always has a normalization boundary after it,
458 * regardless of context.
459 * If true, then the character does not normalization-interact with
460 * following characters.
461 * In other words, a string containing this character can be normalized
462 * by processing portions up to this character and after this
463 * character independently.
464 * This is used for iterative normalization. See the class documentation for details.
465 * Note that this operation may be significantly slower than hasBoundaryBefore().
466 * @param c character to test
467 * @return true if c has a normalization boundary after it
468 * @stable ICU 4.4
469 */
470 virtual UBool hasBoundaryAfter(UChar32 c) const = 0;
471
472 /**
473 * Tests if the character is normalization-inert.
474 * If true, then the character does not change, nor normalization-interact with
475 * preceding or following characters.
476 * In other words, a string containing this character can be normalized
477 * by processing portions before this character and after this
478 * character independently.
479 * This is used for iterative normalization. See the class documentation for details.
480 * Note that this operation may be significantly slower than hasBoundaryBefore().
481 * @param c character to test
482 * @return true if c is normalization-inert
483 * @stable ICU 4.4
484 */
485 virtual UBool isInert(UChar32 c) const = 0;
486};
487
488/**
489 * Normalization filtered by a UnicodeSet.
490 * Normalizes portions of the text contained in the filter set and leaves
491 * portions not contained in the filter set unchanged.
492 * Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE).
493 * Not-in-the-filter text is treated as "is normalized" and "quick check yes".
494 * This class implements all of (and only) the Normalizer2 API.
495 * An instance of this class is unmodifiable/immutable but is constructed and
496 * must be destructed by the owner.
497 * @stable ICU 4.4
498 */
499class U_COMMON_API FilteredNormalizer2 : public Normalizer2 {
500public:
501 /**
502 * Constructs a filtered normalizer wrapping any Normalizer2 instance
503 * and a filter set.
504 * Both are aliased and must not be modified or deleted while this object
505 * is used.
506 * The filter set should be frozen; otherwise the performance will suffer greatly.
507 * @param n2 wrapped Normalizer2 instance
508 * @param filterSet UnicodeSet which determines the characters to be normalized
509 * @stable ICU 4.4
510 */
511 FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) :
512 norm2(n2), set(filterSet) {}
513
514 /**
515 * Destructor.
516 * @stable ICU 4.4
517 */
518 ~FilteredNormalizer2();
519
520 /**
521 * Writes the normalized form of the source string to the destination string
522 * (replacing its contents) and returns the destination string.
523 * The source and destination strings must be different objects.
524 * @param src source string
525 * @param dest destination string; its contents is replaced with normalized src
526 * @param errorCode Standard ICU error code. Its input value must
527 * pass the U_SUCCESS() test, or else the function returns
528 * immediately. Check for U_FAILURE() on output or use with
529 * function chaining. (See User Guide for details.)
530 * @return dest
531 * @stable ICU 4.4
532 */
533 virtual UnicodeString &
534 normalize(const UnicodeString &src,
535 UnicodeString &dest,
536 UErrorCode &errorCode) const override;
537
538 /**
539 * Normalizes a UTF-8 string and optionally records how source substrings
540 * relate to changed and unchanged result substrings.
541 *
542 * Implemented completely for most built-in modes except for FCD.
543 * The base class implementation converts to & from UTF-16 and does not support edits.
544 *
545 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
546 * @param src Source UTF-8 string.
547 * @param sink A ByteSink to which the normalized UTF-8 result string is written.
548 * sink.Flush() is called at the end.
549 * @param edits Records edits for index mapping, working with styled text,
550 * and getting only changes (if any).
551 * The Edits contents is undefined if any error occurs.
552 * This function calls edits->reset() first unless
553 * options includes U_EDITS_NO_RESET. edits can be nullptr.
554 * @param errorCode Standard ICU error code. Its input value must
555 * pass the U_SUCCESS() test, or else the function returns
556 * immediately. Check for U_FAILURE() on output or use with
557 * function chaining. (See User Guide for details.)
558 * @stable ICU 60
559 */
560 virtual void
561 normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
562 Edits *edits, UErrorCode &errorCode) const override;
563
564 /**
565 * Appends the normalized form of the second string to the first string
566 * (merging them at the boundary) and returns the first string.
567 * The result is normalized if the first string was normalized.
568 * The first and second strings must be different objects.
569 * @param first string, should be normalized
570 * @param second string, will be normalized
571 * @param errorCode Standard ICU error code. Its input value must
572 * pass the U_SUCCESS() test, or else the function returns
573 * immediately. Check for U_FAILURE() on output or use with
574 * function chaining. (See User Guide for details.)
575 * @return first
576 * @stable ICU 4.4
577 */
578 virtual UnicodeString &
579 normalizeSecondAndAppend(UnicodeString &first,
580 const UnicodeString &second,
581 UErrorCode &errorCode) const override;
582 /**
583 * Appends the second string to the first string
584 * (merging them at the boundary) and returns the first string.
585 * The result is normalized if both the strings were normalized.
586 * The first and second strings must be different objects.
587 * @param first string, should be normalized
588 * @param second string, should be normalized
589 * @param errorCode Standard ICU error code. Its input value must
590 * pass the U_SUCCESS() test, or else the function returns
591 * immediately. Check for U_FAILURE() on output or use with
592 * function chaining. (See User Guide for details.)
593 * @return first
594 * @stable ICU 4.4
595 */
596 virtual UnicodeString &
597 append(UnicodeString &first,
598 const UnicodeString &second,
599 UErrorCode &errorCode) const override;
600
601 /**
602 * Gets the decomposition mapping of c.
603 * For details see the base class documentation.
604 *
605 * This function is independent of the mode of the Normalizer2.
606 * @param c code point
607 * @param decomposition String object which will be set to c's
608 * decomposition mapping, if there is one.
609 * @return true if c has a decomposition, otherwise false
610 * @stable ICU 4.6
611 */
612 virtual UBool
613 getDecomposition(UChar32 c, UnicodeString &decomposition) const override;
614
615 /**
616 * Gets the raw decomposition mapping of c.
617 * For details see the base class documentation.
618 *
619 * This function is independent of the mode of the Normalizer2.
620 * @param c code point
621 * @param decomposition String object which will be set to c's
622 * raw decomposition mapping, if there is one.
623 * @return true if c has a decomposition, otherwise false
624 * @stable ICU 49
625 */
626 virtual UBool
627 getRawDecomposition(UChar32 c, UnicodeString &decomposition) const override;
628
629 /**
630 * Performs pairwise composition of a & b and returns the composite if there is one.
631 * For details see the base class documentation.
632 *
633 * This function is independent of the mode of the Normalizer2.
634 * @param a A (normalization starter) code point.
635 * @param b Another code point.
636 * @return The non-negative composite code point if there is one; otherwise a negative value.
637 * @stable ICU 49
638 */
639 virtual UChar32
640 composePair(UChar32 a, UChar32 b) const override;
641
642 /**
643 * Gets the combining class of c.
644 * The default implementation returns 0
645 * but all standard implementations return the Unicode Canonical_Combining_Class value.
646 * @param c code point
647 * @return c's combining class
648 * @stable ICU 49
649 */
650 virtual uint8_t
651 getCombiningClass(UChar32 c) const override;
652
653 /**
654 * Tests if the string is normalized.
655 * For details see the Normalizer2 base class documentation.
656 * @param s input string
657 * @param errorCode Standard ICU error code. Its input value must
658 * pass the U_SUCCESS() test, or else the function returns
659 * immediately. Check for U_FAILURE() on output or use with
660 * function chaining. (See User Guide for details.)
661 * @return true if s is normalized
662 * @stable ICU 4.4
663 */
664 virtual UBool
665 isNormalized(const UnicodeString &s, UErrorCode &errorCode) const override;
666 /**
667 * Tests if the UTF-8 string is normalized.
668 * Internally, in cases where the quickCheck() method would return "maybe"
669 * (which is only possible for the two COMPOSE modes) this method
670 * resolves to "yes" or "no" to provide a definitive result,
671 * at the cost of doing more work in those cases.
672 *
673 * This works for all normalization modes.
674 * It is optimized for UTF-8 for all built-in modes except for FCD.
675 * The base class implementation converts to UTF-16 and calls isNormalized().
676 *
677 * @param s UTF-8 input string
678 * @param errorCode Standard ICU error code. Its input value must
679 * pass the U_SUCCESS() test, or else the function returns
680 * immediately. Check for U_FAILURE() on output or use with
681 * function chaining. (See User Guide for details.)
682 * @return true if s is normalized
683 * @stable ICU 60
684 */
685 virtual UBool
686 isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const override;
687 /**
688 * Tests if the string is normalized.
689 * For details see the Normalizer2 base class documentation.
690 * @param s input string
691 * @param errorCode Standard ICU error code. Its input value must
692 * pass the U_SUCCESS() test, or else the function returns
693 * immediately. Check for U_FAILURE() on output or use with
694 * function chaining. (See User Guide for details.)
695 * @return UNormalizationCheckResult
696 * @stable ICU 4.4
697 */
698 virtual UNormalizationCheckResult
699 quickCheck(const UnicodeString &s, UErrorCode &errorCode) const override;
700 /**
701 * Returns the end of the normalized substring of the input string.
702 * For details see the Normalizer2 base class documentation.
703 * @param s input string
704 * @param errorCode Standard ICU error code. Its input value must
705 * pass the U_SUCCESS() test, or else the function returns
706 * immediately. Check for U_FAILURE() on output or use with
707 * function chaining. (See User Guide for details.)
708 * @return "yes" span end index
709 * @stable ICU 4.4
710 */
711 virtual int32_t
712 spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const override;
713
714 /**
715 * Tests if the character always has a normalization boundary before it,
716 * regardless of context.
717 * For details see the Normalizer2 base class documentation.
718 * @param c character to test
719 * @return true if c has a normalization boundary before it
720 * @stable ICU 4.4
721 */
722 virtual UBool hasBoundaryBefore(UChar32 c) const override;
723
724 /**
725 * Tests if the character always has a normalization boundary after it,
726 * regardless of context.
727 * For details see the Normalizer2 base class documentation.
728 * @param c character to test
729 * @return true if c has a normalization boundary after it
730 * @stable ICU 4.4
731 */
732 virtual UBool hasBoundaryAfter(UChar32 c) const override;
733
734 /**
735 * Tests if the character is normalization-inert.
736 * For details see the Normalizer2 base class documentation.
737 * @param c character to test
738 * @return true if c is normalization-inert
739 * @stable ICU 4.4
740 */
741 virtual UBool isInert(UChar32 c) const override;
742private:
743 UnicodeString &
744 normalize(const UnicodeString &src,
745 UnicodeString &dest,
746 USetSpanCondition spanCondition,
747 UErrorCode &errorCode) const;
748
749 void
750 normalizeUTF8(uint32_t options, const char *src, int32_t length,
751 ByteSink &sink, Edits *edits,
752 USetSpanCondition spanCondition,
753 UErrorCode &errorCode) const;
754
755 UnicodeString &
756 normalizeSecondAndAppend(UnicodeString &first,
757 const UnicodeString &second,
758 UBool doNormalize,
759 UErrorCode &errorCode) const;
760
761 const Normalizer2 &norm2;
762 const UnicodeSet &set;
763};
764
765U_NAMESPACE_END
766
767#endif // !UCONFIG_NO_NORMALIZATION
768
769#endif /* U_SHOW_CPLUSPLUS_API */
770
771#endif // __NORMALIZER2_H__
772