| 1 | // © 2016 and later: Unicode, Inc. and others. |
| 2 | // License & terms of use: http://www.unicode.org/copyright.html |
| 3 | /* |
| 4 | ******************************************************************************* |
| 5 | * Copyright (C) 2014, International Business Machines |
| 6 | * Corporation and others. All Rights Reserved. |
| 7 | ******************************************************************************* |
| 8 | * norm2allmodes.h |
| 9 | * |
| 10 | * created on: 2014sep07 |
| 11 | * created by: Markus W. Scherer |
| 12 | */ |
| 13 | |
| 14 | #ifndef __NORM2ALLMODES_H__ |
| 15 | #define __NORM2ALLMODES_H__ |
| 16 | |
| 17 | #include "unicode/utypes.h" |
| 18 | |
| 19 | #if !UCONFIG_NO_NORMALIZATION |
| 20 | |
| 21 | #include "unicode/edits.h" |
| 22 | #include "unicode/normalizer2.h" |
| 23 | #include "unicode/stringoptions.h" |
| 24 | #include "unicode/unistr.h" |
| 25 | #include "cpputils.h" |
| 26 | #include "normalizer2impl.h" |
| 27 | |
| 28 | U_NAMESPACE_BEGIN |
| 29 | |
| 30 | // Intermediate class: |
| 31 | // Has Normalizer2Impl and does boilerplate argument checking and setup. |
| 32 | class Normalizer2WithImpl : public Normalizer2 { |
| 33 | public: |
| 34 | Normalizer2WithImpl(const Normalizer2Impl &ni) : impl(ni) {} |
| 35 | virtual ~Normalizer2WithImpl(); |
| 36 | |
| 37 | // normalize |
| 38 | virtual UnicodeString & |
| 39 | normalize(const UnicodeString &src, |
| 40 | UnicodeString &dest, |
| 41 | UErrorCode &errorCode) const override { |
| 42 | if(U_FAILURE(errorCode)) { |
| 43 | dest.setToBogus(); |
| 44 | return dest; |
| 45 | } |
| 46 | const char16_t *sArray=src.getBuffer(); |
| 47 | if(&dest==&src || sArray==nullptr) { |
| 48 | errorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| 49 | dest.setToBogus(); |
| 50 | return dest; |
| 51 | } |
| 52 | dest.remove(); |
| 53 | ReorderingBuffer buffer(impl, dest); |
| 54 | if(buffer.init(src.length(), errorCode)) { |
| 55 | normalize(sArray, sArray+src.length(), buffer, errorCode); |
| 56 | } |
| 57 | return dest; |
| 58 | } |
| 59 | virtual void |
| 60 | normalize(const char16_t *src, const char16_t *limit, |
| 61 | ReorderingBuffer &buffer, UErrorCode &errorCode) const = 0; |
| 62 | |
| 63 | // normalize and append |
| 64 | virtual UnicodeString & |
| 65 | normalizeSecondAndAppend(UnicodeString &first, |
| 66 | const UnicodeString &second, |
| 67 | UErrorCode &errorCode) const override { |
| 68 | return normalizeSecondAndAppend(first, second, true, errorCode); |
| 69 | } |
| 70 | virtual UnicodeString & |
| 71 | append(UnicodeString &first, |
| 72 | const UnicodeString &second, |
| 73 | UErrorCode &errorCode) const override { |
| 74 | return normalizeSecondAndAppend(first, second, false, errorCode); |
| 75 | } |
| 76 | UnicodeString & |
| 77 | normalizeSecondAndAppend(UnicodeString &first, |
| 78 | const UnicodeString &second, |
| 79 | UBool doNormalize, |
| 80 | UErrorCode &errorCode) const { |
| 81 | uprv_checkCanGetBuffer(first, errorCode); |
| 82 | if(U_FAILURE(errorCode)) { |
| 83 | return first; |
| 84 | } |
| 85 | const char16_t *secondArray=second.getBuffer(); |
| 86 | if(&first==&second || secondArray==nullptr) { |
| 87 | errorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| 88 | return first; |
| 89 | } |
| 90 | int32_t firstLength=first.length(); |
| 91 | UnicodeString safeMiddle; |
| 92 | { |
| 93 | ReorderingBuffer buffer(impl, first); |
| 94 | if(buffer.init(firstLength+second.length(), errorCode)) { |
| 95 | normalizeAndAppend(secondArray, secondArray+second.length(), doNormalize, |
| 96 | safeMiddle, buffer, errorCode); |
| 97 | } |
| 98 | } // The ReorderingBuffer destructor finalizes the first string. |
| 99 | if(U_FAILURE(errorCode)) { |
| 100 | // Restore the modified suffix of the first string. |
| 101 | first.replace(firstLength-safeMiddle.length(), 0x7fffffff, safeMiddle); |
| 102 | } |
| 103 | return first; |
| 104 | } |
| 105 | virtual void |
| 106 | normalizeAndAppend(const char16_t *src, const char16_t *limit, UBool doNormalize, |
| 107 | UnicodeString &safeMiddle, |
| 108 | ReorderingBuffer &buffer, UErrorCode &errorCode) const = 0; |
| 109 | virtual UBool |
| 110 | getDecomposition(UChar32 c, UnicodeString &decomposition) const override { |
| 111 | char16_t buffer[4]; |
| 112 | int32_t length; |
| 113 | const char16_t *d=impl.getDecomposition(c, buffer, length); |
| 114 | if(d==nullptr) { |
| 115 | return false; |
| 116 | } |
| 117 | if(d==buffer) { |
| 118 | decomposition.setTo(buffer, length); // copy the string (Jamos from Hangul syllable c) |
| 119 | } else { |
| 120 | decomposition.setTo(false, d, length); // read-only alias |
| 121 | } |
| 122 | return true; |
| 123 | } |
| 124 | virtual UBool |
| 125 | getRawDecomposition(UChar32 c, UnicodeString &decomposition) const override { |
| 126 | char16_t buffer[30]; |
| 127 | int32_t length; |
| 128 | const char16_t *d=impl.getRawDecomposition(c, buffer, length); |
| 129 | if(d==nullptr) { |
| 130 | return false; |
| 131 | } |
| 132 | if(d==buffer) { |
| 133 | decomposition.setTo(buffer, length); // copy the string (algorithmic decomposition) |
| 134 | } else { |
| 135 | decomposition.setTo(false, d, length); // read-only alias |
| 136 | } |
| 137 | return true; |
| 138 | } |
| 139 | virtual UChar32 |
| 140 | composePair(UChar32 a, UChar32 b) const override { |
| 141 | return impl.composePair(a, b); |
| 142 | } |
| 143 | |
| 144 | virtual uint8_t |
| 145 | getCombiningClass(UChar32 c) const override { |
| 146 | return impl.getCC(impl.getNorm16(c)); |
| 147 | } |
| 148 | |
| 149 | // quick checks |
| 150 | virtual UBool |
| 151 | isNormalized(const UnicodeString &s, UErrorCode &errorCode) const override { |
| 152 | if(U_FAILURE(errorCode)) { |
| 153 | return false; |
| 154 | } |
| 155 | const char16_t *sArray=s.getBuffer(); |
| 156 | if(sArray==nullptr) { |
| 157 | errorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| 158 | return false; |
| 159 | } |
| 160 | const char16_t *sLimit=sArray+s.length(); |
| 161 | return sLimit==spanQuickCheckYes(sArray, sLimit, errorCode); |
| 162 | } |
| 163 | virtual UNormalizationCheckResult |
| 164 | quickCheck(const UnicodeString &s, UErrorCode &errorCode) const override { |
| 165 | return Normalizer2WithImpl::isNormalized(s, errorCode) ? UNORM_YES : UNORM_NO; |
| 166 | } |
| 167 | virtual int32_t |
| 168 | spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const override { |
| 169 | if(U_FAILURE(errorCode)) { |
| 170 | return 0; |
| 171 | } |
| 172 | const char16_t *sArray=s.getBuffer(); |
| 173 | if(sArray==nullptr) { |
| 174 | errorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| 175 | return 0; |
| 176 | } |
| 177 | return (int32_t)(spanQuickCheckYes(sArray, sArray+s.length(), errorCode)-sArray); |
| 178 | } |
| 179 | virtual const char16_t * |
| 180 | spanQuickCheckYes(const char16_t *src, const char16_t *limit, UErrorCode &errorCode) const = 0; |
| 181 | |
| 182 | virtual UNormalizationCheckResult getQuickCheck(UChar32) const { |
| 183 | return UNORM_YES; |
| 184 | } |
| 185 | |
| 186 | const Normalizer2Impl &impl; |
| 187 | }; |
| 188 | |
| 189 | class DecomposeNormalizer2 : public Normalizer2WithImpl { |
| 190 | public: |
| 191 | DecomposeNormalizer2(const Normalizer2Impl &ni) : Normalizer2WithImpl(ni) {} |
| 192 | virtual ~DecomposeNormalizer2(); |
| 193 | |
| 194 | private: |
| 195 | virtual void |
| 196 | normalize(const char16_t *src, const char16_t *limit, |
| 197 | ReorderingBuffer &buffer, UErrorCode &errorCode) const override { |
| 198 | impl.decompose(src, limit, &buffer, errorCode); |
| 199 | } |
| 200 | using Normalizer2WithImpl::normalize; // Avoid warning about hiding base class function. |
| 201 | virtual void |
| 202 | normalizeAndAppend(const char16_t *src, const char16_t *limit, UBool doNormalize, |
| 203 | UnicodeString &safeMiddle, |
| 204 | ReorderingBuffer &buffer, UErrorCode &errorCode) const override { |
| 205 | impl.decomposeAndAppend(src, limit, doNormalize, safeMiddle, buffer, errorCode); |
| 206 | } |
| 207 | |
| 208 | void |
| 209 | normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink, |
| 210 | Edits *edits, UErrorCode &errorCode) const override { |
| 211 | if (U_FAILURE(errorCode)) { |
| 212 | return; |
| 213 | } |
| 214 | if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) { |
| 215 | edits->reset(); |
| 216 | } |
| 217 | const uint8_t *s = reinterpret_cast<const uint8_t *>(src.data()); |
| 218 | impl.decomposeUTF8(options, s, s + src.length(), &sink, edits, errorCode); |
| 219 | sink.Flush(); |
| 220 | } |
| 221 | virtual UBool |
| 222 | isNormalizedUTF8(StringPiece sp, UErrorCode &errorCode) const override { |
| 223 | if(U_FAILURE(errorCode)) { |
| 224 | return false; |
| 225 | } |
| 226 | const uint8_t *s = reinterpret_cast<const uint8_t *>(sp.data()); |
| 227 | const uint8_t *sLimit = s + sp.length(); |
| 228 | return sLimit == impl.decomposeUTF8(0, s, sLimit, nullptr, nullptr, errorCode); |
| 229 | } |
| 230 | |
| 231 | virtual const char16_t * |
| 232 | spanQuickCheckYes(const char16_t *src, const char16_t *limit, UErrorCode &errorCode) const override { |
| 233 | return impl.decompose(src, limit, nullptr, errorCode); |
| 234 | } |
| 235 | using Normalizer2WithImpl::spanQuickCheckYes; // Avoid warning about hiding base class function. |
| 236 | virtual UNormalizationCheckResult getQuickCheck(UChar32 c) const override { |
| 237 | return impl.isDecompYes(impl.getNorm16(c)) ? UNORM_YES : UNORM_NO; |
| 238 | } |
| 239 | virtual UBool hasBoundaryBefore(UChar32 c) const override { |
| 240 | return impl.hasDecompBoundaryBefore(c); |
| 241 | } |
| 242 | virtual UBool hasBoundaryAfter(UChar32 c) const override { |
| 243 | return impl.hasDecompBoundaryAfter(c); |
| 244 | } |
| 245 | virtual UBool isInert(UChar32 c) const override { |
| 246 | return impl.isDecompInert(c); |
| 247 | } |
| 248 | }; |
| 249 | |
| 250 | class ComposeNormalizer2 : public Normalizer2WithImpl { |
| 251 | public: |
| 252 | ComposeNormalizer2(const Normalizer2Impl &ni, UBool fcc) : |
| 253 | Normalizer2WithImpl(ni), onlyContiguous(fcc) {} |
| 254 | virtual ~ComposeNormalizer2(); |
| 255 | |
| 256 | private: |
| 257 | virtual void |
| 258 | normalize(const char16_t *src, const char16_t *limit, |
| 259 | ReorderingBuffer &buffer, UErrorCode &errorCode) const override { |
| 260 | impl.compose(src, limit, onlyContiguous, true, buffer, errorCode); |
| 261 | } |
| 262 | using Normalizer2WithImpl::normalize; // Avoid warning about hiding base class function. |
| 263 | |
| 264 | void |
| 265 | normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink, |
| 266 | Edits *edits, UErrorCode &errorCode) const override { |
| 267 | if (U_FAILURE(errorCode)) { |
| 268 | return; |
| 269 | } |
| 270 | if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) { |
| 271 | edits->reset(); |
| 272 | } |
| 273 | const uint8_t *s = reinterpret_cast<const uint8_t *>(src.data()); |
| 274 | impl.composeUTF8(options, onlyContiguous, s, s + src.length(), |
| 275 | &sink, edits, errorCode); |
| 276 | sink.Flush(); |
| 277 | } |
| 278 | |
| 279 | virtual void |
| 280 | normalizeAndAppend(const char16_t *src, const char16_t *limit, UBool doNormalize, |
| 281 | UnicodeString &safeMiddle, |
| 282 | ReorderingBuffer &buffer, UErrorCode &errorCode) const override { |
| 283 | impl.composeAndAppend(src, limit, doNormalize, onlyContiguous, safeMiddle, buffer, errorCode); |
| 284 | } |
| 285 | |
| 286 | virtual UBool |
| 287 | isNormalized(const UnicodeString &s, UErrorCode &errorCode) const override { |
| 288 | if(U_FAILURE(errorCode)) { |
| 289 | return false; |
| 290 | } |
| 291 | const char16_t *sArray=s.getBuffer(); |
| 292 | if(sArray==nullptr) { |
| 293 | errorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| 294 | return false; |
| 295 | } |
| 296 | UnicodeString temp; |
| 297 | ReorderingBuffer buffer(impl, temp); |
| 298 | if(!buffer.init(5, errorCode)) { // small destCapacity for substring normalization |
| 299 | return false; |
| 300 | } |
| 301 | return impl.compose(sArray, sArray+s.length(), onlyContiguous, false, buffer, errorCode); |
| 302 | } |
| 303 | virtual UBool |
| 304 | isNormalizedUTF8(StringPiece sp, UErrorCode &errorCode) const override { |
| 305 | if(U_FAILURE(errorCode)) { |
| 306 | return false; |
| 307 | } |
| 308 | const uint8_t *s = reinterpret_cast<const uint8_t *>(sp.data()); |
| 309 | return impl.composeUTF8(0, onlyContiguous, s, s + sp.length(), nullptr, nullptr, errorCode); |
| 310 | } |
| 311 | virtual UNormalizationCheckResult |
| 312 | quickCheck(const UnicodeString &s, UErrorCode &errorCode) const override { |
| 313 | if(U_FAILURE(errorCode)) { |
| 314 | return UNORM_MAYBE; |
| 315 | } |
| 316 | const char16_t *sArray=s.getBuffer(); |
| 317 | if(sArray==nullptr) { |
| 318 | errorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| 319 | return UNORM_MAYBE; |
| 320 | } |
| 321 | UNormalizationCheckResult qcResult=UNORM_YES; |
| 322 | impl.composeQuickCheck(sArray, sArray+s.length(), onlyContiguous, &qcResult); |
| 323 | return qcResult; |
| 324 | } |
| 325 | virtual const char16_t * |
| 326 | spanQuickCheckYes(const char16_t *src, const char16_t *limit, UErrorCode &) const override { |
| 327 | return impl.composeQuickCheck(src, limit, onlyContiguous, nullptr); |
| 328 | } |
| 329 | using Normalizer2WithImpl::spanQuickCheckYes; // Avoid warning about hiding base class function. |
| 330 | virtual UNormalizationCheckResult getQuickCheck(UChar32 c) const override { |
| 331 | return impl.getCompQuickCheck(impl.getNorm16(c)); |
| 332 | } |
| 333 | virtual UBool hasBoundaryBefore(UChar32 c) const override { |
| 334 | return impl.hasCompBoundaryBefore(c); |
| 335 | } |
| 336 | virtual UBool hasBoundaryAfter(UChar32 c) const override { |
| 337 | return impl.hasCompBoundaryAfter(c, onlyContiguous); |
| 338 | } |
| 339 | virtual UBool isInert(UChar32 c) const override { |
| 340 | return impl.isCompInert(c, onlyContiguous); |
| 341 | } |
| 342 | |
| 343 | const UBool onlyContiguous; |
| 344 | }; |
| 345 | |
| 346 | class FCDNormalizer2 : public Normalizer2WithImpl { |
| 347 | public: |
| 348 | FCDNormalizer2(const Normalizer2Impl &ni) : Normalizer2WithImpl(ni) {} |
| 349 | virtual ~FCDNormalizer2(); |
| 350 | |
| 351 | private: |
| 352 | virtual void |
| 353 | normalize(const char16_t *src, const char16_t *limit, |
| 354 | ReorderingBuffer &buffer, UErrorCode &errorCode) const override { |
| 355 | impl.makeFCD(src, limit, &buffer, errorCode); |
| 356 | } |
| 357 | using Normalizer2WithImpl::normalize; // Avoid warning about hiding base class function. |
| 358 | virtual void |
| 359 | normalizeAndAppend(const char16_t *src, const char16_t *limit, UBool doNormalize, |
| 360 | UnicodeString &safeMiddle, |
| 361 | ReorderingBuffer &buffer, UErrorCode &errorCode) const override { |
| 362 | impl.makeFCDAndAppend(src, limit, doNormalize, safeMiddle, buffer, errorCode); |
| 363 | } |
| 364 | virtual const char16_t * |
| 365 | spanQuickCheckYes(const char16_t *src, const char16_t *limit, UErrorCode &errorCode) const override { |
| 366 | return impl.makeFCD(src, limit, nullptr, errorCode); |
| 367 | } |
| 368 | using Normalizer2WithImpl::spanQuickCheckYes; // Avoid warning about hiding base class function. |
| 369 | virtual UBool hasBoundaryBefore(UChar32 c) const override { |
| 370 | return impl.hasFCDBoundaryBefore(c); |
| 371 | } |
| 372 | virtual UBool hasBoundaryAfter(UChar32 c) const override { |
| 373 | return impl.hasFCDBoundaryAfter(c); |
| 374 | } |
| 375 | virtual UBool isInert(UChar32 c) const override { |
| 376 | return impl.isFCDInert(c); |
| 377 | } |
| 378 | }; |
| 379 | |
| 380 | struct Norm2AllModes : public UMemory { |
| 381 | Norm2AllModes(Normalizer2Impl *i) |
| 382 | : impl(i), comp(*i, false), decomp(*i), fcd(*i), fcc(*i, true) {} |
| 383 | ~Norm2AllModes(); |
| 384 | |
| 385 | static Norm2AllModes *createInstance(Normalizer2Impl *impl, UErrorCode &errorCode); |
| 386 | static Norm2AllModes *createNFCInstance(UErrorCode &errorCode); |
| 387 | static Norm2AllModes *createInstance(const char *packageName, |
| 388 | const char *name, |
| 389 | UErrorCode &errorCode); |
| 390 | |
| 391 | static const Norm2AllModes *getNFCInstance(UErrorCode &errorCode); |
| 392 | static const Norm2AllModes *getNFKCInstance(UErrorCode &errorCode); |
| 393 | static const Norm2AllModes *getNFKC_CFInstance(UErrorCode &errorCode); |
| 394 | |
| 395 | Normalizer2Impl *impl; |
| 396 | ComposeNormalizer2 comp; |
| 397 | DecomposeNormalizer2 decomp; |
| 398 | FCDNormalizer2 fcd; |
| 399 | ComposeNormalizer2 fcc; |
| 400 | }; |
| 401 | |
| 402 | U_NAMESPACE_END |
| 403 | |
| 404 | #endif // !UCONFIG_NO_NORMALIZATION |
| 405 | #endif // __NORM2ALLMODES_H__ |
| 406 | |