| 1 | // Licensed to the .NET Foundation under one or more agreements. |
| 2 | // The .NET Foundation licenses this file to you under the MIT license. |
| 3 | // See the LICENSE file in the project root for more information. |
| 4 | |
| 5 | /*++ |
| 6 | |
| 7 | |
| 8 | |
| 9 | Module Name: |
| 10 | |
| 11 | unicode/utf8.c |
| 12 | |
| 13 | Abstract: |
| 14 | Functions to encode and decode UTF-8 strings. This is a port of the C# version from mscorlib. |
| 15 | |
| 16 | Revision History: |
| 17 | |
| 18 | |
| 19 | |
| 20 | --*/ |
| 21 | |
| 22 | #include "pal/utf8.h" |
| 23 | #include "pal/malloc.hpp" |
| 24 | |
| 25 | using namespace CorUnix; |
| 26 | |
| 27 | #define FASTLOOP |
| 28 | |
| 29 | struct CharUnicodeInfo |
| 30 | { |
| 31 | static const WCHAR HIGH_SURROGATE_START = 0xd800; |
| 32 | static const WCHAR HIGH_SURROGATE_END = 0xdbff; |
| 33 | static const WCHAR LOW_SURROGATE_START = 0xdc00; |
| 34 | static const WCHAR LOW_SURROGATE_END = 0xdfff; |
| 35 | }; |
| 36 | |
| 37 | struct Char |
| 38 | { |
| 39 | // Test if the wide character is a high surrogate |
| 40 | static bool IsHighSurrogate(const WCHAR c) |
| 41 | { |
| 42 | return (c & 0xFC00) == CharUnicodeInfo::HIGH_SURROGATE_START; |
| 43 | } |
| 44 | |
| 45 | // Test if the wide character is a low surrogate |
| 46 | static bool IsLowSurrogate(const WCHAR c) |
| 47 | { |
| 48 | return (c & 0xFC00) == CharUnicodeInfo::LOW_SURROGATE_START; |
| 49 | } |
| 50 | |
| 51 | // Test if the wide character is a low surrogate |
| 52 | static bool IsSurrogate(const WCHAR c) |
| 53 | { |
| 54 | return (c & 0xF800) == CharUnicodeInfo::HIGH_SURROGATE_START; |
| 55 | } |
| 56 | |
| 57 | // Test if the wide character is a high surrogate |
| 58 | static bool IsHighSurrogate(const WCHAR* s, int index) |
| 59 | { |
| 60 | return IsHighSurrogate(s[index]); |
| 61 | } |
| 62 | |
| 63 | // Test if the wide character is a low surrogate |
| 64 | static bool IsLowSurrogate(const WCHAR* s, int index) |
| 65 | { |
| 66 | return IsLowSurrogate(s[index]); |
| 67 | } |
| 68 | |
| 69 | // Test if the wide character is a low surrogate |
| 70 | static bool IsSurrogate(const WCHAR* s, int index) |
| 71 | { |
| 72 | return IsSurrogate(s[index]); |
| 73 | } |
| 74 | }; |
| 75 | |
| 76 | class ArgumentException |
| 77 | { |
| 78 | |
| 79 | public: |
| 80 | ArgumentException(LPCSTR message) |
| 81 | { |
| 82 | } |
| 83 | |
| 84 | ArgumentException(LPCSTR message, LPCSTR argName) |
| 85 | { |
| 86 | } |
| 87 | }; |
| 88 | |
| 89 | class ArgumentNullException : public ArgumentException |
| 90 | { |
| 91 | public: |
| 92 | ArgumentNullException(LPCSTR argName) |
| 93 | : ArgumentException("Argument is NULL" , argName) |
| 94 | { |
| 95 | |
| 96 | } |
| 97 | }; |
| 98 | |
| 99 | class ArgumentOutOfRangeException : public ArgumentException |
| 100 | { |
| 101 | public: |
| 102 | ArgumentOutOfRangeException(LPCSTR argName, LPCSTR message) |
| 103 | : ArgumentException(message, argName) |
| 104 | { |
| 105 | |
| 106 | } |
| 107 | }; |
| 108 | |
| 109 | class InsufficientBufferException : public ArgumentException |
| 110 | { |
| 111 | public: |
| 112 | InsufficientBufferException(LPCSTR message, LPCSTR argName) |
| 113 | : ArgumentException(message, argName) |
| 114 | { |
| 115 | |
| 116 | } |
| 117 | }; |
| 118 | |
| 119 | class Contract |
| 120 | { |
| 121 | public: |
| 122 | static void Assert(bool cond, LPCSTR str) |
| 123 | { |
| 124 | if (!cond) |
| 125 | { |
| 126 | throw ArgumentException(str); |
| 127 | } |
| 128 | } |
| 129 | |
| 130 | static void EndContractBlock() |
| 131 | { |
| 132 | } |
| 133 | }; |
| 134 | |
| 135 | class DecoderFallbackException : public ArgumentException |
| 136 | { |
| 137 | BYTE *bytesUnknown; |
| 138 | int index; |
| 139 | |
| 140 | public: |
| 141 | DecoderFallbackException( |
| 142 | LPCSTR message, BYTE bytesUnknown[], int index) : ArgumentException(message) |
| 143 | { |
| 144 | this->bytesUnknown = bytesUnknown; |
| 145 | this->index = index; |
| 146 | } |
| 147 | |
| 148 | BYTE *BytesUnknown() |
| 149 | { |
| 150 | return (bytesUnknown); |
| 151 | } |
| 152 | |
| 153 | int GetIndex() |
| 154 | { |
| 155 | return index; |
| 156 | } |
| 157 | }; |
| 158 | |
| 159 | class DecoderFallbackBuffer; |
| 160 | |
| 161 | class DecoderFallback |
| 162 | { |
| 163 | public: |
| 164 | |
| 165 | // Fallback |
| 166 | // |
| 167 | // Return the appropriate unicode string alternative to the character that need to fall back. |
| 168 | |
| 169 | virtual DecoderFallbackBuffer* CreateFallbackBuffer() = 0; |
| 170 | |
| 171 | // Maximum number of characters that this instance of this fallback could return |
| 172 | |
| 173 | virtual int GetMaxCharCount() = 0; |
| 174 | }; |
| 175 | |
| 176 | class DecoderReplacementFallback : public DecoderFallback |
| 177 | { |
| 178 | // Our variables |
| 179 | WCHAR strDefault[2]; |
| 180 | int strDefaultLength; |
| 181 | |
| 182 | public: |
| 183 | // Construction. Default replacement fallback uses no best fit and ? replacement string |
| 184 | DecoderReplacementFallback() : DecoderReplacementFallback(W("?" )) |
| 185 | { |
| 186 | } |
| 187 | |
| 188 | DecoderReplacementFallback(const WCHAR* replacement) |
| 189 | { |
| 190 | // Must not be null |
| 191 | if (replacement == nullptr) |
| 192 | throw ArgumentNullException("replacement" ); |
| 193 | Contract::EndContractBlock(); |
| 194 | |
| 195 | // Make sure it doesn't have bad surrogate pairs |
| 196 | bool bFoundHigh = false; |
| 197 | int replacementLength = PAL_wcslen((const WCHAR *)replacement); |
| 198 | for (int i = 0; i < replacementLength; i++) |
| 199 | { |
| 200 | // Found a surrogate? |
| 201 | if (Char::IsSurrogate(replacement, i)) |
| 202 | { |
| 203 | // High or Low? |
| 204 | if (Char::IsHighSurrogate(replacement, i)) |
| 205 | { |
| 206 | // if already had a high one, stop |
| 207 | if (bFoundHigh) |
| 208 | break; // break & throw at the bFoundHIgh below |
| 209 | bFoundHigh = true; |
| 210 | } |
| 211 | else |
| 212 | { |
| 213 | // Low, did we have a high? |
| 214 | if (!bFoundHigh) |
| 215 | { |
| 216 | // Didn't have one, make if fail when we stop |
| 217 | bFoundHigh = true; |
| 218 | break; |
| 219 | } |
| 220 | |
| 221 | // Clear flag |
| 222 | bFoundHigh = false; |
| 223 | } |
| 224 | } |
| 225 | // If last was high we're in trouble (not surrogate so not low surrogate, so break) |
| 226 | else if (bFoundHigh) |
| 227 | break; |
| 228 | } |
| 229 | if (bFoundHigh) |
| 230 | throw ArgumentException("String 'replacement' contains invalid Unicode code points." , "replacement" ); |
| 231 | |
| 232 | wcscpy_s(strDefault, sizeof(strDefault), replacement); |
| 233 | strDefaultLength = replacementLength; |
| 234 | } |
| 235 | |
| 236 | WCHAR* GetDefaultString() |
| 237 | { |
| 238 | return strDefault; |
| 239 | } |
| 240 | |
| 241 | virtual DecoderFallbackBuffer* CreateFallbackBuffer(); |
| 242 | |
| 243 | // Maximum number of characters that this instance of this fallback could return |
| 244 | virtual int GetMaxCharCount() |
| 245 | { |
| 246 | return strDefaultLength; |
| 247 | } |
| 248 | }; |
| 249 | |
| 250 | class DecoderFallbackBuffer |
| 251 | { |
| 252 | friend class UTF8Encoding; |
| 253 | // Most implimentations will probably need an implimenation-specific constructor |
| 254 | |
| 255 | // internal methods that cannot be overriden that let us do our fallback thing |
| 256 | // These wrap the internal methods so that we can check for people doing stuff that's incorrect |
| 257 | |
| 258 | public: |
| 259 | virtual ~DecoderFallbackBuffer() = default; |
| 260 | |
| 261 | virtual bool Fallback(BYTE bytesUnknown[], int index, int size) = 0; |
| 262 | |
| 263 | // Get next character |
| 264 | virtual WCHAR GetNextChar() = 0; |
| 265 | |
| 266 | //Back up a character |
| 267 | virtual bool MovePrevious() = 0; |
| 268 | |
| 269 | // How many chars left in this fallback? |
| 270 | virtual int GetRemaining() = 0; |
| 271 | |
| 272 | // Clear the buffer |
| 273 | virtual void Reset() |
| 274 | { |
| 275 | while (GetNextChar() != (WCHAR)0); |
| 276 | } |
| 277 | |
| 278 | // Internal items to help us figure out what we're doing as far as error messages, etc. |
| 279 | // These help us with our performance and messages internally |
| 280 | protected: |
| 281 | BYTE* byteStart; |
| 282 | WCHAR* charEnd; |
| 283 | |
| 284 | // Internal reset |
| 285 | void InternalReset() |
| 286 | { |
| 287 | byteStart = nullptr; |
| 288 | Reset(); |
| 289 | } |
| 290 | |
| 291 | // Set the above values |
| 292 | // This can't be part of the constructor because EncoderFallbacks would have to know how to impliment these. |
| 293 | void InternalInitialize(BYTE* byteStart, WCHAR* charEnd) |
| 294 | { |
| 295 | this->byteStart = byteStart; |
| 296 | this->charEnd = charEnd; |
| 297 | } |
| 298 | |
| 299 | // Fallback the current byte by sticking it into the remaining char buffer. |
| 300 | // This can only be called by our encodings (other have to use the public fallback methods), so |
| 301 | // we can use our DecoderNLS here too (except we don't). |
| 302 | // Returns true if we are successful, false if we can't fallback the character (no buffer space) |
| 303 | // So caller needs to throw buffer space if return false. |
| 304 | // Right now this has both bytes and bytes[], since we might have extra bytes, hence the |
| 305 | // array, and we might need the index, hence the byte* |
| 306 | // Don't touch ref chars unless we succeed |
| 307 | virtual bool InternalFallback(BYTE bytes[], BYTE* pBytes, WCHAR** chars, int size) |
| 308 | { |
| 309 | |
| 310 | Contract::Assert(byteStart != nullptr, "[DecoderFallback.InternalFallback]Used InternalFallback without calling InternalInitialize" ); |
| 311 | |
| 312 | // See if there's a fallback character and we have an output buffer then copy our string. |
| 313 | if (this->Fallback(bytes, (int)(pBytes - byteStart - size), size)) |
| 314 | { |
| 315 | // Copy the chars to our output |
| 316 | WCHAR ch; |
| 317 | WCHAR* charTemp = *chars; |
| 318 | bool bHighSurrogate = false; |
| 319 | while ((ch = GetNextChar()) != 0) |
| 320 | { |
| 321 | // Make sure no mixed up surrogates |
| 322 | if (Char::IsSurrogate(ch)) |
| 323 | { |
| 324 | if (Char::IsHighSurrogate(ch)) |
| 325 | { |
| 326 | // High Surrogate |
| 327 | if (bHighSurrogate) |
| 328 | throw ArgumentException("String 'chars' contains invalid Unicode code points." ); |
| 329 | bHighSurrogate = true; |
| 330 | } |
| 331 | else |
| 332 | { |
| 333 | // Low surrogate |
| 334 | if (bHighSurrogate == false) |
| 335 | throw ArgumentException("String 'chars' contains invalid Unicode code points." ); |
| 336 | bHighSurrogate = false; |
| 337 | } |
| 338 | } |
| 339 | |
| 340 | if (charTemp >= charEnd) |
| 341 | { |
| 342 | // No buffer space |
| 343 | return false; |
| 344 | } |
| 345 | |
| 346 | *(charTemp++) = ch; |
| 347 | } |
| 348 | |
| 349 | // Need to make sure that bHighSurrogate isn't true |
| 350 | if (bHighSurrogate) |
| 351 | throw ArgumentException("String 'chars' contains invalid Unicode code points." ); |
| 352 | |
| 353 | // Now we aren't going to be false, so its OK to update chars |
| 354 | *chars = charTemp; |
| 355 | } |
| 356 | |
| 357 | return true; |
| 358 | } |
| 359 | |
| 360 | // This version just counts the fallback and doesn't actually copy anything. |
| 361 | virtual int InternalFallback(BYTE bytes[], BYTE* pBytes, int size) |
| 362 | // Right now this has both bytes[] and BYTE* bytes, since we might have extra bytes, hence the |
| 363 | // array, and we might need the index, hence the byte* |
| 364 | { |
| 365 | |
| 366 | Contract::Assert(byteStart != nullptr, "[DecoderFallback.InternalFallback]Used InternalFallback without calling InternalInitialize" ); |
| 367 | |
| 368 | // See if there's a fallback character and we have an output buffer then copy our string. |
| 369 | if (this->Fallback(bytes, (int)(pBytes - byteStart - size), size)) |
| 370 | { |
| 371 | int count = 0; |
| 372 | |
| 373 | WCHAR ch; |
| 374 | bool bHighSurrogate = false; |
| 375 | while ((ch = GetNextChar()) != 0) |
| 376 | { |
| 377 | // Make sure no mixed up surrogates |
| 378 | if (Char::IsSurrogate(ch)) |
| 379 | { |
| 380 | if (Char::IsHighSurrogate(ch)) |
| 381 | { |
| 382 | // High Surrogate |
| 383 | if (bHighSurrogate) |
| 384 | throw ArgumentException("String 'chars' contains invalid Unicode code points." ); |
| 385 | bHighSurrogate = true; |
| 386 | } |
| 387 | else |
| 388 | { |
| 389 | // Low surrogate |
| 390 | if (bHighSurrogate == false) |
| 391 | throw ArgumentException("String 'chars' contains invalid Unicode code points." ); |
| 392 | bHighSurrogate = false; |
| 393 | } |
| 394 | } |
| 395 | |
| 396 | count++; |
| 397 | } |
| 398 | |
| 399 | // Need to make sure that bHighSurrogate isn't true |
| 400 | if (bHighSurrogate) |
| 401 | throw ArgumentException("String 'chars' contains invalid Unicode code points." ); |
| 402 | |
| 403 | return count; |
| 404 | } |
| 405 | |
| 406 | // If no fallback return 0 |
| 407 | return 0; |
| 408 | } |
| 409 | |
| 410 | // private helper methods |
| 411 | void ThrowLastBytesRecursive(BYTE bytesUnknown[]) |
| 412 | { |
| 413 | throw ArgumentException("Recursive fallback not allowed" ); |
| 414 | } |
| 415 | }; |
| 416 | |
| 417 | class DecoderReplacementFallbackBuffer : public DecoderFallbackBuffer |
| 418 | { |
| 419 | // Store our default string |
| 420 | WCHAR strDefault[2]; |
| 421 | int strDefaultLength; |
| 422 | int fallbackCount = -1; |
| 423 | int fallbackIndex = -1; |
| 424 | |
| 425 | public: |
| 426 | // Construction |
| 427 | DecoderReplacementFallbackBuffer(DecoderReplacementFallback* fallback) |
| 428 | { |
| 429 | wcscpy_s(strDefault, sizeof(strDefault), fallback->GetDefaultString()); |
| 430 | strDefaultLength = PAL_wcslen((const WCHAR *)fallback->GetDefaultString()); |
| 431 | } |
| 432 | |
| 433 | // Fallback Methods |
| 434 | virtual bool Fallback(BYTE bytesUnknown[], int index, int size) |
| 435 | { |
| 436 | // We expect no previous fallback in our buffer |
| 437 | // We can't call recursively but others might (note, we don't test on last char!!!) |
| 438 | if (fallbackCount >= 1) |
| 439 | { |
| 440 | ThrowLastBytesRecursive(bytesUnknown); |
| 441 | } |
| 442 | |
| 443 | // Go ahead and get our fallback |
| 444 | if (strDefaultLength == 0) |
| 445 | return false; |
| 446 | |
| 447 | fallbackCount = strDefaultLength; |
| 448 | fallbackIndex = -1; |
| 449 | |
| 450 | return true; |
| 451 | } |
| 452 | |
| 453 | virtual WCHAR GetNextChar() |
| 454 | { |
| 455 | // We want it to get < 0 because == 0 means that the current/last character is a fallback |
| 456 | // and we need to detect recursion. We could have a flag but we already have this counter. |
| 457 | fallbackCount--; |
| 458 | fallbackIndex++; |
| 459 | |
| 460 | // Do we have anything left? 0 is now last fallback char, negative is nothing left |
| 461 | if (fallbackCount < 0) |
| 462 | return '\0'; |
| 463 | |
| 464 | // Need to get it out of the buffer. |
| 465 | // Make sure it didn't wrap from the fast count-- path |
| 466 | if (fallbackCount == INT_MAX) |
| 467 | { |
| 468 | fallbackCount = -1; |
| 469 | return '\0'; |
| 470 | } |
| 471 | |
| 472 | // Now make sure its in the expected range |
| 473 | Contract::Assert(fallbackIndex < strDefaultLength && fallbackIndex >= 0, |
| 474 | "Index exceeds buffer range" ); |
| 475 | |
| 476 | return strDefault[fallbackIndex]; |
| 477 | } |
| 478 | |
| 479 | virtual bool MovePrevious() |
| 480 | { |
| 481 | // Back up one, only if we just processed the last character (or earlier) |
| 482 | if (fallbackCount >= -1 && fallbackIndex >= 0) |
| 483 | { |
| 484 | fallbackIndex--; |
| 485 | fallbackCount++; |
| 486 | return true; |
| 487 | } |
| 488 | |
| 489 | // Return false 'cause we couldn't do it. |
| 490 | return false; |
| 491 | } |
| 492 | |
| 493 | // How many characters left to output? |
| 494 | virtual int GetRemaining() |
| 495 | { |
| 496 | // Our count is 0 for 1 character left. |
| 497 | return (fallbackCount < 0) ? 0 : fallbackCount; |
| 498 | } |
| 499 | |
| 500 | // Clear the buffer |
| 501 | virtual void Reset() |
| 502 | { |
| 503 | fallbackCount = -1; |
| 504 | fallbackIndex = -1; |
| 505 | byteStart = nullptr; |
| 506 | } |
| 507 | |
| 508 | // This version just counts the fallback and doesn't actually copy anything. |
| 509 | virtual int InternalFallback(BYTE bytes[], BYTE* pBytes, int size) |
| 510 | // Right now this has both bytes and bytes[], since we might have extra bytes, hence the |
| 511 | // array, and we might need the index, hence the byte* |
| 512 | { |
| 513 | // return our replacement string Length |
| 514 | return strDefaultLength; |
| 515 | } |
| 516 | }; |
| 517 | |
| 518 | class DecoderExceptionFallbackBuffer : public DecoderFallbackBuffer |
| 519 | { |
| 520 | public: |
| 521 | DecoderExceptionFallbackBuffer() |
| 522 | { |
| 523 | } |
| 524 | |
| 525 | virtual bool Fallback(BYTE bytesUnknown[], int index, int size) |
| 526 | { |
| 527 | throw DecoderFallbackException( |
| 528 | "Unable to translate UTF-8 character to Unicode" , bytesUnknown, index); |
| 529 | } |
| 530 | |
| 531 | virtual WCHAR GetNextChar() |
| 532 | { |
| 533 | return 0; |
| 534 | } |
| 535 | |
| 536 | virtual bool MovePrevious() |
| 537 | { |
| 538 | // Exception fallback doesn't have anywhere to back up to. |
| 539 | return false; |
| 540 | } |
| 541 | |
| 542 | // Exceptions are always empty |
| 543 | virtual int GetRemaining() |
| 544 | { |
| 545 | return 0; |
| 546 | } |
| 547 | |
| 548 | }; |
| 549 | |
| 550 | class DecoderExceptionFallback : public DecoderFallback |
| 551 | { |
| 552 | // Construction |
| 553 | public: |
| 554 | DecoderExceptionFallback() |
| 555 | { |
| 556 | } |
| 557 | |
| 558 | virtual DecoderFallbackBuffer* CreateFallbackBuffer() |
| 559 | { |
| 560 | return InternalNew<DecoderExceptionFallbackBuffer>(); |
| 561 | } |
| 562 | |
| 563 | // Maximum number of characters that this instance of this fallback could return |
| 564 | virtual int GetMaxCharCount() |
| 565 | { |
| 566 | return 0; |
| 567 | } |
| 568 | }; |
| 569 | |
| 570 | DecoderFallbackBuffer* DecoderReplacementFallback::CreateFallbackBuffer() |
| 571 | { |
| 572 | return InternalNew<DecoderReplacementFallbackBuffer>(this); |
| 573 | } |
| 574 | |
| 575 | class EncoderFallbackException : public ArgumentException |
| 576 | { |
| 577 | WCHAR charUnknown; |
| 578 | WCHAR charUnknownHigh; |
| 579 | WCHAR charUnknownLow; |
| 580 | int index; |
| 581 | |
| 582 | public: |
| 583 | EncoderFallbackException( |
| 584 | LPCSTR message, WCHAR charUnknown, int index) : ArgumentException(message) |
| 585 | { |
| 586 | this->charUnknown = charUnknown; |
| 587 | this->index = index; |
| 588 | } |
| 589 | |
| 590 | EncoderFallbackException( |
| 591 | LPCSTR message, WCHAR charUnknownHigh, WCHAR charUnknownLow, int index) : ArgumentException(message) |
| 592 | { |
| 593 | if (!Char::IsHighSurrogate(charUnknownHigh)) |
| 594 | { |
| 595 | throw ArgumentOutOfRangeException("charUnknownHigh" , |
| 596 | "Argument out of range 0xD800..0xDBFF" ); |
| 597 | } |
| 598 | if (!Char::IsLowSurrogate(charUnknownLow)) |
| 599 | { |
| 600 | throw ArgumentOutOfRangeException("charUnknownLow" , |
| 601 | "Argument out of range 0xDC00..0xDFFF" ); |
| 602 | } |
| 603 | Contract::EndContractBlock(); |
| 604 | |
| 605 | this->charUnknownHigh = charUnknownHigh; |
| 606 | this->charUnknownLow = charUnknownLow; |
| 607 | this->index = index; |
| 608 | } |
| 609 | |
| 610 | WCHAR GetCharUnknown() |
| 611 | { |
| 612 | return (charUnknown); |
| 613 | } |
| 614 | |
| 615 | WCHAR GetCharUnknownHigh() |
| 616 | { |
| 617 | return (charUnknownHigh); |
| 618 | } |
| 619 | |
| 620 | WCHAR GetCharUnknownLow() |
| 621 | { |
| 622 | return (charUnknownLow); |
| 623 | } |
| 624 | |
| 625 | int GetIndex() |
| 626 | { |
| 627 | return index; |
| 628 | } |
| 629 | |
| 630 | // Return true if the unknown character is a surrogate pair. |
| 631 | bool IsUnknownSurrogate() |
| 632 | { |
| 633 | return (charUnknownHigh != '\0'); |
| 634 | } |
| 635 | }; |
| 636 | |
| 637 | class EncoderFallbackBuffer; |
| 638 | |
| 639 | class EncoderFallback |
| 640 | { |
| 641 | public: |
| 642 | |
| 643 | // Fallback |
| 644 | // |
| 645 | // Return the appropriate unicode string alternative to the character that need to fall back. |
| 646 | |
| 647 | virtual EncoderFallbackBuffer* CreateFallbackBuffer() = 0; |
| 648 | |
| 649 | // Maximum number of characters that this instance of this fallback could return |
| 650 | virtual int GetMaxCharCount() = 0; |
| 651 | }; |
| 652 | |
| 653 | class EncoderReplacementFallback : public EncoderFallback |
| 654 | { |
| 655 | // Our variables |
| 656 | WCHAR strDefault[2]; |
| 657 | int strDefaultLength; |
| 658 | |
| 659 | public: |
| 660 | // Construction. Default replacement fallback uses no best fit and ? replacement string |
| 661 | EncoderReplacementFallback() : EncoderReplacementFallback(W("?" )) |
| 662 | { |
| 663 | } |
| 664 | |
| 665 | EncoderReplacementFallback(const WCHAR* replacement) |
| 666 | { |
| 667 | // Must not be null |
| 668 | if (replacement == nullptr) |
| 669 | throw ArgumentNullException("replacement" ); |
| 670 | Contract::EndContractBlock(); |
| 671 | |
| 672 | // Make sure it doesn't have bad surrogate pairs |
| 673 | bool bFoundHigh = false; |
| 674 | int replacementLength = PAL_wcslen((const WCHAR *)replacement); |
| 675 | for (int i = 0; i < replacementLength; i++) |
| 676 | { |
| 677 | // Found a surrogate? |
| 678 | if (Char::IsSurrogate(replacement, i)) |
| 679 | { |
| 680 | // High or Low? |
| 681 | if (Char::IsHighSurrogate(replacement, i)) |
| 682 | { |
| 683 | // if already had a high one, stop |
| 684 | if (bFoundHigh) |
| 685 | break; // break & throw at the bFoundHIgh below |
| 686 | bFoundHigh = true; |
| 687 | } |
| 688 | else |
| 689 | { |
| 690 | // Low, did we have a high? |
| 691 | if (!bFoundHigh) |
| 692 | { |
| 693 | // Didn't have one, make if fail when we stop |
| 694 | bFoundHigh = true; |
| 695 | break; |
| 696 | } |
| 697 | |
| 698 | // Clear flag |
| 699 | bFoundHigh = false; |
| 700 | } |
| 701 | } |
| 702 | // If last was high we're in trouble (not surrogate so not low surrogate, so break) |
| 703 | else if (bFoundHigh) |
| 704 | break; |
| 705 | } |
| 706 | if (bFoundHigh) |
| 707 | throw ArgumentException("String 'replacement' contains invalid Unicode code points." , "replacement" ); |
| 708 | |
| 709 | wcscpy_s(strDefault, sizeof(strDefault), replacement); |
| 710 | strDefaultLength = replacementLength; |
| 711 | } |
| 712 | |
| 713 | WCHAR* GetDefaultString() |
| 714 | { |
| 715 | return strDefault; |
| 716 | } |
| 717 | |
| 718 | virtual EncoderFallbackBuffer* CreateFallbackBuffer(); |
| 719 | |
| 720 | // Maximum number of characters that this instance of this fallback could return |
| 721 | virtual int GetMaxCharCount() |
| 722 | { |
| 723 | return strDefaultLength; |
| 724 | } |
| 725 | }; |
| 726 | |
| 727 | class EncoderFallbackBuffer |
| 728 | { |
| 729 | friend class UTF8Encoding; |
| 730 | // Most implementations will probably need an implemenation-specific constructor |
| 731 | |
| 732 | // Public methods that cannot be overriden that let us do our fallback thing |
| 733 | // These wrap the internal methods so that we can check for people doing stuff that is incorrect |
| 734 | |
| 735 | public: |
| 736 | virtual ~EncoderFallbackBuffer() = default; |
| 737 | |
| 738 | virtual bool Fallback(WCHAR charUnknown, int index) = 0; |
| 739 | |
| 740 | virtual bool Fallback(WCHAR charUnknownHigh, WCHAR charUnknownLow, int index) = 0; |
| 741 | |
| 742 | // Get next character |
| 743 | virtual WCHAR GetNextChar() = 0; |
| 744 | |
| 745 | // Back up a character |
| 746 | virtual bool MovePrevious() = 0; |
| 747 | |
| 748 | // How many chars left in this fallback? |
| 749 | virtual int GetRemaining() = 0; |
| 750 | |
| 751 | // Not sure if this should be public or not. |
| 752 | // Clear the buffer |
| 753 | virtual void Reset() |
| 754 | { |
| 755 | while (GetNextChar() != (WCHAR)0); |
| 756 | } |
| 757 | |
| 758 | // Internal items to help us figure out what we're doing as far as error messages, etc. |
| 759 | // These help us with our performance and messages internally |
| 760 | protected: |
| 761 | WCHAR* charStart; |
| 762 | WCHAR* charEnd; |
| 763 | bool setEncoder; |
| 764 | bool bUsedEncoder; |
| 765 | bool bFallingBack = false; |
| 766 | int iRecursionCount = 0; |
| 767 | static const int iMaxRecursion = 250; |
| 768 | |
| 769 | // Internal Reset |
| 770 | // For example, what if someone fails a conversion and wants to reset one of our fallback buffers? |
| 771 | void InternalReset() |
| 772 | { |
| 773 | charStart = nullptr; |
| 774 | bFallingBack = false; |
| 775 | iRecursionCount = 0; |
| 776 | Reset(); |
| 777 | } |
| 778 | |
| 779 | // Set the above values |
| 780 | // This can't be part of the constructor because EncoderFallbacks would have to know how to impliment these. |
| 781 | void InternalInitialize(WCHAR* charStart, WCHAR* charEnd, bool setEncoder) |
| 782 | { |
| 783 | this->charStart = charStart; |
| 784 | this->charEnd = charEnd; |
| 785 | this->setEncoder = setEncoder; |
| 786 | this->bUsedEncoder = false; |
| 787 | this->bFallingBack = false; |
| 788 | this->iRecursionCount = 0; |
| 789 | } |
| 790 | |
| 791 | WCHAR InternalGetNextChar() |
| 792 | { |
| 793 | WCHAR ch = GetNextChar(); |
| 794 | bFallingBack = (ch != 0); |
| 795 | if (ch == 0) iRecursionCount = 0; |
| 796 | return ch; |
| 797 | } |
| 798 | |
| 799 | // Fallback the current character using the remaining buffer and encoder if necessary |
| 800 | // This can only be called by our encodings (other have to use the public fallback methods), so |
| 801 | // we can use our EncoderNLS here too. |
| 802 | // setEncoder is true if we're calling from a GetBytes method, false if we're calling from a GetByteCount |
| 803 | // |
| 804 | // Note that this could also change the contents of this->encoder, which is the same |
| 805 | // object that the caller is using, so the caller could mess up the encoder for us |
| 806 | // if they aren't careful. |
| 807 | virtual bool InternalFallback(WCHAR ch, WCHAR** chars) |
| 808 | { |
| 809 | // Shouldn't have null charStart |
| 810 | Contract::Assert(charStart != nullptr, |
| 811 | "[EncoderFallback.InternalFallbackBuffer]Fallback buffer is not initialized" ); |
| 812 | |
| 813 | // Get our index, remember chars was preincremented to point at next char, so have to -1 |
| 814 | int index = (int)(*chars - charStart) - 1; |
| 815 | |
| 816 | // See if it was a high surrogate |
| 817 | if (Char::IsHighSurrogate(ch)) |
| 818 | { |
| 819 | // See if there's a low surrogate to go with it |
| 820 | if (*chars >= this->charEnd) |
| 821 | { |
| 822 | // Nothing left in input buffer |
| 823 | // No input, return 0 |
| 824 | } |
| 825 | else |
| 826 | { |
| 827 | // Might have a low surrogate |
| 828 | WCHAR cNext = **chars; |
| 829 | if (Char::IsLowSurrogate(cNext)) |
| 830 | { |
| 831 | // If already falling back then fail |
| 832 | if (bFallingBack && iRecursionCount++ > iMaxRecursion) |
| 833 | ThrowLastCharRecursive(ch, cNext); |
| 834 | |
| 835 | // Next is a surrogate, add it as surrogate pair, and increment chars |
| 836 | (*chars)++; |
| 837 | bFallingBack = Fallback(ch, cNext, index); |
| 838 | return bFallingBack; |
| 839 | } |
| 840 | |
| 841 | // Next isn't a low surrogate, just fallback the high surrogate |
| 842 | } |
| 843 | } |
| 844 | |
| 845 | // If already falling back then fail |
| 846 | if (bFallingBack && iRecursionCount++ > iMaxRecursion) |
| 847 | ThrowLastCharRecursive((int)ch); |
| 848 | |
| 849 | // Fall back our char |
| 850 | bFallingBack = Fallback(ch, index); |
| 851 | |
| 852 | return bFallingBack; |
| 853 | } |
| 854 | |
| 855 | // private helper methods |
| 856 | void ThrowLastCharRecursive(WCHAR highSurrogate, WCHAR lowSurrogate) |
| 857 | { |
| 858 | // Throw it, using our complete character |
| 859 | throw ArgumentException("Recursive fallback not allowed" , "chars" ); |
| 860 | } |
| 861 | |
| 862 | void ThrowLastCharRecursive(int utf32Char) |
| 863 | { |
| 864 | throw ArgumentException("Recursive fallback not allowed" , "chars" ); |
| 865 | } |
| 866 | |
| 867 | }; |
| 868 | |
| 869 | class EncoderReplacementFallbackBuffer : public EncoderFallbackBuffer |
| 870 | { |
| 871 | // Store our default string |
| 872 | WCHAR strDefault[4]; |
| 873 | int strDefaultLength; |
| 874 | int fallbackCount = -1; |
| 875 | int fallbackIndex = -1; |
| 876 | public: |
| 877 | // Construction |
| 878 | EncoderReplacementFallbackBuffer(EncoderReplacementFallback* fallback) |
| 879 | { |
| 880 | // 2X in case we're a surrogate pair |
| 881 | wcscpy_s(strDefault, sizeof(strDefault), fallback->GetDefaultString()); |
| 882 | wcscat_s(strDefault, sizeof(strDefault), fallback->GetDefaultString()); |
| 883 | strDefaultLength = 2 * PAL_wcslen((const WCHAR *)fallback->GetDefaultString()); |
| 884 | |
| 885 | } |
| 886 | |
| 887 | // Fallback Methods |
| 888 | virtual bool Fallback(WCHAR charUnknown, int index) |
| 889 | { |
| 890 | // If we had a buffer already we're being recursive, throw, it's probably at the suspect |
| 891 | // character in our array. |
| 892 | if (fallbackCount >= 1) |
| 893 | { |
| 894 | // If we're recursive we may still have something in our buffer that makes this a surrogate |
| 895 | if (Char::IsHighSurrogate(charUnknown) && fallbackCount >= 0 && |
| 896 | Char::IsLowSurrogate(strDefault[fallbackIndex + 1])) |
| 897 | ThrowLastCharRecursive(charUnknown, strDefault[fallbackIndex + 1]); |
| 898 | |
| 899 | // Nope, just one character |
| 900 | ThrowLastCharRecursive((int)charUnknown); |
| 901 | } |
| 902 | |
| 903 | // Go ahead and get our fallback |
| 904 | // Divide by 2 because we aren't a surrogate pair |
| 905 | fallbackCount = strDefaultLength / 2; |
| 906 | fallbackIndex = -1; |
| 907 | |
| 908 | return fallbackCount != 0; |
| 909 | } |
| 910 | |
| 911 | virtual bool Fallback(WCHAR charUnknownHigh, WCHAR charUnknownLow, int index) |
| 912 | { |
| 913 | // Double check input surrogate pair |
| 914 | if (!Char::IsHighSurrogate(charUnknownHigh)) |
| 915 | throw ArgumentOutOfRangeException("charUnknownHigh" , |
| 916 | "Argument out of range 0xD800..0xDBFF" ); |
| 917 | |
| 918 | if (!Char::IsLowSurrogate(charUnknownLow)) |
| 919 | throw ArgumentOutOfRangeException("charUnknownLow" , |
| 920 | "Argument out of range 0xDC00..0xDFFF" ); |
| 921 | Contract::EndContractBlock(); |
| 922 | |
| 923 | // If we had a buffer already we're being recursive, throw, it's probably at the suspect |
| 924 | // character in our array. |
| 925 | if (fallbackCount >= 1) |
| 926 | ThrowLastCharRecursive(charUnknownHigh, charUnknownLow); |
| 927 | |
| 928 | // Go ahead and get our fallback |
| 929 | fallbackCount = strDefaultLength; |
| 930 | fallbackIndex = -1; |
| 931 | |
| 932 | return fallbackCount != 0; |
| 933 | } |
| 934 | |
| 935 | virtual WCHAR GetNextChar() |
| 936 | { |
| 937 | // We want it to get < 0 because == 0 means that the current/last character is a fallback |
| 938 | // and we need to detect recursion. We could have a flag but we already have this counter. |
| 939 | fallbackCount--; |
| 940 | fallbackIndex++; |
| 941 | |
| 942 | // Do we have anything left? 0 is now last fallback char, negative is nothing left |
| 943 | if (fallbackCount < 0) |
| 944 | return '\0'; |
| 945 | |
| 946 | // Need to get it out of the buffer. |
| 947 | // Make sure it didn't wrap from the fast count-- path |
| 948 | if (fallbackCount == INT_MAX) |
| 949 | { |
| 950 | fallbackCount = -1; |
| 951 | return '\0'; |
| 952 | } |
| 953 | |
| 954 | // Now make sure its in the expected range |
| 955 | Contract::Assert(fallbackIndex < strDefaultLength && fallbackIndex >= 0, |
| 956 | "Index exceeds buffer range" ); |
| 957 | |
| 958 | return strDefault[fallbackIndex]; |
| 959 | } |
| 960 | |
| 961 | virtual bool MovePrevious() |
| 962 | { |
| 963 | // Back up one, only if we just processed the last character (or earlier) |
| 964 | if (fallbackCount >= -1 && fallbackIndex >= 0) |
| 965 | { |
| 966 | fallbackIndex--; |
| 967 | fallbackCount++; |
| 968 | return true; |
| 969 | } |
| 970 | |
| 971 | // Return false 'cause we couldn't do it. |
| 972 | return false; |
| 973 | } |
| 974 | |
| 975 | // How many characters left to output? |
| 976 | virtual int GetRemaining() |
| 977 | { |
| 978 | // Our count is 0 for 1 character left. |
| 979 | return (fallbackCount < 0) ? 0 : fallbackCount; |
| 980 | } |
| 981 | |
| 982 | // Clear the buffer |
| 983 | virtual void Reset() |
| 984 | { |
| 985 | fallbackCount = -1; |
| 986 | fallbackIndex = 0; |
| 987 | charStart = nullptr; |
| 988 | bFallingBack = false; |
| 989 | } |
| 990 | }; |
| 991 | |
| 992 | class EncoderExceptionFallbackBuffer : public EncoderFallbackBuffer |
| 993 | { |
| 994 | public: |
| 995 | EncoderExceptionFallbackBuffer() |
| 996 | { |
| 997 | } |
| 998 | |
| 999 | virtual bool Fallback(WCHAR charUnknown, int index) |
| 1000 | { |
| 1001 | // Fall back our char |
| 1002 | throw EncoderFallbackException("Unable to translate Unicode character to UTF-8" , charUnknown, index); |
| 1003 | } |
| 1004 | |
| 1005 | virtual bool Fallback(WCHAR charUnknownHigh, WCHAR charUnknownLow, int index) |
| 1006 | { |
| 1007 | if (!Char::IsHighSurrogate(charUnknownHigh)) |
| 1008 | { |
| 1009 | throw ArgumentOutOfRangeException("charUnknownHigh" , |
| 1010 | "Argument out of range 0xD800..0xDBFF" ); |
| 1011 | } |
| 1012 | if (!Char::IsLowSurrogate(charUnknownLow)) |
| 1013 | { |
| 1014 | throw ArgumentOutOfRangeException("charUnknownLow" , |
| 1015 | "Argument out of range 0xDC00..0xDFFF" ); |
| 1016 | } |
| 1017 | Contract::EndContractBlock(); |
| 1018 | |
| 1019 | //int iTemp = Char::ConvertToUtf32(charUnknownHigh, charUnknownLow); |
| 1020 | |
| 1021 | // Fall back our char |
| 1022 | throw EncoderFallbackException( |
| 1023 | "Unable to translate Unicode character to UTF-8" , charUnknownHigh, charUnknownLow, index); |
| 1024 | } |
| 1025 | |
| 1026 | virtual WCHAR GetNextChar() |
| 1027 | { |
| 1028 | return 0; |
| 1029 | } |
| 1030 | |
| 1031 | virtual bool MovePrevious() |
| 1032 | { |
| 1033 | // Exception fallback doesn't have anywhere to back up to. |
| 1034 | return false; |
| 1035 | } |
| 1036 | |
| 1037 | // Exceptions are always empty |
| 1038 | virtual int GetRemaining() |
| 1039 | { |
| 1040 | return 0; |
| 1041 | } |
| 1042 | }; |
| 1043 | |
| 1044 | class EncoderExceptionFallback : public EncoderFallback |
| 1045 | { |
| 1046 | // Construction |
| 1047 | public: |
| 1048 | EncoderExceptionFallback() |
| 1049 | { |
| 1050 | } |
| 1051 | |
| 1052 | virtual EncoderFallbackBuffer* CreateFallbackBuffer() |
| 1053 | { |
| 1054 | return InternalNew<EncoderExceptionFallbackBuffer>(); |
| 1055 | } |
| 1056 | |
| 1057 | // Maximum number of characters that this instance of this fallback could return |
| 1058 | virtual int GetMaxCharCount() |
| 1059 | { |
| 1060 | return 0; |
| 1061 | } |
| 1062 | }; |
| 1063 | |
| 1064 | EncoderFallbackBuffer* EncoderReplacementFallback::CreateFallbackBuffer() |
| 1065 | { |
| 1066 | return InternalNew<EncoderReplacementFallbackBuffer>(this); |
| 1067 | } |
| 1068 | |
| 1069 | class UTF8Encoding |
| 1070 | { |
| 1071 | EncoderFallback* encoderFallback; |
| 1072 | // Instances of the two possible fallbacks. The constructor parameter |
| 1073 | // determines which one to use. |
| 1074 | EncoderReplacementFallback encoderReplacementFallback; |
| 1075 | EncoderExceptionFallback encoderExceptionFallback; |
| 1076 | |
| 1077 | DecoderFallback* decoderFallback; |
| 1078 | // Instances of the two possible fallbacks. The constructor parameter |
| 1079 | // determines which one to use. |
| 1080 | DecoderReplacementFallback decoderReplacementFallback; |
| 1081 | DecoderExceptionFallback decoderExceptionFallback; |
| 1082 | |
| 1083 | bool InRange(WCHAR c, WCHAR begin, WCHAR end) |
| 1084 | { |
| 1085 | return begin <= c && c <= end; |
| 1086 | } |
| 1087 | |
| 1088 | size_t PtrDiff(WCHAR* ptr1, WCHAR* ptr2) |
| 1089 | { |
| 1090 | return ptr1 - ptr2; |
| 1091 | } |
| 1092 | |
| 1093 | size_t PtrDiff(BYTE* ptr1, BYTE* ptr2) |
| 1094 | { |
| 1095 | return ptr1 - ptr2; |
| 1096 | } |
| 1097 | |
| 1098 | void ThrowBytesOverflow() |
| 1099 | { |
| 1100 | // Special message to include fallback type in case fallback's GetMaxCharCount is broken |
| 1101 | // This happens if user has implimented an encoder fallback with a broken GetMaxCharCount |
| 1102 | throw InsufficientBufferException("The output byte buffer is too small to contain the encoded data" , "bytes" ); |
| 1103 | } |
| 1104 | |
| 1105 | void ThrowBytesOverflow(bool nothingEncoded) |
| 1106 | { |
| 1107 | // Special message to include fallback type in case fallback's GetMaxCharCount is broken |
| 1108 | // This happens if user has implimented an encoder fallback with a broken GetMaxCharCount |
| 1109 | if (nothingEncoded){ |
| 1110 | ThrowBytesOverflow(); |
| 1111 | } |
| 1112 | } |
| 1113 | |
| 1114 | void ThrowCharsOverflow() |
| 1115 | { |
| 1116 | // Special message to include fallback type in case fallback's GetMaxCharCount is broken |
| 1117 | // This happens if user has implimented a decoder fallback with a broken GetMaxCharCount |
| 1118 | throw InsufficientBufferException("The output char buffer is too small to contain the encoded data" , "chars" ); |
| 1119 | } |
| 1120 | |
| 1121 | void ThrowCharsOverflow(bool nothingEncoded) |
| 1122 | { |
| 1123 | // Special message to include fallback type in case fallback's GetMaxCharCount is broken |
| 1124 | // This happens if user has implimented an decoder fallback with a broken GetMaxCharCount |
| 1125 | if (nothingEncoded){ |
| 1126 | ThrowCharsOverflow(); |
| 1127 | } |
| 1128 | } |
| 1129 | |
| 1130 | // During GetChars we had an invalid byte sequence |
| 1131 | // pSrc is backed up to the start of the bad sequence if we didn't have room to |
| 1132 | // fall it back. Otherwise pSrc remains where it is. |
| 1133 | bool FallbackInvalidByteSequence(BYTE** pSrc, int ch, DecoderFallbackBuffer* fallback, WCHAR** pTarget) |
| 1134 | { |
| 1135 | // Get our byte[] |
| 1136 | BYTE* pStart = *pSrc; |
| 1137 | BYTE* bytesUnknown; |
| 1138 | int size = GetBytesUnknown(pStart, ch, &bytesUnknown); |
| 1139 | |
| 1140 | // Do the actual fallback |
| 1141 | if (!fallback->InternalFallback(bytesUnknown, *pSrc, pTarget, size)) |
| 1142 | { |
| 1143 | // Oops, it failed, back up to pStart |
| 1144 | *pSrc = pStart; |
| 1145 | return false; |
| 1146 | } |
| 1147 | |
| 1148 | // It worked |
| 1149 | return true; |
| 1150 | } |
| 1151 | |
| 1152 | int FallbackInvalidByteSequence(BYTE* pSrc, int ch, DecoderFallbackBuffer *fallback) |
| 1153 | { |
| 1154 | // Get our byte[] |
| 1155 | BYTE *bytesUnknown; |
| 1156 | int size = GetBytesUnknown(pSrc, ch, &bytesUnknown); |
| 1157 | |
| 1158 | // Do the actual fallback |
| 1159 | int count = fallback->InternalFallback(bytesUnknown, pSrc, size); |
| 1160 | |
| 1161 | // # of fallback chars expected. |
| 1162 | // Note that we only get here for "long" sequences, and have already unreserved |
| 1163 | // the count that we prereserved for the input bytes |
| 1164 | return count; |
| 1165 | } |
| 1166 | |
| 1167 | int GetBytesUnknown(BYTE* pSrc, int ch, BYTE **bytesUnknown) |
| 1168 | { |
| 1169 | int size; |
| 1170 | BYTE bytes[3]; |
| 1171 | |
| 1172 | // See if it was a plain char |
| 1173 | // (have to check >= 0 because we have all sorts of wierd bit flags) |
| 1174 | if (ch < 0x100 && ch >= 0) |
| 1175 | { |
| 1176 | pSrc--; |
| 1177 | bytes[0] = (BYTE)ch; |
| 1178 | size = 1; |
| 1179 | } |
| 1180 | // See if its an unfinished 2 byte sequence |
| 1181 | else if ((ch & (SupplimentarySeq | ThreeByteSeq)) == 0) |
| 1182 | { |
| 1183 | pSrc--; |
| 1184 | bytes[0] = (BYTE)((ch & 0x1F) | 0xc0); |
| 1185 | size = 1; |
| 1186 | } |
| 1187 | // So now we're either 2nd byte of 3 or 4 byte sequence or |
| 1188 | // we hit a non-trail byte or we ran out of space for 3rd byte of 4 byte sequence |
| 1189 | // 1st check if its a 4 byte sequence |
| 1190 | else if ((ch & SupplimentarySeq) != 0) |
| 1191 | { |
| 1192 | // 3rd byte of 4 byte sequence? |
| 1193 | if ((ch & (FinalByte >> 6)) != 0) |
| 1194 | { |
| 1195 | // 3rd byte of 4 byte sequence |
| 1196 | pSrc -= 3; |
| 1197 | bytes[0] = (BYTE)(((ch >> 12) & 0x07) | 0xF0); |
| 1198 | bytes[1] = (BYTE)(((ch >> 6) & 0x3F) | 0x80); |
| 1199 | bytes[2] = (BYTE)(((ch)& 0x3F) | 0x80); |
| 1200 | size = 3; |
| 1201 | } |
| 1202 | else if ((ch & (FinalByte >> 12)) != 0) |
| 1203 | { |
| 1204 | // 2nd byte of a 4 byte sequence |
| 1205 | pSrc -= 2; |
| 1206 | bytes[0] = (BYTE)(((ch >> 6) & 0x07) | 0xF0); |
| 1207 | bytes[1] = (BYTE)(((ch)& 0x3F) | 0x80); |
| 1208 | size = 2; |
| 1209 | } |
| 1210 | else |
| 1211 | { |
| 1212 | // 4th byte of a 4 byte sequence |
| 1213 | pSrc--; |
| 1214 | bytes[0] = (BYTE)(((ch)& 0x07) | 0xF0); |
| 1215 | size = 1; |
| 1216 | } |
| 1217 | } |
| 1218 | else |
| 1219 | { |
| 1220 | // 2nd byte of 3 byte sequence? |
| 1221 | if ((ch & (FinalByte >> 6)) != 0) |
| 1222 | { |
| 1223 | // So its 2nd byte of a 3 byte sequence |
| 1224 | pSrc -= 2; |
| 1225 | bytes[0] = (BYTE)(((ch >> 6) & 0x0F) | 0xE0); |
| 1226 | bytes[1] = (BYTE)(((ch)& 0x3F) | 0x80); |
| 1227 | size = 2; |
| 1228 | } |
| 1229 | else |
| 1230 | { |
| 1231 | // 1st byte of a 3 byte sequence |
| 1232 | pSrc--; |
| 1233 | bytes[0] = (BYTE)(((ch)& 0x0F) | 0xE0); |
| 1234 | size = 1; |
| 1235 | } |
| 1236 | } |
| 1237 | |
| 1238 | *bytesUnknown = bytes; |
| 1239 | return size; |
| 1240 | } |
| 1241 | |
| 1242 | public: |
| 1243 | |
| 1244 | UTF8Encoding(bool isThrowException) |
| 1245 | : encoderReplacementFallback(W("\xFFFD" )), decoderReplacementFallback(W("\xFFFD" )) |
| 1246 | { |
| 1247 | if (isThrowException) |
| 1248 | { |
| 1249 | encoderFallback = &encoderExceptionFallback; |
| 1250 | decoderFallback = &decoderExceptionFallback; |
| 1251 | } |
| 1252 | else |
| 1253 | { |
| 1254 | encoderFallback = &encoderReplacementFallback; |
| 1255 | decoderFallback = &decoderReplacementFallback; |
| 1256 | } |
| 1257 | } |
| 1258 | |
| 1259 | // These are bitmasks used to maintain the state in the decoder. They occupy the higher bits |
| 1260 | // while the actual character is being built in the lower bits. They are shifted together |
| 1261 | // with the actual bits of the character. |
| 1262 | |
| 1263 | // bits 30 & 31 are used for pending bits fixup |
| 1264 | const int FinalByte = 1 << 29; |
| 1265 | const int SupplimentarySeq = 1 << 28; |
| 1266 | const int ThreeByteSeq = 1 << 27; |
| 1267 | |
| 1268 | int GetCharCount(BYTE* bytes, int count) |
| 1269 | { |
| 1270 | Contract::Assert(bytes != nullptr, "[UTF8Encoding.GetCharCount]bytes!=nullptr" ); |
| 1271 | Contract::Assert(count >= 0, "[UTF8Encoding.GetCharCount]count >=0" ); |
| 1272 | |
| 1273 | // Initialize stuff |
| 1274 | BYTE *pSrc = bytes; |
| 1275 | BYTE *pEnd = pSrc + count; |
| 1276 | |
| 1277 | // Start by assuming we have as many as count, charCount always includes the adjustment |
| 1278 | // for the character being decoded |
| 1279 | int charCount = count; |
| 1280 | int ch = 0; |
| 1281 | DecoderFallbackBuffer *fallback = nullptr; |
| 1282 | |
| 1283 | for (;;) |
| 1284 | { |
| 1285 | // SLOWLOOP: does all range checks, handles all special cases, but it is slow |
| 1286 | if (pSrc >= pEnd) { |
| 1287 | break; |
| 1288 | } |
| 1289 | |
| 1290 | // read next byte. The JIT optimization seems to be getting confused when |
| 1291 | // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead |
| 1292 | int cha = *pSrc; |
| 1293 | |
| 1294 | if (ch == 0) { |
| 1295 | // no pending bits |
| 1296 | goto ReadChar; |
| 1297 | } |
| 1298 | |
| 1299 | pSrc++; |
| 1300 | |
| 1301 | // we are expecting to see trailing bytes like 10vvvvvv |
| 1302 | if ((cha & 0xC0) != 0x80) { |
| 1303 | // This can be a valid starting byte for another UTF8 byte sequence, so let's put |
| 1304 | // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence |
| 1305 | pSrc--; |
| 1306 | charCount += (ch >> 30); |
| 1307 | goto InvalidByteSequence; |
| 1308 | } |
| 1309 | |
| 1310 | // fold in the new byte |
| 1311 | ch = (ch << 6) | (cha & 0x3F); |
| 1312 | |
| 1313 | if ((ch & FinalByte) == 0) { |
| 1314 | Contract::Assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0, |
| 1315 | "[UTF8Encoding.GetChars]Invariant volation" ); |
| 1316 | |
| 1317 | if ((ch & SupplimentarySeq) != 0) { |
| 1318 | if ((ch & (FinalByte >> 6)) != 0) { |
| 1319 | // this is 3rd byte (of 4 byte supplimentary) - nothing to do |
| 1320 | continue; |
| 1321 | } |
| 1322 | |
| 1323 | // 2nd byte, check for non-shortest form of supplimentary char and the valid |
| 1324 | // supplimentary characters in range 0x010000 - 0x10FFFF at the same time |
| 1325 | if (!InRange(ch & 0x1F0, 0x10, 0x100)) { |
| 1326 | goto InvalidByteSequence; |
| 1327 | } |
| 1328 | } |
| 1329 | else { |
| 1330 | // Must be 2nd byte of a 3-byte sequence |
| 1331 | // check for non-shortest form of 3 byte seq |
| 1332 | if ((ch & (0x1F << 5)) == 0 || // non-shortest form |
| 1333 | (ch & (0xF800 >> 6)) == (0xD800 >> 6)) // illegal individually encoded surrogate |
| 1334 | { |
| 1335 | goto InvalidByteSequence; |
| 1336 | } |
| 1337 | } |
| 1338 | continue; |
| 1339 | } |
| 1340 | |
| 1341 | // ready to punch |
| 1342 | |
| 1343 | // adjust for surrogates in non-shortest form |
| 1344 | if ((ch & (SupplimentarySeq | 0x1F0000)) == SupplimentarySeq) { |
| 1345 | charCount--; |
| 1346 | } |
| 1347 | goto EncodeChar; |
| 1348 | |
| 1349 | InvalidByteSequence: |
| 1350 | // this code fragment should be close to the gotos referencing it |
| 1351 | // Have to do fallback for invalid bytes |
| 1352 | if (fallback == nullptr) |
| 1353 | { |
| 1354 | fallback = decoderFallback->CreateFallbackBuffer(); |
| 1355 | fallback->InternalInitialize(bytes, nullptr); |
| 1356 | } |
| 1357 | charCount += FallbackInvalidByteSequence(pSrc, ch, fallback); |
| 1358 | |
| 1359 | ch = 0; |
| 1360 | continue; |
| 1361 | |
| 1362 | ReadChar: |
| 1363 | ch = *pSrc; |
| 1364 | pSrc++; |
| 1365 | |
| 1366 | ProcessChar: |
| 1367 | if (ch > 0x7F) { |
| 1368 | // If its > 0x7F, its start of a new multi-byte sequence |
| 1369 | |
| 1370 | // Long sequence, so unreserve our char. |
| 1371 | charCount--; |
| 1372 | |
| 1373 | // bit 6 has to be non-zero for start of multibyte chars. |
| 1374 | if ((ch & 0x40) == 0) { |
| 1375 | // Unexpected trail byte |
| 1376 | goto InvalidByteSequence; |
| 1377 | } |
| 1378 | |
| 1379 | // start a new long code |
| 1380 | if ((ch & 0x20) != 0) { |
| 1381 | if ((ch & 0x10) != 0) { |
| 1382 | // 4 byte encoding - supplimentary character (2 surrogates) |
| 1383 | |
| 1384 | ch &= 0x0F; |
| 1385 | |
| 1386 | // check that bit 4 is zero and the valid supplimentary character |
| 1387 | // range 0x000000 - 0x10FFFF at the same time |
| 1388 | if (ch > 0x04) { |
| 1389 | ch |= 0xf0; |
| 1390 | goto InvalidByteSequence; |
| 1391 | } |
| 1392 | |
| 1393 | // Add bit flags so that when we check new characters & rotate we'll be flagged correctly. |
| 1394 | // Final byte flag, count fix if we don't make final byte & supplimentary sequence flag. |
| 1395 | ch |= (FinalByte >> 3 * 6) | // Final byte is 3 more bytes from now |
| 1396 | (1 << 30) | // If it dies on next byte we'll need an extra char |
| 1397 | (3 << (30 - 2 * 6)) | // If it dies on last byte we'll need to subtract a char |
| 1398 | (SupplimentarySeq) | (SupplimentarySeq >> 6) | |
| 1399 | (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6); |
| 1400 | |
| 1401 | // Our character count will be 2 characters for these 4 bytes, so subtract another char |
| 1402 | charCount--; |
| 1403 | } |
| 1404 | else { |
| 1405 | // 3 byte encoding |
| 1406 | // Add bit flags so that when we check new characters & rotate we'll be flagged correctly. |
| 1407 | ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) | |
| 1408 | (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6)); |
| 1409 | |
| 1410 | // We'll expect 1 character for these 3 bytes, so subtract another char. |
| 1411 | charCount--; |
| 1412 | } |
| 1413 | } |
| 1414 | else { |
| 1415 | // 2 byte encoding |
| 1416 | |
| 1417 | ch &= 0x1F; |
| 1418 | |
| 1419 | // check for non-shortest form |
| 1420 | if (ch <= 1) { |
| 1421 | ch |= 0xc0; |
| 1422 | goto InvalidByteSequence; |
| 1423 | } |
| 1424 | |
| 1425 | // Add bit flags so we'll be flagged correctly |
| 1426 | ch |= (FinalByte >> 6); |
| 1427 | } |
| 1428 | continue; |
| 1429 | } |
| 1430 | |
| 1431 | EncodeChar: |
| 1432 | |
| 1433 | #ifdef FASTLOOP |
| 1434 | int availableBytes = PtrDiff(pEnd, pSrc); |
| 1435 | |
| 1436 | // don't fall into the fast decoding loop if we don't have enough bytes |
| 1437 | if (availableBytes <= 13) { |
| 1438 | // try to get over the remainder of the ascii characters fast though |
| 1439 | BYTE* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered |
| 1440 | while (pSrc < pLocalEnd) { |
| 1441 | ch = *pSrc; |
| 1442 | pSrc++; |
| 1443 | |
| 1444 | if (ch > 0x7F) |
| 1445 | goto ProcessChar; |
| 1446 | } |
| 1447 | // we are done |
| 1448 | ch = 0; |
| 1449 | break; |
| 1450 | } |
| 1451 | |
| 1452 | // To compute the upper bound, assume that all characters are ASCII characters at this point, |
| 1453 | // the boundary will be decreased for every non-ASCII character we encounter |
| 1454 | // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences |
| 1455 | BYTE *pStop = pSrc + availableBytes - 7; |
| 1456 | |
| 1457 | while (pSrc < pStop) { |
| 1458 | ch = *pSrc; |
| 1459 | pSrc++; |
| 1460 | |
| 1461 | if (ch > 0x7F) { |
| 1462 | goto LongCode; |
| 1463 | } |
| 1464 | |
| 1465 | // get pSrc 2-byte aligned |
| 1466 | if (((int)pSrc & 0x1) != 0) { |
| 1467 | ch = *pSrc; |
| 1468 | pSrc++; |
| 1469 | if (ch > 0x7F) { |
| 1470 | goto LongCode; |
| 1471 | } |
| 1472 | } |
| 1473 | |
| 1474 | // get pSrc 4-byte aligned |
| 1475 | if (((int)pSrc & 0x2) != 0) { |
| 1476 | ch = *(USHORT*)pSrc; |
| 1477 | if ((ch & 0x8080) != 0) { |
| 1478 | goto LongCodeWithMask16; |
| 1479 | } |
| 1480 | pSrc += 2; |
| 1481 | } |
| 1482 | |
| 1483 | |
| 1484 | // Run 8 + 8 characters at a time! |
| 1485 | while (pSrc < pStop) { |
| 1486 | ch = *(int*)pSrc; |
| 1487 | int chb = *(int*)(pSrc + 4); |
| 1488 | if (((ch | chb) & (int)0x80808080) != 0) { |
| 1489 | goto LongCodeWithMask32; |
| 1490 | } |
| 1491 | pSrc += 8; |
| 1492 | |
| 1493 | // This is a really small loop - unroll it |
| 1494 | if (pSrc >= pStop) |
| 1495 | break; |
| 1496 | |
| 1497 | ch = *(int*)pSrc; |
| 1498 | chb = *(int*)(pSrc + 4); |
| 1499 | if (((ch | chb) & (int)0x80808080) != 0) { |
| 1500 | goto LongCodeWithMask32; |
| 1501 | } |
| 1502 | pSrc += 8; |
| 1503 | } |
| 1504 | break; |
| 1505 | |
| 1506 | #if BIGENDIAN |
| 1507 | LongCodeWithMask32 : |
| 1508 | // be careful about the sign extension |
| 1509 | ch = (int)(((uint)ch) >> 16); |
| 1510 | LongCodeWithMask16: |
| 1511 | ch = (int)(((uint)ch) >> 8); |
| 1512 | #else // BIGENDIAN |
| 1513 | LongCodeWithMask32: |
| 1514 | LongCodeWithMask16: |
| 1515 | ch &= 0xFF; |
| 1516 | #endif // BIGENDIAN |
| 1517 | pSrc++; |
| 1518 | if (ch <= 0x7F) { |
| 1519 | continue; |
| 1520 | } |
| 1521 | |
| 1522 | LongCode: |
| 1523 | int chc = *pSrc; |
| 1524 | pSrc++; |
| 1525 | |
| 1526 | if ( |
| 1527 | // bit 6 has to be zero |
| 1528 | (ch & 0x40) == 0 || |
| 1529 | // we are expecting to see trailing bytes like 10vvvvvv |
| 1530 | (chc & 0xC0) != 0x80) |
| 1531 | { |
| 1532 | goto BadLongCode; |
| 1533 | } |
| 1534 | |
| 1535 | chc &= 0x3F; |
| 1536 | |
| 1537 | // start a new long code |
| 1538 | if ((ch & 0x20) != 0) { |
| 1539 | |
| 1540 | // fold the first two bytes together |
| 1541 | chc |= (ch & 0x0F) << 6; |
| 1542 | |
| 1543 | if ((ch & 0x10) != 0) { |
| 1544 | // 4 byte encoding - surrogate |
| 1545 | ch = *pSrc; |
| 1546 | if ( |
| 1547 | // check that bit 4 is zero, the non-shortest form of surrogate |
| 1548 | // and the valid surrogate range 0x000000 - 0x10FFFF at the same time |
| 1549 | !InRange(chc >> 4, 0x01, 0x10) || |
| 1550 | // we are expecting to see trailing bytes like 10vvvvvv |
| 1551 | (ch & 0xC0) != 0x80) |
| 1552 | { |
| 1553 | goto BadLongCode; |
| 1554 | } |
| 1555 | |
| 1556 | chc = (chc << 6) | (ch & 0x3F); |
| 1557 | |
| 1558 | ch = *(pSrc + 1); |
| 1559 | // we are expecting to see trailing bytes like 10vvvvvv |
| 1560 | if ((ch & 0xC0) != 0x80) { |
| 1561 | goto BadLongCode; |
| 1562 | } |
| 1563 | pSrc += 2; |
| 1564 | |
| 1565 | // extra byte |
| 1566 | charCount--; |
| 1567 | } |
| 1568 | else { |
| 1569 | // 3 byte encoding |
| 1570 | ch = *pSrc; |
| 1571 | if ( |
| 1572 | // check for non-shortest form of 3 byte seq |
| 1573 | (chc & (0x1F << 5)) == 0 || |
| 1574 | // Can't have surrogates here. |
| 1575 | (chc & (0xF800 >> 6)) == (0xD800 >> 6) || |
| 1576 | // we are expecting to see trailing bytes like 10vvvvvv |
| 1577 | (ch & 0xC0) != 0x80) |
| 1578 | { |
| 1579 | goto BadLongCode; |
| 1580 | } |
| 1581 | pSrc++; |
| 1582 | |
| 1583 | // extra byte |
| 1584 | charCount--; |
| 1585 | } |
| 1586 | } |
| 1587 | else { |
| 1588 | // 2 byte encoding |
| 1589 | |
| 1590 | // check for non-shortest form |
| 1591 | if ((ch & 0x1E) == 0) { |
| 1592 | goto BadLongCode; |
| 1593 | } |
| 1594 | } |
| 1595 | |
| 1596 | // extra byte |
| 1597 | charCount--; |
| 1598 | } |
| 1599 | #endif // FASTLOOP |
| 1600 | |
| 1601 | // no pending bits at this point |
| 1602 | ch = 0; |
| 1603 | continue; |
| 1604 | |
| 1605 | BadLongCode: |
| 1606 | pSrc -= 2; |
| 1607 | ch = 0; |
| 1608 | continue; |
| 1609 | } |
| 1610 | |
| 1611 | // May have a problem if we have to flush |
| 1612 | if (ch != 0) |
| 1613 | { |
| 1614 | // We were already adjusting for these, so need to unadjust |
| 1615 | charCount += (ch >> 30); |
| 1616 | // Have to do fallback for invalid bytes |
| 1617 | if (fallback == nullptr) |
| 1618 | { |
| 1619 | fallback = decoderFallback->CreateFallbackBuffer(); |
| 1620 | fallback->InternalInitialize(bytes, nullptr); |
| 1621 | } |
| 1622 | charCount += FallbackInvalidByteSequence(pSrc, ch, fallback); |
| 1623 | } |
| 1624 | |
| 1625 | // Shouldn't have anything in fallback buffer for GetCharCount |
| 1626 | // (don't have to check m_throwOnOverflow for count) |
| 1627 | Contract::Assert(fallback == nullptr || fallback->GetRemaining() == 0, |
| 1628 | "[UTF8Encoding.GetCharCount]Expected empty fallback buffer at end" ); |
| 1629 | |
| 1630 | InternalDelete(fallback); |
| 1631 | |
| 1632 | return charCount; |
| 1633 | |
| 1634 | } |
| 1635 | |
| 1636 | int GetChars(BYTE* bytes, int byteCount, WCHAR* chars, int charCount) |
| 1637 | { |
| 1638 | Contract::Assert(chars != nullptr, "[UTF8Encoding.GetChars]chars!=nullptr" ); |
| 1639 | Contract::Assert(byteCount >= 0, "[UTF8Encoding.GetChars]byteCount >=0" ); |
| 1640 | Contract::Assert(charCount >= 0, "[UTF8Encoding.GetChars]charCount >=0" ); |
| 1641 | Contract::Assert(bytes != nullptr, "[UTF8Encoding.GetChars]bytes!=nullptr" ); |
| 1642 | |
| 1643 | BYTE *pSrc = bytes; |
| 1644 | WCHAR *pTarget = chars; |
| 1645 | |
| 1646 | BYTE *pEnd = pSrc + byteCount; |
| 1647 | WCHAR *pAllocatedBufferEnd = pTarget + charCount; |
| 1648 | |
| 1649 | int ch = 0; |
| 1650 | |
| 1651 | DecoderFallbackBuffer *fallback = nullptr; |
| 1652 | |
| 1653 | for (;;) |
| 1654 | { |
| 1655 | // SLOWLOOP: does all range checks, handles all special cases, but it is slow |
| 1656 | |
| 1657 | if (pSrc >= pEnd) { |
| 1658 | break; |
| 1659 | } |
| 1660 | |
| 1661 | // read next byte. The JIT optimization seems to be getting confused when |
| 1662 | // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead |
| 1663 | int cha = *pSrc; |
| 1664 | |
| 1665 | if (ch == 0) { |
| 1666 | // no pending bits |
| 1667 | goto ReadChar; |
| 1668 | } |
| 1669 | |
| 1670 | pSrc++; |
| 1671 | |
| 1672 | // we are expecting to see trailing bytes like 10vvvvvv |
| 1673 | if ((cha & 0xC0) != 0x80) { |
| 1674 | // This can be a valid starting byte for another UTF8 byte sequence, so let's put |
| 1675 | // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence |
| 1676 | pSrc--; |
| 1677 | goto InvalidByteSequence; |
| 1678 | } |
| 1679 | |
| 1680 | // fold in the new byte |
| 1681 | ch = (ch << 6) | (cha & 0x3F); |
| 1682 | |
| 1683 | if ((ch & FinalByte) == 0) { |
| 1684 | // Not at last byte yet |
| 1685 | Contract::Assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0, |
| 1686 | "[UTF8Encoding.GetChars]Invariant volation" ); |
| 1687 | |
| 1688 | if ((ch & SupplimentarySeq) != 0) { |
| 1689 | // Its a 4-byte supplimentary sequence |
| 1690 | if ((ch & (FinalByte >> 6)) != 0) { |
| 1691 | // this is 3rd byte of 4 byte sequence - nothing to do |
| 1692 | continue; |
| 1693 | } |
| 1694 | |
| 1695 | // 2nd byte of 4 bytes |
| 1696 | // check for non-shortest form of surrogate and the valid surrogate |
| 1697 | // range 0x000000 - 0x10FFFF at the same time |
| 1698 | if (!InRange(ch & 0x1F0, 0x10, 0x100)) { |
| 1699 | goto InvalidByteSequence; |
| 1700 | } |
| 1701 | } |
| 1702 | else { |
| 1703 | // Must be 2nd byte of a 3-byte sequence |
| 1704 | // check for non-shortest form of 3 byte seq |
| 1705 | if ((ch & (0x1F << 5)) == 0 || // non-shortest form |
| 1706 | (ch & (0xF800 >> 6)) == (0xD800 >> 6)) // illegal individually encoded surrogate |
| 1707 | { |
| 1708 | goto InvalidByteSequence; |
| 1709 | } |
| 1710 | } |
| 1711 | continue; |
| 1712 | } |
| 1713 | |
| 1714 | // ready to punch |
| 1715 | |
| 1716 | // surrogate in shortest form? |
| 1717 | // Might be possible to get rid of this? Already did non-shortest check for 4-byte sequence when reading 2nd byte? |
| 1718 | if ((ch & (SupplimentarySeq | 0x1F0000)) > SupplimentarySeq) { |
| 1719 | // let the range check for the second char throw the exception |
| 1720 | if (pTarget < pAllocatedBufferEnd) { |
| 1721 | *pTarget = (WCHAR)(((ch >> 10) & 0x7FF) + |
| 1722 | (SHORT)((CharUnicodeInfo::HIGH_SURROGATE_START - (0x10000 >> 10)))); |
| 1723 | pTarget++; |
| 1724 | |
| 1725 | ch = (ch & 0x3FF) + |
| 1726 | (int)(CharUnicodeInfo::LOW_SURROGATE_START); |
| 1727 | } |
| 1728 | } |
| 1729 | |
| 1730 | goto EncodeChar; |
| 1731 | |
| 1732 | InvalidByteSequence: |
| 1733 | // this code fragment should be close to the gotos referencing it |
| 1734 | // Have to do fallback for invalid bytes |
| 1735 | if (fallback == nullptr) |
| 1736 | { |
| 1737 | fallback = decoderFallback->CreateFallbackBuffer(); |
| 1738 | fallback->InternalInitialize(bytes, pAllocatedBufferEnd); |
| 1739 | } |
| 1740 | |
| 1741 | // That'll back us up the appropriate # of bytes if we didn't get anywhere |
| 1742 | if (!FallbackInvalidByteSequence(&pSrc, ch, fallback, &pTarget)) |
| 1743 | { |
| 1744 | // Ran out of buffer space |
| 1745 | // Need to throw an exception? |
| 1746 | Contract::Assert(pSrc >= bytes || pTarget == chars, |
| 1747 | "[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer after fallback" ); |
| 1748 | fallback->InternalReset(); |
| 1749 | ThrowCharsOverflow(pTarget == chars); |
| 1750 | ch = 0; |
| 1751 | break; |
| 1752 | } |
| 1753 | Contract::Assert(pSrc >= bytes, |
| 1754 | "[UTF8Encoding.GetChars]Expected invalid byte sequence to have remained within the byte array" ); |
| 1755 | ch = 0; |
| 1756 | continue; |
| 1757 | |
| 1758 | ReadChar: |
| 1759 | ch = *pSrc; |
| 1760 | pSrc++; |
| 1761 | |
| 1762 | ProcessChar: |
| 1763 | if (ch > 0x7F) { |
| 1764 | // If its > 0x7F, its start of a new multi-byte sequence |
| 1765 | |
| 1766 | // bit 6 has to be non-zero |
| 1767 | if ((ch & 0x40) == 0) { |
| 1768 | goto InvalidByteSequence; |
| 1769 | } |
| 1770 | |
| 1771 | // start a new long code |
| 1772 | if ((ch & 0x20) != 0) { |
| 1773 | if ((ch & 0x10) != 0) { |
| 1774 | // 4 byte encoding - supplimentary character (2 surrogates) |
| 1775 | |
| 1776 | ch &= 0x0F; |
| 1777 | |
| 1778 | // check that bit 4 is zero and the valid supplimentary character |
| 1779 | // range 0x000000 - 0x10FFFF at the same time |
| 1780 | if (ch > 0x04) { |
| 1781 | ch |= 0xf0; |
| 1782 | goto InvalidByteSequence; |
| 1783 | } |
| 1784 | |
| 1785 | ch |= (FinalByte >> 3 * 6) | (1 << 30) | (3 << (30 - 2 * 6)) | |
| 1786 | (SupplimentarySeq) | (SupplimentarySeq >> 6) | |
| 1787 | (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6); |
| 1788 | } |
| 1789 | else { |
| 1790 | // 3 byte encoding |
| 1791 | ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) | |
| 1792 | (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6)); |
| 1793 | } |
| 1794 | } |
| 1795 | else { |
| 1796 | // 2 byte encoding |
| 1797 | |
| 1798 | ch &= 0x1F; |
| 1799 | |
| 1800 | // check for non-shortest form |
| 1801 | if (ch <= 1) { |
| 1802 | ch |= 0xc0; |
| 1803 | goto InvalidByteSequence; |
| 1804 | } |
| 1805 | |
| 1806 | ch |= (FinalByte >> 6); |
| 1807 | } |
| 1808 | continue; |
| 1809 | } |
| 1810 | |
| 1811 | EncodeChar: |
| 1812 | // write the pending character |
| 1813 | if (pTarget >= pAllocatedBufferEnd) |
| 1814 | { |
| 1815 | // Fix chars so we make sure to throw if we didn't output anything |
| 1816 | ch &= 0x1fffff; |
| 1817 | if (ch > 0x7f) |
| 1818 | { |
| 1819 | if (ch > 0x7ff) |
| 1820 | { |
| 1821 | if (ch >= CharUnicodeInfo::LOW_SURROGATE_START && |
| 1822 | ch <= CharUnicodeInfo::LOW_SURROGATE_END) |
| 1823 | { |
| 1824 | pSrc--; // It was 4 bytes |
| 1825 | pTarget--; // 1 was stored already, but we can't remember 1/2, so back up |
| 1826 | } |
| 1827 | else if (ch > 0xffff) |
| 1828 | { |
| 1829 | pSrc--; // It was 4 bytes, nothing was stored |
| 1830 | } |
| 1831 | pSrc--; // It was at least 3 bytes |
| 1832 | } |
| 1833 | pSrc--; // It was at least 2 bytes |
| 1834 | } |
| 1835 | pSrc--; |
| 1836 | |
| 1837 | // Throw that we don't have enough room (pSrc could be < chars if we had started to process |
| 1838 | // a 4 byte sequence alredy) |
| 1839 | Contract::Assert(pSrc >= bytes || pTarget == chars, |
| 1840 | "[UTF8Encoding.GetChars]Expected pSrc to be within input buffer or throw due to no output]" ); |
| 1841 | ThrowCharsOverflow(pTarget == chars); |
| 1842 | |
| 1843 | // Don't store ch in decoder, we already backed up to its start |
| 1844 | ch = 0; |
| 1845 | |
| 1846 | // Didn't throw, just use this buffer size. |
| 1847 | break; |
| 1848 | } |
| 1849 | *pTarget = (WCHAR)ch; |
| 1850 | pTarget++; |
| 1851 | |
| 1852 | #ifdef FASTLOOP |
| 1853 | int availableChars = PtrDiff(pAllocatedBufferEnd, pTarget); |
| 1854 | int availableBytes = PtrDiff(pEnd, pSrc); |
| 1855 | |
| 1856 | // don't fall into the fast decoding loop if we don't have enough bytes |
| 1857 | // Test for availableChars is done because pStop would be <= pTarget. |
| 1858 | if (availableBytes <= 13) { |
| 1859 | // we may need as many as 1 character per byte |
| 1860 | if (availableChars < availableBytes) { |
| 1861 | // not enough output room. no pending bits at this point |
| 1862 | ch = 0; |
| 1863 | continue; |
| 1864 | } |
| 1865 | |
| 1866 | // try to get over the remainder of the ascii characters fast though |
| 1867 | BYTE* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered |
| 1868 | while (pSrc < pLocalEnd) { |
| 1869 | ch = *pSrc; |
| 1870 | pSrc++; |
| 1871 | |
| 1872 | if (ch > 0x7F) |
| 1873 | goto ProcessChar; |
| 1874 | |
| 1875 | *pTarget = (WCHAR)ch; |
| 1876 | pTarget++; |
| 1877 | } |
| 1878 | // we are done |
| 1879 | ch = 0; |
| 1880 | break; |
| 1881 | } |
| 1882 | |
| 1883 | // we may need as many as 1 character per byte, so reduce the byte count if necessary. |
| 1884 | // If availableChars is too small, pStop will be before pTarget and we won't do fast loop. |
| 1885 | if (availableChars < availableBytes) { |
| 1886 | availableBytes = availableChars; |
| 1887 | } |
| 1888 | |
| 1889 | // To compute the upper bound, assume that all characters are ASCII characters at this point, |
| 1890 | // the boundary will be decreased for every non-ASCII character we encounter |
| 1891 | // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences |
| 1892 | WCHAR *pStop = pTarget + availableBytes - 7; |
| 1893 | |
| 1894 | while (pTarget < pStop) { |
| 1895 | ch = *pSrc; |
| 1896 | pSrc++; |
| 1897 | |
| 1898 | if (ch > 0x7F) { |
| 1899 | goto LongCode; |
| 1900 | } |
| 1901 | *pTarget = (WCHAR)ch; |
| 1902 | pTarget++; |
| 1903 | |
| 1904 | // get pSrc to be 2-byte aligned |
| 1905 | if ((((int)pSrc) & 0x1) != 0) { |
| 1906 | ch = *pSrc; |
| 1907 | pSrc++; |
| 1908 | if (ch > 0x7F) { |
| 1909 | goto LongCode; |
| 1910 | } |
| 1911 | *pTarget = (WCHAR)ch; |
| 1912 | pTarget++; |
| 1913 | } |
| 1914 | |
| 1915 | // get pSrc to be 4-byte aligned |
| 1916 | if ((((int)pSrc) & 0x2) != 0) { |
| 1917 | ch = *(USHORT*)pSrc; |
| 1918 | if ((ch & 0x8080) != 0) { |
| 1919 | goto LongCodeWithMask16; |
| 1920 | } |
| 1921 | |
| 1922 | // Unfortunately, this is endianess sensitive |
| 1923 | #if BIGENDIAN |
| 1924 | *pTarget = (WCHAR)((ch >> 8) & 0x7F); |
| 1925 | pSrc += 2; |
| 1926 | *(pTarget + 1) = (WCHAR)(ch & 0x7F); |
| 1927 | pTarget += 2; |
| 1928 | #else // BIGENDIAN |
| 1929 | *pTarget = (WCHAR)(ch & 0x7F); |
| 1930 | pSrc += 2; |
| 1931 | *(pTarget + 1) = (WCHAR)((ch >> 8) & 0x7F); |
| 1932 | pTarget += 2; |
| 1933 | #endif // BIGENDIAN |
| 1934 | } |
| 1935 | |
| 1936 | // Run 8 characters at a time! |
| 1937 | while (pTarget < pStop) { |
| 1938 | ch = *(int*)pSrc; |
| 1939 | int chb = *(int*)(pSrc + 4); |
| 1940 | if (((ch | chb) & (int)0x80808080) != 0) { |
| 1941 | goto LongCodeWithMask32; |
| 1942 | } |
| 1943 | |
| 1944 | // Unfortunately, this is endianess sensitive |
| 1945 | #if BIGENDIAN |
| 1946 | *pTarget = (WCHAR)((ch >> 24) & 0x7F); |
| 1947 | *(pTarget + 1) = (WCHAR)((ch >> 16) & 0x7F); |
| 1948 | *(pTarget + 2) = (WCHAR)((ch >> 8) & 0x7F); |
| 1949 | *(pTarget + 3) = (WCHAR)(ch & 0x7F); |
| 1950 | pSrc += 8; |
| 1951 | *(pTarget + 4) = (WCHAR)((chb >> 24) & 0x7F); |
| 1952 | *(pTarget + 5) = (WCHAR)((chb >> 16) & 0x7F); |
| 1953 | *(pTarget + 6) = (WCHAR)((chb >> 8) & 0x7F); |
| 1954 | *(pTarget + 7) = (WCHAR)(chb & 0x7F); |
| 1955 | pTarget += 8; |
| 1956 | #else // BIGENDIAN |
| 1957 | *pTarget = (WCHAR)(ch & 0x7F); |
| 1958 | *(pTarget + 1) = (WCHAR)((ch >> 8) & 0x7F); |
| 1959 | *(pTarget + 2) = (WCHAR)((ch >> 16) & 0x7F); |
| 1960 | *(pTarget + 3) = (WCHAR)((ch >> 24) & 0x7F); |
| 1961 | pSrc += 8; |
| 1962 | *(pTarget + 4) = (WCHAR)(chb & 0x7F); |
| 1963 | *(pTarget + 5) = (WCHAR)((chb >> 8) & 0x7F); |
| 1964 | *(pTarget + 6) = (WCHAR)((chb >> 16) & 0x7F); |
| 1965 | *(pTarget + 7) = (WCHAR)((chb >> 24) & 0x7F); |
| 1966 | pTarget += 8; |
| 1967 | #endif // BIGENDIAN |
| 1968 | } |
| 1969 | break; |
| 1970 | |
| 1971 | #if BIGENDIAN |
| 1972 | LongCodeWithMask32 : |
| 1973 | // be careful about the sign extension |
| 1974 | ch = (int)(((uint)ch) >> 16); |
| 1975 | LongCodeWithMask16: |
| 1976 | ch = (int)(((uint)ch) >> 8); |
| 1977 | #else // BIGENDIAN |
| 1978 | LongCodeWithMask32: |
| 1979 | LongCodeWithMask16: |
| 1980 | ch &= 0xFF; |
| 1981 | #endif // BIGENDIAN |
| 1982 | pSrc++; |
| 1983 | if (ch <= 0x7F) { |
| 1984 | *pTarget = (WCHAR)ch; |
| 1985 | pTarget++; |
| 1986 | continue; |
| 1987 | } |
| 1988 | |
| 1989 | LongCode: |
| 1990 | int chc = *pSrc; |
| 1991 | pSrc++; |
| 1992 | |
| 1993 | if ( |
| 1994 | // bit 6 has to be zero |
| 1995 | (ch & 0x40) == 0 || |
| 1996 | // we are expecting to see trailing bytes like 10vvvvvv |
| 1997 | (chc & 0xC0) != 0x80) |
| 1998 | { |
| 1999 | goto BadLongCode; |
| 2000 | } |
| 2001 | |
| 2002 | chc &= 0x3F; |
| 2003 | |
| 2004 | // start a new long code |
| 2005 | if ((ch & 0x20) != 0) { |
| 2006 | |
| 2007 | // fold the first two bytes together |
| 2008 | chc |= (ch & 0x0F) << 6; |
| 2009 | |
| 2010 | if ((ch & 0x10) != 0) { |
| 2011 | // 4 byte encoding - surrogate |
| 2012 | ch = *pSrc; |
| 2013 | if ( |
| 2014 | // check that bit 4 is zero, the non-shortest form of surrogate |
| 2015 | // and the valid surrogate range 0x000000 - 0x10FFFF at the same time |
| 2016 | !InRange(chc >> 4, 0x01, 0x10) || |
| 2017 | // we are expecting to see trailing bytes like 10vvvvvv |
| 2018 | (ch & 0xC0) != 0x80) |
| 2019 | { |
| 2020 | goto BadLongCode; |
| 2021 | } |
| 2022 | |
| 2023 | chc = (chc << 6) | (ch & 0x3F); |
| 2024 | |
| 2025 | ch = *(pSrc + 1); |
| 2026 | // we are expecting to see trailing bytes like 10vvvvvv |
| 2027 | if ((ch & 0xC0) != 0x80) { |
| 2028 | goto BadLongCode; |
| 2029 | } |
| 2030 | pSrc += 2; |
| 2031 | |
| 2032 | ch = (chc << 6) | (ch & 0x3F); |
| 2033 | |
| 2034 | *pTarget = (WCHAR)(((ch >> 10) & 0x7FF) + |
| 2035 | (SHORT)(CharUnicodeInfo::HIGH_SURROGATE_START - (0x10000 >> 10))); |
| 2036 | pTarget++; |
| 2037 | |
| 2038 | ch = (ch & 0x3FF) + |
| 2039 | (SHORT)(CharUnicodeInfo::LOW_SURROGATE_START); |
| 2040 | |
| 2041 | // extra byte, we're already planning 2 chars for 2 of these bytes, |
| 2042 | // but the big loop is testing the target against pStop, so we need |
| 2043 | // to subtract 2 more or we risk overrunning the input. Subtract |
| 2044 | // one here and one below. |
| 2045 | pStop--; |
| 2046 | } |
| 2047 | else { |
| 2048 | // 3 byte encoding |
| 2049 | ch = *pSrc; |
| 2050 | if ( |
| 2051 | // check for non-shortest form of 3 byte seq |
| 2052 | (chc & (0x1F << 5)) == 0 || |
| 2053 | // Can't have surrogates here. |
| 2054 | (chc & (0xF800 >> 6)) == (0xD800 >> 6) || |
| 2055 | // we are expecting to see trailing bytes like 10vvvvvv |
| 2056 | (ch & 0xC0) != 0x80) |
| 2057 | { |
| 2058 | goto BadLongCode; |
| 2059 | } |
| 2060 | pSrc++; |
| 2061 | |
| 2062 | ch = (chc << 6) | (ch & 0x3F); |
| 2063 | |
| 2064 | // extra byte, we're only expecting 1 char for each of these 3 bytes, |
| 2065 | // but the loop is testing the target (not source) against pStop, so |
| 2066 | // we need to subtract 2 more or we risk overrunning the input. |
| 2067 | // Subtract 1 here and one more below |
| 2068 | pStop--; |
| 2069 | } |
| 2070 | } |
| 2071 | else { |
| 2072 | // 2 byte encoding |
| 2073 | |
| 2074 | ch &= 0x1F; |
| 2075 | |
| 2076 | // check for non-shortest form |
| 2077 | if (ch <= 1) { |
| 2078 | goto BadLongCode; |
| 2079 | } |
| 2080 | ch = (ch << 6) | chc; |
| 2081 | } |
| 2082 | |
| 2083 | *pTarget = (WCHAR)ch; |
| 2084 | pTarget++; |
| 2085 | |
| 2086 | // extra byte, we're only expecting 1 char for each of these 2 bytes, |
| 2087 | // but the loop is testing the target (not source) against pStop. |
| 2088 | // subtract an extra count from pStop so that we don't overrun the input. |
| 2089 | pStop--; |
| 2090 | } |
| 2091 | #endif // FASTLOOP |
| 2092 | |
| 2093 | Contract::Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetChars]pTarget <= pAllocatedBufferEnd" ); |
| 2094 | |
| 2095 | // no pending bits at this point |
| 2096 | ch = 0; |
| 2097 | continue; |
| 2098 | |
| 2099 | BadLongCode: |
| 2100 | pSrc -= 2; |
| 2101 | ch = 0; |
| 2102 | continue; |
| 2103 | } |
| 2104 | |
| 2105 | if (ch != 0) |
| 2106 | { |
| 2107 | // Have to do fallback for invalid bytes |
| 2108 | if (fallback == nullptr) |
| 2109 | { |
| 2110 | fallback = decoderFallback->CreateFallbackBuffer(); |
| 2111 | fallback->InternalInitialize(bytes, pAllocatedBufferEnd); |
| 2112 | } |
| 2113 | |
| 2114 | // This'll back us up the appropriate # of bytes if we didn't get anywhere |
| 2115 | if (!FallbackInvalidByteSequence(pSrc, ch, fallback)) |
| 2116 | { |
| 2117 | Contract::Assert(pSrc >= bytes || pTarget == chars, |
| 2118 | "[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer while flushing" ); |
| 2119 | |
| 2120 | // Ran out of buffer space |
| 2121 | // Need to throw an exception? |
| 2122 | fallback->InternalReset(); |
| 2123 | ThrowCharsOverflow(pTarget == chars); |
| 2124 | } |
| 2125 | Contract::Assert(pSrc >= bytes, |
| 2126 | "[UTF8Encoding.GetChars]Expected flushing invalid byte sequence to have remained within the byte array" ); |
| 2127 | ch = 0; |
| 2128 | } |
| 2129 | |
| 2130 | // Shouldn't have anything in fallback buffer for GetChars |
| 2131 | // (don't have to check m_throwOnOverflow for chars) |
| 2132 | Contract::Assert(fallback == nullptr || fallback->GetRemaining() == 0, |
| 2133 | "[UTF8Encoding.GetChars]Expected empty fallback buffer at end" ); |
| 2134 | |
| 2135 | InternalDelete(fallback); |
| 2136 | |
| 2137 | return PtrDiff(pTarget, chars); |
| 2138 | } |
| 2139 | |
| 2140 | int GetBytes(WCHAR* chars, int charCount, BYTE* bytes, int byteCount) |
| 2141 | { |
| 2142 | Contract::Assert(chars != nullptr, "[UTF8Encoding.GetBytes]chars!=nullptr" ); |
| 2143 | Contract::Assert(byteCount >= 0, "[UTF8Encoding.GetBytes]byteCount >=0" ); |
| 2144 | Contract::Assert(charCount >= 0, "[UTF8Encoding.GetBytes]charCount >=0" ); |
| 2145 | Contract::Assert(bytes != nullptr, "[UTF8Encoding.GetBytes]bytes!=nullptr" ); |
| 2146 | |
| 2147 | // For fallback we may need a fallback buffer. |
| 2148 | // We wait to initialize it though in case we don't have any broken input unicode |
| 2149 | EncoderFallbackBuffer* fallbackBuffer = nullptr; |
| 2150 | WCHAR *pSrc = chars; |
| 2151 | BYTE *pTarget = bytes; |
| 2152 | |
| 2153 | WCHAR *pEnd = pSrc + charCount; |
| 2154 | BYTE *pAllocatedBufferEnd = pTarget + byteCount; |
| 2155 | |
| 2156 | int ch = 0; |
| 2157 | |
| 2158 | // assume that JIT will enregister pSrc, pTarget and ch |
| 2159 | |
| 2160 | for (;;) { |
| 2161 | // SLOWLOOP: does all range checks, handles all special cases, but it is slow |
| 2162 | |
| 2163 | if (pSrc >= pEnd) { |
| 2164 | |
| 2165 | if (ch == 0) { |
| 2166 | // Check if there's anything left to get out of the fallback buffer |
| 2167 | ch = fallbackBuffer != nullptr ? fallbackBuffer->InternalGetNextChar() : 0; |
| 2168 | if (ch > 0) { |
| 2169 | goto ProcessChar; |
| 2170 | } |
| 2171 | } |
| 2172 | else { |
| 2173 | // Case of leftover surrogates in the fallback buffer |
| 2174 | if (fallbackBuffer != nullptr && fallbackBuffer->bFallingBack) { |
| 2175 | Contract::Assert(ch >= 0xD800 && ch <= 0xDBFF, |
| 2176 | "[UTF8Encoding.GetBytes]expected high surrogate" ); //, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture)); |
| 2177 | |
| 2178 | int cha = ch; |
| 2179 | |
| 2180 | ch = fallbackBuffer->InternalGetNextChar(); |
| 2181 | |
| 2182 | if (InRange(ch, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) { |
| 2183 | ch = ch + (cha << 10) + (0x10000 - CharUnicodeInfo::LOW_SURROGATE_START - (CharUnicodeInfo::HIGH_SURROGATE_START << 10)); |
| 2184 | goto EncodeChar; |
| 2185 | } |
| 2186 | else if (ch > 0){ |
| 2187 | goto ProcessChar; |
| 2188 | } |
| 2189 | else { |
| 2190 | break; |
| 2191 | } |
| 2192 | } |
| 2193 | } |
| 2194 | |
| 2195 | // attempt to encode the partial surrogate (will fail or ignore) |
| 2196 | if (ch > 0) |
| 2197 | goto EncodeChar; |
| 2198 | |
| 2199 | // We're done |
| 2200 | break; |
| 2201 | } |
| 2202 | |
| 2203 | if (ch > 0) { |
| 2204 | // We have a high surrogate left over from a previous loop. |
| 2205 | Contract::Assert(ch >= 0xD800 && ch <= 0xDBFF, |
| 2206 | "[UTF8Encoding.GetBytes]expected high surrogate" );//, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture)); |
| 2207 | |
| 2208 | // use separate helper variables for local contexts so that the jit optimizations |
| 2209 | // won't get confused about the variable lifetimes |
| 2210 | int cha = *pSrc; |
| 2211 | |
| 2212 | // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here. |
| 2213 | // if (IsLowSurrogate(cha)) { |
| 2214 | if (InRange(cha, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) { |
| 2215 | ch = cha + (ch << 10) + |
| 2216 | (0x10000 |
| 2217 | - CharUnicodeInfo::LOW_SURROGATE_START |
| 2218 | - (CharUnicodeInfo::HIGH_SURROGATE_START << 10)); |
| 2219 | |
| 2220 | pSrc++; |
| 2221 | } |
| 2222 | // else ch is still high surrogate and encoding will fail |
| 2223 | |
| 2224 | // attempt to encode the surrogate or partial surrogate |
| 2225 | goto EncodeChar; |
| 2226 | } |
| 2227 | |
| 2228 | // If we've used a fallback, then we have to check for it |
| 2229 | if (fallbackBuffer != nullptr) |
| 2230 | { |
| 2231 | ch = fallbackBuffer->InternalGetNextChar(); |
| 2232 | if (ch > 0) goto ProcessChar; |
| 2233 | } |
| 2234 | |
| 2235 | // read next char. The JIT optimization seems to be getting confused when |
| 2236 | // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead |
| 2237 | ch = *pSrc; |
| 2238 | pSrc++; |
| 2239 | |
| 2240 | ProcessChar: |
| 2241 | if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::HIGH_SURROGATE_END)) { |
| 2242 | continue; |
| 2243 | } |
| 2244 | // either good char or partial surrogate |
| 2245 | |
| 2246 | EncodeChar: |
| 2247 | // throw exception on partial surrogate if necessary |
| 2248 | if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) |
| 2249 | { |
| 2250 | // Lone surrogates aren't allowed, we have to do fallback for them |
| 2251 | // Have to make a fallback buffer if we don't have one |
| 2252 | if (fallbackBuffer == nullptr) |
| 2253 | { |
| 2254 | // wait on fallbacks if we can |
| 2255 | // For fallback we may need a fallback buffer |
| 2256 | fallbackBuffer = encoderFallback->CreateFallbackBuffer(); |
| 2257 | |
| 2258 | // Set our internal fallback interesting things. |
| 2259 | fallbackBuffer->InternalInitialize(chars, pEnd, true); |
| 2260 | } |
| 2261 | |
| 2262 | // Do our fallback. Actually we already know its a mixed up surrogate, |
| 2263 | // so the ref pSrc isn't gonna do anything. |
| 2264 | fallbackBuffer->InternalFallback((WCHAR)ch, &pSrc); |
| 2265 | |
| 2266 | // Ignore it if we don't throw |
| 2267 | ch = 0; |
| 2268 | continue; |
| 2269 | } |
| 2270 | |
| 2271 | // Count bytes needed |
| 2272 | int bytesNeeded = 1; |
| 2273 | if (ch > 0x7F) { |
| 2274 | if (ch > 0x7FF) { |
| 2275 | if (ch > 0xFFFF) { |
| 2276 | bytesNeeded++; // 4 bytes (surrogate pair) |
| 2277 | } |
| 2278 | bytesNeeded++; // 3 bytes (800-FFFF) |
| 2279 | } |
| 2280 | bytesNeeded++; // 2 bytes (80-7FF) |
| 2281 | } |
| 2282 | |
| 2283 | if (pTarget > pAllocatedBufferEnd - bytesNeeded) { |
| 2284 | // Left over surrogate from last time will cause pSrc == chars, so we'll throw |
| 2285 | if (fallbackBuffer != nullptr && fallbackBuffer->bFallingBack) |
| 2286 | { |
| 2287 | fallbackBuffer->MovePrevious(); // Didn't use this fallback char |
| 2288 | if (ch > 0xFFFF) |
| 2289 | fallbackBuffer->MovePrevious(); // Was surrogate, didn't use 2nd part either |
| 2290 | } |
| 2291 | else |
| 2292 | { |
| 2293 | pSrc--; // Didn't use this char |
| 2294 | if (ch > 0xFFFF) |
| 2295 | pSrc--; // Was surrogate, didn't use 2nd part either |
| 2296 | } |
| 2297 | Contract::Assert(pSrc >= chars || pTarget == bytes, |
| 2298 | "[UTF8Encoding.GetBytes]Expected pSrc to be within buffer or to throw with insufficient room." ); |
| 2299 | ThrowBytesOverflow(pTarget == bytes); // Throw if we must |
| 2300 | ch = 0; // Nothing left over (we backed up to start of pair if supplimentary) |
| 2301 | break; |
| 2302 | } |
| 2303 | |
| 2304 | if (ch <= 0x7F) { |
| 2305 | *pTarget = (BYTE)ch; |
| 2306 | } |
| 2307 | else { |
| 2308 | // use separate helper variables for local contexts so that the jit optimizations |
| 2309 | // won't get confused about the variable lifetimes |
| 2310 | int chb; |
| 2311 | if (ch <= 0x7FF) { |
| 2312 | // 2 BYTE encoding |
| 2313 | chb = (BYTE)(0xC0 | (ch >> 6)); |
| 2314 | } |
| 2315 | else |
| 2316 | { |
| 2317 | if (ch <= 0xFFFF) { |
| 2318 | chb = (BYTE)(0xE0 | (ch >> 12)); |
| 2319 | } |
| 2320 | else |
| 2321 | { |
| 2322 | *pTarget = (BYTE)(0xF0 | (ch >> 18)); |
| 2323 | pTarget++; |
| 2324 | |
| 2325 | chb = 0x80 | ((ch >> 12) & 0x3F); |
| 2326 | } |
| 2327 | *pTarget = (BYTE)chb; |
| 2328 | pTarget++; |
| 2329 | |
| 2330 | chb = 0x80 | ((ch >> 6) & 0x3F); |
| 2331 | } |
| 2332 | *pTarget = (BYTE)chb; |
| 2333 | pTarget++; |
| 2334 | |
| 2335 | *pTarget = (BYTE)0x80 | (ch & 0x3F); |
| 2336 | } |
| 2337 | pTarget++; |
| 2338 | |
| 2339 | |
| 2340 | #ifdef FASTLOOP |
| 2341 | // If still have fallback don't do fast loop |
| 2342 | if (fallbackBuffer != nullptr && (ch = fallbackBuffer->InternalGetNextChar()) != 0) |
| 2343 | goto ProcessChar; |
| 2344 | |
| 2345 | int availableChars = PtrDiff(pEnd, pSrc); |
| 2346 | int availableBytes = PtrDiff(pAllocatedBufferEnd, pTarget); |
| 2347 | |
| 2348 | // don't fall into the fast decoding loop if we don't have enough characters |
| 2349 | // Note that if we don't have enough bytes, pStop will prevent us from entering the fast loop. |
| 2350 | if (availableChars <= 13) { |
| 2351 | // we are hoping for 1 BYTE per char |
| 2352 | if (availableBytes < availableChars) { |
| 2353 | // not enough output room. no pending bits at this point |
| 2354 | ch = 0; |
| 2355 | continue; |
| 2356 | } |
| 2357 | |
| 2358 | // try to get over the remainder of the ascii characters fast though |
| 2359 | WCHAR* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered |
| 2360 | while (pSrc < pLocalEnd) { |
| 2361 | ch = *pSrc; |
| 2362 | pSrc++; |
| 2363 | |
| 2364 | // Not ASCII, need more than 1 BYTE per char |
| 2365 | if (ch > 0x7F) |
| 2366 | goto ProcessChar; |
| 2367 | |
| 2368 | *pTarget = (BYTE)ch; |
| 2369 | pTarget++; |
| 2370 | } |
| 2371 | // we are done, let ch be 0 to clear encoder |
| 2372 | ch = 0; |
| 2373 | break; |
| 2374 | } |
| 2375 | |
| 2376 | // we need at least 1 BYTE per character, but Convert might allow us to convert |
| 2377 | // only part of the input, so try as much as we can. Reduce charCount if necessary |
| 2378 | if (availableBytes < availableChars) |
| 2379 | { |
| 2380 | availableChars = availableBytes; |
| 2381 | } |
| 2382 | |
| 2383 | // FASTLOOP: |
| 2384 | // - optimistic range checks |
| 2385 | // - fallbacks to the slow loop for all special cases, exception throwing, etc. |
| 2386 | |
| 2387 | // To compute the upper bound, assume that all characters are ASCII characters at this point, |
| 2388 | // the boundary will be decreased for every non-ASCII character we encounter |
| 2389 | // Also, we need 5 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates |
| 2390 | // If there aren't enough bytes for the output, then pStop will be <= pSrc and will bypass the loop. |
| 2391 | WCHAR *pStop = pSrc + availableChars - 5; |
| 2392 | |
| 2393 | while (pSrc < pStop) { |
| 2394 | ch = *pSrc; |
| 2395 | pSrc++; |
| 2396 | |
| 2397 | if (ch > 0x7F) { |
| 2398 | goto LongCode; |
| 2399 | } |
| 2400 | *pTarget = (BYTE)ch; |
| 2401 | pTarget++; |
| 2402 | |
| 2403 | // get pSrc aligned |
| 2404 | if (((size_t)pSrc & 0x2) != 0) { |
| 2405 | ch = *pSrc; |
| 2406 | pSrc++; |
| 2407 | if (ch > 0x7F) { |
| 2408 | goto LongCode; |
| 2409 | } |
| 2410 | *pTarget = (BYTE)ch; |
| 2411 | pTarget++; |
| 2412 | } |
| 2413 | |
| 2414 | // Run 4 characters at a time! |
| 2415 | while (pSrc < pStop) { |
| 2416 | ch = *(int*)pSrc; |
| 2417 | int chc = *(int*)(pSrc + 2); |
| 2418 | if (((ch | chc) & (int)0xFF80FF80) != 0) { |
| 2419 | goto LongCodeWithMask; |
| 2420 | } |
| 2421 | |
| 2422 | // Unfortunately, this is endianess sensitive |
| 2423 | #if BIGENDIAN |
| 2424 | *pTarget = (BYTE)(ch >> 16); |
| 2425 | *(pTarget + 1) = (BYTE)ch; |
| 2426 | pSrc += 4; |
| 2427 | *(pTarget + 2) = (BYTE)(chc >> 16); |
| 2428 | *(pTarget + 3) = (BYTE)chc; |
| 2429 | pTarget += 4; |
| 2430 | #else // BIGENDIAN |
| 2431 | *pTarget = (BYTE)ch; |
| 2432 | *(pTarget + 1) = (BYTE)(ch >> 16); |
| 2433 | pSrc += 4; |
| 2434 | *(pTarget + 2) = (BYTE)chc; |
| 2435 | *(pTarget + 3) = (BYTE)(chc >> 16); |
| 2436 | pTarget += 4; |
| 2437 | #endif // BIGENDIAN |
| 2438 | } |
| 2439 | continue; |
| 2440 | |
| 2441 | LongCodeWithMask: |
| 2442 | #if BIGENDIAN |
| 2443 | // be careful about the sign extension |
| 2444 | ch = (int)(((uint)ch) >> 16); |
| 2445 | #else // BIGENDIAN |
| 2446 | ch = (WCHAR)ch; |
| 2447 | #endif // BIGENDIAN |
| 2448 | pSrc++; |
| 2449 | |
| 2450 | if (ch > 0x7F) { |
| 2451 | goto LongCode; |
| 2452 | } |
| 2453 | *pTarget = (BYTE)ch; |
| 2454 | pTarget++; |
| 2455 | continue; |
| 2456 | |
| 2457 | LongCode: |
| 2458 | // use separate helper variables for slow and fast loop so that the jit optimizations |
| 2459 | // won't get confused about the variable lifetimes |
| 2460 | int chd; |
| 2461 | if (ch <= 0x7FF) { |
| 2462 | // 2 BYTE encoding |
| 2463 | chd = 0xC0 | (ch >> 6); |
| 2464 | } |
| 2465 | else { |
| 2466 | if (!InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) { |
| 2467 | // 3 BYTE encoding |
| 2468 | chd = 0xE0 | (ch >> 12); |
| 2469 | } |
| 2470 | else |
| 2471 | { |
| 2472 | // 4 BYTE encoding - high surrogate + low surrogate |
| 2473 | if (ch > CharUnicodeInfo::HIGH_SURROGATE_END) { |
| 2474 | // low without high -> bad, try again in slow loop |
| 2475 | pSrc -= 1; |
| 2476 | break; |
| 2477 | } |
| 2478 | |
| 2479 | chd = *pSrc; |
| 2480 | pSrc++; |
| 2481 | |
| 2482 | // if (!IsLowSurrogate(chd)) { |
| 2483 | if (!InRange(chd, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) { |
| 2484 | // high not followed by low -> bad, try again in slow loop |
| 2485 | pSrc -= 2; |
| 2486 | break; |
| 2487 | } |
| 2488 | |
| 2489 | ch = chd + (ch << 10) + |
| 2490 | (0x10000 |
| 2491 | - CharUnicodeInfo::LOW_SURROGATE_START |
| 2492 | - (CharUnicodeInfo::HIGH_SURROGATE_START << 10)); |
| 2493 | |
| 2494 | *pTarget = (BYTE)(0xF0 | (ch >> 18)); |
| 2495 | // pStop - this BYTE is compensated by the second surrogate character |
| 2496 | // 2 input chars require 4 output bytes. 2 have been anticipated already |
| 2497 | // and 2 more will be accounted for by the 2 pStop-- calls below. |
| 2498 | pTarget++; |
| 2499 | |
| 2500 | chd = 0x80 | ((ch >> 12) & 0x3F); |
| 2501 | } |
| 2502 | *pTarget = (BYTE)chd; |
| 2503 | pStop--; // 3 BYTE sequence for 1 char, so need pStop-- and the one below too. |
| 2504 | pTarget++; |
| 2505 | |
| 2506 | chd = 0x80 | ((ch >> 6) & 0x3F); |
| 2507 | } |
| 2508 | *pTarget = (BYTE)chd; |
| 2509 | pStop--; // 2 BYTE sequence for 1 char so need pStop--. |
| 2510 | pTarget++; |
| 2511 | |
| 2512 | *pTarget = (BYTE)(0x80 | (ch & 0x3F)); |
| 2513 | // pStop - this BYTE is already included |
| 2514 | pTarget++; |
| 2515 | } |
| 2516 | |
| 2517 | Contract::Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetBytes]pTarget <= pAllocatedBufferEnd" ); |
| 2518 | |
| 2519 | #endif // FASTLOOP |
| 2520 | |
| 2521 | // no pending char at this point |
| 2522 | ch = 0; |
| 2523 | } |
| 2524 | |
| 2525 | InternalDelete(fallbackBuffer); |
| 2526 | |
| 2527 | return (int)(pTarget - bytes); |
| 2528 | } |
| 2529 | |
| 2530 | int GetByteCount(WCHAR *chars, int count) |
| 2531 | { |
| 2532 | // For fallback we may need a fallback buffer. |
| 2533 | // We wait to initialize it though in case we don't have any broken input unicode |
| 2534 | EncoderFallbackBuffer* fallbackBuffer = nullptr; |
| 2535 | WCHAR *pSrc = chars; |
| 2536 | WCHAR *pEnd = pSrc + count; |
| 2537 | |
| 2538 | // Start by assuming we have as many as count |
| 2539 | int byteCount = count; |
| 2540 | |
| 2541 | int ch = 0; |
| 2542 | |
| 2543 | for (;;) { |
| 2544 | // SLOWLOOP: does all range checks, handles all special cases, but it is slow |
| 2545 | if (pSrc >= pEnd) { |
| 2546 | |
| 2547 | if (ch == 0) { |
| 2548 | // Unroll any fallback that happens at the end |
| 2549 | ch = fallbackBuffer != nullptr ? fallbackBuffer->InternalGetNextChar() : 0; |
| 2550 | if (ch > 0) { |
| 2551 | byteCount++; |
| 2552 | goto ProcessChar; |
| 2553 | } |
| 2554 | } |
| 2555 | else { |
| 2556 | // Case of surrogates in the fallback. |
| 2557 | if (fallbackBuffer != nullptr && fallbackBuffer->bFallingBack) { |
| 2558 | Contract::Assert(ch >= 0xD800 && ch <= 0xDBFF, |
| 2559 | "[UTF8Encoding.GetBytes]expected high surrogate" );// , not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture)); |
| 2560 | |
| 2561 | ch = fallbackBuffer->InternalGetNextChar(); |
| 2562 | byteCount++; |
| 2563 | |
| 2564 | if (InRange(ch, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) { |
| 2565 | ch = 0xfffd; |
| 2566 | byteCount++; |
| 2567 | goto EncodeChar; |
| 2568 | } |
| 2569 | else if (ch > 0){ |
| 2570 | goto ProcessChar; |
| 2571 | } |
| 2572 | else { |
| 2573 | byteCount--; // ignore last one. |
| 2574 | break; |
| 2575 | } |
| 2576 | } |
| 2577 | } |
| 2578 | |
| 2579 | if (ch <= 0) { |
| 2580 | break; |
| 2581 | } |
| 2582 | |
| 2583 | // attempt to encode the partial surrogate (will fallback or ignore it), it'll also subtract 1. |
| 2584 | byteCount++; |
| 2585 | goto EncodeChar; |
| 2586 | } |
| 2587 | |
| 2588 | if (ch > 0) { |
| 2589 | Contract::Assert(ch >= 0xD800 && ch <= 0xDBFF, |
| 2590 | "[UTF8Encoding.GetBytes]expected high surrogate" ); // , not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture)); |
| 2591 | |
| 2592 | // use separate helper variables for local contexts so that the jit optimizations |
| 2593 | // won't get confused about the variable lifetimes |
| 2594 | int cha = *pSrc; |
| 2595 | |
| 2596 | // count the pending surrogate |
| 2597 | byteCount++; |
| 2598 | |
| 2599 | // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here. |
| 2600 | // if (IsLowSurrogate(cha)) { |
| 2601 | if (InRange(cha, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) { |
| 2602 | // Don't need a real # because we're just counting, anything > 0x7ff ('cept surrogate) will do. |
| 2603 | ch = 0xfffd; |
| 2604 | // ch = cha + (ch << 10) + |
| 2605 | // (0x10000 |
| 2606 | // - CharUnicodeInfo::LOW_SURROGATE_START |
| 2607 | // - (CharUnicodeInfo::HIGH_SURROGATE_START << 10) ); |
| 2608 | |
| 2609 | // Use this next char |
| 2610 | pSrc++; |
| 2611 | } |
| 2612 | // else ch is still high surrogate and encoding will fail (so don't add count) |
| 2613 | |
| 2614 | // attempt to encode the surrogate or partial surrogate |
| 2615 | goto EncodeChar; |
| 2616 | } |
| 2617 | |
| 2618 | // If we've used a fallback, then we have to check for it |
| 2619 | if (fallbackBuffer != nullptr) |
| 2620 | { |
| 2621 | ch = fallbackBuffer->InternalGetNextChar(); |
| 2622 | if (ch > 0) |
| 2623 | { |
| 2624 | // We have an extra byte we weren't expecting. |
| 2625 | byteCount++; |
| 2626 | goto ProcessChar; |
| 2627 | } |
| 2628 | } |
| 2629 | |
| 2630 | // read next char. The JIT optimization seems to be getting confused when |
| 2631 | // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead |
| 2632 | ch = *pSrc; |
| 2633 | pSrc++; |
| 2634 | |
| 2635 | ProcessChar: |
| 2636 | if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::HIGH_SURROGATE_END)) { |
| 2637 | // we will count this surrogate next time around |
| 2638 | byteCount--; |
| 2639 | continue; |
| 2640 | } |
| 2641 | // either good char or partial surrogate |
| 2642 | |
| 2643 | EncodeChar: |
| 2644 | // throw exception on partial surrogate if necessary |
| 2645 | if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) |
| 2646 | { |
| 2647 | // Lone surrogates aren't allowed |
| 2648 | // Have to make a fallback buffer if we don't have one |
| 2649 | if (fallbackBuffer == nullptr) |
| 2650 | { |
| 2651 | // wait on fallbacks if we can |
| 2652 | // For fallback we may need a fallback buffer |
| 2653 | fallbackBuffer = encoderFallback->CreateFallbackBuffer(); |
| 2654 | |
| 2655 | // Set our internal fallback interesting things. |
| 2656 | fallbackBuffer->InternalInitialize(chars, chars + count, false); |
| 2657 | } |
| 2658 | |
| 2659 | // Do our fallback. Actually we already know its a mixed up surrogate, |
| 2660 | // so the ref pSrc isn't gonna do anything. |
| 2661 | fallbackBuffer->InternalFallback((WCHAR)ch, &pSrc); |
| 2662 | |
| 2663 | // Ignore it if we don't throw (we had preallocated this ch) |
| 2664 | byteCount--; |
| 2665 | ch = 0; |
| 2666 | continue; |
| 2667 | } |
| 2668 | |
| 2669 | // Count them |
| 2670 | if (ch > 0x7F) { |
| 2671 | if (ch > 0x7FF) { |
| 2672 | // the extra surrogate byte was compensated by the second surrogate character |
| 2673 | // (2 surrogates make 4 bytes. We've already counted 2 bytes, 1 per char) |
| 2674 | byteCount++; |
| 2675 | } |
| 2676 | byteCount++; |
| 2677 | } |
| 2678 | |
| 2679 | #if WIN64 |
| 2680 | // check for overflow |
| 2681 | if (byteCount < 0) { |
| 2682 | break; |
| 2683 | } |
| 2684 | #endif |
| 2685 | |
| 2686 | #ifdef FASTLOOP |
| 2687 | // If still have fallback don't do fast loop |
| 2688 | if (fallbackBuffer != nullptr && (ch = fallbackBuffer->InternalGetNextChar()) != 0) |
| 2689 | { |
| 2690 | // We're reserving 1 byte for each char by default |
| 2691 | byteCount++; |
| 2692 | goto ProcessChar; |
| 2693 | } |
| 2694 | |
| 2695 | int availableChars = PtrDiff(pEnd, pSrc); |
| 2696 | |
| 2697 | // don't fall into the fast decoding loop if we don't have enough characters |
| 2698 | if (availableChars <= 13) { |
| 2699 | // try to get over the remainder of the ascii characters fast though |
| 2700 | WCHAR* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered |
| 2701 | while (pSrc < pLocalEnd) { |
| 2702 | ch = *pSrc; |
| 2703 | pSrc++; |
| 2704 | if (ch > 0x7F) |
| 2705 | goto ProcessChar; |
| 2706 | } |
| 2707 | |
| 2708 | // we are done |
| 2709 | break; |
| 2710 | } |
| 2711 | |
| 2712 | #if WIN64 |
| 2713 | // make sure that we won't get a silent overflow inside the fast loop |
| 2714 | // (Fall out to slow loop if we have this many characters) |
| 2715 | availableChars &= 0x0FFFFFFF; |
| 2716 | #endif |
| 2717 | |
| 2718 | // To compute the upper bound, assume that all characters are ASCII characters at this point, |
| 2719 | // the boundary will be decreased for every non-ASCII character we encounter |
| 2720 | // Also, we need 3 + 4 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates |
| 2721 | WCHAR *pStop = pSrc + availableChars - (3 + 4); |
| 2722 | |
| 2723 | while (pSrc < pStop) { |
| 2724 | ch = *pSrc; |
| 2725 | pSrc++; |
| 2726 | |
| 2727 | if (ch > 0x7F) // Not ASCII |
| 2728 | { |
| 2729 | if (ch > 0x7FF) // Not 2 Byte |
| 2730 | { |
| 2731 | if ((ch & 0xF800) == 0xD800) // See if its a Surrogate |
| 2732 | goto LongCode; |
| 2733 | byteCount++; |
| 2734 | } |
| 2735 | byteCount++; |
| 2736 | } |
| 2737 | |
| 2738 | // get pSrc aligned |
| 2739 | if (((int)pSrc & 0x2) != 0) { |
| 2740 | ch = *pSrc; |
| 2741 | pSrc++; |
| 2742 | if (ch > 0x7F) // Not ASCII |
| 2743 | { |
| 2744 | if (ch > 0x7FF) // Not 2 Byte |
| 2745 | { |
| 2746 | if ((ch & 0xF800) == 0xD800) // See if its a Surrogate |
| 2747 | goto LongCode; |
| 2748 | byteCount++; |
| 2749 | } |
| 2750 | byteCount++; |
| 2751 | } |
| 2752 | } |
| 2753 | |
| 2754 | // Run 2 * 4 characters at a time! |
| 2755 | while (pSrc < pStop) { |
| 2756 | ch = *(int*)pSrc; |
| 2757 | int chc = *(int*)(pSrc + 2); |
| 2758 | if (((ch | chc) & (int)0xFF80FF80) != 0) // See if not ASCII |
| 2759 | { |
| 2760 | if (((ch | chc) & (int)0xF800F800) != 0) // See if not 2 Byte |
| 2761 | { |
| 2762 | goto LongCodeWithMask; |
| 2763 | } |
| 2764 | |
| 2765 | |
| 2766 | if ((ch & (int)0xFF800000) != 0) // Actually 0x07800780 is all we care about (4 bits) |
| 2767 | byteCount++; |
| 2768 | if ((ch & (int)0xFF80) != 0) |
| 2769 | byteCount++; |
| 2770 | if ((chc & (int)0xFF800000) != 0) |
| 2771 | byteCount++; |
| 2772 | if ((chc & (int)0xFF80) != 0) |
| 2773 | byteCount++; |
| 2774 | } |
| 2775 | pSrc += 4; |
| 2776 | |
| 2777 | ch = *(int*)pSrc; |
| 2778 | chc = *(int*)(pSrc + 2); |
| 2779 | if (((ch | chc) & (int)0xFF80FF80) != 0) // See if not ASCII |
| 2780 | { |
| 2781 | if (((ch | chc) & (int)0xF800F800) != 0) // See if not 2 Byte |
| 2782 | { |
| 2783 | goto LongCodeWithMask; |
| 2784 | } |
| 2785 | |
| 2786 | if ((ch & (int)0xFF800000) != 0) |
| 2787 | byteCount++; |
| 2788 | if ((ch & (int)0xFF80) != 0) |
| 2789 | byteCount++; |
| 2790 | if ((chc & (int)0xFF800000) != 0) |
| 2791 | byteCount++; |
| 2792 | if ((chc & (int)0xFF80) != 0) |
| 2793 | byteCount++; |
| 2794 | } |
| 2795 | pSrc += 4; |
| 2796 | } |
| 2797 | break; |
| 2798 | |
| 2799 | LongCodeWithMask: |
| 2800 | #if BIGENDIAN |
| 2801 | // be careful about the sign extension |
| 2802 | ch = (int)(((uint)ch) >> 16); |
| 2803 | #else // BIGENDIAN |
| 2804 | ch = (WCHAR)ch; |
| 2805 | #endif // BIGENDIAN |
| 2806 | pSrc++; |
| 2807 | |
| 2808 | if (ch <= 0x7F) { |
| 2809 | continue; |
| 2810 | } |
| 2811 | |
| 2812 | LongCode: |
| 2813 | // use separate helper variables for slow and fast loop so that the jit optimizations |
| 2814 | // won't get confused about the variable lifetimes |
| 2815 | if (ch > 0x7FF) { |
| 2816 | if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) { |
| 2817 | // 4 byte encoding - high surrogate + low surrogate |
| 2818 | |
| 2819 | int chd = *pSrc; |
| 2820 | if ( |
| 2821 | ch > CharUnicodeInfo::HIGH_SURROGATE_END || |
| 2822 | !InRange(chd, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) |
| 2823 | { |
| 2824 | // Back up and drop out to slow loop to figure out error |
| 2825 | pSrc--; |
| 2826 | break; |
| 2827 | } |
| 2828 | pSrc++; |
| 2829 | |
| 2830 | // byteCount - this byte is compensated by the second surrogate character |
| 2831 | } |
| 2832 | byteCount++; |
| 2833 | } |
| 2834 | byteCount++; |
| 2835 | |
| 2836 | // byteCount - the last byte is already included |
| 2837 | } |
| 2838 | #endif // FASTLOOP |
| 2839 | |
| 2840 | // no pending char at this point |
| 2841 | ch = 0; |
| 2842 | } |
| 2843 | |
| 2844 | #if WIN64 |
| 2845 | // check for overflow |
| 2846 | if (byteCount < 0) { |
| 2847 | throw ArgumentException("Conversion buffer overflow." ); |
| 2848 | } |
| 2849 | #endif |
| 2850 | |
| 2851 | Contract::Assert(fallbackBuffer == nullptr || fallbackBuffer->GetRemaining() == 0, |
| 2852 | "[UTF8Encoding.GetByteCount]Expected Empty fallback buffer" ); |
| 2853 | |
| 2854 | InternalDelete(fallbackBuffer); |
| 2855 | |
| 2856 | return byteCount; |
| 2857 | } |
| 2858 | |
| 2859 | }; |
| 2860 | |
| 2861 | |
| 2862 | //////////////////////////////////////////////////////////////////////////// |
| 2863 | // |
| 2864 | // UTF8ToUnicode |
| 2865 | // |
| 2866 | // Maps a UTF-8 character string to its wide character string counterpart. |
| 2867 | // |
| 2868 | //////////////////////////////////////////////////////////////////////////// |
| 2869 | |
| 2870 | int UTF8ToUnicode( |
| 2871 | LPCSTR lpSrcStr, |
| 2872 | int cchSrc, |
| 2873 | LPWSTR lpDestStr, |
| 2874 | int cchDest, |
| 2875 | DWORD dwFlags |
| 2876 | ) |
| 2877 | { |
| 2878 | int ret; |
| 2879 | UTF8Encoding enc(dwFlags & MB_ERR_INVALID_CHARS); |
| 2880 | try { |
| 2881 | ret = enc.GetCharCount((BYTE*)lpSrcStr, cchSrc); |
| 2882 | if (cchDest){ |
| 2883 | if (ret > cchDest){ |
| 2884 | SetLastError(ERROR_INSUFFICIENT_BUFFER); |
| 2885 | ret = 0; |
| 2886 | } |
| 2887 | enc.GetChars((BYTE*)lpSrcStr, cchSrc, (WCHAR*)lpDestStr, ret); |
| 2888 | } |
| 2889 | } |
| 2890 | catch (const InsufficientBufferException& e){ |
| 2891 | SetLastError(ERROR_INSUFFICIENT_BUFFER); |
| 2892 | return 0; |
| 2893 | } |
| 2894 | catch (const DecoderFallbackException& e){ |
| 2895 | SetLastError(ERROR_NO_UNICODE_TRANSLATION); |
| 2896 | return 0; |
| 2897 | } |
| 2898 | catch (const ArgumentException& e){ |
| 2899 | SetLastError(ERROR_INVALID_PARAMETER); |
| 2900 | return 0; |
| 2901 | } |
| 2902 | return ret; |
| 2903 | } |
| 2904 | |
| 2905 | //////////////////////////////////////////////////////////////////////////// |
| 2906 | // |
| 2907 | // UnicodeToUTF8 |
| 2908 | // |
| 2909 | // Maps a Unicode character string to its UTF-8 string counterpart. |
| 2910 | // |
| 2911 | //////////////////////////////////////////////////////////////////////////// |
| 2912 | |
| 2913 | int UnicodeToUTF8( |
| 2914 | LPCWSTR lpSrcStr, |
| 2915 | int cchSrc, |
| 2916 | LPSTR lpDestStr, |
| 2917 | int cchDest) |
| 2918 | { |
| 2919 | int ret; |
| 2920 | UTF8Encoding enc(false); |
| 2921 | try{ |
| 2922 | ret = enc.GetByteCount((WCHAR*)lpSrcStr, cchSrc); |
| 2923 | if (cchDest){ |
| 2924 | if (ret > cchDest){ |
| 2925 | SetLastError(ERROR_INSUFFICIENT_BUFFER); |
| 2926 | ret = 0; |
| 2927 | } |
| 2928 | enc.GetBytes((WCHAR*)lpSrcStr, cchSrc, (BYTE*)lpDestStr, ret); |
| 2929 | } |
| 2930 | } |
| 2931 | catch (const InsufficientBufferException& e){ |
| 2932 | SetLastError(ERROR_INSUFFICIENT_BUFFER); |
| 2933 | return 0; |
| 2934 | } |
| 2935 | catch (const EncoderFallbackException& e){ |
| 2936 | SetLastError(ERROR_NO_UNICODE_TRANSLATION); |
| 2937 | return 0; |
| 2938 | } |
| 2939 | catch (const ArgumentException& e){ |
| 2940 | SetLastError(ERROR_INVALID_PARAMETER); |
| 2941 | return 0; |
| 2942 | } |
| 2943 | return ret; |
| 2944 | } |
| 2945 | |