1// Licensed to the .NET Foundation under one or more agreements.
2// The .NET Foundation licenses this file to you under the MIT license.
3// See the LICENSE file in the project root for more information.
4
5/*++
6
7
8
9Module Name:
10
11 unicode/utf8.c
12
13Abstract:
14 Functions to encode and decode UTF-8 strings. This is a port of the C# version from mscorlib.
15
16Revision History:
17
18
19
20--*/
21
22#include "pal/utf8.h"
23#include "pal/malloc.hpp"
24
25using namespace CorUnix;
26
27#define FASTLOOP
28
29struct CharUnicodeInfo
30{
31 static const WCHAR HIGH_SURROGATE_START = 0xd800;
32 static const WCHAR HIGH_SURROGATE_END = 0xdbff;
33 static const WCHAR LOW_SURROGATE_START = 0xdc00;
34 static const WCHAR LOW_SURROGATE_END = 0xdfff;
35};
36
37struct Char
38{
39 // Test if the wide character is a high surrogate
40 static bool IsHighSurrogate(const WCHAR c)
41 {
42 return (c & 0xFC00) == CharUnicodeInfo::HIGH_SURROGATE_START;
43 }
44
45 // Test if the wide character is a low surrogate
46 static bool IsLowSurrogate(const WCHAR c)
47 {
48 return (c & 0xFC00) == CharUnicodeInfo::LOW_SURROGATE_START;
49 }
50
51 // Test if the wide character is a low surrogate
52 static bool IsSurrogate(const WCHAR c)
53 {
54 return (c & 0xF800) == CharUnicodeInfo::HIGH_SURROGATE_START;
55 }
56
57 // Test if the wide character is a high surrogate
58 static bool IsHighSurrogate(const WCHAR* s, int index)
59 {
60 return IsHighSurrogate(s[index]);
61 }
62
63 // Test if the wide character is a low surrogate
64 static bool IsLowSurrogate(const WCHAR* s, int index)
65 {
66 return IsLowSurrogate(s[index]);
67 }
68
69 // Test if the wide character is a low surrogate
70 static bool IsSurrogate(const WCHAR* s, int index)
71 {
72 return IsSurrogate(s[index]);
73 }
74};
75
76class ArgumentException
77{
78
79public:
80 ArgumentException(LPCSTR message)
81 {
82 }
83
84 ArgumentException(LPCSTR message, LPCSTR argName)
85 {
86 }
87};
88
89class ArgumentNullException : public ArgumentException
90{
91public:
92 ArgumentNullException(LPCSTR argName)
93 : ArgumentException("Argument is NULL", argName)
94 {
95
96 }
97};
98
99class ArgumentOutOfRangeException : public ArgumentException
100{
101public:
102 ArgumentOutOfRangeException(LPCSTR argName, LPCSTR message)
103 : ArgumentException(message, argName)
104 {
105
106 }
107};
108
109class InsufficientBufferException : public ArgumentException
110{
111public:
112 InsufficientBufferException(LPCSTR message, LPCSTR argName)
113 : ArgumentException(message, argName)
114 {
115
116 }
117};
118
119class Contract
120{
121public:
122 static void Assert(bool cond, LPCSTR str)
123 {
124 if (!cond)
125 {
126 throw ArgumentException(str);
127 }
128 }
129
130 static void EndContractBlock()
131 {
132 }
133};
134
135class DecoderFallbackException : public ArgumentException
136{
137 BYTE *bytesUnknown;
138 int index;
139
140public:
141 DecoderFallbackException(
142 LPCSTR message, BYTE bytesUnknown[], int index) : ArgumentException(message)
143 {
144 this->bytesUnknown = bytesUnknown;
145 this->index = index;
146 }
147
148 BYTE *BytesUnknown()
149 {
150 return (bytesUnknown);
151 }
152
153 int GetIndex()
154 {
155 return index;
156 }
157};
158
159class DecoderFallbackBuffer;
160
161class DecoderFallback
162{
163public:
164
165 // Fallback
166 //
167 // Return the appropriate unicode string alternative to the character that need to fall back.
168
169 virtual DecoderFallbackBuffer* CreateFallbackBuffer() = 0;
170
171 // Maximum number of characters that this instance of this fallback could return
172
173 virtual int GetMaxCharCount() = 0;
174};
175
176class DecoderReplacementFallback : public DecoderFallback
177{
178 // Our variables
179 WCHAR strDefault[2];
180 int strDefaultLength;
181
182public:
183 // Construction. Default replacement fallback uses no best fit and ? replacement string
184 DecoderReplacementFallback() : DecoderReplacementFallback(W("?"))
185 {
186 }
187
188 DecoderReplacementFallback(const WCHAR* replacement)
189 {
190 // Must not be null
191 if (replacement == nullptr)
192 throw ArgumentNullException("replacement");
193 Contract::EndContractBlock();
194
195 // Make sure it doesn't have bad surrogate pairs
196 bool bFoundHigh = false;
197 int replacementLength = PAL_wcslen((const WCHAR *)replacement);
198 for (int i = 0; i < replacementLength; i++)
199 {
200 // Found a surrogate?
201 if (Char::IsSurrogate(replacement, i))
202 {
203 // High or Low?
204 if (Char::IsHighSurrogate(replacement, i))
205 {
206 // if already had a high one, stop
207 if (bFoundHigh)
208 break; // break & throw at the bFoundHIgh below
209 bFoundHigh = true;
210 }
211 else
212 {
213 // Low, did we have a high?
214 if (!bFoundHigh)
215 {
216 // Didn't have one, make if fail when we stop
217 bFoundHigh = true;
218 break;
219 }
220
221 // Clear flag
222 bFoundHigh = false;
223 }
224 }
225 // If last was high we're in trouble (not surrogate so not low surrogate, so break)
226 else if (bFoundHigh)
227 break;
228 }
229 if (bFoundHigh)
230 throw ArgumentException("String 'replacement' contains invalid Unicode code points.", "replacement");
231
232 wcscpy_s(strDefault, sizeof(strDefault), replacement);
233 strDefaultLength = replacementLength;
234 }
235
236 WCHAR* GetDefaultString()
237 {
238 return strDefault;
239 }
240
241 virtual DecoderFallbackBuffer* CreateFallbackBuffer();
242
243 // Maximum number of characters that this instance of this fallback could return
244 virtual int GetMaxCharCount()
245 {
246 return strDefaultLength;
247 }
248};
249
250class DecoderFallbackBuffer
251{
252 friend class UTF8Encoding;
253 // Most implimentations will probably need an implimenation-specific constructor
254
255 // internal methods that cannot be overriden that let us do our fallback thing
256 // These wrap the internal methods so that we can check for people doing stuff that's incorrect
257
258public:
259 virtual ~DecoderFallbackBuffer() = default;
260
261 virtual bool Fallback(BYTE bytesUnknown[], int index, int size) = 0;
262
263 // Get next character
264 virtual WCHAR GetNextChar() = 0;
265
266 //Back up a character
267 virtual bool MovePrevious() = 0;
268
269 // How many chars left in this fallback?
270 virtual int GetRemaining() = 0;
271
272 // Clear the buffer
273 virtual void Reset()
274 {
275 while (GetNextChar() != (WCHAR)0);
276 }
277
278 // Internal items to help us figure out what we're doing as far as error messages, etc.
279 // These help us with our performance and messages internally
280protected:
281 BYTE* byteStart;
282 WCHAR* charEnd;
283
284 // Internal reset
285 void InternalReset()
286 {
287 byteStart = nullptr;
288 Reset();
289 }
290
291 // Set the above values
292 // This can't be part of the constructor because EncoderFallbacks would have to know how to impliment these.
293 void InternalInitialize(BYTE* byteStart, WCHAR* charEnd)
294 {
295 this->byteStart = byteStart;
296 this->charEnd = charEnd;
297 }
298
299 // Fallback the current byte by sticking it into the remaining char buffer.
300 // This can only be called by our encodings (other have to use the public fallback methods), so
301 // we can use our DecoderNLS here too (except we don't).
302 // Returns true if we are successful, false if we can't fallback the character (no buffer space)
303 // So caller needs to throw buffer space if return false.
304 // Right now this has both bytes and bytes[], since we might have extra bytes, hence the
305 // array, and we might need the index, hence the byte*
306 // Don't touch ref chars unless we succeed
307 virtual bool InternalFallback(BYTE bytes[], BYTE* pBytes, WCHAR** chars, int size)
308 {
309
310 Contract::Assert(byteStart != nullptr, "[DecoderFallback.InternalFallback]Used InternalFallback without calling InternalInitialize");
311
312 // See if there's a fallback character and we have an output buffer then copy our string.
313 if (this->Fallback(bytes, (int)(pBytes - byteStart - size), size))
314 {
315 // Copy the chars to our output
316 WCHAR ch;
317 WCHAR* charTemp = *chars;
318 bool bHighSurrogate = false;
319 while ((ch = GetNextChar()) != 0)
320 {
321 // Make sure no mixed up surrogates
322 if (Char::IsSurrogate(ch))
323 {
324 if (Char::IsHighSurrogate(ch))
325 {
326 // High Surrogate
327 if (bHighSurrogate)
328 throw ArgumentException("String 'chars' contains invalid Unicode code points.");
329 bHighSurrogate = true;
330 }
331 else
332 {
333 // Low surrogate
334 if (bHighSurrogate == false)
335 throw ArgumentException("String 'chars' contains invalid Unicode code points.");
336 bHighSurrogate = false;
337 }
338 }
339
340 if (charTemp >= charEnd)
341 {
342 // No buffer space
343 return false;
344 }
345
346 *(charTemp++) = ch;
347 }
348
349 // Need to make sure that bHighSurrogate isn't true
350 if (bHighSurrogate)
351 throw ArgumentException("String 'chars' contains invalid Unicode code points.");
352
353 // Now we aren't going to be false, so its OK to update chars
354 *chars = charTemp;
355 }
356
357 return true;
358 }
359
360 // This version just counts the fallback and doesn't actually copy anything.
361 virtual int InternalFallback(BYTE bytes[], BYTE* pBytes, int size)
362 // Right now this has both bytes[] and BYTE* bytes, since we might have extra bytes, hence the
363 // array, and we might need the index, hence the byte*
364 {
365
366 Contract::Assert(byteStart != nullptr, "[DecoderFallback.InternalFallback]Used InternalFallback without calling InternalInitialize");
367
368 // See if there's a fallback character and we have an output buffer then copy our string.
369 if (this->Fallback(bytes, (int)(pBytes - byteStart - size), size))
370 {
371 int count = 0;
372
373 WCHAR ch;
374 bool bHighSurrogate = false;
375 while ((ch = GetNextChar()) != 0)
376 {
377 // Make sure no mixed up surrogates
378 if (Char::IsSurrogate(ch))
379 {
380 if (Char::IsHighSurrogate(ch))
381 {
382 // High Surrogate
383 if (bHighSurrogate)
384 throw ArgumentException("String 'chars' contains invalid Unicode code points.");
385 bHighSurrogate = true;
386 }
387 else
388 {
389 // Low surrogate
390 if (bHighSurrogate == false)
391 throw ArgumentException("String 'chars' contains invalid Unicode code points.");
392 bHighSurrogate = false;
393 }
394 }
395
396 count++;
397 }
398
399 // Need to make sure that bHighSurrogate isn't true
400 if (bHighSurrogate)
401 throw ArgumentException("String 'chars' contains invalid Unicode code points.");
402
403 return count;
404 }
405
406 // If no fallback return 0
407 return 0;
408 }
409
410 // private helper methods
411 void ThrowLastBytesRecursive(BYTE bytesUnknown[])
412 {
413 throw ArgumentException("Recursive fallback not allowed");
414 }
415};
416
417class DecoderReplacementFallbackBuffer : public DecoderFallbackBuffer
418{
419 // Store our default string
420 WCHAR strDefault[2];
421 int strDefaultLength;
422 int fallbackCount = -1;
423 int fallbackIndex = -1;
424
425public:
426 // Construction
427 DecoderReplacementFallbackBuffer(DecoderReplacementFallback* fallback)
428 {
429 wcscpy_s(strDefault, sizeof(strDefault), fallback->GetDefaultString());
430 strDefaultLength = PAL_wcslen((const WCHAR *)fallback->GetDefaultString());
431 }
432
433 // Fallback Methods
434 virtual bool Fallback(BYTE bytesUnknown[], int index, int size)
435 {
436 // We expect no previous fallback in our buffer
437 // We can't call recursively but others might (note, we don't test on last char!!!)
438 if (fallbackCount >= 1)
439 {
440 ThrowLastBytesRecursive(bytesUnknown);
441 }
442
443 // Go ahead and get our fallback
444 if (strDefaultLength == 0)
445 return false;
446
447 fallbackCount = strDefaultLength;
448 fallbackIndex = -1;
449
450 return true;
451 }
452
453 virtual WCHAR GetNextChar()
454 {
455 // We want it to get < 0 because == 0 means that the current/last character is a fallback
456 // and we need to detect recursion. We could have a flag but we already have this counter.
457 fallbackCount--;
458 fallbackIndex++;
459
460 // Do we have anything left? 0 is now last fallback char, negative is nothing left
461 if (fallbackCount < 0)
462 return '\0';
463
464 // Need to get it out of the buffer.
465 // Make sure it didn't wrap from the fast count-- path
466 if (fallbackCount == INT_MAX)
467 {
468 fallbackCount = -1;
469 return '\0';
470 }
471
472 // Now make sure its in the expected range
473 Contract::Assert(fallbackIndex < strDefaultLength && fallbackIndex >= 0,
474 "Index exceeds buffer range");
475
476 return strDefault[fallbackIndex];
477 }
478
479 virtual bool MovePrevious()
480 {
481 // Back up one, only if we just processed the last character (or earlier)
482 if (fallbackCount >= -1 && fallbackIndex >= 0)
483 {
484 fallbackIndex--;
485 fallbackCount++;
486 return true;
487 }
488
489 // Return false 'cause we couldn't do it.
490 return false;
491 }
492
493 // How many characters left to output?
494 virtual int GetRemaining()
495 {
496 // Our count is 0 for 1 character left.
497 return (fallbackCount < 0) ? 0 : fallbackCount;
498 }
499
500 // Clear the buffer
501 virtual void Reset()
502 {
503 fallbackCount = -1;
504 fallbackIndex = -1;
505 byteStart = nullptr;
506 }
507
508 // This version just counts the fallback and doesn't actually copy anything.
509 virtual int InternalFallback(BYTE bytes[], BYTE* pBytes, int size)
510 // Right now this has both bytes and bytes[], since we might have extra bytes, hence the
511 // array, and we might need the index, hence the byte*
512 {
513 // return our replacement string Length
514 return strDefaultLength;
515 }
516};
517
518class DecoderExceptionFallbackBuffer : public DecoderFallbackBuffer
519{
520public:
521 DecoderExceptionFallbackBuffer()
522 {
523 }
524
525 virtual bool Fallback(BYTE bytesUnknown[], int index, int size)
526 {
527 throw DecoderFallbackException(
528 "Unable to translate UTF-8 character to Unicode", bytesUnknown, index);
529 }
530
531 virtual WCHAR GetNextChar()
532 {
533 return 0;
534 }
535
536 virtual bool MovePrevious()
537 {
538 // Exception fallback doesn't have anywhere to back up to.
539 return false;
540 }
541
542 // Exceptions are always empty
543 virtual int GetRemaining()
544 {
545 return 0;
546 }
547
548};
549
550class DecoderExceptionFallback : public DecoderFallback
551{
552 // Construction
553public:
554 DecoderExceptionFallback()
555 {
556 }
557
558 virtual DecoderFallbackBuffer* CreateFallbackBuffer()
559 {
560 return InternalNew<DecoderExceptionFallbackBuffer>();
561 }
562
563 // Maximum number of characters that this instance of this fallback could return
564 virtual int GetMaxCharCount()
565 {
566 return 0;
567 }
568};
569
570DecoderFallbackBuffer* DecoderReplacementFallback::CreateFallbackBuffer()
571{
572 return InternalNew<DecoderReplacementFallbackBuffer>(this);
573}
574
575class EncoderFallbackException : public ArgumentException
576{
577 WCHAR charUnknown;
578 WCHAR charUnknownHigh;
579 WCHAR charUnknownLow;
580 int index;
581
582public:
583 EncoderFallbackException(
584 LPCSTR message, WCHAR charUnknown, int index) : ArgumentException(message)
585 {
586 this->charUnknown = charUnknown;
587 this->index = index;
588 }
589
590 EncoderFallbackException(
591 LPCSTR message, WCHAR charUnknownHigh, WCHAR charUnknownLow, int index) : ArgumentException(message)
592 {
593 if (!Char::IsHighSurrogate(charUnknownHigh))
594 {
595 throw ArgumentOutOfRangeException("charUnknownHigh",
596 "Argument out of range 0xD800..0xDBFF");
597 }
598 if (!Char::IsLowSurrogate(charUnknownLow))
599 {
600 throw ArgumentOutOfRangeException("charUnknownLow",
601 "Argument out of range 0xDC00..0xDFFF");
602 }
603 Contract::EndContractBlock();
604
605 this->charUnknownHigh = charUnknownHigh;
606 this->charUnknownLow = charUnknownLow;
607 this->index = index;
608 }
609
610 WCHAR GetCharUnknown()
611 {
612 return (charUnknown);
613 }
614
615 WCHAR GetCharUnknownHigh()
616 {
617 return (charUnknownHigh);
618 }
619
620 WCHAR GetCharUnknownLow()
621 {
622 return (charUnknownLow);
623 }
624
625 int GetIndex()
626 {
627 return index;
628 }
629
630 // Return true if the unknown character is a surrogate pair.
631 bool IsUnknownSurrogate()
632 {
633 return (charUnknownHigh != '\0');
634 }
635};
636
637class EncoderFallbackBuffer;
638
639class EncoderFallback
640{
641public:
642
643 // Fallback
644 //
645 // Return the appropriate unicode string alternative to the character that need to fall back.
646
647 virtual EncoderFallbackBuffer* CreateFallbackBuffer() = 0;
648
649 // Maximum number of characters that this instance of this fallback could return
650 virtual int GetMaxCharCount() = 0;
651};
652
653class EncoderReplacementFallback : public EncoderFallback
654{
655 // Our variables
656 WCHAR strDefault[2];
657 int strDefaultLength;
658
659public:
660 // Construction. Default replacement fallback uses no best fit and ? replacement string
661 EncoderReplacementFallback() : EncoderReplacementFallback(W("?"))
662 {
663 }
664
665 EncoderReplacementFallback(const WCHAR* replacement)
666 {
667 // Must not be null
668 if (replacement == nullptr)
669 throw ArgumentNullException("replacement");
670 Contract::EndContractBlock();
671
672 // Make sure it doesn't have bad surrogate pairs
673 bool bFoundHigh = false;
674 int replacementLength = PAL_wcslen((const WCHAR *)replacement);
675 for (int i = 0; i < replacementLength; i++)
676 {
677 // Found a surrogate?
678 if (Char::IsSurrogate(replacement, i))
679 {
680 // High or Low?
681 if (Char::IsHighSurrogate(replacement, i))
682 {
683 // if already had a high one, stop
684 if (bFoundHigh)
685 break; // break & throw at the bFoundHIgh below
686 bFoundHigh = true;
687 }
688 else
689 {
690 // Low, did we have a high?
691 if (!bFoundHigh)
692 {
693 // Didn't have one, make if fail when we stop
694 bFoundHigh = true;
695 break;
696 }
697
698 // Clear flag
699 bFoundHigh = false;
700 }
701 }
702 // If last was high we're in trouble (not surrogate so not low surrogate, so break)
703 else if (bFoundHigh)
704 break;
705 }
706 if (bFoundHigh)
707 throw ArgumentException("String 'replacement' contains invalid Unicode code points.", "replacement");
708
709 wcscpy_s(strDefault, sizeof(strDefault), replacement);
710 strDefaultLength = replacementLength;
711 }
712
713 WCHAR* GetDefaultString()
714 {
715 return strDefault;
716 }
717
718 virtual EncoderFallbackBuffer* CreateFallbackBuffer();
719
720 // Maximum number of characters that this instance of this fallback could return
721 virtual int GetMaxCharCount()
722 {
723 return strDefaultLength;
724 }
725};
726
727class EncoderFallbackBuffer
728{
729 friend class UTF8Encoding;
730 // Most implementations will probably need an implemenation-specific constructor
731
732 // Public methods that cannot be overriden that let us do our fallback thing
733 // These wrap the internal methods so that we can check for people doing stuff that is incorrect
734
735public:
736 virtual ~EncoderFallbackBuffer() = default;
737
738 virtual bool Fallback(WCHAR charUnknown, int index) = 0;
739
740 virtual bool Fallback(WCHAR charUnknownHigh, WCHAR charUnknownLow, int index) = 0;
741
742 // Get next character
743 virtual WCHAR GetNextChar() = 0;
744
745 // Back up a character
746 virtual bool MovePrevious() = 0;
747
748 // How many chars left in this fallback?
749 virtual int GetRemaining() = 0;
750
751 // Not sure if this should be public or not.
752 // Clear the buffer
753 virtual void Reset()
754 {
755 while (GetNextChar() != (WCHAR)0);
756 }
757
758 // Internal items to help us figure out what we're doing as far as error messages, etc.
759 // These help us with our performance and messages internally
760protected:
761 WCHAR* charStart;
762 WCHAR* charEnd;
763 bool setEncoder;
764 bool bUsedEncoder;
765 bool bFallingBack = false;
766 int iRecursionCount = 0;
767 static const int iMaxRecursion = 250;
768
769 // Internal Reset
770 // For example, what if someone fails a conversion and wants to reset one of our fallback buffers?
771 void InternalReset()
772 {
773 charStart = nullptr;
774 bFallingBack = false;
775 iRecursionCount = 0;
776 Reset();
777 }
778
779 // Set the above values
780 // This can't be part of the constructor because EncoderFallbacks would have to know how to impliment these.
781 void InternalInitialize(WCHAR* charStart, WCHAR* charEnd, bool setEncoder)
782 {
783 this->charStart = charStart;
784 this->charEnd = charEnd;
785 this->setEncoder = setEncoder;
786 this->bUsedEncoder = false;
787 this->bFallingBack = false;
788 this->iRecursionCount = 0;
789 }
790
791 WCHAR InternalGetNextChar()
792 {
793 WCHAR ch = GetNextChar();
794 bFallingBack = (ch != 0);
795 if (ch == 0) iRecursionCount = 0;
796 return ch;
797 }
798
799 // Fallback the current character using the remaining buffer and encoder if necessary
800 // This can only be called by our encodings (other have to use the public fallback methods), so
801 // we can use our EncoderNLS here too.
802 // setEncoder is true if we're calling from a GetBytes method, false if we're calling from a GetByteCount
803 //
804 // Note that this could also change the contents of this->encoder, which is the same
805 // object that the caller is using, so the caller could mess up the encoder for us
806 // if they aren't careful.
807 virtual bool InternalFallback(WCHAR ch, WCHAR** chars)
808 {
809 // Shouldn't have null charStart
810 Contract::Assert(charStart != nullptr,
811 "[EncoderFallback.InternalFallbackBuffer]Fallback buffer is not initialized");
812
813 // Get our index, remember chars was preincremented to point at next char, so have to -1
814 int index = (int)(*chars - charStart) - 1;
815
816 // See if it was a high surrogate
817 if (Char::IsHighSurrogate(ch))
818 {
819 // See if there's a low surrogate to go with it
820 if (*chars >= this->charEnd)
821 {
822 // Nothing left in input buffer
823 // No input, return 0
824 }
825 else
826 {
827 // Might have a low surrogate
828 WCHAR cNext = **chars;
829 if (Char::IsLowSurrogate(cNext))
830 {
831 // If already falling back then fail
832 if (bFallingBack && iRecursionCount++ > iMaxRecursion)
833 ThrowLastCharRecursive(ch, cNext);
834
835 // Next is a surrogate, add it as surrogate pair, and increment chars
836 (*chars)++;
837 bFallingBack = Fallback(ch, cNext, index);
838 return bFallingBack;
839 }
840
841 // Next isn't a low surrogate, just fallback the high surrogate
842 }
843 }
844
845 // If already falling back then fail
846 if (bFallingBack && iRecursionCount++ > iMaxRecursion)
847 ThrowLastCharRecursive((int)ch);
848
849 // Fall back our char
850 bFallingBack = Fallback(ch, index);
851
852 return bFallingBack;
853 }
854
855 // private helper methods
856 void ThrowLastCharRecursive(WCHAR highSurrogate, WCHAR lowSurrogate)
857 {
858 // Throw it, using our complete character
859 throw ArgumentException("Recursive fallback not allowed", "chars");
860 }
861
862 void ThrowLastCharRecursive(int utf32Char)
863 {
864 throw ArgumentException("Recursive fallback not allowed", "chars");
865 }
866
867};
868
869class EncoderReplacementFallbackBuffer : public EncoderFallbackBuffer
870{
871 // Store our default string
872 WCHAR strDefault[4];
873 int strDefaultLength;
874 int fallbackCount = -1;
875 int fallbackIndex = -1;
876public:
877 // Construction
878 EncoderReplacementFallbackBuffer(EncoderReplacementFallback* fallback)
879 {
880 // 2X in case we're a surrogate pair
881 wcscpy_s(strDefault, sizeof(strDefault), fallback->GetDefaultString());
882 wcscat_s(strDefault, sizeof(strDefault), fallback->GetDefaultString());
883 strDefaultLength = 2 * PAL_wcslen((const WCHAR *)fallback->GetDefaultString());
884
885 }
886
887 // Fallback Methods
888 virtual bool Fallback(WCHAR charUnknown, int index)
889 {
890 // If we had a buffer already we're being recursive, throw, it's probably at the suspect
891 // character in our array.
892 if (fallbackCount >= 1)
893 {
894 // If we're recursive we may still have something in our buffer that makes this a surrogate
895 if (Char::IsHighSurrogate(charUnknown) && fallbackCount >= 0 &&
896 Char::IsLowSurrogate(strDefault[fallbackIndex + 1]))
897 ThrowLastCharRecursive(charUnknown, strDefault[fallbackIndex + 1]);
898
899 // Nope, just one character
900 ThrowLastCharRecursive((int)charUnknown);
901 }
902
903 // Go ahead and get our fallback
904 // Divide by 2 because we aren't a surrogate pair
905 fallbackCount = strDefaultLength / 2;
906 fallbackIndex = -1;
907
908 return fallbackCount != 0;
909 }
910
911 virtual bool Fallback(WCHAR charUnknownHigh, WCHAR charUnknownLow, int index)
912 {
913 // Double check input surrogate pair
914 if (!Char::IsHighSurrogate(charUnknownHigh))
915 throw ArgumentOutOfRangeException("charUnknownHigh",
916 "Argument out of range 0xD800..0xDBFF");
917
918 if (!Char::IsLowSurrogate(charUnknownLow))
919 throw ArgumentOutOfRangeException("charUnknownLow",
920 "Argument out of range 0xDC00..0xDFFF");
921 Contract::EndContractBlock();
922
923 // If we had a buffer already we're being recursive, throw, it's probably at the suspect
924 // character in our array.
925 if (fallbackCount >= 1)
926 ThrowLastCharRecursive(charUnknownHigh, charUnknownLow);
927
928 // Go ahead and get our fallback
929 fallbackCount = strDefaultLength;
930 fallbackIndex = -1;
931
932 return fallbackCount != 0;
933 }
934
935 virtual WCHAR GetNextChar()
936 {
937 // We want it to get < 0 because == 0 means that the current/last character is a fallback
938 // and we need to detect recursion. We could have a flag but we already have this counter.
939 fallbackCount--;
940 fallbackIndex++;
941
942 // Do we have anything left? 0 is now last fallback char, negative is nothing left
943 if (fallbackCount < 0)
944 return '\0';
945
946 // Need to get it out of the buffer.
947 // Make sure it didn't wrap from the fast count-- path
948 if (fallbackCount == INT_MAX)
949 {
950 fallbackCount = -1;
951 return '\0';
952 }
953
954 // Now make sure its in the expected range
955 Contract::Assert(fallbackIndex < strDefaultLength && fallbackIndex >= 0,
956 "Index exceeds buffer range");
957
958 return strDefault[fallbackIndex];
959 }
960
961 virtual bool MovePrevious()
962 {
963 // Back up one, only if we just processed the last character (or earlier)
964 if (fallbackCount >= -1 && fallbackIndex >= 0)
965 {
966 fallbackIndex--;
967 fallbackCount++;
968 return true;
969 }
970
971 // Return false 'cause we couldn't do it.
972 return false;
973 }
974
975 // How many characters left to output?
976 virtual int GetRemaining()
977 {
978 // Our count is 0 for 1 character left.
979 return (fallbackCount < 0) ? 0 : fallbackCount;
980 }
981
982 // Clear the buffer
983 virtual void Reset()
984 {
985 fallbackCount = -1;
986 fallbackIndex = 0;
987 charStart = nullptr;
988 bFallingBack = false;
989 }
990};
991
992class EncoderExceptionFallbackBuffer : public EncoderFallbackBuffer
993{
994public:
995 EncoderExceptionFallbackBuffer()
996 {
997 }
998
999 virtual bool Fallback(WCHAR charUnknown, int index)
1000 {
1001 // Fall back our char
1002 throw EncoderFallbackException("Unable to translate Unicode character to UTF-8", charUnknown, index);
1003 }
1004
1005 virtual bool Fallback(WCHAR charUnknownHigh, WCHAR charUnknownLow, int index)
1006 {
1007 if (!Char::IsHighSurrogate(charUnknownHigh))
1008 {
1009 throw ArgumentOutOfRangeException("charUnknownHigh",
1010 "Argument out of range 0xD800..0xDBFF");
1011 }
1012 if (!Char::IsLowSurrogate(charUnknownLow))
1013 {
1014 throw ArgumentOutOfRangeException("charUnknownLow",
1015 "Argument out of range 0xDC00..0xDFFF");
1016 }
1017 Contract::EndContractBlock();
1018
1019 //int iTemp = Char::ConvertToUtf32(charUnknownHigh, charUnknownLow);
1020
1021 // Fall back our char
1022 throw EncoderFallbackException(
1023 "Unable to translate Unicode character to UTF-8", charUnknownHigh, charUnknownLow, index);
1024 }
1025
1026 virtual WCHAR GetNextChar()
1027 {
1028 return 0;
1029 }
1030
1031 virtual bool MovePrevious()
1032 {
1033 // Exception fallback doesn't have anywhere to back up to.
1034 return false;
1035 }
1036
1037 // Exceptions are always empty
1038 virtual int GetRemaining()
1039 {
1040 return 0;
1041 }
1042};
1043
1044class EncoderExceptionFallback : public EncoderFallback
1045{
1046 // Construction
1047public:
1048 EncoderExceptionFallback()
1049 {
1050 }
1051
1052 virtual EncoderFallbackBuffer* CreateFallbackBuffer()
1053 {
1054 return InternalNew<EncoderExceptionFallbackBuffer>();
1055 }
1056
1057 // Maximum number of characters that this instance of this fallback could return
1058 virtual int GetMaxCharCount()
1059 {
1060 return 0;
1061 }
1062};
1063
1064EncoderFallbackBuffer* EncoderReplacementFallback::CreateFallbackBuffer()
1065{
1066 return InternalNew<EncoderReplacementFallbackBuffer>(this);
1067}
1068
1069class UTF8Encoding
1070{
1071 EncoderFallback* encoderFallback;
1072 // Instances of the two possible fallbacks. The constructor parameter
1073 // determines which one to use.
1074 EncoderReplacementFallback encoderReplacementFallback;
1075 EncoderExceptionFallback encoderExceptionFallback;
1076
1077 DecoderFallback* decoderFallback;
1078 // Instances of the two possible fallbacks. The constructor parameter
1079 // determines which one to use.
1080 DecoderReplacementFallback decoderReplacementFallback;
1081 DecoderExceptionFallback decoderExceptionFallback;
1082
1083 bool InRange(WCHAR c, WCHAR begin, WCHAR end)
1084 {
1085 return begin <= c && c <= end;
1086 }
1087
1088 size_t PtrDiff(WCHAR* ptr1, WCHAR* ptr2)
1089 {
1090 return ptr1 - ptr2;
1091 }
1092
1093 size_t PtrDiff(BYTE* ptr1, BYTE* ptr2)
1094 {
1095 return ptr1 - ptr2;
1096 }
1097
1098 void ThrowBytesOverflow()
1099 {
1100 // Special message to include fallback type in case fallback's GetMaxCharCount is broken
1101 // This happens if user has implimented an encoder fallback with a broken GetMaxCharCount
1102 throw InsufficientBufferException("The output byte buffer is too small to contain the encoded data", "bytes");
1103 }
1104
1105 void ThrowBytesOverflow(bool nothingEncoded)
1106 {
1107 // Special message to include fallback type in case fallback's GetMaxCharCount is broken
1108 // This happens if user has implimented an encoder fallback with a broken GetMaxCharCount
1109 if (nothingEncoded){
1110 ThrowBytesOverflow();
1111 }
1112 }
1113
1114 void ThrowCharsOverflow()
1115 {
1116 // Special message to include fallback type in case fallback's GetMaxCharCount is broken
1117 // This happens if user has implimented a decoder fallback with a broken GetMaxCharCount
1118 throw InsufficientBufferException("The output char buffer is too small to contain the encoded data", "chars");
1119 }
1120
1121 void ThrowCharsOverflow(bool nothingEncoded)
1122 {
1123 // Special message to include fallback type in case fallback's GetMaxCharCount is broken
1124 // This happens if user has implimented an decoder fallback with a broken GetMaxCharCount
1125 if (nothingEncoded){
1126 ThrowCharsOverflow();
1127 }
1128 }
1129
1130 // During GetChars we had an invalid byte sequence
1131 // pSrc is backed up to the start of the bad sequence if we didn't have room to
1132 // fall it back. Otherwise pSrc remains where it is.
1133 bool FallbackInvalidByteSequence(BYTE** pSrc, int ch, DecoderFallbackBuffer* fallback, WCHAR** pTarget)
1134 {
1135 // Get our byte[]
1136 BYTE* pStart = *pSrc;
1137 BYTE* bytesUnknown;
1138 int size = GetBytesUnknown(pStart, ch, &bytesUnknown);
1139
1140 // Do the actual fallback
1141 if (!fallback->InternalFallback(bytesUnknown, *pSrc, pTarget, size))
1142 {
1143 // Oops, it failed, back up to pStart
1144 *pSrc = pStart;
1145 return false;
1146 }
1147
1148 // It worked
1149 return true;
1150 }
1151
1152 int FallbackInvalidByteSequence(BYTE* pSrc, int ch, DecoderFallbackBuffer *fallback)
1153 {
1154 // Get our byte[]
1155 BYTE *bytesUnknown;
1156 int size = GetBytesUnknown(pSrc, ch, &bytesUnknown);
1157
1158 // Do the actual fallback
1159 int count = fallback->InternalFallback(bytesUnknown, pSrc, size);
1160
1161 // # of fallback chars expected.
1162 // Note that we only get here for "long" sequences, and have already unreserved
1163 // the count that we prereserved for the input bytes
1164 return count;
1165 }
1166
1167 int GetBytesUnknown(BYTE* pSrc, int ch, BYTE **bytesUnknown)
1168 {
1169 int size;
1170 BYTE bytes[3];
1171
1172 // See if it was a plain char
1173 // (have to check >= 0 because we have all sorts of wierd bit flags)
1174 if (ch < 0x100 && ch >= 0)
1175 {
1176 pSrc--;
1177 bytes[0] = (BYTE)ch;
1178 size = 1;
1179 }
1180 // See if its an unfinished 2 byte sequence
1181 else if ((ch & (SupplimentarySeq | ThreeByteSeq)) == 0)
1182 {
1183 pSrc--;
1184 bytes[0] = (BYTE)((ch & 0x1F) | 0xc0);
1185 size = 1;
1186 }
1187 // So now we're either 2nd byte of 3 or 4 byte sequence or
1188 // we hit a non-trail byte or we ran out of space for 3rd byte of 4 byte sequence
1189 // 1st check if its a 4 byte sequence
1190 else if ((ch & SupplimentarySeq) != 0)
1191 {
1192 // 3rd byte of 4 byte sequence?
1193 if ((ch & (FinalByte >> 6)) != 0)
1194 {
1195 // 3rd byte of 4 byte sequence
1196 pSrc -= 3;
1197 bytes[0] = (BYTE)(((ch >> 12) & 0x07) | 0xF0);
1198 bytes[1] = (BYTE)(((ch >> 6) & 0x3F) | 0x80);
1199 bytes[2] = (BYTE)(((ch)& 0x3F) | 0x80);
1200 size = 3;
1201 }
1202 else if ((ch & (FinalByte >> 12)) != 0)
1203 {
1204 // 2nd byte of a 4 byte sequence
1205 pSrc -= 2;
1206 bytes[0] = (BYTE)(((ch >> 6) & 0x07) | 0xF0);
1207 bytes[1] = (BYTE)(((ch)& 0x3F) | 0x80);
1208 size = 2;
1209 }
1210 else
1211 {
1212 // 4th byte of a 4 byte sequence
1213 pSrc--;
1214 bytes[0] = (BYTE)(((ch)& 0x07) | 0xF0);
1215 size = 1;
1216 }
1217 }
1218 else
1219 {
1220 // 2nd byte of 3 byte sequence?
1221 if ((ch & (FinalByte >> 6)) != 0)
1222 {
1223 // So its 2nd byte of a 3 byte sequence
1224 pSrc -= 2;
1225 bytes[0] = (BYTE)(((ch >> 6) & 0x0F) | 0xE0);
1226 bytes[1] = (BYTE)(((ch)& 0x3F) | 0x80);
1227 size = 2;
1228 }
1229 else
1230 {
1231 // 1st byte of a 3 byte sequence
1232 pSrc--;
1233 bytes[0] = (BYTE)(((ch)& 0x0F) | 0xE0);
1234 size = 1;
1235 }
1236 }
1237
1238 *bytesUnknown = bytes;
1239 return size;
1240 }
1241
1242public:
1243
1244 UTF8Encoding(bool isThrowException)
1245 : encoderReplacementFallback(W("\xFFFD")), decoderReplacementFallback(W("\xFFFD"))
1246 {
1247 if (isThrowException)
1248 {
1249 encoderFallback = &encoderExceptionFallback;
1250 decoderFallback = &decoderExceptionFallback;
1251 }
1252 else
1253 {
1254 encoderFallback = &encoderReplacementFallback;
1255 decoderFallback = &decoderReplacementFallback;
1256 }
1257 }
1258
1259 // These are bitmasks used to maintain the state in the decoder. They occupy the higher bits
1260 // while the actual character is being built in the lower bits. They are shifted together
1261 // with the actual bits of the character.
1262
1263 // bits 30 & 31 are used for pending bits fixup
1264 const int FinalByte = 1 << 29;
1265 const int SupplimentarySeq = 1 << 28;
1266 const int ThreeByteSeq = 1 << 27;
1267
1268 int GetCharCount(BYTE* bytes, int count)
1269 {
1270 Contract::Assert(bytes != nullptr, "[UTF8Encoding.GetCharCount]bytes!=nullptr");
1271 Contract::Assert(count >= 0, "[UTF8Encoding.GetCharCount]count >=0");
1272
1273 // Initialize stuff
1274 BYTE *pSrc = bytes;
1275 BYTE *pEnd = pSrc + count;
1276
1277 // Start by assuming we have as many as count, charCount always includes the adjustment
1278 // for the character being decoded
1279 int charCount = count;
1280 int ch = 0;
1281 DecoderFallbackBuffer *fallback = nullptr;
1282
1283 for (;;)
1284 {
1285 // SLOWLOOP: does all range checks, handles all special cases, but it is slow
1286 if (pSrc >= pEnd) {
1287 break;
1288 }
1289
1290 // read next byte. The JIT optimization seems to be getting confused when
1291 // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
1292 int cha = *pSrc;
1293
1294 if (ch == 0) {
1295 // no pending bits
1296 goto ReadChar;
1297 }
1298
1299 pSrc++;
1300
1301 // we are expecting to see trailing bytes like 10vvvvvv
1302 if ((cha & 0xC0) != 0x80) {
1303 // This can be a valid starting byte for another UTF8 byte sequence, so let's put
1304 // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence
1305 pSrc--;
1306 charCount += (ch >> 30);
1307 goto InvalidByteSequence;
1308 }
1309
1310 // fold in the new byte
1311 ch = (ch << 6) | (cha & 0x3F);
1312
1313 if ((ch & FinalByte) == 0) {
1314 Contract::Assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0,
1315 "[UTF8Encoding.GetChars]Invariant volation");
1316
1317 if ((ch & SupplimentarySeq) != 0) {
1318 if ((ch & (FinalByte >> 6)) != 0) {
1319 // this is 3rd byte (of 4 byte supplimentary) - nothing to do
1320 continue;
1321 }
1322
1323 // 2nd byte, check for non-shortest form of supplimentary char and the valid
1324 // supplimentary characters in range 0x010000 - 0x10FFFF at the same time
1325 if (!InRange(ch & 0x1F0, 0x10, 0x100)) {
1326 goto InvalidByteSequence;
1327 }
1328 }
1329 else {
1330 // Must be 2nd byte of a 3-byte sequence
1331 // check for non-shortest form of 3 byte seq
1332 if ((ch & (0x1F << 5)) == 0 || // non-shortest form
1333 (ch & (0xF800 >> 6)) == (0xD800 >> 6)) // illegal individually encoded surrogate
1334 {
1335 goto InvalidByteSequence;
1336 }
1337 }
1338 continue;
1339 }
1340
1341 // ready to punch
1342
1343 // adjust for surrogates in non-shortest form
1344 if ((ch & (SupplimentarySeq | 0x1F0000)) == SupplimentarySeq) {
1345 charCount--;
1346 }
1347 goto EncodeChar;
1348
1349 InvalidByteSequence:
1350 // this code fragment should be close to the gotos referencing it
1351 // Have to do fallback for invalid bytes
1352 if (fallback == nullptr)
1353 {
1354 fallback = decoderFallback->CreateFallbackBuffer();
1355 fallback->InternalInitialize(bytes, nullptr);
1356 }
1357 charCount += FallbackInvalidByteSequence(pSrc, ch, fallback);
1358
1359 ch = 0;
1360 continue;
1361
1362 ReadChar:
1363 ch = *pSrc;
1364 pSrc++;
1365
1366 ProcessChar:
1367 if (ch > 0x7F) {
1368 // If its > 0x7F, its start of a new multi-byte sequence
1369
1370 // Long sequence, so unreserve our char.
1371 charCount--;
1372
1373 // bit 6 has to be non-zero for start of multibyte chars.
1374 if ((ch & 0x40) == 0) {
1375 // Unexpected trail byte
1376 goto InvalidByteSequence;
1377 }
1378
1379 // start a new long code
1380 if ((ch & 0x20) != 0) {
1381 if ((ch & 0x10) != 0) {
1382 // 4 byte encoding - supplimentary character (2 surrogates)
1383
1384 ch &= 0x0F;
1385
1386 // check that bit 4 is zero and the valid supplimentary character
1387 // range 0x000000 - 0x10FFFF at the same time
1388 if (ch > 0x04) {
1389 ch |= 0xf0;
1390 goto InvalidByteSequence;
1391 }
1392
1393 // Add bit flags so that when we check new characters & rotate we'll be flagged correctly.
1394 // Final byte flag, count fix if we don't make final byte & supplimentary sequence flag.
1395 ch |= (FinalByte >> 3 * 6) | // Final byte is 3 more bytes from now
1396 (1 << 30) | // If it dies on next byte we'll need an extra char
1397 (3 << (30 - 2 * 6)) | // If it dies on last byte we'll need to subtract a char
1398 (SupplimentarySeq) | (SupplimentarySeq >> 6) |
1399 (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6);
1400
1401 // Our character count will be 2 characters for these 4 bytes, so subtract another char
1402 charCount--;
1403 }
1404 else {
1405 // 3 byte encoding
1406 // Add bit flags so that when we check new characters & rotate we'll be flagged correctly.
1407 ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) |
1408 (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6));
1409
1410 // We'll expect 1 character for these 3 bytes, so subtract another char.
1411 charCount--;
1412 }
1413 }
1414 else {
1415 // 2 byte encoding
1416
1417 ch &= 0x1F;
1418
1419 // check for non-shortest form
1420 if (ch <= 1) {
1421 ch |= 0xc0;
1422 goto InvalidByteSequence;
1423 }
1424
1425 // Add bit flags so we'll be flagged correctly
1426 ch |= (FinalByte >> 6);
1427 }
1428 continue;
1429 }
1430
1431 EncodeChar:
1432
1433#ifdef FASTLOOP
1434 int availableBytes = PtrDiff(pEnd, pSrc);
1435
1436 // don't fall into the fast decoding loop if we don't have enough bytes
1437 if (availableBytes <= 13) {
1438 // try to get over the remainder of the ascii characters fast though
1439 BYTE* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
1440 while (pSrc < pLocalEnd) {
1441 ch = *pSrc;
1442 pSrc++;
1443
1444 if (ch > 0x7F)
1445 goto ProcessChar;
1446 }
1447 // we are done
1448 ch = 0;
1449 break;
1450 }
1451
1452 // To compute the upper bound, assume that all characters are ASCII characters at this point,
1453 // the boundary will be decreased for every non-ASCII character we encounter
1454 // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences
1455 BYTE *pStop = pSrc + availableBytes - 7;
1456
1457 while (pSrc < pStop) {
1458 ch = *pSrc;
1459 pSrc++;
1460
1461 if (ch > 0x7F) {
1462 goto LongCode;
1463 }
1464
1465 // get pSrc 2-byte aligned
1466 if (((int)pSrc & 0x1) != 0) {
1467 ch = *pSrc;
1468 pSrc++;
1469 if (ch > 0x7F) {
1470 goto LongCode;
1471 }
1472 }
1473
1474 // get pSrc 4-byte aligned
1475 if (((int)pSrc & 0x2) != 0) {
1476 ch = *(USHORT*)pSrc;
1477 if ((ch & 0x8080) != 0) {
1478 goto LongCodeWithMask16;
1479 }
1480 pSrc += 2;
1481 }
1482
1483
1484 // Run 8 + 8 characters at a time!
1485 while (pSrc < pStop) {
1486 ch = *(int*)pSrc;
1487 int chb = *(int*)(pSrc + 4);
1488 if (((ch | chb) & (int)0x80808080) != 0) {
1489 goto LongCodeWithMask32;
1490 }
1491 pSrc += 8;
1492
1493 // This is a really small loop - unroll it
1494 if (pSrc >= pStop)
1495 break;
1496
1497 ch = *(int*)pSrc;
1498 chb = *(int*)(pSrc + 4);
1499 if (((ch | chb) & (int)0x80808080) != 0) {
1500 goto LongCodeWithMask32;
1501 }
1502 pSrc += 8;
1503 }
1504 break;
1505
1506#if BIGENDIAN
1507 LongCodeWithMask32 :
1508 // be careful about the sign extension
1509 ch = (int)(((uint)ch) >> 16);
1510 LongCodeWithMask16:
1511 ch = (int)(((uint)ch) >> 8);
1512#else // BIGENDIAN
1513 LongCodeWithMask32:
1514 LongCodeWithMask16:
1515 ch &= 0xFF;
1516#endif // BIGENDIAN
1517 pSrc++;
1518 if (ch <= 0x7F) {
1519 continue;
1520 }
1521
1522 LongCode:
1523 int chc = *pSrc;
1524 pSrc++;
1525
1526 if (
1527 // bit 6 has to be zero
1528 (ch & 0x40) == 0 ||
1529 // we are expecting to see trailing bytes like 10vvvvvv
1530 (chc & 0xC0) != 0x80)
1531 {
1532 goto BadLongCode;
1533 }
1534
1535 chc &= 0x3F;
1536
1537 // start a new long code
1538 if ((ch & 0x20) != 0) {
1539
1540 // fold the first two bytes together
1541 chc |= (ch & 0x0F) << 6;
1542
1543 if ((ch & 0x10) != 0) {
1544 // 4 byte encoding - surrogate
1545 ch = *pSrc;
1546 if (
1547 // check that bit 4 is zero, the non-shortest form of surrogate
1548 // and the valid surrogate range 0x000000 - 0x10FFFF at the same time
1549 !InRange(chc >> 4, 0x01, 0x10) ||
1550 // we are expecting to see trailing bytes like 10vvvvvv
1551 (ch & 0xC0) != 0x80)
1552 {
1553 goto BadLongCode;
1554 }
1555
1556 chc = (chc << 6) | (ch & 0x3F);
1557
1558 ch = *(pSrc + 1);
1559 // we are expecting to see trailing bytes like 10vvvvvv
1560 if ((ch & 0xC0) != 0x80) {
1561 goto BadLongCode;
1562 }
1563 pSrc += 2;
1564
1565 // extra byte
1566 charCount--;
1567 }
1568 else {
1569 // 3 byte encoding
1570 ch = *pSrc;
1571 if (
1572 // check for non-shortest form of 3 byte seq
1573 (chc & (0x1F << 5)) == 0 ||
1574 // Can't have surrogates here.
1575 (chc & (0xF800 >> 6)) == (0xD800 >> 6) ||
1576 // we are expecting to see trailing bytes like 10vvvvvv
1577 (ch & 0xC0) != 0x80)
1578 {
1579 goto BadLongCode;
1580 }
1581 pSrc++;
1582
1583 // extra byte
1584 charCount--;
1585 }
1586 }
1587 else {
1588 // 2 byte encoding
1589
1590 // check for non-shortest form
1591 if ((ch & 0x1E) == 0) {
1592 goto BadLongCode;
1593 }
1594 }
1595
1596 // extra byte
1597 charCount--;
1598 }
1599#endif // FASTLOOP
1600
1601 // no pending bits at this point
1602 ch = 0;
1603 continue;
1604
1605 BadLongCode:
1606 pSrc -= 2;
1607 ch = 0;
1608 continue;
1609 }
1610
1611 // May have a problem if we have to flush
1612 if (ch != 0)
1613 {
1614 // We were already adjusting for these, so need to unadjust
1615 charCount += (ch >> 30);
1616 // Have to do fallback for invalid bytes
1617 if (fallback == nullptr)
1618 {
1619 fallback = decoderFallback->CreateFallbackBuffer();
1620 fallback->InternalInitialize(bytes, nullptr);
1621 }
1622 charCount += FallbackInvalidByteSequence(pSrc, ch, fallback);
1623 }
1624
1625 // Shouldn't have anything in fallback buffer for GetCharCount
1626 // (don't have to check m_throwOnOverflow for count)
1627 Contract::Assert(fallback == nullptr || fallback->GetRemaining() == 0,
1628 "[UTF8Encoding.GetCharCount]Expected empty fallback buffer at end");
1629
1630 InternalDelete(fallback);
1631
1632 return charCount;
1633
1634 }
1635
1636 int GetChars(BYTE* bytes, int byteCount, WCHAR* chars, int charCount)
1637 {
1638 Contract::Assert(chars != nullptr, "[UTF8Encoding.GetChars]chars!=nullptr");
1639 Contract::Assert(byteCount >= 0, "[UTF8Encoding.GetChars]byteCount >=0");
1640 Contract::Assert(charCount >= 0, "[UTF8Encoding.GetChars]charCount >=0");
1641 Contract::Assert(bytes != nullptr, "[UTF8Encoding.GetChars]bytes!=nullptr");
1642
1643 BYTE *pSrc = bytes;
1644 WCHAR *pTarget = chars;
1645
1646 BYTE *pEnd = pSrc + byteCount;
1647 WCHAR *pAllocatedBufferEnd = pTarget + charCount;
1648
1649 int ch = 0;
1650
1651 DecoderFallbackBuffer *fallback = nullptr;
1652
1653 for (;;)
1654 {
1655 // SLOWLOOP: does all range checks, handles all special cases, but it is slow
1656
1657 if (pSrc >= pEnd) {
1658 break;
1659 }
1660
1661 // read next byte. The JIT optimization seems to be getting confused when
1662 // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
1663 int cha = *pSrc;
1664
1665 if (ch == 0) {
1666 // no pending bits
1667 goto ReadChar;
1668 }
1669
1670 pSrc++;
1671
1672 // we are expecting to see trailing bytes like 10vvvvvv
1673 if ((cha & 0xC0) != 0x80) {
1674 // This can be a valid starting byte for another UTF8 byte sequence, so let's put
1675 // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence
1676 pSrc--;
1677 goto InvalidByteSequence;
1678 }
1679
1680 // fold in the new byte
1681 ch = (ch << 6) | (cha & 0x3F);
1682
1683 if ((ch & FinalByte) == 0) {
1684 // Not at last byte yet
1685 Contract::Assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0,
1686 "[UTF8Encoding.GetChars]Invariant volation");
1687
1688 if ((ch & SupplimentarySeq) != 0) {
1689 // Its a 4-byte supplimentary sequence
1690 if ((ch & (FinalByte >> 6)) != 0) {
1691 // this is 3rd byte of 4 byte sequence - nothing to do
1692 continue;
1693 }
1694
1695 // 2nd byte of 4 bytes
1696 // check for non-shortest form of surrogate and the valid surrogate
1697 // range 0x000000 - 0x10FFFF at the same time
1698 if (!InRange(ch & 0x1F0, 0x10, 0x100)) {
1699 goto InvalidByteSequence;
1700 }
1701 }
1702 else {
1703 // Must be 2nd byte of a 3-byte sequence
1704 // check for non-shortest form of 3 byte seq
1705 if ((ch & (0x1F << 5)) == 0 || // non-shortest form
1706 (ch & (0xF800 >> 6)) == (0xD800 >> 6)) // illegal individually encoded surrogate
1707 {
1708 goto InvalidByteSequence;
1709 }
1710 }
1711 continue;
1712 }
1713
1714 // ready to punch
1715
1716 // surrogate in shortest form?
1717 // Might be possible to get rid of this? Already did non-shortest check for 4-byte sequence when reading 2nd byte?
1718 if ((ch & (SupplimentarySeq | 0x1F0000)) > SupplimentarySeq) {
1719 // let the range check for the second char throw the exception
1720 if (pTarget < pAllocatedBufferEnd) {
1721 *pTarget = (WCHAR)(((ch >> 10) & 0x7FF) +
1722 (SHORT)((CharUnicodeInfo::HIGH_SURROGATE_START - (0x10000 >> 10))));
1723 pTarget++;
1724
1725 ch = (ch & 0x3FF) +
1726 (int)(CharUnicodeInfo::LOW_SURROGATE_START);
1727 }
1728 }
1729
1730 goto EncodeChar;
1731
1732 InvalidByteSequence:
1733 // this code fragment should be close to the gotos referencing it
1734 // Have to do fallback for invalid bytes
1735 if (fallback == nullptr)
1736 {
1737 fallback = decoderFallback->CreateFallbackBuffer();
1738 fallback->InternalInitialize(bytes, pAllocatedBufferEnd);
1739 }
1740
1741 // That'll back us up the appropriate # of bytes if we didn't get anywhere
1742 if (!FallbackInvalidByteSequence(&pSrc, ch, fallback, &pTarget))
1743 {
1744 // Ran out of buffer space
1745 // Need to throw an exception?
1746 Contract::Assert(pSrc >= bytes || pTarget == chars,
1747 "[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer after fallback");
1748 fallback->InternalReset();
1749 ThrowCharsOverflow(pTarget == chars);
1750 ch = 0;
1751 break;
1752 }
1753 Contract::Assert(pSrc >= bytes,
1754 "[UTF8Encoding.GetChars]Expected invalid byte sequence to have remained within the byte array");
1755 ch = 0;
1756 continue;
1757
1758 ReadChar:
1759 ch = *pSrc;
1760 pSrc++;
1761
1762 ProcessChar:
1763 if (ch > 0x7F) {
1764 // If its > 0x7F, its start of a new multi-byte sequence
1765
1766 // bit 6 has to be non-zero
1767 if ((ch & 0x40) == 0) {
1768 goto InvalidByteSequence;
1769 }
1770
1771 // start a new long code
1772 if ((ch & 0x20) != 0) {
1773 if ((ch & 0x10) != 0) {
1774 // 4 byte encoding - supplimentary character (2 surrogates)
1775
1776 ch &= 0x0F;
1777
1778 // check that bit 4 is zero and the valid supplimentary character
1779 // range 0x000000 - 0x10FFFF at the same time
1780 if (ch > 0x04) {
1781 ch |= 0xf0;
1782 goto InvalidByteSequence;
1783 }
1784
1785 ch |= (FinalByte >> 3 * 6) | (1 << 30) | (3 << (30 - 2 * 6)) |
1786 (SupplimentarySeq) | (SupplimentarySeq >> 6) |
1787 (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6);
1788 }
1789 else {
1790 // 3 byte encoding
1791 ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) |
1792 (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6));
1793 }
1794 }
1795 else {
1796 // 2 byte encoding
1797
1798 ch &= 0x1F;
1799
1800 // check for non-shortest form
1801 if (ch <= 1) {
1802 ch |= 0xc0;
1803 goto InvalidByteSequence;
1804 }
1805
1806 ch |= (FinalByte >> 6);
1807 }
1808 continue;
1809 }
1810
1811 EncodeChar:
1812 // write the pending character
1813 if (pTarget >= pAllocatedBufferEnd)
1814 {
1815 // Fix chars so we make sure to throw if we didn't output anything
1816 ch &= 0x1fffff;
1817 if (ch > 0x7f)
1818 {
1819 if (ch > 0x7ff)
1820 {
1821 if (ch >= CharUnicodeInfo::LOW_SURROGATE_START &&
1822 ch <= CharUnicodeInfo::LOW_SURROGATE_END)
1823 {
1824 pSrc--; // It was 4 bytes
1825 pTarget--; // 1 was stored already, but we can't remember 1/2, so back up
1826 }
1827 else if (ch > 0xffff)
1828 {
1829 pSrc--; // It was 4 bytes, nothing was stored
1830 }
1831 pSrc--; // It was at least 3 bytes
1832 }
1833 pSrc--; // It was at least 2 bytes
1834 }
1835 pSrc--;
1836
1837 // Throw that we don't have enough room (pSrc could be < chars if we had started to process
1838 // a 4 byte sequence alredy)
1839 Contract::Assert(pSrc >= bytes || pTarget == chars,
1840 "[UTF8Encoding.GetChars]Expected pSrc to be within input buffer or throw due to no output]");
1841 ThrowCharsOverflow(pTarget == chars);
1842
1843 // Don't store ch in decoder, we already backed up to its start
1844 ch = 0;
1845
1846 // Didn't throw, just use this buffer size.
1847 break;
1848 }
1849 *pTarget = (WCHAR)ch;
1850 pTarget++;
1851
1852#ifdef FASTLOOP
1853 int availableChars = PtrDiff(pAllocatedBufferEnd, pTarget);
1854 int availableBytes = PtrDiff(pEnd, pSrc);
1855
1856 // don't fall into the fast decoding loop if we don't have enough bytes
1857 // Test for availableChars is done because pStop would be <= pTarget.
1858 if (availableBytes <= 13) {
1859 // we may need as many as 1 character per byte
1860 if (availableChars < availableBytes) {
1861 // not enough output room. no pending bits at this point
1862 ch = 0;
1863 continue;
1864 }
1865
1866 // try to get over the remainder of the ascii characters fast though
1867 BYTE* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
1868 while (pSrc < pLocalEnd) {
1869 ch = *pSrc;
1870 pSrc++;
1871
1872 if (ch > 0x7F)
1873 goto ProcessChar;
1874
1875 *pTarget = (WCHAR)ch;
1876 pTarget++;
1877 }
1878 // we are done
1879 ch = 0;
1880 break;
1881 }
1882
1883 // we may need as many as 1 character per byte, so reduce the byte count if necessary.
1884 // If availableChars is too small, pStop will be before pTarget and we won't do fast loop.
1885 if (availableChars < availableBytes) {
1886 availableBytes = availableChars;
1887 }
1888
1889 // To compute the upper bound, assume that all characters are ASCII characters at this point,
1890 // the boundary will be decreased for every non-ASCII character we encounter
1891 // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences
1892 WCHAR *pStop = pTarget + availableBytes - 7;
1893
1894 while (pTarget < pStop) {
1895 ch = *pSrc;
1896 pSrc++;
1897
1898 if (ch > 0x7F) {
1899 goto LongCode;
1900 }
1901 *pTarget = (WCHAR)ch;
1902 pTarget++;
1903
1904 // get pSrc to be 2-byte aligned
1905 if ((((int)pSrc) & 0x1) != 0) {
1906 ch = *pSrc;
1907 pSrc++;
1908 if (ch > 0x7F) {
1909 goto LongCode;
1910 }
1911 *pTarget = (WCHAR)ch;
1912 pTarget++;
1913 }
1914
1915 // get pSrc to be 4-byte aligned
1916 if ((((int)pSrc) & 0x2) != 0) {
1917 ch = *(USHORT*)pSrc;
1918 if ((ch & 0x8080) != 0) {
1919 goto LongCodeWithMask16;
1920 }
1921
1922 // Unfortunately, this is endianess sensitive
1923#if BIGENDIAN
1924 *pTarget = (WCHAR)((ch >> 8) & 0x7F);
1925 pSrc += 2;
1926 *(pTarget + 1) = (WCHAR)(ch & 0x7F);
1927 pTarget += 2;
1928#else // BIGENDIAN
1929 *pTarget = (WCHAR)(ch & 0x7F);
1930 pSrc += 2;
1931 *(pTarget + 1) = (WCHAR)((ch >> 8) & 0x7F);
1932 pTarget += 2;
1933#endif // BIGENDIAN
1934 }
1935
1936 // Run 8 characters at a time!
1937 while (pTarget < pStop) {
1938 ch = *(int*)pSrc;
1939 int chb = *(int*)(pSrc + 4);
1940 if (((ch | chb) & (int)0x80808080) != 0) {
1941 goto LongCodeWithMask32;
1942 }
1943
1944 // Unfortunately, this is endianess sensitive
1945#if BIGENDIAN
1946 *pTarget = (WCHAR)((ch >> 24) & 0x7F);
1947 *(pTarget + 1) = (WCHAR)((ch >> 16) & 0x7F);
1948 *(pTarget + 2) = (WCHAR)((ch >> 8) & 0x7F);
1949 *(pTarget + 3) = (WCHAR)(ch & 0x7F);
1950 pSrc += 8;
1951 *(pTarget + 4) = (WCHAR)((chb >> 24) & 0x7F);
1952 *(pTarget + 5) = (WCHAR)((chb >> 16) & 0x7F);
1953 *(pTarget + 6) = (WCHAR)((chb >> 8) & 0x7F);
1954 *(pTarget + 7) = (WCHAR)(chb & 0x7F);
1955 pTarget += 8;
1956#else // BIGENDIAN
1957 *pTarget = (WCHAR)(ch & 0x7F);
1958 *(pTarget + 1) = (WCHAR)((ch >> 8) & 0x7F);
1959 *(pTarget + 2) = (WCHAR)((ch >> 16) & 0x7F);
1960 *(pTarget + 3) = (WCHAR)((ch >> 24) & 0x7F);
1961 pSrc += 8;
1962 *(pTarget + 4) = (WCHAR)(chb & 0x7F);
1963 *(pTarget + 5) = (WCHAR)((chb >> 8) & 0x7F);
1964 *(pTarget + 6) = (WCHAR)((chb >> 16) & 0x7F);
1965 *(pTarget + 7) = (WCHAR)((chb >> 24) & 0x7F);
1966 pTarget += 8;
1967#endif // BIGENDIAN
1968 }
1969 break;
1970
1971#if BIGENDIAN
1972 LongCodeWithMask32 :
1973 // be careful about the sign extension
1974 ch = (int)(((uint)ch) >> 16);
1975 LongCodeWithMask16:
1976 ch = (int)(((uint)ch) >> 8);
1977#else // BIGENDIAN
1978 LongCodeWithMask32:
1979 LongCodeWithMask16:
1980 ch &= 0xFF;
1981#endif // BIGENDIAN
1982 pSrc++;
1983 if (ch <= 0x7F) {
1984 *pTarget = (WCHAR)ch;
1985 pTarget++;
1986 continue;
1987 }
1988
1989 LongCode:
1990 int chc = *pSrc;
1991 pSrc++;
1992
1993 if (
1994 // bit 6 has to be zero
1995 (ch & 0x40) == 0 ||
1996 // we are expecting to see trailing bytes like 10vvvvvv
1997 (chc & 0xC0) != 0x80)
1998 {
1999 goto BadLongCode;
2000 }
2001
2002 chc &= 0x3F;
2003
2004 // start a new long code
2005 if ((ch & 0x20) != 0) {
2006
2007 // fold the first two bytes together
2008 chc |= (ch & 0x0F) << 6;
2009
2010 if ((ch & 0x10) != 0) {
2011 // 4 byte encoding - surrogate
2012 ch = *pSrc;
2013 if (
2014 // check that bit 4 is zero, the non-shortest form of surrogate
2015 // and the valid surrogate range 0x000000 - 0x10FFFF at the same time
2016 !InRange(chc >> 4, 0x01, 0x10) ||
2017 // we are expecting to see trailing bytes like 10vvvvvv
2018 (ch & 0xC0) != 0x80)
2019 {
2020 goto BadLongCode;
2021 }
2022
2023 chc = (chc << 6) | (ch & 0x3F);
2024
2025 ch = *(pSrc + 1);
2026 // we are expecting to see trailing bytes like 10vvvvvv
2027 if ((ch & 0xC0) != 0x80) {
2028 goto BadLongCode;
2029 }
2030 pSrc += 2;
2031
2032 ch = (chc << 6) | (ch & 0x3F);
2033
2034 *pTarget = (WCHAR)(((ch >> 10) & 0x7FF) +
2035 (SHORT)(CharUnicodeInfo::HIGH_SURROGATE_START - (0x10000 >> 10)));
2036 pTarget++;
2037
2038 ch = (ch & 0x3FF) +
2039 (SHORT)(CharUnicodeInfo::LOW_SURROGATE_START);
2040
2041 // extra byte, we're already planning 2 chars for 2 of these bytes,
2042 // but the big loop is testing the target against pStop, so we need
2043 // to subtract 2 more or we risk overrunning the input. Subtract
2044 // one here and one below.
2045 pStop--;
2046 }
2047 else {
2048 // 3 byte encoding
2049 ch = *pSrc;
2050 if (
2051 // check for non-shortest form of 3 byte seq
2052 (chc & (0x1F << 5)) == 0 ||
2053 // Can't have surrogates here.
2054 (chc & (0xF800 >> 6)) == (0xD800 >> 6) ||
2055 // we are expecting to see trailing bytes like 10vvvvvv
2056 (ch & 0xC0) != 0x80)
2057 {
2058 goto BadLongCode;
2059 }
2060 pSrc++;
2061
2062 ch = (chc << 6) | (ch & 0x3F);
2063
2064 // extra byte, we're only expecting 1 char for each of these 3 bytes,
2065 // but the loop is testing the target (not source) against pStop, so
2066 // we need to subtract 2 more or we risk overrunning the input.
2067 // Subtract 1 here and one more below
2068 pStop--;
2069 }
2070 }
2071 else {
2072 // 2 byte encoding
2073
2074 ch &= 0x1F;
2075
2076 // check for non-shortest form
2077 if (ch <= 1) {
2078 goto BadLongCode;
2079 }
2080 ch = (ch << 6) | chc;
2081 }
2082
2083 *pTarget = (WCHAR)ch;
2084 pTarget++;
2085
2086 // extra byte, we're only expecting 1 char for each of these 2 bytes,
2087 // but the loop is testing the target (not source) against pStop.
2088 // subtract an extra count from pStop so that we don't overrun the input.
2089 pStop--;
2090 }
2091#endif // FASTLOOP
2092
2093 Contract::Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetChars]pTarget <= pAllocatedBufferEnd");
2094
2095 // no pending bits at this point
2096 ch = 0;
2097 continue;
2098
2099 BadLongCode:
2100 pSrc -= 2;
2101 ch = 0;
2102 continue;
2103 }
2104
2105 if (ch != 0)
2106 {
2107 // Have to do fallback for invalid bytes
2108 if (fallback == nullptr)
2109 {
2110 fallback = decoderFallback->CreateFallbackBuffer();
2111 fallback->InternalInitialize(bytes, pAllocatedBufferEnd);
2112 }
2113
2114 // This'll back us up the appropriate # of bytes if we didn't get anywhere
2115 if (!FallbackInvalidByteSequence(pSrc, ch, fallback))
2116 {
2117 Contract::Assert(pSrc >= bytes || pTarget == chars,
2118 "[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer while flushing");
2119
2120 // Ran out of buffer space
2121 // Need to throw an exception?
2122 fallback->InternalReset();
2123 ThrowCharsOverflow(pTarget == chars);
2124 }
2125 Contract::Assert(pSrc >= bytes,
2126 "[UTF8Encoding.GetChars]Expected flushing invalid byte sequence to have remained within the byte array");
2127 ch = 0;
2128 }
2129
2130 // Shouldn't have anything in fallback buffer for GetChars
2131 // (don't have to check m_throwOnOverflow for chars)
2132 Contract::Assert(fallback == nullptr || fallback->GetRemaining() == 0,
2133 "[UTF8Encoding.GetChars]Expected empty fallback buffer at end");
2134
2135 InternalDelete(fallback);
2136
2137 return PtrDiff(pTarget, chars);
2138 }
2139
2140 int GetBytes(WCHAR* chars, int charCount, BYTE* bytes, int byteCount)
2141 {
2142 Contract::Assert(chars != nullptr, "[UTF8Encoding.GetBytes]chars!=nullptr");
2143 Contract::Assert(byteCount >= 0, "[UTF8Encoding.GetBytes]byteCount >=0");
2144 Contract::Assert(charCount >= 0, "[UTF8Encoding.GetBytes]charCount >=0");
2145 Contract::Assert(bytes != nullptr, "[UTF8Encoding.GetBytes]bytes!=nullptr");
2146
2147 // For fallback we may need a fallback buffer.
2148 // We wait to initialize it though in case we don't have any broken input unicode
2149 EncoderFallbackBuffer* fallbackBuffer = nullptr;
2150 WCHAR *pSrc = chars;
2151 BYTE *pTarget = bytes;
2152
2153 WCHAR *pEnd = pSrc + charCount;
2154 BYTE *pAllocatedBufferEnd = pTarget + byteCount;
2155
2156 int ch = 0;
2157
2158 // assume that JIT will enregister pSrc, pTarget and ch
2159
2160 for (;;) {
2161 // SLOWLOOP: does all range checks, handles all special cases, but it is slow
2162
2163 if (pSrc >= pEnd) {
2164
2165 if (ch == 0) {
2166 // Check if there's anything left to get out of the fallback buffer
2167 ch = fallbackBuffer != nullptr ? fallbackBuffer->InternalGetNextChar() : 0;
2168 if (ch > 0) {
2169 goto ProcessChar;
2170 }
2171 }
2172 else {
2173 // Case of leftover surrogates in the fallback buffer
2174 if (fallbackBuffer != nullptr && fallbackBuffer->bFallingBack) {
2175 Contract::Assert(ch >= 0xD800 && ch <= 0xDBFF,
2176 "[UTF8Encoding.GetBytes]expected high surrogate"); //, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
2177
2178 int cha = ch;
2179
2180 ch = fallbackBuffer->InternalGetNextChar();
2181
2182 if (InRange(ch, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) {
2183 ch = ch + (cha << 10) + (0x10000 - CharUnicodeInfo::LOW_SURROGATE_START - (CharUnicodeInfo::HIGH_SURROGATE_START << 10));
2184 goto EncodeChar;
2185 }
2186 else if (ch > 0){
2187 goto ProcessChar;
2188 }
2189 else {
2190 break;
2191 }
2192 }
2193 }
2194
2195 // attempt to encode the partial surrogate (will fail or ignore)
2196 if (ch > 0)
2197 goto EncodeChar;
2198
2199 // We're done
2200 break;
2201 }
2202
2203 if (ch > 0) {
2204 // We have a high surrogate left over from a previous loop.
2205 Contract::Assert(ch >= 0xD800 && ch <= 0xDBFF,
2206 "[UTF8Encoding.GetBytes]expected high surrogate");//, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
2207
2208 // use separate helper variables for local contexts so that the jit optimizations
2209 // won't get confused about the variable lifetimes
2210 int cha = *pSrc;
2211
2212 // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here.
2213 // if (IsLowSurrogate(cha)) {
2214 if (InRange(cha, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) {
2215 ch = cha + (ch << 10) +
2216 (0x10000
2217 - CharUnicodeInfo::LOW_SURROGATE_START
2218 - (CharUnicodeInfo::HIGH_SURROGATE_START << 10));
2219
2220 pSrc++;
2221 }
2222 // else ch is still high surrogate and encoding will fail
2223
2224 // attempt to encode the surrogate or partial surrogate
2225 goto EncodeChar;
2226 }
2227
2228 // If we've used a fallback, then we have to check for it
2229 if (fallbackBuffer != nullptr)
2230 {
2231 ch = fallbackBuffer->InternalGetNextChar();
2232 if (ch > 0) goto ProcessChar;
2233 }
2234
2235 // read next char. The JIT optimization seems to be getting confused when
2236 // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
2237 ch = *pSrc;
2238 pSrc++;
2239
2240 ProcessChar:
2241 if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::HIGH_SURROGATE_END)) {
2242 continue;
2243 }
2244 // either good char or partial surrogate
2245
2246 EncodeChar:
2247 // throw exception on partial surrogate if necessary
2248 if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END))
2249 {
2250 // Lone surrogates aren't allowed, we have to do fallback for them
2251 // Have to make a fallback buffer if we don't have one
2252 if (fallbackBuffer == nullptr)
2253 {
2254 // wait on fallbacks if we can
2255 // For fallback we may need a fallback buffer
2256 fallbackBuffer = encoderFallback->CreateFallbackBuffer();
2257
2258 // Set our internal fallback interesting things.
2259 fallbackBuffer->InternalInitialize(chars, pEnd, true);
2260 }
2261
2262 // Do our fallback. Actually we already know its a mixed up surrogate,
2263 // so the ref pSrc isn't gonna do anything.
2264 fallbackBuffer->InternalFallback((WCHAR)ch, &pSrc);
2265
2266 // Ignore it if we don't throw
2267 ch = 0;
2268 continue;
2269 }
2270
2271 // Count bytes needed
2272 int bytesNeeded = 1;
2273 if (ch > 0x7F) {
2274 if (ch > 0x7FF) {
2275 if (ch > 0xFFFF) {
2276 bytesNeeded++; // 4 bytes (surrogate pair)
2277 }
2278 bytesNeeded++; // 3 bytes (800-FFFF)
2279 }
2280 bytesNeeded++; // 2 bytes (80-7FF)
2281 }
2282
2283 if (pTarget > pAllocatedBufferEnd - bytesNeeded) {
2284 // Left over surrogate from last time will cause pSrc == chars, so we'll throw
2285 if (fallbackBuffer != nullptr && fallbackBuffer->bFallingBack)
2286 {
2287 fallbackBuffer->MovePrevious(); // Didn't use this fallback char
2288 if (ch > 0xFFFF)
2289 fallbackBuffer->MovePrevious(); // Was surrogate, didn't use 2nd part either
2290 }
2291 else
2292 {
2293 pSrc--; // Didn't use this char
2294 if (ch > 0xFFFF)
2295 pSrc--; // Was surrogate, didn't use 2nd part either
2296 }
2297 Contract::Assert(pSrc >= chars || pTarget == bytes,
2298 "[UTF8Encoding.GetBytes]Expected pSrc to be within buffer or to throw with insufficient room.");
2299 ThrowBytesOverflow(pTarget == bytes); // Throw if we must
2300 ch = 0; // Nothing left over (we backed up to start of pair if supplimentary)
2301 break;
2302 }
2303
2304 if (ch <= 0x7F) {
2305 *pTarget = (BYTE)ch;
2306 }
2307 else {
2308 // use separate helper variables for local contexts so that the jit optimizations
2309 // won't get confused about the variable lifetimes
2310 int chb;
2311 if (ch <= 0x7FF) {
2312 // 2 BYTE encoding
2313 chb = (BYTE)(0xC0 | (ch >> 6));
2314 }
2315 else
2316 {
2317 if (ch <= 0xFFFF) {
2318 chb = (BYTE)(0xE0 | (ch >> 12));
2319 }
2320 else
2321 {
2322 *pTarget = (BYTE)(0xF0 | (ch >> 18));
2323 pTarget++;
2324
2325 chb = 0x80 | ((ch >> 12) & 0x3F);
2326 }
2327 *pTarget = (BYTE)chb;
2328 pTarget++;
2329
2330 chb = 0x80 | ((ch >> 6) & 0x3F);
2331 }
2332 *pTarget = (BYTE)chb;
2333 pTarget++;
2334
2335 *pTarget = (BYTE)0x80 | (ch & 0x3F);
2336 }
2337 pTarget++;
2338
2339
2340#ifdef FASTLOOP
2341 // If still have fallback don't do fast loop
2342 if (fallbackBuffer != nullptr && (ch = fallbackBuffer->InternalGetNextChar()) != 0)
2343 goto ProcessChar;
2344
2345 int availableChars = PtrDiff(pEnd, pSrc);
2346 int availableBytes = PtrDiff(pAllocatedBufferEnd, pTarget);
2347
2348 // don't fall into the fast decoding loop if we don't have enough characters
2349 // Note that if we don't have enough bytes, pStop will prevent us from entering the fast loop.
2350 if (availableChars <= 13) {
2351 // we are hoping for 1 BYTE per char
2352 if (availableBytes < availableChars) {
2353 // not enough output room. no pending bits at this point
2354 ch = 0;
2355 continue;
2356 }
2357
2358 // try to get over the remainder of the ascii characters fast though
2359 WCHAR* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
2360 while (pSrc < pLocalEnd) {
2361 ch = *pSrc;
2362 pSrc++;
2363
2364 // Not ASCII, need more than 1 BYTE per char
2365 if (ch > 0x7F)
2366 goto ProcessChar;
2367
2368 *pTarget = (BYTE)ch;
2369 pTarget++;
2370 }
2371 // we are done, let ch be 0 to clear encoder
2372 ch = 0;
2373 break;
2374 }
2375
2376 // we need at least 1 BYTE per character, but Convert might allow us to convert
2377 // only part of the input, so try as much as we can. Reduce charCount if necessary
2378 if (availableBytes < availableChars)
2379 {
2380 availableChars = availableBytes;
2381 }
2382
2383 // FASTLOOP:
2384 // - optimistic range checks
2385 // - fallbacks to the slow loop for all special cases, exception throwing, etc.
2386
2387 // To compute the upper bound, assume that all characters are ASCII characters at this point,
2388 // the boundary will be decreased for every non-ASCII character we encounter
2389 // Also, we need 5 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates
2390 // If there aren't enough bytes for the output, then pStop will be <= pSrc and will bypass the loop.
2391 WCHAR *pStop = pSrc + availableChars - 5;
2392
2393 while (pSrc < pStop) {
2394 ch = *pSrc;
2395 pSrc++;
2396
2397 if (ch > 0x7F) {
2398 goto LongCode;
2399 }
2400 *pTarget = (BYTE)ch;
2401 pTarget++;
2402
2403 // get pSrc aligned
2404 if (((size_t)pSrc & 0x2) != 0) {
2405 ch = *pSrc;
2406 pSrc++;
2407 if (ch > 0x7F) {
2408 goto LongCode;
2409 }
2410 *pTarget = (BYTE)ch;
2411 pTarget++;
2412 }
2413
2414 // Run 4 characters at a time!
2415 while (pSrc < pStop) {
2416 ch = *(int*)pSrc;
2417 int chc = *(int*)(pSrc + 2);
2418 if (((ch | chc) & (int)0xFF80FF80) != 0) {
2419 goto LongCodeWithMask;
2420 }
2421
2422 // Unfortunately, this is endianess sensitive
2423#if BIGENDIAN
2424 *pTarget = (BYTE)(ch >> 16);
2425 *(pTarget + 1) = (BYTE)ch;
2426 pSrc += 4;
2427 *(pTarget + 2) = (BYTE)(chc >> 16);
2428 *(pTarget + 3) = (BYTE)chc;
2429 pTarget += 4;
2430#else // BIGENDIAN
2431 *pTarget = (BYTE)ch;
2432 *(pTarget + 1) = (BYTE)(ch >> 16);
2433 pSrc += 4;
2434 *(pTarget + 2) = (BYTE)chc;
2435 *(pTarget + 3) = (BYTE)(chc >> 16);
2436 pTarget += 4;
2437#endif // BIGENDIAN
2438 }
2439 continue;
2440
2441 LongCodeWithMask:
2442#if BIGENDIAN
2443 // be careful about the sign extension
2444 ch = (int)(((uint)ch) >> 16);
2445#else // BIGENDIAN
2446 ch = (WCHAR)ch;
2447#endif // BIGENDIAN
2448 pSrc++;
2449
2450 if (ch > 0x7F) {
2451 goto LongCode;
2452 }
2453 *pTarget = (BYTE)ch;
2454 pTarget++;
2455 continue;
2456
2457 LongCode:
2458 // use separate helper variables for slow and fast loop so that the jit optimizations
2459 // won't get confused about the variable lifetimes
2460 int chd;
2461 if (ch <= 0x7FF) {
2462 // 2 BYTE encoding
2463 chd = 0xC0 | (ch >> 6);
2464 }
2465 else {
2466 if (!InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) {
2467 // 3 BYTE encoding
2468 chd = 0xE0 | (ch >> 12);
2469 }
2470 else
2471 {
2472 // 4 BYTE encoding - high surrogate + low surrogate
2473 if (ch > CharUnicodeInfo::HIGH_SURROGATE_END) {
2474 // low without high -> bad, try again in slow loop
2475 pSrc -= 1;
2476 break;
2477 }
2478
2479 chd = *pSrc;
2480 pSrc++;
2481
2482 // if (!IsLowSurrogate(chd)) {
2483 if (!InRange(chd, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) {
2484 // high not followed by low -> bad, try again in slow loop
2485 pSrc -= 2;
2486 break;
2487 }
2488
2489 ch = chd + (ch << 10) +
2490 (0x10000
2491 - CharUnicodeInfo::LOW_SURROGATE_START
2492 - (CharUnicodeInfo::HIGH_SURROGATE_START << 10));
2493
2494 *pTarget = (BYTE)(0xF0 | (ch >> 18));
2495 // pStop - this BYTE is compensated by the second surrogate character
2496 // 2 input chars require 4 output bytes. 2 have been anticipated already
2497 // and 2 more will be accounted for by the 2 pStop-- calls below.
2498 pTarget++;
2499
2500 chd = 0x80 | ((ch >> 12) & 0x3F);
2501 }
2502 *pTarget = (BYTE)chd;
2503 pStop--; // 3 BYTE sequence for 1 char, so need pStop-- and the one below too.
2504 pTarget++;
2505
2506 chd = 0x80 | ((ch >> 6) & 0x3F);
2507 }
2508 *pTarget = (BYTE)chd;
2509 pStop--; // 2 BYTE sequence for 1 char so need pStop--.
2510 pTarget++;
2511
2512 *pTarget = (BYTE)(0x80 | (ch & 0x3F));
2513 // pStop - this BYTE is already included
2514 pTarget++;
2515 }
2516
2517 Contract::Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetBytes]pTarget <= pAllocatedBufferEnd");
2518
2519#endif // FASTLOOP
2520
2521 // no pending char at this point
2522 ch = 0;
2523 }
2524
2525 InternalDelete(fallbackBuffer);
2526
2527 return (int)(pTarget - bytes);
2528 }
2529
2530 int GetByteCount(WCHAR *chars, int count)
2531 {
2532 // For fallback we may need a fallback buffer.
2533 // We wait to initialize it though in case we don't have any broken input unicode
2534 EncoderFallbackBuffer* fallbackBuffer = nullptr;
2535 WCHAR *pSrc = chars;
2536 WCHAR *pEnd = pSrc + count;
2537
2538 // Start by assuming we have as many as count
2539 int byteCount = count;
2540
2541 int ch = 0;
2542
2543 for (;;) {
2544 // SLOWLOOP: does all range checks, handles all special cases, but it is slow
2545 if (pSrc >= pEnd) {
2546
2547 if (ch == 0) {
2548 // Unroll any fallback that happens at the end
2549 ch = fallbackBuffer != nullptr ? fallbackBuffer->InternalGetNextChar() : 0;
2550 if (ch > 0) {
2551 byteCount++;
2552 goto ProcessChar;
2553 }
2554 }
2555 else {
2556 // Case of surrogates in the fallback.
2557 if (fallbackBuffer != nullptr && fallbackBuffer->bFallingBack) {
2558 Contract::Assert(ch >= 0xD800 && ch <= 0xDBFF,
2559 "[UTF8Encoding.GetBytes]expected high surrogate");// , not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
2560
2561 ch = fallbackBuffer->InternalGetNextChar();
2562 byteCount++;
2563
2564 if (InRange(ch, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) {
2565 ch = 0xfffd;
2566 byteCount++;
2567 goto EncodeChar;
2568 }
2569 else if (ch > 0){
2570 goto ProcessChar;
2571 }
2572 else {
2573 byteCount--; // ignore last one.
2574 break;
2575 }
2576 }
2577 }
2578
2579 if (ch <= 0) {
2580 break;
2581 }
2582
2583 // attempt to encode the partial surrogate (will fallback or ignore it), it'll also subtract 1.
2584 byteCount++;
2585 goto EncodeChar;
2586 }
2587
2588 if (ch > 0) {
2589 Contract::Assert(ch >= 0xD800 && ch <= 0xDBFF,
2590 "[UTF8Encoding.GetBytes]expected high surrogate"); // , not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
2591
2592 // use separate helper variables for local contexts so that the jit optimizations
2593 // won't get confused about the variable lifetimes
2594 int cha = *pSrc;
2595
2596 // count the pending surrogate
2597 byteCount++;
2598
2599 // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here.
2600 // if (IsLowSurrogate(cha)) {
2601 if (InRange(cha, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) {
2602 // Don't need a real # because we're just counting, anything > 0x7ff ('cept surrogate) will do.
2603 ch = 0xfffd;
2604 // ch = cha + (ch << 10) +
2605 // (0x10000
2606 // - CharUnicodeInfo::LOW_SURROGATE_START
2607 // - (CharUnicodeInfo::HIGH_SURROGATE_START << 10) );
2608
2609 // Use this next char
2610 pSrc++;
2611 }
2612 // else ch is still high surrogate and encoding will fail (so don't add count)
2613
2614 // attempt to encode the surrogate or partial surrogate
2615 goto EncodeChar;
2616 }
2617
2618 // If we've used a fallback, then we have to check for it
2619 if (fallbackBuffer != nullptr)
2620 {
2621 ch = fallbackBuffer->InternalGetNextChar();
2622 if (ch > 0)
2623 {
2624 // We have an extra byte we weren't expecting.
2625 byteCount++;
2626 goto ProcessChar;
2627 }
2628 }
2629
2630 // read next char. The JIT optimization seems to be getting confused when
2631 // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
2632 ch = *pSrc;
2633 pSrc++;
2634
2635 ProcessChar:
2636 if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::HIGH_SURROGATE_END)) {
2637 // we will count this surrogate next time around
2638 byteCount--;
2639 continue;
2640 }
2641 // either good char or partial surrogate
2642
2643 EncodeChar:
2644 // throw exception on partial surrogate if necessary
2645 if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END))
2646 {
2647 // Lone surrogates aren't allowed
2648 // Have to make a fallback buffer if we don't have one
2649 if (fallbackBuffer == nullptr)
2650 {
2651 // wait on fallbacks if we can
2652 // For fallback we may need a fallback buffer
2653 fallbackBuffer = encoderFallback->CreateFallbackBuffer();
2654
2655 // Set our internal fallback interesting things.
2656 fallbackBuffer->InternalInitialize(chars, chars + count, false);
2657 }
2658
2659 // Do our fallback. Actually we already know its a mixed up surrogate,
2660 // so the ref pSrc isn't gonna do anything.
2661 fallbackBuffer->InternalFallback((WCHAR)ch, &pSrc);
2662
2663 // Ignore it if we don't throw (we had preallocated this ch)
2664 byteCount--;
2665 ch = 0;
2666 continue;
2667 }
2668
2669 // Count them
2670 if (ch > 0x7F) {
2671 if (ch > 0x7FF) {
2672 // the extra surrogate byte was compensated by the second surrogate character
2673 // (2 surrogates make 4 bytes. We've already counted 2 bytes, 1 per char)
2674 byteCount++;
2675 }
2676 byteCount++;
2677 }
2678
2679#if WIN64
2680 // check for overflow
2681 if (byteCount < 0) {
2682 break;
2683 }
2684#endif
2685
2686#ifdef FASTLOOP
2687 // If still have fallback don't do fast loop
2688 if (fallbackBuffer != nullptr && (ch = fallbackBuffer->InternalGetNextChar()) != 0)
2689 {
2690 // We're reserving 1 byte for each char by default
2691 byteCount++;
2692 goto ProcessChar;
2693 }
2694
2695 int availableChars = PtrDiff(pEnd, pSrc);
2696
2697 // don't fall into the fast decoding loop if we don't have enough characters
2698 if (availableChars <= 13) {
2699 // try to get over the remainder of the ascii characters fast though
2700 WCHAR* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
2701 while (pSrc < pLocalEnd) {
2702 ch = *pSrc;
2703 pSrc++;
2704 if (ch > 0x7F)
2705 goto ProcessChar;
2706 }
2707
2708 // we are done
2709 break;
2710 }
2711
2712#if WIN64
2713 // make sure that we won't get a silent overflow inside the fast loop
2714 // (Fall out to slow loop if we have this many characters)
2715 availableChars &= 0x0FFFFFFF;
2716#endif
2717
2718 // To compute the upper bound, assume that all characters are ASCII characters at this point,
2719 // the boundary will be decreased for every non-ASCII character we encounter
2720 // Also, we need 3 + 4 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates
2721 WCHAR *pStop = pSrc + availableChars - (3 + 4);
2722
2723 while (pSrc < pStop) {
2724 ch = *pSrc;
2725 pSrc++;
2726
2727 if (ch > 0x7F) // Not ASCII
2728 {
2729 if (ch > 0x7FF) // Not 2 Byte
2730 {
2731 if ((ch & 0xF800) == 0xD800) // See if its a Surrogate
2732 goto LongCode;
2733 byteCount++;
2734 }
2735 byteCount++;
2736 }
2737
2738 // get pSrc aligned
2739 if (((int)pSrc & 0x2) != 0) {
2740 ch = *pSrc;
2741 pSrc++;
2742 if (ch > 0x7F) // Not ASCII
2743 {
2744 if (ch > 0x7FF) // Not 2 Byte
2745 {
2746 if ((ch & 0xF800) == 0xD800) // See if its a Surrogate
2747 goto LongCode;
2748 byteCount++;
2749 }
2750 byteCount++;
2751 }
2752 }
2753
2754 // Run 2 * 4 characters at a time!
2755 while (pSrc < pStop) {
2756 ch = *(int*)pSrc;
2757 int chc = *(int*)(pSrc + 2);
2758 if (((ch | chc) & (int)0xFF80FF80) != 0) // See if not ASCII
2759 {
2760 if (((ch | chc) & (int)0xF800F800) != 0) // See if not 2 Byte
2761 {
2762 goto LongCodeWithMask;
2763 }
2764
2765
2766 if ((ch & (int)0xFF800000) != 0) // Actually 0x07800780 is all we care about (4 bits)
2767 byteCount++;
2768 if ((ch & (int)0xFF80) != 0)
2769 byteCount++;
2770 if ((chc & (int)0xFF800000) != 0)
2771 byteCount++;
2772 if ((chc & (int)0xFF80) != 0)
2773 byteCount++;
2774 }
2775 pSrc += 4;
2776
2777 ch = *(int*)pSrc;
2778 chc = *(int*)(pSrc + 2);
2779 if (((ch | chc) & (int)0xFF80FF80) != 0) // See if not ASCII
2780 {
2781 if (((ch | chc) & (int)0xF800F800) != 0) // See if not 2 Byte
2782 {
2783 goto LongCodeWithMask;
2784 }
2785
2786 if ((ch & (int)0xFF800000) != 0)
2787 byteCount++;
2788 if ((ch & (int)0xFF80) != 0)
2789 byteCount++;
2790 if ((chc & (int)0xFF800000) != 0)
2791 byteCount++;
2792 if ((chc & (int)0xFF80) != 0)
2793 byteCount++;
2794 }
2795 pSrc += 4;
2796 }
2797 break;
2798
2799 LongCodeWithMask:
2800#if BIGENDIAN
2801 // be careful about the sign extension
2802 ch = (int)(((uint)ch) >> 16);
2803#else // BIGENDIAN
2804 ch = (WCHAR)ch;
2805#endif // BIGENDIAN
2806 pSrc++;
2807
2808 if (ch <= 0x7F) {
2809 continue;
2810 }
2811
2812 LongCode:
2813 // use separate helper variables for slow and fast loop so that the jit optimizations
2814 // won't get confused about the variable lifetimes
2815 if (ch > 0x7FF) {
2816 if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) {
2817 // 4 byte encoding - high surrogate + low surrogate
2818
2819 int chd = *pSrc;
2820 if (
2821 ch > CharUnicodeInfo::HIGH_SURROGATE_END ||
2822 !InRange(chd, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END))
2823 {
2824 // Back up and drop out to slow loop to figure out error
2825 pSrc--;
2826 break;
2827 }
2828 pSrc++;
2829
2830 // byteCount - this byte is compensated by the second surrogate character
2831 }
2832 byteCount++;
2833 }
2834 byteCount++;
2835
2836 // byteCount - the last byte is already included
2837 }
2838#endif // FASTLOOP
2839
2840 // no pending char at this point
2841 ch = 0;
2842 }
2843
2844#if WIN64
2845 // check for overflow
2846 if (byteCount < 0) {
2847 throw ArgumentException("Conversion buffer overflow.");
2848 }
2849#endif
2850
2851 Contract::Assert(fallbackBuffer == nullptr || fallbackBuffer->GetRemaining() == 0,
2852 "[UTF8Encoding.GetByteCount]Expected Empty fallback buffer");
2853
2854 InternalDelete(fallbackBuffer);
2855
2856 return byteCount;
2857 }
2858
2859};
2860
2861
2862////////////////////////////////////////////////////////////////////////////
2863//
2864// UTF8ToUnicode
2865//
2866// Maps a UTF-8 character string to its wide character string counterpart.
2867//
2868////////////////////////////////////////////////////////////////////////////
2869
2870int UTF8ToUnicode(
2871 LPCSTR lpSrcStr,
2872 int cchSrc,
2873 LPWSTR lpDestStr,
2874 int cchDest,
2875 DWORD dwFlags
2876 )
2877{
2878 int ret;
2879 UTF8Encoding enc(dwFlags & MB_ERR_INVALID_CHARS);
2880 try {
2881 ret = enc.GetCharCount((BYTE*)lpSrcStr, cchSrc);
2882 if (cchDest){
2883 if (ret > cchDest){
2884 SetLastError(ERROR_INSUFFICIENT_BUFFER);
2885 ret = 0;
2886 }
2887 enc.GetChars((BYTE*)lpSrcStr, cchSrc, (WCHAR*)lpDestStr, ret);
2888 }
2889 }
2890 catch (const InsufficientBufferException& e){
2891 SetLastError(ERROR_INSUFFICIENT_BUFFER);
2892 return 0;
2893 }
2894 catch (const DecoderFallbackException& e){
2895 SetLastError(ERROR_NO_UNICODE_TRANSLATION);
2896 return 0;
2897 }
2898 catch (const ArgumentException& e){
2899 SetLastError(ERROR_INVALID_PARAMETER);
2900 return 0;
2901 }
2902 return ret;
2903}
2904
2905////////////////////////////////////////////////////////////////////////////
2906//
2907// UnicodeToUTF8
2908//
2909// Maps a Unicode character string to its UTF-8 string counterpart.
2910//
2911////////////////////////////////////////////////////////////////////////////
2912
2913int UnicodeToUTF8(
2914 LPCWSTR lpSrcStr,
2915 int cchSrc,
2916 LPSTR lpDestStr,
2917 int cchDest)
2918{
2919 int ret;
2920 UTF8Encoding enc(false);
2921 try{
2922 ret = enc.GetByteCount((WCHAR*)lpSrcStr, cchSrc);
2923 if (cchDest){
2924 if (ret > cchDest){
2925 SetLastError(ERROR_INSUFFICIENT_BUFFER);
2926 ret = 0;
2927 }
2928 enc.GetBytes((WCHAR*)lpSrcStr, cchSrc, (BYTE*)lpDestStr, ret);
2929 }
2930 }
2931 catch (const InsufficientBufferException& e){
2932 SetLastError(ERROR_INSUFFICIENT_BUFFER);
2933 return 0;
2934 }
2935 catch (const EncoderFallbackException& e){
2936 SetLastError(ERROR_NO_UNICODE_TRANSLATION);
2937 return 0;
2938 }
2939 catch (const ArgumentException& e){
2940 SetLastError(ERROR_INVALID_PARAMETER);
2941 return 0;
2942 }
2943 return ret;
2944}
2945