utf8.cpp source code [CoreCLR/pal/src/locale/utf8.cpp]

1	// Licensed to the .NET Foundation under one or more agreements.
2	// The .NET Foundation licenses this file to you under the MIT license.
3	// See the LICENSE file in the project root for more information.
4
5	/++*
6
7
8
9	Module Name:
10
11	unicode/utf8.c
12
13	Abstract:
14	Functions to encode and decode UTF-8 strings. This is a port of the C# version from mscorlib.
15
16	Revision History:
17
18
19
20	--/*
21
22	#include "pal/utf8.h"
23	#include "pal/malloc.hpp"
24
25	using namespace CorUnix;
26
27	#define FASTLOOP
28
29	struct CharUnicodeInfo
30	{
31	static const WCHAR HIGH_SURROGATE_START = `0xd800`;
32	static const WCHAR HIGH_SURROGATE_END = `0xdbff`;
33	static const WCHAR LOW_SURROGATE_START = `0xdc00`;
34	static const WCHAR LOW_SURROGATE_END = `0xdfff`;
35	};
36
37	struct Char
38	{
39	// Test if the wide character is a high surrogate
40	static bool IsHighSurrogate(const WCHAR c)
41	{
42	return (c & `0xFC00`) == CharUnicodeInfo::HIGH_SURROGATE_START;
43	}
44
45	// Test if the wide character is a low surrogate
46	static bool IsLowSurrogate(const WCHAR c)
47	{
48	return (c & `0xFC00`) == CharUnicodeInfo::LOW_SURROGATE_START;
49	}
50
51	// Test if the wide character is a low surrogate
52	static bool IsSurrogate(const WCHAR c)
53	{
54	return (c & `0xF800`) == CharUnicodeInfo::HIGH_SURROGATE_START;
55	}
56
57	// Test if the wide character is a high surrogate
58	static bool IsHighSurrogate(const WCHAR* s, int index)
59	{
60	return IsHighSurrogate(s[index]);
61	}
62
63	// Test if the wide character is a low surrogate
64	static bool IsLowSurrogate(const WCHAR* s, int index)
65	{
66	return IsLowSurrogate(s[index]);
67	}
68
69	// Test if the wide character is a low surrogate
70	static bool IsSurrogate(const WCHAR* s, int index)
71	{
72	return IsSurrogate(s[index]);
73	}
74	};
75
76	class ArgumentException
77	{
78
79	public:
80	ArgumentException(LPCSTR message)
81	{
82	}
83
84	ArgumentException(LPCSTR message, LPCSTR argName)
85	{
86	}
87	};
88
89	class ArgumentNullException : public ArgumentException
90	{
91	public:
92	ArgumentNullException(LPCSTR argName)
93	: ArgumentException ("Argument is NULL", argName)
94	{
95
96	}
97	};
98
99	class ArgumentOutOfRangeException : public ArgumentException
100	{
101	public:
102	ArgumentOutOfRangeException(LPCSTR argName, LPCSTR message)
103	: ArgumentException (message, argName)
104	{
105
106	}
107	};
108
109	class InsufficientBufferException : public ArgumentException
110	{
111	public:
112	InsufficientBufferException(LPCSTR message, LPCSTR argName)
113	: ArgumentException (message, argName)
114	{
115
116	}
117	};
118
119	class Contract
120	{
121	public:
122	static void Assert(bool cond, LPCSTR str)
123	{
124	if (!cond)
125	{
126	throw ArgumentException (str);
127	}
128	}
129
130	static void EndContractBlock()
131	{
132	}
133	};
134
135	class DecoderFallbackException : public ArgumentException
136	{
137	BYTE *bytesUnknown;
138	int index;
139
140	public:
141	DecoderFallbackException(
142	LPCSTR message, BYTE bytesUnknown[], int index) : ArgumentException (message)
143	{
144	this->bytesUnknown = bytesUnknown;
145	this->index = index;
146	}
147
148	BYTE *BytesUnknown()
149	{
150	return (bytesUnknown);
151	}
152
153	int GetIndex()
154	{
155	return index;
156	}
157	};
158
159	class DecoderFallbackBuffer;
160
161	class DecoderFallback
162	{
163	public:
164
165	// Fallback
166	//
167	// Return the appropriate unicode string alternative to the character that need to fall back.
168
169	virtual DecoderFallbackBuffer* CreateFallbackBuffer() = `0`;
170
171	// Maximum number of characters that this instance of this fallback could return
172
173	virtual int GetMaxCharCount() = `0`;
174	};
175
176	class DecoderReplacementFallback : public DecoderFallback
177	{
178	// Our variables
179	WCHAR strDefault[`2`];
180	int strDefaultLength;
181
182	public:
183	// Construction. Default replacement fallback uses no best fit and ? replacement string
184	DecoderReplacementFallback() : DecoderReplacementFallback (W("?"))
185	{
186	}
187
188	DecoderReplacementFallback(const WCHAR* replacement)
189	{
190	// Must not be null
191	if (replacement == nullptr)
192	throw ArgumentNullException ("replacement");
193	Contract::EndContractBlock();
194
195	// Make sure it doesn't have bad surrogate pairs
196	bool bFoundHigh = false;
197	int replacementLength = PAL_wcslen((const WCHAR *)replacement);
198	for (int i = `0`; i < replacementLength; i++)
199	{
200	// Found a surrogate?
201	if (Char::IsSurrogate(replacement, i))
202	{
203	// High or Low?
204	if (Char::IsHighSurrogate(replacement, i))
205	{
206	// if already had a high one, stop
207	if (bFoundHigh)
208	break; // break & throw at the bFoundHIgh below
209	bFoundHigh = true;
210	}
211	else
212	{
213	// Low, did we have a high?
214	if (!bFoundHigh)
215	{
216	// Didn't have one, make if fail when we stop
217	bFoundHigh = true;
218	break;
219	}
220
221	// Clear flag
222	bFoundHigh = false;
223	}
224	}
225	// If last was high we're in trouble (not surrogate so not low surrogate, so break)
226	else if (bFoundHigh)
227	break;
228	}
229	if (bFoundHigh)
230	throw ArgumentException ("String 'replacement' contains invalid Unicode code points.", "replacement");
231
232	wcscpy_s(strDefault, sizeof(strDefault), replacement);
233	strDefaultLength = replacementLength;
234	}
235
236	WCHAR* GetDefaultString()
237	{
238	return strDefault;
239	}
240
241	virtual DecoderFallbackBuffer* CreateFallbackBuffer();
242
243	// Maximum number of characters that this instance of this fallback could return
244	virtual int GetMaxCharCount()
245	{
246	return strDefaultLength;
247	}
248	};
249
250	class DecoderFallbackBuffer
251	{
252	friend class UTF8Encoding;
253	// Most implimentations will probably need an implimenation-specific constructor
254
255	// internal methods that cannot be overriden that let us do our fallback thing
256	// These wrap the internal methods so that we can check for people doing stuff that's incorrect
257
258	public:
259	virtual ~DecoderFallbackBuffer() = default;
260
261	virtual bool Fallback(BYTE bytesUnknown[], int index, int size) = `0`;
262
263	// Get next character
264	virtual WCHAR GetNextChar() = `0`;
265
266	//Back up a character
267	virtual bool MovePrevious() = `0`;
268
269	// How many chars left in this fallback?
270	virtual int GetRemaining() = `0`;
271
272	// Clear the buffer
273	virtual void Reset()
274	{
275	while (GetNextChar() != (WCHAR)`0`);
276	}
277
278	// Internal items to help us figure out what we're doing as far as error messages, etc.
279	// These help us with our performance and messages internally
280	protected:
281	BYTE* byteStart;
282	WCHAR* charEnd;
283
284	// Internal reset
285	void InternalReset()
286	{
287	byteStart = nullptr;
288	Reset();
289	}
290
291	// Set the above values
292	// This can't be part of the constructor because EncoderFallbacks would have to know how to impliment these.
293	void InternalInitialize(BYTE* byteStart, WCHAR* charEnd)
294	{
295	this->byteStart = byteStart;
296	this->charEnd = charEnd;
297	}
298
299	// Fallback the current byte by sticking it into the remaining char buffer.
300	// This can only be called by our encodings (other have to use the public fallback methods), so
301	// we can use our DecoderNLS here too (except we don't).
302	// Returns true if we are successful, false if we can't fallback the character (no buffer space)
303	// So caller needs to throw buffer space if return false.
304	// Right now this has both bytes and bytes[], since we might have extra bytes, hence the
305	// array, and we might need the index, hence the byte*
306	// Don't touch ref chars unless we succeed
307	virtual bool InternalFallback(BYTE bytes[], BYTE* pBytes, WCHAR** chars, int size)
308	{
309
310	Contract::Assert(byteStart != nullptr, "[DecoderFallback.InternalFallback]Used InternalFallback without calling InternalInitialize");
311
312	// See if there's a fallback character and we have an output buffer then copy our string.
313	if (this->Fallback(bytes, (int)(pBytes - byteStart - size), size))
314	{
315	// Copy the chars to our output
316	WCHAR ch;
317	WCHAR* charTemp = *chars;
318	bool bHighSurrogate = false;
319	while ((ch = GetNextChar()) != `0`)
320	{
321	// Make sure no mixed up surrogates
322	if (Char::IsSurrogate(ch))
323	{
324	if (Char::IsHighSurrogate(ch))
325	{
326	// High Surrogate
327	if (bHighSurrogate)
328	throw ArgumentException ("String 'chars' contains invalid Unicode code points.");
329	bHighSurrogate = true;
330	}
331	else
332	{
333	// Low surrogate
334	if (bHighSurrogate == false)
335	throw ArgumentException ("String 'chars' contains invalid Unicode code points.");
336	bHighSurrogate = false;
337	}
338	}
339
340	if (charTemp >= charEnd)
341	{
342	// No buffer space
343	return false;
344	}
345
346	*(charTemp++) = ch;
347	}
348
349	// Need to make sure that bHighSurrogate isn't true
350	if (bHighSurrogate)
351	throw ArgumentException ("String 'chars' contains invalid Unicode code points.");
352
353	// Now we aren't going to be false, so its OK to update chars
354	*chars = charTemp;
355	}
356
357	return true;
358	}
359
360	// This version just counts the fallback and doesn't actually copy anything.
361	virtual int InternalFallback(BYTE bytes[], BYTE* pBytes, int size)
362	// Right now this has both bytes[] and BYTE bytes, since we might have extra bytes, hence the*
363	// array, and we might need the index, hence the byte*
364	{
365
366	Contract::Assert(byteStart != nullptr, "[DecoderFallback.InternalFallback]Used InternalFallback without calling InternalInitialize");
367
368	// See if there's a fallback character and we have an output buffer then copy our string.
369	if (this->Fallback(bytes, (int)(pBytes - byteStart - size), size))
370	{
371	int count = `0`;
372
373	WCHAR ch;
374	bool bHighSurrogate = false;
375	while ((ch = GetNextChar()) != `0`)
376	{
377	// Make sure no mixed up surrogates
378	if (Char::IsSurrogate(ch))
379	{
380	if (Char::IsHighSurrogate(ch))
381	{
382	// High Surrogate
383	if (bHighSurrogate)
384	throw ArgumentException ("String 'chars' contains invalid Unicode code points.");
385	bHighSurrogate = true;
386	}
387	else
388	{
389	// Low surrogate
390	if (bHighSurrogate == false)
391	throw ArgumentException ("String 'chars' contains invalid Unicode code points.");
392	bHighSurrogate = false;
393	}
394	}
395
396	count++;
397	}
398
399	// Need to make sure that bHighSurrogate isn't true
400	if (bHighSurrogate)
401	throw ArgumentException ("String 'chars' contains invalid Unicode code points.");
402
403	return count;
404	}
405
406	// If no fallback return 0
407	return `0`;
408	}
409
410	// private helper methods
411	void ThrowLastBytesRecursive(BYTE bytesUnknown[])
412	{
413	throw ArgumentException ("Recursive fallback not allowed");
414	}
415	};
416
417	class DecoderReplacementFallbackBuffer : public DecoderFallbackBuffer
418	{
419	// Store our default string
420	WCHAR strDefault[`2`];
421	int strDefaultLength;
422	int fallbackCount = -`1`;
423	int fallbackIndex = -`1`;
424
425	public:
426	// Construction
427	DecoderReplacementFallbackBuffer(DecoderReplacementFallback* fallback)
428	{
429	wcscpy_s(strDefault, sizeof(strDefault), fallback->GetDefaultString());
430	strDefaultLength = PAL_wcslen((const WCHAR *)fallback->GetDefaultString());
431	}
432
433	// Fallback Methods
434	virtual bool Fallback(BYTE bytesUnknown[], int index, int size)
435	{
436	// We expect no previous fallback in our buffer
437	// We can't call recursively but others might (note, we don't test on last char!!!)
438	if (fallbackCount >= `1`)
439	{
440	ThrowLastBytesRecursive(bytesUnknown);
441	}
442
443	// Go ahead and get our fallback
444	if (strDefaultLength == `0`)
445	return false;
446
447	fallbackCount = strDefaultLength;
448	fallbackIndex = -`1`;
449
450	return true;
451	}
452
453	virtual WCHAR GetNextChar()
454	{
455	// We want it to get < 0 because == 0 means that the current/last character is a fallback
456	// and we need to detect recursion. We could have a flag but we already have this counter.
457	fallbackCount--;
458	fallbackIndex++;
459
460	// Do we have anything left? 0 is now last fallback char, negative is nothing left
461	if (fallbackCount < `0`)
462	return `'\0'`;
463
464	// Need to get it out of the buffer.
465	// Make sure it didn't wrap from the fast count-- path
466	if (fallbackCount == INT_MAX)
467	{
468	fallbackCount = -`1`;
469	return `'\0'`;
470	}
471
472	// Now make sure its in the expected range
473	Contract::Assert(fallbackIndex < strDefaultLength && fallbackIndex >= `0`,
474	"Index exceeds buffer range");
475
476	return strDefault[fallbackIndex];
477	}
478
479	virtual bool MovePrevious()
480	{
481	// Back up one, only if we just processed the last character (or earlier)
482	if (fallbackCount >= -`1` && fallbackIndex >= `0`)
483	{
484	fallbackIndex--;
485	fallbackCount++;
486	return true;
487	}
488
489	// Return false 'cause we couldn't do it.
490	return false;
491	}
492
493	// How many characters left to output?
494	virtual int GetRemaining()
495	{
496	// Our count is 0 for 1 character left.
497	return (fallbackCount < `0`) ? `0` : fallbackCount;
498	}
499
500	// Clear the buffer
501	virtual void Reset()
502	{
503	fallbackCount = -`1`;
504	fallbackIndex = -`1`;
505	byteStart = nullptr;
506	}
507
508	// This version just counts the fallback and doesn't actually copy anything.
509	virtual int InternalFallback(BYTE bytes[], BYTE* pBytes, int size)
510	// Right now this has both bytes and bytes[], since we might have extra bytes, hence the
511	// array, and we might need the index, hence the byte*
512	{
513	// return our replacement string Length
514	return strDefaultLength;
515	}
516	};
517
518	class DecoderExceptionFallbackBuffer : public DecoderFallbackBuffer
519	{
520	public:
521	DecoderExceptionFallbackBuffer()
522	{
523	}
524
525	virtual bool Fallback(BYTE bytesUnknown[], int index, int size)
526	{
527	throw DecoderFallbackException (
528	"Unable to translate UTF-8 character to Unicode", bytesUnknown, index);
529	}
530
531	virtual WCHAR GetNextChar()
532	{
533	return `0`;
534	}
535
536	virtual bool MovePrevious()
537	{
538	// Exception fallback doesn't have anywhere to back up to.
539	return false;
540	}
541
542	// Exceptions are always empty
543	virtual int GetRemaining()
544	{
545	return `0`;
546	}
547
548	};
549
550	class DecoderExceptionFallback : public DecoderFallback
551	{
552	// Construction
553	public:
554	DecoderExceptionFallback()
555	{
556	}
557
558	virtual DecoderFallbackBuffer* CreateFallbackBuffer()
559	{
560	return InternalNew<DecoderExceptionFallbackBuffer>();
561	}
562
563	// Maximum number of characters that this instance of this fallback could return
564	virtual int GetMaxCharCount()
565	{
566	return `0`;
567	}
568	};
569
570	DecoderFallbackBuffer* DecoderReplacementFallback::CreateFallbackBuffer()
571	{
572	return InternalNew<DecoderReplacementFallbackBuffer>(this);
573	}
574
575	class EncoderFallbackException : public ArgumentException
576	{
577	WCHAR charUnknown;
578	WCHAR charUnknownHigh;
579	WCHAR charUnknownLow;
580	int index;
581
582	public:
583	EncoderFallbackException(
584	LPCSTR message, WCHAR charUnknown, int index) : ArgumentException (message)
585	{
586	this->charUnknown = charUnknown;
587	this->index = index;
588	}
589
590	EncoderFallbackException(
591	LPCSTR message, WCHAR charUnknownHigh, WCHAR charUnknownLow, int index) : ArgumentException (message)
592	{
593	if (!Char::IsHighSurrogate(charUnknownHigh))
594	{
595	throw ArgumentOutOfRangeException ("charUnknownHigh",
596	"Argument out of range 0xD800..0xDBFF");
597	}
598	if (!Char::IsLowSurrogate(charUnknownLow))
599	{
600	throw ArgumentOutOfRangeException ("charUnknownLow",
601	"Argument out of range 0xDC00..0xDFFF");
602	}
603	Contract::EndContractBlock();
604
605	this->charUnknownHigh = charUnknownHigh;
606	this->charUnknownLow = charUnknownLow;
607	this->index = index;
608	}
609
610	WCHAR GetCharUnknown()
611	{
612	return (charUnknown);
613	}
614
615	WCHAR GetCharUnknownHigh()
616	{
617	return (charUnknownHigh);
618	}
619
620	WCHAR GetCharUnknownLow()
621	{
622	return (charUnknownLow);
623	}
624
625	int GetIndex()
626	{
627	return index;
628	}
629
630	// Return true if the unknown character is a surrogate pair.
631	bool IsUnknownSurrogate()
632	{
633	return (charUnknownHigh != `'\0'`);
634	}
635	};
636
637	class EncoderFallbackBuffer;
638
639	class EncoderFallback
640	{
641	public:
642
643	// Fallback
644	//
645	// Return the appropriate unicode string alternative to the character that need to fall back.
646
647	virtual EncoderFallbackBuffer* CreateFallbackBuffer() = `0`;
648
649	// Maximum number of characters that this instance of this fallback could return
650	virtual int GetMaxCharCount() = `0`;
651	};
652
653	class EncoderReplacementFallback : public EncoderFallback
654	{
655	// Our variables
656	WCHAR strDefault[`2`];
657	int strDefaultLength;
658
659	public:
660	// Construction. Default replacement fallback uses no best fit and ? replacement string
661	EncoderReplacementFallback() : EncoderReplacementFallback (W("?"))
662	{
663	}
664
665	EncoderReplacementFallback(const WCHAR* replacement)
666	{
667	// Must not be null
668	if (replacement == nullptr)
669	throw ArgumentNullException ("replacement");
670	Contract::EndContractBlock();
671
672	// Make sure it doesn't have bad surrogate pairs
673	bool bFoundHigh = false;
674	int replacementLength = PAL_wcslen((const WCHAR *)replacement);
675	for (int i = `0`; i < replacementLength; i++)
676	{
677	// Found a surrogate?
678	if (Char::IsSurrogate(replacement, i))
679	{
680	// High or Low?
681	if (Char::IsHighSurrogate(replacement, i))
682	{
683	// if already had a high one, stop
684	if (bFoundHigh)
685	break; // break & throw at the bFoundHIgh below
686	bFoundHigh = true;
687	}
688	else
689	{
690	// Low, did we have a high?
691	if (!bFoundHigh)
692	{
693	// Didn't have one, make if fail when we stop
694	bFoundHigh = true;
695	break;
696	}
697
698	// Clear flag
699	bFoundHigh = false;
700	}
701	}
702	// If last was high we're in trouble (not surrogate so not low surrogate, so break)
703	else if (bFoundHigh)
704	break;
705	}
706	if (bFoundHigh)
707	throw ArgumentException ("String 'replacement' contains invalid Unicode code points.", "replacement");
708
709	wcscpy_s(strDefault, sizeof(strDefault), replacement);
710	strDefaultLength = replacementLength;
711	}
712
713	WCHAR* GetDefaultString()
714	{
715	return strDefault;
716	}
717
718	virtual EncoderFallbackBuffer* CreateFallbackBuffer();
719
720	// Maximum number of characters that this instance of this fallback could return
721	virtual int GetMaxCharCount()
722	{
723	return strDefaultLength;
724	}
725	};
726
727	class EncoderFallbackBuffer
728	{
729	friend class UTF8Encoding;
730	// Most implementations will probably need an implemenation-specific constructor
731
732	// Public methods that cannot be overriden that let us do our fallback thing
733	// These wrap the internal methods so that we can check for people doing stuff that is incorrect
734
735	public:
736	virtual ~EncoderFallbackBuffer() = default;
737
738	virtual bool Fallback(WCHAR charUnknown, int index) = `0`;
739
740	virtual bool Fallback(WCHAR charUnknownHigh, WCHAR charUnknownLow, int index) = `0`;
741
742	// Get next character
743	virtual WCHAR GetNextChar() = `0`;
744
745	// Back up a character
746	virtual bool MovePrevious() = `0`;
747
748	// How many chars left in this fallback?
749	virtual int GetRemaining() = `0`;
750
751	// Not sure if this should be public or not.
752	// Clear the buffer
753	virtual void Reset()
754	{
755	while (GetNextChar() != (WCHAR)`0`);
756	}
757
758	// Internal items to help us figure out what we're doing as far as error messages, etc.
759	// These help us with our performance and messages internally
760	protected:
761	WCHAR* charStart;
762	WCHAR* charEnd;
763	bool setEncoder;
764	bool bUsedEncoder;
765	bool bFallingBack = false;
766	int iRecursionCount = `0`;
767	static const int iMaxRecursion = `250`;
768
769	// Internal Reset
770	// For example, what if someone fails a conversion and wants to reset one of our fallback buffers?
771	void InternalReset()
772	{
773	charStart = nullptr;
774	bFallingBack = false;
775	iRecursionCount = `0`;
776	Reset();
777	}
778
779	// Set the above values
780	// This can't be part of the constructor because EncoderFallbacks would have to know how to impliment these.
781	void InternalInitialize(WCHAR* charStart, WCHAR* charEnd, bool setEncoder)
782	{
783	this->charStart = charStart;
784	this->charEnd = charEnd;
785	this->setEncoder = setEncoder;
786	this->bUsedEncoder = false;
787	this->bFallingBack = false;
788	this->iRecursionCount = `0`;
789	}
790
791	WCHAR InternalGetNextChar()
792	{
793	WCHAR ch = GetNextChar();
794	bFallingBack = (ch != `0`);
795	if (ch == `0`) iRecursionCount = `0`;
796	return ch;
797	}
798
799	// Fallback the current character using the remaining buffer and encoder if necessary
800	// This can only be called by our encodings (other have to use the public fallback methods), so
801	// we can use our EncoderNLS here too.
802	// setEncoder is true if we're calling from a GetBytes method, false if we're calling from a GetByteCount
803	//
804	// Note that this could also change the contents of this->encoder, which is the same
805	// object that the caller is using, so the caller could mess up the encoder for us
806	// if they aren't careful.
807	virtual bool InternalFallback(WCHAR ch, WCHAR** chars)
808	{
809	// Shouldn't have null charStart
810	Contract::Assert(charStart != nullptr,
811	"[EncoderFallback.InternalFallbackBuffer]Fallback buffer is not initialized");
812
813	// Get our index, remember chars was preincremented to point at next char, so have to -1
814	int index = (int)(*chars - charStart) - `1`;
815
816	// See if it was a high surrogate
817	if (Char::IsHighSurrogate(ch))
818	{
819	// See if there's a low surrogate to go with it
820	if (chars >= this*->charEnd)
821	{
822	// Nothing left in input buffer
823	// No input, return 0
824	}
825	else
826	{
827	// Might have a low surrogate
828	WCHAR cNext = **chars;
829	if (Char::IsLowSurrogate(cNext))
830	{
831	// If already falling back then fail
832	if (bFallingBack && iRecursionCount++ > iMaxRecursion)
833	ThrowLastCharRecursive(ch, cNext);
834
835	// Next is a surrogate, add it as surrogate pair, and increment chars
836	(*chars)++;
837	bFallingBack = Fallback(ch, cNext, index);
838	return bFallingBack;
839	}
840
841	// Next isn't a low surrogate, just fallback the high surrogate
842	}
843	}
844
845	// If already falling back then fail
846	if (bFallingBack && iRecursionCount++ > iMaxRecursion)
847	ThrowLastCharRecursive((int)ch);
848
849	// Fall back our char
850	bFallingBack = Fallback(ch, index);
851
852	return bFallingBack;
853	}
854
855	// private helper methods
856	void ThrowLastCharRecursive(WCHAR highSurrogate, WCHAR lowSurrogate)
857	{
858	// Throw it, using our complete character
859	throw ArgumentException ("Recursive fallback not allowed", "chars");
860	}
861
862	void ThrowLastCharRecursive(int utf32Char)
863	{
864	throw ArgumentException ("Recursive fallback not allowed", "chars");
865	}
866
867	};
868
869	class EncoderReplacementFallbackBuffer : public EncoderFallbackBuffer
870	{
871	// Store our default string
872	WCHAR strDefault[`4`];
873	int strDefaultLength;
874	int fallbackCount = -`1`;
875	int fallbackIndex = -`1`;
876	public:
877	// Construction
878	EncoderReplacementFallbackBuffer(EncoderReplacementFallback* fallback)
879	{
880	// 2X in case we're a surrogate pair
881	wcscpy_s(strDefault, sizeof(strDefault), fallback->GetDefaultString());
882	wcscat_s(strDefault, sizeof(strDefault), fallback->GetDefaultString());
883	strDefaultLength = `2` * PAL_wcslen((const WCHAR *)fallback->GetDefaultString());
884
885	}
886
887	// Fallback Methods
888	virtual bool Fallback(WCHAR charUnknown, int index)
889	{
890	// If we had a buffer already we're being recursive, throw, it's probably at the suspect
891	// character in our array.
892	if (fallbackCount >= `1`)
893	{
894	// If we're recursive we may still have something in our buffer that makes this a surrogate
895	if (Char::IsHighSurrogate(charUnknown) && fallbackCount >= `0` &&
896	Char::IsLowSurrogate(strDefault[fallbackIndex + `1`]))
897	ThrowLastCharRecursive(charUnknown, strDefault[fallbackIndex + `1`]);
898
899	// Nope, just one character
900	ThrowLastCharRecursive((int)charUnknown);
901	}
902
903	// Go ahead and get our fallback
904	// Divide by 2 because we aren't a surrogate pair
905	fallbackCount = strDefaultLength / `2`;
906	fallbackIndex = -`1`;
907
908	return fallbackCount != `0`;
909	}
910
911	virtual bool Fallback(WCHAR charUnknownHigh, WCHAR charUnknownLow, int index)
912	{
913	// Double check input surrogate pair
914	if (!Char::IsHighSurrogate(charUnknownHigh))
915	throw ArgumentOutOfRangeException ("charUnknownHigh",
916	"Argument out of range 0xD800..0xDBFF");
917
918	if (!Char::IsLowSurrogate(charUnknownLow))
919	throw ArgumentOutOfRangeException ("charUnknownLow",
920	"Argument out of range 0xDC00..0xDFFF");
921	Contract::EndContractBlock();
922
923	// If we had a buffer already we're being recursive, throw, it's probably at the suspect
924	// character in our array.
925	if (fallbackCount >= `1`)
926	ThrowLastCharRecursive(charUnknownHigh, charUnknownLow);
927
928	// Go ahead and get our fallback
929	fallbackCount = strDefaultLength;
930	fallbackIndex = -`1`;
931
932	return fallbackCount != `0`;
933	}
934
935	virtual WCHAR GetNextChar()
936	{
937	// We want it to get < 0 because == 0 means that the current/last character is a fallback
938	// and we need to detect recursion. We could have a flag but we already have this counter.
939	fallbackCount--;
940	fallbackIndex++;
941
942	// Do we have anything left? 0 is now last fallback char, negative is nothing left
943	if (fallbackCount < `0`)
944	return `'\0'`;
945
946	// Need to get it out of the buffer.
947	// Make sure it didn't wrap from the fast count-- path
948	if (fallbackCount == INT_MAX)
949	{
950	fallbackCount = -`1`;
951	return `'\0'`;
952	}
953
954	// Now make sure its in the expected range
955	Contract::Assert(fallbackIndex < strDefaultLength && fallbackIndex >= `0`,
956	"Index exceeds buffer range");
957
958	return strDefault[fallbackIndex];
959	}
960
961	virtual bool MovePrevious()
962	{
963	// Back up one, only if we just processed the last character (or earlier)
964	if (fallbackCount >= -`1` && fallbackIndex >= `0`)
965	{
966	fallbackIndex--;
967	fallbackCount++;
968	return true;
969	}
970
971	// Return false 'cause we couldn't do it.
972	return false;
973	}
974
975	// How many characters left to output?
976	virtual int GetRemaining()
977	{
978	// Our count is 0 for 1 character left.
979	return (fallbackCount < `0`) ? `0` : fallbackCount;
980	}
981
982	// Clear the buffer
983	virtual void Reset()
984	{
985	fallbackCount = -`1`;
986	fallbackIndex = `0`;
987	charStart = nullptr;
988	bFallingBack = false;
989	}
990	};
991
992	class EncoderExceptionFallbackBuffer : public EncoderFallbackBuffer
993	{
994	public:
995	EncoderExceptionFallbackBuffer()
996	{
997	}
998
999	virtual bool Fallback(WCHAR charUnknown, int index)
1000	{
1001	// Fall back our char
1002	throw EncoderFallbackException ("Unable to translate Unicode character to UTF-8", charUnknown, index);
1003	}
1004
1005	virtual bool Fallback(WCHAR charUnknownHigh, WCHAR charUnknownLow, int index)
1006	{
1007	if (!Char::IsHighSurrogate(charUnknownHigh))
1008	{
1009	throw ArgumentOutOfRangeException ("charUnknownHigh",
1010	"Argument out of range 0xD800..0xDBFF");
1011	}
1012	if (!Char::IsLowSurrogate(charUnknownLow))
1013	{
1014	throw ArgumentOutOfRangeException ("charUnknownLow",
1015	"Argument out of range 0xDC00..0xDFFF");
1016	}
1017	Contract::EndContractBlock();
1018
1019	//int iTemp = Char::ConvertToUtf32(charUnknownHigh, charUnknownLow);
1020
1021	// Fall back our char
1022	throw EncoderFallbackException (
1023	"Unable to translate Unicode character to UTF-8", charUnknownHigh, charUnknownLow, index);
1024	}
1025
1026	virtual WCHAR GetNextChar()
1027	{
1028	return `0`;
1029	}
1030
1031	virtual bool MovePrevious()
1032	{
1033	// Exception fallback doesn't have anywhere to back up to.
1034	return false;
1035	}
1036
1037	// Exceptions are always empty
1038	virtual int GetRemaining()
1039	{
1040	return `0`;
1041	}
1042	};
1043
1044	class EncoderExceptionFallback : public EncoderFallback
1045	{
1046	// Construction
1047	public:
1048	EncoderExceptionFallback()
1049	{
1050	}
1051
1052	virtual EncoderFallbackBuffer* CreateFallbackBuffer()
1053	{
1054	return InternalNew<EncoderExceptionFallbackBuffer>();
1055	}
1056
1057	// Maximum number of characters that this instance of this fallback could return
1058	virtual int GetMaxCharCount()
1059	{
1060	return `0`;
1061	}
1062	};
1063
1064	EncoderFallbackBuffer* EncoderReplacementFallback::CreateFallbackBuffer()
1065	{
1066	return InternalNew<EncoderReplacementFallbackBuffer>(this);
1067	}
1068
1069	class UTF8Encoding
1070	{
1071	EncoderFallback* encoderFallback;
1072	// Instances of the two possible fallbacks. The constructor parameter
1073	// determines which one to use.
1074	EncoderReplacementFallback encoderReplacementFallback;
1075	EncoderExceptionFallback encoderExceptionFallback;
1076
1077	DecoderFallback* decoderFallback;
1078	// Instances of the two possible fallbacks. The constructor parameter
1079	// determines which one to use.
1080	DecoderReplacementFallback decoderReplacementFallback;
1081	DecoderExceptionFallback decoderExceptionFallback;
1082
1083	bool InRange(WCHAR c, WCHAR begin, WCHAR end)
1084	{
1085	return begin <= c && c <= end;
1086	}
1087
1088	size_t PtrDiff(WCHAR* ptr1, WCHAR* ptr2)
1089	{
1090	return ptr1 - ptr2;
1091	}
1092
1093	size_t PtrDiff(BYTE* ptr1, BYTE* ptr2)
1094	{
1095	return ptr1 - ptr2;
1096	}
1097
1098	void ThrowBytesOverflow()
1099	{
1100	// Special message to include fallback type in case fallback's GetMaxCharCount is broken
1101	// This happens if user has implimented an encoder fallback with a broken GetMaxCharCount
1102	throw InsufficientBufferException ("The output byte buffer is too small to contain the encoded data", "bytes");
1103	}
1104
1105	void ThrowBytesOverflow(bool nothingEncoded)
1106	{
1107	// Special message to include fallback type in case fallback's GetMaxCharCount is broken
1108	// This happens if user has implimented an encoder fallback with a broken GetMaxCharCount
1109	if (nothingEncoded){
1110	ThrowBytesOverflow();
1111	}
1112	}
1113
1114	void ThrowCharsOverflow()
1115	{
1116	// Special message to include fallback type in case fallback's GetMaxCharCount is broken
1117	// This happens if user has implimented a decoder fallback with a broken GetMaxCharCount
1118	throw InsufficientBufferException ("The output char buffer is too small to contain the encoded data", "chars");
1119	}
1120
1121	void ThrowCharsOverflow(bool nothingEncoded)
1122	{
1123	// Special message to include fallback type in case fallback's GetMaxCharCount is broken
1124	// This happens if user has implimented an decoder fallback with a broken GetMaxCharCount
1125	if (nothingEncoded){
1126	ThrowCharsOverflow();
1127	}
1128	}
1129
1130	// During GetChars we had an invalid byte sequence
1131	// pSrc is backed up to the start of the bad sequence if we didn't have room to
1132	// fall it back. Otherwise pSrc remains where it is.
1133	bool FallbackInvalidByteSequence(BYTE** pSrc, int ch, DecoderFallbackBuffer* fallback, WCHAR** pTarget)
1134	{
1135	// Get our byte[]
1136	BYTE* pStart = *pSrc;
1137	BYTE* bytesUnknown;
1138	int size = GetBytesUnknown(pStart, ch, &bytesUnknown);
1139
1140	// Do the actual fallback
1141	if (!fallback->InternalFallback(bytesUnknown, *pSrc, pTarget, size))
1142	{
1143	// Oops, it failed, back up to pStart
1144	*pSrc = pStart;
1145	return false;
1146	}
1147
1148	// It worked
1149	return true;
1150	}
1151
1152	int FallbackInvalidByteSequence(BYTE* pSrc, int ch, DecoderFallbackBuffer *fallback)
1153	{
1154	// Get our byte[]
1155	BYTE *bytesUnknown;
1156	int size = GetBytesUnknown(pSrc, ch, &bytesUnknown);
1157
1158	// Do the actual fallback
1159	int count = fallback->InternalFallback(bytesUnknown, pSrc, size);
1160
1161	// # of fallback chars expected.
1162	// Note that we only get here for "long" sequences, and have already unreserved
1163	// the count that we prereserved for the input bytes
1164	return count;
1165	}
1166
1167	int GetBytesUnknown(BYTE* pSrc, int ch, BYTE **bytesUnknown)
1168	{
1169	int size;
1170	BYTE bytes[`3`];
1171
1172	// See if it was a plain char
1173	// (have to check >= 0 because we have all sorts of wierd bit flags)
1174	if (ch < `0x100` && ch >= `0`)
1175	{
1176	pSrc--;
1177	bytes[`0`] = (BYTE)ch;
1178	size = `1`;
1179	}
1180	// See if its an unfinished 2 byte sequence
1181	else if ((ch & (SupplimentarySeq \| ThreeByteSeq)) == `0`)
1182	{
1183	pSrc--;
1184	bytes[`0`] = (BYTE)((ch & `0x1F`) \| `0xc0`);
1185	size = `1`;
1186	}
1187	// So now we're either 2nd byte of 3 or 4 byte sequence or
1188	// we hit a non-trail byte or we ran out of space for 3rd byte of 4 byte sequence
1189	// 1st check if its a 4 byte sequence
1190	else if ((ch & SupplimentarySeq) != `0`)
1191	{
1192	// 3rd byte of 4 byte sequence?
1193	if ((ch & (FinalByte >> `6`)) != `0`)
1194	{
1195	// 3rd byte of 4 byte sequence
1196	pSrc -= `3`;
1197	bytes[`0`] = (BYTE)(((ch >> `12`) & `0x07`) \| `0xF0`);
1198	bytes[`1`] = (BYTE)(((ch >> `6`) & `0x3F`) \| `0x80`);
1199	bytes[`2`] = (BYTE)(((ch)& `0x3F`) \| `0x80`);
1200	size = `3`;
1201	}
1202	else if ((ch & (FinalByte >> `12`)) != `0`)
1203	{
1204	// 2nd byte of a 4 byte sequence
1205	pSrc -= `2`;
1206	bytes[`0`] = (BYTE)(((ch >> `6`) & `0x07`) \| `0xF0`);
1207	bytes[`1`] = (BYTE)(((ch)& `0x3F`) \| `0x80`);
1208	size = `2`;
1209	}
1210	else
1211	{
1212	// 4th byte of a 4 byte sequence
1213	pSrc--;
1214	bytes[`0`] = (BYTE)(((ch)& `0x07`) \| `0xF0`);
1215	size = `1`;
1216	}
1217	}
1218	else
1219	{
1220	// 2nd byte of 3 byte sequence?
1221	if ((ch & (FinalByte >> `6`)) != `0`)
1222	{
1223	// So its 2nd byte of a 3 byte sequence
1224	pSrc -= `2`;
1225	bytes[`0`] = (BYTE)(((ch >> `6`) & `0x0F`) \| `0xE0`);
1226	bytes[`1`] = (BYTE)(((ch)& `0x3F`) \| `0x80`);
1227	size = `2`;
1228	}
1229	else
1230	{
1231	// 1st byte of a 3 byte sequence
1232	pSrc--;
1233	bytes[`0`] = (BYTE)(((ch)& `0x0F`) \| `0xE0`);
1234	size = `1`;
1235	}
1236	}
1237
1238	*bytesUnknown = bytes;
1239	return size;
1240	}
1241
1242	public:
1243
1244	UTF8Encoding(bool isThrowException)
1245	: encoderReplacementFallback (W("\xFFFD")), decoderReplacementFallback (W("\xFFFD"))
1246	{
1247	if (isThrowException)
1248	{
1249	encoderFallback = &encoderExceptionFallback;
1250	decoderFallback = &decoderExceptionFallback;
1251	}
1252	else
1253	{
1254	encoderFallback = &encoderReplacementFallback;
1255	decoderFallback = &decoderReplacementFallback;
1256	}
1257	}
1258
1259	// These are bitmasks used to maintain the state in the decoder. They occupy the higher bits
1260	// while the actual character is being built in the lower bits. They are shifted together
1261	// with the actual bits of the character.
1262
1263	// bits 30 & 31 are used for pending bits fixup
1264	const int FinalByte = `1` << `29`;
1265	const int SupplimentarySeq = `1` << `28`;
1266	const int ThreeByteSeq = `1` << `27`;
1267
1268	int GetCharCount(BYTE* bytes, int count)
1269	{
1270	Contract::Assert(bytes != nullptr, "[UTF8Encoding.GetCharCount]bytes!=nullptr");
1271	Contract::Assert(count >= `0`, "[UTF8Encoding.GetCharCount]count >=0");
1272
1273	// Initialize stuff
1274	BYTE *pSrc = bytes;
1275	BYTE *pEnd = pSrc + count;
1276
1277	// Start by assuming we have as many as count, charCount always includes the adjustment
1278	// for the character being decoded
1279	int charCount = count;
1280	int ch = `0`;
1281	DecoderFallbackBuffer fallback = nullptr*;
1282
1283	for (;;)
1284	{
1285	// SLOWLOOP: does all range checks, handles all special cases, but it is slow
1286	if (pSrc >= pEnd) {
1287	break;
1288	}
1289
1290	// read next byte. The JIT optimization seems to be getting confused when
1291	// compiling "ch = pSrc++;", so rather use "ch = pSrc; pSrc++;" instead
1292	int cha = *pSrc;
1293
1294	if (ch == `0`) {
1295	// no pending bits
1296	goto ReadChar;
1297	}
1298
1299	pSrc++;
1300
1301	// we are expecting to see trailing bytes like 10vvvvvv
1302	if ((cha & `0xC0`) != `0x80`) {
1303	// This can be a valid starting byte for another UTF8 byte sequence, so let's put
1304	// the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence
1305	pSrc--;
1306	charCount += (ch >> `30`);
1307	goto InvalidByteSequence;
1308	}
1309
1310	// fold in the new byte
1311	ch = (ch << `6`) \| (cha & `0x3F`);
1312
1313	if ((ch & FinalByte) == `0`) {
1314	Contract::Assert((ch & (SupplimentarySeq \| ThreeByteSeq)) != `0`,
1315	"[UTF8Encoding.GetChars]Invariant volation");
1316
1317	if ((ch & SupplimentarySeq) != `0`) {
1318	if ((ch & (FinalByte >> `6`)) != `0`) {
1319	// this is 3rd byte (of 4 byte supplimentary) - nothing to do
1320	continue;
1321	}
1322
1323	// 2nd byte, check for non-shortest form of supplimentary char and the valid
1324	// supplimentary characters in range 0x010000 - 0x10FFFF at the same time
1325	if (!InRange(ch & `0x1F0`, `0x10`, `0x100`)) {
1326	goto InvalidByteSequence;
1327	}
1328	}
1329	else {
1330	// Must be 2nd byte of a 3-byte sequence
1331	// check for non-shortest form of 3 byte seq
1332	if ((ch & (`0x1F` << `5`)) == `0` \|\| // non-shortest form
1333	(ch & (`0xF800` >> `6`)) == (`0xD800` >> `6`)) // illegal individually encoded surrogate
1334	{
1335	goto InvalidByteSequence;
1336	}
1337	}
1338	continue;
1339	}
1340
1341	// ready to punch
1342
1343	// adjust for surrogates in non-shortest form
1344	if ((ch & (SupplimentarySeq \| `0x1F0000`)) == SupplimentarySeq) {
1345	charCount--;
1346	}
1347	goto EncodeChar;
1348
1349	InvalidByteSequence:
1350	// this code fragment should be close to the gotos referencing it
1351	// Have to do fallback for invalid bytes
1352	if (fallback == nullptr)
1353	{
1354	fallback = decoderFallback->CreateFallbackBuffer();
1355	fallback->InternalInitialize(bytes, nullptr);
1356	}
1357	charCount += FallbackInvalidByteSequence(pSrc, ch, fallback);
1358
1359	ch = `0`;
1360	continue;
1361
1362	ReadChar:
1363	ch = *pSrc;
1364	pSrc++;
1365
1366	ProcessChar:
1367	if (ch > `0x7F`) {
1368	// If its > 0x7F, its start of a new multi-byte sequence
1369
1370	// Long sequence, so unreserve our char.
1371	charCount--;
1372
1373	// bit 6 has to be non-zero for start of multibyte chars.
1374	if ((ch & `0x40`) == `0`) {
1375	// Unexpected trail byte
1376	goto InvalidByteSequence;
1377	}
1378
1379	// start a new long code
1380	if ((ch & `0x20`) != `0`) {
1381	if ((ch & `0x10`) != `0`) {
1382	// 4 byte encoding - supplimentary character (2 surrogates)
1383
1384	ch &= `0x0F`;
1385
1386	// check that bit 4 is zero and the valid supplimentary character
1387	// range 0x000000 - 0x10FFFF at the same time
1388	if (ch > `0x04`) {
1389	ch \|= `0xf0`;
1390	goto InvalidByteSequence;
1391	}
1392
1393	// Add bit flags so that when we check new characters & rotate we'll be flagged correctly.
1394	// Final byte flag, count fix if we don't make final byte & supplimentary sequence flag.
1395	ch \|= (FinalByte >> `3` * `6`) \| // Final byte is 3 more bytes from now
1396	(`1` << `30`) \| // If it dies on next byte we'll need an extra char
1397	(`3` << (`30` - `2` * `6`)) \| // If it dies on last byte we'll need to subtract a char
1398	(SupplimentarySeq) \| (SupplimentarySeq >> `6`) \|
1399	(SupplimentarySeq >> `2` * `6`) \| (SupplimentarySeq >> `3` * `6`);
1400
1401	// Our character count will be 2 characters for these 4 bytes, so subtract another char
1402	charCount--;
1403	}
1404	else {
1405	// 3 byte encoding
1406	// Add bit flags so that when we check new characters & rotate we'll be flagged correctly.
1407	ch = (ch & `0x0F`) \| ((FinalByte >> `2` * `6`) \| (`1` << `30`) \|
1408	(ThreeByteSeq) \| (ThreeByteSeq >> `6`) \| (ThreeByteSeq >> `2` * `6`));
1409
1410	// We'll expect 1 character for these 3 bytes, so subtract another char.
1411	charCount--;
1412	}
1413	}
1414	else {
1415	// 2 byte encoding
1416
1417	ch &= `0x1F`;
1418
1419	// check for non-shortest form
1420	if (ch <= `1`) {
1421	ch \|= `0xc0`;
1422	goto InvalidByteSequence;
1423	}
1424
1425	// Add bit flags so we'll be flagged correctly
1426	ch \|= (FinalByte >> `6`);
1427	}
1428	continue;
1429	}
1430
1431	EncodeChar:
1432
1433	#ifdef FASTLOOP
1434	int availableBytes = PtrDiff(pEnd, pSrc);
1435
1436	// don't fall into the fast decoding loop if we don't have enough bytes
1437	if (availableBytes <= `13`) {
1438	// try to get over the remainder of the ascii characters fast though
1439	BYTE* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
1440	while (pSrc < pLocalEnd) {
1441	ch = *pSrc;
1442	pSrc++;
1443
1444	if (ch > `0x7F`)
1445	goto ProcessChar;
1446	}
1447	// we are done
1448	ch = `0`;
1449	break;
1450	}
1451
1452	// To compute the upper bound, assume that all characters are ASCII characters at this point,
1453	// the boundary will be decreased for every non-ASCII character we encounter
1454	// Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences
1455	BYTE *pStop = pSrc + availableBytes - `7`;
1456
1457	while (pSrc < pStop) {
1458	ch = *pSrc;
1459	pSrc++;
1460
1461	if (ch > `0x7F`) {
1462	goto LongCode;
1463	}
1464
1465	// get pSrc 2-byte aligned
1466	if (((int)pSrc & `0x1`) != `0`) {
1467	ch = *pSrc;
1468	pSrc++;
1469	if (ch > `0x7F`) {
1470	goto LongCode;
1471	}
1472	}
1473
1474	// get pSrc 4-byte aligned
1475	if (((int)pSrc & `0x2`) != `0`) {
1476	ch = (USHORT)pSrc;
1477	if ((ch & `0x8080`) != `0`) {
1478	goto LongCodeWithMask16;
1479	}
1480	pSrc += `2`;
1481	}
1482
1483
1484	// Run 8 + 8 characters at a time!
1485	while (pSrc < pStop) {
1486	ch = (int**)pSrc;
1487	int chb = (int**)(pSrc + `4`);
1488	if (((ch \| chb) & (int)`0x80808080`) != `0`) {
1489	goto LongCodeWithMask32;
1490	}
1491	pSrc += `8`;
1492
1493	// This is a really small loop - unroll it
1494	if (pSrc >= pStop)
1495	break;
1496
1497	ch = (int**)pSrc;
1498	chb = (int**)(pSrc + `4`);
1499	if (((ch \| chb) & (int)`0x80808080`) != `0`) {
1500	goto LongCodeWithMask32;
1501	}
1502	pSrc += `8`;
1503	}
1504	break;
1505
1506	#if BIGENDIAN
1507	LongCodeWithMask32 :
1508	// be careful about the sign extension
1509	ch = (int)(((uint)ch) >> `16`);
1510	LongCodeWithMask16:
1511	ch = (int)(((uint)ch) >> `8`);
1512	#else // BIGENDIAN
1513	LongCodeWithMask32:
1514	LongCodeWithMask16:
1515	ch &= `0xFF`;
1516	#endif // BIGENDIAN
1517	pSrc++;
1518	if (ch <= `0x7F`) {
1519	continue;
1520	}
1521
1522	LongCode:
1523	int chc = *pSrc;
1524	pSrc++;
1525
1526	if (
1527	// bit 6 has to be zero
1528	(ch & `0x40`) == `0` \|\|
1529	// we are expecting to see trailing bytes like 10vvvvvv
1530	(chc & `0xC0`) != `0x80`)
1531	{
1532	goto BadLongCode;
1533	}
1534
1535	chc &= `0x3F`;
1536
1537	// start a new long code
1538	if ((ch & `0x20`) != `0`) {
1539
1540	// fold the first two bytes together
1541	chc \|= (ch & `0x0F`) << `6`;
1542
1543	if ((ch & `0x10`) != `0`) {
1544	// 4 byte encoding - surrogate
1545	ch = *pSrc;
1546	if (
1547	// check that bit 4 is zero, the non-shortest form of surrogate
1548	// and the valid surrogate range 0x000000 - 0x10FFFF at the same time
1549	!InRange(chc >> `4`, `0x01`, `0x10`) \|\|
1550	// we are expecting to see trailing bytes like 10vvvvvv
1551	(ch & `0xC0`) != `0x80`)
1552	{
1553	goto BadLongCode;
1554	}
1555
1556	chc = (chc << `6`) \| (ch & `0x3F`);
1557
1558	ch = *(pSrc + `1`);
1559	// we are expecting to see trailing bytes like 10vvvvvv
1560	if ((ch & `0xC0`) != `0x80`) {
1561	goto BadLongCode;
1562	}
1563	pSrc += `2`;
1564
1565	// extra byte
1566	charCount--;
1567	}
1568	else {
1569	// 3 byte encoding
1570	ch = *pSrc;
1571	if (
1572	// check for non-shortest form of 3 byte seq
1573	(chc & (`0x1F` << `5`)) == `0` \|\|
1574	// Can't have surrogates here.
1575	(chc & (`0xF800` >> `6`)) == (`0xD800` >> `6`) \|\|
1576	// we are expecting to see trailing bytes like 10vvvvvv
1577	(ch & `0xC0`) != `0x80`)
1578	{
1579	goto BadLongCode;
1580	}
1581	pSrc++;
1582
1583	// extra byte
1584	charCount--;
1585	}
1586	}
1587	else {
1588	// 2 byte encoding
1589
1590	// check for non-shortest form
1591	if ((ch & `0x1E`) == `0`) {
1592	goto BadLongCode;
1593	}
1594	}
1595
1596	// extra byte
1597	charCount--;
1598	}
1599	#endif // FASTLOOP
1600
1601	// no pending bits at this point
1602	ch = `0`;
1603	continue;
1604
1605	BadLongCode:
1606	pSrc -= `2`;
1607	ch = `0`;
1608	continue;
1609	}
1610
1611	// May have a problem if we have to flush
1612	if (ch != `0`)
1613	{
1614	// We were already adjusting for these, so need to unadjust
1615	charCount += (ch >> `30`);
1616	// Have to do fallback for invalid bytes
1617	if (fallback == nullptr)
1618	{
1619	fallback = decoderFallback->CreateFallbackBuffer();
1620	fallback->InternalInitialize(bytes, nullptr);
1621	}
1622	charCount += FallbackInvalidByteSequence(pSrc, ch, fallback);
1623	}
1624
1625	// Shouldn't have anything in fallback buffer for GetCharCount
1626	// (don't have to check m_throwOnOverflow for count)
1627	Contract::Assert(fallback == nullptr \|\| fallback->GetRemaining() == `0`,
1628	"[UTF8Encoding.GetCharCount]Expected empty fallback buffer at end");
1629
1630	InternalDelete(fallback);
1631
1632	return charCount;
1633
1634	}
1635
1636	int GetChars(BYTE* bytes, int byteCount, WCHAR* chars, int charCount)
1637	{
1638	Contract::Assert(chars != nullptr, "[UTF8Encoding.GetChars]chars!=nullptr");
1639	Contract::Assert(byteCount >= `0`, "[UTF8Encoding.GetChars]byteCount >=0");
1640	Contract::Assert(charCount >= `0`, "[UTF8Encoding.GetChars]charCount >=0");
1641	Contract::Assert(bytes != nullptr, "[UTF8Encoding.GetChars]bytes!=nullptr");
1642
1643	BYTE *pSrc = bytes;
1644	WCHAR *pTarget = chars;
1645
1646	BYTE *pEnd = pSrc + byteCount;
1647	WCHAR *pAllocatedBufferEnd = pTarget + charCount;
1648
1649	int ch = `0`;
1650
1651	DecoderFallbackBuffer fallback = nullptr*;
1652
1653	for (;;)
1654	{
1655	// SLOWLOOP: does all range checks, handles all special cases, but it is slow
1656
1657	if (pSrc >= pEnd) {
1658	break;
1659	}
1660
1661	// read next byte. The JIT optimization seems to be getting confused when
1662	// compiling "ch = pSrc++;", so rather use "ch = pSrc; pSrc++;" instead
1663	int cha = *pSrc;
1664
1665	if (ch == `0`) {
1666	// no pending bits
1667	goto ReadChar;
1668	}
1669
1670	pSrc++;
1671
1672	// we are expecting to see trailing bytes like 10vvvvvv
1673	if ((cha & `0xC0`) != `0x80`) {
1674	// This can be a valid starting byte for another UTF8 byte sequence, so let's put
1675	// the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence
1676	pSrc--;
1677	goto InvalidByteSequence;
1678	}
1679
1680	// fold in the new byte
1681	ch = (ch << `6`) \| (cha & `0x3F`);
1682
1683	if ((ch & FinalByte) == `0`) {
1684	// Not at last byte yet
1685	Contract::Assert((ch & (SupplimentarySeq \| ThreeByteSeq)) != `0`,
1686	"[UTF8Encoding.GetChars]Invariant volation");
1687
1688	if ((ch & SupplimentarySeq) != `0`) {
1689	// Its a 4-byte supplimentary sequence
1690	if ((ch & (FinalByte >> `6`)) != `0`) {
1691	// this is 3rd byte of 4 byte sequence - nothing to do
1692	continue;
1693	}
1694
1695	// 2nd byte of 4 bytes
1696	// check for non-shortest form of surrogate and the valid surrogate
1697	// range 0x000000 - 0x10FFFF at the same time
1698	if (!InRange(ch & `0x1F0`, `0x10`, `0x100`)) {
1699	goto InvalidByteSequence;
1700	}
1701	}
1702	else {
1703	// Must be 2nd byte of a 3-byte sequence
1704	// check for non-shortest form of 3 byte seq
1705	if ((ch & (`0x1F` << `5`)) == `0` \|\| // non-shortest form
1706	(ch & (`0xF800` >> `6`)) == (`0xD800` >> `6`)) // illegal individually encoded surrogate
1707	{
1708	goto InvalidByteSequence;
1709	}
1710	}
1711	continue;
1712	}
1713
1714	// ready to punch
1715
1716	// surrogate in shortest form?
1717	// Might be possible to get rid of this? Already did non-shortest check for 4-byte sequence when reading 2nd byte?
1718	if ((ch & (SupplimentarySeq \| `0x1F0000`)) > SupplimentarySeq) {
1719	// let the range check for the second char throw the exception
1720	if (pTarget < pAllocatedBufferEnd) {
1721	*pTarget = (WCHAR)(((ch >> `10`) & `0x7FF`) +
1722	(SHORT)((CharUnicodeInfo::HIGH_SURROGATE_START - (`0x10000` >> `10`))));
1723	pTarget++;
1724
1725	ch = (ch & `0x3FF`) +
1726	(int)(CharUnicodeInfo::LOW_SURROGATE_START);
1727	}
1728	}
1729
1730	goto EncodeChar;
1731
1732	InvalidByteSequence:
1733	// this code fragment should be close to the gotos referencing it
1734	// Have to do fallback for invalid bytes
1735	if (fallback == nullptr)
1736	{
1737	fallback = decoderFallback->CreateFallbackBuffer();
1738	fallback->InternalInitialize(bytes, pAllocatedBufferEnd);
1739	}
1740
1741	// That'll back us up the appropriate # of bytes if we didn't get anywhere
1742	if (!FallbackInvalidByteSequence(&pSrc, ch, fallback, &pTarget))
1743	{
1744	// Ran out of buffer space
1745	// Need to throw an exception?
1746	Contract::Assert(pSrc >= bytes \|\| pTarget == chars,
1747	"[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer after fallback");
1748	fallback->InternalReset();
1749	ThrowCharsOverflow(pTarget == chars);
1750	ch = `0`;
1751	break;
1752	}
1753	Contract::Assert(pSrc >= bytes,
1754	"[UTF8Encoding.GetChars]Expected invalid byte sequence to have remained within the byte array");
1755	ch = `0`;
1756	continue;
1757
1758	ReadChar:
1759	ch = *pSrc;
1760	pSrc++;
1761
1762	ProcessChar:
1763	if (ch > `0x7F`) {
1764	// If its > 0x7F, its start of a new multi-byte sequence
1765
1766	// bit 6 has to be non-zero
1767	if ((ch & `0x40`) == `0`) {
1768	goto InvalidByteSequence;
1769	}
1770
1771	// start a new long code
1772	if ((ch & `0x20`) != `0`) {
1773	if ((ch & `0x10`) != `0`) {
1774	// 4 byte encoding - supplimentary character (2 surrogates)
1775
1776	ch &= `0x0F`;
1777
1778	// check that bit 4 is zero and the valid supplimentary character
1779	// range 0x000000 - 0x10FFFF at the same time
1780	if (ch > `0x04`) {
1781	ch \|= `0xf0`;
1782	goto InvalidByteSequence;
1783	}
1784
1785	ch \|= (FinalByte >> `3` * `6`) \| (`1` << `30`) \| (`3` << (`30` - `2` * `6`)) \|
1786	(SupplimentarySeq) \| (SupplimentarySeq >> `6`) \|
1787	(SupplimentarySeq >> `2` * `6`) \| (SupplimentarySeq >> `3` * `6`);
1788	}
1789	else {
1790	// 3 byte encoding
1791	ch = (ch & `0x0F`) \| ((FinalByte >> `2` * `6`) \| (`1` << `30`) \|
1792	(ThreeByteSeq) \| (ThreeByteSeq >> `6`) \| (ThreeByteSeq >> `2` * `6`));
1793	}
1794	}
1795	else {
1796	// 2 byte encoding
1797
1798	ch &= `0x1F`;
1799
1800	// check for non-shortest form
1801	if (ch <= `1`) {
1802	ch \|= `0xc0`;
1803	goto InvalidByteSequence;
1804	}
1805
1806	ch \|= (FinalByte >> `6`);
1807	}
1808	continue;
1809	}
1810
1811	EncodeChar:
1812	// write the pending character
1813	if (pTarget >= pAllocatedBufferEnd)
1814	{
1815	// Fix chars so we make sure to throw if we didn't output anything
1816	ch &= `0x1fffff`;
1817	if (ch > `0x7f`)
1818	{
1819	if (ch > `0x7ff`)
1820	{
1821	if (ch >= CharUnicodeInfo::LOW_SURROGATE_START &&
1822	ch <= CharUnicodeInfo::LOW_SURROGATE_END)
1823	{
1824	pSrc--; // It was 4 bytes
1825	pTarget--; // 1 was stored already, but we can't remember 1/2, so back up
1826	}
1827	else if (ch > `0xffff`)
1828	{
1829	pSrc--; // It was 4 bytes, nothing was stored
1830	}
1831	pSrc--; // It was at least 3 bytes
1832	}
1833	pSrc--; // It was at least 2 bytes
1834	}
1835	pSrc--;
1836
1837	// Throw that we don't have enough room (pSrc could be < chars if we had started to process
1838	// a 4 byte sequence alredy)
1839	Contract::Assert(pSrc >= bytes \|\| pTarget == chars,
1840	"[UTF8Encoding.GetChars]Expected pSrc to be within input buffer or throw due to no output]");
1841	ThrowCharsOverflow(pTarget == chars);
1842
1843	// Don't store ch in decoder, we already backed up to its start
1844	ch = `0`;
1845
1846	// Didn't throw, just use this buffer size.
1847	break;
1848	}
1849	*pTarget = (WCHAR)ch;
1850	pTarget++;
1851
1852	#ifdef FASTLOOP
1853	int availableChars = PtrDiff(pAllocatedBufferEnd, pTarget);
1854	int availableBytes = PtrDiff(pEnd, pSrc);
1855
1856	// don't fall into the fast decoding loop if we don't have enough bytes
1857	// Test for availableChars is done because pStop would be <= pTarget.
1858	if (availableBytes <= `13`) {
1859	// we may need as many as 1 character per byte
1860	if (availableChars < availableBytes) {
1861	// not enough output room. no pending bits at this point
1862	ch = `0`;
1863	continue;
1864	}
1865
1866	// try to get over the remainder of the ascii characters fast though
1867	BYTE* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
1868	while (pSrc < pLocalEnd) {
1869	ch = *pSrc;
1870	pSrc++;
1871
1872	if (ch > `0x7F`)
1873	goto ProcessChar;
1874
1875	*pTarget = (WCHAR)ch;
1876	pTarget++;
1877	}
1878	// we are done
1879	ch = `0`;
1880	break;
1881	}
1882
1883	// we may need as many as 1 character per byte, so reduce the byte count if necessary.
1884	// If availableChars is too small, pStop will be before pTarget and we won't do fast loop.
1885	if (availableChars < availableBytes) {
1886	availableBytes = availableChars;
1887	}
1888
1889	// To compute the upper bound, assume that all characters are ASCII characters at this point,
1890	// the boundary will be decreased for every non-ASCII character we encounter
1891	// Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences
1892	WCHAR *pStop = pTarget + availableBytes - `7`;
1893
1894	while (pTarget < pStop) {
1895	ch = *pSrc;
1896	pSrc++;
1897
1898	if (ch > `0x7F`) {
1899	goto LongCode;
1900	}
1901	*pTarget = (WCHAR)ch;
1902	pTarget++;
1903
1904	// get pSrc to be 2-byte aligned
1905	if ((((int)pSrc) & `0x1`) != `0`) {
1906	ch = *pSrc;
1907	pSrc++;
1908	if (ch > `0x7F`) {
1909	goto LongCode;
1910	}
1911	*pTarget = (WCHAR)ch;
1912	pTarget++;
1913	}
1914
1915	// get pSrc to be 4-byte aligned
1916	if ((((int)pSrc) & `0x2`) != `0`) {
1917	ch = (USHORT)pSrc;
1918	if ((ch & `0x8080`) != `0`) {
1919	goto LongCodeWithMask16;
1920	}
1921
1922	// Unfortunately, this is endianess sensitive
1923	#if BIGENDIAN
1924	*pTarget = (WCHAR)((ch >> `8`) & `0x7F`);
1925	pSrc += `2`;
1926	*(pTarget + `1`) = (WCHAR)(ch & `0x7F`);
1927	pTarget += `2`;
1928	#else // BIGENDIAN
1929	*pTarget = (WCHAR)(ch & `0x7F`);
1930	pSrc += `2`;
1931	*(pTarget + `1`) = (WCHAR)((ch >> `8`) & `0x7F`);
1932	pTarget += `2`;
1933	#endif // BIGENDIAN
1934	}
1935
1936	// Run 8 characters at a time!
1937	while (pTarget < pStop) {
1938	ch = (int**)pSrc;
1939	int chb = (int**)(pSrc + `4`);
1940	if (((ch \| chb) & (int)`0x80808080`) != `0`) {
1941	goto LongCodeWithMask32;
1942	}
1943
1944	// Unfortunately, this is endianess sensitive
1945	#if BIGENDIAN
1946	*pTarget = (WCHAR)((ch >> `24`) & `0x7F`);
1947	*(pTarget + `1`) = (WCHAR)((ch >> `16`) & `0x7F`);
1948	*(pTarget + `2`) = (WCHAR)((ch >> `8`) & `0x7F`);
1949	*(pTarget + `3`) = (WCHAR)(ch & `0x7F`);
1950	pSrc += `8`;
1951	*(pTarget + `4`) = (WCHAR)((chb >> `24`) & `0x7F`);
1952	*(pTarget + `5`) = (WCHAR)((chb >> `16`) & `0x7F`);
1953	*(pTarget + `6`) = (WCHAR)((chb >> `8`) & `0x7F`);
1954	*(pTarget + `7`) = (WCHAR)(chb & `0x7F`);
1955	pTarget += `8`;
1956	#else // BIGENDIAN
1957	*pTarget = (WCHAR)(ch & `0x7F`);
1958	*(pTarget + `1`) = (WCHAR)((ch >> `8`) & `0x7F`);
1959	*(pTarget + `2`) = (WCHAR)((ch >> `16`) & `0x7F`);
1960	*(pTarget + `3`) = (WCHAR)((ch >> `24`) & `0x7F`);
1961	pSrc += `8`;
1962	*(pTarget + `4`) = (WCHAR)(chb & `0x7F`);
1963	*(pTarget + `5`) = (WCHAR)((chb >> `8`) & `0x7F`);
1964	*(pTarget + `6`) = (WCHAR)((chb >> `16`) & `0x7F`);
1965	*(pTarget + `7`) = (WCHAR)((chb >> `24`) & `0x7F`);
1966	pTarget += `8`;
1967	#endif // BIGENDIAN
1968	}
1969	break;
1970
1971	#if BIGENDIAN
1972	LongCodeWithMask32 :
1973	// be careful about the sign extension
1974	ch = (int)(((uint)ch) >> `16`);
1975	LongCodeWithMask16:
1976	ch = (int)(((uint)ch) >> `8`);
1977	#else // BIGENDIAN
1978	LongCodeWithMask32:
1979	LongCodeWithMask16:
1980	ch &= `0xFF`;
1981	#endif // BIGENDIAN
1982	pSrc++;
1983	if (ch <= `0x7F`) {
1984	*pTarget = (WCHAR)ch;
1985	pTarget++;
1986	continue;
1987	}
1988
1989	LongCode:
1990	int chc = *pSrc;
1991	pSrc++;
1992
1993	if (
1994	// bit 6 has to be zero
1995	(ch & `0x40`) == `0` \|\|
1996	// we are expecting to see trailing bytes like 10vvvvvv
1997	(chc & `0xC0`) != `0x80`)
1998	{
1999	goto BadLongCode;
2000	}
2001
2002	chc &= `0x3F`;
2003
2004	// start a new long code
2005	if ((ch & `0x20`) != `0`) {
2006
2007	// fold the first two bytes together
2008	chc \|= (ch & `0x0F`) << `6`;
2009
2010	if ((ch & `0x10`) != `0`) {
2011	// 4 byte encoding - surrogate
2012	ch = *pSrc;
2013	if (
2014	// check that bit 4 is zero, the non-shortest form of surrogate
2015	// and the valid surrogate range 0x000000 - 0x10FFFF at the same time
2016	!InRange(chc >> `4`, `0x01`, `0x10`) \|\|
2017	// we are expecting to see trailing bytes like 10vvvvvv
2018	(ch & `0xC0`) != `0x80`)
2019	{
2020	goto BadLongCode;
2021	}
2022
2023	chc = (chc << `6`) \| (ch & `0x3F`);
2024
2025	ch = *(pSrc + `1`);
2026	// we are expecting to see trailing bytes like 10vvvvvv
2027	if ((ch & `0xC0`) != `0x80`) {
2028	goto BadLongCode;
2029	}
2030	pSrc += `2`;
2031
2032	ch = (chc << `6`) \| (ch & `0x3F`);
2033
2034	*pTarget = (WCHAR)(((ch >> `10`) & `0x7FF`) +
2035	(SHORT)(CharUnicodeInfo::HIGH_SURROGATE_START - (`0x10000` >> `10`)));
2036	pTarget++;
2037
2038	ch = (ch & `0x3FF`) +
2039	(SHORT)(CharUnicodeInfo::LOW_SURROGATE_START);
2040
2041	// extra byte, we're already planning 2 chars for 2 of these bytes,
2042	// but the big loop is testing the target against pStop, so we need
2043	// to subtract 2 more or we risk overrunning the input. Subtract
2044	// one here and one below.
2045	pStop--;
2046	}
2047	else {
2048	// 3 byte encoding
2049	ch = *pSrc;
2050	if (
2051	// check for non-shortest form of 3 byte seq
2052	(chc & (`0x1F` << `5`)) == `0` \|\|
2053	// Can't have surrogates here.
2054	(chc & (`0xF800` >> `6`)) == (`0xD800` >> `6`) \|\|
2055	// we are expecting to see trailing bytes like 10vvvvvv
2056	(ch & `0xC0`) != `0x80`)
2057	{
2058	goto BadLongCode;
2059	}
2060	pSrc++;
2061
2062	ch = (chc << `6`) \| (ch & `0x3F`);
2063
2064	// extra byte, we're only expecting 1 char for each of these 3 bytes,
2065	// but the loop is testing the target (not source) against pStop, so
2066	// we need to subtract 2 more or we risk overrunning the input.
2067	// Subtract 1 here and one more below
2068	pStop--;
2069	}
2070	}
2071	else {
2072	// 2 byte encoding
2073
2074	ch &= `0x1F`;
2075
2076	// check for non-shortest form
2077	if (ch <= `1`) {
2078	goto BadLongCode;
2079	}
2080	ch = (ch << `6`) \| chc;
2081	}
2082
2083	*pTarget = (WCHAR)ch;
2084	pTarget++;
2085
2086	// extra byte, we're only expecting 1 char for each of these 2 bytes,
2087	// but the loop is testing the target (not source) against pStop.
2088	// subtract an extra count from pStop so that we don't overrun the input.
2089	pStop--;
2090	}
2091	#endif // FASTLOOP
2092
2093	Contract::Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetChars]pTarget <= pAllocatedBufferEnd");
2094
2095	// no pending bits at this point
2096	ch = `0`;
2097	continue;
2098
2099	BadLongCode:
2100	pSrc -= `2`;
2101	ch = `0`;
2102	continue;
2103	}
2104
2105	if (ch != `0`)
2106	{
2107	// Have to do fallback for invalid bytes
2108	if (fallback == nullptr)
2109	{
2110	fallback = decoderFallback->CreateFallbackBuffer();
2111	fallback->InternalInitialize(bytes, pAllocatedBufferEnd);
2112	}
2113
2114	// This'll back us up the appropriate # of bytes if we didn't get anywhere
2115	if (!FallbackInvalidByteSequence(pSrc, ch, fallback))
2116	{
2117	Contract::Assert(pSrc >= bytes \|\| pTarget == chars,
2118	"[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer while flushing");
2119
2120	// Ran out of buffer space
2121	// Need to throw an exception?
2122	fallback->InternalReset();
2123	ThrowCharsOverflow(pTarget == chars);
2124	}
2125	Contract::Assert(pSrc >= bytes,
2126	"[UTF8Encoding.GetChars]Expected flushing invalid byte sequence to have remained within the byte array");
2127	ch = `0`;
2128	}
2129
2130	// Shouldn't have anything in fallback buffer for GetChars
2131	// (don't have to check m_throwOnOverflow for chars)
2132	Contract::Assert(fallback == nullptr \|\| fallback->GetRemaining() == `0`,
2133	"[UTF8Encoding.GetChars]Expected empty fallback buffer at end");
2134
2135	InternalDelete(fallback);
2136
2137	return PtrDiff(pTarget, chars);
2138	}
2139
2140	int GetBytes(WCHAR* chars, int charCount, BYTE* bytes, int byteCount)
2141	{
2142	Contract::Assert(chars != nullptr, "[UTF8Encoding.GetBytes]chars!=nullptr");
2143	Contract::Assert(byteCount >= `0`, "[UTF8Encoding.GetBytes]byteCount >=0");
2144	Contract::Assert(charCount >= `0`, "[UTF8Encoding.GetBytes]charCount >=0");
2145	Contract::Assert(bytes != nullptr, "[UTF8Encoding.GetBytes]bytes!=nullptr");
2146
2147	// For fallback we may need a fallback buffer.
2148	// We wait to initialize it though in case we don't have any broken input unicode
2149	EncoderFallbackBuffer* fallbackBuffer = nullptr;
2150	WCHAR *pSrc = chars;
2151	BYTE *pTarget = bytes;
2152
2153	WCHAR *pEnd = pSrc + charCount;
2154	BYTE *pAllocatedBufferEnd = pTarget + byteCount;
2155
2156	int ch = `0`;
2157
2158	// assume that JIT will enregister pSrc, pTarget and ch
2159
2160	for (;;) {
2161	// SLOWLOOP: does all range checks, handles all special cases, but it is slow
2162
2163	if (pSrc >= pEnd) {
2164
2165	if (ch == `0`) {
2166	// Check if there's anything left to get out of the fallback buffer
2167	ch = fallbackBuffer != nullptr ? fallbackBuffer->InternalGetNextChar() : `0`;
2168	if (ch > `0`) {
2169	goto ProcessChar;
2170	}
2171	}
2172	else {
2173	// Case of leftover surrogates in the fallback buffer
2174	if (fallbackBuffer != nullptr && fallbackBuffer->bFallingBack) {
2175	Contract::Assert(ch >= `0xD800` && ch <= `0xDBFF`,
2176	"[UTF8Encoding.GetBytes]expected high surrogate"); //, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
2177
2178	int cha = ch;
2179
2180	ch = fallbackBuffer->InternalGetNextChar();
2181
2182	if (InRange(ch, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) {
2183	ch = ch + (cha << `10`) + (`0x10000` - CharUnicodeInfo::LOW_SURROGATE_START - (CharUnicodeInfo::HIGH_SURROGATE_START << `10`));
2184	goto EncodeChar;
2185	}
2186	else if (ch > `0`){
2187	goto ProcessChar;
2188	}
2189	else {
2190	break;
2191	}
2192	}
2193	}
2194
2195	// attempt to encode the partial surrogate (will fail or ignore)
2196	if (ch > `0`)
2197	goto EncodeChar;
2198
2199	// We're done
2200	break;
2201	}
2202
2203	if (ch > `0`) {
2204	// We have a high surrogate left over from a previous loop.
2205	Contract::Assert(ch >= `0xD800` && ch <= `0xDBFF`,
2206	"[UTF8Encoding.GetBytes]expected high surrogate");//, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
2207
2208	// use separate helper variables for local contexts so that the jit optimizations
2209	// won't get confused about the variable lifetimes
2210	int cha = *pSrc;
2211
2212	// In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here.
2213	// if (IsLowSurrogate(cha)) {
2214	if (InRange(cha, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) {
2215	ch = cha + (ch << `10`) +
2216	(`0x10000`
2217	- CharUnicodeInfo::LOW_SURROGATE_START
2218	- (CharUnicodeInfo::HIGH_SURROGATE_START << `10`));
2219
2220	pSrc++;
2221	}
2222	// else ch is still high surrogate and encoding will fail
2223
2224	// attempt to encode the surrogate or partial surrogate
2225	goto EncodeChar;
2226	}
2227
2228	// If we've used a fallback, then we have to check for it
2229	if (fallbackBuffer != nullptr)
2230	{
2231	ch = fallbackBuffer->InternalGetNextChar();
2232	if (ch > `0`) goto ProcessChar;
2233	}
2234
2235	// read next char. The JIT optimization seems to be getting confused when
2236	// compiling "ch = pSrc++;", so rather use "ch = pSrc; pSrc++;" instead
2237	ch = *pSrc;
2238	pSrc++;
2239
2240	ProcessChar:
2241	if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::HIGH_SURROGATE_END)) {
2242	continue;
2243	}
2244	// either good char or partial surrogate
2245
2246	EncodeChar:
2247	// throw exception on partial surrogate if necessary
2248	if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END))
2249	{
2250	// Lone surrogates aren't allowed, we have to do fallback for them
2251	// Have to make a fallback buffer if we don't have one
2252	if (fallbackBuffer == nullptr)
2253	{
2254	// wait on fallbacks if we can
2255	// For fallback we may need a fallback buffer
2256	fallbackBuffer = encoderFallback->CreateFallbackBuffer();
2257
2258	// Set our internal fallback interesting things.
2259	fallbackBuffer->InternalInitialize(chars, pEnd, true);
2260	}
2261
2262	// Do our fallback. Actually we already know its a mixed up surrogate,
2263	// so the ref pSrc isn't gonna do anything.
2264	fallbackBuffer->InternalFallback((WCHAR)ch, &pSrc);
2265
2266	// Ignore it if we don't throw
2267	ch = `0`;
2268	continue;
2269	}
2270
2271	// Count bytes needed
2272	int bytesNeeded = `1`;
2273	if (ch > `0x7F`) {
2274	if (ch > `0x7FF`) {
2275	if (ch > `0xFFFF`) {
2276	bytesNeeded++; // 4 bytes (surrogate pair)
2277	}
2278	bytesNeeded++; // 3 bytes (800-FFFF)
2279	}
2280	bytesNeeded++; // 2 bytes (80-7FF)
2281	}
2282
2283	if (pTarget > pAllocatedBufferEnd - bytesNeeded) {
2284	// Left over surrogate from last time will cause pSrc == chars, so we'll throw
2285	if (fallbackBuffer != nullptr && fallbackBuffer->bFallingBack)
2286	{
2287	fallbackBuffer->MovePrevious(); // Didn't use this fallback char
2288	if (ch > `0xFFFF`)
2289	fallbackBuffer->MovePrevious(); // Was surrogate, didn't use 2nd part either
2290	}
2291	else
2292	{
2293	pSrc--; // Didn't use this char
2294	if (ch > `0xFFFF`)
2295	pSrc--; // Was surrogate, didn't use 2nd part either
2296	}
2297	Contract::Assert(pSrc >= chars \|\| pTarget == bytes,
2298	"[UTF8Encoding.GetBytes]Expected pSrc to be within buffer or to throw with insufficient room.");
2299	ThrowBytesOverflow(pTarget == bytes); // Throw if we must
2300	ch = `0`; // Nothing left over (we backed up to start of pair if supplimentary)
2301	break;
2302	}
2303
2304	if (ch <= `0x7F`) {
2305	*pTarget = (BYTE)ch;
2306	}
2307	else {
2308	// use separate helper variables for local contexts so that the jit optimizations
2309	// won't get confused about the variable lifetimes
2310	int chb;
2311	if (ch <= `0x7FF`) {
2312	// 2 BYTE encoding
2313	chb = (BYTE)(`0xC0` \| (ch >> `6`));
2314	}
2315	else
2316	{
2317	if (ch <= `0xFFFF`) {
2318	chb = (BYTE)(`0xE0` \| (ch >> `12`));
2319	}
2320	else
2321	{
2322	*pTarget = (BYTE)(`0xF0` \| (ch >> `18`));
2323	pTarget++;
2324
2325	chb = `0x80` \| ((ch >> `12`) & `0x3F`);
2326	}
2327	*pTarget = (BYTE)chb;
2328	pTarget++;
2329
2330	chb = `0x80` \| ((ch >> `6`) & `0x3F`);
2331	}
2332	*pTarget = (BYTE)chb;
2333	pTarget++;
2334
2335	*pTarget = (BYTE)`0x80` \| (ch & `0x3F`);
2336	}
2337	pTarget++;
2338
2339
2340	#ifdef FASTLOOP
2341	// If still have fallback don't do fast loop
2342	if (fallbackBuffer != nullptr && (ch = fallbackBuffer->InternalGetNextChar()) != `0`)
2343	goto ProcessChar;
2344
2345	int availableChars = PtrDiff(pEnd, pSrc);
2346	int availableBytes = PtrDiff(pAllocatedBufferEnd, pTarget);
2347
2348	// don't fall into the fast decoding loop if we don't have enough characters
2349	// Note that if we don't have enough bytes, pStop will prevent us from entering the fast loop.
2350	if (availableChars <= `13`) {
2351	// we are hoping for 1 BYTE per char
2352	if (availableBytes < availableChars) {
2353	// not enough output room. no pending bits at this point
2354	ch = `0`;
2355	continue;
2356	}
2357
2358	// try to get over the remainder of the ascii characters fast though
2359	WCHAR* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
2360	while (pSrc < pLocalEnd) {
2361	ch = *pSrc;
2362	pSrc++;
2363
2364	// Not ASCII, need more than 1 BYTE per char
2365	if (ch > `0x7F`)
2366	goto ProcessChar;
2367
2368	*pTarget = (BYTE)ch;
2369	pTarget++;
2370	}
2371	// we are done, let ch be 0 to clear encoder
2372	ch = `0`;
2373	break;
2374	}
2375
2376	// we need at least 1 BYTE per character, but Convert might allow us to convert
2377	// only part of the input, so try as much as we can. Reduce charCount if necessary
2378	if (availableBytes < availableChars)
2379	{
2380	availableChars = availableBytes;
2381	}
2382
2383	// FASTLOOP:
2384	// - optimistic range checks
2385	// - fallbacks to the slow loop for all special cases, exception throwing, etc.
2386
2387	// To compute the upper bound, assume that all characters are ASCII characters at this point,
2388	// the boundary will be decreased for every non-ASCII character we encounter
2389	// Also, we need 5 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates
2390	// If there aren't enough bytes for the output, then pStop will be <= pSrc and will bypass the loop.
2391	WCHAR *pStop = pSrc + availableChars - `5`;
2392
2393	while (pSrc < pStop) {
2394	ch = *pSrc;
2395	pSrc++;
2396
2397	if (ch > `0x7F`) {
2398	goto LongCode;
2399	}
2400	*pTarget = (BYTE)ch;
2401	pTarget++;
2402
2403	// get pSrc aligned
2404	if (((size_t)pSrc & `0x2`) != `0`) {
2405	ch = *pSrc;
2406	pSrc++;
2407	if (ch > `0x7F`) {
2408	goto LongCode;
2409	}
2410	*pTarget = (BYTE)ch;
2411	pTarget++;
2412	}
2413
2414	// Run 4 characters at a time!
2415	while (pSrc < pStop) {
2416	ch = (int**)pSrc;
2417	int chc = (int**)(pSrc + `2`);
2418	if (((ch \| chc) & (int)`0xFF80FF80`) != `0`) {
2419	goto LongCodeWithMask;
2420	}
2421
2422	// Unfortunately, this is endianess sensitive
2423	#if BIGENDIAN
2424	*pTarget = (BYTE)(ch >> `16`);
2425	*(pTarget + `1`) = (BYTE)ch;
2426	pSrc += `4`;
2427	*(pTarget + `2`) = (BYTE)(chc >> `16`);
2428	*(pTarget + `3`) = (BYTE)chc;
2429	pTarget += `4`;
2430	#else // BIGENDIAN
2431	*pTarget = (BYTE)ch;
2432	*(pTarget + `1`) = (BYTE)(ch >> `16`);
2433	pSrc += `4`;
2434	*(pTarget + `2`) = (BYTE)chc;
2435	*(pTarget + `3`) = (BYTE)(chc >> `16`);
2436	pTarget += `4`;
2437	#endif // BIGENDIAN
2438	}
2439	continue;
2440
2441	LongCodeWithMask:
2442	#if BIGENDIAN
2443	// be careful about the sign extension
2444	ch = (int)(((uint)ch) >> `16`);
2445	#else // BIGENDIAN
2446	ch = (WCHAR)ch;
2447	#endif // BIGENDIAN
2448	pSrc++;
2449
2450	if (ch > `0x7F`) {
2451	goto LongCode;
2452	}
2453	*pTarget = (BYTE)ch;
2454	pTarget++;
2455	continue;
2456
2457	LongCode:
2458	// use separate helper variables for slow and fast loop so that the jit optimizations
2459	// won't get confused about the variable lifetimes
2460	int chd;
2461	if (ch <= `0x7FF`) {
2462	// 2 BYTE encoding
2463	chd = `0xC0` \| (ch >> `6`);
2464	}
2465	else {
2466	if (!InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) {
2467	// 3 BYTE encoding
2468	chd = `0xE0` \| (ch >> `12`);
2469	}
2470	else
2471	{
2472	// 4 BYTE encoding - high surrogate + low surrogate
2473	if (ch > CharUnicodeInfo::HIGH_SURROGATE_END) {
2474	// low without high -> bad, try again in slow loop
2475	pSrc -= `1`;
2476	break;
2477	}
2478
2479	chd = *pSrc;
2480	pSrc++;
2481
2482	// if (!IsLowSurrogate(chd)) {
2483	if (!InRange(chd, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) {
2484	// high not followed by low -> bad, try again in slow loop
2485	pSrc -= `2`;
2486	break;
2487	}
2488
2489	ch = chd + (ch << `10`) +
2490	(`0x10000`
2491	- CharUnicodeInfo::LOW_SURROGATE_START
2492	- (CharUnicodeInfo::HIGH_SURROGATE_START << `10`));
2493
2494	*pTarget = (BYTE)(`0xF0` \| (ch >> `18`));
2495	// pStop - this BYTE is compensated by the second surrogate character
2496	// 2 input chars require 4 output bytes. 2 have been anticipated already
2497	// and 2 more will be accounted for by the 2 pStop-- calls below.
2498	pTarget++;
2499
2500	chd = `0x80` \| ((ch >> `12`) & `0x3F`);
2501	}
2502	*pTarget = (BYTE)chd;
2503	pStop--; // 3 BYTE sequence for 1 char, so need pStop-- and the one below too.
2504	pTarget++;
2505
2506	chd = `0x80` \| ((ch >> `6`) & `0x3F`);
2507	}
2508	*pTarget = (BYTE)chd;
2509	pStop--; // 2 BYTE sequence for 1 char so need pStop--.
2510	pTarget++;
2511
2512	*pTarget = (BYTE)(`0x80` \| (ch & `0x3F`));
2513	// pStop - this BYTE is already included
2514	pTarget++;
2515	}
2516
2517	Contract::Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetBytes]pTarget <= pAllocatedBufferEnd");
2518
2519	#endif // FASTLOOP
2520
2521	// no pending char at this point
2522	ch = `0`;
2523	}
2524
2525	InternalDelete(fallbackBuffer);
2526
2527	return (int)(pTarget - bytes);
2528	}
2529
2530	int GetByteCount(WCHAR chars, int* count)
2531	{
2532	// For fallback we may need a fallback buffer.
2533	// We wait to initialize it though in case we don't have any broken input unicode
2534	EncoderFallbackBuffer* fallbackBuffer = nullptr;
2535	WCHAR *pSrc = chars;
2536	WCHAR *pEnd = pSrc + count;
2537
2538	// Start by assuming we have as many as count
2539	int byteCount = count;
2540
2541	int ch = `0`;
2542
2543	for (;;) {
2544	// SLOWLOOP: does all range checks, handles all special cases, but it is slow
2545	if (pSrc >= pEnd) {
2546
2547	if (ch == `0`) {
2548	// Unroll any fallback that happens at the end
2549	ch = fallbackBuffer != nullptr ? fallbackBuffer->InternalGetNextChar() : `0`;
2550	if (ch > `0`) {
2551	byteCount++;
2552	goto ProcessChar;
2553	}
2554	}
2555	else {
2556	// Case of surrogates in the fallback.
2557	if (fallbackBuffer != nullptr && fallbackBuffer->bFallingBack) {
2558	Contract::Assert(ch >= `0xD800` && ch <= `0xDBFF`,
2559	"[UTF8Encoding.GetBytes]expected high surrogate");// , not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
2560
2561	ch = fallbackBuffer->InternalGetNextChar();
2562	byteCount++;
2563
2564	if (InRange(ch, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) {
2565	ch = `0xfffd`;
2566	byteCount++;
2567	goto EncodeChar;
2568	}
2569	else if (ch > `0`){
2570	goto ProcessChar;
2571	}
2572	else {
2573	byteCount--; // ignore last one.
2574	break;
2575	}
2576	}
2577	}
2578
2579	if (ch <= `0`) {
2580	break;
2581	}
2582
2583	// attempt to encode the partial surrogate (will fallback or ignore it), it'll also subtract 1.
2584	byteCount++;
2585	goto EncodeChar;
2586	}
2587
2588	if (ch > `0`) {
2589	Contract::Assert(ch >= `0xD800` && ch <= `0xDBFF`,
2590	"[UTF8Encoding.GetBytes]expected high surrogate"); // , not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
2591
2592	// use separate helper variables for local contexts so that the jit optimizations
2593	// won't get confused about the variable lifetimes
2594	int cha = *pSrc;
2595
2596	// count the pending surrogate
2597	byteCount++;
2598
2599	// In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here.
2600	// if (IsLowSurrogate(cha)) {
2601	if (InRange(cha, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) {
2602	// Don't need a real # because we're just counting, anything > 0x7ff ('cept surrogate) will do.
2603	ch = `0xfffd`;
2604	// ch = cha + (ch << 10) +
2605	// (0x10000
2606	// - CharUnicodeInfo::LOW_SURROGATE_START
2607	// - (CharUnicodeInfo::HIGH_SURROGATE_START << 10) );
2608
2609	// Use this next char
2610	pSrc++;
2611	}
2612	// else ch is still high surrogate and encoding will fail (so don't add count)
2613
2614	// attempt to encode the surrogate or partial surrogate
2615	goto EncodeChar;
2616	}
2617
2618	// If we've used a fallback, then we have to check for it
2619	if (fallbackBuffer != nullptr)
2620	{
2621	ch = fallbackBuffer->InternalGetNextChar();
2622	if (ch > `0`)
2623	{
2624	// We have an extra byte we weren't expecting.
2625	byteCount++;
2626	goto ProcessChar;
2627	}
2628	}
2629
2630	// read next char. The JIT optimization seems to be getting confused when
2631	// compiling "ch = pSrc++;", so rather use "ch = pSrc; pSrc++;" instead
2632	ch = *pSrc;
2633	pSrc++;
2634
2635	ProcessChar:
2636	if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::HIGH_SURROGATE_END)) {
2637	// we will count this surrogate next time around
2638	byteCount--;
2639	continue;
2640	}
2641	// either good char or partial surrogate
2642
2643	EncodeChar:
2644	// throw exception on partial surrogate if necessary
2645	if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END))
2646	{
2647	// Lone surrogates aren't allowed
2648	// Have to make a fallback buffer if we don't have one
2649	if (fallbackBuffer == nullptr)
2650	{
2651	// wait on fallbacks if we can
2652	// For fallback we may need a fallback buffer
2653	fallbackBuffer = encoderFallback->CreateFallbackBuffer();
2654
2655	// Set our internal fallback interesting things.
2656	fallbackBuffer->InternalInitialize(chars, chars + count, false);
2657	}
2658
2659	// Do our fallback. Actually we already know its a mixed up surrogate,
2660	// so the ref pSrc isn't gonna do anything.
2661	fallbackBuffer->InternalFallback((WCHAR)ch, &pSrc);
2662
2663	// Ignore it if we don't throw (we had preallocated this ch)
2664	byteCount--;
2665	ch = `0`;
2666	continue;
2667	}
2668
2669	// Count them
2670	if (ch > `0x7F`) {
2671	if (ch > `0x7FF`) {
2672	// the extra surrogate byte was compensated by the second surrogate character
2673	// (2 surrogates make 4 bytes. We've already counted 2 bytes, 1 per char)
2674	byteCount++;
2675	}
2676	byteCount++;
2677	}
2678
2679	#if WIN64
2680	// check for overflow
2681	if (byteCount < `0`) {
2682	break;
2683	}
2684	#endif
2685
2686	#ifdef FASTLOOP
2687	// If still have fallback don't do fast loop
2688	if (fallbackBuffer != nullptr && (ch = fallbackBuffer->InternalGetNextChar()) != `0`)
2689	{
2690	// We're reserving 1 byte for each char by default
2691	byteCount++;
2692	goto ProcessChar;
2693	}
2694
2695	int availableChars = PtrDiff(pEnd, pSrc);
2696
2697	// don't fall into the fast decoding loop if we don't have enough characters
2698	if (availableChars <= `13`) {
2699	// try to get over the remainder of the ascii characters fast though
2700	WCHAR* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
2701	while (pSrc < pLocalEnd) {
2702	ch = *pSrc;
2703	pSrc++;
2704	if (ch > `0x7F`)
2705	goto ProcessChar;
2706	}
2707
2708	// we are done
2709	break;
2710	}
2711
2712	#if WIN64
2713	// make sure that we won't get a silent overflow inside the fast loop
2714	// (Fall out to slow loop if we have this many characters)
2715	availableChars &= `0x0FFFFFFF`;
2716	#endif
2717
2718	// To compute the upper bound, assume that all characters are ASCII characters at this point,
2719	// the boundary will be decreased for every non-ASCII character we encounter
2720	// Also, we need 3 + 4 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates
2721	WCHAR *pStop = pSrc + availableChars - (`3` + `4`);
2722
2723	while (pSrc < pStop) {
2724	ch = *pSrc;
2725	pSrc++;
2726
2727	if (ch > `0x7F`) // Not ASCII
2728	{
2729	if (ch > `0x7FF`) // Not 2 Byte
2730	{
2731	if ((ch & `0xF800`) == `0xD800`) // See if its a Surrogate
2732	goto LongCode;
2733	byteCount++;
2734	}
2735	byteCount++;
2736	}
2737
2738	// get pSrc aligned
2739	if (((int)pSrc & `0x2`) != `0`) {
2740	ch = *pSrc;
2741	pSrc++;
2742	if (ch > `0x7F`) // Not ASCII
2743	{
2744	if (ch > `0x7FF`) // Not 2 Byte
2745	{
2746	if ((ch & `0xF800`) == `0xD800`) // See if its a Surrogate
2747	goto LongCode;
2748	byteCount++;
2749	}
2750	byteCount++;
2751	}
2752	}
2753
2754	// Run 2 4 characters at a time!*
2755	while (pSrc < pStop) {
2756	ch = (int**)pSrc;
2757	int chc = (int**)(pSrc + `2`);
2758	if (((ch \| chc) & (int)`0xFF80FF80`) != `0`) // See if not ASCII
2759	{
2760	if (((ch \| chc) & (int)`0xF800F800`) != `0`) // See if not 2 Byte
2761	{
2762	goto LongCodeWithMask;
2763	}
2764
2765
2766	if ((ch & (int)`0xFF800000`) != `0`) // Actually 0x07800780 is all we care about (4 bits)
2767	byteCount++;
2768	if ((ch & (int)`0xFF80`) != `0`)
2769	byteCount++;
2770	if ((chc & (int)`0xFF800000`) != `0`)
2771	byteCount++;
2772	if ((chc & (int)`0xFF80`) != `0`)
2773	byteCount++;
2774	}
2775	pSrc += `4`;
2776
2777	ch = (int**)pSrc;
2778	chc = (int**)(pSrc + `2`);
2779	if (((ch \| chc) & (int)`0xFF80FF80`) != `0`) // See if not ASCII
2780	{
2781	if (((ch \| chc) & (int)`0xF800F800`) != `0`) // See if not 2 Byte
2782	{
2783	goto LongCodeWithMask;
2784	}
2785
2786	if ((ch & (int)`0xFF800000`) != `0`)
2787	byteCount++;
2788	if ((ch & (int)`0xFF80`) != `0`)
2789	byteCount++;
2790	if ((chc & (int)`0xFF800000`) != `0`)
2791	byteCount++;
2792	if ((chc & (int)`0xFF80`) != `0`)
2793	byteCount++;
2794	}
2795	pSrc += `4`;
2796	}
2797	break;
2798
2799	LongCodeWithMask:
2800	#if BIGENDIAN
2801	// be careful about the sign extension
2802	ch = (int)(((uint)ch) >> `16`);
2803	#else // BIGENDIAN
2804	ch = (WCHAR)ch;
2805	#endif // BIGENDIAN
2806	pSrc++;
2807
2808	if (ch <= `0x7F`) {
2809	continue;
2810	}
2811
2812	LongCode:
2813	// use separate helper variables for slow and fast loop so that the jit optimizations
2814	// won't get confused about the variable lifetimes
2815	if (ch > `0x7FF`) {
2816	if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) {
2817	// 4 byte encoding - high surrogate + low surrogate
2818
2819	int chd = *pSrc;
2820	if (
2821	ch > CharUnicodeInfo::HIGH_SURROGATE_END \|\|
2822	!InRange(chd, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END))
2823	{
2824	// Back up and drop out to slow loop to figure out error
2825	pSrc--;
2826	break;
2827	}
2828	pSrc++;
2829
2830	// byteCount - this byte is compensated by the second surrogate character
2831	}
2832	byteCount++;
2833	}
2834	byteCount++;
2835
2836	// byteCount - the last byte is already included
2837	}
2838	#endif // FASTLOOP
2839
2840	// no pending char at this point
2841	ch = `0`;
2842	}
2843
2844	#if WIN64
2845	// check for overflow
2846	if (byteCount < `0`) {
2847	throw ArgumentException("Conversion buffer overflow.");
2848	}
2849	#endif
2850
2851	Contract::Assert(fallbackBuffer == nullptr \|\| fallbackBuffer->GetRemaining() == `0`,
2852	"[UTF8Encoding.GetByteCount]Expected Empty fallback buffer");
2853
2854	InternalDelete(fallbackBuffer);
2855
2856	return byteCount;
2857	}
2858
2859	};
2860
2861
2862	////////////////////////////////////////////////////////////////////////////
2863	//
2864	// UTF8ToUnicode
2865	//
2866	// Maps a UTF-8 character string to its wide character string counterpart.
2867	//
2868	////////////////////////////////////////////////////////////////////////////
2869
2870	int UTF8ToUnicode(
2871	LPCSTR lpSrcStr,
2872	int cchSrc,
2873	LPWSTR lpDestStr,
2874	int cchDest,
2875	DWORD dwFlags
2876	)
2877	{
2878	int ret;
2879	UTF8Encoding enc(dwFlags & MB_ERR_INVALID_CHARS);
2880	try {
2881	ret = enc.GetCharCount((BYTE*)lpSrcStr, cchSrc);
2882	if (cchDest){
2883	if (ret > cchDest){
2884	SetLastError(ERROR_INSUFFICIENT_BUFFER);
2885	ret = `0`;
2886	}
2887	enc.GetChars((BYTE)lpSrcStr, cchSrc, (WCHAR)lpDestStr, ret);
2888	}
2889	}
2890	catch (const InsufficientBufferException& e){
2891	SetLastError(ERROR_INSUFFICIENT_BUFFER);
2892	return `0`;
2893	}
2894	catch (const DecoderFallbackException& e){
2895	SetLastError(ERROR_NO_UNICODE_TRANSLATION);
2896	return `0`;
2897	}
2898	catch (const ArgumentException& e){
2899	SetLastError(ERROR_INVALID_PARAMETER);
2900	return `0`;
2901	}
2902	return ret;
2903	}
2904
2905	////////////////////////////////////////////////////////////////////////////
2906	//
2907	// UnicodeToUTF8
2908	//
2909	// Maps a Unicode character string to its UTF-8 string counterpart.
2910	//
2911	////////////////////////////////////////////////////////////////////////////
2912
2913	int UnicodeToUTF8(
2914	LPCWSTR lpSrcStr,
2915	int cchSrc,
2916	LPSTR lpDestStr,
2917	int cchDest)
2918	{
2919	int ret;
2920	UTF8Encoding enc(false);
2921	try{
2922	ret = enc.GetByteCount((WCHAR*)lpSrcStr, cchSrc);
2923	if (cchDest){
2924	if (ret > cchDest){
2925	SetLastError(ERROR_INSUFFICIENT_BUFFER);
2926	ret = `0`;
2927	}
2928	enc.GetBytes((WCHAR)lpSrcStr, cchSrc, (BYTE)lpDestStr, ret);
2929	}
2930	}
2931	catch (const InsufficientBufferException& e){
2932	SetLastError(ERROR_INSUFFICIENT_BUFFER);
2933	return `0`;
2934	}
2935	catch (const EncoderFallbackException& e){
2936	SetLastError(ERROR_NO_UNICODE_TRANSLATION);
2937	return `0`;
2938	}
2939	catch (const ArgumentException& e){
2940	SetLastError(ERROR_INVALID_PARAMETER);
2941	return `0`;
2942	}
2943	return ret;
2944	}
2945

Browse the source code of CoreCLR/pal/src/locale/utf8.cpp