ucnv_u16.cpp source code [ClickHouse/contrib/icu/icu4c/source/common/ucnv_u16.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	**********************************************************************
5	* Copyright (C) 2002-2015, International Business Machines
6	* Corporation and others. All Rights Reserved.
7	**********************************************************************
8	* file name: ucnv_u16.c
9	* encoding: UTF-8
10	* tab size: 8 (not used)
11	* indentation:4
12	*
13	* created on: 2002jul01
14	* created by: Markus W. Scherer
15	*
16	* UTF-16 converter implementation. Used to be in ucnv_utf.c.
17	*/
18
19	#include "unicode/utypes.h"
20
21	#if !UCONFIG_NO_CONVERSION
22
23	#include "unicode/ucnv.h"
24	#include "unicode/uversion.h"
25	#include "ucnv_bld.h"
26	#include "ucnv_cnv.h"
27	#include "cmemory.h"
28
29	enum {
30	UCNV_NEED_TO_WRITE_BOM=`1`
31	};
32
33	U_CDECL_BEGIN
34	/*
35	* The UTF-16 toUnicode implementation is also used for the Java-specific
36	* "with BOM" variants of UTF-16BE and UTF-16LE.
37	*/
38	static void U_CALLCONV
39	_UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
40	UErrorCode *pErrorCode);
41
42	/ UTF-16BE ----------------------------------------------------------------- /
43
44	#if U_IS_BIG_ENDIAN
45	# define _UTF16PEFromUnicodeWithOffsets _UTF16BEFromUnicodeWithOffsets
46	#else
47	# define _UTF16PEFromUnicodeWithOffsets _UTF16LEFromUnicodeWithOffsets
48	#endif
49
50
51	static void U_CALLCONV
52	_UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
53	UErrorCode *pErrorCode) {
54	UConverter *cnv;
55	const UChar *source;
56	char *target;
57	int32_t *offsets;
58
59	uint32_t targetCapacity, length, sourceIndex;
60	UChar c, trail;
61	char overflow[`4`];
62
63	source=pArgs->source;
64	length=(int32_t)(pArgs->sourceLimit-source);
65	if(length<=`0`) {
66	/ no input, nothing to do /
67	return;
68	}
69
70	cnv=pArgs->converter;
71
72	/ write the BOM if necessary /
73	if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
74	static const char bom[]={ (char)`0xfeu`, (char)`0xffu` };
75	ucnv_fromUWriteBytes(cnv,
76	bom, `2`,
77	&pArgs->target, pArgs->targetLimit,
78	&pArgs->offsets, -`1`,
79	pErrorCode);
80	cnv->fromUnicodeStatus=`0`;
81	}
82
83	target=pArgs->target;
84	if(target >= pArgs->targetLimit) {
85	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
86	return;
87	}
88
89	targetCapacity=(uint32_t)(pArgs->targetLimit-target);
90	offsets=pArgs->offsets;
91	sourceIndex=`0`;
92
93	/ c!=0 indicates in several places outside the main loops that a surrogate was found /
94
95	if((c=(UChar)cnv->fromUChar32)!=`0` && U16_IS_TRAIL(trail=*source) && targetCapacity>=`4`) {
96	/ the last buffer ended with a lead surrogate, output the surrogate pair /
97	++source;
98	--length;
99	target[`0`]=(uint8_t)(c>>`8`);
100	target[`1`]=(uint8_t)c;
101	target[`2`]=(uint8_t)(trail>>`8`);
102	target[`3`]=(uint8_t)trail;
103	target+=`4`;
104	targetCapacity-=`4`;
105	if(offsets!=NULL) {
106	*offsets++=-`1`;
107	*offsets++=-`1`;
108	*offsets++=-`1`;
109	*offsets++=-`1`;
110	}
111	sourceIndex=`1`;
112	cnv->fromUChar32=c=`0`;
113	}
114
115	if(c==`0`) {
116	/ copy an even number of bytes for complete UChars /
117	uint32_t count=`2`*length;
118	if(count>targetCapacity) {
119	count=targetCapacity&~`1`;
120	}
121	/ count is even /
122	targetCapacity-=count;
123	count>>=`1`;
124	length-=count;
125
126	if(offsets==NULL) {
127	while(count>`0`) {
128	c=*source++;
129	if(U16_IS_SINGLE(c)) {
130	target[`0`]=(uint8_t)(c>>`8`);
131	target[`1`]=(uint8_t)c;
132	target+=`2`;
133	} else if(U16_IS_SURROGATE_LEAD(c) && count>=`2` && U16_IS_TRAIL(trail=*source)) {
134	++source;
135	--count;
136	target[`0`]=(uint8_t)(c>>`8`);
137	target[`1`]=(uint8_t)c;
138	target[`2`]=(uint8_t)(trail>>`8`);
139	target[`3`]=(uint8_t)trail;
140	target+=`4`;
141	} else {
142	break;
143	}
144	--count;
145	}
146	} else {
147	while(count>`0`) {
148	c=*source++;
149	if(U16_IS_SINGLE(c)) {
150	target[`0`]=(uint8_t)(c>>`8`);
151	target[`1`]=(uint8_t)c;
152	target+=`2`;
153	*offsets++=sourceIndex;
154	*offsets++=sourceIndex++;
155	} else if(U16_IS_SURROGATE_LEAD(c) && count>=`2` && U16_IS_TRAIL(trail=*source)) {
156	++source;
157	--count;
158	target[`0`]=(uint8_t)(c>>`8`);
159	target[`1`]=(uint8_t)c;
160	target[`2`]=(uint8_t)(trail>>`8`);
161	target[`3`]=(uint8_t)trail;
162	target+=`4`;
163	*offsets++=sourceIndex;
164	*offsets++=sourceIndex;
165	*offsets++=sourceIndex;
166	*offsets++=sourceIndex;
167	sourceIndex+=`2`;
168	} else {
169	break;
170	}
171	--count;
172	}
173	}
174
175	if(count==`0`) {
176	/ done with the loop for complete UChars /
177	if(length>`0` && targetCapacity>`0`) {
178	/*
179	* there is more input and some target capacity -
180	* it must be targetCapacity==1 because otherwise
181	* the above would have copied more;
182	* prepare for overflow output
183	*/
184	if(U16_IS_SINGLE(c=*source++)) {
185	overflow[`0`]=(char)(c>>`8`);
186	overflow[`1`]=(char)c;
187	length=`2`; / 2 bytes to output /
188	c=`0`;
189	/ } else { keep c for surrogate handling, length will be set there /
190	}
191	} else {
192	length=`0`;
193	c=`0`;
194	}
195	} else {
196	/ keep c for surrogate handling, length will be set there /
197	targetCapacity+=`2`*count;
198	}
199	} else {
200	length=`0`; / from here on, length counts the bytes in overflow[] /
201	}
202
203	if(c!=`0`) {
204	/*
205	* c is a surrogate, and
206	* - source or target too short
207	* - or the surrogate is unmatched
208	*/
209	length=`0`;
210	if(U16_IS_SURROGATE_LEAD(c)) {
211	if(source<pArgs->sourceLimit) {
212	if(U16_IS_TRAIL(trail=*source)) {
213	/ output the surrogate pair, will overflow (see conditions comment above) /
214	++source;
215	overflow[`0`]=(char)(c>>`8`);
216	overflow[`1`]=(char)c;
217	overflow[`2`]=(char)(trail>>`8`);
218	overflow[`3`]=(char)trail;
219	length=`4`; / 4 bytes to output /
220	c=`0`;
221	} else {
222	/ unmatched lead surrogate /
223	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
224	}
225	} else {
226	/ see if the trail surrogate is in the next buffer /
227	}
228	} else {
229	/ unmatched trail surrogate /
230	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
231	}
232	cnv->fromUChar32=c;
233	}
234
235	if(length>`0`) {
236	/ output length bytes with overflow (length>targetCapacity>0) /
237	ucnv_fromUWriteBytes(cnv,
238	overflow, length,
239	(char **)&target, pArgs->targetLimit,
240	&offsets, sourceIndex,
241	pErrorCode);
242	targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target);
243	}
244
245	if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==`0`) {
246	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
247	}
248
249	/ write back the updated pointers /
250	pArgs->source=source;
251	pArgs->target=(char *)target;
252	pArgs->offsets=offsets;
253	}
254
255	static void U_CALLCONV
256	_UTF16BEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
257	UErrorCode *pErrorCode) {
258	UConverter *cnv;
259	const uint8_t *source;
260	UChar *target;
261	int32_t *offsets;
262
263	uint32_t targetCapacity, length, count, sourceIndex;
264	UChar c, trail;
265
266	if(pArgs->converter->mode<`8`) {
267	_UTF16ToUnicodeWithOffsets(pArgs, pErrorCode);
268	return;
269	}
270
271	cnv=pArgs->converter;
272	source=(const uint8_t *)pArgs->source;
273	length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);
274	if(length<=`0` && cnv->toUnicodeStatus==`0`) {
275	/ no input, nothing to do /
276	return;
277	}
278
279	target=pArgs->target;
280	if(target >= pArgs->targetLimit) {
281	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
282	return;
283	}
284
285	targetCapacity=(uint32_t)(pArgs->targetLimit-target);
286	offsets=pArgs->offsets;
287	sourceIndex=`0`;
288	c=`0`;
289
290	/ complete a partial UChar or pair from the last call /
291	if(cnv->toUnicodeStatus!=`0`) {
292	/*
293	* special case: single byte from a previous buffer,
294	* where the byte turned out not to belong to a trail surrogate
295	* and the preceding, unmatched lead surrogate was put into toUBytes[]
296	* for error handling
297	*/
298	cnv->toUBytes[`0`]=(uint8_t)cnv->toUnicodeStatus;
299	cnv->toULength=`1`;
300	cnv->toUnicodeStatus=`0`;
301	}
302	if((count=cnv->toULength)!=`0`) {
303	uint8_t *p=cnv->toUBytes;
304	do {
305	p[count++]=*source++;
306	++sourceIndex;
307	--length;
308	if(count==`2`) {
309	c=((UChar)p[`0`]<<`8`)\|p[`1`];
310	if(U16_IS_SINGLE(c)) {
311	/ output the BMP code point /
312	*target++=c;
313	if(offsets!=NULL) {
314	*offsets++=-`1`;
315	}
316	--targetCapacity;
317	count=`0`;
318	c=`0`;
319	break;
320	} else if(U16_IS_SURROGATE_LEAD(c)) {
321	/ continue collecting bytes for the trail surrogate /
322	c=`0`; / avoid unnecessary surrogate handling below /
323	} else {
324	/ fall through to error handling for an unmatched trail surrogate /
325	break;
326	}
327	} else if(count==`4`) {
328	c=((UChar)p[`0`]<<`8`)\|p[`1`];
329	trail=((UChar)p[`2`]<<`8`)\|p[`3`];
330	if(U16_IS_TRAIL(trail)) {
331	/ output the surrogate pair /
332	*target++=c;
333	if(targetCapacity>=`2`) {
334	*target++=trail;
335	if(offsets!=NULL) {
336	*offsets++=-`1`;
337	*offsets++=-`1`;
338	}
339	targetCapacity-=`2`;
340	} else / targetCapacity==1 / {
341	targetCapacity=`0`;
342	cnv->UCharErrorBuffer[`0`]=trail;
343	cnv->UCharErrorBufferLength=`1`;
344	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
345	}
346	count=`0`;
347	c=`0`;
348	break;
349	} else {
350	/ unmatched lead surrogate, handle here for consistent toUBytes[] /
351	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
352
353	/ back out reading the code unit after it /
354	if(((const uint8_t *)pArgs->source-source)>=`2`) {
355	source-=`2`;
356	} else {
357	/*
358	* if the trail unit's first byte was in a previous buffer, then
359	* we need to put it into a special place because toUBytes[] will be
360	* used for the lead unit's bytes
361	*/
362	cnv->toUnicodeStatus=`0x100`\|p[`2`];
363	--source;
364	}
365	cnv->toULength=`2`;
366
367	/ write back the updated pointers /
368	pArgs->source=(const char *)source;
369	pArgs->target=target;
370	pArgs->offsets=offsets;
371	return;
372	}
373	}
374	} while(length>`0`);
375	cnv->toULength=(int8_t)count;
376	}
377
378	/ copy an even number of bytes for complete UChars /
379	count=`2`*targetCapacity;
380	if(count>length) {
381	count=length&~`1`;
382	}
383	if(c==`0` && count>`0`) {
384	length-=count;
385	count>>=`1`;
386	targetCapacity-=count;
387	if(offsets==NULL) {
388	do {
389	c=((UChar)source[`0`]<<`8`)\|source[`1`];
390	source+=`2`;
391	if(U16_IS_SINGLE(c)) {
392	*target++=c;
393	} else if(U16_IS_SURROGATE_LEAD(c) && count>=`2` &&
394	U16_IS_TRAIL(trail=((UChar)source[`0`]<<`8`)\|source[`1`])
395	) {
396	source+=`2`;
397	--count;
398	*target++=c;
399	*target++=trail;
400	} else {
401	break;
402	}
403	} while(--count>`0`);
404	} else {
405	do {
406	c=((UChar)source[`0`]<<`8`)\|source[`1`];
407	source+=`2`;
408	if(U16_IS_SINGLE(c)) {
409	*target++=c;
410	*offsets++=sourceIndex;
411	sourceIndex+=`2`;
412	} else if(U16_IS_SURROGATE_LEAD(c) && count>=`2` &&
413	U16_IS_TRAIL(trail=((UChar)source[`0`]<<`8`)\|source[`1`])
414	) {
415	source+=`2`;
416	--count;
417	*target++=c;
418	*target++=trail;
419	*offsets++=sourceIndex;
420	*offsets++=sourceIndex;
421	sourceIndex+=`4`;
422	} else {
423	break;
424	}
425	} while(--count>`0`);
426	}
427
428	if(count==`0`) {
429	/ done with the loop for complete UChars /
430	c=`0`;
431	} else {
432	/ keep c for surrogate handling, trail will be set there /
433	length+=`2`(count-`1`); /* one more byte pair was consumed than count decremented /
434	targetCapacity+=count;
435	}
436	}
437
438	if(c!=`0`) {
439	/*
440	* c is a surrogate, and
441	* - source or target too short
442	* - or the surrogate is unmatched
443	*/
444	cnv->toUBytes[`0`]=(uint8_t)(c>>`8`);
445	cnv->toUBytes[`1`]=(uint8_t)c;
446	cnv->toULength=`2`;
447
448	if(U16_IS_SURROGATE_LEAD(c)) {
449	if(length>=`2`) {
450	if(U16_IS_TRAIL(trail=((UChar)source[`0`]<<`8`)\|source[`1`])) {
451	/ output the surrogate pair, will overflow (see conditions comment above) /
452	source+=`2`;
453	length-=`2`;
454	*target++=c;
455	if(offsets!=NULL) {
456	*offsets++=sourceIndex;
457	}
458	cnv->UCharErrorBuffer[`0`]=trail;
459	cnv->UCharErrorBufferLength=`1`;
460	cnv->toULength=`0`;
461	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
462	} else {
463	/ unmatched lead surrogate /
464	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
465	}
466	} else {
467	/ see if the trail surrogate is in the next buffer /
468	}
469	} else {
470	/ unmatched trail surrogate /
471	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
472	}
473	}
474
475	if(U_SUCCESS(*pErrorCode)) {
476	/ check for a remaining source byte /
477	if(length>`0`) {
478	if(targetCapacity==`0`) {
479	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
480	} else {
481	/ it must be length==1 because otherwise the above would have copied more /
482	cnv->toUBytes[cnv->toULength++]=*source++;
483	}
484	}
485	}
486
487	/ write back the updated pointers /
488	pArgs->source=(const char *)source;
489	pArgs->target=target;
490	pArgs->offsets=offsets;
491	}
492
493	static UChar32 U_CALLCONV
494	_UTF16BEGetNextUChar(UConverterToUnicodeArgs pArgs, UErrorCode err) {
495	const uint8_t s, sourceLimit;
496	UChar32 c;
497
498	if(pArgs->converter->mode<`8`) {
499	return UCNV_GET_NEXT_UCHAR_USE_TO_U;
500	}
501
502	s=(const uint8_t *)pArgs->source;
503	sourceLimit=(const uint8_t *)pArgs->sourceLimit;
504
505	if(s>=sourceLimit) {
506	/ no input /
507	*err=U_INDEX_OUTOFBOUNDS_ERROR;
508	return `0xffff`;
509	}
510
511	if(s+`2`>sourceLimit) {
512	/ only one byte: truncated UChar /
513	pArgs->converter->toUBytes[`0`]=*s++;
514	pArgs->converter->toULength=`1`;
515	pArgs->source=(const char *)s;
516	*err = U_TRUNCATED_CHAR_FOUND;
517	return `0xffff`;
518	}
519
520	/ get one UChar /
521	c=((UChar32)*s<<`8`)\|s[`1`];
522	s+=`2`;
523
524	/ check for a surrogate pair /
525	if(U_IS_SURROGATE(c)) {
526	if(U16_IS_SURROGATE_LEAD(c)) {
527	if(s+`2`<=sourceLimit) {
528	UChar trail;
529
530	/ get a second UChar and see if it is a trail surrogate /
531	trail=((UChar)*s<<`8`)\|s[`1`];
532	if(U16_IS_TRAIL(trail)) {
533	c=U16_GET_SUPPLEMENTARY(c, trail);
534	s+=`2`;
535	} else {
536	/ unmatched lead surrogate /
537	c=-`2`;
538	}
539	} else {
540	/ too few (2 or 3) bytes for a surrogate pair: truncated code point /
541	uint8_t *bytes=pArgs->converter->toUBytes;
542	s-=`2`;
543	pArgs->converter->toULength=(int8_t)(sourceLimit-s);
544	do {
545	bytes++=s++;
546	} while(s<sourceLimit);
547
548	c=`0xffff`;
549	*err=U_TRUNCATED_CHAR_FOUND;
550	}
551	} else {
552	/ unmatched trail surrogate /
553	c=-`2`;
554	}
555
556	if(c<`0`) {
557	/ write the unmatched surrogate /
558	uint8_t *bytes=pArgs->converter->toUBytes;
559	pArgs->converter->toULength=`2`;
560	bytes=(s-`2`);
561	bytes[`1`]=*(s-`1`);
562
563	c=`0xffff`;
564	*err=U_ILLEGAL_CHAR_FOUND;
565	}
566	}
567
568	pArgs->source=(const char *)s;
569	return c;
570	}
571
572	static void U_CALLCONV
573	_UTF16BEReset(UConverter *cnv, UConverterResetChoice choice) {
574	if(choice<=UCNV_RESET_TO_UNICODE) {
575	/ reset toUnicode state /
576	if(UCNV_GET_VERSION(cnv)==`0`) {
577	cnv->mode=`8`; / no BOM handling /
578	} else {
579	cnv->mode=`0`; / Java-specific "UnicodeBig" requires BE BOM or no BOM /
580	}
581	}
582	if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==`1`) {
583	/ reset fromUnicode for "UnicodeBig": prepare to output the UTF-16BE BOM /
584	cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
585	}
586	}
587
588	static void U_CALLCONV
589	_UTF16BEOpen(UConverter *cnv,
590	UConverterLoadArgs *pArgs,
591	UErrorCode *pErrorCode) {
592	(void)pArgs;
593	if(UCNV_GET_VERSION(cnv)<=`1`) {
594	_UTF16BEReset(cnv, UCNV_RESET_BOTH);
595	} else {
596	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
597	}
598	}
599
600	static const char * U_CALLCONV
601	_UTF16BEGetName(const UConverter *cnv) {
602	if(UCNV_GET_VERSION(cnv)==`0`) {
603	return "UTF-16BE";
604	} else {
605	return "UTF-16BE,version=1";
606	}
607	}
608	U_CDECL_END
609
610	static const UConverterImpl _UTF16BEImpl={
611	UCNV_UTF16_BigEndian,
612
613	NULL,
614	NULL,
615
616	_UTF16BEOpen,
617	NULL,
618	_UTF16BEReset,
619
620	_UTF16BEToUnicodeWithOffsets,
621	_UTF16BEToUnicodeWithOffsets,
622	_UTF16BEFromUnicodeWithOffsets,
623	_UTF16BEFromUnicodeWithOffsets,
624	_UTF16BEGetNextUChar,
625
626	NULL,
627	_UTF16BEGetName,
628	NULL,
629	NULL,
630	ucnv_getNonSurrogateUnicodeSet,
631
632	NULL,
633	NULL
634	};
635
636	static const UConverterStaticData _UTF16BEStaticData={
637	sizeof(UConverterStaticData),
638	"UTF-16BE",
639	`1200`, UCNV_IBM, UCNV_UTF16_BigEndian, `2`, `2`,
640	{ `0xff`, `0xfd`, `0`, `0` },`2`,FALSE,FALSE,
641	`0`,
642	`0`,
643	{ `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0` } / reserved /
644	};
645
646
647	const UConverterSharedData _UTF16BEData=
648	UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16BEStaticData, &_UTF16BEImpl);
649
650	/ UTF-16LE ----------------------------------------------------------------- /
651	U_CDECL_BEGIN
652	static void U_CALLCONV
653	_UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
654	UErrorCode *pErrorCode) {
655	UConverter *cnv;
656	const UChar *source;
657	char *target;
658	int32_t *offsets;
659
660	uint32_t targetCapacity, length, sourceIndex;
661	UChar c, trail;
662	char overflow[`4`];
663
664	source=pArgs->source;
665	length=(int32_t)(pArgs->sourceLimit-source);
666	if(length<=`0`) {
667	/ no input, nothing to do /
668	return;
669	}
670
671	cnv=pArgs->converter;
672
673	/ write the BOM if necessary /
674	if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
675	static const char bom[]={ (char)`0xffu`, (char)`0xfeu` };
676	ucnv_fromUWriteBytes(cnv,
677	bom, `2`,
678	&pArgs->target, pArgs->targetLimit,
679	&pArgs->offsets, -`1`,
680	pErrorCode);
681	cnv->fromUnicodeStatus=`0`;
682	}
683
684	target=pArgs->target;
685	if(target >= pArgs->targetLimit) {
686	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
687	return;
688	}
689
690	targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target);
691	offsets=pArgs->offsets;
692	sourceIndex=`0`;
693
694	/ c!=0 indicates in several places outside the main loops that a surrogate was found /
695
696	if((c=(UChar)cnv->fromUChar32)!=`0` && U16_IS_TRAIL(trail=*source) && targetCapacity>=`4`) {
697	/ the last buffer ended with a lead surrogate, output the surrogate pair /
698	++source;
699	--length;
700	target[`0`]=(uint8_t)c;
701	target[`1`]=(uint8_t)(c>>`8`);
702	target[`2`]=(uint8_t)trail;
703	target[`3`]=(uint8_t)(trail>>`8`);
704	target+=`4`;
705	targetCapacity-=`4`;
706	if(offsets!=NULL) {
707	*offsets++=-`1`;
708	*offsets++=-`1`;
709	*offsets++=-`1`;
710	*offsets++=-`1`;
711	}
712	sourceIndex=`1`;
713	cnv->fromUChar32=c=`0`;
714	}
715
716	if(c==`0`) {
717	/ copy an even number of bytes for complete UChars /
718	uint32_t count=`2`*length;
719	if(count>targetCapacity) {
720	count=targetCapacity&~`1`;
721	}
722	/ count is even /
723	targetCapacity-=count;
724	count>>=`1`;
725	length-=count;
726
727	if(offsets==NULL) {
728	while(count>`0`) {
729	c=*source++;
730	if(U16_IS_SINGLE(c)) {
731	target[`0`]=(uint8_t)c;
732	target[`1`]=(uint8_t)(c>>`8`);
733	target+=`2`;
734	} else if(U16_IS_SURROGATE_LEAD(c) && count>=`2` && U16_IS_TRAIL(trail=*source)) {
735	++source;
736	--count;
737	target[`0`]=(uint8_t)c;
738	target[`1`]=(uint8_t)(c>>`8`);
739	target[`2`]=(uint8_t)trail;
740	target[`3`]=(uint8_t)(trail>>`8`);
741	target+=`4`;
742	} else {
743	break;
744	}
745	--count;
746	}
747	} else {
748	while(count>`0`) {
749	c=*source++;
750	if(U16_IS_SINGLE(c)) {
751	target[`0`]=(uint8_t)c;
752	target[`1`]=(uint8_t)(c>>`8`);
753	target+=`2`;
754	*offsets++=sourceIndex;
755	*offsets++=sourceIndex++;
756	} else if(U16_IS_SURROGATE_LEAD(c) && count>=`2` && U16_IS_TRAIL(trail=*source)) {
757	++source;
758	--count;
759	target[`0`]=(uint8_t)c;
760	target[`1`]=(uint8_t)(c>>`8`);
761	target[`2`]=(uint8_t)trail;
762	target[`3`]=(uint8_t)(trail>>`8`);
763	target+=`4`;
764	*offsets++=sourceIndex;
765	*offsets++=sourceIndex;
766	*offsets++=sourceIndex;
767	*offsets++=sourceIndex;
768	sourceIndex+=`2`;
769	} else {
770	break;
771	}
772	--count;
773	}
774	}
775
776	if(count==`0`) {
777	/ done with the loop for complete UChars /
778	if(length>`0` && targetCapacity>`0`) {
779	/*
780	* there is more input and some target capacity -
781	* it must be targetCapacity==1 because otherwise
782	* the above would have copied more;
783	* prepare for overflow output
784	*/
785	if(U16_IS_SINGLE(c=*source++)) {
786	overflow[`0`]=(char)c;
787	overflow[`1`]=(char)(c>>`8`);
788	length=`2`; / 2 bytes to output /
789	c=`0`;
790	/ } else { keep c for surrogate handling, length will be set there /
791	}
792	} else {
793	length=`0`;
794	c=`0`;
795	}
796	} else {
797	/ keep c for surrogate handling, length will be set there /
798	targetCapacity+=`2`*count;
799	}
800	} else {
801	length=`0`; / from here on, length counts the bytes in overflow[] /
802	}
803
804	if(c!=`0`) {
805	/*
806	* c is a surrogate, and
807	* - source or target too short
808	* - or the surrogate is unmatched
809	*/
810	length=`0`;
811	if(U16_IS_SURROGATE_LEAD(c)) {
812	if(source<pArgs->sourceLimit) {
813	if(U16_IS_TRAIL(trail=*source)) {
814	/ output the surrogate pair, will overflow (see conditions comment above) /
815	++source;
816	overflow[`0`]=(char)c;
817	overflow[`1`]=(char)(c>>`8`);
818	overflow[`2`]=(char)trail;
819	overflow[`3`]=(char)(trail>>`8`);
820	length=`4`; / 4 bytes to output /
821	c=`0`;
822	} else {
823	/ unmatched lead surrogate /
824	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
825	}
826	} else {
827	/ see if the trail surrogate is in the next buffer /
828	}
829	} else {
830	/ unmatched trail surrogate /
831	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
832	}
833	cnv->fromUChar32=c;
834	}
835
836	if(length>`0`) {
837	/ output length bytes with overflow (length>targetCapacity>0) /
838	ucnv_fromUWriteBytes(cnv,
839	overflow, length,
840	&target, pArgs->targetLimit,
841	&offsets, sourceIndex,
842	pErrorCode);
843	targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target);
844	}
845
846	if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==`0`) {
847	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
848	}
849
850	/ write back the updated pointers /
851	pArgs->source=source;
852	pArgs->target=target;
853	pArgs->offsets=offsets;
854	}
855
856	static void U_CALLCONV
857	_UTF16LEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
858	UErrorCode *pErrorCode) {
859	UConverter *cnv;
860	const uint8_t *source;
861	UChar *target;
862	int32_t *offsets;
863
864	uint32_t targetCapacity, length, count, sourceIndex;
865	UChar c, trail;
866
867	if(pArgs->converter->mode<`8`) {
868	_UTF16ToUnicodeWithOffsets(pArgs, pErrorCode);
869	return;
870	}
871
872	cnv=pArgs->converter;
873	source=(const uint8_t *)pArgs->source;
874	length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);
875	if(length<=`0` && cnv->toUnicodeStatus==`0`) {
876	/ no input, nothing to do /
877	return;
878	}
879
880	target=pArgs->target;
881	if(target >= pArgs->targetLimit) {
882	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
883	return;
884	}
885
886	targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target);
887	offsets=pArgs->offsets;
888	sourceIndex=`0`;
889	c=`0`;
890
891	/ complete a partial UChar or pair from the last call /
892	if(cnv->toUnicodeStatus!=`0`) {
893	/*
894	* special case: single byte from a previous buffer,
895	* where the byte turned out not to belong to a trail surrogate
896	* and the preceding, unmatched lead surrogate was put into toUBytes[]
897	* for error handling
898	*/
899	cnv->toUBytes[`0`]=(uint8_t)cnv->toUnicodeStatus;
900	cnv->toULength=`1`;
901	cnv->toUnicodeStatus=`0`;
902	}
903	if((count=cnv->toULength)!=`0`) {
904	uint8_t *p=cnv->toUBytes;
905	do {
906	p[count++]=*source++;
907	++sourceIndex;
908	--length;
909	if(count==`2`) {
910	c=((UChar)p[`1`]<<`8`)\|p[`0`];
911	if(U16_IS_SINGLE(c)) {
912	/ output the BMP code point /
913	*target++=c;
914	if(offsets!=NULL) {
915	*offsets++=-`1`;
916	}
917	--targetCapacity;
918	count=`0`;
919	c=`0`;
920	break;
921	} else if(U16_IS_SURROGATE_LEAD(c)) {
922	/ continue collecting bytes for the trail surrogate /
923	c=`0`; / avoid unnecessary surrogate handling below /
924	} else {
925	/ fall through to error handling for an unmatched trail surrogate /
926	break;
927	}
928	} else if(count==`4`) {
929	c=((UChar)p[`1`]<<`8`)\|p[`0`];
930	trail=((UChar)p[`3`]<<`8`)\|p[`2`];
931	if(U16_IS_TRAIL(trail)) {
932	/ output the surrogate pair /
933	*target++=c;
934	if(targetCapacity>=`2`) {
935	*target++=trail;
936	if(offsets!=NULL) {
937	*offsets++=-`1`;
938	*offsets++=-`1`;
939	}
940	targetCapacity-=`2`;
941	} else / targetCapacity==1 / {
942	targetCapacity=`0`;
943	cnv->UCharErrorBuffer[`0`]=trail;
944	cnv->UCharErrorBufferLength=`1`;
945	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
946	}
947	count=`0`;
948	c=`0`;
949	break;
950	} else {
951	/ unmatched lead surrogate, handle here for consistent toUBytes[] /
952	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
953
954	/ back out reading the code unit after it /
955	if(((const uint8_t *)pArgs->source-source)>=`2`) {
956	source-=`2`;
957	} else {
958	/*
959	* if the trail unit's first byte was in a previous buffer, then
960	* we need to put it into a special place because toUBytes[] will be
961	* used for the lead unit's bytes
962	*/
963	cnv->toUnicodeStatus=`0x100`\|p[`2`];
964	--source;
965	}
966	cnv->toULength=`2`;
967
968	/ write back the updated pointers /
969	pArgs->source=(const char *)source;
970	pArgs->target=target;
971	pArgs->offsets=offsets;
972	return;
973	}
974	}
975	} while(length>`0`);
976	cnv->toULength=(int8_t)count;
977	}
978
979	/ copy an even number of bytes for complete UChars /
980	count=`2`*targetCapacity;
981	if(count>length) {
982	count=length&~`1`;
983	}
984	if(c==`0` && count>`0`) {
985	length-=count;
986	count>>=`1`;
987	targetCapacity-=count;
988	if(offsets==NULL) {
989	do {
990	c=((UChar)source[`1`]<<`8`)\|source[`0`];
991	source+=`2`;
992	if(U16_IS_SINGLE(c)) {
993	*target++=c;
994	} else if(U16_IS_SURROGATE_LEAD(c) && count>=`2` &&
995	U16_IS_TRAIL(trail=((UChar)source[`1`]<<`8`)\|source[`0`])
996	) {
997	source+=`2`;
998	--count;
999	*target++=c;
1000	*target++=trail;
1001	} else {
1002	break;
1003	}
1004	} while(--count>`0`);
1005	} else {
1006	do {
1007	c=((UChar)source[`1`]<<`8`)\|source[`0`];
1008	source+=`2`;
1009	if(U16_IS_SINGLE(c)) {
1010	*target++=c;
1011	*offsets++=sourceIndex;
1012	sourceIndex+=`2`;
1013	} else if(U16_IS_SURROGATE_LEAD(c) && count>=`2` &&
1014	U16_IS_TRAIL(trail=((UChar)source[`1`]<<`8`)\|source[`0`])
1015	) {
1016	source+=`2`;
1017	--count;
1018	*target++=c;
1019	*target++=trail;
1020	*offsets++=sourceIndex;
1021	*offsets++=sourceIndex;
1022	sourceIndex+=`4`;
1023	} else {
1024	break;
1025	}
1026	} while(--count>`0`);
1027	}
1028
1029	if(count==`0`) {
1030	/ done with the loop for complete UChars /
1031	c=`0`;
1032	} else {
1033	/ keep c for surrogate handling, trail will be set there /
1034	length+=`2`(count-`1`); /* one more byte pair was consumed than count decremented /
1035	targetCapacity+=count;
1036	}
1037	}
1038
1039	if(c!=`0`) {
1040	/*
1041	* c is a surrogate, and
1042	* - source or target too short
1043	* - or the surrogate is unmatched
1044	*/
1045	cnv->toUBytes[`0`]=(uint8_t)c;
1046	cnv->toUBytes[`1`]=(uint8_t)(c>>`8`);
1047	cnv->toULength=`2`;
1048
1049	if(U16_IS_SURROGATE_LEAD(c)) {
1050	if(length>=`2`) {
1051	if(U16_IS_TRAIL(trail=((UChar)source[`1`]<<`8`)\|source[`0`])) {
1052	/ output the surrogate pair, will overflow (see conditions comment above) /
1053	source+=`2`;
1054	length-=`2`;
1055	*target++=c;
1056	if(offsets!=NULL) {
1057	*offsets++=sourceIndex;
1058	}
1059	cnv->UCharErrorBuffer[`0`]=trail;
1060	cnv->UCharErrorBufferLength=`1`;
1061	cnv->toULength=`0`;
1062	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1063	} else {
1064	/ unmatched lead surrogate /
1065	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
1066	}
1067	} else {
1068	/ see if the trail surrogate is in the next buffer /
1069	}
1070	} else {
1071	/ unmatched trail surrogate /
1072	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
1073	}
1074	}
1075
1076	if(U_SUCCESS(*pErrorCode)) {
1077	/ check for a remaining source byte /
1078	if(length>`0`) {
1079	if(targetCapacity==`0`) {
1080	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1081	} else {
1082	/ it must be length==1 because otherwise the above would have copied more /
1083	cnv->toUBytes[cnv->toULength++]=*source++;
1084	}
1085	}
1086	}
1087
1088	/ write back the updated pointers /
1089	pArgs->source=(const char *)source;
1090	pArgs->target=target;
1091	pArgs->offsets=offsets;
1092	}
1093
1094	static UChar32 U_CALLCONV
1095	_UTF16LEGetNextUChar(UConverterToUnicodeArgs pArgs, UErrorCode err) {
1096	const uint8_t s, sourceLimit;
1097	UChar32 c;
1098
1099	if(pArgs->converter->mode<`8`) {
1100	return UCNV_GET_NEXT_UCHAR_USE_TO_U;
1101	}
1102
1103	s=(const uint8_t *)pArgs->source;
1104	sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1105
1106	if(s>=sourceLimit) {
1107	/ no input /
1108	*err=U_INDEX_OUTOFBOUNDS_ERROR;
1109	return `0xffff`;
1110	}
1111
1112	if(s+`2`>sourceLimit) {
1113	/ only one byte: truncated UChar /
1114	pArgs->converter->toUBytes[`0`]=*s++;
1115	pArgs->converter->toULength=`1`;
1116	pArgs->source=(const char *)s;
1117	*err = U_TRUNCATED_CHAR_FOUND;
1118	return `0xffff`;
1119	}
1120
1121	/ get one UChar /
1122	c=((UChar32)s[`1`]<<`8`)\|*s;
1123	s+=`2`;
1124
1125	/ check for a surrogate pair /
1126	if(U_IS_SURROGATE(c)) {
1127	if(U16_IS_SURROGATE_LEAD(c)) {
1128	if(s+`2`<=sourceLimit) {
1129	UChar trail;
1130
1131	/ get a second UChar and see if it is a trail surrogate /
1132	trail=((UChar)s[`1`]<<`8`)\|*s;
1133	if(U16_IS_TRAIL(trail)) {
1134	c=U16_GET_SUPPLEMENTARY(c, trail);
1135	s+=`2`;
1136	} else {
1137	/ unmatched lead surrogate /
1138	c=-`2`;
1139	}
1140	} else {
1141	/ too few (2 or 3) bytes for a surrogate pair: truncated code point /
1142	uint8_t *bytes=pArgs->converter->toUBytes;
1143	s-=`2`;
1144	pArgs->converter->toULength=(int8_t)(sourceLimit-s);
1145	do {
1146	bytes++=s++;
1147	} while(s<sourceLimit);
1148
1149	c=`0xffff`;
1150	*err=U_TRUNCATED_CHAR_FOUND;
1151	}
1152	} else {
1153	/ unmatched trail surrogate /
1154	c=-`2`;
1155	}
1156
1157	if(c<`0`) {
1158	/ write the unmatched surrogate /
1159	uint8_t *bytes=pArgs->converter->toUBytes;
1160	pArgs->converter->toULength=`2`;
1161	bytes=(s-`2`);
1162	bytes[`1`]=*(s-`1`);
1163
1164	c=`0xffff`;
1165	*err=U_ILLEGAL_CHAR_FOUND;
1166	}
1167	}
1168
1169	pArgs->source=(const char *)s;
1170	return c;
1171	}
1172
1173	static void U_CALLCONV
1174	_UTF16LEReset(UConverter *cnv, UConverterResetChoice choice) {
1175	if(choice<=UCNV_RESET_TO_UNICODE) {
1176	/ reset toUnicode state /
1177	if(UCNV_GET_VERSION(cnv)==`0`) {
1178	cnv->mode=`8`; / no BOM handling /
1179	} else {
1180	cnv->mode=`0`; / Java-specific "UnicodeLittle" requires LE BOM or no BOM /
1181	}
1182	}
1183	if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==`1`) {
1184	/ reset fromUnicode for "UnicodeLittle": prepare to output the UTF-16LE BOM /
1185	cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
1186	}
1187	}
1188
1189	static void U_CALLCONV
1190	_UTF16LEOpen(UConverter *cnv,
1191	UConverterLoadArgs *pArgs,
1192	UErrorCode *pErrorCode) {
1193	(void)pArgs;
1194	if(UCNV_GET_VERSION(cnv)<=`1`) {
1195	_UTF16LEReset(cnv, UCNV_RESET_BOTH);
1196	} else {
1197	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1198	}
1199	}
1200
1201	static const char * U_CALLCONV
1202	_UTF16LEGetName(const UConverter *cnv) {
1203	if(UCNV_GET_VERSION(cnv)==`0`) {
1204	return "UTF-16LE";
1205	} else {
1206	return "UTF-16LE,version=1";
1207	}
1208	}
1209	U_CDECL_END
1210
1211	static const UConverterImpl _UTF16LEImpl={
1212	UCNV_UTF16_LittleEndian,
1213
1214	NULL,
1215	NULL,
1216
1217	_UTF16LEOpen,
1218	NULL,
1219	_UTF16LEReset,
1220
1221	_UTF16LEToUnicodeWithOffsets,
1222	_UTF16LEToUnicodeWithOffsets,
1223	_UTF16LEFromUnicodeWithOffsets,
1224	_UTF16LEFromUnicodeWithOffsets,
1225	_UTF16LEGetNextUChar,
1226
1227	NULL,
1228	_UTF16LEGetName,
1229	NULL,
1230	NULL,
1231	ucnv_getNonSurrogateUnicodeSet,
1232
1233	NULL,
1234	NULL
1235	};
1236
1237
1238	static const UConverterStaticData _UTF16LEStaticData={
1239	sizeof(UConverterStaticData),
1240	"UTF-16LE",
1241	`1202`, UCNV_IBM, UCNV_UTF16_LittleEndian, `2`, `2`,
1242	{ `0xfd`, `0xff`, `0`, `0` },`2`,FALSE,FALSE,
1243	`0`,
1244	`0`,
1245	{ `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0` } / reserved /
1246	};
1247
1248
1249	const UConverterSharedData _UTF16LEData=
1250	UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16LEStaticData, &_UTF16LEImpl);
1251
1252	/ UTF-16 (Detect BOM) ------------------------------------------------------ /
1253
1254	/*
1255	* Detect a BOM at the beginning of the stream and select UTF-16BE or UTF-16LE
1256	* accordingly.
1257	* This is a simpler version of the UTF-32 converter, with
1258	* fewer states for shorter BOMs.
1259	*
1260	* State values:
1261	* 0 initial state
1262	* 1 saw first byte
1263	* 2..5 -
1264	* 6..7 see _UTF16ToUnicodeWithOffsets() comments in state 1
1265	* 8 UTF-16BE mode
1266	* 9 UTF-16LE mode
1267	*
1268	* During detection: state==number of initial bytes seen so far.
1269	*
1270	* On output, emit U+FEFF as the first code point.
1271	*
1272	* Variants:
1273	* - UTF-16,version=1 (Java "Unicode" encoding) treats a missing BOM as an error.
1274	* - UTF-16BE,version=1 (Java "UnicodeBig" encoding) and
1275	* UTF-16LE,version=1 (Java "UnicodeLittle" encoding) treat a reverse BOM as an error.
1276	*/
1277	U_CDECL_BEGIN
1278	static void U_CALLCONV
1279	_UTF16Reset(UConverter *cnv, UConverterResetChoice choice) {
1280	if(choice<=UCNV_RESET_TO_UNICODE) {
1281	/ reset toUnicode: state=0 /
1282	cnv->mode=`0`;
1283	}
1284	if(choice!=UCNV_RESET_TO_UNICODE) {
1285	/ reset fromUnicode: prepare to output the UTF-16PE BOM /
1286	cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
1287	}
1288	}
1289	U_CDECL_END
1290	extern const UConverterSharedData _UTF16v2Data;
1291	U_CDECL_BEGIN
1292	static void U_CALLCONV
1293	_UTF16Open(UConverter *cnv,
1294	UConverterLoadArgs *pArgs,
1295	UErrorCode *pErrorCode) {
1296	if(UCNV_GET_VERSION(cnv)<=`2`) {
1297	if(UCNV_GET_VERSION(cnv)==`2` && !pArgs->onlyTestIsLoadable) {
1298	/*
1299	* Switch implementation, and switch the staticData that's different
1300	* and was copied into the UConverter.
1301	* (See ucnv_createConverterFromSharedData() in ucnv_bld.c.)
1302	* UTF-16,version=2 fromUnicode() always writes a big-endian byte stream.
1303	*/
1304	cnv->sharedData=(UConverterSharedData*)&_UTF16v2Data;
1305	uprv_memcpy(cnv->subChars, _UTF16v2Data.staticData->subChar, UCNV_MAX_SUBCHAR_LEN);
1306	}
1307	_UTF16Reset(cnv, UCNV_RESET_BOTH);
1308	} else {
1309	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1310	}
1311	}
1312
1313	static const char * U_CALLCONV
1314	_UTF16GetName(const UConverter *cnv) {
1315	if(UCNV_GET_VERSION(cnv)==`0`) {
1316	return "UTF-16";
1317	} else if(UCNV_GET_VERSION(cnv)==`1`) {
1318	return "UTF-16,version=1";
1319	} else {
1320	return "UTF-16,version=2";
1321	}
1322	}
1323	U_CDECL_END
1324	extern const UConverterSharedData _UTF16Data;
1325
1326	static inline bool IS_UTF16BE(const UConverter *cnv) {
1327	return ((cnv)->sharedData == &_UTF16BEData);
1328	}
1329
1330	static inline bool IS_UTF16LE(const UConverter *cnv) {
1331	return ((cnv)->sharedData == &_UTF16LEData);
1332	}
1333
1334	static inline bool IS_UTF16(const UConverter *cnv) {
1335	return ((cnv)->sharedData==&_UTF16Data) \|\| ((cnv)->sharedData == &_UTF16v2Data);
1336	}
1337
1338	U_CDECL_BEGIN
1339	static void U_CALLCONV
1340	_UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
1341	UErrorCode *pErrorCode) {
1342	UConverter *cnv=pArgs->converter;
1343	const char *source=pArgs->source;
1344	const char *sourceLimit=pArgs->sourceLimit;
1345	int32_t *offsets=pArgs->offsets;
1346
1347	int32_t state, offsetDelta;
1348	uint8_t b;
1349
1350	state=cnv->mode;
1351
1352	/*
1353	* If we detect a BOM in this buffer, then we must add the BOM size to the
1354	* offsets because the actual converter function will not see and count the BOM.
1355	* offsetDelta will have the number of the BOM bytes that are in the current buffer.
1356	*/
1357	offsetDelta=`0`;
1358
1359	while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {
1360	switch(state) {
1361	case `0`:
1362	cnv->toUBytes[`0`]=(uint8_t)*source++;
1363	cnv->toULength=`1`;
1364	state=`1`;
1365	break;
1366	case `1`:
1367	/*
1368	* Only inside this switch case can the state variable
1369	* temporarily take two additional values:
1370	* 6: BOM error, continue with BE
1371	* 7: BOM error, continue with LE
1372	*/
1373	b=*source;
1374	if(cnv->toUBytes[`0`]==`0xfe` && b==`0xff`) {
1375	if(IS_UTF16LE(cnv)) {
1376	state=`7`; / illegal reverse BOM for Java "UnicodeLittle" /
1377	} else {
1378	state=`8`; / detect UTF-16BE /
1379	}
1380	} else if(cnv->toUBytes[`0`]==`0xff` && b==`0xfe`) {
1381	if(IS_UTF16BE(cnv)) {
1382	state=`6`; / illegal reverse BOM for Java "UnicodeBig" /
1383	} else {
1384	state=`9`; / detect UTF-16LE /
1385	}
1386	} else if((IS_UTF16(cnv) && UCNV_GET_VERSION(cnv)==`1`)) {
1387	state=`6`; / illegal missing BOM for Java "Unicode" /
1388	}
1389	if(state>=`8`) {
1390	/ BOM detected, consume it /
1391	++source;
1392	cnv->toULength=`0`;
1393	offsetDelta=(int32_t)(source-pArgs->source);
1394	} else if(state<`6`) {
1395	/ ok: no BOM, and not a reverse BOM /
1396	if(source!=pArgs->source) {
1397	/ reset the source for a correct first offset /
1398	source=pArgs->source;
1399	cnv->toULength=`0`;
1400	}
1401	if(IS_UTF16LE(cnv)) {
1402	/ Make Java "UnicodeLittle" default to LE. /
1403	state=`9`;
1404	} else {
1405	/ Make standard UTF-16 and Java "UnicodeBig" default to BE. /
1406	state=`8`;
1407	}
1408	} else {
1409	/*
1410	* error: missing BOM, or reverse BOM
1411	* UTF-16,version=1: Java-specific "Unicode" requires a BOM.
1412	* UTF-16BE,version=1: Java-specific "UnicodeBig" requires a BE BOM or no BOM.
1413	* UTF-16LE,version=1: Java-specific "UnicodeLittle" requires an LE BOM or no BOM.
1414	*/
1415	/ report the non-BOM or reverse BOM as an illegal sequence /
1416	cnv->toUBytes[`1`]=b;
1417	cnv->toULength=`2`;
1418	pArgs->source=source+`1`;
1419	/ continue with conversion if the callback resets the error /
1420	/*
1421	* Make Java "Unicode" default to BE like standard UTF-16.
1422	* Make Java "UnicodeBig" and "UnicodeLittle" default
1423	* to their normal endiannesses.
1424	*/
1425	cnv->mode=state+`2`;
1426	*pErrorCode=U_ILLEGAL_ESCAPE_SEQUENCE;
1427	return;
1428	}
1429	/ convert the rest of the stream /
1430	cnv->mode=state;
1431	continue;
1432	case `8`:
1433	/ call UTF-16BE /
1434	pArgs->source=source;
1435	_UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
1436	source=pArgs->source;
1437	break;
1438	case `9`:
1439	/ call UTF-16LE /
1440	pArgs->source=source;
1441	_UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);
1442	source=pArgs->source;
1443	break;
1444	default:
1445	break; / does not occur /
1446	}
1447	}
1448
1449	/ add BOM size to offsets - see comment at offsetDelta declaration /
1450	if(offsets!=NULL && offsetDelta!=`0`) {
1451	int32_t *offsetsLimit=pArgs->offsets;
1452	while(offsets<offsetsLimit) {
1453	*offsets++ += offsetDelta;
1454	}
1455	}
1456
1457	pArgs->source=source;
1458
1459	if(source==sourceLimit && pArgs->flush) {
1460	/ handle truncated input /
1461	switch(state) {
1462	case `0`:
1463	break; / no input at all, nothing to do /
1464	case `8`:
1465	_UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
1466	break;
1467	case `9`:
1468	_UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);
1469	break;
1470	default:
1471	/ 0<state<8: framework will report truncation, nothing to do here /
1472	break;
1473	}
1474	}
1475
1476	cnv->mode=state;
1477	}
1478
1479	static UChar32 U_CALLCONV
1480	_UTF16GetNextUChar(UConverterToUnicodeArgs *pArgs,
1481	UErrorCode *pErrorCode) {
1482	switch(pArgs->converter->mode) {
1483	case `8`:
1484	return _UTF16BEGetNextUChar(pArgs, pErrorCode);
1485	case `9`:
1486	return _UTF16LEGetNextUChar(pArgs, pErrorCode);
1487	default:
1488	return UCNV_GET_NEXT_UCHAR_USE_TO_U;
1489	}
1490	}
1491	U_CDECL_END
1492
1493	static const UConverterImpl _UTF16Impl = {
1494	UCNV_UTF16,
1495
1496	NULL,
1497	NULL,
1498
1499	_UTF16Open,
1500	NULL,
1501	_UTF16Reset,
1502
1503	_UTF16ToUnicodeWithOffsets,
1504	_UTF16ToUnicodeWithOffsets,
1505	_UTF16PEFromUnicodeWithOffsets,
1506	_UTF16PEFromUnicodeWithOffsets,
1507	_UTF16GetNextUChar,
1508
1509	NULL, / ### TODO implement getStarters for all Unicode encodings?! /
1510	_UTF16GetName,
1511	NULL,
1512	NULL,
1513	ucnv_getNonSurrogateUnicodeSet,
1514
1515	NULL,
1516	NULL
1517	};
1518
1519	static const UConverterStaticData _UTF16StaticData = {
1520	sizeof(UConverterStaticData),
1521	"UTF-16",
1522	`1204`, / CCSID for BOM sensitive UTF-16 /
1523	UCNV_IBM, UCNV_UTF16, `2`, `2`,
1524	#if U_IS_BIG_ENDIAN
1525	{ `0xff`, `0xfd`, `0`, `0` }, `2`,
1526	#else
1527	{ `0xfd`, `0xff`, `0`, `0` }, `2`,
1528	#endif
1529	FALSE, FALSE,
1530	`0`,
1531	`0`,
1532	{ `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0` } / reserved /
1533	};
1534
1535	const UConverterSharedData _UTF16Data =
1536	UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16StaticData, &_UTF16Impl);
1537
1538	static const UConverterImpl _UTF16v2Impl = {
1539	UCNV_UTF16,
1540
1541	NULL,
1542	NULL,
1543
1544	_UTF16Open,
1545	NULL,
1546	_UTF16Reset,
1547
1548	_UTF16ToUnicodeWithOffsets,
1549	_UTF16ToUnicodeWithOffsets,
1550	_UTF16BEFromUnicodeWithOffsets,
1551	_UTF16BEFromUnicodeWithOffsets,
1552	_UTF16GetNextUChar,
1553
1554	NULL, / ### TODO implement getStarters for all Unicode encodings?! /
1555	_UTF16GetName,
1556	NULL,
1557	NULL,
1558	ucnv_getNonSurrogateUnicodeSet,
1559
1560	NULL,
1561	NULL
1562	};
1563
1564	static const UConverterStaticData _UTF16v2StaticData = {
1565	sizeof(UConverterStaticData),
1566	"UTF-16,version=2",
1567	`1204`, / CCSID for BOM sensitive UTF-16 /
1568	UCNV_IBM, UCNV_UTF16, `2`, `2`,
1569	{ `0xff`, `0xfd`, `0`, `0` }, `2`,
1570	FALSE, FALSE,
1571	`0`,
1572	`0`,
1573	{ `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0` } / reserved /
1574	};
1575
1576	const UConverterSharedData _UTF16v2Data =
1577	UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16v2StaticData, &_UTF16v2Impl);
1578
1579	#endif
1580

Browse the source code of ClickHouse/contrib/icu/icu4c/source/common/ucnv_u16.cpp