ucnv_u32.cpp source code [ClickHouse/contrib/icu/icu4c/source/common/ucnv_u32.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	**********************************************************************
5	* Copyright (C) 2002-2015, International Business Machines
6	* Corporation and others. All Rights Reserved.
7	**********************************************************************
8	* file name: ucnv_u32.c
9	* encoding: UTF-8
10	* tab size: 8 (not used)
11	* indentation:4
12	*
13	* created on: 2002jul01
14	* created by: Markus W. Scherer
15	*
16	* UTF-32 converter implementation. Used to be in ucnv_utf.c.
17	*/
18
19	#include "unicode/utypes.h"
20
21	#if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
22
23	#include "unicode/ucnv.h"
24	#include "unicode/utf.h"
25	#include "ucnv_bld.h"
26	#include "ucnv_cnv.h"
27	#include "cmemory.h"
28
29	#define MAXIMUM_UCS2 0x0000FFFF
30	#define MAXIMUM_UTF 0x0010FFFF
31	#define HALF_SHIFT 10
32	#define HALF_BASE 0x0010000
33	#define HALF_MASK 0x3FF
34	#define SURROGATE_HIGH_START 0xD800
35	#define SURROGATE_LOW_START 0xDC00
36
37	/ -SURROGATE_LOW_START + HALF_BASE /
38	#define SURROGATE_LOW_BASE 9216
39
40	enum {
41	UCNV_NEED_TO_WRITE_BOM=`1`
42	};
43
44	/ UTF-32BE ----------------------------------------------------------------- /
45	U_CDECL_BEGIN
46	static void U_CALLCONV
47	T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs * args,
48	UErrorCode * err)
49	{
50	const unsigned char mySource = (unsigned* char *) args->source;
51	UChar *myTarget = args->target;
52	const unsigned char sourceLimit = (unsigned* char *) args->sourceLimit;
53	const UChar *targetLimit = args->targetLimit;
54	unsigned char *toUBytes = args->converter->toUBytes;
55	uint32_t ch, i;
56
57	/ Restore state of current sequence /
58	if (args->converter->toULength > `0` && myTarget < targetLimit) {
59	i = args->converter->toULength; / restore # of bytes consumed /
60	args->converter->toULength = `0`;
61
62	ch = args->converter->toUnicodeStatus - `1`;/Stores the previously calculated ch from a previous call/
63	args->converter->toUnicodeStatus = `0`;
64	goto morebytes;
65	}
66
67	while (mySource < sourceLimit && myTarget < targetLimit) {
68	i = `0`;
69	ch = `0`;
70	morebytes:
71	while (i < sizeof(uint32_t)) {
72	if (mySource < sourceLimit) {
73	ch = (ch << `8`) \| (uint8_t)(*mySource);
74	toUBytes[i++] = (char) *(mySource++);
75	}
76	else {
77	/ stores a partially calculated target/
78	/ + 1 to make 0 a valid character /
79	args->converter->toUnicodeStatus = ch + `1`;
80	args->converter->toULength = (int8_t) i;
81	goto donefornow;
82	}
83	}
84
85	if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
86	/ Normal valid byte when the loop has not prematurely terminated (i < inBytes) /
87	if (ch <= MAXIMUM_UCS2)
88	{
89	/ fits in 16 bits /
90	*(myTarget++) = (UChar) ch;
91	}
92	else {
93	/ write out the surrogates /
94	*(myTarget++) = U16_LEAD(ch);
95	ch = U16_TRAIL(ch);
96	if (myTarget < targetLimit) {
97	*(myTarget++) = (UChar)ch;
98	}
99	else {
100	/ Put in overflow buffer (not handled here) /
101	args->converter->UCharErrorBuffer[`0`] = (UChar) ch;
102	args->converter->UCharErrorBufferLength = `1`;
103	*err = U_BUFFER_OVERFLOW_ERROR;
104	break;
105	}
106	}
107	}
108	else {
109	args->converter->toULength = (int8_t)i;
110	*err = U_ILLEGAL_CHAR_FOUND;
111	break;
112	}
113	}
114
115	donefornow:
116	if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
117	/ End of target buffer /
118	*err = U_BUFFER_OVERFLOW_ERROR;
119	}
120
121	args->target = myTarget;
122	args->source = (const char *) mySource;
123	}
124
125	static void U_CALLCONV
126	T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
127	UErrorCode * err)
128	{
129	const unsigned char mySource = (unsigned* char *) args->source;
130	UChar *myTarget = args->target;
131	int32_t *myOffsets = args->offsets;
132	const unsigned char sourceLimit = (unsigned* char *) args->sourceLimit;
133	const UChar *targetLimit = args->targetLimit;
134	unsigned char *toUBytes = args->converter->toUBytes;
135	uint32_t ch, i;
136	int32_t offsetNum = `0`;
137
138	/ Restore state of current sequence /
139	if (args->converter->toULength > `0` && myTarget < targetLimit) {
140	i = args->converter->toULength; / restore # of bytes consumed /
141	args->converter->toULength = `0`;
142
143	ch = args->converter->toUnicodeStatus - `1`;/Stores the previously calculated ch from a previous call/
144	args->converter->toUnicodeStatus = `0`;
145	goto morebytes;
146	}
147
148	while (mySource < sourceLimit && myTarget < targetLimit) {
149	i = `0`;
150	ch = `0`;
151	morebytes:
152	while (i < sizeof(uint32_t)) {
153	if (mySource < sourceLimit) {
154	ch = (ch << `8`) \| (uint8_t)(*mySource);
155	toUBytes[i++] = (char) *(mySource++);
156	}
157	else {
158	/ stores a partially calculated target/
159	/ + 1 to make 0 a valid character /
160	args->converter->toUnicodeStatus = ch + `1`;
161	args->converter->toULength = (int8_t) i;
162	goto donefornow;
163	}
164	}
165
166	if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
167	/ Normal valid byte when the loop has not prematurely terminated (i < inBytes) /
168	if (ch <= MAXIMUM_UCS2) {
169	/ fits in 16 bits /
170	*(myTarget++) = (UChar) ch;
171	*(myOffsets++) = offsetNum;
172	}
173	else {
174	/ write out the surrogates /
175	*(myTarget++) = U16_LEAD(ch);
176	*myOffsets++ = offsetNum;
177	ch = U16_TRAIL(ch);
178	if (myTarget < targetLimit)
179	{
180	*(myTarget++) = (UChar)ch;
181	*(myOffsets++) = offsetNum;
182	}
183	else {
184	/ Put in overflow buffer (not handled here) /
185	args->converter->UCharErrorBuffer[`0`] = (UChar) ch;
186	args->converter->UCharErrorBufferLength = `1`;
187	*err = U_BUFFER_OVERFLOW_ERROR;
188	break;
189	}
190	}
191	}
192	else {
193	args->converter->toULength = (int8_t)i;
194	*err = U_ILLEGAL_CHAR_FOUND;
195	break;
196	}
197	offsetNum += i;
198	}
199
200	donefornow:
201	if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
202	{
203	/ End of target buffer /
204	*err = U_BUFFER_OVERFLOW_ERROR;
205	}
206
207	args->target = myTarget;
208	args->source = (const char *) mySource;
209	args->offsets = myOffsets;
210	}
211
212	static void U_CALLCONV
213	T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args,
214	UErrorCode * err)
215	{
216	const UChar *mySource = args->source;
217	unsigned char *myTarget;
218	const UChar *sourceLimit = args->sourceLimit;
219	const unsigned char targetLimit = (unsigned* char *) args->targetLimit;
220	UChar32 ch, ch2;
221	unsigned int indexToWrite;
222	unsigned char temp[sizeof(uint32_t)];
223
224	if(mySource >= sourceLimit) {
225	/ no input, nothing to do /
226	return;
227	}
228
229	/ write the BOM if necessary /
230	if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
231	static const char bom[]={ `0`, `0`, (char)`0xfeu`, (char)`0xffu` };
232	ucnv_fromUWriteBytes(args->converter,
233	bom, `4`,
234	&args->target, args->targetLimit,
235	&args->offsets, -`1`,
236	err);
237	args->converter->fromUnicodeStatus=`0`;
238	}
239
240	myTarget = (unsigned char *) args->target;
241	temp[`0`] = `0`;
242
243	if (args->converter->fromUChar32) {
244	ch = args->converter->fromUChar32;
245	args->converter->fromUChar32 = `0`;
246	goto lowsurogate;
247	}
248
249	while (mySource < sourceLimit && myTarget < targetLimit) {
250	ch = *(mySource++);
251
252	if (U_IS_SURROGATE(ch)) {
253	if (U_IS_LEAD(ch)) {
254	lowsurogate:
255	if (mySource < sourceLimit) {
256	ch2 = *mySource;
257	if (U_IS_TRAIL(ch2)) {
258	ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
259	mySource++;
260	}
261	else {
262	/ this is an unmatched trail code unit (2nd surrogate) /
263	/ callback(illegal) /
264	args->converter->fromUChar32 = ch;
265	*err = U_ILLEGAL_CHAR_FOUND;
266	break;
267	}
268	}
269	else {
270	/ ran out of source /
271	args->converter->fromUChar32 = ch;
272	if (args->flush) {
273	/ this is an unmatched trail code unit (2nd surrogate) /
274	/ callback(illegal) /
275	*err = U_ILLEGAL_CHAR_FOUND;
276	}
277	break;
278	}
279	}
280	else {
281	/ this is an unmatched trail code unit (2nd surrogate) /
282	/ callback(illegal) /
283	args->converter->fromUChar32 = ch;
284	*err = U_ILLEGAL_CHAR_FOUND;
285	break;
286	}
287	}
288
289	/ We cannot get any larger than 10FFFF because we are coming from UTF-16 /
290	temp[`1`] = (uint8_t) (ch >> `16` & `0x1F`);
291	temp[`2`] = (uint8_t) (ch >> `8`); / unsigned cast implicitly does (ch & FF) /
292	temp[`3`] = (uint8_t) (ch); / unsigned cast implicitly does (ch & FF) /
293
294	for (indexToWrite = `0`; indexToWrite <= sizeof(uint32_t) - `1`; indexToWrite++) {
295	if (myTarget < targetLimit) {
296	*(myTarget++) = temp[indexToWrite];
297	}
298	else {
299	args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
300	*err = U_BUFFER_OVERFLOW_ERROR;
301	}
302	}
303	}
304
305	if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
306	*err = U_BUFFER_OVERFLOW_ERROR;
307	}
308
309	args->target = (char *) myTarget;
310	args->source = mySource;
311	}
312
313	static void U_CALLCONV
314	T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
315	UErrorCode * err)
316	{
317	const UChar *mySource = args->source;
318	unsigned char *myTarget;
319	int32_t *myOffsets;
320	const UChar *sourceLimit = args->sourceLimit;
321	const unsigned char targetLimit = (unsigned* char *) args->targetLimit;
322	UChar32 ch, ch2;
323	int32_t offsetNum = `0`;
324	unsigned int indexToWrite;
325	unsigned char temp[sizeof(uint32_t)];
326
327	if(mySource >= sourceLimit) {
328	/ no input, nothing to do /
329	return;
330	}
331
332	/ write the BOM if necessary /
333	if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
334	static const char bom[]={ `0`, `0`, (char)`0xfeu`, (char)`0xffu` };
335	ucnv_fromUWriteBytes(args->converter,
336	bom, `4`,
337	&args->target, args->targetLimit,
338	&args->offsets, -`1`,
339	err);
340	args->converter->fromUnicodeStatus=`0`;
341	}
342
343	myTarget = (unsigned char *) args->target;
344	myOffsets = args->offsets;
345	temp[`0`] = `0`;
346
347	if (args->converter->fromUChar32) {
348	ch = args->converter->fromUChar32;
349	args->converter->fromUChar32 = `0`;
350	goto lowsurogate;
351	}
352
353	while (mySource < sourceLimit && myTarget < targetLimit) {
354	ch = *(mySource++);
355
356	if (U_IS_SURROGATE(ch)) {
357	if (U_IS_LEAD(ch)) {
358	lowsurogate:
359	if (mySource < sourceLimit) {
360	ch2 = *mySource;
361	if (U_IS_TRAIL(ch2)) {
362	ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
363	mySource++;
364	}
365	else {
366	/ this is an unmatched trail code unit (2nd surrogate) /
367	/ callback(illegal) /
368	args->converter->fromUChar32 = ch;
369	*err = U_ILLEGAL_CHAR_FOUND;
370	break;
371	}
372	}
373	else {
374	/ ran out of source /
375	args->converter->fromUChar32 = ch;
376	if (args->flush) {
377	/ this is an unmatched trail code unit (2nd surrogate) /
378	/ callback(illegal) /
379	*err = U_ILLEGAL_CHAR_FOUND;
380	}
381	break;
382	}
383	}
384	else {
385	/ this is an unmatched trail code unit (2nd surrogate) /
386	/ callback(illegal) /
387	args->converter->fromUChar32 = ch;
388	*err = U_ILLEGAL_CHAR_FOUND;
389	break;
390	}
391	}
392
393	/ We cannot get any larger than 10FFFF because we are coming from UTF-16 /
394	temp[`1`] = (uint8_t) (ch >> `16` & `0x1F`);
395	temp[`2`] = (uint8_t) (ch >> `8`); / unsigned cast implicitly does (ch & FF) /
396	temp[`3`] = (uint8_t) (ch); / unsigned cast implicitly does (ch & FF) /
397
398	for (indexToWrite = `0`; indexToWrite <= sizeof(uint32_t) - `1`; indexToWrite++) {
399	if (myTarget < targetLimit) {
400	*(myTarget++) = temp[indexToWrite];
401	*(myOffsets++) = offsetNum;
402	}
403	else {
404	args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
405	*err = U_BUFFER_OVERFLOW_ERROR;
406	}
407	}
408	offsetNum = offsetNum + `1` + (temp[`1`] != `0`);
409	}
410
411	if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
412	*err = U_BUFFER_OVERFLOW_ERROR;
413	}
414
415	args->target = (char *) myTarget;
416	args->source = mySource;
417	args->offsets = myOffsets;
418	}
419
420	static UChar32 U_CALLCONV
421	T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs* args,
422	UErrorCode* err)
423	{
424	const uint8_t *mySource;
425	UChar32 myUChar;
426	int32_t length;
427
428	mySource = (const uint8_t *)args->source;
429	if (mySource >= (const uint8_t *)args->sourceLimit)
430	{
431	/ no input /
432	*err = U_INDEX_OUTOFBOUNDS_ERROR;
433	return `0xffff`;
434	}
435
436	length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
437	if (length < `4`)
438	{
439	/ got a partial character /
440	uprv_memcpy(args->converter->toUBytes, mySource, length);
441	args->converter->toULength = (int8_t)length;
442	args->source = (const char *)(mySource + length);
443	*err = U_TRUNCATED_CHAR_FOUND;
444	return `0xffff`;
445	}
446
447	/ Don't even try to do a direct cast because the value may be on an odd address. /
448	myUChar = ((UChar32)mySource[`0`] << `24`)
449	\| ((UChar32)mySource[`1`] << `16`)
450	\| ((UChar32)mySource[`2`] << `8`)
451	\| ((UChar32)mySource[`3`]);
452
453	args->source = (const char *)(mySource + `4`);
454	if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
455	return myUChar;
456	}
457
458	uprv_memcpy(args->converter->toUBytes, mySource, `4`);
459	args->converter->toULength = `4`;
460
461	*err = U_ILLEGAL_CHAR_FOUND;
462	return `0xffff`;
463	}
464	U_CDECL_END
465	static const UConverterImpl _UTF32BEImpl = {
466	UCNV_UTF32_BigEndian,
467
468	NULL,
469	NULL,
470
471	NULL,
472	NULL,
473	NULL,
474
475	T_UConverter_toUnicode_UTF32_BE,
476	T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC,
477	T_UConverter_fromUnicode_UTF32_BE,
478	T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
479	T_UConverter_getNextUChar_UTF32_BE,
480
481	NULL,
482	NULL,
483	NULL,
484	NULL,
485	ucnv_getNonSurrogateUnicodeSet,
486
487	NULL,
488	NULL
489	};
490
491	/ The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 /
492	static const UConverterStaticData _UTF32BEStaticData = {
493	sizeof(UConverterStaticData),
494	"UTF-32BE",
495	`1232`,
496	UCNV_IBM, UCNV_UTF32_BigEndian, `4`, `4`,
497	{ `0`, `0`, `0xff`, `0xfd` }, `4`, FALSE, FALSE,
498	`0`,
499	`0`,
500	{ `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0` } / reserved /
501	};
502
503	const UConverterSharedData _UTF32BEData =
504	UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF32BEStaticData, &_UTF32BEImpl);
505
506	/ UTF-32LE ---------------------------------------------------------- /
507	U_CDECL_BEGIN
508	static void U_CALLCONV
509	T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs * args,
510	UErrorCode * err)
511	{
512	const unsigned char mySource = (unsigned* char *) args->source;
513	UChar *myTarget = args->target;
514	const unsigned char sourceLimit = (unsigned* char *) args->sourceLimit;
515	const UChar *targetLimit = args->targetLimit;
516	unsigned char *toUBytes = args->converter->toUBytes;
517	uint32_t ch, i;
518
519	/ Restore state of current sequence /
520	if (args->converter->toULength > `0` && myTarget < targetLimit)
521	{
522	i = args->converter->toULength; / restore # of bytes consumed /
523	args->converter->toULength = `0`;
524
525	/ Stores the previously calculated ch from a previous call/
526	ch = args->converter->toUnicodeStatus - `1`;
527	args->converter->toUnicodeStatus = `0`;
528	goto morebytes;
529	}
530
531	while (mySource < sourceLimit && myTarget < targetLimit)
532	{
533	i = `0`;
534	ch = `0`;
535	morebytes:
536	while (i < sizeof(uint32_t))
537	{
538	if (mySource < sourceLimit)
539	{
540	ch \|= ((uint8_t)(mySource)) << (i `8`);
541	toUBytes[i++] = (char) *(mySource++);
542	}
543	else
544	{
545	/ stores a partially calculated target/
546	/ + 1 to make 0 a valid character /
547	args->converter->toUnicodeStatus = ch + `1`;
548	args->converter->toULength = (int8_t) i;
549	goto donefornow;
550	}
551	}
552
553	if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
554	/ Normal valid byte when the loop has not prematurely terminated (i < inBytes) /
555	if (ch <= MAXIMUM_UCS2) {
556	/ fits in 16 bits /
557	*(myTarget++) = (UChar) ch;
558	}
559	else {
560	/ write out the surrogates /
561	*(myTarget++) = U16_LEAD(ch);
562	ch = U16_TRAIL(ch);
563	if (myTarget < targetLimit) {
564	*(myTarget++) = (UChar)ch;
565	}
566	else {
567	/ Put in overflow buffer (not handled here) /
568	args->converter->UCharErrorBuffer[`0`] = (UChar) ch;
569	args->converter->UCharErrorBufferLength = `1`;
570	*err = U_BUFFER_OVERFLOW_ERROR;
571	break;
572	}
573	}
574	}
575	else {
576	args->converter->toULength = (int8_t)i;
577	*err = U_ILLEGAL_CHAR_FOUND;
578	break;
579	}
580	}
581
582	donefornow:
583	if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
584	{
585	/ End of target buffer /
586	*err = U_BUFFER_OVERFLOW_ERROR;
587	}
588
589	args->target = myTarget;
590	args->source = (const char *) mySource;
591	}
592
593	static void U_CALLCONV
594	T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
595	UErrorCode * err)
596	{
597	const unsigned char mySource = (unsigned* char *) args->source;
598	UChar *myTarget = args->target;
599	int32_t *myOffsets = args->offsets;
600	const unsigned char sourceLimit = (unsigned* char *) args->sourceLimit;
601	const UChar *targetLimit = args->targetLimit;
602	unsigned char *toUBytes = args->converter->toUBytes;
603	uint32_t ch, i;
604	int32_t offsetNum = `0`;
605
606	/ Restore state of current sequence /
607	if (args->converter->toULength > `0` && myTarget < targetLimit)
608	{
609	i = args->converter->toULength; / restore # of bytes consumed /
610	args->converter->toULength = `0`;
611
612	/ Stores the previously calculated ch from a previous call/
613	ch = args->converter->toUnicodeStatus - `1`;
614	args->converter->toUnicodeStatus = `0`;
615	goto morebytes;
616	}
617
618	while (mySource < sourceLimit && myTarget < targetLimit)
619	{
620	i = `0`;
621	ch = `0`;
622	morebytes:
623	while (i < sizeof(uint32_t))
624	{
625	if (mySource < sourceLimit)
626	{
627	ch \|= ((uint8_t)(mySource)) << (i `8`);
628	toUBytes[i++] = (char) *(mySource++);
629	}
630	else
631	{
632	/ stores a partially calculated target/
633	/ + 1 to make 0 a valid character /
634	args->converter->toUnicodeStatus = ch + `1`;
635	args->converter->toULength = (int8_t) i;
636	goto donefornow;
637	}
638	}
639
640	if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch))
641	{
642	/ Normal valid byte when the loop has not prematurely terminated (i < inBytes) /
643	if (ch <= MAXIMUM_UCS2)
644	{
645	/ fits in 16 bits /
646	*(myTarget++) = (UChar) ch;
647	*(myOffsets++) = offsetNum;
648	}
649	else {
650	/ write out the surrogates /
651	*(myTarget++) = U16_LEAD(ch);
652	*(myOffsets++) = offsetNum;
653	ch = U16_TRAIL(ch);
654	if (myTarget < targetLimit)
655	{
656	*(myTarget++) = (UChar)ch;
657	*(myOffsets++) = offsetNum;
658	}
659	else
660	{
661	/ Put in overflow buffer (not handled here) /
662	args->converter->UCharErrorBuffer[`0`] = (UChar) ch;
663	args->converter->UCharErrorBufferLength = `1`;
664	*err = U_BUFFER_OVERFLOW_ERROR;
665	break;
666	}
667	}
668	}
669	else
670	{
671	args->converter->toULength = (int8_t)i;
672	*err = U_ILLEGAL_CHAR_FOUND;
673	break;
674	}
675	offsetNum += i;
676	}
677
678	donefornow:
679	if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
680	{
681	/ End of target buffer /
682	*err = U_BUFFER_OVERFLOW_ERROR;
683	}
684
685	args->target = myTarget;
686	args->source = (const char *) mySource;
687	args->offsets = myOffsets;
688	}
689
690	static void U_CALLCONV
691	T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args,
692	UErrorCode * err)
693	{
694	const UChar *mySource = args->source;
695	unsigned char *myTarget;
696	const UChar *sourceLimit = args->sourceLimit;
697	const unsigned char targetLimit = (unsigned* char *) args->targetLimit;
698	UChar32 ch, ch2;
699	unsigned int indexToWrite;
700	unsigned char temp[sizeof(uint32_t)];
701
702	if(mySource >= sourceLimit) {
703	/ no input, nothing to do /
704	return;
705	}
706
707	/ write the BOM if necessary /
708	if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
709	static const char bom[]={ (char)`0xffu`, (char)`0xfeu`, `0`, `0` };
710	ucnv_fromUWriteBytes(args->converter,
711	bom, `4`,
712	&args->target, args->targetLimit,
713	&args->offsets, -`1`,
714	err);
715	args->converter->fromUnicodeStatus=`0`;
716	}
717
718	myTarget = (unsigned char *) args->target;
719	temp[`3`] = `0`;
720
721	if (args->converter->fromUChar32)
722	{
723	ch = args->converter->fromUChar32;
724	args->converter->fromUChar32 = `0`;
725	goto lowsurogate;
726	}
727
728	while (mySource < sourceLimit && myTarget < targetLimit)
729	{
730	ch = *(mySource++);
731
732	if (U16_IS_SURROGATE(ch)) {
733	if (U16_IS_LEAD(ch))
734	{
735	lowsurogate:
736	if (mySource < sourceLimit)
737	{
738	ch2 = *mySource;
739	if (U16_IS_TRAIL(ch2)) {
740	ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
741	mySource++;
742	}
743	else {
744	/ this is an unmatched trail code unit (2nd surrogate) /
745	/ callback(illegal) /
746	args->converter->fromUChar32 = ch;
747	*err = U_ILLEGAL_CHAR_FOUND;
748	break;
749	}
750	}
751	else {
752	/ ran out of source /
753	args->converter->fromUChar32 = ch;
754	if (args->flush) {
755	/ this is an unmatched trail code unit (2nd surrogate) /
756	/ callback(illegal) /
757	*err = U_ILLEGAL_CHAR_FOUND;
758	}
759	break;
760	}
761	}
762	else {
763	/ this is an unmatched trail code unit (2nd surrogate) /
764	/ callback(illegal) /
765	args->converter->fromUChar32 = ch;
766	*err = U_ILLEGAL_CHAR_FOUND;
767	break;
768	}
769	}
770
771	/ We cannot get any larger than 10FFFF because we are coming from UTF-16 /
772	temp[`2`] = (uint8_t) (ch >> `16` & `0x1F`);
773	temp[`1`] = (uint8_t) (ch >> `8`); / unsigned cast implicitly does (ch & FF) /
774	temp[`0`] = (uint8_t) (ch); / unsigned cast implicitly does (ch & FF) /
775
776	for (indexToWrite = `0`; indexToWrite <= sizeof(uint32_t) - `1`; indexToWrite++)
777	{
778	if (myTarget < targetLimit)
779	{
780	*(myTarget++) = temp[indexToWrite];
781	}
782	else
783	{
784	args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
785	*err = U_BUFFER_OVERFLOW_ERROR;
786	}
787	}
788	}
789
790	if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
791	{
792	*err = U_BUFFER_OVERFLOW_ERROR;
793	}
794
795	args->target = (char *) myTarget;
796	args->source = mySource;
797	}
798
799	static void U_CALLCONV
800	T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
801	UErrorCode * err)
802	{
803	const UChar *mySource = args->source;
804	unsigned char *myTarget;
805	int32_t *myOffsets;
806	const UChar *sourceLimit = args->sourceLimit;
807	const unsigned char targetLimit = (unsigned* char *) args->targetLimit;
808	UChar32 ch, ch2;
809	unsigned int indexToWrite;
810	unsigned char temp[sizeof(uint32_t)];
811	int32_t offsetNum = `0`;
812
813	if(mySource >= sourceLimit) {
814	/ no input, nothing to do /
815	return;
816	}
817
818	/ write the BOM if necessary /
819	if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
820	static const char bom[]={ (char)`0xffu`, (char)`0xfeu`, `0`, `0` };
821	ucnv_fromUWriteBytes(args->converter,
822	bom, `4`,
823	&args->target, args->targetLimit,
824	&args->offsets, -`1`,
825	err);
826	args->converter->fromUnicodeStatus=`0`;
827	}
828
829	myTarget = (unsigned char *) args->target;
830	myOffsets = args->offsets;
831	temp[`3`] = `0`;
832
833	if (args->converter->fromUChar32)
834	{
835	ch = args->converter->fromUChar32;
836	args->converter->fromUChar32 = `0`;
837	goto lowsurogate;
838	}
839
840	while (mySource < sourceLimit && myTarget < targetLimit)
841	{
842	ch = *(mySource++);
843
844	if (U16_IS_SURROGATE(ch)) {
845	if (U16_IS_LEAD(ch))
846	{
847	lowsurogate:
848	if (mySource < sourceLimit)
849	{
850	ch2 = *mySource;
851	if (U16_IS_TRAIL(ch2))
852	{
853	ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
854	mySource++;
855	}
856	else {
857	/ this is an unmatched trail code unit (2nd surrogate) /
858	/ callback(illegal) /
859	args->converter->fromUChar32 = ch;
860	*err = U_ILLEGAL_CHAR_FOUND;
861	break;
862	}
863	}
864	else {
865	/ ran out of source /
866	args->converter->fromUChar32 = ch;
867	if (args->flush) {
868	/ this is an unmatched trail code unit (2nd surrogate) /
869	/ callback(illegal) /
870	*err = U_ILLEGAL_CHAR_FOUND;
871	}
872	break;
873	}
874	}
875	else {
876	/ this is an unmatched trail code unit (2nd surrogate) /
877	/ callback(illegal) /
878	args->converter->fromUChar32 = ch;
879	*err = U_ILLEGAL_CHAR_FOUND;
880	break;
881	}
882	}
883
884	/ We cannot get any larger than 10FFFF because we are coming from UTF-16 /
885	temp[`2`] = (uint8_t) (ch >> `16` & `0x1F`);
886	temp[`1`] = (uint8_t) (ch >> `8`); / unsigned cast implicitly does (ch & FF) /
887	temp[`0`] = (uint8_t) (ch); / unsigned cast implicitly does (ch & FF) /
888
889	for (indexToWrite = `0`; indexToWrite <= sizeof(uint32_t) - `1`; indexToWrite++)
890	{
891	if (myTarget < targetLimit)
892	{
893	*(myTarget++) = temp[indexToWrite];
894	*(myOffsets++) = offsetNum;
895	}
896	else
897	{
898	args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
899	*err = U_BUFFER_OVERFLOW_ERROR;
900	}
901	}
902	offsetNum = offsetNum + `1` + (temp[`2`] != `0`);
903	}
904
905	if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
906	{
907	*err = U_BUFFER_OVERFLOW_ERROR;
908	}
909
910	args->target = (char *) myTarget;
911	args->source = mySource;
912	args->offsets = myOffsets;
913	}
914
915	static UChar32 U_CALLCONV
916	T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs* args,
917	UErrorCode* err)
918	{
919	const uint8_t *mySource;
920	UChar32 myUChar;
921	int32_t length;
922
923	mySource = (const uint8_t *)args->source;
924	if (mySource >= (const uint8_t *)args->sourceLimit)
925	{
926	/ no input /
927	*err = U_INDEX_OUTOFBOUNDS_ERROR;
928	return `0xffff`;
929	}
930
931	length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
932	if (length < `4`)
933	{
934	/ got a partial character /
935	uprv_memcpy(args->converter->toUBytes, mySource, length);
936	args->converter->toULength = (int8_t)length;
937	args->source = (const char *)(mySource + length);
938	*err = U_TRUNCATED_CHAR_FOUND;
939	return `0xffff`;
940	}
941
942	/ Don't even try to do a direct cast because the value may be on an odd address. /
943	myUChar = ((UChar32)mySource[`3`] << `24`)
944	\| ((UChar32)mySource[`2`] << `16`)
945	\| ((UChar32)mySource[`1`] << `8`)
946	\| ((UChar32)mySource[`0`]);
947
948	args->source = (const char *)(mySource + `4`);
949	if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
950	return myUChar;
951	}
952
953	uprv_memcpy(args->converter->toUBytes, mySource, `4`);
954	args->converter->toULength = `4`;
955
956	*err = U_ILLEGAL_CHAR_FOUND;
957	return `0xffff`;
958	}
959	U_CDECL_END
960	static const UConverterImpl _UTF32LEImpl = {
961	UCNV_UTF32_LittleEndian,
962
963	NULL,
964	NULL,
965
966	NULL,
967	NULL,
968	NULL,
969
970	T_UConverter_toUnicode_UTF32_LE,
971	T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC,
972	T_UConverter_fromUnicode_UTF32_LE,
973	T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
974	T_UConverter_getNextUChar_UTF32_LE,
975
976	NULL,
977	NULL,
978	NULL,
979	NULL,
980	ucnv_getNonSurrogateUnicodeSet,
981
982	NULL,
983	NULL
984	};
985
986	/ The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 /
987	static const UConverterStaticData _UTF32LEStaticData = {
988	sizeof(UConverterStaticData),
989	"UTF-32LE",
990	`1234`,
991	UCNV_IBM, UCNV_UTF32_LittleEndian, `4`, `4`,
992	{ `0xfd`, `0xff`, `0`, `0` }, `4`, FALSE, FALSE,
993	`0`,
994	`0`,
995	{ `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0` } / reserved /
996	};
997
998
999	const UConverterSharedData _UTF32LEData =
1000	UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF32LEStaticData, &_UTF32LEImpl);
1001
1002	/ UTF-32 (Detect BOM) ------------------------------------------------------ /
1003
1004	/*
1005	* Detect a BOM at the beginning of the stream and select UTF-32BE or UTF-32LE
1006	* accordingly.
1007	*
1008	* State values:
1009	* 0 initial state
1010	* 1 saw 00
1011	* 2 saw 00 00
1012	* 3 saw 00 00 FE
1013	* 4 -
1014	* 5 saw FF
1015	* 6 saw FF FE
1016	* 7 saw FF FE 00
1017	* 8 UTF-32BE mode
1018	* 9 UTF-32LE mode
1019	*
1020	* During detection: state&3==number of matching bytes so far.
1021	*
1022	* On output, emit U+FEFF as the first code point.
1023	*/
1024	U_CDECL_BEGIN
1025	static void U_CALLCONV
1026	_UTF32Reset(UConverter *cnv, UConverterResetChoice choice) {
1027	if(choice<=UCNV_RESET_TO_UNICODE) {
1028	/ reset toUnicode: state=0 /
1029	cnv->mode=`0`;
1030	}
1031	if(choice!=UCNV_RESET_TO_UNICODE) {
1032	/ reset fromUnicode: prepare to output the UTF-32PE BOM /
1033	cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
1034	}
1035	}
1036
1037	static void U_CALLCONV
1038	_UTF32Open(UConverter *cnv,
1039	UConverterLoadArgs *pArgs,
1040	UErrorCode *pErrorCode) {
1041	(void)pArgs;
1042	(void)pErrorCode;
1043	_UTF32Reset(cnv, UCNV_RESET_BOTH);
1044	}
1045
1046	static const char utf32BOM[`8`]={ `0`, `0`, (char)`0xfeu`, (char)`0xffu`, (char)`0xffu`, (char)`0xfeu`, `0`, `0` };
1047
1048	static void U_CALLCONV
1049	_UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
1050	UErrorCode *pErrorCode) {
1051	UConverter *cnv=pArgs->converter;
1052	const char *source=pArgs->source;
1053	const char *sourceLimit=pArgs->sourceLimit;
1054	int32_t *offsets=pArgs->offsets;
1055
1056	int32_t state, offsetDelta;
1057	char b;
1058
1059	state=cnv->mode;
1060
1061	/*
1062	* If we detect a BOM in this buffer, then we must add the BOM size to the
1063	* offsets because the actual converter function will not see and count the BOM.
1064	* offsetDelta will have the number of the BOM bytes that are in the current buffer.
1065	*/
1066	offsetDelta=`0`;
1067
1068	while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {
1069	switch(state) {
1070	case `0`:
1071	b=*source;
1072	if(b==`0`) {
1073	state=`1`; / could be 00 00 FE FF /
1074	} else if(b==(char)`0xffu`) {
1075	state=`5`; / could be FF FE 00 00 /
1076	} else {
1077	state=`8`; / default to UTF-32BE /
1078	continue;
1079	}
1080	++source;
1081	break;
1082	case `1`:
1083	case `2`:
1084	case `3`:
1085	case `5`:
1086	case `6`:
1087	case `7`:
1088	if(*source==utf32BOM[state]) {
1089	++state;
1090	++source;
1091	if(state==`4`) {
1092	state=`8`; / detect UTF-32BE /
1093	offsetDelta=(int32_t)(source-pArgs->source);
1094	} else if(state==`8`) {
1095	state=`9`; / detect UTF-32LE /
1096	offsetDelta=(int32_t)(source-pArgs->source);
1097	}
1098	} else {
1099	/ switch to UTF-32BE and pass the previous bytes /
1100	int32_t count=(int32_t)(source-pArgs->source); / number of bytes from this buffer /
1101
1102	/ reset the source /
1103	source=pArgs->source;
1104
1105	if(count==(state&`3`)) {
1106	/ simple: all in the same buffer, just reset source /
1107	} else {
1108	UBool oldFlush=pArgs->flush;
1109
1110	/ some of the bytes are from a previous buffer, replay those first /
1111	pArgs->source=utf32BOM+(state&`4`); / select the correct BOM /
1112	pArgs->sourceLimit=pArgs->source+((state&`3`)-count); / replay previous bytes /
1113	pArgs->flush=FALSE; / this sourceLimit is not the real source stream limit /
1114
1115	/ no offsets: bytes from previous buffer, and not enough for output /
1116	T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1117
1118	/ restore real pointers; pArgs->source will be set in case 8/9 /
1119	pArgs->sourceLimit=sourceLimit;
1120	pArgs->flush=oldFlush;
1121	}
1122	state=`8`;
1123	continue;
1124	}
1125	break;
1126	case `8`:
1127	/ call UTF-32BE /
1128	pArgs->source=source;
1129	if(offsets==NULL) {
1130	T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1131	} else {
1132	T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(pArgs, pErrorCode);
1133	}
1134	source=pArgs->source;
1135	break;
1136	case `9`:
1137	/ call UTF-32LE /
1138	pArgs->source=source;
1139	if(offsets==NULL) {
1140	T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
1141	} else {
1142	T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(pArgs, pErrorCode);
1143	}
1144	source=pArgs->source;
1145	break;
1146	default:
1147	break; / does not occur /
1148	}
1149	}
1150
1151	/ add BOM size to offsets - see comment at offsetDelta declaration /
1152	if(offsets!=NULL && offsetDelta!=`0`) {
1153	int32_t *offsetsLimit=pArgs->offsets;
1154	while(offsets<offsetsLimit) {
1155	*offsets++ += offsetDelta;
1156	}
1157	}
1158
1159	pArgs->source=source;
1160
1161	if(source==sourceLimit && pArgs->flush) {
1162	/ handle truncated input /
1163	switch(state) {
1164	case `0`:
1165	break; / no input at all, nothing to do /
1166	case `8`:
1167	T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1168	break;
1169	case `9`:
1170	T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
1171	break;
1172	default:
1173	/ handle 0<state<8: call UTF-32BE with too-short input /
1174	pArgs->source=utf32BOM+(state&`4`); / select the correct BOM /
1175	pArgs->sourceLimit=pArgs->source+(state&`3`); / replay bytes /
1176
1177	/ no offsets: not enough for output /
1178	T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1179	pArgs->source=source;
1180	pArgs->sourceLimit=sourceLimit;
1181	state=`8`;
1182	break;
1183	}
1184	}
1185
1186	cnv->mode=state;
1187	}
1188
1189	static UChar32 U_CALLCONV
1190	_UTF32GetNextUChar(UConverterToUnicodeArgs *pArgs,
1191	UErrorCode *pErrorCode) {
1192	switch(pArgs->converter->mode) {
1193	case `8`:
1194	return T_UConverter_getNextUChar_UTF32_BE(pArgs, pErrorCode);
1195	case `9`:
1196	return T_UConverter_getNextUChar_UTF32_LE(pArgs, pErrorCode);
1197	default:
1198	return UCNV_GET_NEXT_UCHAR_USE_TO_U;
1199	}
1200	}
1201	U_CDECL_END
1202	static const UConverterImpl _UTF32Impl = {
1203	UCNV_UTF32,
1204
1205	NULL,
1206	NULL,
1207
1208	_UTF32Open,
1209	NULL,
1210	_UTF32Reset,
1211
1212	_UTF32ToUnicodeWithOffsets,
1213	_UTF32ToUnicodeWithOffsets,
1214	#if U_IS_BIG_ENDIAN
1215	T_UConverter_fromUnicode_UTF32_BE,
1216	T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
1217	#else
1218	T_UConverter_fromUnicode_UTF32_LE,
1219	T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
1220	#endif
1221	_UTF32GetNextUChar,
1222
1223	NULL, / ### TODO implement getStarters for all Unicode encodings?! /
1224	NULL,
1225	NULL,
1226	NULL,
1227	ucnv_getNonSurrogateUnicodeSet,
1228
1229	NULL,
1230	NULL
1231	};
1232
1233	/ The 1236 CCSID refers to any version of Unicode with a BOM sensitive endianess of UTF-32 /
1234	static const UConverterStaticData _UTF32StaticData = {
1235	sizeof(UConverterStaticData),
1236	"UTF-32",
1237	`1236`,
1238	UCNV_IBM, UCNV_UTF32, `4`, `4`,
1239	#if U_IS_BIG_ENDIAN
1240	{ `0`, `0`, `0xff`, `0xfd` }, `4`,
1241	#else
1242	{ `0xfd`, `0xff`, `0`, `0` }, `4`,
1243	#endif
1244	FALSE, FALSE,
1245	`0`,
1246	`0`,
1247	{ `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0` } / reserved /
1248	};
1249
1250	const UConverterSharedData _UTF32Data =
1251	UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF32StaticData, &_UTF32Impl);
1252
1253	#endif
1254

Browse the source code of ClickHouse/contrib/icu/icu4c/source/common/ucnv_u32.cpp