ucnv_u8.cpp source code [ClickHouse/contrib/icu/icu4c/source/common/ucnv_u8.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	**********************************************************************
5	* Copyright (C) 2002-2016, International Business Machines
6	* Corporation and others. All Rights Reserved.
7	**********************************************************************
8	* file name: ucnv_u8.c
9	* encoding: UTF-8
10	* tab size: 8 (not used)
11	* indentation:4
12	*
13	* created on: 2002jul01
14	* created by: Markus W. Scherer
15	*
16	* UTF-8 converter implementation. Used to be in ucnv_utf.c.
17	*
18	* Also, CESU-8 implementation, see UTR 26.
19	* The CESU-8 converter uses all the same functions as the
20	* UTF-8 converter, with a branch for converting supplementary code points.
21	*/
22
23	#include "unicode/utypes.h"
24
25	#if !UCONFIG_NO_CONVERSION
26
27	#include "unicode/ucnv.h"
28	#include "unicode/utf.h"
29	#include "unicode/utf8.h"
30	#include "unicode/utf16.h"
31	#include "uassert.h"
32	#include "ucnv_bld.h"
33	#include "ucnv_cnv.h"
34	#include "cmemory.h"
35	#include "ustr_imp.h"
36
37	/ Prototypes --------------------------------------------------------------- /
38
39	/ Keep these here to make finicky compilers happy /
40
41	U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args,
42	UErrorCode *err);
43	U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args,
44	UErrorCode *err);
45
46
47	/ UTF-8 -------------------------------------------------------------------- /
48
49	#define MAXIMUM_UCS2 0x0000FFFF
50
51	static const uint32_t offsetsFromUTF8[`5`] = {`0`,
52	(uint32_t) `0x00000000`, (uint32_t) `0x00003080`, (uint32_t) `0x000E2080`,
53	(uint32_t) `0x03C82080`
54	};
55
56	static UBool hasCESU8Data(const UConverter *cnv)
57	{
58	#if UCONFIG_ONLY_HTML_CONVERSION
59	return FALSE;
60	#else
61	return (UBool)(cnv->sharedData == &_CESU8Data);
62	#endif
63	}
64	U_CDECL_BEGIN
65	static void U_CALLCONV ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
66	UErrorCode * err)
67	{
68	UConverter *cnv = args->converter;
69	const unsigned char mySource = (unsigned* char *) args->source;
70	UChar *myTarget = args->target;
71	const unsigned char sourceLimit = (unsigned* char *) args->sourceLimit;
72	const UChar *targetLimit = args->targetLimit;
73	unsigned char *toUBytes = cnv->toUBytes;
74	UBool isCESU8 = hasCESU8Data(cnv);
75	uint32_t ch, ch2 = `0`;
76	int32_t i, inBytes;
77
78	/ Restore size of current sequence /
79	if (cnv->toULength > `0` && myTarget < targetLimit)
80	{
81	inBytes = cnv->mode; / restore # of bytes to consume /
82	i = cnv->toULength; / restore # of bytes consumed /
83	cnv->toULength = `0`;
84
85	ch = cnv->toUnicodeStatus;/Stores the previously calculated ch from a previous call/
86	cnv->toUnicodeStatus = `0`;
87	goto morebytes;
88	}
89
90
91	while (mySource < sourceLimit && myTarget < targetLimit)
92	{
93	ch = *(mySource++);
94	if (U8_IS_SINGLE(ch)) / Simple case /
95	{
96	*(myTarget++) = (UChar) ch;
97	}
98	else
99	{
100	/ store the first char /
101	toUBytes[`0`] = (char)ch;
102	inBytes = U8_COUNT_BYTES_NON_ASCII(ch); / lookup current sequence length /
103	i = `1`;
104
105	morebytes:
106	while (i < inBytes)
107	{
108	if (mySource < sourceLimit)
109	{
110	toUBytes[i] = (char) (ch2 = *mySource);
111	if (!icu::UTF8::isValidTrail(ch, static_cast<uint8_t>(ch2), i, inBytes) &&
112	!(isCESU8 && i == `1` && ch == `0xed` && U8_IS_TRAIL(ch2)))
113	{
114	break; / i < inBytes /
115	}
116	ch = (ch << `6`) + ch2;
117	++mySource;
118	i++;
119	}
120	else
121	{
122	/ stores a partially calculated target/
123	cnv->toUnicodeStatus = ch;
124	cnv->mode = inBytes;
125	cnv->toULength = (int8_t) i;
126	goto donefornow;
127	}
128	}
129
130	// In CESU-8, only surrogates, not supplementary code points, are encoded directly.
131	if (i == inBytes && (!isCESU8 \|\| i <= `3`))
132	{
133	/ Remove the accumulated high bits /
134	ch -= offsetsFromUTF8[inBytes];
135
136	/ Normal valid byte when the loop has not prematurely terminated (i < inBytes) /
137	if (ch <= MAXIMUM_UCS2)
138	{
139	/ fits in 16 bits /
140	*(myTarget++) = (UChar) ch;
141	}
142	else
143	{
144	/ write out the surrogates /
145	*(myTarget++) = U16_LEAD(ch);
146	ch = U16_TRAIL(ch);
147	if (myTarget < targetLimit)
148	{
149	*(myTarget++) = (UChar)ch;
150	}
151	else
152	{
153	/ Put in overflow buffer (not handled here) /
154	cnv->UCharErrorBuffer[`0`] = (UChar) ch;
155	cnv->UCharErrorBufferLength = `1`;
156	*err = U_BUFFER_OVERFLOW_ERROR;
157	break;
158	}
159	}
160	}
161	else
162	{
163	cnv->toULength = (int8_t)i;
164	*err = U_ILLEGAL_CHAR_FOUND;
165	break;
166	}
167	}
168	}
169
170	donefornow:
171	if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
172	{
173	/ End of target buffer /
174	*err = U_BUFFER_OVERFLOW_ERROR;
175	}
176
177	args->target = myTarget;
178	args->source = (const char *) mySource;
179	}
180
181	static void U_CALLCONV ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
182	UErrorCode * err)
183	{
184	UConverter *cnv = args->converter;
185	const unsigned char mySource = (unsigned* char *) args->source;
186	UChar *myTarget = args->target;
187	int32_t *myOffsets = args->offsets;
188	int32_t offsetNum = `0`;
189	const unsigned char sourceLimit = (unsigned* char *) args->sourceLimit;
190	const UChar *targetLimit = args->targetLimit;
191	unsigned char *toUBytes = cnv->toUBytes;
192	UBool isCESU8 = hasCESU8Data(cnv);
193	uint32_t ch, ch2 = `0`;
194	int32_t i, inBytes;
195
196	/ Restore size of current sequence /
197	if (cnv->toULength > `0` && myTarget < targetLimit)
198	{
199	inBytes = cnv->mode; / restore # of bytes to consume /
200	i = cnv->toULength; / restore # of bytes consumed /
201	cnv->toULength = `0`;
202
203	ch = cnv->toUnicodeStatus;/Stores the previously calculated ch from a previous call/
204	cnv->toUnicodeStatus = `0`;
205	goto morebytes;
206	}
207
208	while (mySource < sourceLimit && myTarget < targetLimit)
209	{
210	ch = *(mySource++);
211	if (U8_IS_SINGLE(ch)) / Simple case /
212	{
213	*(myTarget++) = (UChar) ch;
214	*(myOffsets++) = offsetNum++;
215	}
216	else
217	{
218	toUBytes[`0`] = (char)ch;
219	inBytes = U8_COUNT_BYTES_NON_ASCII(ch);
220	i = `1`;
221
222	morebytes:
223	while (i < inBytes)
224	{
225	if (mySource < sourceLimit)
226	{
227	toUBytes[i] = (char) (ch2 = *mySource);
228	if (!icu::UTF8::isValidTrail(ch, static_cast<uint8_t>(ch2), i, inBytes) &&
229	!(isCESU8 && i == `1` && ch == `0xed` && U8_IS_TRAIL(ch2)))
230	{
231	break; / i < inBytes /
232	}
233	ch = (ch << `6`) + ch2;
234	++mySource;
235	i++;
236	}
237	else
238	{
239	cnv->toUnicodeStatus = ch;
240	cnv->mode = inBytes;
241	cnv->toULength = (int8_t)i;
242	goto donefornow;
243	}
244	}
245
246	// In CESU-8, only surrogates, not supplementary code points, are encoded directly.
247	if (i == inBytes && (!isCESU8 \|\| i <= `3`))
248	{
249	/ Remove the accumulated high bits /
250	ch -= offsetsFromUTF8[inBytes];
251
252	/ Normal valid byte when the loop has not prematurely terminated (i < inBytes) /
253	if (ch <= MAXIMUM_UCS2)
254	{
255	/ fits in 16 bits /
256	*(myTarget++) = (UChar) ch;
257	*(myOffsets++) = offsetNum;
258	}
259	else
260	{
261	/ write out the surrogates /
262	*(myTarget++) = U16_LEAD(ch);
263	*(myOffsets++) = offsetNum;
264	ch = U16_TRAIL(ch);
265	if (myTarget < targetLimit)
266	{
267	*(myTarget++) = (UChar)ch;
268	*(myOffsets++) = offsetNum;
269	}
270	else
271	{
272	cnv->UCharErrorBuffer[`0`] = (UChar) ch;
273	cnv->UCharErrorBufferLength = `1`;
274	*err = U_BUFFER_OVERFLOW_ERROR;
275	}
276	}
277	offsetNum += i;
278	}
279	else
280	{
281	cnv->toULength = (int8_t)i;
282	*err = U_ILLEGAL_CHAR_FOUND;
283	break;
284	}
285	}
286	}
287
288	donefornow:
289	if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
290	{ / End of target buffer /
291	*err = U_BUFFER_OVERFLOW_ERROR;
292	}
293
294	args->target = myTarget;
295	args->source = (const char *) mySource;
296	args->offsets = myOffsets;
297	}
298	U_CDECL_END
299
300	U_CFUNC void U_CALLCONV ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
301	UErrorCode * err)
302	{
303	UConverter *cnv = args->converter;
304	const UChar *mySource = args->source;
305	const UChar *sourceLimit = args->sourceLimit;
306	uint8_t myTarget = (uint8_t ) args->target;
307	const uint8_t targetLimit = (uint8_t ) args->targetLimit;
308	uint8_t *tempPtr;
309	UChar32 ch;
310	uint8_t tempBuf[`4`];
311	int32_t indexToWrite;
312	UBool isNotCESU8 = !hasCESU8Data(cnv);
313
314	if (cnv->fromUChar32 && myTarget < targetLimit)
315	{
316	ch = cnv->fromUChar32;
317	cnv->fromUChar32 = `0`;
318	goto lowsurrogate;
319	}
320
321	while (mySource < sourceLimit && myTarget < targetLimit)
322	{
323	ch = *(mySource++);
324
325	if (ch < `0x80`) / Single byte /
326	{
327	*(myTarget++) = (uint8_t) ch;
328	}
329	else if (ch < `0x800`) / Double byte /
330	{
331	*(myTarget++) = (uint8_t) ((ch >> `6`) \| `0xc0`);
332	if (myTarget < targetLimit)
333	{
334	*(myTarget++) = (uint8_t) ((ch & `0x3f`) \| `0x80`);
335	}
336	else
337	{
338	cnv->charErrorBuffer[`0`] = (uint8_t) ((ch & `0x3f`) \| `0x80`);
339	cnv->charErrorBufferLength = `1`;
340	*err = U_BUFFER_OVERFLOW_ERROR;
341	}
342	}
343	else {
344	/ Check for surrogates /
345	if(U16_IS_SURROGATE(ch) && isNotCESU8) {
346	lowsurrogate:
347	if (mySource < sourceLimit) {
348	/ test both code units /
349	if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
350	/ convert and consume this supplementary code point /
351	ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
352	++mySource;
353	/ exit this condition tree /
354	}
355	else {
356	/ this is an unpaired trail or lead code unit /
357	/ callback(illegal) /
358	cnv->fromUChar32 = ch;
359	*err = U_ILLEGAL_CHAR_FOUND;
360	break;
361	}
362	}
363	else {
364	/ no more input /
365	cnv->fromUChar32 = ch;
366	break;
367	}
368	}
369
370	/ Do we write the buffer directly for speed,*
371	or do we have to be careful about target buffer space? /*
372	tempPtr = (((targetLimit - myTarget) >= `4`) ? myTarget : tempBuf);
373
374	if (ch <= MAXIMUM_UCS2) {
375	indexToWrite = `2`;
376	tempPtr[`0`] = (uint8_t) ((ch >> `12`) \| `0xe0`);
377	}
378	else {
379	indexToWrite = `3`;
380	tempPtr[`0`] = (uint8_t) ((ch >> `18`) \| `0xf0`);
381	tempPtr[`1`] = (uint8_t) (((ch >> `12`) & `0x3f`) \| `0x80`);
382	}
383	tempPtr[indexToWrite-`1`] = (uint8_t) (((ch >> `6`) & `0x3f`) \| `0x80`);
384	tempPtr[indexToWrite] = (uint8_t) ((ch & `0x3f`) \| `0x80`);
385
386	if (tempPtr == myTarget) {
387	/ There was enough space to write the codepoint directly. /
388	myTarget += (indexToWrite + `1`);
389	}
390	else {
391	/ We might run out of room soon. Write it slowly. /
392	for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
393	if (myTarget < targetLimit) {
394	(myTarget++) = tempPtr;
395	}
396	else {
397	cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
398	*err = U_BUFFER_OVERFLOW_ERROR;
399	}
400	}
401	}
402	}
403	}
404
405	if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
406	{
407	*err = U_BUFFER_OVERFLOW_ERROR;
408	}
409
410	args->target = (char *) myTarget;
411	args->source = mySource;
412	}
413
414	U_CFUNC void U_CALLCONV ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
415	UErrorCode * err)
416	{
417	UConverter *cnv = args->converter;
418	const UChar *mySource = args->source;
419	int32_t *myOffsets = args->offsets;
420	const UChar *sourceLimit = args->sourceLimit;
421	uint8_t myTarget = (uint8_t ) args->target;
422	const uint8_t targetLimit = (uint8_t ) args->targetLimit;
423	uint8_t *tempPtr;
424	UChar32 ch;
425	int32_t offsetNum, nextSourceIndex;
426	int32_t indexToWrite;
427	uint8_t tempBuf[`4`];
428	UBool isNotCESU8 = !hasCESU8Data(cnv);
429
430	if (cnv->fromUChar32 && myTarget < targetLimit)
431	{
432	ch = cnv->fromUChar32;
433	cnv->fromUChar32 = `0`;
434	offsetNum = -`1`;
435	nextSourceIndex = `0`;
436	goto lowsurrogate;
437	} else {
438	offsetNum = `0`;
439	}
440
441	while (mySource < sourceLimit && myTarget < targetLimit)
442	{
443	ch = *(mySource++);
444
445	if (ch < `0x80`) / Single byte /
446	{
447	*(myOffsets++) = offsetNum++;
448	(myTarget++) = (char*) ch;
449	}
450	else if (ch < `0x800`) / Double byte /
451	{
452	*(myOffsets++) = offsetNum;
453	*(myTarget++) = (uint8_t) ((ch >> `6`) \| `0xc0`);
454	if (myTarget < targetLimit)
455	{
456	*(myOffsets++) = offsetNum++;
457	*(myTarget++) = (uint8_t) ((ch & `0x3f`) \| `0x80`);
458	}
459	else
460	{
461	cnv->charErrorBuffer[`0`] = (uint8_t) ((ch & `0x3f`) \| `0x80`);
462	cnv->charErrorBufferLength = `1`;
463	*err = U_BUFFER_OVERFLOW_ERROR;
464	}
465	}
466	else
467	/ Check for surrogates /
468	{
469	nextSourceIndex = offsetNum + `1`;
470
471	if(U16_IS_SURROGATE(ch) && isNotCESU8) {
472	lowsurrogate:
473	if (mySource < sourceLimit) {
474	/ test both code units /
475	if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
476	/ convert and consume this supplementary code point /
477	ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
478	++mySource;
479	++nextSourceIndex;
480	/ exit this condition tree /
481	}
482	else {
483	/ this is an unpaired trail or lead code unit /
484	/ callback(illegal) /
485	cnv->fromUChar32 = ch;
486	*err = U_ILLEGAL_CHAR_FOUND;
487	break;
488	}
489	}
490	else {
491	/ no more input /
492	cnv->fromUChar32 = ch;
493	break;
494	}
495	}
496
497	/ Do we write the buffer directly for speed,*
498	or do we have to be careful about target buffer space? /*
499	tempPtr = (((targetLimit - myTarget) >= `4`) ? myTarget : tempBuf);
500
501	if (ch <= MAXIMUM_UCS2) {
502	indexToWrite = `2`;
503	tempPtr[`0`] = (uint8_t) ((ch >> `12`) \| `0xe0`);
504	}
505	else {
506	indexToWrite = `3`;
507	tempPtr[`0`] = (uint8_t) ((ch >> `18`) \| `0xf0`);
508	tempPtr[`1`] = (uint8_t) (((ch >> `12`) & `0x3f`) \| `0x80`);
509	}
510	tempPtr[indexToWrite-`1`] = (uint8_t) (((ch >> `6`) & `0x3f`) \| `0x80`);
511	tempPtr[indexToWrite] = (uint8_t) ((ch & `0x3f`) \| `0x80`);
512
513	if (tempPtr == myTarget) {
514	/ There was enough space to write the codepoint directly. /
515	myTarget += (indexToWrite + `1`);
516	myOffsets[`0`] = offsetNum;
517	myOffsets[`1`] = offsetNum;
518	myOffsets[`2`] = offsetNum;
519	if (indexToWrite >= `3`) {
520	myOffsets[`3`] = offsetNum;
521	}
522	myOffsets += (indexToWrite + `1`);
523	}
524	else {
525	/ We might run out of room soon. Write it slowly. /
526	for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
527	if (myTarget < targetLimit)
528	{
529	*(myOffsets++) = offsetNum;
530	(myTarget++) = tempPtr;
531	}
532	else
533	{
534	cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
535	*err = U_BUFFER_OVERFLOW_ERROR;
536	}
537	}
538	}
539	offsetNum = nextSourceIndex;
540	}
541	}
542
543	if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
544	{
545	*err = U_BUFFER_OVERFLOW_ERROR;
546	}
547
548	args->target = (char *) myTarget;
549	args->source = mySource;
550	args->offsets = myOffsets;
551	}
552
553	U_CDECL_BEGIN
554	static UChar32 U_CALLCONV ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
555	UErrorCode *err) {
556	UConverter *cnv;
557	const uint8_t *sourceInitial;
558	const uint8_t *source;
559	uint8_t myByte;
560	UChar32 ch;
561	int8_t i;
562
563	/ UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs /
564
565	cnv = args->converter;
566	sourceInitial = source = (const uint8_t *)args->source;
567	if (source >= (const uint8_t *)args->sourceLimit)
568	{
569	/ no input /
570	*err = U_INDEX_OUTOFBOUNDS_ERROR;
571	return `0xffff`;
572	}
573
574	myByte = (uint8_t)*(source++);
575	if (U8_IS_SINGLE(myByte))
576	{
577	args->source = (const char *)source;
578	return (UChar32)myByte;
579	}
580
581	uint16_t countTrailBytes = U8_COUNT_TRAIL_BYTES(myByte);
582	if (countTrailBytes == `0`) {
583	cnv->toUBytes[`0`] = myByte;
584	cnv->toULength = `1`;
585	*err = U_ILLEGAL_CHAR_FOUND;
586	args->source = (const char *)source;
587	return `0xffff`;
588	}
589
590	/The byte sequence is longer than the buffer area passed/
591	if (((const char *)source + countTrailBytes) > args->sourceLimit)
592	{
593	/ check if all of the remaining bytes are trail bytes /
594	uint16_t extraBytesToWrite = countTrailBytes + `1`;
595	cnv->toUBytes[`0`] = myByte;
596	i = `1`;
597	*err = U_TRUNCATED_CHAR_FOUND;
598	while(source < (const uint8_t *)args->sourceLimit) {
599	uint8_t b = *source;
600	if(icu::UTF8::isValidTrail(myByte, b, i, extraBytesToWrite)) {
601	cnv->toUBytes[i++] = b;
602	++source;
603	} else {
604	/ error even before we run out of input /
605	*err = U_ILLEGAL_CHAR_FOUND;
606	break;
607	}
608	}
609	cnv->toULength = i;
610	args->source = (const char *)source;
611	return `0xffff`;
612	}
613
614	ch = myByte << `6`;
615	if(countTrailBytes == `2`) {
616	uint8_t t1 = *source, t2;
617	if(U8_IS_VALID_LEAD3_AND_T1(myByte, t1) && U8_IS_TRAIL(t2 = *++source)) {
618	args->source = (const char *)(source + `1`);
619	return (((ch + t1) << `6`) + t2) - offsetsFromUTF8[`3`];
620	}
621	} else if(countTrailBytes == `1`) {
622	uint8_t t1 = *source;
623	if(U8_IS_TRAIL(t1)) {
624	args->source = (const char *)(source + `1`);
625	return (ch + t1) - offsetsFromUTF8[`2`];
626	}
627	} else { // countTrailBytes == 3
628	uint8_t t1 = *source, t2, t3;
629	if(U8_IS_VALID_LEAD4_AND_T1(myByte, t1) && U8_IS_TRAIL(t2 = *++source) &&
630	U8_IS_TRAIL(t3 = *++source)) {
631	args->source = (const char *)(source + `1`);
632	return (((((ch + t1) << `6`) + t2) << `6`) + t3) - offsetsFromUTF8[`4`];
633	}
634	}
635	args->source = (const char *)source;
636
637	for(i = `0`; sourceInitial < source; ++i) {
638	cnv->toUBytes[i] = *sourceInitial++;
639	}
640	cnv->toULength = i;
641	*err = U_ILLEGAL_CHAR_FOUND;
642	return `0xffff`;
643	}
644	U_CDECL_END
645
646	/ UTF-8-from-UTF-8 conversion functions ------------------------------------ /
647
648	U_CDECL_BEGIN
649	/ "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). /
650	static void U_CALLCONV
651	ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
652	UConverterToUnicodeArgs *pToUArgs,
653	UErrorCode *pErrorCode) {
654	UConverter *utf8;
655	const uint8_t source, sourceLimit;
656	uint8_t *target;
657	int32_t targetCapacity;
658	int32_t count;
659
660	int8_t oldToULength, toULength, toULimit;
661
662	UChar32 c;
663	uint8_t b, t1, t2;
664
665	/ set up the local pointers /
666	utf8=pToUArgs->converter;
667	source=(uint8_t *)pToUArgs->source;
668	sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
669	target=(uint8_t *)pFromUArgs->target;
670	targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
671
672	/ get the converter state from the UTF-8 UConverter /
673	if(utf8->toULength > `0`) {
674	toULength=oldToULength=utf8->toULength;
675	toULimit=(int8_t)utf8->mode;
676	c=(UChar32)utf8->toUnicodeStatus;
677	} else {
678	toULength=oldToULength=toULimit=`0`;
679	c = `0`;
680	}
681
682	count=(int32_t)(sourceLimit-source)+oldToULength;
683	if(count<toULimit) {
684	/*
685	* Not enough input to complete the partial character.
686	* Jump to moreBytes below - it will not output to target.
687	*/
688	} else if(targetCapacity<toULimit) {
689	/*
690	* Not enough target capacity to output the partial character.
691	* Let the standard converter handle this.
692	*/
693	*pErrorCode=U_USING_DEFAULT_WARNING;
694	return;
695	} else {
696	// Use a single counter for source and target, counting the minimum of
697	// the source length and the target capacity.
698	// Let the standard converter handle edge cases.
699	if(count>targetCapacity) {
700	count=targetCapacity;
701	}
702
703	// The conversion loop checks count>0 only once per character.
704	// If the buffer ends with a truncated sequence,
705	// then we reduce the count to stop before that,
706	// and collect the remaining bytes after the conversion loop.
707
708	// Do not go back into the bytes that will be read for finishing a partial
709	// sequence from the previous buffer.
710	int32_t length=count-toULimit;
711	U8_TRUNCATE_IF_INCOMPLETE(source, `0`, length);
712	count=toULimit+length;
713	}
714
715	if(c!=`0`) {
716	utf8->toUnicodeStatus=`0`;
717	utf8->toULength=`0`;
718	goto moreBytes;
719	/ See note in ucnv_SBCSFromUTF8() about this goto. /
720	}
721
722	/ conversion loop /
723	while(count>`0`) {
724	b=*source++;
725	if(U8_IS_SINGLE(b)) {
726	/ convert ASCII /
727	*target++=b;
728	--count;
729	continue;
730	} else {
731	if(b>=`0xe0`) {
732	if( / handle U+0800..U+FFFF inline /
733	b<`0xf0` &&
734	U8_IS_VALID_LEAD3_AND_T1(b, t1=source[`0`]) &&
735	U8_IS_TRAIL(t2=source[`1`])
736	) {
737	source+=`2`;
738	*target++=b;
739	*target++=t1;
740	*target++=t2;
741	count-=`3`;
742	continue;
743	}
744	} else {
745	if( / handle U+0080..U+07FF inline /
746	b>=`0xc2` &&
747	U8_IS_TRAIL(t1=*source)
748	) {
749	++source;
750	*target++=b;
751	*target++=t1;
752	count-=`2`;
753	continue;
754	}
755	}
756
757	/ handle "complicated" and error cases, and continuing partial characters /
758	oldToULength=`0`;
759	toULength=`1`;
760	toULimit=U8_COUNT_BYTES_NON_ASCII(b);
761	c=b;
762	moreBytes:
763	while(toULength<toULimit) {
764	if(source<sourceLimit) {
765	b=*source;
766	if(icu::UTF8::isValidTrail(c, b, toULength, toULimit)) {
767	++source;
768	++toULength;
769	c=(c<<`6`)+b;
770	} else {
771	break; / sequence too short, stop with toULength<toULimit /
772	}
773	} else {
774	/ store the partial UTF-8 character, compatible with the regular UTF-8 converter /
775	source-=(toULength-oldToULength);
776	while(oldToULength<toULength) {
777	utf8->toUBytes[oldToULength++]=*source++;
778	}
779	utf8->toUnicodeStatus=c;
780	utf8->toULength=toULength;
781	utf8->mode=toULimit;
782	pToUArgs->source=(char *)source;
783	pFromUArgs->target=(char *)target;
784	return;
785	}
786	}
787
788	if(toULength!=toULimit) {
789	/ error handling: illegal UTF-8 byte sequence /
790	source-=(toULength-oldToULength);
791	while(oldToULength<toULength) {
792	utf8->toUBytes[oldToULength++]=*source++;
793	}
794	utf8->toULength=toULength;
795	pToUArgs->source=(char *)source;
796	pFromUArgs->target=(char *)target;
797	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
798	return;
799	}
800
801	/ copy the legal byte sequence to the target /
802	{
803	int8_t i;
804
805	for(i=`0`; i<oldToULength; ++i) {
806	*target++=utf8->toUBytes[i];
807	}
808	source-=(toULength-oldToULength);
809	for(; i<toULength; ++i) {
810	target++=source++;
811	}
812	count-=toULength;
813	}
814	}
815	}
816	U_ASSERT(count>=`0`);
817
818	if(U_SUCCESS(*pErrorCode) && source<sourceLimit) {
819	if(target==(const uint8_t *)pFromUArgs->targetLimit) {
820	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
821	} else {
822	b=*source;
823	toULimit=U8_COUNT_BYTES(b);
824	if(toULimit>(sourceLimit-source)) {
825	/ collect a truncated byte sequence /
826	toULength=`0`;
827	c=b;
828	for(;;) {
829	utf8->toUBytes[toULength++]=b;
830	if(++source==sourceLimit) {
831	/ partial byte sequence at end of source /
832	utf8->toUnicodeStatus=c;
833	utf8->toULength=toULength;
834	utf8->mode=toULimit;
835	break;
836	} else if(!icu::UTF8::isValidTrail(c, b=*source, toULength, toULimit)) {
837	utf8->toULength=toULength;
838	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
839	break;
840	}
841	c=(c<<`6`)+b;
842	}
843	} else {
844	/ partial-sequence target overflow: fall back to the pivoting implementation /
845	*pErrorCode=U_USING_DEFAULT_WARNING;
846	}
847	}
848	}
849
850	/ write back the updated pointers /
851	pToUArgs->source=(char *)source;
852	pFromUArgs->target=(char *)target;
853	}
854
855	U_CDECL_END
856
857	/ UTF-8 converter data ----------------------------------------------------- /
858
859	static const UConverterImpl _UTF8Impl={
860	UCNV_UTF8,
861
862	NULL,
863	NULL,
864
865	NULL,
866	NULL,
867	NULL,
868
869	ucnv_toUnicode_UTF8,
870	ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
871	ucnv_fromUnicode_UTF8,
872	ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
873	ucnv_getNextUChar_UTF8,
874
875	NULL,
876	NULL,
877	NULL,
878	NULL,
879	ucnv_getNonSurrogateUnicodeSet,
880
881	ucnv_UTF8FromUTF8,
882	ucnv_UTF8FromUTF8
883	};
884
885	/ The 1208 CCSID refers to any version of Unicode of UTF-8 /
886	static const UConverterStaticData _UTF8StaticData={
887	sizeof(UConverterStaticData),
888	"UTF-8",
889	`1208`, UCNV_IBM, UCNV_UTF8,
890	`1`, `3`, / max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) /
891	{ `0xef`, `0xbf`, `0xbd`, `0` },`3`,FALSE,FALSE,
892	`0`,
893	`0`,
894	{ `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0` } / reserved /
895	};
896
897
898	const UConverterSharedData _UTF8Data=
899	UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF8StaticData, &_UTF8Impl);
900
901	/ CESU-8 converter data ---------------------------------------------------- /
902
903	static const UConverterImpl _CESU8Impl={
904	UCNV_CESU8,
905
906	NULL,
907	NULL,
908
909	NULL,
910	NULL,
911	NULL,
912
913	ucnv_toUnicode_UTF8,
914	ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
915	ucnv_fromUnicode_UTF8,
916	ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
917	NULL,
918
919	NULL,
920	NULL,
921	NULL,
922	NULL,
923	ucnv_getCompleteUnicodeSet,
924
925	NULL,
926	NULL
927	};
928
929	static const UConverterStaticData _CESU8StaticData={
930	sizeof(UConverterStaticData),
931	"CESU-8",
932	`9400`, / CCSID for CESU-8 /
933	UCNV_UNKNOWN, UCNV_CESU8, `1`, `3`,
934	{ `0xef`, `0xbf`, `0xbd`, `0` },`3`,FALSE,FALSE,
935	`0`,
936	`0`,
937	{ `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0` } / reserved /
938	};
939
940
941	const UConverterSharedData _CESU8Data=
942	UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_CESU8StaticData, &_CESU8Impl);
943
944	#endif
945

Browse the source code of ClickHouse/contrib/icu/icu4c/source/common/ucnv_u8.cpp