ustrtrns.cpp source code [ClickHouse/contrib/icu/icu4c/source/common/ustrtrns.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	******************************************************************************
5	*
6	* Copyright (C) 2001-2016, International Business Machines
7	* Corporation and others. All Rights Reserved.
8	*
9	******************************************************************************
10	*
11	* File ustrtrns.cpp
12	*
13	* Modification History:
14	*
15	* Date Name Description
16	* 9/10/2001 Ram Creation.
17	******************************************************************************
18	*/
19
20	/*******************************************************************************
21	*
22	* u_strTo* and u_strFrom* APIs
23	* WCS functions moved to ustr_wcs.c for better modularization
24	*
25	*******************************************************************************
26	*/
27
28
29	#include "unicode/putil.h"
30	#include "unicode/ustring.h"
31	#include "unicode/utf.h"
32	#include "unicode/utf8.h"
33	#include "unicode/utf16.h"
34	#include "cstring.h"
35	#include "cmemory.h"
36	#include "ustr_imp.h"
37	#include "uassert.h"
38
39	U_CAPI UChar* U_EXPORT2
40	u_strFromUTF32WithSub(UChar *dest,
41	int32_t destCapacity,
42	int32_t *pDestLength,
43	const UChar32 *src,
44	int32_t srcLength,
45	UChar32 subchar, int32_t *pNumSubstitutions,
46	UErrorCode *pErrorCode) {
47	const UChar32 *srcLimit;
48	UChar32 ch;
49	UChar *destLimit;
50	UChar *pDest;
51	int32_t reqLength;
52	int32_t numSubstitutions;
53
54	/ args check /
55	if(U_FAILURE(*pErrorCode)){
56	return NULL;
57	}
58	if( (src==NULL && srcLength!=`0`) \|\| srcLength < -`1` \|\|
59	(destCapacity<`0`) \|\| (dest == NULL && destCapacity > `0`) \|\|
60	subchar > `0x10ffff` \|\| U_IS_SURROGATE(subchar)
61	) {
62	*pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
63	return NULL;
64	}
65
66	if(pNumSubstitutions != NULL) {
67	*pNumSubstitutions = `0`;
68	}
69
70	pDest = dest;
71	destLimit = (dest!=NULL)?(dest + destCapacity):NULL;
72	reqLength = `0`;
73	numSubstitutions = `0`;
74
75	if(srcLength < `0`) {
76	/ simple loop for conversion of a NUL-terminated BMP string /
77	while((ch=*src) != `0` &&
78	((uint32_t)ch < `0xd800` \|\| (`0xe000` <= ch && ch <= `0xffff`))) {
79	++src;
80	if(pDest < destLimit) {
81	*pDest++ = (UChar)ch;
82	} else {
83	++reqLength;
84	}
85	}
86	srcLimit = src;
87	if(ch != `0`) {
88	/ "complicated" case, find the end of the remaining string /
89	while(*++srcLimit != `0`) {}
90	}
91	} else {
92	srcLimit = (src!=NULL)?(src + srcLength):NULL;
93	}
94
95	/ convert with length /
96	while(src < srcLimit) {
97	ch = *src++;
98	do {
99	/ usually "loops" once; twice only for writing subchar /
100	if((uint32_t)ch < `0xd800` \|\| (`0xe000` <= ch && ch <= `0xffff`)) {
101	if(pDest < destLimit) {
102	*pDest++ = (UChar)ch;
103	} else {
104	++reqLength;
105	}
106	break;
107	} else if(`0x10000` <= ch && ch <= `0x10ffff`) {
108	if(pDest!=NULL && ((pDest + `2`) <= destLimit)) {
109	*pDest++ = U16_LEAD(ch);
110	*pDest++ = U16_TRAIL(ch);
111	} else {
112	reqLength += `2`;
113	}
114	break;
115	} else if((ch = subchar) < `0`) {
116	/ surrogate code point, or not a Unicode code point at all /
117	*pErrorCode = U_INVALID_CHAR_FOUND;
118	return NULL;
119	} else {
120	++numSubstitutions;
121	}
122	} while(TRUE);
123	}
124
125	reqLength += (int32_t)(pDest - dest);
126	if(pDestLength) {
127	*pDestLength = reqLength;
128	}
129	if(pNumSubstitutions != NULL) {
130	*pNumSubstitutions = numSubstitutions;
131	}
132
133	/ Terminate the buffer /
134	u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
135
136	return dest;
137	}
138
139	U_CAPI UChar* U_EXPORT2
140	u_strFromUTF32(UChar *dest,
141	int32_t destCapacity,
142	int32_t *pDestLength,
143	const UChar32 *src,
144	int32_t srcLength,
145	UErrorCode *pErrorCode) {
146	return u_strFromUTF32WithSub(
147	dest, destCapacity, pDestLength,
148	src, srcLength,
149	U_SENTINEL, NULL,
150	pErrorCode);
151	}
152
153	U_CAPI UChar32* U_EXPORT2
154	u_strToUTF32WithSub(UChar32 *dest,
155	int32_t destCapacity,
156	int32_t *pDestLength,
157	const UChar *src,
158	int32_t srcLength,
159	UChar32 subchar, int32_t *pNumSubstitutions,
160	UErrorCode *pErrorCode) {
161	const UChar *srcLimit;
162	UChar32 ch;
163	UChar ch2;
164	UChar32 *destLimit;
165	UChar32 *pDest;
166	int32_t reqLength;
167	int32_t numSubstitutions;
168
169	/ args check /
170	if(U_FAILURE(*pErrorCode)){
171	return NULL;
172	}
173	if( (src==NULL && srcLength!=`0`) \|\| srcLength < -`1` \|\|
174	(destCapacity<`0`) \|\| (dest == NULL && destCapacity > `0`) \|\|
175	subchar > `0x10ffff` \|\| U_IS_SURROGATE(subchar)
176	) {
177	*pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
178	return NULL;
179	}
180
181	if(pNumSubstitutions != NULL) {
182	*pNumSubstitutions = `0`;
183	}
184
185	pDest = dest;
186	destLimit = (dest!=NULL)?(dest + destCapacity):NULL;
187	reqLength = `0`;
188	numSubstitutions = `0`;
189
190	if(srcLength < `0`) {
191	/ simple loop for conversion of a NUL-terminated BMP string /
192	while((ch=*src) != `0` && !U16_IS_SURROGATE(ch)) {
193	++src;
194	if(pDest < destLimit) {
195	*pDest++ = ch;
196	} else {
197	++reqLength;
198	}
199	}
200	srcLimit = src;
201	if(ch != `0`) {
202	/ "complicated" case, find the end of the remaining string /
203	while(*++srcLimit != `0`) {}
204	}
205	} else {
206	srcLimit = (src!=NULL)?(src + srcLength):NULL;
207	}
208
209	/ convert with length /
210	while(src < srcLimit) {
211	ch = *src++;
212	if(!U16_IS_SURROGATE(ch)) {
213	/ write or count ch below /
214	} else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) {
215	++src;
216	ch = U16_GET_SUPPLEMENTARY(ch, ch2);
217	} else if((ch = subchar) < `0`) {
218	/ unpaired surrogate /
219	*pErrorCode = U_INVALID_CHAR_FOUND;
220	return NULL;
221	} else {
222	++numSubstitutions;
223	}
224	if(pDest < destLimit) {
225	*pDest++ = ch;
226	} else {
227	++reqLength;
228	}
229	}
230
231	reqLength += (int32_t)(pDest - dest);
232	if(pDestLength) {
233	*pDestLength = reqLength;
234	}
235	if(pNumSubstitutions != NULL) {
236	*pNumSubstitutions = numSubstitutions;
237	}
238
239	/ Terminate the buffer /
240	u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode);
241
242	return dest;
243	}
244
245	U_CAPI UChar32* U_EXPORT2
246	u_strToUTF32(UChar32 *dest,
247	int32_t destCapacity,
248	int32_t *pDestLength,
249	const UChar *src,
250	int32_t srcLength,
251	UErrorCode *pErrorCode) {
252	return u_strToUTF32WithSub(
253	dest, destCapacity, pDestLength,
254	src, srcLength,
255	U_SENTINEL, NULL,
256	pErrorCode);
257	}
258
259	U_CAPI UChar* U_EXPORT2
260	u_strFromUTF8WithSub(UChar *dest,
261	int32_t destCapacity,
262	int32_t *pDestLength,
263	const char* src,
264	int32_t srcLength,
265	UChar32 subchar, int32_t *pNumSubstitutions,
266	UErrorCode *pErrorCode){
267	/ args check /
268	if(U_FAILURE(*pErrorCode)) {
269	return NULL;
270	}
271	if( (src==NULL && srcLength!=`0`) \|\| srcLength < -`1` \|\|
272	(destCapacity<`0`) \|\| (dest == NULL && destCapacity > `0`) \|\|
273	subchar > `0x10ffff` \|\| U_IS_SURROGATE(subchar)
274	) {
275	*pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
276	return NULL;
277	}
278
279	if(pNumSubstitutions!=NULL) {
280	*pNumSubstitutions=`0`;
281	}
282	UChar *pDest = dest;
283	UChar *pDestLimit = dest+destCapacity;
284	int32_t reqLength = `0`;
285	int32_t numSubstitutions=`0`;
286
287	/*
288	* Inline processing of UTF-8 byte sequences:
289	*
290	* Byte sequences for the most common characters are handled inline in
291	* the conversion loops. In order to reduce the path lengths for those
292	* characters, the tests are arranged in a kind of binary search.
293	* ASCII (<=0x7f) is checked first, followed by the dividing point
294	* between 2- and 3-byte sequences (0xe0).
295	* The 3-byte branch is tested first to speed up CJK text.
296	* The compiler should combine the subtractions for the two tests for 0xe0.
297	* Each branch then tests for the other end of its range.
298	*/
299
300	if(srcLength < `0`){
301	/*
302	* Transform a NUL-terminated string.
303	* The code explicitly checks for NULs only in the lead byte position.
304	* A NUL byte in the trail byte position fails the trail byte range check anyway.
305	*/
306	int32_t i;
307	UChar32 c;
308	for(i = `0`; (c = (uint8_t)src[i]) != `0` && (pDest < pDestLimit);) {
309	// modified copy of U8_NEXT()
310	++i;
311	if(U8_IS_SINGLE(c)) {
312	*pDest++=(UChar)c;
313	} else {
314	uint8_t __t1, __t2;
315	if( / handle U+0800..U+FFFF inline /
316	(`0xe0`<=(c) && (c)<`0xf0`) &&
317	U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
318	(__t2=src[(i)+`1`]-`0x80`)<=`0x3f`) {
319	*pDest++ = (((c)&`0xf`)<<`12`)\|((src[i]&`0x3f`)<<`6`)\|__t2;
320	i+=`2`;
321	} else if( / handle U+0080..U+07FF inline /
322	((c)<`0xe0` && (c)>=`0xc2`) &&
323	(__t1=src[i]-`0x80`)<=`0x3f`) {
324	*pDest++ = (((c)&`0x1f`)<<`6`)\|__t1;
325	++(i);
326	} else {
327	/ function call for "complicated" and error cases /
328	(c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), -`1`, c, -`1`);
329	if(c<`0` && (++numSubstitutions, c = subchar) < `0`) {
330	*pErrorCode = U_INVALID_CHAR_FOUND;
331	return NULL;
332	} else if(c<=`0xFFFF`) {
333	*(pDest++)=(UChar)c;
334	} else {
335	*(pDest++)=U16_LEAD(c);
336	if(pDest<pDestLimit) {
337	*(pDest++)=U16_TRAIL(c);
338	} else {
339	reqLength++;
340	break;
341	}
342	}
343	}
344	}
345	}
346
347	/ Pre-flight the rest of the string. /
348	while((c = (uint8_t)src[i]) != `0`) {
349	// modified copy of U8_NEXT()
350	++i;
351	if(U8_IS_SINGLE(c)) {
352	++reqLength;
353	} else {
354	uint8_t __t1, __t2;
355	if( / handle U+0800..U+FFFF inline /
356	(`0xe0`<=(c) && (c)<`0xf0`) &&
357	U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
358	(__t2=src[(i)+`1`]-`0x80`)<=`0x3f`) {
359	++reqLength;
360	i+=`2`;
361	} else if( / handle U+0080..U+07FF inline /
362	((c)<`0xe0` && (c)>=`0xc2`) &&
363	(__t1=src[i]-`0x80`)<=`0x3f`) {
364	++reqLength;
365	++(i);
366	} else {
367	/ function call for "complicated" and error cases /
368	(c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), -`1`, c, -`1`);
369	if(c<`0` && (++numSubstitutions, c = subchar) < `0`) {
370	*pErrorCode = U_INVALID_CHAR_FOUND;
371	return NULL;
372	}
373	reqLength += U16_LENGTH(c);
374	}
375	}
376	}
377	} else / srcLength >= 0 / {
378	/ Faster loop without ongoing checking for srcLength and pDestLimit. /
379	int32_t i = `0`;
380	UChar32 c;
381	for(;;) {
382	/*
383	* Each iteration of the inner loop progresses by at most 3 UTF-8
384	* bytes and one UChar, for most characters.
385	* For supplementary code points (4 & 2), which are rare,
386	* there is an additional adjustment.
387	*/
388	int32_t count = (int32_t)(pDestLimit - pDest);
389	int32_t count2 = (srcLength - i) / `3`;
390	if(count > count2) {
391	count = count2; / min(remaining dest, remaining src/3) /
392	}
393	if(count < `3`) {
394	/*
395	* Too much overhead if we get near the end of the string,
396	* continue with the next loop.
397	*/
398	break;
399	}
400
401	do {
402	// modified copy of U8_NEXT()
403	c = (uint8_t)src[i++];
404	if(U8_IS_SINGLE(c)) {
405	*pDest++=(UChar)c;
406	} else {
407	uint8_t __t1, __t2;
408	if( / handle U+0800..U+FFFF inline /
409	(`0xe0`<=(c) && (c)<`0xf0`) &&
410	((i)+`1`)<srcLength &&
411	U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
412	(__t2=src[(i)+`1`]-`0x80`)<=`0x3f`) {
413	*pDest++ = (((c)&`0xf`)<<`12`)\|((src[i]&`0x3f`)<<`6`)\|__t2;
414	i+=`2`;
415	} else if( / handle U+0080..U+07FF inline /
416	((c)<`0xe0` && (c)>=`0xc2`) &&
417	((i)!=srcLength) &&
418	(__t1=src[i]-`0x80`)<=`0x3f`) {
419	*pDest++ = (((c)&`0x1f`)<<`6`)\|__t1;
420	++(i);
421	} else {
422	if(c >= `0xf0` \|\| subchar > `0xffff`) {
423	// We may read up to four bytes and write up to two UChars,
424	// which we didn't account for with computing count,
425	// so we adjust it here.
426	if(--count == `0`) {
427	--i; // back out byte c
428	break;
429	}
430	}
431
432	/ function call for "complicated" and error cases /
433	(c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -`1`);
434	if(c<`0` && (++numSubstitutions, c = subchar) < `0`) {
435	*pErrorCode = U_INVALID_CHAR_FOUND;
436	return NULL;
437	} else if(c<=`0xFFFF`) {
438	*(pDest++)=(UChar)c;
439	} else {
440	*(pDest++)=U16_LEAD(c);
441	*(pDest++)=U16_TRAIL(c);
442	}
443	}
444	}
445	} while(--count > `0`);
446	}
447
448	while(i < srcLength && (pDest < pDestLimit)) {
449	// modified copy of U8_NEXT()
450	c = (uint8_t)src[i++];
451	if(U8_IS_SINGLE(c)) {
452	*pDest++=(UChar)c;
453	} else {
454	uint8_t __t1, __t2;
455	if( / handle U+0800..U+FFFF inline /
456	(`0xe0`<=(c) && (c)<`0xf0`) &&
457	((i)+`1`)<srcLength &&
458	U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
459	(__t2=src[(i)+`1`]-`0x80`)<=`0x3f`) {
460	*pDest++ = (((c)&`0xf`)<<`12`)\|((src[i]&`0x3f`)<<`6`)\|__t2;
461	i+=`2`;
462	} else if( / handle U+0080..U+07FF inline /
463	((c)<`0xe0` && (c)>=`0xc2`) &&
464	((i)!=srcLength) &&
465	(__t1=src[i]-`0x80`)<=`0x3f`) {
466	*pDest++ = (((c)&`0x1f`)<<`6`)\|__t1;
467	++(i);
468	} else {
469	/ function call for "complicated" and error cases /
470	(c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -`1`);
471	if(c<`0` && (++numSubstitutions, c = subchar) < `0`) {
472	*pErrorCode = U_INVALID_CHAR_FOUND;
473	return NULL;
474	} else if(c<=`0xFFFF`) {
475	*(pDest++)=(UChar)c;
476	} else {
477	*(pDest++)=U16_LEAD(c);
478	if(pDest<pDestLimit) {
479	*(pDest++)=U16_TRAIL(c);
480	} else {
481	reqLength++;
482	break;
483	}
484	}
485	}
486	}
487	}
488
489	/ Pre-flight the rest of the string. /
490	while(i < srcLength) {
491	// modified copy of U8_NEXT()
492	c = (uint8_t)src[i++];
493	if(U8_IS_SINGLE(c)) {
494	++reqLength;
495	} else {
496	uint8_t __t1, __t2;
497	if( / handle U+0800..U+FFFF inline /
498	(`0xe0`<=(c) && (c)<`0xf0`) &&
499	((i)+`1`)<srcLength &&
500	U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
501	(__t2=src[(i)+`1`]-`0x80`)<=`0x3f`) {
502	++reqLength;
503	i+=`2`;
504	} else if( / handle U+0080..U+07FF inline /
505	((c)<`0xe0` && (c)>=`0xc2`) &&
506	((i)!=srcLength) &&
507	(__t1=src[i]-`0x80`)<=`0x3f`) {
508	++reqLength;
509	++(i);
510	} else {
511	/ function call for "complicated" and error cases /
512	(c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -`1`);
513	if(c<`0` && (++numSubstitutions, c = subchar) < `0`) {
514	*pErrorCode = U_INVALID_CHAR_FOUND;
515	return NULL;
516	}
517	reqLength += U16_LENGTH(c);
518	}
519	}
520	}
521	}
522
523	reqLength+=(int32_t)(pDest - dest);
524
525	if(pNumSubstitutions!=NULL) {
526	*pNumSubstitutions=numSubstitutions;
527	}
528
529	if(pDestLength){
530	*pDestLength = reqLength;
531	}
532
533	/ Terminate the buffer /
534	u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
535
536	return dest;
537	}
538
539	U_CAPI UChar* U_EXPORT2
540	u_strFromUTF8(UChar *dest,
541	int32_t destCapacity,
542	int32_t *pDestLength,
543	const char* src,
544	int32_t srcLength,
545	UErrorCode *pErrorCode){
546	return u_strFromUTF8WithSub(
547	dest, destCapacity, pDestLength,
548	src, srcLength,
549	U_SENTINEL, NULL,
550	pErrorCode);
551	}
552
553	U_CAPI UChar * U_EXPORT2
554	u_strFromUTF8Lenient(UChar *dest,
555	int32_t destCapacity,
556	int32_t *pDestLength,
557	const char *src,
558	int32_t srcLength,
559	UErrorCode *pErrorCode) {
560	UChar *pDest = dest;
561	UChar32 ch;
562	int32_t reqLength = `0`;
563	uint8_t* pSrc = (uint8_t*) src;
564
565	/ args check /
566	if(U_FAILURE(*pErrorCode)){
567	return NULL;
568	}
569
570	if( (src==NULL && srcLength!=`0`) \|\| srcLength < -`1` \|\|
571	(destCapacity<`0`) \|\| (dest == NULL && destCapacity > `0`)
572	) {
573	*pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
574	return NULL;
575	}
576
577	if(srcLength < `0`) {
578	/ Transform a NUL-terminated string. /
579	UChar *pDestLimit = (dest!=NULL)?(dest+destCapacity):NULL;
580	uint8_t t1, t2, t3; / trail bytes /
581
582	while(((ch = *pSrc) != `0`) && (pDest < pDestLimit)) {
583	if(ch < `0xc0`) {
584	/*
585	* ASCII, or a trail byte in lead position which is treated like
586	* a single-byte sequence for better character boundary
587	* resynchronization after illegal sequences.
588	*/
589	*pDest++=(UChar)ch;
590	++pSrc;
591	continue;
592	} else if(ch < `0xe0`) { / U+0080..U+07FF /
593	if((t1 = pSrc[`1`]) != `0`) {
594	/ 0x3080 = (0xc0 << 6) + 0x80 /
595	*pDest++ = (UChar)((ch << `6`) + t1 - `0x3080`);
596	pSrc += `2`;
597	continue;
598	}
599	} else if(ch < `0xf0`) { / U+0800..U+FFFF /
600	if((t1 = pSrc[`1`]) != `0` && (t2 = pSrc[`2`]) != `0`) {
601	/ no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) /
602	/ 0x2080 = (0x80 << 6) + 0x80 /
603	*pDest++ = (UChar)((ch << `12`) + (t1 << `6`) + t2 - `0x2080`);
604	pSrc += `3`;
605	continue;
606	}
607	} else / f0..f4 / { / U+10000..U+10FFFF /
608	if((t1 = pSrc[`1`]) != `0` && (t2 = pSrc[`2`]) != `0` && (t3 = pSrc[`3`]) != `0`) {
609	pSrc += `4`;
610	/ 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 /
611	ch = (ch << `18`) + (t1 << `12`) + (t2 << `6`) + t3 - `0x3c82080`;
612	*(pDest++) = U16_LEAD(ch);
613	if(pDest < pDestLimit) {
614	*(pDest++) = U16_TRAIL(ch);
615	} else {
616	reqLength = `1`;
617	break;
618	}
619	continue;
620	}
621	}
622
623	/ truncated character at the end /
624	*pDest++ = `0xfffd`;
625	while(*++pSrc != `0`) {}
626	break;
627	}
628
629	/ Pre-flight the rest of the string. /
630	while((ch = *pSrc) != `0`) {
631	if(ch < `0xc0`) {
632	/*
633	* ASCII, or a trail byte in lead position which is treated like
634	* a single-byte sequence for better character boundary
635	* resynchronization after illegal sequences.
636	*/
637	++reqLength;
638	++pSrc;
639	continue;
640	} else if(ch < `0xe0`) { / U+0080..U+07FF /
641	if(pSrc[`1`] != `0`) {
642	++reqLength;
643	pSrc += `2`;
644	continue;
645	}
646	} else if(ch < `0xf0`) { / U+0800..U+FFFF /
647	if(pSrc[`1`] != `0` && pSrc[`2`] != `0`) {
648	++reqLength;
649	pSrc += `3`;
650	continue;
651	}
652	} else / f0..f4 / { / U+10000..U+10FFFF /
653	if(pSrc[`1`] != `0` && pSrc[`2`] != `0` && pSrc[`3`] != `0`) {
654	reqLength += `2`;
655	pSrc += `4`;
656	continue;
657	}
658	}
659
660	/ truncated character at the end /
661	++reqLength;
662	break;
663	}
664	} else / srcLength >= 0 / {
665	const uint8_t *pSrcLimit = (pSrc!=NULL)?(pSrc + srcLength):NULL;
666
667	/*
668	* This function requires that if srcLength is given, then it must be
669	* destCapatity >= srcLength so that we need not check for
670	* destination buffer overflow in the loop.
671	*/
672	if(destCapacity < srcLength) {
673	if(pDestLength != NULL) {
674	pDestLength = srcLength; /* this likely overestimates the true destLength! /
675	}
676	*pErrorCode = U_BUFFER_OVERFLOW_ERROR;
677	return NULL;
678	}
679
680	if((pSrcLimit - pSrc) >= `4`) {
681	pSrcLimit -= `3`; / temporarily reduce pSrcLimit /
682
683	/ in this loop, we can always access at least 4 bytes, up to pSrc+3 /
684	do {
685	ch = *pSrc++;
686	if(ch < `0xc0`) {
687	/*
688	* ASCII, or a trail byte in lead position which is treated like
689	* a single-byte sequence for better character boundary
690	* resynchronization after illegal sequences.
691	*/
692	*pDest++=(UChar)ch;
693	} else if(ch < `0xe0`) { / U+0080..U+07FF /
694	/ 0x3080 = (0xc0 << 6) + 0x80 /
695	pDest++ = (UChar)((ch << `6`) + pSrc++ - `0x3080`);
696	} else if(ch < `0xf0`) { / U+0800..U+FFFF /
697	/ no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) /
698	/ 0x2080 = (0x80 << 6) + 0x80 /
699	ch = (ch << `12`) + (*pSrc++ << `6`);
700	pDest++ = (UChar)(ch + pSrc++ - `0x2080`);
701	} else / f0..f4 / { / U+10000..U+10FFFF /
702	/ 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 /
703	ch = (ch << `18`) + (*pSrc++ << `12`);
704	ch += *pSrc++ << `6`;
705	ch += *pSrc++ - `0x3c82080`;
706	*(pDest++) = U16_LEAD(ch);
707	*(pDest++) = U16_TRAIL(ch);
708	}
709	} while(pSrc < pSrcLimit);
710
711	pSrcLimit += `3`; / restore original pSrcLimit /
712	}
713
714	while(pSrc < pSrcLimit) {
715	ch = *pSrc++;
716	if(ch < `0xc0`) {
717	/*
718	* ASCII, or a trail byte in lead position which is treated like
719	* a single-byte sequence for better character boundary
720	* resynchronization after illegal sequences.
721	*/
722	*pDest++=(UChar)ch;
723	continue;
724	} else if(ch < `0xe0`) { / U+0080..U+07FF /
725	if(pSrc < pSrcLimit) {
726	/ 0x3080 = (0xc0 << 6) + 0x80 /
727	pDest++ = (UChar)((ch << `6`) + pSrc++ - `0x3080`);
728	continue;
729	}
730	} else if(ch < `0xf0`) { / U+0800..U+FFFF /
731	if((pSrcLimit - pSrc) >= `2`) {
732	/ no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) /
733	/ 0x2080 = (0x80 << 6) + 0x80 /
734	ch = (ch << `12`) + (*pSrc++ << `6`);
735	pDest++ = (UChar)(ch + pSrc++ - `0x2080`);
736	pSrc += `3`;
737	continue;
738	}
739	} else / f0..f4 / { / U+10000..U+10FFFF /
740	if((pSrcLimit - pSrc) >= `3`) {
741	/ 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 /
742	ch = (ch << `18`) + (*pSrc++ << `12`);
743	ch += *pSrc++ << `6`;
744	ch += *pSrc++ - `0x3c82080`;
745	*(pDest++) = U16_LEAD(ch);
746	*(pDest++) = U16_TRAIL(ch);
747	pSrc += `4`;
748	continue;
749	}
750	}
751
752	/ truncated character at the end /
753	*pDest++ = `0xfffd`;
754	break;
755	}
756	}
757
758	reqLength+=(int32_t)(pDest - dest);
759
760	if(pDestLength){
761	*pDestLength = reqLength;
762	}
763
764	/ Terminate the buffer /
765	u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
766
767	return dest;
768	}
769
770	static inline uint8_t *
771	_appendUTF8(uint8_t *pDest, UChar32 c) {
772	/ it is 0<=c<=0x10ffff and not a surrogate if called by a validating function /
773	if((c)<=`0x7f`) {
774	*pDest++=(uint8_t)c;
775	} else if(c<=`0x7ff`) {
776	*pDest++=(uint8_t)((c>>`6`)\|`0xc0`);
777	*pDest++=(uint8_t)((c&`0x3f`)\|`0x80`);
778	} else if(c<=`0xffff`) {
779	*pDest++=(uint8_t)((c>>`12`)\|`0xe0`);
780	*pDest++=(uint8_t)(((c>>`6`)&`0x3f`)\|`0x80`);
781	*pDest++=(uint8_t)(((c)&`0x3f`)\|`0x80`);
782	} else / if((uint32_t)(c)<=0x10ffff) / {
783	*pDest++=(uint8_t)(((c)>>`18`)\|`0xf0`);
784	*pDest++=(uint8_t)((((c)>>`12`)&`0x3f`)\|`0x80`);
785	*pDest++=(uint8_t)((((c)>>`6`)&`0x3f`)\|`0x80`);
786	*pDest++=(uint8_t)(((c)&`0x3f`)\|`0x80`);
787	}
788	return pDest;
789	}
790
791
792	U_CAPI char* U_EXPORT2
793	u_strToUTF8WithSub(char *dest,
794	int32_t destCapacity,
795	int32_t *pDestLength,
796	const UChar *pSrc,
797	int32_t srcLength,
798	UChar32 subchar, int32_t *pNumSubstitutions,
799	UErrorCode *pErrorCode){
800	int32_t reqLength=`0`;
801	uint32_t ch=`0`,ch2=`0`;
802	uint8_t pDest = (uint8_t )dest;
803	uint8_t *pDestLimit = (pDest!=NULL)?(pDest + destCapacity):NULL;
804	int32_t numSubstitutions;
805
806	/ args check /
807	if(U_FAILURE(*pErrorCode)){
808	return NULL;
809	}
810
811	if( (pSrc==NULL && srcLength!=`0`) \|\| srcLength < -`1` \|\|
812	(destCapacity<`0`) \|\| (dest == NULL && destCapacity > `0`) \|\|
813	subchar > `0x10ffff` \|\| U_IS_SURROGATE(subchar)
814	) {
815	*pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
816	return NULL;
817	}
818
819	if(pNumSubstitutions!=NULL) {
820	*pNumSubstitutions=`0`;
821	}
822	numSubstitutions=`0`;
823
824	if(srcLength==-`1`) {
825	while((ch=*pSrc)!=`0`) {
826	++pSrc;
827	if(ch <= `0x7f`) {
828	if(pDest<pDestLimit) {
829	*pDest++ = (uint8_t)ch;
830	} else {
831	reqLength = `1`;
832	break;
833	}
834	} else if(ch <= `0x7ff`) {
835	if((pDestLimit - pDest) >= `2`) {
836	*pDest++=(uint8_t)((ch>>`6`)\|`0xc0`);
837	*pDest++=(uint8_t)((ch&`0x3f`)\|`0x80`);
838	} else {
839	reqLength = `2`;
840	break;
841	}
842	} else if(ch <= `0xd7ff` \|\| ch >= `0xe000`) {
843	if((pDestLimit - pDest) >= `3`) {
844	*pDest++=(uint8_t)((ch>>`12`)\|`0xe0`);
845	*pDest++=(uint8_t)(((ch>>`6`)&`0x3f`)\|`0x80`);
846	*pDest++=(uint8_t)((ch&`0x3f`)\|`0x80`);
847	} else {
848	reqLength = `3`;
849	break;
850	}
851	} else / ch is a surrogate / {
852	int32_t length;
853
854	/need not check for NUL because NUL fails U16_IS_TRAIL() anyway/
855	if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
856	++pSrc;
857	ch=U16_GET_SUPPLEMENTARY(ch, ch2);
858	} else if(subchar>=`0`) {
859	ch=subchar;
860	++numSubstitutions;
861	} else {
862	/ Unicode 3.2 forbids surrogate code points in UTF-8 /
863	*pErrorCode = U_INVALID_CHAR_FOUND;
864	return NULL;
865	}
866
867	length = U8_LENGTH(ch);
868	if((pDestLimit - pDest) >= length) {
869	/ convert and append/
870	pDest=_appendUTF8(pDest, ch);
871	} else {
872	reqLength = length;
873	break;
874	}
875	}
876	}
877	while((ch=*pSrc++)!=`0`) {
878	if(ch<=`0x7f`) {
879	++reqLength;
880	} else if(ch<=`0x7ff`) {
881	reqLength+=`2`;
882	} else if(!U16_IS_SURROGATE(ch)) {
883	reqLength+=`3`;
884	} else if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
885	++pSrc;
886	reqLength+=`4`;
887	} else if(subchar>=`0`) {
888	reqLength+=U8_LENGTH(subchar);
889	++numSubstitutions;
890	} else {
891	/ Unicode 3.2 forbids surrogate code points in UTF-8 /
892	*pErrorCode = U_INVALID_CHAR_FOUND;
893	return NULL;
894	}
895	}
896	} else {
897	const UChar *pSrcLimit = (pSrc!=NULL)?(pSrc+srcLength):NULL;
898	int32_t count;
899
900	/ Faster loop without ongoing checking for pSrcLimit and pDestLimit. /
901	for(;;) {
902	/*
903	* Each iteration of the inner loop progresses by at most 3 UTF-8
904	* bytes and one UChar, for most characters.
905	* For supplementary code points (4 & 2), which are rare,
906	* there is an additional adjustment.
907	*/
908	count = (int32_t)((pDestLimit - pDest) / `3`);
909	srcLength = (int32_t)(pSrcLimit - pSrc);
910	if(count > srcLength) {
911	count = srcLength; / min(remaining dest/3, remaining src) /
912	}
913	if(count < `3`) {
914	/*
915	* Too much overhead if we get near the end of the string,
916	* continue with the next loop.
917	*/
918	break;
919	}
920	do {
921	ch=*pSrc++;
922	if(ch <= `0x7f`) {
923	*pDest++ = (uint8_t)ch;
924	} else if(ch <= `0x7ff`) {
925	*pDest++=(uint8_t)((ch>>`6`)\|`0xc0`);
926	*pDest++=(uint8_t)((ch&`0x3f`)\|`0x80`);
927	} else if(ch <= `0xd7ff` \|\| ch >= `0xe000`) {
928	*pDest++=(uint8_t)((ch>>`12`)\|`0xe0`);
929	*pDest++=(uint8_t)(((ch>>`6`)&`0x3f`)\|`0x80`);
930	*pDest++=(uint8_t)((ch&`0x3f`)\|`0x80`);
931	} else / ch is a surrogate / {
932	/*
933	* We will read two UChars and probably output four bytes,
934	* which we didn't account for with computing count,
935	* so we adjust it here.
936	*/
937	if(--count == `0`) {
938	--pSrc; / undo ch=pSrc++ for the lead surrogate /*
939	break; / recompute count /
940	}
941
942	if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
943	++pSrc;
944	ch=U16_GET_SUPPLEMENTARY(ch, ch2);
945
946	/ writing 4 bytes per 2 UChars is ok /
947	*pDest++=(uint8_t)((ch>>`18`)\|`0xf0`);
948	*pDest++=(uint8_t)(((ch>>`12`)&`0x3f`)\|`0x80`);
949	*pDest++=(uint8_t)(((ch>>`6`)&`0x3f`)\|`0x80`);
950	*pDest++=(uint8_t)((ch&`0x3f`)\|`0x80`);
951	} else {
952	/ Unicode 3.2 forbids surrogate code points in UTF-8 /
953	if(subchar>=`0`) {
954	ch=subchar;
955	++numSubstitutions;
956	} else {
957	*pErrorCode = U_INVALID_CHAR_FOUND;
958	return NULL;
959	}
960
961	/ convert and append/
962	pDest=_appendUTF8(pDest, ch);
963	}
964	}
965	} while(--count > `0`);
966	}
967
968	while(pSrc<pSrcLimit) {
969	ch=*pSrc++;
970	if(ch <= `0x7f`) {
971	if(pDest<pDestLimit) {
972	*pDest++ = (uint8_t)ch;
973	} else {
974	reqLength = `1`;
975	break;
976	}
977	} else if(ch <= `0x7ff`) {
978	if((pDestLimit - pDest) >= `2`) {
979	*pDest++=(uint8_t)((ch>>`6`)\|`0xc0`);
980	*pDest++=(uint8_t)((ch&`0x3f`)\|`0x80`);
981	} else {
982	reqLength = `2`;
983	break;
984	}
985	} else if(ch <= `0xd7ff` \|\| ch >= `0xe000`) {
986	if((pDestLimit - pDest) >= `3`) {
987	*pDest++=(uint8_t)((ch>>`12`)\|`0xe0`);
988	*pDest++=(uint8_t)(((ch>>`6`)&`0x3f`)\|`0x80`);
989	*pDest++=(uint8_t)((ch&`0x3f`)\|`0x80`);
990	} else {
991	reqLength = `3`;
992	break;
993	}
994	} else / ch is a surrogate / {
995	int32_t length;
996
997	if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
998	++pSrc;
999	ch=U16_GET_SUPPLEMENTARY(ch, ch2);
1000	} else if(subchar>=`0`) {
1001	ch=subchar;
1002	++numSubstitutions;
1003	} else {
1004	/ Unicode 3.2 forbids surrogate code points in UTF-8 /
1005	*pErrorCode = U_INVALID_CHAR_FOUND;
1006	return NULL;
1007	}
1008
1009	length = U8_LENGTH(ch);
1010	if((pDestLimit - pDest) >= length) {
1011	/ convert and append/
1012	pDest=_appendUTF8(pDest, ch);
1013	} else {
1014	reqLength = length;
1015	break;
1016	}
1017	}
1018	}
1019	while(pSrc<pSrcLimit) {
1020	ch=*pSrc++;
1021	if(ch<=`0x7f`) {
1022	++reqLength;
1023	} else if(ch<=`0x7ff`) {
1024	reqLength+=`2`;
1025	} else if(!U16_IS_SURROGATE(ch)) {
1026	reqLength+=`3`;
1027	} else if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
1028	++pSrc;
1029	reqLength+=`4`;
1030	} else if(subchar>=`0`) {
1031	reqLength+=U8_LENGTH(subchar);
1032	++numSubstitutions;
1033	} else {
1034	/ Unicode 3.2 forbids surrogate code points in UTF-8 /
1035	*pErrorCode = U_INVALID_CHAR_FOUND;
1036	return NULL;
1037	}
1038	}
1039	}
1040
1041	reqLength+=(int32_t)(pDest - (uint8_t *)dest);
1042
1043	if(pNumSubstitutions!=NULL) {
1044	*pNumSubstitutions=numSubstitutions;
1045	}
1046
1047	if(pDestLength){
1048	*pDestLength = reqLength;
1049	}
1050
1051	/ Terminate the buffer /
1052	u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1053	return dest;
1054	}
1055
1056	U_CAPI char* U_EXPORT2
1057	u_strToUTF8(char *dest,
1058	int32_t destCapacity,
1059	int32_t *pDestLength,
1060	const UChar *pSrc,
1061	int32_t srcLength,
1062	UErrorCode *pErrorCode){
1063	return u_strToUTF8WithSub(
1064	dest, destCapacity, pDestLength,
1065	pSrc, srcLength,
1066	U_SENTINEL, NULL,
1067	pErrorCode);
1068	}
1069
1070	U_CAPI UChar* U_EXPORT2
1071	u_strFromJavaModifiedUTF8WithSub(
1072	UChar *dest,
1073	int32_t destCapacity,
1074	int32_t *pDestLength,
1075	const char *src,
1076	int32_t srcLength,
1077	UChar32 subchar, int32_t *pNumSubstitutions,
1078	UErrorCode *pErrorCode) {
1079	/ args check /
1080	if(U_FAILURE(*pErrorCode)) {
1081	return NULL;
1082	}
1083	if( (src==NULL && srcLength!=`0`) \|\| srcLength < -`1` \|\|
1084	(dest==NULL && destCapacity!=`0`) \|\| destCapacity<`0` \|\|
1085	subchar > `0x10ffff` \|\| U_IS_SURROGATE(subchar)
1086	) {
1087	*pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1088	return NULL;
1089	}
1090
1091	if(pNumSubstitutions!=NULL) {
1092	*pNumSubstitutions=`0`;
1093	}
1094	UChar *pDest = dest;
1095	UChar *pDestLimit = dest+destCapacity;
1096	int32_t reqLength = `0`;
1097	int32_t numSubstitutions=`0`;
1098
1099	if(srcLength < `0`) {
1100	/*
1101	* Transform a NUL-terminated ASCII string.
1102	* Handle non-ASCII strings with slower code.
1103	*/
1104	UChar32 c;
1105	while(((c = (uint8_t)*src) != `0`) && c <= `0x7f` && (pDest < pDestLimit)) {
1106	*pDest++=(UChar)c;
1107	++src;
1108	}
1109	if(c == `0`) {
1110	reqLength=(int32_t)(pDest - dest);
1111	if(pDestLength) {
1112	*pDestLength = reqLength;
1113	}
1114
1115	/ Terminate the buffer /
1116	u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
1117	return dest;
1118	}
1119	srcLength = static_cast<int32_t>(uprv_strlen(src));
1120	}
1121
1122	/ Faster loop without ongoing checking for srcLength and pDestLimit. /
1123	UChar32 ch;
1124	uint8_t t1, t2;
1125	int32_t i = `0`;
1126	for(;;) {
1127	int32_t count = (int32_t)(pDestLimit - pDest);
1128	int32_t count2 = srcLength - i;
1129	if(count >= count2 && srcLength > `0` && U8_IS_SINGLE(*src)) {
1130	/ fast ASCII loop /
1131	int32_t start = i;
1132	uint8_t b;
1133	while(i < srcLength && U8_IS_SINGLE(b = src[i])) {
1134	*pDest++=b;
1135	++i;
1136	}
1137	int32_t delta = i - start;
1138	count -= delta;
1139	count2 -= delta;
1140	}
1141	/*
1142	* Each iteration of the inner loop progresses by at most 3 UTF-8
1143	* bytes and one UChar.
1144	*/
1145	if(subchar > `0xFFFF`) {
1146	break;
1147	}
1148	count2 /= `3`;
1149	if(count > count2) {
1150	count = count2; / min(remaining dest, remaining src/3) /
1151	}
1152	if(count < `3`) {
1153	/*
1154	* Too much overhead if we get near the end of the string,
1155	* continue with the next loop.
1156	*/
1157	break;
1158	}
1159	do {
1160	ch = (uint8_t)src[i++];
1161	if(U8_IS_SINGLE(ch)) {
1162	*pDest++=(UChar)ch;
1163	} else {
1164	if(ch >= `0xe0`) {
1165	if( / handle U+0000..U+FFFF inline /
1166	ch <= `0xef` &&
1167	(t1 = (uint8_t)(src[i] - `0x80`)) <= `0x3f` &&
1168	(t2 = (uint8_t)(src[i+`1`] - `0x80`)) <= `0x3f`
1169	) {
1170	/ no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) /
1171	*pDest++ = (UChar)((ch << `12`) \| (t1 << `6`) \| t2);
1172	i += `2`;
1173	continue;
1174	}
1175	} else {
1176	if( / handle U+0000..U+07FF inline /
1177	ch >= `0xc0` &&
1178	(t1 = (uint8_t)(src[i] - `0x80`)) <= `0x3f`
1179	) {
1180	*pDest++ = (UChar)(((ch & `0x1f`) << `6`) \| t1);
1181	++i;
1182	continue;
1183	}
1184	}
1185
1186	if(subchar < `0`) {
1187	*pErrorCode = U_INVALID_CHAR_FOUND;
1188	return NULL;
1189	} else if(subchar > `0xffff` && --count == `0`) {
1190	/*
1191	* We need to write two UChars, adjusted count for that,
1192	* and ran out of space.
1193	*/
1194	--i; // back out byte ch
1195	break;
1196	} else {
1197	/ function call for error cases /
1198	utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -`1`);
1199	++numSubstitutions;
1200	*(pDest++)=(UChar)subchar;
1201	}
1202	}
1203	} while(--count > `0`);
1204	}
1205
1206	while(i < srcLength && (pDest < pDestLimit)) {
1207	ch = (uint8_t)src[i++];
1208	if(U8_IS_SINGLE(ch)){
1209	*pDest++=(UChar)ch;
1210	} else {
1211	if(ch >= `0xe0`) {
1212	if( / handle U+0000..U+FFFF inline /
1213	ch <= `0xef` &&
1214	(i+`1`) < srcLength &&
1215	(t1 = (uint8_t)(src[i] - `0x80`)) <= `0x3f` &&
1216	(t2 = (uint8_t)(src[i+`1`] - `0x80`)) <= `0x3f`
1217	) {
1218	/ no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) /
1219	*pDest++ = (UChar)((ch << `12`) \| (t1 << `6`) \| t2);
1220	i += `2`;
1221	continue;
1222	}
1223	} else {
1224	if( / handle U+0000..U+07FF inline /
1225	ch >= `0xc0` &&
1226	i < srcLength &&
1227	(t1 = (uint8_t)(src[i] - `0x80`)) <= `0x3f`
1228	) {
1229	*pDest++ = (UChar)(((ch & `0x1f`) << `6`) \| t1);
1230	++i;
1231	continue;
1232	}
1233	}
1234
1235	if(subchar < `0`) {
1236	*pErrorCode = U_INVALID_CHAR_FOUND;
1237	return NULL;
1238	} else {
1239	/ function call for error cases /
1240	utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -`1`);
1241	++numSubstitutions;
1242	if(subchar<=`0xFFFF`) {
1243	*(pDest++)=(UChar)subchar;
1244	} else {
1245	*(pDest++)=U16_LEAD(subchar);
1246	if(pDest<pDestLimit) {
1247	*(pDest++)=U16_TRAIL(subchar);
1248	} else {
1249	reqLength++;
1250	break;
1251	}
1252	}
1253	}
1254	}
1255	}
1256
1257	/ Pre-flight the rest of the string. /
1258	while(i < srcLength) {
1259	ch = (uint8_t)src[i++];
1260	if(U8_IS_SINGLE(ch)) {
1261	reqLength++;
1262	} else {
1263	if(ch >= `0xe0`) {
1264	if( / handle U+0000..U+FFFF inline /
1265	ch <= `0xef` &&
1266	(i+`1`) < srcLength &&
1267	(uint8_t)(src[i] - `0x80`) <= `0x3f` &&
1268	(uint8_t)(src[i+`1`] - `0x80`) <= `0x3f`
1269	) {
1270	reqLength++;
1271	i += `2`;
1272	continue;
1273	}
1274	} else {
1275	if( / handle U+0000..U+07FF inline /
1276	ch >= `0xc0` &&
1277	i < srcLength &&
1278	(uint8_t)(src[i] - `0x80`) <= `0x3f`
1279	) {
1280	reqLength++;
1281	++i;
1282	continue;
1283	}
1284	}
1285
1286	if(subchar < `0`) {
1287	*pErrorCode = U_INVALID_CHAR_FOUND;
1288	return NULL;
1289	} else {
1290	/ function call for error cases /
1291	utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -`1`);
1292	++numSubstitutions;
1293	reqLength+=U16_LENGTH(ch);
1294	}
1295	}
1296	}
1297
1298	if(pNumSubstitutions!=NULL) {
1299	*pNumSubstitutions=numSubstitutions;
1300	}
1301
1302	reqLength+=(int32_t)(pDest - dest);
1303	if(pDestLength) {
1304	*pDestLength = reqLength;
1305	}
1306
1307	/ Terminate the buffer /
1308	u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
1309	return dest;
1310	}
1311
1312	U_CAPI char* U_EXPORT2
1313	u_strToJavaModifiedUTF8(
1314	char *dest,
1315	int32_t destCapacity,
1316	int32_t *pDestLength,
1317	const UChar *src,
1318	int32_t srcLength,
1319	UErrorCode *pErrorCode) {
1320	int32_t reqLength=`0`;
1321	uint32_t ch=`0`;
1322	uint8_t pDest = (uint8_t )dest;
1323	uint8_t *pDestLimit = pDest + destCapacity;
1324	const UChar *pSrcLimit;
1325	int32_t count;
1326
1327	/ args check /
1328	if(U_FAILURE(*pErrorCode)){
1329	return NULL;
1330	}
1331	if( (src==NULL && srcLength!=`0`) \|\| srcLength < -`1` \|\|
1332	(dest==NULL && destCapacity!=`0`) \|\| destCapacity<`0`
1333	) {
1334	*pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1335	return NULL;
1336	}
1337
1338	if(srcLength==-`1`) {
1339	/ Convert NUL-terminated ASCII, then find the string length. /
1340	while((ch=*src)<=`0x7f` && ch != `0` && pDest<pDestLimit) {
1341	*pDest++ = (uint8_t)ch;
1342	++src;
1343	}
1344	if(ch == `0`) {
1345	reqLength=(int32_t)(pDest - (uint8_t *)dest);
1346	if(pDestLength) {
1347	*pDestLength = reqLength;
1348	}
1349
1350	/ Terminate the buffer /
1351	u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1352	return dest;
1353	}
1354	srcLength = u_strlen(src);
1355	}
1356
1357	/ Faster loop without ongoing checking for pSrcLimit and pDestLimit. /
1358	pSrcLimit = (src!=NULL)?(src+srcLength):NULL;
1359	for(;;) {
1360	count = (int32_t)(pDestLimit - pDest);
1361	srcLength = (int32_t)(pSrcLimit - src);
1362	if(count >= srcLength && srcLength > `0` && *src <= `0x7f`) {
1363	/ fast ASCII loop /
1364	const UChar *prevSrc = src;
1365	int32_t delta;
1366	while(src < pSrcLimit && (ch = *src) <= `0x7f` && ch != `0`) {
1367	*pDest++=(uint8_t)ch;
1368	++src;
1369	}
1370	delta = (int32_t)(src - prevSrc);
1371	count -= delta;
1372	srcLength -= delta;
1373	}
1374	/*
1375	* Each iteration of the inner loop progresses by at most 3 UTF-8
1376	* bytes and one UChar.
1377	*/
1378	count /= `3`;
1379	if(count > srcLength) {
1380	count = srcLength; / min(remaining dest/3, remaining src) /
1381	}
1382	if(count < `3`) {
1383	/*
1384	* Too much overhead if we get near the end of the string,
1385	* continue with the next loop.
1386	*/
1387	break;
1388	}
1389	do {
1390	ch=*src++;
1391	if(ch <= `0x7f` && ch != `0`) {
1392	*pDest++ = (uint8_t)ch;
1393	} else if(ch <= `0x7ff`) {
1394	*pDest++=(uint8_t)((ch>>`6`)\|`0xc0`);
1395	*pDest++=(uint8_t)((ch&`0x3f`)\|`0x80`);
1396	} else {
1397	*pDest++=(uint8_t)((ch>>`12`)\|`0xe0`);
1398	*pDest++=(uint8_t)(((ch>>`6`)&`0x3f`)\|`0x80`);
1399	*pDest++=(uint8_t)((ch&`0x3f`)\|`0x80`);
1400	}
1401	} while(--count > `0`);
1402	}
1403
1404	while(src<pSrcLimit) {
1405	ch=*src++;
1406	if(ch <= `0x7f` && ch != `0`) {
1407	if(pDest<pDestLimit) {
1408	*pDest++ = (uint8_t)ch;
1409	} else {
1410	reqLength = `1`;
1411	break;
1412	}
1413	} else if(ch <= `0x7ff`) {
1414	if((pDestLimit - pDest) >= `2`) {
1415	*pDest++=(uint8_t)((ch>>`6`)\|`0xc0`);
1416	*pDest++=(uint8_t)((ch&`0x3f`)\|`0x80`);
1417	} else {
1418	reqLength = `2`;
1419	break;
1420	}
1421	} else {
1422	if((pDestLimit - pDest) >= `3`) {
1423	*pDest++=(uint8_t)((ch>>`12`)\|`0xe0`);
1424	*pDest++=(uint8_t)(((ch>>`6`)&`0x3f`)\|`0x80`);
1425	*pDest++=(uint8_t)((ch&`0x3f`)\|`0x80`);
1426	} else {
1427	reqLength = `3`;
1428	break;
1429	}
1430	}
1431	}
1432	while(src<pSrcLimit) {
1433	ch=*src++;
1434	if(ch <= `0x7f` && ch != `0`) {
1435	++reqLength;
1436	} else if(ch<=`0x7ff`) {
1437	reqLength+=`2`;
1438	} else {
1439	reqLength+=`3`;
1440	}
1441	}
1442
1443	reqLength+=(int32_t)(pDest - (uint8_t *)dest);
1444	if(pDestLength){
1445	*pDestLength = reqLength;
1446	}
1447
1448	/ Terminate the buffer /
1449	u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1450	return dest;
1451	}
1452

Browse the source code of ClickHouse/contrib/icu/icu4c/source/common/ustrtrns.cpp