ucnvbocu.cpp source code [ClickHouse/contrib/icu/icu4c/source/common/ucnvbocu.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	******************************************************************************
5	*
6	* Copyright (C) 2002-2016, International Business Machines
7	* Corporation and others. All Rights Reserved.
8	*
9	******************************************************************************
10	* file name: ucnvbocu.cpp
11	* encoding: UTF-8
12	* tab size: 8 (not used)
13	* indentation:4
14	*
15	* created on: 2002mar27
16	* created by: Markus W. Scherer
17	*
18	* This is an implementation of the Binary Ordered Compression for Unicode,
19	* in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/
20	*/
21
22	#include "unicode/utypes.h"
23
24	#if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
25
26	#include "unicode/ucnv.h"
27	#include "unicode/ucnv_cb.h"
28	#include "unicode/utf16.h"
29	#include "putilimp.h"
30	#include "ucnv_bld.h"
31	#include "ucnv_cnv.h"
32	#include "uassert.h"
33
34	/ BOCU-1 constants and macros ---------------------------------------------- /
35
36	/*
37	* BOCU-1 encodes the code points of a Unicode string as
38	* a sequence of byte-encoded differences (slope detection),
39	* preserving lexical order.
40	*
41	* Optimize the difference-taking for runs of Unicode text within
42	* small scripts:
43	*
44	* Most small scripts are allocated within aligned 128-blocks of Unicode
45	* code points. Lexical order is preserved if the "previous code point" state
46	* is always moved into the middle of such a block.
47	*
48	* Additionally, "prev" is moved from anywhere in the Unihan and Hangul
49	* areas into the middle of those areas.
50	*
51	* C0 control codes and space are encoded with their US-ASCII bytes.
52	* "prev" is reset for C0 controls but not for space.
53	*/
54
55	/ initial value for "prev": middle of the ASCII range /
56	#define BOCU1_ASCII_PREV 0x40
57
58	/ bounding byte values for differences /
59	#define BOCU1_MIN 0x21
60	#define BOCU1_MIDDLE 0x90
61	#define BOCU1_MAX_LEAD 0xfe
62	#define BOCU1_MAX_TRAIL 0xff
63	#define BOCU1_RESET 0xff
64
65	/ number of lead bytes /
66	#define BOCU1_COUNT (BOCU1_MAX_LEAD-BOCU1_MIN+1)
67
68	/ adjust trail byte counts for the use of some C0 control byte values /
69	#define BOCU1_TRAIL_CONTROLS_COUNT 20
70	#define BOCU1_TRAIL_BYTE_OFFSET (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
71
72	/ number of trail bytes /
73	#define BOCU1_TRAIL_COUNT ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
74
75	/*
76	* number of positive and negative single-byte codes
77	* (counting 0==BOCU1_MIDDLE among the positive ones)
78	*/
79	#define BOCU1_SINGLE 64
80
81	/ number of lead bytes for positive and negative 2/3/4-byte sequences /
82	#define BOCU1_LEAD_2 43
83	#define BOCU1_LEAD_3 3
84	#define BOCU1_LEAD_4 1
85
86	/ The difference value range for single-byters. /
87	#define BOCU1_REACH_POS_1 (BOCU1_SINGLE-1)
88	#define BOCU1_REACH_NEG_1 (-BOCU1_SINGLE)
89
90	/ The difference value range for double-byters. /
91	#define BOCU1_REACH_POS_2 (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
92	#define BOCU1_REACH_NEG_2 (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
93
94	/ The difference value range for 3-byters. /
95	#define BOCU1_REACH_POS_3 \
96	(BOCU1_REACH_POS_2+BOCU1_LEAD_3BOCU1_TRAIL_COUNTBOCU1_TRAIL_COUNT)
97
98	#define BOCU1_REACH_NEG_3 (BOCU1_REACH_NEG_2-BOCU1_LEAD_3BOCU1_TRAIL_COUNTBOCU1_TRAIL_COUNT)
99
100	/ The lead byte start values. /
101	#define BOCU1_START_POS_2 (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
102	#define BOCU1_START_POS_3 (BOCU1_START_POS_2+BOCU1_LEAD_2)
103	#define BOCU1_START_POS_4 (BOCU1_START_POS_3+BOCU1_LEAD_3)
104	/ ==BOCU1_MAX_LEAD /
105
106	#define BOCU1_START_NEG_2 (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
107	#define BOCU1_START_NEG_3 (BOCU1_START_NEG_2-BOCU1_LEAD_2)
108	#define BOCU1_START_NEG_4 (BOCU1_START_NEG_3-BOCU1_LEAD_3)
109	/ ==BOCU1_MIN+1 /
110
111	/ The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). /
112	#define BOCU1_LENGTH_FROM_LEAD(lead) \
113	((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
114	(BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
115	(BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
116
117	/ The length of a byte sequence, according to its packed form. /
118	#define BOCU1_LENGTH_FROM_PACKED(packed) \
119	((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
120
121	/*
122	* 12 commonly used C0 control codes (and space) are only used to encode
123	* themselves directly,
124	* which makes BOCU-1 MIME-usable and reasonably safe for
125	* ASCII-oriented software.
126	*
127	* These controls are
128	* 0 NUL
129	*
130	* 7 BEL
131	* 8 BS
132	*
133	* 9 TAB
134	* a LF
135	* b VT
136	* c FF
137	* d CR
138	*
139	* e SO
140	* f SI
141	*
142	* 1a SUB
143	* 1b ESC
144	*
145	* The other 20 C0 controls are also encoded directly (to preserve order)
146	* but are also used as trail bytes in difference encoding
147	* (for better compression).
148	*/
149	#define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
150
151	/*
152	* Byte value map for control codes,
153	* from external byte values 0x00..0x20
154	* to trail byte values 0..19 (0..0x13) as used in the difference calculation.
155	* External byte values that are illegal as trail bytes are mapped to -1.
156	*/
157	static const int8_t
158	bocu1ByteToTrail[BOCU1_MIN]={
159	/ 0 1 2 3 4 5 6 7 /
160	-`1`, `0x00`, `0x01`, `0x02`, `0x03`, `0x04`, `0x05`, -`1`,
161
162	/ 8 9 a b c d e f /
163	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
164
165	/ 10 11 12 13 14 15 16 17 /
166	`0x06`, `0x07`, `0x08`, `0x09`, `0x0a`, `0x0b`, `0x0c`, `0x0d`,
167
168	/ 18 19 1a 1b 1c 1d 1e 1f /
169	`0x0e`, `0x0f`, -`1`, -`1`, `0x10`, `0x11`, `0x12`, `0x13`,
170
171	/ 20 /
172	-`1`
173	};
174
175	/*
176	* Byte value map for control codes,
177	* from trail byte values 0..19 (0..0x13) as used in the difference calculation
178	* to external byte values 0x00..0x20.
179	*/
180	static const int8_t
181	bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={
182	/ 0 1 2 3 4 5 6 7 /
183	`0x01`, `0x02`, `0x03`, `0x04`, `0x05`, `0x06`, `0x10`, `0x11`,
184
185	/ 8 9 a b c d e f /
186	`0x12`, `0x13`, `0x14`, `0x15`, `0x16`, `0x17`, `0x18`, `0x19`,
187
188	/ 10 11 12 13 /
189	`0x1c`, `0x1d`, `0x1e`, `0x1f`
190	};
191
192	/**
193	* Integer division and modulo with negative numerators
194	* yields negative modulo results and quotients that are one more than
195	* what we need here.
196	* This macro adjust the results so that the modulo-value m is always >=0.
197	*
198	* For positive n, the if() condition is always FALSE.
199	*
200	* @param n Number to be split into quotient and rest.
201	* Will be modified to contain the quotient.
202	* @param d Divisor.
203	* @param m Output variable for the rest (modulo result).
204	*/
205	#define NEGDIVMOD(n, d, m) UPRV_BLOCK_MACRO_BEGIN { \
206	(m)=(n)%(d); \
207	(n)/=(d); \
208	if((m)<0) { \
209	--(n); \
210	(m)+=(d); \
211	} \
212	} UPRV_BLOCK_MACRO_END
213
214	/ Faster versions of packDiff() for single-byte-encoded diff values. /
215
216	/* Is a diff value encodable in a single byte? /
217	#define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1)
218
219	/* Encode a diff value in a single byte. /
220	#define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff))
221
222	/* Is a diff value encodable in two bytes? /
223	#define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2)
224
225	/ BOCU-1 implementation functions ------------------------------------------ /
226
227	#define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV)
228
229	/**
230	* Compute the next "previous" value for differencing
231	* from the current code point.
232	*
233	* @param c current code point, 0x3040..0xd7a3 (rest handled by macro below)
234	* @return "previous code point" state value
235	*/
236	static inline int32_t
237	bocu1Prev(int32_t c) {
238	/ compute new prev /
239	if(/ 0x3040<=c && / c<=`0x309f`) {
240	/ Hiragana is not 128-aligned /
241	return `0x3070`;
242	} else if(`0x4e00`<=c && c<=`0x9fa5`) {
243	/ CJK Unihan /
244	return `0x4e00`-BOCU1_REACH_NEG_2;
245	} else if(`0xac00`<=c / && c<=0xd7a3 /) {
246	/ Korean Hangul /
247	return (`0xd7a3`+`0xac00`)/`2`;
248	} else {
249	/ mostly small scripts /
250	return BOCU1_SIMPLE_PREV(c);
251	}
252	}
253
254	/* Fast version of bocu1Prev() for most scripts. /
255	#define BOCU1_PREV(c) ((c)<0x3040 \|\| (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c))
256
257	/*
258	* The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c.
259	* The UConverter fields are used as follows:
260	*
261	* fromUnicodeStatus encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
262	*
263	* toUnicodeStatus decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
264	* mode decoder's incomplete (diff<<2)\|count (ignored when toULength==0)
265	*/
266
267	/ BOCU-1-from-Unicode conversion functions --------------------------------- /
268
269	/**
270	* Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
271	* and return a packed integer with them.
272	*
273	* The encoding favors small absolute differences with short encodings
274	* to compress runs of same-script characters.
275	*
276	* Optimized version with unrolled loops and fewer floating-point operations
277	* than the standard packDiff().
278	*
279	* @param diff difference value -0x10ffff..0x10ffff
280	* @return
281	* 0x010000zz for 1-byte sequence zz
282	* 0x0200yyzz for 2-byte sequence yy zz
283	* 0x03xxyyzz for 3-byte sequence xx yy zz
284	* 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
285	*/
286	static int32_t
287	packDiff(int32_t diff) {
288	int32_t result, m;
289
290	U_ASSERT(!DIFF_IS_SINGLE(diff)); / assume we won't be called where diff==BOCU1_REACH_NEG_1=-64 /
291	if(diff>=BOCU1_REACH_NEG_1) {
292	/ mostly positive differences, and single-byte negative ones /
293	#if 0 /* single-byte case handled in macros, see below */
294	if(diff<=BOCU1_REACH_POS_1) {
295	/ single byte /
296	return `0x01000000`\|(BOCU1_MIDDLE+diff);
297	} else
298	#endif
299	if(diff<=BOCU1_REACH_POS_2) {
300	/ two bytes /
301	diff-=BOCU1_REACH_POS_1+`1`;
302	result=`0x02000000`;
303
304	m=diff%BOCU1_TRAIL_COUNT;
305	diff/=BOCU1_TRAIL_COUNT;
306	result\|=BOCU1_TRAIL_TO_BYTE(m);
307
308	result\|=(BOCU1_START_POS_2+diff)<<`8`;
309	} else if(diff<=BOCU1_REACH_POS_3) {
310	/ three bytes /
311	diff-=BOCU1_REACH_POS_2+`1`;
312	result=`0x03000000`;
313
314	m=diff%BOCU1_TRAIL_COUNT;
315	diff/=BOCU1_TRAIL_COUNT;
316	result\|=BOCU1_TRAIL_TO_BYTE(m);
317
318	m=diff%BOCU1_TRAIL_COUNT;
319	diff/=BOCU1_TRAIL_COUNT;
320	result\|=BOCU1_TRAIL_TO_BYTE(m)<<`8`;
321
322	result\|=(BOCU1_START_POS_3+diff)<<`16`;
323	} else {
324	/ four bytes /
325	diff-=BOCU1_REACH_POS_3+`1`;
326
327	m=diff%BOCU1_TRAIL_COUNT;
328	diff/=BOCU1_TRAIL_COUNT;
329	result=BOCU1_TRAIL_TO_BYTE(m);
330
331	m=diff%BOCU1_TRAIL_COUNT;
332	diff/=BOCU1_TRAIL_COUNT;
333	result\|=BOCU1_TRAIL_TO_BYTE(m)<<`8`;
334
335	/*
336	* We know that / and % would deliver quotient 0 and rest=diff.
337	* Avoid division and modulo for performance.
338	*/
339	result\|=BOCU1_TRAIL_TO_BYTE(diff)<<`16`;
340
341	result\|=((uint32_t)BOCU1_START_POS_4)<<`24`;
342	}
343	} else {
344	/ two- to four-byte negative differences /
345	if(diff>=BOCU1_REACH_NEG_2) {
346	/ two bytes /
347	diff-=BOCU1_REACH_NEG_1;
348	result=`0x02000000`;
349
350	NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
351	result\|=BOCU1_TRAIL_TO_BYTE(m);
352
353	result\|=(BOCU1_START_NEG_2+diff)<<`8`;
354	} else if(diff>=BOCU1_REACH_NEG_3) {
355	/ three bytes /
356	diff-=BOCU1_REACH_NEG_2;
357	result=`0x03000000`;
358
359	NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
360	result\|=BOCU1_TRAIL_TO_BYTE(m);
361
362	NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
363	result\|=BOCU1_TRAIL_TO_BYTE(m)<<`8`;
364
365	result\|=(BOCU1_START_NEG_3+diff)<<`16`;
366	} else {
367	/ four bytes /
368	diff-=BOCU1_REACH_NEG_3;
369
370	NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
371	result=BOCU1_TRAIL_TO_BYTE(m);
372
373	NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
374	result\|=BOCU1_TRAIL_TO_BYTE(m)<<`8`;
375
376	/*
377	* We know that NEGDIVMOD would deliver
378	* quotient -1 and rest=diff+BOCU1_TRAIL_COUNT.
379	* Avoid division and modulo for performance.
380	*/
381	m=diff+BOCU1_TRAIL_COUNT;
382	result\|=BOCU1_TRAIL_TO_BYTE(m)<<`16`;
383
384	result\|=BOCU1_MIN<<`24`;
385	}
386	}
387	return result;
388	}
389
390
391	static void U_CALLCONV
392	_Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
393	UErrorCode *pErrorCode) {
394	UConverter *cnv;
395	const UChar source, sourceLimit;
396	uint8_t *target;
397	int32_t targetCapacity;
398	int32_t *offsets;
399
400	int32_t prev, c, diff;
401
402	int32_t sourceIndex, nextSourceIndex;
403
404	/ set up the local pointers /
405	cnv=pArgs->converter;
406	source=pArgs->source;
407	sourceLimit=pArgs->sourceLimit;
408	target=(uint8_t *)pArgs->target;
409	targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
410	offsets=pArgs->offsets;
411
412	/ get the converter state from UConverter /
413	c=cnv->fromUChar32;
414	prev=(int32_t)cnv->fromUnicodeStatus;
415	if(prev==`0`) {
416	prev=BOCU1_ASCII_PREV;
417	}
418
419	/ sourceIndex=-1 if the current character began in the previous buffer /
420	sourceIndex= c==`0` ? `0` : -`1`;
421	nextSourceIndex=`0`;
422
423	/ conversion loop /
424	if(c!=`0` && targetCapacity>`0`) {
425	goto getTrail;
426	}
427
428	fastSingle:
429	/ fast loop for single-byte differences /
430	/ use only one loop counter variable, targetCapacity, not also source /
431	diff=(int32_t)(sourceLimit-source);
432	if(targetCapacity>diff) {
433	targetCapacity=diff;
434	}
435	while(targetCapacity>`0` && (c=*source)<`0x3000`) {
436	if(c<=`0x20`) {
437	if(c!=`0x20`) {
438	prev=BOCU1_ASCII_PREV;
439	}
440	*target++=(uint8_t)c;
441	*offsets++=nextSourceIndex++;
442	++source;
443	--targetCapacity;
444	} else {
445	diff=c-prev;
446	if(DIFF_IS_SINGLE(diff)) {
447	prev=BOCU1_SIMPLE_PREV(c);
448	*target++=(uint8_t)PACK_SINGLE_DIFF(diff);
449	*offsets++=nextSourceIndex++;
450	++source;
451	--targetCapacity;
452	} else {
453	break;
454	}
455	}
456	}
457	/ restore real values /
458	targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
459	sourceIndex=nextSourceIndex; / wrong if offsets==NULL but does not matter /
460
461	/ regular loop for all cases /
462	while(source<sourceLimit) {
463	if(targetCapacity>`0`) {
464	c=*source++;
465	++nextSourceIndex;
466
467	if(c<=`0x20`) {
468	/*
469	* ISO C0 control & space:
470	* Encode directly for MIME compatibility,
471	* and reset state except for space, to not disrupt compression.
472	*/
473	if(c!=`0x20`) {
474	prev=BOCU1_ASCII_PREV;
475	}
476	*target++=(uint8_t)c;
477	*offsets++=sourceIndex;
478	--targetCapacity;
479
480	sourceIndex=nextSourceIndex;
481	continue;
482	}
483
484	if(U16_IS_LEAD(c)) {
485	getTrail:
486	if(source<sourceLimit) {
487	/ test the following code unit /
488	UChar trail=*source;
489	if(U16_IS_TRAIL(trail)) {
490	++source;
491	++nextSourceIndex;
492	c=U16_GET_SUPPLEMENTARY(c, trail);
493	}
494	} else {
495	/ no more input /
496	c=-c; / negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else /
497	break;
498	}
499	}
500
501	/*
502	* all other Unicode code points c==U+0021..U+10ffff
503	* are encoded with the difference c-prev
504	*
505	* a new prev is computed from c,
506	* placed in the middle of a 0x80-block (for most small scripts) or
507	* in the middle of the Unihan and Hangul blocks
508	* to statistically minimize the following difference
509	*/
510	diff=c-prev;
511	prev=BOCU1_PREV(c);
512	if(DIFF_IS_SINGLE(diff)) {
513	*target++=(uint8_t)PACK_SINGLE_DIFF(diff);
514	*offsets++=sourceIndex;
515	--targetCapacity;
516	sourceIndex=nextSourceIndex;
517	if(c<`0x3000`) {
518	goto fastSingle;
519	}
520	} else if(DIFF_IS_DOUBLE(diff) && `2`<=targetCapacity) {
521	/ optimize 2-byte case /
522	int32_t m;
523
524	if(diff>=`0`) {
525	diff-=BOCU1_REACH_POS_1+`1`;
526	m=diff%BOCU1_TRAIL_COUNT;
527	diff/=BOCU1_TRAIL_COUNT;
528	diff+=BOCU1_START_POS_2;
529	} else {
530	diff-=BOCU1_REACH_NEG_1;
531	NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
532	diff+=BOCU1_START_NEG_2;
533	}
534	*target++=(uint8_t)diff;
535	*target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
536	*offsets++=sourceIndex;
537	*offsets++=sourceIndex;
538	targetCapacity-=`2`;
539	sourceIndex=nextSourceIndex;
540	} else {
541	int32_t length; / will be 2..4 /
542
543	diff=packDiff(diff);
544	length=BOCU1_LENGTH_FROM_PACKED(diff);
545
546	/ write the output character bytes from diff and length /
547	/ from the first if in the loop we know that targetCapacity>0 /
548	if(length<=targetCapacity) {
549	switch(length) {
550	/ each branch falls through to the next one /
551	case `4`:
552	*target++=(uint8_t)(diff>>`24`);
553	*offsets++=sourceIndex;
554	U_FALLTHROUGH;
555	case `3`:
556	*target++=(uint8_t)(diff>>`16`);
557	*offsets++=sourceIndex;
558	U_FALLTHROUGH;
559	case `2`:
560	*target++=(uint8_t)(diff>>`8`);
561	*offsets++=sourceIndex;
562	/ case 1: handled above /
563	*target++=(uint8_t)diff;
564	*offsets++=sourceIndex;
565	U_FALLTHROUGH;
566	default:
567	/ will never occur /
568	break;
569	}
570	targetCapacity-=length;
571	sourceIndex=nextSourceIndex;
572	} else {
573	uint8_t *charErrorBuffer;
574
575	/*
576	* We actually do this backwards here:
577	* In order to save an intermediate variable, we output
578	* first to the overflow buffer what does not fit into the
579	* regular target.
580	*/
581	/ we know that 1<=targetCapacity<length<=4 /
582	length-=targetCapacity;
583	charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
584	switch(length) {
585	/ each branch falls through to the next one /
586	case `3`:
587	*charErrorBuffer++=(uint8_t)(diff>>`16`);
588	U_FALLTHROUGH;
589	case `2`:
590	*charErrorBuffer++=(uint8_t)(diff>>`8`);
591	U_FALLTHROUGH;
592	case `1`:
593	*charErrorBuffer=(uint8_t)diff;
594	U_FALLTHROUGH;
595	default:
596	/ will never occur /
597	break;
598	}
599	cnv->charErrorBufferLength=(int8_t)length;
600
601	/ now output what fits into the regular target /
602	diff>>=`8`length; /* length was reduced by targetCapacity /
603	switch(targetCapacity) {
604	/ each branch falls through to the next one /
605	case `3`:
606	*target++=(uint8_t)(diff>>`16`);
607	*offsets++=sourceIndex;
608	U_FALLTHROUGH;
609	case `2`:
610	*target++=(uint8_t)(diff>>`8`);
611	*offsets++=sourceIndex;
612	U_FALLTHROUGH;
613	case `1`:
614	*target++=(uint8_t)diff;
615	*offsets++=sourceIndex;
616	U_FALLTHROUGH;
617	default:
618	/ will never occur /
619	break;
620	}
621
622	/ target overflow /
623	targetCapacity=`0`;
624	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
625	break;
626	}
627	}
628	} else {
629	/ target is full /
630	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
631	break;
632	}
633	}
634
635	/ set the converter state back into UConverter /
636	cnv->fromUChar32= c<`0` ? -c : `0`;
637	cnv->fromUnicodeStatus=(uint32_t)prev;
638
639	/ write back the updated pointers /
640	pArgs->source=source;
641	pArgs->target=(char *)target;
642	pArgs->offsets=offsets;
643	}
644
645	/*
646	* Identical to _Bocu1FromUnicodeWithOffsets but without offset handling.
647	* If a change is made in the original function, then either
648	* change this function the same way or
649	* re-copy the original function and remove the variables
650	* offsets, sourceIndex, and nextSourceIndex.
651	*/
652	static void U_CALLCONV
653	_Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs,
654	UErrorCode *pErrorCode) {
655	UConverter *cnv;
656	const UChar source, sourceLimit;
657	uint8_t *target;
658	int32_t targetCapacity;
659
660	int32_t prev, c, diff;
661
662	/ set up the local pointers /
663	cnv=pArgs->converter;
664	source=pArgs->source;
665	sourceLimit=pArgs->sourceLimit;
666	target=(uint8_t *)pArgs->target;
667	targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
668
669	/ get the converter state from UConverter /
670	c=cnv->fromUChar32;
671	prev=(int32_t)cnv->fromUnicodeStatus;
672	if(prev==`0`) {
673	prev=BOCU1_ASCII_PREV;
674	}
675
676	/ conversion loop /
677	if(c!=`0` && targetCapacity>`0`) {
678	goto getTrail;
679	}
680
681	fastSingle:
682	/ fast loop for single-byte differences /
683	/ use only one loop counter variable, targetCapacity, not also source /
684	diff=(int32_t)(sourceLimit-source);
685	if(targetCapacity>diff) {
686	targetCapacity=diff;
687	}
688	while(targetCapacity>`0` && (c=*source)<`0x3000`) {
689	if(c<=`0x20`) {
690	if(c!=`0x20`) {
691	prev=BOCU1_ASCII_PREV;
692	}
693	*target++=(uint8_t)c;
694	} else {
695	diff=c-prev;
696	if(DIFF_IS_SINGLE(diff)) {
697	prev=BOCU1_SIMPLE_PREV(c);
698	*target++=(uint8_t)PACK_SINGLE_DIFF(diff);
699	} else {
700	break;
701	}
702	}
703	++source;
704	--targetCapacity;
705	}
706	/ restore real values /
707	targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
708
709	/ regular loop for all cases /
710	while(source<sourceLimit) {
711	if(targetCapacity>`0`) {
712	c=*source++;
713
714	if(c<=`0x20`) {
715	/*
716	* ISO C0 control & space:
717	* Encode directly for MIME compatibility,
718	* and reset state except for space, to not disrupt compression.
719	*/
720	if(c!=`0x20`) {
721	prev=BOCU1_ASCII_PREV;
722	}
723	*target++=(uint8_t)c;
724	--targetCapacity;
725	continue;
726	}
727
728	if(U16_IS_LEAD(c)) {
729	getTrail:
730	if(source<sourceLimit) {
731	/ test the following code unit /
732	UChar trail=*source;
733	if(U16_IS_TRAIL(trail)) {
734	++source;
735	c=U16_GET_SUPPLEMENTARY(c, trail);
736	}
737	} else {
738	/ no more input /
739	c=-c; / negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else /
740	break;
741	}
742	}
743
744	/*
745	* all other Unicode code points c==U+0021..U+10ffff
746	* are encoded with the difference c-prev
747	*
748	* a new prev is computed from c,
749	* placed in the middle of a 0x80-block (for most small scripts) or
750	* in the middle of the Unihan and Hangul blocks
751	* to statistically minimize the following difference
752	*/
753	diff=c-prev;
754	prev=BOCU1_PREV(c);
755	if(DIFF_IS_SINGLE(diff)) {
756	*target++=(uint8_t)PACK_SINGLE_DIFF(diff);
757	--targetCapacity;
758	if(c<`0x3000`) {
759	goto fastSingle;
760	}
761	} else if(DIFF_IS_DOUBLE(diff) && `2`<=targetCapacity) {
762	/ optimize 2-byte case /
763	int32_t m;
764
765	if(diff>=`0`) {
766	diff-=BOCU1_REACH_POS_1+`1`;
767	m=diff%BOCU1_TRAIL_COUNT;
768	diff/=BOCU1_TRAIL_COUNT;
769	diff+=BOCU1_START_POS_2;
770	} else {
771	diff-=BOCU1_REACH_NEG_1;
772	NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
773	diff+=BOCU1_START_NEG_2;
774	}
775	*target++=(uint8_t)diff;
776	*target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
777	targetCapacity-=`2`;
778	} else {
779	int32_t length; / will be 2..4 /
780
781	diff=packDiff(diff);
782	length=BOCU1_LENGTH_FROM_PACKED(diff);
783
784	/ write the output character bytes from diff and length /
785	/ from the first if in the loop we know that targetCapacity>0 /
786	if(length<=targetCapacity) {
787	switch(length) {
788	/ each branch falls through to the next one /
789	case `4`:
790	*target++=(uint8_t)(diff>>`24`);
791	U_FALLTHROUGH;
792	case `3`:
793	*target++=(uint8_t)(diff>>`16`);
794	/ case 2: handled above /
795	*target++=(uint8_t)(diff>>`8`);
796	/ case 1: handled above /
797	*target++=(uint8_t)diff;
798	U_FALLTHROUGH;
799	default:
800	/ will never occur /
801	break;
802	}
803	targetCapacity-=length;
804	} else {
805	uint8_t *charErrorBuffer;
806
807	/*
808	* We actually do this backwards here:
809	* In order to save an intermediate variable, we output
810	* first to the overflow buffer what does not fit into the
811	* regular target.
812	*/
813	/ we know that 1<=targetCapacity<length<=4 /
814	length-=targetCapacity;
815	charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
816	switch(length) {
817	/ each branch falls through to the next one /
818	case `3`:
819	*charErrorBuffer++=(uint8_t)(diff>>`16`);
820	U_FALLTHROUGH;
821	case `2`:
822	*charErrorBuffer++=(uint8_t)(diff>>`8`);
823	U_FALLTHROUGH;
824	case `1`:
825	*charErrorBuffer=(uint8_t)diff;
826	U_FALLTHROUGH;
827	default:
828	/ will never occur /
829	break;
830	}
831	cnv->charErrorBufferLength=(int8_t)length;
832
833	/ now output what fits into the regular target /
834	diff>>=`8`length; /* length was reduced by targetCapacity /
835	switch(targetCapacity) {
836	/ each branch falls through to the next one /
837	case `3`:
838	*target++=(uint8_t)(diff>>`16`);
839	U_FALLTHROUGH;
840	case `2`:
841	*target++=(uint8_t)(diff>>`8`);
842	U_FALLTHROUGH;
843	case `1`:
844	*target++=(uint8_t)diff;
845	U_FALLTHROUGH;
846	default:
847	/ will never occur /
848	break;
849	}
850
851	/ target overflow /
852	targetCapacity=`0`;
853	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
854	break;
855	}
856	}
857	} else {
858	/ target is full /
859	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
860	break;
861	}
862	}
863
864	/ set the converter state back into UConverter /
865	cnv->fromUChar32= c<`0` ? -c : `0`;
866	cnv->fromUnicodeStatus=(uint32_t)prev;
867
868	/ write back the updated pointers /
869	pArgs->source=source;
870	pArgs->target=(char *)target;
871	}
872
873	/ BOCU-1-to-Unicode conversion functions ----------------------------------- /
874
875	/**
876	* Function for BOCU-1 decoder; handles multi-byte lead bytes.
877	*
878	* @param b lead byte;
879	* BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD
880	* @return (diff<<2)\|count
881	*/
882	static inline int32_t
883	decodeBocu1LeadByte(int32_t b) {
884	int32_t diff, count;
885
886	if(b>=BOCU1_START_NEG_2) {
887	/ positive difference /
888	if(b<BOCU1_START_POS_3) {
889	/ two bytes /
890	diff=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+`1`;
891	count=`1`;
892	} else if(b<BOCU1_START_POS_4) {
893	/ three bytes /
894	diff=((int32_t)b-BOCU1_START_POS_3)BOCU1_TRAIL_COUNTBOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+`1`;
895	count=`2`;
896	} else {
897	/ four bytes /
898	diff=BOCU1_REACH_POS_3+`1`;
899	count=`3`;
900	}
901	} else {
902	/ negative difference /
903	if(b>=BOCU1_START_NEG_3) {
904	/ two bytes /
905	diff=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
906	count=`1`;
907	} else if(b>BOCU1_MIN) {
908	/ three bytes /
909	diff=((int32_t)b-BOCU1_START_NEG_3)BOCU1_TRAIL_COUNTBOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2;
910	count=`2`;
911	} else {
912	/ four bytes /
913	diff=-BOCU1_TRAIL_COUNTBOCU1_TRAIL_COUNTBOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
914	count=`3`;
915	}
916	}
917
918	/ return the state for decoding the trail byte(s) /
919	return (diff<<`2`)\|count;
920	}
921
922	/**
923	* Function for BOCU-1 decoder; handles multi-byte trail bytes.
924	*
925	* @param count number of remaining trail bytes including this one
926	* @param b trail byte
927	* @return new delta for diff including b - <0 indicates an error
928	*
929	* @see decodeBocu1
930	*/
931	static inline int32_t
932	decodeBocu1TrailByte(int32_t count, int32_t b) {
933	if(b<=`0x20`) {
934	/ skip some C0 controls and make the trail byte range contiguous /
935	b=bocu1ByteToTrail[b];
936	/ b<0 for an illegal trail byte value will result in return<0 below /
937	#if BOCU1_MAX_TRAIL<0xff
938	} else if(b>BOCU1_MAX_TRAIL) {
939	return -`99`;
940	#endif
941	} else {
942	b-=BOCU1_TRAIL_BYTE_OFFSET;
943	}
944
945	/ add trail byte into difference and decrement count /
946	if(count==`1`) {
947	return b;
948	} else if(count==`2`) {
949	return b*BOCU1_TRAIL_COUNT;
950	} else / count==3 / {
951	return b(BOCU1_TRAIL_COUNTBOCU1_TRAIL_COUNT);
952	}
953	}
954
955	static void U_CALLCONV
956	_Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
957	UErrorCode *pErrorCode) {
958	UConverter *cnv;
959	const uint8_t source, sourceLimit;
960	UChar *target;
961	const UChar *targetLimit;
962	int32_t *offsets;
963
964	int32_t prev, count, diff, c;
965
966	int8_t byteIndex;
967	uint8_t *bytes;
968
969	int32_t sourceIndex, nextSourceIndex;
970
971	/ set up the local pointers /
972	cnv=pArgs->converter;
973	source=(const uint8_t *)pArgs->source;
974	sourceLimit=(const uint8_t *)pArgs->sourceLimit;
975	target=pArgs->target;
976	targetLimit=pArgs->targetLimit;
977	offsets=pArgs->offsets;
978
979	/ get the converter state from UConverter /
980	prev=(int32_t)cnv->toUnicodeStatus;
981	if(prev==`0`) {
982	prev=BOCU1_ASCII_PREV;
983	}
984	diff=cnv->mode; / mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 /
985	count=diff&`3`;
986	diff>>=`2`;
987
988	byteIndex=cnv->toULength;
989	bytes=cnv->toUBytes;
990
991	/ sourceIndex=-1 if the current character began in the previous buffer /
992	sourceIndex=byteIndex==`0` ? `0` : -`1`;
993	nextSourceIndex=`0`;
994
995	/ conversion "loop" similar to _SCSUToUnicodeWithOffsets() /
996	if(count>`0` && byteIndex>`0` && target<targetLimit) {
997	goto getTrail;
998	}
999
1000	fastSingle:
1001	/ fast loop for single-byte differences /
1002	/ use count as the only loop counter variable /
1003	diff=(int32_t)(sourceLimit-source);
1004	count=(int32_t)(pArgs->targetLimit-target);
1005	if(count>diff) {
1006	count=diff;
1007	}
1008	while(count>`0`) {
1009	if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
1010	c=prev+(c-BOCU1_MIDDLE);
1011	if(c<`0x3000`) {
1012	*target++=(UChar)c;
1013	*offsets++=nextSourceIndex++;
1014	prev=BOCU1_SIMPLE_PREV(c);
1015	} else {
1016	break;
1017	}
1018	} else if(c<=`0x20`) {
1019	if(c!=`0x20`) {
1020	prev=BOCU1_ASCII_PREV;
1021	}
1022	*target++=(UChar)c;
1023	*offsets++=nextSourceIndex++;
1024	} else {
1025	break;
1026	}
1027	++source;
1028	--count;
1029	}
1030	sourceIndex=nextSourceIndex; / wrong if offsets==NULL but does not matter /
1031
1032	/ decode a sequence of single and lead bytes /
1033	while(source<sourceLimit) {
1034	if(target>=targetLimit) {
1035	/ target is full /
1036	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1037	break;
1038	}
1039
1040	++nextSourceIndex;
1041	c=*source++;
1042	if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
1043	/ Write a code point directly from a single-byte difference. /
1044	c=prev+(c-BOCU1_MIDDLE);
1045	if(c<`0x3000`) {
1046	*target++=(UChar)c;
1047	*offsets++=sourceIndex;
1048	prev=BOCU1_SIMPLE_PREV(c);
1049	sourceIndex=nextSourceIndex;
1050	goto fastSingle;
1051	}
1052	} else if(c<=`0x20`) {
1053	/*
1054	* Direct-encoded C0 control code or space.
1055	* Reset prev for C0 control codes but not for space.
1056	*/
1057	if(c!=`0x20`) {
1058	prev=BOCU1_ASCII_PREV;
1059	}
1060	*target++=(UChar)c;
1061	*offsets++=sourceIndex;
1062	sourceIndex=nextSourceIndex;
1063	continue;
1064	} else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
1065	/ Optimize two-byte case. /
1066	if(c>=BOCU1_MIDDLE) {
1067	diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+`1`;
1068	} else {
1069	diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
1070	}
1071
1072	/ trail byte /
1073	++nextSourceIndex;
1074	c=decodeBocu1TrailByte(`1`, *source++);
1075	if(c<`0` \|\| (uint32_t)(c=prev+diff+c)>`0x10ffff`) {
1076	bytes[`0`]=source[-`2`];
1077	bytes[`1`]=source[-`1`];
1078	byteIndex=`2`;
1079	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
1080	break;
1081	}
1082	} else if(c==BOCU1_RESET) {
1083	/ only reset the state, no code point /
1084	prev=BOCU1_ASCII_PREV;
1085	sourceIndex=nextSourceIndex;
1086	continue;
1087	} else {
1088	/*
1089	* For multi-byte difference lead bytes, set the decoder state
1090	* with the partial difference value from the lead byte and
1091	* with the number of trail bytes.
1092	*/
1093	bytes[`0`]=(uint8_t)c;
1094	byteIndex=`1`;
1095
1096	diff=decodeBocu1LeadByte(c);
1097	count=diff&`3`;
1098	diff>>=`2`;
1099	getTrail:
1100	for(;;) {
1101	if(source>=sourceLimit) {
1102	goto endloop;
1103	}
1104	++nextSourceIndex;
1105	c=bytes[byteIndex++]=*source++;
1106
1107	/ trail byte in any position /
1108	c=decodeBocu1TrailByte(count, c);
1109	if(c<`0`) {
1110	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
1111	goto endloop;
1112	}
1113
1114	diff+=c;
1115	if(--count==`0`) {
1116	/ final trail byte, deliver a code point /
1117	byteIndex=`0`;
1118	c=prev+diff;
1119	if((uint32_t)c>`0x10ffff`) {
1120	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
1121	goto endloop;
1122	}
1123	break;
1124	}
1125	}
1126	}
1127
1128	/ calculate the next prev and output c /
1129	prev=BOCU1_PREV(c);
1130	if(c<=`0xffff`) {
1131	*target++=(UChar)c;
1132	*offsets++=sourceIndex;
1133	} else {
1134	/ output surrogate pair /
1135	*target++=U16_LEAD(c);
1136	if(target<targetLimit) {
1137	*target++=U16_TRAIL(c);
1138	*offsets++=sourceIndex;
1139	*offsets++=sourceIndex;
1140	} else {
1141	/ target overflow /
1142	*offsets++=sourceIndex;
1143	cnv->UCharErrorBuffer[`0`]=U16_TRAIL(c);
1144	cnv->UCharErrorBufferLength=`1`;
1145	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1146	break;
1147	}
1148	}
1149	sourceIndex=nextSourceIndex;
1150	}
1151	endloop:
1152
1153	if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
1154	/ set the converter state in UConverter to deal with the next character /
1155	cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
1156	cnv->mode=`0`;
1157	} else {
1158	/ set the converter state back into UConverter /
1159	cnv->toUnicodeStatus=(uint32_t)prev;
1160	cnv->mode=(diff<<`2`)\|count;
1161	}
1162	cnv->toULength=byteIndex;
1163
1164	/ write back the updated pointers /
1165	pArgs->source=(const char *)source;
1166	pArgs->target=target;
1167	pArgs->offsets=offsets;
1168	return;
1169	}
1170
1171	/*
1172	* Identical to _Bocu1ToUnicodeWithOffsets but without offset handling.
1173	* If a change is made in the original function, then either
1174	* change this function the same way or
1175	* re-copy the original function and remove the variables
1176	* offsets, sourceIndex, and nextSourceIndex.
1177	*/
1178	static void U_CALLCONV
1179	_Bocu1ToUnicode(UConverterToUnicodeArgs *pArgs,
1180	UErrorCode *pErrorCode) {
1181	UConverter *cnv;
1182	const uint8_t source, sourceLimit;
1183	UChar *target;
1184	const UChar *targetLimit;
1185
1186	int32_t prev, count, diff, c;
1187
1188	int8_t byteIndex;
1189	uint8_t *bytes;
1190
1191	/ set up the local pointers /
1192	cnv=pArgs->converter;
1193	source=(const uint8_t *)pArgs->source;
1194	sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1195	target=pArgs->target;
1196	targetLimit=pArgs->targetLimit;
1197
1198	/ get the converter state from UConverter /
1199	prev=(int32_t)cnv->toUnicodeStatus;
1200	if(prev==`0`) {
1201	prev=BOCU1_ASCII_PREV;
1202	}
1203	diff=cnv->mode; / mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 /
1204	count=diff&`3`;
1205	diff>>=`2`;
1206
1207	byteIndex=cnv->toULength;
1208	bytes=cnv->toUBytes;
1209
1210	/ conversion "loop" similar to _SCSUToUnicodeWithOffsets() /
1211	if(count>`0` && byteIndex>`0` && target<targetLimit) {
1212	goto getTrail;
1213	}
1214
1215	fastSingle:
1216	/ fast loop for single-byte differences /
1217	/ use count as the only loop counter variable /
1218	diff=(int32_t)(sourceLimit-source);
1219	count=(int32_t)(pArgs->targetLimit-target);
1220	if(count>diff) {
1221	count=diff;
1222	}
1223	while(count>`0`) {
1224	if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
1225	c=prev+(c-BOCU1_MIDDLE);
1226	if(c<`0x3000`) {
1227	*target++=(UChar)c;
1228	prev=BOCU1_SIMPLE_PREV(c);
1229	} else {
1230	break;
1231	}
1232	} else if(c<=`0x20`) {
1233	if(c!=`0x20`) {
1234	prev=BOCU1_ASCII_PREV;
1235	}
1236	*target++=(UChar)c;
1237	} else {
1238	break;
1239	}
1240	++source;
1241	--count;
1242	}
1243
1244	/ decode a sequence of single and lead bytes /
1245	while(source<sourceLimit) {
1246	if(target>=targetLimit) {
1247	/ target is full /
1248	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1249	break;
1250	}
1251
1252	c=*source++;
1253	if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
1254	/ Write a code point directly from a single-byte difference. /
1255	c=prev+(c-BOCU1_MIDDLE);
1256	if(c<`0x3000`) {
1257	*target++=(UChar)c;
1258	prev=BOCU1_SIMPLE_PREV(c);
1259	goto fastSingle;
1260	}
1261	} else if(c<=`0x20`) {
1262	/*
1263	* Direct-encoded C0 control code or space.
1264	* Reset prev for C0 control codes but not for space.
1265	*/
1266	if(c!=`0x20`) {
1267	prev=BOCU1_ASCII_PREV;
1268	}
1269	*target++=(UChar)c;
1270	continue;
1271	} else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
1272	/ Optimize two-byte case. /
1273	if(c>=BOCU1_MIDDLE) {
1274	diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+`1`;
1275	} else {
1276	diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
1277	}
1278
1279	/ trail byte /
1280	c=decodeBocu1TrailByte(`1`, *source++);
1281	if(c<`0` \|\| (uint32_t)(c=prev+diff+c)>`0x10ffff`) {
1282	bytes[`0`]=source[-`2`];
1283	bytes[`1`]=source[-`1`];
1284	byteIndex=`2`;
1285	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
1286	break;
1287	}
1288	} else if(c==BOCU1_RESET) {
1289	/ only reset the state, no code point /
1290	prev=BOCU1_ASCII_PREV;
1291	continue;
1292	} else {
1293	/*
1294	* For multi-byte difference lead bytes, set the decoder state
1295	* with the partial difference value from the lead byte and
1296	* with the number of trail bytes.
1297	*/
1298	bytes[`0`]=(uint8_t)c;
1299	byteIndex=`1`;
1300
1301	diff=decodeBocu1LeadByte(c);
1302	count=diff&`3`;
1303	diff>>=`2`;
1304	getTrail:
1305	for(;;) {
1306	if(source>=sourceLimit) {
1307	goto endloop;
1308	}
1309	c=bytes[byteIndex++]=*source++;
1310
1311	/ trail byte in any position /
1312	c=decodeBocu1TrailByte(count, c);
1313	if(c<`0`) {
1314	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
1315	goto endloop;
1316	}
1317
1318	diff+=c;
1319	if(--count==`0`) {
1320	/ final trail byte, deliver a code point /
1321	byteIndex=`0`;
1322	c=prev+diff;
1323	if((uint32_t)c>`0x10ffff`) {
1324	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
1325	goto endloop;
1326	}
1327	break;
1328	}
1329	}
1330	}
1331
1332	/ calculate the next prev and output c /
1333	prev=BOCU1_PREV(c);
1334	if(c<=`0xffff`) {
1335	*target++=(UChar)c;
1336	} else {
1337	/ output surrogate pair /
1338	*target++=U16_LEAD(c);
1339	if(target<targetLimit) {
1340	*target++=U16_TRAIL(c);
1341	} else {
1342	/ target overflow /
1343	cnv->UCharErrorBuffer[`0`]=U16_TRAIL(c);
1344	cnv->UCharErrorBufferLength=`1`;
1345	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1346	break;
1347	}
1348	}
1349	}
1350	endloop:
1351
1352	if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
1353	/ set the converter state in UConverter to deal with the next character /
1354	cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
1355	cnv->mode=`0`;
1356	} else {
1357	/ set the converter state back into UConverter /
1358	cnv->toUnicodeStatus=(uint32_t)prev;
1359	cnv->mode=(diff<<`2`)\|count;
1360	}
1361	cnv->toULength=byteIndex;
1362
1363	/ write back the updated pointers /
1364	pArgs->source=(const char *)source;
1365	pArgs->target=target;
1366	return;
1367	}
1368
1369	/ miscellaneous ------------------------------------------------------------ /
1370
1371	static const UConverterImpl _Bocu1Impl={
1372	UCNV_BOCU1,
1373
1374	NULL,
1375	NULL,
1376
1377	NULL,
1378	NULL,
1379	NULL,
1380
1381	_Bocu1ToUnicode,
1382	_Bocu1ToUnicodeWithOffsets,
1383	_Bocu1FromUnicode,
1384	_Bocu1FromUnicodeWithOffsets,
1385	NULL,
1386
1387	NULL,
1388	NULL,
1389	NULL,
1390	NULL,
1391	ucnv_getCompleteUnicodeSet,
1392
1393	NULL,
1394	NULL
1395	};
1396
1397	static const UConverterStaticData _Bocu1StaticData={
1398	sizeof(UConverterStaticData),
1399	"BOCU-1",
1400	`1214`, / CCSID for BOCU-1 /
1401	UCNV_IBM, UCNV_BOCU1,
1402	`1`, `4`, / one UChar generates at least 1 byte and at most 4 bytes /
1403	{ `0x1a`, `0`, `0`, `0` }, `1`, / BOCU-1 never needs to write a subchar /
1404	FALSE, FALSE,
1405	`0`,
1406	`0`,
1407	{ `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0` } / reserved /
1408	};
1409
1410	const UConverterSharedData _Bocu1Data=
1411	UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_Bocu1StaticData, &_Bocu1Impl);
1412
1413	#endif
1414

Browse the source code of ClickHouse/contrib/icu/icu4c/source/common/ucnvbocu.cpp