ucnv_u7.cpp source code [ClickHouse/contrib/icu/icu4c/source/common/ucnv_u7.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	**********************************************************************
5	* Copyright (C) 2002-2016, International Business Machines
6	* Corporation and others. All Rights Reserved.
7	**********************************************************************
8	* file name: ucnv_u7.c
9	* encoding: UTF-8
10	* tab size: 8 (not used)
11	* indentation:4
12	*
13	* created on: 2002jul01
14	* created by: Markus W. Scherer
15	*
16	* UTF-7 converter implementation. Used to be in ucnv_utf.c.
17	*/
18
19	#include "unicode/utypes.h"
20
21	#if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
22
23	#include "cmemory.h"
24	#include "unicode/ucnv.h"
25	#include "ucnv_bld.h"
26	#include "ucnv_cnv.h"
27	#include "uassert.h"
28
29	/ UTF-7 -------------------------------------------------------------------- /
30
31	/*
32	* UTF-7 is a stateful encoding of Unicode.
33	* It is defined in RFC 2152. (http://www.ietf.org/rfc/rfc2152.txt)
34	* It was intended for use in Internet email systems, using in its bytewise
35	* encoding only a subset of 7-bit US-ASCII.
36	* UTF-7 is deprecated in favor of UTF-8/16/32 and SCSU, but still
37	* occasionally used.
38	*
39	* For converting Unicode to UTF-7, the RFC allows to encode some US-ASCII
40	* characters directly or in base64. Especially, the characters in set O
41	* as defined in the RFC (see below) may be encoded directly but are not
42	* allowed in, e.g., email headers.
43	* By default, the ICU UTF-7 converter encodes set O directly.
44	* By choosing the option "version=1", set O will be escaped instead.
45	* For example:
46	* utf7Converter=ucnv_open("UTF-7,version=1");
47	*
48	* For details about email headers see RFC 2047.
49	*/
50
51	/*
52	* Tests for US-ASCII characters belonging to character classes
53	* defined in UTF-7.
54	*
55	* Set D (directly encoded characters) consists of the following
56	* characters: the upper and lower case letters A through Z
57	* and a through z, the 10 digits 0-9, and the following nine special
58	* characters (note that "+" and "=" are omitted):
59	* '(),-./:?
60	*
61	* Set O (optional direct characters) consists of the following
62	* characters (note that "\" and "~" are omitted):
63	* !"#$%&*;<=>@[]^_`{\|}
64	*
65	* According to the rules in RFC 2152, the byte values for the following
66	* US-ASCII characters are not used in UTF-7 and are therefore illegal:
67	* - all C0 control codes except for CR LF TAB
68	* - BACKSLASH
69	* - TILDE
70	* - DEL
71	* - all codes beyond US-ASCII, i.e. all >127
72	*/
73	#define inSetD(c) \
74	((uint8_t)((c)-97)<26 \|\| (uint8_t)((c)-65)<26 \|\| /* letters */ \
75	(uint8_t)((c)-48)<10 \|\| /* digits */ \
76	(uint8_t)((c)-39)<3 \|\| /* '() */ \
77	(uint8_t)((c)-44)<4 \|\| /* ,-./ */ \
78	(c)==58 \|\| (c)==63 /* :? */ \
79	)
80
81	#define inSetO(c) \
82	((uint8_t)((c)-33)<6 \|\| /* !"#$%& */ \
83	(uint8_t)((c)-59)<4 \|\| /* ;<=> */ \
84	(uint8_t)((c)-93)<4 \|\| /* ]^_` */ \
85	(uint8_t)((c)-123)<3 \|\| /* {\|} */ \
86	(c)==42 \|\| (c)==64 \|\| (c)==91 /* @[ / \
87	)
88
89	#define isCRLFTAB(c) ((c)==13 \|\| (c)==10 \|\| (c)==9)
90	#define isCRLFSPTAB(c) ((c)==32 \|\| (c)==13 \|\| (c)==10 \|\| (c)==9)
91
92	#define PLUS 43
93	#define MINUS 45
94	#define BACKSLASH 92
95	#define TILDE 126
96
97	/ legal byte values: all US-ASCII graphic characters from space to before tilde, and CR LF TAB /
98	#define isLegalUTF7(c) (((uint8_t)((c)-32)<94 && (c)!=BACKSLASH) \|\| isCRLFTAB(c))
99
100	/ encode directly sets D and O and CR LF SP TAB /
101	static const UBool encodeDirectlyMaximum[`128`]={
102	/ 0 1 2 3 4 5 6 7 8 9 a b c d e f /
103	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `1`, `1`, `0`, `0`, `1`, `0`, `0`,
104	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
105
106	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `0`, `1`, `1`, `1`, `1`,
107	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`,
108
109	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`,
110	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `0`, `1`, `1`, `1`,
111
112	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`,
113	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `0`, `0`
114	};
115
116	/ encode directly set D and CR LF SP TAB but not set O /
117	static const UBool encodeDirectlyRestricted[`128`]={
118	/ 0 1 2 3 4 5 6 7 8 9 a b c d e f /
119	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `1`, `1`, `0`, `0`, `1`, `0`, `0`,
120	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
121
122	`1`, `0`, `0`, `0`, `0`, `0`, `0`, `1`, `1`, `1`, `0`, `0`, `1`, `1`, `1`, `1`,
123	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `0`, `0`, `0`, `0`, `1`,
124
125	`0`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`,
126	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `0`, `0`, `0`, `0`, `0`,
127
128	`0`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`,
129	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `0`, `0`, `0`, `0`, `0`
130	};
131
132	static const uint8_t
133	toBase64[`64`]={
134	/ A-Z /
135	`65`, `66`, `67`, `68`, `69`, `70`, `71`, `72`, `73`, `74`, `75`, `76`, `77`,
136	`78`, `79`, `80`, `81`, `82`, `83`, `84`, `85`, `86`, `87`, `88`, `89`, `90`,
137	/ a-z /
138	`97`, `98`, `99`, `100`, `101`, `102`, `103`, `104`, `105`, `106`, `107`, `108`, `109`,
139	`110`, `111`, `112`, `113`, `114`, `115`, `116`, `117`, `118`, `119`, `120`, `121`, `122`,
140	/ 0-9 /
141	`48`, `49`, `50`, `51`, `52`, `53`, `54`, `55`, `56`, `57`,
142	/ +/ /
143	`43`, `47`
144	};
145
146	static const int8_t
147	fromBase64[`128`]={
148	/ C0 controls, -1 for legal ones (CR LF TAB), -3 for illegal ones /
149	-`3`, -`3`, -`3`, -`3`, -`3`, -`3`, -`3`, -`3`, -`3`, -`1`, -`1`, -`3`, -`3`, -`1`, -`3`, -`3`,
150	-`3`, -`3`, -`3`, -`3`, -`3`, -`3`, -`3`, -`3`, -`3`, -`3`, -`3`, -`3`, -`3`, -`3`, -`3`, -`3`,
151
152	/ general punctuation with + and / and a special value (-2) for - /
153	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `62`, -`1`, -`2`, -`1`, `63`,
154	/ digits /
155	`52`, `53`, `54`, `55`, `56`, `57`, `58`, `59`, `60`, `61`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
156
157	/ A-Z /
158	-`1`, `0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`,
159	`15`, `16`, `17`, `18`, `19`, `20`, `21`, `22`, `23`, `24`, `25`, -`1`, -`3`, -`1`, -`1`, -`1`,
160
161	/ a-z /
162	-`1`, `26`, `27`, `28`, `29`, `30`, `31`, `32`, `33`, `34`, `35`, `36`, `37`, `38`, `39`, `40`,
163	`41`, `42`, `43`, `44`, `45`, `46`, `47`, `48`, `49`, `50`, `51`, -`1`, -`1`, -`1`, -`3`, -`3`
164	};
165
166	/*
167	* converter status values:
168	*
169	* toUnicodeStatus:
170	* 24 inDirectMode (boolean)
171	* 23..16 base64Counter (-1..7)
172	* 15..0 bits (up to 14 bits incoming base64)
173	*
174	* fromUnicodeStatus:
175	* 31..28 version (0: set O direct 1: set O escaped)
176	* 24 inDirectMode (boolean)
177	* 23..16 base64Counter (0..2)
178	* 7..0 bits (6 bits outgoing base64)
179	*
180	*/
181
182	U_CDECL_BEGIN
183	static void U_CALLCONV
184	_UTF7Reset(UConverter *cnv, UConverterResetChoice choice) {
185	if(choice<=UCNV_RESET_TO_UNICODE) {
186	/ reset toUnicode /
187	cnv->toUnicodeStatus=`0x1000000`; / inDirectMode=TRUE /
188	cnv->toULength=`0`;
189	}
190	if(choice!=UCNV_RESET_TO_UNICODE) {
191	/ reset fromUnicode /
192	cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&`0xf0000000`)\|`0x1000000`; / keep version, inDirectMode=TRUE /
193	}
194	}
195
196	static void U_CALLCONV
197	_UTF7Open(UConverter *cnv,
198	UConverterLoadArgs *pArgs,
199	UErrorCode *pErrorCode) {
200	(void)pArgs;
201	if(UCNV_GET_VERSION(cnv)<=`1`) {
202	/ TODO(markus): Should just use cnv->options rather than copying the version number. /
203	cnv->fromUnicodeStatus=UCNV_GET_VERSION(cnv)<<`28`;
204	_UTF7Reset(cnv, UCNV_RESET_BOTH);
205	} else {
206	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
207	}
208	}
209
210	static void U_CALLCONV
211	_UTF7ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
212	UErrorCode *pErrorCode) {
213	UConverter *cnv;
214	const uint8_t source, sourceLimit;
215	UChar *target;
216	const UChar *targetLimit;
217	int32_t *offsets;
218
219	uint8_t *bytes;
220	uint8_t byteIndex;
221
222	int32_t length, targetCapacity;
223
224	/ UTF-7 state /
225	uint16_t bits;
226	int8_t base64Counter;
227	UBool inDirectMode;
228
229	int8_t base64Value;
230
231	int32_t sourceIndex, nextSourceIndex;
232
233	uint8_t b;
234	/ set up the local pointers /
235	cnv=pArgs->converter;
236
237	source=(const uint8_t *)pArgs->source;
238	sourceLimit=(const uint8_t *)pArgs->sourceLimit;
239	target=pArgs->target;
240	targetLimit=pArgs->targetLimit;
241	offsets=pArgs->offsets;
242	/ get the state machine state /
243	{
244	uint32_t status=cnv->toUnicodeStatus;
245	inDirectMode=(UBool)((status>>`24`)&`1`);
246	base64Counter=(int8_t)(status>>`16`);
247	bits=(uint16_t)status;
248	}
249	bytes=cnv->toUBytes;
250	byteIndex=cnv->toULength;
251
252	/ sourceIndex=-1 if the current character began in the previous buffer /
253	sourceIndex=byteIndex==`0` ? `0` : -`1`;
254	nextSourceIndex=`0`;
255
256	if(inDirectMode) {
257	directMode:
258	/*
259	* In Direct Mode, most US-ASCII characters are encoded directly, i.e.,
260	* with their US-ASCII byte values.
261	* Backslash and Tilde and most control characters are not allowed in UTF-7.
262	* A plus sign starts Unicode (or "escape") Mode.
263	*
264	* In Direct Mode, only the sourceIndex is used.
265	*/
266	byteIndex=`0`;
267	length=(int32_t)(sourceLimit-source);
268	targetCapacity=(int32_t)(targetLimit-target);
269	if(length>targetCapacity) {
270	length=targetCapacity;
271	}
272	while(length>`0`) {
273	b=*source++;
274	if(!isLegalUTF7(b)) {
275	/ illegal /
276	bytes[`0`]=b;
277	byteIndex=`1`;
278	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
279	break;
280	} else if(b!=PLUS) {
281	/ write directly encoded character /
282	*target++=b;
283	if(offsets!=NULL) {
284	*offsets++=sourceIndex++;
285	}
286	} else / PLUS / {
287	/ switch to Unicode mode /
288	nextSourceIndex=++sourceIndex;
289	inDirectMode=FALSE;
290	byteIndex=`0`;
291	bits=`0`;
292	base64Counter=-`1`;
293	goto unicodeMode;
294	}
295	--length;
296	}
297	if(source<sourceLimit && target>=targetLimit) {
298	/ target is full /
299	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
300	}
301	} else {
302	unicodeMode:
303	/*
304	* In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
305	* The base64 sequence ends with any character that is not in the base64 alphabet.
306	* A terminating minus sign is consumed.
307	*
308	* In Unicode Mode, the sourceIndex has the index to the start of the current
309	* base64 bytes, while nextSourceIndex is precisely parallel to source,
310	* keeping the index to the following byte.
311	* Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
312	*/
313	while(source<sourceLimit) {
314	if(target<targetLimit) {
315	bytes[byteIndex++]=b=*source++;
316	++nextSourceIndex;
317	base64Value = -`3`; / initialize as illegal /
318	if(b>=`126` \|\| (base64Value=fromBase64[b])==-`3` \|\| base64Value==-`1`) {
319	/ either*
320	* base64Value==-1 for any legal character except base64 and minus sign, or
321	* base64Value==-3 for illegal characters:
322	* 1. In either case, leave Unicode mode.
323	* 2.1. If we ended with an incomplete UChar or none after the +, then
324	* generate an error for the preceding erroneous sequence and deal with
325	* the current (possibly illegal) character next time through.
326	* 2.2. Else the current char comes after a complete UChar, which was already
327	* pushed to the output buf, so:
328	* 2.2.1. If the current char is legal, just save it for processing next time.
329	* It may be for example, a plus which we need to deal with in direct mode.
330	* 2.2.2. Else if the current char is illegal, we might as well deal with it here.
331	*/
332	inDirectMode=TRUE;
333	if(base64Counter==-`1`) {
334	/ illegal: + immediately followed by something other than base64 or minus sign /
335	/ include the plus sign in the reported sequence, but not the subsequent char /
336	--source;
337	bytes[`0`]=PLUS;
338	byteIndex=`1`;
339	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
340	break;
341	} else if(bits!=`0`) {
342	/ bits are illegally left over, a UChar is incomplete /
343	/ don't include current char (legal or illegal) in error seq /
344	--source;
345	--byteIndex;
346	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
347	break;
348	} else {
349	/ previous UChar was complete /
350	if(base64Value==-`3`) {
351	/ current character is illegal, deal with it here /
352	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
353	break;
354	} else {
355	/ un-read the current character in case it is a plus sign /
356	--source;
357	sourceIndex=nextSourceIndex-`1`;
358	goto directMode;
359	}
360	}
361	} else if(base64Value>=`0`) {
362	/ collect base64 bytes into UChars /
363	switch(base64Counter) {
364	case -`1`: / -1 is immediately after the + /
365	case `0`:
366	bits=base64Value;
367	base64Counter=`1`;
368	break;
369	case `1`:
370	case `3`:
371	case `4`:
372	case `6`:
373	bits=(uint16_t)((bits<<`6`)\|base64Value);
374	++base64Counter;
375	break;
376	case `2`:
377	*target++=(UChar)((bits<<`4`)\|(base64Value>>`2`));
378	if(offsets!=NULL) {
379	*offsets++=sourceIndex;
380	sourceIndex=nextSourceIndex-`1`;
381	}
382	bytes[`0`]=b; / keep this byte in case an error occurs /
383	byteIndex=`1`;
384	bits=(uint16_t)(base64Value&`3`);
385	base64Counter=`3`;
386	break;
387	case `5`:
388	*target++=(UChar)((bits<<`2`)\|(base64Value>>`4`));
389	if(offsets!=NULL) {
390	*offsets++=sourceIndex;
391	sourceIndex=nextSourceIndex-`1`;
392	}
393	bytes[`0`]=b; / keep this byte in case an error occurs /
394	byteIndex=`1`;
395	bits=(uint16_t)(base64Value&`15`);
396	base64Counter=`6`;
397	break;
398	case `7`:
399	*target++=(UChar)((bits<<`6`)\|base64Value);
400	if(offsets!=NULL) {
401	*offsets++=sourceIndex;
402	sourceIndex=nextSourceIndex;
403	}
404	byteIndex=`0`;
405	bits=`0`;
406	base64Counter=`0`;
407	break;
408	default:
409	/ will never occur /
410	break;
411	}
412	} else /base64Value==-2/ {
413	/ minus sign terminates the base64 sequence /
414	inDirectMode=TRUE;
415	if(base64Counter==-`1`) {
416	/ +- i.e. a minus immediately following a plus /
417	*target++=PLUS;
418	if(offsets!=NULL) {
419	*offsets++=sourceIndex-`1`;
420	}
421	} else {
422	/ absorb the minus and leave the Unicode Mode /
423	if(bits!=`0`) {
424	/ bits are illegally left over, a UChar is incomplete /
425	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
426	break;
427	}
428	}
429	sourceIndex=nextSourceIndex;
430	goto directMode;
431	}
432	} else {
433	/ target is full /
434	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
435	break;
436	}
437	}
438	}
439
440	if(U_SUCCESS(*pErrorCode) && pArgs->flush && source==sourceLimit && bits==`0`) {
441	/*
442	* if we are in Unicode mode, then the byteIndex might not be 0,
443	* but that is ok if bits==0
444	* -> we set byteIndex=0 at the end of the stream to avoid a truncated error
445	* (not true for IMAP-mailbox-name where we must end in direct mode)
446	*/
447	byteIndex=`0`;
448	}
449
450	/ set the converter state back into UConverter /
451	cnv->toUnicodeStatus=((uint32_t)inDirectMode<<`24`)\|((uint32_t)((uint8_t)base64Counter)<<`16`)\|(uint32_t)bits;
452	cnv->toULength=byteIndex;
453
454	/ write back the updated pointers /
455	pArgs->source=(const char *)source;
456	pArgs->target=target;
457	pArgs->offsets=offsets;
458	return;
459	}
460
461	static void U_CALLCONV
462	_UTF7FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
463	UErrorCode *pErrorCode) {
464	UConverter *cnv;
465	const UChar source, sourceLimit;
466	uint8_t target, targetLimit;
467	int32_t *offsets;
468
469	int32_t length, targetCapacity, sourceIndex;
470	UChar c;
471
472	/ UTF-7 state /
473	const UBool *encodeDirectly;
474	uint8_t bits;
475	int8_t base64Counter;
476	UBool inDirectMode;
477
478	/ set up the local pointers /
479	cnv=pArgs->converter;
480
481	/ set up the local pointers /
482	source=pArgs->source;
483	sourceLimit=pArgs->sourceLimit;
484	target=(uint8_t *)pArgs->target;
485	targetLimit=(uint8_t *)pArgs->targetLimit;
486	offsets=pArgs->offsets;
487
488	/ get the state machine state /
489	{
490	uint32_t status=cnv->fromUnicodeStatus;
491	encodeDirectly= status<`0x10000000` ? encodeDirectlyMaximum : encodeDirectlyRestricted;
492	inDirectMode=(UBool)((status>>`24`)&`1`);
493	base64Counter=(int8_t)(status>>`16`);
494	bits=(uint8_t)status;
495	U_ASSERT(bits<=UPRV_LENGTHOF(toBase64));
496	}
497
498	/ UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex /
499	sourceIndex=`0`;
500
501	if(inDirectMode) {
502	directMode:
503	length=(int32_t)(sourceLimit-source);
504	targetCapacity=(int32_t)(targetLimit-target);
505	if(length>targetCapacity) {
506	length=targetCapacity;
507	}
508	while(length>`0`) {
509	c=*source++;
510	/ currently always encode CR LF SP TAB directly /
511	if(c<=`127` && encodeDirectly[c]) {
512	/ encode directly /
513	*target++=(uint8_t)c;
514	if(offsets!=NULL) {
515	*offsets++=sourceIndex++;
516	}
517	} else if(c==PLUS) {
518	/ output +- for + /
519	*target++=PLUS;
520	if(target<targetLimit) {
521	*target++=MINUS;
522	if(offsets!=NULL) {
523	*offsets++=sourceIndex;
524	*offsets++=sourceIndex++;
525	}
526	/ realign length and targetCapacity /
527	goto directMode;
528	} else {
529	if(offsets!=NULL) {
530	*offsets++=sourceIndex++;
531	}
532	cnv->charErrorBuffer[`0`]=MINUS;
533	cnv->charErrorBufferLength=`1`;
534	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
535	break;
536	}
537	} else {
538	/ un-read this character and switch to Unicode Mode /
539	--source;
540	*target++=PLUS;
541	if(offsets!=NULL) {
542	*offsets++=sourceIndex;
543	}
544	inDirectMode=FALSE;
545	base64Counter=`0`;
546	goto unicodeMode;
547	}
548	--length;
549	}
550	if(source<sourceLimit && target>=targetLimit) {
551	/ target is full /
552	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
553	}
554	} else {
555	unicodeMode:
556	while(source<sourceLimit) {
557	if(target<targetLimit) {
558	c=*source++;
559	if(c<=`127` && encodeDirectly[c]) {
560	/ encode directly /
561	inDirectMode=TRUE;
562
563	/ trick: back out this character to make this easier /
564	--source;
565
566	/ terminate the base64 sequence /
567	if(base64Counter!=`0`) {
568	/ write remaining bits for the previous character /
569	*target++=toBase64[bits];
570	if(offsets!=NULL) {
571	*offsets++=sourceIndex-`1`;
572	}
573	}
574	if(fromBase64[c]!=-`1`) {
575	/ need to terminate with a minus /
576	if(target<targetLimit) {
577	*target++=MINUS;
578	if(offsets!=NULL) {
579	*offsets++=sourceIndex-`1`;
580	}
581	} else {
582	cnv->charErrorBuffer[`0`]=MINUS;
583	cnv->charErrorBufferLength=`1`;
584	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
585	break;
586	}
587	}
588	goto directMode;
589	} else {
590	/*
591	* base64 this character:
592	* Output 2 or 3 base64 bytes for the remaining bits of the previous character
593	* and the bits of this character, each implicitly in UTF-16BE.
594	*
595	* Here, bits is an 8-bit variable because only 6 bits need to be kept from one
596	* character to the next. The actual 2 or 4 bits are shifted to the left edge
597	* of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
598	*/
599	switch(base64Counter) {
600	case `0`:
601	*target++=toBase64[c>>`10`];
602	if(target<targetLimit) {
603	*target++=toBase64[(c>>`4`)&`0x3f`];
604	if(offsets!=NULL) {
605	*offsets++=sourceIndex;
606	*offsets++=sourceIndex++;
607	}
608	} else {
609	if(offsets!=NULL) {
610	*offsets++=sourceIndex++;
611	}
612	cnv->charErrorBuffer[`0`]=toBase64[(c>>`4`)&`0x3f`];
613	cnv->charErrorBufferLength=`1`;
614	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
615	}
616	bits=(uint8_t)((c&`15`)<<`2`);
617	base64Counter=`1`;
618	break;
619	case `1`:
620	*target++=toBase64[bits\|(c>>`14`)];
621	if(target<targetLimit) {
622	*target++=toBase64[(c>>`8`)&`0x3f`];
623	if(target<targetLimit) {
624	*target++=toBase64[(c>>`2`)&`0x3f`];
625	if(offsets!=NULL) {
626	*offsets++=sourceIndex;
627	*offsets++=sourceIndex;
628	*offsets++=sourceIndex++;
629	}
630	} else {
631	if(offsets!=NULL) {
632	*offsets++=sourceIndex;
633	*offsets++=sourceIndex++;
634	}
635	cnv->charErrorBuffer[`0`]=toBase64[(c>>`2`)&`0x3f`];
636	cnv->charErrorBufferLength=`1`;
637	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
638	}
639	} else {
640	if(offsets!=NULL) {
641	*offsets++=sourceIndex++;
642	}
643	cnv->charErrorBuffer[`0`]=toBase64[(c>>`8`)&`0x3f`];
644	cnv->charErrorBuffer[`1`]=toBase64[(c>>`2`)&`0x3f`];
645	cnv->charErrorBufferLength=`2`;
646	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
647	}
648	bits=(uint8_t)((c&`3`)<<`4`);
649	base64Counter=`2`;
650	break;
651	case `2`:
652	*target++=toBase64[bits\|(c>>`12`)];
653	if(target<targetLimit) {
654	*target++=toBase64[(c>>`6`)&`0x3f`];
655	if(target<targetLimit) {
656	*target++=toBase64[c&`0x3f`];
657	if(offsets!=NULL) {
658	*offsets++=sourceIndex;
659	*offsets++=sourceIndex;
660	*offsets++=sourceIndex++;
661	}
662	} else {
663	if(offsets!=NULL) {
664	*offsets++=sourceIndex;
665	*offsets++=sourceIndex++;
666	}
667	cnv->charErrorBuffer[`0`]=toBase64[c&`0x3f`];
668	cnv->charErrorBufferLength=`1`;
669	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
670	}
671	} else {
672	if(offsets!=NULL) {
673	*offsets++=sourceIndex++;
674	}
675	cnv->charErrorBuffer[`0`]=toBase64[(c>>`6`)&`0x3f`];
676	cnv->charErrorBuffer[`1`]=toBase64[c&`0x3f`];
677	cnv->charErrorBufferLength=`2`;
678	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
679	}
680	bits=`0`;
681	base64Counter=`0`;
682	break;
683	default:
684	/ will never occur /
685	break;
686	}
687	}
688	} else {
689	/ target is full /
690	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
691	break;
692	}
693	}
694	}
695
696	if(pArgs->flush && source>=sourceLimit) {
697	/ flush remaining bits to the target /
698	if(!inDirectMode) {
699	if (base64Counter!=`0`) {
700	if(target<targetLimit) {
701	*target++=toBase64[bits];
702	if(offsets!=NULL) {
703	*offsets++=sourceIndex-`1`;
704	}
705	} else {
706	cnv->charErrorBuffer[cnv->charErrorBufferLength++]=toBase64[bits];
707	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
708	}
709	}
710	/ Add final MINUS to terminate unicodeMode /
711	if(target<targetLimit) {
712	*target++=MINUS;
713	if(offsets!=NULL) {
714	*offsets++=sourceIndex-`1`;
715	}
716	} else {
717	cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
718	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
719	}
720	}
721	/ reset the state for the next conversion /
722	cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&`0xf0000000`)\|`0x1000000`; / keep version, inDirectMode=TRUE /
723	} else {
724	/ set the converter state back into UConverter /
725	cnv->fromUnicodeStatus=
726	(cnv->fromUnicodeStatus&`0xf0000000`)\| / keep version/
727	((uint32_t)inDirectMode<<`24`)\|((uint32_t)base64Counter<<`16`)\|(uint32_t)bits;
728	}
729
730	/ write back the updated pointers /
731	pArgs->source=source;
732	pArgs->target=(char *)target;
733	pArgs->offsets=offsets;
734	return;
735	}
736
737	static const char * U_CALLCONV
738	_UTF7GetName(const UConverter *cnv) {
739	switch(cnv->fromUnicodeStatus>>`28`) {
740	case `1`:
741	return "UTF-7,version=1";
742	default:
743	return "UTF-7";
744	}
745	}
746	U_CDECL_END
747
748	static const UConverterImpl _UTF7Impl={
749	UCNV_UTF7,
750
751	NULL,
752	NULL,
753
754	_UTF7Open,
755	NULL,
756	_UTF7Reset,
757
758	_UTF7ToUnicodeWithOffsets,
759	_UTF7ToUnicodeWithOffsets,
760	_UTF7FromUnicodeWithOffsets,
761	_UTF7FromUnicodeWithOffsets,
762	NULL,
763
764	NULL,
765	_UTF7GetName,
766	NULL, / we don't need writeSub() because we never call a callback at fromUnicode() /
767	NULL,
768	ucnv_getCompleteUnicodeSet,
769
770	NULL,
771	NULL
772	};
773
774	static const UConverterStaticData _UTF7StaticData={
775	sizeof(UConverterStaticData),
776	"UTF-7",
777	`0`, / TODO CCSID for UTF-7 /
778	UCNV_IBM, UCNV_UTF7,
779	`1`, `4`,
780	{ `0x3f`, `0`, `0`, `0` }, `1`, / the subchar is not used /
781	FALSE, FALSE,
782	`0`,
783	`0`,
784	{ `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0` } / reserved /
785	};
786
787	const UConverterSharedData _UTF7Data=
788	UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF7StaticData, &_UTF7Impl);
789
790	/ IMAP mailbox name encoding ----------------------------------------------- /
791
792	/*
793	* RFC 2060: INTERNET MESSAGE ACCESS PROTOCOL - VERSION 4rev1
794	* http://www.ietf.org/rfc/rfc2060.txt
795	*
796	* 5.1.3. Mailbox International Naming Convention
797	*
798	* By convention, international mailbox names are specified using a
799	* modified version of the UTF-7 encoding described in [UTF-7]. The
800	* purpose of these modifications is to correct the following problems
801	* with UTF-7:
802	*
803	* 1) UTF-7 uses the "+" character for shifting; this conflicts with
804	* the common use of "+" in mailbox names, in particular USENET
805	* newsgroup names.
806	*
807	* 2) UTF-7's encoding is BASE64 which uses the "/" character; this
808	* conflicts with the use of "/" as a popular hierarchy delimiter.
809	*
810	* 3) UTF-7 prohibits the unencoded usage of "\"; this conflicts with
811	* the use of "\" as a popular hierarchy delimiter.
812	*
813	* 4) UTF-7 prohibits the unencoded usage of "~"; this conflicts with
814	* the use of "~" in some servers as a home directory indicator.
815	*
816	* 5) UTF-7 permits multiple alternate forms to represent the same
817	* string; in particular, printable US-ASCII chararacters can be
818	* represented in encoded form.
819	*
820	* In modified UTF-7, printable US-ASCII characters except for "&"
821	* represent themselves; that is, characters with octet values 0x20-0x25
822	* and 0x27-0x7e. The character "&" (0x26) is represented by the two-
823	* octet sequence "&-".
824	*
825	* All other characters (octet values 0x00-0x1f, 0x7f-0xff, and all
826	* Unicode 16-bit octets) are represented in modified BASE64, with a
827	* further modification from [UTF-7] that "," is used instead of "/".
828	* Modified BASE64 MUST NOT be used to represent any printing US-ASCII
829	* character which can represent itself.
830	*
831	* "&" is used to shift to modified BASE64 and "-" to shift back to US-
832	* ASCII. All names start in US-ASCII, and MUST end in US-ASCII (that
833	* is, a name that ends with a Unicode 16-bit octet MUST end with a "-
834	* ").
835	*
836	* For example, here is a mailbox name which mixes English, Japanese,
837	* and Chinese text: ~peter/mail/&ZeVnLIqe-/&U,BTFw-
838	*/
839
840	/*
841	* Tests for US-ASCII characters belonging to character classes
842	* defined in UTF-7.
843	*
844	* Set D (directly encoded characters) consists of the following
845	* characters: the upper and lower case letters A through Z
846	* and a through z, the 10 digits 0-9, and the following nine special
847	* characters (note that "+" and "=" are omitted):
848	* '(),-./:?
849	*
850	* Set O (optional direct characters) consists of the following
851	* characters (note that "\" and "~" are omitted):
852	* !"#$%&*;<=>@[]^_`{\|}
853	*
854	* According to the rules in RFC 2152, the byte values for the following
855	* US-ASCII characters are not used in UTF-7 and are therefore illegal:
856	* - all C0 control codes except for CR LF TAB
857	* - BACKSLASH
858	* - TILDE
859	* - DEL
860	* - all codes beyond US-ASCII, i.e. all >127
861	*/
862
863	/ uses '&' not '+' to start a base64 sequence /
864	#define AMPERSAND 0x26
865	#define COMMA 0x2c
866	#define SLASH 0x2f
867
868	/ legal byte values: all US-ASCII graphic characters 0x20..0x7e /
869	#define isLegalIMAP(c) (0x20<=(c) && (c)<=0x7e)
870
871	/ direct-encode all of printable ASCII 0x20..0x7e except '&' 0x26 /
872	#define inSetDIMAP(c) (isLegalIMAP(c) && c!=AMPERSAND)
873
874	#define TO_BASE64_IMAP(n) ((n)<63 ? toBase64[n] : COMMA)
875	#define FROM_BASE64_IMAP(c) ((c)==COMMA ? 63 : (c)==SLASH ? -1 : fromBase64[c])
876
877	/*
878	* converter status values:
879	*
880	* toUnicodeStatus:
881	* 24 inDirectMode (boolean)
882	* 23..16 base64Counter (-1..7)
883	* 15..0 bits (up to 14 bits incoming base64)
884	*
885	* fromUnicodeStatus:
886	* 24 inDirectMode (boolean)
887	* 23..16 base64Counter (0..2)
888	* 7..0 bits (6 bits outgoing base64)
889	*
890	* ignore bits 31..25
891	*/
892
893	U_CDECL_BEGIN
894	static void U_CALLCONV
895	_IMAPToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
896	UErrorCode *pErrorCode) {
897	UConverter *cnv;
898	const uint8_t source, sourceLimit;
899	UChar *target;
900	const UChar *targetLimit;
901	int32_t *offsets;
902
903	uint8_t *bytes;
904	uint8_t byteIndex;
905
906	int32_t length, targetCapacity;
907
908	/ UTF-7 state /
909	uint16_t bits;
910	int8_t base64Counter;
911	UBool inDirectMode;
912
913	int8_t base64Value;
914
915	int32_t sourceIndex, nextSourceIndex;
916
917	UChar c;
918	uint8_t b;
919
920	/ set up the local pointers /
921	cnv=pArgs->converter;
922
923	source=(const uint8_t *)pArgs->source;
924	sourceLimit=(const uint8_t *)pArgs->sourceLimit;
925	target=pArgs->target;
926	targetLimit=pArgs->targetLimit;
927	offsets=pArgs->offsets;
928	/ get the state machine state /
929	{
930	uint32_t status=cnv->toUnicodeStatus;
931	inDirectMode=(UBool)((status>>`24`)&`1`);
932	base64Counter=(int8_t)(status>>`16`);
933	bits=(uint16_t)status;
934	}
935	bytes=cnv->toUBytes;
936	byteIndex=cnv->toULength;
937
938	/ sourceIndex=-1 if the current character began in the previous buffer /
939	sourceIndex=byteIndex==`0` ? `0` : -`1`;
940	nextSourceIndex=`0`;
941
942	if(inDirectMode) {
943	directMode:
944	/*
945	* In Direct Mode, US-ASCII characters are encoded directly, i.e.,
946	* with their US-ASCII byte values.
947	* An ampersand starts Unicode (or "escape") Mode.
948	*
949	* In Direct Mode, only the sourceIndex is used.
950	*/
951	byteIndex=`0`;
952	length=(int32_t)(sourceLimit-source);
953	targetCapacity=(int32_t)(targetLimit-target);
954	if(length>targetCapacity) {
955	length=targetCapacity;
956	}
957	while(length>`0`) {
958	b=*source++;
959	if(!isLegalIMAP(b)) {
960	/ illegal /
961	bytes[`0`]=b;
962	byteIndex=`1`;
963	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
964	break;
965	} else if(b!=AMPERSAND) {
966	/ write directly encoded character /
967	*target++=b;
968	if(offsets!=NULL) {
969	*offsets++=sourceIndex++;
970	}
971	} else / AMPERSAND / {
972	/ switch to Unicode mode /
973	nextSourceIndex=++sourceIndex;
974	inDirectMode=FALSE;
975	byteIndex=`0`;
976	bits=`0`;
977	base64Counter=-`1`;
978	goto unicodeMode;
979	}
980	--length;
981	}
982	if(source<sourceLimit && target>=targetLimit) {
983	/ target is full /
984	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
985	}
986	} else {
987	unicodeMode:
988	/*
989	* In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
990	* The base64 sequence ends with any character that is not in the base64 alphabet.
991	* A terminating minus sign is consumed.
992	* US-ASCII must not be base64-ed.
993	*
994	* In Unicode Mode, the sourceIndex has the index to the start of the current
995	* base64 bytes, while nextSourceIndex is precisely parallel to source,
996	* keeping the index to the following byte.
997	* Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
998	*/
999	while(source<sourceLimit) {
1000	if(target<targetLimit) {
1001	bytes[byteIndex++]=b=*source++;
1002	++nextSourceIndex;
1003	if(b>`0x7e`) {
1004	/ illegal - test other illegal US-ASCII values by base64Value==-3 /
1005	inDirectMode=TRUE;
1006	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
1007	break;
1008	} else if((base64Value=FROM_BASE64_IMAP(b))>=`0`) {
1009	/ collect base64 bytes into UChars /
1010	switch(base64Counter) {
1011	case -`1`: / -1 is immediately after the & /
1012	case `0`:
1013	bits=base64Value;
1014	base64Counter=`1`;
1015	break;
1016	case `1`:
1017	case `3`:
1018	case `4`:
1019	case `6`:
1020	bits=(uint16_t)((bits<<`6`)\|base64Value);
1021	++base64Counter;
1022	break;
1023	case `2`:
1024	c=(UChar)((bits<<`4`)\|(base64Value>>`2`));
1025	if(isLegalIMAP(c)) {
1026	/ illegal /
1027	inDirectMode=TRUE;
1028	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
1029	goto endloop;
1030	}
1031	*target++=c;
1032	if(offsets!=NULL) {
1033	*offsets++=sourceIndex;
1034	sourceIndex=nextSourceIndex-`1`;
1035	}
1036	bytes[`0`]=b; / keep this byte in case an error occurs /
1037	byteIndex=`1`;
1038	bits=(uint16_t)(base64Value&`3`);
1039	base64Counter=`3`;
1040	break;
1041	case `5`:
1042	c=(UChar)((bits<<`2`)\|(base64Value>>`4`));
1043	if(isLegalIMAP(c)) {
1044	/ illegal /
1045	inDirectMode=TRUE;
1046	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
1047	goto endloop;
1048	}
1049	*target++=c;
1050	if(offsets!=NULL) {
1051	*offsets++=sourceIndex;
1052	sourceIndex=nextSourceIndex-`1`;
1053	}
1054	bytes[`0`]=b; / keep this byte in case an error occurs /
1055	byteIndex=`1`;
1056	bits=(uint16_t)(base64Value&`15`);
1057	base64Counter=`6`;
1058	break;
1059	case `7`:
1060	c=(UChar)((bits<<`6`)\|base64Value);
1061	if(isLegalIMAP(c)) {
1062	/ illegal /
1063	inDirectMode=TRUE;
1064	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
1065	goto endloop;
1066	}
1067	*target++=c;
1068	if(offsets!=NULL) {
1069	*offsets++=sourceIndex;
1070	sourceIndex=nextSourceIndex;
1071	}
1072	byteIndex=`0`;
1073	bits=`0`;
1074	base64Counter=`0`;
1075	break;
1076	default:
1077	/ will never occur /
1078	break;
1079	}
1080	} else if(base64Value==-`2`) {
1081	/ minus sign terminates the base64 sequence /
1082	inDirectMode=TRUE;
1083	if(base64Counter==-`1`) {
1084	/ &- i.e. a minus immediately following an ampersand /
1085	*target++=AMPERSAND;
1086	if(offsets!=NULL) {
1087	*offsets++=sourceIndex-`1`;
1088	}
1089	} else {
1090	/ absorb the minus and leave the Unicode Mode /
1091	if(bits!=`0` \|\| (base64Counter!=`0` && base64Counter!=`3` && base64Counter!=`6`)) {
1092	/ bits are illegally left over, a UChar is incomplete /
1093	/ base64Counter other than 0, 3, 6 means non-minimal zero-padding, also illegal /
1094	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
1095	break;
1096	}
1097	}
1098	sourceIndex=nextSourceIndex;
1099	goto directMode;
1100	} else {
1101	if(base64Counter==-`1`) {
1102	/ illegal: & immediately followed by something other than base64 or minus sign /
1103	/ include the ampersand in the reported sequence /
1104	--sourceIndex;
1105	bytes[`0`]=AMPERSAND;
1106	bytes[`1`]=b;
1107	byteIndex=`2`;
1108	}
1109	/ base64Value==-1 for characters that are illegal only in Unicode mode /
1110	/ base64Value==-3 for illegal characters /
1111	/ illegal /
1112	inDirectMode=TRUE;
1113	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
1114	break;
1115	}
1116	} else {
1117	/ target is full /
1118	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1119	break;
1120	}
1121	}
1122	}
1123	endloop:
1124
1125	/*
1126	* the end of the input stream and detection of truncated input
1127	* are handled by the framework, but here we must check if we are in Unicode
1128	* mode and byteIndex==0 because we must end in direct mode
1129	*
1130	* conditions:
1131	* successful
1132	* in Unicode mode and byteIndex==0
1133	* end of input and no truncated input
1134	*/
1135	if( U_SUCCESS(*pErrorCode) &&
1136	!inDirectMode && byteIndex==`0` &&
1137	pArgs->flush && source>=sourceLimit
1138	) {
1139	if(base64Counter==-`1`) {
1140	/ & at the very end of the input /
1141	/ make the ampersand the reported sequence /
1142	bytes[`0`]=AMPERSAND;
1143	byteIndex=`1`;
1144	}
1145	/ else if(base64Counter!=-1) byteIndex remains 0 because there is no particular byte sequence /
1146
1147	inDirectMode=TRUE; / avoid looping /
1148	*pErrorCode=U_TRUNCATED_CHAR_FOUND;
1149	}
1150
1151	/ set the converter state back into UConverter /
1152	cnv->toUnicodeStatus=((uint32_t)inDirectMode<<`24`)\|((uint32_t)((uint8_t)base64Counter)<<`16`)\|(uint32_t)bits;
1153	cnv->toULength=byteIndex;
1154
1155	/ write back the updated pointers /
1156	pArgs->source=(const char *)source;
1157	pArgs->target=target;
1158	pArgs->offsets=offsets;
1159	return;
1160	}
1161
1162	static void U_CALLCONV
1163	_IMAPFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
1164	UErrorCode *pErrorCode) {
1165	UConverter *cnv;
1166	const UChar source, sourceLimit;
1167	uint8_t target, targetLimit;
1168	int32_t *offsets;
1169
1170	int32_t length, targetCapacity, sourceIndex;
1171	UChar c;
1172	uint8_t b;
1173
1174	/ UTF-7 state /
1175	uint8_t bits;
1176	int8_t base64Counter;
1177	UBool inDirectMode;
1178
1179	/ set up the local pointers /
1180	cnv=pArgs->converter;
1181
1182	/ set up the local pointers /
1183	source=pArgs->source;
1184	sourceLimit=pArgs->sourceLimit;
1185	target=(uint8_t *)pArgs->target;
1186	targetLimit=(uint8_t *)pArgs->targetLimit;
1187	offsets=pArgs->offsets;
1188
1189	/ get the state machine state /
1190	{
1191	uint32_t status=cnv->fromUnicodeStatus;
1192	inDirectMode=(UBool)((status>>`24`)&`1`);
1193	base64Counter=(int8_t)(status>>`16`);
1194	bits=(uint8_t)status;
1195	}
1196
1197	/ UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex /
1198	sourceIndex=`0`;
1199
1200	if(inDirectMode) {
1201	directMode:
1202	length=(int32_t)(sourceLimit-source);
1203	targetCapacity=(int32_t)(targetLimit-target);
1204	if(length>targetCapacity) {
1205	length=targetCapacity;
1206	}
1207	while(length>`0`) {
1208	c=*source++;
1209	/ encode 0x20..0x7e except '&' directly /
1210	if(inSetDIMAP(c)) {
1211	/ encode directly /
1212	*target++=(uint8_t)c;
1213	if(offsets!=NULL) {
1214	*offsets++=sourceIndex++;
1215	}
1216	} else if(c==AMPERSAND) {
1217	/ output &- for & /
1218	*target++=AMPERSAND;
1219	if(target<targetLimit) {
1220	*target++=MINUS;
1221	if(offsets!=NULL) {
1222	*offsets++=sourceIndex;
1223	*offsets++=sourceIndex++;
1224	}
1225	/ realign length and targetCapacity /
1226	goto directMode;
1227	} else {
1228	if(offsets!=NULL) {
1229	*offsets++=sourceIndex++;
1230	}
1231	cnv->charErrorBuffer[`0`]=MINUS;
1232	cnv->charErrorBufferLength=`1`;
1233	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1234	break;
1235	}
1236	} else {
1237	/ un-read this character and switch to Unicode Mode /
1238	--source;
1239	*target++=AMPERSAND;
1240	if(offsets!=NULL) {
1241	*offsets++=sourceIndex;
1242	}
1243	inDirectMode=FALSE;
1244	base64Counter=`0`;
1245	goto unicodeMode;
1246	}
1247	--length;
1248	}
1249	if(source<sourceLimit && target>=targetLimit) {
1250	/ target is full /
1251	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1252	}
1253	} else {
1254	unicodeMode:
1255	while(source<sourceLimit) {
1256	if(target<targetLimit) {
1257	c=*source++;
1258	if(isLegalIMAP(c)) {
1259	/ encode directly /
1260	inDirectMode=TRUE;
1261
1262	/ trick: back out this character to make this easier /
1263	--source;
1264
1265	/ terminate the base64 sequence /
1266	if(base64Counter!=`0`) {
1267	/ write remaining bits for the previous character /
1268	*target++=TO_BASE64_IMAP(bits);
1269	if(offsets!=NULL) {
1270	*offsets++=sourceIndex-`1`;
1271	}
1272	}
1273	/ need to terminate with a minus /
1274	if(target<targetLimit) {
1275	*target++=MINUS;
1276	if(offsets!=NULL) {
1277	*offsets++=sourceIndex-`1`;
1278	}
1279	} else {
1280	cnv->charErrorBuffer[`0`]=MINUS;
1281	cnv->charErrorBufferLength=`1`;
1282	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1283	break;
1284	}
1285	goto directMode;
1286	} else {
1287	/*
1288	* base64 this character:
1289	* Output 2 or 3 base64 bytes for the remaining bits of the previous character
1290	* and the bits of this character, each implicitly in UTF-16BE.
1291	*
1292	* Here, bits is an 8-bit variable because only 6 bits need to be kept from one
1293	* character to the next. The actual 2 or 4 bits are shifted to the left edge
1294	* of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
1295	*/
1296	switch(base64Counter) {
1297	case `0`:
1298	b=(uint8_t)(c>>`10`);
1299	*target++=TO_BASE64_IMAP(b);
1300	if(target<targetLimit) {
1301	b=(uint8_t)((c>>`4`)&`0x3f`);
1302	*target++=TO_BASE64_IMAP(b);
1303	if(offsets!=NULL) {
1304	*offsets++=sourceIndex;
1305	*offsets++=sourceIndex++;
1306	}
1307	} else {
1308	if(offsets!=NULL) {
1309	*offsets++=sourceIndex++;
1310	}
1311	b=(uint8_t)((c>>`4`)&`0x3f`);
1312	cnv->charErrorBuffer[`0`]=TO_BASE64_IMAP(b);
1313	cnv->charErrorBufferLength=`1`;
1314	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1315	}
1316	bits=(uint8_t)((c&`15`)<<`2`);
1317	base64Counter=`1`;
1318	break;
1319	case `1`:
1320	b=(uint8_t)(bits\|(c>>`14`));
1321	*target++=TO_BASE64_IMAP(b);
1322	if(target<targetLimit) {
1323	b=(uint8_t)((c>>`8`)&`0x3f`);
1324	*target++=TO_BASE64_IMAP(b);
1325	if(target<targetLimit) {
1326	b=(uint8_t)((c>>`2`)&`0x3f`);
1327	*target++=TO_BASE64_IMAP(b);
1328	if(offsets!=NULL) {
1329	*offsets++=sourceIndex;
1330	*offsets++=sourceIndex;
1331	*offsets++=sourceIndex++;
1332	}
1333	} else {
1334	if(offsets!=NULL) {
1335	*offsets++=sourceIndex;
1336	*offsets++=sourceIndex++;
1337	}
1338	b=(uint8_t)((c>>`2`)&`0x3f`);
1339	cnv->charErrorBuffer[`0`]=TO_BASE64_IMAP(b);
1340	cnv->charErrorBufferLength=`1`;
1341	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1342	}
1343	} else {
1344	if(offsets!=NULL) {
1345	*offsets++=sourceIndex++;
1346	}
1347	b=(uint8_t)((c>>`8`)&`0x3f`);
1348	cnv->charErrorBuffer[`0`]=TO_BASE64_IMAP(b);
1349	b=(uint8_t)((c>>`2`)&`0x3f`);
1350	cnv->charErrorBuffer[`1`]=TO_BASE64_IMAP(b);
1351	cnv->charErrorBufferLength=`2`;
1352	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1353	}
1354	bits=(uint8_t)((c&`3`)<<`4`);
1355	base64Counter=`2`;
1356	break;
1357	case `2`:
1358	b=(uint8_t)(bits\|(c>>`12`));
1359	*target++=TO_BASE64_IMAP(b);
1360	if(target<targetLimit) {
1361	b=(uint8_t)((c>>`6`)&`0x3f`);
1362	*target++=TO_BASE64_IMAP(b);
1363	if(target<targetLimit) {
1364	b=(uint8_t)(c&`0x3f`);
1365	*target++=TO_BASE64_IMAP(b);
1366	if(offsets!=NULL) {
1367	*offsets++=sourceIndex;
1368	*offsets++=sourceIndex;
1369	*offsets++=sourceIndex++;
1370	}
1371	} else {
1372	if(offsets!=NULL) {
1373	*offsets++=sourceIndex;
1374	*offsets++=sourceIndex++;
1375	}
1376	b=(uint8_t)(c&`0x3f`);
1377	cnv->charErrorBuffer[`0`]=TO_BASE64_IMAP(b);
1378	cnv->charErrorBufferLength=`1`;
1379	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1380	}
1381	} else {
1382	if(offsets!=NULL) {
1383	*offsets++=sourceIndex++;
1384	}
1385	b=(uint8_t)((c>>`6`)&`0x3f`);
1386	cnv->charErrorBuffer[`0`]=TO_BASE64_IMAP(b);
1387	b=(uint8_t)(c&`0x3f`);
1388	cnv->charErrorBuffer[`1`]=TO_BASE64_IMAP(b);
1389	cnv->charErrorBufferLength=`2`;
1390	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1391	}
1392	bits=`0`;
1393	base64Counter=`0`;
1394	break;
1395	default:
1396	/ will never occur /
1397	break;
1398	}
1399	}
1400	} else {
1401	/ target is full /
1402	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1403	break;
1404	}
1405	}
1406	}
1407
1408	if(pArgs->flush && source>=sourceLimit) {
1409	/ flush remaining bits to the target /
1410	if(!inDirectMode) {
1411	if(base64Counter!=`0`) {
1412	if(target<targetLimit) {
1413	*target++=TO_BASE64_IMAP(bits);
1414	if(offsets!=NULL) {
1415	*offsets++=sourceIndex-`1`;
1416	}
1417	} else {
1418	cnv->charErrorBuffer[cnv->charErrorBufferLength++]=TO_BASE64_IMAP(bits);
1419	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1420	}
1421	}
1422	/ need to terminate with a minus /
1423	if(target<targetLimit) {
1424	*target++=MINUS;
1425	if(offsets!=NULL) {
1426	*offsets++=sourceIndex-`1`;
1427	}
1428	} else {
1429	cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
1430	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1431	}
1432	}
1433	/ reset the state for the next conversion /
1434	cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&`0xf0000000`)\|`0x1000000`; / keep version, inDirectMode=TRUE /
1435	} else {
1436	/ set the converter state back into UConverter /
1437	cnv->fromUnicodeStatus=
1438	(cnv->fromUnicodeStatus&`0xf0000000`)\| / keep version/
1439	((uint32_t)inDirectMode<<`24`)\|((uint32_t)base64Counter<<`16`)\|(uint32_t)bits;
1440	}
1441
1442	/ write back the updated pointers /
1443	pArgs->source=source;
1444	pArgs->target=(char *)target;
1445	pArgs->offsets=offsets;
1446	return;
1447	}
1448	U_CDECL_END
1449
1450	static const UConverterImpl _IMAPImpl={
1451	UCNV_IMAP_MAILBOX,
1452
1453	NULL,
1454	NULL,
1455
1456	_UTF7Open,
1457	NULL,
1458	_UTF7Reset,
1459
1460	_IMAPToUnicodeWithOffsets,
1461	_IMAPToUnicodeWithOffsets,
1462	_IMAPFromUnicodeWithOffsets,
1463	_IMAPFromUnicodeWithOffsets,
1464	NULL,
1465
1466	NULL,
1467	NULL,
1468	NULL, / we don't need writeSub() because we never call a callback at fromUnicode() /
1469	NULL,
1470	ucnv_getCompleteUnicodeSet,
1471	NULL,
1472	NULL
1473	};
1474
1475	static const UConverterStaticData _IMAPStaticData={
1476	sizeof(UConverterStaticData),
1477	"IMAP-mailbox-name",
1478	`0`, / TODO CCSID for IMAP-mailbox-name /
1479	UCNV_IBM, UCNV_IMAP_MAILBOX,
1480	`1`, `4`,
1481	{ `0x3f`, `0`, `0`, `0` }, `1`, / the subchar is not used /
1482	FALSE, FALSE,
1483	`0`,
1484	`0`,
1485	{ `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0` } / reserved /
1486	};
1487
1488	const UConverterSharedData _IMAPData=
1489	UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_IMAPStaticData, &_IMAPImpl);
1490
1491	#endif
1492

Browse the source code of ClickHouse/contrib/icu/icu4c/source/common/ucnv_u7.cpp