ucnv2022.cpp source code [ClickHouse/contrib/icu/icu4c/source/common/ucnv2022.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	**********************************************************************
5	* Copyright (C) 2000-2016, International Business Machines
6	* Corporation and others. All Rights Reserved.
7	**********************************************************************
8	* file name: ucnv2022.cpp
9	* encoding: UTF-8
10	* tab size: 8 (not used)
11	* indentation:4
12	*
13	* created on: 2000feb03
14	* created by: Markus W. Scherer
15	*
16	* Change history:
17	*
18	* 06/29/2000 helena Major rewrite of the callback APIs.
19	* 08/08/2000 Ram Included support for ISO-2022-JP-2
20	* Changed implementation of toUnicode
21	* function
22	* 08/21/2000 Ram Added support for ISO-2022-KR
23	* 08/29/2000 Ram Seperated implementation of EBCDIC to
24	* ucnvebdc.c
25	* 09/20/2000 Ram Added support for ISO-2022-CN
26	* Added implementations for getNextUChar()
27	* for specific 2022 country variants.
28	* 10/31/2000 Ram Implemented offsets logic functions
29	*/
30
31	#include "unicode/utypes.h"
32
33	#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
34
35	#include "unicode/ucnv.h"
36	#include "unicode/uset.h"
37	#include "unicode/ucnv_err.h"
38	#include "unicode/ucnv_cb.h"
39	#include "unicode/utf16.h"
40	#include "ucnv_imp.h"
41	#include "ucnv_bld.h"
42	#include "ucnv_cnv.h"
43	#include "ucnvmbcs.h"
44	#include "cstring.h"
45	#include "cmemory.h"
46	#include "uassert.h"
47
48	#ifdef U_ENABLE_GENERIC_ISO_2022
49	/*
50	* I am disabling the generic ISO-2022 converter after proposing to do so on
51	* the icu mailing list two days ago.
52	*
53	* Reasons:
54	* 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
55	* its designation sequences, single shifts with return to the previous state,
56	* switch-with-no-return to UTF-16BE or similar, etc.
57	* This is unlike the language-specific variants like ISO-2022-JP which
58	* require a much smaller repertoire of ISO-2022 features.
59	* These variants continue to be supported.
60	* 2. I believe that no one is really using the generic ISO-2022 converter
61	* but rather always one of the language-specific variants.
62	* Note that ICU's generic ISO-2022 converter has always output one escape
63	* sequence followed by UTF-8 for the whole stream.
64	* 3. Switching between subcharsets is extremely slow, because each time
65	* the previous converter is closed and a new one opened,
66	* without any kind of caching, least-recently-used list, etc.
67	* 4. The code is currently buggy, and given the above it does not seem
68	* reasonable to spend the time on maintenance.
69	* 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
70	* This means, for example, that when ISO-8859-7 is designated, the following
71	* ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
72	* The ICU ISO-2022 converter does not handle this - and has no information
73	* about which subconverter would have to be shifted vs. which is designed
74	* for 7-bit ISO-2022.
75	*
76	* Markus Scherer 2003-dec-03
77	*/
78	#endif
79
80	#if !UCONFIG_ONLY_HTML_CONVERSION
81	static const char SHIFT_IN_STR[] = "\x0F";
82	// static const char SHIFT_OUT_STR[] = "\x0E";
83	#endif
84
85	#define CR 0x0D
86	#define LF 0x0A
87	#define H_TAB 0x09
88	#define V_TAB 0x0B
89	#define SPACE 0x20
90
91	enum {
92	HWKANA_START=`0xff61`,
93	HWKANA_END=`0xff9f`
94	};
95
96	/*
97	* 94-character sets with native byte values A1..FE are encoded in ISO 2022
98	* as bytes 21..7E. (Subtract 0x80.)
99	* 96-character sets with native byte values A0..FF are encoded in ISO 2022
100	* as bytes 20..7F. (Subtract 0x80.)
101	* Do not encode C1 control codes with native bytes 80..9F
102	* as bytes 00..1F (C0 control codes).
103	*/
104	enum {
105	GR94_START=`0xa1`,
106	GR94_END=`0xfe`,
107	GR96_START=`0xa0`,
108	GR96_END=`0xff`
109	};
110
111	/*
112	* ISO 2022 control codes must not be converted from Unicode
113	* because they would mess up the byte stream.
114	* The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
115	* corresponding to SO, SI, and ESC.
116	*/
117	#define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
118
119	/ for ISO-2022-JP and -CN implementations /
120	typedef enum {
121	/ shared values /
122	INVALID_STATE=-`1`,
123	ASCII = `0`,
124
125	SS2_STATE=`0x10`,
126	SS3_STATE,
127
128	/ JP /
129	ISO8859_1 = `1` ,
130	ISO8859_7 = `2` ,
131	JISX201 = `3`,
132	JISX208 = `4`,
133	JISX212 = `5`,
134	GB2312 =`6`,
135	KSC5601 =`7`,
136	HWKANA_7BIT=`8`, / Halfwidth Katakana 7 bit /
137
138	/ CN /
139	/ the first few enum constants must keep their values because they correspond to myConverterArray[] /
140	GB2312_1=`1`,
141	ISO_IR_165=`2`,
142	CNS_11643=`3`,
143
144	/*
145	* these are used in StateEnum and ISO2022State variables,
146	* but CNS_11643 must be used to index into myConverterArray[]
147	*/
148	CNS_11643_0=`0x20`,
149	CNS_11643_1,
150	CNS_11643_2,
151	CNS_11643_3,
152	CNS_11643_4,
153	CNS_11643_5,
154	CNS_11643_6,
155	CNS_11643_7
156	} StateEnum;
157
158	/ is the StateEnum charset value for a DBCS charset? /
159	#if UCONFIG_ONLY_HTML_CONVERSION
160	#define IS_JP_DBCS(cs) (JISX208==(cs))
161	#else
162	#define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
163	#endif
164
165	#define CSM(cs) ((uint16_t)1<<(cs))
166
167	/*
168	* Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
169	* to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
170	*
171	* Note: The converter uses some leniency:
172	* - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
173	* all versions, not just JIS7 and JIS8.
174	* - ICU does not distinguish between different versions of JIS X 0208.
175	*/
176	#if UCONFIG_ONLY_HTML_CONVERSION
177	enum { MAX_JA_VERSION=`0` };
178	#else
179	enum { MAX_JA_VERSION=`4` };
180	#endif
181	static const uint16_t jpCharsetMasks[MAX_JA_VERSION+`1`]={
182	CSM(ASCII)\|CSM(JISX201)\|CSM(JISX208)\|CSM(HWKANA_7BIT),
183	#if !UCONFIG_ONLY_HTML_CONVERSION
184	CSM(ASCII)\|CSM(JISX201)\|CSM(JISX208)\|CSM(HWKANA_7BIT)\|CSM(JISX212),
185	CSM(ASCII)\|CSM(JISX201)\|CSM(JISX208)\|CSM(HWKANA_7BIT)\|CSM(JISX212)\|CSM(GB2312)\|CSM(KSC5601)\|CSM(ISO8859_1)\|CSM(ISO8859_7),
186	CSM(ASCII)\|CSM(JISX201)\|CSM(JISX208)\|CSM(HWKANA_7BIT)\|CSM(JISX212)\|CSM(GB2312)\|CSM(KSC5601)\|CSM(ISO8859_1)\|CSM(ISO8859_7),
187	CSM(ASCII)\|CSM(JISX201)\|CSM(JISX208)\|CSM(HWKANA_7BIT)\|CSM(JISX212)\|CSM(GB2312)\|CSM(KSC5601)\|CSM(ISO8859_1)\|CSM(ISO8859_7)
188	#endif
189	};
190
191	typedef enum {
192	ASCII1=`0`,
193	LATIN1,
194	SBCS,
195	DBCS,
196	MBCS,
197	HWKANA
198	}Cnv2022Type;
199
200	typedef struct ISO2022State {
201	int8_t cs[`4`]; / charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) /
202	int8_t g; / 0..3 for G0..G3 (SI/SO/SS2/SS3) /
203	int8_t prevG; / g before single shift (SS2 or SS3) /
204	} ISO2022State;
205
206	#define UCNV_OPTIONS_VERSION_MASK 0xf
207	#define UCNV_2022_MAX_CONVERTERS 10
208
209	typedef struct{
210	UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
211	UConverter *currentConverter;
212	Cnv2022Type currentType;
213	ISO2022State toU2022State, fromU2022State;
214	uint32_t key;
215	uint32_t version;
216	#ifdef U_ENABLE_GENERIC_ISO_2022
217	UBool isFirstBuffer;
218	#endif
219	UBool isEmptySegment;
220	char name[`30`];
221	char locale[`3`];
222	}UConverterDataISO2022;
223
224	/ Protos /
225	/ ISO-2022 ----------------------------------------------------------------- /
226
227	/Forward declaration /
228	U_CFUNC void U_CALLCONV
229	ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
230	UErrorCode * err);
231	U_CFUNC void U_CALLCONV
232	ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
233	UErrorCode * err);
234
235	#define ESC_2022 0x1B /ESC/
236
237	typedef enum
238	{
239	INVALID_2022 = -`1`, /Doesn't correspond to a valid iso 2022 escape sequence/
240	VALID_NON_TERMINAL_2022 = `0`, /so far corresponds to a valid iso 2022 escape sequence/
241	VALID_TERMINAL_2022 = `1`, /corresponds to a valid iso 2022 escape sequence/
242	VALID_MAYBE_TERMINAL_2022 = `2` /so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence/
243	} UCNV_TableStates_2022;
244
245	/*
246	* The way these state transition arrays work is:
247	* ex : ESC$B is the sequence for JISX208
248	* a) First Iteration: char is ESC
249	* i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
250	* int x = normalize_esq_chars_2022[27] which is equal to 1
251	* ii) Search for this value in escSeqStateTable_Key_2022[]
252	* value of x is stored at escSeqStateTable_Key_2022[0]
253	* iii) Save this index as offset
254	* iv) Get state of this sequence from escSeqStateTable_Value_2022[]
255	* escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
256	* b) Switch on this state and continue to next char
257	* i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
258	* which is normalize_esq_chars_2022[36] == 4
259	* ii) x is currently 1(from above)
260	* x<<=5 -- x is now 32
261	* x+=normalize_esq_chars_2022[36]
262	* now x is 36
263	* iii) Search for this value in escSeqStateTable_Key_2022[]
264	* value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
265	* iv) Get state of this sequence from escSeqStateTable_Value_2022[]
266	* escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
267	* c) Switch on this state and continue to next char
268	* i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index
269	* ii) x is currently 36 (from above)
270	* x<<=5 -- x is now 1152
271	* x+=normalize_esq_chars_2022[66]
272	* now x is 1161
273	* iii) Search for this value in escSeqStateTable_Key_2022[]
274	* value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
275	* iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
276	* escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
277	* v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
278	*/
279
280
281	/Below are the 3 arrays depicting a state transition table/
282	static const int8_t normalize_esq_chars_2022[`256`] = {
283	/ 0 1 2 3 4 5 6 7 8 9 /
284
285	`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0`
286	,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0`
287	,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`1` ,`0` ,`0`
288	,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`4` ,`7` ,`29` ,`0`
289	,`2` ,`24` ,`26` ,`27` ,`0` ,`3` ,`23` ,`6` ,`0` ,`0`
290	,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0`
291	,`0` ,`0` ,`0` ,`0` ,`5` ,`8` ,`9` ,`10` ,`11` ,`12`
292	,`13` ,`14` ,`15` ,`16` ,`17` ,`18` ,`19` ,`20` ,`25` ,`28`
293	,`0` ,`0` ,`21` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0`
294	,`22` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0`
295	,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0`
296	,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0`
297	,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0`
298	,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0`
299	,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0`
300	,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0`
301	,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0`
302	,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0`
303	,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0`
304	,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0`
305	,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0`
306	,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0`
307	,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0`
308	,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0`
309	,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0` ,`0`
310	,`0` ,`0` ,`0` ,`0` ,`0` ,`0`
311	};
312
313	#ifdef U_ENABLE_GENERIC_ISO_2022
314	/*
315	* When the generic ISO-2022 converter is completely removed, not just disabled
316	* per #ifdef, then the following state table and the associated tables that are
317	* dimensioned with MAX_STATES_2022 should be trimmed.
318	*
319	* Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
320	* the associated escape sequences starting with ESC ( B should be removed.
321	* This includes the ones with key values 1097 and all of the ones above 1000000.
322	*
323	* For the latter, the tables can simply be truncated.
324	* For the former, since the tables must be kept parallel, it is probably best
325	* to simply duplicate an adjacent table cell, parallel in all tables.
326	*
327	* It may make sense to restructure the tables, especially by using small search
328	* tables for the variants instead of indexing them parallel to the table here.
329	*/
330	#endif
331
332	#define MAX_STATES_2022 74
333	static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
334	/ 0 1 2 3 4 5 6 7 8 9 /
335
336	`1` ,`34` ,`36` ,`39` ,`55` ,`57` ,`60` ,`61` ,`1093` ,`1096`
337	,`1097` ,`1098` ,`1099` ,`1100` ,`1101` ,`1102` ,`1103` ,`1104` ,`1105` ,`1106`
338	,`1109` ,`1154` ,`1157` ,`1160` ,`1161` ,`1176` ,`1178` ,`1179` ,`1254` ,`1257`
339	,`1768` ,`1773` ,`1957` ,`35105` ,`36933` ,`36936` ,`36937` ,`36938` ,`36939` ,`36940`
340	,`36942` ,`36943` ,`36944` ,`36945` ,`36946` ,`36947` ,`36948` ,`37640` ,`37642` ,`37644`
341	,`37646` ,`37711` ,`37744` ,`37745` ,`37746` ,`37747` ,`37748` ,`40133` ,`40136` ,`40138`
342	,`40139` ,`40140` ,`40141` ,`1123363` ,`35947624` ,`35947625` ,`35947626` ,`35947627` ,`35947629` ,`35947630`
343	,`35947631` ,`35947635` ,`35947636` ,`35947638`
344	};
345
346	#ifdef U_ENABLE_GENERIC_ISO_2022
347
348	static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
349	/* 0 1 2 3 4 5 6 7 8 9 */
350
351	NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,"latin1" ,"latin1"
352	,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1"
353	,"latin1" ,NULL ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL ,NULL ,NULL ,NULL ,"UTF8"
354	,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383"
355	,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165"
356	,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
357	,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL ,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089"
358	,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1"
359	};
360
361	#endif
362
363	static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = {
364	/* 0 1 2 3 4 5 6 7 8 9 */
365	VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
366	,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
367	,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022
368	,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
369	,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
370	,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
371	,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
372	,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
373	};
374
375	/ Type def for refactoring changeState_2022 code/
376	typedef enum{
377	#ifdef U_ENABLE_GENERIC_ISO_2022
378	ISO_2022=`0`,
379	#endif
380	ISO_2022_JP=`1`,
381	#if !UCONFIG_ONLY_HTML_CONVERSION
382	ISO_2022_KR=`2`,
383	ISO_2022_CN=`3`
384	#endif
385	} Variant2022;
386
387	/******** ISO 2022 Converter Protos ********/
388	static void U_CALLCONV
389	_ISO2022Open(UConverter cnv, UConverterLoadArgs pArgs, UErrorCode *errorCode);
390
391	static void U_CALLCONV
392	_ISO2022Close(UConverter *converter);
393
394	static void U_CALLCONV
395	_ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
396
397	U_CDECL_BEGIN
398	static const char * U_CALLCONV
399	_ISO2022getName(const UConverter* cnv);
400	U_CDECL_END
401
402	static void U_CALLCONV
403	_ISO_2022_WriteSub(UConverterFromUnicodeArgs args, int32_t offsetIndex, UErrorCode err);
404
405	U_CDECL_BEGIN
406	static UConverter * U_CALLCONV
407	_ISO_2022_SafeClone(const UConverter cnv, void* stackBuffer, int32_t pBufferSize, UErrorCode *status);
408
409	U_CDECL_END
410
411	#ifdef U_ENABLE_GENERIC_ISO_2022
412	static void U_CALLCONV
413	T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
414	#endif
415
416	namespace {
417
418	/const UConverterSharedData _ISO2022Data;/
419	extern const UConverterSharedData _ISO2022JPData;
420
421	#if !UCONFIG_ONLY_HTML_CONVERSION
422	extern const UConverterSharedData _ISO2022KRData;
423	extern const UConverterSharedData _ISO2022CNData;
424	#endif
425
426	} // namespace
427
428	/************ Converter implementations ***************/
429
430	/ The purpose of this function is to get around gcc compiler warnings. /
431	static inline void
432	fromUWriteUInt8(UConverter *cnv,
433	const char *bytes, int32_t length,
434	uint8_t *target, const* char *targetLimit,
435	int32_t **offsets,
436	int32_t sourceIndex,
437	UErrorCode *pErrorCode)
438	{
439	char targetChars = (char* )target;
440	ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit,
441	offsets, sourceIndex, pErrorCode);
442	target = (uint8_t)targetChars;
443
444	}
445
446	static inline void
447	setInitialStateToUnicodeKR(UConverter* /converter/, UConverterDataISO2022 *myConverterData){
448	if(myConverterData->version == `1`) {
449	UConverter *cnv = myConverterData->currentConverter;
450
451	cnv->toUnicodeStatus=`0`; / offset /
452	cnv->mode=`0`; / state /
453	cnv->toULength=`0`; / byteIndex /
454	}
455	}
456
457	static inline void
458	setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
459	/ in ISO-2022-KR the designator sequence appears only once*
460	* in a file so we append it only once
461	*/
462	if( converter->charErrorBufferLength==`0`){
463
464	converter->charErrorBufferLength = `4`;
465	converter->charErrorBuffer[`0`] = `0x1b`;
466	converter->charErrorBuffer[`1`] = `0x24`;
467	converter->charErrorBuffer[`2`] = `0x29`;
468	converter->charErrorBuffer[`3`] = `0x43`;
469	}
470	if(myConverterData->version == `1`) {
471	UConverter *cnv = myConverterData->currentConverter;
472
473	cnv->fromUChar32=`0`;
474	cnv->fromUnicodeStatus=`1`; / prevLength /
475	}
476	}
477
478	static void U_CALLCONV
479	_ISO2022Open(UConverter cnv, UConverterLoadArgs pArgs, UErrorCode *errorCode){
480
481	char myLocale[`6`]={`' '`,`' '`,`' '`,`' '`,`' '`,`' '`};
482
483	cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
484	if(cnv->extraInfo != NULL) {
485	UConverterNamePieces stackPieces;
486	UConverterLoadArgs stackArgs=UCNV_LOAD_ARGS_INITIALIZER;
487	UConverterDataISO2022 myConverterData=(UConverterDataISO2022 ) cnv->extraInfo;
488	uint32_t version;
489
490	stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable;
491
492	uprv_memset(myConverterData, `0`, sizeof(UConverterDataISO2022));
493	myConverterData->currentType = ASCII1;
494	cnv->fromUnicodeStatus =FALSE;
495	if(pArgs->locale){
496	uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale));
497	}
498	version = pArgs->options & UCNV_OPTIONS_VERSION_MASK;
499	myConverterData->version = version;
500	if(myLocale[`0`]==`'j'` && (myLocale[`1`]==`'a'`\|\| myLocale[`1`]==`'p'`) &&
501	(myLocale[`2`]==`'_'` \|\| myLocale[`2`]==`'\0'`))
502	{
503	/ open the required converters and cache them /
504	if(version>MAX_JA_VERSION) {
505	// ICU 55 fails to open a converter for an unsupported version.
506	// Previously, it fell back to version 0, but that would yield
507	// unexpected behavior.
508	*errorCode = U_MISSING_RESOURCE_ERROR;
509	return;
510	}
511	if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
512	myConverterData->myConverterArray[ISO8859_7] =
513	ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode);
514	}
515	myConverterData->myConverterArray[JISX208] =
516	ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode);
517	if(jpCharsetMasks[version]&CSM(JISX212)) {
518	myConverterData->myConverterArray[JISX212] =
519	ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode);
520	}
521	if(jpCharsetMasks[version]&CSM(GB2312)) {
522	myConverterData->myConverterArray[GB2312] =
523	ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode); / gb_2312_80-1 /
524	}
525	if(jpCharsetMasks[version]&CSM(KSC5601)) {
526	myConverterData->myConverterArray[KSC5601] =
527	ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode);
528	}
529
530	/ set the function pointers to appropriate funtions /
531	cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
532	uprv_strcpy(myConverterData->locale,"ja");
533
534	(void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
535	size_t len = uprv_strlen(myConverterData->name);
536	myConverterData->name[len]=(char)(myConverterData->version+(int)`'0'`);
537	myConverterData->name[len+`1`]=`'\0'`;
538	}
539	#if !UCONFIG_ONLY_HTML_CONVERSION
540	else if(myLocale[`0`]==`'k'` && (myLocale[`1`]==`'o'`\|\| myLocale[`1`]==`'r'`) &&
541	(myLocale[`2`]==`'_'` \|\| myLocale[`2`]==`'\0'`))
542	{
543	if(version>`1`) {
544	// ICU 55 fails to open a converter for an unsupported version.
545	// Previously, it fell back to version 0, but that would yield
546	// unexpected behavior.
547	*errorCode = U_MISSING_RESOURCE_ERROR;
548	return;
549	}
550	const char *cnvName;
551	if(version==`1`) {
552	cnvName="icu-internal-25546";
553	} else {
554	cnvName="ibm-949";
555	myConverterData->version=version=`0`;
556	}
557	if(pArgs->onlyTestIsLoadable) {
558	ucnv_canCreateConverter(cnvName, errorCode); / errorCode carries result /
559	uprv_free(cnv->extraInfo);
560	cnv->extraInfo=NULL;
561	return;
562	} else {
563	myConverterData->currentConverter=ucnv_open(cnvName, errorCode);
564	if (U_FAILURE(*errorCode)) {
565	_ISO2022Close(cnv);
566	return;
567	}
568
569	if(version==`1`) {
570	(void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
571	uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, `4`);
572	cnv->subCharLen = myConverterData->currentConverter->subCharLen;
573	}else{
574	(void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
575	}
576
577	/ initialize the state variables /
578	setInitialStateToUnicodeKR(cnv, myConverterData);
579	setInitialStateFromUnicodeKR(cnv, myConverterData);
580
581	/ set the function pointers to appropriate funtions /
582	cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
583	uprv_strcpy(myConverterData->locale,"ko");
584	}
585	}
586	else if(((myLocale[`0`]==`'z'` && myLocale[`1`]==`'h'`) \|\| (myLocale[`0`]==`'c'`&& myLocale[`1`]==`'n'`))&&
587	(myLocale[`2`]==`'_'` \|\| myLocale[`2`]==`'\0'`))
588	{
589	if(version>`2`) {
590	// ICU 55 fails to open a converter for an unsupported version.
591	// Previously, it fell back to version 0, but that would yield
592	// unexpected behavior.
593	*errorCode = U_MISSING_RESOURCE_ERROR;
594	return;
595	}
596
597	/ open the required converters and cache them /
598	myConverterData->myConverterArray[GB2312_1] =
599	ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode);
600	if(version==`1`) {
601	myConverterData->myConverterArray[ISO_IR_165] =
602	ucnv_loadSharedData("iso-ir-165", &stackPieces, &stackArgs, errorCode);
603	}
604	myConverterData->myConverterArray[CNS_11643] =
605	ucnv_loadSharedData("cns-11643-1992", &stackPieces, &stackArgs, errorCode);
606
607
608	/ set the function pointers to appropriate funtions /
609	cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
610	uprv_strcpy(myConverterData->locale,"cn");
611
612	if (version==`0`){
613	myConverterData->version = `0`;
614	(void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
615	}else if (version==`1`){
616	myConverterData->version = `1`;
617	(void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
618	}else {
619	myConverterData->version = `2`;
620	(void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2");
621	}
622	}
623	#endif // !UCONFIG_ONLY_HTML_CONVERSION
624	else{
625	#ifdef U_ENABLE_GENERIC_ISO_2022
626	myConverterData->isFirstBuffer = TRUE;
627
628	/ append the UTF-8 escape sequence /
629	cnv->charErrorBufferLength = `3`;
630	cnv->charErrorBuffer[`0`] = `0x1b`;
631	cnv->charErrorBuffer[`1`] = `0x25`;
632	cnv->charErrorBuffer[`2`] = `0x42`;
633
634	cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
635	/ initialize the state variables /
636	uprv_strcpy(myConverterData->name,"ISO_2022");
637	#else
638	*errorCode = U_MISSING_RESOURCE_ERROR;
639	// Was U_UNSUPPORTED_ERROR but changed in ICU 55 to a more standard
640	// data loading error code.
641	return;
642	#endif
643	}
644
645	cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
646
647	if(U_FAILURE(*errorCode) \|\| pArgs->onlyTestIsLoadable) {
648	_ISO2022Close(cnv);
649	}
650	} else {
651	*errorCode = U_MEMORY_ALLOCATION_ERROR;
652	}
653	}
654
655
656	static void U_CALLCONV
657	_ISO2022Close(UConverter *converter) {
658	UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
659	UConverterSharedData **array = myData->myConverterArray;
660	int32_t i;
661
662	if (converter->extraInfo != NULL) {
663	/close the array of converter pointers and free the memory/
664	for (i=`0`; i<UCNV_2022_MAX_CONVERTERS; i++) {
665	if(array[i]!=NULL) {
666	ucnv_unloadSharedDataIfReady(array[i]);
667	}
668	}
669
670	ucnv_close(myData->currentConverter);
671
672	if(!converter->isExtraLocal){
673	uprv_free (converter->extraInfo);
674	converter->extraInfo = NULL;
675	}
676	}
677	}
678
679	static void U_CALLCONV
680	_ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
681	UConverterDataISO2022 myConverterData=(UConverterDataISO2022 ) (converter->extraInfo);
682	if(choice<=UCNV_RESET_TO_UNICODE) {
683	uprv_memset(&myConverterData->toU2022State, `0`, sizeof(ISO2022State));
684	myConverterData->key = `0`;
685	myConverterData->isEmptySegment = FALSE;
686	}
687	if(choice!=UCNV_RESET_TO_UNICODE) {
688	uprv_memset(&myConverterData->fromU2022State, `0`, sizeof(ISO2022State));
689	}
690	#ifdef U_ENABLE_GENERIC_ISO_2022
691	if(myConverterData->locale[`0`] == `0`){
692	if(choice<=UCNV_RESET_TO_UNICODE) {
693	myConverterData->isFirstBuffer = TRUE;
694	myConverterData->key = `0`;
695	if (converter->mode == UCNV_SO){
696	ucnv_close (myConverterData->currentConverter);
697	myConverterData->currentConverter=NULL;
698	}
699	converter->mode = UCNV_SI;
700	}
701	if(choice!=UCNV_RESET_TO_UNICODE) {
702	/ re-append UTF-8 escape sequence /
703	converter->charErrorBufferLength = `3`;
704	converter->charErrorBuffer[`0`] = `0x1b`;
705	converter->charErrorBuffer[`1`] = `0x28`;
706	converter->charErrorBuffer[`2`] = `0x42`;
707	}
708	}
709	else
710	#endif
711	{
712	/ reset the state variables /
713	if(myConverterData->locale[`0`] == `'k'`){
714	if(choice<=UCNV_RESET_TO_UNICODE) {
715	setInitialStateToUnicodeKR(converter, myConverterData);
716	}
717	if(choice!=UCNV_RESET_TO_UNICODE) {
718	setInitialStateFromUnicodeKR(converter, myConverterData);
719	}
720	}
721	}
722	}
723
724	U_CDECL_BEGIN
725
726	static const char * U_CALLCONV
727	_ISO2022getName(const UConverter* cnv){
728	if(cnv->extraInfo){
729	UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
730	return myData->name;
731	}
732	return NULL;
733	}
734
735	U_CDECL_END
736
737
738	/************ to unicode ****************/
739	/****************************************************************************
740	* Recognized escape sequences are
741	* <ESC>(B ASCII
742	* <ESC>.A ISO-8859-1
743	* <ESC>.F ISO-8859-7
744	* <ESC>(J JISX-201
745	* <ESC>(I JISX-201
746	* <ESC>$B JISX-208
747	* <ESC>$@ JISX-208
748	* <ESC>$(D JISX-212
749	* <ESC>$A GB2312
750	* <ESC>$(C KSC5601
751	*/
752	static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= {
753	/ 0 1 2 3 4 5 6 7 8 9 /
754	INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
755	,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE
756	,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
757	,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE
758	,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
759	,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
760	,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
761	,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
762	};
763
764	#if !UCONFIG_ONLY_HTML_CONVERSION
765	/************ to unicode ****************/
766	static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= {
767	/ 0 1 2 3 4 5 6 7 8 9 /
768	INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
769	,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
770	,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
771	,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
772	,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165
773	,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
774	,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
775	,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
776	};
777	#endif
778
779
780	static UCNV_TableStates_2022
781	getKey_2022(char c,int32_t* key,int32_t* offset){
782	int32_t togo;
783	int32_t low = `0`;
784	int32_t hi = MAX_STATES_2022;
785	int32_t oldmid=`0`;
786
787	togo = normalize_esq_chars_2022[(uint8_t)c];
788	if(togo == `0`) {
789	/ not a valid character anywhere in an escape sequence /
790	*key = `0`;
791	*offset = `0`;
792	return INVALID_2022;
793	}
794	togo = (*key << `5`) + togo;
795
796	while (hi != low) /binary search/{
797
798	int32_t mid = (hi+low) >> `1`; /Finds median/
799
800	if (mid == oldmid)
801	break;
802
803	if (escSeqStateTable_Key_2022[mid] > togo){
804	hi = mid;
805	}
806	else if (escSeqStateTable_Key_2022[mid] < togo){
807	low = mid;
808	}
809	else /we found it/{
810	*key = togo;
811	*offset = mid;
812	return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid];
813	}
814	oldmid = mid;
815
816	}
817
818	*key = `0`;
819	*offset = `0`;
820	return INVALID_2022;
821	}
822
823	/runs through a state machine to determine the escape sequence - codepage correspondance*
824	*/
825	static void
826	changeState_2022(UConverter* _this,
827	const char** source,
828	const char* sourceLimit,
829	Variant2022 var,
830	UErrorCode* err){
831	UCNV_TableStates_2022 value;
832	UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
833	uint32_t key = myData2022->key;
834	int32_t offset = `0`;
835	int8_t initialToULength = _this->toULength;
836	char c;
837
838	value = VALID_NON_TERMINAL_2022;
839	while (*source < sourceLimit) {
840	c = (source)++;
841	_this->toUBytes[_this->toULength++]=(uint8_t)c;
842	value = getKey_2022(c,(int32_t *) &key, &offset);
843
844	switch (value){
845
846	case VALID_NON_TERMINAL_2022 :
847	/ continue with the loop /
848	break;
849
850	case VALID_TERMINAL_2022:
851	key = `0`;
852	goto DONE;
853
854	case INVALID_2022:
855	goto DONE;
856
857	case VALID_MAYBE_TERMINAL_2022:
858	#ifdef U_ENABLE_GENERIC_ISO_2022
859	/ ESC ( B is ambiguous only for ISO_2022 itself /
860	if(var == ISO_2022) {
861	/ discard toUBytes[] for ESC ( B because this sequence is correct and complete /
862	_this->toULength = `0`;
863
864	/ TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay /
865
866	/ continue with the loop /
867	value = VALID_NON_TERMINAL_2022;
868	break;
869	} else
870	#endif
871	{
872	/ not ISO_2022 itself, finish here /
873	value = VALID_TERMINAL_2022;
874	key = `0`;
875	goto DONE;
876	}
877	}
878	}
879
880	DONE:
881	myData2022->key = key;
882
883	if (value == VALID_NON_TERMINAL_2022) {
884	/ indicate that the escape sequence is incomplete: key!=0 /
885	return;
886	} else if (value == INVALID_2022 ) {
887	*err = U_ILLEGAL_ESCAPE_SEQUENCE;
888	} else / value == VALID_TERMINAL_2022 / {
889	switch(var){
890	#ifdef U_ENABLE_GENERIC_ISO_2022
891	case ISO_2022:
892	{
893	const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
894	if(chosenConverterName == NULL) {
895	/ SS2 or SS3 /
896	*err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
897	_this->toUCallbackReason = UCNV_UNASSIGNED;
898	return;
899	}
900
901	_this->mode = UCNV_SI;
902	ucnv_close(myData2022->currentConverter);
903	myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
904	if(U_SUCCESS(*err)) {
905	myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
906	_this->mode = UCNV_SO;
907	}
908	break;
909	}
910	#endif
911	case ISO_2022_JP:
912	{
913	StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset];
914	switch(tempState) {
915	case INVALID_STATE:
916	*err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
917	break;
918	case SS2_STATE:
919	if(myData2022->toU2022State.cs[`2`]!=`0`) {
920	if(myData2022->toU2022State.g<`2`) {
921	myData2022->toU2022State.prevG=myData2022->toU2022State.g;
922	}
923	myData2022->toU2022State.g=`2`;
924	} else {
925	/ illegal to have SS2 before a matching designator /
926	*err = U_ILLEGAL_ESCAPE_SEQUENCE;
927	}
928	break;
929	/ case SS3_STATE: not used in ISO-2022-JP-x /
930	case ISO8859_1:
931	case ISO8859_7:
932	if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == `0`) {
933	*err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
934	} else {
935	/ G2 charset for SS2 /
936	myData2022->toU2022State.cs[`2`]=(int8_t)tempState;
937	}
938	break;
939	default:
940	if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == `0`) {
941	*err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
942	} else {
943	/ G0 charset /
944	myData2022->toU2022State.cs[`0`]=(int8_t)tempState;
945	}
946	break;
947	}
948	}
949	break;
950	#if !UCONFIG_ONLY_HTML_CONVERSION
951	case ISO_2022_CN:
952	{
953	StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset];
954	switch(tempState) {
955	case INVALID_STATE:
956	*err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
957	break;
958	case SS2_STATE:
959	if(myData2022->toU2022State.cs[`2`]!=`0`) {
960	if(myData2022->toU2022State.g<`2`) {
961	myData2022->toU2022State.prevG=myData2022->toU2022State.g;
962	}
963	myData2022->toU2022State.g=`2`;
964	} else {
965	/ illegal to have SS2 before a matching designator /
966	*err = U_ILLEGAL_ESCAPE_SEQUENCE;
967	}
968	break;
969	case SS3_STATE:
970	if(myData2022->toU2022State.cs[`3`]!=`0`) {
971	if(myData2022->toU2022State.g<`2`) {
972	myData2022->toU2022State.prevG=myData2022->toU2022State.g;
973	}
974	myData2022->toU2022State.g=`3`;
975	} else {
976	/ illegal to have SS3 before a matching designator /
977	*err = U_ILLEGAL_ESCAPE_SEQUENCE;
978	}
979	break;
980	case ISO_IR_165:
981	if(myData2022->version==`0`) {
982	*err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
983	break;
984	}
985	U_FALLTHROUGH;
986	case GB2312_1:
987	U_FALLTHROUGH;
988	case CNS_11643_1:
989	myData2022->toU2022State.cs[`1`]=(int8_t)tempState;
990	break;
991	case CNS_11643_2:
992	myData2022->toU2022State.cs[`2`]=(int8_t)tempState;
993	break;
994	default:
995	/ other CNS 11643 planes /
996	if(myData2022->version==`0`) {
997	*err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
998	} else {
999	myData2022->toU2022State.cs[`3`]=(int8_t)tempState;
1000	}
1001	break;
1002	}
1003	}
1004	break;
1005	case ISO_2022_KR:
1006	if(offset==`0x30`){
1007	/ nothing to be done, just accept this one escape sequence /
1008	} else {
1009	*err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
1010	}
1011	break;
1012	#endif // !UCONFIG_ONLY_HTML_CONVERSION
1013
1014	default:
1015	*err = U_ILLEGAL_ESCAPE_SEQUENCE;
1016	break;
1017	}
1018	}
1019	if(U_SUCCESS(*err)) {
1020	_this->toULength = `0`;
1021	} else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
1022	if(_this->toULength>`1`) {
1023	/*
1024	* Ticket 5691: consistent illegal sequences:
1025	* - We include at least the first byte (ESC) in the illegal sequence.
1026	* - If any of the non-initial bytes could be the start of a character,
1027	* we stop the illegal sequence before the first one of those.
1028	* In escape sequences, all following bytes are "printable", that is,
1029	* unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
1030	* they are valid single/lead bytes.
1031	* For simplicity, we always only report the initial ESC byte as the
1032	* illegal sequence and back out all other bytes we looked at.
1033	*/
1034	/ Back out some bytes. /
1035	int8_t backOutDistance=_this->toULength-`1`;
1036	int8_t bytesFromThisBuffer=_this->toULength-initialToULength;
1037	if(backOutDistance<=bytesFromThisBuffer) {
1038	/ same as initialToULength<=1 /
1039	*source-=backOutDistance;
1040	} else {
1041	/ Back out bytes from the previous buffer: Need to replay them. /
1042	_this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
1043	/ same as -(initialToULength-1) /
1044	/ preToULength is negative! /
1045	uprv_memcpy(_this->preToU, _this->toUBytes+`1`, -_this->preToULength);
1046	*source-=bytesFromThisBuffer;
1047	}
1048	_this->toULength=`1`;
1049	}
1050	} else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
1051	_this->toUCallbackReason = UCNV_UNASSIGNED;
1052	}
1053	}
1054
1055	#if !UCONFIG_ONLY_HTML_CONVERSION
1056	/Checks the characters of the buffer against valid 2022 escape sequences*
1057	*if the match we return a pointer to the initial start of the sequence otherwise
1058	*we return sourceLimit
1059	*/
1060	/for 2022 looks ahead in the stream*
1061	*to determine the longest possible convertible
1062	*data stream
1063	*/
1064	static inline const char*
1065	getEndOfBuffer_2022(const char** source,
1066	const char* sourceLimit,
1067	UBool /flush/){
1068
1069	const char* mySource = *source;
1070
1071	#ifdef U_ENABLE_GENERIC_ISO_2022
1072	if (*source >= sourceLimit)
1073	return sourceLimit;
1074
1075	do{
1076
1077	if (*mySource == ESC_2022){
1078	int8_t i;
1079	int32_t key = `0`;
1080	int32_t offset;
1081	UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
1082
1083	/ Kludge: I could not*
1084	* figure out the reason for validating an escape sequence
1085	* twice - once here and once in changeState_2022().
1086	* is it possible to have an ESC character in a ISO2022
1087	* byte stream which is valid in a code page? Is it legal?
1088	*/
1089	for (i=`0`;
1090	(mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
1091	i++) {
1092	value = getKey_2022(*(mySource+i), &key, &offset);
1093	}
1094	if (value > `0` \|\| *mySource==ESC_2022)
1095	return mySource;
1096
1097	if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
1098	return sourceLimit;
1099	}
1100	}while (++mySource < sourceLimit);
1101
1102	return sourceLimit;
1103	#else
1104	while(mySource < sourceLimit && *mySource != ESC_2022) {
1105	++mySource;
1106	}
1107	return mySource;
1108	#endif
1109	}
1110	#endif
1111
1112	/ This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c*
1113	* any future change in _MBCSFromUChar32() function should be reflected here.
1114	* @return number of bytes in *value; negative number if fallback; 0 if no mapping
1115	*/
1116	static inline int32_t
1117	MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
1118	UChar32 c,
1119	uint32_t* value,
1120	UBool useFallback,
1121	int outputType)
1122	{
1123	const int32_t *cx;
1124	const uint16_t *table;
1125	uint32_t stage2Entry;
1126	uint32_t myValue;
1127	int32_t length;
1128	const uint8_t *p;
1129	/*
1130	* TODO(markus): Use and require new, faster MBCS conversion table structures.
1131	* Use internal version of ucnv_open() that verifies that the new structures are available,
1132	* else U_INTERNAL_PROGRAM_ERROR.
1133	*/
1134	/ BMP-only codepages are stored without stage 1 entries for supplementary code points /
1135	if(c<`0x10000` \|\| (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1136	table=sharedData->mbcs.fromUnicodeTable;
1137	stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
1138	/ get the bytes and the length for the output /
1139	if(outputType==MBCS_OUTPUT_2){
1140	myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1141	if(myValue<=`0xff`) {
1142	length=`1`;
1143	} else {
1144	length=`2`;
1145	}
1146	} else / outputType==MBCS_OUTPUT_3 / {
1147	p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1148	myValue=((uint32_t)*p<<`16`)\|((uint32_t)p[`1`]<<`8`)\|p[`2`];
1149	if(myValue<=`0xff`) {
1150	length=`1`;
1151	} else if(myValue<=`0xffff`) {
1152	length=`2`;
1153	} else {
1154	length=`3`;
1155	}
1156	}
1157	/ is this code point assigned, or do we use fallbacks? /
1158	if((stage2Entry&(`1`<<(`16`+(c&`0xf`))))!=`0`) {
1159	/ assigned /
1160	*value=myValue;
1161	return length;
1162	} else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=`0`) {
1163	/*
1164	* We allow a 0 byte output if the "assigned" bit is set for this entry.
1165	* There is no way with this data structure for fallback output
1166	* to be a zero byte.
1167	*/
1168	*value=myValue;
1169	return -length;
1170	}
1171	}
1172
1173	cx=sharedData->mbcs.extIndexes;
1174	if(cx!=NULL) {
1175	return ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
1176	}
1177
1178	/ unassigned /
1179	return `0`;
1180	}
1181
1182	/ This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c*
1183	* any future change in _MBCSSingleFromUChar32() function should be reflected here.
1184	* @param retval pointer to output byte
1185	* @return 1 roundtrip byte 0 no mapping -1 fallback byte
1186	*/
1187	static inline int32_t
1188	MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
1189	UChar32 c,
1190	uint32_t* retval,
1191	UBool useFallback)
1192	{
1193	const uint16_t *table;
1194	int32_t value;
1195	/ BMP-only codepages are stored without stage 1 entries for supplementary code points /
1196	if(c>=`0x10000` && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1197	return `0`;
1198	}
1199	/ convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) /
1200	table=sharedData->mbcs.fromUnicodeTable;
1201	/ get the byte for the output /
1202	value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
1203	/ is this code point assigned, or do we use fallbacks? /
1204	*retval=(uint32_t)(value&`0xff`);
1205	if(value>=`0xf00`) {
1206	return `1`; / roundtrip /
1207	} else if(useFallback ? value>=`0x800` : value>=`0xc00`) {
1208	return -`1`; / fallback taken /
1209	} else {
1210	return `0`; / no mapping /
1211	}
1212	}
1213
1214	/*
1215	* Check that the result is a 2-byte value with each byte in the range A1..FE
1216	* (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
1217	* to move it to the ISO 2022 range 21..7E.
1218	* Return 0 if out of range.
1219	*/
1220	static inline uint32_t
1221	_2022FromGR94DBCS(uint32_t value) {
1222	if( (uint16_t)(value - `0xa1a1`) <= (`0xfefe` - `0xa1a1`) &&
1223	(uint8_t)(value - `0xa1`) <= (`0xfe` - `0xa1`)
1224	) {
1225	return value - `0x8080`; / shift down to 21..7e byte range /
1226	} else {
1227	return `0`; / not valid for ISO 2022 /
1228	}
1229	}
1230
1231	#if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
1232	/*
1233	* This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
1234	* 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
1235	* unchanged.
1236	*/
1237	static inline uint32_t
1238	_2022ToGR94DBCS(uint32_t value) {
1239	uint32_t returnValue = value + `0x8080`;
1240	if( (uint16_t)(returnValue - `0xa1a1`) <= (`0xfefe` - `0xa1a1`) &&
1241	(uint8_t)(returnValue - `0xa1`) <= (`0xfe` - `0xa1`)) {
1242	return returnValue;
1243	} else {
1244	return value;
1245	}
1246	}
1247	#endif
1248
1249	#ifdef U_ENABLE_GENERIC_ISO_2022
1250
1251	/**********************************************************************************
1252	* ISO-2022 Converter
1253	*
1254	*
1255	*/
1256
1257	static void U_CALLCONV
1258	T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
1259	UErrorCode* err){
1260	const char* mySourceLimit, *realSourceLimit;
1261	const char* sourceStart;
1262	const UChar* myTargetStart;
1263	UConverter* saveThis;
1264	UConverterDataISO2022* myData;
1265	int8_t length;
1266
1267	saveThis = args->converter;
1268	myData=((UConverterDataISO2022*)(saveThis->extraInfo));
1269
1270	realSourceLimit = args->sourceLimit;
1271	while (args->source < realSourceLimit) {
1272	if(myData->key == `0`) { / are we in the middle of an escape sequence? /
1273	/Find the end of the buffer e.g : Next Escape Seq \| end of Buffer/
1274	mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
1275
1276	if(args->source < mySourceLimit) {
1277	if(myData->currentConverter==NULL) {
1278	myData->currentConverter = ucnv_open("ASCII",err);
1279	if(U_FAILURE(*err)){
1280	return;
1281	}
1282
1283	myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
1284	saveThis->mode = UCNV_SO;
1285	}
1286
1287	/ convert to before the ESC or until the end of the buffer /
1288	myData->isFirstBuffer=FALSE;
1289	sourceStart = args->source;
1290	myTargetStart = args->target;
1291	args->converter = myData->currentConverter;
1292	ucnv_toUnicode(args->converter,
1293	&args->target,
1294	args->targetLimit,
1295	&args->source,
1296	mySourceLimit,
1297	args->offsets,
1298	(UBool)(args->flush && mySourceLimit == realSourceLimit),
1299	err);
1300	args->converter = saveThis;
1301
1302	if (*err == U_BUFFER_OVERFLOW_ERROR) {
1303	/ move the overflow buffer /
1304	length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
1305	myData->currentConverter->UCharErrorBufferLength = `0`;
1306	if(length > `0`) {
1307	uprv_memcpy(saveThis->UCharErrorBuffer,
1308	myData->currentConverter->UCharErrorBuffer,
1309	length*U_SIZEOF_UCHAR);
1310	}
1311	return;
1312	}
1313
1314	/*
1315	* At least one of:
1316	* -Error while converting
1317	* -Done with entire buffer
1318	* -Need to write offsets or update the current offset
1319	* (leave that up to the code in ucnv.c)
1320	*
1321	* or else we just stopped at an ESC byte and continue with changeState_2022()
1322	*/
1323	if (U_FAILURE(*err) \|\|
1324	(args->source == realSourceLimit) \|\|
1325	(args->offsets != NULL && (args->target != myTargetStart \|\| args->source != sourceStart) \|\|
1326	(mySourceLimit < realSourceLimit && myData->currentConverter->toULength > `0`))
1327	) {
1328	/ copy partial or error input for truncated detection and error handling /
1329	if(U_FAILURE(*err)) {
1330	length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
1331	if(length > `0`) {
1332	uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
1333	}
1334	} else {
1335	length = saveThis->toULength = myData->currentConverter->toULength;
1336	if(length > `0`) {
1337	uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
1338	if(args->source < mySourceLimit) {
1339	err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC /
1340	}
1341	}
1342	}
1343	return;
1344	}
1345	}
1346	}
1347
1348	sourceStart = args->source;
1349	changeState_2022(args->converter,
1350	&(args->source),
1351	realSourceLimit,
1352	ISO_2022,
1353	err);
1354	if (U_FAILURE(*err) \|\| (args->source != sourceStart && args->offsets != NULL)) {
1355	/ let the ucnv.c code update its current offset /
1356	return;
1357	}
1358	}
1359	}
1360
1361	#endif
1362
1363	/*
1364	* To Unicode Callback helper function
1365	*/
1366	static void
1367	toUnicodeCallback(UConverter *cnv,
1368	const uint32_t sourceChar, const uint32_t targetUniChar,
1369	UErrorCode* err){
1370	if(sourceChar>`0xff`){
1371	cnv->toUBytes[`0`] = (uint8_t)(sourceChar>>`8`);
1372	cnv->toUBytes[`1`] = (uint8_t)sourceChar;
1373	cnv->toULength = `2`;
1374	}
1375	else{
1376	cnv->toUBytes[`0`] =(char) sourceChar;
1377	cnv->toULength = `1`;
1378	}
1379
1380	if(targetUniChar == (missingCharMarker-`1`/0xfffe/)){
1381	*err = U_INVALID_CHAR_FOUND;
1382	}
1383	else{
1384	*err = U_ILLEGAL_CHAR_FOUND;
1385	}
1386	}
1387
1388	/***********************************ISO-2022-JP**********************************************/
1389
1390	/************************************ IMPORTANT ************************************************
1391	* The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1392	* MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
1393	* The converter iterates over each Unicode codepoint
1394	* to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1395	* processed one char at a time it would make sense to reduce the extra processing a canned converter
1396	* would do as far as possible.
1397	*
1398	* If the implementation of these macros or structure of sharedData struct change in the future, make
1399	* sure that ISO-2022 is also changed.
1400	***************************************************************************************************
1401	*/
1402
1403	/***************************************************************************************************
1404	* Rules for ISO-2022-jp encoding
1405	* (i) Escape sequences must be fully contained within a line they should not
1406	* span new lines or CRs
1407	* (ii) If the last character on a line is represented by two bytes then an ASCII or
1408	* JIS-Roman character escape sequence should follow before the line terminates
1409	* (iii) If the first character on the line is represented by two bytes then a two
1410	* byte character escape sequence should precede it
1411	* (iv) If no escape sequence is encountered then the characters are ASCII
1412	* (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1413	* and invoked with SS2 (ESC N).
1414	* (vi) If there is any G0 designation in text, there must be a switch to
1415	* ASCII or to JIS X 0201-Roman before a space character (but not
1416	* necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1417	* characters such as tab or CRLF.
1418	* (vi) Supported encodings:
1419	* ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1420	*
1421	* source : RFC-1554
1422	*
1423	* JISX201, JISX208,JISX212 : new .cnv data files created
1424	* KSC5601 : alias to ibm-949 mapping table
1425	* GB2312 : alias to ibm-1386 mapping table
1426	* ISO-8859-1 : Algorithmic implemented as LATIN1 case
1427	* ISO-8859-7 : alisas to ibm-9409 mapping table
1428	*/
1429
1430	/ preference order of JP charsets /
1431	static const StateEnum jpCharsetPref[]={
1432	ASCII,
1433	JISX201,
1434	ISO8859_1,
1435	JISX208,
1436	ISO8859_7,
1437	JISX212,
1438	GB2312,
1439	KSC5601,
1440	HWKANA_7BIT
1441	};
1442
1443	/*
1444	* The escape sequences must be in order of the enum constants like JISX201 = 3,
1445	* not in order of jpCharsetPref[]!
1446	*/
1447	static const char escSeqChars[][`6`] ={
1448	"\x1B\x28\x42", / <ESC>(B ASCII /
1449	"\x1B\x2E\x41", / <ESC>.A ISO-8859-1 /
1450	"\x1B\x2E\x46", / <ESC>.F ISO-8859-7 /
1451	"\x1B\x28\x4A", / <ESC>(J JISX-201 /
1452	"\x1B\x24\x42", / <ESC>$B JISX-208 /
1453	"\x1B\x24\x28\x44", / <ESC>$(D JISX-212 /
1454	"\x1B\x24\x41", / <ESC>$A GB2312 /
1455	"\x1B\x24\x28\x43", / <ESC>$(C KSC5601 /
1456	"\x1B\x28\x49" / <ESC>(I HWKANA_7BIT /
1457
1458	};
1459	static const int8_t escSeqCharsLen[] ={
1460	`3`, / length of <ESC>(B ASCII /
1461	`3`, / length of <ESC>.A ISO-8859-1 /
1462	`3`, / length of <ESC>.F ISO-8859-7 /
1463	`3`, / length of <ESC>(J JISX-201 /
1464	`3`, / length of <ESC>$B JISX-208 /
1465	`4`, / length of <ESC>$(D JISX-212 /
1466	`3`, / length of <ESC>$A GB2312 /
1467	`4`, / length of <ESC>$(C KSC5601 /
1468	`3` / length of <ESC>(I HWKANA_7BIT /
1469	};
1470
1471	/*
1472	* The iteration over various code pages works this way:
1473	* i) Get the currentState from myConverterData->currentState
1474	* ii) Check if the character is mapped to a valid character in the currentState
1475	* Yes -> a) set the initIterState to currentState
1476	* b) remain in this state until an invalid character is found
1477	* No -> a) go to the next code page and find the character
1478	* iii) Before changing the state increment the current state check if the current state
1479	* is equal to the intitIteration state
1480	* Yes -> A character that cannot be represented in any of the supported encodings
1481	* break and return a U_INVALID_CHARACTER error
1482	* No -> Continue and find the character in next code page
1483	*
1484	*
1485	* TODO: Implement a priority technique where the users are allowed to set the priority of code pages
1486	*/
1487
1488	/ Map 00..7F to Unicode according to JIS X 0201. /
1489	static inline uint32_t
1490	jisx201ToU(uint32_t value) {
1491	if(value < `0x5c`) {
1492	return value;
1493	} else if(value == `0x5c`) {
1494	return `0xa5`;
1495	} else if(value == `0x7e`) {
1496	return `0x203e`;
1497	} else / value <= 0x7f / {
1498	return value;
1499	}
1500	}
1501
1502	/ Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. /
1503	static inline uint32_t
1504	jisx201FromU(uint32_t value) {
1505	if(value<=`0x7f`) {
1506	if(value!=`0x5c` && value!=`0x7e`) {
1507	return value;
1508	}
1509	} else if(value==`0xa5`) {
1510	return `0x5c`;
1511	} else if(value==`0x203e`) {
1512	return `0x7e`;
1513	}
1514	return `0xfffe`;
1515	}
1516
1517	/*
1518	* Take a valid Shift-JIS byte pair, check that it is in the range corresponding
1519	* to JIS X 0208, and convert it to a pair of 21..7E bytes.
1520	* Return 0 if the byte pair is out of range.
1521	*/
1522	static inline uint32_t
1523	_2022FromSJIS(uint32_t value) {
1524	uint8_t trail;
1525
1526	if(value > `0xEFFC`) {
1527	return `0`; / beyond JIS X 0208 /
1528	}
1529
1530	trail = (uint8_t)value;
1531
1532	value &= `0xff00`; / lead byte /
1533	if(value <= `0x9f00`) {
1534	value -= `0x7000`;
1535	} else / 0xe000 <= value <= 0xef00 / {
1536	value -= `0xb000`;
1537	}
1538	value <<= `1`;
1539
1540	if(trail <= `0x9e`) {
1541	value -= `0x100`;
1542	if(trail <= `0x7e`) {
1543	value \|= trail - `0x1f`;
1544	} else {
1545	value \|= trail - `0x20`;
1546	}
1547	} else / trail <= 0xfc / {
1548	value \|= trail - `0x7e`;
1549	}
1550	return value;
1551	}
1552
1553	/*
1554	* Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
1555	* If either byte is outside 21..7E make sure that the result is not valid
1556	* for Shift-JIS so that the converter catches it.
1557	* Some invalid byte values already turn into equally invalid Shift-JIS
1558	* byte values and need not be tested explicitly.
1559	*/
1560	static inline void
1561	_2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[`2`]) {
1562	if(c1&`1`) {
1563	++c1;
1564	if(c2 <= `0x5f`) {
1565	c2 += `0x1f`;
1566	} else if(c2 <= `0x7e`) {
1567	c2 += `0x20`;
1568	} else {
1569	c2 = `0`; / invalid /
1570	}
1571	} else {
1572	if((uint8_t)(c2-`0x21`) <= ((`0x7e`)-`0x21`)) {
1573	c2 += `0x7e`;
1574	} else {
1575	c2 = `0`; / invalid /
1576	}
1577	}
1578	c1 >>= `1`;
1579	if(c1 <= `0x2f`) {
1580	c1 += `0x70`;
1581	} else if(c1 <= `0x3f`) {
1582	c1 += `0xb0`;
1583	} else {
1584	c1 = `0`; / invalid /
1585	}
1586	bytes[`0`] = (char)c1;
1587	bytes[`1`] = (char)c2;
1588	}
1589
1590	/*
1591	* JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
1592	* Katakana.
1593	* Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
1594	* because Shift-JIS roundtrips half-width Katakana to single bytes.
1595	* These were the only fallbacks in ICU's jisx-208.ucm file.
1596	*/
1597	static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + `1`] = {
1598	`0x2123`, / U+FF61 /
1599	`0x2156`,
1600	`0x2157`,
1601	`0x2122`,
1602	`0x2126`,
1603	`0x2572`,
1604	`0x2521`,
1605	`0x2523`,
1606	`0x2525`,
1607	`0x2527`,
1608	`0x2529`,
1609	`0x2563`,
1610	`0x2565`,
1611	`0x2567`,
1612	`0x2543`,
1613	`0x213C`, / U+FF70 /
1614	`0x2522`,
1615	`0x2524`,
1616	`0x2526`,
1617	`0x2528`,
1618	`0x252A`,
1619	`0x252B`,
1620	`0x252D`,
1621	`0x252F`,
1622	`0x2531`,
1623	`0x2533`,
1624	`0x2535`,
1625	`0x2537`,
1626	`0x2539`,
1627	`0x253B`,
1628	`0x253D`,
1629	`0x253F`, / U+FF80 /
1630	`0x2541`,
1631	`0x2544`,
1632	`0x2546`,
1633	`0x2548`,
1634	`0x254A`,
1635	`0x254B`,
1636	`0x254C`,
1637	`0x254D`,
1638	`0x254E`,
1639	`0x254F`,
1640	`0x2552`,
1641	`0x2555`,
1642	`0x2558`,
1643	`0x255B`,
1644	`0x255E`,
1645	`0x255F`, / U+FF90 /
1646	`0x2560`,
1647	`0x2561`,
1648	`0x2562`,
1649	`0x2564`,
1650	`0x2566`,
1651	`0x2568`,
1652	`0x2569`,
1653	`0x256A`,
1654	`0x256B`,
1655	`0x256C`,
1656	`0x256D`,
1657	`0x256F`,
1658	`0x2573`,
1659	`0x212B`,
1660	`0x212C` / U+FF9F /
1661	};
1662
1663	static void U_CALLCONV
1664	UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
1665	UConverter *cnv = args->converter;
1666	UConverterDataISO2022 *converterData;
1667	ISO2022State *pFromU2022State;
1668	uint8_t target = (uint8_t ) args->target;
1669	const uint8_t targetLimit = (const* uint8_t *) args->targetLimit;
1670	const UChar* source = args->source;
1671	const UChar* sourceLimit = args->sourceLimit;
1672	int32_t* offsets = args->offsets;
1673	UChar32 sourceChar;
1674	char buffer[`8`];
1675	int32_t len, outLen;
1676	int8_t choices[`10`];
1677	int32_t choiceCount;
1678	uint32_t targetValue = `0`;
1679	UBool useFallback;
1680
1681	int32_t i;
1682	int8_t cs, g;
1683
1684	/ set up the state /
1685	converterData = (UConverterDataISO2022*)cnv->extraInfo;
1686	pFromU2022State = &converterData->fromU2022State;
1687
1688	choiceCount = `0`;
1689
1690	/ check if the last codepoint of previous buffer was a lead surrogate/
1691	if((sourceChar = cnv->fromUChar32)!=`0` && target< targetLimit) {
1692	goto getTrail;
1693	}
1694
1695	while(source < sourceLimit) {
1696	if(target < targetLimit) {
1697
1698	sourceChar = *(source++);
1699	/check if the char is a First surrogate/
1700	if(U16_IS_SURROGATE(sourceChar)) {
1701	if(U16_IS_SURROGATE_LEAD(sourceChar)) {
1702	getTrail:
1703	/look ahead to find the trail surrogate/
1704	if(source < sourceLimit) {
1705	/ test the following code unit /
1706	UChar trail=(UChar) *source;
1707	if(U16_IS_TRAIL(trail)) {
1708	source++;
1709	sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
1710	cnv->fromUChar32=`0x00`;
1711	/ convert this supplementary code point /
1712	/ exit this condition tree /
1713	} else {
1714	/ this is an unmatched lead code unit (1st surrogate) /
1715	/ callback(illegal) /
1716	*err=U_ILLEGAL_CHAR_FOUND;
1717	cnv->fromUChar32=sourceChar;
1718	break;
1719	}
1720	} else {
1721	/ no more input /
1722	cnv->fromUChar32=sourceChar;
1723	break;
1724	}
1725	} else {
1726	/ this is an unmatched trail code unit (2nd surrogate) /
1727	/ callback(illegal) /
1728	*err=U_ILLEGAL_CHAR_FOUND;
1729	cnv->fromUChar32=sourceChar;
1730	break;
1731	}
1732	}
1733
1734	/ do not convert SO/SI/ESC /
1735	if(IS_2022_CONTROL(sourceChar)) {
1736	/ callback(illegal) /
1737	*err=U_ILLEGAL_CHAR_FOUND;
1738	cnv->fromUChar32=sourceChar;
1739	break;
1740	}
1741
1742	/ do the conversion /
1743
1744	if(choiceCount == `0`) {
1745	uint16_t csm;
1746
1747	/*
1748	* The csm variable keeps track of which charsets are allowed
1749	* and not used yet while building the choices[].
1750	*/
1751	csm = jpCharsetMasks[converterData->version];
1752	choiceCount = `0`;
1753
1754	/ JIS7/8: try single-byte half-width Katakana before JISX208 /
1755	if(converterData->version == `3` \|\| converterData->version == `4`) {
1756	choices[choiceCount++] = (int8_t)HWKANA_7BIT;
1757	}
1758	/ Do not try single-byte half-width Katakana for other versions. /
1759	csm &= ~CSM(HWKANA_7BIT);
1760
1761	/ try the current G0 charset /
1762	choices[choiceCount++] = cs = pFromU2022State->cs[`0`];
1763	csm &= ~CSM(cs);
1764
1765	/ try the current G2 charset /
1766	if((cs = pFromU2022State->cs[`2`]) != `0`) {
1767	choices[choiceCount++] = cs;
1768	csm &= ~CSM(cs);
1769	}
1770
1771	/ try all the other possible charsets /
1772	for(i = `0`; i < UPRV_LENGTHOF(jpCharsetPref); ++i) {
1773	cs = (int8_t)jpCharsetPref[i];
1774	if(CSM(cs) & csm) {
1775	choices[choiceCount++] = cs;
1776	csm &= ~CSM(cs);
1777	}
1778	}
1779	}
1780
1781	cs = g = `0`;
1782	/*
1783	* len==0: no mapping found yet
1784	* len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
1785	* len>0: found a roundtrip result, done
1786	*/
1787	len = `0`;
1788	/*
1789	* We will turn off useFallback after finding a fallback,
1790	* but we still get fallbacks from PUA code points as usual.
1791	* Therefore, we will also need to check that we don't overwrite
1792	* an early fallback with a later one.
1793	*/
1794	useFallback = cnv->useFallback;
1795
1796	for(i = `0`; i < choiceCount && len <= `0`; ++i) {
1797	uint32_t value;
1798	int32_t len2;
1799	int8_t cs0 = choices[i];
1800	switch(cs0) {
1801	case ASCII:
1802	if(sourceChar <= `0x7f`) {
1803	targetValue = (uint32_t)sourceChar;
1804	len = `1`;
1805	cs = cs0;
1806	g = `0`;
1807	}
1808	break;
1809	case ISO8859_1:
1810	if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
1811	targetValue = (uint32_t)sourceChar - `0x80`;
1812	len = `1`;
1813	cs = cs0;
1814	g = `2`;
1815	}
1816	break;
1817	case HWKANA_7BIT:
1818	if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1819	if(converterData->version==`3`) {
1820	/ JIS7: use G1 (SO) /
1821	/ Shift U+FF61..U+FF9F to bytes 21..5F. /
1822	targetValue = (uint32_t)(sourceChar - (HWKANA_START - `0x21`));
1823	len = `1`;
1824	pFromU2022State->cs[`1`] = cs = cs0; / do not output an escape sequence /
1825	g = `1`;
1826	} else if(converterData->version==`4`) {
1827	/ JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below /
1828	/ Shift U+FF61..U+FF9F to bytes A1..DF. /
1829	targetValue = (uint32_t)(sourceChar - (HWKANA_START - `0xa1`));
1830	len = `1`;
1831
1832	cs = pFromU2022State->cs[`0`];
1833	if(IS_JP_DBCS(cs)) {
1834	/ switch from a DBCS charset to JISX201 /
1835	cs = (int8_t)JISX201;
1836	}
1837	/ else stay in the current G0 charset /
1838	g = `0`;
1839	}
1840	/ else do not use HWKANA_7BIT with other versions /
1841	}
1842	break;
1843	case JISX201:
1844	/ G0 SBCS /
1845	value = jisx201FromU(sourceChar);
1846	if(value <= `0x7f`) {
1847	targetValue = value;
1848	len = `1`;
1849	cs = cs0;
1850	g = `0`;
1851	useFallback = FALSE;
1852	}
1853	break;
1854	case JISX208:
1855	/ G0 DBCS from Shift-JIS table /
1856	len2 = MBCS_FROM_UCHAR32_ISO2022(
1857	converterData->myConverterArray[cs0],
1858	sourceChar, &value,
1859	useFallback, MBCS_OUTPUT_2);
1860	if(len2 == `2` \|\| (len2 == -`2` && len == `0`)) { / only accept DBCS: abs(len)==2 /
1861	value = _2022FromSJIS(value);
1862	if(value != `0`) {
1863	targetValue = value;
1864	len = len2;
1865	cs = cs0;
1866	g = `0`;
1867	useFallback = FALSE;
1868	}
1869	} else if(len == `0` && useFallback &&
1870	(uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1871	targetValue = hwkana_fb[sourceChar - HWKANA_START];
1872	len = -`2`;
1873	cs = cs0;
1874	g = `0`;
1875	useFallback = FALSE;
1876	}
1877	break;
1878	case ISO8859_7:
1879	/ G0 SBCS forced to 7-bit output /
1880	len2 = MBCS_SINGLE_FROM_UCHAR32(
1881	converterData->myConverterArray[cs0],
1882	sourceChar, &value,
1883	useFallback);
1884	if(len2 != `0` && !(len2 < `0` && len != `0`) && GR96_START <= value && value <= GR96_END) {
1885	targetValue = value - `0x80`;
1886	len = len2;
1887	cs = cs0;
1888	g = `2`;
1889	useFallback = FALSE;
1890	}
1891	break;
1892	default:
1893	/ G0 DBCS /
1894	len2 = MBCS_FROM_UCHAR32_ISO2022(
1895	converterData->myConverterArray[cs0],
1896	sourceChar, &value,
1897	useFallback, MBCS_OUTPUT_2);
1898	if(len2 == `2` \|\| (len2 == -`2` && len == `0`)) { / only accept DBCS: abs(len)==2 /
1899	if(cs0 == KSC5601) {
1900	/*
1901	* Check for valid bytes for the encoding scheme.
1902	* This is necessary because the sub-converter (windows-949)
1903	* has a broader encoding scheme than is valid for 2022.
1904	*/
1905	value = _2022FromGR94DBCS(value);
1906	if(value == `0`) {
1907	break;
1908	}
1909	}
1910	targetValue = value;
1911	len = len2;
1912	cs = cs0;
1913	g = `0`;
1914	useFallback = FALSE;
1915	}
1916	break;
1917	}
1918	}
1919
1920	if(len != `0`) {
1921	if(len < `0`) {
1922	len = -len; / fallback /
1923	}
1924	outLen = `0`; / count output bytes /
1925
1926	/ write SI if necessary (only for JIS7) /
1927	if(pFromU2022State->g == `1` && g == `0`) {
1928	buffer[outLen++] = UCNV_SI;
1929	pFromU2022State->g = `0`;
1930	}
1931
1932	/ write the designation sequence if necessary /
1933	if(cs != pFromU2022State->cs[g]) {
1934	int32_t escLen = escSeqCharsLen[cs];
1935	uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
1936	outLen += escLen;
1937	pFromU2022State->cs[g] = cs;
1938
1939	/ invalidate the choices[] /
1940	choiceCount = `0`;
1941	}
1942
1943	/ write the shift sequence if necessary /
1944	if(g != pFromU2022State->g) {
1945	switch(g) {
1946	/ case 0 handled before writing escapes /
1947	case `1`:
1948	buffer[outLen++] = UCNV_SO;
1949	pFromU2022State->g = `1`;
1950	break;
1951	default: / case 2 /
1952	buffer[outLen++] = `0x1b`;
1953	buffer[outLen++] = `0x4e`;
1954	break;
1955	/ no case 3: no SS3 in ISO-2022-JP-x /
1956	}
1957	}
1958
1959	/ write the output bytes /
1960	if(len == `1`) {
1961	buffer[outLen++] = (char)targetValue;
1962	} else / len == 2 / {
1963	buffer[outLen++] = (char)(targetValue >> `8`);
1964	buffer[outLen++] = (char)targetValue;
1965	}
1966	} else {
1967	/*
1968	* if we cannot find the character after checking all codepages
1969	* then this is an error
1970	*/
1971	*err = U_INVALID_CHAR_FOUND;
1972	cnv->fromUChar32=sourceChar;
1973	break;
1974	}
1975
1976	if(sourceChar == CR \|\| sourceChar == LF) {
1977	/ reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) /
1978	pFromU2022State->cs[`2`] = `0`;
1979	choiceCount = `0`;
1980	}
1981
1982	/ output outLen>0 bytes in buffer[] /
1983	if(outLen == `1`) {
1984	*target++ = buffer[`0`];
1985	if(offsets) {
1986	offsets++ = (int32_t)(source - args->source - `1`); /* -1: known to be ASCII /
1987	}
1988	} else if(outLen == `2` && (target + `2`) <= targetLimit) {
1989	*target++ = buffer[`0`];
1990	*target++ = buffer[`1`];
1991	if(offsets) {
1992	int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
1993	*offsets++ = sourceIndex;
1994	*offsets++ = sourceIndex;
1995	}
1996	} else {
1997	fromUWriteUInt8(
1998	cnv,
1999	buffer, outLen,
2000	&target, (const char *)targetLimit,
2001	&offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
2002	err);
2003	if(U_FAILURE(*err)) {
2004	break;
2005	}
2006	}
2007	} / end if(myTargetIndex<myTargetLength) /
2008	else{
2009	*err =U_BUFFER_OVERFLOW_ERROR;
2010	break;
2011	}
2012
2013	}/ end while(mySourceIndex<mySourceLength) /
2014
2015	/*
2016	* the end of the input stream and detection of truncated input
2017	* are handled by the framework, but for ISO-2022-JP conversion
2018	* we need to be in ASCII mode at the very end
2019	*
2020	* conditions:
2021	* successful
2022	* in SO mode or not in ASCII mode
2023	* end of input and no truncated input
2024	*/
2025	if( U_SUCCESS(*err) &&
2026	(pFromU2022State->g!=`0` \|\| pFromU2022State->cs[`0`]!=ASCII) &&
2027	args->flush && source>=sourceLimit && cnv->fromUChar32==`0`
2028	) {
2029	int32_t sourceIndex;
2030
2031	outLen = `0`;
2032
2033	if(pFromU2022State->g != `0`) {
2034	buffer[outLen++] = UCNV_SI;
2035	pFromU2022State->g = `0`;
2036	}
2037
2038	if(pFromU2022State->cs[`0`] != ASCII) {
2039	int32_t escLen = escSeqCharsLen[ASCII];
2040	uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
2041	outLen += escLen;
2042	pFromU2022State->cs[`0`] = (int8_t)ASCII;
2043	}
2044
2045	/ get the source index of the last input character /
2046	/*
2047	* TODO this would be simpler and more reliable if we used a pair
2048	* of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2049	* so that we could simply use the prevSourceIndex here;
2050	* this code gives an incorrect result for the rare case of an unmatched
2051	* trail surrogate that is alone in the last buffer of the text stream
2052	*/
2053	sourceIndex=(int32_t)(source-args->source);
2054	if(sourceIndex>`0`) {
2055	--sourceIndex;
2056	if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2057	(sourceIndex==`0` \|\| U16_IS_LEAD(args->source[sourceIndex-`1`]))
2058	) {
2059	--sourceIndex;
2060	}
2061	} else {
2062	sourceIndex=-`1`;
2063	}
2064
2065	fromUWriteUInt8(
2066	cnv,
2067	buffer, outLen,
2068	&target, (const char *)targetLimit,
2069	&offsets, sourceIndex,
2070	err);
2071	}
2072
2073	/save the state and return /
2074	args->source = source;
2075	args->target = (char*)target;
2076	}
2077
2078	/************ to unicode ****************/
2079
2080	static void U_CALLCONV
2081	UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2082	UErrorCode* err){
2083	char tempBuf[`2`];
2084	const char mySource = (char* *) args->source;
2085	UChar *myTarget = args->target;
2086	const char *mySourceLimit = args->sourceLimit;
2087	uint32_t targetUniChar = `0x0000`;
2088	uint32_t mySourceChar = `0x0000`;
2089	uint32_t tmpSourceChar = `0x0000`;
2090	UConverterDataISO2022* myData;
2091	ISO2022State *pToU2022State;
2092	StateEnum cs;
2093
2094	myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2095	pToU2022State = &myData->toU2022State;
2096
2097	if(myData->key != `0`) {
2098	/ continue with a partial escape sequence /
2099	goto escape;
2100	} else if(args->converter->toULength == `1` && mySource < mySourceLimit && myTarget < args->targetLimit) {
2101	/ continue with a partial double-byte character /
2102	mySourceChar = args->converter->toUBytes[`0`];
2103	args->converter->toULength = `0`;
2104	cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2105	targetUniChar = missingCharMarker;
2106	goto getTrailByte;
2107	}
2108
2109	while(mySource < mySourceLimit){
2110
2111	targetUniChar =missingCharMarker;
2112
2113	if(myTarget < args->targetLimit){
2114
2115	mySourceChar= (unsigned char) *mySource++;
2116
2117	switch(mySourceChar) {
2118	case UCNV_SI:
2119	if(myData->version==`3`) {
2120	pToU2022State->g=`0`;
2121	continue;
2122	} else {
2123	/ only JIS7 uses SI/SO, not ISO-2022-JP-x /
2124	myData->isEmptySegment = FALSE; / reset this, we have a different error /
2125	break;
2126	}
2127
2128	case UCNV_SO:
2129	if(myData->version==`3`) {
2130	/ JIS7: switch to G1 half-width Katakana /
2131	pToU2022State->cs[`1`] = (int8_t)HWKANA_7BIT;
2132	pToU2022State->g=`1`;
2133	continue;
2134	} else {
2135	/ only JIS7 uses SI/SO, not ISO-2022-JP-x /
2136	myData->isEmptySegment = FALSE; / reset this, we have a different error /
2137	break;
2138	}
2139
2140	case ESC_2022:
2141	mySource--;
2142	escape:
2143	{
2144	const char * mySourceBefore = mySource;
2145	int8_t toULengthBefore = args->converter->toULength;
2146
2147	changeState_2022(args->converter,&(mySource),
2148	mySourceLimit, ISO_2022_JP,err);
2149
2150	/ If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error /
2151	if(myData->version==`0` && myData->key==`0` && U_SUCCESS(*err) && myData->isEmptySegment) {
2152	*err = U_ILLEGAL_ESCAPE_SEQUENCE;
2153	args->converter->toUCallbackReason = UCNV_IRREGULAR;
2154	args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
2155	}
2156	}
2157
2158	/ invalid or illegal escape sequence /
2159	if(U_FAILURE(*err)){
2160	args->target = myTarget;
2161	args->source = mySource;
2162	myData->isEmptySegment = FALSE; / Reset to avoid future spurious errors /
2163	return;
2164	}
2165	/ If we successfully completed an escape sequence, we begin a new segment, empty so far /
2166	if(myData->key==`0`) {
2167	myData->isEmptySegment = TRUE;
2168	}
2169	continue;
2170
2171	/ ISO-2022-JP does not use single-byte (C1) SS2 and SS3 /
2172
2173	case CR:
2174	case LF:
2175	/ automatically reset to single-byte mode /
2176	if((StateEnum)pToU2022State->cs[`0`] != ASCII && (StateEnum)pToU2022State->cs[`0`] != JISX201) {
2177	pToU2022State->cs[`0`] = (int8_t)ASCII;
2178	}
2179	pToU2022State->cs[`2`] = `0`;
2180	pToU2022State->g = `0`;
2181	U_FALLTHROUGH;
2182	default:
2183	/ convert one or two bytes /
2184	myData->isEmptySegment = FALSE;
2185	cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2186	if( (uint8_t)(mySourceChar - `0xa1`) <= (`0xdf` - `0xa1`) && myData->version==`4` &&
2187	!IS_JP_DBCS(cs)
2188	) {
2189	/ 8-bit halfwidth katakana in any single-byte mode for JIS8 /
2190	targetUniChar = mySourceChar + (HWKANA_START - `0xa1`);
2191
2192	/ return from a single-shift state to the previous one /
2193	if(pToU2022State->g >= `2`) {
2194	pToU2022State->g=pToU2022State->prevG;
2195	}
2196	} else switch(cs) {
2197	case ASCII:
2198	if(mySourceChar <= `0x7f`) {
2199	targetUniChar = mySourceChar;
2200	}
2201	break;
2202	case ISO8859_1:
2203	if(mySourceChar <= `0x7f`) {
2204	targetUniChar = mySourceChar + `0x80`;
2205	}
2206	/ return from a single-shift state to the previous one /
2207	pToU2022State->g=pToU2022State->prevG;
2208	break;
2209	case ISO8859_7:
2210	if(mySourceChar <= `0x7f`) {
2211	/ convert mySourceChar+0x80 to use a normal 8-bit table /
2212	targetUniChar =
2213	_MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
2214	myData->myConverterArray[cs],
2215	mySourceChar + `0x80`);
2216	}
2217	/ return from a single-shift state to the previous one /
2218	pToU2022State->g=pToU2022State->prevG;
2219	break;
2220	case JISX201:
2221	if(mySourceChar <= `0x7f`) {
2222	targetUniChar = jisx201ToU(mySourceChar);
2223	}
2224	break;
2225	case HWKANA_7BIT:
2226	if((uint8_t)(mySourceChar - `0x21`) <= (`0x5f` - `0x21`)) {
2227	/ 7-bit halfwidth Katakana /
2228	targetUniChar = mySourceChar + (HWKANA_START - `0x21`);
2229	}
2230	break;
2231	default:
2232	/ G0 DBCS /
2233	if(mySource < mySourceLimit) {
2234	int leadIsOk, trailIsOk;
2235	uint8_t trailByte;
2236	getTrailByte:
2237	trailByte = (uint8_t)*mySource;
2238	/*
2239	* Ticket 5691: consistent illegal sequences:
2240	* - We include at least the first byte in the illegal sequence.
2241	* - If any of the non-initial bytes could be the start of a character,
2242	* we stop the illegal sequence before the first one of those.
2243	*
2244	* In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2245	* an ESC/SO/SI, we report only the first byte as the illegal sequence.
2246	* Otherwise we convert or report the pair of bytes.
2247	*/
2248	leadIsOk = (uint8_t)(mySourceChar - `0x21`) <= (`0x7e` - `0x21`);
2249	trailIsOk = (uint8_t)(trailByte - `0x21`) <= (`0x7e` - `0x21`);
2250	if (leadIsOk && trailIsOk) {
2251	++mySource;
2252	tmpSourceChar = (mySourceChar << `8`) \| trailByte;
2253	if(cs == JISX208) {
2254	_2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf);
2255	mySourceChar = tmpSourceChar;
2256	} else {
2257	/ Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. /
2258	mySourceChar = tmpSourceChar;
2259	if (cs == KSC5601) {
2260	tmpSourceChar += `0x8080`; / = _2022ToGR94DBCS(tmpSourceChar) /
2261	}
2262	tempBuf[`0`] = (char)(tmpSourceChar >> `8`);
2263	tempBuf[`1`] = (char)(tmpSourceChar);
2264	}
2265	targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, `2`, FALSE);
2266	} else if (!(trailIsOk \|\| IS_2022_CONTROL(trailByte))) {
2267	/ report a pair of illegal bytes if the second byte is not a DBCS starter /
2268	++mySource;
2269	/ add another bit so that the code below writes 2 bytes in case of error /
2270	mySourceChar = `0x10000` \| (mySourceChar << `8`) \| trailByte;
2271	}
2272	} else {
2273	args->converter->toUBytes[`0`] = (uint8_t)mySourceChar;
2274	args->converter->toULength = `1`;
2275	goto endloop;
2276	}
2277	} / End of inner switch /
2278	break;
2279	} / End of outer switch /
2280	if(targetUniChar < (missingCharMarker-`1`/0xfffe/)){
2281	if(args->offsets){
2282	args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= `0xff` ? `1` : `2`));
2283	}
2284	*(myTarget++)=(UChar)targetUniChar;
2285	}
2286	else if(targetUniChar > missingCharMarker){
2287	/ disassemble the surrogate pair and write to output/
2288	targetUniChar-=`0x0010000`;
2289	*myTarget = (UChar)(`0xd800`+(UChar)(targetUniChar>>`10`));
2290	if(args->offsets){
2291	args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= `0xff` ? `1` : `2`));
2292	}
2293	++myTarget;
2294	if(myTarget< args->targetLimit){
2295	*myTarget = (UChar)(`0xdc00`+(UChar)(targetUniChar&`0x3ff`));
2296	if(args->offsets){
2297	args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= `0xff` ? `1` : `2`));
2298	}
2299	++myTarget;
2300	}else{
2301	args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
2302	(UChar)(`0xdc00`+(UChar)(targetUniChar&`0x3ff`));
2303	}
2304
2305	}
2306	else{
2307	/ Call the callback function/
2308	toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2309	break;
2310	}
2311	}
2312	else{ / goes with "if(myTarget < args->targetLimit)" way up near top of function /
2313	*err =U_BUFFER_OVERFLOW_ERROR;
2314	break;
2315	}
2316	}
2317	endloop:
2318	args->target = myTarget;
2319	args->source = mySource;
2320	}
2321
2322
2323	#if !UCONFIG_ONLY_HTML_CONVERSION
2324	/***************************************************************
2325	* Rules for ISO-2022-KR encoding
2326	* i) The KSC5601 designator sequence should appear only once in a file,
2327	* at the begining of a line before any KSC5601 characters. This usually
2328	* means that it appears by itself on the first line of the file
2329	* ii) There are only 2 shifting sequences SO to shift into double byte mode
2330	* and SI to shift into single byte mode
2331	*/
2332	static void U_CALLCONV
2333	UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
2334
2335	UConverter* saveConv = args->converter;
2336	UConverterDataISO2022 myConverterData=(UConverterDataISO2022)saveConv->extraInfo;
2337	args->converter=myConverterData->currentConverter;
2338
2339	myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
2340	ucnv_MBCSFromUnicodeWithOffsets(args,err);
2341	saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
2342
2343	if(*err == U_BUFFER_OVERFLOW_ERROR) {
2344	if(myConverterData->currentConverter->charErrorBufferLength > `0`) {
2345	uprv_memcpy(
2346	saveConv->charErrorBuffer,
2347	myConverterData->currentConverter->charErrorBuffer,
2348	myConverterData->currentConverter->charErrorBufferLength);
2349	}
2350	saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
2351	myConverterData->currentConverter->charErrorBufferLength = `0`;
2352	}
2353	args->converter=saveConv;
2354	}
2355
2356	static void U_CALLCONV
2357	UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2358
2359	const UChar *source = args->source;
2360	const UChar *sourceLimit = args->sourceLimit;
2361	unsigned char target = (unsigned* char *) args->target;
2362	unsigned char targetLimit = (unsigned* char *) args->targetLimit;
2363	int32_t* offsets = args->offsets;
2364	uint32_t targetByteUnit = `0x0000`;
2365	UChar32 sourceChar = `0x0000`;
2366	UBool isTargetByteDBCS;
2367	UBool oldIsTargetByteDBCS;
2368	UConverterDataISO2022 *converterData;
2369	UConverterSharedData* sharedData;
2370	UBool useFallback;
2371	int32_t length =`0`;
2372
2373	converterData=(UConverterDataISO2022*)args->converter->extraInfo;
2374	/ if the version is 1 then the user is requesting*
2375	* conversion with ibm-25546 pass the arguments to
2376	* MBCS converter and return
2377	*/
2378	if(converterData->version==`1`){
2379	UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2380	return;
2381	}
2382
2383	/ initialize data /
2384	sharedData = converterData->currentConverter->sharedData;
2385	useFallback = args->converter->useFallback;
2386	isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
2387	oldIsTargetByteDBCS = isTargetByteDBCS;
2388
2389	isTargetByteDBCS = (UBool) args->converter->fromUnicodeStatus;
2390	if((sourceChar = args->converter->fromUChar32)!=`0` && target <targetLimit) {
2391	goto getTrail;
2392	}
2393	while(source < sourceLimit){
2394
2395	targetByteUnit = missingCharMarker;
2396
2397	if(target < (unsigned char*) args->targetLimit){
2398	sourceChar = *source++;
2399
2400	/ do not convert SO/SI/ESC /
2401	if(IS_2022_CONTROL(sourceChar)) {
2402	/ callback(illegal) /
2403	*err=U_ILLEGAL_CHAR_FOUND;
2404	args->converter->fromUChar32=sourceChar;
2405	break;
2406	}
2407
2408	length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2);
2409	if(length < `0`) {
2410	length = -length; / fallback /
2411	}
2412	/ only DBCS or SBCS characters are expected/
2413	/ DB characters with high bit set to 1 are expected /
2414	if( length > `2` \|\| length==`0` \|\|
2415	(length == `1` && targetByteUnit > `0x7f`) \|\|
2416	(length == `2` &&
2417	((uint16_t)(targetByteUnit - `0xa1a1`) > (`0xfefe` - `0xa1a1`) \|\|
2418	(uint8_t)(targetByteUnit - `0xa1`) > (`0xfe` - `0xa1`)))
2419	) {
2420	targetByteUnit=missingCharMarker;
2421	}
2422	if (targetByteUnit != missingCharMarker){
2423
2424	oldIsTargetByteDBCS = isTargetByteDBCS;
2425	isTargetByteDBCS = (UBool)(targetByteUnit>`0x00FF`);
2426	/ append the shift sequence /
2427	if (oldIsTargetByteDBCS != isTargetByteDBCS ){
2428
2429	if (isTargetByteDBCS)
2430	*target++ = UCNV_SO;
2431	else
2432	*target++ = UCNV_SI;
2433	if(offsets)
2434	*(offsets++) = (int32_t)(source - args->source-`1`);
2435	}
2436	/ write the targetUniChar to target /
2437	if(targetByteUnit <= `0x00FF`){
2438	if( target < targetLimit){
2439	(target++) = (unsigned* char) targetByteUnit;
2440	if(offsets){
2441	*(offsets++) = (int32_t)(source - args->source-`1`);
2442	}
2443
2444	}else{
2445	args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
2446	*err = U_BUFFER_OVERFLOW_ERROR;
2447	}
2448	}else{
2449	if(target < targetLimit){
2450	(target++) =(unsigned* char) ((targetByteUnit>>`8`) -`0x80`);
2451	if(offsets){
2452	*(offsets++) = (int32_t)(source - args->source-`1`);
2453	}
2454	if(target < targetLimit){
2455	(target++) =(unsigned* char) (targetByteUnit -`0x80`);
2456	if(offsets){
2457	*(offsets++) = (int32_t)(source - args->source-`1`);
2458	}
2459	}else{
2460	args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -`0x80`);
2461	*err = U_BUFFER_OVERFLOW_ERROR;
2462	}
2463	}else{
2464	args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>`8`) -`0x80`);
2465	args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-`0x80`);
2466	*err = U_BUFFER_OVERFLOW_ERROR;
2467	}
2468	}
2469
2470	}
2471	else{
2472	/ oops.. the code point is unassingned*
2473	* set the error and reason
2474	*/
2475
2476	/check if the char is a First surrogate/
2477	if(U16_IS_SURROGATE(sourceChar)) {
2478	if(U16_IS_SURROGATE_LEAD(sourceChar)) {
2479	getTrail:
2480	/look ahead to find the trail surrogate/
2481	if(source < sourceLimit) {
2482	/ test the following code unit /
2483	UChar trail=(UChar) *source;
2484	if(U16_IS_TRAIL(trail)) {
2485	source++;
2486	sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
2487	*err = U_INVALID_CHAR_FOUND;
2488	/ convert this surrogate code point /
2489	/ exit this condition tree /
2490	} else {
2491	/ this is an unmatched lead code unit (1st surrogate) /
2492	/ callback(illegal) /
2493	*err=U_ILLEGAL_CHAR_FOUND;
2494	}
2495	} else {
2496	/ no more input /
2497	*err = U_ZERO_ERROR;
2498	}
2499	} else {
2500	/ this is an unmatched trail code unit (2nd surrogate) /
2501	/ callback(illegal) /
2502	*err=U_ILLEGAL_CHAR_FOUND;
2503	}
2504	} else {
2505	/ callback(unassigned) for a BMP code point /
2506	*err = U_INVALID_CHAR_FOUND;
2507	}
2508
2509	args->converter->fromUChar32=sourceChar;
2510	break;
2511	}
2512	} / end if(myTargetIndex<myTargetLength) /
2513	else{
2514	*err =U_BUFFER_OVERFLOW_ERROR;
2515	break;
2516	}
2517
2518	}/ end while(mySourceIndex<mySourceLength) /
2519
2520	/*
2521	* the end of the input stream and detection of truncated input
2522	* are handled by the framework, but for ISO-2022-KR conversion
2523	* we need to be in ASCII mode at the very end
2524	*
2525	* conditions:
2526	* successful
2527	* not in ASCII mode
2528	* end of input and no truncated input
2529	*/
2530	if( U_SUCCESS(*err) &&
2531	isTargetByteDBCS &&
2532	args->flush && source>=sourceLimit && args->converter->fromUChar32==`0`
2533	) {
2534	int32_t sourceIndex;
2535
2536	/ we are switching to ASCII /
2537	isTargetByteDBCS=FALSE;
2538
2539	/ get the source index of the last input character /
2540	/*
2541	* TODO this would be simpler and more reliable if we used a pair
2542	* of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2543	* so that we could simply use the prevSourceIndex here;
2544	* this code gives an incorrect result for the rare case of an unmatched
2545	* trail surrogate that is alone in the last buffer of the text stream
2546	*/
2547	sourceIndex=(int32_t)(source-args->source);
2548	if(sourceIndex>`0`) {
2549	--sourceIndex;
2550	if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2551	(sourceIndex==`0` \|\| U16_IS_LEAD(args->source[sourceIndex-`1`]))
2552	) {
2553	--sourceIndex;
2554	}
2555	} else {
2556	sourceIndex=-`1`;
2557	}
2558
2559	fromUWriteUInt8(
2560	args->converter,
2561	SHIFT_IN_STR, `1`,
2562	&target, (const char *)targetLimit,
2563	&offsets, sourceIndex,
2564	err);
2565	}
2566
2567	/save the state and return /
2568	args->source = source;
2569	args->target = (char*)target;
2570	args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS;
2571	}
2572
2573	/********************* To Unicode ************************************/
2574
2575	static void U_CALLCONV
2576	UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
2577	UErrorCode* err){
2578	char const* sourceStart;
2579	UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2580
2581	UConverterToUnicodeArgs subArgs;
2582	int32_t minArgsSize;
2583
2584	/ set up the subconverter arguments /
2585	if(args->size<sizeof(UConverterToUnicodeArgs)) {
2586	minArgsSize = args->size;
2587	} else {
2588	minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs);
2589	}
2590
2591	uprv_memcpy(&subArgs, args, minArgsSize);
2592	subArgs.size = (uint16_t)minArgsSize;
2593	subArgs.converter = myData->currentConverter;
2594
2595	/ remember the original start of the input for offsets /
2596	sourceStart = args->source;
2597
2598	if(myData->key != `0`) {
2599	/ continue with a partial escape sequence /
2600	goto escape;
2601	}
2602
2603	while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
2604	/Find the end of the buffer e.g : Next Escape Seq \| end of Buffer/
2605	subArgs.source = args->source;
2606	subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
2607	if(subArgs.source != subArgs.sourceLimit) {
2608	/*
2609	* get the current partial byte sequence
2610	*
2611	* it needs to be moved between the public and the subconverter
2612	* so that the conversion framework, which only sees the public
2613	* converter, can handle truncated and illegal input etc.
2614	*/
2615	if(args->converter->toULength > `0`) {
2616	uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
2617	}
2618	subArgs.converter->toULength = args->converter->toULength;
2619
2620	/*
2621	* Convert up to the end of the input, or to before the next escape character.
2622	* Does not handle conversion extensions because the preToU[] state etc.
2623	* is not copied.
2624	*/
2625	ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
2626
2627	if(args->offsets != NULL && sourceStart != args->source) {
2628	/ update offsets to base them on the actual start of the input /
2629	int32_t *offsets = args->offsets;
2630	UChar *target = args->target;
2631	int32_t delta = (int32_t)(args->source - sourceStart);
2632	while(target < subArgs.target) {
2633	if(*offsets >= `0`) {
2634	*offsets += delta;
2635	}
2636	++offsets;
2637	++target;
2638	}
2639	}
2640	args->source = subArgs.source;
2641	args->target = subArgs.target;
2642	args->offsets = subArgs.offsets;
2643
2644	/ copy input/error/overflow buffers /
2645	if(subArgs.converter->toULength > `0`) {
2646	uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
2647	}
2648	args->converter->toULength = subArgs.converter->toULength;
2649
2650	if(*err == U_BUFFER_OVERFLOW_ERROR) {
2651	if(subArgs.converter->UCharErrorBufferLength > `0`) {
2652	uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
2653	subArgs.converter->UCharErrorBufferLength);
2654	}
2655	args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
2656	subArgs.converter->UCharErrorBufferLength = `0`;
2657	}
2658	}
2659
2660	if (U_FAILURE(*err) \|\| (args->source == args->sourceLimit)) {
2661	return;
2662	}
2663
2664	escape:
2665	changeState_2022(args->converter,
2666	&(args->source),
2667	args->sourceLimit,
2668	ISO_2022_KR,
2669	err);
2670	}
2671	}
2672
2673	static void U_CALLCONV
2674	UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2675	UErrorCode* err){
2676	char tempBuf[`2`];
2677	const char mySource = ( char* *) args->source;
2678	UChar *myTarget = args->target;
2679	const char *mySourceLimit = args->sourceLimit;
2680	UChar32 targetUniChar = `0x0000`;
2681	UChar mySourceChar = `0x0000`;
2682	UConverterDataISO2022* myData;
2683	UConverterSharedData* sharedData ;
2684	UBool useFallback;
2685
2686	myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2687	if(myData->version==`1`){
2688	UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2689	return;
2690	}
2691
2692	/ initialize state /
2693	sharedData = myData->currentConverter->sharedData;
2694	useFallback = args->converter->useFallback;
2695
2696	if(myData->key != `0`) {
2697	/ continue with a partial escape sequence /
2698	goto escape;
2699	} else if(args->converter->toULength == `1` && mySource < mySourceLimit && myTarget < args->targetLimit) {
2700	/ continue with a partial double-byte character /
2701	mySourceChar = args->converter->toUBytes[`0`];
2702	args->converter->toULength = `0`;
2703	goto getTrailByte;
2704	}
2705
2706	while(mySource< mySourceLimit){
2707
2708	if(myTarget < args->targetLimit){
2709
2710	mySourceChar= (unsigned char) *mySource++;
2711
2712	if(mySourceChar==UCNV_SI){
2713	myData->toU2022State.g = `0`;
2714	if (myData->isEmptySegment) {
2715	myData->isEmptySegment = FALSE; / we are handling it, reset to avoid future spurious errors /
2716	*err = U_ILLEGAL_ESCAPE_SEQUENCE;
2717	args->converter->toUCallbackReason = UCNV_IRREGULAR;
2718	args->converter->toUBytes[`0`] = (uint8_t)mySourceChar;
2719	args->converter->toULength = `1`;
2720	args->target = myTarget;
2721	args->source = mySource;
2722	return;
2723	}
2724	/consume the source /
2725	continue;
2726	}else if(mySourceChar==UCNV_SO){
2727	myData->toU2022State.g = `1`;
2728	myData->isEmptySegment = TRUE; / Begin a new segment, empty so far /
2729	/consume the source /
2730	continue;
2731	}else if(mySourceChar==ESC_2022){
2732	mySource--;
2733	escape:
2734	myData->isEmptySegment = FALSE; / Any invalid ESC sequences will be detected separately, so just reset this /
2735	changeState_2022(args->converter,&(mySource),
2736	mySourceLimit, ISO_2022_KR, err);
2737	if(U_FAILURE(*err)){
2738	args->target = myTarget;
2739	args->source = mySource;
2740	return;
2741	}
2742	continue;
2743	}
2744
2745	myData->isEmptySegment = FALSE; / Any invalid char errors will be detected separately, so just reset this /
2746	if(myData->toU2022State.g == `1`) {
2747	if(mySource < mySourceLimit) {
2748	int leadIsOk, trailIsOk;
2749	uint8_t trailByte;
2750	getTrailByte:
2751	targetUniChar = missingCharMarker;
2752	trailByte = (uint8_t)*mySource;
2753	/*
2754	* Ticket 5691: consistent illegal sequences:
2755	* - We include at least the first byte in the illegal sequence.
2756	* - If any of the non-initial bytes could be the start of a character,
2757	* we stop the illegal sequence before the first one of those.
2758	*
2759	* In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2760	* an ESC/SO/SI, we report only the first byte as the illegal sequence.
2761	* Otherwise we convert or report the pair of bytes.
2762	*/
2763	leadIsOk = (uint8_t)(mySourceChar - `0x21`) <= (`0x7e` - `0x21`);
2764	trailIsOk = (uint8_t)(trailByte - `0x21`) <= (`0x7e` - `0x21`);
2765	if (leadIsOk && trailIsOk) {
2766	++mySource;
2767	tempBuf[`0`] = (char)(mySourceChar + `0x80`);
2768	tempBuf[`1`] = (char)(trailByte + `0x80`);
2769	targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, `2`, useFallback);
2770	mySourceChar = (mySourceChar << `8`) \| trailByte;
2771	} else if (!(trailIsOk \|\| IS_2022_CONTROL(trailByte))) {
2772	/ report a pair of illegal bytes if the second byte is not a DBCS starter /
2773	++mySource;
2774	/ add another bit so that the code below writes 2 bytes in case of error /
2775	mySourceChar = static_cast<UChar>(`0x10000` \| (mySourceChar << `8`) \| trailByte);
2776	}
2777	} else {
2778	args->converter->toUBytes[`0`] = (uint8_t)mySourceChar;
2779	args->converter->toULength = `1`;
2780	break;
2781	}
2782	}
2783	else if(mySourceChar <= `0x7f`) {
2784	targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - `1`, `1`, useFallback);
2785	} else {
2786	targetUniChar = `0xffff`;
2787	}
2788	if(targetUniChar < `0xfffe`){
2789	if(args->offsets) {
2790	args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= `0xff` ? `1` : `2`));
2791	}
2792	*(myTarget++)=(UChar)targetUniChar;
2793	}
2794	else {
2795	/ Call the callback function/
2796	toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2797	break;
2798	}
2799	}
2800	else{
2801	*err =U_BUFFER_OVERFLOW_ERROR;
2802	break;
2803	}
2804	}
2805	args->target = myTarget;
2806	args->source = mySource;
2807	}
2808
2809	/************************ END ISO2022-KR ******************************/
2810
2811	/************************* ISO-2022-CN *******************************
2812	*
2813	* Rules for ISO-2022-CN Encoding:
2814	* i) The designator sequence must appear once on a line before any instance
2815	* of character set it designates.
2816	* ii) If two lines contain characters from the same character set, both lines
2817	* must include the designator sequence.
2818	* iii) Once the designator sequence is known, a shifting sequence has to be found
2819	* to invoke the shifting
2820	* iv) All lines start in ASCII and end in ASCII.
2821	* v) Four shifting sequences are employed for this purpose:
2822	*
2823	* Sequcence ASCII Eq Charsets
2824	* ---------- ------- ---------
2825	* SI <SI> US-ASCII
2826	* SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2827	* SS2 <ESC>N CNS-11643-1992 Plane 2
2828	* SS3 <ESC>O CNS-11643-1992 Planes 3-7
2829	*
2830	* vi)
2831	* SOdesignator : ESC "$" ")" finalchar_for_SO
2832	* SS2designator : ESC "$" "*" finalchar_for_SS2
2833	* SS3designator : ESC "$" "+" finalchar_for_SS3
2834	*
2835	* ESC $ ) A Indicates the bytes following SO are Chinese
2836	* characters as defined in GB 2312-80, until
2837	* another SOdesignation appears
2838	*
2839	*
2840	* ESC $ ) E Indicates the bytes following SO are as defined
2841	* in ISO-IR-165 (for details, see section 2.1),
2842	* until another SOdesignation appears
2843	*
2844	* ESC $ ) G Indicates the bytes following SO are as defined
2845	* in CNS 11643-plane-1, until another
2846	* SOdesignation appears
2847	*
2848	* ESC $ * H Indicates the two bytes immediately following
2849	* SS2 is a Chinese character as defined in CNS
2850	* 11643-plane-2, until another SS2designation
2851	* appears
2852	* (Meaning <ESC>N must preceed every 2 byte
2853	* sequence.)
2854	*
2855	* ESC $ + I Indicates the immediate two bytes following SS3
2856	* is a Chinese character as defined in CNS
2857	* 11643-plane-3, until another SS3designation
2858	* appears
2859	* (Meaning <ESC>O must preceed every 2 byte
2860	* sequence.)
2861	*
2862	* ESC $ + J Indicates the immediate two bytes following SS3
2863	* is a Chinese character as defined in CNS
2864	* 11643-plane-4, until another SS3designation
2865	* appears
2866	* (In English: <ESC>O must preceed every 2 byte
2867	* sequence.)
2868	*
2869	* ESC $ + K Indicates the immediate two bytes following SS3
2870	* is a Chinese character as defined in CNS
2871	* 11643-plane-5, until another SS3designation
2872	* appears
2873	*
2874	* ESC $ + L Indicates the immediate two bytes following SS3
2875	* is a Chinese character as defined in CNS
2876	* 11643-plane-6, until another SS3designation
2877	* appears
2878	*
2879	* ESC $ + M Indicates the immediate two bytes following SS3
2880	* is a Chinese character as defined in CNS
2881	* 11643-plane-7, until another SS3designation
2882	* appears
2883	*
2884	* As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2885	* has its own designation information before any Chinese characters
2886	* appear
2887	*
2888	*/
2889
2890	/ The following are defined this way to make the strings truly readonly /
2891	static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
2892	static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
2893	static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
2894	static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
2895	static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
2896	static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
2897	static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
2898	static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
2899	static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
2900
2901	/******************* ISO2022-CN Data ***********************/
2902	static const char* const escSeqCharsCN[`10`] ={
2903	SHIFT_IN_STR, / 0 ASCII /
2904	GB_2312_80_STR, / 1 GB2312_1 /
2905	ISO_IR_165_STR, / 2 ISO_IR_165 /
2906	CNS_11643_1992_Plane_1_STR,
2907	CNS_11643_1992_Plane_2_STR,
2908	CNS_11643_1992_Plane_3_STR,
2909	CNS_11643_1992_Plane_4_STR,
2910	CNS_11643_1992_Plane_5_STR,
2911	CNS_11643_1992_Plane_6_STR,
2912	CNS_11643_1992_Plane_7_STR
2913	};
2914
2915	static void U_CALLCONV
2916	UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2917	UConverter *cnv = args->converter;
2918	UConverterDataISO2022 *converterData;
2919	ISO2022State *pFromU2022State;
2920	uint8_t target = (uint8_t ) args->target;
2921	const uint8_t targetLimit = (const* uint8_t *) args->targetLimit;
2922	const UChar* source = args->source;
2923	const UChar* sourceLimit = args->sourceLimit;
2924	int32_t* offsets = args->offsets;
2925	UChar32 sourceChar;
2926	char buffer[`8`];
2927	int32_t len;
2928	int8_t choices[`3`];
2929	int32_t choiceCount;
2930	uint32_t targetValue = `0`;
2931	UBool useFallback;
2932
2933	/ set up the state /
2934	converterData = (UConverterDataISO2022*)cnv->extraInfo;
2935	pFromU2022State = &converterData->fromU2022State;
2936
2937	choiceCount = `0`;
2938
2939	/ check if the last codepoint of previous buffer was a lead surrogate/
2940	if((sourceChar = cnv->fromUChar32)!=`0` && target< targetLimit) {
2941	goto getTrail;
2942	}
2943
2944	while( source < sourceLimit){
2945	if(target < targetLimit){
2946
2947	sourceChar = *(source++);
2948	/check if the char is a First surrogate/
2949	if(U16_IS_SURROGATE(sourceChar)) {
2950	if(U16_IS_SURROGATE_LEAD(sourceChar)) {
2951	getTrail:
2952	/look ahead to find the trail surrogate/
2953	if(source < sourceLimit) {
2954	/ test the following code unit /
2955	UChar trail=(UChar) *source;
2956	if(U16_IS_TRAIL(trail)) {
2957	source++;
2958	sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
2959	cnv->fromUChar32=`0x00`;
2960	/ convert this supplementary code point /
2961	/ exit this condition tree /
2962	} else {
2963	/ this is an unmatched lead code unit (1st surrogate) /
2964	/ callback(illegal) /
2965	*err=U_ILLEGAL_CHAR_FOUND;
2966	cnv->fromUChar32=sourceChar;
2967	break;
2968	}
2969	} else {
2970	/ no more input /
2971	cnv->fromUChar32=sourceChar;
2972	break;
2973	}
2974	} else {
2975	/ this is an unmatched trail code unit (2nd surrogate) /
2976	/ callback(illegal) /
2977	*err=U_ILLEGAL_CHAR_FOUND;
2978	cnv->fromUChar32=sourceChar;
2979	break;
2980	}
2981	}
2982
2983	/ do the conversion /
2984	if(sourceChar <= `0x007f` ){
2985	/ do not convert SO/SI/ESC /
2986	if(IS_2022_CONTROL(sourceChar)) {
2987	/ callback(illegal) /
2988	*err=U_ILLEGAL_CHAR_FOUND;
2989	cnv->fromUChar32=sourceChar;
2990	break;
2991	}
2992
2993	/ US-ASCII /
2994	if(pFromU2022State->g == `0`) {
2995	buffer[`0`] = (char)sourceChar;
2996	len = `1`;
2997	} else {
2998	buffer[`0`] = UCNV_SI;
2999	buffer[`1`] = (char)sourceChar;
3000	len = `2`;
3001	pFromU2022State->g = `0`;
3002	choiceCount = `0`;
3003	}
3004	if(sourceChar == CR \|\| sourceChar == LF) {
3005	/ reset the state at the end of a line /
3006	uprv_memset(pFromU2022State, `0`, sizeof(ISO2022State));
3007	choiceCount = `0`;
3008	}
3009	}
3010	else{
3011	/ convert U+0080..U+10ffff /
3012	int32_t i;
3013	int8_t cs, g;
3014
3015	if(choiceCount == `0`) {
3016	/ try the current SO/G1 converter first /
3017	choices[`0`] = pFromU2022State->cs[`1`];
3018
3019	/ default to GB2312_1 if none is designated yet /
3020	if(choices[`0`] == `0`) {
3021	choices[`0`] = GB2312_1;
3022	}
3023
3024	if(converterData->version == `0`) {
3025	/ ISO-2022-CN /
3026
3027	/ try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane /
3028	if(choices[`0`] == GB2312_1) {
3029	choices[`1`] = (int8_t)CNS_11643_1;
3030	} else {
3031	choices[`1`] = (int8_t)GB2312_1;
3032	}
3033
3034	choiceCount = `2`;
3035	} else if (converterData->version == `1`) {
3036	/ ISO-2022-CN-EXT /
3037
3038	/ try one of the other converters /
3039	switch(choices[`0`]) {
3040	case GB2312_1:
3041	choices[`1`] = (int8_t)CNS_11643_1;
3042	choices[`2`] = (int8_t)ISO_IR_165;
3043	break;
3044	case ISO_IR_165:
3045	choices[`1`] = (int8_t)GB2312_1;
3046	choices[`2`] = (int8_t)CNS_11643_1;
3047	break;
3048	default: / CNS_11643_x /
3049	choices[`1`] = (int8_t)GB2312_1;
3050	choices[`2`] = (int8_t)ISO_IR_165;
3051	break;
3052	}
3053
3054	choiceCount = `3`;
3055	} else {
3056	choices[`0`] = (int8_t)CNS_11643_1;
3057	choices[`1`] = (int8_t)GB2312_1;
3058	}
3059	}
3060
3061	cs = g = `0`;
3062	/*
3063	* len==0: no mapping found yet
3064	* len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
3065	* len>0: found a roundtrip result, done
3066	*/
3067	len = `0`;
3068	/*
3069	* We will turn off useFallback after finding a fallback,
3070	* but we still get fallbacks from PUA code points as usual.
3071	* Therefore, we will also need to check that we don't overwrite
3072	* an early fallback with a later one.
3073	*/
3074	useFallback = cnv->useFallback;
3075
3076	for(i = `0`; i < choiceCount && len <= `0`; ++i) {
3077	int8_t cs0 = choices[i];
3078	if(cs0 > `0`) {
3079	uint32_t value;
3080	int32_t len2;
3081	if(cs0 >= CNS_11643_0) {
3082	len2 = MBCS_FROM_UCHAR32_ISO2022(
3083	converterData->myConverterArray[CNS_11643],
3084	sourceChar,
3085	&value,
3086	useFallback,
3087	MBCS_OUTPUT_3);
3088	if(len2 == `3` \|\| (len2 == -`3` && len == `0`)) {
3089	targetValue = value;
3090	cs = (int8_t)(CNS_11643_0 + (value >> `16`) - `0x80`);
3091	if(len2 >= `0`) {
3092	len = `2`;
3093	} else {
3094	len = -`2`;
3095	useFallback = FALSE;
3096	}
3097	if(cs == CNS_11643_1) {
3098	g = `1`;
3099	} else if(cs == CNS_11643_2) {
3100	g = `2`;
3101	} else / plane 3..7 / if(converterData->version == `1`) {
3102	g = `3`;
3103	} else {
3104	/ ISO-2022-CN (without -EXT) does not support plane 3..7 /
3105	len = `0`;
3106	}
3107	}
3108	} else {
3109	/ GB2312_1 or ISO-IR-165 /
3110	U_ASSERT(cs0<UCNV_2022_MAX_CONVERTERS);
3111	len2 = MBCS_FROM_UCHAR32_ISO2022(
3112	converterData->myConverterArray[cs0],
3113	sourceChar,
3114	&value,
3115	useFallback,
3116	MBCS_OUTPUT_2);
3117	if(len2 == `2` \|\| (len2 == -`2` && len == `0`)) {
3118	targetValue = value;
3119	len = len2;
3120	cs = cs0;
3121	g = `1`;
3122	useFallback = FALSE;
3123	}
3124	}
3125	}
3126	}
3127
3128	if(len != `0`) {
3129	len = `0`; / count output bytes; it must have been abs(len) == 2 /
3130
3131	/ write the designation sequence if necessary /
3132	if(cs != pFromU2022State->cs[g]) {
3133	if(cs < CNS_11643) {
3134	uprv_memcpy(buffer, escSeqCharsCN[cs], `4`);
3135	} else {
3136	U_ASSERT(cs >= CNS_11643_1);
3137	uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], `4`);
3138	}
3139	len = `4`;
3140	pFromU2022State->cs[g] = cs;
3141	if(g == `1`) {
3142	/ changing the SO/G1 charset invalidates the choices[] /
3143	choiceCount = `0`;
3144	}
3145	}
3146
3147	/ write the shift sequence if necessary /
3148	if(g != pFromU2022State->g) {
3149	switch(g) {
3150	case `1`:
3151	buffer[len++] = UCNV_SO;
3152
3153	/ set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 /
3154	pFromU2022State->g = `1`;
3155	break;
3156	case `2`:
3157	buffer[len++] = `0x1b`;
3158	buffer[len++] = `0x4e`;
3159	break;
3160	default: / case 3 /
3161	buffer[len++] = `0x1b`;
3162	buffer[len++] = `0x4f`;
3163	break;
3164	}
3165	}
3166
3167	/ write the two output bytes /
3168	buffer[len++] = (char)(targetValue >> `8`);
3169	buffer[len++] = (char)targetValue;
3170	} else {
3171	/ if we cannot find the character after checking all codepages*
3172	* then this is an error
3173	*/
3174	*err = U_INVALID_CHAR_FOUND;
3175	cnv->fromUChar32=sourceChar;
3176	break;
3177	}
3178	}
3179
3180	/ output len>0 bytes in buffer[] /
3181	if(len == `1`) {
3182	*target++ = buffer[`0`];
3183	if(offsets) {
3184	offsets++ = (int32_t)(source - args->source - `1`); /* -1: known to be ASCII /
3185	}
3186	} else if(len == `2` && (target + `2`) <= targetLimit) {
3187	*target++ = buffer[`0`];
3188	*target++ = buffer[`1`];
3189	if(offsets) {
3190	int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
3191	*offsets++ = sourceIndex;
3192	*offsets++ = sourceIndex;
3193	}
3194	} else {
3195	fromUWriteUInt8(
3196	cnv,
3197	buffer, len,
3198	&target, (const char *)targetLimit,
3199	&offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
3200	err);
3201	if(U_FAILURE(*err)) {
3202	break;
3203	}
3204	}
3205	} / end if(myTargetIndex<myTargetLength) /
3206	else{
3207	*err =U_BUFFER_OVERFLOW_ERROR;
3208	break;
3209	}
3210
3211	}/ end while(mySourceIndex<mySourceLength) /
3212
3213	/*
3214	* the end of the input stream and detection of truncated input
3215	* are handled by the framework, but for ISO-2022-CN conversion
3216	* we need to be in ASCII mode at the very end
3217	*
3218	* conditions:
3219	* successful
3220	* not in ASCII mode
3221	* end of input and no truncated input
3222	*/
3223	if( U_SUCCESS(*err) &&
3224	pFromU2022State->g!=`0` &&
3225	args->flush && source>=sourceLimit && cnv->fromUChar32==`0`
3226	) {
3227	int32_t sourceIndex;
3228
3229	/ we are switching to ASCII /
3230	pFromU2022State->g=`0`;
3231
3232	/ get the source index of the last input character /
3233	/*
3234	* TODO this would be simpler and more reliable if we used a pair
3235	* of sourceIndex/prevSourceIndex like in ucnvmbcs.c
3236	* so that we could simply use the prevSourceIndex here;
3237	* this code gives an incorrect result for the rare case of an unmatched
3238	* trail surrogate that is alone in the last buffer of the text stream
3239	*/
3240	sourceIndex=(int32_t)(source-args->source);
3241	if(sourceIndex>`0`) {
3242	--sourceIndex;
3243	if( U16_IS_TRAIL(args->source[sourceIndex]) &&
3244	(sourceIndex==`0` \|\| U16_IS_LEAD(args->source[sourceIndex-`1`]))
3245	) {
3246	--sourceIndex;
3247	}
3248	} else {
3249	sourceIndex=-`1`;
3250	}
3251
3252	fromUWriteUInt8(
3253	cnv,
3254	SHIFT_IN_STR, `1`,
3255	&target, (const char *)targetLimit,
3256	&offsets, sourceIndex,
3257	err);
3258	}
3259
3260	/save the state and return /
3261	args->source = source;
3262	args->target = (char*)target;
3263	}
3264
3265
3266	static void U_CALLCONV
3267	UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
3268	UErrorCode* err){
3269	char tempBuf[`3`];
3270	const char mySource = (char* *) args->source;
3271	UChar *myTarget = args->target;
3272	const char *mySourceLimit = args->sourceLimit;
3273	uint32_t targetUniChar = `0x0000`;
3274	uint32_t mySourceChar = `0x0000`;
3275	UConverterDataISO2022* myData;
3276	ISO2022State *pToU2022State;
3277
3278	myData=(UConverterDataISO2022*)(args->converter->extraInfo);
3279	pToU2022State = &myData->toU2022State;
3280
3281	if(myData->key != `0`) {
3282	/ continue with a partial escape sequence /
3283	goto escape;
3284	} else if(args->converter->toULength == `1` && mySource < mySourceLimit && myTarget < args->targetLimit) {
3285	/ continue with a partial double-byte character /
3286	mySourceChar = args->converter->toUBytes[`0`];
3287	args->converter->toULength = `0`;
3288	targetUniChar = missingCharMarker;
3289	goto getTrailByte;
3290	}
3291
3292	while(mySource < mySourceLimit){
3293
3294	targetUniChar =missingCharMarker;
3295
3296	if(myTarget < args->targetLimit){
3297
3298	mySourceChar= (unsigned char) *mySource++;
3299
3300	switch(mySourceChar){
3301	case UCNV_SI:
3302	pToU2022State->g=`0`;
3303	if (myData->isEmptySegment) {
3304	myData->isEmptySegment = FALSE; / we are handling it, reset to avoid future spurious errors /
3305	*err = U_ILLEGAL_ESCAPE_SEQUENCE;
3306	args->converter->toUCallbackReason = UCNV_IRREGULAR;
3307	args->converter->toUBytes[`0`] = static_cast<uint8_t>(mySourceChar);
3308	args->converter->toULength = `1`;
3309	args->target = myTarget;
3310	args->source = mySource;
3311	return;
3312	}
3313	continue;
3314
3315	case UCNV_SO:
3316	if(pToU2022State->cs[`1`] != `0`) {
3317	pToU2022State->g=`1`;
3318	myData->isEmptySegment = TRUE; / Begin a new segment, empty so far /
3319	continue;
3320	} else {
3321	/ illegal to have SO before a matching designator /
3322	myData->isEmptySegment = FALSE; / Handling a different error, reset this to avoid future spurious errs /
3323	break;
3324	}
3325
3326	case ESC_2022:
3327	mySource--;
3328	escape:
3329	{
3330	const char * mySourceBefore = mySource;
3331	int8_t toULengthBefore = args->converter->toULength;
3332
3333	changeState_2022(args->converter,&(mySource),
3334	mySourceLimit, ISO_2022_CN,err);
3335
3336	/ After SO there must be at least one character before a designator (designator error handled separately) /
3337	if(myData->key==`0` && U_SUCCESS(*err) && myData->isEmptySegment) {
3338	*err = U_ILLEGAL_ESCAPE_SEQUENCE;
3339	args->converter->toUCallbackReason = UCNV_IRREGULAR;
3340	args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
3341	}
3342	}
3343
3344	/ invalid or illegal escape sequence /
3345	if(U_FAILURE(*err)){
3346	args->target = myTarget;
3347	args->source = mySource;
3348	myData->isEmptySegment = FALSE; / Reset to avoid future spurious errors /
3349	return;
3350	}
3351	continue;
3352
3353	/ ISO-2022-CN does not use single-byte (C1) SS2 and SS3 /
3354
3355	case CR:
3356	case LF:
3357	uprv_memset(pToU2022State, `0`, sizeof(ISO2022State));
3358	U_FALLTHROUGH;
3359	default:
3360	/ convert one or two bytes /
3361	myData->isEmptySegment = FALSE;
3362	if(pToU2022State->g != `0`) {
3363	if(mySource < mySourceLimit) {
3364	UConverterSharedData *cnv;
3365	StateEnum tempState;
3366	int32_t tempBufLen;
3367	int leadIsOk, trailIsOk;
3368	uint8_t trailByte;
3369	getTrailByte:
3370	trailByte = (uint8_t)*mySource;
3371	/*
3372	* Ticket 5691: consistent illegal sequences:
3373	* - We include at least the first byte in the illegal sequence.
3374	* - If any of the non-initial bytes could be the start of a character,
3375	* we stop the illegal sequence before the first one of those.
3376	*
3377	* In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
3378	* an ESC/SO/SI, we report only the first byte as the illegal sequence.
3379	* Otherwise we convert or report the pair of bytes.
3380	*/
3381	leadIsOk = (uint8_t)(mySourceChar - `0x21`) <= (`0x7e` - `0x21`);
3382	trailIsOk = (uint8_t)(trailByte - `0x21`) <= (`0x7e` - `0x21`);
3383	if (leadIsOk && trailIsOk) {
3384	++mySource;
3385	tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
3386	if(tempState >= CNS_11643_0) {
3387	cnv = myData->myConverterArray[CNS_11643];
3388	tempBuf[`0`] = (char) (`0x80`+(tempState-CNS_11643_0));
3389	tempBuf[`1`] = (char) (mySourceChar);
3390	tempBuf[`2`] = (char) trailByte;
3391	tempBufLen = `3`;
3392
3393	}else{
3394	U_ASSERT(tempState<UCNV_2022_MAX_CONVERTERS);
3395	cnv = myData->myConverterArray[tempState];
3396	tempBuf[`0`] = (char) (mySourceChar);
3397	tempBuf[`1`] = (char) trailByte;
3398	tempBufLen = `2`;
3399	}
3400	targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
3401	mySourceChar = (mySourceChar << `8`) \| trailByte;
3402	} else if (!(trailIsOk \|\| IS_2022_CONTROL(trailByte))) {
3403	/ report a pair of illegal bytes if the second byte is not a DBCS starter /
3404	++mySource;
3405	/ add another bit so that the code below writes 2 bytes in case of error /
3406	mySourceChar = `0x10000` \| (mySourceChar << `8`) \| trailByte;
3407	}
3408	if(pToU2022State->g>=`2`) {
3409	/ return from a single-shift state to the previous one /
3410	pToU2022State->g=pToU2022State->prevG;
3411	}
3412	} else {
3413	args->converter->toUBytes[`0`] = (uint8_t)mySourceChar;
3414	args->converter->toULength = `1`;
3415	goto endloop;
3416	}
3417	}
3418	else{
3419	if(mySourceChar <= `0x7f`) {
3420	targetUniChar = (UChar) mySourceChar;
3421	}
3422	}
3423	break;
3424	}
3425	if(targetUniChar < (missingCharMarker-`1`/0xfffe/)){
3426	if(args->offsets){
3427	args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= `0xff` ? `1` : `2`));
3428	}
3429	*(myTarget++)=(UChar)targetUniChar;
3430	}
3431	else if(targetUniChar > missingCharMarker){
3432	/ disassemble the surrogate pair and write to output/
3433	targetUniChar-=`0x0010000`;
3434	*myTarget = (UChar)(`0xd800`+(UChar)(targetUniChar>>`10`));
3435	if(args->offsets){
3436	args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= `0xff` ? `1` : `2`));
3437	}
3438	++myTarget;
3439	if(myTarget< args->targetLimit){
3440	*myTarget = (UChar)(`0xdc00`+(UChar)(targetUniChar&`0x3ff`));
3441	if(args->offsets){
3442	args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= `0xff` ? `1` : `2`));
3443	}
3444	++myTarget;
3445	}else{
3446	args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
3447	(UChar)(`0xdc00`+(UChar)(targetUniChar&`0x3ff`));
3448	}
3449
3450	}
3451	else{
3452	/ Call the callback function/
3453	toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
3454	break;
3455	}
3456	}
3457	else{
3458	*err =U_BUFFER_OVERFLOW_ERROR;
3459	break;
3460	}
3461	}
3462	endloop:
3463	args->target = myTarget;
3464	args->source = mySource;
3465	}
3466	#endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */
3467
3468	static void U_CALLCONV
3469	_ISO_2022_WriteSub(UConverterFromUnicodeArgs args, int32_t offsetIndex, UErrorCode err) {
3470	UConverter *cnv = args->converter;
3471	UConverterDataISO2022 myConverterData=(UConverterDataISO2022 ) cnv->extraInfo;
3472	ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
3473	char p, subchar;
3474	char buffer[`8`];
3475	int32_t length;
3476
3477	subchar=(char *)cnv->subChars;
3478	length=cnv->subCharLen; / assume length==1 for most variants /
3479
3480	p = buffer;
3481	switch(myConverterData->locale[`0`]){
3482	case `'j'`:
3483	{
3484	int8_t cs;
3485
3486	if(pFromU2022State->g == `1`) {
3487	/ JIS7: switch from G1 to G0 /
3488	pFromU2022State->g = `0`;
3489	*p++ = UCNV_SI;
3490	}
3491
3492	cs = pFromU2022State->cs[`0`];
3493	if(cs != ASCII && cs != JISX201) {
3494	/ not in ASCII or JIS X 0201: switch to ASCII /
3495	pFromU2022State->cs[`0`] = (int8_t)ASCII;
3496	*p++ = `'\x1b'`;
3497	*p++ = `'\x28'`;
3498	*p++ = `'\x42'`;
3499	}
3500
3501	*p++ = subchar[`0`];
3502	break;
3503	}
3504	case `'c'`:
3505	if(pFromU2022State->g != `0`) {
3506	/ not in ASCII mode: switch to ASCII /
3507	pFromU2022State->g = `0`;
3508	*p++ = UCNV_SI;
3509	}
3510	*p++ = subchar[`0`];
3511	break;
3512	case `'k'`:
3513	if(myConverterData->version == `0`) {
3514	if(length == `1`) {
3515	if(args->converter->fromUnicodeStatus) {
3516	/ in DBCS mode: switch to SBCS /
3517	args->converter->fromUnicodeStatus = `0`;
3518	*p++ = UCNV_SI;
3519	}
3520	*p++ = subchar[`0`];
3521	} else / length == 2/ {
3522	if(!args->converter->fromUnicodeStatus) {
3523	/ in SBCS mode: switch to DBCS /
3524	args->converter->fromUnicodeStatus = `1`;
3525	*p++ = UCNV_SO;
3526	}
3527	*p++ = subchar[`0`];
3528	*p++ = subchar[`1`];
3529	}
3530	break;
3531	} else {
3532	/ save the subconverter's substitution string /
3533	uint8_t *currentSubChars = myConverterData->currentConverter->subChars;
3534	int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen;
3535
3536	/ set our substitution string into the subconverter /
3537	myConverterData->currentConverter->subChars = (uint8_t *)subchar;
3538	myConverterData->currentConverter->subCharLen = (int8_t)length;
3539
3540	/ let the subconverter write the subchar, set/retrieve fromUChar32 state /
3541	args->converter = myConverterData->currentConverter;
3542	myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
3543	ucnv_cbFromUWriteSub(args, `0`, err);
3544	cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
3545	args->converter = cnv;
3546
3547	/ restore the subconverter's substitution string /
3548	myConverterData->currentConverter->subChars = currentSubChars;
3549	myConverterData->currentConverter->subCharLen = currentSubCharLen;
3550
3551	if(*err == U_BUFFER_OVERFLOW_ERROR) {
3552	if(myConverterData->currentConverter->charErrorBufferLength > `0`) {
3553	uprv_memcpy(
3554	cnv->charErrorBuffer,
3555	myConverterData->currentConverter->charErrorBuffer,
3556	myConverterData->currentConverter->charErrorBufferLength);
3557	}
3558	cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
3559	myConverterData->currentConverter->charErrorBufferLength = `0`;
3560	}
3561	return;
3562	}
3563	default:
3564	/ not expected /
3565	break;
3566	}
3567	ucnv_cbFromUWriteBytes(args,
3568	buffer, (int32_t)(p - buffer),
3569	offsetIndex, err);
3570	}
3571
3572	/*
3573	* Structure for cloning an ISO 2022 converter into a single memory block.
3574	*/
3575	struct cloneStruct
3576	{
3577	UConverter cnv;
3578	UConverter currentConverter;
3579	UConverterDataISO2022 mydata;
3580	};
3581
3582
3583	U_CDECL_BEGIN
3584
3585	static UConverter * U_CALLCONV
3586	_ISO_2022_SafeClone(
3587	const UConverter *cnv,
3588	void *stackBuffer,
3589	int32_t *pBufferSize,
3590	UErrorCode *status)
3591	{
3592	struct cloneStruct * localClone;
3593	UConverterDataISO2022 *cnvData;
3594	int32_t i, size;
3595
3596	if (U_FAILURE(*status)){
3597	return nullptr;
3598	}
3599
3600	if (pBufferSize == `0`) { /* 'preflighting' request - set needed size into pBufferSize /*
3601	pBufferSize = (int32_t)sizeof(struct* cloneStruct);
3602	return NULL;
3603	}
3604
3605	cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
3606	localClone = (struct cloneStruct *)stackBuffer;
3607
3608	/ ucnv.c/ucnv_safeClone() copied the main UConverter already /
3609
3610	uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
3611	localClone->cnv.extraInfo = &localClone->mydata; / set pointer to extra data /
3612	localClone->cnv.isExtraLocal = TRUE;
3613
3614	/ share the subconverters /
3615
3616	if(cnvData->currentConverter != NULL) {
3617	size = (int32_t)sizeof(UConverter);
3618	localClone->mydata.currentConverter =
3619	ucnv_safeClone(cnvData->currentConverter,
3620	&localClone->currentConverter,
3621	&size, status);
3622	if(U_FAILURE(*status)) {
3623	return NULL;
3624	}
3625	}
3626
3627	for(i=`0`; i<UCNV_2022_MAX_CONVERTERS; ++i) {
3628	if(cnvData->myConverterArray[i] != NULL) {
3629	ucnv_incrementRefCount(cnvData->myConverterArray[i]);
3630	}
3631	}
3632
3633	return &localClone->cnv;
3634	}
3635
3636	U_CDECL_END
3637
3638	static void U_CALLCONV
3639	_ISO_2022_GetUnicodeSet(const UConverter *cnv,
3640	const USetAdder *sa,
3641	UConverterUnicodeSet which,
3642	UErrorCode *pErrorCode)
3643	{
3644	int32_t i;
3645	UConverterDataISO2022* cnvData;
3646
3647	if (U_FAILURE(*pErrorCode)) {
3648	return;
3649	}
3650	#ifdef U_ENABLE_GENERIC_ISO_2022
3651	if (cnv->sharedData == &_ISO2022Data) {
3652	/ We use UTF-8 in this case /
3653	sa->addRange(sa->set, `0`, `0xd7FF`);
3654	sa->addRange(sa->set, `0xE000`, `0x10FFFF`);
3655	return;
3656	}
3657	#endif
3658
3659	cnvData = (UConverterDataISO2022*)cnv->extraInfo;
3660
3661	/ open a set and initialize it with code points that are algorithmically round-tripped /
3662	switch(cnvData->locale[`0`]){
3663	case `'j'`:
3664	/ include JIS X 0201 which is hardcoded /
3665	sa->add(sa->set, `0xa5`);
3666	sa->add(sa->set, `0x203e`);
3667	if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
3668	/ include Latin-1 for some variants of JP /
3669	sa->addRange(sa->set, `0`, `0xff`);
3670	} else {
3671	/ include ASCII for JP /
3672	sa->addRange(sa->set, `0`, `0x7f`);
3673	}
3674	if(cnvData->version==`3` \|\| cnvData->version==`4` \|\| which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
3675	/*
3676	* Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
3677	* because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
3678	* use half-width Katakana.
3679	* This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
3680	* half-width Katakana via the ESC ( I sequence.
3681	* However, we only emit (fromUnicode) half-width Katakana according to the
3682	* definition of each variant.
3683	*
3684	* When including fallbacks,
3685	* we need to include half-width Katakana Unicode code points for all JP variants because
3686	* JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
3687	*/
3688	/ include half-width Katakana for JP /
3689	sa->addRange(sa->set, HWKANA_START, HWKANA_END);
3690	}
3691	break;
3692	#if !UCONFIG_ONLY_HTML_CONVERSION
3693	case `'c'`:
3694	case `'z'`:
3695	/ include ASCII for CN /
3696	sa->addRange(sa->set, `0`, `0x7f`);
3697	break;
3698	case `'k'`:
3699	/ there is only one converter for KR, and it is not in the myConverterArray[] /
3700	cnvData->currentConverter->sharedData->impl->getUnicodeSet(
3701	cnvData->currentConverter, sa, which, pErrorCode);
3702	/ the loop over myConverterArray[] will simply not find another converter /
3703	break;
3704	#endif
3705	default:
3706	break;
3707	}
3708
3709	#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
3710	if( (cnvData->locale[`0`]==`'c'` \|\| cnvData->locale[`0`]==`'z'`) &&
3711	cnvData->version==`0` && i==CNS_11643
3712	) {
3713	/ special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 /
3714	ucnv_MBCSGetUnicodeSetForBytes(
3715	cnvData->myConverterArray[i],
3716	sa, UCNV_ROUNDTRIP_SET,
3717	`0`, `0x81`, `0x82`,
3718	pErrorCode);
3719	}
3720	#endif
3721
3722	for (i=`0`; i<UCNV_2022_MAX_CONVERTERS; i++) {
3723	UConverterSetFilter filter;
3724	if(cnvData->myConverterArray[i]!=NULL) {
3725	if(cnvData->locale[`0`]==`'j'` && i==JISX208) {
3726	/*
3727	* Only add code points that map to Shift-JIS codes
3728	* corresponding to JIS X 0208.
3729	*/
3730	filter=UCNV_SET_FILTER_SJIS;
3731	#if !UCONFIG_ONLY_HTML_CONVERSION
3732	} else if( (cnvData->locale[`0`]==`'c'` \|\| cnvData->locale[`0`]==`'z'`) &&
3733	cnvData->version==`0` && i==CNS_11643) {
3734	/*
3735	* Version-specific for CN:
3736	* CN version 0 does not map CNS planes 3..7 although
3737	* they are all available in the CNS conversion table;
3738	* CN version 1 (-EXT) does map them all.
3739	* The two versions create different Unicode sets.
3740	*/
3741	filter=UCNV_SET_FILTER_2022_CN;
3742	} else if(i==KSC5601) {
3743	/*
3744	* Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
3745	* are broader than GR94.
3746	*/
3747	filter=UCNV_SET_FILTER_GR94DBCS;
3748	#endif
3749	} else {
3750	filter=UCNV_SET_FILTER_NONE;
3751	}
3752	ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode);
3753	}
3754	}
3755
3756	/*
3757	* ISO 2022 converters must not convert SO/SI/ESC despite what
3758	* sub-converters do by themselves.
3759	* Remove these characters from the set.
3760	*/
3761	sa->remove(sa->set, `0x0e`);
3762	sa->remove(sa->set, `0x0f`);
3763	sa->remove(sa->set, `0x1b`);
3764
3765	/ ISO 2022 converters do not convert C1 controls either /
3766	sa->removeRange(sa->set, `0x80`, `0x9f`);
3767	}
3768
3769	static const UConverterImpl _ISO2022Impl={
3770	UCNV_ISO_2022,
3771
3772	NULL,
3773	NULL,
3774
3775	_ISO2022Open,
3776	_ISO2022Close,
3777	_ISO2022Reset,
3778
3779	#ifdef U_ENABLE_GENERIC_ISO_2022
3780	T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3781	T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3782	ucnv_fromUnicode_UTF8,
3783	ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
3784	#else
3785	NULL,
3786	NULL,
3787	NULL,
3788	NULL,
3789	#endif
3790	NULL,
3791
3792	NULL,
3793	_ISO2022getName,
3794	_ISO_2022_WriteSub,
3795	_ISO_2022_SafeClone,
3796	_ISO_2022_GetUnicodeSet,
3797
3798	NULL,
3799	NULL
3800	};
3801	static const UConverterStaticData _ISO2022StaticData={
3802	sizeof(UConverterStaticData),
3803	"ISO_2022",
3804	`2022`,
3805	UCNV_IBM,
3806	UCNV_ISO_2022,
3807	`1`,
3808	`3`, / max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) /
3809	{ `0x1a`, `0`, `0`, `0` },
3810	`1`,
3811	FALSE,
3812	FALSE,
3813	`0`,
3814	`0`,
3815	{ `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0` } / reserved /
3816	};
3817	const UConverterSharedData _ISO2022Data=
3818	UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022StaticData, &_ISO2022Impl);
3819
3820	/**********JP*************/
3821	static const UConverterImpl _ISO2022JPImpl={
3822	UCNV_ISO_2022,
3823
3824	NULL,
3825	NULL,
3826
3827	_ISO2022Open,
3828	_ISO2022Close,
3829	_ISO2022Reset,
3830
3831	UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3832	UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3833	UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3834	UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3835	NULL,
3836
3837	NULL,
3838	_ISO2022getName,
3839	_ISO_2022_WriteSub,
3840	_ISO_2022_SafeClone,
3841	_ISO_2022_GetUnicodeSet,
3842
3843	NULL,
3844	NULL
3845	};
3846	static const UConverterStaticData _ISO2022JPStaticData={
3847	sizeof(UConverterStaticData),
3848	"ISO_2022_JP",
3849	`0`,
3850	UCNV_IBM,
3851	UCNV_ISO_2022,
3852	`1`,
3853	`6`, / max 6 bytes per UChar: 4-byte escape sequence + DBCS /
3854	{ `0x1a`, `0`, `0`, `0` },
3855	`1`,
3856	FALSE,
3857	FALSE,
3858	`0`,
3859	`0`,
3860	{ `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0` } / reserved /
3861	};
3862
3863	namespace {
3864
3865	const UConverterSharedData _ISO2022JPData=
3866	UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022JPStaticData, &_ISO2022JPImpl);
3867
3868	} // namespace
3869
3870	#if !UCONFIG_ONLY_HTML_CONVERSION
3871	/********** KR ************/
3872	static const UConverterImpl _ISO2022KRImpl={
3873	UCNV_ISO_2022,
3874
3875	NULL,
3876	NULL,
3877
3878	_ISO2022Open,
3879	_ISO2022Close,
3880	_ISO2022Reset,
3881
3882	UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3883	UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3884	UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3885	UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3886	NULL,
3887
3888	NULL,
3889	_ISO2022getName,
3890	_ISO_2022_WriteSub,
3891	_ISO_2022_SafeClone,
3892	_ISO_2022_GetUnicodeSet,
3893
3894	NULL,
3895	NULL
3896	};
3897	static const UConverterStaticData _ISO2022KRStaticData={
3898	sizeof(UConverterStaticData),
3899	"ISO_2022_KR",
3900	`0`,
3901	UCNV_IBM,
3902	UCNV_ISO_2022,
3903	`1`,
3904	`8`, / max 8 bytes per UChar /
3905	{ `0x1a`, `0`, `0`, `0` },
3906	`1`,
3907	FALSE,
3908	FALSE,
3909	`0`,
3910	`0`,
3911	{ `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0` } / reserved /
3912	};
3913
3914	namespace {
3915
3916	const UConverterSharedData _ISO2022KRData=
3917	UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022KRStaticData, &_ISO2022KRImpl);
3918
3919	} // namespace
3920
3921	/************ CN ************/
3922	static const UConverterImpl _ISO2022CNImpl={
3923
3924	UCNV_ISO_2022,
3925
3926	NULL,
3927	NULL,
3928
3929	_ISO2022Open,
3930	_ISO2022Close,
3931	_ISO2022Reset,
3932
3933	UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3934	UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3935	UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3936	UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3937	NULL,
3938
3939	NULL,
3940	_ISO2022getName,
3941	_ISO_2022_WriteSub,
3942	_ISO_2022_SafeClone,
3943	_ISO_2022_GetUnicodeSet,
3944
3945	NULL,
3946	NULL
3947	};
3948	static const UConverterStaticData _ISO2022CNStaticData={
3949	sizeof(UConverterStaticData),
3950	"ISO_2022_CN",
3951	`0`,
3952	UCNV_IBM,
3953	UCNV_ISO_2022,
3954	`1`,
3955	`8`, / max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS /
3956	{ `0x1a`, `0`, `0`, `0` },
3957	`1`,
3958	FALSE,
3959	FALSE,
3960	`0`,
3961	`0`,
3962	{ `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0` } / reserved /
3963	};
3964
3965	namespace {
3966
3967	const UConverterSharedData _ISO2022CNData=
3968	UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022CNStaticData, &_ISO2022CNImpl);
3969
3970	} // namespace
3971	#endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */
3972
3973	#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
3974

Browse the source code of ClickHouse/contrib/icu/icu4c/source/common/ucnv2022.cpp