1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4**********************************************************************
5* Copyright (C) 2000-2016, International Business Machines
6* Corporation and others. All Rights Reserved.
7**********************************************************************
8* file name: ucnv2022.cpp
9* encoding: UTF-8
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 2000feb03
14* created by: Markus W. Scherer
15*
16* Change history:
17*
18* 06/29/2000 helena Major rewrite of the callback APIs.
19* 08/08/2000 Ram Included support for ISO-2022-JP-2
20* Changed implementation of toUnicode
21* function
22* 08/21/2000 Ram Added support for ISO-2022-KR
23* 08/29/2000 Ram Seperated implementation of EBCDIC to
24* ucnvebdc.c
25* 09/20/2000 Ram Added support for ISO-2022-CN
26* Added implementations for getNextUChar()
27* for specific 2022 country variants.
28* 10/31/2000 Ram Implemented offsets logic functions
29*/
30
31#include "unicode/utypes.h"
32
33#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
34
35#include "unicode/ucnv.h"
36#include "unicode/uset.h"
37#include "unicode/ucnv_err.h"
38#include "unicode/ucnv_cb.h"
39#include "unicode/utf16.h"
40#include "ucnv_imp.h"
41#include "ucnv_bld.h"
42#include "ucnv_cnv.h"
43#include "ucnvmbcs.h"
44#include "cstring.h"
45#include "cmemory.h"
46#include "uassert.h"
47
48#ifdef U_ENABLE_GENERIC_ISO_2022
49/*
50 * I am disabling the generic ISO-2022 converter after proposing to do so on
51 * the icu mailing list two days ago.
52 *
53 * Reasons:
54 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
55 * its designation sequences, single shifts with return to the previous state,
56 * switch-with-no-return to UTF-16BE or similar, etc.
57 * This is unlike the language-specific variants like ISO-2022-JP which
58 * require a much smaller repertoire of ISO-2022 features.
59 * These variants continue to be supported.
60 * 2. I believe that no one is really using the generic ISO-2022 converter
61 * but rather always one of the language-specific variants.
62 * Note that ICU's generic ISO-2022 converter has always output one escape
63 * sequence followed by UTF-8 for the whole stream.
64 * 3. Switching between subcharsets is extremely slow, because each time
65 * the previous converter is closed and a new one opened,
66 * without any kind of caching, least-recently-used list, etc.
67 * 4. The code is currently buggy, and given the above it does not seem
68 * reasonable to spend the time on maintenance.
69 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
70 * This means, for example, that when ISO-8859-7 is designated, the following
71 * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
72 * The ICU ISO-2022 converter does not handle this - and has no information
73 * about which subconverter would have to be shifted vs. which is designed
74 * for 7-bit ISO-2022.
75 *
76 * Markus Scherer 2003-dec-03
77 */
78#endif
79
80#if !UCONFIG_ONLY_HTML_CONVERSION
81static const char SHIFT_IN_STR[] = "\x0F";
82// static const char SHIFT_OUT_STR[] = "\x0E";
83#endif
84
85#define CR 0x0D
86#define LF 0x0A
87#define H_TAB 0x09
88#define V_TAB 0x0B
89#define SPACE 0x20
90
91enum {
92 HWKANA_START=0xff61,
93 HWKANA_END=0xff9f
94};
95
96/*
97 * 94-character sets with native byte values A1..FE are encoded in ISO 2022
98 * as bytes 21..7E. (Subtract 0x80.)
99 * 96-character sets with native byte values A0..FF are encoded in ISO 2022
100 * as bytes 20..7F. (Subtract 0x80.)
101 * Do not encode C1 control codes with native bytes 80..9F
102 * as bytes 00..1F (C0 control codes).
103 */
104enum {
105 GR94_START=0xa1,
106 GR94_END=0xfe,
107 GR96_START=0xa0,
108 GR96_END=0xff
109};
110
111/*
112 * ISO 2022 control codes must not be converted from Unicode
113 * because they would mess up the byte stream.
114 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
115 * corresponding to SO, SI, and ESC.
116 */
117#define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
118
119/* for ISO-2022-JP and -CN implementations */
120typedef enum {
121 /* shared values */
122 INVALID_STATE=-1,
123 ASCII = 0,
124
125 SS2_STATE=0x10,
126 SS3_STATE,
127
128 /* JP */
129 ISO8859_1 = 1 ,
130 ISO8859_7 = 2 ,
131 JISX201 = 3,
132 JISX208 = 4,
133 JISX212 = 5,
134 GB2312 =6,
135 KSC5601 =7,
136 HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */
137
138 /* CN */
139 /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
140 GB2312_1=1,
141 ISO_IR_165=2,
142 CNS_11643=3,
143
144 /*
145 * these are used in StateEnum and ISO2022State variables,
146 * but CNS_11643 must be used to index into myConverterArray[]
147 */
148 CNS_11643_0=0x20,
149 CNS_11643_1,
150 CNS_11643_2,
151 CNS_11643_3,
152 CNS_11643_4,
153 CNS_11643_5,
154 CNS_11643_6,
155 CNS_11643_7
156} StateEnum;
157
158/* is the StateEnum charset value for a DBCS charset? */
159#if UCONFIG_ONLY_HTML_CONVERSION
160#define IS_JP_DBCS(cs) (JISX208==(cs))
161#else
162#define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
163#endif
164
165#define CSM(cs) ((uint16_t)1<<(cs))
166
167/*
168 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
169 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
170 *
171 * Note: The converter uses some leniency:
172 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
173 * all versions, not just JIS7 and JIS8.
174 * - ICU does not distinguish between different versions of JIS X 0208.
175 */
176#if UCONFIG_ONLY_HTML_CONVERSION
177enum { MAX_JA_VERSION=0 };
178#else
179enum { MAX_JA_VERSION=4 };
180#endif
181static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={
182 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
183#if !UCONFIG_ONLY_HTML_CONVERSION
184 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
185 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
186 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
187 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
188#endif
189};
190
191typedef enum {
192 ASCII1=0,
193 LATIN1,
194 SBCS,
195 DBCS,
196 MBCS,
197 HWKANA
198}Cnv2022Type;
199
200typedef struct ISO2022State {
201 int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
202 int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
203 int8_t prevG; /* g before single shift (SS2 or SS3) */
204} ISO2022State;
205
206#define UCNV_OPTIONS_VERSION_MASK 0xf
207#define UCNV_2022_MAX_CONVERTERS 10
208
209typedef struct{
210 UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
211 UConverter *currentConverter;
212 Cnv2022Type currentType;
213 ISO2022State toU2022State, fromU2022State;
214 uint32_t key;
215 uint32_t version;
216#ifdef U_ENABLE_GENERIC_ISO_2022
217 UBool isFirstBuffer;
218#endif
219 UBool isEmptySegment;
220 char name[30];
221 char locale[3];
222}UConverterDataISO2022;
223
224/* Protos */
225/* ISO-2022 ----------------------------------------------------------------- */
226
227/*Forward declaration */
228U_CFUNC void U_CALLCONV
229ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
230 UErrorCode * err);
231U_CFUNC void U_CALLCONV
232ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
233 UErrorCode * err);
234
235#define ESC_2022 0x1B /*ESC*/
236
237typedef enum
238{
239 INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
240 VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
241 VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
242 VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
243} UCNV_TableStates_2022;
244
245/*
246* The way these state transition arrays work is:
247* ex : ESC$B is the sequence for JISX208
248* a) First Iteration: char is ESC
249* i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
250* int x = normalize_esq_chars_2022[27] which is equal to 1
251* ii) Search for this value in escSeqStateTable_Key_2022[]
252* value of x is stored at escSeqStateTable_Key_2022[0]
253* iii) Save this index as offset
254* iv) Get state of this sequence from escSeqStateTable_Value_2022[]
255* escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
256* b) Switch on this state and continue to next char
257* i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
258* which is normalize_esq_chars_2022[36] == 4
259* ii) x is currently 1(from above)
260* x<<=5 -- x is now 32
261* x+=normalize_esq_chars_2022[36]
262* now x is 36
263* iii) Search for this value in escSeqStateTable_Key_2022[]
264* value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
265* iv) Get state of this sequence from escSeqStateTable_Value_2022[]
266* escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
267* c) Switch on this state and continue to next char
268* i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index
269* ii) x is currently 36 (from above)
270* x<<=5 -- x is now 1152
271* x+=normalize_esq_chars_2022[66]
272* now x is 1161
273* iii) Search for this value in escSeqStateTable_Key_2022[]
274* value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
275* iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
276* escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
277* v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
278*/
279
280
281/*Below are the 3 arrays depicting a state transition table*/
282static const int8_t normalize_esq_chars_2022[256] = {
283/* 0 1 2 3 4 5 6 7 8 9 */
284
285 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
286 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
287 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0
288 ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0
289 ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0
290 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
291 ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12
292 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28
293 ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0
294 ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
295 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
296 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
297 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
298 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
299 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
300 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
301 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
302 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
303 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
304 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
305 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
306 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
307 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
308 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
309 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
310 ,0 ,0 ,0 ,0 ,0 ,0
311};
312
313#ifdef U_ENABLE_GENERIC_ISO_2022
314/*
315 * When the generic ISO-2022 converter is completely removed, not just disabled
316 * per #ifdef, then the following state table and the associated tables that are
317 * dimensioned with MAX_STATES_2022 should be trimmed.
318 *
319 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
320 * the associated escape sequences starting with ESC ( B should be removed.
321 * This includes the ones with key values 1097 and all of the ones above 1000000.
322 *
323 * For the latter, the tables can simply be truncated.
324 * For the former, since the tables must be kept parallel, it is probably best
325 * to simply duplicate an adjacent table cell, parallel in all tables.
326 *
327 * It may make sense to restructure the tables, especially by using small search
328 * tables for the variants instead of indexing them parallel to the table here.
329 */
330#endif
331
332#define MAX_STATES_2022 74
333static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
334/* 0 1 2 3 4 5 6 7 8 9 */
335
336 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096
337 ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106
338 ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257
339 ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940
340 ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644
341 ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138
342 ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630
343 ,35947631 ,35947635 ,35947636 ,35947638
344};
345
346#ifdef U_ENABLE_GENERIC_ISO_2022
347
348static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
349 /* 0 1 2 3 4 5 6 7 8 9 */
350
351 NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,"latin1" ,"latin1"
352 ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1"
353 ,"latin1" ,NULL ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL ,NULL ,NULL ,NULL ,"UTF8"
354 ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383"
355 ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165"
356 ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
357 ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL ,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089"
358 ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1"
359};
360
361#endif
362
363static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = {
364/* 0 1 2 3 4 5 6 7 8 9 */
365 VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
366 ,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
367 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022
368 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
369 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
370 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
371 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
372 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
373};
374
375/* Type def for refactoring changeState_2022 code*/
376typedef enum{
377#ifdef U_ENABLE_GENERIC_ISO_2022
378 ISO_2022=0,
379#endif
380 ISO_2022_JP=1,
381#if !UCONFIG_ONLY_HTML_CONVERSION
382 ISO_2022_KR=2,
383 ISO_2022_CN=3
384#endif
385} Variant2022;
386
387/*********** ISO 2022 Converter Protos ***********/
388static void U_CALLCONV
389_ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode);
390
391static void U_CALLCONV
392 _ISO2022Close(UConverter *converter);
393
394static void U_CALLCONV
395_ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
396
397U_CDECL_BEGIN
398static const char * U_CALLCONV
399_ISO2022getName(const UConverter* cnv);
400U_CDECL_END
401
402static void U_CALLCONV
403_ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
404
405U_CDECL_BEGIN
406static UConverter * U_CALLCONV
407_ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
408
409U_CDECL_END
410
411#ifdef U_ENABLE_GENERIC_ISO_2022
412static void U_CALLCONV
413T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
414#endif
415
416namespace {
417
418/*const UConverterSharedData _ISO2022Data;*/
419extern const UConverterSharedData _ISO2022JPData;
420
421#if !UCONFIG_ONLY_HTML_CONVERSION
422extern const UConverterSharedData _ISO2022KRData;
423extern const UConverterSharedData _ISO2022CNData;
424#endif
425
426} // namespace
427
428/*************** Converter implementations ******************/
429
430/* The purpose of this function is to get around gcc compiler warnings. */
431static inline void
432fromUWriteUInt8(UConverter *cnv,
433 const char *bytes, int32_t length,
434 uint8_t **target, const char *targetLimit,
435 int32_t **offsets,
436 int32_t sourceIndex,
437 UErrorCode *pErrorCode)
438{
439 char *targetChars = (char *)*target;
440 ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit,
441 offsets, sourceIndex, pErrorCode);
442 *target = (uint8_t*)targetChars;
443
444}
445
446static inline void
447setInitialStateToUnicodeKR(UConverter* /*converter*/, UConverterDataISO2022 *myConverterData){
448 if(myConverterData->version == 1) {
449 UConverter *cnv = myConverterData->currentConverter;
450
451 cnv->toUnicodeStatus=0; /* offset */
452 cnv->mode=0; /* state */
453 cnv->toULength=0; /* byteIndex */
454 }
455}
456
457static inline void
458setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
459 /* in ISO-2022-KR the designator sequence appears only once
460 * in a file so we append it only once
461 */
462 if( converter->charErrorBufferLength==0){
463
464 converter->charErrorBufferLength = 4;
465 converter->charErrorBuffer[0] = 0x1b;
466 converter->charErrorBuffer[1] = 0x24;
467 converter->charErrorBuffer[2] = 0x29;
468 converter->charErrorBuffer[3] = 0x43;
469 }
470 if(myConverterData->version == 1) {
471 UConverter *cnv = myConverterData->currentConverter;
472
473 cnv->fromUChar32=0;
474 cnv->fromUnicodeStatus=1; /* prevLength */
475 }
476}
477
478static void U_CALLCONV
479_ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){
480
481 char myLocale[6]={' ',' ',' ',' ',' ',' '};
482
483 cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
484 if(cnv->extraInfo != NULL) {
485 UConverterNamePieces stackPieces;
486 UConverterLoadArgs stackArgs=UCNV_LOAD_ARGS_INITIALIZER;
487 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
488 uint32_t version;
489
490 stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable;
491
492 uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
493 myConverterData->currentType = ASCII1;
494 cnv->fromUnicodeStatus =FALSE;
495 if(pArgs->locale){
496 uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale));
497 }
498 version = pArgs->options & UCNV_OPTIONS_VERSION_MASK;
499 myConverterData->version = version;
500 if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') &&
501 (myLocale[2]=='_' || myLocale[2]=='\0'))
502 {
503 /* open the required converters and cache them */
504 if(version>MAX_JA_VERSION) {
505 // ICU 55 fails to open a converter for an unsupported version.
506 // Previously, it fell back to version 0, but that would yield
507 // unexpected behavior.
508 *errorCode = U_MISSING_RESOURCE_ERROR;
509 return;
510 }
511 if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
512 myConverterData->myConverterArray[ISO8859_7] =
513 ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode);
514 }
515 myConverterData->myConverterArray[JISX208] =
516 ucnv_loadSharedData("EUC-JP", &stackPieces, &stackArgs, errorCode);
517 if(jpCharsetMasks[version]&CSM(JISX212)) {
518 myConverterData->myConverterArray[JISX212] =
519 ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode);
520 }
521 if(jpCharsetMasks[version]&CSM(GB2312)) {
522 myConverterData->myConverterArray[GB2312] =
523 ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode); /* gb_2312_80-1 */
524 }
525 if(jpCharsetMasks[version]&CSM(KSC5601)) {
526 myConverterData->myConverterArray[KSC5601] =
527 ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode);
528 }
529
530 /* set the function pointers to appropriate funtions */
531 cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
532 uprv_strcpy(myConverterData->locale,"ja");
533
534 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
535 size_t len = uprv_strlen(myConverterData->name);
536 myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
537 myConverterData->name[len+1]='\0';
538 }
539#if !UCONFIG_ONLY_HTML_CONVERSION
540 else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
541 (myLocale[2]=='_' || myLocale[2]=='\0'))
542 {
543 if(version>1) {
544 // ICU 55 fails to open a converter for an unsupported version.
545 // Previously, it fell back to version 0, but that would yield
546 // unexpected behavior.
547 *errorCode = U_MISSING_RESOURCE_ERROR;
548 return;
549 }
550 const char *cnvName;
551 if(version==1) {
552 cnvName="icu-internal-25546";
553 } else {
554 cnvName="ibm-949";
555 myConverterData->version=version=0;
556 }
557 if(pArgs->onlyTestIsLoadable) {
558 ucnv_canCreateConverter(cnvName, errorCode); /* errorCode carries result */
559 uprv_free(cnv->extraInfo);
560 cnv->extraInfo=NULL;
561 return;
562 } else {
563 myConverterData->currentConverter=ucnv_open(cnvName, errorCode);
564 if (U_FAILURE(*errorCode)) {
565 _ISO2022Close(cnv);
566 return;
567 }
568
569 if(version==1) {
570 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
571 uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4);
572 cnv->subCharLen = myConverterData->currentConverter->subCharLen;
573 }else{
574 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
575 }
576
577 /* initialize the state variables */
578 setInitialStateToUnicodeKR(cnv, myConverterData);
579 setInitialStateFromUnicodeKR(cnv, myConverterData);
580
581 /* set the function pointers to appropriate funtions */
582 cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
583 uprv_strcpy(myConverterData->locale,"ko");
584 }
585 }
586 else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
587 (myLocale[2]=='_' || myLocale[2]=='\0'))
588 {
589 if(version>2) {
590 // ICU 55 fails to open a converter for an unsupported version.
591 // Previously, it fell back to version 0, but that would yield
592 // unexpected behavior.
593 *errorCode = U_MISSING_RESOURCE_ERROR;
594 return;
595 }
596
597 /* open the required converters and cache them */
598 myConverterData->myConverterArray[GB2312_1] =
599 ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode);
600 if(version==1) {
601 myConverterData->myConverterArray[ISO_IR_165] =
602 ucnv_loadSharedData("iso-ir-165", &stackPieces, &stackArgs, errorCode);
603 }
604 myConverterData->myConverterArray[CNS_11643] =
605 ucnv_loadSharedData("cns-11643-1992", &stackPieces, &stackArgs, errorCode);
606
607
608 /* set the function pointers to appropriate funtions */
609 cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
610 uprv_strcpy(myConverterData->locale,"cn");
611
612 if (version==0){
613 myConverterData->version = 0;
614 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
615 }else if (version==1){
616 myConverterData->version = 1;
617 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
618 }else {
619 myConverterData->version = 2;
620 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2");
621 }
622 }
623#endif // !UCONFIG_ONLY_HTML_CONVERSION
624 else{
625#ifdef U_ENABLE_GENERIC_ISO_2022
626 myConverterData->isFirstBuffer = TRUE;
627
628 /* append the UTF-8 escape sequence */
629 cnv->charErrorBufferLength = 3;
630 cnv->charErrorBuffer[0] = 0x1b;
631 cnv->charErrorBuffer[1] = 0x25;
632 cnv->charErrorBuffer[2] = 0x42;
633
634 cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
635 /* initialize the state variables */
636 uprv_strcpy(myConverterData->name,"ISO_2022");
637#else
638 *errorCode = U_MISSING_RESOURCE_ERROR;
639 // Was U_UNSUPPORTED_ERROR but changed in ICU 55 to a more standard
640 // data loading error code.
641 return;
642#endif
643 }
644
645 cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
646
647 if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) {
648 _ISO2022Close(cnv);
649 }
650 } else {
651 *errorCode = U_MEMORY_ALLOCATION_ERROR;
652 }
653}
654
655
656static void U_CALLCONV
657_ISO2022Close(UConverter *converter) {
658 UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
659 UConverterSharedData **array = myData->myConverterArray;
660 int32_t i;
661
662 if (converter->extraInfo != NULL) {
663 /*close the array of converter pointers and free the memory*/
664 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
665 if(array[i]!=NULL) {
666 ucnv_unloadSharedDataIfReady(array[i]);
667 }
668 }
669
670 ucnv_close(myData->currentConverter);
671
672 if(!converter->isExtraLocal){
673 uprv_free (converter->extraInfo);
674 converter->extraInfo = NULL;
675 }
676 }
677}
678
679static void U_CALLCONV
680_ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
681 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
682 if(choice<=UCNV_RESET_TO_UNICODE) {
683 uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
684 myConverterData->key = 0;
685 myConverterData->isEmptySegment = FALSE;
686 }
687 if(choice!=UCNV_RESET_TO_UNICODE) {
688 uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
689 }
690#ifdef U_ENABLE_GENERIC_ISO_2022
691 if(myConverterData->locale[0] == 0){
692 if(choice<=UCNV_RESET_TO_UNICODE) {
693 myConverterData->isFirstBuffer = TRUE;
694 myConverterData->key = 0;
695 if (converter->mode == UCNV_SO){
696 ucnv_close (myConverterData->currentConverter);
697 myConverterData->currentConverter=NULL;
698 }
699 converter->mode = UCNV_SI;
700 }
701 if(choice!=UCNV_RESET_TO_UNICODE) {
702 /* re-append UTF-8 escape sequence */
703 converter->charErrorBufferLength = 3;
704 converter->charErrorBuffer[0] = 0x1b;
705 converter->charErrorBuffer[1] = 0x28;
706 converter->charErrorBuffer[2] = 0x42;
707 }
708 }
709 else
710#endif
711 {
712 /* reset the state variables */
713 if(myConverterData->locale[0] == 'k'){
714 if(choice<=UCNV_RESET_TO_UNICODE) {
715 setInitialStateToUnicodeKR(converter, myConverterData);
716 }
717 if(choice!=UCNV_RESET_TO_UNICODE) {
718 setInitialStateFromUnicodeKR(converter, myConverterData);
719 }
720 }
721 }
722}
723
724U_CDECL_BEGIN
725
726static const char * U_CALLCONV
727_ISO2022getName(const UConverter* cnv){
728 if(cnv->extraInfo){
729 UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
730 return myData->name;
731 }
732 return NULL;
733}
734
735U_CDECL_END
736
737
738/*************** to unicode *******************/
739/****************************************************************************
740 * Recognized escape sequences are
741 * <ESC>(B ASCII
742 * <ESC>.A ISO-8859-1
743 * <ESC>.F ISO-8859-7
744 * <ESC>(J JISX-201
745 * <ESC>(I JISX-201
746 * <ESC>$B JISX-208
747 * <ESC>$@ JISX-208
748 * <ESC>$(D JISX-212
749 * <ESC>$A GB2312
750 * <ESC>$(C KSC5601
751 */
752static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= {
753/* 0 1 2 3 4 5 6 7 8 9 */
754 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
755 ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE
756 ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
757 ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE
758 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
759 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
760 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
761 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
762};
763
764#if !UCONFIG_ONLY_HTML_CONVERSION
765/*************** to unicode *******************/
766static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= {
767/* 0 1 2 3 4 5 6 7 8 9 */
768 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
769 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
770 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
771 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
772 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165
773 ,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
774 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
775 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
776};
777#endif
778
779
780static UCNV_TableStates_2022
781getKey_2022(char c,int32_t* key,int32_t* offset){
782 int32_t togo;
783 int32_t low = 0;
784 int32_t hi = MAX_STATES_2022;
785 int32_t oldmid=0;
786
787 togo = normalize_esq_chars_2022[(uint8_t)c];
788 if(togo == 0) {
789 /* not a valid character anywhere in an escape sequence */
790 *key = 0;
791 *offset = 0;
792 return INVALID_2022;
793 }
794 togo = (*key << 5) + togo;
795
796 while (hi != low) /*binary search*/{
797
798 int32_t mid = (hi+low) >> 1; /*Finds median*/
799
800 if (mid == oldmid)
801 break;
802
803 if (escSeqStateTable_Key_2022[mid] > togo){
804 hi = mid;
805 }
806 else if (escSeqStateTable_Key_2022[mid] < togo){
807 low = mid;
808 }
809 else /*we found it*/{
810 *key = togo;
811 *offset = mid;
812 return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid];
813 }
814 oldmid = mid;
815
816 }
817
818 *key = 0;
819 *offset = 0;
820 return INVALID_2022;
821}
822
823/*runs through a state machine to determine the escape sequence - codepage correspondance
824 */
825static void
826changeState_2022(UConverter* _this,
827 const char** source,
828 const char* sourceLimit,
829 Variant2022 var,
830 UErrorCode* err){
831 UCNV_TableStates_2022 value;
832 UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
833 uint32_t key = myData2022->key;
834 int32_t offset = 0;
835 int8_t initialToULength = _this->toULength;
836 char c;
837
838 value = VALID_NON_TERMINAL_2022;
839 while (*source < sourceLimit) {
840 c = *(*source)++;
841 _this->toUBytes[_this->toULength++]=(uint8_t)c;
842 value = getKey_2022(c,(int32_t *) &key, &offset);
843
844 switch (value){
845
846 case VALID_NON_TERMINAL_2022 :
847 /* continue with the loop */
848 break;
849
850 case VALID_TERMINAL_2022:
851 key = 0;
852 goto DONE;
853
854 case INVALID_2022:
855 goto DONE;
856
857 case VALID_MAYBE_TERMINAL_2022:
858#ifdef U_ENABLE_GENERIC_ISO_2022
859 /* ESC ( B is ambiguous only for ISO_2022 itself */
860 if(var == ISO_2022) {
861 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
862 _this->toULength = 0;
863
864 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
865
866 /* continue with the loop */
867 value = VALID_NON_TERMINAL_2022;
868 break;
869 } else
870#endif
871 {
872 /* not ISO_2022 itself, finish here */
873 value = VALID_TERMINAL_2022;
874 key = 0;
875 goto DONE;
876 }
877 }
878 }
879
880DONE:
881 myData2022->key = key;
882
883 if (value == VALID_NON_TERMINAL_2022) {
884 /* indicate that the escape sequence is incomplete: key!=0 */
885 return;
886 } else if (value == INVALID_2022 ) {
887 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
888 } else /* value == VALID_TERMINAL_2022 */ {
889 switch(var){
890#ifdef U_ENABLE_GENERIC_ISO_2022
891 case ISO_2022:
892 {
893 const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
894 if(chosenConverterName == NULL) {
895 /* SS2 or SS3 */
896 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
897 _this->toUCallbackReason = UCNV_UNASSIGNED;
898 return;
899 }
900
901 _this->mode = UCNV_SI;
902 ucnv_close(myData2022->currentConverter);
903 myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
904 if(U_SUCCESS(*err)) {
905 myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
906 _this->mode = UCNV_SO;
907 }
908 break;
909 }
910#endif
911 case ISO_2022_JP:
912 {
913 StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset];
914 switch(tempState) {
915 case INVALID_STATE:
916 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
917 break;
918 case SS2_STATE:
919 if(myData2022->toU2022State.cs[2]!=0) {
920 if(myData2022->toU2022State.g<2) {
921 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
922 }
923 myData2022->toU2022State.g=2;
924 } else {
925 /* illegal to have SS2 before a matching designator */
926 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
927 }
928 break;
929 /* case SS3_STATE: not used in ISO-2022-JP-x */
930 case ISO8859_1:
931 case ISO8859_7:
932 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
933 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
934 } else {
935 /* G2 charset for SS2 */
936 myData2022->toU2022State.cs[2]=(int8_t)tempState;
937 }
938 break;
939 default:
940 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
941 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
942 } else {
943 /* G0 charset */
944 myData2022->toU2022State.cs[0]=(int8_t)tempState;
945 }
946 break;
947 }
948 }
949 break;
950#if !UCONFIG_ONLY_HTML_CONVERSION
951 case ISO_2022_CN:
952 {
953 StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset];
954 switch(tempState) {
955 case INVALID_STATE:
956 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
957 break;
958 case SS2_STATE:
959 if(myData2022->toU2022State.cs[2]!=0) {
960 if(myData2022->toU2022State.g<2) {
961 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
962 }
963 myData2022->toU2022State.g=2;
964 } else {
965 /* illegal to have SS2 before a matching designator */
966 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
967 }
968 break;
969 case SS3_STATE:
970 if(myData2022->toU2022State.cs[3]!=0) {
971 if(myData2022->toU2022State.g<2) {
972 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
973 }
974 myData2022->toU2022State.g=3;
975 } else {
976 /* illegal to have SS3 before a matching designator */
977 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
978 }
979 break;
980 case ISO_IR_165:
981 if(myData2022->version==0) {
982 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
983 break;
984 }
985 U_FALLTHROUGH;
986 case GB2312_1:
987 U_FALLTHROUGH;
988 case CNS_11643_1:
989 myData2022->toU2022State.cs[1]=(int8_t)tempState;
990 break;
991 case CNS_11643_2:
992 myData2022->toU2022State.cs[2]=(int8_t)tempState;
993 break;
994 default:
995 /* other CNS 11643 planes */
996 if(myData2022->version==0) {
997 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
998 } else {
999 myData2022->toU2022State.cs[3]=(int8_t)tempState;
1000 }
1001 break;
1002 }
1003 }
1004 break;
1005 case ISO_2022_KR:
1006 if(offset==0x30){
1007 /* nothing to be done, just accept this one escape sequence */
1008 } else {
1009 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
1010 }
1011 break;
1012#endif // !UCONFIG_ONLY_HTML_CONVERSION
1013
1014 default:
1015 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
1016 break;
1017 }
1018 }
1019 if(U_SUCCESS(*err)) {
1020 _this->toULength = 0;
1021 } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
1022 if(_this->toULength>1) {
1023 /*
1024 * Ticket 5691: consistent illegal sequences:
1025 * - We include at least the first byte (ESC) in the illegal sequence.
1026 * - If any of the non-initial bytes could be the start of a character,
1027 * we stop the illegal sequence before the first one of those.
1028 * In escape sequences, all following bytes are "printable", that is,
1029 * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
1030 * they are valid single/lead bytes.
1031 * For simplicity, we always only report the initial ESC byte as the
1032 * illegal sequence and back out all other bytes we looked at.
1033 */
1034 /* Back out some bytes. */
1035 int8_t backOutDistance=_this->toULength-1;
1036 int8_t bytesFromThisBuffer=_this->toULength-initialToULength;
1037 if(backOutDistance<=bytesFromThisBuffer) {
1038 /* same as initialToULength<=1 */
1039 *source-=backOutDistance;
1040 } else {
1041 /* Back out bytes from the previous buffer: Need to replay them. */
1042 _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
1043 /* same as -(initialToULength-1) */
1044 /* preToULength is negative! */
1045 uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
1046 *source-=bytesFromThisBuffer;
1047 }
1048 _this->toULength=1;
1049 }
1050 } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
1051 _this->toUCallbackReason = UCNV_UNASSIGNED;
1052 }
1053}
1054
1055#if !UCONFIG_ONLY_HTML_CONVERSION
1056/*Checks the characters of the buffer against valid 2022 escape sequences
1057*if the match we return a pointer to the initial start of the sequence otherwise
1058*we return sourceLimit
1059*/
1060/*for 2022 looks ahead in the stream
1061 *to determine the longest possible convertible
1062 *data stream
1063 */
1064static inline const char*
1065getEndOfBuffer_2022(const char** source,
1066 const char* sourceLimit,
1067 UBool /*flush*/){
1068
1069 const char* mySource = *source;
1070
1071#ifdef U_ENABLE_GENERIC_ISO_2022
1072 if (*source >= sourceLimit)
1073 return sourceLimit;
1074
1075 do{
1076
1077 if (*mySource == ESC_2022){
1078 int8_t i;
1079 int32_t key = 0;
1080 int32_t offset;
1081 UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
1082
1083 /* Kludge: I could not
1084 * figure out the reason for validating an escape sequence
1085 * twice - once here and once in changeState_2022().
1086 * is it possible to have an ESC character in a ISO2022
1087 * byte stream which is valid in a code page? Is it legal?
1088 */
1089 for (i=0;
1090 (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
1091 i++) {
1092 value = getKey_2022(*(mySource+i), &key, &offset);
1093 }
1094 if (value > 0 || *mySource==ESC_2022)
1095 return mySource;
1096
1097 if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
1098 return sourceLimit;
1099 }
1100 }while (++mySource < sourceLimit);
1101
1102 return sourceLimit;
1103#else
1104 while(mySource < sourceLimit && *mySource != ESC_2022) {
1105 ++mySource;
1106 }
1107 return mySource;
1108#endif
1109}
1110#endif
1111
1112/* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
1113 * any future change in _MBCSFromUChar32() function should be reflected here.
1114 * @return number of bytes in *value; negative number if fallback; 0 if no mapping
1115 */
1116static inline int32_t
1117MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
1118 UChar32 c,
1119 uint32_t* value,
1120 UBool useFallback,
1121 int outputType)
1122{
1123 const int32_t *cx;
1124 const uint16_t *table;
1125 uint32_t stage2Entry;
1126 uint32_t myValue;
1127 int32_t length;
1128 const uint8_t *p;
1129 /*
1130 * TODO(markus): Use and require new, faster MBCS conversion table structures.
1131 * Use internal version of ucnv_open() that verifies that the new structures are available,
1132 * else U_INTERNAL_PROGRAM_ERROR.
1133 */
1134 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1135 if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1136 table=sharedData->mbcs.fromUnicodeTable;
1137 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
1138 /* get the bytes and the length for the output */
1139 if(outputType==MBCS_OUTPUT_2){
1140 myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1141 if(myValue<=0xff) {
1142 length=1;
1143 } else {
1144 length=2;
1145 }
1146 } else /* outputType==MBCS_OUTPUT_3 */ {
1147 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1148 myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
1149 if(myValue<=0xff) {
1150 length=1;
1151 } else if(myValue<=0xffff) {
1152 length=2;
1153 } else {
1154 length=3;
1155 }
1156 }
1157 /* is this code point assigned, or do we use fallbacks? */
1158 if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
1159 /* assigned */
1160 *value=myValue;
1161 return length;
1162 } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) {
1163 /*
1164 * We allow a 0 byte output if the "assigned" bit is set for this entry.
1165 * There is no way with this data structure for fallback output
1166 * to be a zero byte.
1167 */
1168 *value=myValue;
1169 return -length;
1170 }
1171 }
1172
1173 cx=sharedData->mbcs.extIndexes;
1174 if(cx!=NULL) {
1175 return ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
1176 }
1177
1178 /* unassigned */
1179 return 0;
1180}
1181
1182/* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
1183 * any future change in _MBCSSingleFromUChar32() function should be reflected here.
1184 * @param retval pointer to output byte
1185 * @return 1 roundtrip byte 0 no mapping -1 fallback byte
1186 */
1187static inline int32_t
1188MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
1189 UChar32 c,
1190 uint32_t* retval,
1191 UBool useFallback)
1192{
1193 const uint16_t *table;
1194 int32_t value;
1195 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1196 if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1197 return 0;
1198 }
1199 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
1200 table=sharedData->mbcs.fromUnicodeTable;
1201 /* get the byte for the output */
1202 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
1203 /* is this code point assigned, or do we use fallbacks? */
1204 *retval=(uint32_t)(value&0xff);
1205 if(value>=0xf00) {
1206 return 1; /* roundtrip */
1207 } else if(useFallback ? value>=0x800 : value>=0xc00) {
1208 return -1; /* fallback taken */
1209 } else {
1210 return 0; /* no mapping */
1211 }
1212}
1213
1214/*
1215 * Check that the result is a 2-byte value with each byte in the range A1..FE
1216 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
1217 * to move it to the ISO 2022 range 21..7E.
1218 * Return 0 if out of range.
1219 */
1220static inline uint32_t
1221_2022FromGR94DBCS(uint32_t value) {
1222 if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1223 (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
1224 ) {
1225 return value - 0x8080; /* shift down to 21..7e byte range */
1226 } else {
1227 return 0; /* not valid for ISO 2022 */
1228 }
1229}
1230
1231#if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
1232/*
1233 * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
1234 * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
1235 * unchanged.
1236 */
1237static inline uint32_t
1238_2022ToGR94DBCS(uint32_t value) {
1239 uint32_t returnValue = value + 0x8080;
1240 if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1241 (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) {
1242 return returnValue;
1243 } else {
1244 return value;
1245 }
1246}
1247#endif
1248
1249#ifdef U_ENABLE_GENERIC_ISO_2022
1250
1251/**********************************************************************************
1252* ISO-2022 Converter
1253*
1254*
1255*/
1256
1257static void U_CALLCONV
1258T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
1259 UErrorCode* err){
1260 const char* mySourceLimit, *realSourceLimit;
1261 const char* sourceStart;
1262 const UChar* myTargetStart;
1263 UConverter* saveThis;
1264 UConverterDataISO2022* myData;
1265 int8_t length;
1266
1267 saveThis = args->converter;
1268 myData=((UConverterDataISO2022*)(saveThis->extraInfo));
1269
1270 realSourceLimit = args->sourceLimit;
1271 while (args->source < realSourceLimit) {
1272 if(myData->key == 0) { /* are we in the middle of an escape sequence? */
1273 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
1274 mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
1275
1276 if(args->source < mySourceLimit) {
1277 if(myData->currentConverter==NULL) {
1278 myData->currentConverter = ucnv_open("ASCII",err);
1279 if(U_FAILURE(*err)){
1280 return;
1281 }
1282
1283 myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
1284 saveThis->mode = UCNV_SO;
1285 }
1286
1287 /* convert to before the ESC or until the end of the buffer */
1288 myData->isFirstBuffer=FALSE;
1289 sourceStart = args->source;
1290 myTargetStart = args->target;
1291 args->converter = myData->currentConverter;
1292 ucnv_toUnicode(args->converter,
1293 &args->target,
1294 args->targetLimit,
1295 &args->source,
1296 mySourceLimit,
1297 args->offsets,
1298 (UBool)(args->flush && mySourceLimit == realSourceLimit),
1299 err);
1300 args->converter = saveThis;
1301
1302 if (*err == U_BUFFER_OVERFLOW_ERROR) {
1303 /* move the overflow buffer */
1304 length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
1305 myData->currentConverter->UCharErrorBufferLength = 0;
1306 if(length > 0) {
1307 uprv_memcpy(saveThis->UCharErrorBuffer,
1308 myData->currentConverter->UCharErrorBuffer,
1309 length*U_SIZEOF_UCHAR);
1310 }
1311 return;
1312 }
1313
1314 /*
1315 * At least one of:
1316 * -Error while converting
1317 * -Done with entire buffer
1318 * -Need to write offsets or update the current offset
1319 * (leave that up to the code in ucnv.c)
1320 *
1321 * or else we just stopped at an ESC byte and continue with changeState_2022()
1322 */
1323 if (U_FAILURE(*err) ||
1324 (args->source == realSourceLimit) ||
1325 (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) ||
1326 (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
1327 ) {
1328 /* copy partial or error input for truncated detection and error handling */
1329 if(U_FAILURE(*err)) {
1330 length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
1331 if(length > 0) {
1332 uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
1333 }
1334 } else {
1335 length = saveThis->toULength = myData->currentConverter->toULength;
1336 if(length > 0) {
1337 uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
1338 if(args->source < mySourceLimit) {
1339 *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
1340 }
1341 }
1342 }
1343 return;
1344 }
1345 }
1346 }
1347
1348 sourceStart = args->source;
1349 changeState_2022(args->converter,
1350 &(args->source),
1351 realSourceLimit,
1352 ISO_2022,
1353 err);
1354 if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) {
1355 /* let the ucnv.c code update its current offset */
1356 return;
1357 }
1358 }
1359}
1360
1361#endif
1362
1363/*
1364 * To Unicode Callback helper function
1365 */
1366static void
1367toUnicodeCallback(UConverter *cnv,
1368 const uint32_t sourceChar, const uint32_t targetUniChar,
1369 UErrorCode* err){
1370 if(sourceChar>0xff){
1371 cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
1372 cnv->toUBytes[1] = (uint8_t)sourceChar;
1373 cnv->toULength = 2;
1374 }
1375 else{
1376 cnv->toUBytes[0] =(char) sourceChar;
1377 cnv->toULength = 1;
1378 }
1379
1380 if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
1381 *err = U_INVALID_CHAR_FOUND;
1382 }
1383 else{
1384 *err = U_ILLEGAL_CHAR_FOUND;
1385 }
1386}
1387
1388/**************************************ISO-2022-JP*************************************************/
1389
1390/************************************** IMPORTANT **************************************************
1391* The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1392* MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
1393* The converter iterates over each Unicode codepoint
1394* to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1395* processed one char at a time it would make sense to reduce the extra processing a canned converter
1396* would do as far as possible.
1397*
1398* If the implementation of these macros or structure of sharedData struct change in the future, make
1399* sure that ISO-2022 is also changed.
1400***************************************************************************************************
1401*/
1402
1403/***************************************************************************************************
1404* Rules for ISO-2022-jp encoding
1405* (i) Escape sequences must be fully contained within a line they should not
1406* span new lines or CRs
1407* (ii) If the last character on a line is represented by two bytes then an ASCII or
1408* JIS-Roman character escape sequence should follow before the line terminates
1409* (iii) If the first character on the line is represented by two bytes then a two
1410* byte character escape sequence should precede it
1411* (iv) If no escape sequence is encountered then the characters are ASCII
1412* (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1413* and invoked with SS2 (ESC N).
1414* (vi) If there is any G0 designation in text, there must be a switch to
1415* ASCII or to JIS X 0201-Roman before a space character (but not
1416* necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1417* characters such as tab or CRLF.
1418* (vi) Supported encodings:
1419* ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1420*
1421* source : RFC-1554
1422*
1423* JISX201, JISX208,JISX212 : new .cnv data files created
1424* KSC5601 : alias to ibm-949 mapping table
1425* GB2312 : alias to ibm-1386 mapping table
1426* ISO-8859-1 : Algorithmic implemented as LATIN1 case
1427* ISO-8859-7 : alisas to ibm-9409 mapping table
1428*/
1429
1430/* preference order of JP charsets */
1431static const StateEnum jpCharsetPref[]={
1432 ASCII,
1433 JISX201,
1434 ISO8859_1,
1435 JISX208,
1436 ISO8859_7,
1437 JISX212,
1438 GB2312,
1439 KSC5601,
1440 HWKANA_7BIT
1441};
1442
1443/*
1444 * The escape sequences must be in order of the enum constants like JISX201 = 3,
1445 * not in order of jpCharsetPref[]!
1446 */
1447static const char escSeqChars[][6] ={
1448 "\x1B\x28\x42", /* <ESC>(B ASCII */
1449 "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */
1450 "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */
1451 "\x1B\x28\x4A", /* <ESC>(J JISX-201 */
1452 "\x1B\x24\x42", /* <ESC>$B JISX-208 */
1453 "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */
1454 "\x1B\x24\x41", /* <ESC>$A GB2312 */
1455 "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */
1456 "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */
1457
1458};
1459static const int8_t escSeqCharsLen[] ={
1460 3, /* length of <ESC>(B ASCII */
1461 3, /* length of <ESC>.A ISO-8859-1 */
1462 3, /* length of <ESC>.F ISO-8859-7 */
1463 3, /* length of <ESC>(J JISX-201 */
1464 3, /* length of <ESC>$B JISX-208 */
1465 4, /* length of <ESC>$(D JISX-212 */
1466 3, /* length of <ESC>$A GB2312 */
1467 4, /* length of <ESC>$(C KSC5601 */
1468 3 /* length of <ESC>(I HWKANA_7BIT */
1469};
1470
1471/*
1472* The iteration over various code pages works this way:
1473* i) Get the currentState from myConverterData->currentState
1474* ii) Check if the character is mapped to a valid character in the currentState
1475* Yes -> a) set the initIterState to currentState
1476* b) remain in this state until an invalid character is found
1477* No -> a) go to the next code page and find the character
1478* iii) Before changing the state increment the current state check if the current state
1479* is equal to the intitIteration state
1480* Yes -> A character that cannot be represented in any of the supported encodings
1481* break and return a U_INVALID_CHARACTER error
1482* No -> Continue and find the character in next code page
1483*
1484*
1485* TODO: Implement a priority technique where the users are allowed to set the priority of code pages
1486*/
1487
1488/* Map 00..7F to Unicode according to JIS X 0201. */
1489static inline uint32_t
1490jisx201ToU(uint32_t value) {
1491 if(value < 0x5c) {
1492 return value;
1493 } else if(value == 0x5c) {
1494 return 0xa5;
1495 } else if(value == 0x7e) {
1496 return 0x203e;
1497 } else /* value <= 0x7f */ {
1498 return value;
1499 }
1500}
1501
1502/* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
1503static inline uint32_t
1504jisx201FromU(uint32_t value) {
1505 if(value<=0x7f) {
1506 if(value!=0x5c && value!=0x7e) {
1507 return value;
1508 }
1509 } else if(value==0xa5) {
1510 return 0x5c;
1511 } else if(value==0x203e) {
1512 return 0x7e;
1513 }
1514 return 0xfffe;
1515}
1516
1517/*
1518 * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
1519 * Katakana.
1520 * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
1521 * because Shift-JIS roundtrips half-width Katakana to single bytes.
1522 * These were the only fallbacks in ICU's jisx-208.ucm file.
1523 */
1524static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = {
1525 0x2123, /* U+FF61 */
1526 0x2156,
1527 0x2157,
1528 0x2122,
1529 0x2126,
1530 0x2572,
1531 0x2521,
1532 0x2523,
1533 0x2525,
1534 0x2527,
1535 0x2529,
1536 0x2563,
1537 0x2565,
1538 0x2567,
1539 0x2543,
1540 0x213C, /* U+FF70 */
1541 0x2522,
1542 0x2524,
1543 0x2526,
1544 0x2528,
1545 0x252A,
1546 0x252B,
1547 0x252D,
1548 0x252F,
1549 0x2531,
1550 0x2533,
1551 0x2535,
1552 0x2537,
1553 0x2539,
1554 0x253B,
1555 0x253D,
1556 0x253F, /* U+FF80 */
1557 0x2541,
1558 0x2544,
1559 0x2546,
1560 0x2548,
1561 0x254A,
1562 0x254B,
1563 0x254C,
1564 0x254D,
1565 0x254E,
1566 0x254F,
1567 0x2552,
1568 0x2555,
1569 0x2558,
1570 0x255B,
1571 0x255E,
1572 0x255F, /* U+FF90 */
1573 0x2560,
1574 0x2561,
1575 0x2562,
1576 0x2564,
1577 0x2566,
1578 0x2568,
1579 0x2569,
1580 0x256A,
1581 0x256B,
1582 0x256C,
1583 0x256D,
1584 0x256F,
1585 0x2573,
1586 0x212B,
1587 0x212C /* U+FF9F */
1588};
1589
1590static void U_CALLCONV
1591UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
1592 UConverter *cnv = args->converter;
1593 UConverterDataISO2022 *converterData;
1594 ISO2022State *pFromU2022State;
1595 uint8_t *target = (uint8_t *) args->target;
1596 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
1597 const UChar* source = args->source;
1598 const UChar* sourceLimit = args->sourceLimit;
1599 int32_t* offsets = args->offsets;
1600 UChar32 sourceChar;
1601 char buffer[8];
1602 int32_t len, outLen;
1603 int8_t choices[10];
1604 int32_t choiceCount;
1605 uint32_t targetValue = 0;
1606 UBool useFallback;
1607
1608 int32_t i;
1609 int8_t cs, g;
1610
1611 /* set up the state */
1612 converterData = (UConverterDataISO2022*)cnv->extraInfo;
1613 pFromU2022State = &converterData->fromU2022State;
1614
1615 choiceCount = 0;
1616
1617 /* check if the last codepoint of previous buffer was a lead surrogate*/
1618 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
1619 goto getTrail;
1620 }
1621
1622 while(source < sourceLimit) {
1623 if(target < targetLimit) {
1624
1625 sourceChar = *(source++);
1626 /*check if the char is a First surrogate*/
1627 if(U16_IS_SURROGATE(sourceChar)) {
1628 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
1629getTrail:
1630 /*look ahead to find the trail surrogate*/
1631 if(source < sourceLimit) {
1632 /* test the following code unit */
1633 UChar trail=(UChar) *source;
1634 if(U16_IS_TRAIL(trail)) {
1635 source++;
1636 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
1637 cnv->fromUChar32=0x00;
1638 /* convert this supplementary code point */
1639 /* exit this condition tree */
1640 } else {
1641 /* this is an unmatched lead code unit (1st surrogate) */
1642 /* callback(illegal) */
1643 *err=U_ILLEGAL_CHAR_FOUND;
1644 cnv->fromUChar32=sourceChar;
1645 break;
1646 }
1647 } else {
1648 /* no more input */
1649 cnv->fromUChar32=sourceChar;
1650 break;
1651 }
1652 } else {
1653 /* this is an unmatched trail code unit (2nd surrogate) */
1654 /* callback(illegal) */
1655 *err=U_ILLEGAL_CHAR_FOUND;
1656 cnv->fromUChar32=sourceChar;
1657 break;
1658 }
1659 }
1660
1661 /* do not convert SO/SI/ESC */
1662 if(IS_2022_CONTROL(sourceChar)) {
1663 /* callback(illegal) */
1664 *err=U_ILLEGAL_CHAR_FOUND;
1665 cnv->fromUChar32=sourceChar;
1666 break;
1667 }
1668
1669 /* do the conversion */
1670
1671 if(choiceCount == 0) {
1672 uint16_t csm;
1673
1674 /*
1675 * The csm variable keeps track of which charsets are allowed
1676 * and not used yet while building the choices[].
1677 */
1678 csm = jpCharsetMasks[converterData->version];
1679 choiceCount = 0;
1680
1681 /* JIS7/8: try single-byte half-width Katakana before JISX208 */
1682 if(converterData->version == 3 || converterData->version == 4) {
1683 choices[choiceCount++] = (int8_t)HWKANA_7BIT;
1684 }
1685 /* Do not try single-byte half-width Katakana for other versions. */
1686 csm &= ~CSM(HWKANA_7BIT);
1687
1688 /* try the current G0 charset */
1689 choices[choiceCount++] = cs = pFromU2022State->cs[0];
1690 csm &= ~CSM(cs);
1691
1692 /* try the current G2 charset */
1693 if((cs = pFromU2022State->cs[2]) != 0) {
1694 choices[choiceCount++] = cs;
1695 csm &= ~CSM(cs);
1696 }
1697
1698 /* try all the other possible charsets */
1699 for(i = 0; i < UPRV_LENGTHOF(jpCharsetPref); ++i) {
1700 cs = (int8_t)jpCharsetPref[i];
1701 if(CSM(cs) & csm) {
1702 choices[choiceCount++] = cs;
1703 csm &= ~CSM(cs);
1704 }
1705 }
1706 }
1707
1708 cs = g = 0;
1709 /*
1710 * len==0: no mapping found yet
1711 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
1712 * len>0: found a roundtrip result, done
1713 */
1714 len = 0;
1715 /*
1716 * We will turn off useFallback after finding a fallback,
1717 * but we still get fallbacks from PUA code points as usual.
1718 * Therefore, we will also need to check that we don't overwrite
1719 * an early fallback with a later one.
1720 */
1721 useFallback = cnv->useFallback;
1722
1723 for(i = 0; i < choiceCount && len <= 0; ++i) {
1724 uint32_t value;
1725 int32_t len2;
1726 int8_t cs0 = choices[i];
1727 switch(cs0) {
1728 case ASCII:
1729 if(sourceChar <= 0x7f) {
1730 targetValue = (uint32_t)sourceChar;
1731 len = 1;
1732 cs = cs0;
1733 g = 0;
1734 }
1735 break;
1736 case ISO8859_1:
1737 if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
1738 targetValue = (uint32_t)sourceChar - 0x80;
1739 len = 1;
1740 cs = cs0;
1741 g = 2;
1742 }
1743 break;
1744 case HWKANA_7BIT:
1745 if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1746 if(converterData->version==3) {
1747 /* JIS7: use G1 (SO) */
1748 /* Shift U+FF61..U+FF9F to bytes 21..5F. */
1749 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21));
1750 len = 1;
1751 pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */
1752 g = 1;
1753 } else if(converterData->version==4) {
1754 /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
1755 /* Shift U+FF61..U+FF9F to bytes A1..DF. */
1756 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1));
1757 len = 1;
1758
1759 cs = pFromU2022State->cs[0];
1760 if(IS_JP_DBCS(cs)) {
1761 /* switch from a DBCS charset to JISX201 */
1762 cs = (int8_t)JISX201;
1763 }
1764 /* else stay in the current G0 charset */
1765 g = 0;
1766 }
1767 /* else do not use HWKANA_7BIT with other versions */
1768 }
1769 break;
1770 case JISX201:
1771 /* G0 SBCS */
1772 value = jisx201FromU(sourceChar);
1773 if(value <= 0x7f) {
1774 targetValue = value;
1775 len = 1;
1776 cs = cs0;
1777 g = 0;
1778 useFallback = FALSE;
1779 }
1780 break;
1781 case JISX208:
1782 /* G0 DBCS from Shift-JIS table */
1783 len2 = MBCS_FROM_UCHAR32_ISO2022(
1784 converterData->myConverterArray[cs0],
1785 sourceChar, &value,
1786 useFallback, MBCS_OUTPUT_2);
1787 // Only accept DBCS char (abs(len2) == 2).
1788 // With EUC-JP table for JIS X 208, half-width Kana
1789 // represented with DBCS starting with 0x8E has to be
1790 // filtered out so that they can be converted with
1791 // hwkana_fb table.
1792 if((len2 == 2 && ((value & 0xFF00) != 0x8E00)) || (len2 == -2 && len == 0)) {
1793 value &= 0x7F7F;
1794 if(value != 0) {
1795 targetValue = value;
1796 len = len2;
1797 cs = cs0;
1798 g = 0;
1799 useFallback = FALSE;
1800 }
1801 } else if(len == 0 && useFallback &&
1802 (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1803 targetValue = hwkana_fb[sourceChar - HWKANA_START];
1804 len = -2;
1805 cs = cs0;
1806 g = 0;
1807 useFallback = FALSE;
1808 }
1809 break;
1810 case ISO8859_7:
1811 /* G0 SBCS forced to 7-bit output */
1812 len2 = MBCS_SINGLE_FROM_UCHAR32(
1813 converterData->myConverterArray[cs0],
1814 sourceChar, &value,
1815 useFallback);
1816 if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) {
1817 targetValue = value - 0x80;
1818 len = len2;
1819 cs = cs0;
1820 g = 2;
1821 useFallback = FALSE;
1822 }
1823 break;
1824 default:
1825 /* G0 DBCS */
1826 len2 = MBCS_FROM_UCHAR32_ISO2022(
1827 converterData->myConverterArray[cs0],
1828 sourceChar, &value,
1829 useFallback, MBCS_OUTPUT_2);
1830 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
1831 if(cs0 == KSC5601) {
1832 /*
1833 * Check for valid bytes for the encoding scheme.
1834 * This is necessary because the sub-converter (windows-949)
1835 * has a broader encoding scheme than is valid for 2022.
1836 */
1837 value = _2022FromGR94DBCS(value);
1838 if(value == 0) {
1839 break;
1840 }
1841 }
1842 targetValue = value;
1843 len = len2;
1844 cs = cs0;
1845 g = 0;
1846 useFallback = FALSE;
1847 }
1848 break;
1849 }
1850 }
1851
1852 if(len != 0) {
1853 if(len < 0) {
1854 len = -len; /* fallback */
1855 }
1856 outLen = 0; /* count output bytes */
1857
1858 /* write SI if necessary (only for JIS7) */
1859 if(pFromU2022State->g == 1 && g == 0) {
1860 buffer[outLen++] = UCNV_SI;
1861 pFromU2022State->g = 0;
1862 }
1863
1864 /* write the designation sequence if necessary */
1865 if(cs != pFromU2022State->cs[g]) {
1866 int32_t escLen = escSeqCharsLen[cs];
1867 uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
1868 outLen += escLen;
1869 pFromU2022State->cs[g] = cs;
1870
1871 /* invalidate the choices[] */
1872 choiceCount = 0;
1873 }
1874
1875 /* write the shift sequence if necessary */
1876 if(g != pFromU2022State->g) {
1877 switch(g) {
1878 /* case 0 handled before writing escapes */
1879 case 1:
1880 buffer[outLen++] = UCNV_SO;
1881 pFromU2022State->g = 1;
1882 break;
1883 default: /* case 2 */
1884 buffer[outLen++] = 0x1b;
1885 buffer[outLen++] = 0x4e;
1886 break;
1887 /* no case 3: no SS3 in ISO-2022-JP-x */
1888 }
1889 }
1890
1891 /* write the output bytes */
1892 if(len == 1) {
1893 buffer[outLen++] = (char)targetValue;
1894 } else /* len == 2 */ {
1895 buffer[outLen++] = (char)(targetValue >> 8);
1896 buffer[outLen++] = (char)targetValue;
1897 }
1898 } else {
1899 /*
1900 * if we cannot find the character after checking all codepages
1901 * then this is an error
1902 */
1903 *err = U_INVALID_CHAR_FOUND;
1904 cnv->fromUChar32=sourceChar;
1905 break;
1906 }
1907
1908 if(sourceChar == CR || sourceChar == LF) {
1909 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
1910 pFromU2022State->cs[2] = 0;
1911 choiceCount = 0;
1912 }
1913
1914 /* output outLen>0 bytes in buffer[] */
1915 if(outLen == 1) {
1916 *target++ = buffer[0];
1917 if(offsets) {
1918 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
1919 }
1920 } else if(outLen == 2 && (target + 2) <= targetLimit) {
1921 *target++ = buffer[0];
1922 *target++ = buffer[1];
1923 if(offsets) {
1924 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
1925 *offsets++ = sourceIndex;
1926 *offsets++ = sourceIndex;
1927 }
1928 } else {
1929 fromUWriteUInt8(
1930 cnv,
1931 buffer, outLen,
1932 &target, (const char *)targetLimit,
1933 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
1934 err);
1935 if(U_FAILURE(*err)) {
1936 break;
1937 }
1938 }
1939 } /* end if(myTargetIndex<myTargetLength) */
1940 else{
1941 *err =U_BUFFER_OVERFLOW_ERROR;
1942 break;
1943 }
1944
1945 }/* end while(mySourceIndex<mySourceLength) */
1946
1947 /*
1948 * the end of the input stream and detection of truncated input
1949 * are handled by the framework, but for ISO-2022-JP conversion
1950 * we need to be in ASCII mode at the very end
1951 *
1952 * conditions:
1953 * successful
1954 * in SO mode or not in ASCII mode
1955 * end of input and no truncated input
1956 */
1957 if( U_SUCCESS(*err) &&
1958 (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
1959 args->flush && source>=sourceLimit && cnv->fromUChar32==0
1960 ) {
1961 int32_t sourceIndex;
1962
1963 outLen = 0;
1964
1965 if(pFromU2022State->g != 0) {
1966 buffer[outLen++] = UCNV_SI;
1967 pFromU2022State->g = 0;
1968 }
1969
1970 if(pFromU2022State->cs[0] != ASCII) {
1971 int32_t escLen = escSeqCharsLen[ASCII];
1972 uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
1973 outLen += escLen;
1974 pFromU2022State->cs[0] = (int8_t)ASCII;
1975 }
1976
1977 /* get the source index of the last input character */
1978 /*
1979 * TODO this would be simpler and more reliable if we used a pair
1980 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
1981 * so that we could simply use the prevSourceIndex here;
1982 * this code gives an incorrect result for the rare case of an unmatched
1983 * trail surrogate that is alone in the last buffer of the text stream
1984 */
1985 sourceIndex=(int32_t)(source-args->source);
1986 if(sourceIndex>0) {
1987 --sourceIndex;
1988 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
1989 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
1990 ) {
1991 --sourceIndex;
1992 }
1993 } else {
1994 sourceIndex=-1;
1995 }
1996
1997 fromUWriteUInt8(
1998 cnv,
1999 buffer, outLen,
2000 &target, (const char *)targetLimit,
2001 &offsets, sourceIndex,
2002 err);
2003 }
2004
2005 /*save the state and return */
2006 args->source = source;
2007 args->target = (char*)target;
2008}
2009
2010/*************** to unicode *******************/
2011
2012static void U_CALLCONV
2013UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2014 UErrorCode* err){
2015 char tempBuf[2];
2016 const char *mySource = (char *) args->source;
2017 UChar *myTarget = args->target;
2018 const char *mySourceLimit = args->sourceLimit;
2019 uint32_t targetUniChar = 0x0000;
2020 uint32_t mySourceChar = 0x0000;
2021 uint32_t tmpSourceChar = 0x0000;
2022 UConverterDataISO2022* myData;
2023 ISO2022State *pToU2022State;
2024 StateEnum cs;
2025
2026 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2027 pToU2022State = &myData->toU2022State;
2028
2029 if(myData->key != 0) {
2030 /* continue with a partial escape sequence */
2031 goto escape;
2032 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2033 /* continue with a partial double-byte character */
2034 mySourceChar = args->converter->toUBytes[0];
2035 args->converter->toULength = 0;
2036 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2037 targetUniChar = missingCharMarker;
2038 goto getTrailByte;
2039 }
2040
2041 while(mySource < mySourceLimit){
2042
2043 targetUniChar =missingCharMarker;
2044
2045 if(myTarget < args->targetLimit){
2046
2047 mySourceChar= (unsigned char) *mySource++;
2048
2049 switch(mySourceChar) {
2050 case UCNV_SI:
2051 if(myData->version==3) {
2052 pToU2022State->g=0;
2053 continue;
2054 } else {
2055 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2056 myData->isEmptySegment = FALSE; /* reset this, we have a different error */
2057 break;
2058 }
2059
2060 case UCNV_SO:
2061 if(myData->version==3) {
2062 /* JIS7: switch to G1 half-width Katakana */
2063 pToU2022State->cs[1] = (int8_t)HWKANA_7BIT;
2064 pToU2022State->g=1;
2065 continue;
2066 } else {
2067 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2068 myData->isEmptySegment = FALSE; /* reset this, we have a different error */
2069 break;
2070 }
2071
2072 case ESC_2022:
2073 mySource--;
2074escape:
2075 {
2076 const char * mySourceBefore = mySource;
2077 int8_t toULengthBefore = args->converter->toULength;
2078
2079 changeState_2022(args->converter,&(mySource),
2080 mySourceLimit, ISO_2022_JP,err);
2081
2082 /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
2083 if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
2084 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2085 args->converter->toUCallbackReason = UCNV_IRREGULAR;
2086 args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
2087 }
2088 }
2089
2090 /* invalid or illegal escape sequence */
2091 if(U_FAILURE(*err)){
2092 args->target = myTarget;
2093 args->source = mySource;
2094 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
2095 return;
2096 }
2097 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
2098 if(myData->key==0) {
2099 myData->isEmptySegment = TRUE;
2100 }
2101 continue;
2102
2103 /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
2104
2105 case CR:
2106 case LF:
2107 /* automatically reset to single-byte mode */
2108 if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) {
2109 pToU2022State->cs[0] = (int8_t)ASCII;
2110 }
2111 pToU2022State->cs[2] = 0;
2112 pToU2022State->g = 0;
2113 U_FALLTHROUGH;
2114 default:
2115 /* convert one or two bytes */
2116 myData->isEmptySegment = FALSE;
2117 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2118 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
2119 !IS_JP_DBCS(cs)
2120 ) {
2121 /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
2122 targetUniChar = mySourceChar + (HWKANA_START - 0xa1);
2123
2124 /* return from a single-shift state to the previous one */
2125 if(pToU2022State->g >= 2) {
2126 pToU2022State->g=pToU2022State->prevG;
2127 }
2128 } else switch(cs) {
2129 case ASCII:
2130 if(mySourceChar <= 0x7f) {
2131 targetUniChar = mySourceChar;
2132 }
2133 break;
2134 case ISO8859_1:
2135 if(mySourceChar <= 0x7f) {
2136 targetUniChar = mySourceChar + 0x80;
2137 }
2138 /* return from a single-shift state to the previous one */
2139 pToU2022State->g=pToU2022State->prevG;
2140 break;
2141 case ISO8859_7:
2142 if(mySourceChar <= 0x7f) {
2143 /* convert mySourceChar+0x80 to use a normal 8-bit table */
2144 targetUniChar =
2145 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
2146 myData->myConverterArray[cs],
2147 mySourceChar + 0x80);
2148 }
2149 /* return from a single-shift state to the previous one */
2150 pToU2022State->g=pToU2022State->prevG;
2151 break;
2152 case JISX201:
2153 if(mySourceChar <= 0x7f) {
2154 targetUniChar = jisx201ToU(mySourceChar);
2155 }
2156 break;
2157 case HWKANA_7BIT:
2158 if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
2159 /* 7-bit halfwidth Katakana */
2160 targetUniChar = mySourceChar + (HWKANA_START - 0x21);
2161 }
2162 break;
2163 default:
2164 /* G0 DBCS */
2165 if(mySource < mySourceLimit) {
2166 int leadIsOk, trailIsOk;
2167 uint8_t trailByte;
2168getTrailByte:
2169 trailByte = (uint8_t)*mySource;
2170 /*
2171 * Ticket 5691: consistent illegal sequences:
2172 * - We include at least the first byte in the illegal sequence.
2173 * - If any of the non-initial bytes could be the start of a character,
2174 * we stop the illegal sequence before the first one of those.
2175 *
2176 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2177 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2178 * Otherwise we convert or report the pair of bytes.
2179 */
2180 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2181 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2182 if (leadIsOk && trailIsOk) {
2183 ++mySource;
2184 tmpSourceChar = (mySourceChar << 8) | trailByte;
2185 /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
2186 mySourceChar = tmpSourceChar;
2187 if (cs == JISX208 || cs == KSC5601) {
2188 tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */
2189 }
2190 tempBuf[0] = (char)(tmpSourceChar >> 8);
2191 tempBuf[1] = (char)(tmpSourceChar);
2192 targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
2193 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2194 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2195 ++mySource;
2196 /* add another bit so that the code below writes 2 bytes in case of error */
2197 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2198 }
2199 } else {
2200 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2201 args->converter->toULength = 1;
2202 goto endloop;
2203 }
2204 } /* End of inner switch */
2205 break;
2206 } /* End of outer switch */
2207 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
2208 if(args->offsets){
2209 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2210 }
2211 *(myTarget++)=(UChar)targetUniChar;
2212 }
2213 else if(targetUniChar > missingCharMarker){
2214 /* disassemble the surrogate pair and write to output*/
2215 targetUniChar-=0x0010000;
2216 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
2217 if(args->offsets){
2218 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2219 }
2220 ++myTarget;
2221 if(myTarget< args->targetLimit){
2222 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2223 if(args->offsets){
2224 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2225 }
2226 ++myTarget;
2227 }else{
2228 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
2229 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2230 }
2231
2232 }
2233 else{
2234 /* Call the callback function*/
2235 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2236 break;
2237 }
2238 }
2239 else{ /* goes with "if(myTarget < args->targetLimit)" way up near top of function */
2240 *err =U_BUFFER_OVERFLOW_ERROR;
2241 break;
2242 }
2243 }
2244endloop:
2245 args->target = myTarget;
2246 args->source = mySource;
2247}
2248
2249
2250#if !UCONFIG_ONLY_HTML_CONVERSION
2251/***************************************************************
2252* Rules for ISO-2022-KR encoding
2253* i) The KSC5601 designator sequence should appear only once in a file,
2254* at the begining of a line before any KSC5601 characters. This usually
2255* means that it appears by itself on the first line of the file
2256* ii) There are only 2 shifting sequences SO to shift into double byte mode
2257* and SI to shift into single byte mode
2258*/
2259static void U_CALLCONV
2260UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
2261
2262 UConverter* saveConv = args->converter;
2263 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo;
2264 args->converter=myConverterData->currentConverter;
2265
2266 myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
2267 ucnv_MBCSFromUnicodeWithOffsets(args,err);
2268 saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
2269
2270 if(*err == U_BUFFER_OVERFLOW_ERROR) {
2271 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
2272 uprv_memcpy(
2273 saveConv->charErrorBuffer,
2274 myConverterData->currentConverter->charErrorBuffer,
2275 myConverterData->currentConverter->charErrorBufferLength);
2276 }
2277 saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
2278 myConverterData->currentConverter->charErrorBufferLength = 0;
2279 }
2280 args->converter=saveConv;
2281}
2282
2283static void U_CALLCONV
2284UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2285
2286 const UChar *source = args->source;
2287 const UChar *sourceLimit = args->sourceLimit;
2288 unsigned char *target = (unsigned char *) args->target;
2289 unsigned char *targetLimit = (unsigned char *) args->targetLimit;
2290 int32_t* offsets = args->offsets;
2291 uint32_t targetByteUnit = 0x0000;
2292 UChar32 sourceChar = 0x0000;
2293 UBool isTargetByteDBCS;
2294 UBool oldIsTargetByteDBCS;
2295 UConverterDataISO2022 *converterData;
2296 UConverterSharedData* sharedData;
2297 UBool useFallback;
2298 int32_t length =0;
2299
2300 converterData=(UConverterDataISO2022*)args->converter->extraInfo;
2301 /* if the version is 1 then the user is requesting
2302 * conversion with ibm-25546 pass the arguments to
2303 * MBCS converter and return
2304 */
2305 if(converterData->version==1){
2306 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2307 return;
2308 }
2309
2310 /* initialize data */
2311 sharedData = converterData->currentConverter->sharedData;
2312 useFallback = args->converter->useFallback;
2313 isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
2314 oldIsTargetByteDBCS = isTargetByteDBCS;
2315
2316 isTargetByteDBCS = (UBool) args->converter->fromUnicodeStatus;
2317 if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
2318 goto getTrail;
2319 }
2320 while(source < sourceLimit){
2321
2322 targetByteUnit = missingCharMarker;
2323
2324 if(target < (unsigned char*) args->targetLimit){
2325 sourceChar = *source++;
2326
2327 /* do not convert SO/SI/ESC */
2328 if(IS_2022_CONTROL(sourceChar)) {
2329 /* callback(illegal) */
2330 *err=U_ILLEGAL_CHAR_FOUND;
2331 args->converter->fromUChar32=sourceChar;
2332 break;
2333 }
2334
2335 length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2);
2336 if(length < 0) {
2337 length = -length; /* fallback */
2338 }
2339 /* only DBCS or SBCS characters are expected*/
2340 /* DB characters with high bit set to 1 are expected */
2341 if( length > 2 || length==0 ||
2342 (length == 1 && targetByteUnit > 0x7f) ||
2343 (length == 2 &&
2344 ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) ||
2345 (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1)))
2346 ) {
2347 targetByteUnit=missingCharMarker;
2348 }
2349 if (targetByteUnit != missingCharMarker){
2350
2351 oldIsTargetByteDBCS = isTargetByteDBCS;
2352 isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF);
2353 /* append the shift sequence */
2354 if (oldIsTargetByteDBCS != isTargetByteDBCS ){
2355
2356 if (isTargetByteDBCS)
2357 *target++ = UCNV_SO;
2358 else
2359 *target++ = UCNV_SI;
2360 if(offsets)
2361 *(offsets++) = (int32_t)(source - args->source-1);
2362 }
2363 /* write the targetUniChar to target */
2364 if(targetByteUnit <= 0x00FF){
2365 if( target < targetLimit){
2366 *(target++) = (unsigned char) targetByteUnit;
2367 if(offsets){
2368 *(offsets++) = (int32_t)(source - args->source-1);
2369 }
2370
2371 }else{
2372 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
2373 *err = U_BUFFER_OVERFLOW_ERROR;
2374 }
2375 }else{
2376 if(target < targetLimit){
2377 *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80);
2378 if(offsets){
2379 *(offsets++) = (int32_t)(source - args->source-1);
2380 }
2381 if(target < targetLimit){
2382 *(target++) =(unsigned char) (targetByteUnit -0x80);
2383 if(offsets){
2384 *(offsets++) = (int32_t)(source - args->source-1);
2385 }
2386 }else{
2387 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80);
2388 *err = U_BUFFER_OVERFLOW_ERROR;
2389 }
2390 }else{
2391 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80);
2392 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80);
2393 *err = U_BUFFER_OVERFLOW_ERROR;
2394 }
2395 }
2396
2397 }
2398 else{
2399 /* oops.. the code point is unassingned
2400 * set the error and reason
2401 */
2402
2403 /*check if the char is a First surrogate*/
2404 if(U16_IS_SURROGATE(sourceChar)) {
2405 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
2406getTrail:
2407 /*look ahead to find the trail surrogate*/
2408 if(source < sourceLimit) {
2409 /* test the following code unit */
2410 UChar trail=(UChar) *source;
2411 if(U16_IS_TRAIL(trail)) {
2412 source++;
2413 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
2414 *err = U_INVALID_CHAR_FOUND;
2415 /* convert this surrogate code point */
2416 /* exit this condition tree */
2417 } else {
2418 /* this is an unmatched lead code unit (1st surrogate) */
2419 /* callback(illegal) */
2420 *err=U_ILLEGAL_CHAR_FOUND;
2421 }
2422 } else {
2423 /* no more input */
2424 *err = U_ZERO_ERROR;
2425 }
2426 } else {
2427 /* this is an unmatched trail code unit (2nd surrogate) */
2428 /* callback(illegal) */
2429 *err=U_ILLEGAL_CHAR_FOUND;
2430 }
2431 } else {
2432 /* callback(unassigned) for a BMP code point */
2433 *err = U_INVALID_CHAR_FOUND;
2434 }
2435
2436 args->converter->fromUChar32=sourceChar;
2437 break;
2438 }
2439 } /* end if(myTargetIndex<myTargetLength) */
2440 else{
2441 *err =U_BUFFER_OVERFLOW_ERROR;
2442 break;
2443 }
2444
2445 }/* end while(mySourceIndex<mySourceLength) */
2446
2447 /*
2448 * the end of the input stream and detection of truncated input
2449 * are handled by the framework, but for ISO-2022-KR conversion
2450 * we need to be in ASCII mode at the very end
2451 *
2452 * conditions:
2453 * successful
2454 * not in ASCII mode
2455 * end of input and no truncated input
2456 */
2457 if( U_SUCCESS(*err) &&
2458 isTargetByteDBCS &&
2459 args->flush && source>=sourceLimit && args->converter->fromUChar32==0
2460 ) {
2461 int32_t sourceIndex;
2462
2463 /* we are switching to ASCII */
2464 isTargetByteDBCS=FALSE;
2465
2466 /* get the source index of the last input character */
2467 /*
2468 * TODO this would be simpler and more reliable if we used a pair
2469 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2470 * so that we could simply use the prevSourceIndex here;
2471 * this code gives an incorrect result for the rare case of an unmatched
2472 * trail surrogate that is alone in the last buffer of the text stream
2473 */
2474 sourceIndex=(int32_t)(source-args->source);
2475 if(sourceIndex>0) {
2476 --sourceIndex;
2477 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2478 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2479 ) {
2480 --sourceIndex;
2481 }
2482 } else {
2483 sourceIndex=-1;
2484 }
2485
2486 fromUWriteUInt8(
2487 args->converter,
2488 SHIFT_IN_STR, 1,
2489 &target, (const char *)targetLimit,
2490 &offsets, sourceIndex,
2491 err);
2492 }
2493
2494 /*save the state and return */
2495 args->source = source;
2496 args->target = (char*)target;
2497 args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS;
2498}
2499
2500/************************ To Unicode ***************************************/
2501
2502static void U_CALLCONV
2503UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
2504 UErrorCode* err){
2505 char const* sourceStart;
2506 UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2507
2508 UConverterToUnicodeArgs subArgs;
2509 int32_t minArgsSize;
2510
2511 /* set up the subconverter arguments */
2512 if(args->size<sizeof(UConverterToUnicodeArgs)) {
2513 minArgsSize = args->size;
2514 } else {
2515 minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs);
2516 }
2517
2518 uprv_memcpy(&subArgs, args, minArgsSize);
2519 subArgs.size = (uint16_t)minArgsSize;
2520 subArgs.converter = myData->currentConverter;
2521
2522 /* remember the original start of the input for offsets */
2523 sourceStart = args->source;
2524
2525 if(myData->key != 0) {
2526 /* continue with a partial escape sequence */
2527 goto escape;
2528 }
2529
2530 while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
2531 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
2532 subArgs.source = args->source;
2533 subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
2534 if(subArgs.source != subArgs.sourceLimit) {
2535 /*
2536 * get the current partial byte sequence
2537 *
2538 * it needs to be moved between the public and the subconverter
2539 * so that the conversion framework, which only sees the public
2540 * converter, can handle truncated and illegal input etc.
2541 */
2542 if(args->converter->toULength > 0) {
2543 uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
2544 }
2545 subArgs.converter->toULength = args->converter->toULength;
2546
2547 /*
2548 * Convert up to the end of the input, or to before the next escape character.
2549 * Does not handle conversion extensions because the preToU[] state etc.
2550 * is not copied.
2551 */
2552 ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
2553
2554 if(args->offsets != NULL && sourceStart != args->source) {
2555 /* update offsets to base them on the actual start of the input */
2556 int32_t *offsets = args->offsets;
2557 UChar *target = args->target;
2558 int32_t delta = (int32_t)(args->source - sourceStart);
2559 while(target < subArgs.target) {
2560 if(*offsets >= 0) {
2561 *offsets += delta;
2562 }
2563 ++offsets;
2564 ++target;
2565 }
2566 }
2567 args->source = subArgs.source;
2568 args->target = subArgs.target;
2569 args->offsets = subArgs.offsets;
2570
2571 /* copy input/error/overflow buffers */
2572 if(subArgs.converter->toULength > 0) {
2573 uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
2574 }
2575 args->converter->toULength = subArgs.converter->toULength;
2576
2577 if(*err == U_BUFFER_OVERFLOW_ERROR) {
2578 if(subArgs.converter->UCharErrorBufferLength > 0) {
2579 uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
2580 subArgs.converter->UCharErrorBufferLength);
2581 }
2582 args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
2583 subArgs.converter->UCharErrorBufferLength = 0;
2584 }
2585 }
2586
2587 if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
2588 return;
2589 }
2590
2591escape:
2592 changeState_2022(args->converter,
2593 &(args->source),
2594 args->sourceLimit,
2595 ISO_2022_KR,
2596 err);
2597 }
2598}
2599
2600static void U_CALLCONV
2601UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2602 UErrorCode* err){
2603 char tempBuf[2];
2604 const char *mySource = ( char *) args->source;
2605 UChar *myTarget = args->target;
2606 const char *mySourceLimit = args->sourceLimit;
2607 UChar32 targetUniChar = 0x0000;
2608 UChar mySourceChar = 0x0000;
2609 UConverterDataISO2022* myData;
2610 UConverterSharedData* sharedData ;
2611 UBool useFallback;
2612
2613 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2614 if(myData->version==1){
2615 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2616 return;
2617 }
2618
2619 /* initialize state */
2620 sharedData = myData->currentConverter->sharedData;
2621 useFallback = args->converter->useFallback;
2622
2623 if(myData->key != 0) {
2624 /* continue with a partial escape sequence */
2625 goto escape;
2626 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2627 /* continue with a partial double-byte character */
2628 mySourceChar = args->converter->toUBytes[0];
2629 args->converter->toULength = 0;
2630 goto getTrailByte;
2631 }
2632
2633 while(mySource< mySourceLimit){
2634
2635 if(myTarget < args->targetLimit){
2636
2637 mySourceChar= (unsigned char) *mySource++;
2638
2639 if(mySourceChar==UCNV_SI){
2640 myData->toU2022State.g = 0;
2641 if (myData->isEmptySegment) {
2642 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
2643 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2644 args->converter->toUCallbackReason = UCNV_IRREGULAR;
2645 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2646 args->converter->toULength = 1;
2647 args->target = myTarget;
2648 args->source = mySource;
2649 return;
2650 }
2651 /*consume the source */
2652 continue;
2653 }else if(mySourceChar==UCNV_SO){
2654 myData->toU2022State.g = 1;
2655 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
2656 /*consume the source */
2657 continue;
2658 }else if(mySourceChar==ESC_2022){
2659 mySource--;
2660escape:
2661 myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */
2662 changeState_2022(args->converter,&(mySource),
2663 mySourceLimit, ISO_2022_KR, err);
2664 if(U_FAILURE(*err)){
2665 args->target = myTarget;
2666 args->source = mySource;
2667 return;
2668 }
2669 continue;
2670 }
2671
2672 myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */
2673 if(myData->toU2022State.g == 1) {
2674 if(mySource < mySourceLimit) {
2675 int leadIsOk, trailIsOk;
2676 uint8_t trailByte;
2677getTrailByte:
2678 targetUniChar = missingCharMarker;
2679 trailByte = (uint8_t)*mySource;
2680 /*
2681 * Ticket 5691: consistent illegal sequences:
2682 * - We include at least the first byte in the illegal sequence.
2683 * - If any of the non-initial bytes could be the start of a character,
2684 * we stop the illegal sequence before the first one of those.
2685 *
2686 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2687 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2688 * Otherwise we convert or report the pair of bytes.
2689 */
2690 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2691 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2692 if (leadIsOk && trailIsOk) {
2693 ++mySource;
2694 tempBuf[0] = (char)(mySourceChar + 0x80);
2695 tempBuf[1] = (char)(trailByte + 0x80);
2696 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
2697 mySourceChar = (mySourceChar << 8) | trailByte;
2698 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2699 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2700 ++mySource;
2701 /* add another bit so that the code below writes 2 bytes in case of error */
2702 mySourceChar = static_cast<UChar>(0x10000 | (mySourceChar << 8) | trailByte);
2703 }
2704 } else {
2705 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2706 args->converter->toULength = 1;
2707 break;
2708 }
2709 }
2710 else if(mySourceChar <= 0x7f) {
2711 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
2712 } else {
2713 targetUniChar = 0xffff;
2714 }
2715 if(targetUniChar < 0xfffe){
2716 if(args->offsets) {
2717 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2718 }
2719 *(myTarget++)=(UChar)targetUniChar;
2720 }
2721 else {
2722 /* Call the callback function*/
2723 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2724 break;
2725 }
2726 }
2727 else{
2728 *err =U_BUFFER_OVERFLOW_ERROR;
2729 break;
2730 }
2731 }
2732 args->target = myTarget;
2733 args->source = mySource;
2734}
2735
2736/*************************** END ISO2022-KR *********************************/
2737
2738/*************************** ISO-2022-CN *********************************
2739*
2740* Rules for ISO-2022-CN Encoding:
2741* i) The designator sequence must appear once on a line before any instance
2742* of character set it designates.
2743* ii) If two lines contain characters from the same character set, both lines
2744* must include the designator sequence.
2745* iii) Once the designator sequence is known, a shifting sequence has to be found
2746* to invoke the shifting
2747* iv) All lines start in ASCII and end in ASCII.
2748* v) Four shifting sequences are employed for this purpose:
2749*
2750* Sequcence ASCII Eq Charsets
2751* ---------- ------- ---------
2752* SI <SI> US-ASCII
2753* SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2754* SS2 <ESC>N CNS-11643-1992 Plane 2
2755* SS3 <ESC>O CNS-11643-1992 Planes 3-7
2756*
2757* vi)
2758* SOdesignator : ESC "$" ")" finalchar_for_SO
2759* SS2designator : ESC "$" "*" finalchar_for_SS2
2760* SS3designator : ESC "$" "+" finalchar_for_SS3
2761*
2762* ESC $ ) A Indicates the bytes following SO are Chinese
2763* characters as defined in GB 2312-80, until
2764* another SOdesignation appears
2765*
2766*
2767* ESC $ ) E Indicates the bytes following SO are as defined
2768* in ISO-IR-165 (for details, see section 2.1),
2769* until another SOdesignation appears
2770*
2771* ESC $ ) G Indicates the bytes following SO are as defined
2772* in CNS 11643-plane-1, until another
2773* SOdesignation appears
2774*
2775* ESC $ * H Indicates the two bytes immediately following
2776* SS2 is a Chinese character as defined in CNS
2777* 11643-plane-2, until another SS2designation
2778* appears
2779* (Meaning <ESC>N must preceed every 2 byte
2780* sequence.)
2781*
2782* ESC $ + I Indicates the immediate two bytes following SS3
2783* is a Chinese character as defined in CNS
2784* 11643-plane-3, until another SS3designation
2785* appears
2786* (Meaning <ESC>O must preceed every 2 byte
2787* sequence.)
2788*
2789* ESC $ + J Indicates the immediate two bytes following SS3
2790* is a Chinese character as defined in CNS
2791* 11643-plane-4, until another SS3designation
2792* appears
2793* (In English: <ESC>O must preceed every 2 byte
2794* sequence.)
2795*
2796* ESC $ + K Indicates the immediate two bytes following SS3
2797* is a Chinese character as defined in CNS
2798* 11643-plane-5, until another SS3designation
2799* appears
2800*
2801* ESC $ + L Indicates the immediate two bytes following SS3
2802* is a Chinese character as defined in CNS
2803* 11643-plane-6, until another SS3designation
2804* appears
2805*
2806* ESC $ + M Indicates the immediate two bytes following SS3
2807* is a Chinese character as defined in CNS
2808* 11643-plane-7, until another SS3designation
2809* appears
2810*
2811* As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2812* has its own designation information before any Chinese characters
2813* appear
2814*
2815*/
2816
2817/* The following are defined this way to make the strings truly readonly */
2818static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
2819static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
2820static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
2821static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
2822static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
2823static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
2824static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
2825static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
2826static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
2827
2828/********************** ISO2022-CN Data **************************/
2829static const char* const escSeqCharsCN[10] ={
2830 SHIFT_IN_STR, /* 0 ASCII */
2831 GB_2312_80_STR, /* 1 GB2312_1 */
2832 ISO_IR_165_STR, /* 2 ISO_IR_165 */
2833 CNS_11643_1992_Plane_1_STR,
2834 CNS_11643_1992_Plane_2_STR,
2835 CNS_11643_1992_Plane_3_STR,
2836 CNS_11643_1992_Plane_4_STR,
2837 CNS_11643_1992_Plane_5_STR,
2838 CNS_11643_1992_Plane_6_STR,
2839 CNS_11643_1992_Plane_7_STR
2840};
2841
2842static void U_CALLCONV
2843UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2844 UConverter *cnv = args->converter;
2845 UConverterDataISO2022 *converterData;
2846 ISO2022State *pFromU2022State;
2847 uint8_t *target = (uint8_t *) args->target;
2848 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
2849 const UChar* source = args->source;
2850 const UChar* sourceLimit = args->sourceLimit;
2851 int32_t* offsets = args->offsets;
2852 UChar32 sourceChar;
2853 char buffer[8];
2854 int32_t len;
2855 int8_t choices[3];
2856 int32_t choiceCount;
2857 uint32_t targetValue = 0;
2858 UBool useFallback;
2859
2860 /* set up the state */
2861 converterData = (UConverterDataISO2022*)cnv->extraInfo;
2862 pFromU2022State = &converterData->fromU2022State;
2863
2864 choiceCount = 0;
2865
2866 /* check if the last codepoint of previous buffer was a lead surrogate*/
2867 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
2868 goto getTrail;
2869 }
2870
2871 while( source < sourceLimit){
2872 if(target < targetLimit){
2873
2874 sourceChar = *(source++);
2875 /*check if the char is a First surrogate*/
2876 if(U16_IS_SURROGATE(sourceChar)) {
2877 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
2878getTrail:
2879 /*look ahead to find the trail surrogate*/
2880 if(source < sourceLimit) {
2881 /* test the following code unit */
2882 UChar trail=(UChar) *source;
2883 if(U16_IS_TRAIL(trail)) {
2884 source++;
2885 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
2886 cnv->fromUChar32=0x00;
2887 /* convert this supplementary code point */
2888 /* exit this condition tree */
2889 } else {
2890 /* this is an unmatched lead code unit (1st surrogate) */
2891 /* callback(illegal) */
2892 *err=U_ILLEGAL_CHAR_FOUND;
2893 cnv->fromUChar32=sourceChar;
2894 break;
2895 }
2896 } else {
2897 /* no more input */
2898 cnv->fromUChar32=sourceChar;
2899 break;
2900 }
2901 } else {
2902 /* this is an unmatched trail code unit (2nd surrogate) */
2903 /* callback(illegal) */
2904 *err=U_ILLEGAL_CHAR_FOUND;
2905 cnv->fromUChar32=sourceChar;
2906 break;
2907 }
2908 }
2909
2910 /* do the conversion */
2911 if(sourceChar <= 0x007f ){
2912 /* do not convert SO/SI/ESC */
2913 if(IS_2022_CONTROL(sourceChar)) {
2914 /* callback(illegal) */
2915 *err=U_ILLEGAL_CHAR_FOUND;
2916 cnv->fromUChar32=sourceChar;
2917 break;
2918 }
2919
2920 /* US-ASCII */
2921 if(pFromU2022State->g == 0) {
2922 buffer[0] = (char)sourceChar;
2923 len = 1;
2924 } else {
2925 buffer[0] = UCNV_SI;
2926 buffer[1] = (char)sourceChar;
2927 len = 2;
2928 pFromU2022State->g = 0;
2929 choiceCount = 0;
2930 }
2931 if(sourceChar == CR || sourceChar == LF) {
2932 /* reset the state at the end of a line */
2933 uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
2934 choiceCount = 0;
2935 }
2936 }
2937 else{
2938 /* convert U+0080..U+10ffff */
2939 int32_t i;
2940 int8_t cs, g;
2941
2942 if(choiceCount == 0) {
2943 /* try the current SO/G1 converter first */
2944 choices[0] = pFromU2022State->cs[1];
2945
2946 /* default to GB2312_1 if none is designated yet */
2947 if(choices[0] == 0) {
2948 choices[0] = GB2312_1;
2949 }
2950
2951 if(converterData->version == 0) {
2952 /* ISO-2022-CN */
2953
2954 /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
2955 if(choices[0] == GB2312_1) {
2956 choices[1] = (int8_t)CNS_11643_1;
2957 } else {
2958 choices[1] = (int8_t)GB2312_1;
2959 }
2960
2961 choiceCount = 2;
2962 } else if (converterData->version == 1) {
2963 /* ISO-2022-CN-EXT */
2964
2965 /* try one of the other converters */
2966 switch(choices[0]) {
2967 case GB2312_1:
2968 choices[1] = (int8_t)CNS_11643_1;
2969 choices[2] = (int8_t)ISO_IR_165;
2970 break;
2971 case ISO_IR_165:
2972 choices[1] = (int8_t)GB2312_1;
2973 choices[2] = (int8_t)CNS_11643_1;
2974 break;
2975 default: /* CNS_11643_x */
2976 choices[1] = (int8_t)GB2312_1;
2977 choices[2] = (int8_t)ISO_IR_165;
2978 break;
2979 }
2980
2981 choiceCount = 3;
2982 } else {
2983 choices[0] = (int8_t)CNS_11643_1;
2984 choices[1] = (int8_t)GB2312_1;
2985 }
2986 }
2987
2988 cs = g = 0;
2989 /*
2990 * len==0: no mapping found yet
2991 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
2992 * len>0: found a roundtrip result, done
2993 */
2994 len = 0;
2995 /*
2996 * We will turn off useFallback after finding a fallback,
2997 * but we still get fallbacks from PUA code points as usual.
2998 * Therefore, we will also need to check that we don't overwrite
2999 * an early fallback with a later one.
3000 */
3001 useFallback = cnv->useFallback;
3002
3003 for(i = 0; i < choiceCount && len <= 0; ++i) {
3004 int8_t cs0 = choices[i];
3005 if(cs0 > 0) {
3006 uint32_t value;
3007 int32_t len2;
3008 if(cs0 >= CNS_11643_0) {
3009 len2 = MBCS_FROM_UCHAR32_ISO2022(
3010 converterData->myConverterArray[CNS_11643],
3011 sourceChar,
3012 &value,
3013 useFallback,
3014 MBCS_OUTPUT_3);
3015 if(len2 == 3 || (len2 == -3 && len == 0)) {
3016 targetValue = value;
3017 cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80);
3018 if(len2 >= 0) {
3019 len = 2;
3020 } else {
3021 len = -2;
3022 useFallback = FALSE;
3023 }
3024 if(cs == CNS_11643_1) {
3025 g = 1;
3026 } else if(cs == CNS_11643_2) {
3027 g = 2;
3028 } else /* plane 3..7 */ if(converterData->version == 1) {
3029 g = 3;
3030 } else {
3031 /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
3032 len = 0;
3033 }
3034 }
3035 } else {
3036 /* GB2312_1 or ISO-IR-165 */
3037 U_ASSERT(cs0<UCNV_2022_MAX_CONVERTERS);
3038 len2 = MBCS_FROM_UCHAR32_ISO2022(
3039 converterData->myConverterArray[cs0],
3040 sourceChar,
3041 &value,
3042 useFallback,
3043 MBCS_OUTPUT_2);
3044 if(len2 == 2 || (len2 == -2 && len == 0)) {
3045 targetValue = value;
3046 len = len2;
3047 cs = cs0;
3048 g = 1;
3049 useFallback = FALSE;
3050 }
3051 }
3052 }
3053 }
3054
3055 if(len != 0) {
3056 len = 0; /* count output bytes; it must have been abs(len) == 2 */
3057
3058 /* write the designation sequence if necessary */
3059 if(cs != pFromU2022State->cs[g]) {
3060 if(cs < CNS_11643) {
3061 uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
3062 } else {
3063 U_ASSERT(cs >= CNS_11643_1);
3064 uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
3065 }
3066 len = 4;
3067 pFromU2022State->cs[g] = cs;
3068 if(g == 1) {
3069 /* changing the SO/G1 charset invalidates the choices[] */
3070 choiceCount = 0;
3071 }
3072 }
3073
3074 /* write the shift sequence if necessary */
3075 if(g != pFromU2022State->g) {
3076 switch(g) {
3077 case 1:
3078 buffer[len++] = UCNV_SO;
3079
3080 /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
3081 pFromU2022State->g = 1;
3082 break;
3083 case 2:
3084 buffer[len++] = 0x1b;
3085 buffer[len++] = 0x4e;
3086 break;
3087 default: /* case 3 */
3088 buffer[len++] = 0x1b;
3089 buffer[len++] = 0x4f;
3090 break;
3091 }
3092 }
3093
3094 /* write the two output bytes */
3095 buffer[len++] = (char)(targetValue >> 8);
3096 buffer[len++] = (char)targetValue;
3097 } else {
3098 /* if we cannot find the character after checking all codepages
3099 * then this is an error
3100 */
3101 *err = U_INVALID_CHAR_FOUND;
3102 cnv->fromUChar32=sourceChar;
3103 break;
3104 }
3105 }
3106
3107 /* output len>0 bytes in buffer[] */
3108 if(len == 1) {
3109 *target++ = buffer[0];
3110 if(offsets) {
3111 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
3112 }
3113 } else if(len == 2 && (target + 2) <= targetLimit) {
3114 *target++ = buffer[0];
3115 *target++ = buffer[1];
3116 if(offsets) {
3117 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
3118 *offsets++ = sourceIndex;
3119 *offsets++ = sourceIndex;
3120 }
3121 } else {
3122 fromUWriteUInt8(
3123 cnv,
3124 buffer, len,
3125 &target, (const char *)targetLimit,
3126 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
3127 err);
3128 if(U_FAILURE(*err)) {
3129 break;
3130 }
3131 }
3132 } /* end if(myTargetIndex<myTargetLength) */
3133 else{
3134 *err =U_BUFFER_OVERFLOW_ERROR;
3135 break;
3136 }
3137
3138 }/* end while(mySourceIndex<mySourceLength) */
3139
3140 /*
3141 * the end of the input stream and detection of truncated input
3142 * are handled by the framework, but for ISO-2022-CN conversion
3143 * we need to be in ASCII mode at the very end
3144 *
3145 * conditions:
3146 * successful
3147 * not in ASCII mode
3148 * end of input and no truncated input
3149 */
3150 if( U_SUCCESS(*err) &&
3151 pFromU2022State->g!=0 &&
3152 args->flush && source>=sourceLimit && cnv->fromUChar32==0
3153 ) {
3154 int32_t sourceIndex;
3155
3156 /* we are switching to ASCII */
3157 pFromU2022State->g=0;
3158
3159 /* get the source index of the last input character */
3160 /*
3161 * TODO this would be simpler and more reliable if we used a pair
3162 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
3163 * so that we could simply use the prevSourceIndex here;
3164 * this code gives an incorrect result for the rare case of an unmatched
3165 * trail surrogate that is alone in the last buffer of the text stream
3166 */
3167 sourceIndex=(int32_t)(source-args->source);
3168 if(sourceIndex>0) {
3169 --sourceIndex;
3170 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
3171 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
3172 ) {
3173 --sourceIndex;
3174 }
3175 } else {
3176 sourceIndex=-1;
3177 }
3178
3179 fromUWriteUInt8(
3180 cnv,
3181 SHIFT_IN_STR, 1,
3182 &target, (const char *)targetLimit,
3183 &offsets, sourceIndex,
3184 err);
3185 }
3186
3187 /*save the state and return */
3188 args->source = source;
3189 args->target = (char*)target;
3190}
3191
3192
3193static void U_CALLCONV
3194UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
3195 UErrorCode* err){
3196 char tempBuf[3];
3197 const char *mySource = (char *) args->source;
3198 UChar *myTarget = args->target;
3199 const char *mySourceLimit = args->sourceLimit;
3200 uint32_t targetUniChar = 0x0000;
3201 uint32_t mySourceChar = 0x0000;
3202 UConverterDataISO2022* myData;
3203 ISO2022State *pToU2022State;
3204
3205 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
3206 pToU2022State = &myData->toU2022State;
3207
3208 if(myData->key != 0) {
3209 /* continue with a partial escape sequence */
3210 goto escape;
3211 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
3212 /* continue with a partial double-byte character */
3213 mySourceChar = args->converter->toUBytes[0];
3214 args->converter->toULength = 0;
3215 targetUniChar = missingCharMarker;
3216 goto getTrailByte;
3217 }
3218
3219 while(mySource < mySourceLimit){
3220
3221 targetUniChar =missingCharMarker;
3222
3223 if(myTarget < args->targetLimit){
3224
3225 mySourceChar= (unsigned char) *mySource++;
3226
3227 switch(mySourceChar){
3228 case UCNV_SI:
3229 pToU2022State->g=0;
3230 if (myData->isEmptySegment) {
3231 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
3232 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3233 args->converter->toUCallbackReason = UCNV_IRREGULAR;
3234 args->converter->toUBytes[0] = static_cast<uint8_t>(mySourceChar);
3235 args->converter->toULength = 1;
3236 args->target = myTarget;
3237 args->source = mySource;
3238 return;
3239 }
3240 continue;
3241
3242 case UCNV_SO:
3243 if(pToU2022State->cs[1] != 0) {
3244 pToU2022State->g=1;
3245 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
3246 continue;
3247 } else {
3248 /* illegal to have SO before a matching designator */
3249 myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */
3250 break;
3251 }
3252
3253 case ESC_2022:
3254 mySource--;
3255escape:
3256 {
3257 const char * mySourceBefore = mySource;
3258 int8_t toULengthBefore = args->converter->toULength;
3259
3260 changeState_2022(args->converter,&(mySource),
3261 mySourceLimit, ISO_2022_CN,err);
3262
3263 /* After SO there must be at least one character before a designator (designator error handled separately) */
3264 if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
3265 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3266 args->converter->toUCallbackReason = UCNV_IRREGULAR;
3267 args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
3268 }
3269 }
3270
3271 /* invalid or illegal escape sequence */
3272 if(U_FAILURE(*err)){
3273 args->target = myTarget;
3274 args->source = mySource;
3275 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
3276 return;
3277 }
3278 continue;
3279
3280 /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
3281
3282 case CR:
3283 case LF:
3284 uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
3285 U_FALLTHROUGH;
3286 default:
3287 /* convert one or two bytes */
3288 myData->isEmptySegment = FALSE;
3289 if(pToU2022State->g != 0) {
3290 if(mySource < mySourceLimit) {
3291 UConverterSharedData *cnv;
3292 StateEnum tempState;
3293 int32_t tempBufLen;
3294 int leadIsOk, trailIsOk;
3295 uint8_t trailByte;
3296getTrailByte:
3297 trailByte = (uint8_t)*mySource;
3298 /*
3299 * Ticket 5691: consistent illegal sequences:
3300 * - We include at least the first byte in the illegal sequence.
3301 * - If any of the non-initial bytes could be the start of a character,
3302 * we stop the illegal sequence before the first one of those.
3303 *
3304 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
3305 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
3306 * Otherwise we convert or report the pair of bytes.
3307 */
3308 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
3309 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
3310 if (leadIsOk && trailIsOk) {
3311 ++mySource;
3312 tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
3313 if(tempState >= CNS_11643_0) {
3314 cnv = myData->myConverterArray[CNS_11643];
3315 tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
3316 tempBuf[1] = (char) (mySourceChar);
3317 tempBuf[2] = (char) trailByte;
3318 tempBufLen = 3;
3319
3320 }else{
3321 U_ASSERT(tempState<UCNV_2022_MAX_CONVERTERS);
3322 cnv = myData->myConverterArray[tempState];
3323 tempBuf[0] = (char) (mySourceChar);
3324 tempBuf[1] = (char) trailByte;
3325 tempBufLen = 2;
3326 }
3327 targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
3328 mySourceChar = (mySourceChar << 8) | trailByte;
3329 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
3330 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
3331 ++mySource;
3332 /* add another bit so that the code below writes 2 bytes in case of error */
3333 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
3334 }
3335 if(pToU2022State->g>=2) {
3336 /* return from a single-shift state to the previous one */
3337 pToU2022State->g=pToU2022State->prevG;
3338 }
3339 } else {
3340 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
3341 args->converter->toULength = 1;
3342 goto endloop;
3343 }
3344 }
3345 else{
3346 if(mySourceChar <= 0x7f) {
3347 targetUniChar = (UChar) mySourceChar;
3348 }
3349 }
3350 break;
3351 }
3352 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
3353 if(args->offsets){
3354 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3355 }
3356 *(myTarget++)=(UChar)targetUniChar;
3357 }
3358 else if(targetUniChar > missingCharMarker){
3359 /* disassemble the surrogate pair and write to output*/
3360 targetUniChar-=0x0010000;
3361 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
3362 if(args->offsets){
3363 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3364 }
3365 ++myTarget;
3366 if(myTarget< args->targetLimit){
3367 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3368 if(args->offsets){
3369 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3370 }
3371 ++myTarget;
3372 }else{
3373 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
3374 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3375 }
3376
3377 }
3378 else{
3379 /* Call the callback function*/
3380 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
3381 break;
3382 }
3383 }
3384 else{
3385 *err =U_BUFFER_OVERFLOW_ERROR;
3386 break;
3387 }
3388 }
3389endloop:
3390 args->target = myTarget;
3391 args->source = mySource;
3392}
3393#endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */
3394
3395static void U_CALLCONV
3396_ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
3397 UConverter *cnv = args->converter;
3398 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
3399 ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
3400 char *p, *subchar;
3401 char buffer[8];
3402 int32_t length;
3403
3404 subchar=(char *)cnv->subChars;
3405 length=cnv->subCharLen; /* assume length==1 for most variants */
3406
3407 p = buffer;
3408 switch(myConverterData->locale[0]){
3409 case 'j':
3410 {
3411 int8_t cs;
3412
3413 if(pFromU2022State->g == 1) {
3414 /* JIS7: switch from G1 to G0 */
3415 pFromU2022State->g = 0;
3416 *p++ = UCNV_SI;
3417 }
3418
3419 cs = pFromU2022State->cs[0];
3420 if(cs != ASCII && cs != JISX201) {
3421 /* not in ASCII or JIS X 0201: switch to ASCII */
3422 pFromU2022State->cs[0] = (int8_t)ASCII;
3423 *p++ = '\x1b';
3424 *p++ = '\x28';
3425 *p++ = '\x42';
3426 }
3427
3428 *p++ = subchar[0];
3429 break;
3430 }
3431 case 'c':
3432 if(pFromU2022State->g != 0) {
3433 /* not in ASCII mode: switch to ASCII */
3434 pFromU2022State->g = 0;
3435 *p++ = UCNV_SI;
3436 }
3437 *p++ = subchar[0];
3438 break;
3439 case 'k':
3440 if(myConverterData->version == 0) {
3441 if(length == 1) {
3442 if(args->converter->fromUnicodeStatus) {
3443 /* in DBCS mode: switch to SBCS */
3444 args->converter->fromUnicodeStatus = 0;
3445 *p++ = UCNV_SI;
3446 }
3447 *p++ = subchar[0];
3448 } else /* length == 2*/ {
3449 if(!args->converter->fromUnicodeStatus) {
3450 /* in SBCS mode: switch to DBCS */
3451 args->converter->fromUnicodeStatus = 1;
3452 *p++ = UCNV_SO;
3453 }
3454 *p++ = subchar[0];
3455 *p++ = subchar[1];
3456 }
3457 break;
3458 } else {
3459 /* save the subconverter's substitution string */
3460 uint8_t *currentSubChars = myConverterData->currentConverter->subChars;
3461 int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen;
3462
3463 /* set our substitution string into the subconverter */
3464 myConverterData->currentConverter->subChars = (uint8_t *)subchar;
3465 myConverterData->currentConverter->subCharLen = (int8_t)length;
3466
3467 /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
3468 args->converter = myConverterData->currentConverter;
3469 myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
3470 ucnv_cbFromUWriteSub(args, 0, err);
3471 cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
3472 args->converter = cnv;
3473
3474 /* restore the subconverter's substitution string */
3475 myConverterData->currentConverter->subChars = currentSubChars;
3476 myConverterData->currentConverter->subCharLen = currentSubCharLen;
3477
3478 if(*err == U_BUFFER_OVERFLOW_ERROR) {
3479 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
3480 uprv_memcpy(
3481 cnv->charErrorBuffer,
3482 myConverterData->currentConverter->charErrorBuffer,
3483 myConverterData->currentConverter->charErrorBufferLength);
3484 }
3485 cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
3486 myConverterData->currentConverter->charErrorBufferLength = 0;
3487 }
3488 return;
3489 }
3490 default:
3491 /* not expected */
3492 break;
3493 }
3494 ucnv_cbFromUWriteBytes(args,
3495 buffer, (int32_t)(p - buffer),
3496 offsetIndex, err);
3497}
3498
3499/*
3500 * Structure for cloning an ISO 2022 converter into a single memory block.
3501 */
3502struct cloneStruct
3503{
3504 UConverter cnv;
3505 UConverter currentConverter;
3506 UConverterDataISO2022 mydata;
3507};
3508
3509
3510U_CDECL_BEGIN
3511
3512static UConverter * U_CALLCONV
3513_ISO_2022_SafeClone(
3514 const UConverter *cnv,
3515 void *stackBuffer,
3516 int32_t *pBufferSize,
3517 UErrorCode *status)
3518{
3519 struct cloneStruct * localClone;
3520 UConverterDataISO2022 *cnvData;
3521 int32_t i, size;
3522
3523 if (U_FAILURE(*status)){
3524 return nullptr;
3525 }
3526
3527 if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
3528 *pBufferSize = (int32_t)sizeof(struct cloneStruct);
3529 return NULL;
3530 }
3531
3532 cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
3533 localClone = (struct cloneStruct *)stackBuffer;
3534
3535 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
3536
3537 uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
3538 localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
3539 localClone->cnv.isExtraLocal = TRUE;
3540
3541 /* share the subconverters */
3542
3543 if(cnvData->currentConverter != NULL) {
3544 size = (int32_t)sizeof(UConverter);
3545 localClone->mydata.currentConverter =
3546 ucnv_safeClone(cnvData->currentConverter,
3547 &localClone->currentConverter,
3548 &size, status);
3549 if(U_FAILURE(*status)) {
3550 return NULL;
3551 }
3552 }
3553
3554 for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) {
3555 if(cnvData->myConverterArray[i] != NULL) {
3556 ucnv_incrementRefCount(cnvData->myConverterArray[i]);
3557 }
3558 }
3559
3560 return &localClone->cnv;
3561}
3562
3563U_CDECL_END
3564
3565static void U_CALLCONV
3566_ISO_2022_GetUnicodeSet(const UConverter *cnv,
3567 const USetAdder *sa,
3568 UConverterUnicodeSet which,
3569 UErrorCode *pErrorCode)
3570{
3571 int32_t i;
3572 UConverterDataISO2022* cnvData;
3573
3574 if (U_FAILURE(*pErrorCode)) {
3575 return;
3576 }
3577#ifdef U_ENABLE_GENERIC_ISO_2022
3578 if (cnv->sharedData == &_ISO2022Data) {
3579 /* We use UTF-8 in this case */
3580 sa->addRange(sa->set, 0, 0xd7FF);
3581 sa->addRange(sa->set, 0xE000, 0x10FFFF);
3582 return;
3583 }
3584#endif
3585
3586 cnvData = (UConverterDataISO2022*)cnv->extraInfo;
3587
3588 /* open a set and initialize it with code points that are algorithmically round-tripped */
3589 switch(cnvData->locale[0]){
3590 case 'j':
3591 /* include JIS X 0201 which is hardcoded */
3592 sa->add(sa->set, 0xa5);
3593 sa->add(sa->set, 0x203e);
3594 if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
3595 /* include Latin-1 for some variants of JP */
3596 sa->addRange(sa->set, 0, 0xff);
3597 } else {
3598 /* include ASCII for JP */
3599 sa->addRange(sa->set, 0, 0x7f);
3600 }
3601 if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
3602 /*
3603 * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
3604 * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
3605 * use half-width Katakana.
3606 * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
3607 * half-width Katakana via the ESC ( I sequence.
3608 * However, we only emit (fromUnicode) half-width Katakana according to the
3609 * definition of each variant.
3610 *
3611 * When including fallbacks,
3612 * we need to include half-width Katakana Unicode code points for all JP variants because
3613 * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
3614 */
3615 /* include half-width Katakana for JP */
3616 sa->addRange(sa->set, HWKANA_START, HWKANA_END);
3617 }
3618 break;
3619#if !UCONFIG_ONLY_HTML_CONVERSION
3620 case 'c':
3621 case 'z':
3622 /* include ASCII for CN */
3623 sa->addRange(sa->set, 0, 0x7f);
3624 break;
3625 case 'k':
3626 /* there is only one converter for KR, and it is not in the myConverterArray[] */
3627 cnvData->currentConverter->sharedData->impl->getUnicodeSet(
3628 cnvData->currentConverter, sa, which, pErrorCode);
3629 /* the loop over myConverterArray[] will simply not find another converter */
3630 break;
3631#endif
3632 default:
3633 break;
3634 }
3635
3636#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
3637 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3638 cnvData->version==0 && i==CNS_11643
3639 ) {
3640 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
3641 ucnv_MBCSGetUnicodeSetForBytes(
3642 cnvData->myConverterArray[i],
3643 sa, UCNV_ROUNDTRIP_SET,
3644 0, 0x81, 0x82,
3645 pErrorCode);
3646 }
3647#endif
3648
3649 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
3650 UConverterSetFilter filter;
3651 if(cnvData->myConverterArray[i]!=NULL) {
3652 if(cnvData->locale[0]=='j' && i==JISX208) {
3653 /*
3654 * Only add code points that map to Shift-JIS codes
3655 * corresponding to JIS X 0208.
3656 */
3657 filter=UCNV_SET_FILTER_SJIS;
3658#if !UCONFIG_ONLY_HTML_CONVERSION
3659 } else if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3660 cnvData->version==0 && i==CNS_11643) {
3661 /*
3662 * Version-specific for CN:
3663 * CN version 0 does not map CNS planes 3..7 although
3664 * they are all available in the CNS conversion table;
3665 * CN version 1 (-EXT) does map them all.
3666 * The two versions create different Unicode sets.
3667 */
3668 filter=UCNV_SET_FILTER_2022_CN;
3669 } else if(i==KSC5601) {
3670 /*
3671 * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
3672 * are broader than GR94.
3673 */
3674 filter=UCNV_SET_FILTER_GR94DBCS;
3675#endif
3676 } else {
3677 filter=UCNV_SET_FILTER_NONE;
3678 }
3679 ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode);
3680 }
3681 }
3682
3683 /*
3684 * ISO 2022 converters must not convert SO/SI/ESC despite what
3685 * sub-converters do by themselves.
3686 * Remove these characters from the set.
3687 */
3688 sa->remove(sa->set, 0x0e);
3689 sa->remove(sa->set, 0x0f);
3690 sa->remove(sa->set, 0x1b);
3691
3692 /* ISO 2022 converters do not convert C1 controls either */
3693 sa->removeRange(sa->set, 0x80, 0x9f);
3694}
3695
3696static const UConverterImpl _ISO2022Impl={
3697 UCNV_ISO_2022,
3698
3699 NULL,
3700 NULL,
3701
3702 _ISO2022Open,
3703 _ISO2022Close,
3704 _ISO2022Reset,
3705
3706#ifdef U_ENABLE_GENERIC_ISO_2022
3707 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3708 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3709 ucnv_fromUnicode_UTF8,
3710 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
3711#else
3712 NULL,
3713 NULL,
3714 NULL,
3715 NULL,
3716#endif
3717 NULL,
3718
3719 NULL,
3720 _ISO2022getName,
3721 _ISO_2022_WriteSub,
3722 _ISO_2022_SafeClone,
3723 _ISO_2022_GetUnicodeSet,
3724
3725 NULL,
3726 NULL
3727};
3728static const UConverterStaticData _ISO2022StaticData={
3729 sizeof(UConverterStaticData),
3730 "ISO_2022",
3731 2022,
3732 UCNV_IBM,
3733 UCNV_ISO_2022,
3734 1,
3735 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
3736 { 0x1a, 0, 0, 0 },
3737 1,
3738 FALSE,
3739 FALSE,
3740 0,
3741 0,
3742 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3743};
3744const UConverterSharedData _ISO2022Data=
3745 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022StaticData, &_ISO2022Impl);
3746
3747/*************JP****************/
3748static const UConverterImpl _ISO2022JPImpl={
3749 UCNV_ISO_2022,
3750
3751 NULL,
3752 NULL,
3753
3754 _ISO2022Open,
3755 _ISO2022Close,
3756 _ISO2022Reset,
3757
3758 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3759 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3760 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3761 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3762 NULL,
3763
3764 NULL,
3765 _ISO2022getName,
3766 _ISO_2022_WriteSub,
3767 _ISO_2022_SafeClone,
3768 _ISO_2022_GetUnicodeSet,
3769
3770 NULL,
3771 NULL
3772};
3773static const UConverterStaticData _ISO2022JPStaticData={
3774 sizeof(UConverterStaticData),
3775 "ISO_2022_JP",
3776 0,
3777 UCNV_IBM,
3778 UCNV_ISO_2022,
3779 1,
3780 6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
3781 { 0x1a, 0, 0, 0 },
3782 1,
3783 FALSE,
3784 FALSE,
3785 0,
3786 0,
3787 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3788};
3789
3790namespace {
3791
3792const UConverterSharedData _ISO2022JPData=
3793 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022JPStaticData, &_ISO2022JPImpl);
3794
3795} // namespace
3796
3797#if !UCONFIG_ONLY_HTML_CONVERSION
3798/************* KR ***************/
3799static const UConverterImpl _ISO2022KRImpl={
3800 UCNV_ISO_2022,
3801
3802 NULL,
3803 NULL,
3804
3805 _ISO2022Open,
3806 _ISO2022Close,
3807 _ISO2022Reset,
3808
3809 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3810 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3811 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3812 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3813 NULL,
3814
3815 NULL,
3816 _ISO2022getName,
3817 _ISO_2022_WriteSub,
3818 _ISO_2022_SafeClone,
3819 _ISO_2022_GetUnicodeSet,
3820
3821 NULL,
3822 NULL
3823};
3824static const UConverterStaticData _ISO2022KRStaticData={
3825 sizeof(UConverterStaticData),
3826 "ISO_2022_KR",
3827 0,
3828 UCNV_IBM,
3829 UCNV_ISO_2022,
3830 1,
3831 8, /* max 8 bytes per UChar */
3832 { 0x1a, 0, 0, 0 },
3833 1,
3834 FALSE,
3835 FALSE,
3836 0,
3837 0,
3838 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3839};
3840
3841namespace {
3842
3843const UConverterSharedData _ISO2022KRData=
3844 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022KRStaticData, &_ISO2022KRImpl);
3845
3846} // namespace
3847
3848/*************** CN ***************/
3849static const UConverterImpl _ISO2022CNImpl={
3850
3851 UCNV_ISO_2022,
3852
3853 NULL,
3854 NULL,
3855
3856 _ISO2022Open,
3857 _ISO2022Close,
3858 _ISO2022Reset,
3859
3860 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3861 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3862 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3863 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3864 NULL,
3865
3866 NULL,
3867 _ISO2022getName,
3868 _ISO_2022_WriteSub,
3869 _ISO_2022_SafeClone,
3870 _ISO_2022_GetUnicodeSet,
3871
3872 NULL,
3873 NULL
3874};
3875static const UConverterStaticData _ISO2022CNStaticData={
3876 sizeof(UConverterStaticData),
3877 "ISO_2022_CN",
3878 0,
3879 UCNV_IBM,
3880 UCNV_ISO_2022,
3881 1,
3882 8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
3883 { 0x1a, 0, 0, 0 },
3884 1,
3885 FALSE,
3886 FALSE,
3887 0,
3888 0,
3889 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3890};
3891
3892namespace {
3893
3894const UConverterSharedData _ISO2022CNData=
3895 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022CNStaticData, &_ISO2022CNImpl);
3896
3897} // namespace
3898#endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */
3899
3900#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
3901