1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4 *****************************************************************************
5 *
6 * Copyright (C) 1998-2016, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *****************************************************************************
10 *
11 * ucnv_err.c
12 * Implements error behaviour functions called by T_UConverter_{from,to}Unicode
13 *
14 *
15* Change history:
16*
17* 06/29/2000 helena Major rewrite of the callback APIs.
18*/
19
20#include "unicode/utypes.h"
21
22#if !UCONFIG_NO_CONVERSION
23
24#include "unicode/ucnv_err.h"
25#include "unicode/ucnv_cb.h"
26#include "ucnv_cnv.h"
27#include "cmemory.h"
28#include "unicode/ucnv.h"
29#include "ustrfmt.h"
30
31#define VALUE_STRING_LENGTH 48
32/*Magic # 32 = 4(number of char in value string) * 8(max number of bytes per char for any converter) */
33#define UNICODE_PERCENT_SIGN_CODEPOINT 0x0025
34#define UNICODE_U_CODEPOINT 0x0055
35#define UNICODE_X_CODEPOINT 0x0058
36#define UNICODE_RS_CODEPOINT 0x005C
37#define UNICODE_U_LOW_CODEPOINT 0x0075
38#define UNICODE_X_LOW_CODEPOINT 0x0078
39#define UNICODE_AMP_CODEPOINT 0x0026
40#define UNICODE_HASH_CODEPOINT 0x0023
41#define UNICODE_SEMICOLON_CODEPOINT 0x003B
42#define UNICODE_PLUS_CODEPOINT 0x002B
43#define UNICODE_LEFT_CURLY_CODEPOINT 0x007B
44#define UNICODE_RIGHT_CURLY_CODEPOINT 0x007D
45#define UNICODE_SPACE_CODEPOINT 0x0020
46#define UCNV_PRV_ESCAPE_ICU 0
47#define UCNV_PRV_ESCAPE_C 'C'
48#define UCNV_PRV_ESCAPE_XML_DEC 'D'
49#define UCNV_PRV_ESCAPE_XML_HEX 'X'
50#define UCNV_PRV_ESCAPE_JAVA 'J'
51#define UCNV_PRV_ESCAPE_UNICODE 'U'
52#define UCNV_PRV_ESCAPE_CSS2 'S'
53#define UCNV_PRV_STOP_ON_ILLEGAL 'i'
54
55/*
56 * IS_DEFAULT_IGNORABLE_CODE_POINT
57 * This is to check if a code point has the default ignorable unicode property.
58 * As such, this list needs to be updated if the ignorable code point list ever
59 * changes.
60 * To avoid dependency on other code, this list is hard coded here.
61 * When an ignorable code point is found and is unmappable, the default callbacks
62 * will ignore them.
63 * For a list of the default ignorable code points, use this link:
64 * https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%3ADI%3A%5D&abb=on&g=&i=
65 *
66 * This list should be sync with the one in CharsetCallback.java
67 */
68#define IS_DEFAULT_IGNORABLE_CODE_POINT(c) ( \
69 (c == 0x00AD) || \
70 (c == 0x034F) || \
71 (c == 0x061C) || \
72 (c == 0x115F) || \
73 (c == 0x1160) || \
74 (0x17B4 <= c && c <= 0x17B5) || \
75 (0x180B <= c && c <= 0x180F) || \
76 (0x200B <= c && c <= 0x200F) || \
77 (0x202A <= c && c <= 0x202E) || \
78 (0x2060 <= c && c <= 0x206F) || \
79 (c == 0x3164) || \
80 (0xFE00 <= c && c <= 0xFE0F) || \
81 (c == 0xFEFF) || \
82 (c == 0xFFA0) || \
83 (0xFFF0 <= c && c <= 0xFFF8) || \
84 (0x1BCA0 <= c && c <= 0x1BCA3) || \
85 (0x1D173 <= c && c <= 0x1D17A) || \
86 (0xE0000 <= c && c <= 0xE0FFF))
87
88
89/*Function Pointer STOPS at the ILLEGAL_SEQUENCE */
90U_CAPI void U_EXPORT2
91UCNV_FROM_U_CALLBACK_STOP (
92 const void *context,
93 UConverterFromUnicodeArgs *fromUArgs,
94 const char16_t* codeUnits,
95 int32_t length,
96 UChar32 codePoint,
97 UConverterCallbackReason reason,
98 UErrorCode * err)
99{
100 (void)context;
101 (void)fromUArgs;
102 (void)codeUnits;
103 (void)length;
104 if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
105 {
106 /*
107 * Skip if the codepoint has unicode property of default ignorable.
108 */
109 *err = U_ZERO_ERROR;
110 }
111 /* the caller must have set the error code accordingly */
112 return;
113}
114
115
116/*Function Pointer STOPS at the ILLEGAL_SEQUENCE */
117U_CAPI void U_EXPORT2
118UCNV_TO_U_CALLBACK_STOP (
119 const void *context,
120 UConverterToUnicodeArgs *toUArgs,
121 const char* codePoints,
122 int32_t length,
123 UConverterCallbackReason reason,
124 UErrorCode * err)
125{
126 /* the caller must have set the error code accordingly */
127 (void)context; (void)toUArgs; (void)codePoints; (void)length; (void)reason; (void)err;
128 return;
129}
130
131U_CAPI void U_EXPORT2
132UCNV_FROM_U_CALLBACK_SKIP (
133 const void *context,
134 UConverterFromUnicodeArgs *fromUArgs,
135 const char16_t* codeUnits,
136 int32_t length,
137 UChar32 codePoint,
138 UConverterCallbackReason reason,
139 UErrorCode * err)
140{
141 (void)fromUArgs;
142 (void)codeUnits;
143 (void)length;
144 if (reason <= UCNV_IRREGULAR)
145 {
146 if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
147 {
148 /*
149 * Skip if the codepoint has unicode property of default ignorable.
150 */
151 *err = U_ZERO_ERROR;
152 }
153 else if (context == nullptr || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
154 {
155 *err = U_ZERO_ERROR;
156 }
157 /* else the caller must have set the error code accordingly. */
158 }
159 /* else ignore the reset, close and clone calls. */
160}
161
162U_CAPI void U_EXPORT2
163UCNV_FROM_U_CALLBACK_SUBSTITUTE (
164 const void *context,
165 UConverterFromUnicodeArgs *fromArgs,
166 const char16_t* codeUnits,
167 int32_t length,
168 UChar32 codePoint,
169 UConverterCallbackReason reason,
170 UErrorCode * err)
171{
172 (void)codeUnits;
173 (void)length;
174 if (reason <= UCNV_IRREGULAR)
175 {
176 if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
177 {
178 /*
179 * Skip if the codepoint has unicode property of default ignorable.
180 */
181 *err = U_ZERO_ERROR;
182 }
183 else if (context == nullptr || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
184 {
185 *err = U_ZERO_ERROR;
186 ucnv_cbFromUWriteSub(fromArgs, 0, err);
187 }
188 /* else the caller must have set the error code accordingly. */
189 }
190 /* else ignore the reset, close and clone calls. */
191}
192
193/*uses uprv_itou to get a unicode escape sequence of the offensive sequence,
194 *uses a clean copy (resetted) of the converter, to convert that unicode
195 *escape sequence to the target codepage (if conversion failure happens then
196 *we revert to substituting with subchar)
197 */
198U_CAPI void U_EXPORT2
199UCNV_FROM_U_CALLBACK_ESCAPE (
200 const void *context,
201 UConverterFromUnicodeArgs *fromArgs,
202 const char16_t *codeUnits,
203 int32_t length,
204 UChar32 codePoint,
205 UConverterCallbackReason reason,
206 UErrorCode * err)
207{
208
209 char16_t valueString[VALUE_STRING_LENGTH];
210 int32_t valueStringLength = 0;
211 int32_t i = 0;
212
213 const char16_t *myValueSource = nullptr;
214 UErrorCode err2 = U_ZERO_ERROR;
215 UConverterFromUCallback original = nullptr;
216 const void *originalContext;
217
218 UConverterFromUCallback ignoredCallback = nullptr;
219 const void *ignoredContext;
220
221 if (reason > UCNV_IRREGULAR)
222 {
223 return;
224 }
225 else if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
226 {
227 /*
228 * Skip if the codepoint has unicode property of default ignorable.
229 */
230 *err = U_ZERO_ERROR;
231 return;
232 }
233
234 ucnv_setFromUCallBack (fromArgs->converter,
235 (UConverterFromUCallback) UCNV_FROM_U_CALLBACK_SUBSTITUTE,
236 nullptr,
237 &original,
238 &originalContext,
239 &err2);
240
241 if (U_FAILURE (err2))
242 {
243 *err = err2;
244 return;
245 }
246 if(context==nullptr)
247 {
248 while (i < length)
249 {
250 valueString[valueStringLength++] = (char16_t) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
251 valueString[valueStringLength++] = (char16_t) UNICODE_U_CODEPOINT; /* adding U */
252 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
253 }
254 }
255 else
256 {
257 switch(*((char*)context))
258 {
259 case UCNV_PRV_ESCAPE_JAVA:
260 while (i < length)
261 {
262 valueString[valueStringLength++] = (char16_t) UNICODE_RS_CODEPOINT; /* adding \ */
263 valueString[valueStringLength++] = (char16_t) UNICODE_U_LOW_CODEPOINT; /* adding u */
264 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
265 }
266 break;
267
268 case UCNV_PRV_ESCAPE_C:
269 valueString[valueStringLength++] = (char16_t) UNICODE_RS_CODEPOINT; /* adding \ */
270
271 if(length==2){
272 valueString[valueStringLength++] = (char16_t) UNICODE_U_CODEPOINT; /* adding U */
273 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 8);
274
275 }
276 else{
277 valueString[valueStringLength++] = (char16_t) UNICODE_U_LOW_CODEPOINT; /* adding u */
278 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 4);
279 }
280 break;
281
282 case UCNV_PRV_ESCAPE_XML_DEC:
283
284 valueString[valueStringLength++] = (char16_t) UNICODE_AMP_CODEPOINT; /* adding & */
285 valueString[valueStringLength++] = (char16_t) UNICODE_HASH_CODEPOINT; /* adding # */
286 if(length==2){
287 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 10, 0);
288 }
289 else{
290 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 10, 0);
291 }
292 valueString[valueStringLength++] = (char16_t) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
293 break;
294
295 case UCNV_PRV_ESCAPE_XML_HEX:
296
297 valueString[valueStringLength++] = (char16_t) UNICODE_AMP_CODEPOINT; /* adding & */
298 valueString[valueStringLength++] = (char16_t) UNICODE_HASH_CODEPOINT; /* adding # */
299 valueString[valueStringLength++] = (char16_t) UNICODE_X_LOW_CODEPOINT; /* adding x */
300 if(length==2){
301 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 0);
302 }
303 else{
304 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 0);
305 }
306 valueString[valueStringLength++] = (char16_t) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
307 break;
308
309 case UCNV_PRV_ESCAPE_UNICODE:
310 valueString[valueStringLength++] = (char16_t) UNICODE_LEFT_CURLY_CODEPOINT; /* adding { */
311 valueString[valueStringLength++] = (char16_t) UNICODE_U_CODEPOINT; /* adding U */
312 valueString[valueStringLength++] = (char16_t) UNICODE_PLUS_CODEPOINT; /* adding + */
313 if (length == 2) {
314 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 4);
315 } else {
316 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 4);
317 }
318 valueString[valueStringLength++] = (char16_t) UNICODE_RIGHT_CURLY_CODEPOINT; /* adding } */
319 break;
320
321 case UCNV_PRV_ESCAPE_CSS2:
322 valueString[valueStringLength++] = (char16_t) UNICODE_RS_CODEPOINT; /* adding \ */
323 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 0);
324 /* Always add space character, because the next character might be whitespace,
325 which would erroneously be considered the termination of the escape sequence. */
326 valueString[valueStringLength++] = (char16_t) UNICODE_SPACE_CODEPOINT;
327 break;
328
329 default:
330 while (i < length)
331 {
332 valueString[valueStringLength++] = (char16_t) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
333 valueString[valueStringLength++] = (char16_t) UNICODE_U_CODEPOINT; /* adding U */
334 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
335 }
336 }
337 }
338 myValueSource = valueString;
339
340 /* reset the error */
341 *err = U_ZERO_ERROR;
342
343 ucnv_cbFromUWriteUChars(fromArgs, &myValueSource, myValueSource+valueStringLength, 0, err);
344
345 ucnv_setFromUCallBack (fromArgs->converter,
346 original,
347 originalContext,
348 &ignoredCallback,
349 &ignoredContext,
350 &err2);
351 if (U_FAILURE (err2))
352 {
353 *err = err2;
354 return;
355 }
356
357 return;
358}
359
360
361
362U_CAPI void U_EXPORT2
363UCNV_TO_U_CALLBACK_SKIP (
364 const void *context,
365 UConverterToUnicodeArgs *toArgs,
366 const char* codeUnits,
367 int32_t length,
368 UConverterCallbackReason reason,
369 UErrorCode * err)
370{
371 (void)toArgs;
372 (void)codeUnits;
373 (void)length;
374 if (reason <= UCNV_IRREGULAR)
375 {
376 if (context == nullptr || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
377 {
378 *err = U_ZERO_ERROR;
379 }
380 /* else the caller must have set the error code accordingly. */
381 }
382 /* else ignore the reset, close and clone calls. */
383}
384
385U_CAPI void U_EXPORT2
386UCNV_TO_U_CALLBACK_SUBSTITUTE (
387 const void *context,
388 UConverterToUnicodeArgs *toArgs,
389 const char* codeUnits,
390 int32_t length,
391 UConverterCallbackReason reason,
392 UErrorCode * err)
393{
394 (void)codeUnits;
395 (void)length;
396 if (reason <= UCNV_IRREGULAR)
397 {
398 if (context == nullptr || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
399 {
400 *err = U_ZERO_ERROR;
401 ucnv_cbToUWriteSub(toArgs,0,err);
402 }
403 /* else the caller must have set the error code accordingly. */
404 }
405 /* else ignore the reset, close and clone calls. */
406}
407
408/*uses uprv_itou to get a unicode escape sequence of the offensive sequence,
409 *and uses that as the substitution sequence
410 */
411U_CAPI void U_EXPORT2
412UCNV_TO_U_CALLBACK_ESCAPE (
413 const void *context,
414 UConverterToUnicodeArgs *toArgs,
415 const char* codeUnits,
416 int32_t length,
417 UConverterCallbackReason reason,
418 UErrorCode * err)
419{
420 char16_t uniValueString[VALUE_STRING_LENGTH];
421 int32_t valueStringLength = 0;
422 int32_t i = 0;
423
424 if (reason > UCNV_IRREGULAR)
425 {
426 return;
427 }
428
429 if(context==nullptr)
430 {
431 while (i < length)
432 {
433 uniValueString[valueStringLength++] = (char16_t) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
434 uniValueString[valueStringLength++] = (char16_t) UNICODE_X_CODEPOINT; /* adding X */
435 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t) codeUnits[i++], 16, 2);
436 }
437 }
438 else
439 {
440 switch(*((char*)context))
441 {
442 case UCNV_PRV_ESCAPE_XML_DEC:
443 while (i < length)
444 {
445 uniValueString[valueStringLength++] = (char16_t) UNICODE_AMP_CODEPOINT; /* adding & */
446 uniValueString[valueStringLength++] = (char16_t) UNICODE_HASH_CODEPOINT; /* adding # */
447 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 10, 0);
448 uniValueString[valueStringLength++] = (char16_t) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
449 }
450 break;
451
452 case UCNV_PRV_ESCAPE_XML_HEX:
453 while (i < length)
454 {
455 uniValueString[valueStringLength++] = (char16_t) UNICODE_AMP_CODEPOINT; /* adding & */
456 uniValueString[valueStringLength++] = (char16_t) UNICODE_HASH_CODEPOINT; /* adding # */
457 uniValueString[valueStringLength++] = (char16_t) UNICODE_X_LOW_CODEPOINT; /* adding x */
458 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 16, 0);
459 uniValueString[valueStringLength++] = (char16_t) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
460 }
461 break;
462 case UCNV_PRV_ESCAPE_C:
463 while (i < length)
464 {
465 uniValueString[valueStringLength++] = (char16_t) UNICODE_RS_CODEPOINT; /* adding \ */
466 uniValueString[valueStringLength++] = (char16_t) UNICODE_X_LOW_CODEPOINT; /* adding x */
467 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 16, 2);
468 }
469 break;
470 default:
471 while (i < length)
472 {
473 uniValueString[valueStringLength++] = (char16_t) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
474 uniValueString[valueStringLength++] = (char16_t) UNICODE_X_CODEPOINT; /* adding X */
475 uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t) codeUnits[i++], 16, 2);
476 valueStringLength += 2;
477 }
478 }
479 }
480 /* reset the error */
481 *err = U_ZERO_ERROR;
482
483 ucnv_cbToUWriteUChars(toArgs, uniValueString, valueStringLength, 0, err);
484}
485
486#endif
487