1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | /* |
4 | ***************************************************************************** |
5 | * |
6 | * Copyright (C) 1998-2016, International Business Machines |
7 | * Corporation and others. All Rights Reserved. |
8 | * |
9 | ***************************************************************************** |
10 | * |
11 | * ucnv_err.c |
12 | * Implements error behaviour functions called by T_UConverter_{from,to}Unicode |
13 | * |
14 | * |
15 | * Change history: |
16 | * |
17 | * 06/29/2000 helena Major rewrite of the callback APIs. |
18 | */ |
19 | |
20 | #include "unicode/utypes.h" |
21 | |
22 | #if !UCONFIG_NO_CONVERSION |
23 | |
24 | #include "unicode/ucnv_err.h" |
25 | #include "unicode/ucnv_cb.h" |
26 | #include "ucnv_cnv.h" |
27 | #include "cmemory.h" |
28 | #include "unicode/ucnv.h" |
29 | #include "ustrfmt.h" |
30 | |
31 | #define VALUE_STRING_LENGTH 48 |
32 | /*Magic # 32 = 4(number of char in value string) * 8(max number of bytes per char for any converter) */ |
33 | #define UNICODE_PERCENT_SIGN_CODEPOINT 0x0025 |
34 | #define UNICODE_U_CODEPOINT 0x0055 |
35 | #define UNICODE_X_CODEPOINT 0x0058 |
36 | #define UNICODE_RS_CODEPOINT 0x005C |
37 | #define UNICODE_U_LOW_CODEPOINT 0x0075 |
38 | #define UNICODE_X_LOW_CODEPOINT 0x0078 |
39 | #define UNICODE_AMP_CODEPOINT 0x0026 |
40 | #define UNICODE_HASH_CODEPOINT 0x0023 |
41 | #define UNICODE_SEMICOLON_CODEPOINT 0x003B |
42 | #define UNICODE_PLUS_CODEPOINT 0x002B |
43 | #define UNICODE_LEFT_CURLY_CODEPOINT 0x007B |
44 | #define UNICODE_RIGHT_CURLY_CODEPOINT 0x007D |
45 | #define UNICODE_SPACE_CODEPOINT 0x0020 |
46 | #define UCNV_PRV_ESCAPE_ICU 0 |
47 | #define UCNV_PRV_ESCAPE_C 'C' |
48 | #define UCNV_PRV_ESCAPE_XML_DEC 'D' |
49 | #define UCNV_PRV_ESCAPE_XML_HEX 'X' |
50 | #define UCNV_PRV_ESCAPE_JAVA 'J' |
51 | #define UCNV_PRV_ESCAPE_UNICODE 'U' |
52 | #define UCNV_PRV_ESCAPE_CSS2 'S' |
53 | #define UCNV_PRV_STOP_ON_ILLEGAL 'i' |
54 | |
55 | /* |
56 | * IS_DEFAULT_IGNORABLE_CODE_POINT |
57 | * This is to check if a code point has the default ignorable unicode property. |
58 | * As such, this list needs to be updated if the ignorable code point list ever |
59 | * changes. |
60 | * To avoid dependency on other code, this list is hard coded here. |
61 | * When an ignorable code point is found and is unmappable, the default callbacks |
62 | * will ignore them. |
63 | * For a list of the default ignorable code points, use this link: |
64 | * https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%3ADI%3A%5D&abb=on&g=&i= |
65 | * |
66 | * This list should be sync with the one in CharsetCallback.java |
67 | */ |
68 | #define IS_DEFAULT_IGNORABLE_CODE_POINT(c) ( \ |
69 | (c == 0x00AD) || \ |
70 | (c == 0x034F) || \ |
71 | (c == 0x061C) || \ |
72 | (c == 0x115F) || \ |
73 | (c == 0x1160) || \ |
74 | (0x17B4 <= c && c <= 0x17B5) || \ |
75 | (0x180B <= c && c <= 0x180F) || \ |
76 | (0x200B <= c && c <= 0x200F) || \ |
77 | (0x202A <= c && c <= 0x202E) || \ |
78 | (0x2060 <= c && c <= 0x206F) || \ |
79 | (c == 0x3164) || \ |
80 | (0xFE00 <= c && c <= 0xFE0F) || \ |
81 | (c == 0xFEFF) || \ |
82 | (c == 0xFFA0) || \ |
83 | (0xFFF0 <= c && c <= 0xFFF8) || \ |
84 | (0x1BCA0 <= c && c <= 0x1BCA3) || \ |
85 | (0x1D173 <= c && c <= 0x1D17A) || \ |
86 | (0xE0000 <= c && c <= 0xE0FFF)) |
87 | |
88 | |
89 | /*Function Pointer STOPS at the ILLEGAL_SEQUENCE */ |
90 | U_CAPI void U_EXPORT2 |
91 | UCNV_FROM_U_CALLBACK_STOP ( |
92 | const void *context, |
93 | UConverterFromUnicodeArgs *fromUArgs, |
94 | const char16_t* codeUnits, |
95 | int32_t length, |
96 | UChar32 codePoint, |
97 | UConverterCallbackReason reason, |
98 | UErrorCode * err) |
99 | { |
100 | (void)context; |
101 | (void)fromUArgs; |
102 | (void)codeUnits; |
103 | (void)length; |
104 | if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint)) |
105 | { |
106 | /* |
107 | * Skip if the codepoint has unicode property of default ignorable. |
108 | */ |
109 | *err = U_ZERO_ERROR; |
110 | } |
111 | /* the caller must have set the error code accordingly */ |
112 | return; |
113 | } |
114 | |
115 | |
116 | /*Function Pointer STOPS at the ILLEGAL_SEQUENCE */ |
117 | U_CAPI void U_EXPORT2 |
118 | UCNV_TO_U_CALLBACK_STOP ( |
119 | const void *context, |
120 | UConverterToUnicodeArgs *toUArgs, |
121 | const char* codePoints, |
122 | int32_t length, |
123 | UConverterCallbackReason reason, |
124 | UErrorCode * err) |
125 | { |
126 | /* the caller must have set the error code accordingly */ |
127 | (void)context; (void)toUArgs; (void)codePoints; (void)length; (void)reason; (void)err; |
128 | return; |
129 | } |
130 | |
131 | U_CAPI void U_EXPORT2 |
132 | UCNV_FROM_U_CALLBACK_SKIP ( |
133 | const void *context, |
134 | UConverterFromUnicodeArgs *fromUArgs, |
135 | const char16_t* codeUnits, |
136 | int32_t length, |
137 | UChar32 codePoint, |
138 | UConverterCallbackReason reason, |
139 | UErrorCode * err) |
140 | { |
141 | (void)fromUArgs; |
142 | (void)codeUnits; |
143 | (void)length; |
144 | if (reason <= UCNV_IRREGULAR) |
145 | { |
146 | if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint)) |
147 | { |
148 | /* |
149 | * Skip if the codepoint has unicode property of default ignorable. |
150 | */ |
151 | *err = U_ZERO_ERROR; |
152 | } |
153 | else if (context == nullptr || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED)) |
154 | { |
155 | *err = U_ZERO_ERROR; |
156 | } |
157 | /* else the caller must have set the error code accordingly. */ |
158 | } |
159 | /* else ignore the reset, close and clone calls. */ |
160 | } |
161 | |
162 | U_CAPI void U_EXPORT2 |
163 | UCNV_FROM_U_CALLBACK_SUBSTITUTE ( |
164 | const void *context, |
165 | UConverterFromUnicodeArgs *fromArgs, |
166 | const char16_t* codeUnits, |
167 | int32_t length, |
168 | UChar32 codePoint, |
169 | UConverterCallbackReason reason, |
170 | UErrorCode * err) |
171 | { |
172 | (void)codeUnits; |
173 | (void)length; |
174 | if (reason <= UCNV_IRREGULAR) |
175 | { |
176 | if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint)) |
177 | { |
178 | /* |
179 | * Skip if the codepoint has unicode property of default ignorable. |
180 | */ |
181 | *err = U_ZERO_ERROR; |
182 | } |
183 | else if (context == nullptr || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED)) |
184 | { |
185 | *err = U_ZERO_ERROR; |
186 | ucnv_cbFromUWriteSub(fromArgs, 0, err); |
187 | } |
188 | /* else the caller must have set the error code accordingly. */ |
189 | } |
190 | /* else ignore the reset, close and clone calls. */ |
191 | } |
192 | |
193 | /*uses uprv_itou to get a unicode escape sequence of the offensive sequence, |
194 | *uses a clean copy (resetted) of the converter, to convert that unicode |
195 | *escape sequence to the target codepage (if conversion failure happens then |
196 | *we revert to substituting with subchar) |
197 | */ |
198 | U_CAPI void U_EXPORT2 |
199 | UCNV_FROM_U_CALLBACK_ESCAPE ( |
200 | const void *context, |
201 | UConverterFromUnicodeArgs *fromArgs, |
202 | const char16_t *codeUnits, |
203 | int32_t length, |
204 | UChar32 codePoint, |
205 | UConverterCallbackReason reason, |
206 | UErrorCode * err) |
207 | { |
208 | |
209 | char16_t valueString[VALUE_STRING_LENGTH]; |
210 | int32_t valueStringLength = 0; |
211 | int32_t i = 0; |
212 | |
213 | const char16_t *myValueSource = nullptr; |
214 | UErrorCode err2 = U_ZERO_ERROR; |
215 | UConverterFromUCallback original = nullptr; |
216 | const void *originalContext; |
217 | |
218 | UConverterFromUCallback ignoredCallback = nullptr; |
219 | const void *ignoredContext; |
220 | |
221 | if (reason > UCNV_IRREGULAR) |
222 | { |
223 | return; |
224 | } |
225 | else if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint)) |
226 | { |
227 | /* |
228 | * Skip if the codepoint has unicode property of default ignorable. |
229 | */ |
230 | *err = U_ZERO_ERROR; |
231 | return; |
232 | } |
233 | |
234 | ucnv_setFromUCallBack (fromArgs->converter, |
235 | (UConverterFromUCallback) UCNV_FROM_U_CALLBACK_SUBSTITUTE, |
236 | nullptr, |
237 | &original, |
238 | &originalContext, |
239 | &err2); |
240 | |
241 | if (U_FAILURE (err2)) |
242 | { |
243 | *err = err2; |
244 | return; |
245 | } |
246 | if(context==nullptr) |
247 | { |
248 | while (i < length) |
249 | { |
250 | valueString[valueStringLength++] = (char16_t) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */ |
251 | valueString[valueStringLength++] = (char16_t) UNICODE_U_CODEPOINT; /* adding U */ |
252 | valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4); |
253 | } |
254 | } |
255 | else |
256 | { |
257 | switch(*((char*)context)) |
258 | { |
259 | case UCNV_PRV_ESCAPE_JAVA: |
260 | while (i < length) |
261 | { |
262 | valueString[valueStringLength++] = (char16_t) UNICODE_RS_CODEPOINT; /* adding \ */ |
263 | valueString[valueStringLength++] = (char16_t) UNICODE_U_LOW_CODEPOINT; /* adding u */ |
264 | valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4); |
265 | } |
266 | break; |
267 | |
268 | case UCNV_PRV_ESCAPE_C: |
269 | valueString[valueStringLength++] = (char16_t) UNICODE_RS_CODEPOINT; /* adding \ */ |
270 | |
271 | if(length==2){ |
272 | valueString[valueStringLength++] = (char16_t) UNICODE_U_CODEPOINT; /* adding U */ |
273 | valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 8); |
274 | |
275 | } |
276 | else{ |
277 | valueString[valueStringLength++] = (char16_t) UNICODE_U_LOW_CODEPOINT; /* adding u */ |
278 | valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 4); |
279 | } |
280 | break; |
281 | |
282 | case UCNV_PRV_ESCAPE_XML_DEC: |
283 | |
284 | valueString[valueStringLength++] = (char16_t) UNICODE_AMP_CODEPOINT; /* adding & */ |
285 | valueString[valueStringLength++] = (char16_t) UNICODE_HASH_CODEPOINT; /* adding # */ |
286 | if(length==2){ |
287 | valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 10, 0); |
288 | } |
289 | else{ |
290 | valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 10, 0); |
291 | } |
292 | valueString[valueStringLength++] = (char16_t) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */ |
293 | break; |
294 | |
295 | case UCNV_PRV_ESCAPE_XML_HEX: |
296 | |
297 | valueString[valueStringLength++] = (char16_t) UNICODE_AMP_CODEPOINT; /* adding & */ |
298 | valueString[valueStringLength++] = (char16_t) UNICODE_HASH_CODEPOINT; /* adding # */ |
299 | valueString[valueStringLength++] = (char16_t) UNICODE_X_LOW_CODEPOINT; /* adding x */ |
300 | if(length==2){ |
301 | valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 0); |
302 | } |
303 | else{ |
304 | valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 0); |
305 | } |
306 | valueString[valueStringLength++] = (char16_t) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */ |
307 | break; |
308 | |
309 | case UCNV_PRV_ESCAPE_UNICODE: |
310 | valueString[valueStringLength++] = (char16_t) UNICODE_LEFT_CURLY_CODEPOINT; /* adding { */ |
311 | valueString[valueStringLength++] = (char16_t) UNICODE_U_CODEPOINT; /* adding U */ |
312 | valueString[valueStringLength++] = (char16_t) UNICODE_PLUS_CODEPOINT; /* adding + */ |
313 | if (length == 2) { |
314 | valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 4); |
315 | } else { |
316 | valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 4); |
317 | } |
318 | valueString[valueStringLength++] = (char16_t) UNICODE_RIGHT_CURLY_CODEPOINT; /* adding } */ |
319 | break; |
320 | |
321 | case UCNV_PRV_ESCAPE_CSS2: |
322 | valueString[valueStringLength++] = (char16_t) UNICODE_RS_CODEPOINT; /* adding \ */ |
323 | valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 0); |
324 | /* Always add space character, because the next character might be whitespace, |
325 | which would erroneously be considered the termination of the escape sequence. */ |
326 | valueString[valueStringLength++] = (char16_t) UNICODE_SPACE_CODEPOINT; |
327 | break; |
328 | |
329 | default: |
330 | while (i < length) |
331 | { |
332 | valueString[valueStringLength++] = (char16_t) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */ |
333 | valueString[valueStringLength++] = (char16_t) UNICODE_U_CODEPOINT; /* adding U */ |
334 | valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4); |
335 | } |
336 | } |
337 | } |
338 | myValueSource = valueString; |
339 | |
340 | /* reset the error */ |
341 | *err = U_ZERO_ERROR; |
342 | |
343 | ucnv_cbFromUWriteUChars(fromArgs, &myValueSource, myValueSource+valueStringLength, 0, err); |
344 | |
345 | ucnv_setFromUCallBack (fromArgs->converter, |
346 | original, |
347 | originalContext, |
348 | &ignoredCallback, |
349 | &ignoredContext, |
350 | &err2); |
351 | if (U_FAILURE (err2)) |
352 | { |
353 | *err = err2; |
354 | return; |
355 | } |
356 | |
357 | return; |
358 | } |
359 | |
360 | |
361 | |
362 | U_CAPI void U_EXPORT2 |
363 | UCNV_TO_U_CALLBACK_SKIP ( |
364 | const void *context, |
365 | UConverterToUnicodeArgs *toArgs, |
366 | const char* codeUnits, |
367 | int32_t length, |
368 | UConverterCallbackReason reason, |
369 | UErrorCode * err) |
370 | { |
371 | (void)toArgs; |
372 | (void)codeUnits; |
373 | (void)length; |
374 | if (reason <= UCNV_IRREGULAR) |
375 | { |
376 | if (context == nullptr || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED)) |
377 | { |
378 | *err = U_ZERO_ERROR; |
379 | } |
380 | /* else the caller must have set the error code accordingly. */ |
381 | } |
382 | /* else ignore the reset, close and clone calls. */ |
383 | } |
384 | |
385 | U_CAPI void U_EXPORT2 |
386 | UCNV_TO_U_CALLBACK_SUBSTITUTE ( |
387 | const void *context, |
388 | UConverterToUnicodeArgs *toArgs, |
389 | const char* codeUnits, |
390 | int32_t length, |
391 | UConverterCallbackReason reason, |
392 | UErrorCode * err) |
393 | { |
394 | (void)codeUnits; |
395 | (void)length; |
396 | if (reason <= UCNV_IRREGULAR) |
397 | { |
398 | if (context == nullptr || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED)) |
399 | { |
400 | *err = U_ZERO_ERROR; |
401 | ucnv_cbToUWriteSub(toArgs,0,err); |
402 | } |
403 | /* else the caller must have set the error code accordingly. */ |
404 | } |
405 | /* else ignore the reset, close and clone calls. */ |
406 | } |
407 | |
408 | /*uses uprv_itou to get a unicode escape sequence of the offensive sequence, |
409 | *and uses that as the substitution sequence |
410 | */ |
411 | U_CAPI void U_EXPORT2 |
412 | UCNV_TO_U_CALLBACK_ESCAPE ( |
413 | const void *context, |
414 | UConverterToUnicodeArgs *toArgs, |
415 | const char* codeUnits, |
416 | int32_t length, |
417 | UConverterCallbackReason reason, |
418 | UErrorCode * err) |
419 | { |
420 | char16_t uniValueString[VALUE_STRING_LENGTH]; |
421 | int32_t valueStringLength = 0; |
422 | int32_t i = 0; |
423 | |
424 | if (reason > UCNV_IRREGULAR) |
425 | { |
426 | return; |
427 | } |
428 | |
429 | if(context==nullptr) |
430 | { |
431 | while (i < length) |
432 | { |
433 | uniValueString[valueStringLength++] = (char16_t) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */ |
434 | uniValueString[valueStringLength++] = (char16_t) UNICODE_X_CODEPOINT; /* adding X */ |
435 | valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t) codeUnits[i++], 16, 2); |
436 | } |
437 | } |
438 | else |
439 | { |
440 | switch(*((char*)context)) |
441 | { |
442 | case UCNV_PRV_ESCAPE_XML_DEC: |
443 | while (i < length) |
444 | { |
445 | uniValueString[valueStringLength++] = (char16_t) UNICODE_AMP_CODEPOINT; /* adding & */ |
446 | uniValueString[valueStringLength++] = (char16_t) UNICODE_HASH_CODEPOINT; /* adding # */ |
447 | valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 10, 0); |
448 | uniValueString[valueStringLength++] = (char16_t) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */ |
449 | } |
450 | break; |
451 | |
452 | case UCNV_PRV_ESCAPE_XML_HEX: |
453 | while (i < length) |
454 | { |
455 | uniValueString[valueStringLength++] = (char16_t) UNICODE_AMP_CODEPOINT; /* adding & */ |
456 | uniValueString[valueStringLength++] = (char16_t) UNICODE_HASH_CODEPOINT; /* adding # */ |
457 | uniValueString[valueStringLength++] = (char16_t) UNICODE_X_LOW_CODEPOINT; /* adding x */ |
458 | valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 16, 0); |
459 | uniValueString[valueStringLength++] = (char16_t) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */ |
460 | } |
461 | break; |
462 | case UCNV_PRV_ESCAPE_C: |
463 | while (i < length) |
464 | { |
465 | uniValueString[valueStringLength++] = (char16_t) UNICODE_RS_CODEPOINT; /* adding \ */ |
466 | uniValueString[valueStringLength++] = (char16_t) UNICODE_X_LOW_CODEPOINT; /* adding x */ |
467 | valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 16, 2); |
468 | } |
469 | break; |
470 | default: |
471 | while (i < length) |
472 | { |
473 | uniValueString[valueStringLength++] = (char16_t) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */ |
474 | uniValueString[valueStringLength++] = (char16_t) UNICODE_X_CODEPOINT; /* adding X */ |
475 | uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t) codeUnits[i++], 16, 2); |
476 | valueStringLength += 2; |
477 | } |
478 | } |
479 | } |
480 | /* reset the error */ |
481 | *err = U_ZERO_ERROR; |
482 | |
483 | ucnv_cbToUWriteUChars(toArgs, uniValueString, valueStringLength, 0, err); |
484 | } |
485 | |
486 | #endif |
487 | |