ucnv_err.cpp source code [ClickHouse/contrib/icu/icu4c/source/common/ucnv_err.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	*****************************************************************************
5	*
6	* Copyright (C) 1998-2016, International Business Machines
7	* Corporation and others. All Rights Reserved.
8	*
9	*****************************************************************************
10	*
11	* ucnv_err.c
12	* Implements error behaviour functions called by T_UConverter_{from,to}Unicode
13	*
14	*
15	* Change history:
16	*
17	* 06/29/2000 helena Major rewrite of the callback APIs.
18	*/
19
20	#include "unicode/utypes.h"
21
22	#if !UCONFIG_NO_CONVERSION
23
24	#include "unicode/ucnv_err.h"
25	#include "unicode/ucnv_cb.h"
26	#include "ucnv_cnv.h"
27	#include "cmemory.h"
28	#include "unicode/ucnv.h"
29	#include "ustrfmt.h"
30
31	#define VALUE_STRING_LENGTH 48
32	/Magic # 32 = 4(number of char in value string) * 8(max number of bytes per char for any converter) /
33	#define UNICODE_PERCENT_SIGN_CODEPOINT 0x0025
34	#define UNICODE_U_CODEPOINT 0x0055
35	#define UNICODE_X_CODEPOINT 0x0058
36	#define UNICODE_RS_CODEPOINT 0x005C
37	#define UNICODE_U_LOW_CODEPOINT 0x0075
38	#define UNICODE_X_LOW_CODEPOINT 0x0078
39	#define UNICODE_AMP_CODEPOINT 0x0026
40	#define UNICODE_HASH_CODEPOINT 0x0023
41	#define UNICODE_SEMICOLON_CODEPOINT 0x003B
42	#define UNICODE_PLUS_CODEPOINT 0x002B
43	#define UNICODE_LEFT_CURLY_CODEPOINT 0x007B
44	#define UNICODE_RIGHT_CURLY_CODEPOINT 0x007D
45	#define UNICODE_SPACE_CODEPOINT 0x0020
46	#define UCNV_PRV_ESCAPE_ICU 0
47	#define UCNV_PRV_ESCAPE_C 'C'
48	#define UCNV_PRV_ESCAPE_XML_DEC 'D'
49	#define UCNV_PRV_ESCAPE_XML_HEX 'X'
50	#define UCNV_PRV_ESCAPE_JAVA 'J'
51	#define UCNV_PRV_ESCAPE_UNICODE 'U'
52	#define UCNV_PRV_ESCAPE_CSS2 'S'
53	#define UCNV_PRV_STOP_ON_ILLEGAL 'i'
54
55	/*
56	* IS_DEFAULT_IGNORABLE_CODE_POINT
57	* This is to check if a code point has the default ignorable unicode property.
58	* As such, this list needs to be updated if the ignorable code point list ever
59	* changes.
60	* To avoid dependency on other code, this list is hard coded here.
61	* When an ignorable code point is found and is unmappable, the default callbacks
62	* will ignore them.
63	* For a list of the default ignorable code points, use this link:
64	* https://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5B%3ADI%3A%5D&abb=on&g=&i=
65	*
66	* This list should be sync with the one in CharsetCallback.java
67	*/
68	#define IS_DEFAULT_IGNORABLE_CODE_POINT(c) ( \
69	(c == 0x00AD) \|\| \
70	(c == 0x034F) \|\| \
71	(c == 0x061C) \|\| \
72	(c == 0x115F) \|\| \
73	(c == 0x1160) \|\| \
74	(0x17B4 <= c && c <= 0x17B5) \|\| \
75	(0x180B <= c && c <= 0x180E) \|\| \
76	(0x200B <= c && c <= 0x200F) \|\| \
77	(0x202A <= c && c <= 0x202E) \|\| \
78	(0x2060 <= c && c <= 0x206F) \|\| \
79	(c == 0x3164) \|\| \
80	(0xFE00 <= c && c <= 0xFE0F) \|\| \
81	(c == 0xFEFF) \|\| \
82	(c == 0xFFA0) \|\| \
83	(0xFFF0 <= c && c <= 0xFFF8) \|\| \
84	(0x1BCA0 <= c && c <= 0x1BCA3) \|\| \
85	(0x1D173 <= c && c <= 0x1D17A) \|\| \
86	(0xE0000 <= c && c <= 0xE0FFF))
87
88
89	/Function Pointer STOPS at the ILLEGAL_SEQUENCE /
90	U_CAPI void U_EXPORT2
91	UCNV_FROM_U_CALLBACK_STOP (
92	const void *context,
93	UConverterFromUnicodeArgs *fromUArgs,
94	const UChar* codeUnits,
95	int32_t length,
96	UChar32 codePoint,
97	UConverterCallbackReason reason,
98	UErrorCode * err)
99	{
100	(void)context;
101	(void)fromUArgs;
102	(void)codeUnits;
103	(void)length;
104	if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
105	{
106	/*
107	* Skip if the codepoint has unicode property of default ignorable.
108	*/
109	*err = U_ZERO_ERROR;
110	}
111	/ the caller must have set the error code accordingly /
112	return;
113	}
114
115
116	/Function Pointer STOPS at the ILLEGAL_SEQUENCE /
117	U_CAPI void U_EXPORT2
118	UCNV_TO_U_CALLBACK_STOP (
119	const void *context,
120	UConverterToUnicodeArgs *toUArgs,
121	const char* codePoints,
122	int32_t length,
123	UConverterCallbackReason reason,
124	UErrorCode * err)
125	{
126	/ the caller must have set the error code accordingly /
127	(void)context; (void)toUArgs; (void)codePoints; (void)length; (void)reason; (void)err;
128	return;
129	}
130
131	U_CAPI void U_EXPORT2
132	UCNV_FROM_U_CALLBACK_SKIP (
133	const void *context,
134	UConverterFromUnicodeArgs *fromUArgs,
135	const UChar* codeUnits,
136	int32_t length,
137	UChar32 codePoint,
138	UConverterCallbackReason reason,
139	UErrorCode * err)
140	{
141	(void)fromUArgs;
142	(void)codeUnits;
143	(void)length;
144	if (reason <= UCNV_IRREGULAR)
145	{
146	if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
147	{
148	/*
149	* Skip if the codepoint has unicode property of default ignorable.
150	*/
151	*err = U_ZERO_ERROR;
152	}
153	else if (context == NULL \|\| (((char**)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
154	{
155	*err = U_ZERO_ERROR;
156	}
157	/ else the caller must have set the error code accordingly. /
158	}
159	/ else ignore the reset, close and clone calls. /
160	}
161
162	U_CAPI void U_EXPORT2
163	UCNV_FROM_U_CALLBACK_SUBSTITUTE (
164	const void *context,
165	UConverterFromUnicodeArgs *fromArgs,
166	const UChar* codeUnits,
167	int32_t length,
168	UChar32 codePoint,
169	UConverterCallbackReason reason,
170	UErrorCode * err)
171	{
172	(void)codeUnits;
173	(void)length;
174	if (reason <= UCNV_IRREGULAR)
175	{
176	if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
177	{
178	/*
179	* Skip if the codepoint has unicode property of default ignorable.
180	*/
181	*err = U_ZERO_ERROR;
182	}
183	else if (context == NULL \|\| (((char**)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
184	{
185	*err = U_ZERO_ERROR;
186	ucnv_cbFromUWriteSub(fromArgs, `0`, err);
187	}
188	/ else the caller must have set the error code accordingly. /
189	}
190	/ else ignore the reset, close and clone calls. /
191	}
192
193	/uses uprv_itou to get a unicode escape sequence of the offensive sequence,*
194	*uses a clean copy (resetted) of the converter, to convert that unicode
195	*escape sequence to the target codepage (if conversion failure happens then
196	*we revert to substituting with subchar)
197	*/
198	U_CAPI void U_EXPORT2
199	UCNV_FROM_U_CALLBACK_ESCAPE (
200	const void *context,
201	UConverterFromUnicodeArgs *fromArgs,
202	const UChar *codeUnits,
203	int32_t length,
204	UChar32 codePoint,
205	UConverterCallbackReason reason,
206	UErrorCode * err)
207	{
208
209	UChar valueString[VALUE_STRING_LENGTH];
210	int32_t valueStringLength = `0`;
211	int32_t i = `0`;
212
213	const UChar *myValueSource = NULL;
214	UErrorCode err2 = U_ZERO_ERROR;
215	UConverterFromUCallback original = NULL;
216	const void *originalContext;
217
218	UConverterFromUCallback ignoredCallback = NULL;
219	const void *ignoredContext;
220
221	if (reason > UCNV_IRREGULAR)
222	{
223	return;
224	}
225	else if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
226	{
227	/*
228	* Skip if the codepoint has unicode property of default ignorable.
229	*/
230	*err = U_ZERO_ERROR;
231	return;
232	}
233
234	ucnv_setFromUCallBack (fromArgs->converter,
235	(UConverterFromUCallback) UCNV_FROM_U_CALLBACK_SUBSTITUTE,
236	NULL,
237	&original,
238	&originalContext,
239	&err2);
240
241	if (U_FAILURE (err2))
242	{
243	*err = err2;
244	return;
245	}
246	if(context==NULL)
247	{
248	while (i < length)
249	{
250	valueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT; / adding % /
251	valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT; / adding U /
252	valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], `16`, `4`);
253	}
254	}
255	else
256	{
257	switch(((char**)context))
258	{
259	case UCNV_PRV_ESCAPE_JAVA:
260	while (i < length)
261	{
262	valueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT; / adding \ /
263	valueString[valueStringLength++] = (UChar) UNICODE_U_LOW_CODEPOINT; / adding u /
264	valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], `16`, `4`);
265	}
266	break;
267
268	case UCNV_PRV_ESCAPE_C:
269	valueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT; / adding \ /
270
271	if(length==`2`){
272	valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT; / adding U /
273	valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, `16`, `8`);
274
275	}
276	else{
277	valueString[valueStringLength++] = (UChar) UNICODE_U_LOW_CODEPOINT; / adding u /
278	valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[`0`], `16`, `4`);
279	}
280	break;
281
282	case UCNV_PRV_ESCAPE_XML_DEC:
283
284	valueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT; / adding & /
285	valueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT; / adding # /
286	if(length==`2`){
287	valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, `10`, `0`);
288	}
289	else{
290	valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[`0`], `10`, `0`);
291	}
292	valueString[valueStringLength++] = (UChar) UNICODE_SEMICOLON_CODEPOINT; / adding ; /
293	break;
294
295	case UCNV_PRV_ESCAPE_XML_HEX:
296
297	valueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT; / adding & /
298	valueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT; / adding # /
299	valueString[valueStringLength++] = (UChar) UNICODE_X_LOW_CODEPOINT; / adding x /
300	if(length==`2`){
301	valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, `16`, `0`);
302	}
303	else{
304	valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[`0`], `16`, `0`);
305	}
306	valueString[valueStringLength++] = (UChar) UNICODE_SEMICOLON_CODEPOINT; / adding ; /
307	break;
308
309	case UCNV_PRV_ESCAPE_UNICODE:
310	valueString[valueStringLength++] = (UChar) UNICODE_LEFT_CURLY_CODEPOINT; / adding { /
311	valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT; / adding U /
312	valueString[valueStringLength++] = (UChar) UNICODE_PLUS_CODEPOINT; / adding + /
313	if (length == `2`) {
314	valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, `16`, `4`);
315	} else {
316	valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[`0`], `16`, `4`);
317	}
318	valueString[valueStringLength++] = (UChar) UNICODE_RIGHT_CURLY_CODEPOINT; / adding } /
319	break;
320
321	case UCNV_PRV_ESCAPE_CSS2:
322	valueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT; / adding \ /
323	valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, `16`, `0`);
324	/ Always add space character, becase the next character might be whitespace,*
325	which would erroneously be considered the termination of the escape sequence. /*
326	valueString[valueStringLength++] = (UChar) UNICODE_SPACE_CODEPOINT;
327	break;
328
329	default:
330	while (i < length)
331	{
332	valueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT; / adding % /
333	valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT; / adding U /
334	valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], `16`, `4`);
335	}
336	}
337	}
338	myValueSource = valueString;
339
340	/ reset the error /
341	*err = U_ZERO_ERROR;
342
343	ucnv_cbFromUWriteUChars(fromArgs, &myValueSource, myValueSource+valueStringLength, `0`, err);
344
345	ucnv_setFromUCallBack (fromArgs->converter,
346	original,
347	originalContext,
348	&ignoredCallback,
349	&ignoredContext,
350	&err2);
351	if (U_FAILURE (err2))
352	{
353	*err = err2;
354	return;
355	}
356
357	return;
358	}
359
360
361
362	U_CAPI void U_EXPORT2
363	UCNV_TO_U_CALLBACK_SKIP (
364	const void *context,
365	UConverterToUnicodeArgs *toArgs,
366	const char* codeUnits,
367	int32_t length,
368	UConverterCallbackReason reason,
369	UErrorCode * err)
370	{
371	(void)toArgs;
372	(void)codeUnits;
373	(void)length;
374	if (reason <= UCNV_IRREGULAR)
375	{
376	if (context == NULL \|\| (((char**)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
377	{
378	*err = U_ZERO_ERROR;
379	}
380	/ else the caller must have set the error code accordingly. /
381	}
382	/ else ignore the reset, close and clone calls. /
383	}
384
385	U_CAPI void U_EXPORT2
386	UCNV_TO_U_CALLBACK_SUBSTITUTE (
387	const void *context,
388	UConverterToUnicodeArgs *toArgs,
389	const char* codeUnits,
390	int32_t length,
391	UConverterCallbackReason reason,
392	UErrorCode * err)
393	{
394	(void)codeUnits;
395	(void)length;
396	if (reason <= UCNV_IRREGULAR)
397	{
398	if (context == NULL \|\| (((char**)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
399	{
400	*err = U_ZERO_ERROR;
401	ucnv_cbToUWriteSub(toArgs,`0`,err);
402	}
403	/ else the caller must have set the error code accordingly. /
404	}
405	/ else ignore the reset, close and clone calls. /
406	}
407
408	/uses uprv_itou to get a unicode escape sequence of the offensive sequence,*
409	*and uses that as the substitution sequence
410	*/
411	U_CAPI void U_EXPORT2
412	UCNV_TO_U_CALLBACK_ESCAPE (
413	const void *context,
414	UConverterToUnicodeArgs *toArgs,
415	const char* codeUnits,
416	int32_t length,
417	UConverterCallbackReason reason,
418	UErrorCode * err)
419	{
420	UChar uniValueString[VALUE_STRING_LENGTH];
421	int32_t valueStringLength = `0`;
422	int32_t i = `0`;
423
424	if (reason > UCNV_IRREGULAR)
425	{
426	return;
427	}
428
429	if(context==NULL)
430	{
431	while (i < length)
432	{
433	uniValueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT; / adding % /
434	uniValueString[valueStringLength++] = (UChar) UNICODE_X_CODEPOINT; / adding X /
435	valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t) codeUnits[i++], `16`, `2`);
436	}
437	}
438	else
439	{
440	switch(((char**)context))
441	{
442	case UCNV_PRV_ESCAPE_XML_DEC:
443	while (i < length)
444	{
445	uniValueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT; / adding & /
446	uniValueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT; / adding # /
447	valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], `10`, `0`);
448	uniValueString[valueStringLength++] = (UChar) UNICODE_SEMICOLON_CODEPOINT; / adding ; /
449	}
450	break;
451
452	case UCNV_PRV_ESCAPE_XML_HEX:
453	while (i < length)
454	{
455	uniValueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT; / adding & /
456	uniValueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT; / adding # /
457	uniValueString[valueStringLength++] = (UChar) UNICODE_X_LOW_CODEPOINT; / adding x /
458	valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], `16`, `0`);
459	uniValueString[valueStringLength++] = (UChar) UNICODE_SEMICOLON_CODEPOINT; / adding ; /
460	}
461	break;
462	case UCNV_PRV_ESCAPE_C:
463	while (i < length)
464	{
465	uniValueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT; / adding \ /
466	uniValueString[valueStringLength++] = (UChar) UNICODE_X_LOW_CODEPOINT; / adding x /
467	valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], `16`, `2`);
468	}
469	break;
470	default:
471	while (i < length)
472	{
473	uniValueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT; / adding % /
474	uniValueString[valueStringLength++] = (UChar) UNICODE_X_CODEPOINT; / adding X /
475	uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t) codeUnits[i++], `16`, `2`);
476	valueStringLength += `2`;
477	}
478	}
479	}
480	/ reset the error /
481	*err = U_ZERO_ERROR;
482
483	ucnv_cbToUWriteUChars(toArgs, uniValueString, valueStringLength, `0`, err);
484	}
485
486	#endif
487

Browse the source code of ClickHouse/contrib/icu/icu4c/source/common/ucnv_err.cpp