utf_impl.cpp source code [ClickHouse/contrib/icu/icu4c/source/common/utf_impl.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	******************************************************************************
5	*
6	* Copyright (C) 1999-2012, International Business Machines
7	* Corporation and others. All Rights Reserved.
8	*
9	******************************************************************************
10	* file name: utf_impl.cpp
11	* encoding: UTF-8
12	* tab size: 8 (not used)
13	* indentation:4
14	*
15	* created on: 1999sep13
16	* created by: Markus W. Scherer
17	*
18	* This file provides implementation functions for macros in the utfXX.h
19	* that would otherwise be too long as macros.
20	*/
21
22	/ set import/export definitions /
23	#ifndef U_UTF8_IMPL
24	# define U_UTF8_IMPL
25	#endif
26
27	#include "unicode/utypes.h"
28	#include "unicode/utf.h"
29	#include "unicode/utf8.h"
30	#include "uassert.h"
31
32	/*
33	* Table of the number of utf8 trail bytes, indexed by the lead byte.
34	* Used by the deprecated macro UTF8_COUNT_TRAIL_BYTES, defined in utf_old.h
35	*
36	* The current macro, U8_COUNT_TRAIL_BYTES, does _not_ use this table.
37	*
38	* Note that this table cannot be removed, even if UTF8_COUNT_TRAIL_BYTES were
39	* changed to no longer use it. References to the table from expansions of UTF8_COUNT_TRAIL_BYTES
40	* may exist in old client code that must continue to run with newer icu library versions.
41	*
42	* This table could be replaced on many machines by
43	* a few lines of assembler code using an
44	* "index of first 0-bit from msb" instruction and
45	* one or two more integer instructions.
46	*
47	* For example, on an i386, do something like
48	* - MOV AL, leadByte
49	* - NOT AL (8-bit, leave b15..b8==0..0, reverse only b7..b0)
50	* - MOV AH, 0
51	* - BSR BX, AX (16-bit)
52	* - MOV AX, 6 (result)
53	* - JZ finish (ZF==1 if leadByte==0xff)
54	* - SUB AX, BX (result)
55	* -finish:
56	* (BSR: Bit Scan Reverse, scans for a 1-bit, starting from the MSB)
57	*/
58	extern "C" U_EXPORT const uint8_t
59	utf8_countTrailBytes[`256`]={
60	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
61	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
62	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
63	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
64
65	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
66	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
67	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
68	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
69
70	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
71	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
72	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
73	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
74
75	// illegal C0 & C1
76	// 2-byte lead bytes C2..DF
77	`0`, `0`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`,
78	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`,
79
80	// 3-byte lead bytes E0..EF
81	`2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`,
82	// 4-byte lead bytes F0..F4
83	// illegal F5..FF
84	`3`, `3`, `3`, `3`, `3`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`
85	};
86
87	static const UChar32
88	utf8_errorValue[`6`]={
89	// Same values as UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE,
90	// but without relying on the obsolete unicode/utf_old.h.
91	`0x15`, `0x9f`, `0xffff`,
92	`0x10ffff`
93	};
94
95	static UChar32
96	errorValue(int32_t count, int8_t strict) {
97	if(strict>=`0`) {
98	return utf8_errorValue[count];
99	} else if(strict==-`3`) {
100	return `0xfffd`;
101	} else {
102	return U_SENTINEL;
103	}
104	}
105
106	/*
107	* Handle the non-inline part of the U8_NEXT() and U8_NEXT_FFFD() macros
108	* and their obsolete sibling UTF8_NEXT_CHAR_SAFE().
109	*
110	* U8_NEXT() supports NUL-terminated strings indicated via length<0.
111	*
112	* The "strict" parameter controls the error behavior:
113	* <0 "Safe" behavior of U8_NEXT():
114	* -1: All illegal byte sequences yield U_SENTINEL=-1.
115	* -2: Same as -1, except for lenient treatment of surrogate code points as legal.
116	* Some implementations use this for roundtripping of
117	* Unicode 16-bit strings that are not well-formed UTF-16, that is, they
118	* contain unpaired surrogates.
119	* -3: All illegal byte sequences yield U+FFFD.
120	* 0 Obsolete "safe" behavior of UTF8_NEXT_CHAR_SAFE(..., FALSE):
121	* All illegal byte sequences yield a positive code point such that this
122	* result code point would be encoded with the same number of bytes as
123	* the illegal sequence.
124	* >0 Obsolete "strict" behavior of UTF8_NEXT_CHAR_SAFE(..., TRUE):
125	* Same as the obsolete "safe" behavior, but non-characters are also treated
126	* like illegal sequences.
127	*
128	* Note that a UBool is the same as an int8_t.
129	*/
130	U_CAPI UChar32 U_EXPORT2
131	utf8_nextCharSafeBody(const uint8_t s, int32_t pi, int32_t length, UChar32 c, UBool strict) {
132	// pi is one after byte c.*
133	int32_t i=*pi;
134	// length can be negative for NUL-terminated strings: Read and validate one byte at a time.
135	if(i==length \|\| c>`0xf4`) {
136	// end of string, or not a lead byte
137	} else if(c>=`0xf0`) {
138	// Test for 4-byte sequences first because
139	// U8_NEXT() handles shorter valid sequences inline.
140	uint8_t t1=s[i], t2, t3;
141	c&=`7`;
142	if(U8_IS_VALID_LEAD4_AND_T1(c, t1) &&
143	++i!=length && (t2=s[i]-`0x80`)<=`0x3f` &&
144	++i!=length && (t3=s[i]-`0x80`)<=`0x3f`) {
145	++i;
146	c=(c<<`18`)\|((t1&`0x3f`)<<`12`)\|(t2<<`6`)\|t3;
147	// strict: forbid non-characters like U+fffe
148	if(strict<=`0` \|\| !U_IS_UNICODE_NONCHAR(c)) {
149	*pi=i;
150	return c;
151	}
152	}
153	} else if(c>=`0xe0`) {
154	c&=`0xf`;
155	if(strict!=-`2`) {
156	uint8_t t1=s[i], t2;
157	if(U8_IS_VALID_LEAD3_AND_T1(c, t1) &&
158	++i!=length && (t2=s[i]-`0x80`)<=`0x3f`) {
159	++i;
160	c=(c<<`12`)\|((t1&`0x3f`)<<`6`)\|t2;
161	// strict: forbid non-characters like U+fffe
162	if(strict<=`0` \|\| !U_IS_UNICODE_NONCHAR(c)) {
163	*pi=i;
164	return c;
165	}
166	}
167	} else {
168	// strict=-2 -> lenient: allow surrogates
169	uint8_t t1=s[i]-`0x80`, t2;
170	if(t1<=`0x3f` && (c>`0` \|\| t1>=`0x20`) &&
171	++i!=length && (t2=s[i]-`0x80`)<=`0x3f`) {
172	*pi=i+`1`;
173	return (c<<`12`)\|(t1<<`6`)\|t2;
174	}
175	}
176	} else if(c>=`0xc2`) {
177	uint8_t t1=s[i]-`0x80`;
178	if(t1<=`0x3f`) {
179	*pi=i+`1`;
180	return ((c-`0xc0`)<<`6`)\|t1;
181	}
182	} // else 0x80<=c<0xc2 is not a lead byte
183
184	/ error handling /
185	c=errorValue(i-*pi, strict);
186	*pi=i;
187	return c;
188	}
189
190	U_CAPI int32_t U_EXPORT2
191	utf8_appendCharSafeBody(uint8_t s, int32_t i, int32_t length, UChar32 c, UBool pIsError) {
192	if((uint32_t)(c)<=`0x7ff`) {
193	if((i)+`1`<(length)) {
194	(s)[(i)++]=(uint8_t)(((c)>>`6`)\|`0xc0`);
195	(s)[(i)++]=(uint8_t)(((c)&`0x3f`)\|`0x80`);
196	return i;
197	}
198	} else if((uint32_t)(c)<=`0xffff`) {
199	/ Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8. /
200	if((i)+`2`<(length) && !U_IS_SURROGATE(c)) {
201	(s)[(i)++]=(uint8_t)(((c)>>`12`)\|`0xe0`);
202	(s)[(i)++]=(uint8_t)((((c)>>`6`)&`0x3f`)\|`0x80`);
203	(s)[(i)++]=(uint8_t)(((c)&`0x3f`)\|`0x80`);
204	return i;
205	}
206	} else if((uint32_t)(c)<=`0x10ffff`) {
207	if((i)+`3`<(length)) {
208	(s)[(i)++]=(uint8_t)(((c)>>`18`)\|`0xf0`);
209	(s)[(i)++]=(uint8_t)((((c)>>`12`)&`0x3f`)\|`0x80`);
210	(s)[(i)++]=(uint8_t)((((c)>>`6`)&`0x3f`)\|`0x80`);
211	(s)[(i)++]=(uint8_t)(((c)&`0x3f`)\|`0x80`);
212	return i;
213	}
214	}
215	/ c>0x10ffff or not enough space, write an error value /
216	if(pIsError!=NULL) {
217	*pIsError=TRUE;
218	} else {
219	length-=i;
220	if(length>`0`) {
221	int32_t offset;
222	if(length>`3`) {
223	length=`3`;
224	}
225	s+=i;
226	offset=`0`;
227	c=utf8_errorValue[length-`1`];
228	U8_APPEND_UNSAFE(s, offset, c);
229	i=i+offset;
230	}
231	}
232	return i;
233	}
234
235	U_CAPI UChar32 U_EXPORT2
236	utf8_prevCharSafeBody(const uint8_t s, int32_t start, int32_t pi, UChar32 c, UBool strict) {
237	// pi is the index of byte c.*
238	int32_t i=*pi;
239	if(U8_IS_TRAIL(c) && i>start) {
240	uint8_t b1=s[--i];
241	if(U8_IS_LEAD(b1)) {
242	if(b1<`0xe0`) {
243	*pi=i;
244	return ((b1-`0xc0`)<<`6`)\|(c&`0x3f`);
245	} else if(b1<`0xf0` ? U8_IS_VALID_LEAD3_AND_T1(b1, c) : U8_IS_VALID_LEAD4_AND_T1(b1, c)) {
246	// Truncated 3- or 4-byte sequence.
247	*pi=i;
248	return errorValue(`1`, strict);
249	}
250	} else if(U8_IS_TRAIL(b1) && i>start) {
251	// Extract the value bits from the last trail byte.
252	c&=`0x3f`;
253	uint8_t b2=s[--i];
254	if(`0xe0`<=b2 && b2<=`0xf4`) {
255	if(b2<`0xf0`) {
256	b2&=`0xf`;
257	if(strict!=-`2`) {
258	if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
259	*pi=i;
260	c=(b2<<`12`)\|((b1&`0x3f`)<<`6`)\|c;
261	if(strict<=`0` \|\| !U_IS_UNICODE_NONCHAR(c)) {
262	return c;
263	} else {
264	// strict: forbid non-characters like U+fffe
265	return errorValue(`2`, strict);
266	}
267	}
268	} else {
269	// strict=-2 -> lenient: allow surrogates
270	b1-=`0x80`;
271	if((b2>`0` \|\| b1>=`0x20`)) {
272	*pi=i;
273	return (b2<<`12`)\|(b1<<`6`)\|c;
274	}
275	}
276	} else if(U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
277	// Truncated 4-byte sequence.
278	*pi=i;
279	return errorValue(`2`, strict);
280	}
281	} else if(U8_IS_TRAIL(b2) && i>start) {
282	uint8_t b3=s[--i];
283	if(`0xf0`<=b3 && b3<=`0xf4`) {
284	b3&=`7`;
285	if(U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
286	*pi=i;
287	c=(b3<<`18`)\|((b2&`0x3f`)<<`12`)\|((b1&`0x3f`)<<`6`)\|c;
288	if(strict<=`0` \|\| !U_IS_UNICODE_NONCHAR(c)) {
289	return c;
290	} else {
291	// strict: forbid non-characters like U+fffe
292	return errorValue(`3`, strict);
293	}
294	}
295	}
296	}
297	}
298	}
299	return errorValue(`0`, strict);
300	}
301
302	U_CAPI int32_t U_EXPORT2
303	utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i) {
304	// Same as utf8_prevCharSafeBody(..., strict=-1) minus assembling code points.
305	int32_t orig_i=i;
306	uint8_t c=s[i];
307	if(U8_IS_TRAIL(c) && i>start) {
308	uint8_t b1=s[--i];
309	if(U8_IS_LEAD(b1)) {
310	if(b1<`0xe0` \|\|
311	(b1<`0xf0` ? U8_IS_VALID_LEAD3_AND_T1(b1, c) : U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
312	return i;
313	}
314	} else if(U8_IS_TRAIL(b1) && i>start) {
315	uint8_t b2=s[--i];
316	if(`0xe0`<=b2 && b2<=`0xf4`) {
317	if(b2<`0xf0` ? U8_IS_VALID_LEAD3_AND_T1(b2, b1) : U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
318	return i;
319	}
320	} else if(U8_IS_TRAIL(b2) && i>start) {
321	uint8_t b3=s[--i];
322	if(`0xf0`<=b3 && b3<=`0xf4` && U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
323	return i;
324	}
325	}
326	}
327	}
328	return orig_i;
329	}
330

Browse the source code of ClickHouse/contrib/icu/icu4c/source/common/utf_impl.cpp