UTF8Helpers.cpp source code [ClickHouse/dbms/src/Common/UTF8Helpers.cpp]

1	#include <Common/UTF8Helpers.h>
2
3	#include <widechar_width.h>
4
5	namespace DB
6	{
7	namespace UTF8
8	{
9
10	// based on https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
11	// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
12	// Permission is hereby granted, free of charge, to any person obtaining a copy
13	// of this software and associated documentation files (the "Software"), to deal
14	// in the Software without restriction, including without limitation the rights
15	// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16	// copies of the Software, and to permit persons to whom the Software is
17	// furnished to do so, subject to the following conditions: The above copyright
18	// notice and this permission notice shall be included in all copies or
19	// substantial portions of the Software.
20	// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21	// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22	// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23	// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24	// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25	// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26	// SOFTWARE.
27
28	static const UInt8 TABLE[] =
29	{
30	`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`, // 00..1f
31	`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`, // 20..3f
32	`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`, // 40..5f
33	`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`, // 60..7f
34	`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`9`,`9`,`9`,`9`,`9`,`9`,`9`,`9`,`9`,`9`,`9`,`9`,`9`,`9`,`9`,`9`, // 80..9f
35	`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`, // a0..bf
36	`8`,`8`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`, // c0..df
37	`0xa`,`0x3`,`0x3`,`0x3`,`0x3`,`0x3`,`0x3`,`0x3`,`0x3`,`0x3`,`0x3`,`0x3`,`0x3`,`0x4`,`0x3`,`0x3`, // e0..ef
38	`0xb`,`0x6`,`0x6`,`0x6`,`0x5`,`0x8`,`0x8`,`0x8`,`0x8`,`0x8`,`0x8`,`0x8`,`0x8`,`0x8`,`0x8`,`0x8`, // f0..ff
39	`0x0`,`0x1`,`0x2`,`0x3`,`0x5`,`0x8`,`0x7`,`0x1`,`0x1`,`0x1`,`0x4`,`0x6`,`0x1`,`0x1`,`0x1`,`0x1`, // s0..s0
40	`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`0`,`1`,`1`,`1`,`1`,`1`,`0`,`1`,`0`,`1`,`1`,`1`,`1`,`1`,`1`, // s1..s2
41	`1`,`2`,`1`,`1`,`1`,`1`,`1`,`2`,`1`,`2`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`2`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`, // s3..s4
42	`1`,`2`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`2`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`3`,`1`,`3`,`1`,`1`,`1`,`1`,`1`,`1`, // s5..s6
43	`1`,`3`,`1`,`1`,`1`,`1`,`1`,`3`,`1`,`3`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`3`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`, // s7..s8
44	};
45
46	struct UTF8Decoder
47	{
48	enum
49	{
50	ACCEPT = `0`,
51	REJECT = `1`
52	};
53
54	UInt32 decode(UInt8 byte)
55	{
56	UInt32 type = TABLE[byte];
57	codepoint = (state != ACCEPT) ? (byte & `0x3fu`) \| (codepoint << `6`) : (`0xff` >> type) & (byte);
58	state = TABLE[`256` + state * `16` + type];
59	return state;
60	}
61
62	void reset()
63	{
64	state = ACCEPT;
65	codepoint = `0xfffdU`;
66	}
67
68	UInt8 state {ACCEPT};
69	UInt32 codepoint {`0`};
70	};
71
72	static int wcwidth(wchar_t wc)
73	{
74	int width = widechar_wcwidth(wc);
75	switch (width)
76	{
77	case widechar_nonprint:
78	[[fallthrough]];
79	case widechar_combining:
80	[[fallthrough]];
81	case widechar_unassigned:
82	return `0`;
83	case widechar_ambiguous:
84	[[fallthrough]];
85	case widechar_private_use:
86	[[fallthrough]];
87	case widechar_widened_in_9:
88	return `1`;
89	default:
90	return width;
91	}
92	}
93
94	size_t computeWidth(const UInt8 * data, size_t size, size_t prefix) noexcept
95	{
96	UTF8Decoder decoder;
97	size_t width = `0`;
98	size_t rollback = `0`;
99	for (size_t i = `0`; i < size; ++i)
100	{
101	switch (decoder.decode(data[i]))
102	{
103	case UTF8Decoder::REJECT:
104	decoder.reset();
105	// invalid sequences seem to have zero width in modern terminals
106	// tested in libvte-based, alacritty, urxvt and xterm
107	i -= rollback;
108	rollback = `0`;
109	break;
110	case UTF8Decoder::ACCEPT:
111	// there are special control characters that manipulate the terminal output.
112	// (`0x08`, `0x09`, `0x0a`, `0x0b`, `0x0c`, `0x0d`, `0x1b`)
113	// Since we don't touch the original column data, there is no easy way to escape them.
114	// TODO: escape control characters
115	// TODO: multiline support for '\n'
116
117	// special treatment for '\t'
118	if (decoder.codepoint == `'\t'`)
119	width += `8` - (prefix + width) % `8`;
120	else
121	width += wcwidth(decoder.codepoint);
122	rollback = `0`;
123	break;
124	// continue if we meet other values here
125	default:
126	++rollback;
127	}
128	}
129
130	// no need to handle trailing sequence as they have zero width
131	return width;
132	}
133	}
134	}
135

Browse the source code of ClickHouse/dbms/src/Common/UTF8Helpers.cpp