1#include <Common/UTF8Helpers.h>
2
3#include <widechar_width.h>
4
5namespace DB
6{
7namespace UTF8
8{
9
10// based on https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
11// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
12// Permission is hereby granted, free of charge, to any person obtaining a copy
13// of this software and associated documentation files (the "Software"), to deal
14// in the Software without restriction, including without limitation the rights
15// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16// copies of the Software, and to permit persons to whom the Software is
17// furnished to do so, subject to the following conditions: The above copyright
18// notice and this permission notice shall be included in all copies or
19// substantial portions of the Software.
20// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26// SOFTWARE.
27
28static const UInt8 TABLE[] =
29{
30 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
31 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
32 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
33 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
34 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
35 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
36 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
37 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
38 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
39 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
40 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
41 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
42 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
43 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
44};
45
46struct UTF8Decoder
47{
48 enum
49 {
50 ACCEPT = 0,
51 REJECT = 1
52 };
53
54 UInt32 decode(UInt8 byte)
55 {
56 UInt32 type = TABLE[byte];
57 codepoint = (state != ACCEPT) ? (byte & 0x3fu) | (codepoint << 6) : (0xff >> type) & (byte);
58 state = TABLE[256 + state * 16 + type];
59 return state;
60 }
61
62 void reset()
63 {
64 state = ACCEPT;
65 codepoint = 0xfffdU;
66 }
67
68 UInt8 state {ACCEPT};
69 UInt32 codepoint {0};
70};
71
72static int wcwidth(wchar_t wc)
73{
74 int width = widechar_wcwidth(wc);
75 switch (width)
76 {
77 case widechar_nonprint:
78 [[fallthrough]];
79 case widechar_combining:
80 [[fallthrough]];
81 case widechar_unassigned:
82 return 0;
83 case widechar_ambiguous:
84 [[fallthrough]];
85 case widechar_private_use:
86 [[fallthrough]];
87 case widechar_widened_in_9:
88 return 1;
89 default:
90 return width;
91 }
92}
93
94size_t computeWidth(const UInt8 * data, size_t size, size_t prefix) noexcept
95{
96 UTF8Decoder decoder;
97 size_t width = 0;
98 size_t rollback = 0;
99 for (size_t i = 0; i < size; ++i)
100 {
101 switch (decoder.decode(data[i]))
102 {
103 case UTF8Decoder::REJECT:
104 decoder.reset();
105 // invalid sequences seem to have zero width in modern terminals
106 // tested in libvte-based, alacritty, urxvt and xterm
107 i -= rollback;
108 rollback = 0;
109 break;
110 case UTF8Decoder::ACCEPT:
111 // there are special control characters that manipulate the terminal output.
112 // (`0x08`, `0x09`, `0x0a`, `0x0b`, `0x0c`, `0x0d`, `0x1b`)
113 // Since we don't touch the original column data, there is no easy way to escape them.
114 // TODO: escape control characters
115 // TODO: multiline support for '\n'
116
117 // special treatment for '\t'
118 if (decoder.codepoint == '\t')
119 width += 8 - (prefix + width) % 8;
120 else
121 width += wcwidth(decoder.codepoint);
122 rollback = 0;
123 break;
124 // continue if we meet other values here
125 default:
126 ++rollback;
127 }
128 }
129
130 // no need to handle trailing sequence as they have zero width
131 return width;
132}
133}
134}
135