1#include <DataTypes/DataTypeString.h>
2#include <Functions/FunctionFactory.h>
3#include <Functions/FunctionStringOrArrayToT.h>
4
5#include <cstring>
6
7#ifdef __SSE4_1__
8# include <emmintrin.h>
9# include <smmintrin.h>
10# include <tmmintrin.h>
11#endif
12
13namespace DB
14{
15/// inspired by https://github.com/cyb70289/utf8/
16struct ValidUTF8Impl
17{
18 /*
19MIT License
20
21Copyright (c) 2019 Yibo Cai
22
23Permission is hereby granted, free of charge, to any person obtaining a copy
24of this software and associated documentation files (the "Software"), to deal
25in the Software without restriction, including without limitation the rights
26to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
27copies of the Software, and to permit persons to whom the Software is
28furnished to do so, subject to the following conditions:
29
30The above copyright notice and this permission notice shall be included in all
31copies or substantial portions of the Software.
32
33THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
34IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
35FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
36AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
37LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
38OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
39SOFTWARE.
40*/
41
42 /*
43 * http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94
44 *
45 * Table 3-7. Well-Formed UTF-8 Byte Sequences
46 *
47 * +--------------------+------------+-------------+------------+-------------+
48 * | Code Points | First Byte | Second Byte | Third Byte | Fourth Byte |
49 * +--------------------+------------+-------------+------------+-------------+
50 * | U+0000..U+007F | 00..7F | | | |
51 * +--------------------+------------+-------------+------------+-------------+
52 * | U+0080..U+07FF | C2..DF | 80..BF | | |
53 * +--------------------+------------+-------------+------------+-------------+
54 * | U+0800..U+0FFF | E0 | A0..BF | 80..BF | |
55 * +--------------------+------------+-------------+------------+-------------+
56 * | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | |
57 * +--------------------+------------+-------------+------------+-------------+
58 * | U+D000..U+D7FF | ED | 80..9F | 80..BF | |
59 * +--------------------+------------+-------------+------------+-------------+
60 * | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | |
61 * +--------------------+------------+-------------+------------+-------------+
62 * | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF |
63 * +--------------------+------------+-------------+------------+-------------+
64 * | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF |
65 * +--------------------+------------+-------------+------------+-------------+
66 * | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF |
67 * +--------------------+------------+-------------+------------+-------------+
68 */
69
70 static inline UInt8 isValidUTF8Naive(const UInt8 * data, UInt64 len)
71 {
72 while (len)
73 {
74 int bytes;
75 const UInt8 byte1 = data[0];
76 /* 00..7F */
77 if (byte1 <= 0x7F)
78 {
79 bytes = 1;
80 }
81 /* C2..DF, 80..BF */
82 else if (len >= 2 && byte1 >= 0xC2 && byte1 <= 0xDF && static_cast<Int8>(data[1]) <= static_cast<Int8>(0xBF))
83 {
84 bytes = 2;
85 }
86 else if (len >= 3)
87 {
88 const UInt8 byte2 = data[1];
89 bool byte2_ok = static_cast<Int8>(byte2) <= static_cast<Int8>(0xBF);
90 bool byte3_ok = static_cast<Int8>(data[2]) <= static_cast<Int8>(0xBF);
91
92 if (byte2_ok && byte3_ok &&
93 /* E0, A0..BF, 80..BF */
94 ((byte1 == 0xE0 && byte2 >= 0xA0) ||
95 /* E1..EC, 80..BF, 80..BF */
96 (byte1 >= 0xE1 && byte1 <= 0xEC) ||
97 /* ED, 80..9F, 80..BF */
98 (byte1 == 0xED && byte2 <= 0x9F) ||
99 /* EE..EF, 80..BF, 80..BF */
100 (byte1 >= 0xEE && byte1 <= 0xEF)))
101 {
102 bytes = 3;
103 }
104 else if (len >= 4)
105 {
106 bool byte4_ok = static_cast<Int8>(data[3]) <= static_cast<Int8>(0xBF);
107 if (byte2_ok && byte3_ok && byte4_ok &&
108 /* F0, 90..BF, 80..BF, 80..BF */
109 ((byte1 == 0xF0 && byte2 >= 0x90) ||
110 /* F1..F3, 80..BF, 80..BF, 80..BF */
111 (byte1 >= 0xF1 && byte1 <= 0xF3) ||
112 /* F4, 80..8F, 80..BF, 80..BF */
113 (byte1 == 0xF4 && byte2 <= 0x8F)))
114 {
115 bytes = 4;
116 }
117 else
118 {
119 return false;
120 }
121 }
122 else
123 {
124 return false;
125 }
126 }
127 else
128 {
129 return false;
130 }
131 len -= bytes;
132 data += bytes;
133 }
134 return true;
135 }
136
137#ifndef __SSE4_1__
138 static inline UInt8 isValidUTF8(const UInt8 * data, UInt64 len) { return isValidUTF8Naive(data, len); }
139#else
140 static inline UInt8 isValidUTF8(const UInt8 * data, UInt64 len)
141 {
142 /*
143 * Map high nibble of "First Byte" to legal character length minus 1
144 * 0x00 ~ 0xBF --> 0
145 * 0xC0 ~ 0xDF --> 1
146 * 0xE0 ~ 0xEF --> 2
147 * 0xF0 ~ 0xFF --> 3
148 */
149 const __m128i first_len_tbl = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 3);
150
151 /* Map "First Byte" to 8-th item of range table (0xC2 ~ 0xF4) */
152 const __m128i first_range_tbl = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8);
153
154 /*
155 * Range table, map range index to min and max values
156 */
157 const __m128i range_min_tbl
158 = _mm_setr_epi8(0x00, 0x80, 0x80, 0x80, 0xA0, 0x80, 0x90, 0x80, 0xC2, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F);
159
160 const __m128i range_max_tbl
161 = _mm_setr_epi8(0x7F, 0xBF, 0xBF, 0xBF, 0xBF, 0x9F, 0xBF, 0x8F, 0xF4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
162
163 /*
164 * Tables for fast handling of four special First Bytes(E0,ED,F0,F4), after
165 * which the Second Byte are not 80~BF. It contains "range index adjustment".
166 * +------------+---------------+------------------+----------------+
167 * | First Byte | original range| range adjustment | adjusted range |
168 * +------------+---------------+------------------+----------------+
169 * | E0 | 2 | 2 | 4 |
170 * +------------+---------------+------------------+----------------+
171 * | ED | 2 | 3 | 5 |
172 * +------------+---------------+------------------+----------------+
173 * | F0 | 3 | 3 | 6 |
174 * +------------+---------------+------------------+----------------+
175 * | F4 | 4 | 4 | 8 |
176 * +------------+---------------+------------------+----------------+
177 */
178
179 /* index1 -> E0, index14 -> ED */
180 const __m128i df_ee_tbl = _mm_setr_epi8(0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0);
181
182 /* index1 -> F0, index5 -> F4 */
183 const __m128i ef_fe_tbl = _mm_setr_epi8(0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
184
185 __m128i prev_input = _mm_set1_epi8(0);
186 __m128i prev_first_len = _mm_set1_epi8(0);
187 __m128i error = _mm_set1_epi8(0);
188
189 auto check_packed = [&](__m128i input) noexcept
190 {
191 /* high_nibbles = input >> 4 */
192 const __m128i high_nibbles = _mm_and_si128(_mm_srli_epi16(input, 4), _mm_set1_epi8(0x0F));
193
194 /* first_len = legal character length minus 1 */
195 /* 0 for 00~7F, 1 for C0~DF, 2 for E0~EF, 3 for F0~FF */
196 /* first_len = first_len_tbl[high_nibbles] */
197 __m128i first_len = _mm_shuffle_epi8(first_len_tbl, high_nibbles);
198
199 /* First Byte: set range index to 8 for bytes within 0xC0 ~ 0xFF */
200 /* range = first_range_tbl[high_nibbles] */
201 __m128i range = _mm_shuffle_epi8(first_range_tbl, high_nibbles);
202
203 /* Second Byte: set range index to first_len */
204 /* 0 for 00~7F, 1 for C0~DF, 2 for E0~EF, 3 for F0~FF */
205 /* range |= (first_len, prev_first_len) << 1 byte */
206 range = _mm_or_si128(range, _mm_alignr_epi8(first_len, prev_first_len, 15));
207
208 /* Third Byte: set range index to saturate_sub(first_len, 1) */
209 /* 0 for 00~7F, 0 for C0~DF, 1 for E0~EF, 2 for F0~FF */
210 __m128i tmp1;
211 __m128i tmp2;
212 /* tmp1 = saturate_sub(first_len, 1) */
213 tmp1 = _mm_subs_epu8(first_len, _mm_set1_epi8(1));
214 /* tmp2 = saturate_sub(prev_first_len, 1) */
215 tmp2 = _mm_subs_epu8(prev_first_len, _mm_set1_epi8(1));
216 /* range |= (tmp1, tmp2) << 2 bytes */
217 range = _mm_or_si128(range, _mm_alignr_epi8(tmp1, tmp2, 14));
218
219 /* Fourth Byte: set range index to saturate_sub(first_len, 2) */
220 /* 0 for 00~7F, 0 for C0~DF, 0 for E0~EF, 1 for F0~FF */
221 /* tmp1 = saturate_sub(first_len, 2) */
222 tmp1 = _mm_subs_epu8(first_len, _mm_set1_epi8(2));
223 /* tmp2 = saturate_sub(prev_first_len, 2) */
224 tmp2 = _mm_subs_epu8(prev_first_len, _mm_set1_epi8(2));
225 /* range |= (tmp1, tmp2) << 3 bytes */
226 range = _mm_or_si128(range, _mm_alignr_epi8(tmp1, tmp2, 13));
227
228 /*
229 * Now we have below range indices caluclated
230 * Correct cases:
231 * - 8 for C0~FF
232 * - 3 for 1st byte after F0~FF
233 * - 2 for 1st byte after E0~EF or 2nd byte after F0~FF
234 * - 1 for 1st byte after C0~DF or 2nd byte after E0~EF or
235 * 3rd byte after F0~FF
236 * - 0 for others
237 * Error cases:
238 * 9,10,11 if non ascii First Byte overlaps
239 * E.g., F1 80 C2 90 --> 8 3 10 2, where 10 indicates error
240 */
241
242 /* Adjust Second Byte range for special First Bytes(E0,ED,F0,F4) */
243 /* Overlaps lead to index 9~15, which are illegal in range table */
244 __m128i shift1, pos, range2;
245 /* shift1 = (input, prev_input) << 1 byte */
246 shift1 = _mm_alignr_epi8(input, prev_input, 15);
247 pos = _mm_sub_epi8(shift1, _mm_set1_epi8(0xEF));
248 /*
249 * shift1: | EF F0 ... FE | FF 00 ... ... DE | DF E0 ... EE |
250 * pos: | 0 1 15 | 16 17 239| 240 241 255|
251 * pos-240: | 0 0 0 | 0 0 0 | 0 1 15 |
252 * pos+112: | 112 113 127| >= 128 | >= 128 |
253 */
254 tmp1 = _mm_subs_epu8(pos, _mm_set1_epi8(0xF0));
255 range2 = _mm_shuffle_epi8(df_ee_tbl, tmp1);
256 tmp2 = _mm_adds_epu8(pos, _mm_set1_epi8(112));
257 range2 = _mm_add_epi8(range2, _mm_shuffle_epi8(ef_fe_tbl, tmp2));
258
259 range = _mm_add_epi8(range, range2);
260
261 /* Load min and max values per calculated range index */
262 __m128i minv = _mm_shuffle_epi8(range_min_tbl, range);
263 __m128i maxv = _mm_shuffle_epi8(range_max_tbl, range);
264
265 /* Check value range */
266 error = _mm_or_si128(error, _mm_cmplt_epi8(input, minv));
267 error = _mm_or_si128(error, _mm_cmpgt_epi8(input, maxv));
268
269 prev_input = input;
270 prev_first_len = first_len;
271
272 data += 16;
273 len -= 16;
274 };
275
276 while (len >= 16)
277 check_packed(_mm_loadu_si128(reinterpret_cast<const __m128i *>(data)));
278
279 /// 0 <= len <= 15 for now. Reading data from data - 1 because of right padding of 15 and left padding
280 /// Then zero some bytes from the unknown memory and check again.
281 alignas(16) char buf[32];
282 _mm_store_si128(reinterpret_cast<__m128i *>(buf), _mm_loadu_si128(reinterpret_cast<const __m128i *>(data - 1)));
283 memset(buf + len + 1, 0, 16);
284 check_packed(_mm_loadu_si128(reinterpret_cast<__m128i *>(buf + 1)));
285
286 return _mm_testz_si128(error, error);
287 }
288#endif
289
290 static constexpr bool is_fixed_to_constant = false;
291
292 static void vector(const ColumnString::Chars & data, const ColumnString::Offsets & offsets, PaddedPODArray<UInt8> & res)
293 {
294 size_t size = offsets.size();
295 size_t prev_offset = 0;
296 for (size_t i = 0; i < size; ++i)
297 {
298 res[i] = isValidUTF8(data.data() + prev_offset, offsets[i] - 1 - prev_offset);
299 prev_offset = offsets[i];
300 }
301 }
302
303 static void vector_fixed_to_constant(const ColumnString::Chars & /*data*/, size_t /*n*/, UInt8 & /*res*/) {}
304
305 static void vector_fixed_to_vector(const ColumnString::Chars & data, size_t n, PaddedPODArray<UInt8> & res)
306 {
307 size_t size = data.size() / n;
308 for (size_t i = 0; i < size; ++i)
309 res[i] = isValidUTF8(data.data() + i * n, n);
310 }
311
312 [[noreturn]] static void array(const ColumnString::Offsets &, PaddedPODArray<UInt8> &)
313 {
314 throw Exception("Cannot apply function isValidUTF8 to Array argument", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
315 }
316};
317
318struct NameIsValidUTF8
319{
320 static constexpr auto name = "isValidUTF8";
321};
322using FunctionValidUTF8 = FunctionStringOrArrayToT<ValidUTF8Impl, NameIsValidUTF8, UInt8>;
323
324void registerFunctionIsValidUTF8(FunctionFactory & factory)
325{
326 factory.registerFunction<FunctionValidUTF8>();
327}
328
329}
330