1 | #include <DataTypes/DataTypeString.h> |
2 | #include <Functions/FunctionFactory.h> |
3 | #include <Functions/FunctionStringOrArrayToT.h> |
4 | #include <Common/UTF8Helpers.h> |
5 | |
6 | |
7 | namespace DB |
8 | { |
9 | |
10 | namespace ErrorCodes |
11 | { |
12 | extern const int ILLEGAL_TYPE_OF_ARGUMENT; |
13 | } |
14 | |
15 | |
16 | /** If the string is UTF-8 encoded text, it returns the length of the text in code points. |
17 | * (not in characters: the length of the text "ё" can be either 1 or 2, depending on the normalization) |
18 | * (not in characters: the length of the text "" can be either 1 or 2, depending on the normalization) |
19 | * Otherwise, the behavior is undefined. |
20 | */ |
21 | struct LengthUTF8Impl |
22 | { |
23 | static constexpr auto is_fixed_to_constant = false; |
24 | |
25 | static void vector(const ColumnString::Chars & data, const ColumnString::Offsets & offsets, PaddedPODArray<UInt64> & res) |
26 | { |
27 | size_t size = offsets.size(); |
28 | |
29 | ColumnString::Offset prev_offset = 0; |
30 | for (size_t i = 0; i < size; ++i) |
31 | { |
32 | res[i] = UTF8::countCodePoints(&data[prev_offset], offsets[i] - prev_offset - 1); |
33 | prev_offset = offsets[i]; |
34 | } |
35 | } |
36 | |
37 | static void vector_fixed_to_constant(const ColumnString::Chars & /*data*/, size_t /*n*/, UInt64 & /*res*/) |
38 | { |
39 | } |
40 | |
41 | static void vector_fixed_to_vector(const ColumnString::Chars & data, size_t n, PaddedPODArray<UInt64> & res) |
42 | { |
43 | size_t size = data.size() / n; |
44 | |
45 | for (size_t i = 0; i < size; ++i) |
46 | { |
47 | res[i] = UTF8::countCodePoints(&data[i * n], n); |
48 | } |
49 | } |
50 | |
51 | [[noreturn]] static void array(const ColumnString::Offsets &, PaddedPODArray<UInt64> &) |
52 | { |
53 | throw Exception("Cannot apply function lengthUTF8 to Array argument" , ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); |
54 | } |
55 | }; |
56 | |
57 | struct NameLengthUTF8 |
58 | { |
59 | static constexpr auto name = "lengthUTF8" ; |
60 | }; |
61 | using FunctionLengthUTF8 = FunctionStringOrArrayToT<LengthUTF8Impl, NameLengthUTF8, UInt64>; |
62 | |
63 | void registerFunctionLengthUTF8(FunctionFactory & factory) |
64 | { |
65 | factory.registerFunction<FunctionLengthUTF8>(); |
66 | |
67 | /// Compatibility aliases. |
68 | factory.registerFunction<FunctionLengthUTF8>("CHAR_LENGTH" , FunctionFactory::CaseInsensitive); |
69 | factory.registerFunction<FunctionLengthUTF8>("CHARACTER_LENGTH" , FunctionFactory::CaseInsensitive); |
70 | } |
71 | |
72 | } |
73 | |