1#include <DataTypes/DataTypeString.h>
2#include <Functions/FunctionFactory.h>
3#include <Functions/FunctionStringOrArrayToT.h>
4#include <Common/UTF8Helpers.h>
5
6
7namespace DB
8{
9
10namespace ErrorCodes
11{
12 extern const int ILLEGAL_TYPE_OF_ARGUMENT;
13}
14
15
16/** If the string is UTF-8 encoded text, it returns the length of the text in code points.
17 * (not in characters: the length of the text "ё" can be either 1 or 2, depending on the normalization)
18 * (not in characters: the length of the text "" can be either 1 or 2, depending on the normalization)
19 * Otherwise, the behavior is undefined.
20 */
21struct LengthUTF8Impl
22{
23 static constexpr auto is_fixed_to_constant = false;
24
25 static void vector(const ColumnString::Chars & data, const ColumnString::Offsets & offsets, PaddedPODArray<UInt64> & res)
26 {
27 size_t size = offsets.size();
28
29 ColumnString::Offset prev_offset = 0;
30 for (size_t i = 0; i < size; ++i)
31 {
32 res[i] = UTF8::countCodePoints(&data[prev_offset], offsets[i] - prev_offset - 1);
33 prev_offset = offsets[i];
34 }
35 }
36
37 static void vector_fixed_to_constant(const ColumnString::Chars & /*data*/, size_t /*n*/, UInt64 & /*res*/)
38 {
39 }
40
41 static void vector_fixed_to_vector(const ColumnString::Chars & data, size_t n, PaddedPODArray<UInt64> & res)
42 {
43 size_t size = data.size() / n;
44
45 for (size_t i = 0; i < size; ++i)
46 {
47 res[i] = UTF8::countCodePoints(&data[i * n], n);
48 }
49 }
50
51 [[noreturn]] static void array(const ColumnString::Offsets &, PaddedPODArray<UInt64> &)
52 {
53 throw Exception("Cannot apply function lengthUTF8 to Array argument", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
54 }
55};
56
57struct NameLengthUTF8
58{
59 static constexpr auto name = "lengthUTF8";
60};
61using FunctionLengthUTF8 = FunctionStringOrArrayToT<LengthUTF8Impl, NameLengthUTF8, UInt64>;
62
63void registerFunctionLengthUTF8(FunctionFactory & factory)
64{
65 factory.registerFunction<FunctionLengthUTF8>();
66
67 /// Compatibility aliases.
68 factory.registerFunction<FunctionLengthUTF8>("CHAR_LENGTH", FunctionFactory::CaseInsensitive);
69 factory.registerFunction<FunctionLengthUTF8>("CHARACTER_LENGTH", FunctionFactory::CaseInsensitive);
70}
71
72}
73