| 1 | #include <DataTypes/DataTypeString.h> |
| 2 | #include <Functions/FunctionFactory.h> |
| 3 | #include <Functions/FunctionStringOrArrayToT.h> |
| 4 | #include <Common/UTF8Helpers.h> |
| 5 | |
| 6 | |
| 7 | namespace DB |
| 8 | { |
| 9 | |
| 10 | namespace ErrorCodes |
| 11 | { |
| 12 | extern const int ILLEGAL_TYPE_OF_ARGUMENT; |
| 13 | } |
| 14 | |
| 15 | |
| 16 | /** If the string is UTF-8 encoded text, it returns the length of the text in code points. |
| 17 | * (not in characters: the length of the text "ё" can be either 1 or 2, depending on the normalization) |
| 18 | * (not in characters: the length of the text "" can be either 1 or 2, depending on the normalization) |
| 19 | * Otherwise, the behavior is undefined. |
| 20 | */ |
| 21 | struct LengthUTF8Impl |
| 22 | { |
| 23 | static constexpr auto is_fixed_to_constant = false; |
| 24 | |
| 25 | static void vector(const ColumnString::Chars & data, const ColumnString::Offsets & offsets, PaddedPODArray<UInt64> & res) |
| 26 | { |
| 27 | size_t size = offsets.size(); |
| 28 | |
| 29 | ColumnString::Offset prev_offset = 0; |
| 30 | for (size_t i = 0; i < size; ++i) |
| 31 | { |
| 32 | res[i] = UTF8::countCodePoints(&data[prev_offset], offsets[i] - prev_offset - 1); |
| 33 | prev_offset = offsets[i]; |
| 34 | } |
| 35 | } |
| 36 | |
| 37 | static void vector_fixed_to_constant(const ColumnString::Chars & /*data*/, size_t /*n*/, UInt64 & /*res*/) |
| 38 | { |
| 39 | } |
| 40 | |
| 41 | static void vector_fixed_to_vector(const ColumnString::Chars & data, size_t n, PaddedPODArray<UInt64> & res) |
| 42 | { |
| 43 | size_t size = data.size() / n; |
| 44 | |
| 45 | for (size_t i = 0; i < size; ++i) |
| 46 | { |
| 47 | res[i] = UTF8::countCodePoints(&data[i * n], n); |
| 48 | } |
| 49 | } |
| 50 | |
| 51 | [[noreturn]] static void array(const ColumnString::Offsets &, PaddedPODArray<UInt64> &) |
| 52 | { |
| 53 | throw Exception("Cannot apply function lengthUTF8 to Array argument" , ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); |
| 54 | } |
| 55 | }; |
| 56 | |
| 57 | struct NameLengthUTF8 |
| 58 | { |
| 59 | static constexpr auto name = "lengthUTF8" ; |
| 60 | }; |
| 61 | using FunctionLengthUTF8 = FunctionStringOrArrayToT<LengthUTF8Impl, NameLengthUTF8, UInt64>; |
| 62 | |
| 63 | void registerFunctionLengthUTF8(FunctionFactory & factory) |
| 64 | { |
| 65 | factory.registerFunction<FunctionLengthUTF8>(); |
| 66 | |
| 67 | /// Compatibility aliases. |
| 68 | factory.registerFunction<FunctionLengthUTF8>("CHAR_LENGTH" , FunctionFactory::CaseInsensitive); |
| 69 | factory.registerFunction<FunctionLengthUTF8>("CHARACTER_LENGTH" , FunctionFactory::CaseInsensitive); |
| 70 | } |
| 71 | |
| 72 | } |
| 73 | |