| 1 | #include "duckdb/function/scalar/string_functions.hpp" |
| 2 | |
| 3 | #include "duckdb/common/exception.hpp" |
| 4 | #include "duckdb/common/vector_operations/vector_operations.hpp" |
| 5 | #include "duckdb/common/vector_operations/unary_executor.hpp" |
| 6 | #include "duckdb/planner/expression/bound_function_expression.hpp" |
| 7 | |
| 8 | #include "utf8proc.hpp" |
| 9 | |
| 10 | #include <string.h> |
| 11 | |
| 12 | namespace duckdb { |
| 13 | |
| 14 | uint8_t UpperFun::ascii_to_upper_map[] = { |
| 15 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, |
| 16 | 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, |
| 17 | 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, |
| 18 | 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, |
| 19 | 88, 89, 90, 91, 92, 93, 94, 95, 96, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, |
| 20 | 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 123, 124, 125, 126, 127, 128, 129, 130, 131, |
| 21 | 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, |
| 22 | 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, |
| 23 | 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, |
| 24 | 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, |
| 25 | 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, |
| 26 | 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254}; |
| 27 | uint8_t LowerFun::ascii_to_lower_map[] = { |
| 28 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, |
| 29 | 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, |
| 30 | 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 97, |
| 31 | 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, |
| 32 | 120, 121, 122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, |
| 33 | 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, |
| 34 | 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, |
| 35 | 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, |
| 36 | 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, |
| 37 | 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, |
| 38 | 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, |
| 39 | 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254}; |
| 40 | |
| 41 | template <bool IS_UPPER> |
| 42 | static string_t ASCIICaseConvert(Vector &result, const char *input_data, idx_t input_length) { |
| 43 | idx_t output_length = input_length; |
| 44 | auto result_str = StringVector::EmptyString(vector&: result, len: output_length); |
| 45 | auto result_data = result_str.GetDataWriteable(); |
| 46 | for (idx_t i = 0; i < input_length; i++) { |
| 47 | result_data[i] = IS_UPPER ? UpperFun::ascii_to_upper_map[uint8_t(input_data[i])] |
| 48 | : LowerFun::ascii_to_lower_map[uint8_t(input_data[i])]; |
| 49 | } |
| 50 | result_str.Finalize(); |
| 51 | return result_str; |
| 52 | } |
| 53 | |
| 54 | template <bool IS_UPPER> |
| 55 | static idx_t GetResultLength(const char *input_data, idx_t input_length) { |
| 56 | idx_t output_length = 0; |
| 57 | for (idx_t i = 0; i < input_length;) { |
| 58 | if (input_data[i] & 0x80) { |
| 59 | // unicode |
| 60 | int sz = 0; |
| 61 | int codepoint = utf8proc_codepoint(u_input: input_data + i, sz); |
| 62 | int converted_codepoint = IS_UPPER ? utf8proc_toupper(c: codepoint) : utf8proc_tolower(c: codepoint); |
| 63 | int new_sz = utf8proc_codepoint_length(cp: converted_codepoint); |
| 64 | D_ASSERT(new_sz >= 0); |
| 65 | output_length += new_sz; |
| 66 | i += sz; |
| 67 | } else { |
| 68 | // ascii |
| 69 | output_length++; |
| 70 | i++; |
| 71 | } |
| 72 | } |
| 73 | return output_length; |
| 74 | } |
| 75 | |
| 76 | template <bool IS_UPPER> |
| 77 | static void CaseConvert(const char *input_data, idx_t input_length, char *result_data) { |
| 78 | for (idx_t i = 0; i < input_length;) { |
| 79 | if (input_data[i] & 0x80) { |
| 80 | // non-ascii character |
| 81 | int sz = 0, new_sz = 0; |
| 82 | int codepoint = utf8proc_codepoint(u_input: input_data + i, sz); |
| 83 | int converted_codepoint = IS_UPPER ? utf8proc_toupper(c: codepoint) : utf8proc_tolower(c: codepoint); |
| 84 | auto success = utf8proc_codepoint_to_utf8(cp: converted_codepoint, sz&: new_sz, c: result_data); |
| 85 | D_ASSERT(success); |
| 86 | (void)success; |
| 87 | result_data += new_sz; |
| 88 | i += sz; |
| 89 | } else { |
| 90 | // ascii |
| 91 | *result_data = IS_UPPER ? UpperFun::ascii_to_upper_map[uint8_t(input_data[i])] |
| 92 | : LowerFun::ascii_to_lower_map[uint8_t(input_data[i])]; |
| 93 | result_data++; |
| 94 | i++; |
| 95 | } |
| 96 | } |
| 97 | } |
| 98 | |
| 99 | idx_t LowerFun::LowerLength(const char *input_data, idx_t input_length) { |
| 100 | return GetResultLength<false>(input_data, input_length); |
| 101 | } |
| 102 | |
| 103 | void LowerFun::LowerCase(const char *input_data, idx_t input_length, char *result_data) { |
| 104 | CaseConvert<false>(input_data, input_length, result_data); |
| 105 | } |
| 106 | |
| 107 | template <bool IS_UPPER> |
| 108 | static string_t UnicodeCaseConvert(Vector &result, const char *input_data, idx_t input_length) { |
| 109 | // first figure out the output length |
| 110 | idx_t output_length = GetResultLength<IS_UPPER>(input_data, input_length); |
| 111 | auto result_str = StringVector::EmptyString(vector&: result, len: output_length); |
| 112 | auto result_data = result_str.GetDataWriteable(); |
| 113 | |
| 114 | CaseConvert<IS_UPPER>(input_data, input_length, result_data); |
| 115 | result_str.Finalize(); |
| 116 | return result_str; |
| 117 | } |
| 118 | |
| 119 | template <bool IS_UPPER> |
| 120 | struct CaseConvertOperator { |
| 121 | template <class INPUT_TYPE, class RESULT_TYPE> |
| 122 | static RESULT_TYPE Operation(INPUT_TYPE input, Vector &result) { |
| 123 | auto input_data = input.GetData(); |
| 124 | auto input_length = input.GetSize(); |
| 125 | return UnicodeCaseConvert<IS_UPPER>(result, input_data, input_length); |
| 126 | } |
| 127 | }; |
| 128 | |
| 129 | template <bool IS_UPPER> |
| 130 | static void CaseConvertFunction(DataChunk &args, ExpressionState &state, Vector &result) { |
| 131 | UnaryExecutor::ExecuteString<string_t, string_t, CaseConvertOperator<IS_UPPER>>(args.data[0], result, args.size()); |
| 132 | } |
| 133 | |
| 134 | template <bool IS_UPPER> |
| 135 | struct CaseConvertOperatorASCII { |
| 136 | template <class INPUT_TYPE, class RESULT_TYPE> |
| 137 | static RESULT_TYPE Operation(INPUT_TYPE input, Vector &result) { |
| 138 | auto input_data = input.GetData(); |
| 139 | auto input_length = input.GetSize(); |
| 140 | return ASCIICaseConvert<IS_UPPER>(result, input_data, input_length); |
| 141 | } |
| 142 | }; |
| 143 | |
| 144 | template <bool IS_UPPER> |
| 145 | static void CaseConvertFunctionASCII(DataChunk &args, ExpressionState &state, Vector &result) { |
| 146 | UnaryExecutor::ExecuteString<string_t, string_t, CaseConvertOperatorASCII<IS_UPPER>>(args.data[0], result, |
| 147 | args.size()); |
| 148 | } |
| 149 | |
| 150 | template <bool IS_UPPER> |
| 151 | static unique_ptr<BaseStatistics> CaseConvertPropagateStats(ClientContext &context, FunctionStatisticsInput &input) { |
| 152 | auto &child_stats = input.child_stats; |
| 153 | auto &expr = input.expr; |
| 154 | D_ASSERT(child_stats.size() == 1); |
| 155 | // can only propagate stats if the children have stats |
| 156 | if (!StringStats::CanContainUnicode(stats: child_stats[0])) { |
| 157 | expr.function.function = CaseConvertFunctionASCII<IS_UPPER>; |
| 158 | } |
| 159 | return nullptr; |
| 160 | } |
| 161 | |
| 162 | ScalarFunction LowerFun::GetFunction() { |
| 163 | return ScalarFunction("lower" , {LogicalType::VARCHAR}, LogicalType::VARCHAR, CaseConvertFunction<false>, nullptr, |
| 164 | nullptr, CaseConvertPropagateStats<false>); |
| 165 | } |
| 166 | |
| 167 | void LowerFun::RegisterFunction(BuiltinFunctions &set) { |
| 168 | set.AddFunction(names: {"lower" , "lcase" }, function: LowerFun::GetFunction()); |
| 169 | } |
| 170 | |
| 171 | void UpperFun::RegisterFunction(BuiltinFunctions &set) { |
| 172 | set.AddFunction(names: {"upper" , "ucase" }, |
| 173 | function: ScalarFunction({LogicalType::VARCHAR}, LogicalType::VARCHAR, CaseConvertFunction<true>, nullptr, |
| 174 | nullptr, CaseConvertPropagateStats<true>)); |
| 175 | } |
| 176 | |
| 177 | } // namespace duckdb |
| 178 | |