1 | #include "duckdb/function/scalar/string_functions.hpp" |
2 | |
3 | #include "duckdb/common/exception.hpp" |
4 | #include "duckdb/common/vector_operations/vector_operations.hpp" |
5 | #include "duckdb/common/vector_operations/unary_executor.hpp" |
6 | #include "duckdb/planner/expression/bound_function_expression.hpp" |
7 | |
8 | #include "utf8proc.hpp" |
9 | |
10 | #include <string.h> |
11 | |
12 | namespace duckdb { |
13 | |
14 | uint8_t UpperFun::ascii_to_upper_map[] = { |
15 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, |
16 | 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, |
17 | 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, |
18 | 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, |
19 | 88, 89, 90, 91, 92, 93, 94, 95, 96, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, |
20 | 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 123, 124, 125, 126, 127, 128, 129, 130, 131, |
21 | 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, |
22 | 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, |
23 | 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, |
24 | 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, |
25 | 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, |
26 | 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254}; |
27 | uint8_t LowerFun::ascii_to_lower_map[] = { |
28 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, |
29 | 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, |
30 | 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 97, |
31 | 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, |
32 | 120, 121, 122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, |
33 | 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, |
34 | 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, |
35 | 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, |
36 | 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, |
37 | 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, |
38 | 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, |
39 | 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254}; |
40 | |
41 | template <bool IS_UPPER> |
42 | static string_t ASCIICaseConvert(Vector &result, const char *input_data, idx_t input_length) { |
43 | idx_t output_length = input_length; |
44 | auto result_str = StringVector::EmptyString(vector&: result, len: output_length); |
45 | auto result_data = result_str.GetDataWriteable(); |
46 | for (idx_t i = 0; i < input_length; i++) { |
47 | result_data[i] = IS_UPPER ? UpperFun::ascii_to_upper_map[uint8_t(input_data[i])] |
48 | : LowerFun::ascii_to_lower_map[uint8_t(input_data[i])]; |
49 | } |
50 | result_str.Finalize(); |
51 | return result_str; |
52 | } |
53 | |
54 | template <bool IS_UPPER> |
55 | static idx_t GetResultLength(const char *input_data, idx_t input_length) { |
56 | idx_t output_length = 0; |
57 | for (idx_t i = 0; i < input_length;) { |
58 | if (input_data[i] & 0x80) { |
59 | // unicode |
60 | int sz = 0; |
61 | int codepoint = utf8proc_codepoint(u_input: input_data + i, sz); |
62 | int converted_codepoint = IS_UPPER ? utf8proc_toupper(c: codepoint) : utf8proc_tolower(c: codepoint); |
63 | int new_sz = utf8proc_codepoint_length(cp: converted_codepoint); |
64 | D_ASSERT(new_sz >= 0); |
65 | output_length += new_sz; |
66 | i += sz; |
67 | } else { |
68 | // ascii |
69 | output_length++; |
70 | i++; |
71 | } |
72 | } |
73 | return output_length; |
74 | } |
75 | |
76 | template <bool IS_UPPER> |
77 | static void CaseConvert(const char *input_data, idx_t input_length, char *result_data) { |
78 | for (idx_t i = 0; i < input_length;) { |
79 | if (input_data[i] & 0x80) { |
80 | // non-ascii character |
81 | int sz = 0, new_sz = 0; |
82 | int codepoint = utf8proc_codepoint(u_input: input_data + i, sz); |
83 | int converted_codepoint = IS_UPPER ? utf8proc_toupper(c: codepoint) : utf8proc_tolower(c: codepoint); |
84 | auto success = utf8proc_codepoint_to_utf8(cp: converted_codepoint, sz&: new_sz, c: result_data); |
85 | D_ASSERT(success); |
86 | (void)success; |
87 | result_data += new_sz; |
88 | i += sz; |
89 | } else { |
90 | // ascii |
91 | *result_data = IS_UPPER ? UpperFun::ascii_to_upper_map[uint8_t(input_data[i])] |
92 | : LowerFun::ascii_to_lower_map[uint8_t(input_data[i])]; |
93 | result_data++; |
94 | i++; |
95 | } |
96 | } |
97 | } |
98 | |
99 | idx_t LowerFun::LowerLength(const char *input_data, idx_t input_length) { |
100 | return GetResultLength<false>(input_data, input_length); |
101 | } |
102 | |
103 | void LowerFun::LowerCase(const char *input_data, idx_t input_length, char *result_data) { |
104 | CaseConvert<false>(input_data, input_length, result_data); |
105 | } |
106 | |
107 | template <bool IS_UPPER> |
108 | static string_t UnicodeCaseConvert(Vector &result, const char *input_data, idx_t input_length) { |
109 | // first figure out the output length |
110 | idx_t output_length = GetResultLength<IS_UPPER>(input_data, input_length); |
111 | auto result_str = StringVector::EmptyString(vector&: result, len: output_length); |
112 | auto result_data = result_str.GetDataWriteable(); |
113 | |
114 | CaseConvert<IS_UPPER>(input_data, input_length, result_data); |
115 | result_str.Finalize(); |
116 | return result_str; |
117 | } |
118 | |
119 | template <bool IS_UPPER> |
120 | struct CaseConvertOperator { |
121 | template <class INPUT_TYPE, class RESULT_TYPE> |
122 | static RESULT_TYPE Operation(INPUT_TYPE input, Vector &result) { |
123 | auto input_data = input.GetData(); |
124 | auto input_length = input.GetSize(); |
125 | return UnicodeCaseConvert<IS_UPPER>(result, input_data, input_length); |
126 | } |
127 | }; |
128 | |
129 | template <bool IS_UPPER> |
130 | static void CaseConvertFunction(DataChunk &args, ExpressionState &state, Vector &result) { |
131 | UnaryExecutor::ExecuteString<string_t, string_t, CaseConvertOperator<IS_UPPER>>(args.data[0], result, args.size()); |
132 | } |
133 | |
134 | template <bool IS_UPPER> |
135 | struct CaseConvertOperatorASCII { |
136 | template <class INPUT_TYPE, class RESULT_TYPE> |
137 | static RESULT_TYPE Operation(INPUT_TYPE input, Vector &result) { |
138 | auto input_data = input.GetData(); |
139 | auto input_length = input.GetSize(); |
140 | return ASCIICaseConvert<IS_UPPER>(result, input_data, input_length); |
141 | } |
142 | }; |
143 | |
144 | template <bool IS_UPPER> |
145 | static void CaseConvertFunctionASCII(DataChunk &args, ExpressionState &state, Vector &result) { |
146 | UnaryExecutor::ExecuteString<string_t, string_t, CaseConvertOperatorASCII<IS_UPPER>>(args.data[0], result, |
147 | args.size()); |
148 | } |
149 | |
150 | template <bool IS_UPPER> |
151 | static unique_ptr<BaseStatistics> CaseConvertPropagateStats(ClientContext &context, FunctionStatisticsInput &input) { |
152 | auto &child_stats = input.child_stats; |
153 | auto &expr = input.expr; |
154 | D_ASSERT(child_stats.size() == 1); |
155 | // can only propagate stats if the children have stats |
156 | if (!StringStats::CanContainUnicode(stats: child_stats[0])) { |
157 | expr.function.function = CaseConvertFunctionASCII<IS_UPPER>; |
158 | } |
159 | return nullptr; |
160 | } |
161 | |
162 | ScalarFunction LowerFun::GetFunction() { |
163 | return ScalarFunction("lower" , {LogicalType::VARCHAR}, LogicalType::VARCHAR, CaseConvertFunction<false>, nullptr, |
164 | nullptr, CaseConvertPropagateStats<false>); |
165 | } |
166 | |
167 | void LowerFun::RegisterFunction(BuiltinFunctions &set) { |
168 | set.AddFunction(names: {"lower" , "lcase" }, function: LowerFun::GetFunction()); |
169 | } |
170 | |
171 | void UpperFun::RegisterFunction(BuiltinFunctions &set) { |
172 | set.AddFunction(names: {"upper" , "ucase" }, |
173 | function: ScalarFunction({LogicalType::VARCHAR}, LogicalType::VARCHAR, CaseConvertFunction<true>, nullptr, |
174 | nullptr, CaseConvertPropagateStats<true>)); |
175 | } |
176 | |
177 | } // namespace duckdb |
178 | |