1#include "duckdb/function/scalar/string_functions.hpp"
2
3#include "duckdb/common/exception.hpp"
4#include "duckdb/common/vector_operations/vector_operations.hpp"
5#include "duckdb/common/vector_operations/unary_executor.hpp"
6#include "duckdb/planner/expression/bound_function_expression.hpp"
7
8#include "utf8proc.hpp"
9
10#include <string.h>
11
12namespace duckdb {
13
14uint8_t UpperFun::ascii_to_upper_map[] = {
15 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
16 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
17 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,
18 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
19 88, 89, 90, 91, 92, 93, 94, 95, 96, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
20 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 123, 124, 125, 126, 127, 128, 129, 130, 131,
21 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153,
22 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
23 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197,
24 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219,
25 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241,
26 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254};
27uint8_t LowerFun::ascii_to_lower_map[] = {
28 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
29 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
30 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 97,
31 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
32 120, 121, 122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
33 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131,
34 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153,
35 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
36 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197,
37 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219,
38 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241,
39 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254};
40
41template <bool IS_UPPER>
42static string_t ASCIICaseConvert(Vector &result, const char *input_data, idx_t input_length) {
43 idx_t output_length = input_length;
44 auto result_str = StringVector::EmptyString(vector&: result, len: output_length);
45 auto result_data = result_str.GetDataWriteable();
46 for (idx_t i = 0; i < input_length; i++) {
47 result_data[i] = IS_UPPER ? UpperFun::ascii_to_upper_map[uint8_t(input_data[i])]
48 : LowerFun::ascii_to_lower_map[uint8_t(input_data[i])];
49 }
50 result_str.Finalize();
51 return result_str;
52}
53
54template <bool IS_UPPER>
55static idx_t GetResultLength(const char *input_data, idx_t input_length) {
56 idx_t output_length = 0;
57 for (idx_t i = 0; i < input_length;) {
58 if (input_data[i] & 0x80) {
59 // unicode
60 int sz = 0;
61 int codepoint = utf8proc_codepoint(u_input: input_data + i, sz);
62 int converted_codepoint = IS_UPPER ? utf8proc_toupper(c: codepoint) : utf8proc_tolower(c: codepoint);
63 int new_sz = utf8proc_codepoint_length(cp: converted_codepoint);
64 D_ASSERT(new_sz >= 0);
65 output_length += new_sz;
66 i += sz;
67 } else {
68 // ascii
69 output_length++;
70 i++;
71 }
72 }
73 return output_length;
74}
75
76template <bool IS_UPPER>
77static void CaseConvert(const char *input_data, idx_t input_length, char *result_data) {
78 for (idx_t i = 0; i < input_length;) {
79 if (input_data[i] & 0x80) {
80 // non-ascii character
81 int sz = 0, new_sz = 0;
82 int codepoint = utf8proc_codepoint(u_input: input_data + i, sz);
83 int converted_codepoint = IS_UPPER ? utf8proc_toupper(c: codepoint) : utf8proc_tolower(c: codepoint);
84 auto success = utf8proc_codepoint_to_utf8(cp: converted_codepoint, sz&: new_sz, c: result_data);
85 D_ASSERT(success);
86 (void)success;
87 result_data += new_sz;
88 i += sz;
89 } else {
90 // ascii
91 *result_data = IS_UPPER ? UpperFun::ascii_to_upper_map[uint8_t(input_data[i])]
92 : LowerFun::ascii_to_lower_map[uint8_t(input_data[i])];
93 result_data++;
94 i++;
95 }
96 }
97}
98
99idx_t LowerFun::LowerLength(const char *input_data, idx_t input_length) {
100 return GetResultLength<false>(input_data, input_length);
101}
102
103void LowerFun::LowerCase(const char *input_data, idx_t input_length, char *result_data) {
104 CaseConvert<false>(input_data, input_length, result_data);
105}
106
107template <bool IS_UPPER>
108static string_t UnicodeCaseConvert(Vector &result, const char *input_data, idx_t input_length) {
109 // first figure out the output length
110 idx_t output_length = GetResultLength<IS_UPPER>(input_data, input_length);
111 auto result_str = StringVector::EmptyString(vector&: result, len: output_length);
112 auto result_data = result_str.GetDataWriteable();
113
114 CaseConvert<IS_UPPER>(input_data, input_length, result_data);
115 result_str.Finalize();
116 return result_str;
117}
118
119template <bool IS_UPPER>
120struct CaseConvertOperator {
121 template <class INPUT_TYPE, class RESULT_TYPE>
122 static RESULT_TYPE Operation(INPUT_TYPE input, Vector &result) {
123 auto input_data = input.GetData();
124 auto input_length = input.GetSize();
125 return UnicodeCaseConvert<IS_UPPER>(result, input_data, input_length);
126 }
127};
128
129template <bool IS_UPPER>
130static void CaseConvertFunction(DataChunk &args, ExpressionState &state, Vector &result) {
131 UnaryExecutor::ExecuteString<string_t, string_t, CaseConvertOperator<IS_UPPER>>(args.data[0], result, args.size());
132}
133
134template <bool IS_UPPER>
135struct CaseConvertOperatorASCII {
136 template <class INPUT_TYPE, class RESULT_TYPE>
137 static RESULT_TYPE Operation(INPUT_TYPE input, Vector &result) {
138 auto input_data = input.GetData();
139 auto input_length = input.GetSize();
140 return ASCIICaseConvert<IS_UPPER>(result, input_data, input_length);
141 }
142};
143
144template <bool IS_UPPER>
145static void CaseConvertFunctionASCII(DataChunk &args, ExpressionState &state, Vector &result) {
146 UnaryExecutor::ExecuteString<string_t, string_t, CaseConvertOperatorASCII<IS_UPPER>>(args.data[0], result,
147 args.size());
148}
149
150template <bool IS_UPPER>
151static unique_ptr<BaseStatistics> CaseConvertPropagateStats(ClientContext &context, FunctionStatisticsInput &input) {
152 auto &child_stats = input.child_stats;
153 auto &expr = input.expr;
154 D_ASSERT(child_stats.size() == 1);
155 // can only propagate stats if the children have stats
156 if (!StringStats::CanContainUnicode(stats: child_stats[0])) {
157 expr.function.function = CaseConvertFunctionASCII<IS_UPPER>;
158 }
159 return nullptr;
160}
161
162ScalarFunction LowerFun::GetFunction() {
163 return ScalarFunction("lower", {LogicalType::VARCHAR}, LogicalType::VARCHAR, CaseConvertFunction<false>, nullptr,
164 nullptr, CaseConvertPropagateStats<false>);
165}
166
167void LowerFun::RegisterFunction(BuiltinFunctions &set) {
168 set.AddFunction(names: {"lower", "lcase"}, function: LowerFun::GetFunction());
169}
170
171void UpperFun::RegisterFunction(BuiltinFunctions &set) {
172 set.AddFunction(names: {"upper", "ucase"},
173 function: ScalarFunction({LogicalType::VARCHAR}, LogicalType::VARCHAR, CaseConvertFunction<true>, nullptr,
174 nullptr, CaseConvertPropagateStats<true>));
175}
176
177} // namespace duckdb
178