1 | #include "duckdb/function/scalar/string_functions.hpp" |
2 | |
3 | #include "duckdb/common/algorithm.hpp" |
4 | #include "duckdb/common/exception.hpp" |
5 | #include "duckdb/common/vector_operations/vector_operations.hpp" |
6 | #include "duckdb/common/vector_operations/ternary_executor.hpp" |
7 | |
8 | #include "duckdb/planner/expression/bound_function_expression.hpp" |
9 | #include "utf8proc.hpp" |
10 | #include "duckdb/common/types/blob.hpp" |
11 | |
12 | namespace duckdb { |
13 | |
14 | static const int64_t SUPPORTED_UPPER_BOUND = NumericLimits<uint32_t>::Maximum(); |
15 | static const int64_t SUPPORTED_LOWER_BOUND = -SUPPORTED_UPPER_BOUND - 1; |
16 | |
17 | static inline void AssertInSupportedRange(idx_t input_size, int64_t offset, int64_t length) { |
18 | |
19 | if (input_size > (uint64_t)SUPPORTED_UPPER_BOUND) { |
20 | throw OutOfRangeException("Substring input size is too large (> %d)" , SUPPORTED_UPPER_BOUND); |
21 | } |
22 | if (offset < SUPPORTED_LOWER_BOUND) { |
23 | throw OutOfRangeException("Substring offset outside of supported range (< %d)" , SUPPORTED_LOWER_BOUND); |
24 | } |
25 | if (offset > SUPPORTED_UPPER_BOUND) { |
26 | throw OutOfRangeException("Substring offset outside of supported range (> %d)" , SUPPORTED_UPPER_BOUND); |
27 | } |
28 | if (length < SUPPORTED_LOWER_BOUND) { |
29 | throw OutOfRangeException("Substring length outside of supported range (< %d)" , SUPPORTED_LOWER_BOUND); |
30 | } |
31 | if (length > SUPPORTED_UPPER_BOUND) { |
32 | throw OutOfRangeException("Substring length outside of supported range (> %d)" , SUPPORTED_UPPER_BOUND); |
33 | } |
34 | } |
35 | |
36 | string_t SubstringEmptyString(Vector &result) { |
37 | auto result_string = StringVector::EmptyString(vector&: result, len: 0); |
38 | result_string.Finalize(); |
39 | return result_string; |
40 | } |
41 | |
42 | string_t SubstringSlice(Vector &result, const char *input_data, int64_t offset, int64_t length) { |
43 | auto result_string = StringVector::EmptyString(vector&: result, len: length); |
44 | auto result_data = result_string.GetDataWriteable(); |
45 | memcpy(dest: result_data, src: input_data + offset, n: length); |
46 | result_string.Finalize(); |
47 | return result_string; |
48 | } |
49 | |
50 | // compute start and end characters from the given input size and offset/length |
51 | bool SubstringStartEnd(int64_t input_size, int64_t offset, int64_t length, int64_t &start, int64_t &end) { |
52 | if (length == 0) { |
53 | return false; |
54 | } |
55 | if (offset > 0) { |
56 | // positive offset: scan from start |
57 | start = MinValue<int64_t>(a: input_size, b: offset - 1); |
58 | } else if (offset < 0) { |
59 | // negative offset: scan from end (i.e. start = end + offset) |
60 | start = MaxValue<int64_t>(a: input_size + offset, b: 0); |
61 | } else { |
62 | // offset = 0: special case, we start 1 character BEHIND the first character |
63 | start = 0; |
64 | length--; |
65 | if (length <= 0) { |
66 | return false; |
67 | } |
68 | } |
69 | if (length > 0) { |
70 | // positive length: go forward (i.e. end = start + offset) |
71 | end = MinValue<int64_t>(a: input_size, b: start + length); |
72 | } else { |
73 | // negative length: go backwards (i.e. end = start, start = start + length) |
74 | end = start; |
75 | start = MaxValue<int64_t>(a: 0, b: start + length); |
76 | } |
77 | if (start == end) { |
78 | return false; |
79 | } |
80 | D_ASSERT(start < end); |
81 | return true; |
82 | } |
83 | |
84 | string_t SubstringASCII(Vector &result, string_t input, int64_t offset, int64_t length) { |
85 | auto input_data = input.GetData(); |
86 | auto input_size = input.GetSize(); |
87 | |
88 | AssertInSupportedRange(input_size, offset, length); |
89 | |
90 | int64_t start, end; |
91 | if (!SubstringStartEnd(input_size, offset, length, start, end)) { |
92 | return SubstringEmptyString(result); |
93 | } |
94 | return SubstringSlice(result, input_data, offset: start, length: end - start); |
95 | } |
96 | |
97 | string_t SubstringFun::SubstringUnicode(Vector &result, string_t input, int64_t offset, int64_t length) { |
98 | auto input_data = input.GetData(); |
99 | auto input_size = input.GetSize(); |
100 | |
101 | AssertInSupportedRange(input_size, offset, length); |
102 | |
103 | if (length == 0) { |
104 | return SubstringEmptyString(result); |
105 | } |
106 | // first figure out which direction we need to scan |
107 | idx_t start_pos; |
108 | idx_t end_pos; |
109 | if (offset < 0) { |
110 | start_pos = 0; |
111 | end_pos = DConstants::INVALID_INDEX; |
112 | |
113 | // negative offset: scan backwards |
114 | int64_t start, end; |
115 | |
116 | // we express start and end as unicode codepoints from the back |
117 | offset--; |
118 | if (length < 0) { |
119 | // negative length |
120 | start = -offset - length; |
121 | end = -offset; |
122 | } else { |
123 | // positive length |
124 | start = -offset; |
125 | end = -offset - length; |
126 | } |
127 | if (end <= 0) { |
128 | end_pos = input_size; |
129 | } |
130 | int64_t current_character = 0; |
131 | for (idx_t i = input_size; i > 0; i--) { |
132 | if (LengthFun::IsCharacter(c: input_data[i - 1])) { |
133 | current_character++; |
134 | if (current_character == start) { |
135 | start_pos = i; |
136 | break; |
137 | } else if (current_character == end) { |
138 | end_pos = i; |
139 | } |
140 | } |
141 | } |
142 | while (!LengthFun::IsCharacter(c: input_data[start_pos])) { |
143 | start_pos++; |
144 | } |
145 | while (end_pos < input_size && !LengthFun::IsCharacter(c: input_data[end_pos])) { |
146 | end_pos++; |
147 | } |
148 | |
149 | if (end_pos == DConstants::INVALID_INDEX) { |
150 | return SubstringEmptyString(result); |
151 | } |
152 | } else { |
153 | start_pos = DConstants::INVALID_INDEX; |
154 | end_pos = input_size; |
155 | |
156 | // positive offset: scan forwards |
157 | int64_t start, end; |
158 | |
159 | // we express start and end as unicode codepoints from the front |
160 | offset--; |
161 | if (length < 0) { |
162 | // negative length |
163 | start = MaxValue<int64_t>(a: 0, b: offset + length); |
164 | end = offset; |
165 | } else { |
166 | // positive length |
167 | start = MaxValue<int64_t>(a: 0, b: offset); |
168 | end = offset + length; |
169 | } |
170 | |
171 | int64_t current_character = 0; |
172 | for (idx_t i = 0; i < input_size; i++) { |
173 | if (LengthFun::IsCharacter(c: input_data[i])) { |
174 | if (current_character == start) { |
175 | start_pos = i; |
176 | } else if (current_character == end) { |
177 | end_pos = i; |
178 | break; |
179 | } |
180 | current_character++; |
181 | } |
182 | } |
183 | if (start_pos == DConstants::INVALID_INDEX || end == 0 || end <= start) { |
184 | return SubstringEmptyString(result); |
185 | } |
186 | } |
187 | D_ASSERT(end_pos >= start_pos); |
188 | // after we have found these, we can slice the substring |
189 | return SubstringSlice(result, input_data, offset: start_pos, length: end_pos - start_pos); |
190 | } |
191 | |
192 | string_t SubstringFun::SubstringGrapheme(Vector &result, string_t input, int64_t offset, int64_t length) { |
193 | auto input_data = input.GetData(); |
194 | auto input_size = input.GetSize(); |
195 | |
196 | AssertInSupportedRange(input_size, offset, length); |
197 | |
198 | // we don't know yet if the substring is ascii, but we assume it is (for now) |
199 | // first get the start and end as if this was an ascii string |
200 | int64_t start, end; |
201 | if (!SubstringStartEnd(input_size, offset, length, start, end)) { |
202 | return SubstringEmptyString(result); |
203 | } |
204 | |
205 | // now check if all the characters between 0 and end are ascii characters |
206 | // note that we scan one further to check for a potential combining diacritics (e.g. i + diacritic is ï) |
207 | bool is_ascii = true; |
208 | idx_t ascii_end = MinValue<idx_t>(a: end + 1, b: input_size); |
209 | for (idx_t i = 0; i < ascii_end; i++) { |
210 | if (input_data[i] & 0x80) { |
211 | // found a non-ascii character: eek |
212 | is_ascii = false; |
213 | break; |
214 | } |
215 | } |
216 | if (is_ascii) { |
217 | // all characters are ascii, we can just slice the substring |
218 | return SubstringSlice(result, input_data, offset: start, length: end - start); |
219 | } |
220 | // if the characters are not ascii, we need to scan grapheme clusters |
221 | // first figure out which direction we need to scan |
222 | // offset = 0 case is taken care of in SubstringStartEnd |
223 | if (offset < 0) { |
224 | // negative offset, this case is more difficult |
225 | // we first need to count the number of characters in the string |
226 | idx_t num_characters = 0; |
227 | utf8proc_grapheme_callback(s: input_data, len: input_size, fun: [&](size_t start, size_t end) { |
228 | num_characters++; |
229 | return true; |
230 | }); |
231 | // now call substring start and end again, but with the number of unicode characters this time |
232 | SubstringStartEnd(input_size: num_characters, offset, length, start, end); |
233 | } |
234 | |
235 | // now scan the graphemes of the string to find the positions of the start and end characters |
236 | int64_t current_character = 0; |
237 | idx_t start_pos = DConstants::INVALID_INDEX, end_pos = input_size; |
238 | utf8proc_grapheme_callback(s: input_data, len: input_size, fun: [&](size_t gstart, size_t gend) { |
239 | if (current_character == start) { |
240 | start_pos = gstart; |
241 | } else if (current_character == end) { |
242 | end_pos = gstart; |
243 | return false; |
244 | } |
245 | current_character++; |
246 | return true; |
247 | }); |
248 | if (start_pos == DConstants::INVALID_INDEX) { |
249 | return SubstringEmptyString(result); |
250 | } |
251 | // after we have found these, we can slice the substring |
252 | return SubstringSlice(result, input_data, offset: start_pos, length: end_pos - start_pos); |
253 | } |
254 | |
255 | struct SubstringUnicodeOp { |
256 | static string_t Substring(Vector &result, string_t input, int64_t offset, int64_t length) { |
257 | return SubstringFun::SubstringUnicode(result, input, offset, length); |
258 | } |
259 | }; |
260 | |
261 | struct SubstringGraphemeOp { |
262 | static string_t Substring(Vector &result, string_t input, int64_t offset, int64_t length) { |
263 | return SubstringFun::SubstringGrapheme(result, input, offset, length); |
264 | } |
265 | }; |
266 | |
267 | template <class OP> |
268 | static void SubstringFunction(DataChunk &args, ExpressionState &state, Vector &result) { |
269 | auto &input_vector = args.data[0]; |
270 | auto &offset_vector = args.data[1]; |
271 | if (args.ColumnCount() == 3) { |
272 | auto &length_vector = args.data[2]; |
273 | |
274 | TernaryExecutor::Execute<string_t, int64_t, int64_t, string_t>( |
275 | input_vector, offset_vector, length_vector, result, args.size(), |
276 | [&](string_t input_string, int64_t offset, int64_t length) { |
277 | return OP::Substring(result, input_string, offset, length); |
278 | }); |
279 | } else { |
280 | BinaryExecutor::Execute<string_t, int64_t, string_t>( |
281 | input_vector, offset_vector, result, args.size(), [&](string_t input_string, int64_t offset) { |
282 | return OP::Substring(result, input_string, offset, NumericLimits<uint32_t>::Maximum()); |
283 | }); |
284 | } |
285 | } |
286 | |
287 | static void SubstringFunctionASCII(DataChunk &args, ExpressionState &state, Vector &result) { |
288 | auto &input_vector = args.data[0]; |
289 | auto &offset_vector = args.data[1]; |
290 | if (args.ColumnCount() == 3) { |
291 | auto &length_vector = args.data[2]; |
292 | |
293 | TernaryExecutor::Execute<string_t, int64_t, int64_t, string_t>( |
294 | a&: input_vector, b&: offset_vector, c&: length_vector, result, count: args.size(), |
295 | fun: [&](string_t input_string, int64_t offset, int64_t length) { |
296 | return SubstringASCII(result, input: input_string, offset, length); |
297 | }); |
298 | } else { |
299 | BinaryExecutor::Execute<string_t, int64_t, string_t>( |
300 | left&: input_vector, right&: offset_vector, result, count: args.size(), fun: [&](string_t input_string, int64_t offset) { |
301 | return SubstringASCII(result, input: input_string, offset, length: NumericLimits<uint32_t>::Maximum()); |
302 | }); |
303 | } |
304 | } |
305 | |
306 | static unique_ptr<BaseStatistics> SubstringPropagateStats(ClientContext &context, FunctionStatisticsInput &input) { |
307 | auto &child_stats = input.child_stats; |
308 | auto &expr = input.expr; |
309 | // can only propagate stats if the children have stats |
310 | // we only care about the stats of the first child (i.e. the string) |
311 | if (!StringStats::CanContainUnicode(stats: child_stats[0])) { |
312 | expr.function.function = SubstringFunctionASCII; |
313 | } |
314 | return nullptr; |
315 | } |
316 | |
317 | void SubstringFun::RegisterFunction(BuiltinFunctions &set) { |
318 | ScalarFunctionSet substr("substring" ); |
319 | substr.AddFunction(function: ScalarFunction({LogicalType::VARCHAR, LogicalType::BIGINT, LogicalType::BIGINT}, |
320 | LogicalType::VARCHAR, SubstringFunction<SubstringUnicodeOp>, nullptr, nullptr, |
321 | SubstringPropagateStats)); |
322 | substr.AddFunction(function: ScalarFunction({LogicalType::VARCHAR, LogicalType::BIGINT}, LogicalType::VARCHAR, |
323 | SubstringFunction<SubstringUnicodeOp>, nullptr, nullptr, |
324 | SubstringPropagateStats)); |
325 | set.AddFunction(set: substr); |
326 | substr.name = "substr" ; |
327 | set.AddFunction(set: substr); |
328 | |
329 | ScalarFunctionSet substr_grapheme("substring_grapheme" ); |
330 | substr_grapheme.AddFunction(function: ScalarFunction({LogicalType::VARCHAR, LogicalType::BIGINT, LogicalType::BIGINT}, |
331 | LogicalType::VARCHAR, SubstringFunction<SubstringGraphemeOp>, nullptr, |
332 | nullptr, SubstringPropagateStats)); |
333 | substr_grapheme.AddFunction(function: ScalarFunction({LogicalType::VARCHAR, LogicalType::BIGINT}, LogicalType::VARCHAR, |
334 | SubstringFunction<SubstringGraphemeOp>, nullptr, nullptr, |
335 | SubstringPropagateStats)); |
336 | set.AddFunction(set: substr_grapheme); |
337 | } |
338 | |
339 | } // namespace duckdb |
340 | |