1#include "duckdb/function/scalar/string_functions.hpp"
2
3#include "duckdb/common/algorithm.hpp"
4#include "duckdb/common/exception.hpp"
5#include "duckdb/common/vector_operations/vector_operations.hpp"
6#include "duckdb/common/vector_operations/ternary_executor.hpp"
7
8#include "duckdb/planner/expression/bound_function_expression.hpp"
9#include "utf8proc.hpp"
10#include "duckdb/common/types/blob.hpp"
11
12namespace duckdb {
13
14static const int64_t SUPPORTED_UPPER_BOUND = NumericLimits<uint32_t>::Maximum();
15static const int64_t SUPPORTED_LOWER_BOUND = -SUPPORTED_UPPER_BOUND - 1;
16
17static inline void AssertInSupportedRange(idx_t input_size, int64_t offset, int64_t length) {
18
19 if (input_size > (uint64_t)SUPPORTED_UPPER_BOUND) {
20 throw OutOfRangeException("Substring input size is too large (> %d)", SUPPORTED_UPPER_BOUND);
21 }
22 if (offset < SUPPORTED_LOWER_BOUND) {
23 throw OutOfRangeException("Substring offset outside of supported range (< %d)", SUPPORTED_LOWER_BOUND);
24 }
25 if (offset > SUPPORTED_UPPER_BOUND) {
26 throw OutOfRangeException("Substring offset outside of supported range (> %d)", SUPPORTED_UPPER_BOUND);
27 }
28 if (length < SUPPORTED_LOWER_BOUND) {
29 throw OutOfRangeException("Substring length outside of supported range (< %d)", SUPPORTED_LOWER_BOUND);
30 }
31 if (length > SUPPORTED_UPPER_BOUND) {
32 throw OutOfRangeException("Substring length outside of supported range (> %d)", SUPPORTED_UPPER_BOUND);
33 }
34}
35
36string_t SubstringEmptyString(Vector &result) {
37 auto result_string = StringVector::EmptyString(vector&: result, len: 0);
38 result_string.Finalize();
39 return result_string;
40}
41
42string_t SubstringSlice(Vector &result, const char *input_data, int64_t offset, int64_t length) {
43 auto result_string = StringVector::EmptyString(vector&: result, len: length);
44 auto result_data = result_string.GetDataWriteable();
45 memcpy(dest: result_data, src: input_data + offset, n: length);
46 result_string.Finalize();
47 return result_string;
48}
49
50// compute start and end characters from the given input size and offset/length
51bool SubstringStartEnd(int64_t input_size, int64_t offset, int64_t length, int64_t &start, int64_t &end) {
52 if (length == 0) {
53 return false;
54 }
55 if (offset > 0) {
56 // positive offset: scan from start
57 start = MinValue<int64_t>(a: input_size, b: offset - 1);
58 } else if (offset < 0) {
59 // negative offset: scan from end (i.e. start = end + offset)
60 start = MaxValue<int64_t>(a: input_size + offset, b: 0);
61 } else {
62 // offset = 0: special case, we start 1 character BEHIND the first character
63 start = 0;
64 length--;
65 if (length <= 0) {
66 return false;
67 }
68 }
69 if (length > 0) {
70 // positive length: go forward (i.e. end = start + offset)
71 end = MinValue<int64_t>(a: input_size, b: start + length);
72 } else {
73 // negative length: go backwards (i.e. end = start, start = start + length)
74 end = start;
75 start = MaxValue<int64_t>(a: 0, b: start + length);
76 }
77 if (start == end) {
78 return false;
79 }
80 D_ASSERT(start < end);
81 return true;
82}
83
84string_t SubstringASCII(Vector &result, string_t input, int64_t offset, int64_t length) {
85 auto input_data = input.GetData();
86 auto input_size = input.GetSize();
87
88 AssertInSupportedRange(input_size, offset, length);
89
90 int64_t start, end;
91 if (!SubstringStartEnd(input_size, offset, length, start, end)) {
92 return SubstringEmptyString(result);
93 }
94 return SubstringSlice(result, input_data, offset: start, length: end - start);
95}
96
97string_t SubstringFun::SubstringUnicode(Vector &result, string_t input, int64_t offset, int64_t length) {
98 auto input_data = input.GetData();
99 auto input_size = input.GetSize();
100
101 AssertInSupportedRange(input_size, offset, length);
102
103 if (length == 0) {
104 return SubstringEmptyString(result);
105 }
106 // first figure out which direction we need to scan
107 idx_t start_pos;
108 idx_t end_pos;
109 if (offset < 0) {
110 start_pos = 0;
111 end_pos = DConstants::INVALID_INDEX;
112
113 // negative offset: scan backwards
114 int64_t start, end;
115
116 // we express start and end as unicode codepoints from the back
117 offset--;
118 if (length < 0) {
119 // negative length
120 start = -offset - length;
121 end = -offset;
122 } else {
123 // positive length
124 start = -offset;
125 end = -offset - length;
126 }
127 if (end <= 0) {
128 end_pos = input_size;
129 }
130 int64_t current_character = 0;
131 for (idx_t i = input_size; i > 0; i--) {
132 if (LengthFun::IsCharacter(c: input_data[i - 1])) {
133 current_character++;
134 if (current_character == start) {
135 start_pos = i;
136 break;
137 } else if (current_character == end) {
138 end_pos = i;
139 }
140 }
141 }
142 while (!LengthFun::IsCharacter(c: input_data[start_pos])) {
143 start_pos++;
144 }
145 while (end_pos < input_size && !LengthFun::IsCharacter(c: input_data[end_pos])) {
146 end_pos++;
147 }
148
149 if (end_pos == DConstants::INVALID_INDEX) {
150 return SubstringEmptyString(result);
151 }
152 } else {
153 start_pos = DConstants::INVALID_INDEX;
154 end_pos = input_size;
155
156 // positive offset: scan forwards
157 int64_t start, end;
158
159 // we express start and end as unicode codepoints from the front
160 offset--;
161 if (length < 0) {
162 // negative length
163 start = MaxValue<int64_t>(a: 0, b: offset + length);
164 end = offset;
165 } else {
166 // positive length
167 start = MaxValue<int64_t>(a: 0, b: offset);
168 end = offset + length;
169 }
170
171 int64_t current_character = 0;
172 for (idx_t i = 0; i < input_size; i++) {
173 if (LengthFun::IsCharacter(c: input_data[i])) {
174 if (current_character == start) {
175 start_pos = i;
176 } else if (current_character == end) {
177 end_pos = i;
178 break;
179 }
180 current_character++;
181 }
182 }
183 if (start_pos == DConstants::INVALID_INDEX || end == 0 || end <= start) {
184 return SubstringEmptyString(result);
185 }
186 }
187 D_ASSERT(end_pos >= start_pos);
188 // after we have found these, we can slice the substring
189 return SubstringSlice(result, input_data, offset: start_pos, length: end_pos - start_pos);
190}
191
192string_t SubstringFun::SubstringGrapheme(Vector &result, string_t input, int64_t offset, int64_t length) {
193 auto input_data = input.GetData();
194 auto input_size = input.GetSize();
195
196 AssertInSupportedRange(input_size, offset, length);
197
198 // we don't know yet if the substring is ascii, but we assume it is (for now)
199 // first get the start and end as if this was an ascii string
200 int64_t start, end;
201 if (!SubstringStartEnd(input_size, offset, length, start, end)) {
202 return SubstringEmptyString(result);
203 }
204
205 // now check if all the characters between 0 and end are ascii characters
206 // note that we scan one further to check for a potential combining diacritics (e.g. i + diacritic is ï)
207 bool is_ascii = true;
208 idx_t ascii_end = MinValue<idx_t>(a: end + 1, b: input_size);
209 for (idx_t i = 0; i < ascii_end; i++) {
210 if (input_data[i] & 0x80) {
211 // found a non-ascii character: eek
212 is_ascii = false;
213 break;
214 }
215 }
216 if (is_ascii) {
217 // all characters are ascii, we can just slice the substring
218 return SubstringSlice(result, input_data, offset: start, length: end - start);
219 }
220 // if the characters are not ascii, we need to scan grapheme clusters
221 // first figure out which direction we need to scan
222 // offset = 0 case is taken care of in SubstringStartEnd
223 if (offset < 0) {
224 // negative offset, this case is more difficult
225 // we first need to count the number of characters in the string
226 idx_t num_characters = 0;
227 utf8proc_grapheme_callback(s: input_data, len: input_size, fun: [&](size_t start, size_t end) {
228 num_characters++;
229 return true;
230 });
231 // now call substring start and end again, but with the number of unicode characters this time
232 SubstringStartEnd(input_size: num_characters, offset, length, start, end);
233 }
234
235 // now scan the graphemes of the string to find the positions of the start and end characters
236 int64_t current_character = 0;
237 idx_t start_pos = DConstants::INVALID_INDEX, end_pos = input_size;
238 utf8proc_grapheme_callback(s: input_data, len: input_size, fun: [&](size_t gstart, size_t gend) {
239 if (current_character == start) {
240 start_pos = gstart;
241 } else if (current_character == end) {
242 end_pos = gstart;
243 return false;
244 }
245 current_character++;
246 return true;
247 });
248 if (start_pos == DConstants::INVALID_INDEX) {
249 return SubstringEmptyString(result);
250 }
251 // after we have found these, we can slice the substring
252 return SubstringSlice(result, input_data, offset: start_pos, length: end_pos - start_pos);
253}
254
255struct SubstringUnicodeOp {
256 static string_t Substring(Vector &result, string_t input, int64_t offset, int64_t length) {
257 return SubstringFun::SubstringUnicode(result, input, offset, length);
258 }
259};
260
261struct SubstringGraphemeOp {
262 static string_t Substring(Vector &result, string_t input, int64_t offset, int64_t length) {
263 return SubstringFun::SubstringGrapheme(result, input, offset, length);
264 }
265};
266
267template <class OP>
268static void SubstringFunction(DataChunk &args, ExpressionState &state, Vector &result) {
269 auto &input_vector = args.data[0];
270 auto &offset_vector = args.data[1];
271 if (args.ColumnCount() == 3) {
272 auto &length_vector = args.data[2];
273
274 TernaryExecutor::Execute<string_t, int64_t, int64_t, string_t>(
275 input_vector, offset_vector, length_vector, result, args.size(),
276 [&](string_t input_string, int64_t offset, int64_t length) {
277 return OP::Substring(result, input_string, offset, length);
278 });
279 } else {
280 BinaryExecutor::Execute<string_t, int64_t, string_t>(
281 input_vector, offset_vector, result, args.size(), [&](string_t input_string, int64_t offset) {
282 return OP::Substring(result, input_string, offset, NumericLimits<uint32_t>::Maximum());
283 });
284 }
285}
286
287static void SubstringFunctionASCII(DataChunk &args, ExpressionState &state, Vector &result) {
288 auto &input_vector = args.data[0];
289 auto &offset_vector = args.data[1];
290 if (args.ColumnCount() == 3) {
291 auto &length_vector = args.data[2];
292
293 TernaryExecutor::Execute<string_t, int64_t, int64_t, string_t>(
294 a&: input_vector, b&: offset_vector, c&: length_vector, result, count: args.size(),
295 fun: [&](string_t input_string, int64_t offset, int64_t length) {
296 return SubstringASCII(result, input: input_string, offset, length);
297 });
298 } else {
299 BinaryExecutor::Execute<string_t, int64_t, string_t>(
300 left&: input_vector, right&: offset_vector, result, count: args.size(), fun: [&](string_t input_string, int64_t offset) {
301 return SubstringASCII(result, input: input_string, offset, length: NumericLimits<uint32_t>::Maximum());
302 });
303 }
304}
305
306static unique_ptr<BaseStatistics> SubstringPropagateStats(ClientContext &context, FunctionStatisticsInput &input) {
307 auto &child_stats = input.child_stats;
308 auto &expr = input.expr;
309 // can only propagate stats if the children have stats
310 // we only care about the stats of the first child (i.e. the string)
311 if (!StringStats::CanContainUnicode(stats: child_stats[0])) {
312 expr.function.function = SubstringFunctionASCII;
313 }
314 return nullptr;
315}
316
317void SubstringFun::RegisterFunction(BuiltinFunctions &set) {
318 ScalarFunctionSet substr("substring");
319 substr.AddFunction(function: ScalarFunction({LogicalType::VARCHAR, LogicalType::BIGINT, LogicalType::BIGINT},
320 LogicalType::VARCHAR, SubstringFunction<SubstringUnicodeOp>, nullptr, nullptr,
321 SubstringPropagateStats));
322 substr.AddFunction(function: ScalarFunction({LogicalType::VARCHAR, LogicalType::BIGINT}, LogicalType::VARCHAR,
323 SubstringFunction<SubstringUnicodeOp>, nullptr, nullptr,
324 SubstringPropagateStats));
325 set.AddFunction(set: substr);
326 substr.name = "substr";
327 set.AddFunction(set: substr);
328
329 ScalarFunctionSet substr_grapheme("substring_grapheme");
330 substr_grapheme.AddFunction(function: ScalarFunction({LogicalType::VARCHAR, LogicalType::BIGINT, LogicalType::BIGINT},
331 LogicalType::VARCHAR, SubstringFunction<SubstringGraphemeOp>, nullptr,
332 nullptr, SubstringPropagateStats));
333 substr_grapheme.AddFunction(function: ScalarFunction({LogicalType::VARCHAR, LogicalType::BIGINT}, LogicalType::VARCHAR,
334 SubstringFunction<SubstringGraphemeOp>, nullptr, nullptr,
335 SubstringPropagateStats));
336 set.AddFunction(set: substr_grapheme);
337}
338
339} // namespace duckdb
340