substring.cpp source code [Velox/build/_deps/duckdb-src/src/function/scalar/string/substring.cpp]

1	#include "duckdb/function/scalar/string_functions.hpp"
2
3	#include "duckdb/common/algorithm.hpp"
4	#include "duckdb/common/exception.hpp"
5	#include "duckdb/common/vector_operations/vector_operations.hpp"
6	#include "duckdb/common/vector_operations/ternary_executor.hpp"
7
8	#include "duckdb/planner/expression/bound_function_expression.hpp"
9	#include "utf8proc.hpp"
10	#include "duckdb/common/types/blob.hpp"
11
12	namespace duckdb {
13
14	static const int64_t SUPPORTED_UPPER_BOUND = NumericLimits<uint32_t>::Maximum();
15	static const int64_t SUPPORTED_LOWER_BOUND = -SUPPORTED_UPPER_BOUND - `1`;
16
17	static inline void AssertInSupportedRange(idx_t input_size, int64_t offset, int64_t length) {
18
19	if (input_size > (uint64_t)SUPPORTED_UPPER_BOUND) {
20	throw OutOfRangeException ("Substring input size is too large (> %d)", SUPPORTED_UPPER_BOUND);
21	}
22	if (offset < SUPPORTED_LOWER_BOUND) {
23	throw OutOfRangeException ("Substring offset outside of supported range (< %d)", SUPPORTED_LOWER_BOUND);
24	}
25	if (offset > SUPPORTED_UPPER_BOUND) {
26	throw OutOfRangeException ("Substring offset outside of supported range (> %d)", SUPPORTED_UPPER_BOUND);
27	}
28	if (length < SUPPORTED_LOWER_BOUND) {
29	throw OutOfRangeException ("Substring length outside of supported range (< %d)", SUPPORTED_LOWER_BOUND);
30	}
31	if (length > SUPPORTED_UPPER_BOUND) {
32	throw OutOfRangeException ("Substring length outside of supported range (> %d)", SUPPORTED_UPPER_BOUND);
33	}
34	}
35
36	string_t SubstringEmptyString(Vector &result) {
37	auto result_string = StringVector::EmptyString(vector&: result, len: `0`);
38	result_string.Finalize();
39	return result_string;
40	}
41
42	string_t SubstringSlice(Vector &result, const char *input_data, int64_t offset, int64_t length) {
43	auto result_string = StringVector::EmptyString(vector&: result, len: length);
44	auto result_data = result_string.GetDataWriteable();
45	memcpy(dest: result_data, src: input_data + offset, n: length);
46	result_string.Finalize();
47	return result_string;
48	}
49
50	// compute start and end characters from the given input size and offset/length
51	bool SubstringStartEnd(int64_t input_size, int64_t offset, int64_t length, int64_t &start, int64_t &end) {
52	if (length == `0`) {
53	return false;
54	}
55	if (offset > `0`) {
56	// positive offset: scan from start
57	start = MinValue<int64_t>(a: input_size, b: offset - `1`);
58	} else if (offset < `0`) {
59	// negative offset: scan from end (i.e. start = end + offset)
60	start = MaxValue<int64_t>(a: input_size + offset, b: `0`);
61	} else {
62	// offset = 0: special case, we start 1 character BEHIND the first character
63	start = `0`;
64	length--;
65	if (length <= `0`) {
66	return false;
67	}
68	}
69	if (length > `0`) {
70	// positive length: go forward (i.e. end = start + offset)
71	end = MinValue<int64_t>(a: input_size, b: start + length);
72	} else {
73	// negative length: go backwards (i.e. end = start, start = start + length)
74	end = start;
75	start = MaxValue<int64_t>(a: `0`, b: start + length);
76	}
77	if (start == end) {
78	return false;
79	}
80	D_ASSERT(start < end);
81	return true;
82	}
83
84	string_t SubstringASCII(Vector &result, string_t input, int64_t offset, int64_t length) {
85	auto input_data = input.GetData();
86	auto input_size = input.GetSize();
87
88	AssertInSupportedRange(input_size, offset, length);
89
90	int64_t start, end;
91	if (!SubstringStartEnd(input_size, offset, length, start, end)) {
92	return SubstringEmptyString(result);
93	}
94	return SubstringSlice(result, input_data, offset: start, length: end - start);
95	}
96
97	string_t SubstringFun::SubstringUnicode(Vector &result, string_t input, int64_t offset, int64_t length) {
98	auto input_data = input.GetData();
99	auto input_size = input.GetSize();
100
101	AssertInSupportedRange(input_size, offset, length);
102
103	if (length == `0`) {
104	return SubstringEmptyString(result);
105	}
106	// first figure out which direction we need to scan
107	idx_t start_pos;
108	idx_t end_pos;
109	if (offset < `0`) {
110	start_pos = `0`;
111	end_pos = DConstants::INVALID_INDEX;
112
113	// negative offset: scan backwards
114	int64_t start, end;
115
116	// we express start and end as unicode codepoints from the back
117	offset--;
118	if (length < `0`) {
119	// negative length
120	start = -offset - length;
121	end = -offset;
122	} else {
123	// positive length
124	start = -offset;
125	end = -offset - length;
126	}
127	if (end <= `0`) {
128	end_pos = input_size;
129	}
130	int64_t current_character = `0`;
131	for (idx_t i = input_size; i > `0`; i--) {
132	if (LengthFun::IsCharacter(c: input_data[i - `1`])) {
133	current_character++;
134	if (current_character == start) {
135	start_pos = i;
136	break;
137	} else if (current_character == end) {
138	end_pos = i;
139	}
140	}
141	}
142	while (!LengthFun::IsCharacter(c: input_data[start_pos])) {
143	start_pos++;
144	}
145	while (end_pos < input_size && !LengthFun::IsCharacter(c: input_data[end_pos])) {
146	end_pos++;
147	}
148
149	if (end_pos == DConstants::INVALID_INDEX) {
150	return SubstringEmptyString(result);
151	}
152	} else {
153	start_pos = DConstants::INVALID_INDEX;
154	end_pos = input_size;
155
156	// positive offset: scan forwards
157	int64_t start, end;
158
159	// we express start and end as unicode codepoints from the front
160	offset--;
161	if (length < `0`) {
162	// negative length
163	start = MaxValue<int64_t>(a: `0`, b: offset + length);
164	end = offset;
165	} else {
166	// positive length
167	start = MaxValue<int64_t>(a: `0`, b: offset);
168	end = offset + length;
169	}
170
171	int64_t current_character = `0`;
172	for (idx_t i = `0`; i < input_size; i++) {
173	if (LengthFun::IsCharacter(c: input_data[i])) {
174	if (current_character == start) {
175	start_pos = i;
176	} else if (current_character == end) {
177	end_pos = i;
178	break;
179	}
180	current_character++;
181	}
182	}
183	if (start_pos == DConstants::INVALID_INDEX \|\| end == `0` \|\| end <= start) {
184	return SubstringEmptyString(result);
185	}
186	}
187	D_ASSERT(end_pos >= start_pos);
188	// after we have found these, we can slice the substring
189	return SubstringSlice(result, input_data, offset: start_pos, length: end_pos - start_pos);
190	}
191
192	string_t SubstringFun::SubstringGrapheme(Vector &result, string_t input, int64_t offset, int64_t length) {
193	auto input_data = input.GetData();
194	auto input_size = input.GetSize();
195
196	AssertInSupportedRange(input_size, offset, length);
197
198	// we don't know yet if the substring is ascii, but we assume it is (for now)
199	// first get the start and end as if this was an ascii string
200	int64_t start, end;
201	if (!SubstringStartEnd(input_size, offset, length, start, end)) {
202	return SubstringEmptyString(result);
203	}
204
205	// now check if all the characters between 0 and end are ascii characters
206	// note that we scan one further to check for a potential combining diacritics (e.g. i + diacritic is ï)
207	bool is_ascii = true;
208	idx_t ascii_end = MinValue<idx_t>(a: end + `1`, b: input_size);
209	for (idx_t i = `0`; i < ascii_end; i++) {
210	if (input_data[i] & `0x80`) {
211	// found a non-ascii character: eek
212	is_ascii = false;
213	break;
214	}
215	}
216	if (is_ascii) {
217	// all characters are ascii, we can just slice the substring
218	return SubstringSlice(result, input_data, offset: start, length: end - start);
219	}
220	// if the characters are not ascii, we need to scan grapheme clusters
221	// first figure out which direction we need to scan
222	// offset = 0 case is taken care of in SubstringStartEnd
223	if (offset < `0`) {
224	// negative offset, this case is more difficult
225	// we first need to count the number of characters in the string
226	idx_t num_characters = `0`;
227	utf8proc_grapheme_callback(s: input_data, len: input_size, fun: [&](size_t start, size_t end) {
228	num_characters++;
229	return true;
230	});
231	// now call substring start and end again, but with the number of unicode characters this time
232	SubstringStartEnd(input_size: num_characters, offset, length, start, end);
233	}
234
235	// now scan the graphemes of the string to find the positions of the start and end characters
236	int64_t current_character = `0`;
237	idx_t start_pos = DConstants::INVALID_INDEX, end_pos = input_size;
238	utf8proc_grapheme_callback(s: input_data, len: input_size, fun: [&](size_t gstart, size_t gend) {
239	if (current_character == start) {
240	start_pos = gstart;
241	} else if (current_character == end) {
242	end_pos = gstart;
243	return false;
244	}
245	current_character++;
246	return true;
247	});
248	if (start_pos == DConstants::INVALID_INDEX) {
249	return SubstringEmptyString(result);
250	}
251	// after we have found these, we can slice the substring
252	return SubstringSlice(result, input_data, offset: start_pos, length: end_pos - start_pos);
253	}
254
255	struct SubstringUnicodeOp {
256	static string_t Substring(Vector &result, string_t input, int64_t offset, int64_t length) {
257	return SubstringFun::SubstringUnicode(result, input, offset, length);
258	}
259	};
260
261	struct SubstringGraphemeOp {
262	static string_t Substring(Vector &result, string_t input, int64_t offset, int64_t length) {
263	return SubstringFun::SubstringGrapheme(result, input, offset, length);
264	}
265	};
266
267	template <class OP>
268	static void SubstringFunction(DataChunk &args, ExpressionState &state, Vector &result) {
269	auto &input_vector = args.data [`0`];
270	auto &offset_vector = args.data [`1`];
271	if (args.ColumnCount() == `3`) {
272	auto &length_vector = args.data [`2`];
273
274	TernaryExecutor::Execute<string_t, int64_t, int64_t, string_t>(
275	input_vector, offset_vector, length_vector, result, args.size(),
276	[&](string_t input_string, int64_t offset, int64_t length) {
277	return OP::Substring(result, input_string, offset, length);
278	});
279	} else {
280	BinaryExecutor::Execute<string_t, int64_t, string_t>(
281	input_vector, offset_vector, result, args.size(), [&](string_t input_string, int64_t offset) {
282	return OP::Substring(result, input_string, offset, NumericLimits<uint32_t>::Maximum());
283	});
284	}
285	}
286
287	static void SubstringFunctionASCII(DataChunk &args, ExpressionState &state, Vector &result) {
288	auto &input_vector = args.data [`0`];
289	auto &offset_vector = args.data [`1`];
290	if (args.ColumnCount() == `3`) {
291	auto &length_vector = args.data [`2`];
292
293	TernaryExecutor::Execute<string_t, int64_t, int64_t, string_t>(
294	a&: input_vector, b&: offset_vector, c&: length_vector, result, count: args.size(),
295	fun: [&](string_t input_string, int64_t offset, int64_t length) {
296	return SubstringASCII(result, input: input_string, offset, length);
297	});
298	} else {
299	BinaryExecutor::Execute<string_t, int64_t, string_t>(
300	left&: input_vector, right&: offset_vector, result, count: args.size(), fun: [&](string_t input_string, int64_t offset) {
301	return SubstringASCII(result, input: input_string, offset, length: NumericLimits<uint32_t>::Maximum());
302	});
303	}
304	}
305
306	static unique_ptr<BaseStatistics> SubstringPropagateStats(ClientContext &context, FunctionStatisticsInput &input) {
307	auto &child_stats = input.child_stats;
308	auto &expr = input.expr;
309	// can only propagate stats if the children have stats
310	// we only care about the stats of the first child (i.e. the string)
311	if (!StringStats::CanContainUnicode(stats: child_stats [`0`])) {
312	expr.function.function = SubstringFunctionASCII;
313	}
314	return nullptr;
315	}
316
317	void SubstringFun::RegisterFunction(BuiltinFunctions &set) {
318	ScalarFunctionSet substr("substring");
319	substr.AddFunction(function: ScalarFunction ({LogicalType::VARCHAR, LogicalType::BIGINT, LogicalType::BIGINT},
320	LogicalType::VARCHAR, SubstringFunction<SubstringUnicodeOp>, nullptr, nullptr,
321	SubstringPropagateStats));
322	substr.AddFunction(function: ScalarFunction ({LogicalType::VARCHAR, LogicalType::BIGINT}, LogicalType::VARCHAR,
323	SubstringFunction<SubstringUnicodeOp>, nullptr, nullptr,
324	SubstringPropagateStats));
325	set.AddFunction(set: substr);
326	substr.name = "substr";
327	set.AddFunction(set: substr);
328
329	ScalarFunctionSet substr_grapheme("substring_grapheme");
330	substr_grapheme.AddFunction(function: ScalarFunction ({LogicalType::VARCHAR, LogicalType::BIGINT, LogicalType::BIGINT},
331	LogicalType::VARCHAR, SubstringFunction<SubstringGraphemeOp>, nullptr,
332	nullptr, SubstringPropagateStats));
333	substr_grapheme.AddFunction(function: ScalarFunction ({LogicalType::VARCHAR, LogicalType::BIGINT}, LogicalType::VARCHAR,
334	SubstringFunction<SubstringGraphemeOp>, nullptr, nullptr,
335	SubstringPropagateStats));
336	set.AddFunction(set: substr_grapheme);
337	}
338
339	} // namespace duckdb
340

Browse the source code of Velox/build/_deps/duckdb-src/src/function/scalar/string/substring.cpp