substring.cpp source code [DuckDB/src/function/scalar/string/substring.cpp]

1	#include "duckdb/function/scalar/string_functions.hpp"
2
3	#include "duckdb/common/exception.hpp"
4	#include "duckdb/common/vector_operations/vector_operations.hpp"
5	#include "duckdb/common/vector_operations/ternary_executor.hpp"
6	#include "utf8proc.hpp"
7
8	using namespace std;
9
10	namespace duckdb {
11
12	string_t SubstringFun::substring_ascii_only(Vector &result, const char input_data, int* offset, int length) {
13	auto result_string = StringVector::EmptyString(result, length);
14	auto result_data = result_string.GetData();
15	memcpy(result_data, input_data + offset, length);
16	result_string.Finalize();
17	return result_string;
18	}
19
20	string_t SubstringFun::substring_scalar_function(Vector &result, string_t input, int offset, int length,
21	unique_ptr<char[]> &output, idx_t &current_len) {
22	// reduce offset by one because SQL starts counting at 1
23	offset--;
24	if (offset < `0` \|\| length < `0`) {
25	throw Exception ("SUBSTRING cannot handle negative offsets");
26	}
27	auto input_data = input.GetData();
28	auto input_size = input.GetSize();
29
30	// check if there is any non-ascii
31	bool ascii_only = true;
32	int ascii_end = std::min(offset + length + `1`, (int)input_size);
33	for (int i = `0`; i < ascii_end; i++) {
34	if (input_data[i] & `0x80`) {
35	ascii_only = false;
36	break;
37	}
38	}
39
40	if (length == `0`) {
41	auto result_string = StringVector::EmptyString(result, `0`);
42	result_string.Finalize();
43	return result_string;
44	}
45
46	if (ascii_only) {
47	// ascii only
48	length = std::min(offset + length, (int)input_size);
49	if (offset >= length) {
50	return string_t ((uint32_t)`0`);
51	}
52	return SubstringFun::substring_ascii_only(result, input_data, offset, length - offset);
53	}
54
55	// size is at most the input size: alloc it
56	idx_t required_len = input_size + `1`;
57	if (required_len > current_len) {
58	// need a resize
59	current_len = required_len;
60	output = unique_ptr<char[]>{new char[required_len]};
61	}
62
63	// use grapheme iterator to iterate over the characters
64	int current_offset = `0`;
65	int output_size = `0`;
66	utf8proc_grapheme_callback(input_data, input_size, [&](size_t start, size_t end) {
67	if (current_offset >= offset) {
68	// this character belongs to the output: copy it there
69	memcpy(output.get() + output_size, input_data + start, end - start);
70	output_size += end - start;
71	}
72	current_offset++;
73	// stop iterating after we have exceeded the required characters
74	return current_offset < offset + length;
75	});
76	output [output_size] = `'\0'`;
77	return StringVector::AddString(result, output.get(), output_size);
78	}
79
80	static void substring_function(DataChunk &args, ExpressionState &state, Vector &result) {
81	assert(args.column_count() == `3` && args.data[`0`].type == TypeId::VARCHAR && args.data[`1`].type == TypeId::INT32 &&
82	args.data[`2`].type == TypeId::INT32);
83	auto &input_vector = args.data [`0`];
84	auto &offset_vector = args.data [`1`];
85	auto &length_vector = args.data [`2`];
86
87	idx_t current_len = `0`;
88	unique_ptr<char[]> output;
89	TernaryExecutor::Execute<string_t, int, int, string_t>(
90	input_vector, offset_vector, length_vector, result, args.size(),
91	[&](string_t input_string, int offset, int length) {
92	return SubstringFun::substring_scalar_function(result, input_string, offset, length, output, current_len);
93	});
94	}
95
96	void SubstringFun::RegisterFunction(BuiltinFunctions &set) {
97	set.AddFunction({"substring", "substr"}, ScalarFunction ({SQLType::VARCHAR, SQLType::INTEGER, SQLType::INTEGER},
98	SQLType::VARCHAR, substring_function));
99	}
100
101	} // namespace duckdb
102

Browse the source code of DuckDB/src/function/scalar/string/substring.cpp