1#include "duckdb/function/scalar/string_functions.hpp"
2
3#include "duckdb/common/exception.hpp"
4#include "duckdb/common/vector_operations/vector_operations.hpp"
5#include "duckdb/common/vector_operations/ternary_executor.hpp"
6#include "utf8proc.hpp"
7
8using namespace std;
9
10namespace duckdb {
11
12string_t SubstringFun::substring_ascii_only(Vector &result, const char *input_data, int offset, int length) {
13 auto result_string = StringVector::EmptyString(result, length);
14 auto result_data = result_string.GetData();
15 memcpy(result_data, input_data + offset, length);
16 result_string.Finalize();
17 return result_string;
18}
19
20string_t SubstringFun::substring_scalar_function(Vector &result, string_t input, int offset, int length,
21 unique_ptr<char[]> &output, idx_t &current_len) {
22 // reduce offset by one because SQL starts counting at 1
23 offset--;
24 if (offset < 0 || length < 0) {
25 throw Exception("SUBSTRING cannot handle negative offsets");
26 }
27 auto input_data = input.GetData();
28 auto input_size = input.GetSize();
29
30 // check if there is any non-ascii
31 bool ascii_only = true;
32 int ascii_end = std::min(offset + length + 1, (int)input_size);
33 for (int i = 0; i < ascii_end; i++) {
34 if (input_data[i] & 0x80) {
35 ascii_only = false;
36 break;
37 }
38 }
39
40 if (length == 0) {
41 auto result_string = StringVector::EmptyString(result, 0);
42 result_string.Finalize();
43 return result_string;
44 }
45
46 if (ascii_only) {
47 // ascii only
48 length = std::min(offset + length, (int)input_size);
49 if (offset >= length) {
50 return string_t((uint32_t)0);
51 }
52 return SubstringFun::substring_ascii_only(result, input_data, offset, length - offset);
53 }
54
55 // size is at most the input size: alloc it
56 idx_t required_len = input_size + 1;
57 if (required_len > current_len) {
58 // need a resize
59 current_len = required_len;
60 output = unique_ptr<char[]>{new char[required_len]};
61 }
62
63 // use grapheme iterator to iterate over the characters
64 int current_offset = 0;
65 int output_size = 0;
66 utf8proc_grapheme_callback(input_data, input_size, [&](size_t start, size_t end) {
67 if (current_offset >= offset) {
68 // this character belongs to the output: copy it there
69 memcpy(output.get() + output_size, input_data + start, end - start);
70 output_size += end - start;
71 }
72 current_offset++;
73 // stop iterating after we have exceeded the required characters
74 return current_offset < offset + length;
75 });
76 output[output_size] = '\0';
77 return StringVector::AddString(result, output.get(), output_size);
78}
79
80static void substring_function(DataChunk &args, ExpressionState &state, Vector &result) {
81 assert(args.column_count() == 3 && args.data[0].type == TypeId::VARCHAR && args.data[1].type == TypeId::INT32 &&
82 args.data[2].type == TypeId::INT32);
83 auto &input_vector = args.data[0];
84 auto &offset_vector = args.data[1];
85 auto &length_vector = args.data[2];
86
87 idx_t current_len = 0;
88 unique_ptr<char[]> output;
89 TernaryExecutor::Execute<string_t, int, int, string_t>(
90 input_vector, offset_vector, length_vector, result, args.size(),
91 [&](string_t input_string, int offset, int length) {
92 return SubstringFun::substring_scalar_function(result, input_string, offset, length, output, current_len);
93 });
94}
95
96void SubstringFun::RegisterFunction(BuiltinFunctions &set) {
97 set.AddFunction({"substring", "substr"}, ScalarFunction({SQLType::VARCHAR, SQLType::INTEGER, SQLType::INTEGER},
98 SQLType::VARCHAR, substring_function));
99}
100
101} // namespace duckdb
102