1#include "duckdb/function/scalar/string_functions.hpp"
2
3#include "duckdb/common/exception.hpp"
4#include "duckdb/common/types/date.hpp"
5#include "duckdb/common/vector_operations/vector_operations.hpp"
6#include "duckdb/common/vector_operations/binary_executor.hpp"
7
8#include <string.h>
9
10using namespace std;
11
12namespace duckdb {
13
14static void concat_function(DataChunk &args, ExpressionState &state, Vector &result) {
15 result.vector_type = VectorType::CONSTANT_VECTOR;
16 // iterate over the vectors to count how large the final string will be
17 idx_t constant_lengths = 0;
18 vector<idx_t> result_lengths(args.size(), 0);
19 for (idx_t col_idx = 0; col_idx < args.column_count(); col_idx++) {
20 auto &input = args.data[col_idx];
21 assert(input.type == TypeId::VARCHAR);
22 if (input.vector_type == VectorType::CONSTANT_VECTOR) {
23 if (ConstantVector::IsNull(input)) {
24 // constant null, skip
25 continue;
26 }
27 auto input_data = ConstantVector::GetData<string_t>(input);
28 constant_lengths += input_data->GetSize();
29 } else {
30 // non-constant vector: set the result type to a flat vector
31 result.vector_type = VectorType::FLAT_VECTOR;
32 // now get the lengths of each of the input elements
33 VectorData vdata;
34 input.Orrify(args.size(), vdata);
35
36 auto input_data = (string_t *)vdata.data;
37 // now add the length of each vector to the result length
38 for (idx_t i = 0; i < args.size(); i++) {
39 auto idx = vdata.sel->get_index(i);
40 if ((*vdata.nullmask)[idx]) {
41 continue;
42 }
43 result_lengths[i] += input_data[idx].GetSize();
44 }
45 }
46 }
47
48 // first we allocate the empty strings for each of the values
49 auto result_data = FlatVector::GetData<string_t>(result);
50 for (idx_t i = 0; i < args.size(); i++) {
51 // allocate an empty string of the required size
52 idx_t str_length = constant_lengths + result_lengths[i];
53 result_data[i] = StringVector::EmptyString(result, str_length);
54 // we reuse the result_lengths vector to store the currently appended size
55 result_lengths[i] = 0;
56 }
57
58 // now that the empty space for the strings has been allocated, perform the concatenation
59 for (idx_t col_idx = 0; col_idx < args.column_count(); col_idx++) {
60 auto &input = args.data[col_idx];
61
62 // loop over the vector and concat to all results
63 if (input.vector_type == VectorType::CONSTANT_VECTOR) {
64 // constant vector
65 if (ConstantVector::IsNull(input)) {
66 // constant null, skip
67 continue;
68 }
69 // append the constant vector to each of the strings
70 auto input_data = ConstantVector::GetData<string_t>(input);
71 auto input_ptr = input_data->GetData();
72 auto input_len = input_data->GetSize();
73 for (idx_t i = 0; i < args.size(); i++) {
74 memcpy(result_data[i].GetData() + result_lengths[i], input_ptr, input_len);
75 result_lengths[i] += input_len;
76 }
77 } else {
78 // standard vector
79 VectorData idata;
80 input.Orrify(args.size(), idata);
81
82 auto input_data = (string_t *)idata.data;
83 for (idx_t i = 0; i < args.size(); i++) {
84 auto idx = idata.sel->get_index(i);
85 if ((*idata.nullmask)[idx]) {
86 continue;
87 }
88 auto input_ptr = input_data[idx].GetData();
89 auto input_len = input_data[idx].GetSize();
90 memcpy(result_data[i].GetData() + result_lengths[i], input_ptr, input_len);
91 result_lengths[i] += input_len;
92 }
93 }
94 }
95 for (idx_t i = 0; i < args.size(); i++) {
96 result_data[i].Finalize();
97 }
98}
99
100static void concat_operator(DataChunk &args, ExpressionState &state, Vector &result) {
101 BinaryExecutor::Execute<string_t, string_t, string_t, true>(
102 args.data[0], args.data[1], result, args.size(), [&](string_t a, string_t b) {
103 auto a_data = a.GetData();
104 auto b_data = b.GetData();
105 auto a_length = a.GetSize();
106 auto b_length = b.GetSize();
107
108 auto target_length = a_length + b_length;
109 auto target = StringVector::EmptyString(result, target_length);
110 auto target_data = target.GetData();
111
112 memcpy(target_data, a_data, a_length);
113 memcpy(target_data + a_length, b_data, b_length);
114 target.Finalize();
115 return target;
116 });
117}
118
119static void templated_concat_ws(DataChunk &args, string_t *sep_data, const SelectionVector &sep_sel,
120 const SelectionVector &rsel, idx_t count, Vector &result) {
121 vector<idx_t> result_lengths(args.size(), 0);
122 vector<bool> has_results(args.size(), false);
123 auto orrified_data = unique_ptr<VectorData[]>(new VectorData[args.column_count() - 1]);
124 for (idx_t col_idx = 1; col_idx < args.column_count(); col_idx++) {
125 args.data[col_idx].Orrify(args.size(), orrified_data[col_idx - 1]);
126 }
127
128 // first figure out the lengths
129 for (idx_t col_idx = 1; col_idx < args.column_count(); col_idx++) {
130 auto &idata = orrified_data[col_idx - 1];
131
132 auto input_data = (string_t *)idata.data;
133 for (idx_t i = 0; i < count; i++) {
134 auto ridx = rsel.get_index(i);
135 auto sep_idx = sep_sel.get_index(ridx);
136 auto idx = idata.sel->get_index(ridx);
137 if ((*idata.nullmask)[idx]) {
138 continue;
139 }
140 if (has_results[ridx]) {
141 result_lengths[ridx] += sep_data[sep_idx].GetSize();
142 }
143 result_lengths[ridx] += input_data[idx].GetSize();
144 has_results[ridx] = true;
145 }
146 }
147
148 // first we allocate the empty strings for each of the values
149 auto result_data = FlatVector::GetData<string_t>(result);
150 for (idx_t i = 0; i < count; i++) {
151 auto ridx = rsel.get_index(i);
152 // allocate an empty string of the required size
153 result_data[ridx] = StringVector::EmptyString(result, result_lengths[ridx]);
154 // we reuse the result_lengths vector to store the currently appended size
155 result_lengths[ridx] = 0;
156 has_results[ridx] = false;
157 }
158
159 // now that the empty space for the strings has been allocated, perform the concatenation
160 for (idx_t col_idx = 1; col_idx < args.column_count(); col_idx++) {
161 auto &idata = orrified_data[col_idx - 1];
162 auto input_data = (string_t *)idata.data;
163 for (idx_t i = 0; i < count; i++) {
164 auto ridx = rsel.get_index(i);
165 auto sep_idx = sep_sel.get_index(ridx);
166 auto idx = idata.sel->get_index(ridx);
167 if ((*idata.nullmask)[idx]) {
168 continue;
169 }
170 if (has_results[ridx]) {
171 auto sep_size = sep_data[sep_idx].GetSize();
172 auto sep_ptr = sep_data[sep_idx].GetData();
173 memcpy(result_data[ridx].GetData() + result_lengths[ridx], sep_ptr, sep_size);
174 result_lengths[ridx] += sep_size;
175 }
176 auto input_ptr = input_data[idx].GetData();
177 auto input_len = input_data[idx].GetSize();
178 memcpy(result_data[ridx].GetData() + result_lengths[ridx], input_ptr, input_len);
179 result_lengths[ridx] += input_len;
180 has_results[ridx] = true;
181 }
182 }
183 for (idx_t i = 0; i < count; i++) {
184 auto ridx = rsel.get_index(i);
185 result_data[ridx].Finalize();
186 }
187}
188
189static void concat_ws_function(DataChunk &args, ExpressionState &state, Vector &result) {
190 auto &separator = args.data[0];
191 VectorData vdata;
192 separator.Orrify(args.size(), vdata);
193
194 result.vector_type = VectorType::CONSTANT_VECTOR;
195 for (idx_t col_idx = 0; col_idx < args.column_count(); col_idx++) {
196 if (args.data[col_idx].vector_type != VectorType::CONSTANT_VECTOR) {
197 result.vector_type = VectorType::FLAT_VECTOR;
198 break;
199 }
200 }
201 switch (separator.vector_type) {
202 case VectorType::CONSTANT_VECTOR:
203 if (ConstantVector::IsNull(separator)) {
204 // constant NULL as separator: return constant NULL vector
205 result.vector_type = VectorType::CONSTANT_VECTOR;
206 ConstantVector::SetNull(result, true);
207 return;
208 }
209 // no null values
210 templated_concat_ws(args, (string_t *)vdata.data, *vdata.sel, FlatVector::IncrementalSelectionVector,
211 args.size(), result);
212 return;
213 default: {
214 // default case: loop over nullmask and create a non-null selection vector
215 idx_t not_null_count = 0;
216 SelectionVector not_null_vector(STANDARD_VECTOR_SIZE);
217 auto &result_nullmask = FlatVector::Nullmask(result);
218 for (idx_t i = 0; i < args.size(); i++) {
219 if ((*vdata.nullmask)[vdata.sel->get_index(i)]) {
220 result_nullmask[i] = true;
221 } else {
222 not_null_vector.set_index(not_null_count++, i);
223 }
224 }
225 templated_concat_ws(args, (string_t *)vdata.data, *vdata.sel, not_null_vector, not_null_count, result);
226 return;
227 }
228 }
229}
230
231void ConcatFun::RegisterFunction(BuiltinFunctions &set) {
232 // the concat operator and concat function have different behavior regarding NULLs
233 // this is strange but seems consistent with postgresql and mysql
234 // (sqlite does not support the concat function, only the concat operator)
235
236 // the concat operator behaves as one would expect: any NULL value present results in a NULL
237 // i.e. NULL || 'hello' = NULL
238 // the concat function, however, treats NULL values as an empty string
239 // i.e. concat(NULL, 'hello') = 'hello'
240 // concat_ws functions similarly to the concat function, except the result is NULL if the separator is NULL
241 // if the separator is not NULL, however, NULL values are counted as empty string
242 // there is one separate rule: there are no separators added between NULL values
243 // so the NULL value and empty string are different!
244 // e.g.:
245 // concat_ws(',', NULL, NULL) = ""
246 // concat_ws(',', '', '') = ","
247 ScalarFunction concat = ScalarFunction("concat", {SQLType::VARCHAR}, SQLType::VARCHAR, concat_function);
248 concat.varargs = SQLType::VARCHAR;
249 set.AddFunction(concat);
250
251 ScalarFunctionSet concat_op("||");
252 concat_op.AddFunction(ScalarFunction({SQLType::VARCHAR, SQLType::VARCHAR}, SQLType::VARCHAR, concat_operator));
253 concat_op.AddFunction(ScalarFunction({SQLType::BLOB, SQLType::BLOB}, SQLType::BLOB, concat_operator));
254 set.AddFunction(concat_op);
255
256
257 ScalarFunction concat_ws =
258 ScalarFunction("concat_ws", {SQLType::VARCHAR, SQLType::VARCHAR}, SQLType::VARCHAR, concat_ws_function);
259 concat_ws.varargs = SQLType::VARCHAR;
260 set.AddFunction(concat_ws);
261}
262
263} // namespace duckdb
264