1#include "duckdb/common/exception.hpp"
2#include "duckdb/common/types/date.hpp"
3#include "duckdb/common/vector_operations/binary_executor.hpp"
4#include "duckdb/common/vector_operations/vector_operations.hpp"
5#include "duckdb/function/scalar/nested_functions.hpp"
6#include "duckdb/function/scalar/string_functions.hpp"
7
8#include <string.h>
9
10namespace duckdb {
11
12static void ConcatFunction(DataChunk &args, ExpressionState &state, Vector &result) {
13 result.SetVectorType(VectorType::CONSTANT_VECTOR);
14 // iterate over the vectors to count how large the final string will be
15 idx_t constant_lengths = 0;
16 vector<idx_t> result_lengths(args.size(), 0);
17 for (idx_t col_idx = 0; col_idx < args.ColumnCount(); col_idx++) {
18 auto &input = args.data[col_idx];
19 D_ASSERT(input.GetType().id() == LogicalTypeId::VARCHAR);
20 if (input.GetVectorType() == VectorType::CONSTANT_VECTOR) {
21 if (ConstantVector::IsNull(vector: input)) {
22 // constant null, skip
23 continue;
24 }
25 auto input_data = ConstantVector::GetData<string_t>(vector&: input);
26 constant_lengths += input_data->GetSize();
27 } else {
28 // non-constant vector: set the result type to a flat vector
29 result.SetVectorType(VectorType::FLAT_VECTOR);
30 // now get the lengths of each of the input elements
31 UnifiedVectorFormat vdata;
32 input.ToUnifiedFormat(count: args.size(), data&: vdata);
33
34 auto input_data = UnifiedVectorFormat::GetData<string_t>(format: vdata);
35 // now add the length of each vector to the result length
36 for (idx_t i = 0; i < args.size(); i++) {
37 auto idx = vdata.sel->get_index(idx: i);
38 if (!vdata.validity.RowIsValid(row_idx: idx)) {
39 continue;
40 }
41 result_lengths[i] += input_data[idx].GetSize();
42 }
43 }
44 }
45
46 // first we allocate the empty strings for each of the values
47 auto result_data = FlatVector::GetData<string_t>(vector&: result);
48 for (idx_t i = 0; i < args.size(); i++) {
49 // allocate an empty string of the required size
50 idx_t str_length = constant_lengths + result_lengths[i];
51 result_data[i] = StringVector::EmptyString(vector&: result, len: str_length);
52 // we reuse the result_lengths vector to store the currently appended size
53 result_lengths[i] = 0;
54 }
55
56 // now that the empty space for the strings has been allocated, perform the concatenation
57 for (idx_t col_idx = 0; col_idx < args.ColumnCount(); col_idx++) {
58 auto &input = args.data[col_idx];
59
60 // loop over the vector and concat to all results
61 if (input.GetVectorType() == VectorType::CONSTANT_VECTOR) {
62 // constant vector
63 if (ConstantVector::IsNull(vector: input)) {
64 // constant null, skip
65 continue;
66 }
67 // append the constant vector to each of the strings
68 auto input_data = ConstantVector::GetData<string_t>(vector&: input);
69 auto input_ptr = input_data->GetData();
70 auto input_len = input_data->GetSize();
71 for (idx_t i = 0; i < args.size(); i++) {
72 memcpy(dest: result_data[i].GetDataWriteable() + result_lengths[i], src: input_ptr, n: input_len);
73 result_lengths[i] += input_len;
74 }
75 } else {
76 // standard vector
77 UnifiedVectorFormat idata;
78 input.ToUnifiedFormat(count: args.size(), data&: idata);
79
80 auto input_data = UnifiedVectorFormat::GetData<string_t>(format: idata);
81 for (idx_t i = 0; i < args.size(); i++) {
82 auto idx = idata.sel->get_index(idx: i);
83 if (!idata.validity.RowIsValid(row_idx: idx)) {
84 continue;
85 }
86 auto input_ptr = input_data[idx].GetData();
87 auto input_len = input_data[idx].GetSize();
88 memcpy(dest: result_data[i].GetDataWriteable() + result_lengths[i], src: input_ptr, n: input_len);
89 result_lengths[i] += input_len;
90 }
91 }
92 }
93 for (idx_t i = 0; i < args.size(); i++) {
94 result_data[i].Finalize();
95 }
96}
97
98static void ConcatOperator(DataChunk &args, ExpressionState &state, Vector &result) {
99 BinaryExecutor::Execute<string_t, string_t, string_t>(
100 left&: args.data[0], right&: args.data[1], result, count: args.size(), fun: [&](string_t a, string_t b) {
101 auto a_data = a.GetData();
102 auto b_data = b.GetData();
103 auto a_length = a.GetSize();
104 auto b_length = b.GetSize();
105
106 auto target_length = a_length + b_length;
107 auto target = StringVector::EmptyString(vector&: result, len: target_length);
108 auto target_data = target.GetDataWriteable();
109
110 memcpy(dest: target_data, src: a_data, n: a_length);
111 memcpy(dest: target_data + a_length, src: b_data, n: b_length);
112 target.Finalize();
113 return target;
114 });
115}
116
117static void TemplatedConcatWS(DataChunk &args, const string_t *sep_data, const SelectionVector &sep_sel,
118 const SelectionVector &rsel, idx_t count, Vector &result) {
119 vector<idx_t> result_lengths(args.size(), 0);
120 vector<bool> has_results(args.size(), false);
121 auto orrified_data = make_unsafe_uniq_array<UnifiedVectorFormat>(n: args.ColumnCount() - 1);
122 for (idx_t col_idx = 1; col_idx < args.ColumnCount(); col_idx++) {
123 args.data[col_idx].ToUnifiedFormat(count: args.size(), data&: orrified_data[col_idx - 1]);
124 }
125
126 // first figure out the lengths
127 for (idx_t col_idx = 1; col_idx < args.ColumnCount(); col_idx++) {
128 auto &idata = orrified_data[col_idx - 1];
129
130 auto input_data = UnifiedVectorFormat::GetData<string_t>(format: idata);
131 for (idx_t i = 0; i < count; i++) {
132 auto ridx = rsel.get_index(idx: i);
133 auto sep_idx = sep_sel.get_index(idx: ridx);
134 auto idx = idata.sel->get_index(idx: ridx);
135 if (!idata.validity.RowIsValid(row_idx: idx)) {
136 continue;
137 }
138 if (has_results[ridx]) {
139 result_lengths[ridx] += sep_data[sep_idx].GetSize();
140 }
141 result_lengths[ridx] += input_data[idx].GetSize();
142 has_results[ridx] = true;
143 }
144 }
145
146 // first we allocate the empty strings for each of the values
147 auto result_data = FlatVector::GetData<string_t>(vector&: result);
148 for (idx_t i = 0; i < count; i++) {
149 auto ridx = rsel.get_index(idx: i);
150 // allocate an empty string of the required size
151 result_data[ridx] = StringVector::EmptyString(vector&: result, len: result_lengths[ridx]);
152 // we reuse the result_lengths vector to store the currently appended size
153 result_lengths[ridx] = 0;
154 has_results[ridx] = false;
155 }
156
157 // now that the empty space for the strings has been allocated, perform the concatenation
158 for (idx_t col_idx = 1; col_idx < args.ColumnCount(); col_idx++) {
159 auto &idata = orrified_data[col_idx - 1];
160 auto input_data = UnifiedVectorFormat::GetData<string_t>(format: idata);
161 for (idx_t i = 0; i < count; i++) {
162 auto ridx = rsel.get_index(idx: i);
163 auto sep_idx = sep_sel.get_index(idx: ridx);
164 auto idx = idata.sel->get_index(idx: ridx);
165 if (!idata.validity.RowIsValid(row_idx: idx)) {
166 continue;
167 }
168 if (has_results[ridx]) {
169 auto sep_size = sep_data[sep_idx].GetSize();
170 auto sep_ptr = sep_data[sep_idx].GetData();
171 memcpy(dest: result_data[ridx].GetDataWriteable() + result_lengths[ridx], src: sep_ptr, n: sep_size);
172 result_lengths[ridx] += sep_size;
173 }
174 auto input_ptr = input_data[idx].GetData();
175 auto input_len = input_data[idx].GetSize();
176 memcpy(dest: result_data[ridx].GetDataWriteable() + result_lengths[ridx], src: input_ptr, n: input_len);
177 result_lengths[ridx] += input_len;
178 has_results[ridx] = true;
179 }
180 }
181 for (idx_t i = 0; i < count; i++) {
182 auto ridx = rsel.get_index(idx: i);
183 result_data[ridx].Finalize();
184 }
185}
186
187static void ConcatWSFunction(DataChunk &args, ExpressionState &state, Vector &result) {
188 auto &separator = args.data[0];
189 UnifiedVectorFormat vdata;
190 separator.ToUnifiedFormat(count: args.size(), data&: vdata);
191
192 result.SetVectorType(VectorType::CONSTANT_VECTOR);
193 for (idx_t col_idx = 0; col_idx < args.ColumnCount(); col_idx++) {
194 if (args.data[col_idx].GetVectorType() != VectorType::CONSTANT_VECTOR) {
195 result.SetVectorType(VectorType::FLAT_VECTOR);
196 break;
197 }
198 }
199 switch (separator.GetVectorType()) {
200 case VectorType::CONSTANT_VECTOR: {
201 if (ConstantVector::IsNull(vector: separator)) {
202 // constant NULL as separator: return constant NULL vector
203 result.SetVectorType(VectorType::CONSTANT_VECTOR);
204 ConstantVector::SetNull(vector&: result, is_null: true);
205 return;
206 }
207 // no null values
208 auto sel = FlatVector::IncrementalSelectionVector();
209 TemplatedConcatWS(args, sep_data: UnifiedVectorFormat::GetData<string_t>(format: vdata), sep_sel: *vdata.sel, rsel: *sel, count: args.size(), result);
210 return;
211 }
212 default: {
213 // default case: loop over nullmask and create a non-null selection vector
214 idx_t not_null_count = 0;
215 SelectionVector not_null_vector(STANDARD_VECTOR_SIZE);
216 auto &result_mask = FlatVector::Validity(vector&: result);
217 for (idx_t i = 0; i < args.size(); i++) {
218 if (!vdata.validity.RowIsValid(row_idx: vdata.sel->get_index(idx: i))) {
219 result_mask.SetInvalid(i);
220 } else {
221 not_null_vector.set_index(idx: not_null_count++, loc: i);
222 }
223 }
224 TemplatedConcatWS(args, sep_data: UnifiedVectorFormat::GetData<string_t>(format: vdata), sep_sel: *vdata.sel, rsel: not_null_vector,
225 count: not_null_count, result);
226 return;
227 }
228 }
229}
230
231void ConcatFun::RegisterFunction(BuiltinFunctions &set) {
232 // the concat operator and concat function have different behavior regarding NULLs
233 // this is strange but seems consistent with postgresql and mysql
234 // (sqlite does not support the concat function, only the concat operator)
235
236 // the concat operator behaves as one would expect: any NULL value present results in a NULL
237 // i.e. NULL || 'hello' = NULL
238 // the concat function, however, treats NULL values as an empty string
239 // i.e. concat(NULL, 'hello') = 'hello'
240 // concat_ws functions similarly to the concat function, except the result is NULL if the separator is NULL
241 // if the separator is not NULL, however, NULL values are counted as empty string
242 // there is one separate rule: there are no separators added between NULL values
243 // so the NULL value and empty string are different!
244 // e.g.:
245 // concat_ws(',', NULL, NULL) = ""
246 // concat_ws(',', '', '') = ","
247 ScalarFunction concat = ScalarFunction("concat", {LogicalType::VARCHAR}, LogicalType::VARCHAR, ConcatFunction);
248 concat.varargs = LogicalType::VARCHAR;
249 concat.null_handling = FunctionNullHandling::SPECIAL_HANDLING;
250 set.AddFunction(function: concat);
251
252 ScalarFunctionSet concat_op("||");
253 concat_op.AddFunction(
254 function: ScalarFunction({LogicalType::VARCHAR, LogicalType::VARCHAR}, LogicalType::VARCHAR, ConcatOperator));
255 concat_op.AddFunction(function: ScalarFunction({LogicalType::BLOB, LogicalType::BLOB}, LogicalType::BLOB, ConcatOperator));
256 concat_op.AddFunction(function: ListConcatFun::GetFunction());
257 for (auto &fun : concat_op.functions) {
258 fun.null_handling = FunctionNullHandling::SPECIAL_HANDLING;
259 }
260 set.AddFunction(set: concat_op);
261
262 ScalarFunction concat_ws = ScalarFunction("concat_ws", {LogicalType::VARCHAR, LogicalType::VARCHAR},
263 LogicalType::VARCHAR, ConcatWSFunction);
264 concat_ws.varargs = LogicalType::VARCHAR;
265 concat_ws.null_handling = FunctionNullHandling::SPECIAL_HANDLING;
266 set.AddFunction(function: concat_ws);
267}
268
269} // namespace duckdb
270