concat.cpp source code [Velox/build/_deps/duckdb-src/src/function/scalar/string/concat.cpp]

1	#include "duckdb/common/exception.hpp"
2	#include "duckdb/common/types/date.hpp"
3	#include "duckdb/common/vector_operations/binary_executor.hpp"
4	#include "duckdb/common/vector_operations/vector_operations.hpp"
5	#include "duckdb/function/scalar/nested_functions.hpp"
6	#include "duckdb/function/scalar/string_functions.hpp"
7
8	#include <string.h>
9
10	namespace duckdb {
11
12	static void ConcatFunction(DataChunk &args, ExpressionState &state, Vector &result) {
13	result.SetVectorType(VectorType::CONSTANT_VECTOR);
14	// iterate over the vectors to count how large the final string will be
15	idx_t constant_lengths = `0`;
16	vector<idx_t> result_lengths(args.size(), `0`);
17	for (idx_t col_idx = `0`; col_idx < args.ColumnCount(); col_idx++) {
18	auto &input = args.data [col_idx];
19	D_ASSERT(input.GetType().id() == LogicalTypeId::VARCHAR);
20	if (input.GetVectorType() == VectorType::CONSTANT_VECTOR) {
21	if (ConstantVector::IsNull(vector: input)) {
22	// constant null, skip
23	continue;
24	}
25	auto input_data = ConstantVector::GetData<string_t>(vector&: input);
26	constant_lengths += input_data->GetSize();
27	} else {
28	// non-constant vector: set the result type to a flat vector
29	result.SetVectorType(VectorType::FLAT_VECTOR);
30	// now get the lengths of each of the input elements
31	UnifiedVectorFormat vdata;
32	input.ToUnifiedFormat(count: args.size(), data&: vdata);
33
34	auto input_data = UnifiedVectorFormat::GetData<string_t>(format: vdata);
35	// now add the length of each vector to the result length
36	for (idx_t i = `0`; i < args.size(); i++) {
37	auto idx = vdata.sel->get_index(idx: i);
38	if (!vdata.validity.RowIsValid(row_idx: idx)) {
39	continue;
40	}
41	result_lengths [i] += input_data[idx].GetSize();
42	}
43	}
44	}
45
46	// first we allocate the empty strings for each of the values
47	auto result_data = FlatVector::GetData<string_t>(vector&: result);
48	for (idx_t i = `0`; i < args.size(); i++) {
49	// allocate an empty string of the required size
50	idx_t str_length = constant_lengths + result_lengths [i];
51	result_data[i] = StringVector::EmptyString(vector&: result, len: str_length);
52	// we reuse the result_lengths vector to store the currently appended size
53	result_lengths [i] = `0`;
54	}
55
56	// now that the empty space for the strings has been allocated, perform the concatenation
57	for (idx_t col_idx = `0`; col_idx < args.ColumnCount(); col_idx++) {
58	auto &input = args.data [col_idx];
59
60	// loop over the vector and concat to all results
61	if (input.GetVectorType() == VectorType::CONSTANT_VECTOR) {
62	// constant vector
63	if (ConstantVector::IsNull(vector: input)) {
64	// constant null, skip
65	continue;
66	}
67	// append the constant vector to each of the strings
68	auto input_data = ConstantVector::GetData<string_t>(vector&: input);
69	auto input_ptr = input_data->GetData();
70	auto input_len = input_data->GetSize();
71	for (idx_t i = `0`; i < args.size(); i++) {
72	memcpy(dest: result_data[i].GetDataWriteable() + result_lengths [i], src: input_ptr, n: input_len);
73	result_lengths [i] += input_len;
74	}
75	} else {
76	// standard vector
77	UnifiedVectorFormat idata;
78	input.ToUnifiedFormat(count: args.size(), data&: idata);
79
80	auto input_data = UnifiedVectorFormat::GetData<string_t>(format: idata);
81	for (idx_t i = `0`; i < args.size(); i++) {
82	auto idx = idata.sel->get_index(idx: i);
83	if (!idata.validity.RowIsValid(row_idx: idx)) {
84	continue;
85	}
86	auto input_ptr = input_data[idx].GetData();
87	auto input_len = input_data[idx].GetSize();
88	memcpy(dest: result_data[i].GetDataWriteable() + result_lengths [i], src: input_ptr, n: input_len);
89	result_lengths [i] += input_len;
90	}
91	}
92	}
93	for (idx_t i = `0`; i < args.size(); i++) {
94	result_data[i].Finalize();
95	}
96	}
97
98	static void ConcatOperator(DataChunk &args, ExpressionState &state, Vector &result) {
99	BinaryExecutor::Execute<string_t, string_t, string_t>(
100	left&: args.data [`0`], right&: args.data [`1`], result, count: args.size(), fun: [&](string_t a, string_t b) {
101	auto a_data = a.GetData();
102	auto b_data = b.GetData();
103	auto a_length = a.GetSize();
104	auto b_length = b.GetSize();
105
106	auto target_length = a_length + b_length;
107	auto target = StringVector::EmptyString(vector&: result, len: target_length);
108	auto target_data = target.GetDataWriteable();
109
110	memcpy(dest: target_data, src: a_data, n: a_length);
111	memcpy(dest: target_data + a_length, src: b_data, n: b_length);
112	target.Finalize();
113	return target;
114	});
115	}
116
117	static void TemplatedConcatWS(DataChunk &args, const string_t sep_data, const* SelectionVector &sep_sel,
118	const SelectionVector &rsel, idx_t count, Vector &result) {
119	vector<idx_t> result_lengths(args.size(), `0`);
120	vector<bool> has_results(args.size(), false);
121	auto orrified_data = make_unsafe_uniq_array<UnifiedVectorFormat>(n: args.ColumnCount() - `1`);
122	for (idx_t col_idx = `1`; col_idx < args.ColumnCount(); col_idx++) {
123	args.data [col_idx].ToUnifiedFormat(count: args.size(), data&: orrified_data [col_idx - `1`]);
124	}
125
126	// first figure out the lengths
127	for (idx_t col_idx = `1`; col_idx < args.ColumnCount(); col_idx++) {
128	auto &idata = orrified_data [col_idx - `1`];
129
130	auto input_data = UnifiedVectorFormat::GetData<string_t>(format: idata);
131	for (idx_t i = `0`; i < count; i++) {
132	auto ridx = rsel.get_index(idx: i);
133	auto sep_idx = sep_sel.get_index(idx: ridx);
134	auto idx = idata.sel->get_index(idx: ridx);
135	if (!idata.validity.RowIsValid(row_idx: idx)) {
136	continue;
137	}
138	if (has_results [ridx]) {
139	result_lengths [ridx] += sep_data[sep_idx].GetSize();
140	}
141	result_lengths [ridx] += input_data[idx].GetSize();
142	has_results [ridx] = true;
143	}
144	}
145
146	// first we allocate the empty strings for each of the values
147	auto result_data = FlatVector::GetData<string_t>(vector&: result);
148	for (idx_t i = `0`; i < count; i++) {
149	auto ridx = rsel.get_index(idx: i);
150	// allocate an empty string of the required size
151	result_data[ridx] = StringVector::EmptyString(vector&: result, len: result_lengths [ridx]);
152	// we reuse the result_lengths vector to store the currently appended size
153	result_lengths [ridx] = `0`;
154	has_results [ridx] = false;
155	}
156
157	// now that the empty space for the strings has been allocated, perform the concatenation
158	for (idx_t col_idx = `1`; col_idx < args.ColumnCount(); col_idx++) {
159	auto &idata = orrified_data [col_idx - `1`];
160	auto input_data = UnifiedVectorFormat::GetData<string_t>(format: idata);
161	for (idx_t i = `0`; i < count; i++) {
162	auto ridx = rsel.get_index(idx: i);
163	auto sep_idx = sep_sel.get_index(idx: ridx);
164	auto idx = idata.sel->get_index(idx: ridx);
165	if (!idata.validity.RowIsValid(row_idx: idx)) {
166	continue;
167	}
168	if (has_results [ridx]) {
169	auto sep_size = sep_data[sep_idx].GetSize();
170	auto sep_ptr = sep_data[sep_idx].GetData();
171	memcpy(dest: result_data[ridx].GetDataWriteable() + result_lengths [ridx], src: sep_ptr, n: sep_size);
172	result_lengths [ridx] += sep_size;
173	}
174	auto input_ptr = input_data[idx].GetData();
175	auto input_len = input_data[idx].GetSize();
176	memcpy(dest: result_data[ridx].GetDataWriteable() + result_lengths [ridx], src: input_ptr, n: input_len);
177	result_lengths [ridx] += input_len;
178	has_results [ridx] = true;
179	}
180	}
181	for (idx_t i = `0`; i < count; i++) {
182	auto ridx = rsel.get_index(idx: i);
183	result_data[ridx].Finalize();
184	}
185	}
186
187	static void ConcatWSFunction(DataChunk &args, ExpressionState &state, Vector &result) {
188	auto &separator = args.data [`0`];
189	UnifiedVectorFormat vdata;
190	separator.ToUnifiedFormat(count: args.size(), data&: vdata);
191
192	result.SetVectorType(VectorType::CONSTANT_VECTOR);
193	for (idx_t col_idx = `0`; col_idx < args.ColumnCount(); col_idx++) {
194	if (args.data [col_idx].GetVectorType() != VectorType::CONSTANT_VECTOR) {
195	result.SetVectorType(VectorType::FLAT_VECTOR);
196	break;
197	}
198	}
199	switch (separator.GetVectorType()) {
200	case VectorType::CONSTANT_VECTOR: {
201	if (ConstantVector::IsNull(vector: separator)) {
202	// constant NULL as separator: return constant NULL vector
203	result.SetVectorType(VectorType::CONSTANT_VECTOR);
204	ConstantVector::SetNull(vector&: result, is_null: true);
205	return;
206	}
207	// no null values
208	auto sel = FlatVector::IncrementalSelectionVector();
209	TemplatedConcatWS(args, sep_data: UnifiedVectorFormat::GetData<string_t>(format: vdata), sep_sel: vdata.sel, rsel: sel, count: args.size(), result);
210	return;
211	}
212	default: {
213	// default case: loop over nullmask and create a non-null selection vector
214	idx_t not_null_count = `0`;
215	SelectionVector not_null_vector(STANDARD_VECTOR_SIZE);
216	auto &result_mask = FlatVector::Validity(vector&: result);
217	for (idx_t i = `0`; i < args.size(); i++) {
218	if (!vdata.validity.RowIsValid(row_idx: vdata.sel->get_index(idx: i))) {
219	result_mask.SetInvalid(i);
220	} else {
221	not_null_vector.set_index(idx: not_null_count++, loc: i);
222	}
223	}
224	TemplatedConcatWS(args, sep_data: UnifiedVectorFormat::GetData<string_t>(format: vdata), sep_sel: *vdata.sel, rsel: not_null_vector,
225	count: not_null_count, result);
226	return;
227	}
228	}
229	}
230
231	void ConcatFun::RegisterFunction(BuiltinFunctions &set) {
232	// the concat operator and concat function have different behavior regarding NULLs
233	// this is strange but seems consistent with postgresql and mysql
234	// (sqlite does not support the concat function, only the concat operator)
235
236	// the concat operator behaves as one would expect: any NULL value present results in a NULL
237	// i.e. NULL \|\| 'hello' = NULL
238	// the concat function, however, treats NULL values as an empty string
239	// i.e. concat(NULL, 'hello') = 'hello'
240	// concat_ws functions similarly to the concat function, except the result is NULL if the separator is NULL
241	// if the separator is not NULL, however, NULL values are counted as empty string
242	// there is one separate rule: there are no separators added between NULL values
243	// so the NULL value and empty string are different!
244	// e.g.:
245	// concat_ws(',', NULL, NULL) = ""
246	// concat_ws(',', '', '') = ","
247	ScalarFunction concat = ScalarFunction ("concat", {LogicalType::VARCHAR}, LogicalType::VARCHAR, ConcatFunction);
248	concat.varargs = LogicalType::VARCHAR;
249	concat.null_handling = FunctionNullHandling::SPECIAL_HANDLING;
250	set.AddFunction(function: concat);
251
252	ScalarFunctionSet concat_op("\|\|");
253	concat_op.AddFunction(
254	function: ScalarFunction ({LogicalType::VARCHAR, LogicalType::VARCHAR}, LogicalType::VARCHAR, ConcatOperator));
255	concat_op.AddFunction(function: ScalarFunction ({LogicalType::BLOB, LogicalType::BLOB}, LogicalType::BLOB, ConcatOperator));
256	concat_op.AddFunction(function: ListConcatFun::GetFunction());
257	for (auto &fun : concat_op.functions) {
258	fun.null_handling = FunctionNullHandling::SPECIAL_HANDLING;
259	}
260	set.AddFunction(set: concat_op);
261
262	ScalarFunction concat_ws = ScalarFunction ("concat_ws", {LogicalType::VARCHAR, LogicalType::VARCHAR},
263	LogicalType::VARCHAR, ConcatWSFunction);
264	concat_ws.varargs = LogicalType::VARCHAR;
265	concat_ws.null_handling = FunctionNullHandling::SPECIAL_HANDLING;
266	set.AddFunction(function: concat_ws);
267	}
268
269	} // namespace duckdb
270

Browse the source code of Velox/build/_deps/duckdb-src/src/function/scalar/string/concat.cpp