1#include "duckdb/common/vector_operations/vector_operations.hpp"
2#include "duckdb/common/operator/comparison_operators.hpp"
3
4namespace duckdb {
5
6struct DistinctBinaryLambdaWrapper {
7 template <class OP, class LEFT_TYPE, class RIGHT_TYPE, class RESULT_TYPE>
8 static inline RESULT_TYPE Operation(LEFT_TYPE left, RIGHT_TYPE right, bool is_left_null, bool is_right_null) {
9 return OP::template Operation<LEFT_TYPE>(left, right, is_left_null, is_right_null);
10 }
11};
12
13template <class LEFT_TYPE, class RIGHT_TYPE, class RESULT_TYPE, class OP>
14static void DistinctExecuteGenericLoop(const LEFT_TYPE *__restrict ldata, const RIGHT_TYPE *__restrict rdata,
15 RESULT_TYPE *__restrict result_data, const SelectionVector *__restrict lsel,
16 const SelectionVector *__restrict rsel, idx_t count, ValidityMask &lmask,
17 ValidityMask &rmask, ValidityMask &result_mask) {
18 for (idx_t i = 0; i < count; i++) {
19 auto lindex = lsel->get_index(idx: i);
20 auto rindex = rsel->get_index(idx: i);
21 auto lentry = ldata[lindex];
22 auto rentry = rdata[rindex];
23 result_data[i] =
24 OP::template Operation<LEFT_TYPE>(lentry, rentry, !lmask.RowIsValid(row_idx: lindex), !rmask.RowIsValid(row_idx: rindex));
25 }
26}
27
28template <class LEFT_TYPE, class RIGHT_TYPE, class RESULT_TYPE, class OP>
29static void DistinctExecuteConstant(Vector &left, Vector &right, Vector &result) {
30 result.SetVectorType(VectorType::CONSTANT_VECTOR);
31
32 auto ldata = ConstantVector::GetData<LEFT_TYPE>(left);
33 auto rdata = ConstantVector::GetData<RIGHT_TYPE>(right);
34 auto result_data = ConstantVector::GetData<RESULT_TYPE>(result);
35 *result_data =
36 OP::template Operation<LEFT_TYPE>(*ldata, *rdata, ConstantVector::IsNull(vector: left), ConstantVector::IsNull(vector: right));
37}
38
39template <class LEFT_TYPE, class RIGHT_TYPE, class RESULT_TYPE, class OP>
40static void DistinctExecuteGeneric(Vector &left, Vector &right, Vector &result, idx_t count) {
41 if (left.GetVectorType() == VectorType::CONSTANT_VECTOR && right.GetVectorType() == VectorType::CONSTANT_VECTOR) {
42 DistinctExecuteConstant<LEFT_TYPE, RIGHT_TYPE, RESULT_TYPE, OP>(left, right, result);
43 } else {
44 UnifiedVectorFormat ldata, rdata;
45
46 left.ToUnifiedFormat(count, data&: ldata);
47 right.ToUnifiedFormat(count, data&: rdata);
48
49 result.SetVectorType(VectorType::FLAT_VECTOR);
50 auto result_data = FlatVector::GetData<RESULT_TYPE>(result);
51 DistinctExecuteGenericLoop<LEFT_TYPE, RIGHT_TYPE, RESULT_TYPE, OP>(
52 UnifiedVectorFormat::GetData<LEFT_TYPE>(ldata), UnifiedVectorFormat::GetData<RIGHT_TYPE>(rdata),
53 result_data, ldata.sel, rdata.sel, count, ldata.validity, rdata.validity, FlatVector::Validity(vector&: result));
54 }
55}
56
57template <class LEFT_TYPE, class RIGHT_TYPE, class RESULT_TYPE, class OP>
58static void DistinctExecuteSwitch(Vector &left, Vector &right, Vector &result, idx_t count) {
59 DistinctExecuteGeneric<LEFT_TYPE, RIGHT_TYPE, RESULT_TYPE, OP>(left, right, result, count);
60}
61
62template <class LEFT_TYPE, class RIGHT_TYPE, class RESULT_TYPE, class OP>
63static void DistinctExecute(Vector &left, Vector &right, Vector &result, idx_t count) {
64 DistinctExecuteSwitch<LEFT_TYPE, RIGHT_TYPE, RESULT_TYPE, OP>(left, right, result, count);
65}
66
67template <class LEFT_TYPE, class RIGHT_TYPE, class OP, bool NO_NULL, bool HAS_TRUE_SEL, bool HAS_FALSE_SEL>
68static inline idx_t
69DistinctSelectGenericLoop(const LEFT_TYPE *__restrict ldata, const RIGHT_TYPE *__restrict rdata,
70 const SelectionVector *__restrict lsel, const SelectionVector *__restrict rsel,
71 const SelectionVector *__restrict result_sel, idx_t count, ValidityMask &lmask,
72 ValidityMask &rmask, SelectionVector *true_sel, SelectionVector *false_sel) {
73 idx_t true_count = 0, false_count = 0;
74 for (idx_t i = 0; i < count; i++) {
75 auto result_idx = result_sel->get_index(idx: i);
76 auto lindex = lsel->get_index(idx: i);
77 auto rindex = rsel->get_index(idx: i);
78 if (NO_NULL) {
79 if (OP::Operation(ldata[lindex], rdata[rindex], false, false)) {
80 if (HAS_TRUE_SEL) {
81 true_sel->set_index(idx: true_count++, loc: result_idx);
82 }
83 } else {
84 if (HAS_FALSE_SEL) {
85 false_sel->set_index(idx: false_count++, loc: result_idx);
86 }
87 }
88 } else {
89 if (OP::Operation(ldata[lindex], rdata[rindex], !lmask.RowIsValid(row_idx: lindex), !rmask.RowIsValid(row_idx: rindex))) {
90 if (HAS_TRUE_SEL) {
91 true_sel->set_index(idx: true_count++, loc: result_idx);
92 }
93 } else {
94 if (HAS_FALSE_SEL) {
95 false_sel->set_index(idx: false_count++, loc: result_idx);
96 }
97 }
98 }
99 }
100 if (HAS_TRUE_SEL) {
101 return true_count;
102 } else {
103 return count - false_count;
104 }
105}
106template <class LEFT_TYPE, class RIGHT_TYPE, class OP, bool NO_NULL>
107static inline idx_t
108DistinctSelectGenericLoopSelSwitch(const LEFT_TYPE *__restrict ldata, const RIGHT_TYPE *__restrict rdata,
109 const SelectionVector *__restrict lsel, const SelectionVector *__restrict rsel,
110 const SelectionVector *__restrict result_sel, idx_t count, ValidityMask &lmask,
111 ValidityMask &rmask, SelectionVector *true_sel, SelectionVector *false_sel) {
112 if (true_sel && false_sel) {
113 return DistinctSelectGenericLoop<LEFT_TYPE, RIGHT_TYPE, OP, NO_NULL, true, true>(
114 ldata, rdata, lsel, rsel, result_sel, count, lmask, rmask, true_sel, false_sel);
115 } else if (true_sel) {
116 return DistinctSelectGenericLoop<LEFT_TYPE, RIGHT_TYPE, OP, NO_NULL, true, false>(
117 ldata, rdata, lsel, rsel, result_sel, count, lmask, rmask, true_sel, false_sel);
118 } else {
119 D_ASSERT(false_sel);
120 return DistinctSelectGenericLoop<LEFT_TYPE, RIGHT_TYPE, OP, NO_NULL, false, true>(
121 ldata, rdata, lsel, rsel, result_sel, count, lmask, rmask, true_sel, false_sel);
122 }
123}
124
125template <class LEFT_TYPE, class RIGHT_TYPE, class OP>
126static inline idx_t
127DistinctSelectGenericLoopSwitch(const LEFT_TYPE *__restrict ldata, const RIGHT_TYPE *__restrict rdata,
128 const SelectionVector *__restrict lsel, const SelectionVector *__restrict rsel,
129 const SelectionVector *__restrict result_sel, idx_t count, ValidityMask &lmask,
130 ValidityMask &rmask, SelectionVector *true_sel, SelectionVector *false_sel) {
131 if (!lmask.AllValid() || !rmask.AllValid()) {
132 return DistinctSelectGenericLoopSelSwitch<LEFT_TYPE, RIGHT_TYPE, OP, false>(
133 ldata, rdata, lsel, rsel, result_sel, count, lmask, rmask, true_sel, false_sel);
134 } else {
135 return DistinctSelectGenericLoopSelSwitch<LEFT_TYPE, RIGHT_TYPE, OP, true>(
136 ldata, rdata, lsel, rsel, result_sel, count, lmask, rmask, true_sel, false_sel);
137 }
138}
139
140template <class LEFT_TYPE, class RIGHT_TYPE, class OP>
141static idx_t DistinctSelectGeneric(Vector &left, Vector &right, const SelectionVector *sel, idx_t count,
142 SelectionVector *true_sel, SelectionVector *false_sel) {
143 UnifiedVectorFormat ldata, rdata;
144
145 left.ToUnifiedFormat(count, data&: ldata);
146 right.ToUnifiedFormat(count, data&: rdata);
147
148 return DistinctSelectGenericLoopSwitch<LEFT_TYPE, RIGHT_TYPE, OP>(
149 UnifiedVectorFormat::GetData<LEFT_TYPE>(ldata), UnifiedVectorFormat::GetData<RIGHT_TYPE>(rdata), ldata.sel,
150 rdata.sel, sel, count, ldata.validity, rdata.validity, true_sel, false_sel);
151}
152template <class LEFT_TYPE, class RIGHT_TYPE, class OP, bool LEFT_CONSTANT, bool RIGHT_CONSTANT, bool NO_NULL,
153 bool HAS_TRUE_SEL, bool HAS_FALSE_SEL>
154static inline idx_t DistinctSelectFlatLoop(LEFT_TYPE *__restrict ldata, RIGHT_TYPE *__restrict rdata,
155 const SelectionVector *sel, idx_t count, ValidityMask &lmask,
156 ValidityMask &rmask, SelectionVector *true_sel, SelectionVector *false_sel) {
157 idx_t true_count = 0, false_count = 0;
158 for (idx_t i = 0; i < count; i++) {
159 idx_t result_idx = sel->get_index(idx: i);
160 idx_t lidx = LEFT_CONSTANT ? 0 : i;
161 idx_t ridx = RIGHT_CONSTANT ? 0 : i;
162 const bool lnull = !lmask.RowIsValid(row_idx: lidx);
163 const bool rnull = !rmask.RowIsValid(row_idx: ridx);
164 bool comparison_result = OP::Operation(ldata[lidx], rdata[ridx], lnull, rnull);
165 if (HAS_TRUE_SEL) {
166 true_sel->set_index(idx: true_count, loc: result_idx);
167 true_count += comparison_result;
168 }
169 if (HAS_FALSE_SEL) {
170 false_sel->set_index(idx: false_count, loc: result_idx);
171 false_count += !comparison_result;
172 }
173 }
174 if (HAS_TRUE_SEL) {
175 return true_count;
176 } else {
177 return count - false_count;
178 }
179}
180
181template <class LEFT_TYPE, class RIGHT_TYPE, class OP, bool LEFT_CONSTANT, bool RIGHT_CONSTANT, bool NO_NULL>
182static inline idx_t DistinctSelectFlatLoopSelSwitch(LEFT_TYPE *__restrict ldata, RIGHT_TYPE *__restrict rdata,
183 const SelectionVector *sel, idx_t count, ValidityMask &lmask,
184 ValidityMask &rmask, SelectionVector *true_sel,
185 SelectionVector *false_sel) {
186 if (true_sel && false_sel) {
187 return DistinctSelectFlatLoop<LEFT_TYPE, RIGHT_TYPE, OP, LEFT_CONSTANT, RIGHT_CONSTANT, NO_NULL, true, true>(
188 ldata, rdata, sel, count, lmask, rmask, true_sel, false_sel);
189 } else if (true_sel) {
190 return DistinctSelectFlatLoop<LEFT_TYPE, RIGHT_TYPE, OP, LEFT_CONSTANT, RIGHT_CONSTANT, NO_NULL, true, false>(
191 ldata, rdata, sel, count, lmask, rmask, true_sel, false_sel);
192 } else {
193 D_ASSERT(false_sel);
194 return DistinctSelectFlatLoop<LEFT_TYPE, RIGHT_TYPE, OP, LEFT_CONSTANT, RIGHT_CONSTANT, NO_NULL, false, true>(
195 ldata, rdata, sel, count, lmask, rmask, true_sel, false_sel);
196 }
197}
198
199template <class LEFT_TYPE, class RIGHT_TYPE, class OP, bool LEFT_CONSTANT, bool RIGHT_CONSTANT>
200static inline idx_t DistinctSelectFlatLoopSwitch(LEFT_TYPE *__restrict ldata, RIGHT_TYPE *__restrict rdata,
201 const SelectionVector *sel, idx_t count, ValidityMask &lmask,
202 ValidityMask &rmask, SelectionVector *true_sel,
203 SelectionVector *false_sel) {
204 return DistinctSelectFlatLoopSelSwitch<LEFT_TYPE, RIGHT_TYPE, OP, LEFT_CONSTANT, RIGHT_CONSTANT, true>(
205 ldata, rdata, sel, count, lmask, rmask, true_sel, false_sel);
206}
207template <class LEFT_TYPE, class RIGHT_TYPE, class OP, bool LEFT_CONSTANT, bool RIGHT_CONSTANT>
208static idx_t DistinctSelectFlat(Vector &left, Vector &right, const SelectionVector *sel, idx_t count,
209 SelectionVector *true_sel, SelectionVector *false_sel) {
210 auto ldata = FlatVector::GetData<LEFT_TYPE>(left);
211 auto rdata = FlatVector::GetData<RIGHT_TYPE>(right);
212 if (LEFT_CONSTANT) {
213 ValidityMask validity;
214 if (ConstantVector::IsNull(vector: left)) {
215 validity.SetAllInvalid(1);
216 }
217 return DistinctSelectFlatLoopSwitch<LEFT_TYPE, RIGHT_TYPE, OP, LEFT_CONSTANT, RIGHT_CONSTANT>(
218 ldata, rdata, sel, count, validity, FlatVector::Validity(vector&: right), true_sel, false_sel);
219 } else if (RIGHT_CONSTANT) {
220 ValidityMask validity;
221 if (ConstantVector::IsNull(vector: right)) {
222 validity.SetAllInvalid(1);
223 }
224 return DistinctSelectFlatLoopSwitch<LEFT_TYPE, RIGHT_TYPE, OP, LEFT_CONSTANT, RIGHT_CONSTANT>(
225 ldata, rdata, sel, count, FlatVector::Validity(vector&: left), validity, true_sel, false_sel);
226 } else {
227 return DistinctSelectFlatLoopSwitch<LEFT_TYPE, RIGHT_TYPE, OP, LEFT_CONSTANT, RIGHT_CONSTANT>(
228 ldata, rdata, sel, count, FlatVector::Validity(vector&: left), FlatVector::Validity(vector&: right), true_sel, false_sel);
229 }
230}
231template <class LEFT_TYPE, class RIGHT_TYPE, class OP>
232static idx_t DistinctSelectConstant(Vector &left, Vector &right, const SelectionVector *sel, idx_t count,
233 SelectionVector *true_sel, SelectionVector *false_sel) {
234 auto ldata = ConstantVector::GetData<LEFT_TYPE>(left);
235 auto rdata = ConstantVector::GetData<RIGHT_TYPE>(right);
236
237 // both sides are constant, return either 0 or the count
238 // in this case we do not fill in the result selection vector at all
239 if (!OP::Operation(*ldata, *rdata, ConstantVector::IsNull(vector: left), ConstantVector::IsNull(vector: right))) {
240 if (false_sel) {
241 for (idx_t i = 0; i < count; i++) {
242 false_sel->set_index(idx: i, loc: sel->get_index(idx: i));
243 }
244 }
245 return 0;
246 } else {
247 if (true_sel) {
248 for (idx_t i = 0; i < count; i++) {
249 true_sel->set_index(idx: i, loc: sel->get_index(idx: i));
250 }
251 }
252 return count;
253 }
254}
255
256template <class LEFT_TYPE, class RIGHT_TYPE, class OP>
257static idx_t DistinctSelect(Vector &left, Vector &right, const SelectionVector *sel, idx_t count,
258 SelectionVector *true_sel, SelectionVector *false_sel) {
259 if (!sel) {
260 sel = FlatVector::IncrementalSelectionVector();
261 }
262 if (left.GetVectorType() == VectorType::CONSTANT_VECTOR && right.GetVectorType() == VectorType::CONSTANT_VECTOR) {
263 return DistinctSelectConstant<LEFT_TYPE, RIGHT_TYPE, OP>(left, right, sel, count, true_sel, false_sel);
264 } else if (left.GetVectorType() == VectorType::CONSTANT_VECTOR &&
265 right.GetVectorType() == VectorType::FLAT_VECTOR) {
266 return DistinctSelectFlat<LEFT_TYPE, RIGHT_TYPE, OP, true, false>(left, right, sel, count, true_sel, false_sel);
267 } else if (left.GetVectorType() == VectorType::FLAT_VECTOR &&
268 right.GetVectorType() == VectorType::CONSTANT_VECTOR) {
269 return DistinctSelectFlat<LEFT_TYPE, RIGHT_TYPE, OP, false, true>(left, right, sel, count, true_sel, false_sel);
270 } else if (left.GetVectorType() == VectorType::FLAT_VECTOR && right.GetVectorType() == VectorType::FLAT_VECTOR) {
271 return DistinctSelectFlat<LEFT_TYPE, RIGHT_TYPE, OP, false, false>(left, right, sel, count, true_sel,
272 false_sel);
273 } else {
274 return DistinctSelectGeneric<LEFT_TYPE, RIGHT_TYPE, OP>(left, right, sel, count, true_sel, false_sel);
275 }
276}
277
278template <class OP>
279static idx_t DistinctSelectNotNull(Vector &left, Vector &right, const idx_t count, idx_t &true_count,
280 const SelectionVector &sel, SelectionVector &maybe_vec, OptionalSelection &true_opt,
281 OptionalSelection &false_opt) {
282 UnifiedVectorFormat lvdata, rvdata;
283 left.ToUnifiedFormat(count, data&: lvdata);
284 right.ToUnifiedFormat(count, data&: rvdata);
285
286 auto &lmask = lvdata.validity;
287 auto &rmask = rvdata.validity;
288
289 idx_t remaining = 0;
290 if (lmask.AllValid() && rmask.AllValid()) {
291 // None are NULL, distinguish values.
292 for (idx_t i = 0; i < count; ++i) {
293 const auto idx = sel.get_index(idx: i);
294 maybe_vec.set_index(idx: remaining++, loc: idx);
295 }
296 return remaining;
297 }
298
299 // Slice the Vectors down to the rows that are not determined (i.e., neither is NULL)
300 SelectionVector slicer(count);
301 true_count = 0;
302 idx_t false_count = 0;
303 for (idx_t i = 0; i < count; ++i) {
304 const auto result_idx = sel.get_index(idx: i);
305 const auto lidx = lvdata.sel->get_index(idx: i);
306 const auto ridx = rvdata.sel->get_index(idx: i);
307 const auto lnull = !lmask.RowIsValid(row_idx: lidx);
308 const auto rnull = !rmask.RowIsValid(row_idx: ridx);
309 if (lnull || rnull) {
310 // If either is NULL then we can major distinguish them
311 if (!OP::Operation(false, false, lnull, rnull)) {
312 false_opt.Append(count&: false_count, idx: result_idx);
313 } else {
314 true_opt.Append(count&: true_count, idx: result_idx);
315 }
316 } else {
317 // Neither is NULL, distinguish values.
318 slicer.set_index(idx: remaining, loc: i);
319 maybe_vec.set_index(idx: remaining++, loc: result_idx);
320 }
321 }
322
323 true_opt.Advance(completed: true_count);
324 false_opt.Advance(completed: false_count);
325
326 if (remaining && remaining < count) {
327 left.Slice(sel: slicer, count: remaining);
328 right.Slice(sel: slicer, count: remaining);
329 }
330
331 return remaining;
332}
333
334struct PositionComparator {
335 // Select the rows that definitely match.
336 // Default to the same as the final row
337 template <typename OP>
338 static idx_t Definite(Vector &left, Vector &right, const SelectionVector &sel, idx_t count,
339 SelectionVector *true_sel, SelectionVector &false_sel) {
340 return Final<OP>(left, right, sel, count, true_sel, &false_sel);
341 }
342
343 // Select the possible rows that need further testing.
344 // Usually this means Is Not Distinct, as those are the semantics used by Postges
345 template <typename OP>
346 static idx_t Possible(Vector &left, Vector &right, const SelectionVector &sel, idx_t count,
347 SelectionVector &true_sel, SelectionVector *false_sel) {
348 return VectorOperations::NestedEquals(left, right, sel, count, true_sel: &true_sel, false_sel);
349 }
350
351 // Select the matching rows for the final position.
352 // This needs to be specialised.
353 template <typename OP>
354 static idx_t Final(Vector &left, Vector &right, const SelectionVector &sel, idx_t count, SelectionVector *true_sel,
355 SelectionVector *false_sel) {
356 return 0;
357 }
358
359 // Tie-break based on length when one of the sides has been exhausted, returning true if the LHS matches.
360 // This essentially means that the existing positions compare equal.
361 // Default to the same semantics as the OP for idx_t. This works in most cases.
362 template <typename OP>
363 static bool TieBreak(const idx_t lpos, const idx_t rpos) {
364 return OP::Operation(lpos, rpos, false, false);
365 }
366};
367
368// NotDistinctFrom must always check every column
369template <>
370idx_t PositionComparator::Definite<duckdb::NotDistinctFrom>(Vector &left, Vector &right, const SelectionVector &sel,
371 idx_t count, SelectionVector *true_sel,
372 SelectionVector &false_sel) {
373 return 0;
374}
375
376template <>
377idx_t PositionComparator::Final<duckdb::NotDistinctFrom>(Vector &left, Vector &right, const SelectionVector &sel,
378 idx_t count, SelectionVector *true_sel,
379 SelectionVector *false_sel) {
380 return VectorOperations::NestedEquals(left, right, sel, count, true_sel, false_sel);
381}
382
383// DistinctFrom must check everything that matched
384template <>
385idx_t PositionComparator::Possible<duckdb::DistinctFrom>(Vector &left, Vector &right, const SelectionVector &sel,
386 idx_t count, SelectionVector &true_sel,
387 SelectionVector *false_sel) {
388 return count;
389}
390
391template <>
392idx_t PositionComparator::Final<duckdb::DistinctFrom>(Vector &left, Vector &right, const SelectionVector &sel,
393 idx_t count, SelectionVector *true_sel,
394 SelectionVector *false_sel) {
395 return VectorOperations::NestedNotEquals(left, right, sel, count, true_sel, false_sel);
396}
397
398// Non-strict inequalities must use strict comparisons for Definite
399template <>
400idx_t PositionComparator::Definite<duckdb::DistinctLessThanEquals>(Vector &left, Vector &right,
401 const SelectionVector &sel, idx_t count,
402 SelectionVector *true_sel,
403 SelectionVector &false_sel) {
404 return VectorOperations::DistinctGreaterThan(left&: right, right&: left, sel: &sel, count, true_sel, false_sel: &false_sel);
405}
406
407template <>
408idx_t PositionComparator::Final<duckdb::DistinctLessThanEquals>(Vector &left, Vector &right, const SelectionVector &sel,
409 idx_t count, SelectionVector *true_sel,
410 SelectionVector *false_sel) {
411 return VectorOperations::DistinctGreaterThanEquals(left&: right, right&: left, sel: &sel, count, true_sel, false_sel);
412}
413
414template <>
415idx_t PositionComparator::Definite<duckdb::DistinctGreaterThanEquals>(Vector &left, Vector &right,
416 const SelectionVector &sel, idx_t count,
417 SelectionVector *true_sel,
418 SelectionVector &false_sel) {
419 return VectorOperations::DistinctGreaterThan(left, right, sel: &sel, count, true_sel, false_sel: &false_sel);
420}
421
422template <>
423idx_t PositionComparator::Final<duckdb::DistinctGreaterThanEquals>(Vector &left, Vector &right,
424 const SelectionVector &sel, idx_t count,
425 SelectionVector *true_sel,
426 SelectionVector *false_sel) {
427 return VectorOperations::DistinctGreaterThanEquals(left, right, sel: &sel, count, true_sel, false_sel);
428}
429
430// Strict inequalities just use strict for both Definite and Final
431template <>
432idx_t PositionComparator::Final<duckdb::DistinctLessThan>(Vector &left, Vector &right, const SelectionVector &sel,
433 idx_t count, SelectionVector *true_sel,
434 SelectionVector *false_sel) {
435 return VectorOperations::DistinctGreaterThan(left&: right, right&: left, sel: &sel, count, true_sel, false_sel);
436}
437
438template <>
439idx_t PositionComparator::Final<duckdb::DistinctGreaterThan>(Vector &left, Vector &right, const SelectionVector &sel,
440 idx_t count, SelectionVector *true_sel,
441 SelectionVector *false_sel) {
442 return VectorOperations::DistinctGreaterThan(left, right, sel: &sel, count, true_sel, false_sel);
443}
444
445using StructEntries = vector<unique_ptr<Vector>>;
446
447static void ExtractNestedSelection(const SelectionVector &slice_sel, const idx_t count, const SelectionVector &sel,
448 OptionalSelection &opt) {
449
450 for (idx_t i = 0; i < count;) {
451 const auto slice_idx = slice_sel.get_index(idx: i);
452 const auto result_idx = sel.get_index(idx: slice_idx);
453 opt.Append(count&: i, idx: result_idx);
454 }
455 opt.Advance(completed: count);
456}
457
458static void DensifyNestedSelection(const SelectionVector &dense_sel, const idx_t count, SelectionVector &slice_sel) {
459 for (idx_t i = 0; i < count; ++i) {
460 slice_sel.set_index(idx: i, loc: dense_sel.get_index(idx: i));
461 }
462}
463
464template <class OP>
465static idx_t DistinctSelectStruct(Vector &left, Vector &right, idx_t count, const SelectionVector &sel,
466 OptionalSelection &true_opt, OptionalSelection &false_opt) {
467 if (count == 0) {
468 return 0;
469 }
470
471 // Avoid allocating in the 99% of the cases where we don't need to.
472 StructEntries lsliced, rsliced;
473 auto &lchildren = StructVector::GetEntries(vector&: left);
474 auto &rchildren = StructVector::GetEntries(vector&: right);
475 D_ASSERT(lchildren.size() == rchildren.size());
476
477 // In order to reuse the comparators, we have to track what passed and failed internally.
478 // To do that, we need local SVs that we then merge back into the real ones after every pass.
479 const auto vcount = count;
480 SelectionVector slice_sel(count);
481 for (idx_t i = 0; i < count; ++i) {
482 slice_sel.set_index(idx: i, loc: i);
483 }
484
485 SelectionVector true_sel(count);
486 SelectionVector false_sel(count);
487
488 idx_t match_count = 0;
489 for (idx_t col_no = 0; col_no < lchildren.size(); ++col_no) {
490 // Slice the children to maintain density
491 Vector lchild(*lchildren[col_no]);
492 lchild.Flatten(count: vcount);
493 lchild.Slice(sel: slice_sel, count);
494
495 Vector rchild(*rchildren[col_no]);
496 rchild.Flatten(count: vcount);
497 rchild.Slice(sel: slice_sel, count);
498
499 // Find everything that definitely matches
500 auto true_count = PositionComparator::Definite<OP>(lchild, rchild, slice_sel, count, &true_sel, false_sel);
501 if (true_count > 0) {
502 auto false_count = count - true_count;
503
504 // Extract the definite matches into the true result
505 ExtractNestedSelection(false_count ? true_sel : slice_sel, true_count, sel, true_opt);
506
507 // Remove the definite matches from the slicing vector
508 DensifyNestedSelection(false_sel, false_count, slice_sel);
509
510 match_count += true_count;
511 count -= true_count;
512 }
513
514 if (col_no != lchildren.size() - 1) {
515 // Find what might match on the next position
516 true_count = PositionComparator::Possible<OP>(lchild, rchild, slice_sel, count, true_sel, &false_sel);
517 auto false_count = count - true_count;
518
519 // Extract the definite failures into the false result
520 ExtractNestedSelection(true_count ? false_sel : slice_sel, false_count, sel, false_opt);
521
522 // Remove any definite failures from the slicing vector
523 if (false_count) {
524 DensifyNestedSelection(true_sel, true_count, slice_sel);
525 }
526
527 count = true_count;
528 } else {
529 true_count = PositionComparator::Final<OP>(lchild, rchild, slice_sel, count, &true_sel, &false_sel);
530 auto false_count = count - true_count;
531
532 // Extract the definite matches into the true result
533 ExtractNestedSelection(false_count ? true_sel : slice_sel, true_count, sel, true_opt);
534
535 // Extract the definite failures into the false result
536 ExtractNestedSelection(true_count ? false_sel : slice_sel, false_count, sel, false_opt);
537
538 match_count += true_count;
539 }
540 }
541 return match_count;
542}
543
544static void PositionListCursor(SelectionVector &cursor, UnifiedVectorFormat &vdata, const idx_t pos,
545 const SelectionVector &slice_sel, const idx_t count) {
546 const auto data = UnifiedVectorFormat::GetData<list_entry_t>(format: vdata);
547 for (idx_t i = 0; i < count; ++i) {
548 const auto slice_idx = slice_sel.get_index(idx: i);
549
550 const auto lidx = vdata.sel->get_index(idx: slice_idx);
551 const auto &entry = data[lidx];
552 cursor.set_index(idx: i, loc: entry.offset + pos);
553 }
554}
555
556template <class OP>
557static idx_t DistinctSelectList(Vector &left, Vector &right, idx_t count, const SelectionVector &sel,
558 OptionalSelection &true_opt, OptionalSelection &false_opt) {
559 if (count == 0) {
560 return count;
561 }
562
563 // Create dictionary views of the children so we can vectorise the positional comparisons.
564 SelectionVector lcursor(count);
565 SelectionVector rcursor(count);
566
567 ListVector::GetEntry(vector&: left).Flatten(count: ListVector::GetListSize(vector: left));
568 ListVector::GetEntry(vector&: right).Flatten(count: ListVector::GetListSize(vector: right));
569 Vector lchild(ListVector::GetEntry(vector&: left), lcursor, count);
570 Vector rchild(ListVector::GetEntry(vector&: right), rcursor, count);
571
572 // To perform the positional comparison, we use a vectorisation of the following algorithm:
573 // bool CompareLists(T *left, idx_t nleft, T *right, nright) {
574 // for (idx_t pos = 0; ; ++pos) {
575 // if (nleft == pos || nright == pos)
576 // return OP::TieBreak(nleft, nright);
577 // if (OP::Definite(*left, *right))
578 // return true;
579 // if (!OP::Maybe(*left, *right))
580 // return false;
581 // }
582 // ++left, ++right;
583 // }
584 // }
585
586 // Get pointers to the list entries
587 UnifiedVectorFormat lvdata;
588 left.ToUnifiedFormat(count, data&: lvdata);
589 const auto ldata = UnifiedVectorFormat::GetData<list_entry_t>(format: lvdata);
590
591 UnifiedVectorFormat rvdata;
592 right.ToUnifiedFormat(count, data&: rvdata);
593 const auto rdata = UnifiedVectorFormat::GetData<list_entry_t>(format: rvdata);
594
595 // In order to reuse the comparators, we have to track what passed and failed internally.
596 // To do that, we need local SVs that we then merge back into the real ones after every pass.
597 SelectionVector slice_sel(count);
598 for (idx_t i = 0; i < count; ++i) {
599 slice_sel.set_index(idx: i, loc: i);
600 }
601
602 SelectionVector true_sel(count);
603 SelectionVector false_sel(count);
604
605 idx_t match_count = 0;
606 for (idx_t pos = 0; count > 0; ++pos) {
607 // Set up the cursors for the current position
608 PositionListCursor(cursor&: lcursor, vdata&: lvdata, pos, slice_sel, count);
609 PositionListCursor(cursor&: rcursor, vdata&: rvdata, pos, slice_sel, count);
610
611 // Tie-break the pairs where one of the LISTs is exhausted.
612 idx_t true_count = 0;
613 idx_t false_count = 0;
614 idx_t maybe_count = 0;
615 for (idx_t i = 0; i < count; ++i) {
616 const auto slice_idx = slice_sel.get_index(idx: i);
617 const auto lidx = lvdata.sel->get_index(idx: slice_idx);
618 const auto &lentry = ldata[lidx];
619 const auto ridx = rvdata.sel->get_index(idx: slice_idx);
620 const auto &rentry = rdata[ridx];
621 if (lentry.length == pos || rentry.length == pos) {
622 const auto idx = sel.get_index(idx: slice_idx);
623 if (PositionComparator::TieBreak<OP>(lentry.length, rentry.length)) {
624 true_opt.Append(count&: true_count, idx);
625 } else {
626 false_opt.Append(count&: false_count, idx);
627 }
628 } else {
629 true_sel.set_index(idx: maybe_count++, loc: slice_idx);
630 }
631 }
632 true_opt.Advance(completed: true_count);
633 false_opt.Advance(completed: false_count);
634 match_count += true_count;
635
636 // Redensify the list cursors
637 if (maybe_count < count) {
638 count = maybe_count;
639 DensifyNestedSelection(dense_sel: true_sel, count, slice_sel);
640 PositionListCursor(cursor&: lcursor, vdata&: lvdata, pos, slice_sel, count);
641 PositionListCursor(cursor&: rcursor, vdata&: rvdata, pos, slice_sel, count);
642 }
643
644 // Find everything that definitely matches
645 true_count = PositionComparator::Definite<OP>(lchild, rchild, slice_sel, count, &true_sel, false_sel);
646 if (true_count) {
647 false_count = count - true_count;
648 ExtractNestedSelection(slice_sel: false_count ? true_sel : slice_sel, count: true_count, sel, opt&: true_opt);
649 match_count += true_count;
650
651 // Redensify the list cursors
652 count -= true_count;
653 DensifyNestedSelection(dense_sel: false_sel, count, slice_sel);
654 PositionListCursor(cursor&: lcursor, vdata&: lvdata, pos, slice_sel, count);
655 PositionListCursor(cursor&: rcursor, vdata&: rvdata, pos, slice_sel, count);
656 }
657
658 // Find what might match on the next position
659 true_count = PositionComparator::Possible<OP>(lchild, rchild, slice_sel, count, true_sel, &false_sel);
660 false_count = count - true_count;
661 ExtractNestedSelection(slice_sel: true_count ? false_sel : slice_sel, count: false_count, sel, opt&: false_opt);
662
663 if (false_count) {
664 DensifyNestedSelection(dense_sel: true_sel, count: true_count, slice_sel);
665 }
666 count = true_count;
667 }
668
669 return match_count;
670}
671
672template <class OP, class OPNESTED>
673static idx_t DistinctSelectNested(Vector &left, Vector &right, const SelectionVector *sel, const idx_t count,
674 SelectionVector *true_sel, SelectionVector *false_sel) {
675 // The Select operations all use a dense pair of input vectors to partition
676 // a selection vector in a single pass. But to implement progressive comparisons,
677 // we have to make multiple passes, so we need to keep track of the original input positions
678 // and then scatter the output selections when we are done.
679 if (!sel) {
680 sel = FlatVector::IncrementalSelectionVector();
681 }
682
683 // Make buffered selections for progressive comparisons
684 // TODO: Remove unnecessary allocations
685 SelectionVector true_vec(count);
686 OptionalSelection true_opt(&true_vec);
687
688 SelectionVector false_vec(count);
689 OptionalSelection false_opt(&false_vec);
690
691 SelectionVector maybe_vec(count);
692
693 // Handle NULL nested values
694 Vector l_not_null(left);
695 Vector r_not_null(right);
696
697 idx_t match_count = 0;
698 auto unknown =
699 DistinctSelectNotNull<OP>(l_not_null, r_not_null, count, match_count, *sel, maybe_vec, true_opt, false_opt);
700
701 if (PhysicalType::LIST == left.GetType().InternalType()) {
702 match_count += DistinctSelectList<OPNESTED>(l_not_null, r_not_null, unknown, maybe_vec, true_opt, false_opt);
703 } else {
704 match_count += DistinctSelectStruct<OPNESTED>(l_not_null, r_not_null, unknown, maybe_vec, true_opt, false_opt);
705 }
706
707 // Copy the buffered selections to the output selections
708 if (true_sel) {
709 DensifyNestedSelection(dense_sel: true_vec, count: match_count, slice_sel&: *true_sel);
710 }
711
712 if (false_sel) {
713 DensifyNestedSelection(dense_sel: false_vec, count: count - match_count, slice_sel&: *false_sel);
714 }
715
716 return match_count;
717}
718
719template <typename OP>
720static void NestedDistinctExecute(Vector &left, Vector &right, Vector &result, idx_t count);
721
722template <class T, class OP>
723static inline void TemplatedDistinctExecute(Vector &left, Vector &right, Vector &result, idx_t count) {
724 DistinctExecute<T, T, bool, OP>(left, right, result, count);
725}
726template <class OP>
727static void ExecuteDistinct(Vector &left, Vector &right, Vector &result, idx_t count) {
728 D_ASSERT(left.GetType() == right.GetType() && result.GetType() == LogicalType::BOOLEAN);
729 // the inplace loops take the result as the last parameter
730 switch (left.GetType().InternalType()) {
731 case PhysicalType::BOOL:
732 case PhysicalType::INT8:
733 TemplatedDistinctExecute<int8_t, OP>(left, right, result, count);
734 break;
735 case PhysicalType::INT16:
736 TemplatedDistinctExecute<int16_t, OP>(left, right, result, count);
737 break;
738 case PhysicalType::INT32:
739 TemplatedDistinctExecute<int32_t, OP>(left, right, result, count);
740 break;
741 case PhysicalType::INT64:
742 TemplatedDistinctExecute<int64_t, OP>(left, right, result, count);
743 break;
744 case PhysicalType::UINT8:
745 TemplatedDistinctExecute<uint8_t, OP>(left, right, result, count);
746 break;
747 case PhysicalType::UINT16:
748 TemplatedDistinctExecute<uint16_t, OP>(left, right, result, count);
749 break;
750 case PhysicalType::UINT32:
751 TemplatedDistinctExecute<uint32_t, OP>(left, right, result, count);
752 break;
753 case PhysicalType::UINT64:
754 TemplatedDistinctExecute<uint64_t, OP>(left, right, result, count);
755 break;
756 case PhysicalType::INT128:
757 TemplatedDistinctExecute<hugeint_t, OP>(left, right, result, count);
758 break;
759 case PhysicalType::FLOAT:
760 TemplatedDistinctExecute<float, OP>(left, right, result, count);
761 break;
762 case PhysicalType::DOUBLE:
763 TemplatedDistinctExecute<double, OP>(left, right, result, count);
764 break;
765 case PhysicalType::INTERVAL:
766 TemplatedDistinctExecute<interval_t, OP>(left, right, result, count);
767 break;
768 case PhysicalType::VARCHAR:
769 TemplatedDistinctExecute<string_t, OP>(left, right, result, count);
770 break;
771 case PhysicalType::LIST:
772 case PhysicalType::STRUCT:
773 NestedDistinctExecute<OP>(left, right, result, count);
774 break;
775 default:
776 throw InternalException("Invalid type for distinct comparison");
777 }
778}
779
780template <class OP, class OPNESTED = OP>
781static idx_t TemplatedDistinctSelectOperation(Vector &left, Vector &right, const SelectionVector *sel, idx_t count,
782 SelectionVector *true_sel, SelectionVector *false_sel) {
783 // the inplace loops take the result as the last parameter
784 switch (left.GetType().InternalType()) {
785 case PhysicalType::BOOL:
786 case PhysicalType::INT8:
787 return DistinctSelect<int8_t, int8_t, OP>(left, right, sel, count, true_sel, false_sel);
788 case PhysicalType::INT16:
789 return DistinctSelect<int16_t, int16_t, OP>(left, right, sel, count, true_sel, false_sel);
790 case PhysicalType::INT32:
791 return DistinctSelect<int32_t, int32_t, OP>(left, right, sel, count, true_sel, false_sel);
792 case PhysicalType::INT64:
793 return DistinctSelect<int64_t, int64_t, OP>(left, right, sel, count, true_sel, false_sel);
794 case PhysicalType::UINT8:
795 return DistinctSelect<uint8_t, uint8_t, OP>(left, right, sel, count, true_sel, false_sel);
796 case PhysicalType::UINT16:
797 return DistinctSelect<uint16_t, uint16_t, OP>(left, right, sel, count, true_sel, false_sel);
798 case PhysicalType::UINT32:
799 return DistinctSelect<uint32_t, uint32_t, OP>(left, right, sel, count, true_sel, false_sel);
800 case PhysicalType::UINT64:
801 return DistinctSelect<uint64_t, uint64_t, OP>(left, right, sel, count, true_sel, false_sel);
802 case PhysicalType::INT128:
803 return DistinctSelect<hugeint_t, hugeint_t, OP>(left, right, sel, count, true_sel, false_sel);
804 case PhysicalType::FLOAT:
805 return DistinctSelect<float, float, OP>(left, right, sel, count, true_sel, false_sel);
806 case PhysicalType::DOUBLE:
807 return DistinctSelect<double, double, OP>(left, right, sel, count, true_sel, false_sel);
808 case PhysicalType::INTERVAL:
809 return DistinctSelect<interval_t, interval_t, OP>(left, right, sel, count, true_sel, false_sel);
810 case PhysicalType::VARCHAR:
811 return DistinctSelect<string_t, string_t, OP>(left, right, sel, count, true_sel, false_sel);
812 case PhysicalType::STRUCT:
813 case PhysicalType::LIST:
814 return DistinctSelectNested<OP, OPNESTED>(left, right, sel, count, true_sel, false_sel);
815 default:
816 throw InternalException("Invalid type for distinct selection");
817 }
818}
819
820template <typename OP>
821static void NestedDistinctExecute(Vector &left, Vector &right, Vector &result, idx_t count) {
822 const auto left_constant = left.GetVectorType() == VectorType::CONSTANT_VECTOR;
823 const auto right_constant = right.GetVectorType() == VectorType::CONSTANT_VECTOR;
824
825 if (left_constant && right_constant) {
826 // both sides are constant, so just compare one element.
827 result.SetVectorType(VectorType::CONSTANT_VECTOR);
828 auto result_data = ConstantVector::GetData<bool>(vector&: result);
829 SelectionVector true_sel(1);
830 auto match_count = TemplatedDistinctSelectOperation<OP>(left, right, nullptr, 1, &true_sel, nullptr);
831 result_data[0] = match_count > 0;
832 return;
833 }
834
835 SelectionVector true_sel(count);
836 SelectionVector false_sel(count);
837
838 // DISTINCT is either true or false
839 idx_t match_count = TemplatedDistinctSelectOperation<OP>(left, right, nullptr, count, &true_sel, &false_sel);
840
841 result.SetVectorType(VectorType::FLAT_VECTOR);
842 auto result_data = FlatVector::GetData<bool>(vector&: result);
843
844 for (idx_t i = 0; i < match_count; ++i) {
845 const auto idx = true_sel.get_index(idx: i);
846 result_data[idx] = true;
847 }
848
849 const idx_t no_match_count = count - match_count;
850 for (idx_t i = 0; i < no_match_count; ++i) {
851 const auto idx = false_sel.get_index(idx: i);
852 result_data[idx] = false;
853 }
854}
855
856void VectorOperations::DistinctFrom(Vector &left, Vector &right, Vector &result, idx_t count) {
857 ExecuteDistinct<duckdb::DistinctFrom>(left, right, result, count);
858}
859
860void VectorOperations::NotDistinctFrom(Vector &left, Vector &right, Vector &result, idx_t count) {
861 ExecuteDistinct<duckdb::NotDistinctFrom>(left, right, result, count);
862}
863
864// true := A != B with nulls being equal
865idx_t VectorOperations::DistinctFrom(Vector &left, Vector &right, const SelectionVector *sel, idx_t count,
866 SelectionVector *true_sel, SelectionVector *false_sel) {
867 return TemplatedDistinctSelectOperation<duckdb::DistinctFrom>(left, right, sel, count, true_sel, false_sel);
868}
869// true := A == B with nulls being equal
870idx_t VectorOperations::NotDistinctFrom(Vector &left, Vector &right, const SelectionVector *sel, idx_t count,
871 SelectionVector *true_sel, SelectionVector *false_sel) {
872 return count - TemplatedDistinctSelectOperation<duckdb::DistinctFrom>(left, right, sel, count, true_sel: false_sel, false_sel: true_sel);
873}
874
875// true := A > B with nulls being maximal
876idx_t VectorOperations::DistinctGreaterThan(Vector &left, Vector &right, const SelectionVector *sel, idx_t count,
877 SelectionVector *true_sel, SelectionVector *false_sel) {
878 return TemplatedDistinctSelectOperation<duckdb::DistinctGreaterThan>(left, right, sel, count, true_sel, false_sel);
879}
880
881// true := A > B with nulls being minimal
882idx_t VectorOperations::DistinctGreaterThanNullsFirst(Vector &left, Vector &right, const SelectionVector *sel,
883 idx_t count, SelectionVector *true_sel,
884 SelectionVector *false_sel) {
885 return TemplatedDistinctSelectOperation<duckdb::DistinctGreaterThanNullsFirst, duckdb::DistinctGreaterThan>(
886 left, right, sel, count, true_sel, false_sel);
887}
888// true := A >= B with nulls being maximal
889idx_t VectorOperations::DistinctGreaterThanEquals(Vector &left, Vector &right, const SelectionVector *sel, idx_t count,
890 SelectionVector *true_sel, SelectionVector *false_sel) {
891 return count -
892 TemplatedDistinctSelectOperation<duckdb::DistinctGreaterThan>(left&: right, right&: left, sel, count, true_sel: false_sel, false_sel: true_sel);
893}
894// true := A < B with nulls being maximal
895idx_t VectorOperations::DistinctLessThan(Vector &left, Vector &right, const SelectionVector *sel, idx_t count,
896 SelectionVector *true_sel, SelectionVector *false_sel) {
897 return TemplatedDistinctSelectOperation<duckdb::DistinctGreaterThan>(left&: right, right&: left, sel, count, true_sel, false_sel);
898}
899
900// true := A < B with nulls being minimal
901idx_t VectorOperations::DistinctLessThanNullsFirst(Vector &left, Vector &right, const SelectionVector *sel, idx_t count,
902 SelectionVector *true_sel, SelectionVector *false_sel) {
903 return TemplatedDistinctSelectOperation<duckdb::DistinctGreaterThanNullsFirst, duckdb::DistinctGreaterThan>(
904 left&: right, right&: left, sel, count, true_sel, false_sel);
905}
906
907// true := A <= B with nulls being maximal
908idx_t VectorOperations::DistinctLessThanEquals(Vector &left, Vector &right, const SelectionVector *sel, idx_t count,
909 SelectionVector *true_sel, SelectionVector *false_sel) {
910 return count -
911 TemplatedDistinctSelectOperation<duckdb::DistinctGreaterThan>(left, right, sel, count, true_sel: false_sel, false_sel: true_sel);
912}
913
914// true := A != B with nulls being equal, inputs selected
915idx_t VectorOperations::NestedNotEquals(Vector &left, Vector &right, const SelectionVector &sel, idx_t count,
916 SelectionVector *true_sel, SelectionVector *false_sel) {
917 return TemplatedDistinctSelectOperation<duckdb::DistinctFrom>(left, right, sel: &sel, count, true_sel, false_sel);
918}
919// true := A == B with nulls being equal, inputs selected
920idx_t VectorOperations::NestedEquals(Vector &left, Vector &right, const SelectionVector &sel, idx_t count,
921 SelectionVector *true_sel, SelectionVector *false_sel) {
922 return count -
923 TemplatedDistinctSelectOperation<duckdb::DistinctFrom>(left, right, sel: &sel, count, true_sel: false_sel, false_sel: true_sel);
924}
925
926} // namespace duckdb
927