1 | #include "duckdb/common/sort/comparators.hpp" |
2 | |
3 | #include "duckdb/common/fast_mem.hpp" |
4 | #include "duckdb/common/sort/sort.hpp" |
5 | |
6 | namespace duckdb { |
7 | |
8 | bool Comparators::TieIsBreakable(const idx_t &tie_col, const data_ptr_t &row_ptr, const SortLayout &sort_layout) { |
9 | const auto &col_idx = sort_layout.sorting_to_blob_col.at(k: tie_col); |
10 | // Check if the blob is NULL |
11 | ValidityBytes row_mask(row_ptr); |
12 | idx_t entry_idx; |
13 | idx_t idx_in_entry; |
14 | ValidityBytes::GetEntryIndex(row_idx: col_idx, entry_idx, idx_in_entry); |
15 | if (!row_mask.RowIsValid(entry: row_mask.GetValidityEntry(entry_idx), idx_in_entry)) { |
16 | // Can't break a NULL tie |
17 | return false; |
18 | } |
19 | auto &row_layout = sort_layout.blob_layout; |
20 | if (row_layout.GetTypes()[col_idx].InternalType() != PhysicalType::VARCHAR) { |
21 | // Nested type, must be broken |
22 | return true; |
23 | } |
24 | const auto &tie_col_offset = row_layout.GetOffsets()[col_idx]; |
25 | auto tie_string = Load<string_t>(ptr: row_ptr + tie_col_offset); |
26 | if (tie_string.GetSize() < sort_layout.prefix_lengths[tie_col]) { |
27 | // No need to break the tie - we already compared the full string |
28 | return false; |
29 | } |
30 | return true; |
31 | } |
32 | |
33 | int Comparators::CompareTuple(const SBScanState &left, const SBScanState &right, const data_ptr_t &l_ptr, |
34 | const data_ptr_t &r_ptr, const SortLayout &sort_layout, const bool &external_sort) { |
35 | // Compare the sorting columns one by one |
36 | int comp_res = 0; |
37 | data_ptr_t l_ptr_offset = l_ptr; |
38 | data_ptr_t r_ptr_offset = r_ptr; |
39 | for (idx_t col_idx = 0; col_idx < sort_layout.column_count; col_idx++) { |
40 | comp_res = FastMemcmp(str1: l_ptr_offset, str2: r_ptr_offset, size: sort_layout.column_sizes[col_idx]); |
41 | if (comp_res == 0 && !sort_layout.constant_size[col_idx]) { |
42 | comp_res = BreakBlobTie(tie_col: col_idx, left, right, sort_layout, external: external_sort); |
43 | } |
44 | if (comp_res != 0) { |
45 | break; |
46 | } |
47 | l_ptr_offset += sort_layout.column_sizes[col_idx]; |
48 | r_ptr_offset += sort_layout.column_sizes[col_idx]; |
49 | } |
50 | return comp_res; |
51 | } |
52 | |
53 | int Comparators::CompareVal(const data_ptr_t l_ptr, const data_ptr_t r_ptr, const LogicalType &type) { |
54 | switch (type.InternalType()) { |
55 | case PhysicalType::VARCHAR: |
56 | return TemplatedCompareVal<string_t>(left_ptr: l_ptr, right_ptr: r_ptr); |
57 | case PhysicalType::LIST: |
58 | case PhysicalType::STRUCT: { |
59 | auto l_nested_ptr = Load<data_ptr_t>(ptr: l_ptr); |
60 | auto r_nested_ptr = Load<data_ptr_t>(ptr: r_ptr); |
61 | return CompareValAndAdvance(l_ptr&: l_nested_ptr, r_ptr&: r_nested_ptr, type, valid: true); |
62 | } |
63 | default: |
64 | throw NotImplementedException("Unimplemented CompareVal for type %s" , type.ToString()); |
65 | } |
66 | } |
67 | |
68 | int Comparators::BreakBlobTie(const idx_t &tie_col, const SBScanState &left, const SBScanState &right, |
69 | const SortLayout &sort_layout, const bool &external) { |
70 | data_ptr_t l_data_ptr = left.DataPtr(sd&: *left.sb->blob_sorting_data); |
71 | data_ptr_t r_data_ptr = right.DataPtr(sd&: *right.sb->blob_sorting_data); |
72 | if (!TieIsBreakable(tie_col, row_ptr: l_data_ptr, sort_layout)) { |
73 | // Quick check to see if ties can be broken |
74 | return 0; |
75 | } |
76 | // Align the pointers |
77 | const idx_t &col_idx = sort_layout.sorting_to_blob_col.at(k: tie_col); |
78 | const auto &tie_col_offset = sort_layout.blob_layout.GetOffsets()[col_idx]; |
79 | l_data_ptr += tie_col_offset; |
80 | r_data_ptr += tie_col_offset; |
81 | // Do the comparison |
82 | const int order = sort_layout.order_types[tie_col] == OrderType::DESCENDING ? -1 : 1; |
83 | const auto &type = sort_layout.blob_layout.GetTypes()[col_idx]; |
84 | int result; |
85 | if (external) { |
86 | // Store heap pointers |
87 | data_ptr_t l_heap_ptr = left.HeapPtr(sd&: *left.sb->blob_sorting_data); |
88 | data_ptr_t r_heap_ptr = right.HeapPtr(sd&: *right.sb->blob_sorting_data); |
89 | // Unswizzle offset to pointer |
90 | UnswizzleSingleValue(data_ptr: l_data_ptr, heap_ptr: l_heap_ptr, type); |
91 | UnswizzleSingleValue(data_ptr: r_data_ptr, heap_ptr: r_heap_ptr, type); |
92 | // Compare |
93 | result = CompareVal(l_ptr: l_data_ptr, r_ptr: r_data_ptr, type); |
94 | // Swizzle the pointers back to offsets |
95 | SwizzleSingleValue(data_ptr: l_data_ptr, heap_ptr: l_heap_ptr, type); |
96 | SwizzleSingleValue(data_ptr: r_data_ptr, heap_ptr: r_heap_ptr, type); |
97 | } else { |
98 | result = CompareVal(l_ptr: l_data_ptr, r_ptr: r_data_ptr, type); |
99 | } |
100 | return order * result; |
101 | } |
102 | |
103 | template <class T> |
104 | int Comparators::TemplatedCompareVal(const data_ptr_t &left_ptr, const data_ptr_t &right_ptr) { |
105 | const auto left_val = Load<T>(left_ptr); |
106 | const auto right_val = Load<T>(right_ptr); |
107 | if (Equals::Operation<T>(left_val, right_val)) { |
108 | return 0; |
109 | } else if (LessThan::Operation<T>(left_val, right_val)) { |
110 | return -1; |
111 | } else { |
112 | return 1; |
113 | } |
114 | } |
115 | |
116 | int Comparators::CompareValAndAdvance(data_ptr_t &l_ptr, data_ptr_t &r_ptr, const LogicalType &type, bool valid) { |
117 | switch (type.InternalType()) { |
118 | case PhysicalType::BOOL: |
119 | case PhysicalType::INT8: |
120 | return TemplatedCompareAndAdvance<int8_t>(left_ptr&: l_ptr, right_ptr&: r_ptr); |
121 | case PhysicalType::INT16: |
122 | return TemplatedCompareAndAdvance<int16_t>(left_ptr&: l_ptr, right_ptr&: r_ptr); |
123 | case PhysicalType::INT32: |
124 | return TemplatedCompareAndAdvance<int32_t>(left_ptr&: l_ptr, right_ptr&: r_ptr); |
125 | case PhysicalType::INT64: |
126 | return TemplatedCompareAndAdvance<int64_t>(left_ptr&: l_ptr, right_ptr&: r_ptr); |
127 | case PhysicalType::UINT8: |
128 | return TemplatedCompareAndAdvance<uint8_t>(left_ptr&: l_ptr, right_ptr&: r_ptr); |
129 | case PhysicalType::UINT16: |
130 | return TemplatedCompareAndAdvance<uint16_t>(left_ptr&: l_ptr, right_ptr&: r_ptr); |
131 | case PhysicalType::UINT32: |
132 | return TemplatedCompareAndAdvance<uint32_t>(left_ptr&: l_ptr, right_ptr&: r_ptr); |
133 | case PhysicalType::UINT64: |
134 | return TemplatedCompareAndAdvance<uint64_t>(left_ptr&: l_ptr, right_ptr&: r_ptr); |
135 | case PhysicalType::INT128: |
136 | return TemplatedCompareAndAdvance<hugeint_t>(left_ptr&: l_ptr, right_ptr&: r_ptr); |
137 | case PhysicalType::FLOAT: |
138 | return TemplatedCompareAndAdvance<float>(left_ptr&: l_ptr, right_ptr&: r_ptr); |
139 | case PhysicalType::DOUBLE: |
140 | return TemplatedCompareAndAdvance<double>(left_ptr&: l_ptr, right_ptr&: r_ptr); |
141 | case PhysicalType::INTERVAL: |
142 | return TemplatedCompareAndAdvance<interval_t>(left_ptr&: l_ptr, right_ptr&: r_ptr); |
143 | case PhysicalType::VARCHAR: |
144 | return CompareStringAndAdvance(left_ptr&: l_ptr, right_ptr&: r_ptr, valid); |
145 | case PhysicalType::LIST: |
146 | return CompareListAndAdvance(left_ptr&: l_ptr, right_ptr&: r_ptr, type: ListType::GetChildType(type), valid); |
147 | case PhysicalType::STRUCT: |
148 | return CompareStructAndAdvance(left_ptr&: l_ptr, right_ptr&: r_ptr, types: StructType::GetChildTypes(type), valid); |
149 | default: |
150 | throw NotImplementedException("Unimplemented CompareValAndAdvance for type %s" , type.ToString()); |
151 | } |
152 | } |
153 | |
154 | template <class T> |
155 | int Comparators::TemplatedCompareAndAdvance(data_ptr_t &left_ptr, data_ptr_t &right_ptr) { |
156 | auto result = TemplatedCompareVal<T>(left_ptr, right_ptr); |
157 | left_ptr += sizeof(T); |
158 | right_ptr += sizeof(T); |
159 | return result; |
160 | } |
161 | |
162 | int Comparators::CompareStringAndAdvance(data_ptr_t &left_ptr, data_ptr_t &right_ptr, bool valid) { |
163 | if (!valid) { |
164 | return 0; |
165 | } |
166 | uint32_t left_string_size = Load<uint32_t>(ptr: left_ptr); |
167 | uint32_t right_string_size = Load<uint32_t>(ptr: right_ptr); |
168 | left_ptr += sizeof(uint32_t); |
169 | right_ptr += sizeof(uint32_t); |
170 | auto memcmp_res = memcmp(s1: const_char_ptr_cast(src: left_ptr), s2: const_char_ptr_cast(src: right_ptr), |
171 | n: std::min<uint32_t>(left_string_size, right_string_size)); |
172 | |
173 | left_ptr += left_string_size; |
174 | right_ptr += right_string_size; |
175 | |
176 | if (memcmp_res != 0) { |
177 | return memcmp_res; |
178 | } |
179 | if (left_string_size == right_string_size) { |
180 | return 0; |
181 | } |
182 | if (left_string_size < right_string_size) { |
183 | return -1; |
184 | } |
185 | return 1; |
186 | } |
187 | |
188 | int Comparators::CompareStructAndAdvance(data_ptr_t &left_ptr, data_ptr_t &right_ptr, |
189 | const child_list_t<LogicalType> &types, bool valid) { |
190 | idx_t count = types.size(); |
191 | // Load validity masks |
192 | ValidityBytes left_validity(left_ptr); |
193 | ValidityBytes right_validity(right_ptr); |
194 | left_ptr += (count + 7) / 8; |
195 | right_ptr += (count + 7) / 8; |
196 | // Initialize variables |
197 | bool left_valid; |
198 | bool right_valid; |
199 | idx_t entry_idx; |
200 | idx_t idx_in_entry; |
201 | // Compare |
202 | int comp_res = 0; |
203 | for (idx_t i = 0; i < count; i++) { |
204 | ValidityBytes::GetEntryIndex(row_idx: i, entry_idx, idx_in_entry); |
205 | left_valid = left_validity.RowIsValid(entry: left_validity.GetValidityEntry(entry_idx), idx_in_entry); |
206 | right_valid = right_validity.RowIsValid(entry: right_validity.GetValidityEntry(entry_idx), idx_in_entry); |
207 | auto &type = types[i].second; |
208 | if ((left_valid == right_valid) || TypeIsConstantSize(type: type.InternalType())) { |
209 | comp_res = CompareValAndAdvance(l_ptr&: left_ptr, r_ptr&: right_ptr, type: types[i].second, valid: left_valid && valid); |
210 | } |
211 | if (!left_valid && !right_valid) { |
212 | comp_res = 0; |
213 | } else if (!left_valid) { |
214 | comp_res = 1; |
215 | } else if (!right_valid) { |
216 | comp_res = -1; |
217 | } |
218 | if (comp_res != 0) { |
219 | break; |
220 | } |
221 | } |
222 | return comp_res; |
223 | } |
224 | |
225 | int Comparators::CompareListAndAdvance(data_ptr_t &left_ptr, data_ptr_t &right_ptr, const LogicalType &type, |
226 | bool valid) { |
227 | if (!valid) { |
228 | return 0; |
229 | } |
230 | // Load list lengths |
231 | auto left_len = Load<idx_t>(ptr: left_ptr); |
232 | auto right_len = Load<idx_t>(ptr: right_ptr); |
233 | left_ptr += sizeof(idx_t); |
234 | right_ptr += sizeof(idx_t); |
235 | // Load list validity masks |
236 | ValidityBytes left_validity(left_ptr); |
237 | ValidityBytes right_validity(right_ptr); |
238 | left_ptr += (left_len + 7) / 8; |
239 | right_ptr += (right_len + 7) / 8; |
240 | // Compare |
241 | int comp_res = 0; |
242 | idx_t count = MinValue(a: left_len, b: right_len); |
243 | if (TypeIsConstantSize(type: type.InternalType())) { |
244 | // Templated code for fixed-size types |
245 | switch (type.InternalType()) { |
246 | case PhysicalType::BOOL: |
247 | case PhysicalType::INT8: |
248 | comp_res = TemplatedCompareListLoop<int8_t>(left_ptr, right_ptr, left_validity, right_validity, count); |
249 | break; |
250 | case PhysicalType::INT16: |
251 | comp_res = TemplatedCompareListLoop<int16_t>(left_ptr, right_ptr, left_validity, right_validity, count); |
252 | break; |
253 | case PhysicalType::INT32: |
254 | comp_res = TemplatedCompareListLoop<int32_t>(left_ptr, right_ptr, left_validity, right_validity, count); |
255 | break; |
256 | case PhysicalType::INT64: |
257 | comp_res = TemplatedCompareListLoop<int64_t>(left_ptr, right_ptr, left_validity, right_validity, count); |
258 | break; |
259 | case PhysicalType::UINT8: |
260 | comp_res = TemplatedCompareListLoop<uint8_t>(left_ptr, right_ptr, left_validity, right_validity, count); |
261 | break; |
262 | case PhysicalType::UINT16: |
263 | comp_res = TemplatedCompareListLoop<uint16_t>(left_ptr, right_ptr, left_validity, right_validity, count); |
264 | break; |
265 | case PhysicalType::UINT32: |
266 | comp_res = TemplatedCompareListLoop<uint32_t>(left_ptr, right_ptr, left_validity, right_validity, count); |
267 | break; |
268 | case PhysicalType::UINT64: |
269 | comp_res = TemplatedCompareListLoop<uint64_t>(left_ptr, right_ptr, left_validity, right_validity, count); |
270 | break; |
271 | case PhysicalType::INT128: |
272 | comp_res = TemplatedCompareListLoop<hugeint_t>(left_ptr, right_ptr, left_validity, right_validity, count); |
273 | break; |
274 | case PhysicalType::FLOAT: |
275 | comp_res = TemplatedCompareListLoop<float>(left_ptr, right_ptr, left_validity, right_validity, count); |
276 | break; |
277 | case PhysicalType::DOUBLE: |
278 | comp_res = TemplatedCompareListLoop<double>(left_ptr, right_ptr, left_validity, right_validity, count); |
279 | break; |
280 | case PhysicalType::INTERVAL: |
281 | comp_res = TemplatedCompareListLoop<interval_t>(left_ptr, right_ptr, left_validity, right_validity, count); |
282 | break; |
283 | default: |
284 | throw NotImplementedException("CompareListAndAdvance for fixed-size type %s" , type.ToString()); |
285 | } |
286 | } else { |
287 | // Variable-sized list entries |
288 | bool left_valid; |
289 | bool right_valid; |
290 | idx_t entry_idx; |
291 | idx_t idx_in_entry; |
292 | // Size (in bytes) of all variable-sizes entries is stored before the entries begin, |
293 | // to make deserialization easier. We need to skip over them |
294 | left_ptr += left_len * sizeof(idx_t); |
295 | right_ptr += right_len * sizeof(idx_t); |
296 | for (idx_t i = 0; i < count; i++) { |
297 | ValidityBytes::GetEntryIndex(row_idx: i, entry_idx, idx_in_entry); |
298 | left_valid = left_validity.RowIsValid(entry: left_validity.GetValidityEntry(entry_idx), idx_in_entry); |
299 | right_valid = right_validity.RowIsValid(entry: right_validity.GetValidityEntry(entry_idx), idx_in_entry); |
300 | if (left_valid && right_valid) { |
301 | switch (type.InternalType()) { |
302 | case PhysicalType::LIST: |
303 | comp_res = CompareListAndAdvance(left_ptr, right_ptr, type: ListType::GetChildType(type), valid: left_valid); |
304 | break; |
305 | case PhysicalType::VARCHAR: |
306 | comp_res = CompareStringAndAdvance(left_ptr, right_ptr, valid: left_valid); |
307 | break; |
308 | case PhysicalType::STRUCT: |
309 | comp_res = |
310 | CompareStructAndAdvance(left_ptr, right_ptr, types: StructType::GetChildTypes(type), valid: left_valid); |
311 | break; |
312 | default: |
313 | throw NotImplementedException("CompareListAndAdvance for variable-size type %s" , type.ToString()); |
314 | } |
315 | } else if (!left_valid && !right_valid) { |
316 | comp_res = 0; |
317 | } else if (left_valid) { |
318 | comp_res = -1; |
319 | } else { |
320 | comp_res = 1; |
321 | } |
322 | if (comp_res != 0) { |
323 | break; |
324 | } |
325 | } |
326 | } |
327 | // All values that we looped over were equal |
328 | if (comp_res == 0 && left_len != right_len) { |
329 | // Smaller lists first |
330 | if (left_len < right_len) { |
331 | comp_res = -1; |
332 | } else { |
333 | comp_res = 1; |
334 | } |
335 | } |
336 | return comp_res; |
337 | } |
338 | |
339 | template <class T> |
340 | int Comparators::TemplatedCompareListLoop(data_ptr_t &left_ptr, data_ptr_t &right_ptr, |
341 | const ValidityBytes &left_validity, const ValidityBytes &right_validity, |
342 | const idx_t &count) { |
343 | int comp_res = 0; |
344 | bool left_valid; |
345 | bool right_valid; |
346 | idx_t entry_idx; |
347 | idx_t idx_in_entry; |
348 | for (idx_t i = 0; i < count; i++) { |
349 | ValidityBytes::GetEntryIndex(row_idx: i, entry_idx, idx_in_entry); |
350 | left_valid = left_validity.RowIsValid(entry: left_validity.GetValidityEntry(entry_idx), idx_in_entry); |
351 | right_valid = right_validity.RowIsValid(entry: right_validity.GetValidityEntry(entry_idx), idx_in_entry); |
352 | comp_res = TemplatedCompareAndAdvance<T>(left_ptr, right_ptr); |
353 | if (!left_valid && !right_valid) { |
354 | comp_res = 0; |
355 | } else if (!left_valid) { |
356 | comp_res = 1; |
357 | } else if (!right_valid) { |
358 | comp_res = -1; |
359 | } |
360 | if (comp_res != 0) { |
361 | break; |
362 | } |
363 | } |
364 | return comp_res; |
365 | } |
366 | |
367 | void Comparators::UnswizzleSingleValue(data_ptr_t data_ptr, const data_ptr_t &heap_ptr, const LogicalType &type) { |
368 | if (type.InternalType() == PhysicalType::VARCHAR) { |
369 | data_ptr += string_t::HEADER_SIZE; |
370 | } |
371 | Store<data_ptr_t>(val: heap_ptr + Load<idx_t>(ptr: data_ptr), ptr: data_ptr); |
372 | } |
373 | |
374 | void Comparators::SwizzleSingleValue(data_ptr_t data_ptr, const data_ptr_t &heap_ptr, const LogicalType &type) { |
375 | if (type.InternalType() == PhysicalType::VARCHAR) { |
376 | data_ptr += string_t::HEADER_SIZE; |
377 | } |
378 | Store<idx_t>(val: Load<data_ptr_t>(ptr: data_ptr) - heap_ptr, ptr: data_ptr); |
379 | } |
380 | |
381 | } // namespace duckdb |
382 | |