1/*
2 pdqsort.h - Pattern-defeating quicksort.
3
4 Copyright (c) 2015 Orson Peters
5
6 This software is provided 'as-is', without any express or implied warranty. In no event will the
7 authors be held liable for any damages arising from the use of this software.
8
9 Permission is granted to anyone to use this software for any purpose, including commercial
10 applications, and to alter it and redistribute it freely, subject to the following restrictions:
11
12 1. The origin of this software must not be misrepresented; you must not claim that you wrote the
13 original software. If you use this software in a product, an acknowledgment in the product
14 documentation would be appreciated but is not required.
15
16 2. Altered source versions must be plainly marked as such, and must not be misrepresented as
17 being the original software.
18
19 3. This notice may not be removed or altered from any source distribution.
20*/
21
22
23#ifndef PDQSORT_H
24#define PDQSORT_H
25
26#include <algorithm>
27#include <cstddef>
28#include <functional>
29#include <utility>
30#include <iterator>
31
32#if __cplusplus >= 201103L
33 #include <cstdint>
34 #include <type_traits>
35 #define PDQSORT_PREFER_MOVE(x) std::move(x)
36#else
37 #define PDQSORT_PREFER_MOVE(x) (x)
38#endif
39
40
41namespace pdqsort_detail {
42 enum {
43 // Partitions below this size are sorted using insertion sort.
44 insertion_sort_threshold = 24,
45
46 // Partitions above this size use Tukey's ninther to select the pivot.
47 ninther_threshold = 128,
48
49 // When we detect an already sorted partition, attempt an insertion sort that allows this
50 // amount of element moves before giving up.
51 partial_insertion_sort_limit = 8,
52
53 // Must be multiple of 8 due to loop unrolling, and < 256 to fit in unsigned char.
54 block_size = 64,
55
56 // Cacheline size, assumes power of two.
57 cacheline_size = 64
58
59 };
60
61#if __cplusplus >= 201103L
62 template<class T> struct is_default_compare : std::false_type { };
63 template<class T> struct is_default_compare<std::less<T>> : std::true_type { };
64 template<class T> struct is_default_compare<std::greater<T>> : std::true_type { };
65#endif
66
67 // Returns floor(log2(n)), assumes n > 0.
68 template<class T>
69 inline int log2(T n) {
70 int log = 0;
71 while (n >>= 1) ++log;
72 return log;
73 }
74
75 // Sorts [begin, end) using insertion sort with the given comparison function.
76 template<class Iter, class Compare>
77 inline void insertion_sort(Iter begin, Iter end, Compare comp) {
78 typedef typename std::iterator_traits<Iter>::value_type T;
79 if (begin == end) return;
80
81 for (Iter cur = begin + 1; cur != end; ++cur) {
82 Iter sift = cur;
83 Iter sift_1 = cur - 1;
84
85 // Compare first so we can avoid 2 moves for an element already positioned correctly.
86 if (comp(*sift, *sift_1)) {
87 T tmp = PDQSORT_PREFER_MOVE(*sift);
88
89 do { *sift-- = PDQSORT_PREFER_MOVE(*sift_1); }
90 while (sift != begin && comp(tmp, *--sift_1));
91
92 *sift = PDQSORT_PREFER_MOVE(tmp);
93 }
94 }
95 }
96
97 // Sorts [begin, end) using insertion sort with the given comparison function. Assumes
98 // *(begin - 1) is an element smaller than or equal to any element in [begin, end).
99 template<class Iter, class Compare>
100 inline void unguarded_insertion_sort(Iter begin, Iter end, Compare comp) {
101 typedef typename std::iterator_traits<Iter>::value_type T;
102 if (begin == end) return;
103
104 for (Iter cur = begin + 1; cur != end; ++cur) {
105 Iter sift = cur;
106 Iter sift_1 = cur - 1;
107
108 // Compare first so we can avoid 2 moves for an element already positioned correctly.
109 if (comp(*sift, *sift_1)) {
110 T tmp = PDQSORT_PREFER_MOVE(*sift);
111
112 do { *sift-- = PDQSORT_PREFER_MOVE(*sift_1); }
113 while (comp(tmp, *--sift_1));
114
115 *sift = PDQSORT_PREFER_MOVE(tmp);
116 }
117 }
118 }
119
120 // Attempts to use insertion sort on [begin, end). Will return false if more than
121 // partial_insertion_sort_limit elements were moved, and abort sorting. Otherwise it will
122 // successfully sort and return true.
123 template<class Iter, class Compare>
124 inline bool partial_insertion_sort(Iter begin, Iter end, Compare comp) {
125 typedef typename std::iterator_traits<Iter>::value_type T;
126 if (begin == end) return true;
127
128 int limit = 0;
129 for (Iter cur = begin + 1; cur != end; ++cur) {
130 if (limit > partial_insertion_sort_limit) return false;
131
132 Iter sift = cur;
133 Iter sift_1 = cur - 1;
134
135 // Compare first so we can avoid 2 moves for an element already positioned correctly.
136 if (comp(*sift, *sift_1)) {
137 T tmp = PDQSORT_PREFER_MOVE(*sift);
138
139 do { *sift-- = PDQSORT_PREFER_MOVE(*sift_1); }
140 while (sift != begin && comp(tmp, *--sift_1));
141
142 *sift = PDQSORT_PREFER_MOVE(tmp);
143 limit += cur - sift;
144 }
145 }
146
147 return true;
148 }
149
150 template<class Iter, class Compare>
151 inline void sort2(Iter a, Iter b, Compare comp) {
152 if (comp(*b, *a)) std::iter_swap(a, b);
153 }
154
155 // Sorts the elements *a, *b and *c using comparison function comp.
156 template<class Iter, class Compare>
157 inline void sort3(Iter a, Iter b, Iter c, Compare comp) {
158 sort2(a, b, comp);
159 sort2(b, c, comp);
160 sort2(a, b, comp);
161 }
162
163 template<class T>
164 inline T* align_cacheline(T* p) {
165#if defined(UINTPTR_MAX) && __cplusplus >= 201103L
166 std::uintptr_t ip = reinterpret_cast<std::uintptr_t>(p);
167#else
168 std::size_t ip = reinterpret_cast<std::size_t>(p);
169#endif
170 ip = (ip + cacheline_size - 1) & -cacheline_size;
171 return reinterpret_cast<T*>(ip);
172 }
173
174 template<class Iter>
175 inline void swap_offsets(Iter first, Iter last,
176 unsigned char* offsets_l, unsigned char* offsets_r,
177 int num, bool use_swaps) {
178 typedef typename std::iterator_traits<Iter>::value_type T;
179 if (use_swaps) {
180 // This case is needed for the descending distribution, where we need
181 // to have proper swapping for pdqsort to remain O(n).
182 for (int i = 0; i < num; ++i) {
183 std::iter_swap(first + offsets_l[i], last - offsets_r[i]);
184 }
185 } else if (num > 0) {
186 Iter l = first + offsets_l[0]; Iter r = last - offsets_r[0];
187 T tmp(PDQSORT_PREFER_MOVE(*l)); *l = PDQSORT_PREFER_MOVE(*r);
188 for (int i = 1; i < num; ++i) {
189 l = first + offsets_l[i]; *r = PDQSORT_PREFER_MOVE(*l);
190 r = last - offsets_r[i]; *l = PDQSORT_PREFER_MOVE(*r);
191 }
192 *r = PDQSORT_PREFER_MOVE(tmp);
193 }
194 }
195
196 // Partitions [begin, end) around pivot *begin using comparison function comp. Elements equal
197 // to the pivot are put in the right-hand partition. Returns the position of the pivot after
198 // partitioning and whether the passed sequence already was correctly partitioned. Assumes the
199 // pivot is a median of at least 3 elements and that [begin, end) is at least
200 // insertion_sort_threshold long. Uses branchless partitioning.
201 template<class Iter, class Compare>
202 inline std::pair<Iter, bool> partition_right_branchless(Iter begin, Iter end, Compare comp) {
203 typedef typename std::iterator_traits<Iter>::value_type T;
204
205 // Move pivot into local for speed.
206 T pivot(PDQSORT_PREFER_MOVE(*begin));
207 Iter first = begin;
208 Iter last = end;
209
210 // Find the first element greater than or equal than the pivot (the median of 3 guarantees
211 // this exists).
212 while (comp(*++first, pivot));
213
214 // Find the first element strictly smaller than the pivot. We have to guard this search if
215 // there was no element before *first.
216 if (first - 1 == begin) while (first < last && !comp(*--last, pivot));
217 else while ( !comp(*--last, pivot));
218
219 // If the first pair of elements that should be swapped to partition are the same element,
220 // the passed in sequence already was correctly partitioned.
221 bool already_partitioned = first >= last;
222 if (!already_partitioned) {
223 std::iter_swap(first, last);
224 ++first;
225 }
226
227 // The following branchless partitioning is derived from "BlockQuicksort: How Branch
228 // Mispredictions don’t affect Quicksort" by Stefan Edelkamp and Armin Weiss.
229 unsigned char offsets_l_storage[block_size + cacheline_size];
230 unsigned char offsets_r_storage[block_size + cacheline_size];
231 unsigned char* offsets_l = align_cacheline(offsets_l_storage);
232 unsigned char* offsets_r = align_cacheline(offsets_r_storage);
233 int num_l, num_r, start_l, start_r;
234 num_l = num_r = start_l = start_r = 0;
235
236 while (last - first > 2 * block_size) {
237 // Fill up offset blocks with elements that are on the wrong side.
238 if (num_l == 0) {
239 start_l = 0;
240 Iter it = first;
241 for (unsigned char i = 0; i < block_size;) {
242 offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
243 offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
244 offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
245 offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
246 offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
247 offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
248 offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
249 offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
250 }
251 }
252 if (num_r == 0) {
253 start_r = 0;
254 Iter it = last;
255 for (unsigned char i = 0; i < block_size;) {
256 offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
257 offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
258 offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
259 offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
260 offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
261 offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
262 offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
263 offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
264 }
265 }
266
267 // Swap elements and update block sizes and first/last boundaries.
268 int num = std::min(num_l, num_r);
269 swap_offsets(first, last, offsets_l + start_l, offsets_r + start_r,
270 num, num_l == num_r);
271 num_l -= num; num_r -= num;
272 start_l += num; start_r += num;
273 if (num_l == 0) first += block_size;
274 if (num_r == 0) last -= block_size;
275 }
276
277 int l_size = 0, r_size = 0;
278 int unknown_left = (last - first) - ((num_r || num_l) ? block_size : 0);
279 if (num_r) {
280 // Handle leftover block by assigning the unknown elements to the other block.
281 l_size = unknown_left;
282 r_size = block_size;
283 } else if (num_l) {
284 l_size = block_size;
285 r_size = unknown_left;
286 } else {
287 // No leftover block, split the unknown elements in two blocks.
288 l_size = unknown_left/2;
289 r_size = unknown_left - l_size;
290 }
291
292 // Fill offset buffers if needed.
293 if (unknown_left && !num_l) {
294 start_l = 0;
295 Iter it = first;
296 for (unsigned char i = 0; i < l_size;) {
297 offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
298 }
299 }
300 if (unknown_left && !num_r) {
301 start_r = 0;
302 Iter it = last;
303 for (unsigned char i = 0; i < r_size;) {
304 offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
305 }
306 }
307
308 int num = std::min(num_l, num_r);
309 swap_offsets(first, last, offsets_l + start_l, offsets_r + start_r, num, num_l == num_r);
310 num_l -= num; num_r -= num;
311 start_l += num; start_r += num;
312 if (num_l == 0) first += l_size;
313 if (num_r == 0) last -= r_size;
314
315 // We have now fully identified [first, last)'s proper position. Swap the last elements.
316 if (num_l) {
317 offsets_l += start_l;
318 while (num_l--) std::iter_swap(first + offsets_l[num_l], --last);
319 first = last;
320 }
321 if (num_r) {
322 offsets_r += start_r;
323 while (num_r--) std::iter_swap(last - offsets_r[num_r], first), ++first;
324 last = first;
325 }
326
327 // Put the pivot in the right place.
328 Iter pivot_pos = first - 1;
329 *begin = PDQSORT_PREFER_MOVE(*pivot_pos);
330 *pivot_pos = PDQSORT_PREFER_MOVE(pivot);
331
332 return std::make_pair(pivot_pos, already_partitioned);
333 }
334
335 // Partitions [begin, end) around pivot *begin using comparison function comp. Elements equal
336 // to the pivot are put in the right-hand partition. Returns the position of the pivot after
337 // partitioning and whether the passed sequence already was correctly partitioned. Assumes the
338 // pivot is a median of at least 3 elements and that [begin, end) is at least
339 // insertion_sort_threshold long.
340 template<class Iter, class Compare>
341 inline std::pair<Iter, bool> partition_right(Iter begin, Iter end, Compare comp) {
342 typedef typename std::iterator_traits<Iter>::value_type T;
343
344 // Move pivot into local for speed.
345 T pivot(PDQSORT_PREFER_MOVE(*begin));
346
347 Iter first = begin;
348 Iter last = end;
349
350 // Find the first element greater than or equal than the pivot (the median of 3 guarantees
351 // this exists).
352 while (comp(*++first, pivot));
353
354 // Find the first element strictly smaller than the pivot. We have to guard this search if
355 // there was no element before *first.
356 if (first - 1 == begin) while (first < last && !comp(*--last, pivot));
357 else while ( !comp(*--last, pivot));
358
359 // If the first pair of elements that should be swapped to partition are the same element,
360 // the passed in sequence already was correctly partitioned.
361 bool already_partitioned = first >= last;
362
363 // Keep swapping pairs of elements that are on the wrong side of the pivot. Previously
364 // swapped pairs guard the searches, which is why the first iteration is special-cased
365 // above.
366 while (first < last) {
367 std::iter_swap(first, last);
368 while (comp(*++first, pivot));
369 while (!comp(*--last, pivot));
370 }
371
372 // Put the pivot in the right place.
373 Iter pivot_pos = first - 1;
374 *begin = PDQSORT_PREFER_MOVE(*pivot_pos);
375 *pivot_pos = PDQSORT_PREFER_MOVE(pivot);
376
377 return std::make_pair(pivot_pos, already_partitioned);
378 }
379
380 // Similar function to the one above, except elements equal to the pivot are put to the left of
381 // the pivot and it doesn't check or return if the passed sequence already was partitioned.
382 // Since this is rarely used (the many equal case), and in that case pdqsort already has O(n)
383 // performance, no block quicksort is applied here for simplicity.
384 template<class Iter, class Compare>
385 inline Iter partition_left(Iter begin, Iter end, Compare comp) {
386 typedef typename std::iterator_traits<Iter>::value_type T;
387
388 T pivot(PDQSORT_PREFER_MOVE(*begin));
389 Iter first = begin;
390 Iter last = end;
391
392 while (comp(pivot, *--last));
393
394 if (last + 1 == end) while (first < last && !comp(pivot, *++first));
395 else while ( !comp(pivot, *++first));
396
397 while (first < last) {
398 std::iter_swap(first, last);
399 while (comp(pivot, *--last));
400 while (!comp(pivot, *++first));
401 }
402
403 Iter pivot_pos = last;
404 *begin = PDQSORT_PREFER_MOVE(*pivot_pos);
405 *pivot_pos = PDQSORT_PREFER_MOVE(pivot);
406
407 return pivot_pos;
408 }
409
410
411 template<class Iter, class Compare, bool Branchless>
412 inline void pdqsort_loop(Iter begin, Iter end, Compare comp, int bad_allowed, bool leftmost = true) {
413 typedef typename std::iterator_traits<Iter>::difference_type diff_t;
414
415 // Use a while loop for tail recursion elimination.
416 while (true) {
417 diff_t size = end - begin;
418
419 // Insertion sort is faster for small arrays.
420 if (size < insertion_sort_threshold) {
421 if (leftmost) insertion_sort(begin, end, comp);
422 else unguarded_insertion_sort(begin, end, comp);
423 return;
424 }
425
426 // Choose pivot as median of 3 or pseudomedian of 9.
427 diff_t s2 = size / 2;
428 if (size > ninther_threshold) {
429 sort3(begin, begin + s2, end - 1, comp);
430 sort3(begin + 1, begin + (s2 - 1), end - 2, comp);
431 sort3(begin + 2, begin + (s2 + 1), end - 3, comp);
432 sort3(begin + (s2 - 1), begin + s2, begin + (s2 + 1), comp);
433 std::iter_swap(begin, begin + s2);
434 } else sort3(begin + s2, begin, end - 1, comp);
435
436 // If *(begin - 1) is the end of the right partition of a previous partition operation
437 // there is no element in [begin, end) that is smaller than *(begin - 1). Then if our
438 // pivot compares equal to *(begin - 1) we change strategy, putting equal elements in
439 // the left partition, greater elements in the right partition. We do not have to
440 // recurse on the left partition, since it's sorted (all equal).
441 if (!leftmost && !comp(*(begin - 1), *begin)) {
442 begin = partition_left(begin, end, comp) + 1;
443 continue;
444 }
445
446 // Partition and get results.
447 std::pair<Iter, bool> part_result =
448 Branchless ? partition_right_branchless(begin, end, comp)
449 : partition_right(begin, end, comp);
450 Iter pivot_pos = part_result.first;
451 bool already_partitioned = part_result.second;
452
453 // Check for a highly unbalanced partition.
454 diff_t l_size = pivot_pos - begin;
455 diff_t r_size = end - (pivot_pos + 1);
456 bool highly_unbalanced = l_size < size / 8 || r_size < size / 8;
457
458 // If we got a highly unbalanced partition we shuffle elements to break many patterns.
459 if (highly_unbalanced) {
460 // If we had too many bad partitions, switch to heapsort to guarantee O(n log n).
461 if (--bad_allowed == 0) {
462 std::make_heap(begin, end, comp);
463 std::sort_heap(begin, end, comp);
464 return;
465 }
466
467 if (l_size >= insertion_sort_threshold) {
468 std::iter_swap(begin, begin + l_size / 4);
469 std::iter_swap(pivot_pos - 1, pivot_pos - l_size / 4);
470
471 if (l_size > ninther_threshold) {
472 std::iter_swap(begin + 1, begin + (l_size / 4 + 1));
473 std::iter_swap(begin + 2, begin + (l_size / 4 + 2));
474 std::iter_swap(pivot_pos - 2, pivot_pos - (l_size / 4 + 1));
475 std::iter_swap(pivot_pos - 3, pivot_pos - (l_size / 4 + 2));
476 }
477 }
478
479 if (r_size >= insertion_sort_threshold) {
480 std::iter_swap(pivot_pos + 1, pivot_pos + (1 + r_size / 4));
481 std::iter_swap(end - 1, end - r_size / 4);
482
483 if (r_size > ninther_threshold) {
484 std::iter_swap(pivot_pos + 2, pivot_pos + (2 + r_size / 4));
485 std::iter_swap(pivot_pos + 3, pivot_pos + (3 + r_size / 4));
486 std::iter_swap(end - 2, end - (1 + r_size / 4));
487 std::iter_swap(end - 3, end - (2 + r_size / 4));
488 }
489 }
490 } else {
491 // If we were decently balanced and we tried to sort an already partitioned
492 // sequence try to use insertion sort.
493 if (already_partitioned && partial_insertion_sort(begin, pivot_pos, comp)
494 && partial_insertion_sort(pivot_pos + 1, end, comp)) return;
495 }
496
497 // Sort the left partition first using recursion and do tail recursion elimination for
498 // the right-hand partition.
499 pdqsort_loop<Iter, Compare, Branchless>(begin, pivot_pos, comp, bad_allowed, leftmost);
500 begin = pivot_pos + 1;
501 leftmost = false;
502 }
503 }
504}
505
506
507template<class Iter, class Compare>
508inline void pdqsort(Iter begin, Iter end, Compare comp) {
509 if (begin == end) return;
510
511#if __cplusplus >= 201103L
512 pdqsort_detail::pdqsort_loop<Iter, Compare,
513 pdqsort_detail::is_default_compare<typename std::decay<Compare>::type>::value &&
514 std::is_arithmetic<typename std::iterator_traits<Iter>::value_type>::value>(
515 begin, end, comp, pdqsort_detail::log2(end - begin));
516#else
517 pdqsort_detail::pdqsort_loop<Iter, Compare, false>(
518 begin, end, comp, pdqsort_detail::log2(end - begin));
519#endif
520}
521
522template<class Iter>
523inline void pdqsort(Iter begin, Iter end) {
524 typedef typename std::iterator_traits<Iter>::value_type T;
525 pdqsort(begin, end, std::less<T>());
526}
527
528template<class Iter, class Compare>
529inline void pdqsort_branchless(Iter begin, Iter end, Compare comp) {
530 if (begin == end) return;
531 pdqsort_detail::pdqsort_loop<Iter, Compare, true>(
532 begin, end, comp, pdqsort_detail::log2(end - begin));
533}
534
535template<class Iter>
536inline void pdqsort_branchless(Iter begin, Iter end) {
537 typedef typename std::iterator_traits<Iter>::value_type T;
538 pdqsort_branchless(begin, end, std::less<T>());
539}
540
541
542#undef PDQSORT_PREFER_MOVE
543
544#endif
545