1/**
2* This code is released under the
3* Apache License Version 2.0 http://www.apache.org/licenses/.
4*
5* (c) Daniel Lemire, http://lemire.me/en/
6*/
7#pragma once
8#include "bitpacking.h"
9
10
11#include <stdexcept>
12
13namespace duckdb_fastpforlib {
14
15namespace internal {
16
17// Note that this only packs 8 values
18inline void fastunpack_quarter(const uint8_t *__restrict in, uint8_t *__restrict out, const uint32_t bit) {
19 // Could have used function pointers instead of switch.
20 // Switch calls do offer the compiler more opportunities for optimization in
21 // theory. In this case, it makes no difference with a good compiler.
22 switch (bit) {
23 case 0:
24 internal::__fastunpack0(in, out);
25 break;
26 case 1:
27 internal::__fastunpack1(in, out);
28 break;
29 case 2:
30 internal::__fastunpack2(in, out);
31 break;
32 case 3:
33 internal::__fastunpack3(in, out);
34 break;
35 case 4:
36 internal::__fastunpack4(in, out);
37 break;
38 case 5:
39 internal::__fastunpack5(in, out);
40 break;
41 case 6:
42 internal::__fastunpack6(in, out);
43 break;
44 case 7:
45 internal::__fastunpack7(in, out);
46 break;
47 case 8:
48 internal::__fastunpack8(in, out);
49 break;
50 default:
51 throw std::logic_error("Invalid bit width for bitpacking");
52 }
53}
54
55// Note that this only packs 8 values
56inline void fastpack_quarter(const uint8_t *__restrict in, uint8_t *__restrict out, const uint32_t bit) {
57 // Could have used function pointers instead of switch.
58 // Switch calls do offer the compiler more opportunities for optimization in
59 // theory. In this case, it makes no difference with a good compiler.
60 switch (bit) {
61 case 0:
62 internal::__fastpack0(in, out);
63 break;
64 case 1:
65 internal::__fastpack1(in, out);
66 break;
67 case 2:
68 internal::__fastpack2(in, out);
69 break;
70 case 3:
71 internal::__fastpack3(in, out);
72 break;
73 case 4:
74 internal::__fastpack4(in, out);
75 break;
76 case 5:
77 internal::__fastpack5(in, out);
78 break;
79 case 6:
80 internal::__fastpack6(in, out);
81 break;
82 case 7:
83 internal::__fastpack7(in, out);
84 break;
85 case 8:
86 internal::__fastpack8(in, out);
87 break;
88 default:
89 throw std::logic_error("Invalid bit width for bitpacking");
90 }
91}
92
93// Note that this only packs 16 values
94inline void fastunpack_half(const uint16_t *__restrict in, uint16_t *__restrict out, const uint32_t bit) {
95 // Could have used function pointers instead of switch.
96 // Switch calls do offer the compiler more opportunities for optimization in
97 // theory. In this case, it makes no difference with a good compiler.
98 switch (bit) {
99 case 0:
100 internal::__fastunpack0(in, out);
101 break;
102 case 1:
103 internal::__fastunpack1(in, out);
104 break;
105 case 2:
106 internal::__fastunpack2(in, out);
107 break;
108 case 3:
109 internal::__fastunpack3(in, out);
110 break;
111 case 4:
112 internal::__fastunpack4(in, out);
113 break;
114 case 5:
115 internal::__fastunpack5(in, out);
116 break;
117 case 6:
118 internal::__fastunpack6(in, out);
119 break;
120 case 7:
121 internal::__fastunpack7(in, out);
122 break;
123 case 8:
124 internal::__fastunpack8(in, out);
125 break;
126 case 9:
127 internal::__fastunpack9(in, out);
128 break;
129 case 10:
130 internal::__fastunpack10(in, out);
131 break;
132 case 11:
133 internal::__fastunpack11(in, out);
134 break;
135 case 12:
136 internal::__fastunpack12(in, out);
137 break;
138 case 13:
139 internal::__fastunpack13(in, out);
140 break;
141 case 14:
142 internal::__fastunpack14(in, out);
143 break;
144 case 15:
145 internal::__fastunpack15(in, out);
146 break;
147 case 16:
148 internal::__fastunpack16(in, out);
149 break;
150 default:
151 throw std::logic_error("Invalid bit width for bitpacking");
152 }
153}
154
155// Note that this only packs 16 values
156inline void fastpack_half(const uint16_t *__restrict in, uint16_t *__restrict out, const uint32_t bit) {
157 // Could have used function pointers instead of switch.
158 // Switch calls do offer the compiler more opportunities for optimization in
159 // theory. In this case, it makes no difference with a good compiler.
160 switch (bit) {
161 case 0:
162 internal::__fastpack0(in, out);
163 break;
164 case 1:
165 internal::__fastpack1(in, out);
166 break;
167 case 2:
168 internal::__fastpack2(in, out);
169 break;
170 case 3:
171 internal::__fastpack3(in, out);
172 break;
173 case 4:
174 internal::__fastpack4(in, out);
175 break;
176 case 5:
177 internal::__fastpack5(in, out);
178 break;
179 case 6:
180 internal::__fastpack6(in, out);
181 break;
182 case 7:
183 internal::__fastpack7(in, out);
184 break;
185 case 8:
186 internal::__fastpack8(in, out);
187 break;
188 case 9:
189 internal::__fastpack9(in, out);
190 break;
191 case 10:
192 internal::__fastpack10(in, out);
193 break;
194 case 11:
195 internal::__fastpack11(in, out);
196 break;
197 case 12:
198 internal::__fastpack12(in, out);
199 break;
200 case 13:
201 internal::__fastpack13(in, out);
202 break;
203 case 14:
204 internal::__fastpack14(in, out);
205 break;
206 case 15:
207 internal::__fastpack15(in, out);
208 break;
209 case 16:
210 internal::__fastpack16(in, out);
211 break;
212 default:
213 throw std::logic_error("Invalid bit width for bitpacking");
214 }
215}
216}
217
218inline void fastunpack(const uint8_t *__restrict in, uint8_t *__restrict out, const uint32_t bit) {
219 for (uint8_t i = 0; i < 4; i++) {
220 internal::fastunpack_quarter(in: in + (i*bit), out: out+(i*8), bit);
221 }
222}
223
224inline void fastunpack(const uint16_t *__restrict in, uint16_t *__restrict out, const uint32_t bit) {
225 internal::fastunpack_half(in, out, bit);
226 internal::fastunpack_half(in: in + bit, out: out+16, bit);
227}
228
229inline void fastunpack(const uint32_t *__restrict in,
230 uint32_t *__restrict out, const uint32_t bit) {
231 // Could have used function pointers instead of switch.
232 // Switch calls do offer the compiler more opportunities for optimization in
233 // theory. In this case, it makes no difference with a good compiler.
234 switch (bit) {
235 case 0:
236 internal::__fastunpack0(in, out);
237 break;
238 case 1:
239 internal::__fastunpack1(in, out);
240 break;
241 case 2:
242 internal::__fastunpack2(in, out);
243 break;
244 case 3:
245 internal::__fastunpack3(in, out);
246 break;
247 case 4:
248 internal::__fastunpack4(in, out);
249 break;
250 case 5:
251 internal::__fastunpack5(in, out);
252 break;
253 case 6:
254 internal::__fastunpack6(in, out);
255 break;
256 case 7:
257 internal::__fastunpack7(in, out);
258 break;
259 case 8:
260 internal::__fastunpack8(in, out);
261 break;
262 case 9:
263 internal::__fastunpack9(in, out);
264 break;
265 case 10:
266 internal::__fastunpack10(in, out);
267 break;
268 case 11:
269 internal::__fastunpack11(in, out);
270 break;
271 case 12:
272 internal::__fastunpack12(in, out);
273 break;
274 case 13:
275 internal::__fastunpack13(in, out);
276 break;
277 case 14:
278 internal::__fastunpack14(in, out);
279 break;
280 case 15:
281 internal::__fastunpack15(in, out);
282 break;
283 case 16:
284 internal::__fastunpack16(in, out);
285 break;
286 case 17:
287 internal::__fastunpack17(in, out);
288 break;
289 case 18:
290 internal::__fastunpack18(in, out);
291 break;
292 case 19:
293 internal::__fastunpack19(in, out);
294 break;
295 case 20:
296 internal::__fastunpack20(in, out);
297 break;
298 case 21:
299 internal::__fastunpack21(in, out);
300 break;
301 case 22:
302 internal::__fastunpack22(in, out);
303 break;
304 case 23:
305 internal::__fastunpack23(in, out);
306 break;
307 case 24:
308 internal::__fastunpack24(in, out);
309 break;
310 case 25:
311 internal::__fastunpack25(in, out);
312 break;
313 case 26:
314 internal::__fastunpack26(in, out);
315 break;
316 case 27:
317 internal::__fastunpack27(in, out);
318 break;
319 case 28:
320 internal::__fastunpack28(in, out);
321 break;
322 case 29:
323 internal::__fastunpack29(in, out);
324 break;
325 case 30:
326 internal::__fastunpack30(in, out);
327 break;
328 case 31:
329 internal::__fastunpack31(in, out);
330 break;
331 case 32:
332 internal::__fastunpack32(in, out);
333 break;
334 default:
335 throw std::logic_error("Invalid bit width for bitpacking");
336 }
337}
338
339inline void fastunpack(const uint32_t *__restrict in,
340 uint64_t *__restrict out, const uint32_t bit) {
341 // Could have used function pointers instead of switch.
342 // Switch calls do offer the compiler more opportunities for optimization in
343 // theory. In this case, it makes no difference with a good compiler.
344 switch (bit) {
345 case 0:
346 internal::__fastunpack0(in, out);
347 break;
348 case 1:
349 internal::__fastunpack1(in, out);
350 break;
351 case 2:
352 internal::__fastunpack2(in, out);
353 break;
354 case 3:
355 internal::__fastunpack3(in, out);
356 break;
357 case 4:
358 internal::__fastunpack4(in, out);
359 break;
360 case 5:
361 internal::__fastunpack5(in, out);
362 break;
363 case 6:
364 internal::__fastunpack6(in, out);
365 break;
366 case 7:
367 internal::__fastunpack7(in, out);
368 break;
369 case 8:
370 internal::__fastunpack8(in, out);
371 break;
372 case 9:
373 internal::__fastunpack9(in, out);
374 break;
375 case 10:
376 internal::__fastunpack10(in, out);
377 break;
378 case 11:
379 internal::__fastunpack11(in, out);
380 break;
381 case 12:
382 internal::__fastunpack12(in, out);
383 break;
384 case 13:
385 internal::__fastunpack13(in, out);
386 break;
387 case 14:
388 internal::__fastunpack14(in, out);
389 break;
390 case 15:
391 internal::__fastunpack15(in, out);
392 break;
393 case 16:
394 internal::__fastunpack16(in, out);
395 break;
396 case 17:
397 internal::__fastunpack17(in, out);
398 break;
399 case 18:
400 internal::__fastunpack18(in, out);
401 break;
402 case 19:
403 internal::__fastunpack19(in, out);
404 break;
405 case 20:
406 internal::__fastunpack20(in, out);
407 break;
408 case 21:
409 internal::__fastunpack21(in, out);
410 break;
411 case 22:
412 internal::__fastunpack22(in, out);
413 break;
414 case 23:
415 internal::__fastunpack23(in, out);
416 break;
417 case 24:
418 internal::__fastunpack24(in, out);
419 break;
420 case 25:
421 internal::__fastunpack25(in, out);
422 break;
423 case 26:
424 internal::__fastunpack26(in, out);
425 break;
426 case 27:
427 internal::__fastunpack27(in, out);
428 break;
429 case 28:
430 internal::__fastunpack28(in, out);
431 break;
432 case 29:
433 internal::__fastunpack29(in, out);
434 break;
435 case 30:
436 internal::__fastunpack30(in, out);
437 break;
438 case 31:
439 internal::__fastunpack31(in, out);
440 break;
441 case 32:
442 internal::__fastunpack32(in, out);
443 break;
444 case 33:
445 internal::__fastunpack33(in, out);
446 break;
447 case 34:
448 internal::__fastunpack34(in, out);
449 break;
450 case 35:
451 internal::__fastunpack35(in, out);
452 break;
453 case 36:
454 internal::__fastunpack36(in, out);
455 break;
456 case 37:
457 internal::__fastunpack37(in, out);
458 break;
459 case 38:
460 internal::__fastunpack38(in, out);
461 break;
462 case 39:
463 internal::__fastunpack39(in, out);
464 break;
465 case 40:
466 internal::__fastunpack40(in, out);
467 break;
468 case 41:
469 internal::__fastunpack41(in, out);
470 break;
471 case 42:
472 internal::__fastunpack42(in, out);
473 break;
474 case 43:
475 internal::__fastunpack43(in, out);
476 break;
477 case 44:
478 internal::__fastunpack44(in, out);
479 break;
480 case 45:
481 internal::__fastunpack45(in, out);
482 break;
483 case 46:
484 internal::__fastunpack46(in, out);
485 break;
486 case 47:
487 internal::__fastunpack47(in, out);
488 break;
489 case 48:
490 internal::__fastunpack48(in, out);
491 break;
492 case 49:
493 internal::__fastunpack49(in, out);
494 break;
495 case 50:
496 internal::__fastunpack50(in, out);
497 break;
498 case 51:
499 internal::__fastunpack51(in, out);
500 break;
501 case 52:
502 internal::__fastunpack52(in, out);
503 break;
504 case 53:
505 internal::__fastunpack53(in, out);
506 break;
507 case 54:
508 internal::__fastunpack54(in, out);
509 break;
510 case 55:
511 internal::__fastunpack55(in, out);
512 break;
513 case 56:
514 internal::__fastunpack56(in, out);
515 break;
516 case 57:
517 internal::__fastunpack57(in, out);
518 break;
519 case 58:
520 internal::__fastunpack58(in, out);
521 break;
522 case 59:
523 internal::__fastunpack59(in, out);
524 break;
525 case 60:
526 internal::__fastunpack60(in, out);
527 break;
528 case 61:
529 internal::__fastunpack61(in, out);
530 break;
531 case 62:
532 internal::__fastunpack62(in, out);
533 break;
534 case 63:
535 internal::__fastunpack63(in, out);
536 break;
537 case 64:
538 internal::__fastunpack64(in, out);
539 break;
540 default:
541 throw std::logic_error("Invalid bit width for bitpacking");
542 }
543}
544
545inline void fastpack(const uint8_t *__restrict in, uint8_t *__restrict out, const uint32_t bit) {
546
547 for (uint8_t i = 0; i < 4; i++) {
548 internal::fastpack_quarter(in: in+(i*8), out: out + (i*bit), bit);
549 }
550}
551
552inline void fastpack(const uint16_t *__restrict in, uint16_t *__restrict out, const uint32_t bit) {
553 internal::fastpack_half(in, out, bit);
554 internal::fastpack_half(in: in+16, out: out + bit, bit);
555}
556
557inline void fastpack(const uint32_t *__restrict in,
558 uint32_t *__restrict out, const uint32_t bit) {
559 // Could have used function pointers instead of switch.
560 // Switch calls do offer the compiler more opportunities for optimization in
561 // theory. In this case, it makes no difference with a good compiler.
562 switch (bit) {
563 case 0:
564 internal::__fastpack0(in, out);
565 break;
566 case 1:
567 internal::__fastpack1(in, out);
568 break;
569 case 2:
570 internal::__fastpack2(in, out);
571 break;
572 case 3:
573 internal::__fastpack3(in, out);
574 break;
575 case 4:
576 internal::__fastpack4(in, out);
577 break;
578 case 5:
579 internal::__fastpack5(in, out);
580 break;
581 case 6:
582 internal::__fastpack6(in, out);
583 break;
584 case 7:
585 internal::__fastpack7(in, out);
586 break;
587 case 8:
588 internal::__fastpack8(in, out);
589 break;
590 case 9:
591 internal::__fastpack9(in, out);
592 break;
593 case 10:
594 internal::__fastpack10(in, out);
595 break;
596 case 11:
597 internal::__fastpack11(in, out);
598 break;
599 case 12:
600 internal::__fastpack12(in, out);
601 break;
602 case 13:
603 internal::__fastpack13(in, out);
604 break;
605 case 14:
606 internal::__fastpack14(in, out);
607 break;
608 case 15:
609 internal::__fastpack15(in, out);
610 break;
611 case 16:
612 internal::__fastpack16(in, out);
613 break;
614 case 17:
615 internal::__fastpack17(in, out);
616 break;
617 case 18:
618 internal::__fastpack18(in, out);
619 break;
620 case 19:
621 internal::__fastpack19(in, out);
622 break;
623 case 20:
624 internal::__fastpack20(in, out);
625 break;
626 case 21:
627 internal::__fastpack21(in, out);
628 break;
629 case 22:
630 internal::__fastpack22(in, out);
631 break;
632 case 23:
633 internal::__fastpack23(in, out);
634 break;
635 case 24:
636 internal::__fastpack24(in, out);
637 break;
638 case 25:
639 internal::__fastpack25(in, out);
640 break;
641 case 26:
642 internal::__fastpack26(in, out);
643 break;
644 case 27:
645 internal::__fastpack27(in, out);
646 break;
647 case 28:
648 internal::__fastpack28(in, out);
649 break;
650 case 29:
651 internal::__fastpack29(in, out);
652 break;
653 case 30:
654 internal::__fastpack30(in, out);
655 break;
656 case 31:
657 internal::__fastpack31(in, out);
658 break;
659 case 32:
660 internal::__fastpack32(in, out);
661 break;
662 default:
663 throw std::logic_error("Invalid bit width for bitpacking");
664 }
665}
666
667inline void fastpack(const uint64_t *__restrict in,
668 uint32_t *__restrict out, const uint32_t bit) {
669 switch (bit) {
670 case 0:
671 internal::__fastpack0(in, out);
672 break;
673 case 1:
674 internal::__fastpack1(in, out);
675 break;
676 case 2:
677 internal::__fastpack2(in, out);
678 break;
679 case 3:
680 internal::__fastpack3(in, out);
681 break;
682 case 4:
683 internal::__fastpack4(in, out);
684 break;
685 case 5:
686 internal::__fastpack5(in, out);
687 break;
688 case 6:
689 internal::__fastpack6(in, out);
690 break;
691 case 7:
692 internal::__fastpack7(in, out);
693 break;
694 case 8:
695 internal::__fastpack8(in, out);
696 break;
697 case 9:
698 internal::__fastpack9(in, out);
699 break;
700 case 10:
701 internal::__fastpack10(in, out);
702 break;
703 case 11:
704 internal::__fastpack11(in, out);
705 break;
706 case 12:
707 internal::__fastpack12(in, out);
708 break;
709 case 13:
710 internal::__fastpack13(in, out);
711 break;
712 case 14:
713 internal::__fastpack14(in, out);
714 break;
715 case 15:
716 internal::__fastpack15(in, out);
717 break;
718 case 16:
719 internal::__fastpack16(in, out);
720 break;
721 case 17:
722 internal::__fastpack17(in, out);
723 break;
724 case 18:
725 internal::__fastpack18(in, out);
726 break;
727 case 19:
728 internal::__fastpack19(in, out);
729 break;
730 case 20:
731 internal::__fastpack20(in, out);
732 break;
733 case 21:
734 internal::__fastpack21(in, out);
735 break;
736 case 22:
737 internal::__fastpack22(in, out);
738 break;
739 case 23:
740 internal::__fastpack23(in, out);
741 break;
742 case 24:
743 internal::__fastpack24(in, out);
744 break;
745 case 25:
746 internal::__fastpack25(in, out);
747 break;
748 case 26:
749 internal::__fastpack26(in, out);
750 break;
751 case 27:
752 internal::__fastpack27(in, out);
753 break;
754 case 28:
755 internal::__fastpack28(in, out);
756 break;
757 case 29:
758 internal::__fastpack29(in, out);
759 break;
760 case 30:
761 internal::__fastpack30(in, out);
762 break;
763 case 31:
764 internal::__fastpack31(in, out);
765 break;
766 case 32:
767 internal::__fastpack32(in, out);
768 break;
769 case 33:
770 internal::__fastpack33(in, out);
771 break;
772 case 34:
773 internal::__fastpack34(in, out);
774 break;
775 case 35:
776 internal::__fastpack35(in, out);
777 break;
778 case 36:
779 internal::__fastpack36(in, out);
780 break;
781 case 37:
782 internal::__fastpack37(in, out);
783 break;
784 case 38:
785 internal::__fastpack38(in, out);
786 break;
787 case 39:
788 internal::__fastpack39(in, out);
789 break;
790 case 40:
791 internal::__fastpack40(in, out);
792 break;
793 case 41:
794 internal::__fastpack41(in, out);
795 break;
796 case 42:
797 internal::__fastpack42(in, out);
798 break;
799 case 43:
800 internal::__fastpack43(in, out);
801 break;
802 case 44:
803 internal::__fastpack44(in, out);
804 break;
805 case 45:
806 internal::__fastpack45(in, out);
807 break;
808 case 46:
809 internal::__fastpack46(in, out);
810 break;
811 case 47:
812 internal::__fastpack47(in, out);
813 break;
814 case 48:
815 internal::__fastpack48(in, out);
816 break;
817 case 49:
818 internal::__fastpack49(in, out);
819 break;
820 case 50:
821 internal::__fastpack50(in, out);
822 break;
823 case 51:
824 internal::__fastpack51(in, out);
825 break;
826 case 52:
827 internal::__fastpack52(in, out);
828 break;
829 case 53:
830 internal::__fastpack53(in, out);
831 break;
832 case 54:
833 internal::__fastpack54(in, out);
834 break;
835 case 55:
836 internal::__fastpack55(in, out);
837 break;
838 case 56:
839 internal::__fastpack56(in, out);
840 break;
841 case 57:
842 internal::__fastpack57(in, out);
843 break;
844 case 58:
845 internal::__fastpack58(in, out);
846 break;
847 case 59:
848 internal::__fastpack59(in, out);
849 break;
850 case 60:
851 internal::__fastpack60(in, out);
852 break;
853 case 61:
854 internal::__fastpack61(in, out);
855 break;
856 case 62:
857 internal::__fastpack62(in, out);
858 break;
859 case 63:
860 internal::__fastpack63(in, out);
861 break;
862 case 64:
863 internal::__fastpack64(in, out);
864 break;
865 default:
866 throw std::logic_error("Invalid bit width for bitpacking");
867 }
868}
869} // namespace fastpfor_lib
870