1#include <assert.h>
2#include <stdint.h>
3#include <stdio.h>
4#include <stdlib.h>
5#include <string.h>
6
7#include <roaring/bitset_util.h>
8
9#ifdef IS_X64
10static uint8_t lengthTable[256] = {
11 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4,
12 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
13 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4,
14 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
15 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,
16 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
17 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5,
18 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
19 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,
20 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
21 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};
22#endif
23
24#ifdef USEAVX
25ALIGNED(32)
26static uint32_t vecDecodeTable[256][8] = {
27 {0, 0, 0, 0, 0, 0, 0, 0}, /* 0x00 (00000000) */
28 {1, 0, 0, 0, 0, 0, 0, 0}, /* 0x01 (00000001) */
29 {2, 0, 0, 0, 0, 0, 0, 0}, /* 0x02 (00000010) */
30 {1, 2, 0, 0, 0, 0, 0, 0}, /* 0x03 (00000011) */
31 {3, 0, 0, 0, 0, 0, 0, 0}, /* 0x04 (00000100) */
32 {1, 3, 0, 0, 0, 0, 0, 0}, /* 0x05 (00000101) */
33 {2, 3, 0, 0, 0, 0, 0, 0}, /* 0x06 (00000110) */
34 {1, 2, 3, 0, 0, 0, 0, 0}, /* 0x07 (00000111) */
35 {4, 0, 0, 0, 0, 0, 0, 0}, /* 0x08 (00001000) */
36 {1, 4, 0, 0, 0, 0, 0, 0}, /* 0x09 (00001001) */
37 {2, 4, 0, 0, 0, 0, 0, 0}, /* 0x0A (00001010) */
38 {1, 2, 4, 0, 0, 0, 0, 0}, /* 0x0B (00001011) */
39 {3, 4, 0, 0, 0, 0, 0, 0}, /* 0x0C (00001100) */
40 {1, 3, 4, 0, 0, 0, 0, 0}, /* 0x0D (00001101) */
41 {2, 3, 4, 0, 0, 0, 0, 0}, /* 0x0E (00001110) */
42 {1, 2, 3, 4, 0, 0, 0, 0}, /* 0x0F (00001111) */
43 {5, 0, 0, 0, 0, 0, 0, 0}, /* 0x10 (00010000) */
44 {1, 5, 0, 0, 0, 0, 0, 0}, /* 0x11 (00010001) */
45 {2, 5, 0, 0, 0, 0, 0, 0}, /* 0x12 (00010010) */
46 {1, 2, 5, 0, 0, 0, 0, 0}, /* 0x13 (00010011) */
47 {3, 5, 0, 0, 0, 0, 0, 0}, /* 0x14 (00010100) */
48 {1, 3, 5, 0, 0, 0, 0, 0}, /* 0x15 (00010101) */
49 {2, 3, 5, 0, 0, 0, 0, 0}, /* 0x16 (00010110) */
50 {1, 2, 3, 5, 0, 0, 0, 0}, /* 0x17 (00010111) */
51 {4, 5, 0, 0, 0, 0, 0, 0}, /* 0x18 (00011000) */
52 {1, 4, 5, 0, 0, 0, 0, 0}, /* 0x19 (00011001) */
53 {2, 4, 5, 0, 0, 0, 0, 0}, /* 0x1A (00011010) */
54 {1, 2, 4, 5, 0, 0, 0, 0}, /* 0x1B (00011011) */
55 {3, 4, 5, 0, 0, 0, 0, 0}, /* 0x1C (00011100) */
56 {1, 3, 4, 5, 0, 0, 0, 0}, /* 0x1D (00011101) */
57 {2, 3, 4, 5, 0, 0, 0, 0}, /* 0x1E (00011110) */
58 {1, 2, 3, 4, 5, 0, 0, 0}, /* 0x1F (00011111) */
59 {6, 0, 0, 0, 0, 0, 0, 0}, /* 0x20 (00100000) */
60 {1, 6, 0, 0, 0, 0, 0, 0}, /* 0x21 (00100001) */
61 {2, 6, 0, 0, 0, 0, 0, 0}, /* 0x22 (00100010) */
62 {1, 2, 6, 0, 0, 0, 0, 0}, /* 0x23 (00100011) */
63 {3, 6, 0, 0, 0, 0, 0, 0}, /* 0x24 (00100100) */
64 {1, 3, 6, 0, 0, 0, 0, 0}, /* 0x25 (00100101) */
65 {2, 3, 6, 0, 0, 0, 0, 0}, /* 0x26 (00100110) */
66 {1, 2, 3, 6, 0, 0, 0, 0}, /* 0x27 (00100111) */
67 {4, 6, 0, 0, 0, 0, 0, 0}, /* 0x28 (00101000) */
68 {1, 4, 6, 0, 0, 0, 0, 0}, /* 0x29 (00101001) */
69 {2, 4, 6, 0, 0, 0, 0, 0}, /* 0x2A (00101010) */
70 {1, 2, 4, 6, 0, 0, 0, 0}, /* 0x2B (00101011) */
71 {3, 4, 6, 0, 0, 0, 0, 0}, /* 0x2C (00101100) */
72 {1, 3, 4, 6, 0, 0, 0, 0}, /* 0x2D (00101101) */
73 {2, 3, 4, 6, 0, 0, 0, 0}, /* 0x2E (00101110) */
74 {1, 2, 3, 4, 6, 0, 0, 0}, /* 0x2F (00101111) */
75 {5, 6, 0, 0, 0, 0, 0, 0}, /* 0x30 (00110000) */
76 {1, 5, 6, 0, 0, 0, 0, 0}, /* 0x31 (00110001) */
77 {2, 5, 6, 0, 0, 0, 0, 0}, /* 0x32 (00110010) */
78 {1, 2, 5, 6, 0, 0, 0, 0}, /* 0x33 (00110011) */
79 {3, 5, 6, 0, 0, 0, 0, 0}, /* 0x34 (00110100) */
80 {1, 3, 5, 6, 0, 0, 0, 0}, /* 0x35 (00110101) */
81 {2, 3, 5, 6, 0, 0, 0, 0}, /* 0x36 (00110110) */
82 {1, 2, 3, 5, 6, 0, 0, 0}, /* 0x37 (00110111) */
83 {4, 5, 6, 0, 0, 0, 0, 0}, /* 0x38 (00111000) */
84 {1, 4, 5, 6, 0, 0, 0, 0}, /* 0x39 (00111001) */
85 {2, 4, 5, 6, 0, 0, 0, 0}, /* 0x3A (00111010) */
86 {1, 2, 4, 5, 6, 0, 0, 0}, /* 0x3B (00111011) */
87 {3, 4, 5, 6, 0, 0, 0, 0}, /* 0x3C (00111100) */
88 {1, 3, 4, 5, 6, 0, 0, 0}, /* 0x3D (00111101) */
89 {2, 3, 4, 5, 6, 0, 0, 0}, /* 0x3E (00111110) */
90 {1, 2, 3, 4, 5, 6, 0, 0}, /* 0x3F (00111111) */
91 {7, 0, 0, 0, 0, 0, 0, 0}, /* 0x40 (01000000) */
92 {1, 7, 0, 0, 0, 0, 0, 0}, /* 0x41 (01000001) */
93 {2, 7, 0, 0, 0, 0, 0, 0}, /* 0x42 (01000010) */
94 {1, 2, 7, 0, 0, 0, 0, 0}, /* 0x43 (01000011) */
95 {3, 7, 0, 0, 0, 0, 0, 0}, /* 0x44 (01000100) */
96 {1, 3, 7, 0, 0, 0, 0, 0}, /* 0x45 (01000101) */
97 {2, 3, 7, 0, 0, 0, 0, 0}, /* 0x46 (01000110) */
98 {1, 2, 3, 7, 0, 0, 0, 0}, /* 0x47 (01000111) */
99 {4, 7, 0, 0, 0, 0, 0, 0}, /* 0x48 (01001000) */
100 {1, 4, 7, 0, 0, 0, 0, 0}, /* 0x49 (01001001) */
101 {2, 4, 7, 0, 0, 0, 0, 0}, /* 0x4A (01001010) */
102 {1, 2, 4, 7, 0, 0, 0, 0}, /* 0x4B (01001011) */
103 {3, 4, 7, 0, 0, 0, 0, 0}, /* 0x4C (01001100) */
104 {1, 3, 4, 7, 0, 0, 0, 0}, /* 0x4D (01001101) */
105 {2, 3, 4, 7, 0, 0, 0, 0}, /* 0x4E (01001110) */
106 {1, 2, 3, 4, 7, 0, 0, 0}, /* 0x4F (01001111) */
107 {5, 7, 0, 0, 0, 0, 0, 0}, /* 0x50 (01010000) */
108 {1, 5, 7, 0, 0, 0, 0, 0}, /* 0x51 (01010001) */
109 {2, 5, 7, 0, 0, 0, 0, 0}, /* 0x52 (01010010) */
110 {1, 2, 5, 7, 0, 0, 0, 0}, /* 0x53 (01010011) */
111 {3, 5, 7, 0, 0, 0, 0, 0}, /* 0x54 (01010100) */
112 {1, 3, 5, 7, 0, 0, 0, 0}, /* 0x55 (01010101) */
113 {2, 3, 5, 7, 0, 0, 0, 0}, /* 0x56 (01010110) */
114 {1, 2, 3, 5, 7, 0, 0, 0}, /* 0x57 (01010111) */
115 {4, 5, 7, 0, 0, 0, 0, 0}, /* 0x58 (01011000) */
116 {1, 4, 5, 7, 0, 0, 0, 0}, /* 0x59 (01011001) */
117 {2, 4, 5, 7, 0, 0, 0, 0}, /* 0x5A (01011010) */
118 {1, 2, 4, 5, 7, 0, 0, 0}, /* 0x5B (01011011) */
119 {3, 4, 5, 7, 0, 0, 0, 0}, /* 0x5C (01011100) */
120 {1, 3, 4, 5, 7, 0, 0, 0}, /* 0x5D (01011101) */
121 {2, 3, 4, 5, 7, 0, 0, 0}, /* 0x5E (01011110) */
122 {1, 2, 3, 4, 5, 7, 0, 0}, /* 0x5F (01011111) */
123 {6, 7, 0, 0, 0, 0, 0, 0}, /* 0x60 (01100000) */
124 {1, 6, 7, 0, 0, 0, 0, 0}, /* 0x61 (01100001) */
125 {2, 6, 7, 0, 0, 0, 0, 0}, /* 0x62 (01100010) */
126 {1, 2, 6, 7, 0, 0, 0, 0}, /* 0x63 (01100011) */
127 {3, 6, 7, 0, 0, 0, 0, 0}, /* 0x64 (01100100) */
128 {1, 3, 6, 7, 0, 0, 0, 0}, /* 0x65 (01100101) */
129 {2, 3, 6, 7, 0, 0, 0, 0}, /* 0x66 (01100110) */
130 {1, 2, 3, 6, 7, 0, 0, 0}, /* 0x67 (01100111) */
131 {4, 6, 7, 0, 0, 0, 0, 0}, /* 0x68 (01101000) */
132 {1, 4, 6, 7, 0, 0, 0, 0}, /* 0x69 (01101001) */
133 {2, 4, 6, 7, 0, 0, 0, 0}, /* 0x6A (01101010) */
134 {1, 2, 4, 6, 7, 0, 0, 0}, /* 0x6B (01101011) */
135 {3, 4, 6, 7, 0, 0, 0, 0}, /* 0x6C (01101100) */
136 {1, 3, 4, 6, 7, 0, 0, 0}, /* 0x6D (01101101) */
137 {2, 3, 4, 6, 7, 0, 0, 0}, /* 0x6E (01101110) */
138 {1, 2, 3, 4, 6, 7, 0, 0}, /* 0x6F (01101111) */
139 {5, 6, 7, 0, 0, 0, 0, 0}, /* 0x70 (01110000) */
140 {1, 5, 6, 7, 0, 0, 0, 0}, /* 0x71 (01110001) */
141 {2, 5, 6, 7, 0, 0, 0, 0}, /* 0x72 (01110010) */
142 {1, 2, 5, 6, 7, 0, 0, 0}, /* 0x73 (01110011) */
143 {3, 5, 6, 7, 0, 0, 0, 0}, /* 0x74 (01110100) */
144 {1, 3, 5, 6, 7, 0, 0, 0}, /* 0x75 (01110101) */
145 {2, 3, 5, 6, 7, 0, 0, 0}, /* 0x76 (01110110) */
146 {1, 2, 3, 5, 6, 7, 0, 0}, /* 0x77 (01110111) */
147 {4, 5, 6, 7, 0, 0, 0, 0}, /* 0x78 (01111000) */
148 {1, 4, 5, 6, 7, 0, 0, 0}, /* 0x79 (01111001) */
149 {2, 4, 5, 6, 7, 0, 0, 0}, /* 0x7A (01111010) */
150 {1, 2, 4, 5, 6, 7, 0, 0}, /* 0x7B (01111011) */
151 {3, 4, 5, 6, 7, 0, 0, 0}, /* 0x7C (01111100) */
152 {1, 3, 4, 5, 6, 7, 0, 0}, /* 0x7D (01111101) */
153 {2, 3, 4, 5, 6, 7, 0, 0}, /* 0x7E (01111110) */
154 {1, 2, 3, 4, 5, 6, 7, 0}, /* 0x7F (01111111) */
155 {8, 0, 0, 0, 0, 0, 0, 0}, /* 0x80 (10000000) */
156 {1, 8, 0, 0, 0, 0, 0, 0}, /* 0x81 (10000001) */
157 {2, 8, 0, 0, 0, 0, 0, 0}, /* 0x82 (10000010) */
158 {1, 2, 8, 0, 0, 0, 0, 0}, /* 0x83 (10000011) */
159 {3, 8, 0, 0, 0, 0, 0, 0}, /* 0x84 (10000100) */
160 {1, 3, 8, 0, 0, 0, 0, 0}, /* 0x85 (10000101) */
161 {2, 3, 8, 0, 0, 0, 0, 0}, /* 0x86 (10000110) */
162 {1, 2, 3, 8, 0, 0, 0, 0}, /* 0x87 (10000111) */
163 {4, 8, 0, 0, 0, 0, 0, 0}, /* 0x88 (10001000) */
164 {1, 4, 8, 0, 0, 0, 0, 0}, /* 0x89 (10001001) */
165 {2, 4, 8, 0, 0, 0, 0, 0}, /* 0x8A (10001010) */
166 {1, 2, 4, 8, 0, 0, 0, 0}, /* 0x8B (10001011) */
167 {3, 4, 8, 0, 0, 0, 0, 0}, /* 0x8C (10001100) */
168 {1, 3, 4, 8, 0, 0, 0, 0}, /* 0x8D (10001101) */
169 {2, 3, 4, 8, 0, 0, 0, 0}, /* 0x8E (10001110) */
170 {1, 2, 3, 4, 8, 0, 0, 0}, /* 0x8F (10001111) */
171 {5, 8, 0, 0, 0, 0, 0, 0}, /* 0x90 (10010000) */
172 {1, 5, 8, 0, 0, 0, 0, 0}, /* 0x91 (10010001) */
173 {2, 5, 8, 0, 0, 0, 0, 0}, /* 0x92 (10010010) */
174 {1, 2, 5, 8, 0, 0, 0, 0}, /* 0x93 (10010011) */
175 {3, 5, 8, 0, 0, 0, 0, 0}, /* 0x94 (10010100) */
176 {1, 3, 5, 8, 0, 0, 0, 0}, /* 0x95 (10010101) */
177 {2, 3, 5, 8, 0, 0, 0, 0}, /* 0x96 (10010110) */
178 {1, 2, 3, 5, 8, 0, 0, 0}, /* 0x97 (10010111) */
179 {4, 5, 8, 0, 0, 0, 0, 0}, /* 0x98 (10011000) */
180 {1, 4, 5, 8, 0, 0, 0, 0}, /* 0x99 (10011001) */
181 {2, 4, 5, 8, 0, 0, 0, 0}, /* 0x9A (10011010) */
182 {1, 2, 4, 5, 8, 0, 0, 0}, /* 0x9B (10011011) */
183 {3, 4, 5, 8, 0, 0, 0, 0}, /* 0x9C (10011100) */
184 {1, 3, 4, 5, 8, 0, 0, 0}, /* 0x9D (10011101) */
185 {2, 3, 4, 5, 8, 0, 0, 0}, /* 0x9E (10011110) */
186 {1, 2, 3, 4, 5, 8, 0, 0}, /* 0x9F (10011111) */
187 {6, 8, 0, 0, 0, 0, 0, 0}, /* 0xA0 (10100000) */
188 {1, 6, 8, 0, 0, 0, 0, 0}, /* 0xA1 (10100001) */
189 {2, 6, 8, 0, 0, 0, 0, 0}, /* 0xA2 (10100010) */
190 {1, 2, 6, 8, 0, 0, 0, 0}, /* 0xA3 (10100011) */
191 {3, 6, 8, 0, 0, 0, 0, 0}, /* 0xA4 (10100100) */
192 {1, 3, 6, 8, 0, 0, 0, 0}, /* 0xA5 (10100101) */
193 {2, 3, 6, 8, 0, 0, 0, 0}, /* 0xA6 (10100110) */
194 {1, 2, 3, 6, 8, 0, 0, 0}, /* 0xA7 (10100111) */
195 {4, 6, 8, 0, 0, 0, 0, 0}, /* 0xA8 (10101000) */
196 {1, 4, 6, 8, 0, 0, 0, 0}, /* 0xA9 (10101001) */
197 {2, 4, 6, 8, 0, 0, 0, 0}, /* 0xAA (10101010) */
198 {1, 2, 4, 6, 8, 0, 0, 0}, /* 0xAB (10101011) */
199 {3, 4, 6, 8, 0, 0, 0, 0}, /* 0xAC (10101100) */
200 {1, 3, 4, 6, 8, 0, 0, 0}, /* 0xAD (10101101) */
201 {2, 3, 4, 6, 8, 0, 0, 0}, /* 0xAE (10101110) */
202 {1, 2, 3, 4, 6, 8, 0, 0}, /* 0xAF (10101111) */
203 {5, 6, 8, 0, 0, 0, 0, 0}, /* 0xB0 (10110000) */
204 {1, 5, 6, 8, 0, 0, 0, 0}, /* 0xB1 (10110001) */
205 {2, 5, 6, 8, 0, 0, 0, 0}, /* 0xB2 (10110010) */
206 {1, 2, 5, 6, 8, 0, 0, 0}, /* 0xB3 (10110011) */
207 {3, 5, 6, 8, 0, 0, 0, 0}, /* 0xB4 (10110100) */
208 {1, 3, 5, 6, 8, 0, 0, 0}, /* 0xB5 (10110101) */
209 {2, 3, 5, 6, 8, 0, 0, 0}, /* 0xB6 (10110110) */
210 {1, 2, 3, 5, 6, 8, 0, 0}, /* 0xB7 (10110111) */
211 {4, 5, 6, 8, 0, 0, 0, 0}, /* 0xB8 (10111000) */
212 {1, 4, 5, 6, 8, 0, 0, 0}, /* 0xB9 (10111001) */
213 {2, 4, 5, 6, 8, 0, 0, 0}, /* 0xBA (10111010) */
214 {1, 2, 4, 5, 6, 8, 0, 0}, /* 0xBB (10111011) */
215 {3, 4, 5, 6, 8, 0, 0, 0}, /* 0xBC (10111100) */
216 {1, 3, 4, 5, 6, 8, 0, 0}, /* 0xBD (10111101) */
217 {2, 3, 4, 5, 6, 8, 0, 0}, /* 0xBE (10111110) */
218 {1, 2, 3, 4, 5, 6, 8, 0}, /* 0xBF (10111111) */
219 {7, 8, 0, 0, 0, 0, 0, 0}, /* 0xC0 (11000000) */
220 {1, 7, 8, 0, 0, 0, 0, 0}, /* 0xC1 (11000001) */
221 {2, 7, 8, 0, 0, 0, 0, 0}, /* 0xC2 (11000010) */
222 {1, 2, 7, 8, 0, 0, 0, 0}, /* 0xC3 (11000011) */
223 {3, 7, 8, 0, 0, 0, 0, 0}, /* 0xC4 (11000100) */
224 {1, 3, 7, 8, 0, 0, 0, 0}, /* 0xC5 (11000101) */
225 {2, 3, 7, 8, 0, 0, 0, 0}, /* 0xC6 (11000110) */
226 {1, 2, 3, 7, 8, 0, 0, 0}, /* 0xC7 (11000111) */
227 {4, 7, 8, 0, 0, 0, 0, 0}, /* 0xC8 (11001000) */
228 {1, 4, 7, 8, 0, 0, 0, 0}, /* 0xC9 (11001001) */
229 {2, 4, 7, 8, 0, 0, 0, 0}, /* 0xCA (11001010) */
230 {1, 2, 4, 7, 8, 0, 0, 0}, /* 0xCB (11001011) */
231 {3, 4, 7, 8, 0, 0, 0, 0}, /* 0xCC (11001100) */
232 {1, 3, 4, 7, 8, 0, 0, 0}, /* 0xCD (11001101) */
233 {2, 3, 4, 7, 8, 0, 0, 0}, /* 0xCE (11001110) */
234 {1, 2, 3, 4, 7, 8, 0, 0}, /* 0xCF (11001111) */
235 {5, 7, 8, 0, 0, 0, 0, 0}, /* 0xD0 (11010000) */
236 {1, 5, 7, 8, 0, 0, 0, 0}, /* 0xD1 (11010001) */
237 {2, 5, 7, 8, 0, 0, 0, 0}, /* 0xD2 (11010010) */
238 {1, 2, 5, 7, 8, 0, 0, 0}, /* 0xD3 (11010011) */
239 {3, 5, 7, 8, 0, 0, 0, 0}, /* 0xD4 (11010100) */
240 {1, 3, 5, 7, 8, 0, 0, 0}, /* 0xD5 (11010101) */
241 {2, 3, 5, 7, 8, 0, 0, 0}, /* 0xD6 (11010110) */
242 {1, 2, 3, 5, 7, 8, 0, 0}, /* 0xD7 (11010111) */
243 {4, 5, 7, 8, 0, 0, 0, 0}, /* 0xD8 (11011000) */
244 {1, 4, 5, 7, 8, 0, 0, 0}, /* 0xD9 (11011001) */
245 {2, 4, 5, 7, 8, 0, 0, 0}, /* 0xDA (11011010) */
246 {1, 2, 4, 5, 7, 8, 0, 0}, /* 0xDB (11011011) */
247 {3, 4, 5, 7, 8, 0, 0, 0}, /* 0xDC (11011100) */
248 {1, 3, 4, 5, 7, 8, 0, 0}, /* 0xDD (11011101) */
249 {2, 3, 4, 5, 7, 8, 0, 0}, /* 0xDE (11011110) */
250 {1, 2, 3, 4, 5, 7, 8, 0}, /* 0xDF (11011111) */
251 {6, 7, 8, 0, 0, 0, 0, 0}, /* 0xE0 (11100000) */
252 {1, 6, 7, 8, 0, 0, 0, 0}, /* 0xE1 (11100001) */
253 {2, 6, 7, 8, 0, 0, 0, 0}, /* 0xE2 (11100010) */
254 {1, 2, 6, 7, 8, 0, 0, 0}, /* 0xE3 (11100011) */
255 {3, 6, 7, 8, 0, 0, 0, 0}, /* 0xE4 (11100100) */
256 {1, 3, 6, 7, 8, 0, 0, 0}, /* 0xE5 (11100101) */
257 {2, 3, 6, 7, 8, 0, 0, 0}, /* 0xE6 (11100110) */
258 {1, 2, 3, 6, 7, 8, 0, 0}, /* 0xE7 (11100111) */
259 {4, 6, 7, 8, 0, 0, 0, 0}, /* 0xE8 (11101000) */
260 {1, 4, 6, 7, 8, 0, 0, 0}, /* 0xE9 (11101001) */
261 {2, 4, 6, 7, 8, 0, 0, 0}, /* 0xEA (11101010) */
262 {1, 2, 4, 6, 7, 8, 0, 0}, /* 0xEB (11101011) */
263 {3, 4, 6, 7, 8, 0, 0, 0}, /* 0xEC (11101100) */
264 {1, 3, 4, 6, 7, 8, 0, 0}, /* 0xED (11101101) */
265 {2, 3, 4, 6, 7, 8, 0, 0}, /* 0xEE (11101110) */
266 {1, 2, 3, 4, 6, 7, 8, 0}, /* 0xEF (11101111) */
267 {5, 6, 7, 8, 0, 0, 0, 0}, /* 0xF0 (11110000) */
268 {1, 5, 6, 7, 8, 0, 0, 0}, /* 0xF1 (11110001) */
269 {2, 5, 6, 7, 8, 0, 0, 0}, /* 0xF2 (11110010) */
270 {1, 2, 5, 6, 7, 8, 0, 0}, /* 0xF3 (11110011) */
271 {3, 5, 6, 7, 8, 0, 0, 0}, /* 0xF4 (11110100) */
272 {1, 3, 5, 6, 7, 8, 0, 0}, /* 0xF5 (11110101) */
273 {2, 3, 5, 6, 7, 8, 0, 0}, /* 0xF6 (11110110) */
274 {1, 2, 3, 5, 6, 7, 8, 0}, /* 0xF7 (11110111) */
275 {4, 5, 6, 7, 8, 0, 0, 0}, /* 0xF8 (11111000) */
276 {1, 4, 5, 6, 7, 8, 0, 0}, /* 0xF9 (11111001) */
277 {2, 4, 5, 6, 7, 8, 0, 0}, /* 0xFA (11111010) */
278 {1, 2, 4, 5, 6, 7, 8, 0}, /* 0xFB (11111011) */
279 {3, 4, 5, 6, 7, 8, 0, 0}, /* 0xFC (11111100) */
280 {1, 3, 4, 5, 6, 7, 8, 0}, /* 0xFD (11111101) */
281 {2, 3, 4, 5, 6, 7, 8, 0}, /* 0xFE (11111110) */
282 {1, 2, 3, 4, 5, 6, 7, 8} /* 0xFF (11111111) */
283};
284
285#endif // #ifdef USEAVX
286
287#ifdef IS_X64
288// same as vecDecodeTable but in 16 bits
289ALIGNED(32)
290static uint16_t vecDecodeTable_uint16[256][8] = {
291 {0, 0, 0, 0, 0, 0, 0, 0}, /* 0x00 (00000000) */
292 {1, 0, 0, 0, 0, 0, 0, 0}, /* 0x01 (00000001) */
293 {2, 0, 0, 0, 0, 0, 0, 0}, /* 0x02 (00000010) */
294 {1, 2, 0, 0, 0, 0, 0, 0}, /* 0x03 (00000011) */
295 {3, 0, 0, 0, 0, 0, 0, 0}, /* 0x04 (00000100) */
296 {1, 3, 0, 0, 0, 0, 0, 0}, /* 0x05 (00000101) */
297 {2, 3, 0, 0, 0, 0, 0, 0}, /* 0x06 (00000110) */
298 {1, 2, 3, 0, 0, 0, 0, 0}, /* 0x07 (00000111) */
299 {4, 0, 0, 0, 0, 0, 0, 0}, /* 0x08 (00001000) */
300 {1, 4, 0, 0, 0, 0, 0, 0}, /* 0x09 (00001001) */
301 {2, 4, 0, 0, 0, 0, 0, 0}, /* 0x0A (00001010) */
302 {1, 2, 4, 0, 0, 0, 0, 0}, /* 0x0B (00001011) */
303 {3, 4, 0, 0, 0, 0, 0, 0}, /* 0x0C (00001100) */
304 {1, 3, 4, 0, 0, 0, 0, 0}, /* 0x0D (00001101) */
305 {2, 3, 4, 0, 0, 0, 0, 0}, /* 0x0E (00001110) */
306 {1, 2, 3, 4, 0, 0, 0, 0}, /* 0x0F (00001111) */
307 {5, 0, 0, 0, 0, 0, 0, 0}, /* 0x10 (00010000) */
308 {1, 5, 0, 0, 0, 0, 0, 0}, /* 0x11 (00010001) */
309 {2, 5, 0, 0, 0, 0, 0, 0}, /* 0x12 (00010010) */
310 {1, 2, 5, 0, 0, 0, 0, 0}, /* 0x13 (00010011) */
311 {3, 5, 0, 0, 0, 0, 0, 0}, /* 0x14 (00010100) */
312 {1, 3, 5, 0, 0, 0, 0, 0}, /* 0x15 (00010101) */
313 {2, 3, 5, 0, 0, 0, 0, 0}, /* 0x16 (00010110) */
314 {1, 2, 3, 5, 0, 0, 0, 0}, /* 0x17 (00010111) */
315 {4, 5, 0, 0, 0, 0, 0, 0}, /* 0x18 (00011000) */
316 {1, 4, 5, 0, 0, 0, 0, 0}, /* 0x19 (00011001) */
317 {2, 4, 5, 0, 0, 0, 0, 0}, /* 0x1A (00011010) */
318 {1, 2, 4, 5, 0, 0, 0, 0}, /* 0x1B (00011011) */
319 {3, 4, 5, 0, 0, 0, 0, 0}, /* 0x1C (00011100) */
320 {1, 3, 4, 5, 0, 0, 0, 0}, /* 0x1D (00011101) */
321 {2, 3, 4, 5, 0, 0, 0, 0}, /* 0x1E (00011110) */
322 {1, 2, 3, 4, 5, 0, 0, 0}, /* 0x1F (00011111) */
323 {6, 0, 0, 0, 0, 0, 0, 0}, /* 0x20 (00100000) */
324 {1, 6, 0, 0, 0, 0, 0, 0}, /* 0x21 (00100001) */
325 {2, 6, 0, 0, 0, 0, 0, 0}, /* 0x22 (00100010) */
326 {1, 2, 6, 0, 0, 0, 0, 0}, /* 0x23 (00100011) */
327 {3, 6, 0, 0, 0, 0, 0, 0}, /* 0x24 (00100100) */
328 {1, 3, 6, 0, 0, 0, 0, 0}, /* 0x25 (00100101) */
329 {2, 3, 6, 0, 0, 0, 0, 0}, /* 0x26 (00100110) */
330 {1, 2, 3, 6, 0, 0, 0, 0}, /* 0x27 (00100111) */
331 {4, 6, 0, 0, 0, 0, 0, 0}, /* 0x28 (00101000) */
332 {1, 4, 6, 0, 0, 0, 0, 0}, /* 0x29 (00101001) */
333 {2, 4, 6, 0, 0, 0, 0, 0}, /* 0x2A (00101010) */
334 {1, 2, 4, 6, 0, 0, 0, 0}, /* 0x2B (00101011) */
335 {3, 4, 6, 0, 0, 0, 0, 0}, /* 0x2C (00101100) */
336 {1, 3, 4, 6, 0, 0, 0, 0}, /* 0x2D (00101101) */
337 {2, 3, 4, 6, 0, 0, 0, 0}, /* 0x2E (00101110) */
338 {1, 2, 3, 4, 6, 0, 0, 0}, /* 0x2F (00101111) */
339 {5, 6, 0, 0, 0, 0, 0, 0}, /* 0x30 (00110000) */
340 {1, 5, 6, 0, 0, 0, 0, 0}, /* 0x31 (00110001) */
341 {2, 5, 6, 0, 0, 0, 0, 0}, /* 0x32 (00110010) */
342 {1, 2, 5, 6, 0, 0, 0, 0}, /* 0x33 (00110011) */
343 {3, 5, 6, 0, 0, 0, 0, 0}, /* 0x34 (00110100) */
344 {1, 3, 5, 6, 0, 0, 0, 0}, /* 0x35 (00110101) */
345 {2, 3, 5, 6, 0, 0, 0, 0}, /* 0x36 (00110110) */
346 {1, 2, 3, 5, 6, 0, 0, 0}, /* 0x37 (00110111) */
347 {4, 5, 6, 0, 0, 0, 0, 0}, /* 0x38 (00111000) */
348 {1, 4, 5, 6, 0, 0, 0, 0}, /* 0x39 (00111001) */
349 {2, 4, 5, 6, 0, 0, 0, 0}, /* 0x3A (00111010) */
350 {1, 2, 4, 5, 6, 0, 0, 0}, /* 0x3B (00111011) */
351 {3, 4, 5, 6, 0, 0, 0, 0}, /* 0x3C (00111100) */
352 {1, 3, 4, 5, 6, 0, 0, 0}, /* 0x3D (00111101) */
353 {2, 3, 4, 5, 6, 0, 0, 0}, /* 0x3E (00111110) */
354 {1, 2, 3, 4, 5, 6, 0, 0}, /* 0x3F (00111111) */
355 {7, 0, 0, 0, 0, 0, 0, 0}, /* 0x40 (01000000) */
356 {1, 7, 0, 0, 0, 0, 0, 0}, /* 0x41 (01000001) */
357 {2, 7, 0, 0, 0, 0, 0, 0}, /* 0x42 (01000010) */
358 {1, 2, 7, 0, 0, 0, 0, 0}, /* 0x43 (01000011) */
359 {3, 7, 0, 0, 0, 0, 0, 0}, /* 0x44 (01000100) */
360 {1, 3, 7, 0, 0, 0, 0, 0}, /* 0x45 (01000101) */
361 {2, 3, 7, 0, 0, 0, 0, 0}, /* 0x46 (01000110) */
362 {1, 2, 3, 7, 0, 0, 0, 0}, /* 0x47 (01000111) */
363 {4, 7, 0, 0, 0, 0, 0, 0}, /* 0x48 (01001000) */
364 {1, 4, 7, 0, 0, 0, 0, 0}, /* 0x49 (01001001) */
365 {2, 4, 7, 0, 0, 0, 0, 0}, /* 0x4A (01001010) */
366 {1, 2, 4, 7, 0, 0, 0, 0}, /* 0x4B (01001011) */
367 {3, 4, 7, 0, 0, 0, 0, 0}, /* 0x4C (01001100) */
368 {1, 3, 4, 7, 0, 0, 0, 0}, /* 0x4D (01001101) */
369 {2, 3, 4, 7, 0, 0, 0, 0}, /* 0x4E (01001110) */
370 {1, 2, 3, 4, 7, 0, 0, 0}, /* 0x4F (01001111) */
371 {5, 7, 0, 0, 0, 0, 0, 0}, /* 0x50 (01010000) */
372 {1, 5, 7, 0, 0, 0, 0, 0}, /* 0x51 (01010001) */
373 {2, 5, 7, 0, 0, 0, 0, 0}, /* 0x52 (01010010) */
374 {1, 2, 5, 7, 0, 0, 0, 0}, /* 0x53 (01010011) */
375 {3, 5, 7, 0, 0, 0, 0, 0}, /* 0x54 (01010100) */
376 {1, 3, 5, 7, 0, 0, 0, 0}, /* 0x55 (01010101) */
377 {2, 3, 5, 7, 0, 0, 0, 0}, /* 0x56 (01010110) */
378 {1, 2, 3, 5, 7, 0, 0, 0}, /* 0x57 (01010111) */
379 {4, 5, 7, 0, 0, 0, 0, 0}, /* 0x58 (01011000) */
380 {1, 4, 5, 7, 0, 0, 0, 0}, /* 0x59 (01011001) */
381 {2, 4, 5, 7, 0, 0, 0, 0}, /* 0x5A (01011010) */
382 {1, 2, 4, 5, 7, 0, 0, 0}, /* 0x5B (01011011) */
383 {3, 4, 5, 7, 0, 0, 0, 0}, /* 0x5C (01011100) */
384 {1, 3, 4, 5, 7, 0, 0, 0}, /* 0x5D (01011101) */
385 {2, 3, 4, 5, 7, 0, 0, 0}, /* 0x5E (01011110) */
386 {1, 2, 3, 4, 5, 7, 0, 0}, /* 0x5F (01011111) */
387 {6, 7, 0, 0, 0, 0, 0, 0}, /* 0x60 (01100000) */
388 {1, 6, 7, 0, 0, 0, 0, 0}, /* 0x61 (01100001) */
389 {2, 6, 7, 0, 0, 0, 0, 0}, /* 0x62 (01100010) */
390 {1, 2, 6, 7, 0, 0, 0, 0}, /* 0x63 (01100011) */
391 {3, 6, 7, 0, 0, 0, 0, 0}, /* 0x64 (01100100) */
392 {1, 3, 6, 7, 0, 0, 0, 0}, /* 0x65 (01100101) */
393 {2, 3, 6, 7, 0, 0, 0, 0}, /* 0x66 (01100110) */
394 {1, 2, 3, 6, 7, 0, 0, 0}, /* 0x67 (01100111) */
395 {4, 6, 7, 0, 0, 0, 0, 0}, /* 0x68 (01101000) */
396 {1, 4, 6, 7, 0, 0, 0, 0}, /* 0x69 (01101001) */
397 {2, 4, 6, 7, 0, 0, 0, 0}, /* 0x6A (01101010) */
398 {1, 2, 4, 6, 7, 0, 0, 0}, /* 0x6B (01101011) */
399 {3, 4, 6, 7, 0, 0, 0, 0}, /* 0x6C (01101100) */
400 {1, 3, 4, 6, 7, 0, 0, 0}, /* 0x6D (01101101) */
401 {2, 3, 4, 6, 7, 0, 0, 0}, /* 0x6E (01101110) */
402 {1, 2, 3, 4, 6, 7, 0, 0}, /* 0x6F (01101111) */
403 {5, 6, 7, 0, 0, 0, 0, 0}, /* 0x70 (01110000) */
404 {1, 5, 6, 7, 0, 0, 0, 0}, /* 0x71 (01110001) */
405 {2, 5, 6, 7, 0, 0, 0, 0}, /* 0x72 (01110010) */
406 {1, 2, 5, 6, 7, 0, 0, 0}, /* 0x73 (01110011) */
407 {3, 5, 6, 7, 0, 0, 0, 0}, /* 0x74 (01110100) */
408 {1, 3, 5, 6, 7, 0, 0, 0}, /* 0x75 (01110101) */
409 {2, 3, 5, 6, 7, 0, 0, 0}, /* 0x76 (01110110) */
410 {1, 2, 3, 5, 6, 7, 0, 0}, /* 0x77 (01110111) */
411 {4, 5, 6, 7, 0, 0, 0, 0}, /* 0x78 (01111000) */
412 {1, 4, 5, 6, 7, 0, 0, 0}, /* 0x79 (01111001) */
413 {2, 4, 5, 6, 7, 0, 0, 0}, /* 0x7A (01111010) */
414 {1, 2, 4, 5, 6, 7, 0, 0}, /* 0x7B (01111011) */
415 {3, 4, 5, 6, 7, 0, 0, 0}, /* 0x7C (01111100) */
416 {1, 3, 4, 5, 6, 7, 0, 0}, /* 0x7D (01111101) */
417 {2, 3, 4, 5, 6, 7, 0, 0}, /* 0x7E (01111110) */
418 {1, 2, 3, 4, 5, 6, 7, 0}, /* 0x7F (01111111) */
419 {8, 0, 0, 0, 0, 0, 0, 0}, /* 0x80 (10000000) */
420 {1, 8, 0, 0, 0, 0, 0, 0}, /* 0x81 (10000001) */
421 {2, 8, 0, 0, 0, 0, 0, 0}, /* 0x82 (10000010) */
422 {1, 2, 8, 0, 0, 0, 0, 0}, /* 0x83 (10000011) */
423 {3, 8, 0, 0, 0, 0, 0, 0}, /* 0x84 (10000100) */
424 {1, 3, 8, 0, 0, 0, 0, 0}, /* 0x85 (10000101) */
425 {2, 3, 8, 0, 0, 0, 0, 0}, /* 0x86 (10000110) */
426 {1, 2, 3, 8, 0, 0, 0, 0}, /* 0x87 (10000111) */
427 {4, 8, 0, 0, 0, 0, 0, 0}, /* 0x88 (10001000) */
428 {1, 4, 8, 0, 0, 0, 0, 0}, /* 0x89 (10001001) */
429 {2, 4, 8, 0, 0, 0, 0, 0}, /* 0x8A (10001010) */
430 {1, 2, 4, 8, 0, 0, 0, 0}, /* 0x8B (10001011) */
431 {3, 4, 8, 0, 0, 0, 0, 0}, /* 0x8C (10001100) */
432 {1, 3, 4, 8, 0, 0, 0, 0}, /* 0x8D (10001101) */
433 {2, 3, 4, 8, 0, 0, 0, 0}, /* 0x8E (10001110) */
434 {1, 2, 3, 4, 8, 0, 0, 0}, /* 0x8F (10001111) */
435 {5, 8, 0, 0, 0, 0, 0, 0}, /* 0x90 (10010000) */
436 {1, 5, 8, 0, 0, 0, 0, 0}, /* 0x91 (10010001) */
437 {2, 5, 8, 0, 0, 0, 0, 0}, /* 0x92 (10010010) */
438 {1, 2, 5, 8, 0, 0, 0, 0}, /* 0x93 (10010011) */
439 {3, 5, 8, 0, 0, 0, 0, 0}, /* 0x94 (10010100) */
440 {1, 3, 5, 8, 0, 0, 0, 0}, /* 0x95 (10010101) */
441 {2, 3, 5, 8, 0, 0, 0, 0}, /* 0x96 (10010110) */
442 {1, 2, 3, 5, 8, 0, 0, 0}, /* 0x97 (10010111) */
443 {4, 5, 8, 0, 0, 0, 0, 0}, /* 0x98 (10011000) */
444 {1, 4, 5, 8, 0, 0, 0, 0}, /* 0x99 (10011001) */
445 {2, 4, 5, 8, 0, 0, 0, 0}, /* 0x9A (10011010) */
446 {1, 2, 4, 5, 8, 0, 0, 0}, /* 0x9B (10011011) */
447 {3, 4, 5, 8, 0, 0, 0, 0}, /* 0x9C (10011100) */
448 {1, 3, 4, 5, 8, 0, 0, 0}, /* 0x9D (10011101) */
449 {2, 3, 4, 5, 8, 0, 0, 0}, /* 0x9E (10011110) */
450 {1, 2, 3, 4, 5, 8, 0, 0}, /* 0x9F (10011111) */
451 {6, 8, 0, 0, 0, 0, 0, 0}, /* 0xA0 (10100000) */
452 {1, 6, 8, 0, 0, 0, 0, 0}, /* 0xA1 (10100001) */
453 {2, 6, 8, 0, 0, 0, 0, 0}, /* 0xA2 (10100010) */
454 {1, 2, 6, 8, 0, 0, 0, 0}, /* 0xA3 (10100011) */
455 {3, 6, 8, 0, 0, 0, 0, 0}, /* 0xA4 (10100100) */
456 {1, 3, 6, 8, 0, 0, 0, 0}, /* 0xA5 (10100101) */
457 {2, 3, 6, 8, 0, 0, 0, 0}, /* 0xA6 (10100110) */
458 {1, 2, 3, 6, 8, 0, 0, 0}, /* 0xA7 (10100111) */
459 {4, 6, 8, 0, 0, 0, 0, 0}, /* 0xA8 (10101000) */
460 {1, 4, 6, 8, 0, 0, 0, 0}, /* 0xA9 (10101001) */
461 {2, 4, 6, 8, 0, 0, 0, 0}, /* 0xAA (10101010) */
462 {1, 2, 4, 6, 8, 0, 0, 0}, /* 0xAB (10101011) */
463 {3, 4, 6, 8, 0, 0, 0, 0}, /* 0xAC (10101100) */
464 {1, 3, 4, 6, 8, 0, 0, 0}, /* 0xAD (10101101) */
465 {2, 3, 4, 6, 8, 0, 0, 0}, /* 0xAE (10101110) */
466 {1, 2, 3, 4, 6, 8, 0, 0}, /* 0xAF (10101111) */
467 {5, 6, 8, 0, 0, 0, 0, 0}, /* 0xB0 (10110000) */
468 {1, 5, 6, 8, 0, 0, 0, 0}, /* 0xB1 (10110001) */
469 {2, 5, 6, 8, 0, 0, 0, 0}, /* 0xB2 (10110010) */
470 {1, 2, 5, 6, 8, 0, 0, 0}, /* 0xB3 (10110011) */
471 {3, 5, 6, 8, 0, 0, 0, 0}, /* 0xB4 (10110100) */
472 {1, 3, 5, 6, 8, 0, 0, 0}, /* 0xB5 (10110101) */
473 {2, 3, 5, 6, 8, 0, 0, 0}, /* 0xB6 (10110110) */
474 {1, 2, 3, 5, 6, 8, 0, 0}, /* 0xB7 (10110111) */
475 {4, 5, 6, 8, 0, 0, 0, 0}, /* 0xB8 (10111000) */
476 {1, 4, 5, 6, 8, 0, 0, 0}, /* 0xB9 (10111001) */
477 {2, 4, 5, 6, 8, 0, 0, 0}, /* 0xBA (10111010) */
478 {1, 2, 4, 5, 6, 8, 0, 0}, /* 0xBB (10111011) */
479 {3, 4, 5, 6, 8, 0, 0, 0}, /* 0xBC (10111100) */
480 {1, 3, 4, 5, 6, 8, 0, 0}, /* 0xBD (10111101) */
481 {2, 3, 4, 5, 6, 8, 0, 0}, /* 0xBE (10111110) */
482 {1, 2, 3, 4, 5, 6, 8, 0}, /* 0xBF (10111111) */
483 {7, 8, 0, 0, 0, 0, 0, 0}, /* 0xC0 (11000000) */
484 {1, 7, 8, 0, 0, 0, 0, 0}, /* 0xC1 (11000001) */
485 {2, 7, 8, 0, 0, 0, 0, 0}, /* 0xC2 (11000010) */
486 {1, 2, 7, 8, 0, 0, 0, 0}, /* 0xC3 (11000011) */
487 {3, 7, 8, 0, 0, 0, 0, 0}, /* 0xC4 (11000100) */
488 {1, 3, 7, 8, 0, 0, 0, 0}, /* 0xC5 (11000101) */
489 {2, 3, 7, 8, 0, 0, 0, 0}, /* 0xC6 (11000110) */
490 {1, 2, 3, 7, 8, 0, 0, 0}, /* 0xC7 (11000111) */
491 {4, 7, 8, 0, 0, 0, 0, 0}, /* 0xC8 (11001000) */
492 {1, 4, 7, 8, 0, 0, 0, 0}, /* 0xC9 (11001001) */
493 {2, 4, 7, 8, 0, 0, 0, 0}, /* 0xCA (11001010) */
494 {1, 2, 4, 7, 8, 0, 0, 0}, /* 0xCB (11001011) */
495 {3, 4, 7, 8, 0, 0, 0, 0}, /* 0xCC (11001100) */
496 {1, 3, 4, 7, 8, 0, 0, 0}, /* 0xCD (11001101) */
497 {2, 3, 4, 7, 8, 0, 0, 0}, /* 0xCE (11001110) */
498 {1, 2, 3, 4, 7, 8, 0, 0}, /* 0xCF (11001111) */
499 {5, 7, 8, 0, 0, 0, 0, 0}, /* 0xD0 (11010000) */
500 {1, 5, 7, 8, 0, 0, 0, 0}, /* 0xD1 (11010001) */
501 {2, 5, 7, 8, 0, 0, 0, 0}, /* 0xD2 (11010010) */
502 {1, 2, 5, 7, 8, 0, 0, 0}, /* 0xD3 (11010011) */
503 {3, 5, 7, 8, 0, 0, 0, 0}, /* 0xD4 (11010100) */
504 {1, 3, 5, 7, 8, 0, 0, 0}, /* 0xD5 (11010101) */
505 {2, 3, 5, 7, 8, 0, 0, 0}, /* 0xD6 (11010110) */
506 {1, 2, 3, 5, 7, 8, 0, 0}, /* 0xD7 (11010111) */
507 {4, 5, 7, 8, 0, 0, 0, 0}, /* 0xD8 (11011000) */
508 {1, 4, 5, 7, 8, 0, 0, 0}, /* 0xD9 (11011001) */
509 {2, 4, 5, 7, 8, 0, 0, 0}, /* 0xDA (11011010) */
510 {1, 2, 4, 5, 7, 8, 0, 0}, /* 0xDB (11011011) */
511 {3, 4, 5, 7, 8, 0, 0, 0}, /* 0xDC (11011100) */
512 {1, 3, 4, 5, 7, 8, 0, 0}, /* 0xDD (11011101) */
513 {2, 3, 4, 5, 7, 8, 0, 0}, /* 0xDE (11011110) */
514 {1, 2, 3, 4, 5, 7, 8, 0}, /* 0xDF (11011111) */
515 {6, 7, 8, 0, 0, 0, 0, 0}, /* 0xE0 (11100000) */
516 {1, 6, 7, 8, 0, 0, 0, 0}, /* 0xE1 (11100001) */
517 {2, 6, 7, 8, 0, 0, 0, 0}, /* 0xE2 (11100010) */
518 {1, 2, 6, 7, 8, 0, 0, 0}, /* 0xE3 (11100011) */
519 {3, 6, 7, 8, 0, 0, 0, 0}, /* 0xE4 (11100100) */
520 {1, 3, 6, 7, 8, 0, 0, 0}, /* 0xE5 (11100101) */
521 {2, 3, 6, 7, 8, 0, 0, 0}, /* 0xE6 (11100110) */
522 {1, 2, 3, 6, 7, 8, 0, 0}, /* 0xE7 (11100111) */
523 {4, 6, 7, 8, 0, 0, 0, 0}, /* 0xE8 (11101000) */
524 {1, 4, 6, 7, 8, 0, 0, 0}, /* 0xE9 (11101001) */
525 {2, 4, 6, 7, 8, 0, 0, 0}, /* 0xEA (11101010) */
526 {1, 2, 4, 6, 7, 8, 0, 0}, /* 0xEB (11101011) */
527 {3, 4, 6, 7, 8, 0, 0, 0}, /* 0xEC (11101100) */
528 {1, 3, 4, 6, 7, 8, 0, 0}, /* 0xED (11101101) */
529 {2, 3, 4, 6, 7, 8, 0, 0}, /* 0xEE (11101110) */
530 {1, 2, 3, 4, 6, 7, 8, 0}, /* 0xEF (11101111) */
531 {5, 6, 7, 8, 0, 0, 0, 0}, /* 0xF0 (11110000) */
532 {1, 5, 6, 7, 8, 0, 0, 0}, /* 0xF1 (11110001) */
533 {2, 5, 6, 7, 8, 0, 0, 0}, /* 0xF2 (11110010) */
534 {1, 2, 5, 6, 7, 8, 0, 0}, /* 0xF3 (11110011) */
535 {3, 5, 6, 7, 8, 0, 0, 0}, /* 0xF4 (11110100) */
536 {1, 3, 5, 6, 7, 8, 0, 0}, /* 0xF5 (11110101) */
537 {2, 3, 5, 6, 7, 8, 0, 0}, /* 0xF6 (11110110) */
538 {1, 2, 3, 5, 6, 7, 8, 0}, /* 0xF7 (11110111) */
539 {4, 5, 6, 7, 8, 0, 0, 0}, /* 0xF8 (11111000) */
540 {1, 4, 5, 6, 7, 8, 0, 0}, /* 0xF9 (11111001) */
541 {2, 4, 5, 6, 7, 8, 0, 0}, /* 0xFA (11111010) */
542 {1, 2, 4, 5, 6, 7, 8, 0}, /* 0xFB (11111011) */
543 {3, 4, 5, 6, 7, 8, 0, 0}, /* 0xFC (11111100) */
544 {1, 3, 4, 5, 6, 7, 8, 0}, /* 0xFD (11111101) */
545 {2, 3, 4, 5, 6, 7, 8, 0}, /* 0xFE (11111110) */
546 {1, 2, 3, 4, 5, 6, 7, 8} /* 0xFF (11111111) */
547};
548
549#endif
550
551#ifdef USEAVX
552
553size_t bitset_extract_setbits_avx2(uint64_t *array, size_t length, void *vout,
554 size_t outcapacity, uint32_t base) {
555 uint32_t *out = (uint32_t *)vout;
556 uint32_t *initout = out;
557 __m256i baseVec = _mm256_set1_epi32(base - 1);
558 __m256i incVec = _mm256_set1_epi32(64);
559 __m256i add8 = _mm256_set1_epi32(8);
560 uint32_t *safeout = out + outcapacity;
561 size_t i = 0;
562 for (; (i < length) && (out + 64 <= safeout); ++i) {
563 uint64_t w = array[i];
564 if (w == 0) {
565 baseVec = _mm256_add_epi32(baseVec, incVec);
566 } else {
567 for (int k = 0; k < 4; ++k) {
568 uint8_t byteA = (uint8_t)w;
569 uint8_t byteB = (uint8_t)(w >> 8);
570 w >>= 16;
571 __m256i vecA =
572 _mm256_load_si256((const __m256i *)vecDecodeTable[byteA]);
573 __m256i vecB =
574 _mm256_load_si256((const __m256i *)vecDecodeTable[byteB]);
575 uint8_t advanceA = lengthTable[byteA];
576 uint8_t advanceB = lengthTable[byteB];
577 vecA = _mm256_add_epi32(baseVec, vecA);
578 baseVec = _mm256_add_epi32(baseVec, add8);
579 vecB = _mm256_add_epi32(baseVec, vecB);
580 baseVec = _mm256_add_epi32(baseVec, add8);
581 _mm256_storeu_si256((__m256i *)out, vecA);
582 out += advanceA;
583 _mm256_storeu_si256((__m256i *)out, vecB);
584 out += advanceB;
585 }
586 }
587 }
588 base += i * 64;
589 for (; (i < length) && (out < safeout); ++i) {
590 uint64_t w = array[i];
591 while ((w != 0) && (out < safeout)) {
592 uint64_t t = w & (~w + 1); // on x64, should compile to BLSI (careful: the Intel compiler seems to fail)
593 int r = __builtin_ctzll(w); // on x64, should compile to TZCNT
594 uint32_t val = r + base;
595 memcpy(out, &val,
596 sizeof(uint32_t)); // should be compiled as a MOV on x64
597 out++;
598 w ^= t;
599 }
600 base += 64;
601 }
602 return out - initout;
603}
604#endif // USEAVX
605
606size_t bitset_extract_setbits(uint64_t *bitset, size_t length, void *vout,
607 uint32_t base) {
608 int outpos = 0;
609 uint32_t *out = (uint32_t *)vout;
610 for (size_t i = 0; i < length; ++i) {
611 uint64_t w = bitset[i];
612 while (w != 0) {
613 uint64_t t = w & (~w + 1); // on x64, should compile to BLSI (careful: the Intel compiler seems to fail)
614 int r = __builtin_ctzll(w); // on x64, should compile to TZCNT
615 uint32_t val = r + base;
616 memcpy(out + outpos, &val,
617 sizeof(uint32_t)); // should be compiled as a MOV on x64
618 outpos++;
619 w ^= t;
620 }
621 base += 64;
622 }
623 return outpos;
624}
625
626size_t bitset_extract_intersection_setbits_uint16(const uint64_t * __restrict__ bitset1,
627 const uint64_t * __restrict__ bitset2,
628 size_t length, uint16_t *out,
629 uint16_t base) {
630 int outpos = 0;
631 for (size_t i = 0; i < length; ++i) {
632 uint64_t w = bitset1[i] & bitset2[i];
633 while (w != 0) {
634 uint64_t t = w & (~w + 1);
635 int r = __builtin_ctzll(w);
636 out[outpos++] = r + base;
637 w ^= t;
638 }
639 base += 64;
640 }
641 return outpos;
642}
643
644#ifdef IS_X64
645/*
646 * Given a bitset containing "length" 64-bit words, write out the position
647 * of all the set bits to "out" as 16-bit integers, values start at "base" (can
648 *be set to zero).
649 *
650 * The "out" pointer should be sufficient to store the actual number of bits
651 *set.
652 *
653 * Returns how many values were actually decoded.
654 *
655 * This function uses SSE decoding.
656 */
657size_t bitset_extract_setbits_sse_uint16(const uint64_t *bitset, size_t length,
658 uint16_t *out, size_t outcapacity,
659 uint16_t base) {
660 uint16_t *initout = out;
661 __m128i baseVec = _mm_set1_epi16(base - 1);
662 __m128i incVec = _mm_set1_epi16(64);
663 __m128i add8 = _mm_set1_epi16(8);
664 uint16_t *safeout = out + outcapacity;
665 const int numberofbytes = 2; // process two bytes at a time
666 size_t i = 0;
667 for (; (i < length) && (out + numberofbytes * 8 <= safeout); ++i) {
668 uint64_t w = bitset[i];
669 if (w == 0) {
670 baseVec = _mm_add_epi16(baseVec, incVec);
671 } else {
672 for (int k = 0; k < 4; ++k) {
673 uint8_t byteA = (uint8_t)w;
674 uint8_t byteB = (uint8_t)(w >> 8);
675 w >>= 16;
676 __m128i vecA = _mm_load_si128(
677 (const __m128i *)vecDecodeTable_uint16[byteA]);
678 __m128i vecB = _mm_load_si128(
679 (const __m128i *)vecDecodeTable_uint16[byteB]);
680 uint8_t advanceA = lengthTable[byteA];
681 uint8_t advanceB = lengthTable[byteB];
682 vecA = _mm_add_epi16(baseVec, vecA);
683 baseVec = _mm_add_epi16(baseVec, add8);
684 vecB = _mm_add_epi16(baseVec, vecB);
685 baseVec = _mm_add_epi16(baseVec, add8);
686 _mm_storeu_si128((__m128i *)out, vecA);
687 out += advanceA;
688 _mm_storeu_si128((__m128i *)out, vecB);
689 out += advanceB;
690 }
691 }
692 }
693 base += (uint16_t)(i * 64);
694 for (; (i < length) && (out < safeout); ++i) {
695 uint64_t w = bitset[i];
696 while ((w != 0) && (out < safeout)) {
697 uint64_t t = w & (~w + 1);
698 int r = __builtin_ctzll(w);
699 *out = r + base;
700 out++;
701 w ^= t;
702 }
703 base += 64;
704 }
705 return out - initout;
706}
707#endif
708
709/*
710 * Given a bitset containing "length" 64-bit words, write out the position
711 * of all the set bits to "out", values start at "base" (can be set to zero).
712 *
713 * The "out" pointer should be sufficient to store the actual number of bits
714 *set.
715 *
716 * Returns how many values were actually decoded.
717 */
718size_t bitset_extract_setbits_uint16(const uint64_t *bitset, size_t length,
719 uint16_t *out, uint16_t base) {
720 int outpos = 0;
721 for (size_t i = 0; i < length; ++i) {
722 uint64_t w = bitset[i];
723 while (w != 0) {
724 uint64_t t = w & (~w + 1);
725 int r = __builtin_ctzll(w);
726 out[outpos++] = r + base;
727 w ^= t;
728 }
729 base += 64;
730 }
731 return outpos;
732}
733
734#if defined(ASMBITMANIPOPTIMIZATION)
735
736uint64_t bitset_set_list_withcard(void *bitset, uint64_t card,
737 const uint16_t *list, uint64_t length) {
738 uint64_t offset, load, pos;
739 uint64_t shift = 6;
740 const uint16_t *end = list + length;
741 if (!length) return card;
742 // TODO: could unroll for performance, see bitset_set_list
743 // bts is not available as an intrinsic in GCC
744 __asm volatile(
745 "1:\n"
746 "movzwq (%[list]), %[pos]\n"
747 "shrx %[shift], %[pos], %[offset]\n"
748 "mov (%[bitset],%[offset],8), %[load]\n"
749 "bts %[pos], %[load]\n"
750 "mov %[load], (%[bitset],%[offset],8)\n"
751 "sbb $-1, %[card]\n"
752 "add $2, %[list]\n"
753 "cmp %[list], %[end]\n"
754 "jnz 1b"
755 : [card] "+&r"(card), [list] "+&r"(list), [load] "=&r"(load),
756 [pos] "=&r"(pos), [offset] "=&r"(offset)
757 : [end] "r"(end), [bitset] "r"(bitset), [shift] "r"(shift));
758 return card;
759}
760
761void bitset_set_list(void *bitset, const uint16_t *list, uint64_t length) {
762 uint64_t pos;
763 const uint16_t *end = list + length;
764
765 uint64_t shift = 6;
766 uint64_t offset;
767 uint64_t load;
768 for (; list + 3 < end; list += 4) {
769 pos = list[0];
770 __asm volatile(
771 "shrx %[shift], %[pos], %[offset]\n"
772 "mov (%[bitset],%[offset],8), %[load]\n"
773 "bts %[pos], %[load]\n"
774 "mov %[load], (%[bitset],%[offset],8)"
775 : [load] "=&r"(load), [offset] "=&r"(offset)
776 : [bitset] "r"(bitset), [shift] "r"(shift), [pos] "r"(pos));
777 pos = list[1];
778 __asm volatile(
779 "shrx %[shift], %[pos], %[offset]\n"
780 "mov (%[bitset],%[offset],8), %[load]\n"
781 "bts %[pos], %[load]\n"
782 "mov %[load], (%[bitset],%[offset],8)"
783 : [load] "=&r"(load), [offset] "=&r"(offset)
784 : [bitset] "r"(bitset), [shift] "r"(shift), [pos] "r"(pos));
785 pos = list[2];
786 __asm volatile(
787 "shrx %[shift], %[pos], %[offset]\n"
788 "mov (%[bitset],%[offset],8), %[load]\n"
789 "bts %[pos], %[load]\n"
790 "mov %[load], (%[bitset],%[offset],8)"
791 : [load] "=&r"(load), [offset] "=&r"(offset)
792 : [bitset] "r"(bitset), [shift] "r"(shift), [pos] "r"(pos));
793 pos = list[3];
794 __asm volatile(
795 "shrx %[shift], %[pos], %[offset]\n"
796 "mov (%[bitset],%[offset],8), %[load]\n"
797 "bts %[pos], %[load]\n"
798 "mov %[load], (%[bitset],%[offset],8)"
799 : [load] "=&r"(load), [offset] "=&r"(offset)
800 : [bitset] "r"(bitset), [shift] "r"(shift), [pos] "r"(pos));
801 }
802
803 while (list != end) {
804 pos = list[0];
805 __asm volatile(
806 "shrx %[shift], %[pos], %[offset]\n"
807 "mov (%[bitset],%[offset],8), %[load]\n"
808 "bts %[pos], %[load]\n"
809 "mov %[load], (%[bitset],%[offset],8)"
810 : [load] "=&r"(load), [offset] "=&r"(offset)
811 : [bitset] "r"(bitset), [shift] "r"(shift), [pos] "r"(pos));
812 list++;
813 }
814}
815
816uint64_t bitset_clear_list(void *bitset, uint64_t card, const uint16_t *list,
817 uint64_t length) {
818 uint64_t offset, load, pos;
819 uint64_t shift = 6;
820 const uint16_t *end = list + length;
821 if (!length) return card;
822 // btr is not available as an intrinsic in GCC
823 __asm volatile(
824 "1:\n"
825 "movzwq (%[list]), %[pos]\n"
826 "shrx %[shift], %[pos], %[offset]\n"
827 "mov (%[bitset],%[offset],8), %[load]\n"
828 "btr %[pos], %[load]\n"
829 "mov %[load], (%[bitset],%[offset],8)\n"
830 "sbb $0, %[card]\n"
831 "add $2, %[list]\n"
832 "cmp %[list], %[end]\n"
833 "jnz 1b"
834 : [card] "+&r"(card), [list] "+&r"(list), [load] "=&r"(load),
835 [pos] "=&r"(pos), [offset] "=&r"(offset)
836 : [end] "r"(end), [bitset] "r"(bitset), [shift] "r"(shift)
837 :
838 /* clobbers */ "memory");
839 return card;
840}
841
842#else
843uint64_t bitset_clear_list(void *bitset, uint64_t card, const uint16_t *list,
844 uint64_t length) {
845 uint64_t offset, load, newload, pos, index;
846 const uint16_t *end = list + length;
847 while (list != end) {
848 pos = *(const uint16_t *)list;
849 offset = pos >> 6;
850 index = pos % 64;
851 load = ((uint64_t *)bitset)[offset];
852 newload = load & ~(UINT64_C(1) << index);
853 card -= (load ^ newload) >> index;
854 ((uint64_t *)bitset)[offset] = newload;
855 list++;
856 }
857 return card;
858}
859
860uint64_t bitset_set_list_withcard(void *bitset, uint64_t card,
861 const uint16_t *list, uint64_t length) {
862 uint64_t offset, load, newload, pos, index;
863 const uint16_t *end = list + length;
864 while (list != end) {
865 pos = *(const uint16_t *)list;
866 offset = pos >> 6;
867 index = pos % 64;
868 load = ((uint64_t *)bitset)[offset];
869 newload = load | (UINT64_C(1) << index);
870 card += (load ^ newload) >> index;
871 ((uint64_t *)bitset)[offset] = newload;
872 list++;
873 }
874 return card;
875}
876
877void bitset_set_list(void *bitset, const uint16_t *list, uint64_t length) {
878 uint64_t offset, load, newload, pos, index;
879 const uint16_t *end = list + length;
880 while (list != end) {
881 pos = *(const uint16_t *)list;
882 offset = pos >> 6;
883 index = pos % 64;
884 load = ((uint64_t *)bitset)[offset];
885 newload = load | (UINT64_C(1) << index);
886 ((uint64_t *)bitset)[offset] = newload;
887 list++;
888 }
889}
890
891#endif
892
893/* flip specified bits */
894/* TODO: consider whether worthwhile to make an asm version */
895
896uint64_t bitset_flip_list_withcard(void *bitset, uint64_t card,
897 const uint16_t *list, uint64_t length) {
898 uint64_t offset, load, newload, pos, index;
899 const uint16_t *end = list + length;
900 while (list != end) {
901 pos = *(const uint16_t *)list;
902 offset = pos >> 6;
903 index = pos % 64;
904 load = ((uint64_t *)bitset)[offset];
905 newload = load ^ (UINT64_C(1) << index);
906 // todo: is a branch here all that bad?
907 card +=
908 (1 - 2 * (((UINT64_C(1) << index) & load) >> index)); // +1 or -1
909 ((uint64_t *)bitset)[offset] = newload;
910 list++;
911 }
912 return card;
913}
914
915void bitset_flip_list(void *bitset, const uint16_t *list, uint64_t length) {
916 uint64_t offset, load, newload, pos, index;
917 const uint16_t *end = list + length;
918 while (list != end) {
919 pos = *(const uint16_t *)list;
920 offset = pos >> 6;
921 index = pos % 64;
922 load = ((uint64_t *)bitset)[offset];
923 newload = load ^ (UINT64_C(1) << index);
924 ((uint64_t *)bitset)[offset] = newload;
925 list++;
926 }
927}
928