| 1 | #include <assert.h> |
| 2 | #include <stdint.h> |
| 3 | #include <stdio.h> |
| 4 | #include <stdlib.h> |
| 5 | #include <string.h> |
| 6 | |
| 7 | #include <roaring/bitset_util.h> |
| 8 | |
| 9 | #ifdef IS_X64 |
| 10 | static uint8_t lengthTable[256] = { |
| 11 | 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, |
| 12 | 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, |
| 13 | 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, |
| 14 | 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, |
| 15 | 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, |
| 16 | 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, |
| 17 | 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, |
| 18 | 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, |
| 19 | 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, |
| 20 | 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, |
| 21 | 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8}; |
| 22 | #endif |
| 23 | |
| 24 | #ifdef USEAVX |
| 25 | ALIGNED(32) |
| 26 | static uint32_t vecDecodeTable[256][8] = { |
| 27 | {0, 0, 0, 0, 0, 0, 0, 0}, /* 0x00 (00000000) */ |
| 28 | {1, 0, 0, 0, 0, 0, 0, 0}, /* 0x01 (00000001) */ |
| 29 | {2, 0, 0, 0, 0, 0, 0, 0}, /* 0x02 (00000010) */ |
| 30 | {1, 2, 0, 0, 0, 0, 0, 0}, /* 0x03 (00000011) */ |
| 31 | {3, 0, 0, 0, 0, 0, 0, 0}, /* 0x04 (00000100) */ |
| 32 | {1, 3, 0, 0, 0, 0, 0, 0}, /* 0x05 (00000101) */ |
| 33 | {2, 3, 0, 0, 0, 0, 0, 0}, /* 0x06 (00000110) */ |
| 34 | {1, 2, 3, 0, 0, 0, 0, 0}, /* 0x07 (00000111) */ |
| 35 | {4, 0, 0, 0, 0, 0, 0, 0}, /* 0x08 (00001000) */ |
| 36 | {1, 4, 0, 0, 0, 0, 0, 0}, /* 0x09 (00001001) */ |
| 37 | {2, 4, 0, 0, 0, 0, 0, 0}, /* 0x0A (00001010) */ |
| 38 | {1, 2, 4, 0, 0, 0, 0, 0}, /* 0x0B (00001011) */ |
| 39 | {3, 4, 0, 0, 0, 0, 0, 0}, /* 0x0C (00001100) */ |
| 40 | {1, 3, 4, 0, 0, 0, 0, 0}, /* 0x0D (00001101) */ |
| 41 | {2, 3, 4, 0, 0, 0, 0, 0}, /* 0x0E (00001110) */ |
| 42 | {1, 2, 3, 4, 0, 0, 0, 0}, /* 0x0F (00001111) */ |
| 43 | {5, 0, 0, 0, 0, 0, 0, 0}, /* 0x10 (00010000) */ |
| 44 | {1, 5, 0, 0, 0, 0, 0, 0}, /* 0x11 (00010001) */ |
| 45 | {2, 5, 0, 0, 0, 0, 0, 0}, /* 0x12 (00010010) */ |
| 46 | {1, 2, 5, 0, 0, 0, 0, 0}, /* 0x13 (00010011) */ |
| 47 | {3, 5, 0, 0, 0, 0, 0, 0}, /* 0x14 (00010100) */ |
| 48 | {1, 3, 5, 0, 0, 0, 0, 0}, /* 0x15 (00010101) */ |
| 49 | {2, 3, 5, 0, 0, 0, 0, 0}, /* 0x16 (00010110) */ |
| 50 | {1, 2, 3, 5, 0, 0, 0, 0}, /* 0x17 (00010111) */ |
| 51 | {4, 5, 0, 0, 0, 0, 0, 0}, /* 0x18 (00011000) */ |
| 52 | {1, 4, 5, 0, 0, 0, 0, 0}, /* 0x19 (00011001) */ |
| 53 | {2, 4, 5, 0, 0, 0, 0, 0}, /* 0x1A (00011010) */ |
| 54 | {1, 2, 4, 5, 0, 0, 0, 0}, /* 0x1B (00011011) */ |
| 55 | {3, 4, 5, 0, 0, 0, 0, 0}, /* 0x1C (00011100) */ |
| 56 | {1, 3, 4, 5, 0, 0, 0, 0}, /* 0x1D (00011101) */ |
| 57 | {2, 3, 4, 5, 0, 0, 0, 0}, /* 0x1E (00011110) */ |
| 58 | {1, 2, 3, 4, 5, 0, 0, 0}, /* 0x1F (00011111) */ |
| 59 | {6, 0, 0, 0, 0, 0, 0, 0}, /* 0x20 (00100000) */ |
| 60 | {1, 6, 0, 0, 0, 0, 0, 0}, /* 0x21 (00100001) */ |
| 61 | {2, 6, 0, 0, 0, 0, 0, 0}, /* 0x22 (00100010) */ |
| 62 | {1, 2, 6, 0, 0, 0, 0, 0}, /* 0x23 (00100011) */ |
| 63 | {3, 6, 0, 0, 0, 0, 0, 0}, /* 0x24 (00100100) */ |
| 64 | {1, 3, 6, 0, 0, 0, 0, 0}, /* 0x25 (00100101) */ |
| 65 | {2, 3, 6, 0, 0, 0, 0, 0}, /* 0x26 (00100110) */ |
| 66 | {1, 2, 3, 6, 0, 0, 0, 0}, /* 0x27 (00100111) */ |
| 67 | {4, 6, 0, 0, 0, 0, 0, 0}, /* 0x28 (00101000) */ |
| 68 | {1, 4, 6, 0, 0, 0, 0, 0}, /* 0x29 (00101001) */ |
| 69 | {2, 4, 6, 0, 0, 0, 0, 0}, /* 0x2A (00101010) */ |
| 70 | {1, 2, 4, 6, 0, 0, 0, 0}, /* 0x2B (00101011) */ |
| 71 | {3, 4, 6, 0, 0, 0, 0, 0}, /* 0x2C (00101100) */ |
| 72 | {1, 3, 4, 6, 0, 0, 0, 0}, /* 0x2D (00101101) */ |
| 73 | {2, 3, 4, 6, 0, 0, 0, 0}, /* 0x2E (00101110) */ |
| 74 | {1, 2, 3, 4, 6, 0, 0, 0}, /* 0x2F (00101111) */ |
| 75 | {5, 6, 0, 0, 0, 0, 0, 0}, /* 0x30 (00110000) */ |
| 76 | {1, 5, 6, 0, 0, 0, 0, 0}, /* 0x31 (00110001) */ |
| 77 | {2, 5, 6, 0, 0, 0, 0, 0}, /* 0x32 (00110010) */ |
| 78 | {1, 2, 5, 6, 0, 0, 0, 0}, /* 0x33 (00110011) */ |
| 79 | {3, 5, 6, 0, 0, 0, 0, 0}, /* 0x34 (00110100) */ |
| 80 | {1, 3, 5, 6, 0, 0, 0, 0}, /* 0x35 (00110101) */ |
| 81 | {2, 3, 5, 6, 0, 0, 0, 0}, /* 0x36 (00110110) */ |
| 82 | {1, 2, 3, 5, 6, 0, 0, 0}, /* 0x37 (00110111) */ |
| 83 | {4, 5, 6, 0, 0, 0, 0, 0}, /* 0x38 (00111000) */ |
| 84 | {1, 4, 5, 6, 0, 0, 0, 0}, /* 0x39 (00111001) */ |
| 85 | {2, 4, 5, 6, 0, 0, 0, 0}, /* 0x3A (00111010) */ |
| 86 | {1, 2, 4, 5, 6, 0, 0, 0}, /* 0x3B (00111011) */ |
| 87 | {3, 4, 5, 6, 0, 0, 0, 0}, /* 0x3C (00111100) */ |
| 88 | {1, 3, 4, 5, 6, 0, 0, 0}, /* 0x3D (00111101) */ |
| 89 | {2, 3, 4, 5, 6, 0, 0, 0}, /* 0x3E (00111110) */ |
| 90 | {1, 2, 3, 4, 5, 6, 0, 0}, /* 0x3F (00111111) */ |
| 91 | {7, 0, 0, 0, 0, 0, 0, 0}, /* 0x40 (01000000) */ |
| 92 | {1, 7, 0, 0, 0, 0, 0, 0}, /* 0x41 (01000001) */ |
| 93 | {2, 7, 0, 0, 0, 0, 0, 0}, /* 0x42 (01000010) */ |
| 94 | {1, 2, 7, 0, 0, 0, 0, 0}, /* 0x43 (01000011) */ |
| 95 | {3, 7, 0, 0, 0, 0, 0, 0}, /* 0x44 (01000100) */ |
| 96 | {1, 3, 7, 0, 0, 0, 0, 0}, /* 0x45 (01000101) */ |
| 97 | {2, 3, 7, 0, 0, 0, 0, 0}, /* 0x46 (01000110) */ |
| 98 | {1, 2, 3, 7, 0, 0, 0, 0}, /* 0x47 (01000111) */ |
| 99 | {4, 7, 0, 0, 0, 0, 0, 0}, /* 0x48 (01001000) */ |
| 100 | {1, 4, 7, 0, 0, 0, 0, 0}, /* 0x49 (01001001) */ |
| 101 | {2, 4, 7, 0, 0, 0, 0, 0}, /* 0x4A (01001010) */ |
| 102 | {1, 2, 4, 7, 0, 0, 0, 0}, /* 0x4B (01001011) */ |
| 103 | {3, 4, 7, 0, 0, 0, 0, 0}, /* 0x4C (01001100) */ |
| 104 | {1, 3, 4, 7, 0, 0, 0, 0}, /* 0x4D (01001101) */ |
| 105 | {2, 3, 4, 7, 0, 0, 0, 0}, /* 0x4E (01001110) */ |
| 106 | {1, 2, 3, 4, 7, 0, 0, 0}, /* 0x4F (01001111) */ |
| 107 | {5, 7, 0, 0, 0, 0, 0, 0}, /* 0x50 (01010000) */ |
| 108 | {1, 5, 7, 0, 0, 0, 0, 0}, /* 0x51 (01010001) */ |
| 109 | {2, 5, 7, 0, 0, 0, 0, 0}, /* 0x52 (01010010) */ |
| 110 | {1, 2, 5, 7, 0, 0, 0, 0}, /* 0x53 (01010011) */ |
| 111 | {3, 5, 7, 0, 0, 0, 0, 0}, /* 0x54 (01010100) */ |
| 112 | {1, 3, 5, 7, 0, 0, 0, 0}, /* 0x55 (01010101) */ |
| 113 | {2, 3, 5, 7, 0, 0, 0, 0}, /* 0x56 (01010110) */ |
| 114 | {1, 2, 3, 5, 7, 0, 0, 0}, /* 0x57 (01010111) */ |
| 115 | {4, 5, 7, 0, 0, 0, 0, 0}, /* 0x58 (01011000) */ |
| 116 | {1, 4, 5, 7, 0, 0, 0, 0}, /* 0x59 (01011001) */ |
| 117 | {2, 4, 5, 7, 0, 0, 0, 0}, /* 0x5A (01011010) */ |
| 118 | {1, 2, 4, 5, 7, 0, 0, 0}, /* 0x5B (01011011) */ |
| 119 | {3, 4, 5, 7, 0, 0, 0, 0}, /* 0x5C (01011100) */ |
| 120 | {1, 3, 4, 5, 7, 0, 0, 0}, /* 0x5D (01011101) */ |
| 121 | {2, 3, 4, 5, 7, 0, 0, 0}, /* 0x5E (01011110) */ |
| 122 | {1, 2, 3, 4, 5, 7, 0, 0}, /* 0x5F (01011111) */ |
| 123 | {6, 7, 0, 0, 0, 0, 0, 0}, /* 0x60 (01100000) */ |
| 124 | {1, 6, 7, 0, 0, 0, 0, 0}, /* 0x61 (01100001) */ |
| 125 | {2, 6, 7, 0, 0, 0, 0, 0}, /* 0x62 (01100010) */ |
| 126 | {1, 2, 6, 7, 0, 0, 0, 0}, /* 0x63 (01100011) */ |
| 127 | {3, 6, 7, 0, 0, 0, 0, 0}, /* 0x64 (01100100) */ |
| 128 | {1, 3, 6, 7, 0, 0, 0, 0}, /* 0x65 (01100101) */ |
| 129 | {2, 3, 6, 7, 0, 0, 0, 0}, /* 0x66 (01100110) */ |
| 130 | {1, 2, 3, 6, 7, 0, 0, 0}, /* 0x67 (01100111) */ |
| 131 | {4, 6, 7, 0, 0, 0, 0, 0}, /* 0x68 (01101000) */ |
| 132 | {1, 4, 6, 7, 0, 0, 0, 0}, /* 0x69 (01101001) */ |
| 133 | {2, 4, 6, 7, 0, 0, 0, 0}, /* 0x6A (01101010) */ |
| 134 | {1, 2, 4, 6, 7, 0, 0, 0}, /* 0x6B (01101011) */ |
| 135 | {3, 4, 6, 7, 0, 0, 0, 0}, /* 0x6C (01101100) */ |
| 136 | {1, 3, 4, 6, 7, 0, 0, 0}, /* 0x6D (01101101) */ |
| 137 | {2, 3, 4, 6, 7, 0, 0, 0}, /* 0x6E (01101110) */ |
| 138 | {1, 2, 3, 4, 6, 7, 0, 0}, /* 0x6F (01101111) */ |
| 139 | {5, 6, 7, 0, 0, 0, 0, 0}, /* 0x70 (01110000) */ |
| 140 | {1, 5, 6, 7, 0, 0, 0, 0}, /* 0x71 (01110001) */ |
| 141 | {2, 5, 6, 7, 0, 0, 0, 0}, /* 0x72 (01110010) */ |
| 142 | {1, 2, 5, 6, 7, 0, 0, 0}, /* 0x73 (01110011) */ |
| 143 | {3, 5, 6, 7, 0, 0, 0, 0}, /* 0x74 (01110100) */ |
| 144 | {1, 3, 5, 6, 7, 0, 0, 0}, /* 0x75 (01110101) */ |
| 145 | {2, 3, 5, 6, 7, 0, 0, 0}, /* 0x76 (01110110) */ |
| 146 | {1, 2, 3, 5, 6, 7, 0, 0}, /* 0x77 (01110111) */ |
| 147 | {4, 5, 6, 7, 0, 0, 0, 0}, /* 0x78 (01111000) */ |
| 148 | {1, 4, 5, 6, 7, 0, 0, 0}, /* 0x79 (01111001) */ |
| 149 | {2, 4, 5, 6, 7, 0, 0, 0}, /* 0x7A (01111010) */ |
| 150 | {1, 2, 4, 5, 6, 7, 0, 0}, /* 0x7B (01111011) */ |
| 151 | {3, 4, 5, 6, 7, 0, 0, 0}, /* 0x7C (01111100) */ |
| 152 | {1, 3, 4, 5, 6, 7, 0, 0}, /* 0x7D (01111101) */ |
| 153 | {2, 3, 4, 5, 6, 7, 0, 0}, /* 0x7E (01111110) */ |
| 154 | {1, 2, 3, 4, 5, 6, 7, 0}, /* 0x7F (01111111) */ |
| 155 | {8, 0, 0, 0, 0, 0, 0, 0}, /* 0x80 (10000000) */ |
| 156 | {1, 8, 0, 0, 0, 0, 0, 0}, /* 0x81 (10000001) */ |
| 157 | {2, 8, 0, 0, 0, 0, 0, 0}, /* 0x82 (10000010) */ |
| 158 | {1, 2, 8, 0, 0, 0, 0, 0}, /* 0x83 (10000011) */ |
| 159 | {3, 8, 0, 0, 0, 0, 0, 0}, /* 0x84 (10000100) */ |
| 160 | {1, 3, 8, 0, 0, 0, 0, 0}, /* 0x85 (10000101) */ |
| 161 | {2, 3, 8, 0, 0, 0, 0, 0}, /* 0x86 (10000110) */ |
| 162 | {1, 2, 3, 8, 0, 0, 0, 0}, /* 0x87 (10000111) */ |
| 163 | {4, 8, 0, 0, 0, 0, 0, 0}, /* 0x88 (10001000) */ |
| 164 | {1, 4, 8, 0, 0, 0, 0, 0}, /* 0x89 (10001001) */ |
| 165 | {2, 4, 8, 0, 0, 0, 0, 0}, /* 0x8A (10001010) */ |
| 166 | {1, 2, 4, 8, 0, 0, 0, 0}, /* 0x8B (10001011) */ |
| 167 | {3, 4, 8, 0, 0, 0, 0, 0}, /* 0x8C (10001100) */ |
| 168 | {1, 3, 4, 8, 0, 0, 0, 0}, /* 0x8D (10001101) */ |
| 169 | {2, 3, 4, 8, 0, 0, 0, 0}, /* 0x8E (10001110) */ |
| 170 | {1, 2, 3, 4, 8, 0, 0, 0}, /* 0x8F (10001111) */ |
| 171 | {5, 8, 0, 0, 0, 0, 0, 0}, /* 0x90 (10010000) */ |
| 172 | {1, 5, 8, 0, 0, 0, 0, 0}, /* 0x91 (10010001) */ |
| 173 | {2, 5, 8, 0, 0, 0, 0, 0}, /* 0x92 (10010010) */ |
| 174 | {1, 2, 5, 8, 0, 0, 0, 0}, /* 0x93 (10010011) */ |
| 175 | {3, 5, 8, 0, 0, 0, 0, 0}, /* 0x94 (10010100) */ |
| 176 | {1, 3, 5, 8, 0, 0, 0, 0}, /* 0x95 (10010101) */ |
| 177 | {2, 3, 5, 8, 0, 0, 0, 0}, /* 0x96 (10010110) */ |
| 178 | {1, 2, 3, 5, 8, 0, 0, 0}, /* 0x97 (10010111) */ |
| 179 | {4, 5, 8, 0, 0, 0, 0, 0}, /* 0x98 (10011000) */ |
| 180 | {1, 4, 5, 8, 0, 0, 0, 0}, /* 0x99 (10011001) */ |
| 181 | {2, 4, 5, 8, 0, 0, 0, 0}, /* 0x9A (10011010) */ |
| 182 | {1, 2, 4, 5, 8, 0, 0, 0}, /* 0x9B (10011011) */ |
| 183 | {3, 4, 5, 8, 0, 0, 0, 0}, /* 0x9C (10011100) */ |
| 184 | {1, 3, 4, 5, 8, 0, 0, 0}, /* 0x9D (10011101) */ |
| 185 | {2, 3, 4, 5, 8, 0, 0, 0}, /* 0x9E (10011110) */ |
| 186 | {1, 2, 3, 4, 5, 8, 0, 0}, /* 0x9F (10011111) */ |
| 187 | {6, 8, 0, 0, 0, 0, 0, 0}, /* 0xA0 (10100000) */ |
| 188 | {1, 6, 8, 0, 0, 0, 0, 0}, /* 0xA1 (10100001) */ |
| 189 | {2, 6, 8, 0, 0, 0, 0, 0}, /* 0xA2 (10100010) */ |
| 190 | {1, 2, 6, 8, 0, 0, 0, 0}, /* 0xA3 (10100011) */ |
| 191 | {3, 6, 8, 0, 0, 0, 0, 0}, /* 0xA4 (10100100) */ |
| 192 | {1, 3, 6, 8, 0, 0, 0, 0}, /* 0xA5 (10100101) */ |
| 193 | {2, 3, 6, 8, 0, 0, 0, 0}, /* 0xA6 (10100110) */ |
| 194 | {1, 2, 3, 6, 8, 0, 0, 0}, /* 0xA7 (10100111) */ |
| 195 | {4, 6, 8, 0, 0, 0, 0, 0}, /* 0xA8 (10101000) */ |
| 196 | {1, 4, 6, 8, 0, 0, 0, 0}, /* 0xA9 (10101001) */ |
| 197 | {2, 4, 6, 8, 0, 0, 0, 0}, /* 0xAA (10101010) */ |
| 198 | {1, 2, 4, 6, 8, 0, 0, 0}, /* 0xAB (10101011) */ |
| 199 | {3, 4, 6, 8, 0, 0, 0, 0}, /* 0xAC (10101100) */ |
| 200 | {1, 3, 4, 6, 8, 0, 0, 0}, /* 0xAD (10101101) */ |
| 201 | {2, 3, 4, 6, 8, 0, 0, 0}, /* 0xAE (10101110) */ |
| 202 | {1, 2, 3, 4, 6, 8, 0, 0}, /* 0xAF (10101111) */ |
| 203 | {5, 6, 8, 0, 0, 0, 0, 0}, /* 0xB0 (10110000) */ |
| 204 | {1, 5, 6, 8, 0, 0, 0, 0}, /* 0xB1 (10110001) */ |
| 205 | {2, 5, 6, 8, 0, 0, 0, 0}, /* 0xB2 (10110010) */ |
| 206 | {1, 2, 5, 6, 8, 0, 0, 0}, /* 0xB3 (10110011) */ |
| 207 | {3, 5, 6, 8, 0, 0, 0, 0}, /* 0xB4 (10110100) */ |
| 208 | {1, 3, 5, 6, 8, 0, 0, 0}, /* 0xB5 (10110101) */ |
| 209 | {2, 3, 5, 6, 8, 0, 0, 0}, /* 0xB6 (10110110) */ |
| 210 | {1, 2, 3, 5, 6, 8, 0, 0}, /* 0xB7 (10110111) */ |
| 211 | {4, 5, 6, 8, 0, 0, 0, 0}, /* 0xB8 (10111000) */ |
| 212 | {1, 4, 5, 6, 8, 0, 0, 0}, /* 0xB9 (10111001) */ |
| 213 | {2, 4, 5, 6, 8, 0, 0, 0}, /* 0xBA (10111010) */ |
| 214 | {1, 2, 4, 5, 6, 8, 0, 0}, /* 0xBB (10111011) */ |
| 215 | {3, 4, 5, 6, 8, 0, 0, 0}, /* 0xBC (10111100) */ |
| 216 | {1, 3, 4, 5, 6, 8, 0, 0}, /* 0xBD (10111101) */ |
| 217 | {2, 3, 4, 5, 6, 8, 0, 0}, /* 0xBE (10111110) */ |
| 218 | {1, 2, 3, 4, 5, 6, 8, 0}, /* 0xBF (10111111) */ |
| 219 | {7, 8, 0, 0, 0, 0, 0, 0}, /* 0xC0 (11000000) */ |
| 220 | {1, 7, 8, 0, 0, 0, 0, 0}, /* 0xC1 (11000001) */ |
| 221 | {2, 7, 8, 0, 0, 0, 0, 0}, /* 0xC2 (11000010) */ |
| 222 | {1, 2, 7, 8, 0, 0, 0, 0}, /* 0xC3 (11000011) */ |
| 223 | {3, 7, 8, 0, 0, 0, 0, 0}, /* 0xC4 (11000100) */ |
| 224 | {1, 3, 7, 8, 0, 0, 0, 0}, /* 0xC5 (11000101) */ |
| 225 | {2, 3, 7, 8, 0, 0, 0, 0}, /* 0xC6 (11000110) */ |
| 226 | {1, 2, 3, 7, 8, 0, 0, 0}, /* 0xC7 (11000111) */ |
| 227 | {4, 7, 8, 0, 0, 0, 0, 0}, /* 0xC8 (11001000) */ |
| 228 | {1, 4, 7, 8, 0, 0, 0, 0}, /* 0xC9 (11001001) */ |
| 229 | {2, 4, 7, 8, 0, 0, 0, 0}, /* 0xCA (11001010) */ |
| 230 | {1, 2, 4, 7, 8, 0, 0, 0}, /* 0xCB (11001011) */ |
| 231 | {3, 4, 7, 8, 0, 0, 0, 0}, /* 0xCC (11001100) */ |
| 232 | {1, 3, 4, 7, 8, 0, 0, 0}, /* 0xCD (11001101) */ |
| 233 | {2, 3, 4, 7, 8, 0, 0, 0}, /* 0xCE (11001110) */ |
| 234 | {1, 2, 3, 4, 7, 8, 0, 0}, /* 0xCF (11001111) */ |
| 235 | {5, 7, 8, 0, 0, 0, 0, 0}, /* 0xD0 (11010000) */ |
| 236 | {1, 5, 7, 8, 0, 0, 0, 0}, /* 0xD1 (11010001) */ |
| 237 | {2, 5, 7, 8, 0, 0, 0, 0}, /* 0xD2 (11010010) */ |
| 238 | {1, 2, 5, 7, 8, 0, 0, 0}, /* 0xD3 (11010011) */ |
| 239 | {3, 5, 7, 8, 0, 0, 0, 0}, /* 0xD4 (11010100) */ |
| 240 | {1, 3, 5, 7, 8, 0, 0, 0}, /* 0xD5 (11010101) */ |
| 241 | {2, 3, 5, 7, 8, 0, 0, 0}, /* 0xD6 (11010110) */ |
| 242 | {1, 2, 3, 5, 7, 8, 0, 0}, /* 0xD7 (11010111) */ |
| 243 | {4, 5, 7, 8, 0, 0, 0, 0}, /* 0xD8 (11011000) */ |
| 244 | {1, 4, 5, 7, 8, 0, 0, 0}, /* 0xD9 (11011001) */ |
| 245 | {2, 4, 5, 7, 8, 0, 0, 0}, /* 0xDA (11011010) */ |
| 246 | {1, 2, 4, 5, 7, 8, 0, 0}, /* 0xDB (11011011) */ |
| 247 | {3, 4, 5, 7, 8, 0, 0, 0}, /* 0xDC (11011100) */ |
| 248 | {1, 3, 4, 5, 7, 8, 0, 0}, /* 0xDD (11011101) */ |
| 249 | {2, 3, 4, 5, 7, 8, 0, 0}, /* 0xDE (11011110) */ |
| 250 | {1, 2, 3, 4, 5, 7, 8, 0}, /* 0xDF (11011111) */ |
| 251 | {6, 7, 8, 0, 0, 0, 0, 0}, /* 0xE0 (11100000) */ |
| 252 | {1, 6, 7, 8, 0, 0, 0, 0}, /* 0xE1 (11100001) */ |
| 253 | {2, 6, 7, 8, 0, 0, 0, 0}, /* 0xE2 (11100010) */ |
| 254 | {1, 2, 6, 7, 8, 0, 0, 0}, /* 0xE3 (11100011) */ |
| 255 | {3, 6, 7, 8, 0, 0, 0, 0}, /* 0xE4 (11100100) */ |
| 256 | {1, 3, 6, 7, 8, 0, 0, 0}, /* 0xE5 (11100101) */ |
| 257 | {2, 3, 6, 7, 8, 0, 0, 0}, /* 0xE6 (11100110) */ |
| 258 | {1, 2, 3, 6, 7, 8, 0, 0}, /* 0xE7 (11100111) */ |
| 259 | {4, 6, 7, 8, 0, 0, 0, 0}, /* 0xE8 (11101000) */ |
| 260 | {1, 4, 6, 7, 8, 0, 0, 0}, /* 0xE9 (11101001) */ |
| 261 | {2, 4, 6, 7, 8, 0, 0, 0}, /* 0xEA (11101010) */ |
| 262 | {1, 2, 4, 6, 7, 8, 0, 0}, /* 0xEB (11101011) */ |
| 263 | {3, 4, 6, 7, 8, 0, 0, 0}, /* 0xEC (11101100) */ |
| 264 | {1, 3, 4, 6, 7, 8, 0, 0}, /* 0xED (11101101) */ |
| 265 | {2, 3, 4, 6, 7, 8, 0, 0}, /* 0xEE (11101110) */ |
| 266 | {1, 2, 3, 4, 6, 7, 8, 0}, /* 0xEF (11101111) */ |
| 267 | {5, 6, 7, 8, 0, 0, 0, 0}, /* 0xF0 (11110000) */ |
| 268 | {1, 5, 6, 7, 8, 0, 0, 0}, /* 0xF1 (11110001) */ |
| 269 | {2, 5, 6, 7, 8, 0, 0, 0}, /* 0xF2 (11110010) */ |
| 270 | {1, 2, 5, 6, 7, 8, 0, 0}, /* 0xF3 (11110011) */ |
| 271 | {3, 5, 6, 7, 8, 0, 0, 0}, /* 0xF4 (11110100) */ |
| 272 | {1, 3, 5, 6, 7, 8, 0, 0}, /* 0xF5 (11110101) */ |
| 273 | {2, 3, 5, 6, 7, 8, 0, 0}, /* 0xF6 (11110110) */ |
| 274 | {1, 2, 3, 5, 6, 7, 8, 0}, /* 0xF7 (11110111) */ |
| 275 | {4, 5, 6, 7, 8, 0, 0, 0}, /* 0xF8 (11111000) */ |
| 276 | {1, 4, 5, 6, 7, 8, 0, 0}, /* 0xF9 (11111001) */ |
| 277 | {2, 4, 5, 6, 7, 8, 0, 0}, /* 0xFA (11111010) */ |
| 278 | {1, 2, 4, 5, 6, 7, 8, 0}, /* 0xFB (11111011) */ |
| 279 | {3, 4, 5, 6, 7, 8, 0, 0}, /* 0xFC (11111100) */ |
| 280 | {1, 3, 4, 5, 6, 7, 8, 0}, /* 0xFD (11111101) */ |
| 281 | {2, 3, 4, 5, 6, 7, 8, 0}, /* 0xFE (11111110) */ |
| 282 | {1, 2, 3, 4, 5, 6, 7, 8} /* 0xFF (11111111) */ |
| 283 | }; |
| 284 | |
| 285 | #endif // #ifdef USEAVX |
| 286 | |
| 287 | #ifdef IS_X64 |
| 288 | // same as vecDecodeTable but in 16 bits |
| 289 | ALIGNED(32) |
| 290 | static uint16_t vecDecodeTable_uint16[256][8] = { |
| 291 | {0, 0, 0, 0, 0, 0, 0, 0}, /* 0x00 (00000000) */ |
| 292 | {1, 0, 0, 0, 0, 0, 0, 0}, /* 0x01 (00000001) */ |
| 293 | {2, 0, 0, 0, 0, 0, 0, 0}, /* 0x02 (00000010) */ |
| 294 | {1, 2, 0, 0, 0, 0, 0, 0}, /* 0x03 (00000011) */ |
| 295 | {3, 0, 0, 0, 0, 0, 0, 0}, /* 0x04 (00000100) */ |
| 296 | {1, 3, 0, 0, 0, 0, 0, 0}, /* 0x05 (00000101) */ |
| 297 | {2, 3, 0, 0, 0, 0, 0, 0}, /* 0x06 (00000110) */ |
| 298 | {1, 2, 3, 0, 0, 0, 0, 0}, /* 0x07 (00000111) */ |
| 299 | {4, 0, 0, 0, 0, 0, 0, 0}, /* 0x08 (00001000) */ |
| 300 | {1, 4, 0, 0, 0, 0, 0, 0}, /* 0x09 (00001001) */ |
| 301 | {2, 4, 0, 0, 0, 0, 0, 0}, /* 0x0A (00001010) */ |
| 302 | {1, 2, 4, 0, 0, 0, 0, 0}, /* 0x0B (00001011) */ |
| 303 | {3, 4, 0, 0, 0, 0, 0, 0}, /* 0x0C (00001100) */ |
| 304 | {1, 3, 4, 0, 0, 0, 0, 0}, /* 0x0D (00001101) */ |
| 305 | {2, 3, 4, 0, 0, 0, 0, 0}, /* 0x0E (00001110) */ |
| 306 | {1, 2, 3, 4, 0, 0, 0, 0}, /* 0x0F (00001111) */ |
| 307 | {5, 0, 0, 0, 0, 0, 0, 0}, /* 0x10 (00010000) */ |
| 308 | {1, 5, 0, 0, 0, 0, 0, 0}, /* 0x11 (00010001) */ |
| 309 | {2, 5, 0, 0, 0, 0, 0, 0}, /* 0x12 (00010010) */ |
| 310 | {1, 2, 5, 0, 0, 0, 0, 0}, /* 0x13 (00010011) */ |
| 311 | {3, 5, 0, 0, 0, 0, 0, 0}, /* 0x14 (00010100) */ |
| 312 | {1, 3, 5, 0, 0, 0, 0, 0}, /* 0x15 (00010101) */ |
| 313 | {2, 3, 5, 0, 0, 0, 0, 0}, /* 0x16 (00010110) */ |
| 314 | {1, 2, 3, 5, 0, 0, 0, 0}, /* 0x17 (00010111) */ |
| 315 | {4, 5, 0, 0, 0, 0, 0, 0}, /* 0x18 (00011000) */ |
| 316 | {1, 4, 5, 0, 0, 0, 0, 0}, /* 0x19 (00011001) */ |
| 317 | {2, 4, 5, 0, 0, 0, 0, 0}, /* 0x1A (00011010) */ |
| 318 | {1, 2, 4, 5, 0, 0, 0, 0}, /* 0x1B (00011011) */ |
| 319 | {3, 4, 5, 0, 0, 0, 0, 0}, /* 0x1C (00011100) */ |
| 320 | {1, 3, 4, 5, 0, 0, 0, 0}, /* 0x1D (00011101) */ |
| 321 | {2, 3, 4, 5, 0, 0, 0, 0}, /* 0x1E (00011110) */ |
| 322 | {1, 2, 3, 4, 5, 0, 0, 0}, /* 0x1F (00011111) */ |
| 323 | {6, 0, 0, 0, 0, 0, 0, 0}, /* 0x20 (00100000) */ |
| 324 | {1, 6, 0, 0, 0, 0, 0, 0}, /* 0x21 (00100001) */ |
| 325 | {2, 6, 0, 0, 0, 0, 0, 0}, /* 0x22 (00100010) */ |
| 326 | {1, 2, 6, 0, 0, 0, 0, 0}, /* 0x23 (00100011) */ |
| 327 | {3, 6, 0, 0, 0, 0, 0, 0}, /* 0x24 (00100100) */ |
| 328 | {1, 3, 6, 0, 0, 0, 0, 0}, /* 0x25 (00100101) */ |
| 329 | {2, 3, 6, 0, 0, 0, 0, 0}, /* 0x26 (00100110) */ |
| 330 | {1, 2, 3, 6, 0, 0, 0, 0}, /* 0x27 (00100111) */ |
| 331 | {4, 6, 0, 0, 0, 0, 0, 0}, /* 0x28 (00101000) */ |
| 332 | {1, 4, 6, 0, 0, 0, 0, 0}, /* 0x29 (00101001) */ |
| 333 | {2, 4, 6, 0, 0, 0, 0, 0}, /* 0x2A (00101010) */ |
| 334 | {1, 2, 4, 6, 0, 0, 0, 0}, /* 0x2B (00101011) */ |
| 335 | {3, 4, 6, 0, 0, 0, 0, 0}, /* 0x2C (00101100) */ |
| 336 | {1, 3, 4, 6, 0, 0, 0, 0}, /* 0x2D (00101101) */ |
| 337 | {2, 3, 4, 6, 0, 0, 0, 0}, /* 0x2E (00101110) */ |
| 338 | {1, 2, 3, 4, 6, 0, 0, 0}, /* 0x2F (00101111) */ |
| 339 | {5, 6, 0, 0, 0, 0, 0, 0}, /* 0x30 (00110000) */ |
| 340 | {1, 5, 6, 0, 0, 0, 0, 0}, /* 0x31 (00110001) */ |
| 341 | {2, 5, 6, 0, 0, 0, 0, 0}, /* 0x32 (00110010) */ |
| 342 | {1, 2, 5, 6, 0, 0, 0, 0}, /* 0x33 (00110011) */ |
| 343 | {3, 5, 6, 0, 0, 0, 0, 0}, /* 0x34 (00110100) */ |
| 344 | {1, 3, 5, 6, 0, 0, 0, 0}, /* 0x35 (00110101) */ |
| 345 | {2, 3, 5, 6, 0, 0, 0, 0}, /* 0x36 (00110110) */ |
| 346 | {1, 2, 3, 5, 6, 0, 0, 0}, /* 0x37 (00110111) */ |
| 347 | {4, 5, 6, 0, 0, 0, 0, 0}, /* 0x38 (00111000) */ |
| 348 | {1, 4, 5, 6, 0, 0, 0, 0}, /* 0x39 (00111001) */ |
| 349 | {2, 4, 5, 6, 0, 0, 0, 0}, /* 0x3A (00111010) */ |
| 350 | {1, 2, 4, 5, 6, 0, 0, 0}, /* 0x3B (00111011) */ |
| 351 | {3, 4, 5, 6, 0, 0, 0, 0}, /* 0x3C (00111100) */ |
| 352 | {1, 3, 4, 5, 6, 0, 0, 0}, /* 0x3D (00111101) */ |
| 353 | {2, 3, 4, 5, 6, 0, 0, 0}, /* 0x3E (00111110) */ |
| 354 | {1, 2, 3, 4, 5, 6, 0, 0}, /* 0x3F (00111111) */ |
| 355 | {7, 0, 0, 0, 0, 0, 0, 0}, /* 0x40 (01000000) */ |
| 356 | {1, 7, 0, 0, 0, 0, 0, 0}, /* 0x41 (01000001) */ |
| 357 | {2, 7, 0, 0, 0, 0, 0, 0}, /* 0x42 (01000010) */ |
| 358 | {1, 2, 7, 0, 0, 0, 0, 0}, /* 0x43 (01000011) */ |
| 359 | {3, 7, 0, 0, 0, 0, 0, 0}, /* 0x44 (01000100) */ |
| 360 | {1, 3, 7, 0, 0, 0, 0, 0}, /* 0x45 (01000101) */ |
| 361 | {2, 3, 7, 0, 0, 0, 0, 0}, /* 0x46 (01000110) */ |
| 362 | {1, 2, 3, 7, 0, 0, 0, 0}, /* 0x47 (01000111) */ |
| 363 | {4, 7, 0, 0, 0, 0, 0, 0}, /* 0x48 (01001000) */ |
| 364 | {1, 4, 7, 0, 0, 0, 0, 0}, /* 0x49 (01001001) */ |
| 365 | {2, 4, 7, 0, 0, 0, 0, 0}, /* 0x4A (01001010) */ |
| 366 | {1, 2, 4, 7, 0, 0, 0, 0}, /* 0x4B (01001011) */ |
| 367 | {3, 4, 7, 0, 0, 0, 0, 0}, /* 0x4C (01001100) */ |
| 368 | {1, 3, 4, 7, 0, 0, 0, 0}, /* 0x4D (01001101) */ |
| 369 | {2, 3, 4, 7, 0, 0, 0, 0}, /* 0x4E (01001110) */ |
| 370 | {1, 2, 3, 4, 7, 0, 0, 0}, /* 0x4F (01001111) */ |
| 371 | {5, 7, 0, 0, 0, 0, 0, 0}, /* 0x50 (01010000) */ |
| 372 | {1, 5, 7, 0, 0, 0, 0, 0}, /* 0x51 (01010001) */ |
| 373 | {2, 5, 7, 0, 0, 0, 0, 0}, /* 0x52 (01010010) */ |
| 374 | {1, 2, 5, 7, 0, 0, 0, 0}, /* 0x53 (01010011) */ |
| 375 | {3, 5, 7, 0, 0, 0, 0, 0}, /* 0x54 (01010100) */ |
| 376 | {1, 3, 5, 7, 0, 0, 0, 0}, /* 0x55 (01010101) */ |
| 377 | {2, 3, 5, 7, 0, 0, 0, 0}, /* 0x56 (01010110) */ |
| 378 | {1, 2, 3, 5, 7, 0, 0, 0}, /* 0x57 (01010111) */ |
| 379 | {4, 5, 7, 0, 0, 0, 0, 0}, /* 0x58 (01011000) */ |
| 380 | {1, 4, 5, 7, 0, 0, 0, 0}, /* 0x59 (01011001) */ |
| 381 | {2, 4, 5, 7, 0, 0, 0, 0}, /* 0x5A (01011010) */ |
| 382 | {1, 2, 4, 5, 7, 0, 0, 0}, /* 0x5B (01011011) */ |
| 383 | {3, 4, 5, 7, 0, 0, 0, 0}, /* 0x5C (01011100) */ |
| 384 | {1, 3, 4, 5, 7, 0, 0, 0}, /* 0x5D (01011101) */ |
| 385 | {2, 3, 4, 5, 7, 0, 0, 0}, /* 0x5E (01011110) */ |
| 386 | {1, 2, 3, 4, 5, 7, 0, 0}, /* 0x5F (01011111) */ |
| 387 | {6, 7, 0, 0, 0, 0, 0, 0}, /* 0x60 (01100000) */ |
| 388 | {1, 6, 7, 0, 0, 0, 0, 0}, /* 0x61 (01100001) */ |
| 389 | {2, 6, 7, 0, 0, 0, 0, 0}, /* 0x62 (01100010) */ |
| 390 | {1, 2, 6, 7, 0, 0, 0, 0}, /* 0x63 (01100011) */ |
| 391 | {3, 6, 7, 0, 0, 0, 0, 0}, /* 0x64 (01100100) */ |
| 392 | {1, 3, 6, 7, 0, 0, 0, 0}, /* 0x65 (01100101) */ |
| 393 | {2, 3, 6, 7, 0, 0, 0, 0}, /* 0x66 (01100110) */ |
| 394 | {1, 2, 3, 6, 7, 0, 0, 0}, /* 0x67 (01100111) */ |
| 395 | {4, 6, 7, 0, 0, 0, 0, 0}, /* 0x68 (01101000) */ |
| 396 | {1, 4, 6, 7, 0, 0, 0, 0}, /* 0x69 (01101001) */ |
| 397 | {2, 4, 6, 7, 0, 0, 0, 0}, /* 0x6A (01101010) */ |
| 398 | {1, 2, 4, 6, 7, 0, 0, 0}, /* 0x6B (01101011) */ |
| 399 | {3, 4, 6, 7, 0, 0, 0, 0}, /* 0x6C (01101100) */ |
| 400 | {1, 3, 4, 6, 7, 0, 0, 0}, /* 0x6D (01101101) */ |
| 401 | {2, 3, 4, 6, 7, 0, 0, 0}, /* 0x6E (01101110) */ |
| 402 | {1, 2, 3, 4, 6, 7, 0, 0}, /* 0x6F (01101111) */ |
| 403 | {5, 6, 7, 0, 0, 0, 0, 0}, /* 0x70 (01110000) */ |
| 404 | {1, 5, 6, 7, 0, 0, 0, 0}, /* 0x71 (01110001) */ |
| 405 | {2, 5, 6, 7, 0, 0, 0, 0}, /* 0x72 (01110010) */ |
| 406 | {1, 2, 5, 6, 7, 0, 0, 0}, /* 0x73 (01110011) */ |
| 407 | {3, 5, 6, 7, 0, 0, 0, 0}, /* 0x74 (01110100) */ |
| 408 | {1, 3, 5, 6, 7, 0, 0, 0}, /* 0x75 (01110101) */ |
| 409 | {2, 3, 5, 6, 7, 0, 0, 0}, /* 0x76 (01110110) */ |
| 410 | {1, 2, 3, 5, 6, 7, 0, 0}, /* 0x77 (01110111) */ |
| 411 | {4, 5, 6, 7, 0, 0, 0, 0}, /* 0x78 (01111000) */ |
| 412 | {1, 4, 5, 6, 7, 0, 0, 0}, /* 0x79 (01111001) */ |
| 413 | {2, 4, 5, 6, 7, 0, 0, 0}, /* 0x7A (01111010) */ |
| 414 | {1, 2, 4, 5, 6, 7, 0, 0}, /* 0x7B (01111011) */ |
| 415 | {3, 4, 5, 6, 7, 0, 0, 0}, /* 0x7C (01111100) */ |
| 416 | {1, 3, 4, 5, 6, 7, 0, 0}, /* 0x7D (01111101) */ |
| 417 | {2, 3, 4, 5, 6, 7, 0, 0}, /* 0x7E (01111110) */ |
| 418 | {1, 2, 3, 4, 5, 6, 7, 0}, /* 0x7F (01111111) */ |
| 419 | {8, 0, 0, 0, 0, 0, 0, 0}, /* 0x80 (10000000) */ |
| 420 | {1, 8, 0, 0, 0, 0, 0, 0}, /* 0x81 (10000001) */ |
| 421 | {2, 8, 0, 0, 0, 0, 0, 0}, /* 0x82 (10000010) */ |
| 422 | {1, 2, 8, 0, 0, 0, 0, 0}, /* 0x83 (10000011) */ |
| 423 | {3, 8, 0, 0, 0, 0, 0, 0}, /* 0x84 (10000100) */ |
| 424 | {1, 3, 8, 0, 0, 0, 0, 0}, /* 0x85 (10000101) */ |
| 425 | {2, 3, 8, 0, 0, 0, 0, 0}, /* 0x86 (10000110) */ |
| 426 | {1, 2, 3, 8, 0, 0, 0, 0}, /* 0x87 (10000111) */ |
| 427 | {4, 8, 0, 0, 0, 0, 0, 0}, /* 0x88 (10001000) */ |
| 428 | {1, 4, 8, 0, 0, 0, 0, 0}, /* 0x89 (10001001) */ |
| 429 | {2, 4, 8, 0, 0, 0, 0, 0}, /* 0x8A (10001010) */ |
| 430 | {1, 2, 4, 8, 0, 0, 0, 0}, /* 0x8B (10001011) */ |
| 431 | {3, 4, 8, 0, 0, 0, 0, 0}, /* 0x8C (10001100) */ |
| 432 | {1, 3, 4, 8, 0, 0, 0, 0}, /* 0x8D (10001101) */ |
| 433 | {2, 3, 4, 8, 0, 0, 0, 0}, /* 0x8E (10001110) */ |
| 434 | {1, 2, 3, 4, 8, 0, 0, 0}, /* 0x8F (10001111) */ |
| 435 | {5, 8, 0, 0, 0, 0, 0, 0}, /* 0x90 (10010000) */ |
| 436 | {1, 5, 8, 0, 0, 0, 0, 0}, /* 0x91 (10010001) */ |
| 437 | {2, 5, 8, 0, 0, 0, 0, 0}, /* 0x92 (10010010) */ |
| 438 | {1, 2, 5, 8, 0, 0, 0, 0}, /* 0x93 (10010011) */ |
| 439 | {3, 5, 8, 0, 0, 0, 0, 0}, /* 0x94 (10010100) */ |
| 440 | {1, 3, 5, 8, 0, 0, 0, 0}, /* 0x95 (10010101) */ |
| 441 | {2, 3, 5, 8, 0, 0, 0, 0}, /* 0x96 (10010110) */ |
| 442 | {1, 2, 3, 5, 8, 0, 0, 0}, /* 0x97 (10010111) */ |
| 443 | {4, 5, 8, 0, 0, 0, 0, 0}, /* 0x98 (10011000) */ |
| 444 | {1, 4, 5, 8, 0, 0, 0, 0}, /* 0x99 (10011001) */ |
| 445 | {2, 4, 5, 8, 0, 0, 0, 0}, /* 0x9A (10011010) */ |
| 446 | {1, 2, 4, 5, 8, 0, 0, 0}, /* 0x9B (10011011) */ |
| 447 | {3, 4, 5, 8, 0, 0, 0, 0}, /* 0x9C (10011100) */ |
| 448 | {1, 3, 4, 5, 8, 0, 0, 0}, /* 0x9D (10011101) */ |
| 449 | {2, 3, 4, 5, 8, 0, 0, 0}, /* 0x9E (10011110) */ |
| 450 | {1, 2, 3, 4, 5, 8, 0, 0}, /* 0x9F (10011111) */ |
| 451 | {6, 8, 0, 0, 0, 0, 0, 0}, /* 0xA0 (10100000) */ |
| 452 | {1, 6, 8, 0, 0, 0, 0, 0}, /* 0xA1 (10100001) */ |
| 453 | {2, 6, 8, 0, 0, 0, 0, 0}, /* 0xA2 (10100010) */ |
| 454 | {1, 2, 6, 8, 0, 0, 0, 0}, /* 0xA3 (10100011) */ |
| 455 | {3, 6, 8, 0, 0, 0, 0, 0}, /* 0xA4 (10100100) */ |
| 456 | {1, 3, 6, 8, 0, 0, 0, 0}, /* 0xA5 (10100101) */ |
| 457 | {2, 3, 6, 8, 0, 0, 0, 0}, /* 0xA6 (10100110) */ |
| 458 | {1, 2, 3, 6, 8, 0, 0, 0}, /* 0xA7 (10100111) */ |
| 459 | {4, 6, 8, 0, 0, 0, 0, 0}, /* 0xA8 (10101000) */ |
| 460 | {1, 4, 6, 8, 0, 0, 0, 0}, /* 0xA9 (10101001) */ |
| 461 | {2, 4, 6, 8, 0, 0, 0, 0}, /* 0xAA (10101010) */ |
| 462 | {1, 2, 4, 6, 8, 0, 0, 0}, /* 0xAB (10101011) */ |
| 463 | {3, 4, 6, 8, 0, 0, 0, 0}, /* 0xAC (10101100) */ |
| 464 | {1, 3, 4, 6, 8, 0, 0, 0}, /* 0xAD (10101101) */ |
| 465 | {2, 3, 4, 6, 8, 0, 0, 0}, /* 0xAE (10101110) */ |
| 466 | {1, 2, 3, 4, 6, 8, 0, 0}, /* 0xAF (10101111) */ |
| 467 | {5, 6, 8, 0, 0, 0, 0, 0}, /* 0xB0 (10110000) */ |
| 468 | {1, 5, 6, 8, 0, 0, 0, 0}, /* 0xB1 (10110001) */ |
| 469 | {2, 5, 6, 8, 0, 0, 0, 0}, /* 0xB2 (10110010) */ |
| 470 | {1, 2, 5, 6, 8, 0, 0, 0}, /* 0xB3 (10110011) */ |
| 471 | {3, 5, 6, 8, 0, 0, 0, 0}, /* 0xB4 (10110100) */ |
| 472 | {1, 3, 5, 6, 8, 0, 0, 0}, /* 0xB5 (10110101) */ |
| 473 | {2, 3, 5, 6, 8, 0, 0, 0}, /* 0xB6 (10110110) */ |
| 474 | {1, 2, 3, 5, 6, 8, 0, 0}, /* 0xB7 (10110111) */ |
| 475 | {4, 5, 6, 8, 0, 0, 0, 0}, /* 0xB8 (10111000) */ |
| 476 | {1, 4, 5, 6, 8, 0, 0, 0}, /* 0xB9 (10111001) */ |
| 477 | {2, 4, 5, 6, 8, 0, 0, 0}, /* 0xBA (10111010) */ |
| 478 | {1, 2, 4, 5, 6, 8, 0, 0}, /* 0xBB (10111011) */ |
| 479 | {3, 4, 5, 6, 8, 0, 0, 0}, /* 0xBC (10111100) */ |
| 480 | {1, 3, 4, 5, 6, 8, 0, 0}, /* 0xBD (10111101) */ |
| 481 | {2, 3, 4, 5, 6, 8, 0, 0}, /* 0xBE (10111110) */ |
| 482 | {1, 2, 3, 4, 5, 6, 8, 0}, /* 0xBF (10111111) */ |
| 483 | {7, 8, 0, 0, 0, 0, 0, 0}, /* 0xC0 (11000000) */ |
| 484 | {1, 7, 8, 0, 0, 0, 0, 0}, /* 0xC1 (11000001) */ |
| 485 | {2, 7, 8, 0, 0, 0, 0, 0}, /* 0xC2 (11000010) */ |
| 486 | {1, 2, 7, 8, 0, 0, 0, 0}, /* 0xC3 (11000011) */ |
| 487 | {3, 7, 8, 0, 0, 0, 0, 0}, /* 0xC4 (11000100) */ |
| 488 | {1, 3, 7, 8, 0, 0, 0, 0}, /* 0xC5 (11000101) */ |
| 489 | {2, 3, 7, 8, 0, 0, 0, 0}, /* 0xC6 (11000110) */ |
| 490 | {1, 2, 3, 7, 8, 0, 0, 0}, /* 0xC7 (11000111) */ |
| 491 | {4, 7, 8, 0, 0, 0, 0, 0}, /* 0xC8 (11001000) */ |
| 492 | {1, 4, 7, 8, 0, 0, 0, 0}, /* 0xC9 (11001001) */ |
| 493 | {2, 4, 7, 8, 0, 0, 0, 0}, /* 0xCA (11001010) */ |
| 494 | {1, 2, 4, 7, 8, 0, 0, 0}, /* 0xCB (11001011) */ |
| 495 | {3, 4, 7, 8, 0, 0, 0, 0}, /* 0xCC (11001100) */ |
| 496 | {1, 3, 4, 7, 8, 0, 0, 0}, /* 0xCD (11001101) */ |
| 497 | {2, 3, 4, 7, 8, 0, 0, 0}, /* 0xCE (11001110) */ |
| 498 | {1, 2, 3, 4, 7, 8, 0, 0}, /* 0xCF (11001111) */ |
| 499 | {5, 7, 8, 0, 0, 0, 0, 0}, /* 0xD0 (11010000) */ |
| 500 | {1, 5, 7, 8, 0, 0, 0, 0}, /* 0xD1 (11010001) */ |
| 501 | {2, 5, 7, 8, 0, 0, 0, 0}, /* 0xD2 (11010010) */ |
| 502 | {1, 2, 5, 7, 8, 0, 0, 0}, /* 0xD3 (11010011) */ |
| 503 | {3, 5, 7, 8, 0, 0, 0, 0}, /* 0xD4 (11010100) */ |
| 504 | {1, 3, 5, 7, 8, 0, 0, 0}, /* 0xD5 (11010101) */ |
| 505 | {2, 3, 5, 7, 8, 0, 0, 0}, /* 0xD6 (11010110) */ |
| 506 | {1, 2, 3, 5, 7, 8, 0, 0}, /* 0xD7 (11010111) */ |
| 507 | {4, 5, 7, 8, 0, 0, 0, 0}, /* 0xD8 (11011000) */ |
| 508 | {1, 4, 5, 7, 8, 0, 0, 0}, /* 0xD9 (11011001) */ |
| 509 | {2, 4, 5, 7, 8, 0, 0, 0}, /* 0xDA (11011010) */ |
| 510 | {1, 2, 4, 5, 7, 8, 0, 0}, /* 0xDB (11011011) */ |
| 511 | {3, 4, 5, 7, 8, 0, 0, 0}, /* 0xDC (11011100) */ |
| 512 | {1, 3, 4, 5, 7, 8, 0, 0}, /* 0xDD (11011101) */ |
| 513 | {2, 3, 4, 5, 7, 8, 0, 0}, /* 0xDE (11011110) */ |
| 514 | {1, 2, 3, 4, 5, 7, 8, 0}, /* 0xDF (11011111) */ |
| 515 | {6, 7, 8, 0, 0, 0, 0, 0}, /* 0xE0 (11100000) */ |
| 516 | {1, 6, 7, 8, 0, 0, 0, 0}, /* 0xE1 (11100001) */ |
| 517 | {2, 6, 7, 8, 0, 0, 0, 0}, /* 0xE2 (11100010) */ |
| 518 | {1, 2, 6, 7, 8, 0, 0, 0}, /* 0xE3 (11100011) */ |
| 519 | {3, 6, 7, 8, 0, 0, 0, 0}, /* 0xE4 (11100100) */ |
| 520 | {1, 3, 6, 7, 8, 0, 0, 0}, /* 0xE5 (11100101) */ |
| 521 | {2, 3, 6, 7, 8, 0, 0, 0}, /* 0xE6 (11100110) */ |
| 522 | {1, 2, 3, 6, 7, 8, 0, 0}, /* 0xE7 (11100111) */ |
| 523 | {4, 6, 7, 8, 0, 0, 0, 0}, /* 0xE8 (11101000) */ |
| 524 | {1, 4, 6, 7, 8, 0, 0, 0}, /* 0xE9 (11101001) */ |
| 525 | {2, 4, 6, 7, 8, 0, 0, 0}, /* 0xEA (11101010) */ |
| 526 | {1, 2, 4, 6, 7, 8, 0, 0}, /* 0xEB (11101011) */ |
| 527 | {3, 4, 6, 7, 8, 0, 0, 0}, /* 0xEC (11101100) */ |
| 528 | {1, 3, 4, 6, 7, 8, 0, 0}, /* 0xED (11101101) */ |
| 529 | {2, 3, 4, 6, 7, 8, 0, 0}, /* 0xEE (11101110) */ |
| 530 | {1, 2, 3, 4, 6, 7, 8, 0}, /* 0xEF (11101111) */ |
| 531 | {5, 6, 7, 8, 0, 0, 0, 0}, /* 0xF0 (11110000) */ |
| 532 | {1, 5, 6, 7, 8, 0, 0, 0}, /* 0xF1 (11110001) */ |
| 533 | {2, 5, 6, 7, 8, 0, 0, 0}, /* 0xF2 (11110010) */ |
| 534 | {1, 2, 5, 6, 7, 8, 0, 0}, /* 0xF3 (11110011) */ |
| 535 | {3, 5, 6, 7, 8, 0, 0, 0}, /* 0xF4 (11110100) */ |
| 536 | {1, 3, 5, 6, 7, 8, 0, 0}, /* 0xF5 (11110101) */ |
| 537 | {2, 3, 5, 6, 7, 8, 0, 0}, /* 0xF6 (11110110) */ |
| 538 | {1, 2, 3, 5, 6, 7, 8, 0}, /* 0xF7 (11110111) */ |
| 539 | {4, 5, 6, 7, 8, 0, 0, 0}, /* 0xF8 (11111000) */ |
| 540 | {1, 4, 5, 6, 7, 8, 0, 0}, /* 0xF9 (11111001) */ |
| 541 | {2, 4, 5, 6, 7, 8, 0, 0}, /* 0xFA (11111010) */ |
| 542 | {1, 2, 4, 5, 6, 7, 8, 0}, /* 0xFB (11111011) */ |
| 543 | {3, 4, 5, 6, 7, 8, 0, 0}, /* 0xFC (11111100) */ |
| 544 | {1, 3, 4, 5, 6, 7, 8, 0}, /* 0xFD (11111101) */ |
| 545 | {2, 3, 4, 5, 6, 7, 8, 0}, /* 0xFE (11111110) */ |
| 546 | {1, 2, 3, 4, 5, 6, 7, 8} /* 0xFF (11111111) */ |
| 547 | }; |
| 548 | |
| 549 | #endif |
| 550 | |
| 551 | #ifdef USEAVX |
| 552 | |
| 553 | size_t (uint64_t *array, size_t length, void *vout, |
| 554 | size_t outcapacity, uint32_t base) { |
| 555 | uint32_t *out = (uint32_t *)vout; |
| 556 | uint32_t *initout = out; |
| 557 | __m256i baseVec = _mm256_set1_epi32(base - 1); |
| 558 | __m256i incVec = _mm256_set1_epi32(64); |
| 559 | __m256i add8 = _mm256_set1_epi32(8); |
| 560 | uint32_t *safeout = out + outcapacity; |
| 561 | size_t i = 0; |
| 562 | for (; (i < length) && (out + 64 <= safeout); ++i) { |
| 563 | uint64_t w = array[i]; |
| 564 | if (w == 0) { |
| 565 | baseVec = _mm256_add_epi32(baseVec, incVec); |
| 566 | } else { |
| 567 | for (int k = 0; k < 4; ++k) { |
| 568 | uint8_t byteA = (uint8_t)w; |
| 569 | uint8_t byteB = (uint8_t)(w >> 8); |
| 570 | w >>= 16; |
| 571 | __m256i vecA = |
| 572 | _mm256_load_si256((const __m256i *)vecDecodeTable[byteA]); |
| 573 | __m256i vecB = |
| 574 | _mm256_load_si256((const __m256i *)vecDecodeTable[byteB]); |
| 575 | uint8_t advanceA = lengthTable[byteA]; |
| 576 | uint8_t advanceB = lengthTable[byteB]; |
| 577 | vecA = _mm256_add_epi32(baseVec, vecA); |
| 578 | baseVec = _mm256_add_epi32(baseVec, add8); |
| 579 | vecB = _mm256_add_epi32(baseVec, vecB); |
| 580 | baseVec = _mm256_add_epi32(baseVec, add8); |
| 581 | _mm256_storeu_si256((__m256i *)out, vecA); |
| 582 | out += advanceA; |
| 583 | _mm256_storeu_si256((__m256i *)out, vecB); |
| 584 | out += advanceB; |
| 585 | } |
| 586 | } |
| 587 | } |
| 588 | base += i * 64; |
| 589 | for (; (i < length) && (out < safeout); ++i) { |
| 590 | uint64_t w = array[i]; |
| 591 | while ((w != 0) && (out < safeout)) { |
| 592 | uint64_t t = w & (~w + 1); // on x64, should compile to BLSI (careful: the Intel compiler seems to fail) |
| 593 | int r = __builtin_ctzll(w); // on x64, should compile to TZCNT |
| 594 | uint32_t val = r + base; |
| 595 | memcpy(out, &val, |
| 596 | sizeof(uint32_t)); // should be compiled as a MOV on x64 |
| 597 | out++; |
| 598 | w ^= t; |
| 599 | } |
| 600 | base += 64; |
| 601 | } |
| 602 | return out - initout; |
| 603 | } |
| 604 | #endif // USEAVX |
| 605 | |
| 606 | size_t (uint64_t *bitset, size_t length, void *vout, |
| 607 | uint32_t base) { |
| 608 | int outpos = 0; |
| 609 | uint32_t *out = (uint32_t *)vout; |
| 610 | for (size_t i = 0; i < length; ++i) { |
| 611 | uint64_t w = bitset[i]; |
| 612 | while (w != 0) { |
| 613 | uint64_t t = w & (~w + 1); // on x64, should compile to BLSI (careful: the Intel compiler seems to fail) |
| 614 | int r = __builtin_ctzll(w); // on x64, should compile to TZCNT |
| 615 | uint32_t val = r + base; |
| 616 | memcpy(out + outpos, &val, |
| 617 | sizeof(uint32_t)); // should be compiled as a MOV on x64 |
| 618 | outpos++; |
| 619 | w ^= t; |
| 620 | } |
| 621 | base += 64; |
| 622 | } |
| 623 | return outpos; |
| 624 | } |
| 625 | |
| 626 | size_t (const uint64_t * __restrict__ bitset1, |
| 627 | const uint64_t * __restrict__ bitset2, |
| 628 | size_t length, uint16_t *out, |
| 629 | uint16_t base) { |
| 630 | int outpos = 0; |
| 631 | for (size_t i = 0; i < length; ++i) { |
| 632 | uint64_t w = bitset1[i] & bitset2[i]; |
| 633 | while (w != 0) { |
| 634 | uint64_t t = w & (~w + 1); |
| 635 | int r = __builtin_ctzll(w); |
| 636 | out[outpos++] = r + base; |
| 637 | w ^= t; |
| 638 | } |
| 639 | base += 64; |
| 640 | } |
| 641 | return outpos; |
| 642 | } |
| 643 | |
| 644 | #ifdef IS_X64 |
| 645 | /* |
| 646 | * Given a bitset containing "length" 64-bit words, write out the position |
| 647 | * of all the set bits to "out" as 16-bit integers, values start at "base" (can |
| 648 | *be set to zero). |
| 649 | * |
| 650 | * The "out" pointer should be sufficient to store the actual number of bits |
| 651 | *set. |
| 652 | * |
| 653 | * Returns how many values were actually decoded. |
| 654 | * |
| 655 | * This function uses SSE decoding. |
| 656 | */ |
| 657 | size_t (const uint64_t *bitset, size_t length, |
| 658 | uint16_t *out, size_t outcapacity, |
| 659 | uint16_t base) { |
| 660 | uint16_t *initout = out; |
| 661 | __m128i baseVec = _mm_set1_epi16(base - 1); |
| 662 | __m128i incVec = _mm_set1_epi16(64); |
| 663 | __m128i add8 = _mm_set1_epi16(8); |
| 664 | uint16_t *safeout = out + outcapacity; |
| 665 | const int numberofbytes = 2; // process two bytes at a time |
| 666 | size_t i = 0; |
| 667 | for (; (i < length) && (out + numberofbytes * 8 <= safeout); ++i) { |
| 668 | uint64_t w = bitset[i]; |
| 669 | if (w == 0) { |
| 670 | baseVec = _mm_add_epi16(baseVec, incVec); |
| 671 | } else { |
| 672 | for (int k = 0; k < 4; ++k) { |
| 673 | uint8_t byteA = (uint8_t)w; |
| 674 | uint8_t byteB = (uint8_t)(w >> 8); |
| 675 | w >>= 16; |
| 676 | __m128i vecA = _mm_load_si128( |
| 677 | (const __m128i *)vecDecodeTable_uint16[byteA]); |
| 678 | __m128i vecB = _mm_load_si128( |
| 679 | (const __m128i *)vecDecodeTable_uint16[byteB]); |
| 680 | uint8_t advanceA = lengthTable[byteA]; |
| 681 | uint8_t advanceB = lengthTable[byteB]; |
| 682 | vecA = _mm_add_epi16(baseVec, vecA); |
| 683 | baseVec = _mm_add_epi16(baseVec, add8); |
| 684 | vecB = _mm_add_epi16(baseVec, vecB); |
| 685 | baseVec = _mm_add_epi16(baseVec, add8); |
| 686 | _mm_storeu_si128((__m128i *)out, vecA); |
| 687 | out += advanceA; |
| 688 | _mm_storeu_si128((__m128i *)out, vecB); |
| 689 | out += advanceB; |
| 690 | } |
| 691 | } |
| 692 | } |
| 693 | base += (uint16_t)(i * 64); |
| 694 | for (; (i < length) && (out < safeout); ++i) { |
| 695 | uint64_t w = bitset[i]; |
| 696 | while ((w != 0) && (out < safeout)) { |
| 697 | uint64_t t = w & (~w + 1); |
| 698 | int r = __builtin_ctzll(w); |
| 699 | *out = r + base; |
| 700 | out++; |
| 701 | w ^= t; |
| 702 | } |
| 703 | base += 64; |
| 704 | } |
| 705 | return out - initout; |
| 706 | } |
| 707 | #endif |
| 708 | |
| 709 | /* |
| 710 | * Given a bitset containing "length" 64-bit words, write out the position |
| 711 | * of all the set bits to "out", values start at "base" (can be set to zero). |
| 712 | * |
| 713 | * The "out" pointer should be sufficient to store the actual number of bits |
| 714 | *set. |
| 715 | * |
| 716 | * Returns how many values were actually decoded. |
| 717 | */ |
| 718 | size_t (const uint64_t *bitset, size_t length, |
| 719 | uint16_t *out, uint16_t base) { |
| 720 | int outpos = 0; |
| 721 | for (size_t i = 0; i < length; ++i) { |
| 722 | uint64_t w = bitset[i]; |
| 723 | while (w != 0) { |
| 724 | uint64_t t = w & (~w + 1); |
| 725 | int r = __builtin_ctzll(w); |
| 726 | out[outpos++] = r + base; |
| 727 | w ^= t; |
| 728 | } |
| 729 | base += 64; |
| 730 | } |
| 731 | return outpos; |
| 732 | } |
| 733 | |
| 734 | #if defined(ASMBITMANIPOPTIMIZATION) |
| 735 | |
| 736 | uint64_t bitset_set_list_withcard(void *bitset, uint64_t card, |
| 737 | const uint16_t *list, uint64_t length) { |
| 738 | uint64_t offset, load, pos; |
| 739 | uint64_t shift = 6; |
| 740 | const uint16_t *end = list + length; |
| 741 | if (!length) return card; |
| 742 | // TODO: could unroll for performance, see bitset_set_list |
| 743 | // bts is not available as an intrinsic in GCC |
| 744 | __asm volatile( |
| 745 | "1:\n" |
| 746 | "movzwq (%[list]), %[pos]\n" |
| 747 | "shrx %[shift], %[pos], %[offset]\n" |
| 748 | "mov (%[bitset],%[offset],8), %[load]\n" |
| 749 | "bts %[pos], %[load]\n" |
| 750 | "mov %[load], (%[bitset],%[offset],8)\n" |
| 751 | "sbb $-1, %[card]\n" |
| 752 | "add $2, %[list]\n" |
| 753 | "cmp %[list], %[end]\n" |
| 754 | "jnz 1b" |
| 755 | : [card] "+&r" (card), [list] "+&r" (list), [load] "=&r" (load), |
| 756 | [pos] "=&r" (pos), [offset] "=&r" (offset) |
| 757 | : [end] "r" (end), [bitset] "r" (bitset), [shift] "r" (shift)); |
| 758 | return card; |
| 759 | } |
| 760 | |
| 761 | void bitset_set_list(void *bitset, const uint16_t *list, uint64_t length) { |
| 762 | uint64_t pos; |
| 763 | const uint16_t *end = list + length; |
| 764 | |
| 765 | uint64_t shift = 6; |
| 766 | uint64_t offset; |
| 767 | uint64_t load; |
| 768 | for (; list + 3 < end; list += 4) { |
| 769 | pos = list[0]; |
| 770 | __asm volatile( |
| 771 | "shrx %[shift], %[pos], %[offset]\n" |
| 772 | "mov (%[bitset],%[offset],8), %[load]\n" |
| 773 | "bts %[pos], %[load]\n" |
| 774 | "mov %[load], (%[bitset],%[offset],8)" |
| 775 | : [load] "=&r" (load), [offset] "=&r" (offset) |
| 776 | : [bitset] "r" (bitset), [shift] "r" (shift), [pos] "r" (pos)); |
| 777 | pos = list[1]; |
| 778 | __asm volatile( |
| 779 | "shrx %[shift], %[pos], %[offset]\n" |
| 780 | "mov (%[bitset],%[offset],8), %[load]\n" |
| 781 | "bts %[pos], %[load]\n" |
| 782 | "mov %[load], (%[bitset],%[offset],8)" |
| 783 | : [load] "=&r" (load), [offset] "=&r" (offset) |
| 784 | : [bitset] "r" (bitset), [shift] "r" (shift), [pos] "r" (pos)); |
| 785 | pos = list[2]; |
| 786 | __asm volatile( |
| 787 | "shrx %[shift], %[pos], %[offset]\n" |
| 788 | "mov (%[bitset],%[offset],8), %[load]\n" |
| 789 | "bts %[pos], %[load]\n" |
| 790 | "mov %[load], (%[bitset],%[offset],8)" |
| 791 | : [load] "=&r" (load), [offset] "=&r" (offset) |
| 792 | : [bitset] "r" (bitset), [shift] "r" (shift), [pos] "r" (pos)); |
| 793 | pos = list[3]; |
| 794 | __asm volatile( |
| 795 | "shrx %[shift], %[pos], %[offset]\n" |
| 796 | "mov (%[bitset],%[offset],8), %[load]\n" |
| 797 | "bts %[pos], %[load]\n" |
| 798 | "mov %[load], (%[bitset],%[offset],8)" |
| 799 | : [load] "=&r" (load), [offset] "=&r" (offset) |
| 800 | : [bitset] "r" (bitset), [shift] "r" (shift), [pos] "r" (pos)); |
| 801 | } |
| 802 | |
| 803 | while (list != end) { |
| 804 | pos = list[0]; |
| 805 | __asm volatile( |
| 806 | "shrx %[shift], %[pos], %[offset]\n" |
| 807 | "mov (%[bitset],%[offset],8), %[load]\n" |
| 808 | "bts %[pos], %[load]\n" |
| 809 | "mov %[load], (%[bitset],%[offset],8)" |
| 810 | : [load] "=&r" (load), [offset] "=&r" (offset) |
| 811 | : [bitset] "r" (bitset), [shift] "r" (shift), [pos] "r" (pos)); |
| 812 | list++; |
| 813 | } |
| 814 | } |
| 815 | |
| 816 | uint64_t bitset_clear_list(void *bitset, uint64_t card, const uint16_t *list, |
| 817 | uint64_t length) { |
| 818 | uint64_t offset, load, pos; |
| 819 | uint64_t shift = 6; |
| 820 | const uint16_t *end = list + length; |
| 821 | if (!length) return card; |
| 822 | // btr is not available as an intrinsic in GCC |
| 823 | __asm volatile( |
| 824 | "1:\n" |
| 825 | "movzwq (%[list]), %[pos]\n" |
| 826 | "shrx %[shift], %[pos], %[offset]\n" |
| 827 | "mov (%[bitset],%[offset],8), %[load]\n" |
| 828 | "btr %[pos], %[load]\n" |
| 829 | "mov %[load], (%[bitset],%[offset],8)\n" |
| 830 | "sbb $0, %[card]\n" |
| 831 | "add $2, %[list]\n" |
| 832 | "cmp %[list], %[end]\n" |
| 833 | "jnz 1b" |
| 834 | : [card] "+&r" (card), [list] "+&r" (list), [load] "=&r" (load), |
| 835 | [pos] "=&r" (pos), [offset] "=&r" (offset) |
| 836 | : [end] "r" (end), [bitset] "r" (bitset), [shift] "r" (shift) |
| 837 | : |
| 838 | /* clobbers */ "memory" ); |
| 839 | return card; |
| 840 | } |
| 841 | |
| 842 | #else |
| 843 | uint64_t bitset_clear_list(void *bitset, uint64_t card, const uint16_t *list, |
| 844 | uint64_t length) { |
| 845 | uint64_t offset, load, newload, pos, index; |
| 846 | const uint16_t *end = list + length; |
| 847 | while (list != end) { |
| 848 | pos = *(const uint16_t *)list; |
| 849 | offset = pos >> 6; |
| 850 | index = pos % 64; |
| 851 | load = ((uint64_t *)bitset)[offset]; |
| 852 | newload = load & ~(UINT64_C(1) << index); |
| 853 | card -= (load ^ newload) >> index; |
| 854 | ((uint64_t *)bitset)[offset] = newload; |
| 855 | list++; |
| 856 | } |
| 857 | return card; |
| 858 | } |
| 859 | |
| 860 | uint64_t bitset_set_list_withcard(void *bitset, uint64_t card, |
| 861 | const uint16_t *list, uint64_t length) { |
| 862 | uint64_t offset, load, newload, pos, index; |
| 863 | const uint16_t *end = list + length; |
| 864 | while (list != end) { |
| 865 | pos = *(const uint16_t *)list; |
| 866 | offset = pos >> 6; |
| 867 | index = pos % 64; |
| 868 | load = ((uint64_t *)bitset)[offset]; |
| 869 | newload = load | (UINT64_C(1) << index); |
| 870 | card += (load ^ newload) >> index; |
| 871 | ((uint64_t *)bitset)[offset] = newload; |
| 872 | list++; |
| 873 | } |
| 874 | return card; |
| 875 | } |
| 876 | |
| 877 | void bitset_set_list(void *bitset, const uint16_t *list, uint64_t length) { |
| 878 | uint64_t offset, load, newload, pos, index; |
| 879 | const uint16_t *end = list + length; |
| 880 | while (list != end) { |
| 881 | pos = *(const uint16_t *)list; |
| 882 | offset = pos >> 6; |
| 883 | index = pos % 64; |
| 884 | load = ((uint64_t *)bitset)[offset]; |
| 885 | newload = load | (UINT64_C(1) << index); |
| 886 | ((uint64_t *)bitset)[offset] = newload; |
| 887 | list++; |
| 888 | } |
| 889 | } |
| 890 | |
| 891 | #endif |
| 892 | |
| 893 | /* flip specified bits */ |
| 894 | /* TODO: consider whether worthwhile to make an asm version */ |
| 895 | |
| 896 | uint64_t bitset_flip_list_withcard(void *bitset, uint64_t card, |
| 897 | const uint16_t *list, uint64_t length) { |
| 898 | uint64_t offset, load, newload, pos, index; |
| 899 | const uint16_t *end = list + length; |
| 900 | while (list != end) { |
| 901 | pos = *(const uint16_t *)list; |
| 902 | offset = pos >> 6; |
| 903 | index = pos % 64; |
| 904 | load = ((uint64_t *)bitset)[offset]; |
| 905 | newload = load ^ (UINT64_C(1) << index); |
| 906 | // todo: is a branch here all that bad? |
| 907 | card += |
| 908 | (1 - 2 * (((UINT64_C(1) << index) & load) >> index)); // +1 or -1 |
| 909 | ((uint64_t *)bitset)[offset] = newload; |
| 910 | list++; |
| 911 | } |
| 912 | return card; |
| 913 | } |
| 914 | |
| 915 | void bitset_flip_list(void *bitset, const uint16_t *list, uint64_t length) { |
| 916 | uint64_t offset, load, newload, pos, index; |
| 917 | const uint16_t *end = list + length; |
| 918 | while (list != end) { |
| 919 | pos = *(const uint16_t *)list; |
| 920 | offset = pos >> 6; |
| 921 | index = pos % 64; |
| 922 | load = ((uint64_t *)bitset)[offset]; |
| 923 | newload = load ^ (UINT64_C(1) << index); |
| 924 | ((uint64_t *)bitset)[offset] = newload; |
| 925 | list++; |
| 926 | } |
| 927 | } |
| 928 | |