| 1 | #include "simdjson/portability.h" |
| 2 | #include <cstdint> |
| 3 | |
| 4 | #ifndef __AVX2__ |
| 5 | |
| 6 | namespace simdjson { |
| 7 | static uint8_t jump_table[256 * 3] = { |
| 8 | 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, |
| 9 | 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, |
| 10 | 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, |
| 11 | 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, |
| 12 | 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, |
| 13 | 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, |
| 14 | 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, |
| 15 | 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, |
| 16 | 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, |
| 17 | 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, |
| 18 | 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, |
| 19 | 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, |
| 20 | 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, |
| 21 | 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, |
| 22 | 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, |
| 23 | 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, |
| 24 | 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, |
| 25 | 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, |
| 26 | 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, |
| 27 | 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, |
| 28 | 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, |
| 29 | 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, |
| 30 | 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, |
| 31 | 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, |
| 32 | 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, |
| 33 | 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, |
| 34 | 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, |
| 35 | 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, |
| 36 | 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, |
| 37 | 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, |
| 38 | 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, |
| 39 | }; |
| 40 | |
| 41 | size_t json_minify(const unsigned char *bytes, size_t how_many, |
| 42 | unsigned char *out) { |
| 43 | size_t i = 0, pos = 0; |
| 44 | uint8_t quote = 0; |
| 45 | uint8_t nonescape = 1; |
| 46 | |
| 47 | while (i < how_many) { |
| 48 | unsigned char c = bytes[i]; |
| 49 | uint8_t *meta = jump_table + 3 * c; |
| 50 | |
| 51 | quote = quote ^ (meta[0] & nonescape); |
| 52 | out[pos] = c; |
| 53 | pos += meta[2] | quote; |
| 54 | |
| 55 | i += 1; |
| 56 | nonescape = (~nonescape) | (meta[1]); |
| 57 | } |
| 58 | return pos; |
| 59 | } |
| 60 | } // namespace simdjson |
| 61 | #else |
| 62 | #include "simdprune_tables.h" |
| 63 | #include <cstring> |
| 64 | |
| 65 | namespace simdjson { |
| 66 | |
| 67 | // a straightforward comparison of a mask against input. |
| 68 | static uint64_t cmp_mask_against_input_mini(__m256i input_lo, __m256i input_hi, |
| 69 | __m256i mask) { |
| 70 | __m256i cmp_res_0 = _mm256_cmpeq_epi8(input_lo, mask); |
| 71 | uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0)); |
| 72 | __m256i cmp_res_1 = _mm256_cmpeq_epi8(input_hi, mask); |
| 73 | uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1); |
| 74 | return res_0 | (res_1 << 32); |
| 75 | } |
| 76 | |
| 77 | // Write up to 16 bytes, only the bytes corresponding to a 1-bit are written |
| 78 | // out. credit: Anime Tosho |
| 79 | static __m128i skinnycleanm128(__m128i x, int mask) { |
| 80 | int mask1 = mask & 0xFF; |
| 81 | int mask2 = (mask >> 8) & 0xFF; |
| 82 | __m128i shufmask = _mm_castps_si128( |
| 83 | _mm_loadh_pi(_mm_castsi128_ps(_mm_loadl_epi64( |
| 84 | (const __m128i *)(thintable_epi8 + mask1))), |
| 85 | (const __m64 *)(thintable_epi8 + mask2))); |
| 86 | shufmask = |
| 87 | _mm_add_epi8(shufmask, _mm_set_epi32(0x08080808, 0x08080808, 0, 0)); |
| 88 | __m128i pruned = _mm_shuffle_epi8(x, shufmask); |
| 89 | intptr_t popx2 = BitsSetTable256mul2[mask1]; |
| 90 | __m128i compactmask = |
| 91 | _mm_loadu_si128((const __m128i *)(pshufb_combine_table + popx2 * 8)); |
| 92 | return _mm_shuffle_epi8(pruned, compactmask); |
| 93 | } |
| 94 | |
| 95 | // take input from buf and remove useless whitespace, input and output can be |
| 96 | // the same, result is null terminated, return the string length (minus the null |
| 97 | // termination) |
| 98 | size_t json_minify(const uint8_t *buf, size_t len, uint8_t *out) { |
| 99 | // Useful constant masks |
| 100 | const uint64_t even_bits = 0x5555555555555555ULL; |
| 101 | const uint64_t odd_bits = ~even_bits; |
| 102 | uint8_t *initout(out); |
| 103 | uint64_t prev_iter_ends_odd_backslash = |
| 104 | 0ULL; // either 0 or 1, but a 64-bit value |
| 105 | uint64_t prev_iter_inside_quote = 0ULL; // either all zeros or all ones |
| 106 | size_t idx = 0; |
| 107 | if (len >= 64) { |
| 108 | size_t avx_len = len - 63; |
| 109 | |
| 110 | for (; idx < avx_len; idx += 64) { |
| 111 | __m256i input_lo = |
| 112 | _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf + idx + 0)); |
| 113 | __m256i input_hi = |
| 114 | _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf + idx + 32)); |
| 115 | uint64_t bs_bits = cmp_mask_against_input_mini(input_lo, input_hi, |
| 116 | _mm256_set1_epi8('\\')); |
| 117 | uint64_t start_edges = bs_bits & ~(bs_bits << 1); |
| 118 | uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash; |
| 119 | uint64_t even_starts = start_edges & even_start_mask; |
| 120 | uint64_t odd_starts = start_edges & ~even_start_mask; |
| 121 | uint64_t even_carries = bs_bits + even_starts; |
| 122 | uint64_t odd_carries; |
| 123 | bool iter_ends_odd_backslash = |
| 124 | add_overflow(bs_bits, odd_starts, &odd_carries); |
| 125 | odd_carries |= prev_iter_ends_odd_backslash; |
| 126 | prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL; |
| 127 | uint64_t even_carry_ends = even_carries & ~bs_bits; |
| 128 | uint64_t odd_carry_ends = odd_carries & ~bs_bits; |
| 129 | uint64_t even_start_odd_end = even_carry_ends & odd_bits; |
| 130 | uint64_t odd_start_even_end = odd_carry_ends & even_bits; |
| 131 | uint64_t odd_ends = even_start_odd_end | odd_start_even_end; |
| 132 | uint64_t quote_bits = cmp_mask_against_input_mini(input_lo, input_hi, |
| 133 | _mm256_set1_epi8('"')); |
| 134 | quote_bits = quote_bits & ~odd_ends; |
| 135 | uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128( |
| 136 | _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0)); |
| 137 | quote_mask ^= prev_iter_inside_quote; |
| 138 | prev_iter_inside_quote = static_cast<uint64_t>( |
| 139 | static_cast<int64_t>(quote_mask) >> |
| 140 | 63); // might be undefined behavior, should be fully defined in C++20, |
| 141 | // ok according to John Regher from Utah University |
| 142 | const __m256i low_nibble_mask = _mm256_setr_epi8( |
| 143 | // 0 9 a b c d |
| 144 | 16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0, 16, 0, 0, 0, 0, 0, |
| 145 | 0, 0, 0, 8, 12, 1, 2, 9, 0, 0); |
| 146 | const __m256i high_nibble_mask = _mm256_setr_epi8( |
| 147 | // 0 2 3 5 7 |
| 148 | 8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0, 8, 0, 18, 4, 0, 1, 0, |
| 149 | 1, 0, 0, 0, 3, 2, 1, 0, 0); |
| 150 | __m256i whitespace_shufti_mask = _mm256_set1_epi8(0x18); |
| 151 | __m256i v_lo = _mm256_and_si256( |
| 152 | _mm256_shuffle_epi8(low_nibble_mask, input_lo), |
| 153 | _mm256_shuffle_epi8(high_nibble_mask, |
| 154 | _mm256_and_si256(_mm256_srli_epi32(input_lo, 4), |
| 155 | _mm256_set1_epi8(0x7f)))); |
| 156 | |
| 157 | __m256i v_hi = _mm256_and_si256( |
| 158 | _mm256_shuffle_epi8(low_nibble_mask, input_hi), |
| 159 | _mm256_shuffle_epi8(high_nibble_mask, |
| 160 | _mm256_and_si256(_mm256_srli_epi32(input_hi, 4), |
| 161 | _mm256_set1_epi8(0x7f)))); |
| 162 | __m256i tmp_ws_lo = _mm256_cmpeq_epi8( |
| 163 | _mm256_and_si256(v_lo, whitespace_shufti_mask), _mm256_set1_epi8(0)); |
| 164 | __m256i tmp_ws_hi = _mm256_cmpeq_epi8( |
| 165 | _mm256_and_si256(v_hi, whitespace_shufti_mask), _mm256_set1_epi8(0)); |
| 166 | |
| 167 | uint64_t ws_res_0 = |
| 168 | static_cast<uint32_t>(_mm256_movemask_epi8(tmp_ws_lo)); |
| 169 | uint64_t ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi); |
| 170 | uint64_t whitespace = ~(ws_res_0 | (ws_res_1 << 32)); |
| 171 | whitespace &= ~quote_mask; |
| 172 | |
| 173 | uint64_t non_whitespace = ~whitespace; |
| 174 | |
| 175 | __m128i x1 = _mm256_extracti128_si256(input_lo, 0); |
| 176 | __m128i x2 = _mm256_extracti128_si256(input_lo, 1); |
| 177 | __m128i x3 = _mm256_extracti128_si256(input_hi, 0); |
| 178 | __m128i x4 = _mm256_extracti128_si256(input_hi, 1); |
| 179 | |
| 180 | int mask1 = non_whitespace & 0xFFFF; |
| 181 | int mask2 = (non_whitespace >> 16) & 0xFFFF; |
| 182 | int mask3 = (non_whitespace >> 32) & 0xFFFF; |
| 183 | int mask4 = (non_whitespace >> 48) & 0xFFFF; |
| 184 | |
| 185 | x1 = skinnycleanm128(x1, mask1); |
| 186 | x2 = skinnycleanm128(x2, mask2); |
| 187 | x3 = skinnycleanm128(x3, mask3); |
| 188 | x4 = skinnycleanm128(x4, mask4); |
| 189 | int pop1 = hamming(non_whitespace & 0xFFFF); |
| 190 | int pop2 = hamming(non_whitespace & UINT64_C(0xFFFFFFFF)); |
| 191 | int pop3 = hamming(non_whitespace) & UINT64_C(0xFFFFFFFFFFFF)); |
| 192 | int pop4 = hamming(non_whitespace); |
| 193 | _mm_storeu_si128(reinterpret_cast<__m128i *>(out), x1); |
| 194 | _mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop1), x2); |
| 195 | _mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop2), x3); |
| 196 | _mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop3), x4); |
| 197 | out += pop4; |
| 198 | } |
| 199 | } |
| 200 | // we finish off the job... copying and pasting the code is not ideal here, |
| 201 | // but it gets the job done. |
| 202 | if (idx < len) { |
| 203 | uint8_t buffer[64]; |
| 204 | memset(buffer, 0, 64); |
| 205 | memcpy(buffer, buf + idx, len - idx); |
| 206 | __m256i input_lo = |
| 207 | _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buffer)); |
| 208 | __m256i input_hi = |
| 209 | _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buffer + 32)); |
| 210 | uint64_t bs_bits = |
| 211 | cmp_mask_against_input_mini(input_lo, input_hi, _mm256_set1_epi8('\\')); |
| 212 | uint64_t start_edges = bs_bits & ~(bs_bits << 1); |
| 213 | uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash; |
| 214 | uint64_t even_starts = start_edges & even_start_mask; |
| 215 | uint64_t odd_starts = start_edges & ~even_start_mask; |
| 216 | uint64_t even_carries = bs_bits + even_starts; |
| 217 | uint64_t odd_carries; |
| 218 | // bool iter_ends_odd_backslash = |
| 219 | add_overflow(bs_bits, odd_starts, &odd_carries); |
| 220 | odd_carries |= prev_iter_ends_odd_backslash; |
| 221 | // prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL; |
| 222 | // // we never use it |
| 223 | uint64_t even_carry_ends = even_carries & ~bs_bits; |
| 224 | uint64_t odd_carry_ends = odd_carries & ~bs_bits; |
| 225 | uint64_t even_start_odd_end = even_carry_ends & odd_bits; |
| 226 | uint64_t odd_start_even_end = odd_carry_ends & even_bits; |
| 227 | uint64_t odd_ends = even_start_odd_end | odd_start_even_end; |
| 228 | uint64_t quote_bits = |
| 229 | cmp_mask_against_input_mini(input_lo, input_hi, _mm256_set1_epi8('"')); |
| 230 | quote_bits = quote_bits & ~odd_ends; |
| 231 | uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128( |
| 232 | _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0)); |
| 233 | quote_mask ^= prev_iter_inside_quote; |
| 234 | // prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63);// we |
| 235 | // don't need this anymore |
| 236 | |
| 237 | __m256i mask_20 = _mm256_set1_epi8(0x20); // c==32 |
| 238 | __m256i mask_70 = |
| 239 | _mm256_set1_epi8(0x70); // adding 0x70 does not check low 4-bits |
| 240 | // but moves any value >= 16 above 128 |
| 241 | |
| 242 | __m256i lut_cntrl = _mm256_setr_epi8( |
| 243 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0x00, |
| 244 | 0x00, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
| 245 | 0x00, 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00); |
| 246 | |
| 247 | __m256i tmp_ws_lo = _mm256_or_si256( |
| 248 | _mm256_cmpeq_epi8(mask_20, input_lo), |
| 249 | _mm256_shuffle_epi8(lut_cntrl, _mm256_adds_epu8(mask_70, input_lo))); |
| 250 | __m256i tmp_ws_hi = _mm256_or_si256( |
| 251 | _mm256_cmpeq_epi8(mask_20, input_hi), |
| 252 | _mm256_shuffle_epi8(lut_cntrl, _mm256_adds_epu8(mask_70, input_hi))); |
| 253 | uint64_t ws_res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(tmp_ws_lo)); |
| 254 | uint64_t ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi); |
| 255 | uint64_t whitespace = (ws_res_0 | (ws_res_1 << 32)); |
| 256 | whitespace &= ~quote_mask; |
| 257 | |
| 258 | if (len - idx < 64) { |
| 259 | whitespace |= UINT64_C(0xFFFFFFFFFFFFFFFF) << (len - idx); |
| 260 | } |
| 261 | int mask1 = non_whitespace & 0xFFFF; |
| 262 | int mask2 = (non_whitespace >> 16) & 0xFFFF; |
| 263 | int mask3 = (non_whitespace >> 32) & 0xFFFF; |
| 264 | int mask4 = (non_whitespace >> 48) & 0xFFFF; |
| 265 | |
| 266 | x1 = skinnycleanm128(x1, mask1); |
| 267 | x2 = skinnycleanm128(x2, mask2); |
| 268 | x3 = skinnycleanm128(x3, mask3); |
| 269 | x4 = skinnycleanm128(x4, mask4); |
| 270 | int pop1 = hamming(non_whitespace & 0xFFFF); |
| 271 | int pop2 = hamming(non_whitespace & UINT64_C(0xFFFFFFFF)); |
| 272 | int pop3 = hamming(non_whitespace) & UINT64_C(0xFFFFFFFFFFFF)); |
| 273 | int pop4 = hamming(non_whitespace); |
| 274 | _mm_storeu_si128(reinterpret_cast<__m128i *>(out), x1); |
| 275 | _mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop1), x2); |
| 276 | _mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop2), x3); |
| 277 | _mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop3), x4); |
| 278 | out += pop4; |
| 279 | } |
| 280 | *out = '\0'; // NULL termination |
| 281 | return out - initout; |
| 282 | } |
| 283 | |
| 284 | size_t oldjson_minify(const uint8_t *buf, size_t len, uint8_t *out) { |
| 285 | // Useful constant masks |
| 286 | const uint64_t even_bits = 0x5555555555555555ULL; |
| 287 | const uint64_t odd_bits = ~even_bits; |
| 288 | uint8_t *initout(out); |
| 289 | uint64_t prev_iter_ends_odd_backslash = |
| 290 | 0ULL; // either 0 or 1, but a 64-bit value |
| 291 | uint64_t prev_iter_inside_quote = 0ULL; // either all zeros or all ones |
| 292 | size_t idx = 0; |
| 293 | if (len >= 64) { |
| 294 | size_t avx_len = len - 63; |
| 295 | |
| 296 | for (; idx < avx_len; idx += 64) { |
| 297 | __m256i input_lo = |
| 298 | _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf + idx + 0)); |
| 299 | __m256i input_hi = |
| 300 | _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf + idx + 32)); |
| 301 | uint64_t bs_bits = cmp_mask_against_input_mini(input_lo, input_hi, |
| 302 | _mm256_set1_epi8('\\')); |
| 303 | uint64_t start_edges = bs_bits & ~(bs_bits << 1); |
| 304 | uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash; |
| 305 | uint64_t even_starts = start_edges & even_start_mask; |
| 306 | uint64_t odd_starts = start_edges & ~even_start_mask; |
| 307 | uint64_t even_carries = bs_bits + even_starts; |
| 308 | uint64_t odd_carries; |
| 309 | bool iter_ends_odd_backslash = |
| 310 | add_overflow(bs_bits, odd_starts, &odd_carries); |
| 311 | odd_carries |= prev_iter_ends_odd_backslash; |
| 312 | prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL; |
| 313 | uint64_t even_carry_ends = even_carries & ~bs_bits; |
| 314 | uint64_t odd_carry_ends = odd_carries & ~bs_bits; |
| 315 | uint64_t even_start_odd_end = even_carry_ends & odd_bits; |
| 316 | uint64_t odd_start_even_end = odd_carry_ends & even_bits; |
| 317 | uint64_t odd_ends = even_start_odd_end | odd_start_even_end; |
| 318 | uint64_t quote_bits = cmp_mask_against_input_mini(input_lo, input_hi, |
| 319 | _mm256_set1_epi8('"')); |
| 320 | quote_bits = quote_bits & ~odd_ends; |
| 321 | uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128( |
| 322 | _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0)); |
| 323 | quote_mask ^= prev_iter_inside_quote; |
| 324 | prev_iter_inside_quote = static_cast<uint64_t>( |
| 325 | static_cast<int64_t>(quote_mask) >> |
| 326 | 63); // might be undefined behavior, should be fully defined in C++20, |
| 327 | // ok according to John Regher from Utah University |
| 328 | const __m256i low_nibble_mask = _mm256_setr_epi8( |
| 329 | // 0 9 a b c d |
| 330 | 16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0, 16, 0, 0, 0, 0, 0, |
| 331 | 0, 0, 0, 8, 12, 1, 2, 9, 0, 0); |
| 332 | const __m256i high_nibble_mask = _mm256_setr_epi8( |
| 333 | // 0 2 3 5 7 |
| 334 | 8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0, 8, 0, 18, 4, 0, 1, 0, |
| 335 | 1, 0, 0, 0, 3, 2, 1, 0, 0); |
| 336 | __m256i whitespace_shufti_mask = _mm256_set1_epi8(0x18); |
| 337 | __m256i v_lo = _mm256_and_si256( |
| 338 | _mm256_shuffle_epi8(low_nibble_mask, input_lo), |
| 339 | _mm256_shuffle_epi8(high_nibble_mask, |
| 340 | _mm256_and_si256(_mm256_srli_epi32(input_lo, 4), |
| 341 | _mm256_set1_epi8(0x7f)))); |
| 342 | |
| 343 | __m256i v_hi = _mm256_and_si256( |
| 344 | _mm256_shuffle_epi8(low_nibble_mask, input_hi), |
| 345 | _mm256_shuffle_epi8(high_nibble_mask, |
| 346 | _mm256_and_si256(_mm256_srli_epi32(input_hi, 4), |
| 347 | _mm256_set1_epi8(0x7f)))); |
| 348 | __m256i tmp_ws_lo = _mm256_cmpeq_epi8( |
| 349 | _mm256_and_si256(v_lo, whitespace_shufti_mask), _mm256_set1_epi8(0)); |
| 350 | __m256i tmp_ws_hi = _mm256_cmpeq_epi8( |
| 351 | _mm256_and_si256(v_hi, whitespace_shufti_mask), _mm256_set1_epi8(0)); |
| 352 | |
| 353 | uint64_t ws_res_0 = |
| 354 | static_cast<uint32_t>(_mm256_movemask_epi8(tmp_ws_lo)); |
| 355 | uint64_t ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi); |
| 356 | uint64_t whitespace = ~(ws_res_0 | (ws_res_1 << 32)); |
| 357 | whitespace &= ~quote_mask; |
| 358 | int mask1 = whitespace & 0xFFFF; |
| 359 | int mask2 = (whitespace >> 16) & 0xFFFF; |
| 360 | int mask3 = (whitespace >> 32) & 0xFFFF; |
| 361 | int mask4 = (whitespace >> 48) & 0xFFFF; |
| 362 | int pop1 = hamming((~whitespace) & 0xFFFF); |
| 363 | int pop2 = hamming((~whitespace) & UINT64_C(0xFFFFFFFF)); |
| 364 | int pop3 = hamming((~whitespace) & UINT64_C(0xFFFFFFFFFFFF)); |
| 365 | int pop4 = hamming((~whitespace)); |
| 366 | __m256i vmask1 = _mm256_loadu2_m128i( |
| 367 | reinterpret_cast<const __m128i *>(mask128_epi8) + (mask2 & 0x7FFF), |
| 368 | reinterpret_cast<const __m128i *>(mask128_epi8) + (mask1 & 0x7FFF)); |
| 369 | __m256i vmask2 = _mm256_loadu2_m128i( |
| 370 | reinterpret_cast<const __m128i *>(mask128_epi8) + (mask4 & 0x7FFF), |
| 371 | reinterpret_cast<const __m128i *>(mask128_epi8) + (mask3 & 0x7FFF)); |
| 372 | __m256i result1 = _mm256_shuffle_epi8(input_lo, vmask1); |
| 373 | __m256i result2 = _mm256_shuffle_epi8(input_hi, vmask2); |
| 374 | _mm256_storeu2_m128i(reinterpret_cast<__m128i *>(out + pop1), |
| 375 | reinterpret_cast<__m128i *>(out), result1); |
| 376 | _mm256_storeu2_m128i(reinterpret_cast<__m128i *>(out + pop3), |
| 377 | reinterpret_cast<__m128i *>(out + pop2), result2); |
| 378 | out += pop4; |
| 379 | } |
| 380 | } |
| 381 | // we finish off the job... copying and pasting the code is not ideal here, |
| 382 | // but it gets the job done. |
| 383 | if (idx < len) { |
| 384 | uint8_t buffer[64]; |
| 385 | memset(buffer, 0, 64); |
| 386 | memcpy(buffer, buf + idx, len - idx); |
| 387 | __m256i input_lo = |
| 388 | _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buffer)); |
| 389 | __m256i input_hi = |
| 390 | _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buffer + 32)); |
| 391 | uint64_t bs_bits = |
| 392 | cmp_mask_against_input_mini(input_lo, input_hi, _mm256_set1_epi8('\\')); |
| 393 | uint64_t start_edges = bs_bits & ~(bs_bits << 1); |
| 394 | uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash; |
| 395 | uint64_t even_starts = start_edges & even_start_mask; |
| 396 | uint64_t odd_starts = start_edges & ~even_start_mask; |
| 397 | uint64_t even_carries = bs_bits + even_starts; |
| 398 | uint64_t odd_carries; |
| 399 | // bool iter_ends_odd_backslash = |
| 400 | add_overflow(bs_bits, odd_starts, &odd_carries); |
| 401 | odd_carries |= prev_iter_ends_odd_backslash; |
| 402 | // prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL; |
| 403 | // // we never use it |
| 404 | uint64_t even_carry_ends = even_carries & ~bs_bits; |
| 405 | uint64_t odd_carry_ends = odd_carries & ~bs_bits; |
| 406 | uint64_t even_start_odd_end = even_carry_ends & odd_bits; |
| 407 | uint64_t odd_start_even_end = odd_carry_ends & even_bits; |
| 408 | uint64_t odd_ends = even_start_odd_end | odd_start_even_end; |
| 409 | uint64_t quote_bits = |
| 410 | cmp_mask_against_input_mini(input_lo, input_hi, _mm256_set1_epi8('"')); |
| 411 | quote_bits = quote_bits & ~odd_ends; |
| 412 | uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128( |
| 413 | _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0)); |
| 414 | quote_mask ^= prev_iter_inside_quote; |
| 415 | // prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63);// we |
| 416 | // don't need this anymore |
| 417 | |
| 418 | __m256i mask_20 = _mm256_set1_epi8(0x20); // c==32 |
| 419 | __m256i mask_70 = |
| 420 | _mm256_set1_epi8(0x70); // adding 0x70 does not check low 4-bits |
| 421 | // but moves any value >= 16 above 128 |
| 422 | |
| 423 | __m256i lut_cntrl = _mm256_setr_epi8( |
| 424 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0x00, |
| 425 | 0x00, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
| 426 | 0x00, 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00); |
| 427 | |
| 428 | __m256i tmp_ws_lo = _mm256_or_si256( |
| 429 | _mm256_cmpeq_epi8(mask_20, input_lo), |
| 430 | _mm256_shuffle_epi8(lut_cntrl, _mm256_adds_epu8(mask_70, input_lo))); |
| 431 | __m256i tmp_ws_hi = _mm256_or_si256( |
| 432 | _mm256_cmpeq_epi8(mask_20, input_hi), |
| 433 | _mm256_shuffle_epi8(lut_cntrl, _mm256_adds_epu8(mask_70, input_hi))); |
| 434 | uint64_t ws_res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(tmp_ws_lo)); |
| 435 | uint64_t ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi); |
| 436 | uint64_t whitespace = (ws_res_0 | (ws_res_1 << 32)); |
| 437 | whitespace &= ~quote_mask; |
| 438 | |
| 439 | if (len - idx < 64) { |
| 440 | whitespace |= UINT64_C(0xFFFFFFFFFFFFFFFF) << (len - idx); |
| 441 | } |
| 442 | int mask1 = whitespace & 0xFFFF; |
| 443 | int mask2 = (whitespace >> 16) & 0xFFFF; |
| 444 | int mask3 = (whitespace >> 32) & 0xFFFF; |
| 445 | int mask4 = (whitespace >> 48) & 0xFFFF; |
| 446 | int pop1 = hamming((~whitespace) & 0xFFFF); |
| 447 | int pop2 = hamming((~whitespace) & UINT64_C(0xFFFFFFFF)); |
| 448 | int pop3 = hamming((~whitespace) & UINT64_C(0xFFFFFFFFFFFF)); |
| 449 | int pop4 = hamming((~whitespace)); |
| 450 | __m256i vmask1 = _mm256_loadu2_m128i( |
| 451 | reinterpret_cast<const __m128i *>(mask128_epi8) + (mask2 & 0x7FFF), |
| 452 | reinterpret_cast<const __m128i *>(mask128_epi8) + (mask1 & 0x7FFF)); |
| 453 | __m256i vmask2 = _mm256_loadu2_m128i( |
| 454 | reinterpret_cast<const __m128i *>(mask128_epi8) + (mask4 & 0x7FFF), |
| 455 | reinterpret_cast<const __m128i *>(mask128_epi8) + (mask3 & 0x7FFF)); |
| 456 | __m256i result1 = _mm256_shuffle_epi8(input_lo, vmask1); |
| 457 | __m256i result2 = _mm256_shuffle_epi8(input_hi, vmask2); |
| 458 | _mm256_storeu2_m128i(reinterpret_cast<__m128i *>(buffer + pop1), |
| 459 | reinterpret_cast<__m128i *>(buffer), result1); |
| 460 | _mm256_storeu2_m128i(reinterpret_cast<__m128i *>(buffer + pop3), |
| 461 | reinterpret_cast<__m128i *>(buffer + pop2), result2); |
| 462 | memcpy(out, buffer, pop4); |
| 463 | out += pop4; |
| 464 | } |
| 465 | *out = '\0'; // NULL termination |
| 466 | return out - initout; |
| 467 | } |
| 468 | } // namespace simdjson |
| 469 | #endif |
| 470 | |