1 | #include "simdjson/portability.h" |
2 | #include <cstdint> |
3 | |
4 | #ifndef __AVX2__ |
5 | |
6 | namespace simdjson { |
7 | static uint8_t jump_table[256 * 3] = { |
8 | 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, |
9 | 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, |
10 | 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, |
11 | 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, |
12 | 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, |
13 | 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, |
14 | 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, |
15 | 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, |
16 | 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, |
17 | 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, |
18 | 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, |
19 | 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, |
20 | 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, |
21 | 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, |
22 | 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, |
23 | 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, |
24 | 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, |
25 | 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, |
26 | 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, |
27 | 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, |
28 | 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, |
29 | 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, |
30 | 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, |
31 | 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, |
32 | 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, |
33 | 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, |
34 | 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, |
35 | 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, |
36 | 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, |
37 | 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, |
38 | 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, |
39 | }; |
40 | |
41 | size_t json_minify(const unsigned char *bytes, size_t how_many, |
42 | unsigned char *out) { |
43 | size_t i = 0, pos = 0; |
44 | uint8_t quote = 0; |
45 | uint8_t nonescape = 1; |
46 | |
47 | while (i < how_many) { |
48 | unsigned char c = bytes[i]; |
49 | uint8_t *meta = jump_table + 3 * c; |
50 | |
51 | quote = quote ^ (meta[0] & nonescape); |
52 | out[pos] = c; |
53 | pos += meta[2] | quote; |
54 | |
55 | i += 1; |
56 | nonescape = (~nonescape) | (meta[1]); |
57 | } |
58 | return pos; |
59 | } |
60 | } // namespace simdjson |
61 | #else |
62 | #include "simdprune_tables.h" |
63 | #include <cstring> |
64 | |
65 | namespace simdjson { |
66 | |
67 | // a straightforward comparison of a mask against input. |
68 | static uint64_t cmp_mask_against_input_mini(__m256i input_lo, __m256i input_hi, |
69 | __m256i mask) { |
70 | __m256i cmp_res_0 = _mm256_cmpeq_epi8(input_lo, mask); |
71 | uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0)); |
72 | __m256i cmp_res_1 = _mm256_cmpeq_epi8(input_hi, mask); |
73 | uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1); |
74 | return res_0 | (res_1 << 32); |
75 | } |
76 | |
77 | // Write up to 16 bytes, only the bytes corresponding to a 1-bit are written |
78 | // out. credit: Anime Tosho |
79 | static __m128i skinnycleanm128(__m128i x, int mask) { |
80 | int mask1 = mask & 0xFF; |
81 | int mask2 = (mask >> 8) & 0xFF; |
82 | __m128i shufmask = _mm_castps_si128( |
83 | _mm_loadh_pi(_mm_castsi128_ps(_mm_loadl_epi64( |
84 | (const __m128i *)(thintable_epi8 + mask1))), |
85 | (const __m64 *)(thintable_epi8 + mask2))); |
86 | shufmask = |
87 | _mm_add_epi8(shufmask, _mm_set_epi32(0x08080808, 0x08080808, 0, 0)); |
88 | __m128i pruned = _mm_shuffle_epi8(x, shufmask); |
89 | intptr_t popx2 = BitsSetTable256mul2[mask1]; |
90 | __m128i compactmask = |
91 | _mm_loadu_si128((const __m128i *)(pshufb_combine_table + popx2 * 8)); |
92 | return _mm_shuffle_epi8(pruned, compactmask); |
93 | } |
94 | |
95 | // take input from buf and remove useless whitespace, input and output can be |
96 | // the same, result is null terminated, return the string length (minus the null |
97 | // termination) |
98 | size_t json_minify(const uint8_t *buf, size_t len, uint8_t *out) { |
99 | // Useful constant masks |
100 | const uint64_t even_bits = 0x5555555555555555ULL; |
101 | const uint64_t odd_bits = ~even_bits; |
102 | uint8_t *initout(out); |
103 | uint64_t prev_iter_ends_odd_backslash = |
104 | 0ULL; // either 0 or 1, but a 64-bit value |
105 | uint64_t prev_iter_inside_quote = 0ULL; // either all zeros or all ones |
106 | size_t idx = 0; |
107 | if (len >= 64) { |
108 | size_t avx_len = len - 63; |
109 | |
110 | for (; idx < avx_len; idx += 64) { |
111 | __m256i input_lo = |
112 | _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf + idx + 0)); |
113 | __m256i input_hi = |
114 | _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf + idx + 32)); |
115 | uint64_t bs_bits = cmp_mask_against_input_mini(input_lo, input_hi, |
116 | _mm256_set1_epi8('\\')); |
117 | uint64_t start_edges = bs_bits & ~(bs_bits << 1); |
118 | uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash; |
119 | uint64_t even_starts = start_edges & even_start_mask; |
120 | uint64_t odd_starts = start_edges & ~even_start_mask; |
121 | uint64_t even_carries = bs_bits + even_starts; |
122 | uint64_t odd_carries; |
123 | bool iter_ends_odd_backslash = |
124 | add_overflow(bs_bits, odd_starts, &odd_carries); |
125 | odd_carries |= prev_iter_ends_odd_backslash; |
126 | prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL; |
127 | uint64_t even_carry_ends = even_carries & ~bs_bits; |
128 | uint64_t odd_carry_ends = odd_carries & ~bs_bits; |
129 | uint64_t even_start_odd_end = even_carry_ends & odd_bits; |
130 | uint64_t odd_start_even_end = odd_carry_ends & even_bits; |
131 | uint64_t odd_ends = even_start_odd_end | odd_start_even_end; |
132 | uint64_t quote_bits = cmp_mask_against_input_mini(input_lo, input_hi, |
133 | _mm256_set1_epi8('"')); |
134 | quote_bits = quote_bits & ~odd_ends; |
135 | uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128( |
136 | _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0)); |
137 | quote_mask ^= prev_iter_inside_quote; |
138 | prev_iter_inside_quote = static_cast<uint64_t>( |
139 | static_cast<int64_t>(quote_mask) >> |
140 | 63); // might be undefined behavior, should be fully defined in C++20, |
141 | // ok according to John Regher from Utah University |
142 | const __m256i low_nibble_mask = _mm256_setr_epi8( |
143 | // 0 9 a b c d |
144 | 16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0, 16, 0, 0, 0, 0, 0, |
145 | 0, 0, 0, 8, 12, 1, 2, 9, 0, 0); |
146 | const __m256i high_nibble_mask = _mm256_setr_epi8( |
147 | // 0 2 3 5 7 |
148 | 8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0, 8, 0, 18, 4, 0, 1, 0, |
149 | 1, 0, 0, 0, 3, 2, 1, 0, 0); |
150 | __m256i whitespace_shufti_mask = _mm256_set1_epi8(0x18); |
151 | __m256i v_lo = _mm256_and_si256( |
152 | _mm256_shuffle_epi8(low_nibble_mask, input_lo), |
153 | _mm256_shuffle_epi8(high_nibble_mask, |
154 | _mm256_and_si256(_mm256_srli_epi32(input_lo, 4), |
155 | _mm256_set1_epi8(0x7f)))); |
156 | |
157 | __m256i v_hi = _mm256_and_si256( |
158 | _mm256_shuffle_epi8(low_nibble_mask, input_hi), |
159 | _mm256_shuffle_epi8(high_nibble_mask, |
160 | _mm256_and_si256(_mm256_srli_epi32(input_hi, 4), |
161 | _mm256_set1_epi8(0x7f)))); |
162 | __m256i tmp_ws_lo = _mm256_cmpeq_epi8( |
163 | _mm256_and_si256(v_lo, whitespace_shufti_mask), _mm256_set1_epi8(0)); |
164 | __m256i tmp_ws_hi = _mm256_cmpeq_epi8( |
165 | _mm256_and_si256(v_hi, whitespace_shufti_mask), _mm256_set1_epi8(0)); |
166 | |
167 | uint64_t ws_res_0 = |
168 | static_cast<uint32_t>(_mm256_movemask_epi8(tmp_ws_lo)); |
169 | uint64_t ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi); |
170 | uint64_t whitespace = ~(ws_res_0 | (ws_res_1 << 32)); |
171 | whitespace &= ~quote_mask; |
172 | |
173 | uint64_t non_whitespace = ~whitespace; |
174 | |
175 | __m128i x1 = _mm256_extracti128_si256(input_lo, 0); |
176 | __m128i x2 = _mm256_extracti128_si256(input_lo, 1); |
177 | __m128i x3 = _mm256_extracti128_si256(input_hi, 0); |
178 | __m128i x4 = _mm256_extracti128_si256(input_hi, 1); |
179 | |
180 | int mask1 = non_whitespace & 0xFFFF; |
181 | int mask2 = (non_whitespace >> 16) & 0xFFFF; |
182 | int mask3 = (non_whitespace >> 32) & 0xFFFF; |
183 | int mask4 = (non_whitespace >> 48) & 0xFFFF; |
184 | |
185 | x1 = skinnycleanm128(x1, mask1); |
186 | x2 = skinnycleanm128(x2, mask2); |
187 | x3 = skinnycleanm128(x3, mask3); |
188 | x4 = skinnycleanm128(x4, mask4); |
189 | int pop1 = hamming(non_whitespace & 0xFFFF); |
190 | int pop2 = hamming(non_whitespace & UINT64_C(0xFFFFFFFF)); |
191 | int pop3 = hamming(non_whitespace) & UINT64_C(0xFFFFFFFFFFFF)); |
192 | int pop4 = hamming(non_whitespace); |
193 | _mm_storeu_si128(reinterpret_cast<__m128i *>(out), x1); |
194 | _mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop1), x2); |
195 | _mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop2), x3); |
196 | _mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop3), x4); |
197 | out += pop4; |
198 | } |
199 | } |
200 | // we finish off the job... copying and pasting the code is not ideal here, |
201 | // but it gets the job done. |
202 | if (idx < len) { |
203 | uint8_t buffer[64]; |
204 | memset(buffer, 0, 64); |
205 | memcpy(buffer, buf + idx, len - idx); |
206 | __m256i input_lo = |
207 | _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buffer)); |
208 | __m256i input_hi = |
209 | _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buffer + 32)); |
210 | uint64_t bs_bits = |
211 | cmp_mask_against_input_mini(input_lo, input_hi, _mm256_set1_epi8('\\')); |
212 | uint64_t start_edges = bs_bits & ~(bs_bits << 1); |
213 | uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash; |
214 | uint64_t even_starts = start_edges & even_start_mask; |
215 | uint64_t odd_starts = start_edges & ~even_start_mask; |
216 | uint64_t even_carries = bs_bits + even_starts; |
217 | uint64_t odd_carries; |
218 | // bool iter_ends_odd_backslash = |
219 | add_overflow(bs_bits, odd_starts, &odd_carries); |
220 | odd_carries |= prev_iter_ends_odd_backslash; |
221 | // prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL; |
222 | // // we never use it |
223 | uint64_t even_carry_ends = even_carries & ~bs_bits; |
224 | uint64_t odd_carry_ends = odd_carries & ~bs_bits; |
225 | uint64_t even_start_odd_end = even_carry_ends & odd_bits; |
226 | uint64_t odd_start_even_end = odd_carry_ends & even_bits; |
227 | uint64_t odd_ends = even_start_odd_end | odd_start_even_end; |
228 | uint64_t quote_bits = |
229 | cmp_mask_against_input_mini(input_lo, input_hi, _mm256_set1_epi8('"')); |
230 | quote_bits = quote_bits & ~odd_ends; |
231 | uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128( |
232 | _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0)); |
233 | quote_mask ^= prev_iter_inside_quote; |
234 | // prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63);// we |
235 | // don't need this anymore |
236 | |
237 | __m256i mask_20 = _mm256_set1_epi8(0x20); // c==32 |
238 | __m256i mask_70 = |
239 | _mm256_set1_epi8(0x70); // adding 0x70 does not check low 4-bits |
240 | // but moves any value >= 16 above 128 |
241 | |
242 | __m256i lut_cntrl = _mm256_setr_epi8( |
243 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0x00, |
244 | 0x00, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
245 | 0x00, 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00); |
246 | |
247 | __m256i tmp_ws_lo = _mm256_or_si256( |
248 | _mm256_cmpeq_epi8(mask_20, input_lo), |
249 | _mm256_shuffle_epi8(lut_cntrl, _mm256_adds_epu8(mask_70, input_lo))); |
250 | __m256i tmp_ws_hi = _mm256_or_si256( |
251 | _mm256_cmpeq_epi8(mask_20, input_hi), |
252 | _mm256_shuffle_epi8(lut_cntrl, _mm256_adds_epu8(mask_70, input_hi))); |
253 | uint64_t ws_res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(tmp_ws_lo)); |
254 | uint64_t ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi); |
255 | uint64_t whitespace = (ws_res_0 | (ws_res_1 << 32)); |
256 | whitespace &= ~quote_mask; |
257 | |
258 | if (len - idx < 64) { |
259 | whitespace |= UINT64_C(0xFFFFFFFFFFFFFFFF) << (len - idx); |
260 | } |
261 | int mask1 = non_whitespace & 0xFFFF; |
262 | int mask2 = (non_whitespace >> 16) & 0xFFFF; |
263 | int mask3 = (non_whitespace >> 32) & 0xFFFF; |
264 | int mask4 = (non_whitespace >> 48) & 0xFFFF; |
265 | |
266 | x1 = skinnycleanm128(x1, mask1); |
267 | x2 = skinnycleanm128(x2, mask2); |
268 | x3 = skinnycleanm128(x3, mask3); |
269 | x4 = skinnycleanm128(x4, mask4); |
270 | int pop1 = hamming(non_whitespace & 0xFFFF); |
271 | int pop2 = hamming(non_whitespace & UINT64_C(0xFFFFFFFF)); |
272 | int pop3 = hamming(non_whitespace) & UINT64_C(0xFFFFFFFFFFFF)); |
273 | int pop4 = hamming(non_whitespace); |
274 | _mm_storeu_si128(reinterpret_cast<__m128i *>(out), x1); |
275 | _mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop1), x2); |
276 | _mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop2), x3); |
277 | _mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop3), x4); |
278 | out += pop4; |
279 | } |
280 | *out = '\0'; // NULL termination |
281 | return out - initout; |
282 | } |
283 | |
284 | size_t oldjson_minify(const uint8_t *buf, size_t len, uint8_t *out) { |
285 | // Useful constant masks |
286 | const uint64_t even_bits = 0x5555555555555555ULL; |
287 | const uint64_t odd_bits = ~even_bits; |
288 | uint8_t *initout(out); |
289 | uint64_t prev_iter_ends_odd_backslash = |
290 | 0ULL; // either 0 or 1, but a 64-bit value |
291 | uint64_t prev_iter_inside_quote = 0ULL; // either all zeros or all ones |
292 | size_t idx = 0; |
293 | if (len >= 64) { |
294 | size_t avx_len = len - 63; |
295 | |
296 | for (; idx < avx_len; idx += 64) { |
297 | __m256i input_lo = |
298 | _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf + idx + 0)); |
299 | __m256i input_hi = |
300 | _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf + idx + 32)); |
301 | uint64_t bs_bits = cmp_mask_against_input_mini(input_lo, input_hi, |
302 | _mm256_set1_epi8('\\')); |
303 | uint64_t start_edges = bs_bits & ~(bs_bits << 1); |
304 | uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash; |
305 | uint64_t even_starts = start_edges & even_start_mask; |
306 | uint64_t odd_starts = start_edges & ~even_start_mask; |
307 | uint64_t even_carries = bs_bits + even_starts; |
308 | uint64_t odd_carries; |
309 | bool iter_ends_odd_backslash = |
310 | add_overflow(bs_bits, odd_starts, &odd_carries); |
311 | odd_carries |= prev_iter_ends_odd_backslash; |
312 | prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL; |
313 | uint64_t even_carry_ends = even_carries & ~bs_bits; |
314 | uint64_t odd_carry_ends = odd_carries & ~bs_bits; |
315 | uint64_t even_start_odd_end = even_carry_ends & odd_bits; |
316 | uint64_t odd_start_even_end = odd_carry_ends & even_bits; |
317 | uint64_t odd_ends = even_start_odd_end | odd_start_even_end; |
318 | uint64_t quote_bits = cmp_mask_against_input_mini(input_lo, input_hi, |
319 | _mm256_set1_epi8('"')); |
320 | quote_bits = quote_bits & ~odd_ends; |
321 | uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128( |
322 | _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0)); |
323 | quote_mask ^= prev_iter_inside_quote; |
324 | prev_iter_inside_quote = static_cast<uint64_t>( |
325 | static_cast<int64_t>(quote_mask) >> |
326 | 63); // might be undefined behavior, should be fully defined in C++20, |
327 | // ok according to John Regher from Utah University |
328 | const __m256i low_nibble_mask = _mm256_setr_epi8( |
329 | // 0 9 a b c d |
330 | 16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0, 16, 0, 0, 0, 0, 0, |
331 | 0, 0, 0, 8, 12, 1, 2, 9, 0, 0); |
332 | const __m256i high_nibble_mask = _mm256_setr_epi8( |
333 | // 0 2 3 5 7 |
334 | 8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0, 8, 0, 18, 4, 0, 1, 0, |
335 | 1, 0, 0, 0, 3, 2, 1, 0, 0); |
336 | __m256i whitespace_shufti_mask = _mm256_set1_epi8(0x18); |
337 | __m256i v_lo = _mm256_and_si256( |
338 | _mm256_shuffle_epi8(low_nibble_mask, input_lo), |
339 | _mm256_shuffle_epi8(high_nibble_mask, |
340 | _mm256_and_si256(_mm256_srli_epi32(input_lo, 4), |
341 | _mm256_set1_epi8(0x7f)))); |
342 | |
343 | __m256i v_hi = _mm256_and_si256( |
344 | _mm256_shuffle_epi8(low_nibble_mask, input_hi), |
345 | _mm256_shuffle_epi8(high_nibble_mask, |
346 | _mm256_and_si256(_mm256_srli_epi32(input_hi, 4), |
347 | _mm256_set1_epi8(0x7f)))); |
348 | __m256i tmp_ws_lo = _mm256_cmpeq_epi8( |
349 | _mm256_and_si256(v_lo, whitespace_shufti_mask), _mm256_set1_epi8(0)); |
350 | __m256i tmp_ws_hi = _mm256_cmpeq_epi8( |
351 | _mm256_and_si256(v_hi, whitespace_shufti_mask), _mm256_set1_epi8(0)); |
352 | |
353 | uint64_t ws_res_0 = |
354 | static_cast<uint32_t>(_mm256_movemask_epi8(tmp_ws_lo)); |
355 | uint64_t ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi); |
356 | uint64_t whitespace = ~(ws_res_0 | (ws_res_1 << 32)); |
357 | whitespace &= ~quote_mask; |
358 | int mask1 = whitespace & 0xFFFF; |
359 | int mask2 = (whitespace >> 16) & 0xFFFF; |
360 | int mask3 = (whitespace >> 32) & 0xFFFF; |
361 | int mask4 = (whitespace >> 48) & 0xFFFF; |
362 | int pop1 = hamming((~whitespace) & 0xFFFF); |
363 | int pop2 = hamming((~whitespace) & UINT64_C(0xFFFFFFFF)); |
364 | int pop3 = hamming((~whitespace) & UINT64_C(0xFFFFFFFFFFFF)); |
365 | int pop4 = hamming((~whitespace)); |
366 | __m256i vmask1 = _mm256_loadu2_m128i( |
367 | reinterpret_cast<const __m128i *>(mask128_epi8) + (mask2 & 0x7FFF), |
368 | reinterpret_cast<const __m128i *>(mask128_epi8) + (mask1 & 0x7FFF)); |
369 | __m256i vmask2 = _mm256_loadu2_m128i( |
370 | reinterpret_cast<const __m128i *>(mask128_epi8) + (mask4 & 0x7FFF), |
371 | reinterpret_cast<const __m128i *>(mask128_epi8) + (mask3 & 0x7FFF)); |
372 | __m256i result1 = _mm256_shuffle_epi8(input_lo, vmask1); |
373 | __m256i result2 = _mm256_shuffle_epi8(input_hi, vmask2); |
374 | _mm256_storeu2_m128i(reinterpret_cast<__m128i *>(out + pop1), |
375 | reinterpret_cast<__m128i *>(out), result1); |
376 | _mm256_storeu2_m128i(reinterpret_cast<__m128i *>(out + pop3), |
377 | reinterpret_cast<__m128i *>(out + pop2), result2); |
378 | out += pop4; |
379 | } |
380 | } |
381 | // we finish off the job... copying and pasting the code is not ideal here, |
382 | // but it gets the job done. |
383 | if (idx < len) { |
384 | uint8_t buffer[64]; |
385 | memset(buffer, 0, 64); |
386 | memcpy(buffer, buf + idx, len - idx); |
387 | __m256i input_lo = |
388 | _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buffer)); |
389 | __m256i input_hi = |
390 | _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buffer + 32)); |
391 | uint64_t bs_bits = |
392 | cmp_mask_against_input_mini(input_lo, input_hi, _mm256_set1_epi8('\\')); |
393 | uint64_t start_edges = bs_bits & ~(bs_bits << 1); |
394 | uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash; |
395 | uint64_t even_starts = start_edges & even_start_mask; |
396 | uint64_t odd_starts = start_edges & ~even_start_mask; |
397 | uint64_t even_carries = bs_bits + even_starts; |
398 | uint64_t odd_carries; |
399 | // bool iter_ends_odd_backslash = |
400 | add_overflow(bs_bits, odd_starts, &odd_carries); |
401 | odd_carries |= prev_iter_ends_odd_backslash; |
402 | // prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL; |
403 | // // we never use it |
404 | uint64_t even_carry_ends = even_carries & ~bs_bits; |
405 | uint64_t odd_carry_ends = odd_carries & ~bs_bits; |
406 | uint64_t even_start_odd_end = even_carry_ends & odd_bits; |
407 | uint64_t odd_start_even_end = odd_carry_ends & even_bits; |
408 | uint64_t odd_ends = even_start_odd_end | odd_start_even_end; |
409 | uint64_t quote_bits = |
410 | cmp_mask_against_input_mini(input_lo, input_hi, _mm256_set1_epi8('"')); |
411 | quote_bits = quote_bits & ~odd_ends; |
412 | uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128( |
413 | _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0)); |
414 | quote_mask ^= prev_iter_inside_quote; |
415 | // prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63);// we |
416 | // don't need this anymore |
417 | |
418 | __m256i mask_20 = _mm256_set1_epi8(0x20); // c==32 |
419 | __m256i mask_70 = |
420 | _mm256_set1_epi8(0x70); // adding 0x70 does not check low 4-bits |
421 | // but moves any value >= 16 above 128 |
422 | |
423 | __m256i lut_cntrl = _mm256_setr_epi8( |
424 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0x00, |
425 | 0x00, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
426 | 0x00, 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00); |
427 | |
428 | __m256i tmp_ws_lo = _mm256_or_si256( |
429 | _mm256_cmpeq_epi8(mask_20, input_lo), |
430 | _mm256_shuffle_epi8(lut_cntrl, _mm256_adds_epu8(mask_70, input_lo))); |
431 | __m256i tmp_ws_hi = _mm256_or_si256( |
432 | _mm256_cmpeq_epi8(mask_20, input_hi), |
433 | _mm256_shuffle_epi8(lut_cntrl, _mm256_adds_epu8(mask_70, input_hi))); |
434 | uint64_t ws_res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(tmp_ws_lo)); |
435 | uint64_t ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi); |
436 | uint64_t whitespace = (ws_res_0 | (ws_res_1 << 32)); |
437 | whitespace &= ~quote_mask; |
438 | |
439 | if (len - idx < 64) { |
440 | whitespace |= UINT64_C(0xFFFFFFFFFFFFFFFF) << (len - idx); |
441 | } |
442 | int mask1 = whitespace & 0xFFFF; |
443 | int mask2 = (whitespace >> 16) & 0xFFFF; |
444 | int mask3 = (whitespace >> 32) & 0xFFFF; |
445 | int mask4 = (whitespace >> 48) & 0xFFFF; |
446 | int pop1 = hamming((~whitespace) & 0xFFFF); |
447 | int pop2 = hamming((~whitespace) & UINT64_C(0xFFFFFFFF)); |
448 | int pop3 = hamming((~whitespace) & UINT64_C(0xFFFFFFFFFFFF)); |
449 | int pop4 = hamming((~whitespace)); |
450 | __m256i vmask1 = _mm256_loadu2_m128i( |
451 | reinterpret_cast<const __m128i *>(mask128_epi8) + (mask2 & 0x7FFF), |
452 | reinterpret_cast<const __m128i *>(mask128_epi8) + (mask1 & 0x7FFF)); |
453 | __m256i vmask2 = _mm256_loadu2_m128i( |
454 | reinterpret_cast<const __m128i *>(mask128_epi8) + (mask4 & 0x7FFF), |
455 | reinterpret_cast<const __m128i *>(mask128_epi8) + (mask3 & 0x7FFF)); |
456 | __m256i result1 = _mm256_shuffle_epi8(input_lo, vmask1); |
457 | __m256i result2 = _mm256_shuffle_epi8(input_hi, vmask2); |
458 | _mm256_storeu2_m128i(reinterpret_cast<__m128i *>(buffer + pop1), |
459 | reinterpret_cast<__m128i *>(buffer), result1); |
460 | _mm256_storeu2_m128i(reinterpret_cast<__m128i *>(buffer + pop3), |
461 | reinterpret_cast<__m128i *>(buffer + pop2), result2); |
462 | memcpy(out, buffer, pop4); |
463 | out += pop4; |
464 | } |
465 | *out = '\0'; // NULL termination |
466 | return out - initout; |
467 | } |
468 | } // namespace simdjson |
469 | #endif |
470 | |