1#include "simdjson/portability.h"
2#include <cstdint>
3
4#ifndef __AVX2__
5
6namespace simdjson {
7static uint8_t jump_table[256 * 3] = {
8 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
9 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1,
10 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
11 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0,
12 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
13 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
14 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
15 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
16 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
17 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
18 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
19 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
20 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
21 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
22 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
23 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
24 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
25 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
26 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
27 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
28 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
29 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
30 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
31 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
32 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
33 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
34 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
35 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
36 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
37 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
38 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
39};
40
41size_t json_minify(const unsigned char *bytes, size_t how_many,
42 unsigned char *out) {
43 size_t i = 0, pos = 0;
44 uint8_t quote = 0;
45 uint8_t nonescape = 1;
46
47 while (i < how_many) {
48 unsigned char c = bytes[i];
49 uint8_t *meta = jump_table + 3 * c;
50
51 quote = quote ^ (meta[0] & nonescape);
52 out[pos] = c;
53 pos += meta[2] | quote;
54
55 i += 1;
56 nonescape = (~nonescape) | (meta[1]);
57 }
58 return pos;
59}
60} // namespace simdjson
61#else
62#include "simdprune_tables.h"
63#include <cstring>
64
65namespace simdjson {
66
67// a straightforward comparison of a mask against input.
68static uint64_t cmp_mask_against_input_mini(__m256i input_lo, __m256i input_hi,
69 __m256i mask) {
70 __m256i cmp_res_0 = _mm256_cmpeq_epi8(input_lo, mask);
71 uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
72 __m256i cmp_res_1 = _mm256_cmpeq_epi8(input_hi, mask);
73 uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1);
74 return res_0 | (res_1 << 32);
75}
76
77// Write up to 16 bytes, only the bytes corresponding to a 1-bit are written
78// out. credit: Anime Tosho
79static __m128i skinnycleanm128(__m128i x, int mask) {
80 int mask1 = mask & 0xFF;
81 int mask2 = (mask >> 8) & 0xFF;
82 __m128i shufmask = _mm_castps_si128(
83 _mm_loadh_pi(_mm_castsi128_ps(_mm_loadl_epi64(
84 (const __m128i *)(thintable_epi8 + mask1))),
85 (const __m64 *)(thintable_epi8 + mask2)));
86 shufmask =
87 _mm_add_epi8(shufmask, _mm_set_epi32(0x08080808, 0x08080808, 0, 0));
88 __m128i pruned = _mm_shuffle_epi8(x, shufmask);
89 intptr_t popx2 = BitsSetTable256mul2[mask1];
90 __m128i compactmask =
91 _mm_loadu_si128((const __m128i *)(pshufb_combine_table + popx2 * 8));
92 return _mm_shuffle_epi8(pruned, compactmask);
93}
94
95// take input from buf and remove useless whitespace, input and output can be
96// the same, result is null terminated, return the string length (minus the null
97// termination)
98size_t json_minify(const uint8_t *buf, size_t len, uint8_t *out) {
99 // Useful constant masks
100 const uint64_t even_bits = 0x5555555555555555ULL;
101 const uint64_t odd_bits = ~even_bits;
102 uint8_t *initout(out);
103 uint64_t prev_iter_ends_odd_backslash =
104 0ULL; // either 0 or 1, but a 64-bit value
105 uint64_t prev_iter_inside_quote = 0ULL; // either all zeros or all ones
106 size_t idx = 0;
107 if (len >= 64) {
108 size_t avx_len = len - 63;
109
110 for (; idx < avx_len; idx += 64) {
111 __m256i input_lo =
112 _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf + idx + 0));
113 __m256i input_hi =
114 _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf + idx + 32));
115 uint64_t bs_bits = cmp_mask_against_input_mini(input_lo, input_hi,
116 _mm256_set1_epi8('\\'));
117 uint64_t start_edges = bs_bits & ~(bs_bits << 1);
118 uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
119 uint64_t even_starts = start_edges & even_start_mask;
120 uint64_t odd_starts = start_edges & ~even_start_mask;
121 uint64_t even_carries = bs_bits + even_starts;
122 uint64_t odd_carries;
123 bool iter_ends_odd_backslash =
124 add_overflow(bs_bits, odd_starts, &odd_carries);
125 odd_carries |= prev_iter_ends_odd_backslash;
126 prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
127 uint64_t even_carry_ends = even_carries & ~bs_bits;
128 uint64_t odd_carry_ends = odd_carries & ~bs_bits;
129 uint64_t even_start_odd_end = even_carry_ends & odd_bits;
130 uint64_t odd_start_even_end = odd_carry_ends & even_bits;
131 uint64_t odd_ends = even_start_odd_end | odd_start_even_end;
132 uint64_t quote_bits = cmp_mask_against_input_mini(input_lo, input_hi,
133 _mm256_set1_epi8('"'));
134 quote_bits = quote_bits & ~odd_ends;
135 uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
136 _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
137 quote_mask ^= prev_iter_inside_quote;
138 prev_iter_inside_quote = static_cast<uint64_t>(
139 static_cast<int64_t>(quote_mask) >>
140 63); // might be undefined behavior, should be fully defined in C++20,
141 // ok according to John Regher from Utah University
142 const __m256i low_nibble_mask = _mm256_setr_epi8(
143 // 0 9 a b c d
144 16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0, 16, 0, 0, 0, 0, 0,
145 0, 0, 0, 8, 12, 1, 2, 9, 0, 0);
146 const __m256i high_nibble_mask = _mm256_setr_epi8(
147 // 0 2 3 5 7
148 8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0, 8, 0, 18, 4, 0, 1, 0,
149 1, 0, 0, 0, 3, 2, 1, 0, 0);
150 __m256i whitespace_shufti_mask = _mm256_set1_epi8(0x18);
151 __m256i v_lo = _mm256_and_si256(
152 _mm256_shuffle_epi8(low_nibble_mask, input_lo),
153 _mm256_shuffle_epi8(high_nibble_mask,
154 _mm256_and_si256(_mm256_srli_epi32(input_lo, 4),
155 _mm256_set1_epi8(0x7f))));
156
157 __m256i v_hi = _mm256_and_si256(
158 _mm256_shuffle_epi8(low_nibble_mask, input_hi),
159 _mm256_shuffle_epi8(high_nibble_mask,
160 _mm256_and_si256(_mm256_srli_epi32(input_hi, 4),
161 _mm256_set1_epi8(0x7f))));
162 __m256i tmp_ws_lo = _mm256_cmpeq_epi8(
163 _mm256_and_si256(v_lo, whitespace_shufti_mask), _mm256_set1_epi8(0));
164 __m256i tmp_ws_hi = _mm256_cmpeq_epi8(
165 _mm256_and_si256(v_hi, whitespace_shufti_mask), _mm256_set1_epi8(0));
166
167 uint64_t ws_res_0 =
168 static_cast<uint32_t>(_mm256_movemask_epi8(tmp_ws_lo));
169 uint64_t ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi);
170 uint64_t whitespace = ~(ws_res_0 | (ws_res_1 << 32));
171 whitespace &= ~quote_mask;
172
173 uint64_t non_whitespace = ~whitespace;
174
175 __m128i x1 = _mm256_extracti128_si256(input_lo, 0);
176 __m128i x2 = _mm256_extracti128_si256(input_lo, 1);
177 __m128i x3 = _mm256_extracti128_si256(input_hi, 0);
178 __m128i x4 = _mm256_extracti128_si256(input_hi, 1);
179
180 int mask1 = non_whitespace & 0xFFFF;
181 int mask2 = (non_whitespace >> 16) & 0xFFFF;
182 int mask3 = (non_whitespace >> 32) & 0xFFFF;
183 int mask4 = (non_whitespace >> 48) & 0xFFFF;
184
185 x1 = skinnycleanm128(x1, mask1);
186 x2 = skinnycleanm128(x2, mask2);
187 x3 = skinnycleanm128(x3, mask3);
188 x4 = skinnycleanm128(x4, mask4);
189 int pop1 = hamming(non_whitespace & 0xFFFF);
190 int pop2 = hamming(non_whitespace & UINT64_C(0xFFFFFFFF));
191 int pop3 = hamming(non_whitespace) & UINT64_C(0xFFFFFFFFFFFF));
192 int pop4 = hamming(non_whitespace);
193 _mm_storeu_si128(reinterpret_cast<__m128i *>(out), x1);
194 _mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop1), x2);
195 _mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop2), x3);
196 _mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop3), x4);
197 out += pop4;
198 }
199 }
200 // we finish off the job... copying and pasting the code is not ideal here,
201 // but it gets the job done.
202 if (idx < len) {
203 uint8_t buffer[64];
204 memset(buffer, 0, 64);
205 memcpy(buffer, buf + idx, len - idx);
206 __m256i input_lo =
207 _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buffer));
208 __m256i input_hi =
209 _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buffer + 32));
210 uint64_t bs_bits =
211 cmp_mask_against_input_mini(input_lo, input_hi, _mm256_set1_epi8('\\'));
212 uint64_t start_edges = bs_bits & ~(bs_bits << 1);
213 uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
214 uint64_t even_starts = start_edges & even_start_mask;
215 uint64_t odd_starts = start_edges & ~even_start_mask;
216 uint64_t even_carries = bs_bits + even_starts;
217 uint64_t odd_carries;
218 // bool iter_ends_odd_backslash =
219 add_overflow(bs_bits, odd_starts, &odd_carries);
220 odd_carries |= prev_iter_ends_odd_backslash;
221 // prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
222 // // we never use it
223 uint64_t even_carry_ends = even_carries & ~bs_bits;
224 uint64_t odd_carry_ends = odd_carries & ~bs_bits;
225 uint64_t even_start_odd_end = even_carry_ends & odd_bits;
226 uint64_t odd_start_even_end = odd_carry_ends & even_bits;
227 uint64_t odd_ends = even_start_odd_end | odd_start_even_end;
228 uint64_t quote_bits =
229 cmp_mask_against_input_mini(input_lo, input_hi, _mm256_set1_epi8('"'));
230 quote_bits = quote_bits & ~odd_ends;
231 uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
232 _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
233 quote_mask ^= prev_iter_inside_quote;
234 // prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63);// we
235 // don't need this anymore
236
237 __m256i mask_20 = _mm256_set1_epi8(0x20); // c==32
238 __m256i mask_70 =
239 _mm256_set1_epi8(0x70); // adding 0x70 does not check low 4-bits
240 // but moves any value >= 16 above 128
241
242 __m256i lut_cntrl = _mm256_setr_epi8(
243 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0x00,
244 0x00, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
245 0x00, 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00);
246
247 __m256i tmp_ws_lo = _mm256_or_si256(
248 _mm256_cmpeq_epi8(mask_20, input_lo),
249 _mm256_shuffle_epi8(lut_cntrl, _mm256_adds_epu8(mask_70, input_lo)));
250 __m256i tmp_ws_hi = _mm256_or_si256(
251 _mm256_cmpeq_epi8(mask_20, input_hi),
252 _mm256_shuffle_epi8(lut_cntrl, _mm256_adds_epu8(mask_70, input_hi)));
253 uint64_t ws_res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(tmp_ws_lo));
254 uint64_t ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi);
255 uint64_t whitespace = (ws_res_0 | (ws_res_1 << 32));
256 whitespace &= ~quote_mask;
257
258 if (len - idx < 64) {
259 whitespace |= UINT64_C(0xFFFFFFFFFFFFFFFF) << (len - idx);
260 }
261 int mask1 = non_whitespace & 0xFFFF;
262 int mask2 = (non_whitespace >> 16) & 0xFFFF;
263 int mask3 = (non_whitespace >> 32) & 0xFFFF;
264 int mask4 = (non_whitespace >> 48) & 0xFFFF;
265
266 x1 = skinnycleanm128(x1, mask1);
267 x2 = skinnycleanm128(x2, mask2);
268 x3 = skinnycleanm128(x3, mask3);
269 x4 = skinnycleanm128(x4, mask4);
270 int pop1 = hamming(non_whitespace & 0xFFFF);
271 int pop2 = hamming(non_whitespace & UINT64_C(0xFFFFFFFF));
272 int pop3 = hamming(non_whitespace) & UINT64_C(0xFFFFFFFFFFFF));
273 int pop4 = hamming(non_whitespace);
274 _mm_storeu_si128(reinterpret_cast<__m128i *>(out), x1);
275 _mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop1), x2);
276 _mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop2), x3);
277 _mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop3), x4);
278 out += pop4;
279 }
280 *out = '\0'; // NULL termination
281 return out - initout;
282}
283
284size_t oldjson_minify(const uint8_t *buf, size_t len, uint8_t *out) {
285 // Useful constant masks
286 const uint64_t even_bits = 0x5555555555555555ULL;
287 const uint64_t odd_bits = ~even_bits;
288 uint8_t *initout(out);
289 uint64_t prev_iter_ends_odd_backslash =
290 0ULL; // either 0 or 1, but a 64-bit value
291 uint64_t prev_iter_inside_quote = 0ULL; // either all zeros or all ones
292 size_t idx = 0;
293 if (len >= 64) {
294 size_t avx_len = len - 63;
295
296 for (; idx < avx_len; idx += 64) {
297 __m256i input_lo =
298 _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf + idx + 0));
299 __m256i input_hi =
300 _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf + idx + 32));
301 uint64_t bs_bits = cmp_mask_against_input_mini(input_lo, input_hi,
302 _mm256_set1_epi8('\\'));
303 uint64_t start_edges = bs_bits & ~(bs_bits << 1);
304 uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
305 uint64_t even_starts = start_edges & even_start_mask;
306 uint64_t odd_starts = start_edges & ~even_start_mask;
307 uint64_t even_carries = bs_bits + even_starts;
308 uint64_t odd_carries;
309 bool iter_ends_odd_backslash =
310 add_overflow(bs_bits, odd_starts, &odd_carries);
311 odd_carries |= prev_iter_ends_odd_backslash;
312 prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
313 uint64_t even_carry_ends = even_carries & ~bs_bits;
314 uint64_t odd_carry_ends = odd_carries & ~bs_bits;
315 uint64_t even_start_odd_end = even_carry_ends & odd_bits;
316 uint64_t odd_start_even_end = odd_carry_ends & even_bits;
317 uint64_t odd_ends = even_start_odd_end | odd_start_even_end;
318 uint64_t quote_bits = cmp_mask_against_input_mini(input_lo, input_hi,
319 _mm256_set1_epi8('"'));
320 quote_bits = quote_bits & ~odd_ends;
321 uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
322 _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
323 quote_mask ^= prev_iter_inside_quote;
324 prev_iter_inside_quote = static_cast<uint64_t>(
325 static_cast<int64_t>(quote_mask) >>
326 63); // might be undefined behavior, should be fully defined in C++20,
327 // ok according to John Regher from Utah University
328 const __m256i low_nibble_mask = _mm256_setr_epi8(
329 // 0 9 a b c d
330 16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0, 16, 0, 0, 0, 0, 0,
331 0, 0, 0, 8, 12, 1, 2, 9, 0, 0);
332 const __m256i high_nibble_mask = _mm256_setr_epi8(
333 // 0 2 3 5 7
334 8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0, 8, 0, 18, 4, 0, 1, 0,
335 1, 0, 0, 0, 3, 2, 1, 0, 0);
336 __m256i whitespace_shufti_mask = _mm256_set1_epi8(0x18);
337 __m256i v_lo = _mm256_and_si256(
338 _mm256_shuffle_epi8(low_nibble_mask, input_lo),
339 _mm256_shuffle_epi8(high_nibble_mask,
340 _mm256_and_si256(_mm256_srli_epi32(input_lo, 4),
341 _mm256_set1_epi8(0x7f))));
342
343 __m256i v_hi = _mm256_and_si256(
344 _mm256_shuffle_epi8(low_nibble_mask, input_hi),
345 _mm256_shuffle_epi8(high_nibble_mask,
346 _mm256_and_si256(_mm256_srli_epi32(input_hi, 4),
347 _mm256_set1_epi8(0x7f))));
348 __m256i tmp_ws_lo = _mm256_cmpeq_epi8(
349 _mm256_and_si256(v_lo, whitespace_shufti_mask), _mm256_set1_epi8(0));
350 __m256i tmp_ws_hi = _mm256_cmpeq_epi8(
351 _mm256_and_si256(v_hi, whitespace_shufti_mask), _mm256_set1_epi8(0));
352
353 uint64_t ws_res_0 =
354 static_cast<uint32_t>(_mm256_movemask_epi8(tmp_ws_lo));
355 uint64_t ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi);
356 uint64_t whitespace = ~(ws_res_0 | (ws_res_1 << 32));
357 whitespace &= ~quote_mask;
358 int mask1 = whitespace & 0xFFFF;
359 int mask2 = (whitespace >> 16) & 0xFFFF;
360 int mask3 = (whitespace >> 32) & 0xFFFF;
361 int mask4 = (whitespace >> 48) & 0xFFFF;
362 int pop1 = hamming((~whitespace) & 0xFFFF);
363 int pop2 = hamming((~whitespace) & UINT64_C(0xFFFFFFFF));
364 int pop3 = hamming((~whitespace) & UINT64_C(0xFFFFFFFFFFFF));
365 int pop4 = hamming((~whitespace));
366 __m256i vmask1 = _mm256_loadu2_m128i(
367 reinterpret_cast<const __m128i *>(mask128_epi8) + (mask2 & 0x7FFF),
368 reinterpret_cast<const __m128i *>(mask128_epi8) + (mask1 & 0x7FFF));
369 __m256i vmask2 = _mm256_loadu2_m128i(
370 reinterpret_cast<const __m128i *>(mask128_epi8) + (mask4 & 0x7FFF),
371 reinterpret_cast<const __m128i *>(mask128_epi8) + (mask3 & 0x7FFF));
372 __m256i result1 = _mm256_shuffle_epi8(input_lo, vmask1);
373 __m256i result2 = _mm256_shuffle_epi8(input_hi, vmask2);
374 _mm256_storeu2_m128i(reinterpret_cast<__m128i *>(out + pop1),
375 reinterpret_cast<__m128i *>(out), result1);
376 _mm256_storeu2_m128i(reinterpret_cast<__m128i *>(out + pop3),
377 reinterpret_cast<__m128i *>(out + pop2), result2);
378 out += pop4;
379 }
380 }
381 // we finish off the job... copying and pasting the code is not ideal here,
382 // but it gets the job done.
383 if (idx < len) {
384 uint8_t buffer[64];
385 memset(buffer, 0, 64);
386 memcpy(buffer, buf + idx, len - idx);
387 __m256i input_lo =
388 _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buffer));
389 __m256i input_hi =
390 _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buffer + 32));
391 uint64_t bs_bits =
392 cmp_mask_against_input_mini(input_lo, input_hi, _mm256_set1_epi8('\\'));
393 uint64_t start_edges = bs_bits & ~(bs_bits << 1);
394 uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
395 uint64_t even_starts = start_edges & even_start_mask;
396 uint64_t odd_starts = start_edges & ~even_start_mask;
397 uint64_t even_carries = bs_bits + even_starts;
398 uint64_t odd_carries;
399 // bool iter_ends_odd_backslash =
400 add_overflow(bs_bits, odd_starts, &odd_carries);
401 odd_carries |= prev_iter_ends_odd_backslash;
402 // prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
403 // // we never use it
404 uint64_t even_carry_ends = even_carries & ~bs_bits;
405 uint64_t odd_carry_ends = odd_carries & ~bs_bits;
406 uint64_t even_start_odd_end = even_carry_ends & odd_bits;
407 uint64_t odd_start_even_end = odd_carry_ends & even_bits;
408 uint64_t odd_ends = even_start_odd_end | odd_start_even_end;
409 uint64_t quote_bits =
410 cmp_mask_against_input_mini(input_lo, input_hi, _mm256_set1_epi8('"'));
411 quote_bits = quote_bits & ~odd_ends;
412 uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
413 _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
414 quote_mask ^= prev_iter_inside_quote;
415 // prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63);// we
416 // don't need this anymore
417
418 __m256i mask_20 = _mm256_set1_epi8(0x20); // c==32
419 __m256i mask_70 =
420 _mm256_set1_epi8(0x70); // adding 0x70 does not check low 4-bits
421 // but moves any value >= 16 above 128
422
423 __m256i lut_cntrl = _mm256_setr_epi8(
424 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0x00,
425 0x00, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
426 0x00, 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00);
427
428 __m256i tmp_ws_lo = _mm256_or_si256(
429 _mm256_cmpeq_epi8(mask_20, input_lo),
430 _mm256_shuffle_epi8(lut_cntrl, _mm256_adds_epu8(mask_70, input_lo)));
431 __m256i tmp_ws_hi = _mm256_or_si256(
432 _mm256_cmpeq_epi8(mask_20, input_hi),
433 _mm256_shuffle_epi8(lut_cntrl, _mm256_adds_epu8(mask_70, input_hi)));
434 uint64_t ws_res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(tmp_ws_lo));
435 uint64_t ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi);
436 uint64_t whitespace = (ws_res_0 | (ws_res_1 << 32));
437 whitespace &= ~quote_mask;
438
439 if (len - idx < 64) {
440 whitespace |= UINT64_C(0xFFFFFFFFFFFFFFFF) << (len - idx);
441 }
442 int mask1 = whitespace & 0xFFFF;
443 int mask2 = (whitespace >> 16) & 0xFFFF;
444 int mask3 = (whitespace >> 32) & 0xFFFF;
445 int mask4 = (whitespace >> 48) & 0xFFFF;
446 int pop1 = hamming((~whitespace) & 0xFFFF);
447 int pop2 = hamming((~whitespace) & UINT64_C(0xFFFFFFFF));
448 int pop3 = hamming((~whitespace) & UINT64_C(0xFFFFFFFFFFFF));
449 int pop4 = hamming((~whitespace));
450 __m256i vmask1 = _mm256_loadu2_m128i(
451 reinterpret_cast<const __m128i *>(mask128_epi8) + (mask2 & 0x7FFF),
452 reinterpret_cast<const __m128i *>(mask128_epi8) + (mask1 & 0x7FFF));
453 __m256i vmask2 = _mm256_loadu2_m128i(
454 reinterpret_cast<const __m128i *>(mask128_epi8) + (mask4 & 0x7FFF),
455 reinterpret_cast<const __m128i *>(mask128_epi8) + (mask3 & 0x7FFF));
456 __m256i result1 = _mm256_shuffle_epi8(input_lo, vmask1);
457 __m256i result2 = _mm256_shuffle_epi8(input_hi, vmask2);
458 _mm256_storeu2_m128i(reinterpret_cast<__m128i *>(buffer + pop1),
459 reinterpret_cast<__m128i *>(buffer), result1);
460 _mm256_storeu2_m128i(reinterpret_cast<__m128i *>(buffer + pop3),
461 reinterpret_cast<__m128i *>(buffer + pop2), result2);
462 memcpy(out, buffer, pop4);
463 out += pop4;
464 }
465 *out = '\0'; // NULL termination
466 return out - initout;
467}
468} // namespace simdjson
469#endif
470