1static inline int
2dec_loop_avx2_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
3{
4 const __m256i lut_lo = _mm256_setr_epi8(
5 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
6 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A,
7 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
8 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A);
9
10 const __m256i lut_hi = _mm256_setr_epi8(
11 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
12 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
13 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
14 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10);
15
16 const __m256i lut_roll = _mm256_setr_epi8(
17 0, 16, 19, 4, -65, -65, -71, -71,
18 0, 0, 0, 0, 0, 0, 0, 0,
19 0, 16, 19, 4, -65, -65, -71, -71,
20 0, 0, 0, 0, 0, 0, 0, 0);
21
22 const __m256i mask_2F = _mm256_set1_epi8(0x2F);
23
24 // Load input:
25 __m256i str = _mm256_loadu_si256((__m256i *) *s);
26
27 // See the SSSE3 decoder for an explanation of the algorithm.
28 const __m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi32(str, 4), mask_2F);
29 const __m256i lo_nibbles = _mm256_and_si256(str, mask_2F);
30 const __m256i hi = _mm256_shuffle_epi8(lut_hi, hi_nibbles);
31 const __m256i lo = _mm256_shuffle_epi8(lut_lo, lo_nibbles);
32
33 if (!_mm256_testz_si256(lo, hi)) {
34 return 0;
35 }
36
37 const __m256i eq_2F = _mm256_cmpeq_epi8(str, mask_2F);
38 const __m256i roll = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2F, hi_nibbles));
39
40 // Now simply add the delta values to the input:
41 str = _mm256_add_epi8(str, roll);
42
43 // Reshuffle the input to packed 12-byte output format:
44 str = dec_reshuffle(str);
45
46 // Store the output:
47 _mm256_storeu_si256((__m256i *) *o, str);
48
49 *s += 32;
50 *o += 24;
51 *rounds -= 1;
52
53 return 1;
54}
55
56static inline void
57dec_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
58{
59 if (*slen < 45) {
60 return;
61 }
62
63 // Process blocks of 32 bytes per round. Because 8 extra zero bytes are
64 // written after the output, ensure that there will be at least 13
65 // bytes of input data left to cover the gap. (11 data bytes and up to
66 // two end-of-string markers.)
67 size_t rounds = (*slen - 13) / 32;
68
69 *slen -= rounds * 32; // 32 bytes consumed per round
70 *olen += rounds * 24; // 24 bytes produced per round
71
72 do {
73 if (rounds >= 8) {
74 if (dec_loop_avx2_inner(s, o, &rounds) &&
75 dec_loop_avx2_inner(s, o, &rounds) &&
76 dec_loop_avx2_inner(s, o, &rounds) &&
77 dec_loop_avx2_inner(s, o, &rounds) &&
78 dec_loop_avx2_inner(s, o, &rounds) &&
79 dec_loop_avx2_inner(s, o, &rounds) &&
80 dec_loop_avx2_inner(s, o, &rounds) &&
81 dec_loop_avx2_inner(s, o, &rounds)) {
82 continue;
83 }
84 break;
85 }
86 if (rounds >= 4) {
87 if (dec_loop_avx2_inner(s, o, &rounds) &&
88 dec_loop_avx2_inner(s, o, &rounds) &&
89 dec_loop_avx2_inner(s, o, &rounds) &&
90 dec_loop_avx2_inner(s, o, &rounds)) {
91 continue;
92 }
93 break;
94 }
95 if (rounds >= 2) {
96 if (dec_loop_avx2_inner(s, o, &rounds) &&
97 dec_loop_avx2_inner(s, o, &rounds)) {
98 continue;
99 }
100 break;
101 }
102 dec_loop_avx2_inner(s, o, &rounds);
103 break;
104
105 } while (rounds > 0);
106
107 // Adjust for any rounds that were skipped:
108 *slen += rounds * 32;
109 *olen -= rounds * 24;
110}
111