1 | static inline int |
2 | dec_loop_avx2_inner (const uint8_t **s, uint8_t **o, size_t *rounds) |
3 | { |
4 | const __m256i lut_lo = _mm256_setr_epi8( |
5 | 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, |
6 | 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A, |
7 | 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, |
8 | 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A); |
9 | |
10 | const __m256i lut_hi = _mm256_setr_epi8( |
11 | 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08, |
12 | 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, |
13 | 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08, |
14 | 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10); |
15 | |
16 | const __m256i lut_roll = _mm256_setr_epi8( |
17 | 0, 16, 19, 4, -65, -65, -71, -71, |
18 | 0, 0, 0, 0, 0, 0, 0, 0, |
19 | 0, 16, 19, 4, -65, -65, -71, -71, |
20 | 0, 0, 0, 0, 0, 0, 0, 0); |
21 | |
22 | const __m256i mask_2F = _mm256_set1_epi8(0x2F); |
23 | |
24 | // Load input: |
25 | __m256i str = _mm256_loadu_si256((__m256i *) *s); |
26 | |
27 | // See the SSSE3 decoder for an explanation of the algorithm. |
28 | const __m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi32(str, 4), mask_2F); |
29 | const __m256i lo_nibbles = _mm256_and_si256(str, mask_2F); |
30 | const __m256i hi = _mm256_shuffle_epi8(lut_hi, hi_nibbles); |
31 | const __m256i lo = _mm256_shuffle_epi8(lut_lo, lo_nibbles); |
32 | |
33 | if (!_mm256_testz_si256(lo, hi)) { |
34 | return 0; |
35 | } |
36 | |
37 | const __m256i eq_2F = _mm256_cmpeq_epi8(str, mask_2F); |
38 | const __m256i roll = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2F, hi_nibbles)); |
39 | |
40 | // Now simply add the delta values to the input: |
41 | str = _mm256_add_epi8(str, roll); |
42 | |
43 | // Reshuffle the input to packed 12-byte output format: |
44 | str = dec_reshuffle(str); |
45 | |
46 | // Store the output: |
47 | _mm256_storeu_si256((__m256i *) *o, str); |
48 | |
49 | *s += 32; |
50 | *o += 24; |
51 | *rounds -= 1; |
52 | |
53 | return 1; |
54 | } |
55 | |
56 | static inline void |
57 | dec_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) |
58 | { |
59 | if (*slen < 45) { |
60 | return; |
61 | } |
62 | |
63 | // Process blocks of 32 bytes per round. Because 8 extra zero bytes are |
64 | // written after the output, ensure that there will be at least 13 |
65 | // bytes of input data left to cover the gap. (11 data bytes and up to |
66 | // two end-of-string markers.) |
67 | size_t rounds = (*slen - 13) / 32; |
68 | |
69 | *slen -= rounds * 32; // 32 bytes consumed per round |
70 | *olen += rounds * 24; // 24 bytes produced per round |
71 | |
72 | do { |
73 | if (rounds >= 8) { |
74 | if (dec_loop_avx2_inner(s, o, &rounds) && |
75 | dec_loop_avx2_inner(s, o, &rounds) && |
76 | dec_loop_avx2_inner(s, o, &rounds) && |
77 | dec_loop_avx2_inner(s, o, &rounds) && |
78 | dec_loop_avx2_inner(s, o, &rounds) && |
79 | dec_loop_avx2_inner(s, o, &rounds) && |
80 | dec_loop_avx2_inner(s, o, &rounds) && |
81 | dec_loop_avx2_inner(s, o, &rounds)) { |
82 | continue; |
83 | } |
84 | break; |
85 | } |
86 | if (rounds >= 4) { |
87 | if (dec_loop_avx2_inner(s, o, &rounds) && |
88 | dec_loop_avx2_inner(s, o, &rounds) && |
89 | dec_loop_avx2_inner(s, o, &rounds) && |
90 | dec_loop_avx2_inner(s, o, &rounds)) { |
91 | continue; |
92 | } |
93 | break; |
94 | } |
95 | if (rounds >= 2) { |
96 | if (dec_loop_avx2_inner(s, o, &rounds) && |
97 | dec_loop_avx2_inner(s, o, &rounds)) { |
98 | continue; |
99 | } |
100 | break; |
101 | } |
102 | dec_loop_avx2_inner(s, o, &rounds); |
103 | break; |
104 | |
105 | } while (rounds > 0); |
106 | |
107 | // Adjust for any rounds that were skipped: |
108 | *slen += rounds * 32; |
109 | *olen -= rounds * 24; |
110 | } |
111 | |