1static inline void
2enc_loop_avx2_inner_first (const uint8_t **s, uint8_t **o)
3{
4 // First load is done at s - 0 to not get a segfault:
5 __m256i src = _mm256_loadu_si256((__m256i *) *s);
6
7 // Shift by 4 bytes, as required by enc_reshuffle:
8 src = _mm256_permutevar8x32_epi32(src, _mm256_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6));
9
10 // Reshuffle, translate, store:
11 src = enc_reshuffle(src);
12 src = enc_translate(src);
13 _mm256_storeu_si256((__m256i *) *o, src);
14
15 // Subsequent loads will be done at s - 4, set pointer for next round:
16 *s += 20;
17 *o += 32;
18}
19
20static inline void
21enc_loop_avx2_inner (const uint8_t **s, uint8_t **o)
22{
23 // Load input:
24 __m256i src = _mm256_loadu_si256((__m256i *) *s);
25
26 // Reshuffle, translate, store:
27 src = enc_reshuffle(src);
28 src = enc_translate(src);
29 _mm256_storeu_si256((__m256i *) *o, src);
30
31 *s += 24;
32 *o += 32;
33}
34
35static inline void
36enc_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
37{
38 if (*slen < 32) {
39 return;
40 }
41
42 // Process blocks of 24 bytes at a time. Because blocks are loaded 32
43 // bytes at a time an offset of -4, ensure that there will be at least
44 // 4 remaining bytes after the last round, so that the final read will
45 // not pass beyond the bounds of the input buffer:
46 size_t rounds = (*slen - 4) / 24;
47
48 *slen -= rounds * 24; // 24 bytes consumed per round
49 *olen += rounds * 32; // 32 bytes produced per round
50
51 // The first loop iteration requires special handling to ensure that
52 // the read, which is done at an offset, does not underflow the buffer:
53 enc_loop_avx2_inner_first(s, o);
54 rounds--;
55
56 while (rounds > 0) {
57 if (rounds >= 8) {
58 enc_loop_avx2_inner(s, o);
59 enc_loop_avx2_inner(s, o);
60 enc_loop_avx2_inner(s, o);
61 enc_loop_avx2_inner(s, o);
62 enc_loop_avx2_inner(s, o);
63 enc_loop_avx2_inner(s, o);
64 enc_loop_avx2_inner(s, o);
65 enc_loop_avx2_inner(s, o);
66 rounds -= 8;
67 continue;
68 }
69 if (rounds >= 4) {
70 enc_loop_avx2_inner(s, o);
71 enc_loop_avx2_inner(s, o);
72 enc_loop_avx2_inner(s, o);
73 enc_loop_avx2_inner(s, o);
74 rounds -= 4;
75 continue;
76 }
77 if (rounds >= 2) {
78 enc_loop_avx2_inner(s, o);
79 enc_loop_avx2_inner(s, o);
80 rounds -= 2;
81 continue;
82 }
83 enc_loop_avx2_inner(s, o);
84 break;
85 }
86
87 // Add the offset back:
88 *s += 4;
89}
90