1 | static inline void |
2 | enc_loop_avx2_inner_first (const uint8_t **s, uint8_t **o) |
3 | { |
4 | // First load is done at s - 0 to not get a segfault: |
5 | __m256i src = _mm256_loadu_si256((__m256i *) *s); |
6 | |
7 | // Shift by 4 bytes, as required by enc_reshuffle: |
8 | src = _mm256_permutevar8x32_epi32(src, _mm256_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6)); |
9 | |
10 | // Reshuffle, translate, store: |
11 | src = enc_reshuffle(src); |
12 | src = enc_translate(src); |
13 | _mm256_storeu_si256((__m256i *) *o, src); |
14 | |
15 | // Subsequent loads will be done at s - 4, set pointer for next round: |
16 | *s += 20; |
17 | *o += 32; |
18 | } |
19 | |
20 | static inline void |
21 | enc_loop_avx2_inner (const uint8_t **s, uint8_t **o) |
22 | { |
23 | // Load input: |
24 | __m256i src = _mm256_loadu_si256((__m256i *) *s); |
25 | |
26 | // Reshuffle, translate, store: |
27 | src = enc_reshuffle(src); |
28 | src = enc_translate(src); |
29 | _mm256_storeu_si256((__m256i *) *o, src); |
30 | |
31 | *s += 24; |
32 | *o += 32; |
33 | } |
34 | |
35 | static inline void |
36 | enc_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) |
37 | { |
38 | if (*slen < 32) { |
39 | return; |
40 | } |
41 | |
42 | // Process blocks of 24 bytes at a time. Because blocks are loaded 32 |
43 | // bytes at a time an offset of -4, ensure that there will be at least |
44 | // 4 remaining bytes after the last round, so that the final read will |
45 | // not pass beyond the bounds of the input buffer: |
46 | size_t rounds = (*slen - 4) / 24; |
47 | |
48 | *slen -= rounds * 24; // 24 bytes consumed per round |
49 | *olen += rounds * 32; // 32 bytes produced per round |
50 | |
51 | // The first loop iteration requires special handling to ensure that |
52 | // the read, which is done at an offset, does not underflow the buffer: |
53 | enc_loop_avx2_inner_first(s, o); |
54 | rounds--; |
55 | |
56 | while (rounds > 0) { |
57 | if (rounds >= 8) { |
58 | enc_loop_avx2_inner(s, o); |
59 | enc_loop_avx2_inner(s, o); |
60 | enc_loop_avx2_inner(s, o); |
61 | enc_loop_avx2_inner(s, o); |
62 | enc_loop_avx2_inner(s, o); |
63 | enc_loop_avx2_inner(s, o); |
64 | enc_loop_avx2_inner(s, o); |
65 | enc_loop_avx2_inner(s, o); |
66 | rounds -= 8; |
67 | continue; |
68 | } |
69 | if (rounds >= 4) { |
70 | enc_loop_avx2_inner(s, o); |
71 | enc_loop_avx2_inner(s, o); |
72 | enc_loop_avx2_inner(s, o); |
73 | enc_loop_avx2_inner(s, o); |
74 | rounds -= 4; |
75 | continue; |
76 | } |
77 | if (rounds >= 2) { |
78 | enc_loop_avx2_inner(s, o); |
79 | enc_loop_avx2_inner(s, o); |
80 | rounds -= 2; |
81 | continue; |
82 | } |
83 | enc_loop_avx2_inner(s, o); |
84 | break; |
85 | } |
86 | |
87 | // Add the offset back: |
88 | *s += 4; |
89 | } |
90 | |