dec_loop.c source code [ClickHouse/contrib/base64/lib/arch/avx2/dec_loop.c]

1	static inline int
2	dec_loop_avx2_inner (const uint8_t s, uint8_t o, size_t *rounds)
3	{
4	const __m256i lut_lo = _mm256_setr_epi8(
5	`0x15`, `0x11`, `0x11`, `0x11`, `0x11`, `0x11`, `0x11`, `0x11`,
6	`0x11`, `0x11`, `0x13`, `0x1A`, `0x1B`, `0x1B`, `0x1B`, `0x1A`,
7	`0x15`, `0x11`, `0x11`, `0x11`, `0x11`, `0x11`, `0x11`, `0x11`,
8	`0x11`, `0x11`, `0x13`, `0x1A`, `0x1B`, `0x1B`, `0x1B`, `0x1A`);
9
10	const __m256i lut_hi = _mm256_setr_epi8(
11	`0x10`, `0x10`, `0x01`, `0x02`, `0x04`, `0x08`, `0x04`, `0x08`,
12	`0x10`, `0x10`, `0x10`, `0x10`, `0x10`, `0x10`, `0x10`, `0x10`,
13	`0x10`, `0x10`, `0x01`, `0x02`, `0x04`, `0x08`, `0x04`, `0x08`,
14	`0x10`, `0x10`, `0x10`, `0x10`, `0x10`, `0x10`, `0x10`, `0x10`);
15
16	const __m256i lut_roll = _mm256_setr_epi8(
17	`0`, `16`, `19`, `4`, -`65`, -`65`, -`71`, -`71`,
18	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
19	`0`, `16`, `19`, `4`, -`65`, -`65`, -`71`, -`71`,
20	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`);
21
22	const __m256i mask_2F = _mm256_set1_epi8(`0x2F`);
23
24	// Load input:
25	__m256i str = _mm256_loadu_si256((__m256i ) s);
26
27	// See the SSSE3 decoder for an explanation of the algorithm.
28	const __m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi32(str, `4`), mask_2F);
29	const __m256i lo_nibbles = _mm256_and_si256(str, mask_2F);
30	const __m256i hi = _mm256_shuffle_epi8(lut_hi, hi_nibbles);
31	const __m256i lo = _mm256_shuffle_epi8(lut_lo, lo_nibbles);
32
33	if (!_mm256_testz_si256(lo, hi)) {
34	return `0`;
35	}
36
37	const __m256i eq_2F = _mm256_cmpeq_epi8(str, mask_2F);
38	const __m256i roll = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2F, hi_nibbles));
39
40	// Now simply add the delta values to the input:
41	str = _mm256_add_epi8(str, roll);
42
43	// Reshuffle the input to packed 12-byte output format:
44	str = dec_reshuffle(str);
45
46	// Store the output:
47	_mm256_storeu_si256((__m256i ) o, str);
48
49	*s += `32`;
50	*o += `24`;
51	*rounds -= `1`;
52
53	return `1`;
54	}
55
56	static inline void
57	dec_loop_avx2 (const uint8_t *s, size_t slen, uint8_t *o, size_t olen)
58	{
59	if (*slen < `45`) {
60	return;
61	}
62
63	// Process blocks of 32 bytes per round. Because 8 extra zero bytes are
64	// written after the output, ensure that there will be at least 13
65	// bytes of input data left to cover the gap. (11 data bytes and up to
66	// two end-of-string markers.)
67	size_t rounds = (*slen - `13`) / `32`;
68
69	slen -= rounds `32`; // 32 bytes consumed per round
70	olen += rounds `24`; // 24 bytes produced per round
71
72	do {
73	if (rounds >= `8`) {
74	if (dec_loop_avx2_inner(s, o, &rounds) &&
75	dec_loop_avx2_inner(s, o, &rounds) &&
76	dec_loop_avx2_inner(s, o, &rounds) &&
77	dec_loop_avx2_inner(s, o, &rounds) &&
78	dec_loop_avx2_inner(s, o, &rounds) &&
79	dec_loop_avx2_inner(s, o, &rounds) &&
80	dec_loop_avx2_inner(s, o, &rounds) &&
81	dec_loop_avx2_inner(s, o, &rounds)) {
82	continue;
83	}
84	break;
85	}
86	if (rounds >= `4`) {
87	if (dec_loop_avx2_inner(s, o, &rounds) &&
88	dec_loop_avx2_inner(s, o, &rounds) &&
89	dec_loop_avx2_inner(s, o, &rounds) &&
90	dec_loop_avx2_inner(s, o, &rounds)) {
91	continue;
92	}
93	break;
94	}
95	if (rounds >= `2`) {
96	if (dec_loop_avx2_inner(s, o, &rounds) &&
97	dec_loop_avx2_inner(s, o, &rounds)) {
98	continue;
99	}
100	break;
101	}
102	dec_loop_avx2_inner(s, o, &rounds);
103	break;
104
105	} while (rounds > `0`);
106
107	// Adjust for any rounds that were skipped:
108	slen += rounds `32`;
109	olen -= rounds `24`;
110	}
111

Browse the source code of ClickHouse/contrib/base64/lib/arch/avx2/dec_loop.c