aesni.c source code [Godot/thirdparty/mbedtls/library/aesni.c]

1	/*
2	* AES-NI support functions
3	*
4	* Copyright The Mbed TLS Contributors
5	* SPDX-License-Identifier: Apache-2.0
6	*
7	* Licensed under the Apache License, Version 2.0 (the "License"); you may
8	* not use this file except in compliance with the License.
9	* You may obtain a copy of the License at
10	*
11	* http://www.apache.org/licenses/LICENSE-2.0
12	*
13	* Unless required by applicable law or agreed to in writing, software
14	* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
15	* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16	* See the License for the specific language governing permissions and
17	* limitations under the License.
18	*/
19
20	/*
21	* [AES-WP] https://www.intel.com/content/www/us/en/developer/articles/tool/intel-advanced-encryption-standard-aes-instructions-set.html
22	* [CLMUL-WP] https://www.intel.com/content/www/us/en/develop/download/intel-carry-less-multiplication-instruction-and-its-usage-for-computing-the-gcm-mode.html
23	*/
24
25	#include "common.h"
26
27	#if defined(MBEDTLS_AESNI_C)
28
29	#include "mbedtls/aesni.h"
30
31	#include <string.h>
32
33	/ INDENT-OFF /
34	#ifndef asm
35	#define asm __asm
36	#endif
37	/ INDENT-ON /
38
39	#if defined(MBEDTLS_AESNI_HAVE_CODE)
40
41	#if MBEDTLS_AESNI_HAVE_CODE == 2
42	#if !defined(_WIN32)
43	#include <cpuid.h>
44	#else
45	#include <intrin.h>
46	#endif
47	#include <immintrin.h>
48	#endif
49
50	/*
51	* AES-NI support detection routine
52	*/
53	int mbedtls_aesni_has_support(unsigned int what)
54	{
55	static int done = `0`;
56	static unsigned int c = `0`;
57
58	if (!done) {
59	#if MBEDTLS_AESNI_HAVE_CODE == 2
60	static unsigned info[`4`] = { `0`, `0`, `0`, `0` };
61	#if defined(_MSC_VER)
62	__cpuid(info, `1`);
63	#else
64	__cpuid(`1`, info[`0`], info[`1`], info[`2`], info[`3`]);
65	#endif
66	c = info[`2`];
67	#else /* AESNI using asm */
68	asm ("movl $1, %%eax \n\t"
69	"cpuid \n\t"
70	: "=c" (c)
71	:
72	: "eax", "ebx", "edx");
73	#endif /* MBEDTLS_AESNI_HAVE_CODE */
74	done = `1`;
75	}
76
77	return (c & what) != `0`;
78	}
79
80	#if MBEDTLS_AESNI_HAVE_CODE == 2
81
82	/*
83	* AES-NI AES-ECB block en(de)cryption
84	*/
85	int mbedtls_aesni_crypt_ecb(mbedtls_aes_context *ctx,
86	int mode,
87	const unsigned char input[`16`],
88	unsigned char output[`16`])
89	{
90	const __m128i rk = (const* __m128i *) (ctx->rk);
91	unsigned nr = ctx->nr; // Number of remaining rounds
92
93	// Load round key 0
94	__m128i state;
95	memcpy(&state, input, `16`);
96	state = _mm_xor_si128(state, rk[`0`]); // state ^= rk;*
97	++rk;
98	--nr;
99
100	if (mode == `0`) {
101	while (nr != `0`) {
102	state = _mm_aesdec_si128(state, *rk);
103	++rk;
104	--nr;
105	}
106	state = _mm_aesdeclast_si128(state, *rk);
107	} else {
108	while (nr != `0`) {
109	state = _mm_aesenc_si128(state, *rk);
110	++rk;
111	--nr;
112	}
113	state = _mm_aesenclast_si128(state, *rk);
114	}
115
116	memcpy(output, &state, `16`);
117	return `0`;
118	}
119
120	/*
121	* GCM multiplication: c = a times b in GF(2^128)
122	* Based on [CLMUL-WP] algorithms 1 (with equation 27) and 5.
123	*/
124
125	static void gcm_clmul(const __m128i aa, const __m128i bb,
126	__m128i cc, __m128i dd)
127	{
128	/*
129	* Caryless multiplication dd:cc = aa * bb
130	* using [CLMUL-WP] algorithm 1 (p. 12).
131	*/
132	cc = _mm_clmulepi64_si128(aa, bb, `0x00`); // a0b0 = c1:c0
133	dd = _mm_clmulepi64_si128(aa, bb, `0x11`); // a1b1 = d1:d0
134	__m128i ee = _mm_clmulepi64_si128(aa, bb, `0x10`); // a0b1 = e1:e0*
135	__m128i ff = _mm_clmulepi64_si128(aa, bb, `0x01`); // a1b0 = f1:f0*
136	ff = _mm_xor_si128(ff, ee); // e1+f1:e0+f0
137	ee = ff; // e1+f1:e0+f0
138	ff = _mm_srli_si128(ff, `8`); // 0:e1+f1
139	ee = _mm_slli_si128(ee, `8`); // e0+f0:0
140	dd = _mm_xor_si128(dd, ff); // d1:d0+e1+f1
141	cc = _mm_xor_si128(cc, ee); // c1+e0+f0:c0
142	}
143
144	static void gcm_shift(__m128i cc, __m128i dd)
145	{
146	/ [CMUCL-WP] Algorithm 5 Step 1: shift cc:dd one bit to the left,*
147	* taking advantage of [CLMUL-WP] eq 27 (p. 18). */
148	// // cc = r1:r0*
149	// // dd = r3:r2*
150	__m128i cc_lo = _mm_slli_epi64(cc, `1`); // r1<<1:r0<<1*
151	__m128i dd_lo = _mm_slli_epi64(dd, `1`); // r3<<1:r2<<1*
152	__m128i cc_hi = _mm_srli_epi64(cc, `63`); // r1>>63:r0>>63*
153	__m128i dd_hi = _mm_srli_epi64(dd, `63`); // r3>>63:r2>>63*
154	__m128i xmm5 = _mm_srli_si128(cc_hi, `8`); // 0:r1>>63
155	cc_hi = _mm_slli_si128(cc_hi, `8`); // r0>>63:0
156	dd_hi = _mm_slli_si128(dd_hi, `8`); // 0:r1>>63
157
158	cc = _mm_or_si128(cc_lo, cc_hi); // r1<<1\|r0>>63:r0<<1*
159	dd = _mm_or_si128(_mm_or_si128(dd_lo, dd_hi), xmm5); // r3<<1\|r2>>62:r2<<1\|r1>>63*
160	}
161
162	static __m128i gcm_reduce(__m128i xx)
163	{
164	// // xx = x1:x0
165	/ [CLMUL-WP] Algorithm 5 Step 2 /
166	__m128i aa = _mm_slli_epi64(xx, `63`); // x1<<63:x0<<63 = stuff:a
167	__m128i bb = _mm_slli_epi64(xx, `62`); // x1<<62:x0<<62 = stuff:b
168	__m128i cc = _mm_slli_epi64(xx, `57`); // x1<<57:x0<<57 = stuff:c
169	__m128i dd = _mm_slli_si128(_mm_xor_si128(_mm_xor_si128(aa, bb), cc), `8`); // a+b+c:0
170	return _mm_xor_si128(dd, xx); // x1+a+b+c:x0 = d:x0
171	}
172
173	static __m128i gcm_mix(__m128i dx)
174	{
175	/ [CLMUL-WP] Algorithm 5 Steps 3 and 4 /
176	__m128i ee = _mm_srli_epi64(dx, `1`); // e1:x0>>1 = e1:e0'
177	__m128i ff = _mm_srli_epi64(dx, `2`); // f1:x0>>2 = f1:f0'
178	__m128i gg = _mm_srli_epi64(dx, `7`); // g1:x0>>7 = g1:g0'
179
180	// e0'+f0'+g0' is almost e0+f0+g0, except for some missing
181	// bits carried from d. Now get those bits back in.
182	__m128i eh = _mm_slli_epi64(dx, `63`); // d<<63:stuff
183	__m128i fh = _mm_slli_epi64(dx, `62`); // d<<62:stuff
184	__m128i gh = _mm_slli_epi64(dx, `57`); // d<<57:stuff
185	__m128i hh = _mm_srli_si128(_mm_xor_si128(_mm_xor_si128(eh, fh), gh), `8`); // 0:missing bits of d
186
187	return _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(ee, ff), gg), hh), dx);
188	}
189
190	void mbedtls_aesni_gcm_mult(unsigned char c[`16`],
191	const unsigned char a[`16`],
192	const unsigned char b[`16`])
193	{
194	__m128i aa, bb, cc, dd;
195
196	/ The inputs are in big-endian order, so byte-reverse them /
197	for (size_t i = `0`; i < `16`; i++) {
198	((uint8_t *) &aa)[i] = a[`15` - i];
199	((uint8_t *) &bb)[i] = b[`15` - i];
200	}
201
202	gcm_clmul(aa, bb, &cc, &dd);
203	gcm_shift(&cc, &dd);
204	/*
205	* Now reduce modulo the GCM polynomial x^128 + x^7 + x^2 + x + 1
206	* using [CLMUL-WP] algorithm 5 (p. 18).
207	* Currently dd:cc holds x3:x2:x1:x0 (already shifted).
208	*/
209	__m128i dx = gcm_reduce(cc);
210	__m128i xh = gcm_mix(dx);
211	cc = _mm_xor_si128(xh, dd); // x3+h1:x2+h0
212
213	/ Now byte-reverse the outputs /
214	for (size_t i = `0`; i < `16`; i++) {
215	c[i] = ((uint8_t *) &cc)[`15` - i];
216	}
217
218	return;
219	}
220
221	/*
222	* Compute decryption round keys from encryption round keys
223	*/
224	void mbedtls_aesni_inverse_key(unsigned char *invkey,
225	const unsigned char fwdkey, int* nr)
226	{
227	__m128i ik = (__m128i ) invkey;
228	const __m128i fk = (const* __m128i *) fwdkey + nr;
229
230	ik = fk;
231	for (--fk, ++ik; fk > (const __m128i *) fwdkey; --fk, ++ik) {
232	ik = _mm_aesimc_si128(fk);
233	}
234	ik = fk;
235	}
236
237	/*
238	* Key expansion, 128-bit case
239	*/
240	static __m128i aesni_set_rk_128(__m128i state, __m128i xword)
241	{
242	/*
243	* Finish generating the next round key.
244	*
245	* On entry state is r3:r2:r1:r0 and xword is X:stuff:stuff:stuff
246	* with X = rot( sub( r3 ) ) ^ RCON (obtained with AESKEYGENASSIST).
247	*
248	* On exit, xword is r7:r6:r5:r4
249	* with r4 = X + r0, r5 = r4 + r1, r6 = r5 + r2, r7 = r6 + r3
250	* and this is returned, to be written to the round key buffer.
251	*/
252	xword = _mm_shuffle_epi32(xword, `0xff`); // X:X:X:X
253	xword = _mm_xor_si128(xword, state); // X+r3:X+r2:X+r1:r4
254	state = _mm_slli_si128(state, `4`); // r2:r1:r0:0
255	xword = _mm_xor_si128(xword, state); // X+r3+r2:X+r2+r1:r5:r4
256	state = _mm_slli_si128(state, `4`); // r1:r0:0:0
257	xword = _mm_xor_si128(xword, state); // X+r3+r2+r1:r6:r5:r4
258	state = _mm_slli_si128(state, `4`); // r0:0:0:0
259	state = _mm_xor_si128(xword, state); // r7:r6:r5:r4
260	return state;
261	}
262
263	static void aesni_setkey_enc_128(unsigned char *rk_bytes,
264	const unsigned char *key)
265	{
266	__m128i rk = (__m128i ) rk_bytes;
267
268	memcpy(&rk[`0`], key, `16`);
269	rk[`1`] = aesni_set_rk_128(rk[`0`], _mm_aeskeygenassist_si128(rk[`0`], `0x01`));
270	rk[`2`] = aesni_set_rk_128(rk[`1`], _mm_aeskeygenassist_si128(rk[`1`], `0x02`));
271	rk[`3`] = aesni_set_rk_128(rk[`2`], _mm_aeskeygenassist_si128(rk[`2`], `0x04`));
272	rk[`4`] = aesni_set_rk_128(rk[`3`], _mm_aeskeygenassist_si128(rk[`3`], `0x08`));
273	rk[`5`] = aesni_set_rk_128(rk[`4`], _mm_aeskeygenassist_si128(rk[`4`], `0x10`));
274	rk[`6`] = aesni_set_rk_128(rk[`5`], _mm_aeskeygenassist_si128(rk[`5`], `0x20`));
275	rk[`7`] = aesni_set_rk_128(rk[`6`], _mm_aeskeygenassist_si128(rk[`6`], `0x40`));
276	rk[`8`] = aesni_set_rk_128(rk[`7`], _mm_aeskeygenassist_si128(rk[`7`], `0x80`));
277	rk[`9`] = aesni_set_rk_128(rk[`8`], _mm_aeskeygenassist_si128(rk[`8`], `0x1B`));
278	rk[`10`] = aesni_set_rk_128(rk[`9`], _mm_aeskeygenassist_si128(rk[`9`], `0x36`));
279	}
280
281	/*
282	* Key expansion, 192-bit case
283	*/
284	static void aesni_set_rk_192(__m128i state0, __m128i state1, __m128i xword,
285	unsigned char *rk)
286	{
287	/*
288	* Finish generating the next 6 quarter-keys.
289	*
290	* On entry state0 is r3:r2:r1:r0, state1 is stuff:stuff:r5:r4
291	* and xword is stuff:stuff:X:stuff with X = rot( sub( r3 ) ) ^ RCON
292	* (obtained with AESKEYGENASSIST).
293	*
294	* On exit, state0 is r9:r8:r7:r6 and state1 is stuff:stuff:r11:r10
295	* and those are written to the round key buffer.
296	*/
297	xword = _mm_shuffle_epi32(xword, `0x55`); // X:X:X:X
298	xword = _mm_xor_si128(xword, state0); // X+r3:X+r2:X+r1:X+r0*
299	state0 = _mm_slli_si128(state0, `4`); // r2:r1:r0:0
300	xword = _mm_xor_si128(xword, state0); // X+r3+r2:X+r2+r1:X+r1+r0:X+r0*
301	state0 = _mm_slli_si128(state0, `4`); // r1:r0:0:0
302	xword = _mm_xor_si128(xword, state0); // X+r3+r2+r1:X+r2+r1+r0:X+r1+r0:X+r0*
303	state0 = _mm_slli_si128(state0, `4`); // r0:0:0:0
304	xword = _mm_xor_si128(xword, state0); // X+r3+r2+r1+r0:X+r2+r1+r0:X+r1+r0:X+r0*
305	state0 = xword; // = r9:r8:r7:r6*
306
307	xword = _mm_shuffle_epi32(xword, `0xff`); // r9:r9:r9:r9
308	xword = _mm_xor_si128(xword, state1); // stuff:stuff:r9+r5:r9+r4*
309	state1 = _mm_slli_si128(state1, `4`); // stuff:stuff:r4:0
310	xword = _mm_xor_si128(xword, state1); // stuff:stuff:r9+r5+r4:r9+r4*
311	state1 = xword; // = stuff:stuff:r11:r10*
312
313	/ Store state0 and the low half of state1 into rk, which is conceptually*
314	* an array of 24-byte elements. Since 24 is not a multiple of 16,
315	* rk is not necessarily aligned so just `rk = state0` doesn't work. */
316	memcpy(rk, state0, `16`);
317	memcpy(rk + `16`, state1, `8`);
318	}
319
320	static void aesni_setkey_enc_192(unsigned char *rk,
321	const unsigned char *key)
322	{
323	/ First round: use original key /
324	memcpy(rk, key, `24`);
325	/ aes.c guarantees that rk is aligned on a 16-byte boundary. /
326	__m128i state0 = ((__m128i *) rk)[`0`];
327	__m128i state1 = _mm_loadl_epi64(((__m128i *) rk) + `1`);
328
329	aesni_set_rk_192(&state0, &state1, _mm_aeskeygenassist_si128(state1, `0x01`), rk + `24` * `1`);
330	aesni_set_rk_192(&state0, &state1, _mm_aeskeygenassist_si128(state1, `0x02`), rk + `24` * `2`);
331	aesni_set_rk_192(&state0, &state1, _mm_aeskeygenassist_si128(state1, `0x04`), rk + `24` * `3`);
332	aesni_set_rk_192(&state0, &state1, _mm_aeskeygenassist_si128(state1, `0x08`), rk + `24` * `4`);
333	aesni_set_rk_192(&state0, &state1, _mm_aeskeygenassist_si128(state1, `0x10`), rk + `24` * `5`);
334	aesni_set_rk_192(&state0, &state1, _mm_aeskeygenassist_si128(state1, `0x20`), rk + `24` * `6`);
335	aesni_set_rk_192(&state0, &state1, _mm_aeskeygenassist_si128(state1, `0x40`), rk + `24` * `7`);
336	aesni_set_rk_192(&state0, &state1, _mm_aeskeygenassist_si128(state1, `0x80`), rk + `24` * `8`);
337	}
338
339	/*
340	* Key expansion, 256-bit case
341	*/
342	static void aesni_set_rk_256(__m128i state0, __m128i state1, __m128i xword,
343	__m128i rk0, __m128i rk1)
344	{
345	/*
346	* Finish generating the next two round keys.
347	*
348	* On entry state0 is r3:r2:r1:r0, state1 is r7:r6:r5:r4 and
349	* xword is X:stuff:stuff:stuff with X = rot( sub( r7 )) ^ RCON
350	* (obtained with AESKEYGENASSIST).
351	*
352	* On exit, rk0 is r11:r10:r9:r8 and rk1 is r15:r14:r13:r12
353	*/
354	xword = _mm_shuffle_epi32(xword, `0xff`);
355	xword = _mm_xor_si128(xword, state0);
356	state0 = _mm_slli_si128(state0, `4`);
357	xword = _mm_xor_si128(xword, state0);
358	state0 = _mm_slli_si128(state0, `4`);
359	xword = _mm_xor_si128(xword, state0);
360	state0 = _mm_slli_si128(state0, `4`);
361	state0 = _mm_xor_si128(state0, xword);
362	*rk0 = state0;
363
364	/ Set xword to stuff:Y:stuff:stuff with Y = subword( r11 )*
365	* and proceed to generate next round key from there */
366	xword = _mm_aeskeygenassist_si128(state0, `0x00`);
367	xword = _mm_shuffle_epi32(xword, `0xaa`);
368	xword = _mm_xor_si128(xword, state1);
369	state1 = _mm_slli_si128(state1, `4`);
370	xword = _mm_xor_si128(xword, state1);
371	state1 = _mm_slli_si128(state1, `4`);
372	xword = _mm_xor_si128(xword, state1);
373	state1 = _mm_slli_si128(state1, `4`);
374	state1 = _mm_xor_si128(state1, xword);
375	*rk1 = state1;
376	}
377
378	static void aesni_setkey_enc_256(unsigned char *rk_bytes,
379	const unsigned char *key)
380	{
381	__m128i rk = (__m128i ) rk_bytes;
382
383	memcpy(&rk[`0`], key, `16`);
384	memcpy(&rk[`1`], key + `16`, `16`);
385
386	/*
387	* Main "loop" - Generating one more key than necessary,
388	* see definition of mbedtls_aes_context.buf
389	*/
390	aesni_set_rk_256(rk[`0`], rk[`1`], _mm_aeskeygenassist_si128(rk[`1`], `0x01`), &rk[`2`], &rk[`3`]);
391	aesni_set_rk_256(rk[`2`], rk[`3`], _mm_aeskeygenassist_si128(rk[`3`], `0x02`), &rk[`4`], &rk[`5`]);
392	aesni_set_rk_256(rk[`4`], rk[`5`], _mm_aeskeygenassist_si128(rk[`5`], `0x04`), &rk[`6`], &rk[`7`]);
393	aesni_set_rk_256(rk[`6`], rk[`7`], _mm_aeskeygenassist_si128(rk[`7`], `0x08`), &rk[`8`], &rk[`9`]);
394	aesni_set_rk_256(rk[`8`], rk[`9`], _mm_aeskeygenassist_si128(rk[`9`], `0x10`), &rk[`10`], &rk[`11`]);
395	aesni_set_rk_256(rk[`10`], rk[`11`], _mm_aeskeygenassist_si128(rk[`11`], `0x20`), &rk[`12`], &rk[`13`]);
396	aesni_set_rk_256(rk[`12`], rk[`13`], _mm_aeskeygenassist_si128(rk[`13`], `0x40`), &rk[`14`], &rk[`15`]);
397	}
398
399	#else /* MBEDTLS_AESNI_HAVE_CODE == 1 */
400
401	#if defined(__has_feature)
402	#if __has_feature(memory_sanitizer)
403	#warning \
404	"MBEDTLS_AESNI_C is known to cause spurious error reports with some memory sanitizers as they do not understand the assembly code."
405	#endif
406	#endif
407
408	/*
409	* Binutils needs to be at least 2.19 to support AES-NI instructions.
410	* Unfortunately, a lot of users have a lower version now (2014-04).
411	* Emit bytecode directly in order to support "old" version of gas.
412	*
413	* Opcodes from the Intel architecture reference manual, vol. 3.
414	* We always use registers, so we don't need prefixes for memory operands.
415	* Operand macros are in gas order (src, dst) as opposed to Intel order
416	* (dst, src) in order to blend better into the surrounding assembly code.
417	*/
418	#define AESDEC(regs) ".byte 0x66,0x0F,0x38,0xDE," regs "\n\t"
419	#define AESDECLAST(regs) ".byte 0x66,0x0F,0x38,0xDF," regs "\n\t"
420	#define AESENC(regs) ".byte 0x66,0x0F,0x38,0xDC," regs "\n\t"
421	#define AESENCLAST(regs) ".byte 0x66,0x0F,0x38,0xDD," regs "\n\t"
422	#define AESIMC(regs) ".byte 0x66,0x0F,0x38,0xDB," regs "\n\t"
423	#define AESKEYGENA(regs, imm) ".byte 0x66,0x0F,0x3A,0xDF," regs "," imm "\n\t"
424	#define PCLMULQDQ(regs, imm) ".byte 0x66,0x0F,0x3A,0x44," regs "," imm "\n\t"
425
426	#define xmm0_xmm0 "0xC0"
427	#define xmm0_xmm1 "0xC8"
428	#define xmm0_xmm2 "0xD0"
429	#define xmm0_xmm3 "0xD8"
430	#define xmm0_xmm4 "0xE0"
431	#define xmm1_xmm0 "0xC1"
432	#define xmm1_xmm2 "0xD1"
433
434	/*
435	* AES-NI AES-ECB block en(de)cryption
436	*/
437	int mbedtls_aesni_crypt_ecb(mbedtls_aes_context *ctx,
438	int mode,
439	const unsigned char input[`16`],
440	unsigned char output[`16`])
441	{
442	asm ("movdqu (%3), %%xmm0 \n\t" // load input
443	"movdqu (%1), %%xmm1 \n\t" // load round key 0
444	"pxor %%xmm1, %%xmm0 \n\t" // round 0
445	"add $16, %1 \n\t" // point to next round key
446	"subl $1, %0 \n\t" // normal rounds = nr - 1
447	"test %2, %2 \n\t" // mode?
448	"jz 2f \n\t" // 0 = decrypt
449
450	"1: \n\t" // encryption loop
451	"movdqu (%1), %%xmm1 \n\t" // load round key
452	AESENC(xmm1_xmm0) // do round
453	"add $16, %1 \n\t" // point to next round key
454	"subl $1, %0 \n\t" // loop
455	"jnz 1b \n\t"
456	"movdqu (%1), %%xmm1 \n\t" // load round key
457	AESENCLAST(xmm1_xmm0) // last round
458	"jmp 3f \n\t"
459
460	"2: \n\t" // decryption loop
461	"movdqu (%1), %%xmm1 \n\t"
462	AESDEC(xmm1_xmm0) // do round
463	"add $16, %1 \n\t"
464	"subl $1, %0 \n\t"
465	"jnz 2b \n\t"
466	"movdqu (%1), %%xmm1 \n\t" // load round key
467	AESDECLAST(xmm1_xmm0) // last round
468
469	"3: \n\t"
470	"movdqu %%xmm0, (%4) \n\t" // export output
471	:
472	: "r" (ctx->nr), "r" (ctx->rk), "r" (mode), "r" (input), "r" (output)
473	: "memory", "cc", "xmm0", "xmm1");
474
475
476	return `0`;
477	}
478
479	/*
480	* GCM multiplication: c = a times b in GF(2^128)
481	* Based on [CLMUL-WP] algorithms 1 (with equation 27) and 5.
482	*/
483	void mbedtls_aesni_gcm_mult(unsigned char c[`16`],
484	const unsigned char a[`16`],
485	const unsigned char b[`16`])
486	{
487	unsigned char aa[`16`], bb[`16`], cc[`16`];
488	size_t i;
489
490	/ The inputs are in big-endian order, so byte-reverse them /
491	for (i = `0`; i < `16`; i++) {
492	aa[i] = a[`15` - i];
493	bb[i] = b[`15` - i];
494	}
495
496	asm ("movdqu (%0), %%xmm0 \n\t" // a1:a0
497	"movdqu (%1), %%xmm1 \n\t" // b1:b0
498
499	/*
500	* Caryless multiplication xmm2:xmm1 = xmm0 * xmm1
501	* using [CLMUL-WP] algorithm 1 (p. 12).
502	*/
503	"movdqa %%xmm1, %%xmm2 \n\t" // copy of b1:b0
504	"movdqa %%xmm1, %%xmm3 \n\t" // same
505	"movdqa %%xmm1, %%xmm4 \n\t" // same
506	PCLMULQDQ(xmm0_xmm1, "0x00") // a0b0 = c1:c0*
507	PCLMULQDQ(xmm0_xmm2, "0x11") // a1b1 = d1:d0*
508	PCLMULQDQ(xmm0_xmm3, "0x10") // a0b1 = e1:e0*
509	PCLMULQDQ(xmm0_xmm4, "0x01") // a1b0 = f1:f0*
510	"pxor %%xmm3, %%xmm4 \n\t" // e1+f1:e0+f0
511	"movdqa %%xmm4, %%xmm3 \n\t" // same
512	"psrldq $8, %%xmm4 \n\t" // 0:e1+f1
513	"pslldq $8, %%xmm3 \n\t" // e0+f0:0
514	"pxor %%xmm4, %%xmm2 \n\t" // d1:d0+e1+f1
515	"pxor %%xmm3, %%xmm1 \n\t" // c1+e0+f1:c0
516
517	/*
518	* Now shift the result one bit to the left,
519	* taking advantage of [CLMUL-WP] eq 27 (p. 18)
520	*/
521	"movdqa %%xmm1, %%xmm3 \n\t" // r1:r0
522	"movdqa %%xmm2, %%xmm4 \n\t" // r3:r2
523	"psllq $1, %%xmm1 \n\t" // r1<<1:r0<<1
524	"psllq $1, %%xmm2 \n\t" // r3<<1:r2<<1
525	"psrlq $63, %%xmm3 \n\t" // r1>>63:r0>>63
526	"psrlq $63, %%xmm4 \n\t" // r3>>63:r2>>63
527	"movdqa %%xmm3, %%xmm5 \n\t" // r1>>63:r0>>63
528	"pslldq $8, %%xmm3 \n\t" // r0>>63:0
529	"pslldq $8, %%xmm4 \n\t" // r2>>63:0
530	"psrldq $8, %%xmm5 \n\t" // 0:r1>>63
531	"por %%xmm3, %%xmm1 \n\t" // r1<<1\|r0>>63:r0<<1
532	"por %%xmm4, %%xmm2 \n\t" // r3<<1\|r2>>62:r2<<1
533	"por %%xmm5, %%xmm2 \n\t" // r3<<1\|r2>>62:r2<<1\|r1>>63
534
535	/*
536	* Now reduce modulo the GCM polynomial x^128 + x^7 + x^2 + x + 1
537	* using [CLMUL-WP] algorithm 5 (p. 18).
538	* Currently xmm2:xmm1 holds x3:x2:x1:x0 (already shifted).
539	*/
540	/ Step 2 (1) /
541	"movdqa %%xmm1, %%xmm3 \n\t" // x1:x0
542	"movdqa %%xmm1, %%xmm4 \n\t" // same
543	"movdqa %%xmm1, %%xmm5 \n\t" // same
544	"psllq $63, %%xmm3 \n\t" // x1<<63:x0<<63 = stuff:a
545	"psllq $62, %%xmm4 \n\t" // x1<<62:x0<<62 = stuff:b
546	"psllq $57, %%xmm5 \n\t" // x1<<57:x0<<57 = stuff:c
547
548	/ Step 2 (2) /
549	"pxor %%xmm4, %%xmm3 \n\t" // stuff:a+b
550	"pxor %%xmm5, %%xmm3 \n\t" // stuff:a+b+c
551	"pslldq $8, %%xmm3 \n\t" // a+b+c:0
552	"pxor %%xmm3, %%xmm1 \n\t" // x1+a+b+c:x0 = d:x0
553
554	/ Steps 3 and 4 /
555	"movdqa %%xmm1,%%xmm0 \n\t" // d:x0
556	"movdqa %%xmm1,%%xmm4 \n\t" // same
557	"movdqa %%xmm1,%%xmm5 \n\t" // same
558	"psrlq $1, %%xmm0 \n\t" // e1:x0>>1 = e1:e0'
559	"psrlq $2, %%xmm4 \n\t" // f1:x0>>2 = f1:f0'
560	"psrlq $7, %%xmm5 \n\t" // g1:x0>>7 = g1:g0'
561	"pxor %%xmm4, %%xmm0 \n\t" // e1+f1:e0'+f0'
562	"pxor %%xmm5, %%xmm0 \n\t" // e1+f1+g1:e0'+f0'+g0'
563	// e0'+f0'+g0' is almost e0+f0+g0, ex\tcept for some missing
564	// bits carried from d. Now get those\t bits back in.
565	"movdqa %%xmm1,%%xmm3 \n\t" // d:x0
566	"movdqa %%xmm1,%%xmm4 \n\t" // same
567	"movdqa %%xmm1,%%xmm5 \n\t" // same
568	"psllq $63, %%xmm3 \n\t" // d<<63:stuff
569	"psllq $62, %%xmm4 \n\t" // d<<62:stuff
570	"psllq $57, %%xmm5 \n\t" // d<<57:stuff
571	"pxor %%xmm4, %%xmm3 \n\t" // d<<63+d<<62:stuff
572	"pxor %%xmm5, %%xmm3 \n\t" // missing bits of d:stuff
573	"psrldq $8, %%xmm3 \n\t" // 0:missing bits of d
574	"pxor %%xmm3, %%xmm0 \n\t" // e1+f1+g1:e0+f0+g0
575	"pxor %%xmm1, %%xmm0 \n\t" // h1:h0
576	"pxor %%xmm2, %%xmm0 \n\t" // x3+h1:x2+h0
577
578	"movdqu %%xmm0, (%2) \n\t" // done
579	:
580	: "r" (aa), "r" (bb), "r" (cc)
581	: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
582
583	/ Now byte-reverse the outputs /
584	for (i = `0`; i < `16`; i++) {
585	c[i] = cc[`15` - i];
586	}
587
588	return;
589	}
590
591	/*
592	* Compute decryption round keys from encryption round keys
593	*/
594	void mbedtls_aesni_inverse_key(unsigned char *invkey,
595	const unsigned char fwdkey, int* nr)
596	{
597	unsigned char *ik = invkey;
598	const unsigned char fk = fwdkey + `16` nr;
599
600	memcpy(ik, fk, `16`);
601
602	for (fk -= `16`, ik += `16`; fk > fwdkey; fk -= `16`, ik += `16`) {
603	asm ("movdqu (%0), %%xmm0 \n\t"
604	AESIMC(xmm0_xmm0)
605	"movdqu %%xmm0, (%1) \n\t"
606	:
607	: "r" (fk), "r" (ik)
608	: "memory", "xmm0");
609	}
610
611	memcpy(ik, fk, `16`);
612	}
613
614	/*
615	* Key expansion, 128-bit case
616	*/
617	static void aesni_setkey_enc_128(unsigned char *rk,
618	const unsigned char *key)
619	{
620	asm ("movdqu (%1), %%xmm0 \n\t" // copy the original key
621	"movdqu %%xmm0, (%0) \n\t" // as round key 0
622	"jmp 2f \n\t" // skip auxiliary routine
623
624	/*
625	* Finish generating the next round key.
626	*
627	* On entry xmm0 is r3:r2:r1:r0 and xmm1 is X:stuff:stuff:stuff
628	* with X = rot( sub( r3 ) ) ^ RCON.
629	*
630	* On exit, xmm0 is r7:r6:r5:r4
631	* with r4 = X + r0, r5 = r4 + r1, r6 = r5 + r2, r7 = r6 + r3
632	* and those are written to the round key buffer.
633	*/
634	"1: \n\t"
635	"pshufd $0xff, %%xmm1, %%xmm1 \n\t" // X:X:X:X
636	"pxor %%xmm0, %%xmm1 \n\t" // X+r3:X+r2:X+r1:r4
637	"pslldq $4, %%xmm0 \n\t" // r2:r1:r0:0
638	"pxor %%xmm0, %%xmm1 \n\t" // X+r3+r2:X+r2+r1:r5:r4
639	"pslldq $4, %%xmm0 \n\t" // etc
640	"pxor %%xmm0, %%xmm1 \n\t"
641	"pslldq $4, %%xmm0 \n\t"
642	"pxor %%xmm1, %%xmm0 \n\t" // update xmm0 for next time!
643	"add $16, %0 \n\t" // point to next round key
644	"movdqu %%xmm0, (%0) \n\t" // write it
645	"ret \n\t"
646
647	/ Main "loop" /
648	"2: \n\t"
649	AESKEYGENA(xmm0_xmm1, "0x01") "call 1b \n\t"
650	AESKEYGENA(xmm0_xmm1, "0x02") "call 1b \n\t"
651	AESKEYGENA(xmm0_xmm1, "0x04") "call 1b \n\t"
652	AESKEYGENA(xmm0_xmm1, "0x08") "call 1b \n\t"
653	AESKEYGENA(xmm0_xmm1, "0x10") "call 1b \n\t"
654	AESKEYGENA(xmm0_xmm1, "0x20") "call 1b \n\t"
655	AESKEYGENA(xmm0_xmm1, "0x40") "call 1b \n\t"
656	AESKEYGENA(xmm0_xmm1, "0x80") "call 1b \n\t"
657	AESKEYGENA(xmm0_xmm1, "0x1B") "call 1b \n\t"
658	AESKEYGENA(xmm0_xmm1, "0x36") "call 1b \n\t"
659	:
660	: "r" (rk), "r" (key)
661	: "memory", "cc", "0");
662	}
663
664	/*
665	* Key expansion, 192-bit case
666	*/
667	static void aesni_setkey_enc_192(unsigned char *rk,
668	const unsigned char *key)
669	{
670	asm ("movdqu (%1), %%xmm0 \n\t" // copy original round key
671	"movdqu %%xmm0, (%0) \n\t"
672	"add $16, %0 \n\t"
673	"movq 16(%1), %%xmm1 \n\t"
674	"movq %%xmm1, (%0) \n\t"
675	"add $8, %0 \n\t"
676	"jmp 2f \n\t" // skip auxiliary routine
677
678	/*
679	* Finish generating the next 6 quarter-keys.
680	*
681	* On entry xmm0 is r3:r2:r1:r0, xmm1 is stuff:stuff:r5:r4
682	* and xmm2 is stuff:stuff:X:stuff with X = rot( sub( r3 ) ) ^ RCON.
683	*
684	* On exit, xmm0 is r9:r8:r7:r6 and xmm1 is stuff:stuff:r11:r10
685	* and those are written to the round key buffer.
686	*/
687	"1: \n\t"
688	"pshufd $0x55, %%xmm2, %%xmm2 \n\t" // X:X:X:X
689	"pxor %%xmm0, %%xmm2 \n\t" // X+r3:X+r2:X+r1:r4
690	"pslldq $4, %%xmm0 \n\t" // etc
691	"pxor %%xmm0, %%xmm2 \n\t"
692	"pslldq $4, %%xmm0 \n\t"
693	"pxor %%xmm0, %%xmm2 \n\t"
694	"pslldq $4, %%xmm0 \n\t"
695	"pxor %%xmm2, %%xmm0 \n\t" // update xmm0 = r9:r8:r7:r6
696	"movdqu %%xmm0, (%0) \n\t"
697	"add $16, %0 \n\t"
698	"pshufd $0xff, %%xmm0, %%xmm2 \n\t" // r9:r9:r9:r9
699	"pxor %%xmm1, %%xmm2 \n\t" // stuff:stuff:r9+r5:r10
700	"pslldq $4, %%xmm1 \n\t" // r2:r1:r0:0
701	"pxor %%xmm2, %%xmm1 \n\t" // xmm1 = stuff:stuff:r11:r10
702	"movq %%xmm1, (%0) \n\t"
703	"add $8, %0 \n\t"
704	"ret \n\t"
705
706	"2: \n\t"
707	AESKEYGENA(xmm1_xmm2, "0x01") "call 1b \n\t"
708	AESKEYGENA(xmm1_xmm2, "0x02") "call 1b \n\t"
709	AESKEYGENA(xmm1_xmm2, "0x04") "call 1b \n\t"
710	AESKEYGENA(xmm1_xmm2, "0x08") "call 1b \n\t"
711	AESKEYGENA(xmm1_xmm2, "0x10") "call 1b \n\t"
712	AESKEYGENA(xmm1_xmm2, "0x20") "call 1b \n\t"
713	AESKEYGENA(xmm1_xmm2, "0x40") "call 1b \n\t"
714	AESKEYGENA(xmm1_xmm2, "0x80") "call 1b \n\t"
715
716	:
717	: "r" (rk), "r" (key)
718	: "memory", "cc", "0");
719	}
720
721	/*
722	* Key expansion, 256-bit case
723	*/
724	static void aesni_setkey_enc_256(unsigned char *rk,
725	const unsigned char *key)
726	{
727	asm ("movdqu (%1), %%xmm0 \n\t"
728	"movdqu %%xmm0, (%0) \n\t"
729	"add $16, %0 \n\t"
730	"movdqu 16(%1), %%xmm1 \n\t"
731	"movdqu %%xmm1, (%0) \n\t"
732	"jmp 2f \n\t" // skip auxiliary routine
733
734	/*
735	* Finish generating the next two round keys.
736	*
737	* On entry xmm0 is r3:r2:r1:r0, xmm1 is r7:r6:r5:r4 and
738	* xmm2 is X:stuff:stuff:stuff with X = rot( sub( r7 )) ^ RCON
739	*
740	* On exit, xmm0 is r11:r10:r9:r8 and xmm1 is r15:r14:r13:r12
741	* and those have been written to the output buffer.
742	*/
743	"1: \n\t"
744	"pshufd $0xff, %%xmm2, %%xmm2 \n\t"
745	"pxor %%xmm0, %%xmm2 \n\t"
746	"pslldq $4, %%xmm0 \n\t"
747	"pxor %%xmm0, %%xmm2 \n\t"
748	"pslldq $4, %%xmm0 \n\t"
749	"pxor %%xmm0, %%xmm2 \n\t"
750	"pslldq $4, %%xmm0 \n\t"
751	"pxor %%xmm2, %%xmm0 \n\t"
752	"add $16, %0 \n\t"
753	"movdqu %%xmm0, (%0) \n\t"
754
755	/ Set xmm2 to stuff:Y:stuff:stuff with Y = subword( r11 )*
756	* and proceed to generate next round key from there */
757	AESKEYGENA(xmm0_xmm2, "0x00")
758	"pshufd $0xaa, %%xmm2, %%xmm2 \n\t"
759	"pxor %%xmm1, %%xmm2 \n\t"
760	"pslldq $4, %%xmm1 \n\t"
761	"pxor %%xmm1, %%xmm2 \n\t"
762	"pslldq $4, %%xmm1 \n\t"
763	"pxor %%xmm1, %%xmm2 \n\t"
764	"pslldq $4, %%xmm1 \n\t"
765	"pxor %%xmm2, %%xmm1 \n\t"
766	"add $16, %0 \n\t"
767	"movdqu %%xmm1, (%0) \n\t"
768	"ret \n\t"
769
770	/*
771	* Main "loop" - Generating one more key than necessary,
772	* see definition of mbedtls_aes_context.buf
773	*/
774	"2: \n\t"
775	AESKEYGENA(xmm1_xmm2, "0x01") "call 1b \n\t"
776	AESKEYGENA(xmm1_xmm2, "0x02") "call 1b \n\t"
777	AESKEYGENA(xmm1_xmm2, "0x04") "call 1b \n\t"
778	AESKEYGENA(xmm1_xmm2, "0x08") "call 1b \n\t"
779	AESKEYGENA(xmm1_xmm2, "0x10") "call 1b \n\t"
780	AESKEYGENA(xmm1_xmm2, "0x20") "call 1b \n\t"
781	AESKEYGENA(xmm1_xmm2, "0x40") "call 1b \n\t"
782	:
783	: "r" (rk), "r" (key)
784	: "memory", "cc", "0");
785	}
786
787	#endif /* MBEDTLS_AESNI_HAVE_CODE */
788
789	/*
790	* Key expansion, wrapper
791	*/
792	int mbedtls_aesni_setkey_enc(unsigned char *rk,
793	const unsigned char *key,
794	size_t bits)
795	{
796	switch (bits) {
797	case `128`: aesni_setkey_enc_128(rk, key); break;
798	case `192`: aesni_setkey_enc_192(rk, key); break;
799	case `256`: aesni_setkey_enc_256(rk, key); break;
800	default: return MBEDTLS_ERR_AES_INVALID_KEY_LENGTH;
801	}
802
803	return `0`;
804	}
805
806	#endif /* MBEDTLS_AESNI_HAVE_CODE */
807
808	#endif /* MBEDTLS_AESNI_C */
809

Browse the source code of Godot/thirdparty/mbedtls/library/aesni.c