macroAssembler_x86_aes.cpp source code [OpenJDK/src/hotspot/cpu/x86/macroAssembler_x86_aes.cpp]

1	/*
2	* Copyright (c) 2018, Intel Corporation.
3	*
4	* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5	*
6	* This code is free software; you can redistribute it and/or modify it
7	* under the terms of the GNU General Public License version 2 only, as
8	* published by the Free Software Foundation.
9	*
10	* This code is distributed in the hope that it will be useful, but WITHOUT
11	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13	* version 2 for more details (a copy is included in the LICENSE file that
14	* accompanied this code).
15	*
16	* You should have received a copy of the GNU General Public License version
17	* 2 along with this work; if not, write to the Free Software Foundation,
18	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19	*
20	* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21	* or visit www.oracle.com if you need additional information or have any
22	* questions.
23	*
24	*/
25
26	#include "precompiled.hpp"
27	#include "asm/assembler.hpp"
28	#include "asm/assembler.inline.hpp"
29	#include "runtime/stubRoutines.hpp"
30	#include "macroAssembler_x86.hpp"
31
32	#ifdef _LP64
33	// Multiply 128 x 128 bits, using 4 pclmulqdq operations
34	void MacroAssembler::schoolbookAAD(int i, Register htbl, XMMRegister data,
35	XMMRegister tmp0, XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3) {
36	movdqu(xmm15, Address (htbl, i * `16`));
37	vpclmulhqlqdq(tmp3, data, xmm15); // 0x01
38	vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit);
39	vpclmulldq(tmp3, data, xmm15); // 0x00
40	vpxor(tmp0, tmp0, tmp3, Assembler::AVX_128bit);
41	vpclmulhdq(tmp3, data, xmm15); // 0x11
42	vpxor(tmp1, tmp1, tmp3, Assembler::AVX_128bit);
43	vpclmullqhqdq(tmp3, data, xmm15); // 0x10
44	vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit);
45	}
46
47	// Multiply two 128 bit numbers resulting in a 256 bit value
48	// Result of the multiplication followed by reduction stored in state
49	void MacroAssembler::gfmul(XMMRegister tmp0, XMMRegister state) {
50	const XMMRegister tmp1 = xmm4;
51	const XMMRegister tmp2 = xmm5;
52	const XMMRegister tmp3 = xmm6;
53	const XMMRegister tmp4 = xmm7;
54
55	vpclmulldq(tmp1, state, tmp0); //0x00 (a0 b0)*
56	vpclmulhdq(tmp4, state, tmp0);//0x11 (a1 b1)*
57	vpclmullqhqdq(tmp2, state, tmp0);//0x10 (a1 b0)*
58	vpclmulhqlqdq(tmp3, state, tmp0); //0x01 (a0 b1)*
59
60	vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit); // (a0 b1) + (a1 * b0)*
61
62	vpslldq(tmp3, tmp2, `8`, Assembler::AVX_128bit);
63	vpsrldq(tmp2, tmp2, `8`, Assembler::AVX_128bit);
64	vpxor(tmp1, tmp1, tmp3, Assembler::AVX_128bit); // tmp1 and tmp4 hold the result
65	vpxor(tmp4, tmp4, tmp2, Assembler::AVX_128bit); // of carryless multiplication
66	// Follows the reduction technique mentioned in
67	// Shift-XOR reduction described in Gueron-Kounavis May 2010
68	// First phase of reduction
69	//
70	vpslld(xmm8, tmp1, `31`, Assembler::AVX_128bit); // packed right shift shifting << 31
71	vpslld(xmm9, tmp1, `30`, Assembler::AVX_128bit); // packed right shift shifting << 30
72	vpslld(xmm10, tmp1, `25`, Assembler::AVX_128bit);// packed right shift shifting << 25
73	// xor the shifted versions
74	vpxor(xmm8, xmm8, xmm9, Assembler::AVX_128bit);
75	vpxor(xmm8, xmm8, xmm10, Assembler::AVX_128bit);
76	vpslldq(xmm9, xmm8, `12`, Assembler::AVX_128bit);
77	vpsrldq(xmm8, xmm8, `4`, Assembler::AVX_128bit);
78	vpxor(tmp1, tmp1, xmm9, Assembler::AVX_128bit);// first phase of the reduction complete
79	//
80	// Second phase of the reduction
81	//
82	vpsrld(xmm9, tmp1, `1`, Assembler::AVX_128bit);// packed left shifting >> 1
83	vpsrld(xmm10, tmp1, `2`, Assembler::AVX_128bit);// packed left shifting >> 2
84	vpsrld(xmm11, tmp1, `7`, Assembler::AVX_128bit);// packed left shifting >> 7
85	vpxor(xmm9, xmm9, xmm10, Assembler::AVX_128bit);// xor the shifted versions
86	vpxor(xmm9, xmm9, xmm11, Assembler::AVX_128bit);
87	vpxor(xmm9, xmm9, xmm8, Assembler::AVX_128bit);
88	vpxor(tmp1, tmp1, xmm9, Assembler::AVX_128bit);
89	vpxor(state, tmp4, tmp1, Assembler::AVX_128bit);// the result is in state
90	ret(`0`);
91	}
92
93	// This method takes the subkey after expansion as input and generates 1 16 power of subkey H.*
94	// The power of H is used in reduction process for one block ghash
95	void MacroAssembler::generateHtbl_one_block(Register htbl) {
96	const XMMRegister t = xmm13;
97
98	// load the original subkey hash
99	movdqu(t, Address (htbl, `0`));
100	// shuffle using long swap mask
101	movdqu(xmm10, ExternalAddress (StubRoutines::x86::ghash_long_swap_mask_addr()));
102	vpshufb(t, t, xmm10, Assembler::AVX_128bit);
103
104	// Compute H' = GFMUL(H, 2)
105	vpsrld(xmm3, t, `7`, Assembler::AVX_128bit);
106	movdqu(xmm4, ExternalAddress (StubRoutines::x86::ghash_shufflemask_addr()));
107	vpshufb(xmm3, xmm3, xmm4, Assembler::AVX_128bit);
108	movl(rax, `0xff00`);
109	movdl(xmm4, rax);
110	vpshufb(xmm4, xmm4, xmm3, Assembler::AVX_128bit);
111	movdqu(xmm5, ExternalAddress (StubRoutines::x86::ghash_polynomial_addr()));
112	vpand(xmm5, xmm5, xmm4, Assembler::AVX_128bit);
113	vpsrld(xmm3, t, `31`, Assembler::AVX_128bit);
114	vpslld(xmm4, t, `1`, Assembler::AVX_128bit);
115	vpslldq(xmm3, xmm3, `4`, Assembler::AVX_128bit);
116	vpxor(t, xmm4, xmm3, Assembler::AVX_128bit);// t holds p(x) <<1 or H 2*
117
118	//Adding p(x)<<1 to xmm5 which holds the reduction polynomial
119	vpxor(t, t, xmm5, Assembler::AVX_128bit);
120	movdqu(Address (htbl, `1` * `16`), t); // H 2*
121
122	ret(`0`);
123	}
124
125	// This method takes the subkey after expansion as input and generates the remaining powers of subkey H.
126	// The power of H is used in reduction process for eight block ghash
127	void MacroAssembler::generateHtbl_eight_blocks(Register htbl) {
128	const XMMRegister t = xmm13;
129	const XMMRegister tmp0 = xmm1;
130	Label GFMUL;
131
132	movdqu(t, Address (htbl, `1` * `16`));
133	movdqu(tmp0, t);
134
135	// tmp0 and t hold H. Now we compute powers of H by using GFMUL(H, H)
136	call(GFMUL, relocInfo::none);
137	movdqu(Address (htbl, `2` * `16`), t); //H ^ 2 2*
138	call(GFMUL, relocInfo::none);
139	movdqu(Address (htbl, `3` * `16`), t); //H ^ 3 2*
140	call(GFMUL, relocInfo::none);
141	movdqu(Address (htbl, `4` * `16`), t); //H ^ 4 2*
142	call(GFMUL, relocInfo::none);
143	movdqu(Address (htbl, `5` * `16`), t); //H ^ 5 2*
144	call(GFMUL, relocInfo::none);
145	movdqu(Address (htbl, `6` * `16`), t); //H ^ 6 2*
146	call(GFMUL, relocInfo::none);
147	movdqu(Address (htbl, `7` * `16`), t); //H ^ 7 2*
148	call(GFMUL, relocInfo::none);
149	movdqu(Address (htbl, `8` * `16`), t); //H ^ 8 2*
150	ret(`0`);
151
152	bind(GFMUL);
153	gfmul(tmp0, t);
154	}
155
156	// Multiblock and single block GHASH computation using Shift XOR reduction technique
157	void MacroAssembler::avx_ghash(Register input_state, Register htbl,
158	Register input_data, Register blocks) {
159
160	// temporary variables to hold input data and input state
161	const XMMRegister data = xmm1;
162	const XMMRegister state = xmm0;
163	// temporary variables to hold intermediate results
164	const XMMRegister tmp0 = xmm3;
165	const XMMRegister tmp1 = xmm4;
166	const XMMRegister tmp2 = xmm5;
167	const XMMRegister tmp3 = xmm6;
168	// temporary variables to hold byte and long swap masks
169	const XMMRegister bswap_mask = xmm2;
170	const XMMRegister lswap_mask = xmm14;
171
172	Label GENERATE_HTBL_1_BLK, GENERATE_HTBL_8_BLKS, BEGIN_PROCESS, GFMUL, BLOCK8_REDUCTION,
173	ONE_BLK_INIT, PROCESS_1_BLOCK, PROCESS_8_BLOCKS, SAVE_STATE, EXIT_GHASH;
174
175	testptr(blocks, blocks);
176	jcc(Assembler::zero, EXIT_GHASH);
177
178	// Check if Hashtable (116) has been already generated*
179	// For anything less than 8 blocks, we generate only the first power of H.
180	movdqu(tmp2, Address (htbl, `1` * `16`));
181	ptest(tmp2, tmp2);
182	jcc(Assembler::notZero, BEGIN_PROCESS);
183	call(GENERATE_HTBL_1_BLK, relocInfo::none);
184
185	// Shuffle the input state
186	bind(BEGIN_PROCESS);
187	movdqu(lswap_mask, ExternalAddress (StubRoutines::x86::ghash_long_swap_mask_addr()));
188	movdqu(state, Address (input_state, `0`));
189	vpshufb(state, state, lswap_mask, Assembler::AVX_128bit);
190
191	cmpl(blocks, `8`);
192	jcc(Assembler::below, ONE_BLK_INIT);
193	// If we have 8 blocks or more data, then generate remaining powers of H
194	movdqu(tmp2, Address (htbl, `8` * `16`));
195	ptest(tmp2, tmp2);
196	jcc(Assembler::notZero, PROCESS_8_BLOCKS);
197	call(GENERATE_HTBL_8_BLKS, relocInfo::none);
198
199	//Do 8 multiplies followed by a reduction processing 8 blocks of data at a time
200	//Each block = 16 bytes.
201	bind(PROCESS_8_BLOCKS);
202	subl(blocks, `8`);
203	movdqu(bswap_mask, ExternalAddress (StubRoutines::x86::ghash_byte_swap_mask_addr()));
204	movdqu(data, Address (input_data, `16` * `7`));
205	vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
206	//Loading 116 as calculated powers of H required starts at that location.*
207	movdqu(xmm15, Address (htbl, `1` * `16`));
208	//Perform carryless multiplication of (H2, data block #7)*
209	vpclmulhqlqdq(tmp2, data, xmm15);//a0 b1*
210	vpclmulldq(tmp0, data, xmm15);//a0 b0*
211	vpclmulhdq(tmp1, data, xmm15);//a1 b1*
212	vpclmullqhqdq(tmp3, data, xmm15);//a1 b0*
213	vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit);// (a0 b1) + (a1 * b0)*
214
215	movdqu(data, Address (input_data, `16` * `6`));
216	vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
217	// Perform carryless multiplication of (H^2 2, data block #6)*
218	schoolbookAAD(`2`, htbl, data, tmp0, tmp1, tmp2, tmp3);
219
220	movdqu(data, Address (input_data, `16` * `5`));
221	vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
222	// Perform carryless multiplication of (H^3 2, data block #5)*
223	schoolbookAAD(`3`, htbl, data, tmp0, tmp1, tmp2, tmp3);
224	movdqu(data, Address (input_data, `16` * `4`));
225	vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
226	// Perform carryless multiplication of (H^4 2, data block #4)*
227	schoolbookAAD(`4`, htbl, data, tmp0, tmp1, tmp2, tmp3);
228	movdqu(data, Address (input_data, `16` * `3`));
229	vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
230	// Perform carryless multiplication of (H^5 2, data block #3)*
231	schoolbookAAD(`5`, htbl, data, tmp0, tmp1, tmp2, tmp3);
232	movdqu(data, Address (input_data, `16` * `2`));
233	vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
234	// Perform carryless multiplication of (H^6 2, data block #2)*
235	schoolbookAAD(`6`, htbl, data, tmp0, tmp1, tmp2, tmp3);
236	movdqu(data, Address (input_data, `16` * `1`));
237	vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
238	// Perform carryless multiplication of (H^7 2, data block #1)*
239	schoolbookAAD(`7`, htbl, data, tmp0, tmp1, tmp2, tmp3);
240	movdqu(data, Address (input_data, `16` * `0`));
241	// xor data block#0 with input state before perfoming carry-less multiplication
242	vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
243	vpxor(data, data, state, Assembler::AVX_128bit);
244	// Perform carryless multiplication of (H^8 2, data block #0)*
245	schoolbookAAD(`8`, htbl, data, tmp0, tmp1, tmp2, tmp3);
246	vpslldq(tmp3, tmp2, `8`, Assembler::AVX_128bit);
247	vpsrldq(tmp2, tmp2, `8`, Assembler::AVX_128bit);
248	vpxor(tmp0, tmp0, tmp3, Assembler::AVX_128bit);// tmp0, tmp1 contains aggregated results of
249	vpxor(tmp1, tmp1, tmp2, Assembler::AVX_128bit);// the multiplication operation
250
251	// we have the 2 128-bit partially accumulated multiplication results in tmp0:tmp1
252	// with higher 128-bit in tmp1 and lower 128-bit in corresponding tmp0
253	// Follows the reduction technique mentioned in
254	// Shift-XOR reduction described in Gueron-Kounavis May 2010
255	bind(BLOCK8_REDUCTION);
256	// First Phase of the reduction
257	vpslld(xmm8, tmp0, `31`, Assembler::AVX_128bit); // packed right shifting << 31
258	vpslld(xmm9, tmp0, `30`, Assembler::AVX_128bit); // packed right shifting << 30
259	vpslld(xmm10, tmp0, `25`, Assembler::AVX_128bit); // packed right shifting << 25
260	// xor the shifted versions
261	vpxor(xmm8, xmm8, xmm10, Assembler::AVX_128bit);
262	vpxor(xmm8, xmm8, xmm9, Assembler::AVX_128bit);
263
264	vpslldq(xmm9, xmm8, `12`, Assembler::AVX_128bit);
265	vpsrldq(xmm8, xmm8, `4`, Assembler::AVX_128bit);
266
267	vpxor(tmp0, tmp0, xmm9, Assembler::AVX_128bit); // first phase of reduction is complete
268	// second phase of the reduction
269	vpsrld(xmm9, tmp0, `1`, Assembler::AVX_128bit); // packed left shifting >> 1
270	vpsrld(xmm10, tmp0, `2`, Assembler::AVX_128bit); // packed left shifting >> 2
271	vpsrld(tmp2, tmp0, `7`, Assembler::AVX_128bit); // packed left shifting >> 7
272	// xor the shifted versions
273	vpxor(xmm9, xmm9, xmm10, Assembler::AVX_128bit);
274	vpxor(xmm9, xmm9, tmp2, Assembler::AVX_128bit);
275	vpxor(xmm9, xmm9, xmm8, Assembler::AVX_128bit);
276	vpxor(tmp0, xmm9, tmp0, Assembler::AVX_128bit);
277	// Final result is in state
278	vpxor(state, tmp0, tmp1, Assembler::AVX_128bit);
279
280	lea(input_data, Address (input_data, `16` * `8`));
281	cmpl(blocks, `8`);
282	jcc(Assembler::below, ONE_BLK_INIT);
283	jmp(PROCESS_8_BLOCKS);
284
285	// Since this is one block operation we will only use H 2 i.e. the first power of H*
286	bind(ONE_BLK_INIT);
287	movdqu(tmp0, Address (htbl, `1` * `16`));
288	movdqu(bswap_mask, ExternalAddress (StubRoutines::x86::ghash_byte_swap_mask_addr()));
289
290	//Do one (128 bit x 128 bit) carry-less multiplication at a time followed by a reduction.
291	bind(PROCESS_1_BLOCK);
292	cmpl(blocks, `0`);
293	jcc(Assembler::equal, SAVE_STATE);
294	subl(blocks, `1`);
295	movdqu(data, Address (input_data, `0`));
296	vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
297	vpxor(state, state, data, Assembler::AVX_128bit);
298	// gfmul(H2, state)*
299	call(GFMUL, relocInfo::none);
300	addptr(input_data, `16`);
301	jmp(PROCESS_1_BLOCK);
302
303	bind(SAVE_STATE);
304	vpshufb(state, state, lswap_mask, Assembler::AVX_128bit);
305	movdqu(Address (input_state, `0`), state);
306	jmp(EXIT_GHASH);
307
308	bind(GFMUL);
309	gfmul(tmp0, state);
310
311	bind(GENERATE_HTBL_1_BLK);
312	generateHtbl_one_block(htbl);
313
314	bind(GENERATE_HTBL_8_BLKS);
315	generateHtbl_eight_blocks(htbl);
316
317	bind(EXIT_GHASH);
318	// zero out xmm registers used for Htbl storage
319	vpxor(xmm0, xmm0, xmm0, Assembler::AVX_128bit);
320	vpxor(xmm1, xmm1, xmm1, Assembler::AVX_128bit);
321	vpxor(xmm3, xmm3, xmm3, Assembler::AVX_128bit);
322	vpxor(xmm15, xmm15, xmm15, Assembler::AVX_128bit);
323	}
324	#endif // _LP64
325

Browse the source code of OpenJDK/src/hotspot/cpu/x86/macroAssembler_x86_aes.cpp