1/*
2* Copyright (c) 2018, Intel Corporation.
3*
4* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5*
6* This code is free software; you can redistribute it and/or modify it
7* under the terms of the GNU General Public License version 2 only, as
8* published by the Free Software Foundation.
9*
10* This code is distributed in the hope that it will be useful, but WITHOUT
11* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13* version 2 for more details (a copy is included in the LICENSE file that
14* accompanied this code).
15*
16* You should have received a copy of the GNU General Public License version
17* 2 along with this work; if not, write to the Free Software Foundation,
18* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19*
20* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21* or visit www.oracle.com if you need additional information or have any
22* questions.
23*
24*/
25
26#include "precompiled.hpp"
27#include "asm/assembler.hpp"
28#include "asm/assembler.inline.hpp"
29#include "runtime/stubRoutines.hpp"
30#include "macroAssembler_x86.hpp"
31
32#ifdef _LP64
33// Multiply 128 x 128 bits, using 4 pclmulqdq operations
34void MacroAssembler::schoolbookAAD(int i, Register htbl, XMMRegister data,
35 XMMRegister tmp0, XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3) {
36 movdqu(xmm15, Address(htbl, i * 16));
37 vpclmulhqlqdq(tmp3, data, xmm15); // 0x01
38 vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit);
39 vpclmulldq(tmp3, data, xmm15); // 0x00
40 vpxor(tmp0, tmp0, tmp3, Assembler::AVX_128bit);
41 vpclmulhdq(tmp3, data, xmm15); // 0x11
42 vpxor(tmp1, tmp1, tmp3, Assembler::AVX_128bit);
43 vpclmullqhqdq(tmp3, data, xmm15); // 0x10
44 vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit);
45}
46
47// Multiply two 128 bit numbers resulting in a 256 bit value
48// Result of the multiplication followed by reduction stored in state
49void MacroAssembler::gfmul(XMMRegister tmp0, XMMRegister state) {
50 const XMMRegister tmp1 = xmm4;
51 const XMMRegister tmp2 = xmm5;
52 const XMMRegister tmp3 = xmm6;
53 const XMMRegister tmp4 = xmm7;
54
55 vpclmulldq(tmp1, state, tmp0); //0x00 (a0 * b0)
56 vpclmulhdq(tmp4, state, tmp0);//0x11 (a1 * b1)
57 vpclmullqhqdq(tmp2, state, tmp0);//0x10 (a1 * b0)
58 vpclmulhqlqdq(tmp3, state, tmp0); //0x01 (a0 * b1)
59
60 vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit); // (a0 * b1) + (a1 * b0)
61
62 vpslldq(tmp3, tmp2, 8, Assembler::AVX_128bit);
63 vpsrldq(tmp2, tmp2, 8, Assembler::AVX_128bit);
64 vpxor(tmp1, tmp1, tmp3, Assembler::AVX_128bit); // tmp1 and tmp4 hold the result
65 vpxor(tmp4, tmp4, tmp2, Assembler::AVX_128bit); // of carryless multiplication
66 // Follows the reduction technique mentioned in
67 // Shift-XOR reduction described in Gueron-Kounavis May 2010
68 // First phase of reduction
69 //
70 vpslld(xmm8, tmp1, 31, Assembler::AVX_128bit); // packed right shift shifting << 31
71 vpslld(xmm9, tmp1, 30, Assembler::AVX_128bit); // packed right shift shifting << 30
72 vpslld(xmm10, tmp1, 25, Assembler::AVX_128bit);// packed right shift shifting << 25
73 // xor the shifted versions
74 vpxor(xmm8, xmm8, xmm9, Assembler::AVX_128bit);
75 vpxor(xmm8, xmm8, xmm10, Assembler::AVX_128bit);
76 vpslldq(xmm9, xmm8, 12, Assembler::AVX_128bit);
77 vpsrldq(xmm8, xmm8, 4, Assembler::AVX_128bit);
78 vpxor(tmp1, tmp1, xmm9, Assembler::AVX_128bit);// first phase of the reduction complete
79 //
80 // Second phase of the reduction
81 //
82 vpsrld(xmm9, tmp1, 1, Assembler::AVX_128bit);// packed left shifting >> 1
83 vpsrld(xmm10, tmp1, 2, Assembler::AVX_128bit);// packed left shifting >> 2
84 vpsrld(xmm11, tmp1, 7, Assembler::AVX_128bit);// packed left shifting >> 7
85 vpxor(xmm9, xmm9, xmm10, Assembler::AVX_128bit);// xor the shifted versions
86 vpxor(xmm9, xmm9, xmm11, Assembler::AVX_128bit);
87 vpxor(xmm9, xmm9, xmm8, Assembler::AVX_128bit);
88 vpxor(tmp1, tmp1, xmm9, Assembler::AVX_128bit);
89 vpxor(state, tmp4, tmp1, Assembler::AVX_128bit);// the result is in state
90 ret(0);
91}
92
93// This method takes the subkey after expansion as input and generates 1 * 16 power of subkey H.
94// The power of H is used in reduction process for one block ghash
95void MacroAssembler::generateHtbl_one_block(Register htbl) {
96 const XMMRegister t = xmm13;
97
98 // load the original subkey hash
99 movdqu(t, Address(htbl, 0));
100 // shuffle using long swap mask
101 movdqu(xmm10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
102 vpshufb(t, t, xmm10, Assembler::AVX_128bit);
103
104 // Compute H' = GFMUL(H, 2)
105 vpsrld(xmm3, t, 7, Assembler::AVX_128bit);
106 movdqu(xmm4, ExternalAddress(StubRoutines::x86::ghash_shufflemask_addr()));
107 vpshufb(xmm3, xmm3, xmm4, Assembler::AVX_128bit);
108 movl(rax, 0xff00);
109 movdl(xmm4, rax);
110 vpshufb(xmm4, xmm4, xmm3, Assembler::AVX_128bit);
111 movdqu(xmm5, ExternalAddress(StubRoutines::x86::ghash_polynomial_addr()));
112 vpand(xmm5, xmm5, xmm4, Assembler::AVX_128bit);
113 vpsrld(xmm3, t, 31, Assembler::AVX_128bit);
114 vpslld(xmm4, t, 1, Assembler::AVX_128bit);
115 vpslldq(xmm3, xmm3, 4, Assembler::AVX_128bit);
116 vpxor(t, xmm4, xmm3, Assembler::AVX_128bit);// t holds p(x) <<1 or H * 2
117
118 //Adding p(x)<<1 to xmm5 which holds the reduction polynomial
119 vpxor(t, t, xmm5, Assembler::AVX_128bit);
120 movdqu(Address(htbl, 1 * 16), t); // H * 2
121
122 ret(0);
123}
124
125// This method takes the subkey after expansion as input and generates the remaining powers of subkey H.
126// The power of H is used in reduction process for eight block ghash
127void MacroAssembler::generateHtbl_eight_blocks(Register htbl) {
128 const XMMRegister t = xmm13;
129 const XMMRegister tmp0 = xmm1;
130 Label GFMUL;
131
132 movdqu(t, Address(htbl, 1 * 16));
133 movdqu(tmp0, t);
134
135 // tmp0 and t hold H. Now we compute powers of H by using GFMUL(H, H)
136 call(GFMUL, relocInfo::none);
137 movdqu(Address(htbl, 2 * 16), t); //H ^ 2 * 2
138 call(GFMUL, relocInfo::none);
139 movdqu(Address(htbl, 3 * 16), t); //H ^ 3 * 2
140 call(GFMUL, relocInfo::none);
141 movdqu(Address(htbl, 4 * 16), t); //H ^ 4 * 2
142 call(GFMUL, relocInfo::none);
143 movdqu(Address(htbl, 5 * 16), t); //H ^ 5 * 2
144 call(GFMUL, relocInfo::none);
145 movdqu(Address(htbl, 6 * 16), t); //H ^ 6 * 2
146 call(GFMUL, relocInfo::none);
147 movdqu(Address(htbl, 7 * 16), t); //H ^ 7 * 2
148 call(GFMUL, relocInfo::none);
149 movdqu(Address(htbl, 8 * 16), t); //H ^ 8 * 2
150 ret(0);
151
152 bind(GFMUL);
153 gfmul(tmp0, t);
154}
155
156// Multiblock and single block GHASH computation using Shift XOR reduction technique
157void MacroAssembler::avx_ghash(Register input_state, Register htbl,
158 Register input_data, Register blocks) {
159
160 // temporary variables to hold input data and input state
161 const XMMRegister data = xmm1;
162 const XMMRegister state = xmm0;
163 // temporary variables to hold intermediate results
164 const XMMRegister tmp0 = xmm3;
165 const XMMRegister tmp1 = xmm4;
166 const XMMRegister tmp2 = xmm5;
167 const XMMRegister tmp3 = xmm6;
168 // temporary variables to hold byte and long swap masks
169 const XMMRegister bswap_mask = xmm2;
170 const XMMRegister lswap_mask = xmm14;
171
172 Label GENERATE_HTBL_1_BLK, GENERATE_HTBL_8_BLKS, BEGIN_PROCESS, GFMUL, BLOCK8_REDUCTION,
173 ONE_BLK_INIT, PROCESS_1_BLOCK, PROCESS_8_BLOCKS, SAVE_STATE, EXIT_GHASH;
174
175 testptr(blocks, blocks);
176 jcc(Assembler::zero, EXIT_GHASH);
177
178 // Check if Hashtable (1*16) has been already generated
179 // For anything less than 8 blocks, we generate only the first power of H.
180 movdqu(tmp2, Address(htbl, 1 * 16));
181 ptest(tmp2, tmp2);
182 jcc(Assembler::notZero, BEGIN_PROCESS);
183 call(GENERATE_HTBL_1_BLK, relocInfo::none);
184
185 // Shuffle the input state
186 bind(BEGIN_PROCESS);
187 movdqu(lswap_mask, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
188 movdqu(state, Address(input_state, 0));
189 vpshufb(state, state, lswap_mask, Assembler::AVX_128bit);
190
191 cmpl(blocks, 8);
192 jcc(Assembler::below, ONE_BLK_INIT);
193 // If we have 8 blocks or more data, then generate remaining powers of H
194 movdqu(tmp2, Address(htbl, 8 * 16));
195 ptest(tmp2, tmp2);
196 jcc(Assembler::notZero, PROCESS_8_BLOCKS);
197 call(GENERATE_HTBL_8_BLKS, relocInfo::none);
198
199 //Do 8 multiplies followed by a reduction processing 8 blocks of data at a time
200 //Each block = 16 bytes.
201 bind(PROCESS_8_BLOCKS);
202 subl(blocks, 8);
203 movdqu(bswap_mask, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
204 movdqu(data, Address(input_data, 16 * 7));
205 vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
206 //Loading 1*16 as calculated powers of H required starts at that location.
207 movdqu(xmm15, Address(htbl, 1 * 16));
208 //Perform carryless multiplication of (H*2, data block #7)
209 vpclmulhqlqdq(tmp2, data, xmm15);//a0 * b1
210 vpclmulldq(tmp0, data, xmm15);//a0 * b0
211 vpclmulhdq(tmp1, data, xmm15);//a1 * b1
212 vpclmullqhqdq(tmp3, data, xmm15);//a1* b0
213 vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit);// (a0 * b1) + (a1 * b0)
214
215 movdqu(data, Address(input_data, 16 * 6));
216 vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
217 // Perform carryless multiplication of (H^2 * 2, data block #6)
218 schoolbookAAD(2, htbl, data, tmp0, tmp1, tmp2, tmp3);
219
220 movdqu(data, Address(input_data, 16 * 5));
221 vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
222 // Perform carryless multiplication of (H^3 * 2, data block #5)
223 schoolbookAAD(3, htbl, data, tmp0, tmp1, tmp2, tmp3);
224 movdqu(data, Address(input_data, 16 * 4));
225 vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
226 // Perform carryless multiplication of (H^4 * 2, data block #4)
227 schoolbookAAD(4, htbl, data, tmp0, tmp1, tmp2, tmp3);
228 movdqu(data, Address(input_data, 16 * 3));
229 vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
230 // Perform carryless multiplication of (H^5 * 2, data block #3)
231 schoolbookAAD(5, htbl, data, tmp0, tmp1, tmp2, tmp3);
232 movdqu(data, Address(input_data, 16 * 2));
233 vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
234 // Perform carryless multiplication of (H^6 * 2, data block #2)
235 schoolbookAAD(6, htbl, data, tmp0, tmp1, tmp2, tmp3);
236 movdqu(data, Address(input_data, 16 * 1));
237 vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
238 // Perform carryless multiplication of (H^7 * 2, data block #1)
239 schoolbookAAD(7, htbl, data, tmp0, tmp1, tmp2, tmp3);
240 movdqu(data, Address(input_data, 16 * 0));
241 // xor data block#0 with input state before perfoming carry-less multiplication
242 vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
243 vpxor(data, data, state, Assembler::AVX_128bit);
244 // Perform carryless multiplication of (H^8 * 2, data block #0)
245 schoolbookAAD(8, htbl, data, tmp0, tmp1, tmp2, tmp3);
246 vpslldq(tmp3, tmp2, 8, Assembler::AVX_128bit);
247 vpsrldq(tmp2, tmp2, 8, Assembler::AVX_128bit);
248 vpxor(tmp0, tmp0, tmp3, Assembler::AVX_128bit);// tmp0, tmp1 contains aggregated results of
249 vpxor(tmp1, tmp1, tmp2, Assembler::AVX_128bit);// the multiplication operation
250
251 // we have the 2 128-bit partially accumulated multiplication results in tmp0:tmp1
252 // with higher 128-bit in tmp1 and lower 128-bit in corresponding tmp0
253 // Follows the reduction technique mentioned in
254 // Shift-XOR reduction described in Gueron-Kounavis May 2010
255 bind(BLOCK8_REDUCTION);
256 // First Phase of the reduction
257 vpslld(xmm8, tmp0, 31, Assembler::AVX_128bit); // packed right shifting << 31
258 vpslld(xmm9, tmp0, 30, Assembler::AVX_128bit); // packed right shifting << 30
259 vpslld(xmm10, tmp0, 25, Assembler::AVX_128bit); // packed right shifting << 25
260 // xor the shifted versions
261 vpxor(xmm8, xmm8, xmm10, Assembler::AVX_128bit);
262 vpxor(xmm8, xmm8, xmm9, Assembler::AVX_128bit);
263
264 vpslldq(xmm9, xmm8, 12, Assembler::AVX_128bit);
265 vpsrldq(xmm8, xmm8, 4, Assembler::AVX_128bit);
266
267 vpxor(tmp0, tmp0, xmm9, Assembler::AVX_128bit); // first phase of reduction is complete
268 // second phase of the reduction
269 vpsrld(xmm9, tmp0, 1, Assembler::AVX_128bit); // packed left shifting >> 1
270 vpsrld(xmm10, tmp0, 2, Assembler::AVX_128bit); // packed left shifting >> 2
271 vpsrld(tmp2, tmp0, 7, Assembler::AVX_128bit); // packed left shifting >> 7
272 // xor the shifted versions
273 vpxor(xmm9, xmm9, xmm10, Assembler::AVX_128bit);
274 vpxor(xmm9, xmm9, tmp2, Assembler::AVX_128bit);
275 vpxor(xmm9, xmm9, xmm8, Assembler::AVX_128bit);
276 vpxor(tmp0, xmm9, tmp0, Assembler::AVX_128bit);
277 // Final result is in state
278 vpxor(state, tmp0, tmp1, Assembler::AVX_128bit);
279
280 lea(input_data, Address(input_data, 16 * 8));
281 cmpl(blocks, 8);
282 jcc(Assembler::below, ONE_BLK_INIT);
283 jmp(PROCESS_8_BLOCKS);
284
285 // Since this is one block operation we will only use H * 2 i.e. the first power of H
286 bind(ONE_BLK_INIT);
287 movdqu(tmp0, Address(htbl, 1 * 16));
288 movdqu(bswap_mask, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
289
290 //Do one (128 bit x 128 bit) carry-less multiplication at a time followed by a reduction.
291 bind(PROCESS_1_BLOCK);
292 cmpl(blocks, 0);
293 jcc(Assembler::equal, SAVE_STATE);
294 subl(blocks, 1);
295 movdqu(data, Address(input_data, 0));
296 vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
297 vpxor(state, state, data, Assembler::AVX_128bit);
298 // gfmul(H*2, state)
299 call(GFMUL, relocInfo::none);
300 addptr(input_data, 16);
301 jmp(PROCESS_1_BLOCK);
302
303 bind(SAVE_STATE);
304 vpshufb(state, state, lswap_mask, Assembler::AVX_128bit);
305 movdqu(Address(input_state, 0), state);
306 jmp(EXIT_GHASH);
307
308 bind(GFMUL);
309 gfmul(tmp0, state);
310
311 bind(GENERATE_HTBL_1_BLK);
312 generateHtbl_one_block(htbl);
313
314 bind(GENERATE_HTBL_8_BLKS);
315 generateHtbl_eight_blocks(htbl);
316
317 bind(EXIT_GHASH);
318 // zero out xmm registers used for Htbl storage
319 vpxor(xmm0, xmm0, xmm0, Assembler::AVX_128bit);
320 vpxor(xmm1, xmm1, xmm1, Assembler::AVX_128bit);
321 vpxor(xmm3, xmm3, xmm3, Assembler::AVX_128bit);
322 vpxor(xmm15, xmm15, xmm15, Assembler::AVX_128bit);
323}
324#endif // _LP64
325