1 | /* |
2 | * Copyright (c) 2018, Intel Corporation. |
3 | * |
4 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
5 | * |
6 | * This code is free software; you can redistribute it and/or modify it |
7 | * under the terms of the GNU General Public License version 2 only, as |
8 | * published by the Free Software Foundation. |
9 | * |
10 | * This code is distributed in the hope that it will be useful, but WITHOUT |
11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
13 | * version 2 for more details (a copy is included in the LICENSE file that |
14 | * accompanied this code). |
15 | * |
16 | * You should have received a copy of the GNU General Public License version |
17 | * 2 along with this work; if not, write to the Free Software Foundation, |
18 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
19 | * |
20 | * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
21 | * or visit www.oracle.com if you need additional information or have any |
22 | * questions. |
23 | * |
24 | */ |
25 | |
26 | #include "precompiled.hpp" |
27 | #include "asm/assembler.hpp" |
28 | #include "asm/assembler.inline.hpp" |
29 | #include "runtime/stubRoutines.hpp" |
30 | #include "macroAssembler_x86.hpp" |
31 | |
32 | #ifdef _LP64 |
33 | // Multiply 128 x 128 bits, using 4 pclmulqdq operations |
34 | void MacroAssembler::schoolbookAAD(int i, Register htbl, XMMRegister data, |
35 | XMMRegister tmp0, XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3) { |
36 | movdqu(xmm15, Address(htbl, i * 16)); |
37 | vpclmulhqlqdq(tmp3, data, xmm15); // 0x01 |
38 | vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit); |
39 | vpclmulldq(tmp3, data, xmm15); // 0x00 |
40 | vpxor(tmp0, tmp0, tmp3, Assembler::AVX_128bit); |
41 | vpclmulhdq(tmp3, data, xmm15); // 0x11 |
42 | vpxor(tmp1, tmp1, tmp3, Assembler::AVX_128bit); |
43 | vpclmullqhqdq(tmp3, data, xmm15); // 0x10 |
44 | vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit); |
45 | } |
46 | |
47 | // Multiply two 128 bit numbers resulting in a 256 bit value |
48 | // Result of the multiplication followed by reduction stored in state |
49 | void MacroAssembler::gfmul(XMMRegister tmp0, XMMRegister state) { |
50 | const XMMRegister tmp1 = xmm4; |
51 | const XMMRegister tmp2 = xmm5; |
52 | const XMMRegister tmp3 = xmm6; |
53 | const XMMRegister tmp4 = xmm7; |
54 | |
55 | vpclmulldq(tmp1, state, tmp0); //0x00 (a0 * b0) |
56 | vpclmulhdq(tmp4, state, tmp0);//0x11 (a1 * b1) |
57 | vpclmullqhqdq(tmp2, state, tmp0);//0x10 (a1 * b0) |
58 | vpclmulhqlqdq(tmp3, state, tmp0); //0x01 (a0 * b1) |
59 | |
60 | vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit); // (a0 * b1) + (a1 * b0) |
61 | |
62 | vpslldq(tmp3, tmp2, 8, Assembler::AVX_128bit); |
63 | vpsrldq(tmp2, tmp2, 8, Assembler::AVX_128bit); |
64 | vpxor(tmp1, tmp1, tmp3, Assembler::AVX_128bit); // tmp1 and tmp4 hold the result |
65 | vpxor(tmp4, tmp4, tmp2, Assembler::AVX_128bit); // of carryless multiplication |
66 | // Follows the reduction technique mentioned in |
67 | // Shift-XOR reduction described in Gueron-Kounavis May 2010 |
68 | // First phase of reduction |
69 | // |
70 | vpslld(xmm8, tmp1, 31, Assembler::AVX_128bit); // packed right shift shifting << 31 |
71 | vpslld(xmm9, tmp1, 30, Assembler::AVX_128bit); // packed right shift shifting << 30 |
72 | vpslld(xmm10, tmp1, 25, Assembler::AVX_128bit);// packed right shift shifting << 25 |
73 | // xor the shifted versions |
74 | vpxor(xmm8, xmm8, xmm9, Assembler::AVX_128bit); |
75 | vpxor(xmm8, xmm8, xmm10, Assembler::AVX_128bit); |
76 | vpslldq(xmm9, xmm8, 12, Assembler::AVX_128bit); |
77 | vpsrldq(xmm8, xmm8, 4, Assembler::AVX_128bit); |
78 | vpxor(tmp1, tmp1, xmm9, Assembler::AVX_128bit);// first phase of the reduction complete |
79 | // |
80 | // Second phase of the reduction |
81 | // |
82 | vpsrld(xmm9, tmp1, 1, Assembler::AVX_128bit);// packed left shifting >> 1 |
83 | vpsrld(xmm10, tmp1, 2, Assembler::AVX_128bit);// packed left shifting >> 2 |
84 | vpsrld(xmm11, tmp1, 7, Assembler::AVX_128bit);// packed left shifting >> 7 |
85 | vpxor(xmm9, xmm9, xmm10, Assembler::AVX_128bit);// xor the shifted versions |
86 | vpxor(xmm9, xmm9, xmm11, Assembler::AVX_128bit); |
87 | vpxor(xmm9, xmm9, xmm8, Assembler::AVX_128bit); |
88 | vpxor(tmp1, tmp1, xmm9, Assembler::AVX_128bit); |
89 | vpxor(state, tmp4, tmp1, Assembler::AVX_128bit);// the result is in state |
90 | ret(0); |
91 | } |
92 | |
93 | // This method takes the subkey after expansion as input and generates 1 * 16 power of subkey H. |
94 | // The power of H is used in reduction process for one block ghash |
95 | void MacroAssembler::generateHtbl_one_block(Register htbl) { |
96 | const XMMRegister t = xmm13; |
97 | |
98 | // load the original subkey hash |
99 | movdqu(t, Address(htbl, 0)); |
100 | // shuffle using long swap mask |
101 | movdqu(xmm10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr())); |
102 | vpshufb(t, t, xmm10, Assembler::AVX_128bit); |
103 | |
104 | // Compute H' = GFMUL(H, 2) |
105 | vpsrld(xmm3, t, 7, Assembler::AVX_128bit); |
106 | movdqu(xmm4, ExternalAddress(StubRoutines::x86::ghash_shufflemask_addr())); |
107 | vpshufb(xmm3, xmm3, xmm4, Assembler::AVX_128bit); |
108 | movl(rax, 0xff00); |
109 | movdl(xmm4, rax); |
110 | vpshufb(xmm4, xmm4, xmm3, Assembler::AVX_128bit); |
111 | movdqu(xmm5, ExternalAddress(StubRoutines::x86::ghash_polynomial_addr())); |
112 | vpand(xmm5, xmm5, xmm4, Assembler::AVX_128bit); |
113 | vpsrld(xmm3, t, 31, Assembler::AVX_128bit); |
114 | vpslld(xmm4, t, 1, Assembler::AVX_128bit); |
115 | vpslldq(xmm3, xmm3, 4, Assembler::AVX_128bit); |
116 | vpxor(t, xmm4, xmm3, Assembler::AVX_128bit);// t holds p(x) <<1 or H * 2 |
117 | |
118 | //Adding p(x)<<1 to xmm5 which holds the reduction polynomial |
119 | vpxor(t, t, xmm5, Assembler::AVX_128bit); |
120 | movdqu(Address(htbl, 1 * 16), t); // H * 2 |
121 | |
122 | ret(0); |
123 | } |
124 | |
125 | // This method takes the subkey after expansion as input and generates the remaining powers of subkey H. |
126 | // The power of H is used in reduction process for eight block ghash |
127 | void MacroAssembler::generateHtbl_eight_blocks(Register htbl) { |
128 | const XMMRegister t = xmm13; |
129 | const XMMRegister tmp0 = xmm1; |
130 | Label GFMUL; |
131 | |
132 | movdqu(t, Address(htbl, 1 * 16)); |
133 | movdqu(tmp0, t); |
134 | |
135 | // tmp0 and t hold H. Now we compute powers of H by using GFMUL(H, H) |
136 | call(GFMUL, relocInfo::none); |
137 | movdqu(Address(htbl, 2 * 16), t); //H ^ 2 * 2 |
138 | call(GFMUL, relocInfo::none); |
139 | movdqu(Address(htbl, 3 * 16), t); //H ^ 3 * 2 |
140 | call(GFMUL, relocInfo::none); |
141 | movdqu(Address(htbl, 4 * 16), t); //H ^ 4 * 2 |
142 | call(GFMUL, relocInfo::none); |
143 | movdqu(Address(htbl, 5 * 16), t); //H ^ 5 * 2 |
144 | call(GFMUL, relocInfo::none); |
145 | movdqu(Address(htbl, 6 * 16), t); //H ^ 6 * 2 |
146 | call(GFMUL, relocInfo::none); |
147 | movdqu(Address(htbl, 7 * 16), t); //H ^ 7 * 2 |
148 | call(GFMUL, relocInfo::none); |
149 | movdqu(Address(htbl, 8 * 16), t); //H ^ 8 * 2 |
150 | ret(0); |
151 | |
152 | bind(GFMUL); |
153 | gfmul(tmp0, t); |
154 | } |
155 | |
156 | // Multiblock and single block GHASH computation using Shift XOR reduction technique |
157 | void MacroAssembler::avx_ghash(Register input_state, Register htbl, |
158 | Register input_data, Register blocks) { |
159 | |
160 | // temporary variables to hold input data and input state |
161 | const XMMRegister data = xmm1; |
162 | const XMMRegister state = xmm0; |
163 | // temporary variables to hold intermediate results |
164 | const XMMRegister tmp0 = xmm3; |
165 | const XMMRegister tmp1 = xmm4; |
166 | const XMMRegister tmp2 = xmm5; |
167 | const XMMRegister tmp3 = xmm6; |
168 | // temporary variables to hold byte and long swap masks |
169 | const XMMRegister bswap_mask = xmm2; |
170 | const XMMRegister lswap_mask = xmm14; |
171 | |
172 | Label GENERATE_HTBL_1_BLK, GENERATE_HTBL_8_BLKS, BEGIN_PROCESS, GFMUL, BLOCK8_REDUCTION, |
173 | ONE_BLK_INIT, PROCESS_1_BLOCK, PROCESS_8_BLOCKS, SAVE_STATE, EXIT_GHASH; |
174 | |
175 | testptr(blocks, blocks); |
176 | jcc(Assembler::zero, EXIT_GHASH); |
177 | |
178 | // Check if Hashtable (1*16) has been already generated |
179 | // For anything less than 8 blocks, we generate only the first power of H. |
180 | movdqu(tmp2, Address(htbl, 1 * 16)); |
181 | ptest(tmp2, tmp2); |
182 | jcc(Assembler::notZero, BEGIN_PROCESS); |
183 | call(GENERATE_HTBL_1_BLK, relocInfo::none); |
184 | |
185 | // Shuffle the input state |
186 | bind(BEGIN_PROCESS); |
187 | movdqu(lswap_mask, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr())); |
188 | movdqu(state, Address(input_state, 0)); |
189 | vpshufb(state, state, lswap_mask, Assembler::AVX_128bit); |
190 | |
191 | cmpl(blocks, 8); |
192 | jcc(Assembler::below, ONE_BLK_INIT); |
193 | // If we have 8 blocks or more data, then generate remaining powers of H |
194 | movdqu(tmp2, Address(htbl, 8 * 16)); |
195 | ptest(tmp2, tmp2); |
196 | jcc(Assembler::notZero, PROCESS_8_BLOCKS); |
197 | call(GENERATE_HTBL_8_BLKS, relocInfo::none); |
198 | |
199 | //Do 8 multiplies followed by a reduction processing 8 blocks of data at a time |
200 | //Each block = 16 bytes. |
201 | bind(PROCESS_8_BLOCKS); |
202 | subl(blocks, 8); |
203 | movdqu(bswap_mask, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr())); |
204 | movdqu(data, Address(input_data, 16 * 7)); |
205 | vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); |
206 | //Loading 1*16 as calculated powers of H required starts at that location. |
207 | movdqu(xmm15, Address(htbl, 1 * 16)); |
208 | //Perform carryless multiplication of (H*2, data block #7) |
209 | vpclmulhqlqdq(tmp2, data, xmm15);//a0 * b1 |
210 | vpclmulldq(tmp0, data, xmm15);//a0 * b0 |
211 | vpclmulhdq(tmp1, data, xmm15);//a1 * b1 |
212 | vpclmullqhqdq(tmp3, data, xmm15);//a1* b0 |
213 | vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit);// (a0 * b1) + (a1 * b0) |
214 | |
215 | movdqu(data, Address(input_data, 16 * 6)); |
216 | vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); |
217 | // Perform carryless multiplication of (H^2 * 2, data block #6) |
218 | schoolbookAAD(2, htbl, data, tmp0, tmp1, tmp2, tmp3); |
219 | |
220 | movdqu(data, Address(input_data, 16 * 5)); |
221 | vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); |
222 | // Perform carryless multiplication of (H^3 * 2, data block #5) |
223 | schoolbookAAD(3, htbl, data, tmp0, tmp1, tmp2, tmp3); |
224 | movdqu(data, Address(input_data, 16 * 4)); |
225 | vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); |
226 | // Perform carryless multiplication of (H^4 * 2, data block #4) |
227 | schoolbookAAD(4, htbl, data, tmp0, tmp1, tmp2, tmp3); |
228 | movdqu(data, Address(input_data, 16 * 3)); |
229 | vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); |
230 | // Perform carryless multiplication of (H^5 * 2, data block #3) |
231 | schoolbookAAD(5, htbl, data, tmp0, tmp1, tmp2, tmp3); |
232 | movdqu(data, Address(input_data, 16 * 2)); |
233 | vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); |
234 | // Perform carryless multiplication of (H^6 * 2, data block #2) |
235 | schoolbookAAD(6, htbl, data, tmp0, tmp1, tmp2, tmp3); |
236 | movdqu(data, Address(input_data, 16 * 1)); |
237 | vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); |
238 | // Perform carryless multiplication of (H^7 * 2, data block #1) |
239 | schoolbookAAD(7, htbl, data, tmp0, tmp1, tmp2, tmp3); |
240 | movdqu(data, Address(input_data, 16 * 0)); |
241 | // xor data block#0 with input state before perfoming carry-less multiplication |
242 | vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); |
243 | vpxor(data, data, state, Assembler::AVX_128bit); |
244 | // Perform carryless multiplication of (H^8 * 2, data block #0) |
245 | schoolbookAAD(8, htbl, data, tmp0, tmp1, tmp2, tmp3); |
246 | vpslldq(tmp3, tmp2, 8, Assembler::AVX_128bit); |
247 | vpsrldq(tmp2, tmp2, 8, Assembler::AVX_128bit); |
248 | vpxor(tmp0, tmp0, tmp3, Assembler::AVX_128bit);// tmp0, tmp1 contains aggregated results of |
249 | vpxor(tmp1, tmp1, tmp2, Assembler::AVX_128bit);// the multiplication operation |
250 | |
251 | // we have the 2 128-bit partially accumulated multiplication results in tmp0:tmp1 |
252 | // with higher 128-bit in tmp1 and lower 128-bit in corresponding tmp0 |
253 | // Follows the reduction technique mentioned in |
254 | // Shift-XOR reduction described in Gueron-Kounavis May 2010 |
255 | bind(BLOCK8_REDUCTION); |
256 | // First Phase of the reduction |
257 | vpslld(xmm8, tmp0, 31, Assembler::AVX_128bit); // packed right shifting << 31 |
258 | vpslld(xmm9, tmp0, 30, Assembler::AVX_128bit); // packed right shifting << 30 |
259 | vpslld(xmm10, tmp0, 25, Assembler::AVX_128bit); // packed right shifting << 25 |
260 | // xor the shifted versions |
261 | vpxor(xmm8, xmm8, xmm10, Assembler::AVX_128bit); |
262 | vpxor(xmm8, xmm8, xmm9, Assembler::AVX_128bit); |
263 | |
264 | vpslldq(xmm9, xmm8, 12, Assembler::AVX_128bit); |
265 | vpsrldq(xmm8, xmm8, 4, Assembler::AVX_128bit); |
266 | |
267 | vpxor(tmp0, tmp0, xmm9, Assembler::AVX_128bit); // first phase of reduction is complete |
268 | // second phase of the reduction |
269 | vpsrld(xmm9, tmp0, 1, Assembler::AVX_128bit); // packed left shifting >> 1 |
270 | vpsrld(xmm10, tmp0, 2, Assembler::AVX_128bit); // packed left shifting >> 2 |
271 | vpsrld(tmp2, tmp0, 7, Assembler::AVX_128bit); // packed left shifting >> 7 |
272 | // xor the shifted versions |
273 | vpxor(xmm9, xmm9, xmm10, Assembler::AVX_128bit); |
274 | vpxor(xmm9, xmm9, tmp2, Assembler::AVX_128bit); |
275 | vpxor(xmm9, xmm9, xmm8, Assembler::AVX_128bit); |
276 | vpxor(tmp0, xmm9, tmp0, Assembler::AVX_128bit); |
277 | // Final result is in state |
278 | vpxor(state, tmp0, tmp1, Assembler::AVX_128bit); |
279 | |
280 | lea(input_data, Address(input_data, 16 * 8)); |
281 | cmpl(blocks, 8); |
282 | jcc(Assembler::below, ONE_BLK_INIT); |
283 | jmp(PROCESS_8_BLOCKS); |
284 | |
285 | // Since this is one block operation we will only use H * 2 i.e. the first power of H |
286 | bind(ONE_BLK_INIT); |
287 | movdqu(tmp0, Address(htbl, 1 * 16)); |
288 | movdqu(bswap_mask, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr())); |
289 | |
290 | //Do one (128 bit x 128 bit) carry-less multiplication at a time followed by a reduction. |
291 | bind(PROCESS_1_BLOCK); |
292 | cmpl(blocks, 0); |
293 | jcc(Assembler::equal, SAVE_STATE); |
294 | subl(blocks, 1); |
295 | movdqu(data, Address(input_data, 0)); |
296 | vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); |
297 | vpxor(state, state, data, Assembler::AVX_128bit); |
298 | // gfmul(H*2, state) |
299 | call(GFMUL, relocInfo::none); |
300 | addptr(input_data, 16); |
301 | jmp(PROCESS_1_BLOCK); |
302 | |
303 | bind(SAVE_STATE); |
304 | vpshufb(state, state, lswap_mask, Assembler::AVX_128bit); |
305 | movdqu(Address(input_state, 0), state); |
306 | jmp(EXIT_GHASH); |
307 | |
308 | bind(GFMUL); |
309 | gfmul(tmp0, state); |
310 | |
311 | bind(GENERATE_HTBL_1_BLK); |
312 | generateHtbl_one_block(htbl); |
313 | |
314 | bind(GENERATE_HTBL_8_BLKS); |
315 | generateHtbl_eight_blocks(htbl); |
316 | |
317 | bind(EXIT_GHASH); |
318 | // zero out xmm registers used for Htbl storage |
319 | vpxor(xmm0, xmm0, xmm0, Assembler::AVX_128bit); |
320 | vpxor(xmm1, xmm1, xmm1, Assembler::AVX_128bit); |
321 | vpxor(xmm3, xmm3, xmm3, Assembler::AVX_128bit); |
322 | vpxor(xmm15, xmm15, xmm15, Assembler::AVX_128bit); |
323 | } |
324 | #endif // _LP64 |
325 | |