| 1 | /* |
| 2 | * Copyright (c) 2018, Intel Corporation. |
| 3 | * |
| 4 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
| 5 | * |
| 6 | * This code is free software; you can redistribute it and/or modify it |
| 7 | * under the terms of the GNU General Public License version 2 only, as |
| 8 | * published by the Free Software Foundation. |
| 9 | * |
| 10 | * This code is distributed in the hope that it will be useful, but WITHOUT |
| 11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
| 12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| 13 | * version 2 for more details (a copy is included in the LICENSE file that |
| 14 | * accompanied this code). |
| 15 | * |
| 16 | * You should have received a copy of the GNU General Public License version |
| 17 | * 2 along with this work; if not, write to the Free Software Foundation, |
| 18 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
| 19 | * |
| 20 | * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
| 21 | * or visit www.oracle.com if you need additional information or have any |
| 22 | * questions. |
| 23 | * |
| 24 | */ |
| 25 | |
| 26 | #include "precompiled.hpp" |
| 27 | #include "asm/assembler.hpp" |
| 28 | #include "asm/assembler.inline.hpp" |
| 29 | #include "runtime/stubRoutines.hpp" |
| 30 | #include "macroAssembler_x86.hpp" |
| 31 | |
| 32 | #ifdef _LP64 |
| 33 | // Multiply 128 x 128 bits, using 4 pclmulqdq operations |
| 34 | void MacroAssembler::schoolbookAAD(int i, Register htbl, XMMRegister data, |
| 35 | XMMRegister tmp0, XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3) { |
| 36 | movdqu(xmm15, Address(htbl, i * 16)); |
| 37 | vpclmulhqlqdq(tmp3, data, xmm15); // 0x01 |
| 38 | vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit); |
| 39 | vpclmulldq(tmp3, data, xmm15); // 0x00 |
| 40 | vpxor(tmp0, tmp0, tmp3, Assembler::AVX_128bit); |
| 41 | vpclmulhdq(tmp3, data, xmm15); // 0x11 |
| 42 | vpxor(tmp1, tmp1, tmp3, Assembler::AVX_128bit); |
| 43 | vpclmullqhqdq(tmp3, data, xmm15); // 0x10 |
| 44 | vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit); |
| 45 | } |
| 46 | |
| 47 | // Multiply two 128 bit numbers resulting in a 256 bit value |
| 48 | // Result of the multiplication followed by reduction stored in state |
| 49 | void MacroAssembler::gfmul(XMMRegister tmp0, XMMRegister state) { |
| 50 | const XMMRegister tmp1 = xmm4; |
| 51 | const XMMRegister tmp2 = xmm5; |
| 52 | const XMMRegister tmp3 = xmm6; |
| 53 | const XMMRegister tmp4 = xmm7; |
| 54 | |
| 55 | vpclmulldq(tmp1, state, tmp0); //0x00 (a0 * b0) |
| 56 | vpclmulhdq(tmp4, state, tmp0);//0x11 (a1 * b1) |
| 57 | vpclmullqhqdq(tmp2, state, tmp0);//0x10 (a1 * b0) |
| 58 | vpclmulhqlqdq(tmp3, state, tmp0); //0x01 (a0 * b1) |
| 59 | |
| 60 | vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit); // (a0 * b1) + (a1 * b0) |
| 61 | |
| 62 | vpslldq(tmp3, tmp2, 8, Assembler::AVX_128bit); |
| 63 | vpsrldq(tmp2, tmp2, 8, Assembler::AVX_128bit); |
| 64 | vpxor(tmp1, tmp1, tmp3, Assembler::AVX_128bit); // tmp1 and tmp4 hold the result |
| 65 | vpxor(tmp4, tmp4, tmp2, Assembler::AVX_128bit); // of carryless multiplication |
| 66 | // Follows the reduction technique mentioned in |
| 67 | // Shift-XOR reduction described in Gueron-Kounavis May 2010 |
| 68 | // First phase of reduction |
| 69 | // |
| 70 | vpslld(xmm8, tmp1, 31, Assembler::AVX_128bit); // packed right shift shifting << 31 |
| 71 | vpslld(xmm9, tmp1, 30, Assembler::AVX_128bit); // packed right shift shifting << 30 |
| 72 | vpslld(xmm10, tmp1, 25, Assembler::AVX_128bit);// packed right shift shifting << 25 |
| 73 | // xor the shifted versions |
| 74 | vpxor(xmm8, xmm8, xmm9, Assembler::AVX_128bit); |
| 75 | vpxor(xmm8, xmm8, xmm10, Assembler::AVX_128bit); |
| 76 | vpslldq(xmm9, xmm8, 12, Assembler::AVX_128bit); |
| 77 | vpsrldq(xmm8, xmm8, 4, Assembler::AVX_128bit); |
| 78 | vpxor(tmp1, tmp1, xmm9, Assembler::AVX_128bit);// first phase of the reduction complete |
| 79 | // |
| 80 | // Second phase of the reduction |
| 81 | // |
| 82 | vpsrld(xmm9, tmp1, 1, Assembler::AVX_128bit);// packed left shifting >> 1 |
| 83 | vpsrld(xmm10, tmp1, 2, Assembler::AVX_128bit);// packed left shifting >> 2 |
| 84 | vpsrld(xmm11, tmp1, 7, Assembler::AVX_128bit);// packed left shifting >> 7 |
| 85 | vpxor(xmm9, xmm9, xmm10, Assembler::AVX_128bit);// xor the shifted versions |
| 86 | vpxor(xmm9, xmm9, xmm11, Assembler::AVX_128bit); |
| 87 | vpxor(xmm9, xmm9, xmm8, Assembler::AVX_128bit); |
| 88 | vpxor(tmp1, tmp1, xmm9, Assembler::AVX_128bit); |
| 89 | vpxor(state, tmp4, tmp1, Assembler::AVX_128bit);// the result is in state |
| 90 | ret(0); |
| 91 | } |
| 92 | |
| 93 | // This method takes the subkey after expansion as input and generates 1 * 16 power of subkey H. |
| 94 | // The power of H is used in reduction process for one block ghash |
| 95 | void MacroAssembler::generateHtbl_one_block(Register htbl) { |
| 96 | const XMMRegister t = xmm13; |
| 97 | |
| 98 | // load the original subkey hash |
| 99 | movdqu(t, Address(htbl, 0)); |
| 100 | // shuffle using long swap mask |
| 101 | movdqu(xmm10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr())); |
| 102 | vpshufb(t, t, xmm10, Assembler::AVX_128bit); |
| 103 | |
| 104 | // Compute H' = GFMUL(H, 2) |
| 105 | vpsrld(xmm3, t, 7, Assembler::AVX_128bit); |
| 106 | movdqu(xmm4, ExternalAddress(StubRoutines::x86::ghash_shufflemask_addr())); |
| 107 | vpshufb(xmm3, xmm3, xmm4, Assembler::AVX_128bit); |
| 108 | movl(rax, 0xff00); |
| 109 | movdl(xmm4, rax); |
| 110 | vpshufb(xmm4, xmm4, xmm3, Assembler::AVX_128bit); |
| 111 | movdqu(xmm5, ExternalAddress(StubRoutines::x86::ghash_polynomial_addr())); |
| 112 | vpand(xmm5, xmm5, xmm4, Assembler::AVX_128bit); |
| 113 | vpsrld(xmm3, t, 31, Assembler::AVX_128bit); |
| 114 | vpslld(xmm4, t, 1, Assembler::AVX_128bit); |
| 115 | vpslldq(xmm3, xmm3, 4, Assembler::AVX_128bit); |
| 116 | vpxor(t, xmm4, xmm3, Assembler::AVX_128bit);// t holds p(x) <<1 or H * 2 |
| 117 | |
| 118 | //Adding p(x)<<1 to xmm5 which holds the reduction polynomial |
| 119 | vpxor(t, t, xmm5, Assembler::AVX_128bit); |
| 120 | movdqu(Address(htbl, 1 * 16), t); // H * 2 |
| 121 | |
| 122 | ret(0); |
| 123 | } |
| 124 | |
| 125 | // This method takes the subkey after expansion as input and generates the remaining powers of subkey H. |
| 126 | // The power of H is used in reduction process for eight block ghash |
| 127 | void MacroAssembler::generateHtbl_eight_blocks(Register htbl) { |
| 128 | const XMMRegister t = xmm13; |
| 129 | const XMMRegister tmp0 = xmm1; |
| 130 | Label GFMUL; |
| 131 | |
| 132 | movdqu(t, Address(htbl, 1 * 16)); |
| 133 | movdqu(tmp0, t); |
| 134 | |
| 135 | // tmp0 and t hold H. Now we compute powers of H by using GFMUL(H, H) |
| 136 | call(GFMUL, relocInfo::none); |
| 137 | movdqu(Address(htbl, 2 * 16), t); //H ^ 2 * 2 |
| 138 | call(GFMUL, relocInfo::none); |
| 139 | movdqu(Address(htbl, 3 * 16), t); //H ^ 3 * 2 |
| 140 | call(GFMUL, relocInfo::none); |
| 141 | movdqu(Address(htbl, 4 * 16), t); //H ^ 4 * 2 |
| 142 | call(GFMUL, relocInfo::none); |
| 143 | movdqu(Address(htbl, 5 * 16), t); //H ^ 5 * 2 |
| 144 | call(GFMUL, relocInfo::none); |
| 145 | movdqu(Address(htbl, 6 * 16), t); //H ^ 6 * 2 |
| 146 | call(GFMUL, relocInfo::none); |
| 147 | movdqu(Address(htbl, 7 * 16), t); //H ^ 7 * 2 |
| 148 | call(GFMUL, relocInfo::none); |
| 149 | movdqu(Address(htbl, 8 * 16), t); //H ^ 8 * 2 |
| 150 | ret(0); |
| 151 | |
| 152 | bind(GFMUL); |
| 153 | gfmul(tmp0, t); |
| 154 | } |
| 155 | |
| 156 | // Multiblock and single block GHASH computation using Shift XOR reduction technique |
| 157 | void MacroAssembler::avx_ghash(Register input_state, Register htbl, |
| 158 | Register input_data, Register blocks) { |
| 159 | |
| 160 | // temporary variables to hold input data and input state |
| 161 | const XMMRegister data = xmm1; |
| 162 | const XMMRegister state = xmm0; |
| 163 | // temporary variables to hold intermediate results |
| 164 | const XMMRegister tmp0 = xmm3; |
| 165 | const XMMRegister tmp1 = xmm4; |
| 166 | const XMMRegister tmp2 = xmm5; |
| 167 | const XMMRegister tmp3 = xmm6; |
| 168 | // temporary variables to hold byte and long swap masks |
| 169 | const XMMRegister bswap_mask = xmm2; |
| 170 | const XMMRegister lswap_mask = xmm14; |
| 171 | |
| 172 | Label GENERATE_HTBL_1_BLK, GENERATE_HTBL_8_BLKS, BEGIN_PROCESS, GFMUL, BLOCK8_REDUCTION, |
| 173 | ONE_BLK_INIT, PROCESS_1_BLOCK, PROCESS_8_BLOCKS, SAVE_STATE, EXIT_GHASH; |
| 174 | |
| 175 | testptr(blocks, blocks); |
| 176 | jcc(Assembler::zero, EXIT_GHASH); |
| 177 | |
| 178 | // Check if Hashtable (1*16) has been already generated |
| 179 | // For anything less than 8 blocks, we generate only the first power of H. |
| 180 | movdqu(tmp2, Address(htbl, 1 * 16)); |
| 181 | ptest(tmp2, tmp2); |
| 182 | jcc(Assembler::notZero, BEGIN_PROCESS); |
| 183 | call(GENERATE_HTBL_1_BLK, relocInfo::none); |
| 184 | |
| 185 | // Shuffle the input state |
| 186 | bind(BEGIN_PROCESS); |
| 187 | movdqu(lswap_mask, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr())); |
| 188 | movdqu(state, Address(input_state, 0)); |
| 189 | vpshufb(state, state, lswap_mask, Assembler::AVX_128bit); |
| 190 | |
| 191 | cmpl(blocks, 8); |
| 192 | jcc(Assembler::below, ONE_BLK_INIT); |
| 193 | // If we have 8 blocks or more data, then generate remaining powers of H |
| 194 | movdqu(tmp2, Address(htbl, 8 * 16)); |
| 195 | ptest(tmp2, tmp2); |
| 196 | jcc(Assembler::notZero, PROCESS_8_BLOCKS); |
| 197 | call(GENERATE_HTBL_8_BLKS, relocInfo::none); |
| 198 | |
| 199 | //Do 8 multiplies followed by a reduction processing 8 blocks of data at a time |
| 200 | //Each block = 16 bytes. |
| 201 | bind(PROCESS_8_BLOCKS); |
| 202 | subl(blocks, 8); |
| 203 | movdqu(bswap_mask, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr())); |
| 204 | movdqu(data, Address(input_data, 16 * 7)); |
| 205 | vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); |
| 206 | //Loading 1*16 as calculated powers of H required starts at that location. |
| 207 | movdqu(xmm15, Address(htbl, 1 * 16)); |
| 208 | //Perform carryless multiplication of (H*2, data block #7) |
| 209 | vpclmulhqlqdq(tmp2, data, xmm15);//a0 * b1 |
| 210 | vpclmulldq(tmp0, data, xmm15);//a0 * b0 |
| 211 | vpclmulhdq(tmp1, data, xmm15);//a1 * b1 |
| 212 | vpclmullqhqdq(tmp3, data, xmm15);//a1* b0 |
| 213 | vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit);// (a0 * b1) + (a1 * b0) |
| 214 | |
| 215 | movdqu(data, Address(input_data, 16 * 6)); |
| 216 | vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); |
| 217 | // Perform carryless multiplication of (H^2 * 2, data block #6) |
| 218 | schoolbookAAD(2, htbl, data, tmp0, tmp1, tmp2, tmp3); |
| 219 | |
| 220 | movdqu(data, Address(input_data, 16 * 5)); |
| 221 | vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); |
| 222 | // Perform carryless multiplication of (H^3 * 2, data block #5) |
| 223 | schoolbookAAD(3, htbl, data, tmp0, tmp1, tmp2, tmp3); |
| 224 | movdqu(data, Address(input_data, 16 * 4)); |
| 225 | vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); |
| 226 | // Perform carryless multiplication of (H^4 * 2, data block #4) |
| 227 | schoolbookAAD(4, htbl, data, tmp0, tmp1, tmp2, tmp3); |
| 228 | movdqu(data, Address(input_data, 16 * 3)); |
| 229 | vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); |
| 230 | // Perform carryless multiplication of (H^5 * 2, data block #3) |
| 231 | schoolbookAAD(5, htbl, data, tmp0, tmp1, tmp2, tmp3); |
| 232 | movdqu(data, Address(input_data, 16 * 2)); |
| 233 | vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); |
| 234 | // Perform carryless multiplication of (H^6 * 2, data block #2) |
| 235 | schoolbookAAD(6, htbl, data, tmp0, tmp1, tmp2, tmp3); |
| 236 | movdqu(data, Address(input_data, 16 * 1)); |
| 237 | vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); |
| 238 | // Perform carryless multiplication of (H^7 * 2, data block #1) |
| 239 | schoolbookAAD(7, htbl, data, tmp0, tmp1, tmp2, tmp3); |
| 240 | movdqu(data, Address(input_data, 16 * 0)); |
| 241 | // xor data block#0 with input state before perfoming carry-less multiplication |
| 242 | vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); |
| 243 | vpxor(data, data, state, Assembler::AVX_128bit); |
| 244 | // Perform carryless multiplication of (H^8 * 2, data block #0) |
| 245 | schoolbookAAD(8, htbl, data, tmp0, tmp1, tmp2, tmp3); |
| 246 | vpslldq(tmp3, tmp2, 8, Assembler::AVX_128bit); |
| 247 | vpsrldq(tmp2, tmp2, 8, Assembler::AVX_128bit); |
| 248 | vpxor(tmp0, tmp0, tmp3, Assembler::AVX_128bit);// tmp0, tmp1 contains aggregated results of |
| 249 | vpxor(tmp1, tmp1, tmp2, Assembler::AVX_128bit);// the multiplication operation |
| 250 | |
| 251 | // we have the 2 128-bit partially accumulated multiplication results in tmp0:tmp1 |
| 252 | // with higher 128-bit in tmp1 and lower 128-bit in corresponding tmp0 |
| 253 | // Follows the reduction technique mentioned in |
| 254 | // Shift-XOR reduction described in Gueron-Kounavis May 2010 |
| 255 | bind(BLOCK8_REDUCTION); |
| 256 | // First Phase of the reduction |
| 257 | vpslld(xmm8, tmp0, 31, Assembler::AVX_128bit); // packed right shifting << 31 |
| 258 | vpslld(xmm9, tmp0, 30, Assembler::AVX_128bit); // packed right shifting << 30 |
| 259 | vpslld(xmm10, tmp0, 25, Assembler::AVX_128bit); // packed right shifting << 25 |
| 260 | // xor the shifted versions |
| 261 | vpxor(xmm8, xmm8, xmm10, Assembler::AVX_128bit); |
| 262 | vpxor(xmm8, xmm8, xmm9, Assembler::AVX_128bit); |
| 263 | |
| 264 | vpslldq(xmm9, xmm8, 12, Assembler::AVX_128bit); |
| 265 | vpsrldq(xmm8, xmm8, 4, Assembler::AVX_128bit); |
| 266 | |
| 267 | vpxor(tmp0, tmp0, xmm9, Assembler::AVX_128bit); // first phase of reduction is complete |
| 268 | // second phase of the reduction |
| 269 | vpsrld(xmm9, tmp0, 1, Assembler::AVX_128bit); // packed left shifting >> 1 |
| 270 | vpsrld(xmm10, tmp0, 2, Assembler::AVX_128bit); // packed left shifting >> 2 |
| 271 | vpsrld(tmp2, tmp0, 7, Assembler::AVX_128bit); // packed left shifting >> 7 |
| 272 | // xor the shifted versions |
| 273 | vpxor(xmm9, xmm9, xmm10, Assembler::AVX_128bit); |
| 274 | vpxor(xmm9, xmm9, tmp2, Assembler::AVX_128bit); |
| 275 | vpxor(xmm9, xmm9, xmm8, Assembler::AVX_128bit); |
| 276 | vpxor(tmp0, xmm9, tmp0, Assembler::AVX_128bit); |
| 277 | // Final result is in state |
| 278 | vpxor(state, tmp0, tmp1, Assembler::AVX_128bit); |
| 279 | |
| 280 | lea(input_data, Address(input_data, 16 * 8)); |
| 281 | cmpl(blocks, 8); |
| 282 | jcc(Assembler::below, ONE_BLK_INIT); |
| 283 | jmp(PROCESS_8_BLOCKS); |
| 284 | |
| 285 | // Since this is one block operation we will only use H * 2 i.e. the first power of H |
| 286 | bind(ONE_BLK_INIT); |
| 287 | movdqu(tmp0, Address(htbl, 1 * 16)); |
| 288 | movdqu(bswap_mask, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr())); |
| 289 | |
| 290 | //Do one (128 bit x 128 bit) carry-less multiplication at a time followed by a reduction. |
| 291 | bind(PROCESS_1_BLOCK); |
| 292 | cmpl(blocks, 0); |
| 293 | jcc(Assembler::equal, SAVE_STATE); |
| 294 | subl(blocks, 1); |
| 295 | movdqu(data, Address(input_data, 0)); |
| 296 | vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); |
| 297 | vpxor(state, state, data, Assembler::AVX_128bit); |
| 298 | // gfmul(H*2, state) |
| 299 | call(GFMUL, relocInfo::none); |
| 300 | addptr(input_data, 16); |
| 301 | jmp(PROCESS_1_BLOCK); |
| 302 | |
| 303 | bind(SAVE_STATE); |
| 304 | vpshufb(state, state, lswap_mask, Assembler::AVX_128bit); |
| 305 | movdqu(Address(input_state, 0), state); |
| 306 | jmp(EXIT_GHASH); |
| 307 | |
| 308 | bind(GFMUL); |
| 309 | gfmul(tmp0, state); |
| 310 | |
| 311 | bind(GENERATE_HTBL_1_BLK); |
| 312 | generateHtbl_one_block(htbl); |
| 313 | |
| 314 | bind(GENERATE_HTBL_8_BLKS); |
| 315 | generateHtbl_eight_blocks(htbl); |
| 316 | |
| 317 | bind(EXIT_GHASH); |
| 318 | // zero out xmm registers used for Htbl storage |
| 319 | vpxor(xmm0, xmm0, xmm0, Assembler::AVX_128bit); |
| 320 | vpxor(xmm1, xmm1, xmm1, Assembler::AVX_128bit); |
| 321 | vpxor(xmm3, xmm3, xmm3, Assembler::AVX_128bit); |
| 322 | vpxor(xmm15, xmm15, xmm15, Assembler::AVX_128bit); |
| 323 | } |
| 324 | #endif // _LP64 |
| 325 | |