| 1 | /* |
| 2 | * Copyright (c) 2016, Intel Corporation. |
| 3 | * |
| 4 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
| 5 | * |
| 6 | * This code is free software; you can redistribute it and/or modify it |
| 7 | * under the terms of the GNU General Public License version 2 only, as |
| 8 | * published by the Free Software Foundation. |
| 9 | * |
| 10 | * This code is distributed in the hope that it will be useful, but WITHOUT |
| 11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
| 12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| 13 | * version 2 for more details (a copy is included in the LICENSE file that |
| 14 | * accompanied this code). |
| 15 | * |
| 16 | * You should have received a copy of the GNU General Public License version |
| 17 | * 2 along with this work; if not, write to the Free Software Foundation, |
| 18 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
| 19 | * |
| 20 | * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
| 21 | * or visit www.oracle.com if you need additional information or have any |
| 22 | * questions. |
| 23 | * |
| 24 | */ |
| 25 | |
| 26 | #include "precompiled.hpp" |
| 27 | #include "asm/assembler.hpp" |
| 28 | #include "asm/assembler.inline.hpp" |
| 29 | #include "runtime/stubRoutines.hpp" |
| 30 | #include "macroAssembler_x86.hpp" |
| 31 | |
| 32 | // ofs and limit are used for multi-block byte array. |
| 33 | // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit) |
| 34 | void MacroAssembler::fast_sha1(XMMRegister abcd, XMMRegister e0, XMMRegister e1, XMMRegister msg0, |
| 35 | XMMRegister msg1, XMMRegister msg2, XMMRegister msg3, XMMRegister shuf_mask, |
| 36 | Register buf, Register state, Register ofs, Register limit, Register rsp, bool multi_block) { |
| 37 | |
| 38 | Label start, done_hash, loop0; |
| 39 | |
| 40 | address upper_word_mask = StubRoutines::x86::upper_word_mask_addr(); |
| 41 | address shuffle_byte_flip_mask = StubRoutines::x86::shuffle_byte_flip_mask_addr(); |
| 42 | |
| 43 | bind(start); |
| 44 | movdqu(abcd, Address(state, 0)); |
| 45 | pinsrd(e0, Address(state, 16), 3); |
| 46 | movdqu(shuf_mask, ExternalAddress(upper_word_mask)); // 0xFFFFFFFF000000000000000000000000 |
| 47 | pand(e0, shuf_mask); |
| 48 | pshufd(abcd, abcd, 0x1B); |
| 49 | movdqu(shuf_mask, ExternalAddress(shuffle_byte_flip_mask)); //0x000102030405060708090a0b0c0d0e0f |
| 50 | |
| 51 | bind(loop0); |
| 52 | // Save hash values for addition after rounds |
| 53 | movdqu(Address(rsp, 0), e0); |
| 54 | movdqu(Address(rsp, 16), abcd); |
| 55 | |
| 56 | |
| 57 | // Rounds 0 - 3 |
| 58 | movdqu(msg0, Address(buf, 0)); |
| 59 | pshufb(msg0, shuf_mask); |
| 60 | paddd(e0, msg0); |
| 61 | movdqa(e1, abcd); |
| 62 | sha1rnds4(abcd, e0, 0); |
| 63 | |
| 64 | // Rounds 4 - 7 |
| 65 | movdqu(msg1, Address(buf, 16)); |
| 66 | pshufb(msg1, shuf_mask); |
| 67 | sha1nexte(e1, msg1); |
| 68 | movdqa(e0, abcd); |
| 69 | sha1rnds4(abcd, e1, 0); |
| 70 | sha1msg1(msg0, msg1); |
| 71 | |
| 72 | // Rounds 8 - 11 |
| 73 | movdqu(msg2, Address(buf, 32)); |
| 74 | pshufb(msg2, shuf_mask); |
| 75 | sha1nexte(e0, msg2); |
| 76 | movdqa(e1, abcd); |
| 77 | sha1rnds4(abcd, e0, 0); |
| 78 | sha1msg1(msg1, msg2); |
| 79 | pxor(msg0, msg2); |
| 80 | |
| 81 | // Rounds 12 - 15 |
| 82 | movdqu(msg3, Address(buf, 48)); |
| 83 | pshufb(msg3, shuf_mask); |
| 84 | sha1nexte(e1, msg3); |
| 85 | movdqa(e0, abcd); |
| 86 | sha1msg2(msg0, msg3); |
| 87 | sha1rnds4(abcd, e1, 0); |
| 88 | sha1msg1(msg2, msg3); |
| 89 | pxor(msg1, msg3); |
| 90 | |
| 91 | // Rounds 16 - 19 |
| 92 | sha1nexte(e0, msg0); |
| 93 | movdqa(e1, abcd); |
| 94 | sha1msg2(msg1, msg0); |
| 95 | sha1rnds4(abcd, e0, 0); |
| 96 | sha1msg1(msg3, msg0); |
| 97 | pxor(msg2, msg0); |
| 98 | |
| 99 | // Rounds 20 - 23 |
| 100 | sha1nexte(e1, msg1); |
| 101 | movdqa(e0, abcd); |
| 102 | sha1msg2(msg2, msg1); |
| 103 | sha1rnds4(abcd, e1, 1); |
| 104 | sha1msg1(msg0, msg1); |
| 105 | pxor(msg3, msg1); |
| 106 | |
| 107 | // Rounds 24 - 27 |
| 108 | sha1nexte(e0, msg2); |
| 109 | movdqa(e1, abcd); |
| 110 | sha1msg2(msg3, msg2); |
| 111 | sha1rnds4(abcd, e0, 1); |
| 112 | sha1msg1(msg1, msg2); |
| 113 | pxor(msg0, msg2); |
| 114 | |
| 115 | // Rounds 28 - 31 |
| 116 | sha1nexte(e1, msg3); |
| 117 | movdqa(e0, abcd); |
| 118 | sha1msg2(msg0, msg3); |
| 119 | sha1rnds4(abcd, e1, 1); |
| 120 | sha1msg1(msg2, msg3); |
| 121 | pxor(msg1, msg3); |
| 122 | |
| 123 | // Rounds 32 - 35 |
| 124 | sha1nexte(e0, msg0); |
| 125 | movdqa(e1, abcd); |
| 126 | sha1msg2(msg1, msg0); |
| 127 | sha1rnds4(abcd, e0, 1); |
| 128 | sha1msg1(msg3, msg0); |
| 129 | pxor(msg2, msg0); |
| 130 | |
| 131 | // Rounds 36 - 39 |
| 132 | sha1nexte(e1, msg1); |
| 133 | movdqa(e0, abcd); |
| 134 | sha1msg2(msg2, msg1); |
| 135 | sha1rnds4(abcd, e1, 1); |
| 136 | sha1msg1(msg0, msg1); |
| 137 | pxor(msg3, msg1); |
| 138 | |
| 139 | // Rounds 40 - 43 |
| 140 | sha1nexte(e0, msg2); |
| 141 | movdqa(e1, abcd); |
| 142 | sha1msg2(msg3, msg2); |
| 143 | sha1rnds4(abcd, e0, 2); |
| 144 | sha1msg1(msg1, msg2); |
| 145 | pxor(msg0, msg2); |
| 146 | |
| 147 | // Rounds 44 - 47 |
| 148 | sha1nexte(e1, msg3); |
| 149 | movdqa(e0, abcd); |
| 150 | sha1msg2(msg0, msg3); |
| 151 | sha1rnds4(abcd, e1, 2); |
| 152 | sha1msg1(msg2, msg3); |
| 153 | pxor(msg1, msg3); |
| 154 | |
| 155 | // Rounds 48 - 51 |
| 156 | sha1nexte(e0, msg0); |
| 157 | movdqa(e1, abcd); |
| 158 | sha1msg2(msg1, msg0); |
| 159 | sha1rnds4(abcd, e0, 2); |
| 160 | sha1msg1(msg3, msg0); |
| 161 | pxor(msg2, msg0); |
| 162 | |
| 163 | // Rounds 52 - 55 |
| 164 | sha1nexte(e1, msg1); |
| 165 | movdqa(e0, abcd); |
| 166 | sha1msg2(msg2, msg1); |
| 167 | sha1rnds4(abcd, e1, 2); |
| 168 | sha1msg1(msg0, msg1); |
| 169 | pxor(msg3, msg1); |
| 170 | |
| 171 | // Rounds 56 - 59 |
| 172 | sha1nexte(e0, msg2); |
| 173 | movdqa(e1, abcd); |
| 174 | sha1msg2(msg3, msg2); |
| 175 | sha1rnds4(abcd, e0, 2); |
| 176 | sha1msg1(msg1, msg2); |
| 177 | pxor(msg0, msg2); |
| 178 | |
| 179 | // Rounds 60 - 63 |
| 180 | sha1nexte(e1, msg3); |
| 181 | movdqa(e0, abcd); |
| 182 | sha1msg2(msg0, msg3); |
| 183 | sha1rnds4(abcd, e1, 3); |
| 184 | sha1msg1(msg2, msg3); |
| 185 | pxor(msg1, msg3); |
| 186 | |
| 187 | // Rounds 64 - 67 |
| 188 | sha1nexte(e0, msg0); |
| 189 | movdqa(e1, abcd); |
| 190 | sha1msg2(msg1, msg0); |
| 191 | sha1rnds4(abcd, e0, 3); |
| 192 | sha1msg1(msg3, msg0); |
| 193 | pxor(msg2, msg0); |
| 194 | |
| 195 | // Rounds 68 - 71 |
| 196 | sha1nexte(e1, msg1); |
| 197 | movdqa(e0, abcd); |
| 198 | sha1msg2(msg2, msg1); |
| 199 | sha1rnds4(abcd, e1, 3); |
| 200 | pxor(msg3, msg1); |
| 201 | |
| 202 | // Rounds 72 - 75 |
| 203 | sha1nexte(e0, msg2); |
| 204 | movdqa(e1, abcd); |
| 205 | sha1msg2(msg3, msg2); |
| 206 | sha1rnds4(abcd, e0, 3); |
| 207 | |
| 208 | // Rounds 76 - 79 |
| 209 | sha1nexte(e1, msg3); |
| 210 | movdqa(e0, abcd); |
| 211 | sha1rnds4(abcd, e1, 3); |
| 212 | |
| 213 | // add current hash values with previously saved |
| 214 | movdqu(msg0, Address(rsp, 0)); |
| 215 | sha1nexte(e0, msg0); |
| 216 | movdqu(msg0, Address(rsp, 16)); |
| 217 | paddd(abcd, msg0); |
| 218 | |
| 219 | if (multi_block) { |
| 220 | // increment data pointer and loop if more to process |
| 221 | addptr(buf, 64); |
| 222 | addptr(ofs, 64); |
| 223 | cmpptr(ofs, limit); |
| 224 | jcc(Assembler::belowEqual, loop0); |
| 225 | movptr(rax, ofs); //return ofs |
| 226 | } |
| 227 | // write hash values back in the correct order |
| 228 | pshufd(abcd, abcd, 0x1b); |
| 229 | movdqu(Address(state, 0), abcd); |
| 230 | pextrd(Address(state, 16), e0, 3); |
| 231 | |
| 232 | bind(done_hash); |
| 233 | |
| 234 | } |
| 235 | |
| 236 | // xmm0 (msg) is used as an implicit argument to sh256rnds2 |
| 237 | // and state0 and state1 can never use xmm0 register. |
| 238 | // ofs and limit are used for multi-block byte array. |
| 239 | // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit) |
| 240 | #ifdef _LP64 |
| 241 | void MacroAssembler::fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0, |
| 242 | XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4, |
| 243 | Register buf, Register state, Register ofs, Register limit, Register rsp, |
| 244 | bool multi_block, XMMRegister shuf_mask) { |
| 245 | #else |
| 246 | void MacroAssembler::fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0, |
| 247 | XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4, |
| 248 | Register buf, Register state, Register ofs, Register limit, Register rsp, |
| 249 | bool multi_block) { |
| 250 | #endif |
| 251 | Label start, done_hash, loop0; |
| 252 | |
| 253 | address K256 = StubRoutines::x86::k256_addr(); |
| 254 | address pshuffle_byte_flip_mask = StubRoutines::x86::pshuffle_byte_flip_mask_addr(); |
| 255 | |
| 256 | bind(start); |
| 257 | movdqu(state0, Address(state, 0)); |
| 258 | movdqu(state1, Address(state, 16)); |
| 259 | |
| 260 | pshufd(state0, state0, 0xB1); |
| 261 | pshufd(state1, state1, 0x1B); |
| 262 | movdqa(msgtmp4, state0); |
| 263 | palignr(state0, state1, 8); |
| 264 | pblendw(state1, msgtmp4, 0xF0); |
| 265 | |
| 266 | #ifdef _LP64 |
| 267 | movdqu(shuf_mask, ExternalAddress(pshuffle_byte_flip_mask)); |
| 268 | #endif |
| 269 | lea(rax, ExternalAddress(K256)); |
| 270 | |
| 271 | bind(loop0); |
| 272 | movdqu(Address(rsp, 0), state0); |
| 273 | movdqu(Address(rsp, 16), state1); |
| 274 | |
| 275 | // Rounds 0-3 |
| 276 | movdqu(msg, Address(buf, 0)); |
| 277 | #ifdef _LP64 |
| 278 | pshufb(msg, shuf_mask); |
| 279 | #else |
| 280 | pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask)); |
| 281 | #endif |
| 282 | movdqa(msgtmp0, msg); |
| 283 | paddd(msg, Address(rax, 0)); |
| 284 | sha256rnds2(state1, state0); |
| 285 | pshufd(msg, msg, 0x0E); |
| 286 | sha256rnds2(state0, state1); |
| 287 | |
| 288 | // Rounds 4-7 |
| 289 | movdqu(msg, Address(buf, 16)); |
| 290 | #ifdef _LP64 |
| 291 | pshufb(msg, shuf_mask); |
| 292 | #else |
| 293 | pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask)); |
| 294 | #endif |
| 295 | movdqa(msgtmp1, msg); |
| 296 | paddd(msg, Address(rax, 16)); |
| 297 | sha256rnds2(state1, state0); |
| 298 | pshufd(msg, msg, 0x0E); |
| 299 | sha256rnds2(state0, state1); |
| 300 | sha256msg1(msgtmp0, msgtmp1); |
| 301 | |
| 302 | // Rounds 8-11 |
| 303 | movdqu(msg, Address(buf, 32)); |
| 304 | #ifdef _LP64 |
| 305 | pshufb(msg, shuf_mask); |
| 306 | #else |
| 307 | pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask)); |
| 308 | #endif |
| 309 | movdqa(msgtmp2, msg); |
| 310 | paddd(msg, Address(rax, 32)); |
| 311 | sha256rnds2(state1, state0); |
| 312 | pshufd(msg, msg, 0x0E); |
| 313 | sha256rnds2(state0, state1); |
| 314 | sha256msg1(msgtmp1, msgtmp2); |
| 315 | |
| 316 | // Rounds 12-15 |
| 317 | movdqu(msg, Address(buf, 48)); |
| 318 | #ifdef _LP64 |
| 319 | pshufb(msg, shuf_mask); |
| 320 | #else |
| 321 | pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask)); |
| 322 | #endif |
| 323 | movdqa(msgtmp3, msg); |
| 324 | paddd(msg, Address(rax, 48)); |
| 325 | sha256rnds2(state1, state0); |
| 326 | movdqa(msgtmp4, msgtmp3); |
| 327 | palignr(msgtmp4, msgtmp2, 4); |
| 328 | paddd(msgtmp0, msgtmp4); |
| 329 | sha256msg2(msgtmp0, msgtmp3); |
| 330 | pshufd(msg, msg, 0x0E); |
| 331 | sha256rnds2(state0, state1); |
| 332 | sha256msg1(msgtmp2, msgtmp3); |
| 333 | |
| 334 | // Rounds 16-19 |
| 335 | movdqa(msg, msgtmp0); |
| 336 | paddd(msg, Address(rax, 64)); |
| 337 | sha256rnds2(state1, state0); |
| 338 | movdqa(msgtmp4, msgtmp0); |
| 339 | palignr(msgtmp4, msgtmp3, 4); |
| 340 | paddd(msgtmp1, msgtmp4); |
| 341 | sha256msg2(msgtmp1, msgtmp0); |
| 342 | pshufd(msg, msg, 0x0E); |
| 343 | sha256rnds2(state0, state1); |
| 344 | sha256msg1(msgtmp3, msgtmp0); |
| 345 | |
| 346 | // Rounds 20-23 |
| 347 | movdqa(msg, msgtmp1); |
| 348 | paddd(msg, Address(rax, 80)); |
| 349 | sha256rnds2(state1, state0); |
| 350 | movdqa(msgtmp4, msgtmp1); |
| 351 | palignr(msgtmp4, msgtmp0, 4); |
| 352 | paddd(msgtmp2, msgtmp4); |
| 353 | sha256msg2(msgtmp2, msgtmp1); |
| 354 | pshufd(msg, msg, 0x0E); |
| 355 | sha256rnds2(state0, state1); |
| 356 | sha256msg1(msgtmp0, msgtmp1); |
| 357 | |
| 358 | // Rounds 24-27 |
| 359 | movdqa(msg, msgtmp2); |
| 360 | paddd(msg, Address(rax, 96)); |
| 361 | sha256rnds2(state1, state0); |
| 362 | movdqa(msgtmp4, msgtmp2); |
| 363 | palignr(msgtmp4, msgtmp1, 4); |
| 364 | paddd(msgtmp3, msgtmp4); |
| 365 | sha256msg2(msgtmp3, msgtmp2); |
| 366 | pshufd(msg, msg, 0x0E); |
| 367 | sha256rnds2(state0, state1); |
| 368 | sha256msg1(msgtmp1, msgtmp2); |
| 369 | |
| 370 | // Rounds 28-31 |
| 371 | movdqa(msg, msgtmp3); |
| 372 | paddd(msg, Address(rax, 112)); |
| 373 | sha256rnds2(state1, state0); |
| 374 | movdqa(msgtmp4, msgtmp3); |
| 375 | palignr(msgtmp4, msgtmp2, 4); |
| 376 | paddd(msgtmp0, msgtmp4); |
| 377 | sha256msg2(msgtmp0, msgtmp3); |
| 378 | pshufd(msg, msg, 0x0E); |
| 379 | sha256rnds2(state0, state1); |
| 380 | sha256msg1(msgtmp2, msgtmp3); |
| 381 | |
| 382 | // Rounds 32-35 |
| 383 | movdqa(msg, msgtmp0); |
| 384 | paddd(msg, Address(rax, 128)); |
| 385 | sha256rnds2(state1, state0); |
| 386 | movdqa(msgtmp4, msgtmp0); |
| 387 | palignr(msgtmp4, msgtmp3, 4); |
| 388 | paddd(msgtmp1, msgtmp4); |
| 389 | sha256msg2(msgtmp1, msgtmp0); |
| 390 | pshufd(msg, msg, 0x0E); |
| 391 | sha256rnds2(state0, state1); |
| 392 | sha256msg1(msgtmp3, msgtmp0); |
| 393 | |
| 394 | // Rounds 36-39 |
| 395 | movdqa(msg, msgtmp1); |
| 396 | paddd(msg, Address(rax, 144)); |
| 397 | sha256rnds2(state1, state0); |
| 398 | movdqa(msgtmp4, msgtmp1); |
| 399 | palignr(msgtmp4, msgtmp0, 4); |
| 400 | paddd(msgtmp2, msgtmp4); |
| 401 | sha256msg2(msgtmp2, msgtmp1); |
| 402 | pshufd(msg, msg, 0x0E); |
| 403 | sha256rnds2(state0, state1); |
| 404 | sha256msg1(msgtmp0, msgtmp1); |
| 405 | |
| 406 | // Rounds 40-43 |
| 407 | movdqa(msg, msgtmp2); |
| 408 | paddd(msg, Address(rax, 160)); |
| 409 | sha256rnds2(state1, state0); |
| 410 | movdqa(msgtmp4, msgtmp2); |
| 411 | palignr(msgtmp4, msgtmp1, 4); |
| 412 | paddd(msgtmp3, msgtmp4); |
| 413 | sha256msg2(msgtmp3, msgtmp2); |
| 414 | pshufd(msg, msg, 0x0E); |
| 415 | sha256rnds2(state0, state1); |
| 416 | sha256msg1(msgtmp1, msgtmp2); |
| 417 | |
| 418 | // Rounds 44-47 |
| 419 | movdqa(msg, msgtmp3); |
| 420 | paddd(msg, Address(rax, 176)); |
| 421 | sha256rnds2(state1, state0); |
| 422 | movdqa(msgtmp4, msgtmp3); |
| 423 | palignr(msgtmp4, msgtmp2, 4); |
| 424 | paddd(msgtmp0, msgtmp4); |
| 425 | sha256msg2(msgtmp0, msgtmp3); |
| 426 | pshufd(msg, msg, 0x0E); |
| 427 | sha256rnds2(state0, state1); |
| 428 | sha256msg1(msgtmp2, msgtmp3); |
| 429 | |
| 430 | // Rounds 48-51 |
| 431 | movdqa(msg, msgtmp0); |
| 432 | paddd(msg, Address(rax, 192)); |
| 433 | sha256rnds2(state1, state0); |
| 434 | movdqa(msgtmp4, msgtmp0); |
| 435 | palignr(msgtmp4, msgtmp3, 4); |
| 436 | paddd(msgtmp1, msgtmp4); |
| 437 | sha256msg2(msgtmp1, msgtmp0); |
| 438 | pshufd(msg, msg, 0x0E); |
| 439 | sha256rnds2(state0, state1); |
| 440 | sha256msg1(msgtmp3, msgtmp0); |
| 441 | |
| 442 | // Rounds 52-55 |
| 443 | movdqa(msg, msgtmp1); |
| 444 | paddd(msg, Address(rax, 208)); |
| 445 | sha256rnds2(state1, state0); |
| 446 | movdqa(msgtmp4, msgtmp1); |
| 447 | palignr(msgtmp4, msgtmp0, 4); |
| 448 | paddd(msgtmp2, msgtmp4); |
| 449 | sha256msg2(msgtmp2, msgtmp1); |
| 450 | pshufd(msg, msg, 0x0E); |
| 451 | sha256rnds2(state0, state1); |
| 452 | |
| 453 | // Rounds 56-59 |
| 454 | movdqa(msg, msgtmp2); |
| 455 | paddd(msg, Address(rax, 224)); |
| 456 | sha256rnds2(state1, state0); |
| 457 | movdqa(msgtmp4, msgtmp2); |
| 458 | palignr(msgtmp4, msgtmp1, 4); |
| 459 | paddd(msgtmp3, msgtmp4); |
| 460 | sha256msg2(msgtmp3, msgtmp2); |
| 461 | pshufd(msg, msg, 0x0E); |
| 462 | sha256rnds2(state0, state1); |
| 463 | |
| 464 | // Rounds 60-63 |
| 465 | movdqa(msg, msgtmp3); |
| 466 | paddd(msg, Address(rax, 240)); |
| 467 | sha256rnds2(state1, state0); |
| 468 | pshufd(msg, msg, 0x0E); |
| 469 | sha256rnds2(state0, state1); |
| 470 | movdqu(msg, Address(rsp, 0)); |
| 471 | paddd(state0, msg); |
| 472 | movdqu(msg, Address(rsp, 16)); |
| 473 | paddd(state1, msg); |
| 474 | |
| 475 | if (multi_block) { |
| 476 | // increment data pointer and loop if more to process |
| 477 | addptr(buf, 64); |
| 478 | addptr(ofs, 64); |
| 479 | cmpptr(ofs, limit); |
| 480 | jcc(Assembler::belowEqual, loop0); |
| 481 | movptr(rax, ofs); //return ofs |
| 482 | } |
| 483 | |
| 484 | pshufd(state0, state0, 0x1B); |
| 485 | pshufd(state1, state1, 0xB1); |
| 486 | movdqa(msgtmp4, state0); |
| 487 | pblendw(state0, state1, 0xF0); |
| 488 | palignr(state1, msgtmp4, 8); |
| 489 | |
| 490 | movdqu(Address(state, 0), state0); |
| 491 | movdqu(Address(state, 16), state1); |
| 492 | |
| 493 | bind(done_hash); |
| 494 | |
| 495 | } |
| 496 | |
| 497 | #ifdef _LP64 |
| 498 | /* |
| 499 | The algorithm below is based on Intel publication: |
| 500 | "Fast SHA-256 Implementations on Intelë Architecture Processors" by Jim Guilford, Kirk Yap and Vinodh Gopal. |
| 501 | The assembly code was originally provided by Sean Gulley and in many places preserves |
| 502 | the original assembly NAMES and comments to simplify matching Java assembly with its original. |
| 503 | The Java version was substantially redesigned to replace 1200 assembly instruction with |
| 504 | much shorter run-time generator of the same code in memory. |
| 505 | */ |
| 506 | |
| 507 | void MacroAssembler::sha256_AVX2_one_round_compute( |
| 508 | Register reg_old_h, |
| 509 | Register reg_a, |
| 510 | Register reg_b, |
| 511 | Register reg_c, |
| 512 | Register reg_d, |
| 513 | Register reg_e, |
| 514 | Register reg_f, |
| 515 | Register reg_g, |
| 516 | Register reg_h, |
| 517 | int iter) { |
| 518 | const Register& reg_y0 = r13; |
| 519 | const Register& reg_y1 = r14; |
| 520 | const Register& reg_y2 = r15; |
| 521 | const Register& reg_y3 = rcx; |
| 522 | const Register& reg_T1 = r12; |
| 523 | //;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND iter ;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| 524 | if (iter%4 > 0) { |
| 525 | addl(reg_old_h, reg_y2); // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; -- |
| 526 | } |
| 527 | movl(reg_y2, reg_f); // reg_y2 = reg_f ; CH |
| 528 | rorxd(reg_y0, reg_e, 25); // reg_y0 = reg_e >> 25 ; S1A |
| 529 | rorxd(reg_y1, reg_e, 11); // reg_y1 = reg_e >> 11 ; S1B |
| 530 | xorl(reg_y2, reg_g); // reg_y2 = reg_f^reg_g ; CH |
| 531 | |
| 532 | xorl(reg_y0, reg_y1); // reg_y0 = (reg_e>>25) ^ (reg_h>>11) ; S1 |
| 533 | rorxd(reg_y1, reg_e, 6); // reg_y1 = (reg_e >> 6) ; S1 |
| 534 | andl(reg_y2, reg_e); // reg_y2 = (reg_f^reg_g)®_e ; CH |
| 535 | |
| 536 | if (iter%4 > 0) { |
| 537 | addl(reg_old_h, reg_y3); // reg_h = t1 + S0 + MAJ ; -- |
| 538 | } |
| 539 | |
| 540 | xorl(reg_y0, reg_y1); // reg_y0 = (reg_e>>25) ^ (reg_e>>11) ^ (reg_e>>6) ; S1 |
| 541 | rorxd(reg_T1, reg_a, 13); // reg_T1 = reg_a >> 13 ; S0B |
| 542 | xorl(reg_y2, reg_g); // reg_y2 = CH = ((reg_f^reg_g)®_e)^reg_g ; CH |
| 543 | rorxd(reg_y1, reg_a, 22); // reg_y1 = reg_a >> 22 ; S0A |
| 544 | movl(reg_y3, reg_a); // reg_y3 = reg_a ; MAJA |
| 545 | |
| 546 | xorl(reg_y1, reg_T1); // reg_y1 = (reg_a>>22) ^ (reg_a>>13) ; S0 |
| 547 | rorxd(reg_T1, reg_a, 2); // reg_T1 = (reg_a >> 2) ; S0 |
| 548 | addl(reg_h, Address(rsp, rdx, Address::times_1, 4*iter)); // reg_h = k + w + reg_h ; -- |
| 549 | orl(reg_y3, reg_c); // reg_y3 = reg_a|reg_c ; MAJA |
| 550 | |
| 551 | xorl(reg_y1, reg_T1); // reg_y1 = (reg_a>>22) ^ (reg_a>>13) ^ (reg_a>>2) ; S0 |
| 552 | movl(reg_T1, reg_a); // reg_T1 = reg_a ; MAJB |
| 553 | andl(reg_y3, reg_b); // reg_y3 = (reg_a|reg_c)®_b ; MAJA |
| 554 | andl(reg_T1, reg_c); // reg_T1 = reg_a®_c ; MAJB |
| 555 | addl(reg_y2, reg_y0); // reg_y2 = S1 + CH ; -- |
| 556 | |
| 557 | |
| 558 | addl(reg_d, reg_h); // reg_d = k + w + reg_h + reg_d ; -- |
| 559 | orl(reg_y3, reg_T1); // reg_y3 = MAJ = (reg_a|reg_c)®_b)|(reg_a®_c) ; MAJ |
| 560 | addl(reg_h, reg_y1); // reg_h = k + w + reg_h + S0 ; -- |
| 561 | |
| 562 | addl(reg_d, reg_y2); // reg_d = k + w + reg_h + reg_d + S1 + CH = reg_d + t1 ; -- |
| 563 | |
| 564 | |
| 565 | if (iter%4 == 3) { |
| 566 | addl(reg_h, reg_y2); // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; -- |
| 567 | addl(reg_h, reg_y3); // reg_h = t1 + S0 + MAJ ; -- |
| 568 | } |
| 569 | } |
| 570 | |
| 571 | void MacroAssembler::sha256_AVX2_four_rounds_compute_first(int start) { |
| 572 | sha256_AVX2_one_round_compute(rax, rax, rbx, rdi, rsi, r8, r9, r10, r11, start + 0); |
| 573 | sha256_AVX2_one_round_compute(r11, r11, rax, rbx, rdi, rsi, r8, r9, r10, start + 1); |
| 574 | sha256_AVX2_one_round_compute(r10, r10, r11, rax, rbx, rdi, rsi, r8, r9, start + 2); |
| 575 | sha256_AVX2_one_round_compute(r9, r9, r10, r11, rax, rbx, rdi, rsi, r8, start + 3); |
| 576 | } |
| 577 | |
| 578 | void MacroAssembler::sha256_AVX2_four_rounds_compute_last(int start) { |
| 579 | sha256_AVX2_one_round_compute(r8, r8, r9, r10, r11, rax, rbx, rdi, rsi, start + 0); |
| 580 | sha256_AVX2_one_round_compute(rsi, rsi, r8, r9, r10, r11, rax, rbx, rdi, start + 1); |
| 581 | sha256_AVX2_one_round_compute(rdi, rdi, rsi, r8, r9, r10, r11, rax, rbx, start + 2); |
| 582 | sha256_AVX2_one_round_compute(rbx, rbx, rdi, rsi, r8, r9, r10, r11, rax, start + 3); |
| 583 | } |
| 584 | |
| 585 | void MacroAssembler::sha256_AVX2_one_round_and_sched( |
| 586 | XMMRegister xmm_0, /* == ymm4 on 0, 1, 2, 3 iterations, then rotate 4 registers left on 4, 8, 12 iterations */ |
| 587 | XMMRegister xmm_1, /* ymm5 */ /* full cycle is 16 iterations */ |
| 588 | XMMRegister xmm_2, /* ymm6 */ |
| 589 | XMMRegister xmm_3, /* ymm7 */ |
| 590 | Register reg_a, /* == rax on 0 iteration, then rotate 8 register right on each next iteration */ |
| 591 | Register reg_b, /* rbx */ /* full cycle is 8 iterations */ |
| 592 | Register reg_c, /* rdi */ |
| 593 | Register reg_d, /* rsi */ |
| 594 | Register reg_e, /* r8 */ |
| 595 | Register reg_f, /* r9d */ |
| 596 | Register reg_g, /* r10d */ |
| 597 | Register reg_h, /* r11d */ |
| 598 | int iter) |
| 599 | { |
| 600 | movl(rcx, reg_a); // rcx = reg_a ; MAJA |
| 601 | rorxd(r13, reg_e, 25); // r13 = reg_e >> 25 ; S1A |
| 602 | rorxd(r14, reg_e, 11); // r14 = reg_e >> 11 ; S1B |
| 603 | addl(reg_h, Address(rsp, rdx, Address::times_1, 4*iter)); |
| 604 | orl(rcx, reg_c); // rcx = reg_a|reg_c ; MAJA |
| 605 | |
| 606 | movl(r15, reg_f); // r15 = reg_f ; CH |
| 607 | rorxd(r12, reg_a, 13); // r12 = reg_a >> 13 ; S0B |
| 608 | xorl(r13, r14); // r13 = (reg_e>>25) ^ (reg_e>>11) ; S1 |
| 609 | xorl(r15, reg_g); // r15 = reg_f^reg_g ; CH |
| 610 | |
| 611 | rorxd(r14, reg_e, 6); // r14 = (reg_e >> 6) ; S1 |
| 612 | andl(r15, reg_e); // r15 = (reg_f^reg_g)®_e ; CH |
| 613 | |
| 614 | xorl(r13, r14); // r13 = (reg_e>>25) ^ (reg_e>>11) ^ (reg_e>>6) ; S1 |
| 615 | rorxd(r14, reg_a, 22); // r14 = reg_a >> 22 ; S0A |
| 616 | addl(reg_d, reg_h); // reg_d = k + w + reg_h + reg_d ; -- |
| 617 | |
| 618 | andl(rcx, reg_b); // rcx = (reg_a|reg_c)®_b ; MAJA |
| 619 | xorl(r14, r12); // r14 = (reg_a>>22) ^ (reg_a>>13) ; S0 |
| 620 | |
| 621 | rorxd(r12, reg_a, 2); // r12 = (reg_a >> 2) ; S0 |
| 622 | xorl(r15, reg_g); // r15 = CH = ((reg_f^reg_g)®_e)^reg_g ; CH |
| 623 | |
| 624 | xorl(r14, r12); // r14 = (reg_a>>22) ^ (reg_a>>13) ^ (reg_a>>2) ; S0 |
| 625 | movl(r12, reg_a); // r12 = reg_a ; MAJB |
| 626 | andl(r12, reg_c); // r12 = reg_a®_c ; MAJB |
| 627 | addl(r15, r13); // r15 = S1 + CH ; -- |
| 628 | |
| 629 | orl(rcx, r12); // rcx = MAJ = (reg_a|reg_c)®_b)|(reg_a®_c) ; MAJ |
| 630 | addl(reg_h, r14); // reg_h = k + w + reg_h + S0 ; -- |
| 631 | addl(reg_d, r15); // reg_d = k + w + reg_h + reg_d + S1 + CH = reg_d + t1 ; -- |
| 632 | |
| 633 | addl(reg_h, r15); // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; -- |
| 634 | addl(reg_h, rcx); // reg_h = t1 + S0 + MAJ ; -- |
| 635 | |
| 636 | if (iter%4 == 0) { |
| 637 | vpalignr(xmm0, xmm_3, xmm_2, 4, AVX_256bit); // ymm0 = W[-7] |
| 638 | vpaddd(xmm0, xmm0, xmm_0, AVX_256bit); // ymm0 = W[-7] + W[-16]; y1 = (e >> 6) ; S1 |
| 639 | vpalignr(xmm1, xmm_1, xmm_0, 4, AVX_256bit); // ymm1 = W[-15] |
| 640 | vpsrld(xmm2, xmm1, 7, AVX_256bit); |
| 641 | vpslld(xmm3, xmm1, 32-7, AVX_256bit); |
| 642 | vpor(xmm3, xmm3, xmm2, AVX_256bit); // ymm3 = W[-15] ror 7 |
| 643 | vpsrld(xmm2, xmm1,18, AVX_256bit); |
| 644 | } else if (iter%4 == 1 ) { |
| 645 | vpsrld(xmm8, xmm1, 3, AVX_256bit); // ymm8 = W[-15] >> 3 |
| 646 | vpslld(xmm1, xmm1, 32-18, AVX_256bit); |
| 647 | vpxor(xmm3, xmm3, xmm1, AVX_256bit); |
| 648 | vpxor(xmm3, xmm3, xmm2, AVX_256bit); // ymm3 = W[-15] ror 7 ^ W[-15] ror 18 |
| 649 | vpxor(xmm1, xmm3, xmm8, AVX_256bit); // ymm1 = s0 |
| 650 | vpshufd(xmm2, xmm_3, 0xFA, AVX_256bit); // 11111010b ; ymm2 = W[-2] {BBAA} |
| 651 | vpaddd(xmm0, xmm0, xmm1, AVX_256bit); // ymm0 = W[-16] + W[-7] + s0 |
| 652 | vpsrld(xmm8, xmm2, 10, AVX_256bit); // ymm8 = W[-2] >> 10 {BBAA} |
| 653 | } else if (iter%4 == 2) { |
| 654 | vpsrlq(xmm3, xmm2, 19, AVX_256bit); // ymm3 = W[-2] ror 19 {xBxA} |
| 655 | vpsrlq(xmm2, xmm2, 17, AVX_256bit); // ymm2 = W[-2] ror 17 {xBxA} |
| 656 | vpxor(xmm2, xmm2, xmm3, AVX_256bit); |
| 657 | vpxor(xmm8, xmm8, xmm2, AVX_256bit); // ymm8 = s1 {xBxA} |
| 658 | vpshufb(xmm8, xmm8, xmm10, AVX_256bit); // ymm8 = s1 {00BA} |
| 659 | vpaddd(xmm0, xmm0, xmm8, AVX_256bit); // ymm0 = {..., ..., W[1], W[0]} |
| 660 | vpshufd(xmm2, xmm0, 0x50, AVX_256bit); // 01010000b ; ymm2 = W[-2] {DDCC} |
| 661 | } else if (iter%4 == 3) { |
| 662 | vpsrld(xmm11, xmm2, 10, AVX_256bit); // ymm11 = W[-2] >> 10 {DDCC} |
| 663 | vpsrlq(xmm3, xmm2, 19, AVX_256bit); // ymm3 = W[-2] ror 19 {xDxC} |
| 664 | vpsrlq(xmm2, xmm2, 17, AVX_256bit); // ymm2 = W[-2] ror 17 {xDxC} |
| 665 | vpxor(xmm2, xmm2, xmm3, AVX_256bit); |
| 666 | vpxor(xmm11, xmm11, xmm2, AVX_256bit); // ymm11 = s1 {xDxC} |
| 667 | vpshufb(xmm11, xmm11, xmm12, AVX_256bit); // ymm11 = s1 {DC00} |
| 668 | vpaddd(xmm_0, xmm11, xmm0, AVX_256bit); // xmm_0 = {W[3], W[2], W[1], W[0]} |
| 669 | } |
| 670 | } |
| 671 | |
| 672 | void MacroAssembler::addm(int disp, Register r1, Register r2) { |
| 673 | addl(r2, Address(r1, disp)); |
| 674 | movl(Address(r1, disp), r2); |
| 675 | } |
| 676 | |
| 677 | void MacroAssembler::addmq(int disp, Register r1, Register r2) { |
| 678 | addq(r2, Address(r1, disp)); |
| 679 | movq(Address(r1, disp), r2); |
| 680 | } |
| 681 | |
| 682 | void MacroAssembler::sha256_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0, |
| 683 | XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4, |
| 684 | Register buf, Register state, Register ofs, Register limit, Register rsp, |
| 685 | bool multi_block, XMMRegister shuf_mask) { |
| 686 | |
| 687 | Label loop0, loop1, loop2, loop3, |
| 688 | last_block_enter, do_last_block, only_one_block, done_hash, |
| 689 | compute_size, compute_size_end, |
| 690 | compute_size1, compute_size_end1; |
| 691 | |
| 692 | address K256_W = StubRoutines::x86::k256_W_addr(); |
| 693 | address pshuffle_byte_flip_mask = StubRoutines::x86::pshuffle_byte_flip_mask_addr(); |
| 694 | address pshuffle_byte_flip_mask_addr = 0; |
| 695 | |
| 696 | const XMMRegister& SHUF_00BA = xmm10; // ymm10: shuffle xBxA -> 00BA |
| 697 | const XMMRegister& SHUF_DC00 = xmm12; // ymm12: shuffle xDxC -> DC00 |
| 698 | const XMMRegister& BYTE_FLIP_MASK = xmm13; // ymm13 |
| 699 | |
| 700 | const XMMRegister& X_BYTE_FLIP_MASK = xmm13; //XMM version of BYTE_FLIP_MASK |
| 701 | |
| 702 | const Register& NUM_BLKS = r8; // 3rd arg |
| 703 | const Register& CTX = rdx; // 2nd arg |
| 704 | const Register& INP = rcx; // 1st arg |
| 705 | |
| 706 | const Register& c = rdi; |
| 707 | const Register& d = rsi; |
| 708 | const Register& e = r8; // clobbers NUM_BLKS |
| 709 | const Register& y3 = rcx; // clobbers INP |
| 710 | |
| 711 | const Register& TBL = rbp; |
| 712 | const Register& SRND = CTX; // SRND is same register as CTX |
| 713 | |
| 714 | const Register& a = rax; |
| 715 | const Register& b = rbx; |
| 716 | const Register& f = r9; |
| 717 | const Register& g = r10; |
| 718 | const Register& h = r11; |
| 719 | |
| 720 | const Register& T1 = r12; |
| 721 | const Register& y0 = r13; |
| 722 | const Register& y1 = r14; |
| 723 | const Register& y2 = r15; |
| 724 | |
| 725 | |
| 726 | enum { |
| 727 | _XFER_SIZE = 2*64*4, // 2 blocks, 64 rounds, 4 bytes/round |
| 728 | _INP_END_SIZE = 8, |
| 729 | _INP_SIZE = 8, |
| 730 | _CTX_SIZE = 8, |
| 731 | _RSP_SIZE = 8, |
| 732 | |
| 733 | _XFER = 0, |
| 734 | _INP_END = _XFER + _XFER_SIZE, |
| 735 | _INP = _INP_END + _INP_END_SIZE, |
| 736 | _CTX = _INP + _INP_SIZE, |
| 737 | _RSP = _CTX + _CTX_SIZE, |
| 738 | STACK_SIZE = _RSP + _RSP_SIZE |
| 739 | }; |
| 740 | |
| 741 | #ifndef _WIN64 |
| 742 | push(rcx); // linux: this is limit, need at the end |
| 743 | push(rdx); // linux: this is ofs |
| 744 | #else |
| 745 | push(r8); // win64: this is ofs |
| 746 | push(r9); // win64: this is limit, we need them again at the very and |
| 747 | #endif |
| 748 | |
| 749 | |
| 750 | push(rbx); |
| 751 | #ifdef _WIN64 |
| 752 | push(rsi); |
| 753 | push(rdi); |
| 754 | #endif |
| 755 | push(rbp); |
| 756 | push(r12); |
| 757 | push(r13); |
| 758 | push(r14); |
| 759 | push(r15); |
| 760 | |
| 761 | movq(rax, rsp); |
| 762 | subq(rsp, STACK_SIZE); |
| 763 | andq(rsp, -32); |
| 764 | movq(Address(rsp, _RSP), rax); |
| 765 | |
| 766 | #ifndef _WIN64 |
| 767 | // copy linux params to win64 params, therefore the rest of code will be the same for both |
| 768 | movq(r9, rcx); |
| 769 | movq(r8, rdx); |
| 770 | movq(rdx, rsi); |
| 771 | movq(rcx, rdi); |
| 772 | #endif |
| 773 | |
| 774 | // setting original assembly ABI |
| 775 | /** message to encrypt in INP */ |
| 776 | lea(INP, Address(rcx, 0)); // rcx == message (buf) ;; linux: INP = buf = rdi |
| 777 | /** digest in CTX */ |
| 778 | movq(CTX, rdx); // rdx = digest (state) ;; linux: CTX = state = rsi |
| 779 | |
| 780 | /** NUM_BLK is the length of message, need to set it from ofs and limit */ |
| 781 | if (multi_block) { |
| 782 | |
| 783 | // Win64: cannot directly update NUM_BLKS, since NUM_BLKS = ofs = r8 |
| 784 | // on entry r8 = ofs |
| 785 | // on exit r8 = NUM_BLKS |
| 786 | |
| 787 | xorq(rax, rax); |
| 788 | |
| 789 | bind(compute_size); |
| 790 | cmpptr(r8, r9); // assume the original ofs <= limit ;; linux: cmp rcx, rdx |
| 791 | jccb(Assembler::aboveEqual, compute_size_end); |
| 792 | addq(r8, 64); //;; linux: ofs = rdx |
| 793 | addq(rax, 64); |
| 794 | jmpb(compute_size); |
| 795 | |
| 796 | bind(compute_size_end); |
| 797 | movq(NUM_BLKS, rax); // NUM_BLK (r8) ;; linux: NUM_BLK = rdx |
| 798 | |
| 799 | cmpq(NUM_BLKS, 0); |
| 800 | jcc(Assembler::equal, done_hash); |
| 801 | |
| 802 | } else { |
| 803 | xorq(NUM_BLKS, NUM_BLKS); |
| 804 | addq(NUM_BLKS, 64); |
| 805 | }//if (!multi_block) |
| 806 | |
| 807 | lea(NUM_BLKS, Address(INP, NUM_BLKS, Address::times_1, -64)); // pointer to the last block |
| 808 | movq(Address(rsp, _INP_END), NUM_BLKS); // |
| 809 | |
| 810 | cmpptr(INP, NUM_BLKS); //cmp INP, NUM_BLKS |
| 811 | jcc(Assembler::equal, only_one_block); //je only_one_block |
| 812 | |
| 813 | // load initial digest |
| 814 | movl(a, Address(CTX, 4*0)); |
| 815 | movl(b, Address(CTX, 4*1)); |
| 816 | movl(c, Address(CTX, 4*2)); |
| 817 | movl(d, Address(CTX, 4*3)); |
| 818 | movl(e, Address(CTX, 4*4)); |
| 819 | movl(f, Address(CTX, 4*5)); |
| 820 | // load g - r10 after it is used as scratch |
| 821 | movl(h, Address(CTX, 4*7)); |
| 822 | |
| 823 | pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask; |
| 824 | vmovdqu(BYTE_FLIP_MASK, ExternalAddress(pshuffle_byte_flip_mask_addr +0)); //[PSHUFFLE_BYTE_FLIP_MASK wrt rip] |
| 825 | vmovdqu(SHUF_00BA, ExternalAddress(pshuffle_byte_flip_mask_addr + 32)); //[_SHUF_00BA wrt rip] |
| 826 | vmovdqu(SHUF_DC00, ExternalAddress(pshuffle_byte_flip_mask_addr + 64)); //[_SHUF_DC00 wrt rip] |
| 827 | |
| 828 | movl(g, Address(CTX, 4*6)); |
| 829 | |
| 830 | movq(Address(rsp, _CTX), CTX); // store |
| 831 | |
| 832 | bind(loop0); |
| 833 | lea(TBL, ExternalAddress(K256_W)); |
| 834 | |
| 835 | // assume buffers not aligned |
| 836 | |
| 837 | // Load first 16 dwords from two blocks |
| 838 | vmovdqu(xmm0, Address(INP, 0*32)); |
| 839 | vmovdqu(xmm1, Address(INP, 1*32)); |
| 840 | vmovdqu(xmm2, Address(INP, 2*32)); |
| 841 | vmovdqu(xmm3, Address(INP, 3*32)); |
| 842 | |
| 843 | // byte swap data |
| 844 | vpshufb(xmm0, xmm0, BYTE_FLIP_MASK, AVX_256bit); |
| 845 | vpshufb(xmm1, xmm1, BYTE_FLIP_MASK, AVX_256bit); |
| 846 | vpshufb(xmm2, xmm2, BYTE_FLIP_MASK, AVX_256bit); |
| 847 | vpshufb(xmm3, xmm3, BYTE_FLIP_MASK, AVX_256bit); |
| 848 | |
| 849 | // transpose data into high/low halves |
| 850 | vperm2i128(xmm4, xmm0, xmm2, 0x20); |
| 851 | vperm2i128(xmm5, xmm0, xmm2, 0x31); |
| 852 | vperm2i128(xmm6, xmm1, xmm3, 0x20); |
| 853 | vperm2i128(xmm7, xmm1, xmm3, 0x31); |
| 854 | |
| 855 | bind(last_block_enter); |
| 856 | addq(INP, 64); |
| 857 | movq(Address(rsp, _INP), INP); |
| 858 | |
| 859 | //;; schedule 48 input dwords, by doing 3 rounds of 12 each |
| 860 | xorq(SRND, SRND); |
| 861 | |
| 862 | align(16); |
| 863 | bind(loop1); |
| 864 | vpaddd(xmm9, xmm4, Address(TBL, SRND, Address::times_1, 0*32), AVX_256bit); |
| 865 | vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 0*32), xmm9); |
| 866 | sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, rax, rbx, rdi, rsi, r8, r9, r10, r11, 0); |
| 867 | sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r11, rax, rbx, rdi, rsi, r8, r9, r10, 1); |
| 868 | sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r10, r11, rax, rbx, rdi, rsi, r8, r9, 2); |
| 869 | sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r9, r10, r11, rax, rbx, rdi, rsi, r8, 3); |
| 870 | |
| 871 | vpaddd(xmm9, xmm5, Address(TBL, SRND, Address::times_1, 1*32), AVX_256bit); |
| 872 | vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 1*32), xmm9); |
| 873 | sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, r8, r9, r10, r11, rax, rbx, rdi, rsi, 8+0); |
| 874 | sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rsi, r8, r9, r10, r11, rax, rbx, rdi, 8+1); |
| 875 | sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rdi, rsi, r8, r9, r10, r11, rax, rbx, 8+2); |
| 876 | sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rbx, rdi, rsi, r8, r9, r10, r11, rax, 8+3); |
| 877 | |
| 878 | vpaddd(xmm9, xmm6, Address(TBL, SRND, Address::times_1, 2*32), AVX_256bit); |
| 879 | vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 2*32), xmm9); |
| 880 | sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, rax, rbx, rdi, rsi, r8, r9, r10, r11, 16+0); |
| 881 | sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r11, rax, rbx, rdi, rsi, r8, r9, r10, 16+1); |
| 882 | sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r10, r11, rax, rbx, rdi, rsi, r8, r9, 16+2); |
| 883 | sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r9, r10, r11, rax, rbx, rdi, rsi, r8, 16+3); |
| 884 | |
| 885 | vpaddd(xmm9, xmm7, Address(TBL, SRND, Address::times_1, 3*32), AVX_256bit); |
| 886 | vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 3*32), xmm9); |
| 887 | |
| 888 | sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, r8, r9, r10, r11, rax, rbx, rdi, rsi, 24+0); |
| 889 | sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rsi, r8, r9, r10, r11, rax, rbx, rdi, 24+1); |
| 890 | sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rdi, rsi, r8, r9, r10, r11, rax, rbx, 24+2); |
| 891 | sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rbx, rdi, rsi, r8, r9, r10, r11, rax, 24+3); |
| 892 | |
| 893 | addq(SRND, 4*32); |
| 894 | cmpq(SRND, 3 * 4*32); |
| 895 | jcc(Assembler::below, loop1); |
| 896 | |
| 897 | bind(loop2); |
| 898 | // Do last 16 rounds with no scheduling |
| 899 | vpaddd(xmm9, xmm4, Address(TBL, SRND, Address::times_1, 0*32), AVX_256bit); |
| 900 | vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 0*32), xmm9); |
| 901 | sha256_AVX2_four_rounds_compute_first(0); |
| 902 | |
| 903 | vpaddd(xmm9, xmm5, Address(TBL, SRND, Address::times_1, 1*32), AVX_256bit); |
| 904 | vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 1*32), xmm9); |
| 905 | sha256_AVX2_four_rounds_compute_last(0 + 8); |
| 906 | |
| 907 | addq(SRND, 2*32); |
| 908 | |
| 909 | vmovdqu(xmm4, xmm6); |
| 910 | vmovdqu(xmm5, xmm7); |
| 911 | |
| 912 | cmpq(SRND, 4 * 4*32); |
| 913 | jcc(Assembler::below, loop2); |
| 914 | |
| 915 | movq(CTX, Address(rsp, _CTX)); |
| 916 | movq(INP, Address(rsp, _INP)); |
| 917 | |
| 918 | addm(4*0, CTX, a); |
| 919 | addm(4*1, CTX, b); |
| 920 | addm(4*2, CTX, c); |
| 921 | addm(4*3, CTX, d); |
| 922 | addm(4*4, CTX, e); |
| 923 | addm(4*5, CTX, f); |
| 924 | addm(4*6, CTX, g); |
| 925 | addm(4*7, CTX, h); |
| 926 | |
| 927 | cmpq(INP, Address(rsp, _INP_END)); |
| 928 | jcc(Assembler::above, done_hash); |
| 929 | |
| 930 | //Do second block using previously scheduled results |
| 931 | xorq(SRND, SRND); |
| 932 | align(16); |
| 933 | bind(loop3); |
| 934 | sha256_AVX2_four_rounds_compute_first(4); |
| 935 | sha256_AVX2_four_rounds_compute_last(4+8); |
| 936 | |
| 937 | addq(SRND, 2*32); |
| 938 | cmpq(SRND, 4 * 4*32); |
| 939 | jcc(Assembler::below, loop3); |
| 940 | |
| 941 | movq(CTX, Address(rsp, _CTX)); |
| 942 | movq(INP, Address(rsp, _INP)); |
| 943 | addq(INP, 64); |
| 944 | |
| 945 | addm(4*0, CTX, a); |
| 946 | addm(4*1, CTX, b); |
| 947 | addm(4*2, CTX, c); |
| 948 | addm(4*3, CTX, d); |
| 949 | addm(4*4, CTX, e); |
| 950 | addm(4*5, CTX, f); |
| 951 | addm(4*6, CTX, g); |
| 952 | addm(4*7, CTX, h); |
| 953 | |
| 954 | cmpq(INP, Address(rsp, _INP_END)); |
| 955 | jcc(Assembler::below, loop0); |
| 956 | jccb(Assembler::above, done_hash); |
| 957 | |
| 958 | bind(do_last_block); |
| 959 | lea(TBL, ExternalAddress(K256_W)); |
| 960 | |
| 961 | movdqu(xmm4, Address(INP, 0*16)); |
| 962 | movdqu(xmm5, Address(INP, 1*16)); |
| 963 | movdqu(xmm6, Address(INP, 2*16)); |
| 964 | movdqu(xmm7, Address(INP, 3*16)); |
| 965 | |
| 966 | vpshufb(xmm4, xmm4, xmm13, AVX_128bit); |
| 967 | vpshufb(xmm5, xmm5, xmm13, AVX_128bit); |
| 968 | vpshufb(xmm6, xmm6, xmm13, AVX_128bit); |
| 969 | vpshufb(xmm7, xmm7, xmm13, AVX_128bit); |
| 970 | |
| 971 | jmp(last_block_enter); |
| 972 | |
| 973 | bind(only_one_block); |
| 974 | |
| 975 | // load initial digest ;; table should be preloaded with following values |
| 976 | movl(a, Address(CTX, 4*0)); // 0x6a09e667 |
| 977 | movl(b, Address(CTX, 4*1)); // 0xbb67ae85 |
| 978 | movl(c, Address(CTX, 4*2)); // 0x3c6ef372 |
| 979 | movl(d, Address(CTX, 4*3)); // 0xa54ff53a |
| 980 | movl(e, Address(CTX, 4*4)); // 0x510e527f |
| 981 | movl(f, Address(CTX, 4*5)); // 0x9b05688c |
| 982 | // load g - r10 after use as scratch |
| 983 | movl(h, Address(CTX, 4*7)); // 0x5be0cd19 |
| 984 | |
| 985 | |
| 986 | pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask; |
| 987 | vmovdqu(BYTE_FLIP_MASK, ExternalAddress(pshuffle_byte_flip_mask_addr + 0)); //[PSHUFFLE_BYTE_FLIP_MASK wrt rip] |
| 988 | vmovdqu(SHUF_00BA, ExternalAddress(pshuffle_byte_flip_mask_addr + 32)); //[_SHUF_00BA wrt rip] |
| 989 | vmovdqu(SHUF_DC00, ExternalAddress(pshuffle_byte_flip_mask_addr + 64)); //[_SHUF_DC00 wrt rip] |
| 990 | |
| 991 | movl(g, Address(CTX, 4*6)); // 0x1f83d9ab |
| 992 | |
| 993 | movq(Address(rsp, _CTX), CTX); |
| 994 | jmpb(do_last_block); |
| 995 | |
| 996 | bind(done_hash); |
| 997 | |
| 998 | movq(rsp, Address(rsp, _RSP)); |
| 999 | |
| 1000 | pop(r15); |
| 1001 | pop(r14); |
| 1002 | pop(r13); |
| 1003 | pop(r12); |
| 1004 | pop(rbp); |
| 1005 | #ifdef _WIN64 |
| 1006 | pop(rdi); |
| 1007 | pop(rsi); |
| 1008 | #endif |
| 1009 | pop(rbx); |
| 1010 | |
| 1011 | #ifdef _WIN64 |
| 1012 | pop(r9); |
| 1013 | pop(r8); |
| 1014 | #else |
| 1015 | pop(rdx); |
| 1016 | pop(rcx); |
| 1017 | #endif |
| 1018 | |
| 1019 | if (multi_block) { |
| 1020 | #ifdef _WIN64 |
| 1021 | const Register& limit_end = r9; |
| 1022 | const Register& ofs_end = r8; |
| 1023 | #else |
| 1024 | const Register& limit_end = rcx; |
| 1025 | const Register& ofs_end = rdx; |
| 1026 | #endif |
| 1027 | movq(rax, ofs_end); |
| 1028 | |
| 1029 | bind(compute_size1); |
| 1030 | cmpptr(rax, limit_end); // assume the original ofs <= limit |
| 1031 | jccb(Assembler::aboveEqual, compute_size_end1); |
| 1032 | addq(rax, 64); |
| 1033 | jmpb(compute_size1); |
| 1034 | |
| 1035 | bind(compute_size_end1); |
| 1036 | } |
| 1037 | } |
| 1038 | |
| 1039 | void MacroAssembler::sha512_AVX2_one_round_compute(Register old_h, Register a, Register b, Register c, |
| 1040 | Register d, Register e, Register f, Register g, Register h, |
| 1041 | int iteration) |
| 1042 | { |
| 1043 | |
| 1044 | const Register& y0 = r13; |
| 1045 | const Register& y1 = r14; |
| 1046 | const Register& y2 = r15; |
| 1047 | #ifdef _WIN64 |
| 1048 | const Register& y3 = rcx; |
| 1049 | #else |
| 1050 | const Register& y3 = rdi; |
| 1051 | #endif |
| 1052 | const Register& T1 = r12; |
| 1053 | |
| 1054 | if (iteration % 4 > 0) { |
| 1055 | addq(old_h, y2); //h = k + w + h + S0 + S1 + CH = t1 + S0; |
| 1056 | } |
| 1057 | movq(y2, f); //y2 = f; CH |
| 1058 | rorxq(y0, e, 41); //y0 = e >> 41; S1A |
| 1059 | rorxq(y1, e, 18); //y1 = e >> 18; S1B |
| 1060 | xorq(y2, g); //y2 = f^g; CH |
| 1061 | |
| 1062 | xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18); S1 |
| 1063 | rorxq(y1, e, 14); //y1 = (e >> 14); S1 |
| 1064 | andq(y2, e); //y2 = (f^g)&e; CH |
| 1065 | |
| 1066 | if (iteration % 4 > 0 ) { |
| 1067 | addq(old_h, y3); //h = t1 + S0 + MAJ |
| 1068 | } |
| 1069 | xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18) ^ (e >> 14); S1 |
| 1070 | rorxq(T1, a, 34); //T1 = a >> 34; S0B |
| 1071 | xorq(y2, g); //y2 = CH = ((f^g)&e) ^g; CH |
| 1072 | rorxq(y1, a, 39); //y1 = a >> 39; S0A |
| 1073 | movq(y3, a); //y3 = a; MAJA |
| 1074 | |
| 1075 | xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34); S0 |
| 1076 | rorxq(T1, a, 28); //T1 = (a >> 28); S0 |
| 1077 | addq(h, Address(rsp, (8 * iteration))); //h = k + w + h; -- |
| 1078 | orq(y3, c); //y3 = a | c; MAJA |
| 1079 | |
| 1080 | xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34) ^ (a >> 28); S0 |
| 1081 | movq(T1, a); //T1 = a; MAJB |
| 1082 | andq(y3, b); //y3 = (a | c)&b; MAJA |
| 1083 | andq(T1, c); //T1 = a&c; MAJB |
| 1084 | addq(y2, y0); //y2 = S1 + CH; -- |
| 1085 | |
| 1086 | addq(d, h); //d = k + w + h + d; -- |
| 1087 | orq(y3, T1); //y3 = MAJ = (a | c)&b) | (a&c); MAJ |
| 1088 | addq(h, y1); //h = k + w + h + S0; -- |
| 1089 | |
| 1090 | addq(d, y2); //d = k + w + h + d + S1 + CH = d + t1; -- |
| 1091 | |
| 1092 | if (iteration % 4 == 3) { |
| 1093 | addq(h, y2); //h = k + w + h + S0 + S1 + CH = t1 + S0; -- |
| 1094 | addq(h, y3); //h = t1 + S0 + MAJ; -- |
| 1095 | } |
| 1096 | } |
| 1097 | |
| 1098 | void MacroAssembler::sha512_AVX2_one_round_and_schedule( |
| 1099 | XMMRegister xmm4, // ymm4 |
| 1100 | XMMRegister xmm5, // ymm5 |
| 1101 | XMMRegister xmm6, // ymm6 |
| 1102 | XMMRegister xmm7, // ymm7 |
| 1103 | Register a, //rax |
| 1104 | Register b, //rbx |
| 1105 | Register c, //rdi |
| 1106 | Register d, //rsi |
| 1107 | Register e, //r8 |
| 1108 | Register f, //r9 |
| 1109 | Register g, //r10 |
| 1110 | Register h, //r11 |
| 1111 | int iteration) |
| 1112 | { |
| 1113 | |
| 1114 | const Register& y0 = r13; |
| 1115 | const Register& y1 = r14; |
| 1116 | const Register& y2 = r15; |
| 1117 | #ifdef _WIN64 |
| 1118 | const Register& y3 = rcx; |
| 1119 | #else |
| 1120 | const Register& y3 = rdi; |
| 1121 | #endif |
| 1122 | const Register& T1 = r12; |
| 1123 | |
| 1124 | if (iteration % 4 == 0) { |
| 1125 | // Extract w[t - 7] |
| 1126 | // xmm0 = W[-7] |
| 1127 | vperm2f128(xmm0, xmm7, xmm6, 3); |
| 1128 | vpalignr(xmm0, xmm0, xmm6, 8, AVX_256bit); |
| 1129 | |
| 1130 | // Calculate w[t - 16] + w[t - 7] |
| 1131 | vpaddq(xmm0, xmm0, xmm4, AVX_256bit); //xmm0 = W[-7] + W[-16] |
| 1132 | // Extract w[t - 15] |
| 1133 | //xmm1 = W[-15] |
| 1134 | vperm2f128(xmm1, xmm5, xmm4, 3); |
| 1135 | vpalignr(xmm1, xmm1, xmm4, 8, AVX_256bit); |
| 1136 | |
| 1137 | // Calculate sigma0 |
| 1138 | // Calculate w[t - 15] ror 1 |
| 1139 | vpsrlq(xmm2, xmm1, 1, AVX_256bit); |
| 1140 | vpsllq(xmm3, xmm1, (64 - 1), AVX_256bit); |
| 1141 | vpor(xmm3, xmm3, xmm2, AVX_256bit); //xmm3 = W[-15] ror 1 |
| 1142 | // Calculate w[t - 15] shr 7 |
| 1143 | vpsrlq(xmm8, xmm1, 7, AVX_256bit); //xmm8 = W[-15] >> 7 |
| 1144 | |
| 1145 | } else if (iteration % 4 == 1) { |
| 1146 | //Calculate w[t - 15] ror 8 |
| 1147 | vpsrlq(xmm2, xmm1, 8, AVX_256bit); |
| 1148 | vpsllq(xmm1, xmm1, (64 - 8), AVX_256bit); |
| 1149 | vpor(xmm1, xmm1, xmm2, AVX_256bit); //xmm1 = W[-15] ror 8 |
| 1150 | |
| 1151 | //XOR the three components |
| 1152 | vpxor(xmm3, xmm3, xmm8, AVX_256bit); //xmm3 = W[-15] ror 1 ^ W[-15] >> 7 |
| 1153 | vpxor(xmm1, xmm3, xmm1, AVX_256bit); //xmm1 = s0 |
| 1154 | |
| 1155 | //Add three components, w[t - 16], w[t - 7] and sigma0 |
| 1156 | vpaddq(xmm0, xmm0, xmm1, AVX_256bit); //xmm0 = W[-16] + W[-7] + s0 |
| 1157 | |
| 1158 | // Move to appropriate lanes for calculating w[16] and w[17] |
| 1159 | vperm2f128(xmm4, xmm0, xmm0, 0); //xmm4 = W[-16] + W[-7] + s0{ BABA } |
| 1160 | |
| 1161 | //Move to appropriate lanes for calculating w[18] and w[19] |
| 1162 | vpand(xmm0, xmm0, xmm10, AVX_256bit); //xmm0 = W[-16] + W[-7] + s0{ DC00 } |
| 1163 | //Calculate w[16] and w[17] in both 128 bit lanes |
| 1164 | //Calculate sigma1 for w[16] and w[17] on both 128 bit lanes |
| 1165 | vperm2f128(xmm2, xmm7, xmm7, 17); //xmm2 = W[-2] {BABA} |
| 1166 | vpsrlq(xmm8, xmm2, 6, AVX_256bit); //xmm8 = W[-2] >> 6 {BABA} |
| 1167 | |
| 1168 | } else if (iteration % 4 == 2) { |
| 1169 | vpsrlq(xmm3, xmm2, 19, AVX_256bit); //xmm3 = W[-2] >> 19 {BABA} |
| 1170 | vpsllq(xmm1, xmm2, (64 - 19), AVX_256bit); //xmm1 = W[-2] << 19 {BABA} |
| 1171 | vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 19 {BABA} |
| 1172 | vpxor(xmm8, xmm8, xmm3, AVX_256bit);// xmm8 = W[-2] ror 19 ^ W[-2] >> 6 {BABA} |
| 1173 | vpsrlq(xmm3, xmm2, 61, AVX_256bit); //xmm3 = W[-2] >> 61 {BABA} |
| 1174 | vpsllq(xmm1, xmm2, (64 - 61), AVX_256bit); //xmm1 = W[-2] << 61 {BABA} |
| 1175 | vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 61 {BABA} |
| 1176 | vpxor(xmm8, xmm8, xmm3, AVX_256bit); //xmm8 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) { BABA } |
| 1177 | |
| 1178 | //Add sigma1 to the other components to get w[16] and w[17] |
| 1179 | vpaddq(xmm4, xmm4, xmm8, AVX_256bit); //xmm4 = { W[1], W[0], W[1], W[0] } |
| 1180 | |
| 1181 | //Calculate sigma1 for w[18] and w[19] for upper 128 bit lane |
| 1182 | vpsrlq(xmm8, xmm4, 6, AVX_256bit); //xmm8 = W[-2] >> 6 {DC--} |
| 1183 | |
| 1184 | } else if (iteration % 4 == 3){ |
| 1185 | vpsrlq(xmm3, xmm4, 19, AVX_256bit); //xmm3 = W[-2] >> 19 {DC--} |
| 1186 | vpsllq(xmm1, xmm4, (64 - 19), AVX_256bit); //xmm1 = W[-2] << 19 {DC--} |
| 1187 | vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 19 {DC--} |
| 1188 | vpxor(xmm8, xmm8, xmm3, AVX_256bit); //xmm8 = W[-2] ror 19 ^ W[-2] >> 6 {DC--} |
| 1189 | vpsrlq(xmm3, xmm4, 61, AVX_256bit); //xmm3 = W[-2] >> 61 {DC--} |
| 1190 | vpsllq(xmm1, xmm4, (64 - 61), AVX_256bit); //xmm1 = W[-2] << 61 {DC--} |
| 1191 | vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 61 {DC--} |
| 1192 | vpxor(xmm8, xmm8, xmm3, AVX_256bit); //xmm8 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) { DC-- } |
| 1193 | |
| 1194 | //Add the sigma0 + w[t - 7] + w[t - 16] for w[18] and w[19] to newly calculated sigma1 to get w[18] and w[19] |
| 1195 | vpaddq(xmm2, xmm0, xmm8, AVX_256bit); //xmm2 = { W[3], W[2], --, -- } |
| 1196 | |
| 1197 | //Form w[19, w[18], w17], w[16] |
| 1198 | vpblendd(xmm4, xmm4, xmm2, 0xF0, AVX_256bit); //xmm4 = { W[3], W[2], W[1], W[0] } |
| 1199 | } |
| 1200 | |
| 1201 | movq(y3, a); //y3 = a; MAJA |
| 1202 | rorxq(y0, e, 41); // y0 = e >> 41; S1A |
| 1203 | rorxq(y1, e, 18); //y1 = e >> 18; S1B |
| 1204 | addq(h, Address(rsp, (iteration * 8))); //h = k + w + h; -- |
| 1205 | orq(y3, c); //y3 = a | c; MAJA |
| 1206 | movq(y2, f); //y2 = f; CH |
| 1207 | |
| 1208 | xorq(y2, g); //y2 = f^g; CH |
| 1209 | |
| 1210 | rorxq(T1, a, 34); //T1 = a >> 34; S0B |
| 1211 | xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18); S1 |
| 1212 | |
| 1213 | rorxq(y1, e, 14); //y1 = (e >> 14); S1 |
| 1214 | |
| 1215 | andq(y2, e); //y2 = (f^g) & e; CH |
| 1216 | addq(d, h); //d = k + w + h + d; -- |
| 1217 | |
| 1218 | andq(y3, b); //y3 = (a | c)&b; MAJA |
| 1219 | xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18) ^ (e >> 14); S1 |
| 1220 | rorxq(y1, a, 39); //y1 = a >> 39; S0A |
| 1221 | |
| 1222 | xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34); S0 |
| 1223 | rorxq(T1, a, 28); //T1 = (a >> 28); S0 |
| 1224 | xorq(y2, g); //y2 = CH = ((f^g)&e) ^ g; CH |
| 1225 | |
| 1226 | xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34) ^ (a >> 28); S0 |
| 1227 | movq(T1, a); //T1 = a; MAJB |
| 1228 | |
| 1229 | andq(T1, c); //T1 = a&c; MAJB |
| 1230 | addq(y2, y0); //y2 = S1 + CH; -- |
| 1231 | |
| 1232 | orq(y3, T1); //y3 = MAJ = (a | c)&b) | (a&c); MAJ |
| 1233 | addq(h, y1); //h = k + w + h + S0; -- |
| 1234 | |
| 1235 | addq(d, y2); //d = k + w + h + d + S1 + CH = d + t1; -- |
| 1236 | addq(h, y2); //h = k + w + h + S0 + S1 + CH = t1 + S0; -- |
| 1237 | addq(h, y3); //h = t1 + S0 + MAJ; -- |
| 1238 | } |
| 1239 | |
| 1240 | void MacroAssembler::sha512_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0, |
| 1241 | XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4, |
| 1242 | Register buf, Register state, Register ofs, Register limit, Register rsp, |
| 1243 | bool multi_block, XMMRegister shuf_mask) |
| 1244 | { |
| 1245 | |
| 1246 | Label loop0, loop1, loop2, done_hash, |
| 1247 | compute_block_size, compute_size, |
| 1248 | compute_block_size_end, compute_size_end; |
| 1249 | |
| 1250 | address K512_W = StubRoutines::x86::k512_W_addr(); |
| 1251 | address pshuffle_byte_flip_mask_sha512 = StubRoutines::x86::pshuffle_byte_flip_mask_addr_sha512(); |
| 1252 | address pshuffle_byte_flip_mask_addr = 0; |
| 1253 | |
| 1254 | const XMMRegister& XFER = xmm0; // YTMP0 |
| 1255 | const XMMRegister& BYTE_FLIP_MASK = xmm9; // ymm9 |
| 1256 | const XMMRegister& YMM_MASK_LO = xmm10; // ymm10 |
| 1257 | #ifdef _WIN64 |
| 1258 | const Register& INP = rcx; //1st arg |
| 1259 | const Register& CTX = rdx; //2nd arg |
| 1260 | const Register& NUM_BLKS = r8; //3rd arg |
| 1261 | const Register& c = rdi; |
| 1262 | const Register& d = rsi; |
| 1263 | const Register& e = r8; |
| 1264 | const Register& y3 = rcx; |
| 1265 | const Register& offset = r8; |
| 1266 | const Register& input_limit = r9; |
| 1267 | #else |
| 1268 | const Register& INP = rdi; //1st arg |
| 1269 | const Register& CTX = rsi; //2nd arg |
| 1270 | const Register& NUM_BLKS = rdx; //3rd arg |
| 1271 | const Register& c = rcx; |
| 1272 | const Register& d = r8; |
| 1273 | const Register& e = rdx; |
| 1274 | const Register& y3 = rdi; |
| 1275 | const Register& offset = rdx; |
| 1276 | const Register& input_limit = rcx; |
| 1277 | #endif |
| 1278 | |
| 1279 | const Register& TBL = rbp; |
| 1280 | |
| 1281 | const Register& a = rax; |
| 1282 | const Register& b = rbx; |
| 1283 | |
| 1284 | const Register& f = r9; |
| 1285 | const Register& g = r10; |
| 1286 | const Register& h = r11; |
| 1287 | |
| 1288 | //Local variables as defined in assembly file. |
| 1289 | enum |
| 1290 | { |
| 1291 | _XFER_SIZE = 4 * 8, // resq 4 => reserve 4 quadwords. Hence 4 * 8 |
| 1292 | _SRND_SIZE = 8, // resq 1 |
| 1293 | _INP_SIZE = 8, |
| 1294 | _INP_END_SIZE = 8, |
| 1295 | _RSP_SAVE_SIZE = 8, // defined as resq 1 |
| 1296 | |
| 1297 | #ifdef _WIN64 |
| 1298 | _GPR_SAVE_SIZE = 8 * 8, // defined as resq 8 |
| 1299 | #else |
| 1300 | _GPR_SAVE_SIZE = 6 * 8 // resq 6 |
| 1301 | #endif |
| 1302 | }; |
| 1303 | |
| 1304 | enum |
| 1305 | { |
| 1306 | _XFER = 0, |
| 1307 | _SRND = _XFER + _XFER_SIZE, // 32 |
| 1308 | _INP = _SRND + _SRND_SIZE, // 40 |
| 1309 | _INP_END = _INP + _INP_SIZE, // 48 |
| 1310 | _RSP = _INP_END + _INP_END_SIZE, // 56 |
| 1311 | _GPR = _RSP + _RSP_SAVE_SIZE, // 64 |
| 1312 | _STACK_SIZE = _GPR + _GPR_SAVE_SIZE // 128 for windows and 112 for linux. |
| 1313 | }; |
| 1314 | |
| 1315 | //Saving offset and limit as it will help with blocksize calculation for multiblock SHA512. |
| 1316 | #ifdef _WIN64 |
| 1317 | push(r8); // win64: this is ofs |
| 1318 | push(r9); // win64: this is limit, we need them again at the very end. |
| 1319 | #else |
| 1320 | push(rdx); // linux : this is ofs, need at the end for multiblock calculation |
| 1321 | push(rcx); // linux: This is the limit. |
| 1322 | #endif |
| 1323 | |
| 1324 | //Allocate Stack Space |
| 1325 | movq(rax, rsp); |
| 1326 | subq(rsp, _STACK_SIZE); |
| 1327 | andq(rsp, -32); |
| 1328 | movq(Address(rsp, _RSP), rax); |
| 1329 | |
| 1330 | //Save GPRs |
| 1331 | movq(Address(rsp, _GPR), rbp); |
| 1332 | movq(Address(rsp, (_GPR + 8)), rbx); |
| 1333 | movq(Address(rsp, (_GPR + 16)), r12); |
| 1334 | movq(Address(rsp, (_GPR + 24)), r13); |
| 1335 | movq(Address(rsp, (_GPR + 32)), r14); |
| 1336 | movq(Address(rsp, (_GPR + 40)), r15); |
| 1337 | |
| 1338 | #ifdef _WIN64 |
| 1339 | movq(Address(rsp, (_GPR + 48)), rsi); |
| 1340 | movq(Address(rsp, (_GPR + 56)), rdi); |
| 1341 | #endif |
| 1342 | |
| 1343 | vpblendd(xmm0, xmm0, xmm1, 0xF0, AVX_128bit); |
| 1344 | vpblendd(xmm0, xmm0, xmm1, 0xF0, AVX_256bit); |
| 1345 | |
| 1346 | if (multi_block) { |
| 1347 | xorq(rax, rax); |
| 1348 | bind(compute_block_size); |
| 1349 | cmpptr(offset, input_limit); // Assuming that offset is less than limit. |
| 1350 | jccb(Assembler::aboveEqual, compute_block_size_end); |
| 1351 | addq(offset, 128); |
| 1352 | addq(rax, 128); |
| 1353 | jmpb(compute_block_size); |
| 1354 | |
| 1355 | bind(compute_block_size_end); |
| 1356 | movq(NUM_BLKS, rax); |
| 1357 | |
| 1358 | cmpq(NUM_BLKS, 0); |
| 1359 | jcc(Assembler::equal, done_hash); |
| 1360 | } else { |
| 1361 | xorq(NUM_BLKS, NUM_BLKS); //If single block. |
| 1362 | addq(NUM_BLKS, 128); |
| 1363 | } |
| 1364 | |
| 1365 | addq(NUM_BLKS, INP); //pointer to end of data |
| 1366 | movq(Address(rsp, _INP_END), NUM_BLKS); |
| 1367 | |
| 1368 | //load initial digest |
| 1369 | movq(a, Address(CTX, 8 * 0)); |
| 1370 | movq(b, Address(CTX, 8 * 1)); |
| 1371 | movq(c, Address(CTX, 8 * 2)); |
| 1372 | movq(d, Address(CTX, 8 * 3)); |
| 1373 | movq(e, Address(CTX, 8 * 4)); |
| 1374 | movq(f, Address(CTX, 8 * 5)); |
| 1375 | // load g - r10 after it is used as scratch |
| 1376 | movq(h, Address(CTX, 8 * 7)); |
| 1377 | |
| 1378 | pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask_sha512; |
| 1379 | vmovdqu(BYTE_FLIP_MASK, ExternalAddress(pshuffle_byte_flip_mask_addr + 0)); //PSHUFFLE_BYTE_FLIP_MASK wrt rip |
| 1380 | vmovdqu(YMM_MASK_LO, ExternalAddress(pshuffle_byte_flip_mask_addr + 32)); |
| 1381 | |
| 1382 | movq(g, Address(CTX, 8 * 6)); |
| 1383 | |
| 1384 | bind(loop0); |
| 1385 | lea(TBL, ExternalAddress(K512_W)); |
| 1386 | |
| 1387 | //byte swap first 16 dwords |
| 1388 | vmovdqu(xmm4, Address(INP, 32 * 0)); |
| 1389 | vpshufb(xmm4, xmm4, BYTE_FLIP_MASK, AVX_256bit); |
| 1390 | vmovdqu(xmm5, Address(INP, 32 * 1)); |
| 1391 | vpshufb(xmm5, xmm5, BYTE_FLIP_MASK, AVX_256bit); |
| 1392 | vmovdqu(xmm6, Address(INP, 32 * 2)); |
| 1393 | vpshufb(xmm6, xmm6, BYTE_FLIP_MASK, AVX_256bit); |
| 1394 | vmovdqu(xmm7, Address(INP, 32 * 3)); |
| 1395 | vpshufb(xmm7, xmm7, BYTE_FLIP_MASK, AVX_256bit); |
| 1396 | |
| 1397 | movq(Address(rsp, _INP), INP); |
| 1398 | |
| 1399 | movslq(Address(rsp, _SRND), 4); |
| 1400 | align(16); |
| 1401 | |
| 1402 | //Schedule 64 input dwords, by calling sha512_AVX2_one_round_and_schedule |
| 1403 | bind(loop1); |
| 1404 | vpaddq(xmm0, xmm4, Address(TBL, 0 * 32), AVX_256bit); |
| 1405 | vmovdqu(Address(rsp, _XFER), xmm0); |
| 1406 | //four rounds and schedule |
| 1407 | sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, a, b, c, d, e, f, g, h, 0); |
| 1408 | sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, h, a, b, c, d, e, f, g, 1); |
| 1409 | sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, g, h, a, b, c, d, e, f, 2); |
| 1410 | sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, f, g, h, a, b, c, d, e, 3); |
| 1411 | |
| 1412 | vpaddq(xmm0, xmm5, Address(TBL, 1 * 32), AVX_256bit); |
| 1413 | vmovdqu(Address(rsp, _XFER), xmm0); |
| 1414 | //four rounds and schedule |
| 1415 | sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, e, f, g, h, a, b, c, d, 0); |
| 1416 | sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, d, e, f, g, h, a, b, c, 1); |
| 1417 | sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, c, d, e, f, g, h, a, b, 2); |
| 1418 | sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, b, c, d, e, f, g, h, a, 3); |
| 1419 | |
| 1420 | vpaddq(xmm0, xmm6, Address(TBL, 2 * 32), AVX_256bit); |
| 1421 | vmovdqu(Address(rsp, _XFER), xmm0); |
| 1422 | //four rounds and schedule |
| 1423 | sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, a, b, c, d, e, f, g, h, 0); |
| 1424 | sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, h, a, b, c, d, e, f, g, 1); |
| 1425 | sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, g, h, a, b, c, d, e, f, 2); |
| 1426 | sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, f, g, h, a, b, c, d, e, 3); |
| 1427 | |
| 1428 | vpaddq(xmm0, xmm7, Address(TBL, 3 * 32), AVX_256bit); |
| 1429 | vmovdqu(Address(rsp, _XFER), xmm0); |
| 1430 | addq(TBL, 4 * 32); |
| 1431 | //four rounds and schedule |
| 1432 | sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, e, f, g, h, a, b, c, d, 0); |
| 1433 | sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, d, e, f, g, h, a, b, c, 1); |
| 1434 | sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, c, d, e, f, g, h, a, b, 2); |
| 1435 | sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, b, c, d, e, f, g, h, a, 3); |
| 1436 | |
| 1437 | subq(Address(rsp, _SRND), 1); |
| 1438 | jcc(Assembler::notEqual, loop1); |
| 1439 | |
| 1440 | movslq(Address(rsp, _SRND), 2); |
| 1441 | |
| 1442 | bind(loop2); |
| 1443 | vpaddq(xmm0, xmm4, Address(TBL, 0 * 32), AVX_256bit); |
| 1444 | vmovdqu(Address(rsp, _XFER), xmm0); |
| 1445 | //four rounds and compute. |
| 1446 | sha512_AVX2_one_round_compute(a, a, b, c, d, e, f, g, h, 0); |
| 1447 | sha512_AVX2_one_round_compute(h, h, a, b, c, d, e, f, g, 1); |
| 1448 | sha512_AVX2_one_round_compute(g, g, h, a, b, c, d, e, f, 2); |
| 1449 | sha512_AVX2_one_round_compute(f, f, g, h, a, b, c, d, e, 3); |
| 1450 | |
| 1451 | vpaddq(xmm0, xmm5, Address(TBL, 1 * 32), AVX_256bit); |
| 1452 | vmovdqu(Address(rsp, _XFER), xmm0); |
| 1453 | addq(TBL, 2 * 32); |
| 1454 | // four rounds and compute. |
| 1455 | sha512_AVX2_one_round_compute(e, e, f, g, h, a, b, c, d, 0); |
| 1456 | sha512_AVX2_one_round_compute(d, d, e, f, g, h, a, b, c, 1); |
| 1457 | sha512_AVX2_one_round_compute(c, c, d, e, f, g, h, a, b, 2); |
| 1458 | sha512_AVX2_one_round_compute(b, b, c, d, e, f, g, h, a, 3); |
| 1459 | |
| 1460 | vmovdqu(xmm4, xmm6); |
| 1461 | vmovdqu(xmm5, xmm7); |
| 1462 | |
| 1463 | subq(Address(rsp, _SRND), 1); |
| 1464 | jcc(Assembler::notEqual, loop2); |
| 1465 | |
| 1466 | addmq(8 * 0, CTX, a); |
| 1467 | addmq(8 * 1, CTX, b); |
| 1468 | addmq(8 * 2, CTX, c); |
| 1469 | addmq(8 * 3, CTX, d); |
| 1470 | addmq(8 * 4, CTX, e); |
| 1471 | addmq(8 * 5, CTX, f); |
| 1472 | addmq(8 * 6, CTX, g); |
| 1473 | addmq(8 * 7, CTX, h); |
| 1474 | |
| 1475 | movq(INP, Address(rsp, _INP)); |
| 1476 | addq(INP, 128); |
| 1477 | cmpq(INP, Address(rsp, _INP_END)); |
| 1478 | jcc(Assembler::notEqual, loop0); |
| 1479 | |
| 1480 | bind(done_hash); |
| 1481 | |
| 1482 | //Restore GPRs |
| 1483 | movq(rbp, Address(rsp, (_GPR + 0))); |
| 1484 | movq(rbx, Address(rsp, (_GPR + 8))); |
| 1485 | movq(r12, Address(rsp, (_GPR + 16))); |
| 1486 | movq(r13, Address(rsp, (_GPR + 24))); |
| 1487 | movq(r14, Address(rsp, (_GPR + 32))); |
| 1488 | movq(r15, Address(rsp, (_GPR + 40))); |
| 1489 | |
| 1490 | #ifdef _WIN64 |
| 1491 | movq(rsi, Address(rsp, (_GPR + 48))); |
| 1492 | movq(rdi, Address(rsp, (_GPR + 56))); |
| 1493 | #endif |
| 1494 | |
| 1495 | //Restore Stack Pointer |
| 1496 | movq(rsp, Address(rsp, _RSP)); |
| 1497 | |
| 1498 | #ifdef _WIN64 |
| 1499 | pop(r9); |
| 1500 | pop(r8); |
| 1501 | #else |
| 1502 | pop(rcx); |
| 1503 | pop(rdx); |
| 1504 | #endif |
| 1505 | |
| 1506 | if (multi_block) { |
| 1507 | #ifdef _WIN64 |
| 1508 | const Register& limit_end = r9; |
| 1509 | const Register& ofs_end = r8; |
| 1510 | #else |
| 1511 | const Register& limit_end = rcx; |
| 1512 | const Register& ofs_end = rdx; |
| 1513 | #endif |
| 1514 | movq(rax, ofs_end); |
| 1515 | bind(compute_size); |
| 1516 | cmpptr(rax, limit_end); |
| 1517 | jccb(Assembler::aboveEqual, compute_size_end); |
| 1518 | addq(rax, 128); |
| 1519 | jmpb(compute_size); |
| 1520 | bind(compute_size_end); |
| 1521 | } |
| 1522 | } |
| 1523 | |
| 1524 | #endif //#ifdef _LP64 |
| 1525 | |
| 1526 | |