1/*
2* Copyright (c) 2016, Intel Corporation.
3*
4* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5*
6* This code is free software; you can redistribute it and/or modify it
7* under the terms of the GNU General Public License version 2 only, as
8* published by the Free Software Foundation.
9*
10* This code is distributed in the hope that it will be useful, but WITHOUT
11* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13* version 2 for more details (a copy is included in the LICENSE file that
14* accompanied this code).
15*
16* You should have received a copy of the GNU General Public License version
17* 2 along with this work; if not, write to the Free Software Foundation,
18* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19*
20* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21* or visit www.oracle.com if you need additional information or have any
22* questions.
23*
24*/
25
26#include "precompiled.hpp"
27#include "asm/assembler.hpp"
28#include "asm/assembler.inline.hpp"
29#include "runtime/stubRoutines.hpp"
30#include "macroAssembler_x86.hpp"
31
32// ofs and limit are used for multi-block byte array.
33// int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
34void MacroAssembler::fast_sha1(XMMRegister abcd, XMMRegister e0, XMMRegister e1, XMMRegister msg0,
35 XMMRegister msg1, XMMRegister msg2, XMMRegister msg3, XMMRegister shuf_mask,
36 Register buf, Register state, Register ofs, Register limit, Register rsp, bool multi_block) {
37
38 Label start, done_hash, loop0;
39
40 address upper_word_mask = StubRoutines::x86::upper_word_mask_addr();
41 address shuffle_byte_flip_mask = StubRoutines::x86::shuffle_byte_flip_mask_addr();
42
43 bind(start);
44 movdqu(abcd, Address(state, 0));
45 pinsrd(e0, Address(state, 16), 3);
46 movdqu(shuf_mask, ExternalAddress(upper_word_mask)); // 0xFFFFFFFF000000000000000000000000
47 pand(e0, shuf_mask);
48 pshufd(abcd, abcd, 0x1B);
49 movdqu(shuf_mask, ExternalAddress(shuffle_byte_flip_mask)); //0x000102030405060708090a0b0c0d0e0f
50
51 bind(loop0);
52 // Save hash values for addition after rounds
53 movdqu(Address(rsp, 0), e0);
54 movdqu(Address(rsp, 16), abcd);
55
56
57 // Rounds 0 - 3
58 movdqu(msg0, Address(buf, 0));
59 pshufb(msg0, shuf_mask);
60 paddd(e0, msg0);
61 movdqa(e1, abcd);
62 sha1rnds4(abcd, e0, 0);
63
64 // Rounds 4 - 7
65 movdqu(msg1, Address(buf, 16));
66 pshufb(msg1, shuf_mask);
67 sha1nexte(e1, msg1);
68 movdqa(e0, abcd);
69 sha1rnds4(abcd, e1, 0);
70 sha1msg1(msg0, msg1);
71
72 // Rounds 8 - 11
73 movdqu(msg2, Address(buf, 32));
74 pshufb(msg2, shuf_mask);
75 sha1nexte(e0, msg2);
76 movdqa(e1, abcd);
77 sha1rnds4(abcd, e0, 0);
78 sha1msg1(msg1, msg2);
79 pxor(msg0, msg2);
80
81 // Rounds 12 - 15
82 movdqu(msg3, Address(buf, 48));
83 pshufb(msg3, shuf_mask);
84 sha1nexte(e1, msg3);
85 movdqa(e0, abcd);
86 sha1msg2(msg0, msg3);
87 sha1rnds4(abcd, e1, 0);
88 sha1msg1(msg2, msg3);
89 pxor(msg1, msg3);
90
91 // Rounds 16 - 19
92 sha1nexte(e0, msg0);
93 movdqa(e1, abcd);
94 sha1msg2(msg1, msg0);
95 sha1rnds4(abcd, e0, 0);
96 sha1msg1(msg3, msg0);
97 pxor(msg2, msg0);
98
99 // Rounds 20 - 23
100 sha1nexte(e1, msg1);
101 movdqa(e0, abcd);
102 sha1msg2(msg2, msg1);
103 sha1rnds4(abcd, e1, 1);
104 sha1msg1(msg0, msg1);
105 pxor(msg3, msg1);
106
107 // Rounds 24 - 27
108 sha1nexte(e0, msg2);
109 movdqa(e1, abcd);
110 sha1msg2(msg3, msg2);
111 sha1rnds4(abcd, e0, 1);
112 sha1msg1(msg1, msg2);
113 pxor(msg0, msg2);
114
115 // Rounds 28 - 31
116 sha1nexte(e1, msg3);
117 movdqa(e0, abcd);
118 sha1msg2(msg0, msg3);
119 sha1rnds4(abcd, e1, 1);
120 sha1msg1(msg2, msg3);
121 pxor(msg1, msg3);
122
123 // Rounds 32 - 35
124 sha1nexte(e0, msg0);
125 movdqa(e1, abcd);
126 sha1msg2(msg1, msg0);
127 sha1rnds4(abcd, e0, 1);
128 sha1msg1(msg3, msg0);
129 pxor(msg2, msg0);
130
131 // Rounds 36 - 39
132 sha1nexte(e1, msg1);
133 movdqa(e0, abcd);
134 sha1msg2(msg2, msg1);
135 sha1rnds4(abcd, e1, 1);
136 sha1msg1(msg0, msg1);
137 pxor(msg3, msg1);
138
139 // Rounds 40 - 43
140 sha1nexte(e0, msg2);
141 movdqa(e1, abcd);
142 sha1msg2(msg3, msg2);
143 sha1rnds4(abcd, e0, 2);
144 sha1msg1(msg1, msg2);
145 pxor(msg0, msg2);
146
147 // Rounds 44 - 47
148 sha1nexte(e1, msg3);
149 movdqa(e0, abcd);
150 sha1msg2(msg0, msg3);
151 sha1rnds4(abcd, e1, 2);
152 sha1msg1(msg2, msg3);
153 pxor(msg1, msg3);
154
155 // Rounds 48 - 51
156 sha1nexte(e0, msg0);
157 movdqa(e1, abcd);
158 sha1msg2(msg1, msg0);
159 sha1rnds4(abcd, e0, 2);
160 sha1msg1(msg3, msg0);
161 pxor(msg2, msg0);
162
163 // Rounds 52 - 55
164 sha1nexte(e1, msg1);
165 movdqa(e0, abcd);
166 sha1msg2(msg2, msg1);
167 sha1rnds4(abcd, e1, 2);
168 sha1msg1(msg0, msg1);
169 pxor(msg3, msg1);
170
171 // Rounds 56 - 59
172 sha1nexte(e0, msg2);
173 movdqa(e1, abcd);
174 sha1msg2(msg3, msg2);
175 sha1rnds4(abcd, e0, 2);
176 sha1msg1(msg1, msg2);
177 pxor(msg0, msg2);
178
179 // Rounds 60 - 63
180 sha1nexte(e1, msg3);
181 movdqa(e0, abcd);
182 sha1msg2(msg0, msg3);
183 sha1rnds4(abcd, e1, 3);
184 sha1msg1(msg2, msg3);
185 pxor(msg1, msg3);
186
187 // Rounds 64 - 67
188 sha1nexte(e0, msg0);
189 movdqa(e1, abcd);
190 sha1msg2(msg1, msg0);
191 sha1rnds4(abcd, e0, 3);
192 sha1msg1(msg3, msg0);
193 pxor(msg2, msg0);
194
195 // Rounds 68 - 71
196 sha1nexte(e1, msg1);
197 movdqa(e0, abcd);
198 sha1msg2(msg2, msg1);
199 sha1rnds4(abcd, e1, 3);
200 pxor(msg3, msg1);
201
202 // Rounds 72 - 75
203 sha1nexte(e0, msg2);
204 movdqa(e1, abcd);
205 sha1msg2(msg3, msg2);
206 sha1rnds4(abcd, e0, 3);
207
208 // Rounds 76 - 79
209 sha1nexte(e1, msg3);
210 movdqa(e0, abcd);
211 sha1rnds4(abcd, e1, 3);
212
213 // add current hash values with previously saved
214 movdqu(msg0, Address(rsp, 0));
215 sha1nexte(e0, msg0);
216 movdqu(msg0, Address(rsp, 16));
217 paddd(abcd, msg0);
218
219 if (multi_block) {
220 // increment data pointer and loop if more to process
221 addptr(buf, 64);
222 addptr(ofs, 64);
223 cmpptr(ofs, limit);
224 jcc(Assembler::belowEqual, loop0);
225 movptr(rax, ofs); //return ofs
226 }
227 // write hash values back in the correct order
228 pshufd(abcd, abcd, 0x1b);
229 movdqu(Address(state, 0), abcd);
230 pextrd(Address(state, 16), e0, 3);
231
232 bind(done_hash);
233
234}
235
236// xmm0 (msg) is used as an implicit argument to sh256rnds2
237// and state0 and state1 can never use xmm0 register.
238// ofs and limit are used for multi-block byte array.
239// int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
240#ifdef _LP64
241void MacroAssembler::fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
242 XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
243 Register buf, Register state, Register ofs, Register limit, Register rsp,
244 bool multi_block, XMMRegister shuf_mask) {
245#else
246void MacroAssembler::fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
247 XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
248 Register buf, Register state, Register ofs, Register limit, Register rsp,
249 bool multi_block) {
250#endif
251 Label start, done_hash, loop0;
252
253 address K256 = StubRoutines::x86::k256_addr();
254 address pshuffle_byte_flip_mask = StubRoutines::x86::pshuffle_byte_flip_mask_addr();
255
256 bind(start);
257 movdqu(state0, Address(state, 0));
258 movdqu(state1, Address(state, 16));
259
260 pshufd(state0, state0, 0xB1);
261 pshufd(state1, state1, 0x1B);
262 movdqa(msgtmp4, state0);
263 palignr(state0, state1, 8);
264 pblendw(state1, msgtmp4, 0xF0);
265
266#ifdef _LP64
267 movdqu(shuf_mask, ExternalAddress(pshuffle_byte_flip_mask));
268#endif
269 lea(rax, ExternalAddress(K256));
270
271 bind(loop0);
272 movdqu(Address(rsp, 0), state0);
273 movdqu(Address(rsp, 16), state1);
274
275 // Rounds 0-3
276 movdqu(msg, Address(buf, 0));
277#ifdef _LP64
278 pshufb(msg, shuf_mask);
279#else
280 pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask));
281#endif
282 movdqa(msgtmp0, msg);
283 paddd(msg, Address(rax, 0));
284 sha256rnds2(state1, state0);
285 pshufd(msg, msg, 0x0E);
286 sha256rnds2(state0, state1);
287
288 // Rounds 4-7
289 movdqu(msg, Address(buf, 16));
290#ifdef _LP64
291 pshufb(msg, shuf_mask);
292#else
293 pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask));
294#endif
295 movdqa(msgtmp1, msg);
296 paddd(msg, Address(rax, 16));
297 sha256rnds2(state1, state0);
298 pshufd(msg, msg, 0x0E);
299 sha256rnds2(state0, state1);
300 sha256msg1(msgtmp0, msgtmp1);
301
302 // Rounds 8-11
303 movdqu(msg, Address(buf, 32));
304#ifdef _LP64
305 pshufb(msg, shuf_mask);
306#else
307 pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask));
308#endif
309 movdqa(msgtmp2, msg);
310 paddd(msg, Address(rax, 32));
311 sha256rnds2(state1, state0);
312 pshufd(msg, msg, 0x0E);
313 sha256rnds2(state0, state1);
314 sha256msg1(msgtmp1, msgtmp2);
315
316 // Rounds 12-15
317 movdqu(msg, Address(buf, 48));
318#ifdef _LP64
319 pshufb(msg, shuf_mask);
320#else
321 pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask));
322#endif
323 movdqa(msgtmp3, msg);
324 paddd(msg, Address(rax, 48));
325 sha256rnds2(state1, state0);
326 movdqa(msgtmp4, msgtmp3);
327 palignr(msgtmp4, msgtmp2, 4);
328 paddd(msgtmp0, msgtmp4);
329 sha256msg2(msgtmp0, msgtmp3);
330 pshufd(msg, msg, 0x0E);
331 sha256rnds2(state0, state1);
332 sha256msg1(msgtmp2, msgtmp3);
333
334 // Rounds 16-19
335 movdqa(msg, msgtmp0);
336 paddd(msg, Address(rax, 64));
337 sha256rnds2(state1, state0);
338 movdqa(msgtmp4, msgtmp0);
339 palignr(msgtmp4, msgtmp3, 4);
340 paddd(msgtmp1, msgtmp4);
341 sha256msg2(msgtmp1, msgtmp0);
342 pshufd(msg, msg, 0x0E);
343 sha256rnds2(state0, state1);
344 sha256msg1(msgtmp3, msgtmp0);
345
346 // Rounds 20-23
347 movdqa(msg, msgtmp1);
348 paddd(msg, Address(rax, 80));
349 sha256rnds2(state1, state0);
350 movdqa(msgtmp4, msgtmp1);
351 palignr(msgtmp4, msgtmp0, 4);
352 paddd(msgtmp2, msgtmp4);
353 sha256msg2(msgtmp2, msgtmp1);
354 pshufd(msg, msg, 0x0E);
355 sha256rnds2(state0, state1);
356 sha256msg1(msgtmp0, msgtmp1);
357
358 // Rounds 24-27
359 movdqa(msg, msgtmp2);
360 paddd(msg, Address(rax, 96));
361 sha256rnds2(state1, state0);
362 movdqa(msgtmp4, msgtmp2);
363 palignr(msgtmp4, msgtmp1, 4);
364 paddd(msgtmp3, msgtmp4);
365 sha256msg2(msgtmp3, msgtmp2);
366 pshufd(msg, msg, 0x0E);
367 sha256rnds2(state0, state1);
368 sha256msg1(msgtmp1, msgtmp2);
369
370 // Rounds 28-31
371 movdqa(msg, msgtmp3);
372 paddd(msg, Address(rax, 112));
373 sha256rnds2(state1, state0);
374 movdqa(msgtmp4, msgtmp3);
375 palignr(msgtmp4, msgtmp2, 4);
376 paddd(msgtmp0, msgtmp4);
377 sha256msg2(msgtmp0, msgtmp3);
378 pshufd(msg, msg, 0x0E);
379 sha256rnds2(state0, state1);
380 sha256msg1(msgtmp2, msgtmp3);
381
382 // Rounds 32-35
383 movdqa(msg, msgtmp0);
384 paddd(msg, Address(rax, 128));
385 sha256rnds2(state1, state0);
386 movdqa(msgtmp4, msgtmp0);
387 palignr(msgtmp4, msgtmp3, 4);
388 paddd(msgtmp1, msgtmp4);
389 sha256msg2(msgtmp1, msgtmp0);
390 pshufd(msg, msg, 0x0E);
391 sha256rnds2(state0, state1);
392 sha256msg1(msgtmp3, msgtmp0);
393
394 // Rounds 36-39
395 movdqa(msg, msgtmp1);
396 paddd(msg, Address(rax, 144));
397 sha256rnds2(state1, state0);
398 movdqa(msgtmp4, msgtmp1);
399 palignr(msgtmp4, msgtmp0, 4);
400 paddd(msgtmp2, msgtmp4);
401 sha256msg2(msgtmp2, msgtmp1);
402 pshufd(msg, msg, 0x0E);
403 sha256rnds2(state0, state1);
404 sha256msg1(msgtmp0, msgtmp1);
405
406 // Rounds 40-43
407 movdqa(msg, msgtmp2);
408 paddd(msg, Address(rax, 160));
409 sha256rnds2(state1, state0);
410 movdqa(msgtmp4, msgtmp2);
411 palignr(msgtmp4, msgtmp1, 4);
412 paddd(msgtmp3, msgtmp4);
413 sha256msg2(msgtmp3, msgtmp2);
414 pshufd(msg, msg, 0x0E);
415 sha256rnds2(state0, state1);
416 sha256msg1(msgtmp1, msgtmp2);
417
418 // Rounds 44-47
419 movdqa(msg, msgtmp3);
420 paddd(msg, Address(rax, 176));
421 sha256rnds2(state1, state0);
422 movdqa(msgtmp4, msgtmp3);
423 palignr(msgtmp4, msgtmp2, 4);
424 paddd(msgtmp0, msgtmp4);
425 sha256msg2(msgtmp0, msgtmp3);
426 pshufd(msg, msg, 0x0E);
427 sha256rnds2(state0, state1);
428 sha256msg1(msgtmp2, msgtmp3);
429
430 // Rounds 48-51
431 movdqa(msg, msgtmp0);
432 paddd(msg, Address(rax, 192));
433 sha256rnds2(state1, state0);
434 movdqa(msgtmp4, msgtmp0);
435 palignr(msgtmp4, msgtmp3, 4);
436 paddd(msgtmp1, msgtmp4);
437 sha256msg2(msgtmp1, msgtmp0);
438 pshufd(msg, msg, 0x0E);
439 sha256rnds2(state0, state1);
440 sha256msg1(msgtmp3, msgtmp0);
441
442 // Rounds 52-55
443 movdqa(msg, msgtmp1);
444 paddd(msg, Address(rax, 208));
445 sha256rnds2(state1, state0);
446 movdqa(msgtmp4, msgtmp1);
447 palignr(msgtmp4, msgtmp0, 4);
448 paddd(msgtmp2, msgtmp4);
449 sha256msg2(msgtmp2, msgtmp1);
450 pshufd(msg, msg, 0x0E);
451 sha256rnds2(state0, state1);
452
453 // Rounds 56-59
454 movdqa(msg, msgtmp2);
455 paddd(msg, Address(rax, 224));
456 sha256rnds2(state1, state0);
457 movdqa(msgtmp4, msgtmp2);
458 palignr(msgtmp4, msgtmp1, 4);
459 paddd(msgtmp3, msgtmp4);
460 sha256msg2(msgtmp3, msgtmp2);
461 pshufd(msg, msg, 0x0E);
462 sha256rnds2(state0, state1);
463
464 // Rounds 60-63
465 movdqa(msg, msgtmp3);
466 paddd(msg, Address(rax, 240));
467 sha256rnds2(state1, state0);
468 pshufd(msg, msg, 0x0E);
469 sha256rnds2(state0, state1);
470 movdqu(msg, Address(rsp, 0));
471 paddd(state0, msg);
472 movdqu(msg, Address(rsp, 16));
473 paddd(state1, msg);
474
475 if (multi_block) {
476 // increment data pointer and loop if more to process
477 addptr(buf, 64);
478 addptr(ofs, 64);
479 cmpptr(ofs, limit);
480 jcc(Assembler::belowEqual, loop0);
481 movptr(rax, ofs); //return ofs
482 }
483
484 pshufd(state0, state0, 0x1B);
485 pshufd(state1, state1, 0xB1);
486 movdqa(msgtmp4, state0);
487 pblendw(state0, state1, 0xF0);
488 palignr(state1, msgtmp4, 8);
489
490 movdqu(Address(state, 0), state0);
491 movdqu(Address(state, 16), state1);
492
493 bind(done_hash);
494
495}
496
497#ifdef _LP64
498/*
499 The algorithm below is based on Intel publication:
500 "Fast SHA-256 Implementations on Intelë Architecture Processors" by Jim Guilford, Kirk Yap and Vinodh Gopal.
501 The assembly code was originally provided by Sean Gulley and in many places preserves
502 the original assembly NAMES and comments to simplify matching Java assembly with its original.
503 The Java version was substantially redesigned to replace 1200 assembly instruction with
504 much shorter run-time generator of the same code in memory.
505*/
506
507void MacroAssembler::sha256_AVX2_one_round_compute(
508 Register reg_old_h,
509 Register reg_a,
510 Register reg_b,
511 Register reg_c,
512 Register reg_d,
513 Register reg_e,
514 Register reg_f,
515 Register reg_g,
516 Register reg_h,
517 int iter) {
518 const Register& reg_y0 = r13;
519 const Register& reg_y1 = r14;
520 const Register& reg_y2 = r15;
521 const Register& reg_y3 = rcx;
522 const Register& reg_T1 = r12;
523 //;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND iter ;;;;;;;;;;;;;;;;;;;;;;;;;;;
524 if (iter%4 > 0) {
525 addl(reg_old_h, reg_y2); // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; --
526 }
527 movl(reg_y2, reg_f); // reg_y2 = reg_f ; CH
528 rorxd(reg_y0, reg_e, 25); // reg_y0 = reg_e >> 25 ; S1A
529 rorxd(reg_y1, reg_e, 11); // reg_y1 = reg_e >> 11 ; S1B
530 xorl(reg_y2, reg_g); // reg_y2 = reg_f^reg_g ; CH
531
532 xorl(reg_y0, reg_y1); // reg_y0 = (reg_e>>25) ^ (reg_h>>11) ; S1
533 rorxd(reg_y1, reg_e, 6); // reg_y1 = (reg_e >> 6) ; S1
534 andl(reg_y2, reg_e); // reg_y2 = (reg_f^reg_g)&reg_e ; CH
535
536 if (iter%4 > 0) {
537 addl(reg_old_h, reg_y3); // reg_h = t1 + S0 + MAJ ; --
538 }
539
540 xorl(reg_y0, reg_y1); // reg_y0 = (reg_e>>25) ^ (reg_e>>11) ^ (reg_e>>6) ; S1
541 rorxd(reg_T1, reg_a, 13); // reg_T1 = reg_a >> 13 ; S0B
542 xorl(reg_y2, reg_g); // reg_y2 = CH = ((reg_f^reg_g)&reg_e)^reg_g ; CH
543 rorxd(reg_y1, reg_a, 22); // reg_y1 = reg_a >> 22 ; S0A
544 movl(reg_y3, reg_a); // reg_y3 = reg_a ; MAJA
545
546 xorl(reg_y1, reg_T1); // reg_y1 = (reg_a>>22) ^ (reg_a>>13) ; S0
547 rorxd(reg_T1, reg_a, 2); // reg_T1 = (reg_a >> 2) ; S0
548 addl(reg_h, Address(rsp, rdx, Address::times_1, 4*iter)); // reg_h = k + w + reg_h ; --
549 orl(reg_y3, reg_c); // reg_y3 = reg_a|reg_c ; MAJA
550
551 xorl(reg_y1, reg_T1); // reg_y1 = (reg_a>>22) ^ (reg_a>>13) ^ (reg_a>>2) ; S0
552 movl(reg_T1, reg_a); // reg_T1 = reg_a ; MAJB
553 andl(reg_y3, reg_b); // reg_y3 = (reg_a|reg_c)&reg_b ; MAJA
554 andl(reg_T1, reg_c); // reg_T1 = reg_a&reg_c ; MAJB
555 addl(reg_y2, reg_y0); // reg_y2 = S1 + CH ; --
556
557
558 addl(reg_d, reg_h); // reg_d = k + w + reg_h + reg_d ; --
559 orl(reg_y3, reg_T1); // reg_y3 = MAJ = (reg_a|reg_c)&reg_b)|(reg_a&reg_c) ; MAJ
560 addl(reg_h, reg_y1); // reg_h = k + w + reg_h + S0 ; --
561
562 addl(reg_d, reg_y2); // reg_d = k + w + reg_h + reg_d + S1 + CH = reg_d + t1 ; --
563
564
565 if (iter%4 == 3) {
566 addl(reg_h, reg_y2); // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; --
567 addl(reg_h, reg_y3); // reg_h = t1 + S0 + MAJ ; --
568 }
569}
570
571void MacroAssembler::sha256_AVX2_four_rounds_compute_first(int start) {
572 sha256_AVX2_one_round_compute(rax, rax, rbx, rdi, rsi, r8, r9, r10, r11, start + 0);
573 sha256_AVX2_one_round_compute(r11, r11, rax, rbx, rdi, rsi, r8, r9, r10, start + 1);
574 sha256_AVX2_one_round_compute(r10, r10, r11, rax, rbx, rdi, rsi, r8, r9, start + 2);
575 sha256_AVX2_one_round_compute(r9, r9, r10, r11, rax, rbx, rdi, rsi, r8, start + 3);
576}
577
578void MacroAssembler::sha256_AVX2_four_rounds_compute_last(int start) {
579 sha256_AVX2_one_round_compute(r8, r8, r9, r10, r11, rax, rbx, rdi, rsi, start + 0);
580 sha256_AVX2_one_round_compute(rsi, rsi, r8, r9, r10, r11, rax, rbx, rdi, start + 1);
581 sha256_AVX2_one_round_compute(rdi, rdi, rsi, r8, r9, r10, r11, rax, rbx, start + 2);
582 sha256_AVX2_one_round_compute(rbx, rbx, rdi, rsi, r8, r9, r10, r11, rax, start + 3);
583}
584
585void MacroAssembler::sha256_AVX2_one_round_and_sched(
586 XMMRegister xmm_0, /* == ymm4 on 0, 1, 2, 3 iterations, then rotate 4 registers left on 4, 8, 12 iterations */
587 XMMRegister xmm_1, /* ymm5 */ /* full cycle is 16 iterations */
588 XMMRegister xmm_2, /* ymm6 */
589 XMMRegister xmm_3, /* ymm7 */
590 Register reg_a, /* == rax on 0 iteration, then rotate 8 register right on each next iteration */
591 Register reg_b, /* rbx */ /* full cycle is 8 iterations */
592 Register reg_c, /* rdi */
593 Register reg_d, /* rsi */
594 Register reg_e, /* r8 */
595 Register reg_f, /* r9d */
596 Register reg_g, /* r10d */
597 Register reg_h, /* r11d */
598 int iter)
599{
600 movl(rcx, reg_a); // rcx = reg_a ; MAJA
601 rorxd(r13, reg_e, 25); // r13 = reg_e >> 25 ; S1A
602 rorxd(r14, reg_e, 11); // r14 = reg_e >> 11 ; S1B
603 addl(reg_h, Address(rsp, rdx, Address::times_1, 4*iter));
604 orl(rcx, reg_c); // rcx = reg_a|reg_c ; MAJA
605
606 movl(r15, reg_f); // r15 = reg_f ; CH
607 rorxd(r12, reg_a, 13); // r12 = reg_a >> 13 ; S0B
608 xorl(r13, r14); // r13 = (reg_e>>25) ^ (reg_e>>11) ; S1
609 xorl(r15, reg_g); // r15 = reg_f^reg_g ; CH
610
611 rorxd(r14, reg_e, 6); // r14 = (reg_e >> 6) ; S1
612 andl(r15, reg_e); // r15 = (reg_f^reg_g)&reg_e ; CH
613
614 xorl(r13, r14); // r13 = (reg_e>>25) ^ (reg_e>>11) ^ (reg_e>>6) ; S1
615 rorxd(r14, reg_a, 22); // r14 = reg_a >> 22 ; S0A
616 addl(reg_d, reg_h); // reg_d = k + w + reg_h + reg_d ; --
617
618 andl(rcx, reg_b); // rcx = (reg_a|reg_c)&reg_b ; MAJA
619 xorl(r14, r12); // r14 = (reg_a>>22) ^ (reg_a>>13) ; S0
620
621 rorxd(r12, reg_a, 2); // r12 = (reg_a >> 2) ; S0
622 xorl(r15, reg_g); // r15 = CH = ((reg_f^reg_g)&reg_e)^reg_g ; CH
623
624 xorl(r14, r12); // r14 = (reg_a>>22) ^ (reg_a>>13) ^ (reg_a>>2) ; S0
625 movl(r12, reg_a); // r12 = reg_a ; MAJB
626 andl(r12, reg_c); // r12 = reg_a&reg_c ; MAJB
627 addl(r15, r13); // r15 = S1 + CH ; --
628
629 orl(rcx, r12); // rcx = MAJ = (reg_a|reg_c)&reg_b)|(reg_a&reg_c) ; MAJ
630 addl(reg_h, r14); // reg_h = k + w + reg_h + S0 ; --
631 addl(reg_d, r15); // reg_d = k + w + reg_h + reg_d + S1 + CH = reg_d + t1 ; --
632
633 addl(reg_h, r15); // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; --
634 addl(reg_h, rcx); // reg_h = t1 + S0 + MAJ ; --
635
636 if (iter%4 == 0) {
637 vpalignr(xmm0, xmm_3, xmm_2, 4, AVX_256bit); // ymm0 = W[-7]
638 vpaddd(xmm0, xmm0, xmm_0, AVX_256bit); // ymm0 = W[-7] + W[-16]; y1 = (e >> 6) ; S1
639 vpalignr(xmm1, xmm_1, xmm_0, 4, AVX_256bit); // ymm1 = W[-15]
640 vpsrld(xmm2, xmm1, 7, AVX_256bit);
641 vpslld(xmm3, xmm1, 32-7, AVX_256bit);
642 vpor(xmm3, xmm3, xmm2, AVX_256bit); // ymm3 = W[-15] ror 7
643 vpsrld(xmm2, xmm1,18, AVX_256bit);
644 } else if (iter%4 == 1 ) {
645 vpsrld(xmm8, xmm1, 3, AVX_256bit); // ymm8 = W[-15] >> 3
646 vpslld(xmm1, xmm1, 32-18, AVX_256bit);
647 vpxor(xmm3, xmm3, xmm1, AVX_256bit);
648 vpxor(xmm3, xmm3, xmm2, AVX_256bit); // ymm3 = W[-15] ror 7 ^ W[-15] ror 18
649 vpxor(xmm1, xmm3, xmm8, AVX_256bit); // ymm1 = s0
650 vpshufd(xmm2, xmm_3, 0xFA, AVX_256bit); // 11111010b ; ymm2 = W[-2] {BBAA}
651 vpaddd(xmm0, xmm0, xmm1, AVX_256bit); // ymm0 = W[-16] + W[-7] + s0
652 vpsrld(xmm8, xmm2, 10, AVX_256bit); // ymm8 = W[-2] >> 10 {BBAA}
653 } else if (iter%4 == 2) {
654 vpsrlq(xmm3, xmm2, 19, AVX_256bit); // ymm3 = W[-2] ror 19 {xBxA}
655 vpsrlq(xmm2, xmm2, 17, AVX_256bit); // ymm2 = W[-2] ror 17 {xBxA}
656 vpxor(xmm2, xmm2, xmm3, AVX_256bit);
657 vpxor(xmm8, xmm8, xmm2, AVX_256bit); // ymm8 = s1 {xBxA}
658 vpshufb(xmm8, xmm8, xmm10, AVX_256bit); // ymm8 = s1 {00BA}
659 vpaddd(xmm0, xmm0, xmm8, AVX_256bit); // ymm0 = {..., ..., W[1], W[0]}
660 vpshufd(xmm2, xmm0, 0x50, AVX_256bit); // 01010000b ; ymm2 = W[-2] {DDCC}
661 } else if (iter%4 == 3) {
662 vpsrld(xmm11, xmm2, 10, AVX_256bit); // ymm11 = W[-2] >> 10 {DDCC}
663 vpsrlq(xmm3, xmm2, 19, AVX_256bit); // ymm3 = W[-2] ror 19 {xDxC}
664 vpsrlq(xmm2, xmm2, 17, AVX_256bit); // ymm2 = W[-2] ror 17 {xDxC}
665 vpxor(xmm2, xmm2, xmm3, AVX_256bit);
666 vpxor(xmm11, xmm11, xmm2, AVX_256bit); // ymm11 = s1 {xDxC}
667 vpshufb(xmm11, xmm11, xmm12, AVX_256bit); // ymm11 = s1 {DC00}
668 vpaddd(xmm_0, xmm11, xmm0, AVX_256bit); // xmm_0 = {W[3], W[2], W[1], W[0]}
669 }
670}
671
672void MacroAssembler::addm(int disp, Register r1, Register r2) {
673 addl(r2, Address(r1, disp));
674 movl(Address(r1, disp), r2);
675}
676
677void MacroAssembler::addmq(int disp, Register r1, Register r2) {
678 addq(r2, Address(r1, disp));
679 movq(Address(r1, disp), r2);
680}
681
682void MacroAssembler::sha256_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
683 XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
684 Register buf, Register state, Register ofs, Register limit, Register rsp,
685 bool multi_block, XMMRegister shuf_mask) {
686
687 Label loop0, loop1, loop2, loop3,
688 last_block_enter, do_last_block, only_one_block, done_hash,
689 compute_size, compute_size_end,
690 compute_size1, compute_size_end1;
691
692 address K256_W = StubRoutines::x86::k256_W_addr();
693 address pshuffle_byte_flip_mask = StubRoutines::x86::pshuffle_byte_flip_mask_addr();
694 address pshuffle_byte_flip_mask_addr = 0;
695
696const XMMRegister& SHUF_00BA = xmm10; // ymm10: shuffle xBxA -> 00BA
697const XMMRegister& SHUF_DC00 = xmm12; // ymm12: shuffle xDxC -> DC00
698const XMMRegister& BYTE_FLIP_MASK = xmm13; // ymm13
699
700const XMMRegister& X_BYTE_FLIP_MASK = xmm13; //XMM version of BYTE_FLIP_MASK
701
702const Register& NUM_BLKS = r8; // 3rd arg
703const Register& CTX = rdx; // 2nd arg
704const Register& INP = rcx; // 1st arg
705
706const Register& c = rdi;
707const Register& d = rsi;
708const Register& e = r8; // clobbers NUM_BLKS
709const Register& y3 = rcx; // clobbers INP
710
711const Register& TBL = rbp;
712const Register& SRND = CTX; // SRND is same register as CTX
713
714const Register& a = rax;
715const Register& b = rbx;
716const Register& f = r9;
717const Register& g = r10;
718const Register& h = r11;
719
720const Register& T1 = r12;
721const Register& y0 = r13;
722const Register& y1 = r14;
723const Register& y2 = r15;
724
725
726enum {
727 _XFER_SIZE = 2*64*4, // 2 blocks, 64 rounds, 4 bytes/round
728 _INP_END_SIZE = 8,
729 _INP_SIZE = 8,
730 _CTX_SIZE = 8,
731 _RSP_SIZE = 8,
732
733 _XFER = 0,
734 _INP_END = _XFER + _XFER_SIZE,
735 _INP = _INP_END + _INP_END_SIZE,
736 _CTX = _INP + _INP_SIZE,
737 _RSP = _CTX + _CTX_SIZE,
738 STACK_SIZE = _RSP + _RSP_SIZE
739};
740
741#ifndef _WIN64
742 push(rcx); // linux: this is limit, need at the end
743 push(rdx); // linux: this is ofs
744#else
745 push(r8); // win64: this is ofs
746 push(r9); // win64: this is limit, we need them again at the very and
747#endif
748
749
750 push(rbx);
751#ifdef _WIN64
752 push(rsi);
753 push(rdi);
754#endif
755 push(rbp);
756 push(r12);
757 push(r13);
758 push(r14);
759 push(r15);
760
761 movq(rax, rsp);
762 subq(rsp, STACK_SIZE);
763 andq(rsp, -32);
764 movq(Address(rsp, _RSP), rax);
765
766#ifndef _WIN64
767 // copy linux params to win64 params, therefore the rest of code will be the same for both
768 movq(r9, rcx);
769 movq(r8, rdx);
770 movq(rdx, rsi);
771 movq(rcx, rdi);
772#endif
773
774 // setting original assembly ABI
775 /** message to encrypt in INP */
776 lea(INP, Address(rcx, 0)); // rcx == message (buf) ;; linux: INP = buf = rdi
777 /** digest in CTX */
778 movq(CTX, rdx); // rdx = digest (state) ;; linux: CTX = state = rsi
779
780 /** NUM_BLK is the length of message, need to set it from ofs and limit */
781 if (multi_block) {
782
783 // Win64: cannot directly update NUM_BLKS, since NUM_BLKS = ofs = r8
784 // on entry r8 = ofs
785 // on exit r8 = NUM_BLKS
786
787 xorq(rax, rax);
788
789 bind(compute_size);
790 cmpptr(r8, r9); // assume the original ofs <= limit ;; linux: cmp rcx, rdx
791 jccb(Assembler::aboveEqual, compute_size_end);
792 addq(r8, 64); //;; linux: ofs = rdx
793 addq(rax, 64);
794 jmpb(compute_size);
795
796 bind(compute_size_end);
797 movq(NUM_BLKS, rax); // NUM_BLK (r8) ;; linux: NUM_BLK = rdx
798
799 cmpq(NUM_BLKS, 0);
800 jcc(Assembler::equal, done_hash);
801
802 } else {
803 xorq(NUM_BLKS, NUM_BLKS);
804 addq(NUM_BLKS, 64);
805 }//if (!multi_block)
806
807 lea(NUM_BLKS, Address(INP, NUM_BLKS, Address::times_1, -64)); // pointer to the last block
808 movq(Address(rsp, _INP_END), NUM_BLKS); //
809
810 cmpptr(INP, NUM_BLKS); //cmp INP, NUM_BLKS
811 jcc(Assembler::equal, only_one_block); //je only_one_block
812
813 // load initial digest
814 movl(a, Address(CTX, 4*0));
815 movl(b, Address(CTX, 4*1));
816 movl(c, Address(CTX, 4*2));
817 movl(d, Address(CTX, 4*3));
818 movl(e, Address(CTX, 4*4));
819 movl(f, Address(CTX, 4*5));
820 // load g - r10 after it is used as scratch
821 movl(h, Address(CTX, 4*7));
822
823 pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask;
824 vmovdqu(BYTE_FLIP_MASK, ExternalAddress(pshuffle_byte_flip_mask_addr +0)); //[PSHUFFLE_BYTE_FLIP_MASK wrt rip]
825 vmovdqu(SHUF_00BA, ExternalAddress(pshuffle_byte_flip_mask_addr + 32)); //[_SHUF_00BA wrt rip]
826 vmovdqu(SHUF_DC00, ExternalAddress(pshuffle_byte_flip_mask_addr + 64)); //[_SHUF_DC00 wrt rip]
827
828 movl(g, Address(CTX, 4*6));
829
830 movq(Address(rsp, _CTX), CTX); // store
831
832bind(loop0);
833 lea(TBL, ExternalAddress(K256_W));
834
835 // assume buffers not aligned
836
837 // Load first 16 dwords from two blocks
838 vmovdqu(xmm0, Address(INP, 0*32));
839 vmovdqu(xmm1, Address(INP, 1*32));
840 vmovdqu(xmm2, Address(INP, 2*32));
841 vmovdqu(xmm3, Address(INP, 3*32));
842
843 // byte swap data
844 vpshufb(xmm0, xmm0, BYTE_FLIP_MASK, AVX_256bit);
845 vpshufb(xmm1, xmm1, BYTE_FLIP_MASK, AVX_256bit);
846 vpshufb(xmm2, xmm2, BYTE_FLIP_MASK, AVX_256bit);
847 vpshufb(xmm3, xmm3, BYTE_FLIP_MASK, AVX_256bit);
848
849 // transpose data into high/low halves
850 vperm2i128(xmm4, xmm0, xmm2, 0x20);
851 vperm2i128(xmm5, xmm0, xmm2, 0x31);
852 vperm2i128(xmm6, xmm1, xmm3, 0x20);
853 vperm2i128(xmm7, xmm1, xmm3, 0x31);
854
855bind(last_block_enter);
856 addq(INP, 64);
857 movq(Address(rsp, _INP), INP);
858
859 //;; schedule 48 input dwords, by doing 3 rounds of 12 each
860 xorq(SRND, SRND);
861
862align(16);
863bind(loop1);
864 vpaddd(xmm9, xmm4, Address(TBL, SRND, Address::times_1, 0*32), AVX_256bit);
865 vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 0*32), xmm9);
866 sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, rax, rbx, rdi, rsi, r8, r9, r10, r11, 0);
867 sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r11, rax, rbx, rdi, rsi, r8, r9, r10, 1);
868 sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r10, r11, rax, rbx, rdi, rsi, r8, r9, 2);
869 sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r9, r10, r11, rax, rbx, rdi, rsi, r8, 3);
870
871 vpaddd(xmm9, xmm5, Address(TBL, SRND, Address::times_1, 1*32), AVX_256bit);
872 vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 1*32), xmm9);
873 sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, r8, r9, r10, r11, rax, rbx, rdi, rsi, 8+0);
874 sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rsi, r8, r9, r10, r11, rax, rbx, rdi, 8+1);
875 sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rdi, rsi, r8, r9, r10, r11, rax, rbx, 8+2);
876 sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rbx, rdi, rsi, r8, r9, r10, r11, rax, 8+3);
877
878 vpaddd(xmm9, xmm6, Address(TBL, SRND, Address::times_1, 2*32), AVX_256bit);
879 vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 2*32), xmm9);
880 sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, rax, rbx, rdi, rsi, r8, r9, r10, r11, 16+0);
881 sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r11, rax, rbx, rdi, rsi, r8, r9, r10, 16+1);
882 sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r10, r11, rax, rbx, rdi, rsi, r8, r9, 16+2);
883 sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r9, r10, r11, rax, rbx, rdi, rsi, r8, 16+3);
884
885 vpaddd(xmm9, xmm7, Address(TBL, SRND, Address::times_1, 3*32), AVX_256bit);
886 vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 3*32), xmm9);
887
888 sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, r8, r9, r10, r11, rax, rbx, rdi, rsi, 24+0);
889 sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rsi, r8, r9, r10, r11, rax, rbx, rdi, 24+1);
890 sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rdi, rsi, r8, r9, r10, r11, rax, rbx, 24+2);
891 sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rbx, rdi, rsi, r8, r9, r10, r11, rax, 24+3);
892
893 addq(SRND, 4*32);
894 cmpq(SRND, 3 * 4*32);
895 jcc(Assembler::below, loop1);
896
897bind(loop2);
898 // Do last 16 rounds with no scheduling
899 vpaddd(xmm9, xmm4, Address(TBL, SRND, Address::times_1, 0*32), AVX_256bit);
900 vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 0*32), xmm9);
901 sha256_AVX2_four_rounds_compute_first(0);
902
903 vpaddd(xmm9, xmm5, Address(TBL, SRND, Address::times_1, 1*32), AVX_256bit);
904 vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 1*32), xmm9);
905 sha256_AVX2_four_rounds_compute_last(0 + 8);
906
907 addq(SRND, 2*32);
908
909 vmovdqu(xmm4, xmm6);
910 vmovdqu(xmm5, xmm7);
911
912 cmpq(SRND, 4 * 4*32);
913 jcc(Assembler::below, loop2);
914
915 movq(CTX, Address(rsp, _CTX));
916 movq(INP, Address(rsp, _INP));
917
918 addm(4*0, CTX, a);
919 addm(4*1, CTX, b);
920 addm(4*2, CTX, c);
921 addm(4*3, CTX, d);
922 addm(4*4, CTX, e);
923 addm(4*5, CTX, f);
924 addm(4*6, CTX, g);
925 addm(4*7, CTX, h);
926
927 cmpq(INP, Address(rsp, _INP_END));
928 jcc(Assembler::above, done_hash);
929
930 //Do second block using previously scheduled results
931 xorq(SRND, SRND);
932align(16);
933bind(loop3);
934 sha256_AVX2_four_rounds_compute_first(4);
935 sha256_AVX2_four_rounds_compute_last(4+8);
936
937 addq(SRND, 2*32);
938 cmpq(SRND, 4 * 4*32);
939 jcc(Assembler::below, loop3);
940
941 movq(CTX, Address(rsp, _CTX));
942 movq(INP, Address(rsp, _INP));
943 addq(INP, 64);
944
945 addm(4*0, CTX, a);
946 addm(4*1, CTX, b);
947 addm(4*2, CTX, c);
948 addm(4*3, CTX, d);
949 addm(4*4, CTX, e);
950 addm(4*5, CTX, f);
951 addm(4*6, CTX, g);
952 addm(4*7, CTX, h);
953
954 cmpq(INP, Address(rsp, _INP_END));
955 jcc(Assembler::below, loop0);
956 jccb(Assembler::above, done_hash);
957
958bind(do_last_block);
959 lea(TBL, ExternalAddress(K256_W));
960
961 movdqu(xmm4, Address(INP, 0*16));
962 movdqu(xmm5, Address(INP, 1*16));
963 movdqu(xmm6, Address(INP, 2*16));
964 movdqu(xmm7, Address(INP, 3*16));
965
966 vpshufb(xmm4, xmm4, xmm13, AVX_128bit);
967 vpshufb(xmm5, xmm5, xmm13, AVX_128bit);
968 vpshufb(xmm6, xmm6, xmm13, AVX_128bit);
969 vpshufb(xmm7, xmm7, xmm13, AVX_128bit);
970
971 jmp(last_block_enter);
972
973bind(only_one_block);
974
975 // load initial digest ;; table should be preloaded with following values
976 movl(a, Address(CTX, 4*0)); // 0x6a09e667
977 movl(b, Address(CTX, 4*1)); // 0xbb67ae85
978 movl(c, Address(CTX, 4*2)); // 0x3c6ef372
979 movl(d, Address(CTX, 4*3)); // 0xa54ff53a
980 movl(e, Address(CTX, 4*4)); // 0x510e527f
981 movl(f, Address(CTX, 4*5)); // 0x9b05688c
982 // load g - r10 after use as scratch
983 movl(h, Address(CTX, 4*7)); // 0x5be0cd19
984
985
986 pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask;
987 vmovdqu(BYTE_FLIP_MASK, ExternalAddress(pshuffle_byte_flip_mask_addr + 0)); //[PSHUFFLE_BYTE_FLIP_MASK wrt rip]
988 vmovdqu(SHUF_00BA, ExternalAddress(pshuffle_byte_flip_mask_addr + 32)); //[_SHUF_00BA wrt rip]
989 vmovdqu(SHUF_DC00, ExternalAddress(pshuffle_byte_flip_mask_addr + 64)); //[_SHUF_DC00 wrt rip]
990
991 movl(g, Address(CTX, 4*6)); // 0x1f83d9ab
992
993 movq(Address(rsp, _CTX), CTX);
994 jmpb(do_last_block);
995
996bind(done_hash);
997
998 movq(rsp, Address(rsp, _RSP));
999
1000 pop(r15);
1001 pop(r14);
1002 pop(r13);
1003 pop(r12);
1004 pop(rbp);
1005#ifdef _WIN64
1006 pop(rdi);
1007 pop(rsi);
1008#endif
1009 pop(rbx);
1010
1011#ifdef _WIN64
1012 pop(r9);
1013 pop(r8);
1014#else
1015 pop(rdx);
1016 pop(rcx);
1017#endif
1018
1019 if (multi_block) {
1020#ifdef _WIN64
1021const Register& limit_end = r9;
1022const Register& ofs_end = r8;
1023#else
1024const Register& limit_end = rcx;
1025const Register& ofs_end = rdx;
1026#endif
1027 movq(rax, ofs_end);
1028
1029bind(compute_size1);
1030 cmpptr(rax, limit_end); // assume the original ofs <= limit
1031 jccb(Assembler::aboveEqual, compute_size_end1);
1032 addq(rax, 64);
1033 jmpb(compute_size1);
1034
1035bind(compute_size_end1);
1036 }
1037}
1038
1039void MacroAssembler::sha512_AVX2_one_round_compute(Register old_h, Register a, Register b, Register c,
1040 Register d, Register e, Register f, Register g, Register h,
1041 int iteration)
1042{
1043
1044 const Register& y0 = r13;
1045 const Register& y1 = r14;
1046 const Register& y2 = r15;
1047#ifdef _WIN64
1048 const Register& y3 = rcx;
1049#else
1050 const Register& y3 = rdi;
1051#endif
1052 const Register& T1 = r12;
1053
1054 if (iteration % 4 > 0) {
1055 addq(old_h, y2); //h = k + w + h + S0 + S1 + CH = t1 + S0;
1056 }
1057 movq(y2, f); //y2 = f; CH
1058 rorxq(y0, e, 41); //y0 = e >> 41; S1A
1059 rorxq(y1, e, 18); //y1 = e >> 18; S1B
1060 xorq(y2, g); //y2 = f^g; CH
1061
1062 xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18); S1
1063 rorxq(y1, e, 14); //y1 = (e >> 14); S1
1064 andq(y2, e); //y2 = (f^g)&e; CH
1065
1066 if (iteration % 4 > 0 ) {
1067 addq(old_h, y3); //h = t1 + S0 + MAJ
1068 }
1069 xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18) ^ (e >> 14); S1
1070 rorxq(T1, a, 34); //T1 = a >> 34; S0B
1071 xorq(y2, g); //y2 = CH = ((f^g)&e) ^g; CH
1072 rorxq(y1, a, 39); //y1 = a >> 39; S0A
1073 movq(y3, a); //y3 = a; MAJA
1074
1075 xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34); S0
1076 rorxq(T1, a, 28); //T1 = (a >> 28); S0
1077 addq(h, Address(rsp, (8 * iteration))); //h = k + w + h; --
1078 orq(y3, c); //y3 = a | c; MAJA
1079
1080 xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34) ^ (a >> 28); S0
1081 movq(T1, a); //T1 = a; MAJB
1082 andq(y3, b); //y3 = (a | c)&b; MAJA
1083 andq(T1, c); //T1 = a&c; MAJB
1084 addq(y2, y0); //y2 = S1 + CH; --
1085
1086 addq(d, h); //d = k + w + h + d; --
1087 orq(y3, T1); //y3 = MAJ = (a | c)&b) | (a&c); MAJ
1088 addq(h, y1); //h = k + w + h + S0; --
1089
1090 addq(d, y2); //d = k + w + h + d + S1 + CH = d + t1; --
1091
1092 if (iteration % 4 == 3) {
1093 addq(h, y2); //h = k + w + h + S0 + S1 + CH = t1 + S0; --
1094 addq(h, y3); //h = t1 + S0 + MAJ; --
1095 }
1096}
1097
1098void MacroAssembler::sha512_AVX2_one_round_and_schedule(
1099 XMMRegister xmm4, // ymm4
1100 XMMRegister xmm5, // ymm5
1101 XMMRegister xmm6, // ymm6
1102 XMMRegister xmm7, // ymm7
1103 Register a, //rax
1104 Register b, //rbx
1105 Register c, //rdi
1106 Register d, //rsi
1107 Register e, //r8
1108 Register f, //r9
1109 Register g, //r10
1110 Register h, //r11
1111 int iteration)
1112{
1113
1114 const Register& y0 = r13;
1115 const Register& y1 = r14;
1116 const Register& y2 = r15;
1117#ifdef _WIN64
1118 const Register& y3 = rcx;
1119#else
1120 const Register& y3 = rdi;
1121#endif
1122 const Register& T1 = r12;
1123
1124 if (iteration % 4 == 0) {
1125 // Extract w[t - 7]
1126 // xmm0 = W[-7]
1127 vperm2f128(xmm0, xmm7, xmm6, 3);
1128 vpalignr(xmm0, xmm0, xmm6, 8, AVX_256bit);
1129
1130 // Calculate w[t - 16] + w[t - 7]
1131 vpaddq(xmm0, xmm0, xmm4, AVX_256bit); //xmm0 = W[-7] + W[-16]
1132 // Extract w[t - 15]
1133 //xmm1 = W[-15]
1134 vperm2f128(xmm1, xmm5, xmm4, 3);
1135 vpalignr(xmm1, xmm1, xmm4, 8, AVX_256bit);
1136
1137 // Calculate sigma0
1138 // Calculate w[t - 15] ror 1
1139 vpsrlq(xmm2, xmm1, 1, AVX_256bit);
1140 vpsllq(xmm3, xmm1, (64 - 1), AVX_256bit);
1141 vpor(xmm3, xmm3, xmm2, AVX_256bit); //xmm3 = W[-15] ror 1
1142 // Calculate w[t - 15] shr 7
1143 vpsrlq(xmm8, xmm1, 7, AVX_256bit); //xmm8 = W[-15] >> 7
1144
1145 } else if (iteration % 4 == 1) {
1146 //Calculate w[t - 15] ror 8
1147 vpsrlq(xmm2, xmm1, 8, AVX_256bit);
1148 vpsllq(xmm1, xmm1, (64 - 8), AVX_256bit);
1149 vpor(xmm1, xmm1, xmm2, AVX_256bit); //xmm1 = W[-15] ror 8
1150
1151 //XOR the three components
1152 vpxor(xmm3, xmm3, xmm8, AVX_256bit); //xmm3 = W[-15] ror 1 ^ W[-15] >> 7
1153 vpxor(xmm1, xmm3, xmm1, AVX_256bit); //xmm1 = s0
1154
1155 //Add three components, w[t - 16], w[t - 7] and sigma0
1156 vpaddq(xmm0, xmm0, xmm1, AVX_256bit); //xmm0 = W[-16] + W[-7] + s0
1157
1158 // Move to appropriate lanes for calculating w[16] and w[17]
1159 vperm2f128(xmm4, xmm0, xmm0, 0); //xmm4 = W[-16] + W[-7] + s0{ BABA }
1160
1161 //Move to appropriate lanes for calculating w[18] and w[19]
1162 vpand(xmm0, xmm0, xmm10, AVX_256bit); //xmm0 = W[-16] + W[-7] + s0{ DC00 }
1163 //Calculate w[16] and w[17] in both 128 bit lanes
1164 //Calculate sigma1 for w[16] and w[17] on both 128 bit lanes
1165 vperm2f128(xmm2, xmm7, xmm7, 17); //xmm2 = W[-2] {BABA}
1166 vpsrlq(xmm8, xmm2, 6, AVX_256bit); //xmm8 = W[-2] >> 6 {BABA}
1167
1168 } else if (iteration % 4 == 2) {
1169 vpsrlq(xmm3, xmm2, 19, AVX_256bit); //xmm3 = W[-2] >> 19 {BABA}
1170 vpsllq(xmm1, xmm2, (64 - 19), AVX_256bit); //xmm1 = W[-2] << 19 {BABA}
1171 vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 19 {BABA}
1172 vpxor(xmm8, xmm8, xmm3, AVX_256bit);// xmm8 = W[-2] ror 19 ^ W[-2] >> 6 {BABA}
1173 vpsrlq(xmm3, xmm2, 61, AVX_256bit); //xmm3 = W[-2] >> 61 {BABA}
1174 vpsllq(xmm1, xmm2, (64 - 61), AVX_256bit); //xmm1 = W[-2] << 61 {BABA}
1175 vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 61 {BABA}
1176 vpxor(xmm8, xmm8, xmm3, AVX_256bit); //xmm8 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) { BABA }
1177
1178 //Add sigma1 to the other components to get w[16] and w[17]
1179 vpaddq(xmm4, xmm4, xmm8, AVX_256bit); //xmm4 = { W[1], W[0], W[1], W[0] }
1180
1181 //Calculate sigma1 for w[18] and w[19] for upper 128 bit lane
1182 vpsrlq(xmm8, xmm4, 6, AVX_256bit); //xmm8 = W[-2] >> 6 {DC--}
1183
1184 } else if (iteration % 4 == 3){
1185 vpsrlq(xmm3, xmm4, 19, AVX_256bit); //xmm3 = W[-2] >> 19 {DC--}
1186 vpsllq(xmm1, xmm4, (64 - 19), AVX_256bit); //xmm1 = W[-2] << 19 {DC--}
1187 vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 19 {DC--}
1188 vpxor(xmm8, xmm8, xmm3, AVX_256bit); //xmm8 = W[-2] ror 19 ^ W[-2] >> 6 {DC--}
1189 vpsrlq(xmm3, xmm4, 61, AVX_256bit); //xmm3 = W[-2] >> 61 {DC--}
1190 vpsllq(xmm1, xmm4, (64 - 61), AVX_256bit); //xmm1 = W[-2] << 61 {DC--}
1191 vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 61 {DC--}
1192 vpxor(xmm8, xmm8, xmm3, AVX_256bit); //xmm8 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) { DC-- }
1193
1194 //Add the sigma0 + w[t - 7] + w[t - 16] for w[18] and w[19] to newly calculated sigma1 to get w[18] and w[19]
1195 vpaddq(xmm2, xmm0, xmm8, AVX_256bit); //xmm2 = { W[3], W[2], --, -- }
1196
1197 //Form w[19, w[18], w17], w[16]
1198 vpblendd(xmm4, xmm4, xmm2, 0xF0, AVX_256bit); //xmm4 = { W[3], W[2], W[1], W[0] }
1199 }
1200
1201 movq(y3, a); //y3 = a; MAJA
1202 rorxq(y0, e, 41); // y0 = e >> 41; S1A
1203 rorxq(y1, e, 18); //y1 = e >> 18; S1B
1204 addq(h, Address(rsp, (iteration * 8))); //h = k + w + h; --
1205 orq(y3, c); //y3 = a | c; MAJA
1206 movq(y2, f); //y2 = f; CH
1207
1208 xorq(y2, g); //y2 = f^g; CH
1209
1210 rorxq(T1, a, 34); //T1 = a >> 34; S0B
1211 xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18); S1
1212
1213 rorxq(y1, e, 14); //y1 = (e >> 14); S1
1214
1215 andq(y2, e); //y2 = (f^g) & e; CH
1216 addq(d, h); //d = k + w + h + d; --
1217
1218 andq(y3, b); //y3 = (a | c)&b; MAJA
1219 xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18) ^ (e >> 14); S1
1220 rorxq(y1, a, 39); //y1 = a >> 39; S0A
1221
1222 xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34); S0
1223 rorxq(T1, a, 28); //T1 = (a >> 28); S0
1224 xorq(y2, g); //y2 = CH = ((f^g)&e) ^ g; CH
1225
1226 xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34) ^ (a >> 28); S0
1227 movq(T1, a); //T1 = a; MAJB
1228
1229 andq(T1, c); //T1 = a&c; MAJB
1230 addq(y2, y0); //y2 = S1 + CH; --
1231
1232 orq(y3, T1); //y3 = MAJ = (a | c)&b) | (a&c); MAJ
1233 addq(h, y1); //h = k + w + h + S0; --
1234
1235 addq(d, y2); //d = k + w + h + d + S1 + CH = d + t1; --
1236 addq(h, y2); //h = k + w + h + S0 + S1 + CH = t1 + S0; --
1237 addq(h, y3); //h = t1 + S0 + MAJ; --
1238}
1239
1240void MacroAssembler::sha512_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
1241 XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
1242 Register buf, Register state, Register ofs, Register limit, Register rsp,
1243 bool multi_block, XMMRegister shuf_mask)
1244{
1245
1246 Label loop0, loop1, loop2, done_hash,
1247 compute_block_size, compute_size,
1248 compute_block_size_end, compute_size_end;
1249
1250 address K512_W = StubRoutines::x86::k512_W_addr();
1251 address pshuffle_byte_flip_mask_sha512 = StubRoutines::x86::pshuffle_byte_flip_mask_addr_sha512();
1252 address pshuffle_byte_flip_mask_addr = 0;
1253
1254 const XMMRegister& XFER = xmm0; // YTMP0
1255 const XMMRegister& BYTE_FLIP_MASK = xmm9; // ymm9
1256 const XMMRegister& YMM_MASK_LO = xmm10; // ymm10
1257#ifdef _WIN64
1258 const Register& INP = rcx; //1st arg
1259 const Register& CTX = rdx; //2nd arg
1260 const Register& NUM_BLKS = r8; //3rd arg
1261 const Register& c = rdi;
1262 const Register& d = rsi;
1263 const Register& e = r8;
1264 const Register& y3 = rcx;
1265 const Register& offset = r8;
1266 const Register& input_limit = r9;
1267#else
1268 const Register& INP = rdi; //1st arg
1269 const Register& CTX = rsi; //2nd arg
1270 const Register& NUM_BLKS = rdx; //3rd arg
1271 const Register& c = rcx;
1272 const Register& d = r8;
1273 const Register& e = rdx;
1274 const Register& y3 = rdi;
1275 const Register& offset = rdx;
1276 const Register& input_limit = rcx;
1277#endif
1278
1279 const Register& TBL = rbp;
1280
1281 const Register& a = rax;
1282 const Register& b = rbx;
1283
1284 const Register& f = r9;
1285 const Register& g = r10;
1286 const Register& h = r11;
1287
1288 //Local variables as defined in assembly file.
1289 enum
1290 {
1291 _XFER_SIZE = 4 * 8, // resq 4 => reserve 4 quadwords. Hence 4 * 8
1292 _SRND_SIZE = 8, // resq 1
1293 _INP_SIZE = 8,
1294 _INP_END_SIZE = 8,
1295 _RSP_SAVE_SIZE = 8, // defined as resq 1
1296
1297#ifdef _WIN64
1298 _GPR_SAVE_SIZE = 8 * 8, // defined as resq 8
1299#else
1300 _GPR_SAVE_SIZE = 6 * 8 // resq 6
1301#endif
1302 };
1303
1304 enum
1305 {
1306 _XFER = 0,
1307 _SRND = _XFER + _XFER_SIZE, // 32
1308 _INP = _SRND + _SRND_SIZE, // 40
1309 _INP_END = _INP + _INP_SIZE, // 48
1310 _RSP = _INP_END + _INP_END_SIZE, // 56
1311 _GPR = _RSP + _RSP_SAVE_SIZE, // 64
1312 _STACK_SIZE = _GPR + _GPR_SAVE_SIZE // 128 for windows and 112 for linux.
1313 };
1314
1315//Saving offset and limit as it will help with blocksize calculation for multiblock SHA512.
1316#ifdef _WIN64
1317 push(r8); // win64: this is ofs
1318 push(r9); // win64: this is limit, we need them again at the very end.
1319#else
1320 push(rdx); // linux : this is ofs, need at the end for multiblock calculation
1321 push(rcx); // linux: This is the limit.
1322#endif
1323
1324 //Allocate Stack Space
1325 movq(rax, rsp);
1326 subq(rsp, _STACK_SIZE);
1327 andq(rsp, -32);
1328 movq(Address(rsp, _RSP), rax);
1329
1330 //Save GPRs
1331 movq(Address(rsp, _GPR), rbp);
1332 movq(Address(rsp, (_GPR + 8)), rbx);
1333 movq(Address(rsp, (_GPR + 16)), r12);
1334 movq(Address(rsp, (_GPR + 24)), r13);
1335 movq(Address(rsp, (_GPR + 32)), r14);
1336 movq(Address(rsp, (_GPR + 40)), r15);
1337
1338#ifdef _WIN64
1339 movq(Address(rsp, (_GPR + 48)), rsi);
1340 movq(Address(rsp, (_GPR + 56)), rdi);
1341#endif
1342
1343 vpblendd(xmm0, xmm0, xmm1, 0xF0, AVX_128bit);
1344 vpblendd(xmm0, xmm0, xmm1, 0xF0, AVX_256bit);
1345
1346 if (multi_block) {
1347 xorq(rax, rax);
1348 bind(compute_block_size);
1349 cmpptr(offset, input_limit); // Assuming that offset is less than limit.
1350 jccb(Assembler::aboveEqual, compute_block_size_end);
1351 addq(offset, 128);
1352 addq(rax, 128);
1353 jmpb(compute_block_size);
1354
1355 bind(compute_block_size_end);
1356 movq(NUM_BLKS, rax);
1357
1358 cmpq(NUM_BLKS, 0);
1359 jcc(Assembler::equal, done_hash);
1360 } else {
1361 xorq(NUM_BLKS, NUM_BLKS); //If single block.
1362 addq(NUM_BLKS, 128);
1363 }
1364
1365 addq(NUM_BLKS, INP); //pointer to end of data
1366 movq(Address(rsp, _INP_END), NUM_BLKS);
1367
1368 //load initial digest
1369 movq(a, Address(CTX, 8 * 0));
1370 movq(b, Address(CTX, 8 * 1));
1371 movq(c, Address(CTX, 8 * 2));
1372 movq(d, Address(CTX, 8 * 3));
1373 movq(e, Address(CTX, 8 * 4));
1374 movq(f, Address(CTX, 8 * 5));
1375 // load g - r10 after it is used as scratch
1376 movq(h, Address(CTX, 8 * 7));
1377
1378 pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask_sha512;
1379 vmovdqu(BYTE_FLIP_MASK, ExternalAddress(pshuffle_byte_flip_mask_addr + 0)); //PSHUFFLE_BYTE_FLIP_MASK wrt rip
1380 vmovdqu(YMM_MASK_LO, ExternalAddress(pshuffle_byte_flip_mask_addr + 32));
1381
1382 movq(g, Address(CTX, 8 * 6));
1383
1384 bind(loop0);
1385 lea(TBL, ExternalAddress(K512_W));
1386
1387 //byte swap first 16 dwords
1388 vmovdqu(xmm4, Address(INP, 32 * 0));
1389 vpshufb(xmm4, xmm4, BYTE_FLIP_MASK, AVX_256bit);
1390 vmovdqu(xmm5, Address(INP, 32 * 1));
1391 vpshufb(xmm5, xmm5, BYTE_FLIP_MASK, AVX_256bit);
1392 vmovdqu(xmm6, Address(INP, 32 * 2));
1393 vpshufb(xmm6, xmm6, BYTE_FLIP_MASK, AVX_256bit);
1394 vmovdqu(xmm7, Address(INP, 32 * 3));
1395 vpshufb(xmm7, xmm7, BYTE_FLIP_MASK, AVX_256bit);
1396
1397 movq(Address(rsp, _INP), INP);
1398
1399 movslq(Address(rsp, _SRND), 4);
1400 align(16);
1401
1402 //Schedule 64 input dwords, by calling sha512_AVX2_one_round_and_schedule
1403 bind(loop1);
1404 vpaddq(xmm0, xmm4, Address(TBL, 0 * 32), AVX_256bit);
1405 vmovdqu(Address(rsp, _XFER), xmm0);
1406 //four rounds and schedule
1407 sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, a, b, c, d, e, f, g, h, 0);
1408 sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, h, a, b, c, d, e, f, g, 1);
1409 sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, g, h, a, b, c, d, e, f, 2);
1410 sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, f, g, h, a, b, c, d, e, 3);
1411
1412 vpaddq(xmm0, xmm5, Address(TBL, 1 * 32), AVX_256bit);
1413 vmovdqu(Address(rsp, _XFER), xmm0);
1414 //four rounds and schedule
1415 sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, e, f, g, h, a, b, c, d, 0);
1416 sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, d, e, f, g, h, a, b, c, 1);
1417 sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, c, d, e, f, g, h, a, b, 2);
1418 sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, b, c, d, e, f, g, h, a, 3);
1419
1420 vpaddq(xmm0, xmm6, Address(TBL, 2 * 32), AVX_256bit);
1421 vmovdqu(Address(rsp, _XFER), xmm0);
1422 //four rounds and schedule
1423 sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, a, b, c, d, e, f, g, h, 0);
1424 sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, h, a, b, c, d, e, f, g, 1);
1425 sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, g, h, a, b, c, d, e, f, 2);
1426 sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, f, g, h, a, b, c, d, e, 3);
1427
1428 vpaddq(xmm0, xmm7, Address(TBL, 3 * 32), AVX_256bit);
1429 vmovdqu(Address(rsp, _XFER), xmm0);
1430 addq(TBL, 4 * 32);
1431 //four rounds and schedule
1432 sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, e, f, g, h, a, b, c, d, 0);
1433 sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, d, e, f, g, h, a, b, c, 1);
1434 sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, c, d, e, f, g, h, a, b, 2);
1435 sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, b, c, d, e, f, g, h, a, 3);
1436
1437 subq(Address(rsp, _SRND), 1);
1438 jcc(Assembler::notEqual, loop1);
1439
1440 movslq(Address(rsp, _SRND), 2);
1441
1442 bind(loop2);
1443 vpaddq(xmm0, xmm4, Address(TBL, 0 * 32), AVX_256bit);
1444 vmovdqu(Address(rsp, _XFER), xmm0);
1445 //four rounds and compute.
1446 sha512_AVX2_one_round_compute(a, a, b, c, d, e, f, g, h, 0);
1447 sha512_AVX2_one_round_compute(h, h, a, b, c, d, e, f, g, 1);
1448 sha512_AVX2_one_round_compute(g, g, h, a, b, c, d, e, f, 2);
1449 sha512_AVX2_one_round_compute(f, f, g, h, a, b, c, d, e, 3);
1450
1451 vpaddq(xmm0, xmm5, Address(TBL, 1 * 32), AVX_256bit);
1452 vmovdqu(Address(rsp, _XFER), xmm0);
1453 addq(TBL, 2 * 32);
1454 // four rounds and compute.
1455 sha512_AVX2_one_round_compute(e, e, f, g, h, a, b, c, d, 0);
1456 sha512_AVX2_one_round_compute(d, d, e, f, g, h, a, b, c, 1);
1457 sha512_AVX2_one_round_compute(c, c, d, e, f, g, h, a, b, 2);
1458 sha512_AVX2_one_round_compute(b, b, c, d, e, f, g, h, a, 3);
1459
1460 vmovdqu(xmm4, xmm6);
1461 vmovdqu(xmm5, xmm7);
1462
1463 subq(Address(rsp, _SRND), 1);
1464 jcc(Assembler::notEqual, loop2);
1465
1466 addmq(8 * 0, CTX, a);
1467 addmq(8 * 1, CTX, b);
1468 addmq(8 * 2, CTX, c);
1469 addmq(8 * 3, CTX, d);
1470 addmq(8 * 4, CTX, e);
1471 addmq(8 * 5, CTX, f);
1472 addmq(8 * 6, CTX, g);
1473 addmq(8 * 7, CTX, h);
1474
1475 movq(INP, Address(rsp, _INP));
1476 addq(INP, 128);
1477 cmpq(INP, Address(rsp, _INP_END));
1478 jcc(Assembler::notEqual, loop0);
1479
1480 bind(done_hash);
1481
1482 //Restore GPRs
1483 movq(rbp, Address(rsp, (_GPR + 0)));
1484 movq(rbx, Address(rsp, (_GPR + 8)));
1485 movq(r12, Address(rsp, (_GPR + 16)));
1486 movq(r13, Address(rsp, (_GPR + 24)));
1487 movq(r14, Address(rsp, (_GPR + 32)));
1488 movq(r15, Address(rsp, (_GPR + 40)));
1489
1490#ifdef _WIN64
1491 movq(rsi, Address(rsp, (_GPR + 48)));
1492 movq(rdi, Address(rsp, (_GPR + 56)));
1493#endif
1494
1495 //Restore Stack Pointer
1496 movq(rsp, Address(rsp, _RSP));
1497
1498#ifdef _WIN64
1499 pop(r9);
1500 pop(r8);
1501#else
1502 pop(rcx);
1503 pop(rdx);
1504#endif
1505
1506 if (multi_block) {
1507#ifdef _WIN64
1508 const Register& limit_end = r9;
1509 const Register& ofs_end = r8;
1510#else
1511 const Register& limit_end = rcx;
1512 const Register& ofs_end = rdx;
1513#endif
1514 movq(rax, ofs_end);
1515 bind(compute_size);
1516 cmpptr(rax, limit_end);
1517 jccb(Assembler::aboveEqual, compute_size_end);
1518 addq(rax, 128);
1519 jmpb(compute_size);
1520 bind(compute_size_end);
1521 }
1522}
1523
1524#endif //#ifdef _LP64
1525
1526