macroAssembler_x86_sha.cpp source code [OpenJDK/src/hotspot/cpu/x86/macroAssembler_x86_sha.cpp]

1	/*
2	* Copyright (c) 2016, Intel Corporation.
3	*
4	* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5	*
6	* This code is free software; you can redistribute it and/or modify it
7	* under the terms of the GNU General Public License version 2 only, as
8	* published by the Free Software Foundation.
9	*
10	* This code is distributed in the hope that it will be useful, but WITHOUT
11	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13	* version 2 for more details (a copy is included in the LICENSE file that
14	* accompanied this code).
15	*
16	* You should have received a copy of the GNU General Public License version
17	* 2 along with this work; if not, write to the Free Software Foundation,
18	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19	*
20	* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21	* or visit www.oracle.com if you need additional information or have any
22	* questions.
23	*
24	*/
25
26	#include "precompiled.hpp"
27	#include "asm/assembler.hpp"
28	#include "asm/assembler.inline.hpp"
29	#include "runtime/stubRoutines.hpp"
30	#include "macroAssembler_x86.hpp"
31
32	// ofs and limit are used for multi-block byte array.
33	// int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
34	void MacroAssembler::fast_sha1(XMMRegister abcd, XMMRegister e0, XMMRegister e1, XMMRegister msg0,
35	XMMRegister msg1, XMMRegister msg2, XMMRegister msg3, XMMRegister shuf_mask,
36	Register buf, Register state, Register ofs, Register limit, Register rsp, bool multi_block) {
37
38	Label start, done_hash, loop0;
39
40	address upper_word_mask = StubRoutines::x86::upper_word_mask_addr();
41	address shuffle_byte_flip_mask = StubRoutines::x86::shuffle_byte_flip_mask_addr();
42
43	bind(start);
44	movdqu(abcd, Address (state, `0`));
45	pinsrd(e0, Address (state, `16`), `3`);
46	movdqu(shuf_mask, ExternalAddress (upper_word_mask)); // 0xFFFFFFFF000000000000000000000000
47	pand(e0, shuf_mask);
48	pshufd(abcd, abcd, `0x1B`);
49	movdqu(shuf_mask, ExternalAddress (shuffle_byte_flip_mask)); //0x000102030405060708090a0b0c0d0e0f
50
51	bind(loop0);
52	// Save hash values for addition after rounds
53	movdqu(Address (rsp, `0`), e0);
54	movdqu(Address (rsp, `16`), abcd);
55
56
57	// Rounds 0 - 3
58	movdqu(msg0, Address (buf, `0`));
59	pshufb(msg0, shuf_mask);
60	paddd(e0, msg0);
61	movdqa(e1, abcd);
62	sha1rnds4(abcd, e0, `0`);
63
64	// Rounds 4 - 7
65	movdqu(msg1, Address (buf, `16`));
66	pshufb(msg1, shuf_mask);
67	sha1nexte(e1, msg1);
68	movdqa(e0, abcd);
69	sha1rnds4(abcd, e1, `0`);
70	sha1msg1(msg0, msg1);
71
72	// Rounds 8 - 11
73	movdqu(msg2, Address (buf, `32`));
74	pshufb(msg2, shuf_mask);
75	sha1nexte(e0, msg2);
76	movdqa(e1, abcd);
77	sha1rnds4(abcd, e0, `0`);
78	sha1msg1(msg1, msg2);
79	pxor(msg0, msg2);
80
81	// Rounds 12 - 15
82	movdqu(msg3, Address (buf, `48`));
83	pshufb(msg3, shuf_mask);
84	sha1nexte(e1, msg3);
85	movdqa(e0, abcd);
86	sha1msg2(msg0, msg3);
87	sha1rnds4(abcd, e1, `0`);
88	sha1msg1(msg2, msg3);
89	pxor(msg1, msg3);
90
91	// Rounds 16 - 19
92	sha1nexte(e0, msg0);
93	movdqa(e1, abcd);
94	sha1msg2(msg1, msg0);
95	sha1rnds4(abcd, e0, `0`);
96	sha1msg1(msg3, msg0);
97	pxor(msg2, msg0);
98
99	// Rounds 20 - 23
100	sha1nexte(e1, msg1);
101	movdqa(e0, abcd);
102	sha1msg2(msg2, msg1);
103	sha1rnds4(abcd, e1, `1`);
104	sha1msg1(msg0, msg1);
105	pxor(msg3, msg1);
106
107	// Rounds 24 - 27
108	sha1nexte(e0, msg2);
109	movdqa(e1, abcd);
110	sha1msg2(msg3, msg2);
111	sha1rnds4(abcd, e0, `1`);
112	sha1msg1(msg1, msg2);
113	pxor(msg0, msg2);
114
115	// Rounds 28 - 31
116	sha1nexte(e1, msg3);
117	movdqa(e0, abcd);
118	sha1msg2(msg0, msg3);
119	sha1rnds4(abcd, e1, `1`);
120	sha1msg1(msg2, msg3);
121	pxor(msg1, msg3);
122
123	// Rounds 32 - 35
124	sha1nexte(e0, msg0);
125	movdqa(e1, abcd);
126	sha1msg2(msg1, msg0);
127	sha1rnds4(abcd, e0, `1`);
128	sha1msg1(msg3, msg0);
129	pxor(msg2, msg0);
130
131	// Rounds 36 - 39
132	sha1nexte(e1, msg1);
133	movdqa(e0, abcd);
134	sha1msg2(msg2, msg1);
135	sha1rnds4(abcd, e1, `1`);
136	sha1msg1(msg0, msg1);
137	pxor(msg3, msg1);
138
139	// Rounds 40 - 43
140	sha1nexte(e0, msg2);
141	movdqa(e1, abcd);
142	sha1msg2(msg3, msg2);
143	sha1rnds4(abcd, e0, `2`);
144	sha1msg1(msg1, msg2);
145	pxor(msg0, msg2);
146
147	// Rounds 44 - 47
148	sha1nexte(e1, msg3);
149	movdqa(e0, abcd);
150	sha1msg2(msg0, msg3);
151	sha1rnds4(abcd, e1, `2`);
152	sha1msg1(msg2, msg3);
153	pxor(msg1, msg3);
154
155	// Rounds 48 - 51
156	sha1nexte(e0, msg0);
157	movdqa(e1, abcd);
158	sha1msg2(msg1, msg0);
159	sha1rnds4(abcd, e0, `2`);
160	sha1msg1(msg3, msg0);
161	pxor(msg2, msg0);
162
163	// Rounds 52 - 55
164	sha1nexte(e1, msg1);
165	movdqa(e0, abcd);
166	sha1msg2(msg2, msg1);
167	sha1rnds4(abcd, e1, `2`);
168	sha1msg1(msg0, msg1);
169	pxor(msg3, msg1);
170
171	// Rounds 56 - 59
172	sha1nexte(e0, msg2);
173	movdqa(e1, abcd);
174	sha1msg2(msg3, msg2);
175	sha1rnds4(abcd, e0, `2`);
176	sha1msg1(msg1, msg2);
177	pxor(msg0, msg2);
178
179	// Rounds 60 - 63
180	sha1nexte(e1, msg3);
181	movdqa(e0, abcd);
182	sha1msg2(msg0, msg3);
183	sha1rnds4(abcd, e1, `3`);
184	sha1msg1(msg2, msg3);
185	pxor(msg1, msg3);
186
187	// Rounds 64 - 67
188	sha1nexte(e0, msg0);
189	movdqa(e1, abcd);
190	sha1msg2(msg1, msg0);
191	sha1rnds4(abcd, e0, `3`);
192	sha1msg1(msg3, msg0);
193	pxor(msg2, msg0);
194
195	// Rounds 68 - 71
196	sha1nexte(e1, msg1);
197	movdqa(e0, abcd);
198	sha1msg2(msg2, msg1);
199	sha1rnds4(abcd, e1, `3`);
200	pxor(msg3, msg1);
201
202	// Rounds 72 - 75
203	sha1nexte(e0, msg2);
204	movdqa(e1, abcd);
205	sha1msg2(msg3, msg2);
206	sha1rnds4(abcd, e0, `3`);
207
208	// Rounds 76 - 79
209	sha1nexte(e1, msg3);
210	movdqa(e0, abcd);
211	sha1rnds4(abcd, e1, `3`);
212
213	// add current hash values with previously saved
214	movdqu(msg0, Address (rsp, `0`));
215	sha1nexte(e0, msg0);
216	movdqu(msg0, Address (rsp, `16`));
217	paddd(abcd, msg0);
218
219	if (multi_block) {
220	// increment data pointer and loop if more to process
221	addptr(buf, `64`);
222	addptr(ofs, `64`);
223	cmpptr(ofs, limit);
224	jcc(Assembler::belowEqual, loop0);
225	movptr(rax, ofs); //return ofs
226	}
227	// write hash values back in the correct order
228	pshufd(abcd, abcd, `0x1b`);
229	movdqu(Address (state, `0`), abcd);
230	pextrd(Address (state, `16`), e0, `3`);
231
232	bind(done_hash);
233
234	}
235
236	// xmm0 (msg) is used as an implicit argument to sh256rnds2
237	// and state0 and state1 can never use xmm0 register.
238	// ofs and limit are used for multi-block byte array.
239	// int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
240	#ifdef _LP64
241	void MacroAssembler::fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
242	XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
243	Register buf, Register state, Register ofs, Register limit, Register rsp,
244	bool multi_block, XMMRegister shuf_mask) {
245	#else
246	void MacroAssembler::fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
247	XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
248	Register buf, Register state, Register ofs, Register limit, Register rsp,
249	bool multi_block) {
250	#endif
251	Label start, done_hash, loop0;
252
253	address K256 = StubRoutines::x86::k256_addr();
254	address pshuffle_byte_flip_mask = StubRoutines::x86::pshuffle_byte_flip_mask_addr();
255
256	bind(start);
257	movdqu(state0, Address (state, `0`));
258	movdqu(state1, Address (state, `16`));
259
260	pshufd(state0, state0, `0xB1`);
261	pshufd(state1, state1, `0x1B`);
262	movdqa(msgtmp4, state0);
263	palignr(state0, state1, `8`);
264	pblendw(state1, msgtmp4, `0xF0`);
265
266	#ifdef _LP64
267	movdqu(shuf_mask, ExternalAddress (pshuffle_byte_flip_mask));
268	#endif
269	lea(rax, ExternalAddress (K256));
270
271	bind(loop0);
272	movdqu(Address (rsp, `0`), state0);
273	movdqu(Address (rsp, `16`), state1);
274
275	// Rounds 0-3
276	movdqu(msg, Address (buf, `0`));
277	#ifdef _LP64
278	pshufb(msg, shuf_mask);
279	#else
280	pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask));
281	#endif
282	movdqa(msgtmp0, msg);
283	paddd(msg, Address (rax, `0`));
284	sha256rnds2(state1, state0);
285	pshufd(msg, msg, `0x0E`);
286	sha256rnds2(state0, state1);
287
288	// Rounds 4-7
289	movdqu(msg, Address (buf, `16`));
290	#ifdef _LP64
291	pshufb(msg, shuf_mask);
292	#else
293	pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask));
294	#endif
295	movdqa(msgtmp1, msg);
296	paddd(msg, Address (rax, `16`));
297	sha256rnds2(state1, state0);
298	pshufd(msg, msg, `0x0E`);
299	sha256rnds2(state0, state1);
300	sha256msg1(msgtmp0, msgtmp1);
301
302	// Rounds 8-11
303	movdqu(msg, Address (buf, `32`));
304	#ifdef _LP64
305	pshufb(msg, shuf_mask);
306	#else
307	pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask));
308	#endif
309	movdqa(msgtmp2, msg);
310	paddd(msg, Address (rax, `32`));
311	sha256rnds2(state1, state0);
312	pshufd(msg, msg, `0x0E`);
313	sha256rnds2(state0, state1);
314	sha256msg1(msgtmp1, msgtmp2);
315
316	// Rounds 12-15
317	movdqu(msg, Address (buf, `48`));
318	#ifdef _LP64
319	pshufb(msg, shuf_mask);
320	#else
321	pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask));
322	#endif
323	movdqa(msgtmp3, msg);
324	paddd(msg, Address (rax, `48`));
325	sha256rnds2(state1, state0);
326	movdqa(msgtmp4, msgtmp3);
327	palignr(msgtmp4, msgtmp2, `4`);
328	paddd(msgtmp0, msgtmp4);
329	sha256msg2(msgtmp0, msgtmp3);
330	pshufd(msg, msg, `0x0E`);
331	sha256rnds2(state0, state1);
332	sha256msg1(msgtmp2, msgtmp3);
333
334	// Rounds 16-19
335	movdqa(msg, msgtmp0);
336	paddd(msg, Address (rax, `64`));
337	sha256rnds2(state1, state0);
338	movdqa(msgtmp4, msgtmp0);
339	palignr(msgtmp4, msgtmp3, `4`);
340	paddd(msgtmp1, msgtmp4);
341	sha256msg2(msgtmp1, msgtmp0);
342	pshufd(msg, msg, `0x0E`);
343	sha256rnds2(state0, state1);
344	sha256msg1(msgtmp3, msgtmp0);
345
346	// Rounds 20-23
347	movdqa(msg, msgtmp1);
348	paddd(msg, Address (rax, `80`));
349	sha256rnds2(state1, state0);
350	movdqa(msgtmp4, msgtmp1);
351	palignr(msgtmp4, msgtmp0, `4`);
352	paddd(msgtmp2, msgtmp4);
353	sha256msg2(msgtmp2, msgtmp1);
354	pshufd(msg, msg, `0x0E`);
355	sha256rnds2(state0, state1);
356	sha256msg1(msgtmp0, msgtmp1);
357
358	// Rounds 24-27
359	movdqa(msg, msgtmp2);
360	paddd(msg, Address (rax, `96`));
361	sha256rnds2(state1, state0);
362	movdqa(msgtmp4, msgtmp2);
363	palignr(msgtmp4, msgtmp1, `4`);
364	paddd(msgtmp3, msgtmp4);
365	sha256msg2(msgtmp3, msgtmp2);
366	pshufd(msg, msg, `0x0E`);
367	sha256rnds2(state0, state1);
368	sha256msg1(msgtmp1, msgtmp2);
369
370	// Rounds 28-31
371	movdqa(msg, msgtmp3);
372	paddd(msg, Address (rax, `112`));
373	sha256rnds2(state1, state0);
374	movdqa(msgtmp4, msgtmp3);
375	palignr(msgtmp4, msgtmp2, `4`);
376	paddd(msgtmp0, msgtmp4);
377	sha256msg2(msgtmp0, msgtmp3);
378	pshufd(msg, msg, `0x0E`);
379	sha256rnds2(state0, state1);
380	sha256msg1(msgtmp2, msgtmp3);
381
382	// Rounds 32-35
383	movdqa(msg, msgtmp0);
384	paddd(msg, Address (rax, `128`));
385	sha256rnds2(state1, state0);
386	movdqa(msgtmp4, msgtmp0);
387	palignr(msgtmp4, msgtmp3, `4`);
388	paddd(msgtmp1, msgtmp4);
389	sha256msg2(msgtmp1, msgtmp0);
390	pshufd(msg, msg, `0x0E`);
391	sha256rnds2(state0, state1);
392	sha256msg1(msgtmp3, msgtmp0);
393
394	// Rounds 36-39
395	movdqa(msg, msgtmp1);
396	paddd(msg, Address (rax, `144`));
397	sha256rnds2(state1, state0);
398	movdqa(msgtmp4, msgtmp1);
399	palignr(msgtmp4, msgtmp0, `4`);
400	paddd(msgtmp2, msgtmp4);
401	sha256msg2(msgtmp2, msgtmp1);
402	pshufd(msg, msg, `0x0E`);
403	sha256rnds2(state0, state1);
404	sha256msg1(msgtmp0, msgtmp1);
405
406	// Rounds 40-43
407	movdqa(msg, msgtmp2);
408	paddd(msg, Address (rax, `160`));
409	sha256rnds2(state1, state0);
410	movdqa(msgtmp4, msgtmp2);
411	palignr(msgtmp4, msgtmp1, `4`);
412	paddd(msgtmp3, msgtmp4);
413	sha256msg2(msgtmp3, msgtmp2);
414	pshufd(msg, msg, `0x0E`);
415	sha256rnds2(state0, state1);
416	sha256msg1(msgtmp1, msgtmp2);
417
418	// Rounds 44-47
419	movdqa(msg, msgtmp3);
420	paddd(msg, Address (rax, `176`));
421	sha256rnds2(state1, state0);
422	movdqa(msgtmp4, msgtmp3);
423	palignr(msgtmp4, msgtmp2, `4`);
424	paddd(msgtmp0, msgtmp4);
425	sha256msg2(msgtmp0, msgtmp3);
426	pshufd(msg, msg, `0x0E`);
427	sha256rnds2(state0, state1);
428	sha256msg1(msgtmp2, msgtmp3);
429
430	// Rounds 48-51
431	movdqa(msg, msgtmp0);
432	paddd(msg, Address (rax, `192`));
433	sha256rnds2(state1, state0);
434	movdqa(msgtmp4, msgtmp0);
435	palignr(msgtmp4, msgtmp3, `4`);
436	paddd(msgtmp1, msgtmp4);
437	sha256msg2(msgtmp1, msgtmp0);
438	pshufd(msg, msg, `0x0E`);
439	sha256rnds2(state0, state1);
440	sha256msg1(msgtmp3, msgtmp0);
441
442	// Rounds 52-55
443	movdqa(msg, msgtmp1);
444	paddd(msg, Address (rax, `208`));
445	sha256rnds2(state1, state0);
446	movdqa(msgtmp4, msgtmp1);
447	palignr(msgtmp4, msgtmp0, `4`);
448	paddd(msgtmp2, msgtmp4);
449	sha256msg2(msgtmp2, msgtmp1);
450	pshufd(msg, msg, `0x0E`);
451	sha256rnds2(state0, state1);
452
453	// Rounds 56-59
454	movdqa(msg, msgtmp2);
455	paddd(msg, Address (rax, `224`));
456	sha256rnds2(state1, state0);
457	movdqa(msgtmp4, msgtmp2);
458	palignr(msgtmp4, msgtmp1, `4`);
459	paddd(msgtmp3, msgtmp4);
460	sha256msg2(msgtmp3, msgtmp2);
461	pshufd(msg, msg, `0x0E`);
462	sha256rnds2(state0, state1);
463
464	// Rounds 60-63
465	movdqa(msg, msgtmp3);
466	paddd(msg, Address (rax, `240`));
467	sha256rnds2(state1, state0);
468	pshufd(msg, msg, `0x0E`);
469	sha256rnds2(state0, state1);
470	movdqu(msg, Address (rsp, `0`));
471	paddd(state0, msg);
472	movdqu(msg, Address (rsp, `16`));
473	paddd(state1, msg);
474
475	if (multi_block) {
476	// increment data pointer and loop if more to process
477	addptr(buf, `64`);
478	addptr(ofs, `64`);
479	cmpptr(ofs, limit);
480	jcc(Assembler::belowEqual, loop0);
481	movptr(rax, ofs); //return ofs
482	}
483
484	pshufd(state0, state0, `0x1B`);
485	pshufd(state1, state1, `0xB1`);
486	movdqa(msgtmp4, state0);
487	pblendw(state0, state1, `0xF0`);
488	palignr(state1, msgtmp4, `8`);
489
490	movdqu(Address (state, `0`), state0);
491	movdqu(Address (state, `16`), state1);
492
493	bind(done_hash);
494
495	}
496
497	#ifdef _LP64
498	/*
499	The algorithm below is based on Intel publication:
500	"Fast SHA-256 Implementations on Intelë Architecture Processors" by Jim Guilford, Kirk Yap and Vinodh Gopal.
501	The assembly code was originally provided by Sean Gulley and in many places preserves
502	the original assembly NAMES and comments to simplify matching Java assembly with its original.
503	The Java version was substantially redesigned to replace 1200 assembly instruction with
504	much shorter run-time generator of the same code in memory.
505	*/
506
507	void MacroAssembler::sha256_AVX2_one_round_compute(
508	Register reg_old_h,
509	Register reg_a,
510	Register reg_b,
511	Register reg_c,
512	Register reg_d,
513	Register reg_e,
514	Register reg_f,
515	Register reg_g,
516	Register reg_h,
517	int iter) {
518	const Register& reg_y0 = r13;
519	const Register& reg_y1 = r14;
520	const Register& reg_y2 = r15;
521	const Register& reg_y3 = rcx;
522	const Register& reg_T1 = r12;
523	//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND iter ;;;;;;;;;;;;;;;;;;;;;;;;;;;
524	if (iter%`4` > `0`) {
525	addl(reg_old_h, reg_y2); // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; --
526	}
527	movl(reg_y2, reg_f); // reg_y2 = reg_f ; CH
528	rorxd(reg_y0, reg_e, `25`); // reg_y0 = reg_e >> 25 ; S1A
529	rorxd(reg_y1, reg_e, `11`); // reg_y1 = reg_e >> 11 ; S1B
530	xorl(reg_y2, reg_g); // reg_y2 = reg_f^reg_g ; CH
531
532	xorl(reg_y0, reg_y1); // reg_y0 = (reg_e>>25) ^ (reg_h>>11) ; S1
533	rorxd(reg_y1, reg_e, `6`); // reg_y1 = (reg_e >> 6) ; S1
534	andl(reg_y2, reg_e); // reg_y2 = (reg_f^reg_g)&reg_e ; CH
535
536	if (iter%`4` > `0`) {
537	addl(reg_old_h, reg_y3); // reg_h = t1 + S0 + MAJ ; --
538	}
539
540	xorl(reg_y0, reg_y1); // reg_y0 = (reg_e>>25) ^ (reg_e>>11) ^ (reg_e>>6) ; S1
541	rorxd(reg_T1, reg_a, `13`); // reg_T1 = reg_a >> 13 ; S0B
542	xorl(reg_y2, reg_g); // reg_y2 = CH = ((reg_f^reg_g)&reg_e)^reg_g ; CH
543	rorxd(reg_y1, reg_a, `22`); // reg_y1 = reg_a >> 22 ; S0A
544	movl(reg_y3, reg_a); // reg_y3 = reg_a ; MAJA
545
546	xorl(reg_y1, reg_T1); // reg_y1 = (reg_a>>22) ^ (reg_a>>13) ; S0
547	rorxd(reg_T1, reg_a, `2`); // reg_T1 = (reg_a >> 2) ; S0
548	addl(reg_h, Address (rsp, rdx, Address::times_1, `4`iter)); // reg_h = k + w + reg_h ; --*
549	orl(reg_y3, reg_c); // reg_y3 = reg_a\|reg_c ; MAJA
550
551	xorl(reg_y1, reg_T1); // reg_y1 = (reg_a>>22) ^ (reg_a>>13) ^ (reg_a>>2) ; S0
552	movl(reg_T1, reg_a); // reg_T1 = reg_a ; MAJB
553	andl(reg_y3, reg_b); // reg_y3 = (reg_a\|reg_c)&reg_b ; MAJA
554	andl(reg_T1, reg_c); // reg_T1 = reg_a&reg_c ; MAJB
555	addl(reg_y2, reg_y0); // reg_y2 = S1 + CH ; --
556
557
558	addl(reg_d, reg_h); // reg_d = k + w + reg_h + reg_d ; --
559	orl(reg_y3, reg_T1); // reg_y3 = MAJ = (reg_a\|reg_c)&reg_b)\|(reg_a&reg_c) ; MAJ
560	addl(reg_h, reg_y1); // reg_h = k + w + reg_h + S0 ; --
561
562	addl(reg_d, reg_y2); // reg_d = k + w + reg_h + reg_d + S1 + CH = reg_d + t1 ; --
563
564
565	if (iter%`4` == `3`) {
566	addl(reg_h, reg_y2); // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; --
567	addl(reg_h, reg_y3); // reg_h = t1 + S0 + MAJ ; --
568	}
569	}
570
571	void MacroAssembler::sha256_AVX2_four_rounds_compute_first(int start) {
572	sha256_AVX2_one_round_compute(rax, rax, rbx, rdi, rsi, r8, r9, r10, r11, start + `0`);
573	sha256_AVX2_one_round_compute(r11, r11, rax, rbx, rdi, rsi, r8, r9, r10, start + `1`);
574	sha256_AVX2_one_round_compute(r10, r10, r11, rax, rbx, rdi, rsi, r8, r9, start + `2`);
575	sha256_AVX2_one_round_compute(r9, r9, r10, r11, rax, rbx, rdi, rsi, r8, start + `3`);
576	}
577
578	void MacroAssembler::sha256_AVX2_four_rounds_compute_last(int start) {
579	sha256_AVX2_one_round_compute(r8, r8, r9, r10, r11, rax, rbx, rdi, rsi, start + `0`);
580	sha256_AVX2_one_round_compute(rsi, rsi, r8, r9, r10, r11, rax, rbx, rdi, start + `1`);
581	sha256_AVX2_one_round_compute(rdi, rdi, rsi, r8, r9, r10, r11, rax, rbx, start + `2`);
582	sha256_AVX2_one_round_compute(rbx, rbx, rdi, rsi, r8, r9, r10, r11, rax, start + `3`);
583	}
584
585	void MacroAssembler::sha256_AVX2_one_round_and_sched(
586	XMMRegister xmm_0, / == ymm4 on 0, 1, 2, 3 iterations, then rotate 4 registers left on 4, 8, 12 iterations /
587	XMMRegister xmm_1, / ymm5 / / full cycle is 16 iterations /
588	XMMRegister xmm_2, / ymm6 /
589	XMMRegister xmm_3, / ymm7 /
590	Register reg_a, / == rax on 0 iteration, then rotate 8 register right on each next iteration /
591	Register reg_b, / rbx / / full cycle is 8 iterations /
592	Register reg_c, / rdi /
593	Register reg_d, / rsi /
594	Register reg_e, / r8 /
595	Register reg_f, / r9d /
596	Register reg_g, / r10d /
597	Register reg_h, / r11d /
598	int iter)
599	{
600	movl(rcx, reg_a); // rcx = reg_a ; MAJA
601	rorxd(r13, reg_e, `25`); // r13 = reg_e >> 25 ; S1A
602	rorxd(r14, reg_e, `11`); // r14 = reg_e >> 11 ; S1B
603	addl(reg_h, Address (rsp, rdx, Address::times_1, `4`*iter));
604	orl(rcx, reg_c); // rcx = reg_a\|reg_c ; MAJA
605
606	movl(r15, reg_f); // r15 = reg_f ; CH
607	rorxd(r12, reg_a, `13`); // r12 = reg_a >> 13 ; S0B
608	xorl(r13, r14); // r13 = (reg_e>>25) ^ (reg_e>>11) ; S1
609	xorl(r15, reg_g); // r15 = reg_f^reg_g ; CH
610
611	rorxd(r14, reg_e, `6`); // r14 = (reg_e >> 6) ; S1
612	andl(r15, reg_e); // r15 = (reg_f^reg_g)&reg_e ; CH
613
614	xorl(r13, r14); // r13 = (reg_e>>25) ^ (reg_e>>11) ^ (reg_e>>6) ; S1
615	rorxd(r14, reg_a, `22`); // r14 = reg_a >> 22 ; S0A
616	addl(reg_d, reg_h); // reg_d = k + w + reg_h + reg_d ; --
617
618	andl(rcx, reg_b); // rcx = (reg_a\|reg_c)&reg_b ; MAJA
619	xorl(r14, r12); // r14 = (reg_a>>22) ^ (reg_a>>13) ; S0
620
621	rorxd(r12, reg_a, `2`); // r12 = (reg_a >> 2) ; S0
622	xorl(r15, reg_g); // r15 = CH = ((reg_f^reg_g)&reg_e)^reg_g ; CH
623
624	xorl(r14, r12); // r14 = (reg_a>>22) ^ (reg_a>>13) ^ (reg_a>>2) ; S0
625	movl(r12, reg_a); // r12 = reg_a ; MAJB
626	andl(r12, reg_c); // r12 = reg_a&reg_c ; MAJB
627	addl(r15, r13); // r15 = S1 + CH ; --
628
629	orl(rcx, r12); // rcx = MAJ = (reg_a\|reg_c)&reg_b)\|(reg_a&reg_c) ; MAJ
630	addl(reg_h, r14); // reg_h = k + w + reg_h + S0 ; --
631	addl(reg_d, r15); // reg_d = k + w + reg_h + reg_d + S1 + CH = reg_d + t1 ; --
632
633	addl(reg_h, r15); // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; --
634	addl(reg_h, rcx); // reg_h = t1 + S0 + MAJ ; --
635
636	if (iter%`4` == `0`) {
637	vpalignr(xmm0, xmm_3, xmm_2, `4`, AVX_256bit); // ymm0 = W[-7]
638	vpaddd(xmm0, xmm0, xmm_0, AVX_256bit); // ymm0 = W[-7] + W[-16]; y1 = (e >> 6) ; S1
639	vpalignr(xmm1, xmm_1, xmm_0, `4`, AVX_256bit); // ymm1 = W[-15]
640	vpsrld(xmm2, xmm1, `7`, AVX_256bit);
641	vpslld(xmm3, xmm1, `32`-`7`, AVX_256bit);
642	vpor(xmm3, xmm3, xmm2, AVX_256bit); // ymm3 = W[-15] ror 7
643	vpsrld(xmm2, xmm1,`18`, AVX_256bit);
644	} else if (iter%`4` == `1` ) {
645	vpsrld(xmm8, xmm1, `3`, AVX_256bit); // ymm8 = W[-15] >> 3
646	vpslld(xmm1, xmm1, `32`-`18`, AVX_256bit);
647	vpxor(xmm3, xmm3, xmm1, AVX_256bit);
648	vpxor(xmm3, xmm3, xmm2, AVX_256bit); // ymm3 = W[-15] ror 7 ^ W[-15] ror 18
649	vpxor(xmm1, xmm3, xmm8, AVX_256bit); // ymm1 = s0
650	vpshufd(xmm2, xmm_3, `0xFA`, AVX_256bit); // 11111010b ; ymm2 = W[-2] {BBAA}
651	vpaddd(xmm0, xmm0, xmm1, AVX_256bit); // ymm0 = W[-16] + W[-7] + s0
652	vpsrld(xmm8, xmm2, `10`, AVX_256bit); // ymm8 = W[-2] >> 10 {BBAA}
653	} else if (iter%`4` == `2`) {
654	vpsrlq(xmm3, xmm2, `19`, AVX_256bit); // ymm3 = W[-2] ror 19 {xBxA}
655	vpsrlq(xmm2, xmm2, `17`, AVX_256bit); // ymm2 = W[-2] ror 17 {xBxA}
656	vpxor(xmm2, xmm2, xmm3, AVX_256bit);
657	vpxor(xmm8, xmm8, xmm2, AVX_256bit); // ymm8 = s1 {xBxA}
658	vpshufb(xmm8, xmm8, xmm10, AVX_256bit); // ymm8 = s1 {00BA}
659	vpaddd(xmm0, xmm0, xmm8, AVX_256bit); // ymm0 = {..., ..., W[1], W[0]}
660	vpshufd(xmm2, xmm0, `0x50`, AVX_256bit); // 01010000b ; ymm2 = W[-2] {DDCC}
661	} else if (iter%`4` == `3`) {
662	vpsrld(xmm11, xmm2, `10`, AVX_256bit); // ymm11 = W[-2] >> 10 {DDCC}
663	vpsrlq(xmm3, xmm2, `19`, AVX_256bit); // ymm3 = W[-2] ror 19 {xDxC}
664	vpsrlq(xmm2, xmm2, `17`, AVX_256bit); // ymm2 = W[-2] ror 17 {xDxC}
665	vpxor(xmm2, xmm2, xmm3, AVX_256bit);
666	vpxor(xmm11, xmm11, xmm2, AVX_256bit); // ymm11 = s1 {xDxC}
667	vpshufb(xmm11, xmm11, xmm12, AVX_256bit); // ymm11 = s1 {DC00}
668	vpaddd(xmm_0, xmm11, xmm0, AVX_256bit); // xmm_0 = {W[3], W[2], W[1], W[0]}
669	}
670	}
671
672	void MacroAssembler::addm(int disp, Register r1, Register r2) {
673	addl(r2, Address (r1, disp));
674	movl(Address (r1, disp), r2);
675	}
676
677	void MacroAssembler::addmq(int disp, Register r1, Register r2) {
678	addq(r2, Address (r1, disp));
679	movq(Address (r1, disp), r2);
680	}
681
682	void MacroAssembler::sha256_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
683	XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
684	Register buf, Register state, Register ofs, Register limit, Register rsp,
685	bool multi_block, XMMRegister shuf_mask) {
686
687	Label loop0, loop1, loop2, loop3,
688	last_block_enter, do_last_block, only_one_block, done_hash,
689	compute_size, compute_size_end,
690	compute_size1, compute_size_end1;
691
692	address K256_W = StubRoutines::x86::k256_W_addr();
693	address pshuffle_byte_flip_mask = StubRoutines::x86::pshuffle_byte_flip_mask_addr();
694	address pshuffle_byte_flip_mask_addr = `0`;
695
696	const XMMRegister& SHUF_00BA = xmm10; // ymm10: shuffle xBxA -> 00BA
697	const XMMRegister& SHUF_DC00 = xmm12; // ymm12: shuffle xDxC -> DC00
698	const XMMRegister& BYTE_FLIP_MASK = xmm13; // ymm13
699
700	const XMMRegister& X_BYTE_FLIP_MASK = xmm13; //XMM version of BYTE_FLIP_MASK
701
702	const Register& NUM_BLKS = r8; // 3rd arg
703	const Register& CTX = rdx; // 2nd arg
704	const Register& INP = rcx; // 1st arg
705
706	const Register& c = rdi;
707	const Register& d = rsi;
708	const Register& e = r8; // clobbers NUM_BLKS
709	const Register& y3 = rcx; // clobbers INP
710
711	const Register& TBL = rbp;
712	const Register& SRND = CTX; // SRND is same register as CTX
713
714	const Register& a = rax;
715	const Register& b = rbx;
716	const Register& f = r9;
717	const Register& g = r10;
718	const Register& h = r11;
719
720	const Register& T1 = r12;
721	const Register& y0 = r13;
722	const Register& y1 = r14;
723	const Register& y2 = r15;
724
725
726	enum {
727	_XFER_SIZE = `2``64``4`, // 2 blocks, 64 rounds, 4 bytes/round
728	_INP_END_SIZE = `8`,
729	_INP_SIZE = `8`,
730	_CTX_SIZE = `8`,
731	_RSP_SIZE = `8`,
732
733	_XFER = `0`,
734	_INP_END = _XFER + _XFER_SIZE,
735	_INP = _INP_END + _INP_END_SIZE,
736	_CTX = _INP + _INP_SIZE,
737	_RSP = _CTX + _CTX_SIZE,
738	STACK_SIZE = _RSP + _RSP_SIZE
739	};
740
741	#ifndef _WIN64
742	push(rcx); // linux: this is limit, need at the end
743	push(rdx); // linux: this is ofs
744	#else
745	push(r8); // win64: this is ofs
746	push(r9); // win64: this is limit, we need them again at the very and
747	#endif
748
749
750	push(rbx);
751	#ifdef _WIN64
752	push(rsi);
753	push(rdi);
754	#endif
755	push(rbp);
756	push(r12);
757	push(r13);
758	push(r14);
759	push(r15);
760
761	movq(rax, rsp);
762	subq(rsp, STACK_SIZE);
763	andq(rsp, -`32`);
764	movq(Address (rsp, _RSP), rax);
765
766	#ifndef _WIN64
767	// copy linux params to win64 params, therefore the rest of code will be the same for both
768	movq(r9, rcx);
769	movq(r8, rdx);
770	movq(rdx, rsi);
771	movq(rcx, rdi);
772	#endif
773
774	// setting original assembly ABI
775	/* message to encrypt in INP /
776	lea(INP, Address (rcx, `0`)); // rcx == message (buf) ;; linux: INP = buf = rdi
777	/* digest in CTX /
778	movq(CTX, rdx); // rdx = digest (state) ;; linux: CTX = state = rsi
779
780	/* NUM_BLK is the length of message, need to set it from ofs and limit /
781	if (multi_block) {
782
783	// Win64: cannot directly update NUM_BLKS, since NUM_BLKS = ofs = r8
784	// on entry r8 = ofs
785	// on exit r8 = NUM_BLKS
786
787	xorq(rax, rax);
788
789	bind(compute_size);
790	cmpptr(r8, r9); // assume the original ofs <= limit ;; linux: cmp rcx, rdx
791	jccb(Assembler::aboveEqual, compute_size_end);
792	addq(r8, `64`); //;; linux: ofs = rdx
793	addq(rax, `64`);
794	jmpb(compute_size);
795
796	bind(compute_size_end);
797	movq(NUM_BLKS, rax); // NUM_BLK (r8) ;; linux: NUM_BLK = rdx
798
799	cmpq(NUM_BLKS, `0`);
800	jcc(Assembler::equal, done_hash);
801
802	} else {
803	xorq(NUM_BLKS, NUM_BLKS);
804	addq(NUM_BLKS, `64`);
805	}//if (!multi_block)
806
807	lea(NUM_BLKS, Address (INP, NUM_BLKS, Address::times_1, -`64`)); // pointer to the last block
808	movq(Address (rsp, _INP_END), NUM_BLKS); //
809
810	cmpptr(INP, NUM_BLKS); //cmp INP, NUM_BLKS
811	jcc(Assembler::equal, only_one_block); //je only_one_block
812
813	// load initial digest
814	movl(a, Address (CTX, `4`*`0`));
815	movl(b, Address (CTX, `4`*`1`));
816	movl(c, Address (CTX, `4`*`2`));
817	movl(d, Address (CTX, `4`*`3`));
818	movl(e, Address (CTX, `4`*`4`));
819	movl(f, Address (CTX, `4`*`5`));
820	// load g - r10 after it is used as scratch
821	movl(h, Address (CTX, `4`*`7`));
822
823	pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask;
824	vmovdqu(BYTE_FLIP_MASK, ExternalAddress (pshuffle_byte_flip_mask_addr +`0`)); //[PSHUFFLE_BYTE_FLIP_MASK wrt rip]
825	vmovdqu(SHUF_00BA, ExternalAddress (pshuffle_byte_flip_mask_addr + `32`)); //[_SHUF_00BA wrt rip]
826	vmovdqu(SHUF_DC00, ExternalAddress (pshuffle_byte_flip_mask_addr + `64`)); //[_SHUF_DC00 wrt rip]
827
828	movl(g, Address (CTX, `4`*`6`));
829
830	movq(Address (rsp, _CTX), CTX); // store
831
832	bind(loop0);
833	lea(TBL, ExternalAddress (K256_W));
834
835	// assume buffers not aligned
836
837	// Load first 16 dwords from two blocks
838	vmovdqu(xmm0, Address (INP, `0`*`32`));
839	vmovdqu(xmm1, Address (INP, `1`*`32`));
840	vmovdqu(xmm2, Address (INP, `2`*`32`));
841	vmovdqu(xmm3, Address (INP, `3`*`32`));
842
843	// byte swap data
844	vpshufb(xmm0, xmm0, BYTE_FLIP_MASK, AVX_256bit);
845	vpshufb(xmm1, xmm1, BYTE_FLIP_MASK, AVX_256bit);
846	vpshufb(xmm2, xmm2, BYTE_FLIP_MASK, AVX_256bit);
847	vpshufb(xmm3, xmm3, BYTE_FLIP_MASK, AVX_256bit);
848
849	// transpose data into high/low halves
850	vperm2i128(xmm4, xmm0, xmm2, `0x20`);
851	vperm2i128(xmm5, xmm0, xmm2, `0x31`);
852	vperm2i128(xmm6, xmm1, xmm3, `0x20`);
853	vperm2i128(xmm7, xmm1, xmm3, `0x31`);
854
855	bind(last_block_enter);
856	addq(INP, `64`);
857	movq(Address (rsp, _INP), INP);
858
859	//;; schedule 48 input dwords, by doing 3 rounds of 12 each
860	xorq(SRND, SRND);
861
862	align(`16`);
863	bind(loop1);
864	vpaddd(xmm9, xmm4, Address (TBL, SRND, Address::times_1, `0`*`32`), AVX_256bit);
865	vmovdqu(Address (rsp, SRND, Address::times_1, _XFER + `0`*`32`), xmm9);
866	sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, rax, rbx, rdi, rsi, r8, r9, r10, r11, `0`);
867	sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r11, rax, rbx, rdi, rsi, r8, r9, r10, `1`);
868	sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r10, r11, rax, rbx, rdi, rsi, r8, r9, `2`);
869	sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r9, r10, r11, rax, rbx, rdi, rsi, r8, `3`);
870
871	vpaddd(xmm9, xmm5, Address (TBL, SRND, Address::times_1, `1`*`32`), AVX_256bit);
872	vmovdqu(Address (rsp, SRND, Address::times_1, _XFER + `1`*`32`), xmm9);
873	sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, r8, r9, r10, r11, rax, rbx, rdi, rsi, `8`+`0`);
874	sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rsi, r8, r9, r10, r11, rax, rbx, rdi, `8`+`1`);
875	sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rdi, rsi, r8, r9, r10, r11, rax, rbx, `8`+`2`);
876	sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rbx, rdi, rsi, r8, r9, r10, r11, rax, `8`+`3`);
877
878	vpaddd(xmm9, xmm6, Address (TBL, SRND, Address::times_1, `2`*`32`), AVX_256bit);
879	vmovdqu(Address (rsp, SRND, Address::times_1, _XFER + `2`*`32`), xmm9);
880	sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, rax, rbx, rdi, rsi, r8, r9, r10, r11, `16`+`0`);
881	sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r11, rax, rbx, rdi, rsi, r8, r9, r10, `16`+`1`);
882	sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r10, r11, rax, rbx, rdi, rsi, r8, r9, `16`+`2`);
883	sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r9, r10, r11, rax, rbx, rdi, rsi, r8, `16`+`3`);
884
885	vpaddd(xmm9, xmm7, Address (TBL, SRND, Address::times_1, `3`*`32`), AVX_256bit);
886	vmovdqu(Address (rsp, SRND, Address::times_1, _XFER + `3`*`32`), xmm9);
887
888	sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, r8, r9, r10, r11, rax, rbx, rdi, rsi, `24`+`0`);
889	sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rsi, r8, r9, r10, r11, rax, rbx, rdi, `24`+`1`);
890	sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rdi, rsi, r8, r9, r10, r11, rax, rbx, `24`+`2`);
891	sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rbx, rdi, rsi, r8, r9, r10, r11, rax, `24`+`3`);
892
893	addq(SRND, `4`*`32`);
894	cmpq(SRND, `3` * `4`*`32`);
895	jcc(Assembler::below, loop1);
896
897	bind(loop2);
898	// Do last 16 rounds with no scheduling
899	vpaddd(xmm9, xmm4, Address (TBL, SRND, Address::times_1, `0`*`32`), AVX_256bit);
900	vmovdqu(Address (rsp, SRND, Address::times_1, _XFER + `0`*`32`), xmm9);
901	sha256_AVX2_four_rounds_compute_first(`0`);
902
903	vpaddd(xmm9, xmm5, Address (TBL, SRND, Address::times_1, `1`*`32`), AVX_256bit);
904	vmovdqu(Address (rsp, SRND, Address::times_1, _XFER + `1`*`32`), xmm9);
905	sha256_AVX2_four_rounds_compute_last(`0` + `8`);
906
907	addq(SRND, `2`*`32`);
908
909	vmovdqu(xmm4, xmm6);
910	vmovdqu(xmm5, xmm7);
911
912	cmpq(SRND, `4` * `4`*`32`);
913	jcc(Assembler::below, loop2);
914
915	movq(CTX, Address (rsp, _CTX));
916	movq(INP, Address (rsp, _INP));
917
918	addm(`4`*`0`, CTX, a);
919	addm(`4`*`1`, CTX, b);
920	addm(`4`*`2`, CTX, c);
921	addm(`4`*`3`, CTX, d);
922	addm(`4`*`4`, CTX, e);
923	addm(`4`*`5`, CTX, f);
924	addm(`4`*`6`, CTX, g);
925	addm(`4`*`7`, CTX, h);
926
927	cmpq(INP, Address (rsp, _INP_END));
928	jcc(Assembler::above, done_hash);
929
930	//Do second block using previously scheduled results
931	xorq(SRND, SRND);
932	align(`16`);
933	bind(loop3);
934	sha256_AVX2_four_rounds_compute_first(`4`);
935	sha256_AVX2_four_rounds_compute_last(`4`+`8`);
936
937	addq(SRND, `2`*`32`);
938	cmpq(SRND, `4` * `4`*`32`);
939	jcc(Assembler::below, loop3);
940
941	movq(CTX, Address (rsp, _CTX));
942	movq(INP, Address (rsp, _INP));
943	addq(INP, `64`);
944
945	addm(`4`*`0`, CTX, a);
946	addm(`4`*`1`, CTX, b);
947	addm(`4`*`2`, CTX, c);
948	addm(`4`*`3`, CTX, d);
949	addm(`4`*`4`, CTX, e);
950	addm(`4`*`5`, CTX, f);
951	addm(`4`*`6`, CTX, g);
952	addm(`4`*`7`, CTX, h);
953
954	cmpq(INP, Address (rsp, _INP_END));
955	jcc(Assembler::below, loop0);
956	jccb(Assembler::above, done_hash);
957
958	bind(do_last_block);
959	lea(TBL, ExternalAddress (K256_W));
960
961	movdqu(xmm4, Address (INP, `0`*`16`));
962	movdqu(xmm5, Address (INP, `1`*`16`));
963	movdqu(xmm6, Address (INP, `2`*`16`));
964	movdqu(xmm7, Address (INP, `3`*`16`));
965
966	vpshufb(xmm4, xmm4, xmm13, AVX_128bit);
967	vpshufb(xmm5, xmm5, xmm13, AVX_128bit);
968	vpshufb(xmm6, xmm6, xmm13, AVX_128bit);
969	vpshufb(xmm7, xmm7, xmm13, AVX_128bit);
970
971	jmp(last_block_enter);
972
973	bind(only_one_block);
974
975	// load initial digest ;; table should be preloaded with following values
976	movl(a, Address (CTX, `4``0`)); // 0x6a09e667*
977	movl(b, Address (CTX, `4``1`)); // 0xbb67ae85*
978	movl(c, Address (CTX, `4``2`)); // 0x3c6ef372*
979	movl(d, Address (CTX, `4``3`)); // 0xa54ff53a*
980	movl(e, Address (CTX, `4``4`)); // 0x510e527f*
981	movl(f, Address (CTX, `4``5`)); // 0x9b05688c*
982	// load g - r10 after use as scratch
983	movl(h, Address (CTX, `4``7`)); // 0x5be0cd19*
984
985
986	pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask;
987	vmovdqu(BYTE_FLIP_MASK, ExternalAddress (pshuffle_byte_flip_mask_addr + `0`)); //[PSHUFFLE_BYTE_FLIP_MASK wrt rip]
988	vmovdqu(SHUF_00BA, ExternalAddress (pshuffle_byte_flip_mask_addr + `32`)); //[_SHUF_00BA wrt rip]
989	vmovdqu(SHUF_DC00, ExternalAddress (pshuffle_byte_flip_mask_addr + `64`)); //[_SHUF_DC00 wrt rip]
990
991	movl(g, Address (CTX, `4``6`)); // 0x1f83d9ab*
992
993	movq(Address (rsp, _CTX), CTX);
994	jmpb(do_last_block);
995
996	bind(done_hash);
997
998	movq(rsp, Address (rsp, _RSP));
999
1000	pop(r15);
1001	pop(r14);
1002	pop(r13);
1003	pop(r12);
1004	pop(rbp);
1005	#ifdef _WIN64
1006	pop(rdi);
1007	pop(rsi);
1008	#endif
1009	pop(rbx);
1010
1011	#ifdef _WIN64
1012	pop(r9);
1013	pop(r8);
1014	#else
1015	pop(rdx);
1016	pop(rcx);
1017	#endif
1018
1019	if (multi_block) {
1020	#ifdef _WIN64
1021	const Register& limit_end = r9;
1022	const Register& ofs_end = r8;
1023	#else
1024	const Register& limit_end = rcx;
1025	const Register& ofs_end = rdx;
1026	#endif
1027	movq(rax, ofs_end);
1028
1029	bind(compute_size1);
1030	cmpptr(rax, limit_end); // assume the original ofs <= limit
1031	jccb(Assembler::aboveEqual, compute_size_end1);
1032	addq(rax, `64`);
1033	jmpb(compute_size1);
1034
1035	bind(compute_size_end1);
1036	}
1037	}
1038
1039	void MacroAssembler::sha512_AVX2_one_round_compute(Register old_h, Register a, Register b, Register c,
1040	Register d, Register e, Register f, Register g, Register h,
1041	int iteration)
1042	{
1043
1044	const Register& y0 = r13;
1045	const Register& y1 = r14;
1046	const Register& y2 = r15;
1047	#ifdef _WIN64
1048	const Register& y3 = rcx;
1049	#else
1050	const Register& y3 = rdi;
1051	#endif
1052	const Register& T1 = r12;
1053
1054	if (iteration % `4` > `0`) {
1055	addq(old_h, y2); //h = k + w + h + S0 + S1 + CH = t1 + S0;
1056	}
1057	movq(y2, f); //y2 = f; CH
1058	rorxq(y0, e, `41`); //y0 = e >> 41; S1A
1059	rorxq(y1, e, `18`); //y1 = e >> 18; S1B
1060	xorq(y2, g); //y2 = f^g; CH
1061
1062	xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18); S1
1063	rorxq(y1, e, `14`); //y1 = (e >> 14); S1
1064	andq(y2, e); //y2 = (f^g)&e; CH
1065
1066	if (iteration % `4` > `0` ) {
1067	addq(old_h, y3); //h = t1 + S0 + MAJ
1068	}
1069	xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18) ^ (e >> 14); S1
1070	rorxq(T1, a, `34`); //T1 = a >> 34; S0B
1071	xorq(y2, g); //y2 = CH = ((f^g)&e) ^g; CH
1072	rorxq(y1, a, `39`); //y1 = a >> 39; S0A
1073	movq(y3, a); //y3 = a; MAJA
1074
1075	xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34); S0
1076	rorxq(T1, a, `28`); //T1 = (a >> 28); S0
1077	addq(h, Address (rsp, (`8` * iteration))); //h = k + w + h; --
1078	orq(y3, c); //y3 = a \| c; MAJA
1079
1080	xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34) ^ (a >> 28); S0
1081	movq(T1, a); //T1 = a; MAJB
1082	andq(y3, b); //y3 = (a \| c)&b; MAJA
1083	andq(T1, c); //T1 = a&c; MAJB
1084	addq(y2, y0); //y2 = S1 + CH; --
1085
1086	addq(d, h); //d = k + w + h + d; --
1087	orq(y3, T1); //y3 = MAJ = (a \| c)&b) \| (a&c); MAJ
1088	addq(h, y1); //h = k + w + h + S0; --
1089
1090	addq(d, y2); //d = k + w + h + d + S1 + CH = d + t1; --
1091
1092	if (iteration % `4` == `3`) {
1093	addq(h, y2); //h = k + w + h + S0 + S1 + CH = t1 + S0; --
1094	addq(h, y3); //h = t1 + S0 + MAJ; --
1095	}
1096	}
1097
1098	void MacroAssembler::sha512_AVX2_one_round_and_schedule(
1099	XMMRegister xmm4, // ymm4
1100	XMMRegister xmm5, // ymm5
1101	XMMRegister xmm6, // ymm6
1102	XMMRegister xmm7, // ymm7
1103	Register a, //rax
1104	Register b, //rbx
1105	Register c, //rdi
1106	Register d, //rsi
1107	Register e, //r8
1108	Register f, //r9
1109	Register g, //r10
1110	Register h, //r11
1111	int iteration)
1112	{
1113
1114	const Register& y0 = r13;
1115	const Register& y1 = r14;
1116	const Register& y2 = r15;
1117	#ifdef _WIN64
1118	const Register& y3 = rcx;
1119	#else
1120	const Register& y3 = rdi;
1121	#endif
1122	const Register& T1 = r12;
1123
1124	if (iteration % `4` == `0`) {
1125	// Extract w[t - 7]
1126	// xmm0 = W[-7]
1127	vperm2f128(xmm0, xmm7, xmm6, `3`);
1128	vpalignr(xmm0, xmm0, xmm6, `8`, AVX_256bit);
1129
1130	// Calculate w[t - 16] + w[t - 7]
1131	vpaddq(xmm0, xmm0, xmm4, AVX_256bit); //xmm0 = W[-7] + W[-16]
1132	// Extract w[t - 15]
1133	//xmm1 = W[-15]
1134	vperm2f128(xmm1, xmm5, xmm4, `3`);
1135	vpalignr(xmm1, xmm1, xmm4, `8`, AVX_256bit);
1136
1137	// Calculate sigma0
1138	// Calculate w[t - 15] ror 1
1139	vpsrlq(xmm2, xmm1, `1`, AVX_256bit);
1140	vpsllq(xmm3, xmm1, (`64` - `1`), AVX_256bit);
1141	vpor(xmm3, xmm3, xmm2, AVX_256bit); //xmm3 = W[-15] ror 1
1142	// Calculate w[t - 15] shr 7
1143	vpsrlq(xmm8, xmm1, `7`, AVX_256bit); //xmm8 = W[-15] >> 7
1144
1145	} else if (iteration % `4` == `1`) {
1146	//Calculate w[t - 15] ror 8
1147	vpsrlq(xmm2, xmm1, `8`, AVX_256bit);
1148	vpsllq(xmm1, xmm1, (`64` - `8`), AVX_256bit);
1149	vpor(xmm1, xmm1, xmm2, AVX_256bit); //xmm1 = W[-15] ror 8
1150
1151	//XOR the three components
1152	vpxor(xmm3, xmm3, xmm8, AVX_256bit); //xmm3 = W[-15] ror 1 ^ W[-15] >> 7
1153	vpxor(xmm1, xmm3, xmm1, AVX_256bit); //xmm1 = s0
1154
1155	//Add three components, w[t - 16], w[t - 7] and sigma0
1156	vpaddq(xmm0, xmm0, xmm1, AVX_256bit); //xmm0 = W[-16] + W[-7] + s0
1157
1158	// Move to appropriate lanes for calculating w[16] and w[17]
1159	vperm2f128(xmm4, xmm0, xmm0, `0`); //xmm4 = W[-16] + W[-7] + s0{ BABA }
1160
1161	//Move to appropriate lanes for calculating w[18] and w[19]
1162	vpand(xmm0, xmm0, xmm10, AVX_256bit); //xmm0 = W[-16] + W[-7] + s0{ DC00 }
1163	//Calculate w[16] and w[17] in both 128 bit lanes
1164	//Calculate sigma1 for w[16] and w[17] on both 128 bit lanes
1165	vperm2f128(xmm2, xmm7, xmm7, `17`); //xmm2 = W[-2] {BABA}
1166	vpsrlq(xmm8, xmm2, `6`, AVX_256bit); //xmm8 = W[-2] >> 6 {BABA}
1167
1168	} else if (iteration % `4` == `2`) {
1169	vpsrlq(xmm3, xmm2, `19`, AVX_256bit); //xmm3 = W[-2] >> 19 {BABA}
1170	vpsllq(xmm1, xmm2, (`64` - `19`), AVX_256bit); //xmm1 = W[-2] << 19 {BABA}
1171	vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 19 {BABA}
1172	vpxor(xmm8, xmm8, xmm3, AVX_256bit);// xmm8 = W[-2] ror 19 ^ W[-2] >> 6 {BABA}
1173	vpsrlq(xmm3, xmm2, `61`, AVX_256bit); //xmm3 = W[-2] >> 61 {BABA}
1174	vpsllq(xmm1, xmm2, (`64` - `61`), AVX_256bit); //xmm1 = W[-2] << 61 {BABA}
1175	vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 61 {BABA}
1176	vpxor(xmm8, xmm8, xmm3, AVX_256bit); //xmm8 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) { BABA }
1177
1178	//Add sigma1 to the other components to get w[16] and w[17]
1179	vpaddq(xmm4, xmm4, xmm8, AVX_256bit); //xmm4 = { W[1], W[0], W[1], W[0] }
1180
1181	//Calculate sigma1 for w[18] and w[19] for upper 128 bit lane
1182	vpsrlq(xmm8, xmm4, `6`, AVX_256bit); //xmm8 = W[-2] >> 6 {DC--}
1183
1184	} else if (iteration % `4` == `3`){
1185	vpsrlq(xmm3, xmm4, `19`, AVX_256bit); //xmm3 = W[-2] >> 19 {DC--}
1186	vpsllq(xmm1, xmm4, (`64` - `19`), AVX_256bit); //xmm1 = W[-2] << 19 {DC--}
1187	vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 19 {DC--}
1188	vpxor(xmm8, xmm8, xmm3, AVX_256bit); //xmm8 = W[-2] ror 19 ^ W[-2] >> 6 {DC--}
1189	vpsrlq(xmm3, xmm4, `61`, AVX_256bit); //xmm3 = W[-2] >> 61 {DC--}
1190	vpsllq(xmm1, xmm4, (`64` - `61`), AVX_256bit); //xmm1 = W[-2] << 61 {DC--}
1191	vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 61 {DC--}
1192	vpxor(xmm8, xmm8, xmm3, AVX_256bit); //xmm8 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) { DC-- }
1193
1194	//Add the sigma0 + w[t - 7] + w[t - 16] for w[18] and w[19] to newly calculated sigma1 to get w[18] and w[19]
1195	vpaddq(xmm2, xmm0, xmm8, AVX_256bit); //xmm2 = { W[3], W[2], --, -- }
1196
1197	//Form w[19, w[18], w17], w[16]
1198	vpblendd(xmm4, xmm4, xmm2, `0xF0`, AVX_256bit); //xmm4 = { W[3], W[2], W[1], W[0] }
1199	}
1200
1201	movq(y3, a); //y3 = a; MAJA
1202	rorxq(y0, e, `41`); // y0 = e >> 41; S1A
1203	rorxq(y1, e, `18`); //y1 = e >> 18; S1B
1204	addq(h, Address (rsp, (iteration * `8`))); //h = k + w + h; --
1205	orq(y3, c); //y3 = a \| c; MAJA
1206	movq(y2, f); //y2 = f; CH
1207
1208	xorq(y2, g); //y2 = f^g; CH
1209
1210	rorxq(T1, a, `34`); //T1 = a >> 34; S0B
1211	xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18); S1
1212
1213	rorxq(y1, e, `14`); //y1 = (e >> 14); S1
1214
1215	andq(y2, e); //y2 = (f^g) & e; CH
1216	addq(d, h); //d = k + w + h + d; --
1217
1218	andq(y3, b); //y3 = (a \| c)&b; MAJA
1219	xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18) ^ (e >> 14); S1
1220	rorxq(y1, a, `39`); //y1 = a >> 39; S0A
1221
1222	xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34); S0
1223	rorxq(T1, a, `28`); //T1 = (a >> 28); S0
1224	xorq(y2, g); //y2 = CH = ((f^g)&e) ^ g; CH
1225
1226	xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34) ^ (a >> 28); S0
1227	movq(T1, a); //T1 = a; MAJB
1228
1229	andq(T1, c); //T1 = a&c; MAJB
1230	addq(y2, y0); //y2 = S1 + CH; --
1231
1232	orq(y3, T1); //y3 = MAJ = (a \| c)&b) \| (a&c); MAJ
1233	addq(h, y1); //h = k + w + h + S0; --
1234
1235	addq(d, y2); //d = k + w + h + d + S1 + CH = d + t1; --
1236	addq(h, y2); //h = k + w + h + S0 + S1 + CH = t1 + S0; --
1237	addq(h, y3); //h = t1 + S0 + MAJ; --
1238	}
1239
1240	void MacroAssembler::sha512_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
1241	XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
1242	Register buf, Register state, Register ofs, Register limit, Register rsp,
1243	bool multi_block, XMMRegister shuf_mask)
1244	{
1245
1246	Label loop0, loop1, loop2, done_hash,
1247	compute_block_size, compute_size,
1248	compute_block_size_end, compute_size_end;
1249
1250	address K512_W = StubRoutines::x86::k512_W_addr();
1251	address pshuffle_byte_flip_mask_sha512 = StubRoutines::x86::pshuffle_byte_flip_mask_addr_sha512();
1252	address pshuffle_byte_flip_mask_addr = `0`;
1253
1254	const XMMRegister& XFER = xmm0; // YTMP0
1255	const XMMRegister& BYTE_FLIP_MASK = xmm9; // ymm9
1256	const XMMRegister& YMM_MASK_LO = xmm10; // ymm10
1257	#ifdef _WIN64
1258	const Register& INP = rcx; //1st arg
1259	const Register& CTX = rdx; //2nd arg
1260	const Register& NUM_BLKS = r8; //3rd arg
1261	const Register& c = rdi;
1262	const Register& d = rsi;
1263	const Register& e = r8;
1264	const Register& y3 = rcx;
1265	const Register& offset = r8;
1266	const Register& input_limit = r9;
1267	#else
1268	const Register& INP = rdi; //1st arg
1269	const Register& CTX = rsi; //2nd arg
1270	const Register& NUM_BLKS = rdx; //3rd arg
1271	const Register& c = rcx;
1272	const Register& d = r8;
1273	const Register& e = rdx;
1274	const Register& y3 = rdi;
1275	const Register& offset = rdx;
1276	const Register& input_limit = rcx;
1277	#endif
1278
1279	const Register& TBL = rbp;
1280
1281	const Register& a = rax;
1282	const Register& b = rbx;
1283
1284	const Register& f = r9;
1285	const Register& g = r10;
1286	const Register& h = r11;
1287
1288	//Local variables as defined in assembly file.
1289	enum
1290	{
1291	_XFER_SIZE = `4` * `8`, // resq 4 => reserve 4 quadwords. Hence 4 8*
1292	_SRND_SIZE = `8`, // resq 1
1293	_INP_SIZE = `8`,
1294	_INP_END_SIZE = `8`,
1295	_RSP_SAVE_SIZE = `8`, // defined as resq 1
1296
1297	#ifdef _WIN64
1298	_GPR_SAVE_SIZE = `8` * `8`, // defined as resq 8
1299	#else
1300	_GPR_SAVE_SIZE = `6` * `8` // resq 6
1301	#endif
1302	};
1303
1304	enum
1305	{
1306	_XFER = `0`,
1307	_SRND = _XFER + _XFER_SIZE, // 32
1308	_INP = _SRND + _SRND_SIZE, // 40
1309	_INP_END = _INP + _INP_SIZE, // 48
1310	_RSP = _INP_END + _INP_END_SIZE, // 56
1311	_GPR = _RSP + _RSP_SAVE_SIZE, // 64
1312	_STACK_SIZE = _GPR + _GPR_SAVE_SIZE // 128 for windows and 112 for linux.
1313	};
1314
1315	//Saving offset and limit as it will help with blocksize calculation for multiblock SHA512.
1316	#ifdef _WIN64
1317	push(r8); // win64: this is ofs
1318	push(r9); // win64: this is limit, we need them again at the very end.
1319	#else
1320	push(rdx); // linux : this is ofs, need at the end for multiblock calculation
1321	push(rcx); // linux: This is the limit.
1322	#endif
1323
1324	//Allocate Stack Space
1325	movq(rax, rsp);
1326	subq(rsp, _STACK_SIZE);
1327	andq(rsp, -`32`);
1328	movq(Address (rsp, _RSP), rax);
1329
1330	//Save GPRs
1331	movq(Address (rsp, _GPR), rbp);
1332	movq(Address (rsp, (_GPR + `8`)), rbx);
1333	movq(Address (rsp, (_GPR + `16`)), r12);
1334	movq(Address (rsp, (_GPR + `24`)), r13);
1335	movq(Address (rsp, (_GPR + `32`)), r14);
1336	movq(Address (rsp, (_GPR + `40`)), r15);
1337
1338	#ifdef _WIN64
1339	movq(Address(rsp, (_GPR + `48`)), rsi);
1340	movq(Address(rsp, (_GPR + `56`)), rdi);
1341	#endif
1342
1343	vpblendd(xmm0, xmm0, xmm1, `0xF0`, AVX_128bit);
1344	vpblendd(xmm0, xmm0, xmm1, `0xF0`, AVX_256bit);
1345
1346	if (multi_block) {
1347	xorq(rax, rax);
1348	bind(compute_block_size);
1349	cmpptr(offset, input_limit); // Assuming that offset is less than limit.
1350	jccb(Assembler::aboveEqual, compute_block_size_end);
1351	addq(offset, `128`);
1352	addq(rax, `128`);
1353	jmpb(compute_block_size);
1354
1355	bind(compute_block_size_end);
1356	movq(NUM_BLKS, rax);
1357
1358	cmpq(NUM_BLKS, `0`);
1359	jcc(Assembler::equal, done_hash);
1360	} else {
1361	xorq(NUM_BLKS, NUM_BLKS); //If single block.
1362	addq(NUM_BLKS, `128`);
1363	}
1364
1365	addq(NUM_BLKS, INP); //pointer to end of data
1366	movq(Address (rsp, _INP_END), NUM_BLKS);
1367
1368	//load initial digest
1369	movq(a, Address (CTX, `8` * `0`));
1370	movq(b, Address (CTX, `8` * `1`));
1371	movq(c, Address (CTX, `8` * `2`));
1372	movq(d, Address (CTX, `8` * `3`));
1373	movq(e, Address (CTX, `8` * `4`));
1374	movq(f, Address (CTX, `8` * `5`));
1375	// load g - r10 after it is used as scratch
1376	movq(h, Address (CTX, `8` * `7`));
1377
1378	pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask_sha512;
1379	vmovdqu(BYTE_FLIP_MASK, ExternalAddress (pshuffle_byte_flip_mask_addr + `0`)); //PSHUFFLE_BYTE_FLIP_MASK wrt rip
1380	vmovdqu(YMM_MASK_LO, ExternalAddress (pshuffle_byte_flip_mask_addr + `32`));
1381
1382	movq(g, Address (CTX, `8` * `6`));
1383
1384	bind(loop0);
1385	lea(TBL, ExternalAddress (K512_W));
1386
1387	//byte swap first 16 dwords
1388	vmovdqu(xmm4, Address (INP, `32` * `0`));
1389	vpshufb(xmm4, xmm4, BYTE_FLIP_MASK, AVX_256bit);
1390	vmovdqu(xmm5, Address (INP, `32` * `1`));
1391	vpshufb(xmm5, xmm5, BYTE_FLIP_MASK, AVX_256bit);
1392	vmovdqu(xmm6, Address (INP, `32` * `2`));
1393	vpshufb(xmm6, xmm6, BYTE_FLIP_MASK, AVX_256bit);
1394	vmovdqu(xmm7, Address (INP, `32` * `3`));
1395	vpshufb(xmm7, xmm7, BYTE_FLIP_MASK, AVX_256bit);
1396
1397	movq(Address (rsp, _INP), INP);
1398
1399	movslq(Address (rsp, _SRND), `4`);
1400	align(`16`);
1401
1402	//Schedule 64 input dwords, by calling sha512_AVX2_one_round_and_schedule
1403	bind(loop1);
1404	vpaddq(xmm0, xmm4, Address (TBL, `0` * `32`), AVX_256bit);
1405	vmovdqu(Address (rsp, _XFER), xmm0);
1406	//four rounds and schedule
1407	sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, a, b, c, d, e, f, g, h, `0`);
1408	sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, h, a, b, c, d, e, f, g, `1`);
1409	sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, g, h, a, b, c, d, e, f, `2`);
1410	sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, f, g, h, a, b, c, d, e, `3`);
1411
1412	vpaddq(xmm0, xmm5, Address (TBL, `1` * `32`), AVX_256bit);
1413	vmovdqu(Address (rsp, _XFER), xmm0);
1414	//four rounds and schedule
1415	sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, e, f, g, h, a, b, c, d, `0`);
1416	sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, d, e, f, g, h, a, b, c, `1`);
1417	sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, c, d, e, f, g, h, a, b, `2`);
1418	sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, b, c, d, e, f, g, h, a, `3`);
1419
1420	vpaddq(xmm0, xmm6, Address (TBL, `2` * `32`), AVX_256bit);
1421	vmovdqu(Address (rsp, _XFER), xmm0);
1422	//four rounds and schedule
1423	sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, a, b, c, d, e, f, g, h, `0`);
1424	sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, h, a, b, c, d, e, f, g, `1`);
1425	sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, g, h, a, b, c, d, e, f, `2`);
1426	sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, f, g, h, a, b, c, d, e, `3`);
1427
1428	vpaddq(xmm0, xmm7, Address (TBL, `3` * `32`), AVX_256bit);
1429	vmovdqu(Address (rsp, _XFER), xmm0);
1430	addq(TBL, `4` * `32`);
1431	//four rounds and schedule
1432	sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, e, f, g, h, a, b, c, d, `0`);
1433	sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, d, e, f, g, h, a, b, c, `1`);
1434	sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, c, d, e, f, g, h, a, b, `2`);
1435	sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, b, c, d, e, f, g, h, a, `3`);
1436
1437	subq(Address (rsp, _SRND), `1`);
1438	jcc(Assembler::notEqual, loop1);
1439
1440	movslq(Address (rsp, _SRND), `2`);
1441
1442	bind(loop2);
1443	vpaddq(xmm0, xmm4, Address (TBL, `0` * `32`), AVX_256bit);
1444	vmovdqu(Address (rsp, _XFER), xmm0);
1445	//four rounds and compute.
1446	sha512_AVX2_one_round_compute(a, a, b, c, d, e, f, g, h, `0`);
1447	sha512_AVX2_one_round_compute(h, h, a, b, c, d, e, f, g, `1`);
1448	sha512_AVX2_one_round_compute(g, g, h, a, b, c, d, e, f, `2`);
1449	sha512_AVX2_one_round_compute(f, f, g, h, a, b, c, d, e, `3`);
1450
1451	vpaddq(xmm0, xmm5, Address (TBL, `1` * `32`), AVX_256bit);
1452	vmovdqu(Address (rsp, _XFER), xmm0);
1453	addq(TBL, `2` * `32`);
1454	// four rounds and compute.
1455	sha512_AVX2_one_round_compute(e, e, f, g, h, a, b, c, d, `0`);
1456	sha512_AVX2_one_round_compute(d, d, e, f, g, h, a, b, c, `1`);
1457	sha512_AVX2_one_round_compute(c, c, d, e, f, g, h, a, b, `2`);
1458	sha512_AVX2_one_round_compute(b, b, c, d, e, f, g, h, a, `3`);
1459
1460	vmovdqu(xmm4, xmm6);
1461	vmovdqu(xmm5, xmm7);
1462
1463	subq(Address (rsp, _SRND), `1`);
1464	jcc(Assembler::notEqual, loop2);
1465
1466	addmq(`8` * `0`, CTX, a);
1467	addmq(`8` * `1`, CTX, b);
1468	addmq(`8` * `2`, CTX, c);
1469	addmq(`8` * `3`, CTX, d);
1470	addmq(`8` * `4`, CTX, e);
1471	addmq(`8` * `5`, CTX, f);
1472	addmq(`8` * `6`, CTX, g);
1473	addmq(`8` * `7`, CTX, h);
1474
1475	movq(INP, Address (rsp, _INP));
1476	addq(INP, `128`);
1477	cmpq(INP, Address (rsp, _INP_END));
1478	jcc(Assembler::notEqual, loop0);
1479
1480	bind(done_hash);
1481
1482	//Restore GPRs
1483	movq(rbp, Address (rsp, (_GPR + `0`)));
1484	movq(rbx, Address (rsp, (_GPR + `8`)));
1485	movq(r12, Address (rsp, (_GPR + `16`)));
1486	movq(r13, Address (rsp, (_GPR + `24`)));
1487	movq(r14, Address (rsp, (_GPR + `32`)));
1488	movq(r15, Address (rsp, (_GPR + `40`)));
1489
1490	#ifdef _WIN64
1491	movq(rsi, Address(rsp, (_GPR + `48`)));
1492	movq(rdi, Address(rsp, (_GPR + `56`)));
1493	#endif
1494
1495	//Restore Stack Pointer
1496	movq(rsp, Address (rsp, _RSP));
1497
1498	#ifdef _WIN64
1499	pop(r9);
1500	pop(r8);
1501	#else
1502	pop(rcx);
1503	pop(rdx);
1504	#endif
1505
1506	if (multi_block) {
1507	#ifdef _WIN64
1508	const Register& limit_end = r9;
1509	const Register& ofs_end = r8;
1510	#else
1511	const Register& limit_end = rcx;
1512	const Register& ofs_end = rdx;
1513	#endif
1514	movq(rax, ofs_end);
1515	bind(compute_size);
1516	cmpptr(rax, limit_end);
1517	jccb(Assembler::aboveEqual, compute_size_end);
1518	addq(rax, `128`);
1519	jmpb(compute_size);
1520	bind(compute_size_end);
1521	}
1522	}
1523
1524	#endif //#ifdef _LP64
1525
1526

Browse the source code of OpenJDK/src/hotspot/cpu/x86/macroAssembler_x86_sha.cpp