1 | /* |
2 | * Copyright (c) 2016, Intel Corporation. |
3 | * |
4 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
5 | * |
6 | * This code is free software; you can redistribute it and/or modify it |
7 | * under the terms of the GNU General Public License version 2 only, as |
8 | * published by the Free Software Foundation. |
9 | * |
10 | * This code is distributed in the hope that it will be useful, but WITHOUT |
11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
13 | * version 2 for more details (a copy is included in the LICENSE file that |
14 | * accompanied this code). |
15 | * |
16 | * You should have received a copy of the GNU General Public License version |
17 | * 2 along with this work; if not, write to the Free Software Foundation, |
18 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
19 | * |
20 | * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
21 | * or visit www.oracle.com if you need additional information or have any |
22 | * questions. |
23 | * |
24 | */ |
25 | |
26 | #include "precompiled.hpp" |
27 | #include "asm/assembler.hpp" |
28 | #include "asm/assembler.inline.hpp" |
29 | #include "runtime/stubRoutines.hpp" |
30 | #include "macroAssembler_x86.hpp" |
31 | |
32 | // ofs and limit are used for multi-block byte array. |
33 | // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit) |
34 | void MacroAssembler::fast_sha1(XMMRegister abcd, XMMRegister e0, XMMRegister e1, XMMRegister msg0, |
35 | XMMRegister msg1, XMMRegister msg2, XMMRegister msg3, XMMRegister shuf_mask, |
36 | Register buf, Register state, Register ofs, Register limit, Register rsp, bool multi_block) { |
37 | |
38 | Label start, done_hash, loop0; |
39 | |
40 | address upper_word_mask = StubRoutines::x86::upper_word_mask_addr(); |
41 | address shuffle_byte_flip_mask = StubRoutines::x86::shuffle_byte_flip_mask_addr(); |
42 | |
43 | bind(start); |
44 | movdqu(abcd, Address(state, 0)); |
45 | pinsrd(e0, Address(state, 16), 3); |
46 | movdqu(shuf_mask, ExternalAddress(upper_word_mask)); // 0xFFFFFFFF000000000000000000000000 |
47 | pand(e0, shuf_mask); |
48 | pshufd(abcd, abcd, 0x1B); |
49 | movdqu(shuf_mask, ExternalAddress(shuffle_byte_flip_mask)); //0x000102030405060708090a0b0c0d0e0f |
50 | |
51 | bind(loop0); |
52 | // Save hash values for addition after rounds |
53 | movdqu(Address(rsp, 0), e0); |
54 | movdqu(Address(rsp, 16), abcd); |
55 | |
56 | |
57 | // Rounds 0 - 3 |
58 | movdqu(msg0, Address(buf, 0)); |
59 | pshufb(msg0, shuf_mask); |
60 | paddd(e0, msg0); |
61 | movdqa(e1, abcd); |
62 | sha1rnds4(abcd, e0, 0); |
63 | |
64 | // Rounds 4 - 7 |
65 | movdqu(msg1, Address(buf, 16)); |
66 | pshufb(msg1, shuf_mask); |
67 | sha1nexte(e1, msg1); |
68 | movdqa(e0, abcd); |
69 | sha1rnds4(abcd, e1, 0); |
70 | sha1msg1(msg0, msg1); |
71 | |
72 | // Rounds 8 - 11 |
73 | movdqu(msg2, Address(buf, 32)); |
74 | pshufb(msg2, shuf_mask); |
75 | sha1nexte(e0, msg2); |
76 | movdqa(e1, abcd); |
77 | sha1rnds4(abcd, e0, 0); |
78 | sha1msg1(msg1, msg2); |
79 | pxor(msg0, msg2); |
80 | |
81 | // Rounds 12 - 15 |
82 | movdqu(msg3, Address(buf, 48)); |
83 | pshufb(msg3, shuf_mask); |
84 | sha1nexte(e1, msg3); |
85 | movdqa(e0, abcd); |
86 | sha1msg2(msg0, msg3); |
87 | sha1rnds4(abcd, e1, 0); |
88 | sha1msg1(msg2, msg3); |
89 | pxor(msg1, msg3); |
90 | |
91 | // Rounds 16 - 19 |
92 | sha1nexte(e0, msg0); |
93 | movdqa(e1, abcd); |
94 | sha1msg2(msg1, msg0); |
95 | sha1rnds4(abcd, e0, 0); |
96 | sha1msg1(msg3, msg0); |
97 | pxor(msg2, msg0); |
98 | |
99 | // Rounds 20 - 23 |
100 | sha1nexte(e1, msg1); |
101 | movdqa(e0, abcd); |
102 | sha1msg2(msg2, msg1); |
103 | sha1rnds4(abcd, e1, 1); |
104 | sha1msg1(msg0, msg1); |
105 | pxor(msg3, msg1); |
106 | |
107 | // Rounds 24 - 27 |
108 | sha1nexte(e0, msg2); |
109 | movdqa(e1, abcd); |
110 | sha1msg2(msg3, msg2); |
111 | sha1rnds4(abcd, e0, 1); |
112 | sha1msg1(msg1, msg2); |
113 | pxor(msg0, msg2); |
114 | |
115 | // Rounds 28 - 31 |
116 | sha1nexte(e1, msg3); |
117 | movdqa(e0, abcd); |
118 | sha1msg2(msg0, msg3); |
119 | sha1rnds4(abcd, e1, 1); |
120 | sha1msg1(msg2, msg3); |
121 | pxor(msg1, msg3); |
122 | |
123 | // Rounds 32 - 35 |
124 | sha1nexte(e0, msg0); |
125 | movdqa(e1, abcd); |
126 | sha1msg2(msg1, msg0); |
127 | sha1rnds4(abcd, e0, 1); |
128 | sha1msg1(msg3, msg0); |
129 | pxor(msg2, msg0); |
130 | |
131 | // Rounds 36 - 39 |
132 | sha1nexte(e1, msg1); |
133 | movdqa(e0, abcd); |
134 | sha1msg2(msg2, msg1); |
135 | sha1rnds4(abcd, e1, 1); |
136 | sha1msg1(msg0, msg1); |
137 | pxor(msg3, msg1); |
138 | |
139 | // Rounds 40 - 43 |
140 | sha1nexte(e0, msg2); |
141 | movdqa(e1, abcd); |
142 | sha1msg2(msg3, msg2); |
143 | sha1rnds4(abcd, e0, 2); |
144 | sha1msg1(msg1, msg2); |
145 | pxor(msg0, msg2); |
146 | |
147 | // Rounds 44 - 47 |
148 | sha1nexte(e1, msg3); |
149 | movdqa(e0, abcd); |
150 | sha1msg2(msg0, msg3); |
151 | sha1rnds4(abcd, e1, 2); |
152 | sha1msg1(msg2, msg3); |
153 | pxor(msg1, msg3); |
154 | |
155 | // Rounds 48 - 51 |
156 | sha1nexte(e0, msg0); |
157 | movdqa(e1, abcd); |
158 | sha1msg2(msg1, msg0); |
159 | sha1rnds4(abcd, e0, 2); |
160 | sha1msg1(msg3, msg0); |
161 | pxor(msg2, msg0); |
162 | |
163 | // Rounds 52 - 55 |
164 | sha1nexte(e1, msg1); |
165 | movdqa(e0, abcd); |
166 | sha1msg2(msg2, msg1); |
167 | sha1rnds4(abcd, e1, 2); |
168 | sha1msg1(msg0, msg1); |
169 | pxor(msg3, msg1); |
170 | |
171 | // Rounds 56 - 59 |
172 | sha1nexte(e0, msg2); |
173 | movdqa(e1, abcd); |
174 | sha1msg2(msg3, msg2); |
175 | sha1rnds4(abcd, e0, 2); |
176 | sha1msg1(msg1, msg2); |
177 | pxor(msg0, msg2); |
178 | |
179 | // Rounds 60 - 63 |
180 | sha1nexte(e1, msg3); |
181 | movdqa(e0, abcd); |
182 | sha1msg2(msg0, msg3); |
183 | sha1rnds4(abcd, e1, 3); |
184 | sha1msg1(msg2, msg3); |
185 | pxor(msg1, msg3); |
186 | |
187 | // Rounds 64 - 67 |
188 | sha1nexte(e0, msg0); |
189 | movdqa(e1, abcd); |
190 | sha1msg2(msg1, msg0); |
191 | sha1rnds4(abcd, e0, 3); |
192 | sha1msg1(msg3, msg0); |
193 | pxor(msg2, msg0); |
194 | |
195 | // Rounds 68 - 71 |
196 | sha1nexte(e1, msg1); |
197 | movdqa(e0, abcd); |
198 | sha1msg2(msg2, msg1); |
199 | sha1rnds4(abcd, e1, 3); |
200 | pxor(msg3, msg1); |
201 | |
202 | // Rounds 72 - 75 |
203 | sha1nexte(e0, msg2); |
204 | movdqa(e1, abcd); |
205 | sha1msg2(msg3, msg2); |
206 | sha1rnds4(abcd, e0, 3); |
207 | |
208 | // Rounds 76 - 79 |
209 | sha1nexte(e1, msg3); |
210 | movdqa(e0, abcd); |
211 | sha1rnds4(abcd, e1, 3); |
212 | |
213 | // add current hash values with previously saved |
214 | movdqu(msg0, Address(rsp, 0)); |
215 | sha1nexte(e0, msg0); |
216 | movdqu(msg0, Address(rsp, 16)); |
217 | paddd(abcd, msg0); |
218 | |
219 | if (multi_block) { |
220 | // increment data pointer and loop if more to process |
221 | addptr(buf, 64); |
222 | addptr(ofs, 64); |
223 | cmpptr(ofs, limit); |
224 | jcc(Assembler::belowEqual, loop0); |
225 | movptr(rax, ofs); //return ofs |
226 | } |
227 | // write hash values back in the correct order |
228 | pshufd(abcd, abcd, 0x1b); |
229 | movdqu(Address(state, 0), abcd); |
230 | pextrd(Address(state, 16), e0, 3); |
231 | |
232 | bind(done_hash); |
233 | |
234 | } |
235 | |
236 | // xmm0 (msg) is used as an implicit argument to sh256rnds2 |
237 | // and state0 and state1 can never use xmm0 register. |
238 | // ofs and limit are used for multi-block byte array. |
239 | // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit) |
240 | #ifdef _LP64 |
241 | void MacroAssembler::fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0, |
242 | XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4, |
243 | Register buf, Register state, Register ofs, Register limit, Register rsp, |
244 | bool multi_block, XMMRegister shuf_mask) { |
245 | #else |
246 | void MacroAssembler::fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0, |
247 | XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4, |
248 | Register buf, Register state, Register ofs, Register limit, Register rsp, |
249 | bool multi_block) { |
250 | #endif |
251 | Label start, done_hash, loop0; |
252 | |
253 | address K256 = StubRoutines::x86::k256_addr(); |
254 | address pshuffle_byte_flip_mask = StubRoutines::x86::pshuffle_byte_flip_mask_addr(); |
255 | |
256 | bind(start); |
257 | movdqu(state0, Address(state, 0)); |
258 | movdqu(state1, Address(state, 16)); |
259 | |
260 | pshufd(state0, state0, 0xB1); |
261 | pshufd(state1, state1, 0x1B); |
262 | movdqa(msgtmp4, state0); |
263 | palignr(state0, state1, 8); |
264 | pblendw(state1, msgtmp4, 0xF0); |
265 | |
266 | #ifdef _LP64 |
267 | movdqu(shuf_mask, ExternalAddress(pshuffle_byte_flip_mask)); |
268 | #endif |
269 | lea(rax, ExternalAddress(K256)); |
270 | |
271 | bind(loop0); |
272 | movdqu(Address(rsp, 0), state0); |
273 | movdqu(Address(rsp, 16), state1); |
274 | |
275 | // Rounds 0-3 |
276 | movdqu(msg, Address(buf, 0)); |
277 | #ifdef _LP64 |
278 | pshufb(msg, shuf_mask); |
279 | #else |
280 | pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask)); |
281 | #endif |
282 | movdqa(msgtmp0, msg); |
283 | paddd(msg, Address(rax, 0)); |
284 | sha256rnds2(state1, state0); |
285 | pshufd(msg, msg, 0x0E); |
286 | sha256rnds2(state0, state1); |
287 | |
288 | // Rounds 4-7 |
289 | movdqu(msg, Address(buf, 16)); |
290 | #ifdef _LP64 |
291 | pshufb(msg, shuf_mask); |
292 | #else |
293 | pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask)); |
294 | #endif |
295 | movdqa(msgtmp1, msg); |
296 | paddd(msg, Address(rax, 16)); |
297 | sha256rnds2(state1, state0); |
298 | pshufd(msg, msg, 0x0E); |
299 | sha256rnds2(state0, state1); |
300 | sha256msg1(msgtmp0, msgtmp1); |
301 | |
302 | // Rounds 8-11 |
303 | movdqu(msg, Address(buf, 32)); |
304 | #ifdef _LP64 |
305 | pshufb(msg, shuf_mask); |
306 | #else |
307 | pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask)); |
308 | #endif |
309 | movdqa(msgtmp2, msg); |
310 | paddd(msg, Address(rax, 32)); |
311 | sha256rnds2(state1, state0); |
312 | pshufd(msg, msg, 0x0E); |
313 | sha256rnds2(state0, state1); |
314 | sha256msg1(msgtmp1, msgtmp2); |
315 | |
316 | // Rounds 12-15 |
317 | movdqu(msg, Address(buf, 48)); |
318 | #ifdef _LP64 |
319 | pshufb(msg, shuf_mask); |
320 | #else |
321 | pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask)); |
322 | #endif |
323 | movdqa(msgtmp3, msg); |
324 | paddd(msg, Address(rax, 48)); |
325 | sha256rnds2(state1, state0); |
326 | movdqa(msgtmp4, msgtmp3); |
327 | palignr(msgtmp4, msgtmp2, 4); |
328 | paddd(msgtmp0, msgtmp4); |
329 | sha256msg2(msgtmp0, msgtmp3); |
330 | pshufd(msg, msg, 0x0E); |
331 | sha256rnds2(state0, state1); |
332 | sha256msg1(msgtmp2, msgtmp3); |
333 | |
334 | // Rounds 16-19 |
335 | movdqa(msg, msgtmp0); |
336 | paddd(msg, Address(rax, 64)); |
337 | sha256rnds2(state1, state0); |
338 | movdqa(msgtmp4, msgtmp0); |
339 | palignr(msgtmp4, msgtmp3, 4); |
340 | paddd(msgtmp1, msgtmp4); |
341 | sha256msg2(msgtmp1, msgtmp0); |
342 | pshufd(msg, msg, 0x0E); |
343 | sha256rnds2(state0, state1); |
344 | sha256msg1(msgtmp3, msgtmp0); |
345 | |
346 | // Rounds 20-23 |
347 | movdqa(msg, msgtmp1); |
348 | paddd(msg, Address(rax, 80)); |
349 | sha256rnds2(state1, state0); |
350 | movdqa(msgtmp4, msgtmp1); |
351 | palignr(msgtmp4, msgtmp0, 4); |
352 | paddd(msgtmp2, msgtmp4); |
353 | sha256msg2(msgtmp2, msgtmp1); |
354 | pshufd(msg, msg, 0x0E); |
355 | sha256rnds2(state0, state1); |
356 | sha256msg1(msgtmp0, msgtmp1); |
357 | |
358 | // Rounds 24-27 |
359 | movdqa(msg, msgtmp2); |
360 | paddd(msg, Address(rax, 96)); |
361 | sha256rnds2(state1, state0); |
362 | movdqa(msgtmp4, msgtmp2); |
363 | palignr(msgtmp4, msgtmp1, 4); |
364 | paddd(msgtmp3, msgtmp4); |
365 | sha256msg2(msgtmp3, msgtmp2); |
366 | pshufd(msg, msg, 0x0E); |
367 | sha256rnds2(state0, state1); |
368 | sha256msg1(msgtmp1, msgtmp2); |
369 | |
370 | // Rounds 28-31 |
371 | movdqa(msg, msgtmp3); |
372 | paddd(msg, Address(rax, 112)); |
373 | sha256rnds2(state1, state0); |
374 | movdqa(msgtmp4, msgtmp3); |
375 | palignr(msgtmp4, msgtmp2, 4); |
376 | paddd(msgtmp0, msgtmp4); |
377 | sha256msg2(msgtmp0, msgtmp3); |
378 | pshufd(msg, msg, 0x0E); |
379 | sha256rnds2(state0, state1); |
380 | sha256msg1(msgtmp2, msgtmp3); |
381 | |
382 | // Rounds 32-35 |
383 | movdqa(msg, msgtmp0); |
384 | paddd(msg, Address(rax, 128)); |
385 | sha256rnds2(state1, state0); |
386 | movdqa(msgtmp4, msgtmp0); |
387 | palignr(msgtmp4, msgtmp3, 4); |
388 | paddd(msgtmp1, msgtmp4); |
389 | sha256msg2(msgtmp1, msgtmp0); |
390 | pshufd(msg, msg, 0x0E); |
391 | sha256rnds2(state0, state1); |
392 | sha256msg1(msgtmp3, msgtmp0); |
393 | |
394 | // Rounds 36-39 |
395 | movdqa(msg, msgtmp1); |
396 | paddd(msg, Address(rax, 144)); |
397 | sha256rnds2(state1, state0); |
398 | movdqa(msgtmp4, msgtmp1); |
399 | palignr(msgtmp4, msgtmp0, 4); |
400 | paddd(msgtmp2, msgtmp4); |
401 | sha256msg2(msgtmp2, msgtmp1); |
402 | pshufd(msg, msg, 0x0E); |
403 | sha256rnds2(state0, state1); |
404 | sha256msg1(msgtmp0, msgtmp1); |
405 | |
406 | // Rounds 40-43 |
407 | movdqa(msg, msgtmp2); |
408 | paddd(msg, Address(rax, 160)); |
409 | sha256rnds2(state1, state0); |
410 | movdqa(msgtmp4, msgtmp2); |
411 | palignr(msgtmp4, msgtmp1, 4); |
412 | paddd(msgtmp3, msgtmp4); |
413 | sha256msg2(msgtmp3, msgtmp2); |
414 | pshufd(msg, msg, 0x0E); |
415 | sha256rnds2(state0, state1); |
416 | sha256msg1(msgtmp1, msgtmp2); |
417 | |
418 | // Rounds 44-47 |
419 | movdqa(msg, msgtmp3); |
420 | paddd(msg, Address(rax, 176)); |
421 | sha256rnds2(state1, state0); |
422 | movdqa(msgtmp4, msgtmp3); |
423 | palignr(msgtmp4, msgtmp2, 4); |
424 | paddd(msgtmp0, msgtmp4); |
425 | sha256msg2(msgtmp0, msgtmp3); |
426 | pshufd(msg, msg, 0x0E); |
427 | sha256rnds2(state0, state1); |
428 | sha256msg1(msgtmp2, msgtmp3); |
429 | |
430 | // Rounds 48-51 |
431 | movdqa(msg, msgtmp0); |
432 | paddd(msg, Address(rax, 192)); |
433 | sha256rnds2(state1, state0); |
434 | movdqa(msgtmp4, msgtmp0); |
435 | palignr(msgtmp4, msgtmp3, 4); |
436 | paddd(msgtmp1, msgtmp4); |
437 | sha256msg2(msgtmp1, msgtmp0); |
438 | pshufd(msg, msg, 0x0E); |
439 | sha256rnds2(state0, state1); |
440 | sha256msg1(msgtmp3, msgtmp0); |
441 | |
442 | // Rounds 52-55 |
443 | movdqa(msg, msgtmp1); |
444 | paddd(msg, Address(rax, 208)); |
445 | sha256rnds2(state1, state0); |
446 | movdqa(msgtmp4, msgtmp1); |
447 | palignr(msgtmp4, msgtmp0, 4); |
448 | paddd(msgtmp2, msgtmp4); |
449 | sha256msg2(msgtmp2, msgtmp1); |
450 | pshufd(msg, msg, 0x0E); |
451 | sha256rnds2(state0, state1); |
452 | |
453 | // Rounds 56-59 |
454 | movdqa(msg, msgtmp2); |
455 | paddd(msg, Address(rax, 224)); |
456 | sha256rnds2(state1, state0); |
457 | movdqa(msgtmp4, msgtmp2); |
458 | palignr(msgtmp4, msgtmp1, 4); |
459 | paddd(msgtmp3, msgtmp4); |
460 | sha256msg2(msgtmp3, msgtmp2); |
461 | pshufd(msg, msg, 0x0E); |
462 | sha256rnds2(state0, state1); |
463 | |
464 | // Rounds 60-63 |
465 | movdqa(msg, msgtmp3); |
466 | paddd(msg, Address(rax, 240)); |
467 | sha256rnds2(state1, state0); |
468 | pshufd(msg, msg, 0x0E); |
469 | sha256rnds2(state0, state1); |
470 | movdqu(msg, Address(rsp, 0)); |
471 | paddd(state0, msg); |
472 | movdqu(msg, Address(rsp, 16)); |
473 | paddd(state1, msg); |
474 | |
475 | if (multi_block) { |
476 | // increment data pointer and loop if more to process |
477 | addptr(buf, 64); |
478 | addptr(ofs, 64); |
479 | cmpptr(ofs, limit); |
480 | jcc(Assembler::belowEqual, loop0); |
481 | movptr(rax, ofs); //return ofs |
482 | } |
483 | |
484 | pshufd(state0, state0, 0x1B); |
485 | pshufd(state1, state1, 0xB1); |
486 | movdqa(msgtmp4, state0); |
487 | pblendw(state0, state1, 0xF0); |
488 | palignr(state1, msgtmp4, 8); |
489 | |
490 | movdqu(Address(state, 0), state0); |
491 | movdqu(Address(state, 16), state1); |
492 | |
493 | bind(done_hash); |
494 | |
495 | } |
496 | |
497 | #ifdef _LP64 |
498 | /* |
499 | The algorithm below is based on Intel publication: |
500 | "Fast SHA-256 Implementations on Intelë Architecture Processors" by Jim Guilford, Kirk Yap and Vinodh Gopal. |
501 | The assembly code was originally provided by Sean Gulley and in many places preserves |
502 | the original assembly NAMES and comments to simplify matching Java assembly with its original. |
503 | The Java version was substantially redesigned to replace 1200 assembly instruction with |
504 | much shorter run-time generator of the same code in memory. |
505 | */ |
506 | |
507 | void MacroAssembler::sha256_AVX2_one_round_compute( |
508 | Register reg_old_h, |
509 | Register reg_a, |
510 | Register reg_b, |
511 | Register reg_c, |
512 | Register reg_d, |
513 | Register reg_e, |
514 | Register reg_f, |
515 | Register reg_g, |
516 | Register reg_h, |
517 | int iter) { |
518 | const Register& reg_y0 = r13; |
519 | const Register& reg_y1 = r14; |
520 | const Register& reg_y2 = r15; |
521 | const Register& reg_y3 = rcx; |
522 | const Register& reg_T1 = r12; |
523 | //;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND iter ;;;;;;;;;;;;;;;;;;;;;;;;;;; |
524 | if (iter%4 > 0) { |
525 | addl(reg_old_h, reg_y2); // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; -- |
526 | } |
527 | movl(reg_y2, reg_f); // reg_y2 = reg_f ; CH |
528 | rorxd(reg_y0, reg_e, 25); // reg_y0 = reg_e >> 25 ; S1A |
529 | rorxd(reg_y1, reg_e, 11); // reg_y1 = reg_e >> 11 ; S1B |
530 | xorl(reg_y2, reg_g); // reg_y2 = reg_f^reg_g ; CH |
531 | |
532 | xorl(reg_y0, reg_y1); // reg_y0 = (reg_e>>25) ^ (reg_h>>11) ; S1 |
533 | rorxd(reg_y1, reg_e, 6); // reg_y1 = (reg_e >> 6) ; S1 |
534 | andl(reg_y2, reg_e); // reg_y2 = (reg_f^reg_g)®_e ; CH |
535 | |
536 | if (iter%4 > 0) { |
537 | addl(reg_old_h, reg_y3); // reg_h = t1 + S0 + MAJ ; -- |
538 | } |
539 | |
540 | xorl(reg_y0, reg_y1); // reg_y0 = (reg_e>>25) ^ (reg_e>>11) ^ (reg_e>>6) ; S1 |
541 | rorxd(reg_T1, reg_a, 13); // reg_T1 = reg_a >> 13 ; S0B |
542 | xorl(reg_y2, reg_g); // reg_y2 = CH = ((reg_f^reg_g)®_e)^reg_g ; CH |
543 | rorxd(reg_y1, reg_a, 22); // reg_y1 = reg_a >> 22 ; S0A |
544 | movl(reg_y3, reg_a); // reg_y3 = reg_a ; MAJA |
545 | |
546 | xorl(reg_y1, reg_T1); // reg_y1 = (reg_a>>22) ^ (reg_a>>13) ; S0 |
547 | rorxd(reg_T1, reg_a, 2); // reg_T1 = (reg_a >> 2) ; S0 |
548 | addl(reg_h, Address(rsp, rdx, Address::times_1, 4*iter)); // reg_h = k + w + reg_h ; -- |
549 | orl(reg_y3, reg_c); // reg_y3 = reg_a|reg_c ; MAJA |
550 | |
551 | xorl(reg_y1, reg_T1); // reg_y1 = (reg_a>>22) ^ (reg_a>>13) ^ (reg_a>>2) ; S0 |
552 | movl(reg_T1, reg_a); // reg_T1 = reg_a ; MAJB |
553 | andl(reg_y3, reg_b); // reg_y3 = (reg_a|reg_c)®_b ; MAJA |
554 | andl(reg_T1, reg_c); // reg_T1 = reg_a®_c ; MAJB |
555 | addl(reg_y2, reg_y0); // reg_y2 = S1 + CH ; -- |
556 | |
557 | |
558 | addl(reg_d, reg_h); // reg_d = k + w + reg_h + reg_d ; -- |
559 | orl(reg_y3, reg_T1); // reg_y3 = MAJ = (reg_a|reg_c)®_b)|(reg_a®_c) ; MAJ |
560 | addl(reg_h, reg_y1); // reg_h = k + w + reg_h + S0 ; -- |
561 | |
562 | addl(reg_d, reg_y2); // reg_d = k + w + reg_h + reg_d + S1 + CH = reg_d + t1 ; -- |
563 | |
564 | |
565 | if (iter%4 == 3) { |
566 | addl(reg_h, reg_y2); // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; -- |
567 | addl(reg_h, reg_y3); // reg_h = t1 + S0 + MAJ ; -- |
568 | } |
569 | } |
570 | |
571 | void MacroAssembler::sha256_AVX2_four_rounds_compute_first(int start) { |
572 | sha256_AVX2_one_round_compute(rax, rax, rbx, rdi, rsi, r8, r9, r10, r11, start + 0); |
573 | sha256_AVX2_one_round_compute(r11, r11, rax, rbx, rdi, rsi, r8, r9, r10, start + 1); |
574 | sha256_AVX2_one_round_compute(r10, r10, r11, rax, rbx, rdi, rsi, r8, r9, start + 2); |
575 | sha256_AVX2_one_round_compute(r9, r9, r10, r11, rax, rbx, rdi, rsi, r8, start + 3); |
576 | } |
577 | |
578 | void MacroAssembler::sha256_AVX2_four_rounds_compute_last(int start) { |
579 | sha256_AVX2_one_round_compute(r8, r8, r9, r10, r11, rax, rbx, rdi, rsi, start + 0); |
580 | sha256_AVX2_one_round_compute(rsi, rsi, r8, r9, r10, r11, rax, rbx, rdi, start + 1); |
581 | sha256_AVX2_one_round_compute(rdi, rdi, rsi, r8, r9, r10, r11, rax, rbx, start + 2); |
582 | sha256_AVX2_one_round_compute(rbx, rbx, rdi, rsi, r8, r9, r10, r11, rax, start + 3); |
583 | } |
584 | |
585 | void MacroAssembler::sha256_AVX2_one_round_and_sched( |
586 | XMMRegister xmm_0, /* == ymm4 on 0, 1, 2, 3 iterations, then rotate 4 registers left on 4, 8, 12 iterations */ |
587 | XMMRegister xmm_1, /* ymm5 */ /* full cycle is 16 iterations */ |
588 | XMMRegister xmm_2, /* ymm6 */ |
589 | XMMRegister xmm_3, /* ymm7 */ |
590 | Register reg_a, /* == rax on 0 iteration, then rotate 8 register right on each next iteration */ |
591 | Register reg_b, /* rbx */ /* full cycle is 8 iterations */ |
592 | Register reg_c, /* rdi */ |
593 | Register reg_d, /* rsi */ |
594 | Register reg_e, /* r8 */ |
595 | Register reg_f, /* r9d */ |
596 | Register reg_g, /* r10d */ |
597 | Register reg_h, /* r11d */ |
598 | int iter) |
599 | { |
600 | movl(rcx, reg_a); // rcx = reg_a ; MAJA |
601 | rorxd(r13, reg_e, 25); // r13 = reg_e >> 25 ; S1A |
602 | rorxd(r14, reg_e, 11); // r14 = reg_e >> 11 ; S1B |
603 | addl(reg_h, Address(rsp, rdx, Address::times_1, 4*iter)); |
604 | orl(rcx, reg_c); // rcx = reg_a|reg_c ; MAJA |
605 | |
606 | movl(r15, reg_f); // r15 = reg_f ; CH |
607 | rorxd(r12, reg_a, 13); // r12 = reg_a >> 13 ; S0B |
608 | xorl(r13, r14); // r13 = (reg_e>>25) ^ (reg_e>>11) ; S1 |
609 | xorl(r15, reg_g); // r15 = reg_f^reg_g ; CH |
610 | |
611 | rorxd(r14, reg_e, 6); // r14 = (reg_e >> 6) ; S1 |
612 | andl(r15, reg_e); // r15 = (reg_f^reg_g)®_e ; CH |
613 | |
614 | xorl(r13, r14); // r13 = (reg_e>>25) ^ (reg_e>>11) ^ (reg_e>>6) ; S1 |
615 | rorxd(r14, reg_a, 22); // r14 = reg_a >> 22 ; S0A |
616 | addl(reg_d, reg_h); // reg_d = k + w + reg_h + reg_d ; -- |
617 | |
618 | andl(rcx, reg_b); // rcx = (reg_a|reg_c)®_b ; MAJA |
619 | xorl(r14, r12); // r14 = (reg_a>>22) ^ (reg_a>>13) ; S0 |
620 | |
621 | rorxd(r12, reg_a, 2); // r12 = (reg_a >> 2) ; S0 |
622 | xorl(r15, reg_g); // r15 = CH = ((reg_f^reg_g)®_e)^reg_g ; CH |
623 | |
624 | xorl(r14, r12); // r14 = (reg_a>>22) ^ (reg_a>>13) ^ (reg_a>>2) ; S0 |
625 | movl(r12, reg_a); // r12 = reg_a ; MAJB |
626 | andl(r12, reg_c); // r12 = reg_a®_c ; MAJB |
627 | addl(r15, r13); // r15 = S1 + CH ; -- |
628 | |
629 | orl(rcx, r12); // rcx = MAJ = (reg_a|reg_c)®_b)|(reg_a®_c) ; MAJ |
630 | addl(reg_h, r14); // reg_h = k + w + reg_h + S0 ; -- |
631 | addl(reg_d, r15); // reg_d = k + w + reg_h + reg_d + S1 + CH = reg_d + t1 ; -- |
632 | |
633 | addl(reg_h, r15); // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; -- |
634 | addl(reg_h, rcx); // reg_h = t1 + S0 + MAJ ; -- |
635 | |
636 | if (iter%4 == 0) { |
637 | vpalignr(xmm0, xmm_3, xmm_2, 4, AVX_256bit); // ymm0 = W[-7] |
638 | vpaddd(xmm0, xmm0, xmm_0, AVX_256bit); // ymm0 = W[-7] + W[-16]; y1 = (e >> 6) ; S1 |
639 | vpalignr(xmm1, xmm_1, xmm_0, 4, AVX_256bit); // ymm1 = W[-15] |
640 | vpsrld(xmm2, xmm1, 7, AVX_256bit); |
641 | vpslld(xmm3, xmm1, 32-7, AVX_256bit); |
642 | vpor(xmm3, xmm3, xmm2, AVX_256bit); // ymm3 = W[-15] ror 7 |
643 | vpsrld(xmm2, xmm1,18, AVX_256bit); |
644 | } else if (iter%4 == 1 ) { |
645 | vpsrld(xmm8, xmm1, 3, AVX_256bit); // ymm8 = W[-15] >> 3 |
646 | vpslld(xmm1, xmm1, 32-18, AVX_256bit); |
647 | vpxor(xmm3, xmm3, xmm1, AVX_256bit); |
648 | vpxor(xmm3, xmm3, xmm2, AVX_256bit); // ymm3 = W[-15] ror 7 ^ W[-15] ror 18 |
649 | vpxor(xmm1, xmm3, xmm8, AVX_256bit); // ymm1 = s0 |
650 | vpshufd(xmm2, xmm_3, 0xFA, AVX_256bit); // 11111010b ; ymm2 = W[-2] {BBAA} |
651 | vpaddd(xmm0, xmm0, xmm1, AVX_256bit); // ymm0 = W[-16] + W[-7] + s0 |
652 | vpsrld(xmm8, xmm2, 10, AVX_256bit); // ymm8 = W[-2] >> 10 {BBAA} |
653 | } else if (iter%4 == 2) { |
654 | vpsrlq(xmm3, xmm2, 19, AVX_256bit); // ymm3 = W[-2] ror 19 {xBxA} |
655 | vpsrlq(xmm2, xmm2, 17, AVX_256bit); // ymm2 = W[-2] ror 17 {xBxA} |
656 | vpxor(xmm2, xmm2, xmm3, AVX_256bit); |
657 | vpxor(xmm8, xmm8, xmm2, AVX_256bit); // ymm8 = s1 {xBxA} |
658 | vpshufb(xmm8, xmm8, xmm10, AVX_256bit); // ymm8 = s1 {00BA} |
659 | vpaddd(xmm0, xmm0, xmm8, AVX_256bit); // ymm0 = {..., ..., W[1], W[0]} |
660 | vpshufd(xmm2, xmm0, 0x50, AVX_256bit); // 01010000b ; ymm2 = W[-2] {DDCC} |
661 | } else if (iter%4 == 3) { |
662 | vpsrld(xmm11, xmm2, 10, AVX_256bit); // ymm11 = W[-2] >> 10 {DDCC} |
663 | vpsrlq(xmm3, xmm2, 19, AVX_256bit); // ymm3 = W[-2] ror 19 {xDxC} |
664 | vpsrlq(xmm2, xmm2, 17, AVX_256bit); // ymm2 = W[-2] ror 17 {xDxC} |
665 | vpxor(xmm2, xmm2, xmm3, AVX_256bit); |
666 | vpxor(xmm11, xmm11, xmm2, AVX_256bit); // ymm11 = s1 {xDxC} |
667 | vpshufb(xmm11, xmm11, xmm12, AVX_256bit); // ymm11 = s1 {DC00} |
668 | vpaddd(xmm_0, xmm11, xmm0, AVX_256bit); // xmm_0 = {W[3], W[2], W[1], W[0]} |
669 | } |
670 | } |
671 | |
672 | void MacroAssembler::addm(int disp, Register r1, Register r2) { |
673 | addl(r2, Address(r1, disp)); |
674 | movl(Address(r1, disp), r2); |
675 | } |
676 | |
677 | void MacroAssembler::addmq(int disp, Register r1, Register r2) { |
678 | addq(r2, Address(r1, disp)); |
679 | movq(Address(r1, disp), r2); |
680 | } |
681 | |
682 | void MacroAssembler::sha256_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0, |
683 | XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4, |
684 | Register buf, Register state, Register ofs, Register limit, Register rsp, |
685 | bool multi_block, XMMRegister shuf_mask) { |
686 | |
687 | Label loop0, loop1, loop2, loop3, |
688 | last_block_enter, do_last_block, only_one_block, done_hash, |
689 | compute_size, compute_size_end, |
690 | compute_size1, compute_size_end1; |
691 | |
692 | address K256_W = StubRoutines::x86::k256_W_addr(); |
693 | address pshuffle_byte_flip_mask = StubRoutines::x86::pshuffle_byte_flip_mask_addr(); |
694 | address pshuffle_byte_flip_mask_addr = 0; |
695 | |
696 | const XMMRegister& SHUF_00BA = xmm10; // ymm10: shuffle xBxA -> 00BA |
697 | const XMMRegister& SHUF_DC00 = xmm12; // ymm12: shuffle xDxC -> DC00 |
698 | const XMMRegister& BYTE_FLIP_MASK = xmm13; // ymm13 |
699 | |
700 | const XMMRegister& X_BYTE_FLIP_MASK = xmm13; //XMM version of BYTE_FLIP_MASK |
701 | |
702 | const Register& NUM_BLKS = r8; // 3rd arg |
703 | const Register& CTX = rdx; // 2nd arg |
704 | const Register& INP = rcx; // 1st arg |
705 | |
706 | const Register& c = rdi; |
707 | const Register& d = rsi; |
708 | const Register& e = r8; // clobbers NUM_BLKS |
709 | const Register& y3 = rcx; // clobbers INP |
710 | |
711 | const Register& TBL = rbp; |
712 | const Register& SRND = CTX; // SRND is same register as CTX |
713 | |
714 | const Register& a = rax; |
715 | const Register& b = rbx; |
716 | const Register& f = r9; |
717 | const Register& g = r10; |
718 | const Register& h = r11; |
719 | |
720 | const Register& T1 = r12; |
721 | const Register& y0 = r13; |
722 | const Register& y1 = r14; |
723 | const Register& y2 = r15; |
724 | |
725 | |
726 | enum { |
727 | _XFER_SIZE = 2*64*4, // 2 blocks, 64 rounds, 4 bytes/round |
728 | _INP_END_SIZE = 8, |
729 | _INP_SIZE = 8, |
730 | _CTX_SIZE = 8, |
731 | _RSP_SIZE = 8, |
732 | |
733 | _XFER = 0, |
734 | _INP_END = _XFER + _XFER_SIZE, |
735 | _INP = _INP_END + _INP_END_SIZE, |
736 | _CTX = _INP + _INP_SIZE, |
737 | _RSP = _CTX + _CTX_SIZE, |
738 | STACK_SIZE = _RSP + _RSP_SIZE |
739 | }; |
740 | |
741 | #ifndef _WIN64 |
742 | push(rcx); // linux: this is limit, need at the end |
743 | push(rdx); // linux: this is ofs |
744 | #else |
745 | push(r8); // win64: this is ofs |
746 | push(r9); // win64: this is limit, we need them again at the very and |
747 | #endif |
748 | |
749 | |
750 | push(rbx); |
751 | #ifdef _WIN64 |
752 | push(rsi); |
753 | push(rdi); |
754 | #endif |
755 | push(rbp); |
756 | push(r12); |
757 | push(r13); |
758 | push(r14); |
759 | push(r15); |
760 | |
761 | movq(rax, rsp); |
762 | subq(rsp, STACK_SIZE); |
763 | andq(rsp, -32); |
764 | movq(Address(rsp, _RSP), rax); |
765 | |
766 | #ifndef _WIN64 |
767 | // copy linux params to win64 params, therefore the rest of code will be the same for both |
768 | movq(r9, rcx); |
769 | movq(r8, rdx); |
770 | movq(rdx, rsi); |
771 | movq(rcx, rdi); |
772 | #endif |
773 | |
774 | // setting original assembly ABI |
775 | /** message to encrypt in INP */ |
776 | lea(INP, Address(rcx, 0)); // rcx == message (buf) ;; linux: INP = buf = rdi |
777 | /** digest in CTX */ |
778 | movq(CTX, rdx); // rdx = digest (state) ;; linux: CTX = state = rsi |
779 | |
780 | /** NUM_BLK is the length of message, need to set it from ofs and limit */ |
781 | if (multi_block) { |
782 | |
783 | // Win64: cannot directly update NUM_BLKS, since NUM_BLKS = ofs = r8 |
784 | // on entry r8 = ofs |
785 | // on exit r8 = NUM_BLKS |
786 | |
787 | xorq(rax, rax); |
788 | |
789 | bind(compute_size); |
790 | cmpptr(r8, r9); // assume the original ofs <= limit ;; linux: cmp rcx, rdx |
791 | jccb(Assembler::aboveEqual, compute_size_end); |
792 | addq(r8, 64); //;; linux: ofs = rdx |
793 | addq(rax, 64); |
794 | jmpb(compute_size); |
795 | |
796 | bind(compute_size_end); |
797 | movq(NUM_BLKS, rax); // NUM_BLK (r8) ;; linux: NUM_BLK = rdx |
798 | |
799 | cmpq(NUM_BLKS, 0); |
800 | jcc(Assembler::equal, done_hash); |
801 | |
802 | } else { |
803 | xorq(NUM_BLKS, NUM_BLKS); |
804 | addq(NUM_BLKS, 64); |
805 | }//if (!multi_block) |
806 | |
807 | lea(NUM_BLKS, Address(INP, NUM_BLKS, Address::times_1, -64)); // pointer to the last block |
808 | movq(Address(rsp, _INP_END), NUM_BLKS); // |
809 | |
810 | cmpptr(INP, NUM_BLKS); //cmp INP, NUM_BLKS |
811 | jcc(Assembler::equal, only_one_block); //je only_one_block |
812 | |
813 | // load initial digest |
814 | movl(a, Address(CTX, 4*0)); |
815 | movl(b, Address(CTX, 4*1)); |
816 | movl(c, Address(CTX, 4*2)); |
817 | movl(d, Address(CTX, 4*3)); |
818 | movl(e, Address(CTX, 4*4)); |
819 | movl(f, Address(CTX, 4*5)); |
820 | // load g - r10 after it is used as scratch |
821 | movl(h, Address(CTX, 4*7)); |
822 | |
823 | pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask; |
824 | vmovdqu(BYTE_FLIP_MASK, ExternalAddress(pshuffle_byte_flip_mask_addr +0)); //[PSHUFFLE_BYTE_FLIP_MASK wrt rip] |
825 | vmovdqu(SHUF_00BA, ExternalAddress(pshuffle_byte_flip_mask_addr + 32)); //[_SHUF_00BA wrt rip] |
826 | vmovdqu(SHUF_DC00, ExternalAddress(pshuffle_byte_flip_mask_addr + 64)); //[_SHUF_DC00 wrt rip] |
827 | |
828 | movl(g, Address(CTX, 4*6)); |
829 | |
830 | movq(Address(rsp, _CTX), CTX); // store |
831 | |
832 | bind(loop0); |
833 | lea(TBL, ExternalAddress(K256_W)); |
834 | |
835 | // assume buffers not aligned |
836 | |
837 | // Load first 16 dwords from two blocks |
838 | vmovdqu(xmm0, Address(INP, 0*32)); |
839 | vmovdqu(xmm1, Address(INP, 1*32)); |
840 | vmovdqu(xmm2, Address(INP, 2*32)); |
841 | vmovdqu(xmm3, Address(INP, 3*32)); |
842 | |
843 | // byte swap data |
844 | vpshufb(xmm0, xmm0, BYTE_FLIP_MASK, AVX_256bit); |
845 | vpshufb(xmm1, xmm1, BYTE_FLIP_MASK, AVX_256bit); |
846 | vpshufb(xmm2, xmm2, BYTE_FLIP_MASK, AVX_256bit); |
847 | vpshufb(xmm3, xmm3, BYTE_FLIP_MASK, AVX_256bit); |
848 | |
849 | // transpose data into high/low halves |
850 | vperm2i128(xmm4, xmm0, xmm2, 0x20); |
851 | vperm2i128(xmm5, xmm0, xmm2, 0x31); |
852 | vperm2i128(xmm6, xmm1, xmm3, 0x20); |
853 | vperm2i128(xmm7, xmm1, xmm3, 0x31); |
854 | |
855 | bind(last_block_enter); |
856 | addq(INP, 64); |
857 | movq(Address(rsp, _INP), INP); |
858 | |
859 | //;; schedule 48 input dwords, by doing 3 rounds of 12 each |
860 | xorq(SRND, SRND); |
861 | |
862 | align(16); |
863 | bind(loop1); |
864 | vpaddd(xmm9, xmm4, Address(TBL, SRND, Address::times_1, 0*32), AVX_256bit); |
865 | vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 0*32), xmm9); |
866 | sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, rax, rbx, rdi, rsi, r8, r9, r10, r11, 0); |
867 | sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r11, rax, rbx, rdi, rsi, r8, r9, r10, 1); |
868 | sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r10, r11, rax, rbx, rdi, rsi, r8, r9, 2); |
869 | sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r9, r10, r11, rax, rbx, rdi, rsi, r8, 3); |
870 | |
871 | vpaddd(xmm9, xmm5, Address(TBL, SRND, Address::times_1, 1*32), AVX_256bit); |
872 | vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 1*32), xmm9); |
873 | sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, r8, r9, r10, r11, rax, rbx, rdi, rsi, 8+0); |
874 | sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rsi, r8, r9, r10, r11, rax, rbx, rdi, 8+1); |
875 | sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rdi, rsi, r8, r9, r10, r11, rax, rbx, 8+2); |
876 | sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rbx, rdi, rsi, r8, r9, r10, r11, rax, 8+3); |
877 | |
878 | vpaddd(xmm9, xmm6, Address(TBL, SRND, Address::times_1, 2*32), AVX_256bit); |
879 | vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 2*32), xmm9); |
880 | sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, rax, rbx, rdi, rsi, r8, r9, r10, r11, 16+0); |
881 | sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r11, rax, rbx, rdi, rsi, r8, r9, r10, 16+1); |
882 | sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r10, r11, rax, rbx, rdi, rsi, r8, r9, 16+2); |
883 | sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r9, r10, r11, rax, rbx, rdi, rsi, r8, 16+3); |
884 | |
885 | vpaddd(xmm9, xmm7, Address(TBL, SRND, Address::times_1, 3*32), AVX_256bit); |
886 | vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 3*32), xmm9); |
887 | |
888 | sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, r8, r9, r10, r11, rax, rbx, rdi, rsi, 24+0); |
889 | sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rsi, r8, r9, r10, r11, rax, rbx, rdi, 24+1); |
890 | sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rdi, rsi, r8, r9, r10, r11, rax, rbx, 24+2); |
891 | sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rbx, rdi, rsi, r8, r9, r10, r11, rax, 24+3); |
892 | |
893 | addq(SRND, 4*32); |
894 | cmpq(SRND, 3 * 4*32); |
895 | jcc(Assembler::below, loop1); |
896 | |
897 | bind(loop2); |
898 | // Do last 16 rounds with no scheduling |
899 | vpaddd(xmm9, xmm4, Address(TBL, SRND, Address::times_1, 0*32), AVX_256bit); |
900 | vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 0*32), xmm9); |
901 | sha256_AVX2_four_rounds_compute_first(0); |
902 | |
903 | vpaddd(xmm9, xmm5, Address(TBL, SRND, Address::times_1, 1*32), AVX_256bit); |
904 | vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 1*32), xmm9); |
905 | sha256_AVX2_four_rounds_compute_last(0 + 8); |
906 | |
907 | addq(SRND, 2*32); |
908 | |
909 | vmovdqu(xmm4, xmm6); |
910 | vmovdqu(xmm5, xmm7); |
911 | |
912 | cmpq(SRND, 4 * 4*32); |
913 | jcc(Assembler::below, loop2); |
914 | |
915 | movq(CTX, Address(rsp, _CTX)); |
916 | movq(INP, Address(rsp, _INP)); |
917 | |
918 | addm(4*0, CTX, a); |
919 | addm(4*1, CTX, b); |
920 | addm(4*2, CTX, c); |
921 | addm(4*3, CTX, d); |
922 | addm(4*4, CTX, e); |
923 | addm(4*5, CTX, f); |
924 | addm(4*6, CTX, g); |
925 | addm(4*7, CTX, h); |
926 | |
927 | cmpq(INP, Address(rsp, _INP_END)); |
928 | jcc(Assembler::above, done_hash); |
929 | |
930 | //Do second block using previously scheduled results |
931 | xorq(SRND, SRND); |
932 | align(16); |
933 | bind(loop3); |
934 | sha256_AVX2_four_rounds_compute_first(4); |
935 | sha256_AVX2_four_rounds_compute_last(4+8); |
936 | |
937 | addq(SRND, 2*32); |
938 | cmpq(SRND, 4 * 4*32); |
939 | jcc(Assembler::below, loop3); |
940 | |
941 | movq(CTX, Address(rsp, _CTX)); |
942 | movq(INP, Address(rsp, _INP)); |
943 | addq(INP, 64); |
944 | |
945 | addm(4*0, CTX, a); |
946 | addm(4*1, CTX, b); |
947 | addm(4*2, CTX, c); |
948 | addm(4*3, CTX, d); |
949 | addm(4*4, CTX, e); |
950 | addm(4*5, CTX, f); |
951 | addm(4*6, CTX, g); |
952 | addm(4*7, CTX, h); |
953 | |
954 | cmpq(INP, Address(rsp, _INP_END)); |
955 | jcc(Assembler::below, loop0); |
956 | jccb(Assembler::above, done_hash); |
957 | |
958 | bind(do_last_block); |
959 | lea(TBL, ExternalAddress(K256_W)); |
960 | |
961 | movdqu(xmm4, Address(INP, 0*16)); |
962 | movdqu(xmm5, Address(INP, 1*16)); |
963 | movdqu(xmm6, Address(INP, 2*16)); |
964 | movdqu(xmm7, Address(INP, 3*16)); |
965 | |
966 | vpshufb(xmm4, xmm4, xmm13, AVX_128bit); |
967 | vpshufb(xmm5, xmm5, xmm13, AVX_128bit); |
968 | vpshufb(xmm6, xmm6, xmm13, AVX_128bit); |
969 | vpshufb(xmm7, xmm7, xmm13, AVX_128bit); |
970 | |
971 | jmp(last_block_enter); |
972 | |
973 | bind(only_one_block); |
974 | |
975 | // load initial digest ;; table should be preloaded with following values |
976 | movl(a, Address(CTX, 4*0)); // 0x6a09e667 |
977 | movl(b, Address(CTX, 4*1)); // 0xbb67ae85 |
978 | movl(c, Address(CTX, 4*2)); // 0x3c6ef372 |
979 | movl(d, Address(CTX, 4*3)); // 0xa54ff53a |
980 | movl(e, Address(CTX, 4*4)); // 0x510e527f |
981 | movl(f, Address(CTX, 4*5)); // 0x9b05688c |
982 | // load g - r10 after use as scratch |
983 | movl(h, Address(CTX, 4*7)); // 0x5be0cd19 |
984 | |
985 | |
986 | pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask; |
987 | vmovdqu(BYTE_FLIP_MASK, ExternalAddress(pshuffle_byte_flip_mask_addr + 0)); //[PSHUFFLE_BYTE_FLIP_MASK wrt rip] |
988 | vmovdqu(SHUF_00BA, ExternalAddress(pshuffle_byte_flip_mask_addr + 32)); //[_SHUF_00BA wrt rip] |
989 | vmovdqu(SHUF_DC00, ExternalAddress(pshuffle_byte_flip_mask_addr + 64)); //[_SHUF_DC00 wrt rip] |
990 | |
991 | movl(g, Address(CTX, 4*6)); // 0x1f83d9ab |
992 | |
993 | movq(Address(rsp, _CTX), CTX); |
994 | jmpb(do_last_block); |
995 | |
996 | bind(done_hash); |
997 | |
998 | movq(rsp, Address(rsp, _RSP)); |
999 | |
1000 | pop(r15); |
1001 | pop(r14); |
1002 | pop(r13); |
1003 | pop(r12); |
1004 | pop(rbp); |
1005 | #ifdef _WIN64 |
1006 | pop(rdi); |
1007 | pop(rsi); |
1008 | #endif |
1009 | pop(rbx); |
1010 | |
1011 | #ifdef _WIN64 |
1012 | pop(r9); |
1013 | pop(r8); |
1014 | #else |
1015 | pop(rdx); |
1016 | pop(rcx); |
1017 | #endif |
1018 | |
1019 | if (multi_block) { |
1020 | #ifdef _WIN64 |
1021 | const Register& limit_end = r9; |
1022 | const Register& ofs_end = r8; |
1023 | #else |
1024 | const Register& limit_end = rcx; |
1025 | const Register& ofs_end = rdx; |
1026 | #endif |
1027 | movq(rax, ofs_end); |
1028 | |
1029 | bind(compute_size1); |
1030 | cmpptr(rax, limit_end); // assume the original ofs <= limit |
1031 | jccb(Assembler::aboveEqual, compute_size_end1); |
1032 | addq(rax, 64); |
1033 | jmpb(compute_size1); |
1034 | |
1035 | bind(compute_size_end1); |
1036 | } |
1037 | } |
1038 | |
1039 | void MacroAssembler::sha512_AVX2_one_round_compute(Register old_h, Register a, Register b, Register c, |
1040 | Register d, Register e, Register f, Register g, Register h, |
1041 | int iteration) |
1042 | { |
1043 | |
1044 | const Register& y0 = r13; |
1045 | const Register& y1 = r14; |
1046 | const Register& y2 = r15; |
1047 | #ifdef _WIN64 |
1048 | const Register& y3 = rcx; |
1049 | #else |
1050 | const Register& y3 = rdi; |
1051 | #endif |
1052 | const Register& T1 = r12; |
1053 | |
1054 | if (iteration % 4 > 0) { |
1055 | addq(old_h, y2); //h = k + w + h + S0 + S1 + CH = t1 + S0; |
1056 | } |
1057 | movq(y2, f); //y2 = f; CH |
1058 | rorxq(y0, e, 41); //y0 = e >> 41; S1A |
1059 | rorxq(y1, e, 18); //y1 = e >> 18; S1B |
1060 | xorq(y2, g); //y2 = f^g; CH |
1061 | |
1062 | xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18); S1 |
1063 | rorxq(y1, e, 14); //y1 = (e >> 14); S1 |
1064 | andq(y2, e); //y2 = (f^g)&e; CH |
1065 | |
1066 | if (iteration % 4 > 0 ) { |
1067 | addq(old_h, y3); //h = t1 + S0 + MAJ |
1068 | } |
1069 | xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18) ^ (e >> 14); S1 |
1070 | rorxq(T1, a, 34); //T1 = a >> 34; S0B |
1071 | xorq(y2, g); //y2 = CH = ((f^g)&e) ^g; CH |
1072 | rorxq(y1, a, 39); //y1 = a >> 39; S0A |
1073 | movq(y3, a); //y3 = a; MAJA |
1074 | |
1075 | xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34); S0 |
1076 | rorxq(T1, a, 28); //T1 = (a >> 28); S0 |
1077 | addq(h, Address(rsp, (8 * iteration))); //h = k + w + h; -- |
1078 | orq(y3, c); //y3 = a | c; MAJA |
1079 | |
1080 | xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34) ^ (a >> 28); S0 |
1081 | movq(T1, a); //T1 = a; MAJB |
1082 | andq(y3, b); //y3 = (a | c)&b; MAJA |
1083 | andq(T1, c); //T1 = a&c; MAJB |
1084 | addq(y2, y0); //y2 = S1 + CH; -- |
1085 | |
1086 | addq(d, h); //d = k + w + h + d; -- |
1087 | orq(y3, T1); //y3 = MAJ = (a | c)&b) | (a&c); MAJ |
1088 | addq(h, y1); //h = k + w + h + S0; -- |
1089 | |
1090 | addq(d, y2); //d = k + w + h + d + S1 + CH = d + t1; -- |
1091 | |
1092 | if (iteration % 4 == 3) { |
1093 | addq(h, y2); //h = k + w + h + S0 + S1 + CH = t1 + S0; -- |
1094 | addq(h, y3); //h = t1 + S0 + MAJ; -- |
1095 | } |
1096 | } |
1097 | |
1098 | void MacroAssembler::sha512_AVX2_one_round_and_schedule( |
1099 | XMMRegister xmm4, // ymm4 |
1100 | XMMRegister xmm5, // ymm5 |
1101 | XMMRegister xmm6, // ymm6 |
1102 | XMMRegister xmm7, // ymm7 |
1103 | Register a, //rax |
1104 | Register b, //rbx |
1105 | Register c, //rdi |
1106 | Register d, //rsi |
1107 | Register e, //r8 |
1108 | Register f, //r9 |
1109 | Register g, //r10 |
1110 | Register h, //r11 |
1111 | int iteration) |
1112 | { |
1113 | |
1114 | const Register& y0 = r13; |
1115 | const Register& y1 = r14; |
1116 | const Register& y2 = r15; |
1117 | #ifdef _WIN64 |
1118 | const Register& y3 = rcx; |
1119 | #else |
1120 | const Register& y3 = rdi; |
1121 | #endif |
1122 | const Register& T1 = r12; |
1123 | |
1124 | if (iteration % 4 == 0) { |
1125 | // Extract w[t - 7] |
1126 | // xmm0 = W[-7] |
1127 | vperm2f128(xmm0, xmm7, xmm6, 3); |
1128 | vpalignr(xmm0, xmm0, xmm6, 8, AVX_256bit); |
1129 | |
1130 | // Calculate w[t - 16] + w[t - 7] |
1131 | vpaddq(xmm0, xmm0, xmm4, AVX_256bit); //xmm0 = W[-7] + W[-16] |
1132 | // Extract w[t - 15] |
1133 | //xmm1 = W[-15] |
1134 | vperm2f128(xmm1, xmm5, xmm4, 3); |
1135 | vpalignr(xmm1, xmm1, xmm4, 8, AVX_256bit); |
1136 | |
1137 | // Calculate sigma0 |
1138 | // Calculate w[t - 15] ror 1 |
1139 | vpsrlq(xmm2, xmm1, 1, AVX_256bit); |
1140 | vpsllq(xmm3, xmm1, (64 - 1), AVX_256bit); |
1141 | vpor(xmm3, xmm3, xmm2, AVX_256bit); //xmm3 = W[-15] ror 1 |
1142 | // Calculate w[t - 15] shr 7 |
1143 | vpsrlq(xmm8, xmm1, 7, AVX_256bit); //xmm8 = W[-15] >> 7 |
1144 | |
1145 | } else if (iteration % 4 == 1) { |
1146 | //Calculate w[t - 15] ror 8 |
1147 | vpsrlq(xmm2, xmm1, 8, AVX_256bit); |
1148 | vpsllq(xmm1, xmm1, (64 - 8), AVX_256bit); |
1149 | vpor(xmm1, xmm1, xmm2, AVX_256bit); //xmm1 = W[-15] ror 8 |
1150 | |
1151 | //XOR the three components |
1152 | vpxor(xmm3, xmm3, xmm8, AVX_256bit); //xmm3 = W[-15] ror 1 ^ W[-15] >> 7 |
1153 | vpxor(xmm1, xmm3, xmm1, AVX_256bit); //xmm1 = s0 |
1154 | |
1155 | //Add three components, w[t - 16], w[t - 7] and sigma0 |
1156 | vpaddq(xmm0, xmm0, xmm1, AVX_256bit); //xmm0 = W[-16] + W[-7] + s0 |
1157 | |
1158 | // Move to appropriate lanes for calculating w[16] and w[17] |
1159 | vperm2f128(xmm4, xmm0, xmm0, 0); //xmm4 = W[-16] + W[-7] + s0{ BABA } |
1160 | |
1161 | //Move to appropriate lanes for calculating w[18] and w[19] |
1162 | vpand(xmm0, xmm0, xmm10, AVX_256bit); //xmm0 = W[-16] + W[-7] + s0{ DC00 } |
1163 | //Calculate w[16] and w[17] in both 128 bit lanes |
1164 | //Calculate sigma1 for w[16] and w[17] on both 128 bit lanes |
1165 | vperm2f128(xmm2, xmm7, xmm7, 17); //xmm2 = W[-2] {BABA} |
1166 | vpsrlq(xmm8, xmm2, 6, AVX_256bit); //xmm8 = W[-2] >> 6 {BABA} |
1167 | |
1168 | } else if (iteration % 4 == 2) { |
1169 | vpsrlq(xmm3, xmm2, 19, AVX_256bit); //xmm3 = W[-2] >> 19 {BABA} |
1170 | vpsllq(xmm1, xmm2, (64 - 19), AVX_256bit); //xmm1 = W[-2] << 19 {BABA} |
1171 | vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 19 {BABA} |
1172 | vpxor(xmm8, xmm8, xmm3, AVX_256bit);// xmm8 = W[-2] ror 19 ^ W[-2] >> 6 {BABA} |
1173 | vpsrlq(xmm3, xmm2, 61, AVX_256bit); //xmm3 = W[-2] >> 61 {BABA} |
1174 | vpsllq(xmm1, xmm2, (64 - 61), AVX_256bit); //xmm1 = W[-2] << 61 {BABA} |
1175 | vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 61 {BABA} |
1176 | vpxor(xmm8, xmm8, xmm3, AVX_256bit); //xmm8 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) { BABA } |
1177 | |
1178 | //Add sigma1 to the other components to get w[16] and w[17] |
1179 | vpaddq(xmm4, xmm4, xmm8, AVX_256bit); //xmm4 = { W[1], W[0], W[1], W[0] } |
1180 | |
1181 | //Calculate sigma1 for w[18] and w[19] for upper 128 bit lane |
1182 | vpsrlq(xmm8, xmm4, 6, AVX_256bit); //xmm8 = W[-2] >> 6 {DC--} |
1183 | |
1184 | } else if (iteration % 4 == 3){ |
1185 | vpsrlq(xmm3, xmm4, 19, AVX_256bit); //xmm3 = W[-2] >> 19 {DC--} |
1186 | vpsllq(xmm1, xmm4, (64 - 19), AVX_256bit); //xmm1 = W[-2] << 19 {DC--} |
1187 | vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 19 {DC--} |
1188 | vpxor(xmm8, xmm8, xmm3, AVX_256bit); //xmm8 = W[-2] ror 19 ^ W[-2] >> 6 {DC--} |
1189 | vpsrlq(xmm3, xmm4, 61, AVX_256bit); //xmm3 = W[-2] >> 61 {DC--} |
1190 | vpsllq(xmm1, xmm4, (64 - 61), AVX_256bit); //xmm1 = W[-2] << 61 {DC--} |
1191 | vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 61 {DC--} |
1192 | vpxor(xmm8, xmm8, xmm3, AVX_256bit); //xmm8 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) { DC-- } |
1193 | |
1194 | //Add the sigma0 + w[t - 7] + w[t - 16] for w[18] and w[19] to newly calculated sigma1 to get w[18] and w[19] |
1195 | vpaddq(xmm2, xmm0, xmm8, AVX_256bit); //xmm2 = { W[3], W[2], --, -- } |
1196 | |
1197 | //Form w[19, w[18], w17], w[16] |
1198 | vpblendd(xmm4, xmm4, xmm2, 0xF0, AVX_256bit); //xmm4 = { W[3], W[2], W[1], W[0] } |
1199 | } |
1200 | |
1201 | movq(y3, a); //y3 = a; MAJA |
1202 | rorxq(y0, e, 41); // y0 = e >> 41; S1A |
1203 | rorxq(y1, e, 18); //y1 = e >> 18; S1B |
1204 | addq(h, Address(rsp, (iteration * 8))); //h = k + w + h; -- |
1205 | orq(y3, c); //y3 = a | c; MAJA |
1206 | movq(y2, f); //y2 = f; CH |
1207 | |
1208 | xorq(y2, g); //y2 = f^g; CH |
1209 | |
1210 | rorxq(T1, a, 34); //T1 = a >> 34; S0B |
1211 | xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18); S1 |
1212 | |
1213 | rorxq(y1, e, 14); //y1 = (e >> 14); S1 |
1214 | |
1215 | andq(y2, e); //y2 = (f^g) & e; CH |
1216 | addq(d, h); //d = k + w + h + d; -- |
1217 | |
1218 | andq(y3, b); //y3 = (a | c)&b; MAJA |
1219 | xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18) ^ (e >> 14); S1 |
1220 | rorxq(y1, a, 39); //y1 = a >> 39; S0A |
1221 | |
1222 | xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34); S0 |
1223 | rorxq(T1, a, 28); //T1 = (a >> 28); S0 |
1224 | xorq(y2, g); //y2 = CH = ((f^g)&e) ^ g; CH |
1225 | |
1226 | xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34) ^ (a >> 28); S0 |
1227 | movq(T1, a); //T1 = a; MAJB |
1228 | |
1229 | andq(T1, c); //T1 = a&c; MAJB |
1230 | addq(y2, y0); //y2 = S1 + CH; -- |
1231 | |
1232 | orq(y3, T1); //y3 = MAJ = (a | c)&b) | (a&c); MAJ |
1233 | addq(h, y1); //h = k + w + h + S0; -- |
1234 | |
1235 | addq(d, y2); //d = k + w + h + d + S1 + CH = d + t1; -- |
1236 | addq(h, y2); //h = k + w + h + S0 + S1 + CH = t1 + S0; -- |
1237 | addq(h, y3); //h = t1 + S0 + MAJ; -- |
1238 | } |
1239 | |
1240 | void MacroAssembler::sha512_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0, |
1241 | XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4, |
1242 | Register buf, Register state, Register ofs, Register limit, Register rsp, |
1243 | bool multi_block, XMMRegister shuf_mask) |
1244 | { |
1245 | |
1246 | Label loop0, loop1, loop2, done_hash, |
1247 | compute_block_size, compute_size, |
1248 | compute_block_size_end, compute_size_end; |
1249 | |
1250 | address K512_W = StubRoutines::x86::k512_W_addr(); |
1251 | address pshuffle_byte_flip_mask_sha512 = StubRoutines::x86::pshuffle_byte_flip_mask_addr_sha512(); |
1252 | address pshuffle_byte_flip_mask_addr = 0; |
1253 | |
1254 | const XMMRegister& XFER = xmm0; // YTMP0 |
1255 | const XMMRegister& BYTE_FLIP_MASK = xmm9; // ymm9 |
1256 | const XMMRegister& YMM_MASK_LO = xmm10; // ymm10 |
1257 | #ifdef _WIN64 |
1258 | const Register& INP = rcx; //1st arg |
1259 | const Register& CTX = rdx; //2nd arg |
1260 | const Register& NUM_BLKS = r8; //3rd arg |
1261 | const Register& c = rdi; |
1262 | const Register& d = rsi; |
1263 | const Register& e = r8; |
1264 | const Register& y3 = rcx; |
1265 | const Register& offset = r8; |
1266 | const Register& input_limit = r9; |
1267 | #else |
1268 | const Register& INP = rdi; //1st arg |
1269 | const Register& CTX = rsi; //2nd arg |
1270 | const Register& NUM_BLKS = rdx; //3rd arg |
1271 | const Register& c = rcx; |
1272 | const Register& d = r8; |
1273 | const Register& e = rdx; |
1274 | const Register& y3 = rdi; |
1275 | const Register& offset = rdx; |
1276 | const Register& input_limit = rcx; |
1277 | #endif |
1278 | |
1279 | const Register& TBL = rbp; |
1280 | |
1281 | const Register& a = rax; |
1282 | const Register& b = rbx; |
1283 | |
1284 | const Register& f = r9; |
1285 | const Register& g = r10; |
1286 | const Register& h = r11; |
1287 | |
1288 | //Local variables as defined in assembly file. |
1289 | enum |
1290 | { |
1291 | _XFER_SIZE = 4 * 8, // resq 4 => reserve 4 quadwords. Hence 4 * 8 |
1292 | _SRND_SIZE = 8, // resq 1 |
1293 | _INP_SIZE = 8, |
1294 | _INP_END_SIZE = 8, |
1295 | _RSP_SAVE_SIZE = 8, // defined as resq 1 |
1296 | |
1297 | #ifdef _WIN64 |
1298 | _GPR_SAVE_SIZE = 8 * 8, // defined as resq 8 |
1299 | #else |
1300 | _GPR_SAVE_SIZE = 6 * 8 // resq 6 |
1301 | #endif |
1302 | }; |
1303 | |
1304 | enum |
1305 | { |
1306 | _XFER = 0, |
1307 | _SRND = _XFER + _XFER_SIZE, // 32 |
1308 | _INP = _SRND + _SRND_SIZE, // 40 |
1309 | _INP_END = _INP + _INP_SIZE, // 48 |
1310 | _RSP = _INP_END + _INP_END_SIZE, // 56 |
1311 | _GPR = _RSP + _RSP_SAVE_SIZE, // 64 |
1312 | _STACK_SIZE = _GPR + _GPR_SAVE_SIZE // 128 for windows and 112 for linux. |
1313 | }; |
1314 | |
1315 | //Saving offset and limit as it will help with blocksize calculation for multiblock SHA512. |
1316 | #ifdef _WIN64 |
1317 | push(r8); // win64: this is ofs |
1318 | push(r9); // win64: this is limit, we need them again at the very end. |
1319 | #else |
1320 | push(rdx); // linux : this is ofs, need at the end for multiblock calculation |
1321 | push(rcx); // linux: This is the limit. |
1322 | #endif |
1323 | |
1324 | //Allocate Stack Space |
1325 | movq(rax, rsp); |
1326 | subq(rsp, _STACK_SIZE); |
1327 | andq(rsp, -32); |
1328 | movq(Address(rsp, _RSP), rax); |
1329 | |
1330 | //Save GPRs |
1331 | movq(Address(rsp, _GPR), rbp); |
1332 | movq(Address(rsp, (_GPR + 8)), rbx); |
1333 | movq(Address(rsp, (_GPR + 16)), r12); |
1334 | movq(Address(rsp, (_GPR + 24)), r13); |
1335 | movq(Address(rsp, (_GPR + 32)), r14); |
1336 | movq(Address(rsp, (_GPR + 40)), r15); |
1337 | |
1338 | #ifdef _WIN64 |
1339 | movq(Address(rsp, (_GPR + 48)), rsi); |
1340 | movq(Address(rsp, (_GPR + 56)), rdi); |
1341 | #endif |
1342 | |
1343 | vpblendd(xmm0, xmm0, xmm1, 0xF0, AVX_128bit); |
1344 | vpblendd(xmm0, xmm0, xmm1, 0xF0, AVX_256bit); |
1345 | |
1346 | if (multi_block) { |
1347 | xorq(rax, rax); |
1348 | bind(compute_block_size); |
1349 | cmpptr(offset, input_limit); // Assuming that offset is less than limit. |
1350 | jccb(Assembler::aboveEqual, compute_block_size_end); |
1351 | addq(offset, 128); |
1352 | addq(rax, 128); |
1353 | jmpb(compute_block_size); |
1354 | |
1355 | bind(compute_block_size_end); |
1356 | movq(NUM_BLKS, rax); |
1357 | |
1358 | cmpq(NUM_BLKS, 0); |
1359 | jcc(Assembler::equal, done_hash); |
1360 | } else { |
1361 | xorq(NUM_BLKS, NUM_BLKS); //If single block. |
1362 | addq(NUM_BLKS, 128); |
1363 | } |
1364 | |
1365 | addq(NUM_BLKS, INP); //pointer to end of data |
1366 | movq(Address(rsp, _INP_END), NUM_BLKS); |
1367 | |
1368 | //load initial digest |
1369 | movq(a, Address(CTX, 8 * 0)); |
1370 | movq(b, Address(CTX, 8 * 1)); |
1371 | movq(c, Address(CTX, 8 * 2)); |
1372 | movq(d, Address(CTX, 8 * 3)); |
1373 | movq(e, Address(CTX, 8 * 4)); |
1374 | movq(f, Address(CTX, 8 * 5)); |
1375 | // load g - r10 after it is used as scratch |
1376 | movq(h, Address(CTX, 8 * 7)); |
1377 | |
1378 | pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask_sha512; |
1379 | vmovdqu(BYTE_FLIP_MASK, ExternalAddress(pshuffle_byte_flip_mask_addr + 0)); //PSHUFFLE_BYTE_FLIP_MASK wrt rip |
1380 | vmovdqu(YMM_MASK_LO, ExternalAddress(pshuffle_byte_flip_mask_addr + 32)); |
1381 | |
1382 | movq(g, Address(CTX, 8 * 6)); |
1383 | |
1384 | bind(loop0); |
1385 | lea(TBL, ExternalAddress(K512_W)); |
1386 | |
1387 | //byte swap first 16 dwords |
1388 | vmovdqu(xmm4, Address(INP, 32 * 0)); |
1389 | vpshufb(xmm4, xmm4, BYTE_FLIP_MASK, AVX_256bit); |
1390 | vmovdqu(xmm5, Address(INP, 32 * 1)); |
1391 | vpshufb(xmm5, xmm5, BYTE_FLIP_MASK, AVX_256bit); |
1392 | vmovdqu(xmm6, Address(INP, 32 * 2)); |
1393 | vpshufb(xmm6, xmm6, BYTE_FLIP_MASK, AVX_256bit); |
1394 | vmovdqu(xmm7, Address(INP, 32 * 3)); |
1395 | vpshufb(xmm7, xmm7, BYTE_FLIP_MASK, AVX_256bit); |
1396 | |
1397 | movq(Address(rsp, _INP), INP); |
1398 | |
1399 | movslq(Address(rsp, _SRND), 4); |
1400 | align(16); |
1401 | |
1402 | //Schedule 64 input dwords, by calling sha512_AVX2_one_round_and_schedule |
1403 | bind(loop1); |
1404 | vpaddq(xmm0, xmm4, Address(TBL, 0 * 32), AVX_256bit); |
1405 | vmovdqu(Address(rsp, _XFER), xmm0); |
1406 | //four rounds and schedule |
1407 | sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, a, b, c, d, e, f, g, h, 0); |
1408 | sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, h, a, b, c, d, e, f, g, 1); |
1409 | sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, g, h, a, b, c, d, e, f, 2); |
1410 | sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, f, g, h, a, b, c, d, e, 3); |
1411 | |
1412 | vpaddq(xmm0, xmm5, Address(TBL, 1 * 32), AVX_256bit); |
1413 | vmovdqu(Address(rsp, _XFER), xmm0); |
1414 | //four rounds and schedule |
1415 | sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, e, f, g, h, a, b, c, d, 0); |
1416 | sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, d, e, f, g, h, a, b, c, 1); |
1417 | sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, c, d, e, f, g, h, a, b, 2); |
1418 | sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, b, c, d, e, f, g, h, a, 3); |
1419 | |
1420 | vpaddq(xmm0, xmm6, Address(TBL, 2 * 32), AVX_256bit); |
1421 | vmovdqu(Address(rsp, _XFER), xmm0); |
1422 | //four rounds and schedule |
1423 | sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, a, b, c, d, e, f, g, h, 0); |
1424 | sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, h, a, b, c, d, e, f, g, 1); |
1425 | sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, g, h, a, b, c, d, e, f, 2); |
1426 | sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, f, g, h, a, b, c, d, e, 3); |
1427 | |
1428 | vpaddq(xmm0, xmm7, Address(TBL, 3 * 32), AVX_256bit); |
1429 | vmovdqu(Address(rsp, _XFER), xmm0); |
1430 | addq(TBL, 4 * 32); |
1431 | //four rounds and schedule |
1432 | sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, e, f, g, h, a, b, c, d, 0); |
1433 | sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, d, e, f, g, h, a, b, c, 1); |
1434 | sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, c, d, e, f, g, h, a, b, 2); |
1435 | sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, b, c, d, e, f, g, h, a, 3); |
1436 | |
1437 | subq(Address(rsp, _SRND), 1); |
1438 | jcc(Assembler::notEqual, loop1); |
1439 | |
1440 | movslq(Address(rsp, _SRND), 2); |
1441 | |
1442 | bind(loop2); |
1443 | vpaddq(xmm0, xmm4, Address(TBL, 0 * 32), AVX_256bit); |
1444 | vmovdqu(Address(rsp, _XFER), xmm0); |
1445 | //four rounds and compute. |
1446 | sha512_AVX2_one_round_compute(a, a, b, c, d, e, f, g, h, 0); |
1447 | sha512_AVX2_one_round_compute(h, h, a, b, c, d, e, f, g, 1); |
1448 | sha512_AVX2_one_round_compute(g, g, h, a, b, c, d, e, f, 2); |
1449 | sha512_AVX2_one_round_compute(f, f, g, h, a, b, c, d, e, 3); |
1450 | |
1451 | vpaddq(xmm0, xmm5, Address(TBL, 1 * 32), AVX_256bit); |
1452 | vmovdqu(Address(rsp, _XFER), xmm0); |
1453 | addq(TBL, 2 * 32); |
1454 | // four rounds and compute. |
1455 | sha512_AVX2_one_round_compute(e, e, f, g, h, a, b, c, d, 0); |
1456 | sha512_AVX2_one_round_compute(d, d, e, f, g, h, a, b, c, 1); |
1457 | sha512_AVX2_one_round_compute(c, c, d, e, f, g, h, a, b, 2); |
1458 | sha512_AVX2_one_round_compute(b, b, c, d, e, f, g, h, a, 3); |
1459 | |
1460 | vmovdqu(xmm4, xmm6); |
1461 | vmovdqu(xmm5, xmm7); |
1462 | |
1463 | subq(Address(rsp, _SRND), 1); |
1464 | jcc(Assembler::notEqual, loop2); |
1465 | |
1466 | addmq(8 * 0, CTX, a); |
1467 | addmq(8 * 1, CTX, b); |
1468 | addmq(8 * 2, CTX, c); |
1469 | addmq(8 * 3, CTX, d); |
1470 | addmq(8 * 4, CTX, e); |
1471 | addmq(8 * 5, CTX, f); |
1472 | addmq(8 * 6, CTX, g); |
1473 | addmq(8 * 7, CTX, h); |
1474 | |
1475 | movq(INP, Address(rsp, _INP)); |
1476 | addq(INP, 128); |
1477 | cmpq(INP, Address(rsp, _INP_END)); |
1478 | jcc(Assembler::notEqual, loop0); |
1479 | |
1480 | bind(done_hash); |
1481 | |
1482 | //Restore GPRs |
1483 | movq(rbp, Address(rsp, (_GPR + 0))); |
1484 | movq(rbx, Address(rsp, (_GPR + 8))); |
1485 | movq(r12, Address(rsp, (_GPR + 16))); |
1486 | movq(r13, Address(rsp, (_GPR + 24))); |
1487 | movq(r14, Address(rsp, (_GPR + 32))); |
1488 | movq(r15, Address(rsp, (_GPR + 40))); |
1489 | |
1490 | #ifdef _WIN64 |
1491 | movq(rsi, Address(rsp, (_GPR + 48))); |
1492 | movq(rdi, Address(rsp, (_GPR + 56))); |
1493 | #endif |
1494 | |
1495 | //Restore Stack Pointer |
1496 | movq(rsp, Address(rsp, _RSP)); |
1497 | |
1498 | #ifdef _WIN64 |
1499 | pop(r9); |
1500 | pop(r8); |
1501 | #else |
1502 | pop(rcx); |
1503 | pop(rdx); |
1504 | #endif |
1505 | |
1506 | if (multi_block) { |
1507 | #ifdef _WIN64 |
1508 | const Register& limit_end = r9; |
1509 | const Register& ofs_end = r8; |
1510 | #else |
1511 | const Register& limit_end = rcx; |
1512 | const Register& ofs_end = rdx; |
1513 | #endif |
1514 | movq(rax, ofs_end); |
1515 | bind(compute_size); |
1516 | cmpptr(rax, limit_end); |
1517 | jccb(Assembler::aboveEqual, compute_size_end); |
1518 | addq(rax, 128); |
1519 | jmpb(compute_size); |
1520 | bind(compute_size_end); |
1521 | } |
1522 | } |
1523 | |
1524 | #endif //#ifdef _LP64 |
1525 | |
1526 | |