1# This file is generated from a similarly-named Perl script in the BoringSSL
2# source tree. Do not edit by hand.
3
4#if defined(__has_feature)
5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
6#define OPENSSL_NO_ASM
7#endif
8#endif
9
10#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
11#if defined(BORINGSSL_PREFIX)
12#include <boringssl_prefix_symbols_asm.h>
13#endif
14.text
15
16.extern OPENSSL_ia32cap_P
17.hidden OPENSSL_ia32cap_P
18
19.globl bn_mul_mont
20.hidden bn_mul_mont
21.type bn_mul_mont,@function
22.align 16
23bn_mul_mont:
24.cfi_startproc
25 movl %r9d,%r9d
26 movq %rsp,%rax
27.cfi_def_cfa_register %rax
28 testl $3,%r9d
29 jnz .Lmul_enter
30 cmpl $8,%r9d
31 jb .Lmul_enter
32 leaq OPENSSL_ia32cap_P(%rip),%r11
33 movl 8(%r11),%r11d
34 cmpq %rsi,%rdx
35 jne .Lmul4x_enter
36 testl $7,%r9d
37 jz .Lsqr8x_enter
38 jmp .Lmul4x_enter
39
40.align 16
41.Lmul_enter:
42 pushq %rbx
43.cfi_offset %rbx,-16
44 pushq %rbp
45.cfi_offset %rbp,-24
46 pushq %r12
47.cfi_offset %r12,-32
48 pushq %r13
49.cfi_offset %r13,-40
50 pushq %r14
51.cfi_offset %r14,-48
52 pushq %r15
53.cfi_offset %r15,-56
54
55 negq %r9
56 movq %rsp,%r11
57 leaq -16(%rsp,%r9,8),%r10
58 negq %r9
59 andq $-1024,%r10
60
61
62
63
64
65
66
67
68
69 subq %r10,%r11
70 andq $-4096,%r11
71 leaq (%r10,%r11,1),%rsp
72 movq (%rsp),%r11
73 cmpq %r10,%rsp
74 ja .Lmul_page_walk
75 jmp .Lmul_page_walk_done
76
77.align 16
78.Lmul_page_walk:
79 leaq -4096(%rsp),%rsp
80 movq (%rsp),%r11
81 cmpq %r10,%rsp
82 ja .Lmul_page_walk
83.Lmul_page_walk_done:
84
85 movq %rax,8(%rsp,%r9,8)
86.cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
87.Lmul_body:
88 movq %rdx,%r12
89 movq (%r8),%r8
90 movq (%r12),%rbx
91 movq (%rsi),%rax
92
93 xorq %r14,%r14
94 xorq %r15,%r15
95
96 movq %r8,%rbp
97 mulq %rbx
98 movq %rax,%r10
99 movq (%rcx),%rax
100
101 imulq %r10,%rbp
102 movq %rdx,%r11
103
104 mulq %rbp
105 addq %rax,%r10
106 movq 8(%rsi),%rax
107 adcq $0,%rdx
108 movq %rdx,%r13
109
110 leaq 1(%r15),%r15
111 jmp .L1st_enter
112
113.align 16
114.L1st:
115 addq %rax,%r13
116 movq (%rsi,%r15,8),%rax
117 adcq $0,%rdx
118 addq %r11,%r13
119 movq %r10,%r11
120 adcq $0,%rdx
121 movq %r13,-16(%rsp,%r15,8)
122 movq %rdx,%r13
123
124.L1st_enter:
125 mulq %rbx
126 addq %rax,%r11
127 movq (%rcx,%r15,8),%rax
128 adcq $0,%rdx
129 leaq 1(%r15),%r15
130 movq %rdx,%r10
131
132 mulq %rbp
133 cmpq %r9,%r15
134 jne .L1st
135
136 addq %rax,%r13
137 movq (%rsi),%rax
138 adcq $0,%rdx
139 addq %r11,%r13
140 adcq $0,%rdx
141 movq %r13,-16(%rsp,%r15,8)
142 movq %rdx,%r13
143 movq %r10,%r11
144
145 xorq %rdx,%rdx
146 addq %r11,%r13
147 adcq $0,%rdx
148 movq %r13,-8(%rsp,%r9,8)
149 movq %rdx,(%rsp,%r9,8)
150
151 leaq 1(%r14),%r14
152 jmp .Louter
153.align 16
154.Louter:
155 movq (%r12,%r14,8),%rbx
156 xorq %r15,%r15
157 movq %r8,%rbp
158 movq (%rsp),%r10
159 mulq %rbx
160 addq %rax,%r10
161 movq (%rcx),%rax
162 adcq $0,%rdx
163
164 imulq %r10,%rbp
165 movq %rdx,%r11
166
167 mulq %rbp
168 addq %rax,%r10
169 movq 8(%rsi),%rax
170 adcq $0,%rdx
171 movq 8(%rsp),%r10
172 movq %rdx,%r13
173
174 leaq 1(%r15),%r15
175 jmp .Linner_enter
176
177.align 16
178.Linner:
179 addq %rax,%r13
180 movq (%rsi,%r15,8),%rax
181 adcq $0,%rdx
182 addq %r10,%r13
183 movq (%rsp,%r15,8),%r10
184 adcq $0,%rdx
185 movq %r13,-16(%rsp,%r15,8)
186 movq %rdx,%r13
187
188.Linner_enter:
189 mulq %rbx
190 addq %rax,%r11
191 movq (%rcx,%r15,8),%rax
192 adcq $0,%rdx
193 addq %r11,%r10
194 movq %rdx,%r11
195 adcq $0,%r11
196 leaq 1(%r15),%r15
197
198 mulq %rbp
199 cmpq %r9,%r15
200 jne .Linner
201
202 addq %rax,%r13
203 movq (%rsi),%rax
204 adcq $0,%rdx
205 addq %r10,%r13
206 movq (%rsp,%r15,8),%r10
207 adcq $0,%rdx
208 movq %r13,-16(%rsp,%r15,8)
209 movq %rdx,%r13
210
211 xorq %rdx,%rdx
212 addq %r11,%r13
213 adcq $0,%rdx
214 addq %r10,%r13
215 adcq $0,%rdx
216 movq %r13,-8(%rsp,%r9,8)
217 movq %rdx,(%rsp,%r9,8)
218
219 leaq 1(%r14),%r14
220 cmpq %r9,%r14
221 jb .Louter
222
223 xorq %r14,%r14
224 movq (%rsp),%rax
225 movq %r9,%r15
226
227.align 16
228.Lsub: sbbq (%rcx,%r14,8),%rax
229 movq %rax,(%rdi,%r14,8)
230 movq 8(%rsp,%r14,8),%rax
231 leaq 1(%r14),%r14
232 decq %r15
233 jnz .Lsub
234
235 sbbq $0,%rax
236 movq $-1,%rbx
237 xorq %rax,%rbx
238 xorq %r14,%r14
239 movq %r9,%r15
240
241.Lcopy:
242 movq (%rdi,%r14,8),%rcx
243 movq (%rsp,%r14,8),%rdx
244 andq %rbx,%rcx
245 andq %rax,%rdx
246 movq %r9,(%rsp,%r14,8)
247 orq %rcx,%rdx
248 movq %rdx,(%rdi,%r14,8)
249 leaq 1(%r14),%r14
250 subq $1,%r15
251 jnz .Lcopy
252
253 movq 8(%rsp,%r9,8),%rsi
254.cfi_def_cfa %rsi,8
255 movq $1,%rax
256 movq -48(%rsi),%r15
257.cfi_restore %r15
258 movq -40(%rsi),%r14
259.cfi_restore %r14
260 movq -32(%rsi),%r13
261.cfi_restore %r13
262 movq -24(%rsi),%r12
263.cfi_restore %r12
264 movq -16(%rsi),%rbp
265.cfi_restore %rbp
266 movq -8(%rsi),%rbx
267.cfi_restore %rbx
268 leaq (%rsi),%rsp
269.cfi_def_cfa_register %rsp
270.Lmul_epilogue:
271 .byte 0xf3,0xc3
272.cfi_endproc
273.size bn_mul_mont,.-bn_mul_mont
274.type bn_mul4x_mont,@function
275.align 16
276bn_mul4x_mont:
277.cfi_startproc
278 movl %r9d,%r9d
279 movq %rsp,%rax
280.cfi_def_cfa_register %rax
281.Lmul4x_enter:
282 andl $0x80100,%r11d
283 cmpl $0x80100,%r11d
284 je .Lmulx4x_enter
285 pushq %rbx
286.cfi_offset %rbx,-16
287 pushq %rbp
288.cfi_offset %rbp,-24
289 pushq %r12
290.cfi_offset %r12,-32
291 pushq %r13
292.cfi_offset %r13,-40
293 pushq %r14
294.cfi_offset %r14,-48
295 pushq %r15
296.cfi_offset %r15,-56
297
298 negq %r9
299 movq %rsp,%r11
300 leaq -32(%rsp,%r9,8),%r10
301 negq %r9
302 andq $-1024,%r10
303
304 subq %r10,%r11
305 andq $-4096,%r11
306 leaq (%r10,%r11,1),%rsp
307 movq (%rsp),%r11
308 cmpq %r10,%rsp
309 ja .Lmul4x_page_walk
310 jmp .Lmul4x_page_walk_done
311
312.Lmul4x_page_walk:
313 leaq -4096(%rsp),%rsp
314 movq (%rsp),%r11
315 cmpq %r10,%rsp
316 ja .Lmul4x_page_walk
317.Lmul4x_page_walk_done:
318
319 movq %rax,8(%rsp,%r9,8)
320.cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
321.Lmul4x_body:
322 movq %rdi,16(%rsp,%r9,8)
323 movq %rdx,%r12
324 movq (%r8),%r8
325 movq (%r12),%rbx
326 movq (%rsi),%rax
327
328 xorq %r14,%r14
329 xorq %r15,%r15
330
331 movq %r8,%rbp
332 mulq %rbx
333 movq %rax,%r10
334 movq (%rcx),%rax
335
336 imulq %r10,%rbp
337 movq %rdx,%r11
338
339 mulq %rbp
340 addq %rax,%r10
341 movq 8(%rsi),%rax
342 adcq $0,%rdx
343 movq %rdx,%rdi
344
345 mulq %rbx
346 addq %rax,%r11
347 movq 8(%rcx),%rax
348 adcq $0,%rdx
349 movq %rdx,%r10
350
351 mulq %rbp
352 addq %rax,%rdi
353 movq 16(%rsi),%rax
354 adcq $0,%rdx
355 addq %r11,%rdi
356 leaq 4(%r15),%r15
357 adcq $0,%rdx
358 movq %rdi,(%rsp)
359 movq %rdx,%r13
360 jmp .L1st4x
361.align 16
362.L1st4x:
363 mulq %rbx
364 addq %rax,%r10
365 movq -16(%rcx,%r15,8),%rax
366 adcq $0,%rdx
367 movq %rdx,%r11
368
369 mulq %rbp
370 addq %rax,%r13
371 movq -8(%rsi,%r15,8),%rax
372 adcq $0,%rdx
373 addq %r10,%r13
374 adcq $0,%rdx
375 movq %r13,-24(%rsp,%r15,8)
376 movq %rdx,%rdi
377
378 mulq %rbx
379 addq %rax,%r11
380 movq -8(%rcx,%r15,8),%rax
381 adcq $0,%rdx
382 movq %rdx,%r10
383
384 mulq %rbp
385 addq %rax,%rdi
386 movq (%rsi,%r15,8),%rax
387 adcq $0,%rdx
388 addq %r11,%rdi
389 adcq $0,%rdx
390 movq %rdi,-16(%rsp,%r15,8)
391 movq %rdx,%r13
392
393 mulq %rbx
394 addq %rax,%r10
395 movq (%rcx,%r15,8),%rax
396 adcq $0,%rdx
397 movq %rdx,%r11
398
399 mulq %rbp
400 addq %rax,%r13
401 movq 8(%rsi,%r15,8),%rax
402 adcq $0,%rdx
403 addq %r10,%r13
404 adcq $0,%rdx
405 movq %r13,-8(%rsp,%r15,8)
406 movq %rdx,%rdi
407
408 mulq %rbx
409 addq %rax,%r11
410 movq 8(%rcx,%r15,8),%rax
411 adcq $0,%rdx
412 leaq 4(%r15),%r15
413 movq %rdx,%r10
414
415 mulq %rbp
416 addq %rax,%rdi
417 movq -16(%rsi,%r15,8),%rax
418 adcq $0,%rdx
419 addq %r11,%rdi
420 adcq $0,%rdx
421 movq %rdi,-32(%rsp,%r15,8)
422 movq %rdx,%r13
423 cmpq %r9,%r15
424 jb .L1st4x
425
426 mulq %rbx
427 addq %rax,%r10
428 movq -16(%rcx,%r15,8),%rax
429 adcq $0,%rdx
430 movq %rdx,%r11
431
432 mulq %rbp
433 addq %rax,%r13
434 movq -8(%rsi,%r15,8),%rax
435 adcq $0,%rdx
436 addq %r10,%r13
437 adcq $0,%rdx
438 movq %r13,-24(%rsp,%r15,8)
439 movq %rdx,%rdi
440
441 mulq %rbx
442 addq %rax,%r11
443 movq -8(%rcx,%r15,8),%rax
444 adcq $0,%rdx
445 movq %rdx,%r10
446
447 mulq %rbp
448 addq %rax,%rdi
449 movq (%rsi),%rax
450 adcq $0,%rdx
451 addq %r11,%rdi
452 adcq $0,%rdx
453 movq %rdi,-16(%rsp,%r15,8)
454 movq %rdx,%r13
455
456 xorq %rdi,%rdi
457 addq %r10,%r13
458 adcq $0,%rdi
459 movq %r13,-8(%rsp,%r15,8)
460 movq %rdi,(%rsp,%r15,8)
461
462 leaq 1(%r14),%r14
463.align 4
464.Louter4x:
465 movq (%r12,%r14,8),%rbx
466 xorq %r15,%r15
467 movq (%rsp),%r10
468 movq %r8,%rbp
469 mulq %rbx
470 addq %rax,%r10
471 movq (%rcx),%rax
472 adcq $0,%rdx
473
474 imulq %r10,%rbp
475 movq %rdx,%r11
476
477 mulq %rbp
478 addq %rax,%r10
479 movq 8(%rsi),%rax
480 adcq $0,%rdx
481 movq %rdx,%rdi
482
483 mulq %rbx
484 addq %rax,%r11
485 movq 8(%rcx),%rax
486 adcq $0,%rdx
487 addq 8(%rsp),%r11
488 adcq $0,%rdx
489 movq %rdx,%r10
490
491 mulq %rbp
492 addq %rax,%rdi
493 movq 16(%rsi),%rax
494 adcq $0,%rdx
495 addq %r11,%rdi
496 leaq 4(%r15),%r15
497 adcq $0,%rdx
498 movq %rdi,(%rsp)
499 movq %rdx,%r13
500 jmp .Linner4x
501.align 16
502.Linner4x:
503 mulq %rbx
504 addq %rax,%r10
505 movq -16(%rcx,%r15,8),%rax
506 adcq $0,%rdx
507 addq -16(%rsp,%r15,8),%r10
508 adcq $0,%rdx
509 movq %rdx,%r11
510
511 mulq %rbp
512 addq %rax,%r13
513 movq -8(%rsi,%r15,8),%rax
514 adcq $0,%rdx
515 addq %r10,%r13
516 adcq $0,%rdx
517 movq %r13,-24(%rsp,%r15,8)
518 movq %rdx,%rdi
519
520 mulq %rbx
521 addq %rax,%r11
522 movq -8(%rcx,%r15,8),%rax
523 adcq $0,%rdx
524 addq -8(%rsp,%r15,8),%r11
525 adcq $0,%rdx
526 movq %rdx,%r10
527
528 mulq %rbp
529 addq %rax,%rdi
530 movq (%rsi,%r15,8),%rax
531 adcq $0,%rdx
532 addq %r11,%rdi
533 adcq $0,%rdx
534 movq %rdi,-16(%rsp,%r15,8)
535 movq %rdx,%r13
536
537 mulq %rbx
538 addq %rax,%r10
539 movq (%rcx,%r15,8),%rax
540 adcq $0,%rdx
541 addq (%rsp,%r15,8),%r10
542 adcq $0,%rdx
543 movq %rdx,%r11
544
545 mulq %rbp
546 addq %rax,%r13
547 movq 8(%rsi,%r15,8),%rax
548 adcq $0,%rdx
549 addq %r10,%r13
550 adcq $0,%rdx
551 movq %r13,-8(%rsp,%r15,8)
552 movq %rdx,%rdi
553
554 mulq %rbx
555 addq %rax,%r11
556 movq 8(%rcx,%r15,8),%rax
557 adcq $0,%rdx
558 addq 8(%rsp,%r15,8),%r11
559 adcq $0,%rdx
560 leaq 4(%r15),%r15
561 movq %rdx,%r10
562
563 mulq %rbp
564 addq %rax,%rdi
565 movq -16(%rsi,%r15,8),%rax
566 adcq $0,%rdx
567 addq %r11,%rdi
568 adcq $0,%rdx
569 movq %rdi,-32(%rsp,%r15,8)
570 movq %rdx,%r13
571 cmpq %r9,%r15
572 jb .Linner4x
573
574 mulq %rbx
575 addq %rax,%r10
576 movq -16(%rcx,%r15,8),%rax
577 adcq $0,%rdx
578 addq -16(%rsp,%r15,8),%r10
579 adcq $0,%rdx
580 movq %rdx,%r11
581
582 mulq %rbp
583 addq %rax,%r13
584 movq -8(%rsi,%r15,8),%rax
585 adcq $0,%rdx
586 addq %r10,%r13
587 adcq $0,%rdx
588 movq %r13,-24(%rsp,%r15,8)
589 movq %rdx,%rdi
590
591 mulq %rbx
592 addq %rax,%r11
593 movq -8(%rcx,%r15,8),%rax
594 adcq $0,%rdx
595 addq -8(%rsp,%r15,8),%r11
596 adcq $0,%rdx
597 leaq 1(%r14),%r14
598 movq %rdx,%r10
599
600 mulq %rbp
601 addq %rax,%rdi
602 movq (%rsi),%rax
603 adcq $0,%rdx
604 addq %r11,%rdi
605 adcq $0,%rdx
606 movq %rdi,-16(%rsp,%r15,8)
607 movq %rdx,%r13
608
609 xorq %rdi,%rdi
610 addq %r10,%r13
611 adcq $0,%rdi
612 addq (%rsp,%r9,8),%r13
613 adcq $0,%rdi
614 movq %r13,-8(%rsp,%r15,8)
615 movq %rdi,(%rsp,%r15,8)
616
617 cmpq %r9,%r14
618 jb .Louter4x
619 movq 16(%rsp,%r9,8),%rdi
620 leaq -4(%r9),%r15
621 movq 0(%rsp),%rax
622 movq 8(%rsp),%rdx
623 shrq $2,%r15
624 leaq (%rsp),%rsi
625 xorq %r14,%r14
626
627 subq 0(%rcx),%rax
628 movq 16(%rsi),%rbx
629 movq 24(%rsi),%rbp
630 sbbq 8(%rcx),%rdx
631
632.Lsub4x:
633 movq %rax,0(%rdi,%r14,8)
634 movq %rdx,8(%rdi,%r14,8)
635 sbbq 16(%rcx,%r14,8),%rbx
636 movq 32(%rsi,%r14,8),%rax
637 movq 40(%rsi,%r14,8),%rdx
638 sbbq 24(%rcx,%r14,8),%rbp
639 movq %rbx,16(%rdi,%r14,8)
640 movq %rbp,24(%rdi,%r14,8)
641 sbbq 32(%rcx,%r14,8),%rax
642 movq 48(%rsi,%r14,8),%rbx
643 movq 56(%rsi,%r14,8),%rbp
644 sbbq 40(%rcx,%r14,8),%rdx
645 leaq 4(%r14),%r14
646 decq %r15
647 jnz .Lsub4x
648
649 movq %rax,0(%rdi,%r14,8)
650 movq 32(%rsi,%r14,8),%rax
651 sbbq 16(%rcx,%r14,8),%rbx
652 movq %rdx,8(%rdi,%r14,8)
653 sbbq 24(%rcx,%r14,8),%rbp
654 movq %rbx,16(%rdi,%r14,8)
655
656 sbbq $0,%rax
657 movq %rbp,24(%rdi,%r14,8)
658 pxor %xmm0,%xmm0
659.byte 102,72,15,110,224
660 pcmpeqd %xmm5,%xmm5
661 pshufd $0,%xmm4,%xmm4
662 movq %r9,%r15
663 pxor %xmm4,%xmm5
664 shrq $2,%r15
665 xorl %eax,%eax
666
667 jmp .Lcopy4x
668.align 16
669.Lcopy4x:
670 movdqa (%rsp,%rax,1),%xmm1
671 movdqu (%rdi,%rax,1),%xmm2
672 pand %xmm4,%xmm1
673 pand %xmm5,%xmm2
674 movdqa 16(%rsp,%rax,1),%xmm3
675 movdqa %xmm0,(%rsp,%rax,1)
676 por %xmm2,%xmm1
677 movdqu 16(%rdi,%rax,1),%xmm2
678 movdqu %xmm1,(%rdi,%rax,1)
679 pand %xmm4,%xmm3
680 pand %xmm5,%xmm2
681 movdqa %xmm0,16(%rsp,%rax,1)
682 por %xmm2,%xmm3
683 movdqu %xmm3,16(%rdi,%rax,1)
684 leaq 32(%rax),%rax
685 decq %r15
686 jnz .Lcopy4x
687 movq 8(%rsp,%r9,8),%rsi
688.cfi_def_cfa %rsi, 8
689 movq $1,%rax
690 movq -48(%rsi),%r15
691.cfi_restore %r15
692 movq -40(%rsi),%r14
693.cfi_restore %r14
694 movq -32(%rsi),%r13
695.cfi_restore %r13
696 movq -24(%rsi),%r12
697.cfi_restore %r12
698 movq -16(%rsi),%rbp
699.cfi_restore %rbp
700 movq -8(%rsi),%rbx
701.cfi_restore %rbx
702 leaq (%rsi),%rsp
703.cfi_def_cfa_register %rsp
704.Lmul4x_epilogue:
705 .byte 0xf3,0xc3
706.cfi_endproc
707.size bn_mul4x_mont,.-bn_mul4x_mont
708.extern bn_sqrx8x_internal
709.hidden bn_sqrx8x_internal
710.extern bn_sqr8x_internal
711.hidden bn_sqr8x_internal
712
713.type bn_sqr8x_mont,@function
714.align 32
715bn_sqr8x_mont:
716.cfi_startproc
717 movq %rsp,%rax
718.cfi_def_cfa_register %rax
719.Lsqr8x_enter:
720 pushq %rbx
721.cfi_offset %rbx,-16
722 pushq %rbp
723.cfi_offset %rbp,-24
724 pushq %r12
725.cfi_offset %r12,-32
726 pushq %r13
727.cfi_offset %r13,-40
728 pushq %r14
729.cfi_offset %r14,-48
730 pushq %r15
731.cfi_offset %r15,-56
732.Lsqr8x_prologue:
733
734 movl %r9d,%r10d
735 shll $3,%r9d
736 shlq $3+2,%r10
737 negq %r9
738
739
740
741
742
743
744 leaq -64(%rsp,%r9,2),%r11
745 movq %rsp,%rbp
746 movq (%r8),%r8
747 subq %rsi,%r11
748 andq $4095,%r11
749 cmpq %r11,%r10
750 jb .Lsqr8x_sp_alt
751 subq %r11,%rbp
752 leaq -64(%rbp,%r9,2),%rbp
753 jmp .Lsqr8x_sp_done
754
755.align 32
756.Lsqr8x_sp_alt:
757 leaq 4096-64(,%r9,2),%r10
758 leaq -64(%rbp,%r9,2),%rbp
759 subq %r10,%r11
760 movq $0,%r10
761 cmovcq %r10,%r11
762 subq %r11,%rbp
763.Lsqr8x_sp_done:
764 andq $-64,%rbp
765 movq %rsp,%r11
766 subq %rbp,%r11
767 andq $-4096,%r11
768 leaq (%r11,%rbp,1),%rsp
769 movq (%rsp),%r10
770 cmpq %rbp,%rsp
771 ja .Lsqr8x_page_walk
772 jmp .Lsqr8x_page_walk_done
773
774.align 16
775.Lsqr8x_page_walk:
776 leaq -4096(%rsp),%rsp
777 movq (%rsp),%r10
778 cmpq %rbp,%rsp
779 ja .Lsqr8x_page_walk
780.Lsqr8x_page_walk_done:
781
782 movq %r9,%r10
783 negq %r9
784
785 movq %r8,32(%rsp)
786 movq %rax,40(%rsp)
787.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
788.Lsqr8x_body:
789
790.byte 102,72,15,110,209
791 pxor %xmm0,%xmm0
792.byte 102,72,15,110,207
793.byte 102,73,15,110,218
794 leaq OPENSSL_ia32cap_P(%rip),%rax
795 movl 8(%rax),%eax
796 andl $0x80100,%eax
797 cmpl $0x80100,%eax
798 jne .Lsqr8x_nox
799
800 call bn_sqrx8x_internal
801
802
803
804
805 leaq (%r8,%rcx,1),%rbx
806 movq %rcx,%r9
807 movq %rcx,%rdx
808.byte 102,72,15,126,207
809 sarq $3+2,%rcx
810 jmp .Lsqr8x_sub
811
812.align 32
813.Lsqr8x_nox:
814 call bn_sqr8x_internal
815
816
817
818
819 leaq (%rdi,%r9,1),%rbx
820 movq %r9,%rcx
821 movq %r9,%rdx
822.byte 102,72,15,126,207
823 sarq $3+2,%rcx
824 jmp .Lsqr8x_sub
825
826.align 32
827.Lsqr8x_sub:
828 movq 0(%rbx),%r12
829 movq 8(%rbx),%r13
830 movq 16(%rbx),%r14
831 movq 24(%rbx),%r15
832 leaq 32(%rbx),%rbx
833 sbbq 0(%rbp),%r12
834 sbbq 8(%rbp),%r13
835 sbbq 16(%rbp),%r14
836 sbbq 24(%rbp),%r15
837 leaq 32(%rbp),%rbp
838 movq %r12,0(%rdi)
839 movq %r13,8(%rdi)
840 movq %r14,16(%rdi)
841 movq %r15,24(%rdi)
842 leaq 32(%rdi),%rdi
843 incq %rcx
844 jnz .Lsqr8x_sub
845
846 sbbq $0,%rax
847 leaq (%rbx,%r9,1),%rbx
848 leaq (%rdi,%r9,1),%rdi
849
850.byte 102,72,15,110,200
851 pxor %xmm0,%xmm0
852 pshufd $0,%xmm1,%xmm1
853 movq 40(%rsp),%rsi
854.cfi_def_cfa %rsi,8
855 jmp .Lsqr8x_cond_copy
856
857.align 32
858.Lsqr8x_cond_copy:
859 movdqa 0(%rbx),%xmm2
860 movdqa 16(%rbx),%xmm3
861 leaq 32(%rbx),%rbx
862 movdqu 0(%rdi),%xmm4
863 movdqu 16(%rdi),%xmm5
864 leaq 32(%rdi),%rdi
865 movdqa %xmm0,-32(%rbx)
866 movdqa %xmm0,-16(%rbx)
867 movdqa %xmm0,-32(%rbx,%rdx,1)
868 movdqa %xmm0,-16(%rbx,%rdx,1)
869 pcmpeqd %xmm1,%xmm0
870 pand %xmm1,%xmm2
871 pand %xmm1,%xmm3
872 pand %xmm0,%xmm4
873 pand %xmm0,%xmm5
874 pxor %xmm0,%xmm0
875 por %xmm2,%xmm4
876 por %xmm3,%xmm5
877 movdqu %xmm4,-32(%rdi)
878 movdqu %xmm5,-16(%rdi)
879 addq $32,%r9
880 jnz .Lsqr8x_cond_copy
881
882 movq $1,%rax
883 movq -48(%rsi),%r15
884.cfi_restore %r15
885 movq -40(%rsi),%r14
886.cfi_restore %r14
887 movq -32(%rsi),%r13
888.cfi_restore %r13
889 movq -24(%rsi),%r12
890.cfi_restore %r12
891 movq -16(%rsi),%rbp
892.cfi_restore %rbp
893 movq -8(%rsi),%rbx
894.cfi_restore %rbx
895 leaq (%rsi),%rsp
896.cfi_def_cfa_register %rsp
897.Lsqr8x_epilogue:
898 .byte 0xf3,0xc3
899.cfi_endproc
900.size bn_sqr8x_mont,.-bn_sqr8x_mont
901.type bn_mulx4x_mont,@function
902.align 32
903bn_mulx4x_mont:
904.cfi_startproc
905 movq %rsp,%rax
906.cfi_def_cfa_register %rax
907.Lmulx4x_enter:
908 pushq %rbx
909.cfi_offset %rbx,-16
910 pushq %rbp
911.cfi_offset %rbp,-24
912 pushq %r12
913.cfi_offset %r12,-32
914 pushq %r13
915.cfi_offset %r13,-40
916 pushq %r14
917.cfi_offset %r14,-48
918 pushq %r15
919.cfi_offset %r15,-56
920.Lmulx4x_prologue:
921
922 shll $3,%r9d
923 xorq %r10,%r10
924 subq %r9,%r10
925 movq (%r8),%r8
926 leaq -72(%rsp,%r10,1),%rbp
927 andq $-128,%rbp
928 movq %rsp,%r11
929 subq %rbp,%r11
930 andq $-4096,%r11
931 leaq (%r11,%rbp,1),%rsp
932 movq (%rsp),%r10
933 cmpq %rbp,%rsp
934 ja .Lmulx4x_page_walk
935 jmp .Lmulx4x_page_walk_done
936
937.align 16
938.Lmulx4x_page_walk:
939 leaq -4096(%rsp),%rsp
940 movq (%rsp),%r10
941 cmpq %rbp,%rsp
942 ja .Lmulx4x_page_walk
943.Lmulx4x_page_walk_done:
944
945 leaq (%rdx,%r9,1),%r10
946
947
948
949
950
951
952
953
954
955
956
957
958 movq %r9,0(%rsp)
959 shrq $5,%r9
960 movq %r10,16(%rsp)
961 subq $1,%r9
962 movq %r8,24(%rsp)
963 movq %rdi,32(%rsp)
964 movq %rax,40(%rsp)
965.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
966 movq %r9,48(%rsp)
967 jmp .Lmulx4x_body
968
969.align 32
970.Lmulx4x_body:
971 leaq 8(%rdx),%rdi
972 movq (%rdx),%rdx
973 leaq 64+32(%rsp),%rbx
974 movq %rdx,%r9
975
976 mulxq 0(%rsi),%r8,%rax
977 mulxq 8(%rsi),%r11,%r14
978 addq %rax,%r11
979 movq %rdi,8(%rsp)
980 mulxq 16(%rsi),%r12,%r13
981 adcq %r14,%r12
982 adcq $0,%r13
983
984 movq %r8,%rdi
985 imulq 24(%rsp),%r8
986 xorq %rbp,%rbp
987
988 mulxq 24(%rsi),%rax,%r14
989 movq %r8,%rdx
990 leaq 32(%rsi),%rsi
991 adcxq %rax,%r13
992 adcxq %rbp,%r14
993
994 mulxq 0(%rcx),%rax,%r10
995 adcxq %rax,%rdi
996 adoxq %r11,%r10
997 mulxq 8(%rcx),%rax,%r11
998 adcxq %rax,%r10
999 adoxq %r12,%r11
1000.byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00
1001 movq 48(%rsp),%rdi
1002 movq %r10,-32(%rbx)
1003 adcxq %rax,%r11
1004 adoxq %r13,%r12
1005 mulxq 24(%rcx),%rax,%r15
1006 movq %r9,%rdx
1007 movq %r11,-24(%rbx)
1008 adcxq %rax,%r12
1009 adoxq %rbp,%r15
1010 leaq 32(%rcx),%rcx
1011 movq %r12,-16(%rbx)
1012
1013 jmp .Lmulx4x_1st
1014
1015.align 32
1016.Lmulx4x_1st:
1017 adcxq %rbp,%r15
1018 mulxq 0(%rsi),%r10,%rax
1019 adcxq %r14,%r10
1020 mulxq 8(%rsi),%r11,%r14
1021 adcxq %rax,%r11
1022 mulxq 16(%rsi),%r12,%rax
1023 adcxq %r14,%r12
1024 mulxq 24(%rsi),%r13,%r14
1025.byte 0x67,0x67
1026 movq %r8,%rdx
1027 adcxq %rax,%r13
1028 adcxq %rbp,%r14
1029 leaq 32(%rsi),%rsi
1030 leaq 32(%rbx),%rbx
1031
1032 adoxq %r15,%r10
1033 mulxq 0(%rcx),%rax,%r15
1034 adcxq %rax,%r10
1035 adoxq %r15,%r11
1036 mulxq 8(%rcx),%rax,%r15
1037 adcxq %rax,%r11
1038 adoxq %r15,%r12
1039 mulxq 16(%rcx),%rax,%r15
1040 movq %r10,-40(%rbx)
1041 adcxq %rax,%r12
1042 movq %r11,-32(%rbx)
1043 adoxq %r15,%r13
1044 mulxq 24(%rcx),%rax,%r15
1045 movq %r9,%rdx
1046 movq %r12,-24(%rbx)
1047 adcxq %rax,%r13
1048 adoxq %rbp,%r15
1049 leaq 32(%rcx),%rcx
1050 movq %r13,-16(%rbx)
1051
1052 decq %rdi
1053 jnz .Lmulx4x_1st
1054
1055 movq 0(%rsp),%rax
1056 movq 8(%rsp),%rdi
1057 adcq %rbp,%r15
1058 addq %r15,%r14
1059 sbbq %r15,%r15
1060 movq %r14,-8(%rbx)
1061 jmp .Lmulx4x_outer
1062
1063.align 32
1064.Lmulx4x_outer:
1065 movq (%rdi),%rdx
1066 leaq 8(%rdi),%rdi
1067 subq %rax,%rsi
1068 movq %r15,(%rbx)
1069 leaq 64+32(%rsp),%rbx
1070 subq %rax,%rcx
1071
1072 mulxq 0(%rsi),%r8,%r11
1073 xorl %ebp,%ebp
1074 movq %rdx,%r9
1075 mulxq 8(%rsi),%r14,%r12
1076 adoxq -32(%rbx),%r8
1077 adcxq %r14,%r11
1078 mulxq 16(%rsi),%r15,%r13
1079 adoxq -24(%rbx),%r11
1080 adcxq %r15,%r12
1081 adoxq -16(%rbx),%r12
1082 adcxq %rbp,%r13
1083 adoxq %rbp,%r13
1084
1085 movq %rdi,8(%rsp)
1086 movq %r8,%r15
1087 imulq 24(%rsp),%r8
1088 xorl %ebp,%ebp
1089
1090 mulxq 24(%rsi),%rax,%r14
1091 movq %r8,%rdx
1092 adcxq %rax,%r13
1093 adoxq -8(%rbx),%r13
1094 adcxq %rbp,%r14
1095 leaq 32(%rsi),%rsi
1096 adoxq %rbp,%r14
1097
1098 mulxq 0(%rcx),%rax,%r10
1099 adcxq %rax,%r15
1100 adoxq %r11,%r10
1101 mulxq 8(%rcx),%rax,%r11
1102 adcxq %rax,%r10
1103 adoxq %r12,%r11
1104 mulxq 16(%rcx),%rax,%r12
1105 movq %r10,-32(%rbx)
1106 adcxq %rax,%r11
1107 adoxq %r13,%r12
1108 mulxq 24(%rcx),%rax,%r15
1109 movq %r9,%rdx
1110 movq %r11,-24(%rbx)
1111 leaq 32(%rcx),%rcx
1112 adcxq %rax,%r12
1113 adoxq %rbp,%r15
1114 movq 48(%rsp),%rdi
1115 movq %r12,-16(%rbx)
1116
1117 jmp .Lmulx4x_inner
1118
1119.align 32
1120.Lmulx4x_inner:
1121 mulxq 0(%rsi),%r10,%rax
1122 adcxq %rbp,%r15
1123 adoxq %r14,%r10
1124 mulxq 8(%rsi),%r11,%r14
1125 adcxq 0(%rbx),%r10
1126 adoxq %rax,%r11
1127 mulxq 16(%rsi),%r12,%rax
1128 adcxq 8(%rbx),%r11
1129 adoxq %r14,%r12
1130 mulxq 24(%rsi),%r13,%r14
1131 movq %r8,%rdx
1132 adcxq 16(%rbx),%r12
1133 adoxq %rax,%r13
1134 adcxq 24(%rbx),%r13
1135 adoxq %rbp,%r14
1136 leaq 32(%rsi),%rsi
1137 leaq 32(%rbx),%rbx
1138 adcxq %rbp,%r14
1139
1140 adoxq %r15,%r10
1141 mulxq 0(%rcx),%rax,%r15
1142 adcxq %rax,%r10
1143 adoxq %r15,%r11
1144 mulxq 8(%rcx),%rax,%r15
1145 adcxq %rax,%r11
1146 adoxq %r15,%r12
1147 mulxq 16(%rcx),%rax,%r15
1148 movq %r10,-40(%rbx)
1149 adcxq %rax,%r12
1150 adoxq %r15,%r13
1151 mulxq 24(%rcx),%rax,%r15
1152 movq %r9,%rdx
1153 movq %r11,-32(%rbx)
1154 movq %r12,-24(%rbx)
1155 adcxq %rax,%r13
1156 adoxq %rbp,%r15
1157 leaq 32(%rcx),%rcx
1158 movq %r13,-16(%rbx)
1159
1160 decq %rdi
1161 jnz .Lmulx4x_inner
1162
1163 movq 0(%rsp),%rax
1164 movq 8(%rsp),%rdi
1165 adcq %rbp,%r15
1166 subq 0(%rbx),%rbp
1167 adcq %r15,%r14
1168 sbbq %r15,%r15
1169 movq %r14,-8(%rbx)
1170
1171 cmpq 16(%rsp),%rdi
1172 jne .Lmulx4x_outer
1173
1174 leaq 64(%rsp),%rbx
1175 subq %rax,%rcx
1176 negq %r15
1177 movq %rax,%rdx
1178 shrq $3+2,%rax
1179 movq 32(%rsp),%rdi
1180 jmp .Lmulx4x_sub
1181
1182.align 32
1183.Lmulx4x_sub:
1184 movq 0(%rbx),%r11
1185 movq 8(%rbx),%r12
1186 movq 16(%rbx),%r13
1187 movq 24(%rbx),%r14
1188 leaq 32(%rbx),%rbx
1189 sbbq 0(%rcx),%r11
1190 sbbq 8(%rcx),%r12
1191 sbbq 16(%rcx),%r13
1192 sbbq 24(%rcx),%r14
1193 leaq 32(%rcx),%rcx
1194 movq %r11,0(%rdi)
1195 movq %r12,8(%rdi)
1196 movq %r13,16(%rdi)
1197 movq %r14,24(%rdi)
1198 leaq 32(%rdi),%rdi
1199 decq %rax
1200 jnz .Lmulx4x_sub
1201
1202 sbbq $0,%r15
1203 leaq 64(%rsp),%rbx
1204 subq %rdx,%rdi
1205
1206.byte 102,73,15,110,207
1207 pxor %xmm0,%xmm0
1208 pshufd $0,%xmm1,%xmm1
1209 movq 40(%rsp),%rsi
1210.cfi_def_cfa %rsi,8
1211 jmp .Lmulx4x_cond_copy
1212
1213.align 32
1214.Lmulx4x_cond_copy:
1215 movdqa 0(%rbx),%xmm2
1216 movdqa 16(%rbx),%xmm3
1217 leaq 32(%rbx),%rbx
1218 movdqu 0(%rdi),%xmm4
1219 movdqu 16(%rdi),%xmm5
1220 leaq 32(%rdi),%rdi
1221 movdqa %xmm0,-32(%rbx)
1222 movdqa %xmm0,-16(%rbx)
1223 pcmpeqd %xmm1,%xmm0
1224 pand %xmm1,%xmm2
1225 pand %xmm1,%xmm3
1226 pand %xmm0,%xmm4
1227 pand %xmm0,%xmm5
1228 pxor %xmm0,%xmm0
1229 por %xmm2,%xmm4
1230 por %xmm3,%xmm5
1231 movdqu %xmm4,-32(%rdi)
1232 movdqu %xmm5,-16(%rdi)
1233 subq $32,%rdx
1234 jnz .Lmulx4x_cond_copy
1235
1236 movq %rdx,(%rbx)
1237
1238 movq $1,%rax
1239 movq -48(%rsi),%r15
1240.cfi_restore %r15
1241 movq -40(%rsi),%r14
1242.cfi_restore %r14
1243 movq -32(%rsi),%r13
1244.cfi_restore %r13
1245 movq -24(%rsi),%r12
1246.cfi_restore %r12
1247 movq -16(%rsi),%rbp
1248.cfi_restore %rbp
1249 movq -8(%rsi),%rbx
1250.cfi_restore %rbx
1251 leaq (%rsi),%rsp
1252.cfi_def_cfa_register %rsp
1253.Lmulx4x_epilogue:
1254 .byte 0xf3,0xc3
1255.cfi_endproc
1256.size bn_mulx4x_mont,.-bn_mulx4x_mont
1257.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1258.align 16
1259#endif
1260