1# This file is generated from a similarly-named Perl script in the BoringSSL
2# source tree. Do not edit by hand.
3
4#if defined(__has_feature)
5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
6#define OPENSSL_NO_ASM
7#endif
8#endif
9
10#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
11#if defined(BORINGSSL_PREFIX)
12#include <boringssl_prefix_symbols_asm.h>
13#endif
14.text
15
16.extern OPENSSL_ia32cap_P
17.hidden OPENSSL_ia32cap_P
18
19.globl bn_mul_mont_gather5
20.hidden bn_mul_mont_gather5
21.type bn_mul_mont_gather5,@function
22.align 64
23bn_mul_mont_gather5:
24.cfi_startproc
25 movl %r9d,%r9d
26 movq %rsp,%rax
27.cfi_def_cfa_register %rax
28 testl $7,%r9d
29 jnz .Lmul_enter
30 leaq OPENSSL_ia32cap_P(%rip),%r11
31 movl 8(%r11),%r11d
32 jmp .Lmul4x_enter
33
34.align 16
35.Lmul_enter:
36 movd 8(%rsp),%xmm5
37 pushq %rbx
38.cfi_offset %rbx,-16
39 pushq %rbp
40.cfi_offset %rbp,-24
41 pushq %r12
42.cfi_offset %r12,-32
43 pushq %r13
44.cfi_offset %r13,-40
45 pushq %r14
46.cfi_offset %r14,-48
47 pushq %r15
48.cfi_offset %r15,-56
49
50 negq %r9
51 movq %rsp,%r11
52 leaq -280(%rsp,%r9,8),%r10
53 negq %r9
54 andq $-1024,%r10
55
56
57
58
59
60
61
62
63
64 subq %r10,%r11
65 andq $-4096,%r11
66 leaq (%r10,%r11,1),%rsp
67 movq (%rsp),%r11
68 cmpq %r10,%rsp
69 ja .Lmul_page_walk
70 jmp .Lmul_page_walk_done
71
72.Lmul_page_walk:
73 leaq -4096(%rsp),%rsp
74 movq (%rsp),%r11
75 cmpq %r10,%rsp
76 ja .Lmul_page_walk
77.Lmul_page_walk_done:
78
79 leaq .Linc(%rip),%r10
80 movq %rax,8(%rsp,%r9,8)
81.cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
82.Lmul_body:
83
84 leaq 128(%rdx),%r12
85 movdqa 0(%r10),%xmm0
86 movdqa 16(%r10),%xmm1
87 leaq 24-112(%rsp,%r9,8),%r10
88 andq $-16,%r10
89
90 pshufd $0,%xmm5,%xmm5
91 movdqa %xmm1,%xmm4
92 movdqa %xmm1,%xmm2
93 paddd %xmm0,%xmm1
94 pcmpeqd %xmm5,%xmm0
95.byte 0x67
96 movdqa %xmm4,%xmm3
97 paddd %xmm1,%xmm2
98 pcmpeqd %xmm5,%xmm1
99 movdqa %xmm0,112(%r10)
100 movdqa %xmm4,%xmm0
101
102 paddd %xmm2,%xmm3
103 pcmpeqd %xmm5,%xmm2
104 movdqa %xmm1,128(%r10)
105 movdqa %xmm4,%xmm1
106
107 paddd %xmm3,%xmm0
108 pcmpeqd %xmm5,%xmm3
109 movdqa %xmm2,144(%r10)
110 movdqa %xmm4,%xmm2
111
112 paddd %xmm0,%xmm1
113 pcmpeqd %xmm5,%xmm0
114 movdqa %xmm3,160(%r10)
115 movdqa %xmm4,%xmm3
116 paddd %xmm1,%xmm2
117 pcmpeqd %xmm5,%xmm1
118 movdqa %xmm0,176(%r10)
119 movdqa %xmm4,%xmm0
120
121 paddd %xmm2,%xmm3
122 pcmpeqd %xmm5,%xmm2
123 movdqa %xmm1,192(%r10)
124 movdqa %xmm4,%xmm1
125
126 paddd %xmm3,%xmm0
127 pcmpeqd %xmm5,%xmm3
128 movdqa %xmm2,208(%r10)
129 movdqa %xmm4,%xmm2
130
131 paddd %xmm0,%xmm1
132 pcmpeqd %xmm5,%xmm0
133 movdqa %xmm3,224(%r10)
134 movdqa %xmm4,%xmm3
135 paddd %xmm1,%xmm2
136 pcmpeqd %xmm5,%xmm1
137 movdqa %xmm0,240(%r10)
138 movdqa %xmm4,%xmm0
139
140 paddd %xmm2,%xmm3
141 pcmpeqd %xmm5,%xmm2
142 movdqa %xmm1,256(%r10)
143 movdqa %xmm4,%xmm1
144
145 paddd %xmm3,%xmm0
146 pcmpeqd %xmm5,%xmm3
147 movdqa %xmm2,272(%r10)
148 movdqa %xmm4,%xmm2
149
150 paddd %xmm0,%xmm1
151 pcmpeqd %xmm5,%xmm0
152 movdqa %xmm3,288(%r10)
153 movdqa %xmm4,%xmm3
154 paddd %xmm1,%xmm2
155 pcmpeqd %xmm5,%xmm1
156 movdqa %xmm0,304(%r10)
157
158 paddd %xmm2,%xmm3
159.byte 0x67
160 pcmpeqd %xmm5,%xmm2
161 movdqa %xmm1,320(%r10)
162
163 pcmpeqd %xmm5,%xmm3
164 movdqa %xmm2,336(%r10)
165 pand 64(%r12),%xmm0
166
167 pand 80(%r12),%xmm1
168 pand 96(%r12),%xmm2
169 movdqa %xmm3,352(%r10)
170 pand 112(%r12),%xmm3
171 por %xmm2,%xmm0
172 por %xmm3,%xmm1
173 movdqa -128(%r12),%xmm4
174 movdqa -112(%r12),%xmm5
175 movdqa -96(%r12),%xmm2
176 pand 112(%r10),%xmm4
177 movdqa -80(%r12),%xmm3
178 pand 128(%r10),%xmm5
179 por %xmm4,%xmm0
180 pand 144(%r10),%xmm2
181 por %xmm5,%xmm1
182 pand 160(%r10),%xmm3
183 por %xmm2,%xmm0
184 por %xmm3,%xmm1
185 movdqa -64(%r12),%xmm4
186 movdqa -48(%r12),%xmm5
187 movdqa -32(%r12),%xmm2
188 pand 176(%r10),%xmm4
189 movdqa -16(%r12),%xmm3
190 pand 192(%r10),%xmm5
191 por %xmm4,%xmm0
192 pand 208(%r10),%xmm2
193 por %xmm5,%xmm1
194 pand 224(%r10),%xmm3
195 por %xmm2,%xmm0
196 por %xmm3,%xmm1
197 movdqa 0(%r12),%xmm4
198 movdqa 16(%r12),%xmm5
199 movdqa 32(%r12),%xmm2
200 pand 240(%r10),%xmm4
201 movdqa 48(%r12),%xmm3
202 pand 256(%r10),%xmm5
203 por %xmm4,%xmm0
204 pand 272(%r10),%xmm2
205 por %xmm5,%xmm1
206 pand 288(%r10),%xmm3
207 por %xmm2,%xmm0
208 por %xmm3,%xmm1
209 por %xmm1,%xmm0
210 pshufd $0x4e,%xmm0,%xmm1
211 por %xmm1,%xmm0
212 leaq 256(%r12),%r12
213.byte 102,72,15,126,195
214
215 movq (%r8),%r8
216 movq (%rsi),%rax
217
218 xorq %r14,%r14
219 xorq %r15,%r15
220
221 movq %r8,%rbp
222 mulq %rbx
223 movq %rax,%r10
224 movq (%rcx),%rax
225
226 imulq %r10,%rbp
227 movq %rdx,%r11
228
229 mulq %rbp
230 addq %rax,%r10
231 movq 8(%rsi),%rax
232 adcq $0,%rdx
233 movq %rdx,%r13
234
235 leaq 1(%r15),%r15
236 jmp .L1st_enter
237
238.align 16
239.L1st:
240 addq %rax,%r13
241 movq (%rsi,%r15,8),%rax
242 adcq $0,%rdx
243 addq %r11,%r13
244 movq %r10,%r11
245 adcq $0,%rdx
246 movq %r13,-16(%rsp,%r15,8)
247 movq %rdx,%r13
248
249.L1st_enter:
250 mulq %rbx
251 addq %rax,%r11
252 movq (%rcx,%r15,8),%rax
253 adcq $0,%rdx
254 leaq 1(%r15),%r15
255 movq %rdx,%r10
256
257 mulq %rbp
258 cmpq %r9,%r15
259 jne .L1st
260
261
262 addq %rax,%r13
263 adcq $0,%rdx
264 addq %r11,%r13
265 adcq $0,%rdx
266 movq %r13,-16(%rsp,%r9,8)
267 movq %rdx,%r13
268 movq %r10,%r11
269
270 xorq %rdx,%rdx
271 addq %r11,%r13
272 adcq $0,%rdx
273 movq %r13,-8(%rsp,%r9,8)
274 movq %rdx,(%rsp,%r9,8)
275
276 leaq 1(%r14),%r14
277 jmp .Louter
278.align 16
279.Louter:
280 leaq 24+128(%rsp,%r9,8),%rdx
281 andq $-16,%rdx
282 pxor %xmm4,%xmm4
283 pxor %xmm5,%xmm5
284 movdqa -128(%r12),%xmm0
285 movdqa -112(%r12),%xmm1
286 movdqa -96(%r12),%xmm2
287 movdqa -80(%r12),%xmm3
288 pand -128(%rdx),%xmm0
289 pand -112(%rdx),%xmm1
290 por %xmm0,%xmm4
291 pand -96(%rdx),%xmm2
292 por %xmm1,%xmm5
293 pand -80(%rdx),%xmm3
294 por %xmm2,%xmm4
295 por %xmm3,%xmm5
296 movdqa -64(%r12),%xmm0
297 movdqa -48(%r12),%xmm1
298 movdqa -32(%r12),%xmm2
299 movdqa -16(%r12),%xmm3
300 pand -64(%rdx),%xmm0
301 pand -48(%rdx),%xmm1
302 por %xmm0,%xmm4
303 pand -32(%rdx),%xmm2
304 por %xmm1,%xmm5
305 pand -16(%rdx),%xmm3
306 por %xmm2,%xmm4
307 por %xmm3,%xmm5
308 movdqa 0(%r12),%xmm0
309 movdqa 16(%r12),%xmm1
310 movdqa 32(%r12),%xmm2
311 movdqa 48(%r12),%xmm3
312 pand 0(%rdx),%xmm0
313 pand 16(%rdx),%xmm1
314 por %xmm0,%xmm4
315 pand 32(%rdx),%xmm2
316 por %xmm1,%xmm5
317 pand 48(%rdx),%xmm3
318 por %xmm2,%xmm4
319 por %xmm3,%xmm5
320 movdqa 64(%r12),%xmm0
321 movdqa 80(%r12),%xmm1
322 movdqa 96(%r12),%xmm2
323 movdqa 112(%r12),%xmm3
324 pand 64(%rdx),%xmm0
325 pand 80(%rdx),%xmm1
326 por %xmm0,%xmm4
327 pand 96(%rdx),%xmm2
328 por %xmm1,%xmm5
329 pand 112(%rdx),%xmm3
330 por %xmm2,%xmm4
331 por %xmm3,%xmm5
332 por %xmm5,%xmm4
333 pshufd $0x4e,%xmm4,%xmm0
334 por %xmm4,%xmm0
335 leaq 256(%r12),%r12
336
337 movq (%rsi),%rax
338.byte 102,72,15,126,195
339
340 xorq %r15,%r15
341 movq %r8,%rbp
342 movq (%rsp),%r10
343
344 mulq %rbx
345 addq %rax,%r10
346 movq (%rcx),%rax
347 adcq $0,%rdx
348
349 imulq %r10,%rbp
350 movq %rdx,%r11
351
352 mulq %rbp
353 addq %rax,%r10
354 movq 8(%rsi),%rax
355 adcq $0,%rdx
356 movq 8(%rsp),%r10
357 movq %rdx,%r13
358
359 leaq 1(%r15),%r15
360 jmp .Linner_enter
361
362.align 16
363.Linner:
364 addq %rax,%r13
365 movq (%rsi,%r15,8),%rax
366 adcq $0,%rdx
367 addq %r10,%r13
368 movq (%rsp,%r15,8),%r10
369 adcq $0,%rdx
370 movq %r13,-16(%rsp,%r15,8)
371 movq %rdx,%r13
372
373.Linner_enter:
374 mulq %rbx
375 addq %rax,%r11
376 movq (%rcx,%r15,8),%rax
377 adcq $0,%rdx
378 addq %r11,%r10
379 movq %rdx,%r11
380 adcq $0,%r11
381 leaq 1(%r15),%r15
382
383 mulq %rbp
384 cmpq %r9,%r15
385 jne .Linner
386
387 addq %rax,%r13
388 adcq $0,%rdx
389 addq %r10,%r13
390 movq (%rsp,%r9,8),%r10
391 adcq $0,%rdx
392 movq %r13,-16(%rsp,%r9,8)
393 movq %rdx,%r13
394
395 xorq %rdx,%rdx
396 addq %r11,%r13
397 adcq $0,%rdx
398 addq %r10,%r13
399 adcq $0,%rdx
400 movq %r13,-8(%rsp,%r9,8)
401 movq %rdx,(%rsp,%r9,8)
402
403 leaq 1(%r14),%r14
404 cmpq %r9,%r14
405 jb .Louter
406
407 xorq %r14,%r14
408 movq (%rsp),%rax
409 leaq (%rsp),%rsi
410 movq %r9,%r15
411 jmp .Lsub
412.align 16
413.Lsub: sbbq (%rcx,%r14,8),%rax
414 movq %rax,(%rdi,%r14,8)
415 movq 8(%rsi,%r14,8),%rax
416 leaq 1(%r14),%r14
417 decq %r15
418 jnz .Lsub
419
420 sbbq $0,%rax
421 movq $-1,%rbx
422 xorq %rax,%rbx
423 xorq %r14,%r14
424 movq %r9,%r15
425
426.Lcopy:
427 movq (%rdi,%r14,8),%rcx
428 movq (%rsp,%r14,8),%rdx
429 andq %rbx,%rcx
430 andq %rax,%rdx
431 movq %r14,(%rsp,%r14,8)
432 orq %rcx,%rdx
433 movq %rdx,(%rdi,%r14,8)
434 leaq 1(%r14),%r14
435 subq $1,%r15
436 jnz .Lcopy
437
438 movq 8(%rsp,%r9,8),%rsi
439.cfi_def_cfa %rsi,8
440 movq $1,%rax
441
442 movq -48(%rsi),%r15
443.cfi_restore %r15
444 movq -40(%rsi),%r14
445.cfi_restore %r14
446 movq -32(%rsi),%r13
447.cfi_restore %r13
448 movq -24(%rsi),%r12
449.cfi_restore %r12
450 movq -16(%rsi),%rbp
451.cfi_restore %rbp
452 movq -8(%rsi),%rbx
453.cfi_restore %rbx
454 leaq (%rsi),%rsp
455.cfi_def_cfa_register %rsp
456.Lmul_epilogue:
457 .byte 0xf3,0xc3
458.cfi_endproc
459.size bn_mul_mont_gather5,.-bn_mul_mont_gather5
460.type bn_mul4x_mont_gather5,@function
461.align 32
462bn_mul4x_mont_gather5:
463.cfi_startproc
464.byte 0x67
465 movq %rsp,%rax
466.cfi_def_cfa_register %rax
467.Lmul4x_enter:
468 andl $0x80108,%r11d
469 cmpl $0x80108,%r11d
470 je .Lmulx4x_enter
471 pushq %rbx
472.cfi_offset %rbx,-16
473 pushq %rbp
474.cfi_offset %rbp,-24
475 pushq %r12
476.cfi_offset %r12,-32
477 pushq %r13
478.cfi_offset %r13,-40
479 pushq %r14
480.cfi_offset %r14,-48
481 pushq %r15
482.cfi_offset %r15,-56
483.Lmul4x_prologue:
484
485.byte 0x67
486 shll $3,%r9d
487 leaq (%r9,%r9,2),%r10
488 negq %r9
489
490
491
492
493
494
495
496
497
498
499 leaq -320(%rsp,%r9,2),%r11
500 movq %rsp,%rbp
501 subq %rdi,%r11
502 andq $4095,%r11
503 cmpq %r11,%r10
504 jb .Lmul4xsp_alt
505 subq %r11,%rbp
506 leaq -320(%rbp,%r9,2),%rbp
507 jmp .Lmul4xsp_done
508
509.align 32
510.Lmul4xsp_alt:
511 leaq 4096-320(,%r9,2),%r10
512 leaq -320(%rbp,%r9,2),%rbp
513 subq %r10,%r11
514 movq $0,%r10
515 cmovcq %r10,%r11
516 subq %r11,%rbp
517.Lmul4xsp_done:
518 andq $-64,%rbp
519 movq %rsp,%r11
520 subq %rbp,%r11
521 andq $-4096,%r11
522 leaq (%r11,%rbp,1),%rsp
523 movq (%rsp),%r10
524 cmpq %rbp,%rsp
525 ja .Lmul4x_page_walk
526 jmp .Lmul4x_page_walk_done
527
528.Lmul4x_page_walk:
529 leaq -4096(%rsp),%rsp
530 movq (%rsp),%r10
531 cmpq %rbp,%rsp
532 ja .Lmul4x_page_walk
533.Lmul4x_page_walk_done:
534
535 negq %r9
536
537 movq %rax,40(%rsp)
538.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
539.Lmul4x_body:
540
541 call mul4x_internal
542
543 movq 40(%rsp),%rsi
544.cfi_def_cfa %rsi,8
545 movq $1,%rax
546
547 movq -48(%rsi),%r15
548.cfi_restore %r15
549 movq -40(%rsi),%r14
550.cfi_restore %r14
551 movq -32(%rsi),%r13
552.cfi_restore %r13
553 movq -24(%rsi),%r12
554.cfi_restore %r12
555 movq -16(%rsi),%rbp
556.cfi_restore %rbp
557 movq -8(%rsi),%rbx
558.cfi_restore %rbx
559 leaq (%rsi),%rsp
560.cfi_def_cfa_register %rsp
561.Lmul4x_epilogue:
562 .byte 0xf3,0xc3
563.cfi_endproc
564.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
565
566.type mul4x_internal,@function
567.align 32
568mul4x_internal:
569.cfi_startproc
570 shlq $5,%r9
571 movd 8(%rax),%xmm5
572 leaq .Linc(%rip),%rax
573 leaq 128(%rdx,%r9,1),%r13
574 shrq $5,%r9
575 movdqa 0(%rax),%xmm0
576 movdqa 16(%rax),%xmm1
577 leaq 88-112(%rsp,%r9,1),%r10
578 leaq 128(%rdx),%r12
579
580 pshufd $0,%xmm5,%xmm5
581 movdqa %xmm1,%xmm4
582.byte 0x67,0x67
583 movdqa %xmm1,%xmm2
584 paddd %xmm0,%xmm1
585 pcmpeqd %xmm5,%xmm0
586.byte 0x67
587 movdqa %xmm4,%xmm3
588 paddd %xmm1,%xmm2
589 pcmpeqd %xmm5,%xmm1
590 movdqa %xmm0,112(%r10)
591 movdqa %xmm4,%xmm0
592
593 paddd %xmm2,%xmm3
594 pcmpeqd %xmm5,%xmm2
595 movdqa %xmm1,128(%r10)
596 movdqa %xmm4,%xmm1
597
598 paddd %xmm3,%xmm0
599 pcmpeqd %xmm5,%xmm3
600 movdqa %xmm2,144(%r10)
601 movdqa %xmm4,%xmm2
602
603 paddd %xmm0,%xmm1
604 pcmpeqd %xmm5,%xmm0
605 movdqa %xmm3,160(%r10)
606 movdqa %xmm4,%xmm3
607 paddd %xmm1,%xmm2
608 pcmpeqd %xmm5,%xmm1
609 movdqa %xmm0,176(%r10)
610 movdqa %xmm4,%xmm0
611
612 paddd %xmm2,%xmm3
613 pcmpeqd %xmm5,%xmm2
614 movdqa %xmm1,192(%r10)
615 movdqa %xmm4,%xmm1
616
617 paddd %xmm3,%xmm0
618 pcmpeqd %xmm5,%xmm3
619 movdqa %xmm2,208(%r10)
620 movdqa %xmm4,%xmm2
621
622 paddd %xmm0,%xmm1
623 pcmpeqd %xmm5,%xmm0
624 movdqa %xmm3,224(%r10)
625 movdqa %xmm4,%xmm3
626 paddd %xmm1,%xmm2
627 pcmpeqd %xmm5,%xmm1
628 movdqa %xmm0,240(%r10)
629 movdqa %xmm4,%xmm0
630
631 paddd %xmm2,%xmm3
632 pcmpeqd %xmm5,%xmm2
633 movdqa %xmm1,256(%r10)
634 movdqa %xmm4,%xmm1
635
636 paddd %xmm3,%xmm0
637 pcmpeqd %xmm5,%xmm3
638 movdqa %xmm2,272(%r10)
639 movdqa %xmm4,%xmm2
640
641 paddd %xmm0,%xmm1
642 pcmpeqd %xmm5,%xmm0
643 movdqa %xmm3,288(%r10)
644 movdqa %xmm4,%xmm3
645 paddd %xmm1,%xmm2
646 pcmpeqd %xmm5,%xmm1
647 movdqa %xmm0,304(%r10)
648
649 paddd %xmm2,%xmm3
650.byte 0x67
651 pcmpeqd %xmm5,%xmm2
652 movdqa %xmm1,320(%r10)
653
654 pcmpeqd %xmm5,%xmm3
655 movdqa %xmm2,336(%r10)
656 pand 64(%r12),%xmm0
657
658 pand 80(%r12),%xmm1
659 pand 96(%r12),%xmm2
660 movdqa %xmm3,352(%r10)
661 pand 112(%r12),%xmm3
662 por %xmm2,%xmm0
663 por %xmm3,%xmm1
664 movdqa -128(%r12),%xmm4
665 movdqa -112(%r12),%xmm5
666 movdqa -96(%r12),%xmm2
667 pand 112(%r10),%xmm4
668 movdqa -80(%r12),%xmm3
669 pand 128(%r10),%xmm5
670 por %xmm4,%xmm0
671 pand 144(%r10),%xmm2
672 por %xmm5,%xmm1
673 pand 160(%r10),%xmm3
674 por %xmm2,%xmm0
675 por %xmm3,%xmm1
676 movdqa -64(%r12),%xmm4
677 movdqa -48(%r12),%xmm5
678 movdqa -32(%r12),%xmm2
679 pand 176(%r10),%xmm4
680 movdqa -16(%r12),%xmm3
681 pand 192(%r10),%xmm5
682 por %xmm4,%xmm0
683 pand 208(%r10),%xmm2
684 por %xmm5,%xmm1
685 pand 224(%r10),%xmm3
686 por %xmm2,%xmm0
687 por %xmm3,%xmm1
688 movdqa 0(%r12),%xmm4
689 movdqa 16(%r12),%xmm5
690 movdqa 32(%r12),%xmm2
691 pand 240(%r10),%xmm4
692 movdqa 48(%r12),%xmm3
693 pand 256(%r10),%xmm5
694 por %xmm4,%xmm0
695 pand 272(%r10),%xmm2
696 por %xmm5,%xmm1
697 pand 288(%r10),%xmm3
698 por %xmm2,%xmm0
699 por %xmm3,%xmm1
700 por %xmm1,%xmm0
701 pshufd $0x4e,%xmm0,%xmm1
702 por %xmm1,%xmm0
703 leaq 256(%r12),%r12
704.byte 102,72,15,126,195
705
706 movq %r13,16+8(%rsp)
707 movq %rdi,56+8(%rsp)
708
709 movq (%r8),%r8
710 movq (%rsi),%rax
711 leaq (%rsi,%r9,1),%rsi
712 negq %r9
713
714 movq %r8,%rbp
715 mulq %rbx
716 movq %rax,%r10
717 movq (%rcx),%rax
718
719 imulq %r10,%rbp
720 leaq 64+8(%rsp),%r14
721 movq %rdx,%r11
722
723 mulq %rbp
724 addq %rax,%r10
725 movq 8(%rsi,%r9,1),%rax
726 adcq $0,%rdx
727 movq %rdx,%rdi
728
729 mulq %rbx
730 addq %rax,%r11
731 movq 8(%rcx),%rax
732 adcq $0,%rdx
733 movq %rdx,%r10
734
735 mulq %rbp
736 addq %rax,%rdi
737 movq 16(%rsi,%r9,1),%rax
738 adcq $0,%rdx
739 addq %r11,%rdi
740 leaq 32(%r9),%r15
741 leaq 32(%rcx),%rcx
742 adcq $0,%rdx
743 movq %rdi,(%r14)
744 movq %rdx,%r13
745 jmp .L1st4x
746
747.align 32
748.L1st4x:
749 mulq %rbx
750 addq %rax,%r10
751 movq -16(%rcx),%rax
752 leaq 32(%r14),%r14
753 adcq $0,%rdx
754 movq %rdx,%r11
755
756 mulq %rbp
757 addq %rax,%r13
758 movq -8(%rsi,%r15,1),%rax
759 adcq $0,%rdx
760 addq %r10,%r13
761 adcq $0,%rdx
762 movq %r13,-24(%r14)
763 movq %rdx,%rdi
764
765 mulq %rbx
766 addq %rax,%r11
767 movq -8(%rcx),%rax
768 adcq $0,%rdx
769 movq %rdx,%r10
770
771 mulq %rbp
772 addq %rax,%rdi
773 movq (%rsi,%r15,1),%rax
774 adcq $0,%rdx
775 addq %r11,%rdi
776 adcq $0,%rdx
777 movq %rdi,-16(%r14)
778 movq %rdx,%r13
779
780 mulq %rbx
781 addq %rax,%r10
782 movq 0(%rcx),%rax
783 adcq $0,%rdx
784 movq %rdx,%r11
785
786 mulq %rbp
787 addq %rax,%r13
788 movq 8(%rsi,%r15,1),%rax
789 adcq $0,%rdx
790 addq %r10,%r13
791 adcq $0,%rdx
792 movq %r13,-8(%r14)
793 movq %rdx,%rdi
794
795 mulq %rbx
796 addq %rax,%r11
797 movq 8(%rcx),%rax
798 adcq $0,%rdx
799 movq %rdx,%r10
800
801 mulq %rbp
802 addq %rax,%rdi
803 movq 16(%rsi,%r15,1),%rax
804 adcq $0,%rdx
805 addq %r11,%rdi
806 leaq 32(%rcx),%rcx
807 adcq $0,%rdx
808 movq %rdi,(%r14)
809 movq %rdx,%r13
810
811 addq $32,%r15
812 jnz .L1st4x
813
814 mulq %rbx
815 addq %rax,%r10
816 movq -16(%rcx),%rax
817 leaq 32(%r14),%r14
818 adcq $0,%rdx
819 movq %rdx,%r11
820
821 mulq %rbp
822 addq %rax,%r13
823 movq -8(%rsi),%rax
824 adcq $0,%rdx
825 addq %r10,%r13
826 adcq $0,%rdx
827 movq %r13,-24(%r14)
828 movq %rdx,%rdi
829
830 mulq %rbx
831 addq %rax,%r11
832 movq -8(%rcx),%rax
833 adcq $0,%rdx
834 movq %rdx,%r10
835
836 mulq %rbp
837 addq %rax,%rdi
838 movq (%rsi,%r9,1),%rax
839 adcq $0,%rdx
840 addq %r11,%rdi
841 adcq $0,%rdx
842 movq %rdi,-16(%r14)
843 movq %rdx,%r13
844
845 leaq (%rcx,%r9,1),%rcx
846
847 xorq %rdi,%rdi
848 addq %r10,%r13
849 adcq $0,%rdi
850 movq %r13,-8(%r14)
851
852 jmp .Louter4x
853
854.align 32
855.Louter4x:
856 leaq 16+128(%r14),%rdx
857 pxor %xmm4,%xmm4
858 pxor %xmm5,%xmm5
859 movdqa -128(%r12),%xmm0
860 movdqa -112(%r12),%xmm1
861 movdqa -96(%r12),%xmm2
862 movdqa -80(%r12),%xmm3
863 pand -128(%rdx),%xmm0
864 pand -112(%rdx),%xmm1
865 por %xmm0,%xmm4
866 pand -96(%rdx),%xmm2
867 por %xmm1,%xmm5
868 pand -80(%rdx),%xmm3
869 por %xmm2,%xmm4
870 por %xmm3,%xmm5
871 movdqa -64(%r12),%xmm0
872 movdqa -48(%r12),%xmm1
873 movdqa -32(%r12),%xmm2
874 movdqa -16(%r12),%xmm3
875 pand -64(%rdx),%xmm0
876 pand -48(%rdx),%xmm1
877 por %xmm0,%xmm4
878 pand -32(%rdx),%xmm2
879 por %xmm1,%xmm5
880 pand -16(%rdx),%xmm3
881 por %xmm2,%xmm4
882 por %xmm3,%xmm5
883 movdqa 0(%r12),%xmm0
884 movdqa 16(%r12),%xmm1
885 movdqa 32(%r12),%xmm2
886 movdqa 48(%r12),%xmm3
887 pand 0(%rdx),%xmm0
888 pand 16(%rdx),%xmm1
889 por %xmm0,%xmm4
890 pand 32(%rdx),%xmm2
891 por %xmm1,%xmm5
892 pand 48(%rdx),%xmm3
893 por %xmm2,%xmm4
894 por %xmm3,%xmm5
895 movdqa 64(%r12),%xmm0
896 movdqa 80(%r12),%xmm1
897 movdqa 96(%r12),%xmm2
898 movdqa 112(%r12),%xmm3
899 pand 64(%rdx),%xmm0
900 pand 80(%rdx),%xmm1
901 por %xmm0,%xmm4
902 pand 96(%rdx),%xmm2
903 por %xmm1,%xmm5
904 pand 112(%rdx),%xmm3
905 por %xmm2,%xmm4
906 por %xmm3,%xmm5
907 por %xmm5,%xmm4
908 pshufd $0x4e,%xmm4,%xmm0
909 por %xmm4,%xmm0
910 leaq 256(%r12),%r12
911.byte 102,72,15,126,195
912
913 movq (%r14,%r9,1),%r10
914 movq %r8,%rbp
915 mulq %rbx
916 addq %rax,%r10
917 movq (%rcx),%rax
918 adcq $0,%rdx
919
920 imulq %r10,%rbp
921 movq %rdx,%r11
922 movq %rdi,(%r14)
923
924 leaq (%r14,%r9,1),%r14
925
926 mulq %rbp
927 addq %rax,%r10
928 movq 8(%rsi,%r9,1),%rax
929 adcq $0,%rdx
930 movq %rdx,%rdi
931
932 mulq %rbx
933 addq %rax,%r11
934 movq 8(%rcx),%rax
935 adcq $0,%rdx
936 addq 8(%r14),%r11
937 adcq $0,%rdx
938 movq %rdx,%r10
939
940 mulq %rbp
941 addq %rax,%rdi
942 movq 16(%rsi,%r9,1),%rax
943 adcq $0,%rdx
944 addq %r11,%rdi
945 leaq 32(%r9),%r15
946 leaq 32(%rcx),%rcx
947 adcq $0,%rdx
948 movq %rdx,%r13
949 jmp .Linner4x
950
951.align 32
952.Linner4x:
953 mulq %rbx
954 addq %rax,%r10
955 movq -16(%rcx),%rax
956 adcq $0,%rdx
957 addq 16(%r14),%r10
958 leaq 32(%r14),%r14
959 adcq $0,%rdx
960 movq %rdx,%r11
961
962 mulq %rbp
963 addq %rax,%r13
964 movq -8(%rsi,%r15,1),%rax
965 adcq $0,%rdx
966 addq %r10,%r13
967 adcq $0,%rdx
968 movq %rdi,-32(%r14)
969 movq %rdx,%rdi
970
971 mulq %rbx
972 addq %rax,%r11
973 movq -8(%rcx),%rax
974 adcq $0,%rdx
975 addq -8(%r14),%r11
976 adcq $0,%rdx
977 movq %rdx,%r10
978
979 mulq %rbp
980 addq %rax,%rdi
981 movq (%rsi,%r15,1),%rax
982 adcq $0,%rdx
983 addq %r11,%rdi
984 adcq $0,%rdx
985 movq %r13,-24(%r14)
986 movq %rdx,%r13
987
988 mulq %rbx
989 addq %rax,%r10
990 movq 0(%rcx),%rax
991 adcq $0,%rdx
992 addq (%r14),%r10
993 adcq $0,%rdx
994 movq %rdx,%r11
995
996 mulq %rbp
997 addq %rax,%r13
998 movq 8(%rsi,%r15,1),%rax
999 adcq $0,%rdx
1000 addq %r10,%r13
1001 adcq $0,%rdx
1002 movq %rdi,-16(%r14)
1003 movq %rdx,%rdi
1004
1005 mulq %rbx
1006 addq %rax,%r11
1007 movq 8(%rcx),%rax
1008 adcq $0,%rdx
1009 addq 8(%r14),%r11
1010 adcq $0,%rdx
1011 movq %rdx,%r10
1012
1013 mulq %rbp
1014 addq %rax,%rdi
1015 movq 16(%rsi,%r15,1),%rax
1016 adcq $0,%rdx
1017 addq %r11,%rdi
1018 leaq 32(%rcx),%rcx
1019 adcq $0,%rdx
1020 movq %r13,-8(%r14)
1021 movq %rdx,%r13
1022
1023 addq $32,%r15
1024 jnz .Linner4x
1025
1026 mulq %rbx
1027 addq %rax,%r10
1028 movq -16(%rcx),%rax
1029 adcq $0,%rdx
1030 addq 16(%r14),%r10
1031 leaq 32(%r14),%r14
1032 adcq $0,%rdx
1033 movq %rdx,%r11
1034
1035 mulq %rbp
1036 addq %rax,%r13
1037 movq -8(%rsi),%rax
1038 adcq $0,%rdx
1039 addq %r10,%r13
1040 adcq $0,%rdx
1041 movq %rdi,-32(%r14)
1042 movq %rdx,%rdi
1043
1044 mulq %rbx
1045 addq %rax,%r11
1046 movq %rbp,%rax
1047 movq -8(%rcx),%rbp
1048 adcq $0,%rdx
1049 addq -8(%r14),%r11
1050 adcq $0,%rdx
1051 movq %rdx,%r10
1052
1053 mulq %rbp
1054 addq %rax,%rdi
1055 movq (%rsi,%r9,1),%rax
1056 adcq $0,%rdx
1057 addq %r11,%rdi
1058 adcq $0,%rdx
1059 movq %r13,-24(%r14)
1060 movq %rdx,%r13
1061
1062 movq %rdi,-16(%r14)
1063 leaq (%rcx,%r9,1),%rcx
1064
1065 xorq %rdi,%rdi
1066 addq %r10,%r13
1067 adcq $0,%rdi
1068 addq (%r14),%r13
1069 adcq $0,%rdi
1070 movq %r13,-8(%r14)
1071
1072 cmpq 16+8(%rsp),%r12
1073 jb .Louter4x
1074 xorq %rax,%rax
1075 subq %r13,%rbp
1076 adcq %r15,%r15
1077 orq %r15,%rdi
1078 subq %rdi,%rax
1079 leaq (%r14,%r9,1),%rbx
1080 movq (%rcx),%r12
1081 leaq (%rcx),%rbp
1082 movq %r9,%rcx
1083 sarq $3+2,%rcx
1084 movq 56+8(%rsp),%rdi
1085 decq %r12
1086 xorq %r10,%r10
1087 movq 8(%rbp),%r13
1088 movq 16(%rbp),%r14
1089 movq 24(%rbp),%r15
1090 jmp .Lsqr4x_sub_entry
1091.cfi_endproc
1092.size mul4x_internal,.-mul4x_internal
1093.globl bn_power5
1094.hidden bn_power5
1095.type bn_power5,@function
1096.align 32
1097bn_power5:
1098.cfi_startproc
1099 movq %rsp,%rax
1100.cfi_def_cfa_register %rax
1101 leaq OPENSSL_ia32cap_P(%rip),%r11
1102 movl 8(%r11),%r11d
1103 andl $0x80108,%r11d
1104 cmpl $0x80108,%r11d
1105 je .Lpowerx5_enter
1106 pushq %rbx
1107.cfi_offset %rbx,-16
1108 pushq %rbp
1109.cfi_offset %rbp,-24
1110 pushq %r12
1111.cfi_offset %r12,-32
1112 pushq %r13
1113.cfi_offset %r13,-40
1114 pushq %r14
1115.cfi_offset %r14,-48
1116 pushq %r15
1117.cfi_offset %r15,-56
1118.Lpower5_prologue:
1119
1120 shll $3,%r9d
1121 leal (%r9,%r9,2),%r10d
1122 negq %r9
1123 movq (%r8),%r8
1124
1125
1126
1127
1128
1129
1130
1131
1132 leaq -320(%rsp,%r9,2),%r11
1133 movq %rsp,%rbp
1134 subq %rdi,%r11
1135 andq $4095,%r11
1136 cmpq %r11,%r10
1137 jb .Lpwr_sp_alt
1138 subq %r11,%rbp
1139 leaq -320(%rbp,%r9,2),%rbp
1140 jmp .Lpwr_sp_done
1141
1142.align 32
1143.Lpwr_sp_alt:
1144 leaq 4096-320(,%r9,2),%r10
1145 leaq -320(%rbp,%r9,2),%rbp
1146 subq %r10,%r11
1147 movq $0,%r10
1148 cmovcq %r10,%r11
1149 subq %r11,%rbp
1150.Lpwr_sp_done:
1151 andq $-64,%rbp
1152 movq %rsp,%r11
1153 subq %rbp,%r11
1154 andq $-4096,%r11
1155 leaq (%r11,%rbp,1),%rsp
1156 movq (%rsp),%r10
1157 cmpq %rbp,%rsp
1158 ja .Lpwr_page_walk
1159 jmp .Lpwr_page_walk_done
1160
1161.Lpwr_page_walk:
1162 leaq -4096(%rsp),%rsp
1163 movq (%rsp),%r10
1164 cmpq %rbp,%rsp
1165 ja .Lpwr_page_walk
1166.Lpwr_page_walk_done:
1167
1168 movq %r9,%r10
1169 negq %r9
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180 movq %r8,32(%rsp)
1181 movq %rax,40(%rsp)
1182.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
1183.Lpower5_body:
1184.byte 102,72,15,110,207
1185.byte 102,72,15,110,209
1186.byte 102,73,15,110,218
1187.byte 102,72,15,110,226
1188
1189 call __bn_sqr8x_internal
1190 call __bn_post4x_internal
1191 call __bn_sqr8x_internal
1192 call __bn_post4x_internal
1193 call __bn_sqr8x_internal
1194 call __bn_post4x_internal
1195 call __bn_sqr8x_internal
1196 call __bn_post4x_internal
1197 call __bn_sqr8x_internal
1198 call __bn_post4x_internal
1199
1200.byte 102,72,15,126,209
1201.byte 102,72,15,126,226
1202 movq %rsi,%rdi
1203 movq 40(%rsp),%rax
1204 leaq 32(%rsp),%r8
1205
1206 call mul4x_internal
1207
1208 movq 40(%rsp),%rsi
1209.cfi_def_cfa %rsi,8
1210 movq $1,%rax
1211 movq -48(%rsi),%r15
1212.cfi_restore %r15
1213 movq -40(%rsi),%r14
1214.cfi_restore %r14
1215 movq -32(%rsi),%r13
1216.cfi_restore %r13
1217 movq -24(%rsi),%r12
1218.cfi_restore %r12
1219 movq -16(%rsi),%rbp
1220.cfi_restore %rbp
1221 movq -8(%rsi),%rbx
1222.cfi_restore %rbx
1223 leaq (%rsi),%rsp
1224.cfi_def_cfa_register %rsp
1225.Lpower5_epilogue:
1226 .byte 0xf3,0xc3
1227.cfi_endproc
1228.size bn_power5,.-bn_power5
1229
1230.globl bn_sqr8x_internal
1231.hidden bn_sqr8x_internal
1232.hidden bn_sqr8x_internal
1233.type bn_sqr8x_internal,@function
1234.align 32
1235bn_sqr8x_internal:
1236__bn_sqr8x_internal:
1237.cfi_startproc
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311 leaq 32(%r10),%rbp
1312 leaq (%rsi,%r9,1),%rsi
1313
1314 movq %r9,%rcx
1315
1316
1317 movq -32(%rsi,%rbp,1),%r14
1318 leaq 48+8(%rsp,%r9,2),%rdi
1319 movq -24(%rsi,%rbp,1),%rax
1320 leaq -32(%rdi,%rbp,1),%rdi
1321 movq -16(%rsi,%rbp,1),%rbx
1322 movq %rax,%r15
1323
1324 mulq %r14
1325 movq %rax,%r10
1326 movq %rbx,%rax
1327 movq %rdx,%r11
1328 movq %r10,-24(%rdi,%rbp,1)
1329
1330 mulq %r14
1331 addq %rax,%r11
1332 movq %rbx,%rax
1333 adcq $0,%rdx
1334 movq %r11,-16(%rdi,%rbp,1)
1335 movq %rdx,%r10
1336
1337
1338 movq -8(%rsi,%rbp,1),%rbx
1339 mulq %r15
1340 movq %rax,%r12
1341 movq %rbx,%rax
1342 movq %rdx,%r13
1343
1344 leaq (%rbp),%rcx
1345 mulq %r14
1346 addq %rax,%r10
1347 movq %rbx,%rax
1348 movq %rdx,%r11
1349 adcq $0,%r11
1350 addq %r12,%r10
1351 adcq $0,%r11
1352 movq %r10,-8(%rdi,%rcx,1)
1353 jmp .Lsqr4x_1st
1354
1355.align 32
1356.Lsqr4x_1st:
1357 movq (%rsi,%rcx,1),%rbx
1358 mulq %r15
1359 addq %rax,%r13
1360 movq %rbx,%rax
1361 movq %rdx,%r12
1362 adcq $0,%r12
1363
1364 mulq %r14
1365 addq %rax,%r11
1366 movq %rbx,%rax
1367 movq 8(%rsi,%rcx,1),%rbx
1368 movq %rdx,%r10
1369 adcq $0,%r10
1370 addq %r13,%r11
1371 adcq $0,%r10
1372
1373
1374 mulq %r15
1375 addq %rax,%r12
1376 movq %rbx,%rax
1377 movq %r11,(%rdi,%rcx,1)
1378 movq %rdx,%r13
1379 adcq $0,%r13
1380
1381 mulq %r14
1382 addq %rax,%r10
1383 movq %rbx,%rax
1384 movq 16(%rsi,%rcx,1),%rbx
1385 movq %rdx,%r11
1386 adcq $0,%r11
1387 addq %r12,%r10
1388 adcq $0,%r11
1389
1390 mulq %r15
1391 addq %rax,%r13
1392 movq %rbx,%rax
1393 movq %r10,8(%rdi,%rcx,1)
1394 movq %rdx,%r12
1395 adcq $0,%r12
1396
1397 mulq %r14
1398 addq %rax,%r11
1399 movq %rbx,%rax
1400 movq 24(%rsi,%rcx,1),%rbx
1401 movq %rdx,%r10
1402 adcq $0,%r10
1403 addq %r13,%r11
1404 adcq $0,%r10
1405
1406
1407 mulq %r15
1408 addq %rax,%r12
1409 movq %rbx,%rax
1410 movq %r11,16(%rdi,%rcx,1)
1411 movq %rdx,%r13
1412 adcq $0,%r13
1413 leaq 32(%rcx),%rcx
1414
1415 mulq %r14
1416 addq %rax,%r10
1417 movq %rbx,%rax
1418 movq %rdx,%r11
1419 adcq $0,%r11
1420 addq %r12,%r10
1421 adcq $0,%r11
1422 movq %r10,-8(%rdi,%rcx,1)
1423
1424 cmpq $0,%rcx
1425 jne .Lsqr4x_1st
1426
1427 mulq %r15
1428 addq %rax,%r13
1429 leaq 16(%rbp),%rbp
1430 adcq $0,%rdx
1431 addq %r11,%r13
1432 adcq $0,%rdx
1433
1434 movq %r13,(%rdi)
1435 movq %rdx,%r12
1436 movq %rdx,8(%rdi)
1437 jmp .Lsqr4x_outer
1438
1439.align 32
1440.Lsqr4x_outer:
1441 movq -32(%rsi,%rbp,1),%r14
1442 leaq 48+8(%rsp,%r9,2),%rdi
1443 movq -24(%rsi,%rbp,1),%rax
1444 leaq -32(%rdi,%rbp,1),%rdi
1445 movq -16(%rsi,%rbp,1),%rbx
1446 movq %rax,%r15
1447
1448 mulq %r14
1449 movq -24(%rdi,%rbp,1),%r10
1450 addq %rax,%r10
1451 movq %rbx,%rax
1452 adcq $0,%rdx
1453 movq %r10,-24(%rdi,%rbp,1)
1454 movq %rdx,%r11
1455
1456 mulq %r14
1457 addq %rax,%r11
1458 movq %rbx,%rax
1459 adcq $0,%rdx
1460 addq -16(%rdi,%rbp,1),%r11
1461 movq %rdx,%r10
1462 adcq $0,%r10
1463 movq %r11,-16(%rdi,%rbp,1)
1464
1465 xorq %r12,%r12
1466
1467 movq -8(%rsi,%rbp,1),%rbx
1468 mulq %r15
1469 addq %rax,%r12
1470 movq %rbx,%rax
1471 adcq $0,%rdx
1472 addq -8(%rdi,%rbp,1),%r12
1473 movq %rdx,%r13
1474 adcq $0,%r13
1475
1476 mulq %r14
1477 addq %rax,%r10
1478 movq %rbx,%rax
1479 adcq $0,%rdx
1480 addq %r12,%r10
1481 movq %rdx,%r11
1482 adcq $0,%r11
1483 movq %r10,-8(%rdi,%rbp,1)
1484
1485 leaq (%rbp),%rcx
1486 jmp .Lsqr4x_inner
1487
1488.align 32
1489.Lsqr4x_inner:
1490 movq (%rsi,%rcx,1),%rbx
1491 mulq %r15
1492 addq %rax,%r13
1493 movq %rbx,%rax
1494 movq %rdx,%r12
1495 adcq $0,%r12
1496 addq (%rdi,%rcx,1),%r13
1497 adcq $0,%r12
1498
1499.byte 0x67
1500 mulq %r14
1501 addq %rax,%r11
1502 movq %rbx,%rax
1503 movq 8(%rsi,%rcx,1),%rbx
1504 movq %rdx,%r10
1505 adcq $0,%r10
1506 addq %r13,%r11
1507 adcq $0,%r10
1508
1509 mulq %r15
1510 addq %rax,%r12
1511 movq %r11,(%rdi,%rcx,1)
1512 movq %rbx,%rax
1513 movq %rdx,%r13
1514 adcq $0,%r13
1515 addq 8(%rdi,%rcx,1),%r12
1516 leaq 16(%rcx),%rcx
1517 adcq $0,%r13
1518
1519 mulq %r14
1520 addq %rax,%r10
1521 movq %rbx,%rax
1522 adcq $0,%rdx
1523 addq %r12,%r10
1524 movq %rdx,%r11
1525 adcq $0,%r11
1526 movq %r10,-8(%rdi,%rcx,1)
1527
1528 cmpq $0,%rcx
1529 jne .Lsqr4x_inner
1530
1531.byte 0x67
1532 mulq %r15
1533 addq %rax,%r13
1534 adcq $0,%rdx
1535 addq %r11,%r13
1536 adcq $0,%rdx
1537
1538 movq %r13,(%rdi)
1539 movq %rdx,%r12
1540 movq %rdx,8(%rdi)
1541
1542 addq $16,%rbp
1543 jnz .Lsqr4x_outer
1544
1545
1546 movq -32(%rsi),%r14
1547 leaq 48+8(%rsp,%r9,2),%rdi
1548 movq -24(%rsi),%rax
1549 leaq -32(%rdi,%rbp,1),%rdi
1550 movq -16(%rsi),%rbx
1551 movq %rax,%r15
1552
1553 mulq %r14
1554 addq %rax,%r10
1555 movq %rbx,%rax
1556 movq %rdx,%r11
1557 adcq $0,%r11
1558
1559 mulq %r14
1560 addq %rax,%r11
1561 movq %rbx,%rax
1562 movq %r10,-24(%rdi)
1563 movq %rdx,%r10
1564 adcq $0,%r10
1565 addq %r13,%r11
1566 movq -8(%rsi),%rbx
1567 adcq $0,%r10
1568
1569 mulq %r15
1570 addq %rax,%r12
1571 movq %rbx,%rax
1572 movq %r11,-16(%rdi)
1573 movq %rdx,%r13
1574 adcq $0,%r13
1575
1576 mulq %r14
1577 addq %rax,%r10
1578 movq %rbx,%rax
1579 movq %rdx,%r11
1580 adcq $0,%r11
1581 addq %r12,%r10
1582 adcq $0,%r11
1583 movq %r10,-8(%rdi)
1584
1585 mulq %r15
1586 addq %rax,%r13
1587 movq -16(%rsi),%rax
1588 adcq $0,%rdx
1589 addq %r11,%r13
1590 adcq $0,%rdx
1591
1592 movq %r13,(%rdi)
1593 movq %rdx,%r12
1594 movq %rdx,8(%rdi)
1595
1596 mulq %rbx
1597 addq $16,%rbp
1598 xorq %r14,%r14
1599 subq %r9,%rbp
1600 xorq %r15,%r15
1601
1602 addq %r12,%rax
1603 adcq $0,%rdx
1604 movq %rax,8(%rdi)
1605 movq %rdx,16(%rdi)
1606 movq %r15,24(%rdi)
1607
1608 movq -16(%rsi,%rbp,1),%rax
1609 leaq 48+8(%rsp),%rdi
1610 xorq %r10,%r10
1611 movq 8(%rdi),%r11
1612
1613 leaq (%r14,%r10,2),%r12
1614 shrq $63,%r10
1615 leaq (%rcx,%r11,2),%r13
1616 shrq $63,%r11
1617 orq %r10,%r13
1618 movq 16(%rdi),%r10
1619 movq %r11,%r14
1620 mulq %rax
1621 negq %r15
1622 movq 24(%rdi),%r11
1623 adcq %rax,%r12
1624 movq -8(%rsi,%rbp,1),%rax
1625 movq %r12,(%rdi)
1626 adcq %rdx,%r13
1627
1628 leaq (%r14,%r10,2),%rbx
1629 movq %r13,8(%rdi)
1630 sbbq %r15,%r15
1631 shrq $63,%r10
1632 leaq (%rcx,%r11,2),%r8
1633 shrq $63,%r11
1634 orq %r10,%r8
1635 movq 32(%rdi),%r10
1636 movq %r11,%r14
1637 mulq %rax
1638 negq %r15
1639 movq 40(%rdi),%r11
1640 adcq %rax,%rbx
1641 movq 0(%rsi,%rbp,1),%rax
1642 movq %rbx,16(%rdi)
1643 adcq %rdx,%r8
1644 leaq 16(%rbp),%rbp
1645 movq %r8,24(%rdi)
1646 sbbq %r15,%r15
1647 leaq 64(%rdi),%rdi
1648 jmp .Lsqr4x_shift_n_add
1649
1650.align 32
1651.Lsqr4x_shift_n_add:
1652 leaq (%r14,%r10,2),%r12
1653 shrq $63,%r10
1654 leaq (%rcx,%r11,2),%r13
1655 shrq $63,%r11
1656 orq %r10,%r13
1657 movq -16(%rdi),%r10
1658 movq %r11,%r14
1659 mulq %rax
1660 negq %r15
1661 movq -8(%rdi),%r11
1662 adcq %rax,%r12
1663 movq -8(%rsi,%rbp,1),%rax
1664 movq %r12,-32(%rdi)
1665 adcq %rdx,%r13
1666
1667 leaq (%r14,%r10,2),%rbx
1668 movq %r13,-24(%rdi)
1669 sbbq %r15,%r15
1670 shrq $63,%r10
1671 leaq (%rcx,%r11,2),%r8
1672 shrq $63,%r11
1673 orq %r10,%r8
1674 movq 0(%rdi),%r10
1675 movq %r11,%r14
1676 mulq %rax
1677 negq %r15
1678 movq 8(%rdi),%r11
1679 adcq %rax,%rbx
1680 movq 0(%rsi,%rbp,1),%rax
1681 movq %rbx,-16(%rdi)
1682 adcq %rdx,%r8
1683
1684 leaq (%r14,%r10,2),%r12
1685 movq %r8,-8(%rdi)
1686 sbbq %r15,%r15
1687 shrq $63,%r10
1688 leaq (%rcx,%r11,2),%r13
1689 shrq $63,%r11
1690 orq %r10,%r13
1691 movq 16(%rdi),%r10
1692 movq %r11,%r14
1693 mulq %rax
1694 negq %r15
1695 movq 24(%rdi),%r11
1696 adcq %rax,%r12
1697 movq 8(%rsi,%rbp,1),%rax
1698 movq %r12,0(%rdi)
1699 adcq %rdx,%r13
1700
1701 leaq (%r14,%r10,2),%rbx
1702 movq %r13,8(%rdi)
1703 sbbq %r15,%r15
1704 shrq $63,%r10
1705 leaq (%rcx,%r11,2),%r8
1706 shrq $63,%r11
1707 orq %r10,%r8
1708 movq 32(%rdi),%r10
1709 movq %r11,%r14
1710 mulq %rax
1711 negq %r15
1712 movq 40(%rdi),%r11
1713 adcq %rax,%rbx
1714 movq 16(%rsi,%rbp,1),%rax
1715 movq %rbx,16(%rdi)
1716 adcq %rdx,%r8
1717 movq %r8,24(%rdi)
1718 sbbq %r15,%r15
1719 leaq 64(%rdi),%rdi
1720 addq $32,%rbp
1721 jnz .Lsqr4x_shift_n_add
1722
1723 leaq (%r14,%r10,2),%r12
1724.byte 0x67
1725 shrq $63,%r10
1726 leaq (%rcx,%r11,2),%r13
1727 shrq $63,%r11
1728 orq %r10,%r13
1729 movq -16(%rdi),%r10
1730 movq %r11,%r14
1731 mulq %rax
1732 negq %r15
1733 movq -8(%rdi),%r11
1734 adcq %rax,%r12
1735 movq -8(%rsi),%rax
1736 movq %r12,-32(%rdi)
1737 adcq %rdx,%r13
1738
1739 leaq (%r14,%r10,2),%rbx
1740 movq %r13,-24(%rdi)
1741 sbbq %r15,%r15
1742 shrq $63,%r10
1743 leaq (%rcx,%r11,2),%r8
1744 shrq $63,%r11
1745 orq %r10,%r8
1746 mulq %rax
1747 negq %r15
1748 adcq %rax,%rbx
1749 adcq %rdx,%r8
1750 movq %rbx,-16(%rdi)
1751 movq %r8,-8(%rdi)
1752.byte 102,72,15,126,213
1753__bn_sqr8x_reduction:
1754 xorq %rax,%rax
1755 leaq (%r9,%rbp,1),%rcx
1756 leaq 48+8(%rsp,%r9,2),%rdx
1757 movq %rcx,0+8(%rsp)
1758 leaq 48+8(%rsp,%r9,1),%rdi
1759 movq %rdx,8+8(%rsp)
1760 negq %r9
1761 jmp .L8x_reduction_loop
1762
1763.align 32
1764.L8x_reduction_loop:
1765 leaq (%rdi,%r9,1),%rdi
1766.byte 0x66
1767 movq 0(%rdi),%rbx
1768 movq 8(%rdi),%r9
1769 movq 16(%rdi),%r10
1770 movq 24(%rdi),%r11
1771 movq 32(%rdi),%r12
1772 movq 40(%rdi),%r13
1773 movq 48(%rdi),%r14
1774 movq 56(%rdi),%r15
1775 movq %rax,(%rdx)
1776 leaq 64(%rdi),%rdi
1777
1778.byte 0x67
1779 movq %rbx,%r8
1780 imulq 32+8(%rsp),%rbx
1781 movq 0(%rbp),%rax
1782 movl $8,%ecx
1783 jmp .L8x_reduce
1784
1785.align 32
1786.L8x_reduce:
1787 mulq %rbx
1788 movq 8(%rbp),%rax
1789 negq %r8
1790 movq %rdx,%r8
1791 adcq $0,%r8
1792
1793 mulq %rbx
1794 addq %rax,%r9
1795 movq 16(%rbp),%rax
1796 adcq $0,%rdx
1797 addq %r9,%r8
1798 movq %rbx,48-8+8(%rsp,%rcx,8)
1799 movq %rdx,%r9
1800 adcq $0,%r9
1801
1802 mulq %rbx
1803 addq %rax,%r10
1804 movq 24(%rbp),%rax
1805 adcq $0,%rdx
1806 addq %r10,%r9
1807 movq 32+8(%rsp),%rsi
1808 movq %rdx,%r10
1809 adcq $0,%r10
1810
1811 mulq %rbx
1812 addq %rax,%r11
1813 movq 32(%rbp),%rax
1814 adcq $0,%rdx
1815 imulq %r8,%rsi
1816 addq %r11,%r10
1817 movq %rdx,%r11
1818 adcq $0,%r11
1819
1820 mulq %rbx
1821 addq %rax,%r12
1822 movq 40(%rbp),%rax
1823 adcq $0,%rdx
1824 addq %r12,%r11
1825 movq %rdx,%r12
1826 adcq $0,%r12
1827
1828 mulq %rbx
1829 addq %rax,%r13
1830 movq 48(%rbp),%rax
1831 adcq $0,%rdx
1832 addq %r13,%r12
1833 movq %rdx,%r13
1834 adcq $0,%r13
1835
1836 mulq %rbx
1837 addq %rax,%r14
1838 movq 56(%rbp),%rax
1839 adcq $0,%rdx
1840 addq %r14,%r13
1841 movq %rdx,%r14
1842 adcq $0,%r14
1843
1844 mulq %rbx
1845 movq %rsi,%rbx
1846 addq %rax,%r15
1847 movq 0(%rbp),%rax
1848 adcq $0,%rdx
1849 addq %r15,%r14
1850 movq %rdx,%r15
1851 adcq $0,%r15
1852
1853 decl %ecx
1854 jnz .L8x_reduce
1855
1856 leaq 64(%rbp),%rbp
1857 xorq %rax,%rax
1858 movq 8+8(%rsp),%rdx
1859 cmpq 0+8(%rsp),%rbp
1860 jae .L8x_no_tail
1861
1862.byte 0x66
1863 addq 0(%rdi),%r8
1864 adcq 8(%rdi),%r9
1865 adcq 16(%rdi),%r10
1866 adcq 24(%rdi),%r11
1867 adcq 32(%rdi),%r12
1868 adcq 40(%rdi),%r13
1869 adcq 48(%rdi),%r14
1870 adcq 56(%rdi),%r15
1871 sbbq %rsi,%rsi
1872
1873 movq 48+56+8(%rsp),%rbx
1874 movl $8,%ecx
1875 movq 0(%rbp),%rax
1876 jmp .L8x_tail
1877
1878.align 32
1879.L8x_tail:
1880 mulq %rbx
1881 addq %rax,%r8
1882 movq 8(%rbp),%rax
1883 movq %r8,(%rdi)
1884 movq %rdx,%r8
1885 adcq $0,%r8
1886
1887 mulq %rbx
1888 addq %rax,%r9
1889 movq 16(%rbp),%rax
1890 adcq $0,%rdx
1891 addq %r9,%r8
1892 leaq 8(%rdi),%rdi
1893 movq %rdx,%r9
1894 adcq $0,%r9
1895
1896 mulq %rbx
1897 addq %rax,%r10
1898 movq 24(%rbp),%rax
1899 adcq $0,%rdx
1900 addq %r10,%r9
1901 movq %rdx,%r10
1902 adcq $0,%r10
1903
1904 mulq %rbx
1905 addq %rax,%r11
1906 movq 32(%rbp),%rax
1907 adcq $0,%rdx
1908 addq %r11,%r10
1909 movq %rdx,%r11
1910 adcq $0,%r11
1911
1912 mulq %rbx
1913 addq %rax,%r12
1914 movq 40(%rbp),%rax
1915 adcq $0,%rdx
1916 addq %r12,%r11
1917 movq %rdx,%r12
1918 adcq $0,%r12
1919
1920 mulq %rbx
1921 addq %rax,%r13
1922 movq 48(%rbp),%rax
1923 adcq $0,%rdx
1924 addq %r13,%r12
1925 movq %rdx,%r13
1926 adcq $0,%r13
1927
1928 mulq %rbx
1929 addq %rax,%r14
1930 movq 56(%rbp),%rax
1931 adcq $0,%rdx
1932 addq %r14,%r13
1933 movq %rdx,%r14
1934 adcq $0,%r14
1935
1936 mulq %rbx
1937 movq 48-16+8(%rsp,%rcx,8),%rbx
1938 addq %rax,%r15
1939 adcq $0,%rdx
1940 addq %r15,%r14
1941 movq 0(%rbp),%rax
1942 movq %rdx,%r15
1943 adcq $0,%r15
1944
1945 decl %ecx
1946 jnz .L8x_tail
1947
1948 leaq 64(%rbp),%rbp
1949 movq 8+8(%rsp),%rdx
1950 cmpq 0+8(%rsp),%rbp
1951 jae .L8x_tail_done
1952
1953 movq 48+56+8(%rsp),%rbx
1954 negq %rsi
1955 movq 0(%rbp),%rax
1956 adcq 0(%rdi),%r8
1957 adcq 8(%rdi),%r9
1958 adcq 16(%rdi),%r10
1959 adcq 24(%rdi),%r11
1960 adcq 32(%rdi),%r12
1961 adcq 40(%rdi),%r13
1962 adcq 48(%rdi),%r14
1963 adcq 56(%rdi),%r15
1964 sbbq %rsi,%rsi
1965
1966 movl $8,%ecx
1967 jmp .L8x_tail
1968
1969.align 32
1970.L8x_tail_done:
1971 xorq %rax,%rax
1972 addq (%rdx),%r8
1973 adcq $0,%r9
1974 adcq $0,%r10
1975 adcq $0,%r11
1976 adcq $0,%r12
1977 adcq $0,%r13
1978 adcq $0,%r14
1979 adcq $0,%r15
1980 adcq $0,%rax
1981
1982 negq %rsi
1983.L8x_no_tail:
1984 adcq 0(%rdi),%r8
1985 adcq 8(%rdi),%r9
1986 adcq 16(%rdi),%r10
1987 adcq 24(%rdi),%r11
1988 adcq 32(%rdi),%r12
1989 adcq 40(%rdi),%r13
1990 adcq 48(%rdi),%r14
1991 adcq 56(%rdi),%r15
1992 adcq $0,%rax
1993 movq -8(%rbp),%rcx
1994 xorq %rsi,%rsi
1995
1996.byte 102,72,15,126,213
1997
1998 movq %r8,0(%rdi)
1999 movq %r9,8(%rdi)
2000.byte 102,73,15,126,217
2001 movq %r10,16(%rdi)
2002 movq %r11,24(%rdi)
2003 movq %r12,32(%rdi)
2004 movq %r13,40(%rdi)
2005 movq %r14,48(%rdi)
2006 movq %r15,56(%rdi)
2007 leaq 64(%rdi),%rdi
2008
2009 cmpq %rdx,%rdi
2010 jb .L8x_reduction_loop
2011 .byte 0xf3,0xc3
2012.cfi_endproc
2013.size bn_sqr8x_internal,.-bn_sqr8x_internal
2014.type __bn_post4x_internal,@function
2015.align 32
2016__bn_post4x_internal:
2017.cfi_startproc
2018 movq 0(%rbp),%r12
2019 leaq (%rdi,%r9,1),%rbx
2020 movq %r9,%rcx
2021.byte 102,72,15,126,207
2022 negq %rax
2023.byte 102,72,15,126,206
2024 sarq $3+2,%rcx
2025 decq %r12
2026 xorq %r10,%r10
2027 movq 8(%rbp),%r13
2028 movq 16(%rbp),%r14
2029 movq 24(%rbp),%r15
2030 jmp .Lsqr4x_sub_entry
2031
2032.align 16
2033.Lsqr4x_sub:
2034 movq 0(%rbp),%r12
2035 movq 8(%rbp),%r13
2036 movq 16(%rbp),%r14
2037 movq 24(%rbp),%r15
2038.Lsqr4x_sub_entry:
2039 leaq 32(%rbp),%rbp
2040 notq %r12
2041 notq %r13
2042 notq %r14
2043 notq %r15
2044 andq %rax,%r12
2045 andq %rax,%r13
2046 andq %rax,%r14
2047 andq %rax,%r15
2048
2049 negq %r10
2050 adcq 0(%rbx),%r12
2051 adcq 8(%rbx),%r13
2052 adcq 16(%rbx),%r14
2053 adcq 24(%rbx),%r15
2054 movq %r12,0(%rdi)
2055 leaq 32(%rbx),%rbx
2056 movq %r13,8(%rdi)
2057 sbbq %r10,%r10
2058 movq %r14,16(%rdi)
2059 movq %r15,24(%rdi)
2060 leaq 32(%rdi),%rdi
2061
2062 incq %rcx
2063 jnz .Lsqr4x_sub
2064
2065 movq %r9,%r10
2066 negq %r9
2067 .byte 0xf3,0xc3
2068.cfi_endproc
2069.size __bn_post4x_internal,.-__bn_post4x_internal
2070.globl bn_from_montgomery
2071.hidden bn_from_montgomery
2072.type bn_from_montgomery,@function
2073.align 32
2074bn_from_montgomery:
2075.cfi_startproc
2076 testl $7,%r9d
2077 jz bn_from_mont8x
2078 xorl %eax,%eax
2079 .byte 0xf3,0xc3
2080.cfi_endproc
2081.size bn_from_montgomery,.-bn_from_montgomery
2082
2083.type bn_from_mont8x,@function
2084.align 32
2085bn_from_mont8x:
2086.cfi_startproc
2087.byte 0x67
2088 movq %rsp,%rax
2089.cfi_def_cfa_register %rax
2090 pushq %rbx
2091.cfi_offset %rbx,-16
2092 pushq %rbp
2093.cfi_offset %rbp,-24
2094 pushq %r12
2095.cfi_offset %r12,-32
2096 pushq %r13
2097.cfi_offset %r13,-40
2098 pushq %r14
2099.cfi_offset %r14,-48
2100 pushq %r15
2101.cfi_offset %r15,-56
2102.Lfrom_prologue:
2103
2104 shll $3,%r9d
2105 leaq (%r9,%r9,2),%r10
2106 negq %r9
2107 movq (%r8),%r8
2108
2109
2110
2111
2112
2113
2114
2115
2116 leaq -320(%rsp,%r9,2),%r11
2117 movq %rsp,%rbp
2118 subq %rdi,%r11
2119 andq $4095,%r11
2120 cmpq %r11,%r10
2121 jb .Lfrom_sp_alt
2122 subq %r11,%rbp
2123 leaq -320(%rbp,%r9,2),%rbp
2124 jmp .Lfrom_sp_done
2125
2126.align 32
2127.Lfrom_sp_alt:
2128 leaq 4096-320(,%r9,2),%r10
2129 leaq -320(%rbp,%r9,2),%rbp
2130 subq %r10,%r11
2131 movq $0,%r10
2132 cmovcq %r10,%r11
2133 subq %r11,%rbp
2134.Lfrom_sp_done:
2135 andq $-64,%rbp
2136 movq %rsp,%r11
2137 subq %rbp,%r11
2138 andq $-4096,%r11
2139 leaq (%r11,%rbp,1),%rsp
2140 movq (%rsp),%r10
2141 cmpq %rbp,%rsp
2142 ja .Lfrom_page_walk
2143 jmp .Lfrom_page_walk_done
2144
2145.Lfrom_page_walk:
2146 leaq -4096(%rsp),%rsp
2147 movq (%rsp),%r10
2148 cmpq %rbp,%rsp
2149 ja .Lfrom_page_walk
2150.Lfrom_page_walk_done:
2151
2152 movq %r9,%r10
2153 negq %r9
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164 movq %r8,32(%rsp)
2165 movq %rax,40(%rsp)
2166.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
2167.Lfrom_body:
2168 movq %r9,%r11
2169 leaq 48(%rsp),%rax
2170 pxor %xmm0,%xmm0
2171 jmp .Lmul_by_1
2172
2173.align 32
2174.Lmul_by_1:
2175 movdqu (%rsi),%xmm1
2176 movdqu 16(%rsi),%xmm2
2177 movdqu 32(%rsi),%xmm3
2178 movdqa %xmm0,(%rax,%r9,1)
2179 movdqu 48(%rsi),%xmm4
2180 movdqa %xmm0,16(%rax,%r9,1)
2181.byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00
2182 movdqa %xmm1,(%rax)
2183 movdqa %xmm0,32(%rax,%r9,1)
2184 movdqa %xmm2,16(%rax)
2185 movdqa %xmm0,48(%rax,%r9,1)
2186 movdqa %xmm3,32(%rax)
2187 movdqa %xmm4,48(%rax)
2188 leaq 64(%rax),%rax
2189 subq $64,%r11
2190 jnz .Lmul_by_1
2191
2192.byte 102,72,15,110,207
2193.byte 102,72,15,110,209
2194.byte 0x67
2195 movq %rcx,%rbp
2196.byte 102,73,15,110,218
2197 leaq OPENSSL_ia32cap_P(%rip),%r11
2198 movl 8(%r11),%r11d
2199 andl $0x80108,%r11d
2200 cmpl $0x80108,%r11d
2201 jne .Lfrom_mont_nox
2202
2203 leaq (%rax,%r9,1),%rdi
2204 call __bn_sqrx8x_reduction
2205 call __bn_postx4x_internal
2206
2207 pxor %xmm0,%xmm0
2208 leaq 48(%rsp),%rax
2209 jmp .Lfrom_mont_zero
2210
2211.align 32
2212.Lfrom_mont_nox:
2213 call __bn_sqr8x_reduction
2214 call __bn_post4x_internal
2215
2216 pxor %xmm0,%xmm0
2217 leaq 48(%rsp),%rax
2218 jmp .Lfrom_mont_zero
2219
2220.align 32
2221.Lfrom_mont_zero:
2222 movq 40(%rsp),%rsi
2223.cfi_def_cfa %rsi,8
2224 movdqa %xmm0,0(%rax)
2225 movdqa %xmm0,16(%rax)
2226 movdqa %xmm0,32(%rax)
2227 movdqa %xmm0,48(%rax)
2228 leaq 64(%rax),%rax
2229 subq $32,%r9
2230 jnz .Lfrom_mont_zero
2231
2232 movq $1,%rax
2233 movq -48(%rsi),%r15
2234.cfi_restore %r15
2235 movq -40(%rsi),%r14
2236.cfi_restore %r14
2237 movq -32(%rsi),%r13
2238.cfi_restore %r13
2239 movq -24(%rsi),%r12
2240.cfi_restore %r12
2241 movq -16(%rsi),%rbp
2242.cfi_restore %rbp
2243 movq -8(%rsi),%rbx
2244.cfi_restore %rbx
2245 leaq (%rsi),%rsp
2246.cfi_def_cfa_register %rsp
2247.Lfrom_epilogue:
2248 .byte 0xf3,0xc3
2249.cfi_endproc
2250.size bn_from_mont8x,.-bn_from_mont8x
2251.type bn_mulx4x_mont_gather5,@function
2252.align 32
2253bn_mulx4x_mont_gather5:
2254.cfi_startproc
2255 movq %rsp,%rax
2256.cfi_def_cfa_register %rax
2257.Lmulx4x_enter:
2258 pushq %rbx
2259.cfi_offset %rbx,-16
2260 pushq %rbp
2261.cfi_offset %rbp,-24
2262 pushq %r12
2263.cfi_offset %r12,-32
2264 pushq %r13
2265.cfi_offset %r13,-40
2266 pushq %r14
2267.cfi_offset %r14,-48
2268 pushq %r15
2269.cfi_offset %r15,-56
2270.Lmulx4x_prologue:
2271
2272 shll $3,%r9d
2273 leaq (%r9,%r9,2),%r10
2274 negq %r9
2275 movq (%r8),%r8
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286 leaq -320(%rsp,%r9,2),%r11
2287 movq %rsp,%rbp
2288 subq %rdi,%r11
2289 andq $4095,%r11
2290 cmpq %r11,%r10
2291 jb .Lmulx4xsp_alt
2292 subq %r11,%rbp
2293 leaq -320(%rbp,%r9,2),%rbp
2294 jmp .Lmulx4xsp_done
2295
2296.Lmulx4xsp_alt:
2297 leaq 4096-320(,%r9,2),%r10
2298 leaq -320(%rbp,%r9,2),%rbp
2299 subq %r10,%r11
2300 movq $0,%r10
2301 cmovcq %r10,%r11
2302 subq %r11,%rbp
2303.Lmulx4xsp_done:
2304 andq $-64,%rbp
2305 movq %rsp,%r11
2306 subq %rbp,%r11
2307 andq $-4096,%r11
2308 leaq (%r11,%rbp,1),%rsp
2309 movq (%rsp),%r10
2310 cmpq %rbp,%rsp
2311 ja .Lmulx4x_page_walk
2312 jmp .Lmulx4x_page_walk_done
2313
2314.Lmulx4x_page_walk:
2315 leaq -4096(%rsp),%rsp
2316 movq (%rsp),%r10
2317 cmpq %rbp,%rsp
2318 ja .Lmulx4x_page_walk
2319.Lmulx4x_page_walk_done:
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333 movq %r8,32(%rsp)
2334 movq %rax,40(%rsp)
2335.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
2336.Lmulx4x_body:
2337 call mulx4x_internal
2338
2339 movq 40(%rsp),%rsi
2340.cfi_def_cfa %rsi,8
2341 movq $1,%rax
2342
2343 movq -48(%rsi),%r15
2344.cfi_restore %r15
2345 movq -40(%rsi),%r14
2346.cfi_restore %r14
2347 movq -32(%rsi),%r13
2348.cfi_restore %r13
2349 movq -24(%rsi),%r12
2350.cfi_restore %r12
2351 movq -16(%rsi),%rbp
2352.cfi_restore %rbp
2353 movq -8(%rsi),%rbx
2354.cfi_restore %rbx
2355 leaq (%rsi),%rsp
2356.cfi_def_cfa_register %rsp
2357.Lmulx4x_epilogue:
2358 .byte 0xf3,0xc3
2359.cfi_endproc
2360.size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
2361
2362.type mulx4x_internal,@function
2363.align 32
2364mulx4x_internal:
2365.cfi_startproc
2366 movq %r9,8(%rsp)
2367 movq %r9,%r10
2368 negq %r9
2369 shlq $5,%r9
2370 negq %r10
2371 leaq 128(%rdx,%r9,1),%r13
2372 shrq $5+5,%r9
2373 movd 8(%rax),%xmm5
2374 subq $1,%r9
2375 leaq .Linc(%rip),%rax
2376 movq %r13,16+8(%rsp)
2377 movq %r9,24+8(%rsp)
2378 movq %rdi,56+8(%rsp)
2379 movdqa 0(%rax),%xmm0
2380 movdqa 16(%rax),%xmm1
2381 leaq 88-112(%rsp,%r10,1),%r10
2382 leaq 128(%rdx),%rdi
2383
2384 pshufd $0,%xmm5,%xmm5
2385 movdqa %xmm1,%xmm4
2386.byte 0x67
2387 movdqa %xmm1,%xmm2
2388.byte 0x67
2389 paddd %xmm0,%xmm1
2390 pcmpeqd %xmm5,%xmm0
2391 movdqa %xmm4,%xmm3
2392 paddd %xmm1,%xmm2
2393 pcmpeqd %xmm5,%xmm1
2394 movdqa %xmm0,112(%r10)
2395 movdqa %xmm4,%xmm0
2396
2397 paddd %xmm2,%xmm3
2398 pcmpeqd %xmm5,%xmm2
2399 movdqa %xmm1,128(%r10)
2400 movdqa %xmm4,%xmm1
2401
2402 paddd %xmm3,%xmm0
2403 pcmpeqd %xmm5,%xmm3
2404 movdqa %xmm2,144(%r10)
2405 movdqa %xmm4,%xmm2
2406
2407 paddd %xmm0,%xmm1
2408 pcmpeqd %xmm5,%xmm0
2409 movdqa %xmm3,160(%r10)
2410 movdqa %xmm4,%xmm3
2411 paddd %xmm1,%xmm2
2412 pcmpeqd %xmm5,%xmm1
2413 movdqa %xmm0,176(%r10)
2414 movdqa %xmm4,%xmm0
2415
2416 paddd %xmm2,%xmm3
2417 pcmpeqd %xmm5,%xmm2
2418 movdqa %xmm1,192(%r10)
2419 movdqa %xmm4,%xmm1
2420
2421 paddd %xmm3,%xmm0
2422 pcmpeqd %xmm5,%xmm3
2423 movdqa %xmm2,208(%r10)
2424 movdqa %xmm4,%xmm2
2425
2426 paddd %xmm0,%xmm1
2427 pcmpeqd %xmm5,%xmm0
2428 movdqa %xmm3,224(%r10)
2429 movdqa %xmm4,%xmm3
2430 paddd %xmm1,%xmm2
2431 pcmpeqd %xmm5,%xmm1
2432 movdqa %xmm0,240(%r10)
2433 movdqa %xmm4,%xmm0
2434
2435 paddd %xmm2,%xmm3
2436 pcmpeqd %xmm5,%xmm2
2437 movdqa %xmm1,256(%r10)
2438 movdqa %xmm4,%xmm1
2439
2440 paddd %xmm3,%xmm0
2441 pcmpeqd %xmm5,%xmm3
2442 movdqa %xmm2,272(%r10)
2443 movdqa %xmm4,%xmm2
2444
2445 paddd %xmm0,%xmm1
2446 pcmpeqd %xmm5,%xmm0
2447 movdqa %xmm3,288(%r10)
2448 movdqa %xmm4,%xmm3
2449.byte 0x67
2450 paddd %xmm1,%xmm2
2451 pcmpeqd %xmm5,%xmm1
2452 movdqa %xmm0,304(%r10)
2453
2454 paddd %xmm2,%xmm3
2455 pcmpeqd %xmm5,%xmm2
2456 movdqa %xmm1,320(%r10)
2457
2458 pcmpeqd %xmm5,%xmm3
2459 movdqa %xmm2,336(%r10)
2460
2461 pand 64(%rdi),%xmm0
2462 pand 80(%rdi),%xmm1
2463 pand 96(%rdi),%xmm2
2464 movdqa %xmm3,352(%r10)
2465 pand 112(%rdi),%xmm3
2466 por %xmm2,%xmm0
2467 por %xmm3,%xmm1
2468 movdqa -128(%rdi),%xmm4
2469 movdqa -112(%rdi),%xmm5
2470 movdqa -96(%rdi),%xmm2
2471 pand 112(%r10),%xmm4
2472 movdqa -80(%rdi),%xmm3
2473 pand 128(%r10),%xmm5
2474 por %xmm4,%xmm0
2475 pand 144(%r10),%xmm2
2476 por %xmm5,%xmm1
2477 pand 160(%r10),%xmm3
2478 por %xmm2,%xmm0
2479 por %xmm3,%xmm1
2480 movdqa -64(%rdi),%xmm4
2481 movdqa -48(%rdi),%xmm5
2482 movdqa -32(%rdi),%xmm2
2483 pand 176(%r10),%xmm4
2484 movdqa -16(%rdi),%xmm3
2485 pand 192(%r10),%xmm5
2486 por %xmm4,%xmm0
2487 pand 208(%r10),%xmm2
2488 por %xmm5,%xmm1
2489 pand 224(%r10),%xmm3
2490 por %xmm2,%xmm0
2491 por %xmm3,%xmm1
2492 movdqa 0(%rdi),%xmm4
2493 movdqa 16(%rdi),%xmm5
2494 movdqa 32(%rdi),%xmm2
2495 pand 240(%r10),%xmm4
2496 movdqa 48(%rdi),%xmm3
2497 pand 256(%r10),%xmm5
2498 por %xmm4,%xmm0
2499 pand 272(%r10),%xmm2
2500 por %xmm5,%xmm1
2501 pand 288(%r10),%xmm3
2502 por %xmm2,%xmm0
2503 por %xmm3,%xmm1
2504 pxor %xmm1,%xmm0
2505 pshufd $0x4e,%xmm0,%xmm1
2506 por %xmm1,%xmm0
2507 leaq 256(%rdi),%rdi
2508.byte 102,72,15,126,194
2509 leaq 64+32+8(%rsp),%rbx
2510
2511 movq %rdx,%r9
2512 mulxq 0(%rsi),%r8,%rax
2513 mulxq 8(%rsi),%r11,%r12
2514 addq %rax,%r11
2515 mulxq 16(%rsi),%rax,%r13
2516 adcq %rax,%r12
2517 adcq $0,%r13
2518 mulxq 24(%rsi),%rax,%r14
2519
2520 movq %r8,%r15
2521 imulq 32+8(%rsp),%r8
2522 xorq %rbp,%rbp
2523 movq %r8,%rdx
2524
2525 movq %rdi,8+8(%rsp)
2526
2527 leaq 32(%rsi),%rsi
2528 adcxq %rax,%r13
2529 adcxq %rbp,%r14
2530
2531 mulxq 0(%rcx),%rax,%r10
2532 adcxq %rax,%r15
2533 adoxq %r11,%r10
2534 mulxq 8(%rcx),%rax,%r11
2535 adcxq %rax,%r10
2536 adoxq %r12,%r11
2537 mulxq 16(%rcx),%rax,%r12
2538 movq 24+8(%rsp),%rdi
2539 movq %r10,-32(%rbx)
2540 adcxq %rax,%r11
2541 adoxq %r13,%r12
2542 mulxq 24(%rcx),%rax,%r15
2543 movq %r9,%rdx
2544 movq %r11,-24(%rbx)
2545 adcxq %rax,%r12
2546 adoxq %rbp,%r15
2547 leaq 32(%rcx),%rcx
2548 movq %r12,-16(%rbx)
2549 jmp .Lmulx4x_1st
2550
2551.align 32
2552.Lmulx4x_1st:
2553 adcxq %rbp,%r15
2554 mulxq 0(%rsi),%r10,%rax
2555 adcxq %r14,%r10
2556 mulxq 8(%rsi),%r11,%r14
2557 adcxq %rax,%r11
2558 mulxq 16(%rsi),%r12,%rax
2559 adcxq %r14,%r12
2560 mulxq 24(%rsi),%r13,%r14
2561.byte 0x67,0x67
2562 movq %r8,%rdx
2563 adcxq %rax,%r13
2564 adcxq %rbp,%r14
2565 leaq 32(%rsi),%rsi
2566 leaq 32(%rbx),%rbx
2567
2568 adoxq %r15,%r10
2569 mulxq 0(%rcx),%rax,%r15
2570 adcxq %rax,%r10
2571 adoxq %r15,%r11
2572 mulxq 8(%rcx),%rax,%r15
2573 adcxq %rax,%r11
2574 adoxq %r15,%r12
2575 mulxq 16(%rcx),%rax,%r15
2576 movq %r10,-40(%rbx)
2577 adcxq %rax,%r12
2578 movq %r11,-32(%rbx)
2579 adoxq %r15,%r13
2580 mulxq 24(%rcx),%rax,%r15
2581 movq %r9,%rdx
2582 movq %r12,-24(%rbx)
2583 adcxq %rax,%r13
2584 adoxq %rbp,%r15
2585 leaq 32(%rcx),%rcx
2586 movq %r13,-16(%rbx)
2587
2588 decq %rdi
2589 jnz .Lmulx4x_1st
2590
2591 movq 8(%rsp),%rax
2592 adcq %rbp,%r15
2593 leaq (%rsi,%rax,1),%rsi
2594 addq %r15,%r14
2595 movq 8+8(%rsp),%rdi
2596 adcq %rbp,%rbp
2597 movq %r14,-8(%rbx)
2598 jmp .Lmulx4x_outer
2599
2600.align 32
2601.Lmulx4x_outer:
2602 leaq 16-256(%rbx),%r10
2603 pxor %xmm4,%xmm4
2604.byte 0x67,0x67
2605 pxor %xmm5,%xmm5
2606 movdqa -128(%rdi),%xmm0
2607 movdqa -112(%rdi),%xmm1
2608 movdqa -96(%rdi),%xmm2
2609 pand 256(%r10),%xmm0
2610 movdqa -80(%rdi),%xmm3
2611 pand 272(%r10),%xmm1
2612 por %xmm0,%xmm4
2613 pand 288(%r10),%xmm2
2614 por %xmm1,%xmm5
2615 pand 304(%r10),%xmm3
2616 por %xmm2,%xmm4
2617 por %xmm3,%xmm5
2618 movdqa -64(%rdi),%xmm0
2619 movdqa -48(%rdi),%xmm1
2620 movdqa -32(%rdi),%xmm2
2621 pand 320(%r10),%xmm0
2622 movdqa -16(%rdi),%xmm3
2623 pand 336(%r10),%xmm1
2624 por %xmm0,%xmm4
2625 pand 352(%r10),%xmm2
2626 por %xmm1,%xmm5
2627 pand 368(%r10),%xmm3
2628 por %xmm2,%xmm4
2629 por %xmm3,%xmm5
2630 movdqa 0(%rdi),%xmm0
2631 movdqa 16(%rdi),%xmm1
2632 movdqa 32(%rdi),%xmm2
2633 pand 384(%r10),%xmm0
2634 movdqa 48(%rdi),%xmm3
2635 pand 400(%r10),%xmm1
2636 por %xmm0,%xmm4
2637 pand 416(%r10),%xmm2
2638 por %xmm1,%xmm5
2639 pand 432(%r10),%xmm3
2640 por %xmm2,%xmm4
2641 por %xmm3,%xmm5
2642 movdqa 64(%rdi),%xmm0
2643 movdqa 80(%rdi),%xmm1
2644 movdqa 96(%rdi),%xmm2
2645 pand 448(%r10),%xmm0
2646 movdqa 112(%rdi),%xmm3
2647 pand 464(%r10),%xmm1
2648 por %xmm0,%xmm4
2649 pand 480(%r10),%xmm2
2650 por %xmm1,%xmm5
2651 pand 496(%r10),%xmm3
2652 por %xmm2,%xmm4
2653 por %xmm3,%xmm5
2654 por %xmm5,%xmm4
2655 pshufd $0x4e,%xmm4,%xmm0
2656 por %xmm4,%xmm0
2657 leaq 256(%rdi),%rdi
2658.byte 102,72,15,126,194
2659
2660 movq %rbp,(%rbx)
2661 leaq 32(%rbx,%rax,1),%rbx
2662 mulxq 0(%rsi),%r8,%r11
2663 xorq %rbp,%rbp
2664 movq %rdx,%r9
2665 mulxq 8(%rsi),%r14,%r12
2666 adoxq -32(%rbx),%r8
2667 adcxq %r14,%r11
2668 mulxq 16(%rsi),%r15,%r13
2669 adoxq -24(%rbx),%r11
2670 adcxq %r15,%r12
2671 mulxq 24(%rsi),%rdx,%r14
2672 adoxq -16(%rbx),%r12
2673 adcxq %rdx,%r13
2674 leaq (%rcx,%rax,1),%rcx
2675 leaq 32(%rsi),%rsi
2676 adoxq -8(%rbx),%r13
2677 adcxq %rbp,%r14
2678 adoxq %rbp,%r14
2679
2680 movq %r8,%r15
2681 imulq 32+8(%rsp),%r8
2682
2683 movq %r8,%rdx
2684 xorq %rbp,%rbp
2685 movq %rdi,8+8(%rsp)
2686
2687 mulxq 0(%rcx),%rax,%r10
2688 adcxq %rax,%r15
2689 adoxq %r11,%r10
2690 mulxq 8(%rcx),%rax,%r11
2691 adcxq %rax,%r10
2692 adoxq %r12,%r11
2693 mulxq 16(%rcx),%rax,%r12
2694 adcxq %rax,%r11
2695 adoxq %r13,%r12
2696 mulxq 24(%rcx),%rax,%r15
2697 movq %r9,%rdx
2698 movq 24+8(%rsp),%rdi
2699 movq %r10,-32(%rbx)
2700 adcxq %rax,%r12
2701 movq %r11,-24(%rbx)
2702 adoxq %rbp,%r15
2703 movq %r12,-16(%rbx)
2704 leaq 32(%rcx),%rcx
2705 jmp .Lmulx4x_inner
2706
2707.align 32
2708.Lmulx4x_inner:
2709 mulxq 0(%rsi),%r10,%rax
2710 adcxq %rbp,%r15
2711 adoxq %r14,%r10
2712 mulxq 8(%rsi),%r11,%r14
2713 adcxq 0(%rbx),%r10
2714 adoxq %rax,%r11
2715 mulxq 16(%rsi),%r12,%rax
2716 adcxq 8(%rbx),%r11
2717 adoxq %r14,%r12
2718 mulxq 24(%rsi),%r13,%r14
2719 movq %r8,%rdx
2720 adcxq 16(%rbx),%r12
2721 adoxq %rax,%r13
2722 adcxq 24(%rbx),%r13
2723 adoxq %rbp,%r14
2724 leaq 32(%rsi),%rsi
2725 leaq 32(%rbx),%rbx
2726 adcxq %rbp,%r14
2727
2728 adoxq %r15,%r10
2729 mulxq 0(%rcx),%rax,%r15
2730 adcxq %rax,%r10
2731 adoxq %r15,%r11
2732 mulxq 8(%rcx),%rax,%r15
2733 adcxq %rax,%r11
2734 adoxq %r15,%r12
2735 mulxq 16(%rcx),%rax,%r15
2736 movq %r10,-40(%rbx)
2737 adcxq %rax,%r12
2738 adoxq %r15,%r13
2739 movq %r11,-32(%rbx)
2740 mulxq 24(%rcx),%rax,%r15
2741 movq %r9,%rdx
2742 leaq 32(%rcx),%rcx
2743 movq %r12,-24(%rbx)
2744 adcxq %rax,%r13
2745 adoxq %rbp,%r15
2746 movq %r13,-16(%rbx)
2747
2748 decq %rdi
2749 jnz .Lmulx4x_inner
2750
2751 movq 0+8(%rsp),%rax
2752 adcq %rbp,%r15
2753 subq 0(%rbx),%rdi
2754 movq 8+8(%rsp),%rdi
2755 movq 16+8(%rsp),%r10
2756 adcq %r15,%r14
2757 leaq (%rsi,%rax,1),%rsi
2758 adcq %rbp,%rbp
2759 movq %r14,-8(%rbx)
2760
2761 cmpq %r10,%rdi
2762 jb .Lmulx4x_outer
2763
2764 movq -8(%rcx),%r10
2765 movq %rbp,%r8
2766 movq (%rcx,%rax,1),%r12
2767 leaq (%rcx,%rax,1),%rbp
2768 movq %rax,%rcx
2769 leaq (%rbx,%rax,1),%rdi
2770 xorl %eax,%eax
2771 xorq %r15,%r15
2772 subq %r14,%r10
2773 adcq %r15,%r15
2774 orq %r15,%r8
2775 sarq $3+2,%rcx
2776 subq %r8,%rax
2777 movq 56+8(%rsp),%rdx
2778 decq %r12
2779 movq 8(%rbp),%r13
2780 xorq %r8,%r8
2781 movq 16(%rbp),%r14
2782 movq 24(%rbp),%r15
2783 jmp .Lsqrx4x_sub_entry
2784.cfi_endproc
2785.size mulx4x_internal,.-mulx4x_internal
2786.type bn_powerx5,@function
2787.align 32
2788bn_powerx5:
2789.cfi_startproc
2790 movq %rsp,%rax
2791.cfi_def_cfa_register %rax
2792.Lpowerx5_enter:
2793 pushq %rbx
2794.cfi_offset %rbx,-16
2795 pushq %rbp
2796.cfi_offset %rbp,-24
2797 pushq %r12
2798.cfi_offset %r12,-32
2799 pushq %r13
2800.cfi_offset %r13,-40
2801 pushq %r14
2802.cfi_offset %r14,-48
2803 pushq %r15
2804.cfi_offset %r15,-56
2805.Lpowerx5_prologue:
2806
2807 shll $3,%r9d
2808 leaq (%r9,%r9,2),%r10
2809 negq %r9
2810 movq (%r8),%r8
2811
2812
2813
2814
2815
2816
2817
2818
2819 leaq -320(%rsp,%r9,2),%r11
2820 movq %rsp,%rbp
2821 subq %rdi,%r11
2822 andq $4095,%r11
2823 cmpq %r11,%r10
2824 jb .Lpwrx_sp_alt
2825 subq %r11,%rbp
2826 leaq -320(%rbp,%r9,2),%rbp
2827 jmp .Lpwrx_sp_done
2828
2829.align 32
2830.Lpwrx_sp_alt:
2831 leaq 4096-320(,%r9,2),%r10
2832 leaq -320(%rbp,%r9,2),%rbp
2833 subq %r10,%r11
2834 movq $0,%r10
2835 cmovcq %r10,%r11
2836 subq %r11,%rbp
2837.Lpwrx_sp_done:
2838 andq $-64,%rbp
2839 movq %rsp,%r11
2840 subq %rbp,%r11
2841 andq $-4096,%r11
2842 leaq (%r11,%rbp,1),%rsp
2843 movq (%rsp),%r10
2844 cmpq %rbp,%rsp
2845 ja .Lpwrx_page_walk
2846 jmp .Lpwrx_page_walk_done
2847
2848.Lpwrx_page_walk:
2849 leaq -4096(%rsp),%rsp
2850 movq (%rsp),%r10
2851 cmpq %rbp,%rsp
2852 ja .Lpwrx_page_walk
2853.Lpwrx_page_walk_done:
2854
2855 movq %r9,%r10
2856 negq %r9
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869 pxor %xmm0,%xmm0
2870.byte 102,72,15,110,207
2871.byte 102,72,15,110,209
2872.byte 102,73,15,110,218
2873.byte 102,72,15,110,226
2874 movq %r8,32(%rsp)
2875 movq %rax,40(%rsp)
2876.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
2877.Lpowerx5_body:
2878
2879 call __bn_sqrx8x_internal
2880 call __bn_postx4x_internal
2881 call __bn_sqrx8x_internal
2882 call __bn_postx4x_internal
2883 call __bn_sqrx8x_internal
2884 call __bn_postx4x_internal
2885 call __bn_sqrx8x_internal
2886 call __bn_postx4x_internal
2887 call __bn_sqrx8x_internal
2888 call __bn_postx4x_internal
2889
2890 movq %r10,%r9
2891 movq %rsi,%rdi
2892.byte 102,72,15,126,209
2893.byte 102,72,15,126,226
2894 movq 40(%rsp),%rax
2895
2896 call mulx4x_internal
2897
2898 movq 40(%rsp),%rsi
2899.cfi_def_cfa %rsi,8
2900 movq $1,%rax
2901
2902 movq -48(%rsi),%r15
2903.cfi_restore %r15
2904 movq -40(%rsi),%r14
2905.cfi_restore %r14
2906 movq -32(%rsi),%r13
2907.cfi_restore %r13
2908 movq -24(%rsi),%r12
2909.cfi_restore %r12
2910 movq -16(%rsi),%rbp
2911.cfi_restore %rbp
2912 movq -8(%rsi),%rbx
2913.cfi_restore %rbx
2914 leaq (%rsi),%rsp
2915.cfi_def_cfa_register %rsp
2916.Lpowerx5_epilogue:
2917 .byte 0xf3,0xc3
2918.cfi_endproc
2919.size bn_powerx5,.-bn_powerx5
2920
2921.globl bn_sqrx8x_internal
2922.hidden bn_sqrx8x_internal
2923.hidden bn_sqrx8x_internal
2924.type bn_sqrx8x_internal,@function
2925.align 32
2926bn_sqrx8x_internal:
2927__bn_sqrx8x_internal:
2928.cfi_startproc
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969 leaq 48+8(%rsp),%rdi
2970 leaq (%rsi,%r9,1),%rbp
2971 movq %r9,0+8(%rsp)
2972 movq %rbp,8+8(%rsp)
2973 jmp .Lsqr8x_zero_start
2974
2975.align 32
2976.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
2977.Lsqrx8x_zero:
2978.byte 0x3e
2979 movdqa %xmm0,0(%rdi)
2980 movdqa %xmm0,16(%rdi)
2981 movdqa %xmm0,32(%rdi)
2982 movdqa %xmm0,48(%rdi)
2983.Lsqr8x_zero_start:
2984 movdqa %xmm0,64(%rdi)
2985 movdqa %xmm0,80(%rdi)
2986 movdqa %xmm0,96(%rdi)
2987 movdqa %xmm0,112(%rdi)
2988 leaq 128(%rdi),%rdi
2989 subq $64,%r9
2990 jnz .Lsqrx8x_zero
2991
2992 movq 0(%rsi),%rdx
2993
2994 xorq %r10,%r10
2995 xorq %r11,%r11
2996 xorq %r12,%r12
2997 xorq %r13,%r13
2998 xorq %r14,%r14
2999 xorq %r15,%r15
3000 leaq 48+8(%rsp),%rdi
3001 xorq %rbp,%rbp
3002 jmp .Lsqrx8x_outer_loop
3003
3004.align 32
3005.Lsqrx8x_outer_loop:
3006 mulxq 8(%rsi),%r8,%rax
3007 adcxq %r9,%r8
3008 adoxq %rax,%r10
3009 mulxq 16(%rsi),%r9,%rax
3010 adcxq %r10,%r9
3011 adoxq %rax,%r11
3012.byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
3013 adcxq %r11,%r10
3014 adoxq %rax,%r12
3015.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
3016 adcxq %r12,%r11
3017 adoxq %rax,%r13
3018 mulxq 40(%rsi),%r12,%rax
3019 adcxq %r13,%r12
3020 adoxq %rax,%r14
3021 mulxq 48(%rsi),%r13,%rax
3022 adcxq %r14,%r13
3023 adoxq %r15,%rax
3024 mulxq 56(%rsi),%r14,%r15
3025 movq 8(%rsi),%rdx
3026 adcxq %rax,%r14
3027 adoxq %rbp,%r15
3028 adcq 64(%rdi),%r15
3029 movq %r8,8(%rdi)
3030 movq %r9,16(%rdi)
3031 sbbq %rcx,%rcx
3032 xorq %rbp,%rbp
3033
3034
3035 mulxq 16(%rsi),%r8,%rbx
3036 mulxq 24(%rsi),%r9,%rax
3037 adcxq %r10,%r8
3038 adoxq %rbx,%r9
3039 mulxq 32(%rsi),%r10,%rbx
3040 adcxq %r11,%r9
3041 adoxq %rax,%r10
3042.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
3043 adcxq %r12,%r10
3044 adoxq %rbx,%r11
3045.byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
3046 adcxq %r13,%r11
3047 adoxq %r14,%r12
3048.byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
3049 movq 16(%rsi),%rdx
3050 adcxq %rax,%r12
3051 adoxq %rbx,%r13
3052 adcxq %r15,%r13
3053 adoxq %rbp,%r14
3054 adcxq %rbp,%r14
3055
3056 movq %r8,24(%rdi)
3057 movq %r9,32(%rdi)
3058
3059 mulxq 24(%rsi),%r8,%rbx
3060 mulxq 32(%rsi),%r9,%rax
3061 adcxq %r10,%r8
3062 adoxq %rbx,%r9
3063 mulxq 40(%rsi),%r10,%rbx
3064 adcxq %r11,%r9
3065 adoxq %rax,%r10
3066.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
3067 adcxq %r12,%r10
3068 adoxq %r13,%r11
3069.byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
3070.byte 0x3e
3071 movq 24(%rsi),%rdx
3072 adcxq %rbx,%r11
3073 adoxq %rax,%r12
3074 adcxq %r14,%r12
3075 movq %r8,40(%rdi)
3076 movq %r9,48(%rdi)
3077 mulxq 32(%rsi),%r8,%rax
3078 adoxq %rbp,%r13
3079 adcxq %rbp,%r13
3080
3081 mulxq 40(%rsi),%r9,%rbx
3082 adcxq %r10,%r8
3083 adoxq %rax,%r9
3084 mulxq 48(%rsi),%r10,%rax
3085 adcxq %r11,%r9
3086 adoxq %r12,%r10
3087 mulxq 56(%rsi),%r11,%r12
3088 movq 32(%rsi),%rdx
3089 movq 40(%rsi),%r14
3090 adcxq %rbx,%r10
3091 adoxq %rax,%r11
3092 movq 48(%rsi),%r15
3093 adcxq %r13,%r11
3094 adoxq %rbp,%r12
3095 adcxq %rbp,%r12
3096
3097 movq %r8,56(%rdi)
3098 movq %r9,64(%rdi)
3099
3100 mulxq %r14,%r9,%rax
3101 movq 56(%rsi),%r8
3102 adcxq %r10,%r9
3103 mulxq %r15,%r10,%rbx
3104 adoxq %rax,%r10
3105 adcxq %r11,%r10
3106 mulxq %r8,%r11,%rax
3107 movq %r14,%rdx
3108 adoxq %rbx,%r11
3109 adcxq %r12,%r11
3110
3111 adcxq %rbp,%rax
3112
3113 mulxq %r15,%r14,%rbx
3114 mulxq %r8,%r12,%r13
3115 movq %r15,%rdx
3116 leaq 64(%rsi),%rsi
3117 adcxq %r14,%r11
3118 adoxq %rbx,%r12
3119 adcxq %rax,%r12
3120 adoxq %rbp,%r13
3121
3122.byte 0x67,0x67
3123 mulxq %r8,%r8,%r14
3124 adcxq %r8,%r13
3125 adcxq %rbp,%r14
3126
3127 cmpq 8+8(%rsp),%rsi
3128 je .Lsqrx8x_outer_break
3129
3130 negq %rcx
3131 movq $-8,%rcx
3132 movq %rbp,%r15
3133 movq 64(%rdi),%r8
3134 adcxq 72(%rdi),%r9
3135 adcxq 80(%rdi),%r10
3136 adcxq 88(%rdi),%r11
3137 adcq 96(%rdi),%r12
3138 adcq 104(%rdi),%r13
3139 adcq 112(%rdi),%r14
3140 adcq 120(%rdi),%r15
3141 leaq (%rsi),%rbp
3142 leaq 128(%rdi),%rdi
3143 sbbq %rax,%rax
3144
3145 movq -64(%rsi),%rdx
3146 movq %rax,16+8(%rsp)
3147 movq %rdi,24+8(%rsp)
3148
3149
3150 xorl %eax,%eax
3151 jmp .Lsqrx8x_loop
3152
3153.align 32
3154.Lsqrx8x_loop:
3155 movq %r8,%rbx
3156 mulxq 0(%rbp),%rax,%r8
3157 adcxq %rax,%rbx
3158 adoxq %r9,%r8
3159
3160 mulxq 8(%rbp),%rax,%r9
3161 adcxq %rax,%r8
3162 adoxq %r10,%r9
3163
3164 mulxq 16(%rbp),%rax,%r10
3165 adcxq %rax,%r9
3166 adoxq %r11,%r10
3167
3168 mulxq 24(%rbp),%rax,%r11
3169 adcxq %rax,%r10
3170 adoxq %r12,%r11
3171
3172.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3173 adcxq %rax,%r11
3174 adoxq %r13,%r12
3175
3176 mulxq 40(%rbp),%rax,%r13
3177 adcxq %rax,%r12
3178 adoxq %r14,%r13
3179
3180 mulxq 48(%rbp),%rax,%r14
3181 movq %rbx,(%rdi,%rcx,8)
3182 movl $0,%ebx
3183 adcxq %rax,%r13
3184 adoxq %r15,%r14
3185
3186.byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
3187 movq 8(%rsi,%rcx,8),%rdx
3188 adcxq %rax,%r14
3189 adoxq %rbx,%r15
3190 adcxq %rbx,%r15
3191
3192.byte 0x67
3193 incq %rcx
3194 jnz .Lsqrx8x_loop
3195
3196 leaq 64(%rbp),%rbp
3197 movq $-8,%rcx
3198 cmpq 8+8(%rsp),%rbp
3199 je .Lsqrx8x_break
3200
3201 subq 16+8(%rsp),%rbx
3202.byte 0x66
3203 movq -64(%rsi),%rdx
3204 adcxq 0(%rdi),%r8
3205 adcxq 8(%rdi),%r9
3206 adcq 16(%rdi),%r10
3207 adcq 24(%rdi),%r11
3208 adcq 32(%rdi),%r12
3209 adcq 40(%rdi),%r13
3210 adcq 48(%rdi),%r14
3211 adcq 56(%rdi),%r15
3212 leaq 64(%rdi),%rdi
3213.byte 0x67
3214 sbbq %rax,%rax
3215 xorl %ebx,%ebx
3216 movq %rax,16+8(%rsp)
3217 jmp .Lsqrx8x_loop
3218
3219.align 32
3220.Lsqrx8x_break:
3221 xorq %rbp,%rbp
3222 subq 16+8(%rsp),%rbx
3223 adcxq %rbp,%r8
3224 movq 24+8(%rsp),%rcx
3225 adcxq %rbp,%r9
3226 movq 0(%rsi),%rdx
3227 adcq $0,%r10
3228 movq %r8,0(%rdi)
3229 adcq $0,%r11
3230 adcq $0,%r12
3231 adcq $0,%r13
3232 adcq $0,%r14
3233 adcq $0,%r15
3234 cmpq %rcx,%rdi
3235 je .Lsqrx8x_outer_loop
3236
3237 movq %r9,8(%rdi)
3238 movq 8(%rcx),%r9
3239 movq %r10,16(%rdi)
3240 movq 16(%rcx),%r10
3241 movq %r11,24(%rdi)
3242 movq 24(%rcx),%r11
3243 movq %r12,32(%rdi)
3244 movq 32(%rcx),%r12
3245 movq %r13,40(%rdi)
3246 movq 40(%rcx),%r13
3247 movq %r14,48(%rdi)
3248 movq 48(%rcx),%r14
3249 movq %r15,56(%rdi)
3250 movq 56(%rcx),%r15
3251 movq %rcx,%rdi
3252 jmp .Lsqrx8x_outer_loop
3253
3254.align 32
3255.Lsqrx8x_outer_break:
3256 movq %r9,72(%rdi)
3257.byte 102,72,15,126,217
3258 movq %r10,80(%rdi)
3259 movq %r11,88(%rdi)
3260 movq %r12,96(%rdi)
3261 movq %r13,104(%rdi)
3262 movq %r14,112(%rdi)
3263 leaq 48+8(%rsp),%rdi
3264 movq (%rsi,%rcx,1),%rdx
3265
3266 movq 8(%rdi),%r11
3267 xorq %r10,%r10
3268 movq 0+8(%rsp),%r9
3269 adoxq %r11,%r11
3270 movq 16(%rdi),%r12
3271 movq 24(%rdi),%r13
3272
3273
3274.align 32
3275.Lsqrx4x_shift_n_add:
3276 mulxq %rdx,%rax,%rbx
3277 adoxq %r12,%r12
3278 adcxq %r10,%rax
3279.byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
3280.byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
3281 adoxq %r13,%r13
3282 adcxq %r11,%rbx
3283 movq 40(%rdi),%r11
3284 movq %rax,0(%rdi)
3285 movq %rbx,8(%rdi)
3286
3287 mulxq %rdx,%rax,%rbx
3288 adoxq %r10,%r10
3289 adcxq %r12,%rax
3290 movq 16(%rsi,%rcx,1),%rdx
3291 movq 48(%rdi),%r12
3292 adoxq %r11,%r11
3293 adcxq %r13,%rbx
3294 movq 56(%rdi),%r13
3295 movq %rax,16(%rdi)
3296 movq %rbx,24(%rdi)
3297
3298 mulxq %rdx,%rax,%rbx
3299 adoxq %r12,%r12
3300 adcxq %r10,%rax
3301 movq 24(%rsi,%rcx,1),%rdx
3302 leaq 32(%rcx),%rcx
3303 movq 64(%rdi),%r10
3304 adoxq %r13,%r13
3305 adcxq %r11,%rbx
3306 movq 72(%rdi),%r11
3307 movq %rax,32(%rdi)
3308 movq %rbx,40(%rdi)
3309
3310 mulxq %rdx,%rax,%rbx
3311 adoxq %r10,%r10
3312 adcxq %r12,%rax
3313 jrcxz .Lsqrx4x_shift_n_add_break
3314.byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
3315 adoxq %r11,%r11
3316 adcxq %r13,%rbx
3317 movq 80(%rdi),%r12
3318 movq 88(%rdi),%r13
3319 movq %rax,48(%rdi)
3320 movq %rbx,56(%rdi)
3321 leaq 64(%rdi),%rdi
3322 nop
3323 jmp .Lsqrx4x_shift_n_add
3324
3325.align 32
3326.Lsqrx4x_shift_n_add_break:
3327 adcxq %r13,%rbx
3328 movq %rax,48(%rdi)
3329 movq %rbx,56(%rdi)
3330 leaq 64(%rdi),%rdi
3331.byte 102,72,15,126,213
3332__bn_sqrx8x_reduction:
3333 xorl %eax,%eax
3334 movq 32+8(%rsp),%rbx
3335 movq 48+8(%rsp),%rdx
3336 leaq -64(%rbp,%r9,1),%rcx
3337
3338 movq %rcx,0+8(%rsp)
3339 movq %rdi,8+8(%rsp)
3340
3341 leaq 48+8(%rsp),%rdi
3342 jmp .Lsqrx8x_reduction_loop
3343
3344.align 32
3345.Lsqrx8x_reduction_loop:
3346 movq 8(%rdi),%r9
3347 movq 16(%rdi),%r10
3348 movq 24(%rdi),%r11
3349 movq 32(%rdi),%r12
3350 movq %rdx,%r8
3351 imulq %rbx,%rdx
3352 movq 40(%rdi),%r13
3353 movq 48(%rdi),%r14
3354 movq 56(%rdi),%r15
3355 movq %rax,24+8(%rsp)
3356
3357 leaq 64(%rdi),%rdi
3358 xorq %rsi,%rsi
3359 movq $-8,%rcx
3360 jmp .Lsqrx8x_reduce
3361
3362.align 32
3363.Lsqrx8x_reduce:
3364 movq %r8,%rbx
3365 mulxq 0(%rbp),%rax,%r8
3366 adcxq %rbx,%rax
3367 adoxq %r9,%r8
3368
3369 mulxq 8(%rbp),%rbx,%r9
3370 adcxq %rbx,%r8
3371 adoxq %r10,%r9
3372
3373 mulxq 16(%rbp),%rbx,%r10
3374 adcxq %rbx,%r9
3375 adoxq %r11,%r10
3376
3377 mulxq 24(%rbp),%rbx,%r11
3378 adcxq %rbx,%r10
3379 adoxq %r12,%r11
3380
3381.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
3382 movq %rdx,%rax
3383 movq %r8,%rdx
3384 adcxq %rbx,%r11
3385 adoxq %r13,%r12
3386
3387 mulxq 32+8(%rsp),%rbx,%rdx
3388 movq %rax,%rdx
3389 movq %rax,64+48+8(%rsp,%rcx,8)
3390
3391 mulxq 40(%rbp),%rax,%r13
3392 adcxq %rax,%r12
3393 adoxq %r14,%r13
3394
3395 mulxq 48(%rbp),%rax,%r14
3396 adcxq %rax,%r13
3397 adoxq %r15,%r14
3398
3399 mulxq 56(%rbp),%rax,%r15
3400 movq %rbx,%rdx
3401 adcxq %rax,%r14
3402 adoxq %rsi,%r15
3403 adcxq %rsi,%r15
3404
3405.byte 0x67,0x67,0x67
3406 incq %rcx
3407 jnz .Lsqrx8x_reduce
3408
3409 movq %rsi,%rax
3410 cmpq 0+8(%rsp),%rbp
3411 jae .Lsqrx8x_no_tail
3412
3413 movq 48+8(%rsp),%rdx
3414 addq 0(%rdi),%r8
3415 leaq 64(%rbp),%rbp
3416 movq $-8,%rcx
3417 adcxq 8(%rdi),%r9
3418 adcxq 16(%rdi),%r10
3419 adcq 24(%rdi),%r11
3420 adcq 32(%rdi),%r12
3421 adcq 40(%rdi),%r13
3422 adcq 48(%rdi),%r14
3423 adcq 56(%rdi),%r15
3424 leaq 64(%rdi),%rdi
3425 sbbq %rax,%rax
3426
3427 xorq %rsi,%rsi
3428 movq %rax,16+8(%rsp)
3429 jmp .Lsqrx8x_tail
3430
3431.align 32
3432.Lsqrx8x_tail:
3433 movq %r8,%rbx
3434 mulxq 0(%rbp),%rax,%r8
3435 adcxq %rax,%rbx
3436 adoxq %r9,%r8
3437
3438 mulxq 8(%rbp),%rax,%r9
3439 adcxq %rax,%r8
3440 adoxq %r10,%r9
3441
3442 mulxq 16(%rbp),%rax,%r10
3443 adcxq %rax,%r9
3444 adoxq %r11,%r10
3445
3446 mulxq 24(%rbp),%rax,%r11
3447 adcxq %rax,%r10
3448 adoxq %r12,%r11
3449
3450.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3451 adcxq %rax,%r11
3452 adoxq %r13,%r12
3453
3454 mulxq 40(%rbp),%rax,%r13
3455 adcxq %rax,%r12
3456 adoxq %r14,%r13
3457
3458 mulxq 48(%rbp),%rax,%r14
3459 adcxq %rax,%r13
3460 adoxq %r15,%r14
3461
3462 mulxq 56(%rbp),%rax,%r15
3463 movq 72+48+8(%rsp,%rcx,8),%rdx
3464 adcxq %rax,%r14
3465 adoxq %rsi,%r15
3466 movq %rbx,(%rdi,%rcx,8)
3467 movq %r8,%rbx
3468 adcxq %rsi,%r15
3469
3470 incq %rcx
3471 jnz .Lsqrx8x_tail
3472
3473 cmpq 0+8(%rsp),%rbp
3474 jae .Lsqrx8x_tail_done
3475
3476 subq 16+8(%rsp),%rsi
3477 movq 48+8(%rsp),%rdx
3478 leaq 64(%rbp),%rbp
3479 adcq 0(%rdi),%r8
3480 adcq 8(%rdi),%r9
3481 adcq 16(%rdi),%r10
3482 adcq 24(%rdi),%r11
3483 adcq 32(%rdi),%r12
3484 adcq 40(%rdi),%r13
3485 adcq 48(%rdi),%r14
3486 adcq 56(%rdi),%r15
3487 leaq 64(%rdi),%rdi
3488 sbbq %rax,%rax
3489 subq $8,%rcx
3490
3491 xorq %rsi,%rsi
3492 movq %rax,16+8(%rsp)
3493 jmp .Lsqrx8x_tail
3494
3495.align 32
3496.Lsqrx8x_tail_done:
3497 xorq %rax,%rax
3498 addq 24+8(%rsp),%r8
3499 adcq $0,%r9
3500 adcq $0,%r10
3501 adcq $0,%r11
3502 adcq $0,%r12
3503 adcq $0,%r13
3504 adcq $0,%r14
3505 adcq $0,%r15
3506 adcq $0,%rax
3507
3508 subq 16+8(%rsp),%rsi
3509.Lsqrx8x_no_tail:
3510 adcq 0(%rdi),%r8
3511.byte 102,72,15,126,217
3512 adcq 8(%rdi),%r9
3513 movq 56(%rbp),%rsi
3514.byte 102,72,15,126,213
3515 adcq 16(%rdi),%r10
3516 adcq 24(%rdi),%r11
3517 adcq 32(%rdi),%r12
3518 adcq 40(%rdi),%r13
3519 adcq 48(%rdi),%r14
3520 adcq 56(%rdi),%r15
3521 adcq $0,%rax
3522
3523 movq 32+8(%rsp),%rbx
3524 movq 64(%rdi,%rcx,1),%rdx
3525
3526 movq %r8,0(%rdi)
3527 leaq 64(%rdi),%r8
3528 movq %r9,8(%rdi)
3529 movq %r10,16(%rdi)
3530 movq %r11,24(%rdi)
3531 movq %r12,32(%rdi)
3532 movq %r13,40(%rdi)
3533 movq %r14,48(%rdi)
3534 movq %r15,56(%rdi)
3535
3536 leaq 64(%rdi,%rcx,1),%rdi
3537 cmpq 8+8(%rsp),%r8
3538 jb .Lsqrx8x_reduction_loop
3539 .byte 0xf3,0xc3
3540.cfi_endproc
3541.size bn_sqrx8x_internal,.-bn_sqrx8x_internal
3542.align 32
3543.type __bn_postx4x_internal,@function
3544__bn_postx4x_internal:
3545.cfi_startproc
3546 movq 0(%rbp),%r12
3547 movq %rcx,%r10
3548 movq %rcx,%r9
3549 negq %rax
3550 sarq $3+2,%rcx
3551
3552.byte 102,72,15,126,202
3553.byte 102,72,15,126,206
3554 decq %r12
3555 movq 8(%rbp),%r13
3556 xorq %r8,%r8
3557 movq 16(%rbp),%r14
3558 movq 24(%rbp),%r15
3559 jmp .Lsqrx4x_sub_entry
3560
3561.align 16
3562.Lsqrx4x_sub:
3563 movq 0(%rbp),%r12
3564 movq 8(%rbp),%r13
3565 movq 16(%rbp),%r14
3566 movq 24(%rbp),%r15
3567.Lsqrx4x_sub_entry:
3568 andnq %rax,%r12,%r12
3569 leaq 32(%rbp),%rbp
3570 andnq %rax,%r13,%r13
3571 andnq %rax,%r14,%r14
3572 andnq %rax,%r15,%r15
3573
3574 negq %r8
3575 adcq 0(%rdi),%r12
3576 adcq 8(%rdi),%r13
3577 adcq 16(%rdi),%r14
3578 adcq 24(%rdi),%r15
3579 movq %r12,0(%rdx)
3580 leaq 32(%rdi),%rdi
3581 movq %r13,8(%rdx)
3582 sbbq %r8,%r8
3583 movq %r14,16(%rdx)
3584 movq %r15,24(%rdx)
3585 leaq 32(%rdx),%rdx
3586
3587 incq %rcx
3588 jnz .Lsqrx4x_sub
3589
3590 negq %r9
3591
3592 .byte 0xf3,0xc3
3593.cfi_endproc
3594.size __bn_postx4x_internal,.-__bn_postx4x_internal
3595.globl bn_scatter5
3596.hidden bn_scatter5
3597.type bn_scatter5,@function
3598.align 16
3599bn_scatter5:
3600.cfi_startproc
3601 cmpl $0,%esi
3602 jz .Lscatter_epilogue
3603 leaq (%rdx,%rcx,8),%rdx
3604.Lscatter:
3605 movq (%rdi),%rax
3606 leaq 8(%rdi),%rdi
3607 movq %rax,(%rdx)
3608 leaq 256(%rdx),%rdx
3609 subl $1,%esi
3610 jnz .Lscatter
3611.Lscatter_epilogue:
3612 .byte 0xf3,0xc3
3613.cfi_endproc
3614.size bn_scatter5,.-bn_scatter5
3615
3616.globl bn_gather5
3617.hidden bn_gather5
3618.type bn_gather5,@function
3619.align 32
3620bn_gather5:
3621.cfi_startproc
3622.LSEH_begin_bn_gather5:
3623
3624.byte 0x4c,0x8d,0x14,0x24
3625.cfi_def_cfa_register %r10
3626.byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00
3627 leaq .Linc(%rip),%rax
3628 andq $-16,%rsp
3629
3630 movd %ecx,%xmm5
3631 movdqa 0(%rax),%xmm0
3632 movdqa 16(%rax),%xmm1
3633 leaq 128(%rdx),%r11
3634 leaq 128(%rsp),%rax
3635
3636 pshufd $0,%xmm5,%xmm5
3637 movdqa %xmm1,%xmm4
3638 movdqa %xmm1,%xmm2
3639 paddd %xmm0,%xmm1
3640 pcmpeqd %xmm5,%xmm0
3641 movdqa %xmm4,%xmm3
3642
3643 paddd %xmm1,%xmm2
3644 pcmpeqd %xmm5,%xmm1
3645 movdqa %xmm0,-128(%rax)
3646 movdqa %xmm4,%xmm0
3647
3648 paddd %xmm2,%xmm3
3649 pcmpeqd %xmm5,%xmm2
3650 movdqa %xmm1,-112(%rax)
3651 movdqa %xmm4,%xmm1
3652
3653 paddd %xmm3,%xmm0
3654 pcmpeqd %xmm5,%xmm3
3655 movdqa %xmm2,-96(%rax)
3656 movdqa %xmm4,%xmm2
3657 paddd %xmm0,%xmm1
3658 pcmpeqd %xmm5,%xmm0
3659 movdqa %xmm3,-80(%rax)
3660 movdqa %xmm4,%xmm3
3661
3662 paddd %xmm1,%xmm2
3663 pcmpeqd %xmm5,%xmm1
3664 movdqa %xmm0,-64(%rax)
3665 movdqa %xmm4,%xmm0
3666
3667 paddd %xmm2,%xmm3
3668 pcmpeqd %xmm5,%xmm2
3669 movdqa %xmm1,-48(%rax)
3670 movdqa %xmm4,%xmm1
3671
3672 paddd %xmm3,%xmm0
3673 pcmpeqd %xmm5,%xmm3
3674 movdqa %xmm2,-32(%rax)
3675 movdqa %xmm4,%xmm2
3676 paddd %xmm0,%xmm1
3677 pcmpeqd %xmm5,%xmm0
3678 movdqa %xmm3,-16(%rax)
3679 movdqa %xmm4,%xmm3
3680
3681 paddd %xmm1,%xmm2
3682 pcmpeqd %xmm5,%xmm1
3683 movdqa %xmm0,0(%rax)
3684 movdqa %xmm4,%xmm0
3685
3686 paddd %xmm2,%xmm3
3687 pcmpeqd %xmm5,%xmm2
3688 movdqa %xmm1,16(%rax)
3689 movdqa %xmm4,%xmm1
3690
3691 paddd %xmm3,%xmm0
3692 pcmpeqd %xmm5,%xmm3
3693 movdqa %xmm2,32(%rax)
3694 movdqa %xmm4,%xmm2
3695 paddd %xmm0,%xmm1
3696 pcmpeqd %xmm5,%xmm0
3697 movdqa %xmm3,48(%rax)
3698 movdqa %xmm4,%xmm3
3699
3700 paddd %xmm1,%xmm2
3701 pcmpeqd %xmm5,%xmm1
3702 movdqa %xmm0,64(%rax)
3703 movdqa %xmm4,%xmm0
3704
3705 paddd %xmm2,%xmm3
3706 pcmpeqd %xmm5,%xmm2
3707 movdqa %xmm1,80(%rax)
3708 movdqa %xmm4,%xmm1
3709
3710 paddd %xmm3,%xmm0
3711 pcmpeqd %xmm5,%xmm3
3712 movdqa %xmm2,96(%rax)
3713 movdqa %xmm4,%xmm2
3714 movdqa %xmm3,112(%rax)
3715 jmp .Lgather
3716
3717.align 32
3718.Lgather:
3719 pxor %xmm4,%xmm4
3720 pxor %xmm5,%xmm5
3721 movdqa -128(%r11),%xmm0
3722 movdqa -112(%r11),%xmm1
3723 movdqa -96(%r11),%xmm2
3724 pand -128(%rax),%xmm0
3725 movdqa -80(%r11),%xmm3
3726 pand -112(%rax),%xmm1
3727 por %xmm0,%xmm4
3728 pand -96(%rax),%xmm2
3729 por %xmm1,%xmm5
3730 pand -80(%rax),%xmm3
3731 por %xmm2,%xmm4
3732 por %xmm3,%xmm5
3733 movdqa -64(%r11),%xmm0
3734 movdqa -48(%r11),%xmm1
3735 movdqa -32(%r11),%xmm2
3736 pand -64(%rax),%xmm0
3737 movdqa -16(%r11),%xmm3
3738 pand -48(%rax),%xmm1
3739 por %xmm0,%xmm4
3740 pand -32(%rax),%xmm2
3741 por %xmm1,%xmm5
3742 pand -16(%rax),%xmm3
3743 por %xmm2,%xmm4
3744 por %xmm3,%xmm5
3745 movdqa 0(%r11),%xmm0
3746 movdqa 16(%r11),%xmm1
3747 movdqa 32(%r11),%xmm2
3748 pand 0(%rax),%xmm0
3749 movdqa 48(%r11),%xmm3
3750 pand 16(%rax),%xmm1
3751 por %xmm0,%xmm4
3752 pand 32(%rax),%xmm2
3753 por %xmm1,%xmm5
3754 pand 48(%rax),%xmm3
3755 por %xmm2,%xmm4
3756 por %xmm3,%xmm5
3757 movdqa 64(%r11),%xmm0
3758 movdqa 80(%r11),%xmm1
3759 movdqa 96(%r11),%xmm2
3760 pand 64(%rax),%xmm0
3761 movdqa 112(%r11),%xmm3
3762 pand 80(%rax),%xmm1
3763 por %xmm0,%xmm4
3764 pand 96(%rax),%xmm2
3765 por %xmm1,%xmm5
3766 pand 112(%rax),%xmm3
3767 por %xmm2,%xmm4
3768 por %xmm3,%xmm5
3769 por %xmm5,%xmm4
3770 leaq 256(%r11),%r11
3771 pshufd $0x4e,%xmm4,%xmm0
3772 por %xmm4,%xmm0
3773 movq %xmm0,(%rdi)
3774 leaq 8(%rdi),%rdi
3775 subl $1,%esi
3776 jnz .Lgather
3777
3778 leaq (%r10),%rsp
3779.cfi_def_cfa_register %rsp
3780 .byte 0xf3,0xc3
3781.LSEH_end_bn_gather5:
3782.cfi_endproc
3783.size bn_gather5,.-bn_gather5
3784.align 64
3785.Linc:
3786.long 0,0, 1,1
3787.long 2,2, 2,2
3788.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
3789#endif
3790