1 | # This file is generated from a similarly-named Perl script in the BoringSSL |
2 | # source tree. Do not edit by hand. |
3 | |
4 | #if defined(__has_feature) |
5 | #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) |
6 | #define OPENSSL_NO_ASM |
7 | #endif |
8 | #endif |
9 | |
10 | #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) |
11 | #if defined(BORINGSSL_PREFIX) |
12 | #include <boringssl_prefix_symbols_asm.h> |
13 | #endif |
14 | .text |
15 | |
16 | .extern OPENSSL_ia32cap_P |
17 | .hidden OPENSSL_ia32cap_P |
18 | |
19 | .globl bn_mul_mont_gather5 |
20 | .hidden bn_mul_mont_gather5 |
21 | .type bn_mul_mont_gather5,@function |
22 | .align 64 |
23 | bn_mul_mont_gather5: |
24 | .cfi_startproc |
25 | movl %r9d,%r9d |
26 | movq %rsp,%rax |
27 | .cfi_def_cfa_register %rax |
28 | testl $7,%r9d |
29 | jnz .Lmul_enter |
30 | leaq OPENSSL_ia32cap_P(%rip),%r11 |
31 | movl 8(%r11),%r11d |
32 | jmp .Lmul4x_enter |
33 | |
34 | .align 16 |
35 | .Lmul_enter: |
36 | movd 8(%rsp),%xmm5 |
37 | pushq %rbx |
38 | .cfi_offset %rbx,-16 |
39 | pushq %rbp |
40 | .cfi_offset %rbp,-24 |
41 | pushq %r12 |
42 | .cfi_offset %r12,-32 |
43 | pushq %r13 |
44 | .cfi_offset %r13,-40 |
45 | pushq %r14 |
46 | .cfi_offset %r14,-48 |
47 | pushq %r15 |
48 | .cfi_offset %r15,-56 |
49 | |
50 | negq %r9 |
51 | movq %rsp,%r11 |
52 | leaq -280(%rsp,%r9,8),%r10 |
53 | negq %r9 |
54 | andq $-1024,%r10 |
55 | |
56 | |
57 | |
58 | |
59 | |
60 | |
61 | |
62 | |
63 | |
64 | subq %r10,%r11 |
65 | andq $-4096,%r11 |
66 | leaq (%r10,%r11,1),%rsp |
67 | movq (%rsp),%r11 |
68 | cmpq %r10,%rsp |
69 | ja .Lmul_page_walk |
70 | jmp .Lmul_page_walk_done |
71 | |
72 | .Lmul_page_walk: |
73 | leaq -4096(%rsp),%rsp |
74 | movq (%rsp),%r11 |
75 | cmpq %r10,%rsp |
76 | ja .Lmul_page_walk |
77 | .Lmul_page_walk_done: |
78 | |
79 | leaq .Linc(%rip),%r10 |
80 | movq %rax,8(%rsp,%r9,8) |
81 | .cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08 |
82 | .Lmul_body: |
83 | |
84 | leaq 128(%rdx),%r12 |
85 | movdqa 0(%r10),%xmm0 |
86 | movdqa 16(%r10),%xmm1 |
87 | leaq 24-112(%rsp,%r9,8),%r10 |
88 | andq $-16,%r10 |
89 | |
90 | pshufd $0,%xmm5,%xmm5 |
91 | movdqa %xmm1,%xmm4 |
92 | movdqa %xmm1,%xmm2 |
93 | paddd %xmm0,%xmm1 |
94 | pcmpeqd %xmm5,%xmm0 |
95 | .byte 0x67 |
96 | movdqa %xmm4,%xmm3 |
97 | paddd %xmm1,%xmm2 |
98 | pcmpeqd %xmm5,%xmm1 |
99 | movdqa %xmm0,112(%r10) |
100 | movdqa %xmm4,%xmm0 |
101 | |
102 | paddd %xmm2,%xmm3 |
103 | pcmpeqd %xmm5,%xmm2 |
104 | movdqa %xmm1,128(%r10) |
105 | movdqa %xmm4,%xmm1 |
106 | |
107 | paddd %xmm3,%xmm0 |
108 | pcmpeqd %xmm5,%xmm3 |
109 | movdqa %xmm2,144(%r10) |
110 | movdqa %xmm4,%xmm2 |
111 | |
112 | paddd %xmm0,%xmm1 |
113 | pcmpeqd %xmm5,%xmm0 |
114 | movdqa %xmm3,160(%r10) |
115 | movdqa %xmm4,%xmm3 |
116 | paddd %xmm1,%xmm2 |
117 | pcmpeqd %xmm5,%xmm1 |
118 | movdqa %xmm0,176(%r10) |
119 | movdqa %xmm4,%xmm0 |
120 | |
121 | paddd %xmm2,%xmm3 |
122 | pcmpeqd %xmm5,%xmm2 |
123 | movdqa %xmm1,192(%r10) |
124 | movdqa %xmm4,%xmm1 |
125 | |
126 | paddd %xmm3,%xmm0 |
127 | pcmpeqd %xmm5,%xmm3 |
128 | movdqa %xmm2,208(%r10) |
129 | movdqa %xmm4,%xmm2 |
130 | |
131 | paddd %xmm0,%xmm1 |
132 | pcmpeqd %xmm5,%xmm0 |
133 | movdqa %xmm3,224(%r10) |
134 | movdqa %xmm4,%xmm3 |
135 | paddd %xmm1,%xmm2 |
136 | pcmpeqd %xmm5,%xmm1 |
137 | movdqa %xmm0,240(%r10) |
138 | movdqa %xmm4,%xmm0 |
139 | |
140 | paddd %xmm2,%xmm3 |
141 | pcmpeqd %xmm5,%xmm2 |
142 | movdqa %xmm1,256(%r10) |
143 | movdqa %xmm4,%xmm1 |
144 | |
145 | paddd %xmm3,%xmm0 |
146 | pcmpeqd %xmm5,%xmm3 |
147 | movdqa %xmm2,272(%r10) |
148 | movdqa %xmm4,%xmm2 |
149 | |
150 | paddd %xmm0,%xmm1 |
151 | pcmpeqd %xmm5,%xmm0 |
152 | movdqa %xmm3,288(%r10) |
153 | movdqa %xmm4,%xmm3 |
154 | paddd %xmm1,%xmm2 |
155 | pcmpeqd %xmm5,%xmm1 |
156 | movdqa %xmm0,304(%r10) |
157 | |
158 | paddd %xmm2,%xmm3 |
159 | .byte 0x67 |
160 | pcmpeqd %xmm5,%xmm2 |
161 | movdqa %xmm1,320(%r10) |
162 | |
163 | pcmpeqd %xmm5,%xmm3 |
164 | movdqa %xmm2,336(%r10) |
165 | pand 64(%r12),%xmm0 |
166 | |
167 | pand 80(%r12),%xmm1 |
168 | pand 96(%r12),%xmm2 |
169 | movdqa %xmm3,352(%r10) |
170 | pand 112(%r12),%xmm3 |
171 | por %xmm2,%xmm0 |
172 | por %xmm3,%xmm1 |
173 | movdqa -128(%r12),%xmm4 |
174 | movdqa -112(%r12),%xmm5 |
175 | movdqa -96(%r12),%xmm2 |
176 | pand 112(%r10),%xmm4 |
177 | movdqa -80(%r12),%xmm3 |
178 | pand 128(%r10),%xmm5 |
179 | por %xmm4,%xmm0 |
180 | pand 144(%r10),%xmm2 |
181 | por %xmm5,%xmm1 |
182 | pand 160(%r10),%xmm3 |
183 | por %xmm2,%xmm0 |
184 | por %xmm3,%xmm1 |
185 | movdqa -64(%r12),%xmm4 |
186 | movdqa -48(%r12),%xmm5 |
187 | movdqa -32(%r12),%xmm2 |
188 | pand 176(%r10),%xmm4 |
189 | movdqa -16(%r12),%xmm3 |
190 | pand 192(%r10),%xmm5 |
191 | por %xmm4,%xmm0 |
192 | pand 208(%r10),%xmm2 |
193 | por %xmm5,%xmm1 |
194 | pand 224(%r10),%xmm3 |
195 | por %xmm2,%xmm0 |
196 | por %xmm3,%xmm1 |
197 | movdqa 0(%r12),%xmm4 |
198 | movdqa 16(%r12),%xmm5 |
199 | movdqa 32(%r12),%xmm2 |
200 | pand 240(%r10),%xmm4 |
201 | movdqa 48(%r12),%xmm3 |
202 | pand 256(%r10),%xmm5 |
203 | por %xmm4,%xmm0 |
204 | pand 272(%r10),%xmm2 |
205 | por %xmm5,%xmm1 |
206 | pand 288(%r10),%xmm3 |
207 | por %xmm2,%xmm0 |
208 | por %xmm3,%xmm1 |
209 | por %xmm1,%xmm0 |
210 | pshufd $0x4e,%xmm0,%xmm1 |
211 | por %xmm1,%xmm0 |
212 | leaq 256(%r12),%r12 |
213 | .byte 102,72,15,126,195 |
214 | |
215 | movq (%r8),%r8 |
216 | movq (%rsi),%rax |
217 | |
218 | xorq %r14,%r14 |
219 | xorq %r15,%r15 |
220 | |
221 | movq %r8,%rbp |
222 | mulq %rbx |
223 | movq %rax,%r10 |
224 | movq (%rcx),%rax |
225 | |
226 | imulq %r10,%rbp |
227 | movq %rdx,%r11 |
228 | |
229 | mulq %rbp |
230 | addq %rax,%r10 |
231 | movq 8(%rsi),%rax |
232 | adcq $0,%rdx |
233 | movq %rdx,%r13 |
234 | |
235 | leaq 1(%r15),%r15 |
236 | jmp .L1st_enter |
237 | |
238 | .align 16 |
239 | .L1st: |
240 | addq %rax,%r13 |
241 | movq (%rsi,%r15,8),%rax |
242 | adcq $0,%rdx |
243 | addq %r11,%r13 |
244 | movq %r10,%r11 |
245 | adcq $0,%rdx |
246 | movq %r13,-16(%rsp,%r15,8) |
247 | movq %rdx,%r13 |
248 | |
249 | .L1st_enter: |
250 | mulq %rbx |
251 | addq %rax,%r11 |
252 | movq (%rcx,%r15,8),%rax |
253 | adcq $0,%rdx |
254 | leaq 1(%r15),%r15 |
255 | movq %rdx,%r10 |
256 | |
257 | mulq %rbp |
258 | cmpq %r9,%r15 |
259 | jne .L1st |
260 | |
261 | |
262 | addq %rax,%r13 |
263 | adcq $0,%rdx |
264 | addq %r11,%r13 |
265 | adcq $0,%rdx |
266 | movq %r13,-16(%rsp,%r9,8) |
267 | movq %rdx,%r13 |
268 | movq %r10,%r11 |
269 | |
270 | xorq %rdx,%rdx |
271 | addq %r11,%r13 |
272 | adcq $0,%rdx |
273 | movq %r13,-8(%rsp,%r9,8) |
274 | movq %rdx,(%rsp,%r9,8) |
275 | |
276 | leaq 1(%r14),%r14 |
277 | jmp .Louter |
278 | .align 16 |
279 | .Louter: |
280 | leaq 24+128(%rsp,%r9,8),%rdx |
281 | andq $-16,%rdx |
282 | pxor %xmm4,%xmm4 |
283 | pxor %xmm5,%xmm5 |
284 | movdqa -128(%r12),%xmm0 |
285 | movdqa -112(%r12),%xmm1 |
286 | movdqa -96(%r12),%xmm2 |
287 | movdqa -80(%r12),%xmm3 |
288 | pand -128(%rdx),%xmm0 |
289 | pand -112(%rdx),%xmm1 |
290 | por %xmm0,%xmm4 |
291 | pand -96(%rdx),%xmm2 |
292 | por %xmm1,%xmm5 |
293 | pand -80(%rdx),%xmm3 |
294 | por %xmm2,%xmm4 |
295 | por %xmm3,%xmm5 |
296 | movdqa -64(%r12),%xmm0 |
297 | movdqa -48(%r12),%xmm1 |
298 | movdqa -32(%r12),%xmm2 |
299 | movdqa -16(%r12),%xmm3 |
300 | pand -64(%rdx),%xmm0 |
301 | pand -48(%rdx),%xmm1 |
302 | por %xmm0,%xmm4 |
303 | pand -32(%rdx),%xmm2 |
304 | por %xmm1,%xmm5 |
305 | pand -16(%rdx),%xmm3 |
306 | por %xmm2,%xmm4 |
307 | por %xmm3,%xmm5 |
308 | movdqa 0(%r12),%xmm0 |
309 | movdqa 16(%r12),%xmm1 |
310 | movdqa 32(%r12),%xmm2 |
311 | movdqa 48(%r12),%xmm3 |
312 | pand 0(%rdx),%xmm0 |
313 | pand 16(%rdx),%xmm1 |
314 | por %xmm0,%xmm4 |
315 | pand 32(%rdx),%xmm2 |
316 | por %xmm1,%xmm5 |
317 | pand 48(%rdx),%xmm3 |
318 | por %xmm2,%xmm4 |
319 | por %xmm3,%xmm5 |
320 | movdqa 64(%r12),%xmm0 |
321 | movdqa 80(%r12),%xmm1 |
322 | movdqa 96(%r12),%xmm2 |
323 | movdqa 112(%r12),%xmm3 |
324 | pand 64(%rdx),%xmm0 |
325 | pand 80(%rdx),%xmm1 |
326 | por %xmm0,%xmm4 |
327 | pand 96(%rdx),%xmm2 |
328 | por %xmm1,%xmm5 |
329 | pand 112(%rdx),%xmm3 |
330 | por %xmm2,%xmm4 |
331 | por %xmm3,%xmm5 |
332 | por %xmm5,%xmm4 |
333 | pshufd $0x4e,%xmm4,%xmm0 |
334 | por %xmm4,%xmm0 |
335 | leaq 256(%r12),%r12 |
336 | |
337 | movq (%rsi),%rax |
338 | .byte 102,72,15,126,195 |
339 | |
340 | xorq %r15,%r15 |
341 | movq %r8,%rbp |
342 | movq (%rsp),%r10 |
343 | |
344 | mulq %rbx |
345 | addq %rax,%r10 |
346 | movq (%rcx),%rax |
347 | adcq $0,%rdx |
348 | |
349 | imulq %r10,%rbp |
350 | movq %rdx,%r11 |
351 | |
352 | mulq %rbp |
353 | addq %rax,%r10 |
354 | movq 8(%rsi),%rax |
355 | adcq $0,%rdx |
356 | movq 8(%rsp),%r10 |
357 | movq %rdx,%r13 |
358 | |
359 | leaq 1(%r15),%r15 |
360 | jmp .Linner_enter |
361 | |
362 | .align 16 |
363 | .Linner: |
364 | addq %rax,%r13 |
365 | movq (%rsi,%r15,8),%rax |
366 | adcq $0,%rdx |
367 | addq %r10,%r13 |
368 | movq (%rsp,%r15,8),%r10 |
369 | adcq $0,%rdx |
370 | movq %r13,-16(%rsp,%r15,8) |
371 | movq %rdx,%r13 |
372 | |
373 | .Linner_enter: |
374 | mulq %rbx |
375 | addq %rax,%r11 |
376 | movq (%rcx,%r15,8),%rax |
377 | adcq $0,%rdx |
378 | addq %r11,%r10 |
379 | movq %rdx,%r11 |
380 | adcq $0,%r11 |
381 | leaq 1(%r15),%r15 |
382 | |
383 | mulq %rbp |
384 | cmpq %r9,%r15 |
385 | jne .Linner |
386 | |
387 | addq %rax,%r13 |
388 | adcq $0,%rdx |
389 | addq %r10,%r13 |
390 | movq (%rsp,%r9,8),%r10 |
391 | adcq $0,%rdx |
392 | movq %r13,-16(%rsp,%r9,8) |
393 | movq %rdx,%r13 |
394 | |
395 | xorq %rdx,%rdx |
396 | addq %r11,%r13 |
397 | adcq $0,%rdx |
398 | addq %r10,%r13 |
399 | adcq $0,%rdx |
400 | movq %r13,-8(%rsp,%r9,8) |
401 | movq %rdx,(%rsp,%r9,8) |
402 | |
403 | leaq 1(%r14),%r14 |
404 | cmpq %r9,%r14 |
405 | jb .Louter |
406 | |
407 | xorq %r14,%r14 |
408 | movq (%rsp),%rax |
409 | leaq (%rsp),%rsi |
410 | movq %r9,%r15 |
411 | jmp .Lsub |
412 | .align 16 |
413 | .Lsub: sbbq (%rcx,%r14,8),%rax |
414 | movq %rax,(%rdi,%r14,8) |
415 | movq 8(%rsi,%r14,8),%rax |
416 | leaq 1(%r14),%r14 |
417 | decq %r15 |
418 | jnz .Lsub |
419 | |
420 | sbbq $0,%rax |
421 | movq $-1,%rbx |
422 | xorq %rax,%rbx |
423 | xorq %r14,%r14 |
424 | movq %r9,%r15 |
425 | |
426 | .Lcopy: |
427 | movq (%rdi,%r14,8),%rcx |
428 | movq (%rsp,%r14,8),%rdx |
429 | andq %rbx,%rcx |
430 | andq %rax,%rdx |
431 | movq %r14,(%rsp,%r14,8) |
432 | orq %rcx,%rdx |
433 | movq %rdx,(%rdi,%r14,8) |
434 | leaq 1(%r14),%r14 |
435 | subq $1,%r15 |
436 | jnz .Lcopy |
437 | |
438 | movq 8(%rsp,%r9,8),%rsi |
439 | .cfi_def_cfa %rsi,8 |
440 | movq $1,%rax |
441 | |
442 | movq -48(%rsi),%r15 |
443 | .cfi_restore %r15 |
444 | movq -40(%rsi),%r14 |
445 | .cfi_restore %r14 |
446 | movq -32(%rsi),%r13 |
447 | .cfi_restore %r13 |
448 | movq -24(%rsi),%r12 |
449 | .cfi_restore %r12 |
450 | movq -16(%rsi),%rbp |
451 | .cfi_restore %rbp |
452 | movq -8(%rsi),%rbx |
453 | .cfi_restore %rbx |
454 | leaq (%rsi),%rsp |
455 | .cfi_def_cfa_register %rsp |
456 | .Lmul_epilogue: |
457 | .byte 0xf3,0xc3 |
458 | .cfi_endproc |
459 | .size bn_mul_mont_gather5,.-bn_mul_mont_gather5 |
460 | .type bn_mul4x_mont_gather5,@function |
461 | .align 32 |
462 | bn_mul4x_mont_gather5: |
463 | .cfi_startproc |
464 | .byte 0x67 |
465 | movq %rsp,%rax |
466 | .cfi_def_cfa_register %rax |
467 | .Lmul4x_enter: |
468 | andl $0x80108,%r11d |
469 | cmpl $0x80108,%r11d |
470 | je .Lmulx4x_enter |
471 | pushq %rbx |
472 | .cfi_offset %rbx,-16 |
473 | pushq %rbp |
474 | .cfi_offset %rbp,-24 |
475 | pushq %r12 |
476 | .cfi_offset %r12,-32 |
477 | pushq %r13 |
478 | .cfi_offset %r13,-40 |
479 | pushq %r14 |
480 | .cfi_offset %r14,-48 |
481 | pushq %r15 |
482 | .cfi_offset %r15,-56 |
483 | .Lmul4x_prologue: |
484 | |
485 | .byte 0x67 |
486 | shll $3,%r9d |
487 | leaq (%r9,%r9,2),%r10 |
488 | negq %r9 |
489 | |
490 | |
491 | |
492 | |
493 | |
494 | |
495 | |
496 | |
497 | |
498 | |
499 | leaq -320(%rsp,%r9,2),%r11 |
500 | movq %rsp,%rbp |
501 | subq %rdi,%r11 |
502 | andq $4095,%r11 |
503 | cmpq %r11,%r10 |
504 | jb .Lmul4xsp_alt |
505 | subq %r11,%rbp |
506 | leaq -320(%rbp,%r9,2),%rbp |
507 | jmp .Lmul4xsp_done |
508 | |
509 | .align 32 |
510 | .Lmul4xsp_alt: |
511 | leaq 4096-320(,%r9,2),%r10 |
512 | leaq -320(%rbp,%r9,2),%rbp |
513 | subq %r10,%r11 |
514 | movq $0,%r10 |
515 | cmovcq %r10,%r11 |
516 | subq %r11,%rbp |
517 | .Lmul4xsp_done: |
518 | andq $-64,%rbp |
519 | movq %rsp,%r11 |
520 | subq %rbp,%r11 |
521 | andq $-4096,%r11 |
522 | leaq (%r11,%rbp,1),%rsp |
523 | movq (%rsp),%r10 |
524 | cmpq %rbp,%rsp |
525 | ja .Lmul4x_page_walk |
526 | jmp .Lmul4x_page_walk_done |
527 | |
528 | .Lmul4x_page_walk: |
529 | leaq -4096(%rsp),%rsp |
530 | movq (%rsp),%r10 |
531 | cmpq %rbp,%rsp |
532 | ja .Lmul4x_page_walk |
533 | .Lmul4x_page_walk_done: |
534 | |
535 | negq %r9 |
536 | |
537 | movq %rax,40(%rsp) |
538 | .cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 |
539 | .Lmul4x_body: |
540 | |
541 | call mul4x_internal |
542 | |
543 | movq 40(%rsp),%rsi |
544 | .cfi_def_cfa %rsi,8 |
545 | movq $1,%rax |
546 | |
547 | movq -48(%rsi),%r15 |
548 | .cfi_restore %r15 |
549 | movq -40(%rsi),%r14 |
550 | .cfi_restore %r14 |
551 | movq -32(%rsi),%r13 |
552 | .cfi_restore %r13 |
553 | movq -24(%rsi),%r12 |
554 | .cfi_restore %r12 |
555 | movq -16(%rsi),%rbp |
556 | .cfi_restore %rbp |
557 | movq -8(%rsi),%rbx |
558 | .cfi_restore %rbx |
559 | leaq (%rsi),%rsp |
560 | .cfi_def_cfa_register %rsp |
561 | .Lmul4x_epilogue: |
562 | .byte 0xf3,0xc3 |
563 | .cfi_endproc |
564 | .size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 |
565 | |
566 | .type mul4x_internal,@function |
567 | .align 32 |
568 | mul4x_internal: |
569 | .cfi_startproc |
570 | shlq $5,%r9 |
571 | movd 8(%rax),%xmm5 |
572 | leaq .Linc(%rip),%rax |
573 | leaq 128(%rdx,%r9,1),%r13 |
574 | shrq $5,%r9 |
575 | movdqa 0(%rax),%xmm0 |
576 | movdqa 16(%rax),%xmm1 |
577 | leaq 88-112(%rsp,%r9,1),%r10 |
578 | leaq 128(%rdx),%r12 |
579 | |
580 | pshufd $0,%xmm5,%xmm5 |
581 | movdqa %xmm1,%xmm4 |
582 | .byte 0x67,0x67 |
583 | movdqa %xmm1,%xmm2 |
584 | paddd %xmm0,%xmm1 |
585 | pcmpeqd %xmm5,%xmm0 |
586 | .byte 0x67 |
587 | movdqa %xmm4,%xmm3 |
588 | paddd %xmm1,%xmm2 |
589 | pcmpeqd %xmm5,%xmm1 |
590 | movdqa %xmm0,112(%r10) |
591 | movdqa %xmm4,%xmm0 |
592 | |
593 | paddd %xmm2,%xmm3 |
594 | pcmpeqd %xmm5,%xmm2 |
595 | movdqa %xmm1,128(%r10) |
596 | movdqa %xmm4,%xmm1 |
597 | |
598 | paddd %xmm3,%xmm0 |
599 | pcmpeqd %xmm5,%xmm3 |
600 | movdqa %xmm2,144(%r10) |
601 | movdqa %xmm4,%xmm2 |
602 | |
603 | paddd %xmm0,%xmm1 |
604 | pcmpeqd %xmm5,%xmm0 |
605 | movdqa %xmm3,160(%r10) |
606 | movdqa %xmm4,%xmm3 |
607 | paddd %xmm1,%xmm2 |
608 | pcmpeqd %xmm5,%xmm1 |
609 | movdqa %xmm0,176(%r10) |
610 | movdqa %xmm4,%xmm0 |
611 | |
612 | paddd %xmm2,%xmm3 |
613 | pcmpeqd %xmm5,%xmm2 |
614 | movdqa %xmm1,192(%r10) |
615 | movdqa %xmm4,%xmm1 |
616 | |
617 | paddd %xmm3,%xmm0 |
618 | pcmpeqd %xmm5,%xmm3 |
619 | movdqa %xmm2,208(%r10) |
620 | movdqa %xmm4,%xmm2 |
621 | |
622 | paddd %xmm0,%xmm1 |
623 | pcmpeqd %xmm5,%xmm0 |
624 | movdqa %xmm3,224(%r10) |
625 | movdqa %xmm4,%xmm3 |
626 | paddd %xmm1,%xmm2 |
627 | pcmpeqd %xmm5,%xmm1 |
628 | movdqa %xmm0,240(%r10) |
629 | movdqa %xmm4,%xmm0 |
630 | |
631 | paddd %xmm2,%xmm3 |
632 | pcmpeqd %xmm5,%xmm2 |
633 | movdqa %xmm1,256(%r10) |
634 | movdqa %xmm4,%xmm1 |
635 | |
636 | paddd %xmm3,%xmm0 |
637 | pcmpeqd %xmm5,%xmm3 |
638 | movdqa %xmm2,272(%r10) |
639 | movdqa %xmm4,%xmm2 |
640 | |
641 | paddd %xmm0,%xmm1 |
642 | pcmpeqd %xmm5,%xmm0 |
643 | movdqa %xmm3,288(%r10) |
644 | movdqa %xmm4,%xmm3 |
645 | paddd %xmm1,%xmm2 |
646 | pcmpeqd %xmm5,%xmm1 |
647 | movdqa %xmm0,304(%r10) |
648 | |
649 | paddd %xmm2,%xmm3 |
650 | .byte 0x67 |
651 | pcmpeqd %xmm5,%xmm2 |
652 | movdqa %xmm1,320(%r10) |
653 | |
654 | pcmpeqd %xmm5,%xmm3 |
655 | movdqa %xmm2,336(%r10) |
656 | pand 64(%r12),%xmm0 |
657 | |
658 | pand 80(%r12),%xmm1 |
659 | pand 96(%r12),%xmm2 |
660 | movdqa %xmm3,352(%r10) |
661 | pand 112(%r12),%xmm3 |
662 | por %xmm2,%xmm0 |
663 | por %xmm3,%xmm1 |
664 | movdqa -128(%r12),%xmm4 |
665 | movdqa -112(%r12),%xmm5 |
666 | movdqa -96(%r12),%xmm2 |
667 | pand 112(%r10),%xmm4 |
668 | movdqa -80(%r12),%xmm3 |
669 | pand 128(%r10),%xmm5 |
670 | por %xmm4,%xmm0 |
671 | pand 144(%r10),%xmm2 |
672 | por %xmm5,%xmm1 |
673 | pand 160(%r10),%xmm3 |
674 | por %xmm2,%xmm0 |
675 | por %xmm3,%xmm1 |
676 | movdqa -64(%r12),%xmm4 |
677 | movdqa -48(%r12),%xmm5 |
678 | movdqa -32(%r12),%xmm2 |
679 | pand 176(%r10),%xmm4 |
680 | movdqa -16(%r12),%xmm3 |
681 | pand 192(%r10),%xmm5 |
682 | por %xmm4,%xmm0 |
683 | pand 208(%r10),%xmm2 |
684 | por %xmm5,%xmm1 |
685 | pand 224(%r10),%xmm3 |
686 | por %xmm2,%xmm0 |
687 | por %xmm3,%xmm1 |
688 | movdqa 0(%r12),%xmm4 |
689 | movdqa 16(%r12),%xmm5 |
690 | movdqa 32(%r12),%xmm2 |
691 | pand 240(%r10),%xmm4 |
692 | movdqa 48(%r12),%xmm3 |
693 | pand 256(%r10),%xmm5 |
694 | por %xmm4,%xmm0 |
695 | pand 272(%r10),%xmm2 |
696 | por %xmm5,%xmm1 |
697 | pand 288(%r10),%xmm3 |
698 | por %xmm2,%xmm0 |
699 | por %xmm3,%xmm1 |
700 | por %xmm1,%xmm0 |
701 | pshufd $0x4e,%xmm0,%xmm1 |
702 | por %xmm1,%xmm0 |
703 | leaq 256(%r12),%r12 |
704 | .byte 102,72,15,126,195 |
705 | |
706 | movq %r13,16+8(%rsp) |
707 | movq %rdi,56+8(%rsp) |
708 | |
709 | movq (%r8),%r8 |
710 | movq (%rsi),%rax |
711 | leaq (%rsi,%r9,1),%rsi |
712 | negq %r9 |
713 | |
714 | movq %r8,%rbp |
715 | mulq %rbx |
716 | movq %rax,%r10 |
717 | movq (%rcx),%rax |
718 | |
719 | imulq %r10,%rbp |
720 | leaq 64+8(%rsp),%r14 |
721 | movq %rdx,%r11 |
722 | |
723 | mulq %rbp |
724 | addq %rax,%r10 |
725 | movq 8(%rsi,%r9,1),%rax |
726 | adcq $0,%rdx |
727 | movq %rdx,%rdi |
728 | |
729 | mulq %rbx |
730 | addq %rax,%r11 |
731 | movq 8(%rcx),%rax |
732 | adcq $0,%rdx |
733 | movq %rdx,%r10 |
734 | |
735 | mulq %rbp |
736 | addq %rax,%rdi |
737 | movq 16(%rsi,%r9,1),%rax |
738 | adcq $0,%rdx |
739 | addq %r11,%rdi |
740 | leaq 32(%r9),%r15 |
741 | leaq 32(%rcx),%rcx |
742 | adcq $0,%rdx |
743 | movq %rdi,(%r14) |
744 | movq %rdx,%r13 |
745 | jmp .L1st4x |
746 | |
747 | .align 32 |
748 | .L1st4x: |
749 | mulq %rbx |
750 | addq %rax,%r10 |
751 | movq -16(%rcx),%rax |
752 | leaq 32(%r14),%r14 |
753 | adcq $0,%rdx |
754 | movq %rdx,%r11 |
755 | |
756 | mulq %rbp |
757 | addq %rax,%r13 |
758 | movq -8(%rsi,%r15,1),%rax |
759 | adcq $0,%rdx |
760 | addq %r10,%r13 |
761 | adcq $0,%rdx |
762 | movq %r13,-24(%r14) |
763 | movq %rdx,%rdi |
764 | |
765 | mulq %rbx |
766 | addq %rax,%r11 |
767 | movq -8(%rcx),%rax |
768 | adcq $0,%rdx |
769 | movq %rdx,%r10 |
770 | |
771 | mulq %rbp |
772 | addq %rax,%rdi |
773 | movq (%rsi,%r15,1),%rax |
774 | adcq $0,%rdx |
775 | addq %r11,%rdi |
776 | adcq $0,%rdx |
777 | movq %rdi,-16(%r14) |
778 | movq %rdx,%r13 |
779 | |
780 | mulq %rbx |
781 | addq %rax,%r10 |
782 | movq 0(%rcx),%rax |
783 | adcq $0,%rdx |
784 | movq %rdx,%r11 |
785 | |
786 | mulq %rbp |
787 | addq %rax,%r13 |
788 | movq 8(%rsi,%r15,1),%rax |
789 | adcq $0,%rdx |
790 | addq %r10,%r13 |
791 | adcq $0,%rdx |
792 | movq %r13,-8(%r14) |
793 | movq %rdx,%rdi |
794 | |
795 | mulq %rbx |
796 | addq %rax,%r11 |
797 | movq 8(%rcx),%rax |
798 | adcq $0,%rdx |
799 | movq %rdx,%r10 |
800 | |
801 | mulq %rbp |
802 | addq %rax,%rdi |
803 | movq 16(%rsi,%r15,1),%rax |
804 | adcq $0,%rdx |
805 | addq %r11,%rdi |
806 | leaq 32(%rcx),%rcx |
807 | adcq $0,%rdx |
808 | movq %rdi,(%r14) |
809 | movq %rdx,%r13 |
810 | |
811 | addq $32,%r15 |
812 | jnz .L1st4x |
813 | |
814 | mulq %rbx |
815 | addq %rax,%r10 |
816 | movq -16(%rcx),%rax |
817 | leaq 32(%r14),%r14 |
818 | adcq $0,%rdx |
819 | movq %rdx,%r11 |
820 | |
821 | mulq %rbp |
822 | addq %rax,%r13 |
823 | movq -8(%rsi),%rax |
824 | adcq $0,%rdx |
825 | addq %r10,%r13 |
826 | adcq $0,%rdx |
827 | movq %r13,-24(%r14) |
828 | movq %rdx,%rdi |
829 | |
830 | mulq %rbx |
831 | addq %rax,%r11 |
832 | movq -8(%rcx),%rax |
833 | adcq $0,%rdx |
834 | movq %rdx,%r10 |
835 | |
836 | mulq %rbp |
837 | addq %rax,%rdi |
838 | movq (%rsi,%r9,1),%rax |
839 | adcq $0,%rdx |
840 | addq %r11,%rdi |
841 | adcq $0,%rdx |
842 | movq %rdi,-16(%r14) |
843 | movq %rdx,%r13 |
844 | |
845 | leaq (%rcx,%r9,1),%rcx |
846 | |
847 | xorq %rdi,%rdi |
848 | addq %r10,%r13 |
849 | adcq $0,%rdi |
850 | movq %r13,-8(%r14) |
851 | |
852 | jmp .Louter4x |
853 | |
854 | .align 32 |
855 | .Louter4x: |
856 | leaq 16+128(%r14),%rdx |
857 | pxor %xmm4,%xmm4 |
858 | pxor %xmm5,%xmm5 |
859 | movdqa -128(%r12),%xmm0 |
860 | movdqa -112(%r12),%xmm1 |
861 | movdqa -96(%r12),%xmm2 |
862 | movdqa -80(%r12),%xmm3 |
863 | pand -128(%rdx),%xmm0 |
864 | pand -112(%rdx),%xmm1 |
865 | por %xmm0,%xmm4 |
866 | pand -96(%rdx),%xmm2 |
867 | por %xmm1,%xmm5 |
868 | pand -80(%rdx),%xmm3 |
869 | por %xmm2,%xmm4 |
870 | por %xmm3,%xmm5 |
871 | movdqa -64(%r12),%xmm0 |
872 | movdqa -48(%r12),%xmm1 |
873 | movdqa -32(%r12),%xmm2 |
874 | movdqa -16(%r12),%xmm3 |
875 | pand -64(%rdx),%xmm0 |
876 | pand -48(%rdx),%xmm1 |
877 | por %xmm0,%xmm4 |
878 | pand -32(%rdx),%xmm2 |
879 | por %xmm1,%xmm5 |
880 | pand -16(%rdx),%xmm3 |
881 | por %xmm2,%xmm4 |
882 | por %xmm3,%xmm5 |
883 | movdqa 0(%r12),%xmm0 |
884 | movdqa 16(%r12),%xmm1 |
885 | movdqa 32(%r12),%xmm2 |
886 | movdqa 48(%r12),%xmm3 |
887 | pand 0(%rdx),%xmm0 |
888 | pand 16(%rdx),%xmm1 |
889 | por %xmm0,%xmm4 |
890 | pand 32(%rdx),%xmm2 |
891 | por %xmm1,%xmm5 |
892 | pand 48(%rdx),%xmm3 |
893 | por %xmm2,%xmm4 |
894 | por %xmm3,%xmm5 |
895 | movdqa 64(%r12),%xmm0 |
896 | movdqa 80(%r12),%xmm1 |
897 | movdqa 96(%r12),%xmm2 |
898 | movdqa 112(%r12),%xmm3 |
899 | pand 64(%rdx),%xmm0 |
900 | pand 80(%rdx),%xmm1 |
901 | por %xmm0,%xmm4 |
902 | pand 96(%rdx),%xmm2 |
903 | por %xmm1,%xmm5 |
904 | pand 112(%rdx),%xmm3 |
905 | por %xmm2,%xmm4 |
906 | por %xmm3,%xmm5 |
907 | por %xmm5,%xmm4 |
908 | pshufd $0x4e,%xmm4,%xmm0 |
909 | por %xmm4,%xmm0 |
910 | leaq 256(%r12),%r12 |
911 | .byte 102,72,15,126,195 |
912 | |
913 | movq (%r14,%r9,1),%r10 |
914 | movq %r8,%rbp |
915 | mulq %rbx |
916 | addq %rax,%r10 |
917 | movq (%rcx),%rax |
918 | adcq $0,%rdx |
919 | |
920 | imulq %r10,%rbp |
921 | movq %rdx,%r11 |
922 | movq %rdi,(%r14) |
923 | |
924 | leaq (%r14,%r9,1),%r14 |
925 | |
926 | mulq %rbp |
927 | addq %rax,%r10 |
928 | movq 8(%rsi,%r9,1),%rax |
929 | adcq $0,%rdx |
930 | movq %rdx,%rdi |
931 | |
932 | mulq %rbx |
933 | addq %rax,%r11 |
934 | movq 8(%rcx),%rax |
935 | adcq $0,%rdx |
936 | addq 8(%r14),%r11 |
937 | adcq $0,%rdx |
938 | movq %rdx,%r10 |
939 | |
940 | mulq %rbp |
941 | addq %rax,%rdi |
942 | movq 16(%rsi,%r9,1),%rax |
943 | adcq $0,%rdx |
944 | addq %r11,%rdi |
945 | leaq 32(%r9),%r15 |
946 | leaq 32(%rcx),%rcx |
947 | adcq $0,%rdx |
948 | movq %rdx,%r13 |
949 | jmp .Linner4x |
950 | |
951 | .align 32 |
952 | .Linner4x: |
953 | mulq %rbx |
954 | addq %rax,%r10 |
955 | movq -16(%rcx),%rax |
956 | adcq $0,%rdx |
957 | addq 16(%r14),%r10 |
958 | leaq 32(%r14),%r14 |
959 | adcq $0,%rdx |
960 | movq %rdx,%r11 |
961 | |
962 | mulq %rbp |
963 | addq %rax,%r13 |
964 | movq -8(%rsi,%r15,1),%rax |
965 | adcq $0,%rdx |
966 | addq %r10,%r13 |
967 | adcq $0,%rdx |
968 | movq %rdi,-32(%r14) |
969 | movq %rdx,%rdi |
970 | |
971 | mulq %rbx |
972 | addq %rax,%r11 |
973 | movq -8(%rcx),%rax |
974 | adcq $0,%rdx |
975 | addq -8(%r14),%r11 |
976 | adcq $0,%rdx |
977 | movq %rdx,%r10 |
978 | |
979 | mulq %rbp |
980 | addq %rax,%rdi |
981 | movq (%rsi,%r15,1),%rax |
982 | adcq $0,%rdx |
983 | addq %r11,%rdi |
984 | adcq $0,%rdx |
985 | movq %r13,-24(%r14) |
986 | movq %rdx,%r13 |
987 | |
988 | mulq %rbx |
989 | addq %rax,%r10 |
990 | movq 0(%rcx),%rax |
991 | adcq $0,%rdx |
992 | addq (%r14),%r10 |
993 | adcq $0,%rdx |
994 | movq %rdx,%r11 |
995 | |
996 | mulq %rbp |
997 | addq %rax,%r13 |
998 | movq 8(%rsi,%r15,1),%rax |
999 | adcq $0,%rdx |
1000 | addq %r10,%r13 |
1001 | adcq $0,%rdx |
1002 | movq %rdi,-16(%r14) |
1003 | movq %rdx,%rdi |
1004 | |
1005 | mulq %rbx |
1006 | addq %rax,%r11 |
1007 | movq 8(%rcx),%rax |
1008 | adcq $0,%rdx |
1009 | addq 8(%r14),%r11 |
1010 | adcq $0,%rdx |
1011 | movq %rdx,%r10 |
1012 | |
1013 | mulq %rbp |
1014 | addq %rax,%rdi |
1015 | movq 16(%rsi,%r15,1),%rax |
1016 | adcq $0,%rdx |
1017 | addq %r11,%rdi |
1018 | leaq 32(%rcx),%rcx |
1019 | adcq $0,%rdx |
1020 | movq %r13,-8(%r14) |
1021 | movq %rdx,%r13 |
1022 | |
1023 | addq $32,%r15 |
1024 | jnz .Linner4x |
1025 | |
1026 | mulq %rbx |
1027 | addq %rax,%r10 |
1028 | movq -16(%rcx),%rax |
1029 | adcq $0,%rdx |
1030 | addq 16(%r14),%r10 |
1031 | leaq 32(%r14),%r14 |
1032 | adcq $0,%rdx |
1033 | movq %rdx,%r11 |
1034 | |
1035 | mulq %rbp |
1036 | addq %rax,%r13 |
1037 | movq -8(%rsi),%rax |
1038 | adcq $0,%rdx |
1039 | addq %r10,%r13 |
1040 | adcq $0,%rdx |
1041 | movq %rdi,-32(%r14) |
1042 | movq %rdx,%rdi |
1043 | |
1044 | mulq %rbx |
1045 | addq %rax,%r11 |
1046 | movq %rbp,%rax |
1047 | movq -8(%rcx),%rbp |
1048 | adcq $0,%rdx |
1049 | addq -8(%r14),%r11 |
1050 | adcq $0,%rdx |
1051 | movq %rdx,%r10 |
1052 | |
1053 | mulq %rbp |
1054 | addq %rax,%rdi |
1055 | movq (%rsi,%r9,1),%rax |
1056 | adcq $0,%rdx |
1057 | addq %r11,%rdi |
1058 | adcq $0,%rdx |
1059 | movq %r13,-24(%r14) |
1060 | movq %rdx,%r13 |
1061 | |
1062 | movq %rdi,-16(%r14) |
1063 | leaq (%rcx,%r9,1),%rcx |
1064 | |
1065 | xorq %rdi,%rdi |
1066 | addq %r10,%r13 |
1067 | adcq $0,%rdi |
1068 | addq (%r14),%r13 |
1069 | adcq $0,%rdi |
1070 | movq %r13,-8(%r14) |
1071 | |
1072 | cmpq 16+8(%rsp),%r12 |
1073 | jb .Louter4x |
1074 | xorq %rax,%rax |
1075 | subq %r13,%rbp |
1076 | adcq %r15,%r15 |
1077 | orq %r15,%rdi |
1078 | subq %rdi,%rax |
1079 | leaq (%r14,%r9,1),%rbx |
1080 | movq (%rcx),%r12 |
1081 | leaq (%rcx),%rbp |
1082 | movq %r9,%rcx |
1083 | sarq $3+2,%rcx |
1084 | movq 56+8(%rsp),%rdi |
1085 | decq %r12 |
1086 | xorq %r10,%r10 |
1087 | movq 8(%rbp),%r13 |
1088 | movq 16(%rbp),%r14 |
1089 | movq 24(%rbp),%r15 |
1090 | jmp .Lsqr4x_sub_entry |
1091 | .cfi_endproc |
1092 | .size mul4x_internal,.-mul4x_internal |
1093 | .globl bn_power5 |
1094 | .hidden bn_power5 |
1095 | .type bn_power5,@function |
1096 | .align 32 |
1097 | bn_power5: |
1098 | .cfi_startproc |
1099 | movq %rsp,%rax |
1100 | .cfi_def_cfa_register %rax |
1101 | leaq OPENSSL_ia32cap_P(%rip),%r11 |
1102 | movl 8(%r11),%r11d |
1103 | andl $0x80108,%r11d |
1104 | cmpl $0x80108,%r11d |
1105 | je .Lpowerx5_enter |
1106 | pushq %rbx |
1107 | .cfi_offset %rbx,-16 |
1108 | pushq %rbp |
1109 | .cfi_offset %rbp,-24 |
1110 | pushq %r12 |
1111 | .cfi_offset %r12,-32 |
1112 | pushq %r13 |
1113 | .cfi_offset %r13,-40 |
1114 | pushq %r14 |
1115 | .cfi_offset %r14,-48 |
1116 | pushq %r15 |
1117 | .cfi_offset %r15,-56 |
1118 | .Lpower5_prologue: |
1119 | |
1120 | shll $3,%r9d |
1121 | leal (%r9,%r9,2),%r10d |
1122 | negq %r9 |
1123 | movq (%r8),%r8 |
1124 | |
1125 | |
1126 | |
1127 | |
1128 | |
1129 | |
1130 | |
1131 | |
1132 | leaq -320(%rsp,%r9,2),%r11 |
1133 | movq %rsp,%rbp |
1134 | subq %rdi,%r11 |
1135 | andq $4095,%r11 |
1136 | cmpq %r11,%r10 |
1137 | jb .Lpwr_sp_alt |
1138 | subq %r11,%rbp |
1139 | leaq -320(%rbp,%r9,2),%rbp |
1140 | jmp .Lpwr_sp_done |
1141 | |
1142 | .align 32 |
1143 | .Lpwr_sp_alt: |
1144 | leaq 4096-320(,%r9,2),%r10 |
1145 | leaq -320(%rbp,%r9,2),%rbp |
1146 | subq %r10,%r11 |
1147 | movq $0,%r10 |
1148 | cmovcq %r10,%r11 |
1149 | subq %r11,%rbp |
1150 | .Lpwr_sp_done: |
1151 | andq $-64,%rbp |
1152 | movq %rsp,%r11 |
1153 | subq %rbp,%r11 |
1154 | andq $-4096,%r11 |
1155 | leaq (%r11,%rbp,1),%rsp |
1156 | movq (%rsp),%r10 |
1157 | cmpq %rbp,%rsp |
1158 | ja .Lpwr_page_walk |
1159 | jmp .Lpwr_page_walk_done |
1160 | |
1161 | .Lpwr_page_walk: |
1162 | leaq -4096(%rsp),%rsp |
1163 | movq (%rsp),%r10 |
1164 | cmpq %rbp,%rsp |
1165 | ja .Lpwr_page_walk |
1166 | .Lpwr_page_walk_done: |
1167 | |
1168 | movq %r9,%r10 |
1169 | negq %r9 |
1170 | |
1171 | |
1172 | |
1173 | |
1174 | |
1175 | |
1176 | |
1177 | |
1178 | |
1179 | |
1180 | movq %r8,32(%rsp) |
1181 | movq %rax,40(%rsp) |
1182 | .cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 |
1183 | .Lpower5_body: |
1184 | .byte 102,72,15,110,207 |
1185 | .byte 102,72,15,110,209 |
1186 | .byte 102,73,15,110,218 |
1187 | .byte 102,72,15,110,226 |
1188 | |
1189 | call __bn_sqr8x_internal |
1190 | call __bn_post4x_internal |
1191 | call __bn_sqr8x_internal |
1192 | call __bn_post4x_internal |
1193 | call __bn_sqr8x_internal |
1194 | call __bn_post4x_internal |
1195 | call __bn_sqr8x_internal |
1196 | call __bn_post4x_internal |
1197 | call __bn_sqr8x_internal |
1198 | call __bn_post4x_internal |
1199 | |
1200 | .byte 102,72,15,126,209 |
1201 | .byte 102,72,15,126,226 |
1202 | movq %rsi,%rdi |
1203 | movq 40(%rsp),%rax |
1204 | leaq 32(%rsp),%r8 |
1205 | |
1206 | call mul4x_internal |
1207 | |
1208 | movq 40(%rsp),%rsi |
1209 | .cfi_def_cfa %rsi,8 |
1210 | movq $1,%rax |
1211 | movq -48(%rsi),%r15 |
1212 | .cfi_restore %r15 |
1213 | movq -40(%rsi),%r14 |
1214 | .cfi_restore %r14 |
1215 | movq -32(%rsi),%r13 |
1216 | .cfi_restore %r13 |
1217 | movq -24(%rsi),%r12 |
1218 | .cfi_restore %r12 |
1219 | movq -16(%rsi),%rbp |
1220 | .cfi_restore %rbp |
1221 | movq -8(%rsi),%rbx |
1222 | .cfi_restore %rbx |
1223 | leaq (%rsi),%rsp |
1224 | .cfi_def_cfa_register %rsp |
1225 | .Lpower5_epilogue: |
1226 | .byte 0xf3,0xc3 |
1227 | .cfi_endproc |
1228 | .size bn_power5,.-bn_power5 |
1229 | |
1230 | .globl bn_sqr8x_internal |
1231 | .hidden bn_sqr8x_internal |
1232 | .hidden bn_sqr8x_internal |
1233 | .type bn_sqr8x_internal,@function |
1234 | .align 32 |
1235 | bn_sqr8x_internal: |
1236 | __bn_sqr8x_internal: |
1237 | .cfi_startproc |
1238 | |
1239 | |
1240 | |
1241 | |
1242 | |
1243 | |
1244 | |
1245 | |
1246 | |
1247 | |
1248 | |
1249 | |
1250 | |
1251 | |
1252 | |
1253 | |
1254 | |
1255 | |
1256 | |
1257 | |
1258 | |
1259 | |
1260 | |
1261 | |
1262 | |
1263 | |
1264 | |
1265 | |
1266 | |
1267 | |
1268 | |
1269 | |
1270 | |
1271 | |
1272 | |
1273 | |
1274 | |
1275 | |
1276 | |
1277 | |
1278 | |
1279 | |
1280 | |
1281 | |
1282 | |
1283 | |
1284 | |
1285 | |
1286 | |
1287 | |
1288 | |
1289 | |
1290 | |
1291 | |
1292 | |
1293 | |
1294 | |
1295 | |
1296 | |
1297 | |
1298 | |
1299 | |
1300 | |
1301 | |
1302 | |
1303 | |
1304 | |
1305 | |
1306 | |
1307 | |
1308 | |
1309 | |
1310 | |
1311 | leaq 32(%r10),%rbp |
1312 | leaq (%rsi,%r9,1),%rsi |
1313 | |
1314 | movq %r9,%rcx |
1315 | |
1316 | |
1317 | movq -32(%rsi,%rbp,1),%r14 |
1318 | leaq 48+8(%rsp,%r9,2),%rdi |
1319 | movq -24(%rsi,%rbp,1),%rax |
1320 | leaq -32(%rdi,%rbp,1),%rdi |
1321 | movq -16(%rsi,%rbp,1),%rbx |
1322 | movq %rax,%r15 |
1323 | |
1324 | mulq %r14 |
1325 | movq %rax,%r10 |
1326 | movq %rbx,%rax |
1327 | movq %rdx,%r11 |
1328 | movq %r10,-24(%rdi,%rbp,1) |
1329 | |
1330 | mulq %r14 |
1331 | addq %rax,%r11 |
1332 | movq %rbx,%rax |
1333 | adcq $0,%rdx |
1334 | movq %r11,-16(%rdi,%rbp,1) |
1335 | movq %rdx,%r10 |
1336 | |
1337 | |
1338 | movq -8(%rsi,%rbp,1),%rbx |
1339 | mulq %r15 |
1340 | movq %rax,%r12 |
1341 | movq %rbx,%rax |
1342 | movq %rdx,%r13 |
1343 | |
1344 | leaq (%rbp),%rcx |
1345 | mulq %r14 |
1346 | addq %rax,%r10 |
1347 | movq %rbx,%rax |
1348 | movq %rdx,%r11 |
1349 | adcq $0,%r11 |
1350 | addq %r12,%r10 |
1351 | adcq $0,%r11 |
1352 | movq %r10,-8(%rdi,%rcx,1) |
1353 | jmp .Lsqr4x_1st |
1354 | |
1355 | .align 32 |
1356 | .Lsqr4x_1st: |
1357 | movq (%rsi,%rcx,1),%rbx |
1358 | mulq %r15 |
1359 | addq %rax,%r13 |
1360 | movq %rbx,%rax |
1361 | movq %rdx,%r12 |
1362 | adcq $0,%r12 |
1363 | |
1364 | mulq %r14 |
1365 | addq %rax,%r11 |
1366 | movq %rbx,%rax |
1367 | movq 8(%rsi,%rcx,1),%rbx |
1368 | movq %rdx,%r10 |
1369 | adcq $0,%r10 |
1370 | addq %r13,%r11 |
1371 | adcq $0,%r10 |
1372 | |
1373 | |
1374 | mulq %r15 |
1375 | addq %rax,%r12 |
1376 | movq %rbx,%rax |
1377 | movq %r11,(%rdi,%rcx,1) |
1378 | movq %rdx,%r13 |
1379 | adcq $0,%r13 |
1380 | |
1381 | mulq %r14 |
1382 | addq %rax,%r10 |
1383 | movq %rbx,%rax |
1384 | movq 16(%rsi,%rcx,1),%rbx |
1385 | movq %rdx,%r11 |
1386 | adcq $0,%r11 |
1387 | addq %r12,%r10 |
1388 | adcq $0,%r11 |
1389 | |
1390 | mulq %r15 |
1391 | addq %rax,%r13 |
1392 | movq %rbx,%rax |
1393 | movq %r10,8(%rdi,%rcx,1) |
1394 | movq %rdx,%r12 |
1395 | adcq $0,%r12 |
1396 | |
1397 | mulq %r14 |
1398 | addq %rax,%r11 |
1399 | movq %rbx,%rax |
1400 | movq 24(%rsi,%rcx,1),%rbx |
1401 | movq %rdx,%r10 |
1402 | adcq $0,%r10 |
1403 | addq %r13,%r11 |
1404 | adcq $0,%r10 |
1405 | |
1406 | |
1407 | mulq %r15 |
1408 | addq %rax,%r12 |
1409 | movq %rbx,%rax |
1410 | movq %r11,16(%rdi,%rcx,1) |
1411 | movq %rdx,%r13 |
1412 | adcq $0,%r13 |
1413 | leaq 32(%rcx),%rcx |
1414 | |
1415 | mulq %r14 |
1416 | addq %rax,%r10 |
1417 | movq %rbx,%rax |
1418 | movq %rdx,%r11 |
1419 | adcq $0,%r11 |
1420 | addq %r12,%r10 |
1421 | adcq $0,%r11 |
1422 | movq %r10,-8(%rdi,%rcx,1) |
1423 | |
1424 | cmpq $0,%rcx |
1425 | jne .Lsqr4x_1st |
1426 | |
1427 | mulq %r15 |
1428 | addq %rax,%r13 |
1429 | leaq 16(%rbp),%rbp |
1430 | adcq $0,%rdx |
1431 | addq %r11,%r13 |
1432 | adcq $0,%rdx |
1433 | |
1434 | movq %r13,(%rdi) |
1435 | movq %rdx,%r12 |
1436 | movq %rdx,8(%rdi) |
1437 | jmp .Lsqr4x_outer |
1438 | |
1439 | .align 32 |
1440 | .Lsqr4x_outer: |
1441 | movq -32(%rsi,%rbp,1),%r14 |
1442 | leaq 48+8(%rsp,%r9,2),%rdi |
1443 | movq -24(%rsi,%rbp,1),%rax |
1444 | leaq -32(%rdi,%rbp,1),%rdi |
1445 | movq -16(%rsi,%rbp,1),%rbx |
1446 | movq %rax,%r15 |
1447 | |
1448 | mulq %r14 |
1449 | movq -24(%rdi,%rbp,1),%r10 |
1450 | addq %rax,%r10 |
1451 | movq %rbx,%rax |
1452 | adcq $0,%rdx |
1453 | movq %r10,-24(%rdi,%rbp,1) |
1454 | movq %rdx,%r11 |
1455 | |
1456 | mulq %r14 |
1457 | addq %rax,%r11 |
1458 | movq %rbx,%rax |
1459 | adcq $0,%rdx |
1460 | addq -16(%rdi,%rbp,1),%r11 |
1461 | movq %rdx,%r10 |
1462 | adcq $0,%r10 |
1463 | movq %r11,-16(%rdi,%rbp,1) |
1464 | |
1465 | xorq %r12,%r12 |
1466 | |
1467 | movq -8(%rsi,%rbp,1),%rbx |
1468 | mulq %r15 |
1469 | addq %rax,%r12 |
1470 | movq %rbx,%rax |
1471 | adcq $0,%rdx |
1472 | addq -8(%rdi,%rbp,1),%r12 |
1473 | movq %rdx,%r13 |
1474 | adcq $0,%r13 |
1475 | |
1476 | mulq %r14 |
1477 | addq %rax,%r10 |
1478 | movq %rbx,%rax |
1479 | adcq $0,%rdx |
1480 | addq %r12,%r10 |
1481 | movq %rdx,%r11 |
1482 | adcq $0,%r11 |
1483 | movq %r10,-8(%rdi,%rbp,1) |
1484 | |
1485 | leaq (%rbp),%rcx |
1486 | jmp .Lsqr4x_inner |
1487 | |
1488 | .align 32 |
1489 | .Lsqr4x_inner: |
1490 | movq (%rsi,%rcx,1),%rbx |
1491 | mulq %r15 |
1492 | addq %rax,%r13 |
1493 | movq %rbx,%rax |
1494 | movq %rdx,%r12 |
1495 | adcq $0,%r12 |
1496 | addq (%rdi,%rcx,1),%r13 |
1497 | adcq $0,%r12 |
1498 | |
1499 | .byte 0x67 |
1500 | mulq %r14 |
1501 | addq %rax,%r11 |
1502 | movq %rbx,%rax |
1503 | movq 8(%rsi,%rcx,1),%rbx |
1504 | movq %rdx,%r10 |
1505 | adcq $0,%r10 |
1506 | addq %r13,%r11 |
1507 | adcq $0,%r10 |
1508 | |
1509 | mulq %r15 |
1510 | addq %rax,%r12 |
1511 | movq %r11,(%rdi,%rcx,1) |
1512 | movq %rbx,%rax |
1513 | movq %rdx,%r13 |
1514 | adcq $0,%r13 |
1515 | addq 8(%rdi,%rcx,1),%r12 |
1516 | leaq 16(%rcx),%rcx |
1517 | adcq $0,%r13 |
1518 | |
1519 | mulq %r14 |
1520 | addq %rax,%r10 |
1521 | movq %rbx,%rax |
1522 | adcq $0,%rdx |
1523 | addq %r12,%r10 |
1524 | movq %rdx,%r11 |
1525 | adcq $0,%r11 |
1526 | movq %r10,-8(%rdi,%rcx,1) |
1527 | |
1528 | cmpq $0,%rcx |
1529 | jne .Lsqr4x_inner |
1530 | |
1531 | .byte 0x67 |
1532 | mulq %r15 |
1533 | addq %rax,%r13 |
1534 | adcq $0,%rdx |
1535 | addq %r11,%r13 |
1536 | adcq $0,%rdx |
1537 | |
1538 | movq %r13,(%rdi) |
1539 | movq %rdx,%r12 |
1540 | movq %rdx,8(%rdi) |
1541 | |
1542 | addq $16,%rbp |
1543 | jnz .Lsqr4x_outer |
1544 | |
1545 | |
1546 | movq -32(%rsi),%r14 |
1547 | leaq 48+8(%rsp,%r9,2),%rdi |
1548 | movq -24(%rsi),%rax |
1549 | leaq -32(%rdi,%rbp,1),%rdi |
1550 | movq -16(%rsi),%rbx |
1551 | movq %rax,%r15 |
1552 | |
1553 | mulq %r14 |
1554 | addq %rax,%r10 |
1555 | movq %rbx,%rax |
1556 | movq %rdx,%r11 |
1557 | adcq $0,%r11 |
1558 | |
1559 | mulq %r14 |
1560 | addq %rax,%r11 |
1561 | movq %rbx,%rax |
1562 | movq %r10,-24(%rdi) |
1563 | movq %rdx,%r10 |
1564 | adcq $0,%r10 |
1565 | addq %r13,%r11 |
1566 | movq -8(%rsi),%rbx |
1567 | adcq $0,%r10 |
1568 | |
1569 | mulq %r15 |
1570 | addq %rax,%r12 |
1571 | movq %rbx,%rax |
1572 | movq %r11,-16(%rdi) |
1573 | movq %rdx,%r13 |
1574 | adcq $0,%r13 |
1575 | |
1576 | mulq %r14 |
1577 | addq %rax,%r10 |
1578 | movq %rbx,%rax |
1579 | movq %rdx,%r11 |
1580 | adcq $0,%r11 |
1581 | addq %r12,%r10 |
1582 | adcq $0,%r11 |
1583 | movq %r10,-8(%rdi) |
1584 | |
1585 | mulq %r15 |
1586 | addq %rax,%r13 |
1587 | movq -16(%rsi),%rax |
1588 | adcq $0,%rdx |
1589 | addq %r11,%r13 |
1590 | adcq $0,%rdx |
1591 | |
1592 | movq %r13,(%rdi) |
1593 | movq %rdx,%r12 |
1594 | movq %rdx,8(%rdi) |
1595 | |
1596 | mulq %rbx |
1597 | addq $16,%rbp |
1598 | xorq %r14,%r14 |
1599 | subq %r9,%rbp |
1600 | xorq %r15,%r15 |
1601 | |
1602 | addq %r12,%rax |
1603 | adcq $0,%rdx |
1604 | movq %rax,8(%rdi) |
1605 | movq %rdx,16(%rdi) |
1606 | movq %r15,24(%rdi) |
1607 | |
1608 | movq -16(%rsi,%rbp,1),%rax |
1609 | leaq 48+8(%rsp),%rdi |
1610 | xorq %r10,%r10 |
1611 | movq 8(%rdi),%r11 |
1612 | |
1613 | leaq (%r14,%r10,2),%r12 |
1614 | shrq $63,%r10 |
1615 | leaq (%rcx,%r11,2),%r13 |
1616 | shrq $63,%r11 |
1617 | orq %r10,%r13 |
1618 | movq 16(%rdi),%r10 |
1619 | movq %r11,%r14 |
1620 | mulq %rax |
1621 | negq %r15 |
1622 | movq 24(%rdi),%r11 |
1623 | adcq %rax,%r12 |
1624 | movq -8(%rsi,%rbp,1),%rax |
1625 | movq %r12,(%rdi) |
1626 | adcq %rdx,%r13 |
1627 | |
1628 | leaq (%r14,%r10,2),%rbx |
1629 | movq %r13,8(%rdi) |
1630 | sbbq %r15,%r15 |
1631 | shrq $63,%r10 |
1632 | leaq (%rcx,%r11,2),%r8 |
1633 | shrq $63,%r11 |
1634 | orq %r10,%r8 |
1635 | movq 32(%rdi),%r10 |
1636 | movq %r11,%r14 |
1637 | mulq %rax |
1638 | negq %r15 |
1639 | movq 40(%rdi),%r11 |
1640 | adcq %rax,%rbx |
1641 | movq 0(%rsi,%rbp,1),%rax |
1642 | movq %rbx,16(%rdi) |
1643 | adcq %rdx,%r8 |
1644 | leaq 16(%rbp),%rbp |
1645 | movq %r8,24(%rdi) |
1646 | sbbq %r15,%r15 |
1647 | leaq 64(%rdi),%rdi |
1648 | jmp .Lsqr4x_shift_n_add |
1649 | |
1650 | .align 32 |
1651 | .Lsqr4x_shift_n_add: |
1652 | leaq (%r14,%r10,2),%r12 |
1653 | shrq $63,%r10 |
1654 | leaq (%rcx,%r11,2),%r13 |
1655 | shrq $63,%r11 |
1656 | orq %r10,%r13 |
1657 | movq -16(%rdi),%r10 |
1658 | movq %r11,%r14 |
1659 | mulq %rax |
1660 | negq %r15 |
1661 | movq -8(%rdi),%r11 |
1662 | adcq %rax,%r12 |
1663 | movq -8(%rsi,%rbp,1),%rax |
1664 | movq %r12,-32(%rdi) |
1665 | adcq %rdx,%r13 |
1666 | |
1667 | leaq (%r14,%r10,2),%rbx |
1668 | movq %r13,-24(%rdi) |
1669 | sbbq %r15,%r15 |
1670 | shrq $63,%r10 |
1671 | leaq (%rcx,%r11,2),%r8 |
1672 | shrq $63,%r11 |
1673 | orq %r10,%r8 |
1674 | movq 0(%rdi),%r10 |
1675 | movq %r11,%r14 |
1676 | mulq %rax |
1677 | negq %r15 |
1678 | movq 8(%rdi),%r11 |
1679 | adcq %rax,%rbx |
1680 | movq 0(%rsi,%rbp,1),%rax |
1681 | movq %rbx,-16(%rdi) |
1682 | adcq %rdx,%r8 |
1683 | |
1684 | leaq (%r14,%r10,2),%r12 |
1685 | movq %r8,-8(%rdi) |
1686 | sbbq %r15,%r15 |
1687 | shrq $63,%r10 |
1688 | leaq (%rcx,%r11,2),%r13 |
1689 | shrq $63,%r11 |
1690 | orq %r10,%r13 |
1691 | movq 16(%rdi),%r10 |
1692 | movq %r11,%r14 |
1693 | mulq %rax |
1694 | negq %r15 |
1695 | movq 24(%rdi),%r11 |
1696 | adcq %rax,%r12 |
1697 | movq 8(%rsi,%rbp,1),%rax |
1698 | movq %r12,0(%rdi) |
1699 | adcq %rdx,%r13 |
1700 | |
1701 | leaq (%r14,%r10,2),%rbx |
1702 | movq %r13,8(%rdi) |
1703 | sbbq %r15,%r15 |
1704 | shrq $63,%r10 |
1705 | leaq (%rcx,%r11,2),%r8 |
1706 | shrq $63,%r11 |
1707 | orq %r10,%r8 |
1708 | movq 32(%rdi),%r10 |
1709 | movq %r11,%r14 |
1710 | mulq %rax |
1711 | negq %r15 |
1712 | movq 40(%rdi),%r11 |
1713 | adcq %rax,%rbx |
1714 | movq 16(%rsi,%rbp,1),%rax |
1715 | movq %rbx,16(%rdi) |
1716 | adcq %rdx,%r8 |
1717 | movq %r8,24(%rdi) |
1718 | sbbq %r15,%r15 |
1719 | leaq 64(%rdi),%rdi |
1720 | addq $32,%rbp |
1721 | jnz .Lsqr4x_shift_n_add |
1722 | |
1723 | leaq (%r14,%r10,2),%r12 |
1724 | .byte 0x67 |
1725 | shrq $63,%r10 |
1726 | leaq (%rcx,%r11,2),%r13 |
1727 | shrq $63,%r11 |
1728 | orq %r10,%r13 |
1729 | movq -16(%rdi),%r10 |
1730 | movq %r11,%r14 |
1731 | mulq %rax |
1732 | negq %r15 |
1733 | movq -8(%rdi),%r11 |
1734 | adcq %rax,%r12 |
1735 | movq -8(%rsi),%rax |
1736 | movq %r12,-32(%rdi) |
1737 | adcq %rdx,%r13 |
1738 | |
1739 | leaq (%r14,%r10,2),%rbx |
1740 | movq %r13,-24(%rdi) |
1741 | sbbq %r15,%r15 |
1742 | shrq $63,%r10 |
1743 | leaq (%rcx,%r11,2),%r8 |
1744 | shrq $63,%r11 |
1745 | orq %r10,%r8 |
1746 | mulq %rax |
1747 | negq %r15 |
1748 | adcq %rax,%rbx |
1749 | adcq %rdx,%r8 |
1750 | movq %rbx,-16(%rdi) |
1751 | movq %r8,-8(%rdi) |
1752 | .byte 102,72,15,126,213 |
1753 | __bn_sqr8x_reduction: |
1754 | xorq %rax,%rax |
1755 | leaq (%r9,%rbp,1),%rcx |
1756 | leaq 48+8(%rsp,%r9,2),%rdx |
1757 | movq %rcx,0+8(%rsp) |
1758 | leaq 48+8(%rsp,%r9,1),%rdi |
1759 | movq %rdx,8+8(%rsp) |
1760 | negq %r9 |
1761 | jmp .L8x_reduction_loop |
1762 | |
1763 | .align 32 |
1764 | .L8x_reduction_loop: |
1765 | leaq (%rdi,%r9,1),%rdi |
1766 | .byte 0x66 |
1767 | movq 0(%rdi),%rbx |
1768 | movq 8(%rdi),%r9 |
1769 | movq 16(%rdi),%r10 |
1770 | movq 24(%rdi),%r11 |
1771 | movq 32(%rdi),%r12 |
1772 | movq 40(%rdi),%r13 |
1773 | movq 48(%rdi),%r14 |
1774 | movq 56(%rdi),%r15 |
1775 | movq %rax,(%rdx) |
1776 | leaq 64(%rdi),%rdi |
1777 | |
1778 | .byte 0x67 |
1779 | movq %rbx,%r8 |
1780 | imulq 32+8(%rsp),%rbx |
1781 | movq 0(%rbp),%rax |
1782 | movl $8,%ecx |
1783 | jmp .L8x_reduce |
1784 | |
1785 | .align 32 |
1786 | .L8x_reduce: |
1787 | mulq %rbx |
1788 | movq 8(%rbp),%rax |
1789 | negq %r8 |
1790 | movq %rdx,%r8 |
1791 | adcq $0,%r8 |
1792 | |
1793 | mulq %rbx |
1794 | addq %rax,%r9 |
1795 | movq 16(%rbp),%rax |
1796 | adcq $0,%rdx |
1797 | addq %r9,%r8 |
1798 | movq %rbx,48-8+8(%rsp,%rcx,8) |
1799 | movq %rdx,%r9 |
1800 | adcq $0,%r9 |
1801 | |
1802 | mulq %rbx |
1803 | addq %rax,%r10 |
1804 | movq 24(%rbp),%rax |
1805 | adcq $0,%rdx |
1806 | addq %r10,%r9 |
1807 | movq 32+8(%rsp),%rsi |
1808 | movq %rdx,%r10 |
1809 | adcq $0,%r10 |
1810 | |
1811 | mulq %rbx |
1812 | addq %rax,%r11 |
1813 | movq 32(%rbp),%rax |
1814 | adcq $0,%rdx |
1815 | imulq %r8,%rsi |
1816 | addq %r11,%r10 |
1817 | movq %rdx,%r11 |
1818 | adcq $0,%r11 |
1819 | |
1820 | mulq %rbx |
1821 | addq %rax,%r12 |
1822 | movq 40(%rbp),%rax |
1823 | adcq $0,%rdx |
1824 | addq %r12,%r11 |
1825 | movq %rdx,%r12 |
1826 | adcq $0,%r12 |
1827 | |
1828 | mulq %rbx |
1829 | addq %rax,%r13 |
1830 | movq 48(%rbp),%rax |
1831 | adcq $0,%rdx |
1832 | addq %r13,%r12 |
1833 | movq %rdx,%r13 |
1834 | adcq $0,%r13 |
1835 | |
1836 | mulq %rbx |
1837 | addq %rax,%r14 |
1838 | movq 56(%rbp),%rax |
1839 | adcq $0,%rdx |
1840 | addq %r14,%r13 |
1841 | movq %rdx,%r14 |
1842 | adcq $0,%r14 |
1843 | |
1844 | mulq %rbx |
1845 | movq %rsi,%rbx |
1846 | addq %rax,%r15 |
1847 | movq 0(%rbp),%rax |
1848 | adcq $0,%rdx |
1849 | addq %r15,%r14 |
1850 | movq %rdx,%r15 |
1851 | adcq $0,%r15 |
1852 | |
1853 | decl %ecx |
1854 | jnz .L8x_reduce |
1855 | |
1856 | leaq 64(%rbp),%rbp |
1857 | xorq %rax,%rax |
1858 | movq 8+8(%rsp),%rdx |
1859 | cmpq 0+8(%rsp),%rbp |
1860 | jae .L8x_no_tail |
1861 | |
1862 | .byte 0x66 |
1863 | addq 0(%rdi),%r8 |
1864 | adcq 8(%rdi),%r9 |
1865 | adcq 16(%rdi),%r10 |
1866 | adcq 24(%rdi),%r11 |
1867 | adcq 32(%rdi),%r12 |
1868 | adcq 40(%rdi),%r13 |
1869 | adcq 48(%rdi),%r14 |
1870 | adcq 56(%rdi),%r15 |
1871 | sbbq %rsi,%rsi |
1872 | |
1873 | movq 48+56+8(%rsp),%rbx |
1874 | movl $8,%ecx |
1875 | movq 0(%rbp),%rax |
1876 | jmp .L8x_tail |
1877 | |
1878 | .align 32 |
1879 | .L8x_tail: |
1880 | mulq %rbx |
1881 | addq %rax,%r8 |
1882 | movq 8(%rbp),%rax |
1883 | movq %r8,(%rdi) |
1884 | movq %rdx,%r8 |
1885 | adcq $0,%r8 |
1886 | |
1887 | mulq %rbx |
1888 | addq %rax,%r9 |
1889 | movq 16(%rbp),%rax |
1890 | adcq $0,%rdx |
1891 | addq %r9,%r8 |
1892 | leaq 8(%rdi),%rdi |
1893 | movq %rdx,%r9 |
1894 | adcq $0,%r9 |
1895 | |
1896 | mulq %rbx |
1897 | addq %rax,%r10 |
1898 | movq 24(%rbp),%rax |
1899 | adcq $0,%rdx |
1900 | addq %r10,%r9 |
1901 | movq %rdx,%r10 |
1902 | adcq $0,%r10 |
1903 | |
1904 | mulq %rbx |
1905 | addq %rax,%r11 |
1906 | movq 32(%rbp),%rax |
1907 | adcq $0,%rdx |
1908 | addq %r11,%r10 |
1909 | movq %rdx,%r11 |
1910 | adcq $0,%r11 |
1911 | |
1912 | mulq %rbx |
1913 | addq %rax,%r12 |
1914 | movq 40(%rbp),%rax |
1915 | adcq $0,%rdx |
1916 | addq %r12,%r11 |
1917 | movq %rdx,%r12 |
1918 | adcq $0,%r12 |
1919 | |
1920 | mulq %rbx |
1921 | addq %rax,%r13 |
1922 | movq 48(%rbp),%rax |
1923 | adcq $0,%rdx |
1924 | addq %r13,%r12 |
1925 | movq %rdx,%r13 |
1926 | adcq $0,%r13 |
1927 | |
1928 | mulq %rbx |
1929 | addq %rax,%r14 |
1930 | movq 56(%rbp),%rax |
1931 | adcq $0,%rdx |
1932 | addq %r14,%r13 |
1933 | movq %rdx,%r14 |
1934 | adcq $0,%r14 |
1935 | |
1936 | mulq %rbx |
1937 | movq 48-16+8(%rsp,%rcx,8),%rbx |
1938 | addq %rax,%r15 |
1939 | adcq $0,%rdx |
1940 | addq %r15,%r14 |
1941 | movq 0(%rbp),%rax |
1942 | movq %rdx,%r15 |
1943 | adcq $0,%r15 |
1944 | |
1945 | decl %ecx |
1946 | jnz .L8x_tail |
1947 | |
1948 | leaq 64(%rbp),%rbp |
1949 | movq 8+8(%rsp),%rdx |
1950 | cmpq 0+8(%rsp),%rbp |
1951 | jae .L8x_tail_done |
1952 | |
1953 | movq 48+56+8(%rsp),%rbx |
1954 | negq %rsi |
1955 | movq 0(%rbp),%rax |
1956 | adcq 0(%rdi),%r8 |
1957 | adcq 8(%rdi),%r9 |
1958 | adcq 16(%rdi),%r10 |
1959 | adcq 24(%rdi),%r11 |
1960 | adcq 32(%rdi),%r12 |
1961 | adcq 40(%rdi),%r13 |
1962 | adcq 48(%rdi),%r14 |
1963 | adcq 56(%rdi),%r15 |
1964 | sbbq %rsi,%rsi |
1965 | |
1966 | movl $8,%ecx |
1967 | jmp .L8x_tail |
1968 | |
1969 | .align 32 |
1970 | .L8x_tail_done: |
1971 | xorq %rax,%rax |
1972 | addq (%rdx),%r8 |
1973 | adcq $0,%r9 |
1974 | adcq $0,%r10 |
1975 | adcq $0,%r11 |
1976 | adcq $0,%r12 |
1977 | adcq $0,%r13 |
1978 | adcq $0,%r14 |
1979 | adcq $0,%r15 |
1980 | adcq $0,%rax |
1981 | |
1982 | negq %rsi |
1983 | .L8x_no_tail: |
1984 | adcq 0(%rdi),%r8 |
1985 | adcq 8(%rdi),%r9 |
1986 | adcq 16(%rdi),%r10 |
1987 | adcq 24(%rdi),%r11 |
1988 | adcq 32(%rdi),%r12 |
1989 | adcq 40(%rdi),%r13 |
1990 | adcq 48(%rdi),%r14 |
1991 | adcq 56(%rdi),%r15 |
1992 | adcq $0,%rax |
1993 | movq -8(%rbp),%rcx |
1994 | xorq %rsi,%rsi |
1995 | |
1996 | .byte 102,72,15,126,213 |
1997 | |
1998 | movq %r8,0(%rdi) |
1999 | movq %r9,8(%rdi) |
2000 | .byte 102,73,15,126,217 |
2001 | movq %r10,16(%rdi) |
2002 | movq %r11,24(%rdi) |
2003 | movq %r12,32(%rdi) |
2004 | movq %r13,40(%rdi) |
2005 | movq %r14,48(%rdi) |
2006 | movq %r15,56(%rdi) |
2007 | leaq 64(%rdi),%rdi |
2008 | |
2009 | cmpq %rdx,%rdi |
2010 | jb .L8x_reduction_loop |
2011 | .byte 0xf3,0xc3 |
2012 | .cfi_endproc |
2013 | .size bn_sqr8x_internal,.-bn_sqr8x_internal |
2014 | .type __bn_post4x_internal,@function |
2015 | .align 32 |
2016 | __bn_post4x_internal: |
2017 | .cfi_startproc |
2018 | movq 0(%rbp),%r12 |
2019 | leaq (%rdi,%r9,1),%rbx |
2020 | movq %r9,%rcx |
2021 | .byte 102,72,15,126,207 |
2022 | negq %rax |
2023 | .byte 102,72,15,126,206 |
2024 | sarq $3+2,%rcx |
2025 | decq %r12 |
2026 | xorq %r10,%r10 |
2027 | movq 8(%rbp),%r13 |
2028 | movq 16(%rbp),%r14 |
2029 | movq 24(%rbp),%r15 |
2030 | jmp .Lsqr4x_sub_entry |
2031 | |
2032 | .align 16 |
2033 | .Lsqr4x_sub: |
2034 | movq 0(%rbp),%r12 |
2035 | movq 8(%rbp),%r13 |
2036 | movq 16(%rbp),%r14 |
2037 | movq 24(%rbp),%r15 |
2038 | .Lsqr4x_sub_entry: |
2039 | leaq 32(%rbp),%rbp |
2040 | notq %r12 |
2041 | notq %r13 |
2042 | notq %r14 |
2043 | notq %r15 |
2044 | andq %rax,%r12 |
2045 | andq %rax,%r13 |
2046 | andq %rax,%r14 |
2047 | andq %rax,%r15 |
2048 | |
2049 | negq %r10 |
2050 | adcq 0(%rbx),%r12 |
2051 | adcq 8(%rbx),%r13 |
2052 | adcq 16(%rbx),%r14 |
2053 | adcq 24(%rbx),%r15 |
2054 | movq %r12,0(%rdi) |
2055 | leaq 32(%rbx),%rbx |
2056 | movq %r13,8(%rdi) |
2057 | sbbq %r10,%r10 |
2058 | movq %r14,16(%rdi) |
2059 | movq %r15,24(%rdi) |
2060 | leaq 32(%rdi),%rdi |
2061 | |
2062 | incq %rcx |
2063 | jnz .Lsqr4x_sub |
2064 | |
2065 | movq %r9,%r10 |
2066 | negq %r9 |
2067 | .byte 0xf3,0xc3 |
2068 | .cfi_endproc |
2069 | .size __bn_post4x_internal,.-__bn_post4x_internal |
2070 | .globl bn_from_montgomery |
2071 | .hidden bn_from_montgomery |
2072 | .type bn_from_montgomery,@function |
2073 | .align 32 |
2074 | bn_from_montgomery: |
2075 | .cfi_startproc |
2076 | testl $7,%r9d |
2077 | jz bn_from_mont8x |
2078 | xorl %eax,%eax |
2079 | .byte 0xf3,0xc3 |
2080 | .cfi_endproc |
2081 | .size bn_from_montgomery,.-bn_from_montgomery |
2082 | |
2083 | .type bn_from_mont8x,@function |
2084 | .align 32 |
2085 | bn_from_mont8x: |
2086 | .cfi_startproc |
2087 | .byte 0x67 |
2088 | movq %rsp,%rax |
2089 | .cfi_def_cfa_register %rax |
2090 | pushq %rbx |
2091 | .cfi_offset %rbx,-16 |
2092 | pushq %rbp |
2093 | .cfi_offset %rbp,-24 |
2094 | pushq %r12 |
2095 | .cfi_offset %r12,-32 |
2096 | pushq %r13 |
2097 | .cfi_offset %r13,-40 |
2098 | pushq %r14 |
2099 | .cfi_offset %r14,-48 |
2100 | pushq %r15 |
2101 | .cfi_offset %r15,-56 |
2102 | .Lfrom_prologue: |
2103 | |
2104 | shll $3,%r9d |
2105 | leaq (%r9,%r9,2),%r10 |
2106 | negq %r9 |
2107 | movq (%r8),%r8 |
2108 | |
2109 | |
2110 | |
2111 | |
2112 | |
2113 | |
2114 | |
2115 | |
2116 | leaq -320(%rsp,%r9,2),%r11 |
2117 | movq %rsp,%rbp |
2118 | subq %rdi,%r11 |
2119 | andq $4095,%r11 |
2120 | cmpq %r11,%r10 |
2121 | jb .Lfrom_sp_alt |
2122 | subq %r11,%rbp |
2123 | leaq -320(%rbp,%r9,2),%rbp |
2124 | jmp .Lfrom_sp_done |
2125 | |
2126 | .align 32 |
2127 | .Lfrom_sp_alt: |
2128 | leaq 4096-320(,%r9,2),%r10 |
2129 | leaq -320(%rbp,%r9,2),%rbp |
2130 | subq %r10,%r11 |
2131 | movq $0,%r10 |
2132 | cmovcq %r10,%r11 |
2133 | subq %r11,%rbp |
2134 | .Lfrom_sp_done: |
2135 | andq $-64,%rbp |
2136 | movq %rsp,%r11 |
2137 | subq %rbp,%r11 |
2138 | andq $-4096,%r11 |
2139 | leaq (%r11,%rbp,1),%rsp |
2140 | movq (%rsp),%r10 |
2141 | cmpq %rbp,%rsp |
2142 | ja .Lfrom_page_walk |
2143 | jmp .Lfrom_page_walk_done |
2144 | |
2145 | .Lfrom_page_walk: |
2146 | leaq -4096(%rsp),%rsp |
2147 | movq (%rsp),%r10 |
2148 | cmpq %rbp,%rsp |
2149 | ja .Lfrom_page_walk |
2150 | .Lfrom_page_walk_done: |
2151 | |
2152 | movq %r9,%r10 |
2153 | negq %r9 |
2154 | |
2155 | |
2156 | |
2157 | |
2158 | |
2159 | |
2160 | |
2161 | |
2162 | |
2163 | |
2164 | movq %r8,32(%rsp) |
2165 | movq %rax,40(%rsp) |
2166 | .cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 |
2167 | .Lfrom_body: |
2168 | movq %r9,%r11 |
2169 | leaq 48(%rsp),%rax |
2170 | pxor %xmm0,%xmm0 |
2171 | jmp .Lmul_by_1 |
2172 | |
2173 | .align 32 |
2174 | .Lmul_by_1: |
2175 | movdqu (%rsi),%xmm1 |
2176 | movdqu 16(%rsi),%xmm2 |
2177 | movdqu 32(%rsi),%xmm3 |
2178 | movdqa %xmm0,(%rax,%r9,1) |
2179 | movdqu 48(%rsi),%xmm4 |
2180 | movdqa %xmm0,16(%rax,%r9,1) |
2181 | .byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00 |
2182 | movdqa %xmm1,(%rax) |
2183 | movdqa %xmm0,32(%rax,%r9,1) |
2184 | movdqa %xmm2,16(%rax) |
2185 | movdqa %xmm0,48(%rax,%r9,1) |
2186 | movdqa %xmm3,32(%rax) |
2187 | movdqa %xmm4,48(%rax) |
2188 | leaq 64(%rax),%rax |
2189 | subq $64,%r11 |
2190 | jnz .Lmul_by_1 |
2191 | |
2192 | .byte 102,72,15,110,207 |
2193 | .byte 102,72,15,110,209 |
2194 | .byte 0x67 |
2195 | movq %rcx,%rbp |
2196 | .byte 102,73,15,110,218 |
2197 | leaq OPENSSL_ia32cap_P(%rip),%r11 |
2198 | movl 8(%r11),%r11d |
2199 | andl $0x80108,%r11d |
2200 | cmpl $0x80108,%r11d |
2201 | jne .Lfrom_mont_nox |
2202 | |
2203 | leaq (%rax,%r9,1),%rdi |
2204 | call __bn_sqrx8x_reduction |
2205 | call __bn_postx4x_internal |
2206 | |
2207 | pxor %xmm0,%xmm0 |
2208 | leaq 48(%rsp),%rax |
2209 | jmp .Lfrom_mont_zero |
2210 | |
2211 | .align 32 |
2212 | .Lfrom_mont_nox: |
2213 | call __bn_sqr8x_reduction |
2214 | call __bn_post4x_internal |
2215 | |
2216 | pxor %xmm0,%xmm0 |
2217 | leaq 48(%rsp),%rax |
2218 | jmp .Lfrom_mont_zero |
2219 | |
2220 | .align 32 |
2221 | .Lfrom_mont_zero: |
2222 | movq 40(%rsp),%rsi |
2223 | .cfi_def_cfa %rsi,8 |
2224 | movdqa %xmm0,0(%rax) |
2225 | movdqa %xmm0,16(%rax) |
2226 | movdqa %xmm0,32(%rax) |
2227 | movdqa %xmm0,48(%rax) |
2228 | leaq 64(%rax),%rax |
2229 | subq $32,%r9 |
2230 | jnz .Lfrom_mont_zero |
2231 | |
2232 | movq $1,%rax |
2233 | movq -48(%rsi),%r15 |
2234 | .cfi_restore %r15 |
2235 | movq -40(%rsi),%r14 |
2236 | .cfi_restore %r14 |
2237 | movq -32(%rsi),%r13 |
2238 | .cfi_restore %r13 |
2239 | movq -24(%rsi),%r12 |
2240 | .cfi_restore %r12 |
2241 | movq -16(%rsi),%rbp |
2242 | .cfi_restore %rbp |
2243 | movq -8(%rsi),%rbx |
2244 | .cfi_restore %rbx |
2245 | leaq (%rsi),%rsp |
2246 | .cfi_def_cfa_register %rsp |
2247 | .Lfrom_epilogue: |
2248 | .byte 0xf3,0xc3 |
2249 | .cfi_endproc |
2250 | .size bn_from_mont8x,.-bn_from_mont8x |
2251 | .type bn_mulx4x_mont_gather5,@function |
2252 | .align 32 |
2253 | bn_mulx4x_mont_gather5: |
2254 | .cfi_startproc |
2255 | movq %rsp,%rax |
2256 | .cfi_def_cfa_register %rax |
2257 | .Lmulx4x_enter: |
2258 | pushq %rbx |
2259 | .cfi_offset %rbx,-16 |
2260 | pushq %rbp |
2261 | .cfi_offset %rbp,-24 |
2262 | pushq %r12 |
2263 | .cfi_offset %r12,-32 |
2264 | pushq %r13 |
2265 | .cfi_offset %r13,-40 |
2266 | pushq %r14 |
2267 | .cfi_offset %r14,-48 |
2268 | pushq %r15 |
2269 | .cfi_offset %r15,-56 |
2270 | .Lmulx4x_prologue: |
2271 | |
2272 | shll $3,%r9d |
2273 | leaq (%r9,%r9,2),%r10 |
2274 | negq %r9 |
2275 | movq (%r8),%r8 |
2276 | |
2277 | |
2278 | |
2279 | |
2280 | |
2281 | |
2282 | |
2283 | |
2284 | |
2285 | |
2286 | leaq -320(%rsp,%r9,2),%r11 |
2287 | movq %rsp,%rbp |
2288 | subq %rdi,%r11 |
2289 | andq $4095,%r11 |
2290 | cmpq %r11,%r10 |
2291 | jb .Lmulx4xsp_alt |
2292 | subq %r11,%rbp |
2293 | leaq -320(%rbp,%r9,2),%rbp |
2294 | jmp .Lmulx4xsp_done |
2295 | |
2296 | .Lmulx4xsp_alt: |
2297 | leaq 4096-320(,%r9,2),%r10 |
2298 | leaq -320(%rbp,%r9,2),%rbp |
2299 | subq %r10,%r11 |
2300 | movq $0,%r10 |
2301 | cmovcq %r10,%r11 |
2302 | subq %r11,%rbp |
2303 | .Lmulx4xsp_done: |
2304 | andq $-64,%rbp |
2305 | movq %rsp,%r11 |
2306 | subq %rbp,%r11 |
2307 | andq $-4096,%r11 |
2308 | leaq (%r11,%rbp,1),%rsp |
2309 | movq (%rsp),%r10 |
2310 | cmpq %rbp,%rsp |
2311 | ja .Lmulx4x_page_walk |
2312 | jmp .Lmulx4x_page_walk_done |
2313 | |
2314 | .Lmulx4x_page_walk: |
2315 | leaq -4096(%rsp),%rsp |
2316 | movq (%rsp),%r10 |
2317 | cmpq %rbp,%rsp |
2318 | ja .Lmulx4x_page_walk |
2319 | .Lmulx4x_page_walk_done: |
2320 | |
2321 | |
2322 | |
2323 | |
2324 | |
2325 | |
2326 | |
2327 | |
2328 | |
2329 | |
2330 | |
2331 | |
2332 | |
2333 | movq %r8,32(%rsp) |
2334 | movq %rax,40(%rsp) |
2335 | .cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 |
2336 | .Lmulx4x_body: |
2337 | call mulx4x_internal |
2338 | |
2339 | movq 40(%rsp),%rsi |
2340 | .cfi_def_cfa %rsi,8 |
2341 | movq $1,%rax |
2342 | |
2343 | movq -48(%rsi),%r15 |
2344 | .cfi_restore %r15 |
2345 | movq -40(%rsi),%r14 |
2346 | .cfi_restore %r14 |
2347 | movq -32(%rsi),%r13 |
2348 | .cfi_restore %r13 |
2349 | movq -24(%rsi),%r12 |
2350 | .cfi_restore %r12 |
2351 | movq -16(%rsi),%rbp |
2352 | .cfi_restore %rbp |
2353 | movq -8(%rsi),%rbx |
2354 | .cfi_restore %rbx |
2355 | leaq (%rsi),%rsp |
2356 | .cfi_def_cfa_register %rsp |
2357 | .Lmulx4x_epilogue: |
2358 | .byte 0xf3,0xc3 |
2359 | .cfi_endproc |
2360 | .size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5 |
2361 | |
2362 | .type mulx4x_internal,@function |
2363 | .align 32 |
2364 | mulx4x_internal: |
2365 | .cfi_startproc |
2366 | movq %r9,8(%rsp) |
2367 | movq %r9,%r10 |
2368 | negq %r9 |
2369 | shlq $5,%r9 |
2370 | negq %r10 |
2371 | leaq 128(%rdx,%r9,1),%r13 |
2372 | shrq $5+5,%r9 |
2373 | movd 8(%rax),%xmm5 |
2374 | subq $1,%r9 |
2375 | leaq .Linc(%rip),%rax |
2376 | movq %r13,16+8(%rsp) |
2377 | movq %r9,24+8(%rsp) |
2378 | movq %rdi,56+8(%rsp) |
2379 | movdqa 0(%rax),%xmm0 |
2380 | movdqa 16(%rax),%xmm1 |
2381 | leaq 88-112(%rsp,%r10,1),%r10 |
2382 | leaq 128(%rdx),%rdi |
2383 | |
2384 | pshufd $0,%xmm5,%xmm5 |
2385 | movdqa %xmm1,%xmm4 |
2386 | .byte 0x67 |
2387 | movdqa %xmm1,%xmm2 |
2388 | .byte 0x67 |
2389 | paddd %xmm0,%xmm1 |
2390 | pcmpeqd %xmm5,%xmm0 |
2391 | movdqa %xmm4,%xmm3 |
2392 | paddd %xmm1,%xmm2 |
2393 | pcmpeqd %xmm5,%xmm1 |
2394 | movdqa %xmm0,112(%r10) |
2395 | movdqa %xmm4,%xmm0 |
2396 | |
2397 | paddd %xmm2,%xmm3 |
2398 | pcmpeqd %xmm5,%xmm2 |
2399 | movdqa %xmm1,128(%r10) |
2400 | movdqa %xmm4,%xmm1 |
2401 | |
2402 | paddd %xmm3,%xmm0 |
2403 | pcmpeqd %xmm5,%xmm3 |
2404 | movdqa %xmm2,144(%r10) |
2405 | movdqa %xmm4,%xmm2 |
2406 | |
2407 | paddd %xmm0,%xmm1 |
2408 | pcmpeqd %xmm5,%xmm0 |
2409 | movdqa %xmm3,160(%r10) |
2410 | movdqa %xmm4,%xmm3 |
2411 | paddd %xmm1,%xmm2 |
2412 | pcmpeqd %xmm5,%xmm1 |
2413 | movdqa %xmm0,176(%r10) |
2414 | movdqa %xmm4,%xmm0 |
2415 | |
2416 | paddd %xmm2,%xmm3 |
2417 | pcmpeqd %xmm5,%xmm2 |
2418 | movdqa %xmm1,192(%r10) |
2419 | movdqa %xmm4,%xmm1 |
2420 | |
2421 | paddd %xmm3,%xmm0 |
2422 | pcmpeqd %xmm5,%xmm3 |
2423 | movdqa %xmm2,208(%r10) |
2424 | movdqa %xmm4,%xmm2 |
2425 | |
2426 | paddd %xmm0,%xmm1 |
2427 | pcmpeqd %xmm5,%xmm0 |
2428 | movdqa %xmm3,224(%r10) |
2429 | movdqa %xmm4,%xmm3 |
2430 | paddd %xmm1,%xmm2 |
2431 | pcmpeqd %xmm5,%xmm1 |
2432 | movdqa %xmm0,240(%r10) |
2433 | movdqa %xmm4,%xmm0 |
2434 | |
2435 | paddd %xmm2,%xmm3 |
2436 | pcmpeqd %xmm5,%xmm2 |
2437 | movdqa %xmm1,256(%r10) |
2438 | movdqa %xmm4,%xmm1 |
2439 | |
2440 | paddd %xmm3,%xmm0 |
2441 | pcmpeqd %xmm5,%xmm3 |
2442 | movdqa %xmm2,272(%r10) |
2443 | movdqa %xmm4,%xmm2 |
2444 | |
2445 | paddd %xmm0,%xmm1 |
2446 | pcmpeqd %xmm5,%xmm0 |
2447 | movdqa %xmm3,288(%r10) |
2448 | movdqa %xmm4,%xmm3 |
2449 | .byte 0x67 |
2450 | paddd %xmm1,%xmm2 |
2451 | pcmpeqd %xmm5,%xmm1 |
2452 | movdqa %xmm0,304(%r10) |
2453 | |
2454 | paddd %xmm2,%xmm3 |
2455 | pcmpeqd %xmm5,%xmm2 |
2456 | movdqa %xmm1,320(%r10) |
2457 | |
2458 | pcmpeqd %xmm5,%xmm3 |
2459 | movdqa %xmm2,336(%r10) |
2460 | |
2461 | pand 64(%rdi),%xmm0 |
2462 | pand 80(%rdi),%xmm1 |
2463 | pand 96(%rdi),%xmm2 |
2464 | movdqa %xmm3,352(%r10) |
2465 | pand 112(%rdi),%xmm3 |
2466 | por %xmm2,%xmm0 |
2467 | por %xmm3,%xmm1 |
2468 | movdqa -128(%rdi),%xmm4 |
2469 | movdqa -112(%rdi),%xmm5 |
2470 | movdqa -96(%rdi),%xmm2 |
2471 | pand 112(%r10),%xmm4 |
2472 | movdqa -80(%rdi),%xmm3 |
2473 | pand 128(%r10),%xmm5 |
2474 | por %xmm4,%xmm0 |
2475 | pand 144(%r10),%xmm2 |
2476 | por %xmm5,%xmm1 |
2477 | pand 160(%r10),%xmm3 |
2478 | por %xmm2,%xmm0 |
2479 | por %xmm3,%xmm1 |
2480 | movdqa -64(%rdi),%xmm4 |
2481 | movdqa -48(%rdi),%xmm5 |
2482 | movdqa -32(%rdi),%xmm2 |
2483 | pand 176(%r10),%xmm4 |
2484 | movdqa -16(%rdi),%xmm3 |
2485 | pand 192(%r10),%xmm5 |
2486 | por %xmm4,%xmm0 |
2487 | pand 208(%r10),%xmm2 |
2488 | por %xmm5,%xmm1 |
2489 | pand 224(%r10),%xmm3 |
2490 | por %xmm2,%xmm0 |
2491 | por %xmm3,%xmm1 |
2492 | movdqa 0(%rdi),%xmm4 |
2493 | movdqa 16(%rdi),%xmm5 |
2494 | movdqa 32(%rdi),%xmm2 |
2495 | pand 240(%r10),%xmm4 |
2496 | movdqa 48(%rdi),%xmm3 |
2497 | pand 256(%r10),%xmm5 |
2498 | por %xmm4,%xmm0 |
2499 | pand 272(%r10),%xmm2 |
2500 | por %xmm5,%xmm1 |
2501 | pand 288(%r10),%xmm3 |
2502 | por %xmm2,%xmm0 |
2503 | por %xmm3,%xmm1 |
2504 | pxor %xmm1,%xmm0 |
2505 | pshufd $0x4e,%xmm0,%xmm1 |
2506 | por %xmm1,%xmm0 |
2507 | leaq 256(%rdi),%rdi |
2508 | .byte 102,72,15,126,194 |
2509 | leaq 64+32+8(%rsp),%rbx |
2510 | |
2511 | movq %rdx,%r9 |
2512 | mulxq 0(%rsi),%r8,%rax |
2513 | mulxq 8(%rsi),%r11,%r12 |
2514 | addq %rax,%r11 |
2515 | mulxq 16(%rsi),%rax,%r13 |
2516 | adcq %rax,%r12 |
2517 | adcq $0,%r13 |
2518 | mulxq 24(%rsi),%rax,%r14 |
2519 | |
2520 | movq %r8,%r15 |
2521 | imulq 32+8(%rsp),%r8 |
2522 | xorq %rbp,%rbp |
2523 | movq %r8,%rdx |
2524 | |
2525 | movq %rdi,8+8(%rsp) |
2526 | |
2527 | leaq 32(%rsi),%rsi |
2528 | adcxq %rax,%r13 |
2529 | adcxq %rbp,%r14 |
2530 | |
2531 | mulxq 0(%rcx),%rax,%r10 |
2532 | adcxq %rax,%r15 |
2533 | adoxq %r11,%r10 |
2534 | mulxq 8(%rcx),%rax,%r11 |
2535 | adcxq %rax,%r10 |
2536 | adoxq %r12,%r11 |
2537 | mulxq 16(%rcx),%rax,%r12 |
2538 | movq 24+8(%rsp),%rdi |
2539 | movq %r10,-32(%rbx) |
2540 | adcxq %rax,%r11 |
2541 | adoxq %r13,%r12 |
2542 | mulxq 24(%rcx),%rax,%r15 |
2543 | movq %r9,%rdx |
2544 | movq %r11,-24(%rbx) |
2545 | adcxq %rax,%r12 |
2546 | adoxq %rbp,%r15 |
2547 | leaq 32(%rcx),%rcx |
2548 | movq %r12,-16(%rbx) |
2549 | jmp .Lmulx4x_1st |
2550 | |
2551 | .align 32 |
2552 | .Lmulx4x_1st: |
2553 | adcxq %rbp,%r15 |
2554 | mulxq 0(%rsi),%r10,%rax |
2555 | adcxq %r14,%r10 |
2556 | mulxq 8(%rsi),%r11,%r14 |
2557 | adcxq %rax,%r11 |
2558 | mulxq 16(%rsi),%r12,%rax |
2559 | adcxq %r14,%r12 |
2560 | mulxq 24(%rsi),%r13,%r14 |
2561 | .byte 0x67,0x67 |
2562 | movq %r8,%rdx |
2563 | adcxq %rax,%r13 |
2564 | adcxq %rbp,%r14 |
2565 | leaq 32(%rsi),%rsi |
2566 | leaq 32(%rbx),%rbx |
2567 | |
2568 | adoxq %r15,%r10 |
2569 | mulxq 0(%rcx),%rax,%r15 |
2570 | adcxq %rax,%r10 |
2571 | adoxq %r15,%r11 |
2572 | mulxq 8(%rcx),%rax,%r15 |
2573 | adcxq %rax,%r11 |
2574 | adoxq %r15,%r12 |
2575 | mulxq 16(%rcx),%rax,%r15 |
2576 | movq %r10,-40(%rbx) |
2577 | adcxq %rax,%r12 |
2578 | movq %r11,-32(%rbx) |
2579 | adoxq %r15,%r13 |
2580 | mulxq 24(%rcx),%rax,%r15 |
2581 | movq %r9,%rdx |
2582 | movq %r12,-24(%rbx) |
2583 | adcxq %rax,%r13 |
2584 | adoxq %rbp,%r15 |
2585 | leaq 32(%rcx),%rcx |
2586 | movq %r13,-16(%rbx) |
2587 | |
2588 | decq %rdi |
2589 | jnz .Lmulx4x_1st |
2590 | |
2591 | movq 8(%rsp),%rax |
2592 | adcq %rbp,%r15 |
2593 | leaq (%rsi,%rax,1),%rsi |
2594 | addq %r15,%r14 |
2595 | movq 8+8(%rsp),%rdi |
2596 | adcq %rbp,%rbp |
2597 | movq %r14,-8(%rbx) |
2598 | jmp .Lmulx4x_outer |
2599 | |
2600 | .align 32 |
2601 | .Lmulx4x_outer: |
2602 | leaq 16-256(%rbx),%r10 |
2603 | pxor %xmm4,%xmm4 |
2604 | .byte 0x67,0x67 |
2605 | pxor %xmm5,%xmm5 |
2606 | movdqa -128(%rdi),%xmm0 |
2607 | movdqa -112(%rdi),%xmm1 |
2608 | movdqa -96(%rdi),%xmm2 |
2609 | pand 256(%r10),%xmm0 |
2610 | movdqa -80(%rdi),%xmm3 |
2611 | pand 272(%r10),%xmm1 |
2612 | por %xmm0,%xmm4 |
2613 | pand 288(%r10),%xmm2 |
2614 | por %xmm1,%xmm5 |
2615 | pand 304(%r10),%xmm3 |
2616 | por %xmm2,%xmm4 |
2617 | por %xmm3,%xmm5 |
2618 | movdqa -64(%rdi),%xmm0 |
2619 | movdqa -48(%rdi),%xmm1 |
2620 | movdqa -32(%rdi),%xmm2 |
2621 | pand 320(%r10),%xmm0 |
2622 | movdqa -16(%rdi),%xmm3 |
2623 | pand 336(%r10),%xmm1 |
2624 | por %xmm0,%xmm4 |
2625 | pand 352(%r10),%xmm2 |
2626 | por %xmm1,%xmm5 |
2627 | pand 368(%r10),%xmm3 |
2628 | por %xmm2,%xmm4 |
2629 | por %xmm3,%xmm5 |
2630 | movdqa 0(%rdi),%xmm0 |
2631 | movdqa 16(%rdi),%xmm1 |
2632 | movdqa 32(%rdi),%xmm2 |
2633 | pand 384(%r10),%xmm0 |
2634 | movdqa 48(%rdi),%xmm3 |
2635 | pand 400(%r10),%xmm1 |
2636 | por %xmm0,%xmm4 |
2637 | pand 416(%r10),%xmm2 |
2638 | por %xmm1,%xmm5 |
2639 | pand 432(%r10),%xmm3 |
2640 | por %xmm2,%xmm4 |
2641 | por %xmm3,%xmm5 |
2642 | movdqa 64(%rdi),%xmm0 |
2643 | movdqa 80(%rdi),%xmm1 |
2644 | movdqa 96(%rdi),%xmm2 |
2645 | pand 448(%r10),%xmm0 |
2646 | movdqa 112(%rdi),%xmm3 |
2647 | pand 464(%r10),%xmm1 |
2648 | por %xmm0,%xmm4 |
2649 | pand 480(%r10),%xmm2 |
2650 | por %xmm1,%xmm5 |
2651 | pand 496(%r10),%xmm3 |
2652 | por %xmm2,%xmm4 |
2653 | por %xmm3,%xmm5 |
2654 | por %xmm5,%xmm4 |
2655 | pshufd $0x4e,%xmm4,%xmm0 |
2656 | por %xmm4,%xmm0 |
2657 | leaq 256(%rdi),%rdi |
2658 | .byte 102,72,15,126,194 |
2659 | |
2660 | movq %rbp,(%rbx) |
2661 | leaq 32(%rbx,%rax,1),%rbx |
2662 | mulxq 0(%rsi),%r8,%r11 |
2663 | xorq %rbp,%rbp |
2664 | movq %rdx,%r9 |
2665 | mulxq 8(%rsi),%r14,%r12 |
2666 | adoxq -32(%rbx),%r8 |
2667 | adcxq %r14,%r11 |
2668 | mulxq 16(%rsi),%r15,%r13 |
2669 | adoxq -24(%rbx),%r11 |
2670 | adcxq %r15,%r12 |
2671 | mulxq 24(%rsi),%rdx,%r14 |
2672 | adoxq -16(%rbx),%r12 |
2673 | adcxq %rdx,%r13 |
2674 | leaq (%rcx,%rax,1),%rcx |
2675 | leaq 32(%rsi),%rsi |
2676 | adoxq -8(%rbx),%r13 |
2677 | adcxq %rbp,%r14 |
2678 | adoxq %rbp,%r14 |
2679 | |
2680 | movq %r8,%r15 |
2681 | imulq 32+8(%rsp),%r8 |
2682 | |
2683 | movq %r8,%rdx |
2684 | xorq %rbp,%rbp |
2685 | movq %rdi,8+8(%rsp) |
2686 | |
2687 | mulxq 0(%rcx),%rax,%r10 |
2688 | adcxq %rax,%r15 |
2689 | adoxq %r11,%r10 |
2690 | mulxq 8(%rcx),%rax,%r11 |
2691 | adcxq %rax,%r10 |
2692 | adoxq %r12,%r11 |
2693 | mulxq 16(%rcx),%rax,%r12 |
2694 | adcxq %rax,%r11 |
2695 | adoxq %r13,%r12 |
2696 | mulxq 24(%rcx),%rax,%r15 |
2697 | movq %r9,%rdx |
2698 | movq 24+8(%rsp),%rdi |
2699 | movq %r10,-32(%rbx) |
2700 | adcxq %rax,%r12 |
2701 | movq %r11,-24(%rbx) |
2702 | adoxq %rbp,%r15 |
2703 | movq %r12,-16(%rbx) |
2704 | leaq 32(%rcx),%rcx |
2705 | jmp .Lmulx4x_inner |
2706 | |
2707 | .align 32 |
2708 | .Lmulx4x_inner: |
2709 | mulxq 0(%rsi),%r10,%rax |
2710 | adcxq %rbp,%r15 |
2711 | adoxq %r14,%r10 |
2712 | mulxq 8(%rsi),%r11,%r14 |
2713 | adcxq 0(%rbx),%r10 |
2714 | adoxq %rax,%r11 |
2715 | mulxq 16(%rsi),%r12,%rax |
2716 | adcxq 8(%rbx),%r11 |
2717 | adoxq %r14,%r12 |
2718 | mulxq 24(%rsi),%r13,%r14 |
2719 | movq %r8,%rdx |
2720 | adcxq 16(%rbx),%r12 |
2721 | adoxq %rax,%r13 |
2722 | adcxq 24(%rbx),%r13 |
2723 | adoxq %rbp,%r14 |
2724 | leaq 32(%rsi),%rsi |
2725 | leaq 32(%rbx),%rbx |
2726 | adcxq %rbp,%r14 |
2727 | |
2728 | adoxq %r15,%r10 |
2729 | mulxq 0(%rcx),%rax,%r15 |
2730 | adcxq %rax,%r10 |
2731 | adoxq %r15,%r11 |
2732 | mulxq 8(%rcx),%rax,%r15 |
2733 | adcxq %rax,%r11 |
2734 | adoxq %r15,%r12 |
2735 | mulxq 16(%rcx),%rax,%r15 |
2736 | movq %r10,-40(%rbx) |
2737 | adcxq %rax,%r12 |
2738 | adoxq %r15,%r13 |
2739 | movq %r11,-32(%rbx) |
2740 | mulxq 24(%rcx),%rax,%r15 |
2741 | movq %r9,%rdx |
2742 | leaq 32(%rcx),%rcx |
2743 | movq %r12,-24(%rbx) |
2744 | adcxq %rax,%r13 |
2745 | adoxq %rbp,%r15 |
2746 | movq %r13,-16(%rbx) |
2747 | |
2748 | decq %rdi |
2749 | jnz .Lmulx4x_inner |
2750 | |
2751 | movq 0+8(%rsp),%rax |
2752 | adcq %rbp,%r15 |
2753 | subq 0(%rbx),%rdi |
2754 | movq 8+8(%rsp),%rdi |
2755 | movq 16+8(%rsp),%r10 |
2756 | adcq %r15,%r14 |
2757 | leaq (%rsi,%rax,1),%rsi |
2758 | adcq %rbp,%rbp |
2759 | movq %r14,-8(%rbx) |
2760 | |
2761 | cmpq %r10,%rdi |
2762 | jb .Lmulx4x_outer |
2763 | |
2764 | movq -8(%rcx),%r10 |
2765 | movq %rbp,%r8 |
2766 | movq (%rcx,%rax,1),%r12 |
2767 | leaq (%rcx,%rax,1),%rbp |
2768 | movq %rax,%rcx |
2769 | leaq (%rbx,%rax,1),%rdi |
2770 | xorl %eax,%eax |
2771 | xorq %r15,%r15 |
2772 | subq %r14,%r10 |
2773 | adcq %r15,%r15 |
2774 | orq %r15,%r8 |
2775 | sarq $3+2,%rcx |
2776 | subq %r8,%rax |
2777 | movq 56+8(%rsp),%rdx |
2778 | decq %r12 |
2779 | movq 8(%rbp),%r13 |
2780 | xorq %r8,%r8 |
2781 | movq 16(%rbp),%r14 |
2782 | movq 24(%rbp),%r15 |
2783 | jmp .Lsqrx4x_sub_entry |
2784 | .cfi_endproc |
2785 | .size mulx4x_internal,.-mulx4x_internal |
2786 | .type bn_powerx5,@function |
2787 | .align 32 |
2788 | bn_powerx5: |
2789 | .cfi_startproc |
2790 | movq %rsp,%rax |
2791 | .cfi_def_cfa_register %rax |
2792 | .Lpowerx5_enter: |
2793 | pushq %rbx |
2794 | .cfi_offset %rbx,-16 |
2795 | pushq %rbp |
2796 | .cfi_offset %rbp,-24 |
2797 | pushq %r12 |
2798 | .cfi_offset %r12,-32 |
2799 | pushq %r13 |
2800 | .cfi_offset %r13,-40 |
2801 | pushq %r14 |
2802 | .cfi_offset %r14,-48 |
2803 | pushq %r15 |
2804 | .cfi_offset %r15,-56 |
2805 | .Lpowerx5_prologue: |
2806 | |
2807 | shll $3,%r9d |
2808 | leaq (%r9,%r9,2),%r10 |
2809 | negq %r9 |
2810 | movq (%r8),%r8 |
2811 | |
2812 | |
2813 | |
2814 | |
2815 | |
2816 | |
2817 | |
2818 | |
2819 | leaq -320(%rsp,%r9,2),%r11 |
2820 | movq %rsp,%rbp |
2821 | subq %rdi,%r11 |
2822 | andq $4095,%r11 |
2823 | cmpq %r11,%r10 |
2824 | jb .Lpwrx_sp_alt |
2825 | subq %r11,%rbp |
2826 | leaq -320(%rbp,%r9,2),%rbp |
2827 | jmp .Lpwrx_sp_done |
2828 | |
2829 | .align 32 |
2830 | .Lpwrx_sp_alt: |
2831 | leaq 4096-320(,%r9,2),%r10 |
2832 | leaq -320(%rbp,%r9,2),%rbp |
2833 | subq %r10,%r11 |
2834 | movq $0,%r10 |
2835 | cmovcq %r10,%r11 |
2836 | subq %r11,%rbp |
2837 | .Lpwrx_sp_done: |
2838 | andq $-64,%rbp |
2839 | movq %rsp,%r11 |
2840 | subq %rbp,%r11 |
2841 | andq $-4096,%r11 |
2842 | leaq (%r11,%rbp,1),%rsp |
2843 | movq (%rsp),%r10 |
2844 | cmpq %rbp,%rsp |
2845 | ja .Lpwrx_page_walk |
2846 | jmp .Lpwrx_page_walk_done |
2847 | |
2848 | .Lpwrx_page_walk: |
2849 | leaq -4096(%rsp),%rsp |
2850 | movq (%rsp),%r10 |
2851 | cmpq %rbp,%rsp |
2852 | ja .Lpwrx_page_walk |
2853 | .Lpwrx_page_walk_done: |
2854 | |
2855 | movq %r9,%r10 |
2856 | negq %r9 |
2857 | |
2858 | |
2859 | |
2860 | |
2861 | |
2862 | |
2863 | |
2864 | |
2865 | |
2866 | |
2867 | |
2868 | |
2869 | pxor %xmm0,%xmm0 |
2870 | .byte 102,72,15,110,207 |
2871 | .byte 102,72,15,110,209 |
2872 | .byte 102,73,15,110,218 |
2873 | .byte 102,72,15,110,226 |
2874 | movq %r8,32(%rsp) |
2875 | movq %rax,40(%rsp) |
2876 | .cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 |
2877 | .Lpowerx5_body: |
2878 | |
2879 | call __bn_sqrx8x_internal |
2880 | call __bn_postx4x_internal |
2881 | call __bn_sqrx8x_internal |
2882 | call __bn_postx4x_internal |
2883 | call __bn_sqrx8x_internal |
2884 | call __bn_postx4x_internal |
2885 | call __bn_sqrx8x_internal |
2886 | call __bn_postx4x_internal |
2887 | call __bn_sqrx8x_internal |
2888 | call __bn_postx4x_internal |
2889 | |
2890 | movq %r10,%r9 |
2891 | movq %rsi,%rdi |
2892 | .byte 102,72,15,126,209 |
2893 | .byte 102,72,15,126,226 |
2894 | movq 40(%rsp),%rax |
2895 | |
2896 | call mulx4x_internal |
2897 | |
2898 | movq 40(%rsp),%rsi |
2899 | .cfi_def_cfa %rsi,8 |
2900 | movq $1,%rax |
2901 | |
2902 | movq -48(%rsi),%r15 |
2903 | .cfi_restore %r15 |
2904 | movq -40(%rsi),%r14 |
2905 | .cfi_restore %r14 |
2906 | movq -32(%rsi),%r13 |
2907 | .cfi_restore %r13 |
2908 | movq -24(%rsi),%r12 |
2909 | .cfi_restore %r12 |
2910 | movq -16(%rsi),%rbp |
2911 | .cfi_restore %rbp |
2912 | movq -8(%rsi),%rbx |
2913 | .cfi_restore %rbx |
2914 | leaq (%rsi),%rsp |
2915 | .cfi_def_cfa_register %rsp |
2916 | .Lpowerx5_epilogue: |
2917 | .byte 0xf3,0xc3 |
2918 | .cfi_endproc |
2919 | .size bn_powerx5,.-bn_powerx5 |
2920 | |
2921 | .globl bn_sqrx8x_internal |
2922 | .hidden bn_sqrx8x_internal |
2923 | .hidden bn_sqrx8x_internal |
2924 | .type bn_sqrx8x_internal,@function |
2925 | .align 32 |
2926 | bn_sqrx8x_internal: |
2927 | __bn_sqrx8x_internal: |
2928 | .cfi_startproc |
2929 | |
2930 | |
2931 | |
2932 | |
2933 | |
2934 | |
2935 | |
2936 | |
2937 | |
2938 | |
2939 | |
2940 | |
2941 | |
2942 | |
2943 | |
2944 | |
2945 | |
2946 | |
2947 | |
2948 | |
2949 | |
2950 | |
2951 | |
2952 | |
2953 | |
2954 | |
2955 | |
2956 | |
2957 | |
2958 | |
2959 | |
2960 | |
2961 | |
2962 | |
2963 | |
2964 | |
2965 | |
2966 | |
2967 | |
2968 | |
2969 | leaq 48+8(%rsp),%rdi |
2970 | leaq (%rsi,%r9,1),%rbp |
2971 | movq %r9,0+8(%rsp) |
2972 | movq %rbp,8+8(%rsp) |
2973 | jmp .Lsqr8x_zero_start |
2974 | |
2975 | .align 32 |
2976 | .byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 |
2977 | .Lsqrx8x_zero: |
2978 | .byte 0x3e |
2979 | movdqa %xmm0,0(%rdi) |
2980 | movdqa %xmm0,16(%rdi) |
2981 | movdqa %xmm0,32(%rdi) |
2982 | movdqa %xmm0,48(%rdi) |
2983 | .Lsqr8x_zero_start: |
2984 | movdqa %xmm0,64(%rdi) |
2985 | movdqa %xmm0,80(%rdi) |
2986 | movdqa %xmm0,96(%rdi) |
2987 | movdqa %xmm0,112(%rdi) |
2988 | leaq 128(%rdi),%rdi |
2989 | subq $64,%r9 |
2990 | jnz .Lsqrx8x_zero |
2991 | |
2992 | movq 0(%rsi),%rdx |
2993 | |
2994 | xorq %r10,%r10 |
2995 | xorq %r11,%r11 |
2996 | xorq %r12,%r12 |
2997 | xorq %r13,%r13 |
2998 | xorq %r14,%r14 |
2999 | xorq %r15,%r15 |
3000 | leaq 48+8(%rsp),%rdi |
3001 | xorq %rbp,%rbp |
3002 | jmp .Lsqrx8x_outer_loop |
3003 | |
3004 | .align 32 |
3005 | .Lsqrx8x_outer_loop: |
3006 | mulxq 8(%rsi),%r8,%rax |
3007 | adcxq %r9,%r8 |
3008 | adoxq %rax,%r10 |
3009 | mulxq 16(%rsi),%r9,%rax |
3010 | adcxq %r10,%r9 |
3011 | adoxq %rax,%r11 |
3012 | .byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 |
3013 | adcxq %r11,%r10 |
3014 | adoxq %rax,%r12 |
3015 | .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 |
3016 | adcxq %r12,%r11 |
3017 | adoxq %rax,%r13 |
3018 | mulxq 40(%rsi),%r12,%rax |
3019 | adcxq %r13,%r12 |
3020 | adoxq %rax,%r14 |
3021 | mulxq 48(%rsi),%r13,%rax |
3022 | adcxq %r14,%r13 |
3023 | adoxq %r15,%rax |
3024 | mulxq 56(%rsi),%r14,%r15 |
3025 | movq 8(%rsi),%rdx |
3026 | adcxq %rax,%r14 |
3027 | adoxq %rbp,%r15 |
3028 | adcq 64(%rdi),%r15 |
3029 | movq %r8,8(%rdi) |
3030 | movq %r9,16(%rdi) |
3031 | sbbq %rcx,%rcx |
3032 | xorq %rbp,%rbp |
3033 | |
3034 | |
3035 | mulxq 16(%rsi),%r8,%rbx |
3036 | mulxq 24(%rsi),%r9,%rax |
3037 | adcxq %r10,%r8 |
3038 | adoxq %rbx,%r9 |
3039 | mulxq 32(%rsi),%r10,%rbx |
3040 | adcxq %r11,%r9 |
3041 | adoxq %rax,%r10 |
3042 | .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 |
3043 | adcxq %r12,%r10 |
3044 | adoxq %rbx,%r11 |
3045 | .byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 |
3046 | adcxq %r13,%r11 |
3047 | adoxq %r14,%r12 |
3048 | .byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 |
3049 | movq 16(%rsi),%rdx |
3050 | adcxq %rax,%r12 |
3051 | adoxq %rbx,%r13 |
3052 | adcxq %r15,%r13 |
3053 | adoxq %rbp,%r14 |
3054 | adcxq %rbp,%r14 |
3055 | |
3056 | movq %r8,24(%rdi) |
3057 | movq %r9,32(%rdi) |
3058 | |
3059 | mulxq 24(%rsi),%r8,%rbx |
3060 | mulxq 32(%rsi),%r9,%rax |
3061 | adcxq %r10,%r8 |
3062 | adoxq %rbx,%r9 |
3063 | mulxq 40(%rsi),%r10,%rbx |
3064 | adcxq %r11,%r9 |
3065 | adoxq %rax,%r10 |
3066 | .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 |
3067 | adcxq %r12,%r10 |
3068 | adoxq %r13,%r11 |
3069 | .byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 |
3070 | .byte 0x3e |
3071 | movq 24(%rsi),%rdx |
3072 | adcxq %rbx,%r11 |
3073 | adoxq %rax,%r12 |
3074 | adcxq %r14,%r12 |
3075 | movq %r8,40(%rdi) |
3076 | movq %r9,48(%rdi) |
3077 | mulxq 32(%rsi),%r8,%rax |
3078 | adoxq %rbp,%r13 |
3079 | adcxq %rbp,%r13 |
3080 | |
3081 | mulxq 40(%rsi),%r9,%rbx |
3082 | adcxq %r10,%r8 |
3083 | adoxq %rax,%r9 |
3084 | mulxq 48(%rsi),%r10,%rax |
3085 | adcxq %r11,%r9 |
3086 | adoxq %r12,%r10 |
3087 | mulxq 56(%rsi),%r11,%r12 |
3088 | movq 32(%rsi),%rdx |
3089 | movq 40(%rsi),%r14 |
3090 | adcxq %rbx,%r10 |
3091 | adoxq %rax,%r11 |
3092 | movq 48(%rsi),%r15 |
3093 | adcxq %r13,%r11 |
3094 | adoxq %rbp,%r12 |
3095 | adcxq %rbp,%r12 |
3096 | |
3097 | movq %r8,56(%rdi) |
3098 | movq %r9,64(%rdi) |
3099 | |
3100 | mulxq %r14,%r9,%rax |
3101 | movq 56(%rsi),%r8 |
3102 | adcxq %r10,%r9 |
3103 | mulxq %r15,%r10,%rbx |
3104 | adoxq %rax,%r10 |
3105 | adcxq %r11,%r10 |
3106 | mulxq %r8,%r11,%rax |
3107 | movq %r14,%rdx |
3108 | adoxq %rbx,%r11 |
3109 | adcxq %r12,%r11 |
3110 | |
3111 | adcxq %rbp,%rax |
3112 | |
3113 | mulxq %r15,%r14,%rbx |
3114 | mulxq %r8,%r12,%r13 |
3115 | movq %r15,%rdx |
3116 | leaq 64(%rsi),%rsi |
3117 | adcxq %r14,%r11 |
3118 | adoxq %rbx,%r12 |
3119 | adcxq %rax,%r12 |
3120 | adoxq %rbp,%r13 |
3121 | |
3122 | .byte 0x67,0x67 |
3123 | mulxq %r8,%r8,%r14 |
3124 | adcxq %r8,%r13 |
3125 | adcxq %rbp,%r14 |
3126 | |
3127 | cmpq 8+8(%rsp),%rsi |
3128 | je .Lsqrx8x_outer_break |
3129 | |
3130 | negq %rcx |
3131 | movq $-8,%rcx |
3132 | movq %rbp,%r15 |
3133 | movq 64(%rdi),%r8 |
3134 | adcxq 72(%rdi),%r9 |
3135 | adcxq 80(%rdi),%r10 |
3136 | adcxq 88(%rdi),%r11 |
3137 | adcq 96(%rdi),%r12 |
3138 | adcq 104(%rdi),%r13 |
3139 | adcq 112(%rdi),%r14 |
3140 | adcq 120(%rdi),%r15 |
3141 | leaq (%rsi),%rbp |
3142 | leaq 128(%rdi),%rdi |
3143 | sbbq %rax,%rax |
3144 | |
3145 | movq -64(%rsi),%rdx |
3146 | movq %rax,16+8(%rsp) |
3147 | movq %rdi,24+8(%rsp) |
3148 | |
3149 | |
3150 | xorl %eax,%eax |
3151 | jmp .Lsqrx8x_loop |
3152 | |
3153 | .align 32 |
3154 | .Lsqrx8x_loop: |
3155 | movq %r8,%rbx |
3156 | mulxq 0(%rbp),%rax,%r8 |
3157 | adcxq %rax,%rbx |
3158 | adoxq %r9,%r8 |
3159 | |
3160 | mulxq 8(%rbp),%rax,%r9 |
3161 | adcxq %rax,%r8 |
3162 | adoxq %r10,%r9 |
3163 | |
3164 | mulxq 16(%rbp),%rax,%r10 |
3165 | adcxq %rax,%r9 |
3166 | adoxq %r11,%r10 |
3167 | |
3168 | mulxq 24(%rbp),%rax,%r11 |
3169 | adcxq %rax,%r10 |
3170 | adoxq %r12,%r11 |
3171 | |
3172 | .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 |
3173 | adcxq %rax,%r11 |
3174 | adoxq %r13,%r12 |
3175 | |
3176 | mulxq 40(%rbp),%rax,%r13 |
3177 | adcxq %rax,%r12 |
3178 | adoxq %r14,%r13 |
3179 | |
3180 | mulxq 48(%rbp),%rax,%r14 |
3181 | movq %rbx,(%rdi,%rcx,8) |
3182 | movl $0,%ebx |
3183 | adcxq %rax,%r13 |
3184 | adoxq %r15,%r14 |
3185 | |
3186 | .byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 |
3187 | movq 8(%rsi,%rcx,8),%rdx |
3188 | adcxq %rax,%r14 |
3189 | adoxq %rbx,%r15 |
3190 | adcxq %rbx,%r15 |
3191 | |
3192 | .byte 0x67 |
3193 | incq %rcx |
3194 | jnz .Lsqrx8x_loop |
3195 | |
3196 | leaq 64(%rbp),%rbp |
3197 | movq $-8,%rcx |
3198 | cmpq 8+8(%rsp),%rbp |
3199 | je .Lsqrx8x_break |
3200 | |
3201 | subq 16+8(%rsp),%rbx |
3202 | .byte 0x66 |
3203 | movq -64(%rsi),%rdx |
3204 | adcxq 0(%rdi),%r8 |
3205 | adcxq 8(%rdi),%r9 |
3206 | adcq 16(%rdi),%r10 |
3207 | adcq 24(%rdi),%r11 |
3208 | adcq 32(%rdi),%r12 |
3209 | adcq 40(%rdi),%r13 |
3210 | adcq 48(%rdi),%r14 |
3211 | adcq 56(%rdi),%r15 |
3212 | leaq 64(%rdi),%rdi |
3213 | .byte 0x67 |
3214 | sbbq %rax,%rax |
3215 | xorl %ebx,%ebx |
3216 | movq %rax,16+8(%rsp) |
3217 | jmp .Lsqrx8x_loop |
3218 | |
3219 | .align 32 |
3220 | .Lsqrx8x_break: |
3221 | xorq %rbp,%rbp |
3222 | subq 16+8(%rsp),%rbx |
3223 | adcxq %rbp,%r8 |
3224 | movq 24+8(%rsp),%rcx |
3225 | adcxq %rbp,%r9 |
3226 | movq 0(%rsi),%rdx |
3227 | adcq $0,%r10 |
3228 | movq %r8,0(%rdi) |
3229 | adcq $0,%r11 |
3230 | adcq $0,%r12 |
3231 | adcq $0,%r13 |
3232 | adcq $0,%r14 |
3233 | adcq $0,%r15 |
3234 | cmpq %rcx,%rdi |
3235 | je .Lsqrx8x_outer_loop |
3236 | |
3237 | movq %r9,8(%rdi) |
3238 | movq 8(%rcx),%r9 |
3239 | movq %r10,16(%rdi) |
3240 | movq 16(%rcx),%r10 |
3241 | movq %r11,24(%rdi) |
3242 | movq 24(%rcx),%r11 |
3243 | movq %r12,32(%rdi) |
3244 | movq 32(%rcx),%r12 |
3245 | movq %r13,40(%rdi) |
3246 | movq 40(%rcx),%r13 |
3247 | movq %r14,48(%rdi) |
3248 | movq 48(%rcx),%r14 |
3249 | movq %r15,56(%rdi) |
3250 | movq 56(%rcx),%r15 |
3251 | movq %rcx,%rdi |
3252 | jmp .Lsqrx8x_outer_loop |
3253 | |
3254 | .align 32 |
3255 | .Lsqrx8x_outer_break: |
3256 | movq %r9,72(%rdi) |
3257 | .byte 102,72,15,126,217 |
3258 | movq %r10,80(%rdi) |
3259 | movq %r11,88(%rdi) |
3260 | movq %r12,96(%rdi) |
3261 | movq %r13,104(%rdi) |
3262 | movq %r14,112(%rdi) |
3263 | leaq 48+8(%rsp),%rdi |
3264 | movq (%rsi,%rcx,1),%rdx |
3265 | |
3266 | movq 8(%rdi),%r11 |
3267 | xorq %r10,%r10 |
3268 | movq 0+8(%rsp),%r9 |
3269 | adoxq %r11,%r11 |
3270 | movq 16(%rdi),%r12 |
3271 | movq 24(%rdi),%r13 |
3272 | |
3273 | |
3274 | .align 32 |
3275 | .Lsqrx4x_shift_n_add: |
3276 | mulxq %rdx,%rax,%rbx |
3277 | adoxq %r12,%r12 |
3278 | adcxq %r10,%rax |
3279 | .byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 |
3280 | .byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 |
3281 | adoxq %r13,%r13 |
3282 | adcxq %r11,%rbx |
3283 | movq 40(%rdi),%r11 |
3284 | movq %rax,0(%rdi) |
3285 | movq %rbx,8(%rdi) |
3286 | |
3287 | mulxq %rdx,%rax,%rbx |
3288 | adoxq %r10,%r10 |
3289 | adcxq %r12,%rax |
3290 | movq 16(%rsi,%rcx,1),%rdx |
3291 | movq 48(%rdi),%r12 |
3292 | adoxq %r11,%r11 |
3293 | adcxq %r13,%rbx |
3294 | movq 56(%rdi),%r13 |
3295 | movq %rax,16(%rdi) |
3296 | movq %rbx,24(%rdi) |
3297 | |
3298 | mulxq %rdx,%rax,%rbx |
3299 | adoxq %r12,%r12 |
3300 | adcxq %r10,%rax |
3301 | movq 24(%rsi,%rcx,1),%rdx |
3302 | leaq 32(%rcx),%rcx |
3303 | movq 64(%rdi),%r10 |
3304 | adoxq %r13,%r13 |
3305 | adcxq %r11,%rbx |
3306 | movq 72(%rdi),%r11 |
3307 | movq %rax,32(%rdi) |
3308 | movq %rbx,40(%rdi) |
3309 | |
3310 | mulxq %rdx,%rax,%rbx |
3311 | adoxq %r10,%r10 |
3312 | adcxq %r12,%rax |
3313 | jrcxz .Lsqrx4x_shift_n_add_break |
3314 | .byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 |
3315 | adoxq %r11,%r11 |
3316 | adcxq %r13,%rbx |
3317 | movq 80(%rdi),%r12 |
3318 | movq 88(%rdi),%r13 |
3319 | movq %rax,48(%rdi) |
3320 | movq %rbx,56(%rdi) |
3321 | leaq 64(%rdi),%rdi |
3322 | nop |
3323 | jmp .Lsqrx4x_shift_n_add |
3324 | |
3325 | .align 32 |
3326 | .Lsqrx4x_shift_n_add_break: |
3327 | adcxq %r13,%rbx |
3328 | movq %rax,48(%rdi) |
3329 | movq %rbx,56(%rdi) |
3330 | leaq 64(%rdi),%rdi |
3331 | .byte 102,72,15,126,213 |
3332 | __bn_sqrx8x_reduction: |
3333 | xorl %eax,%eax |
3334 | movq 32+8(%rsp),%rbx |
3335 | movq 48+8(%rsp),%rdx |
3336 | leaq -64(%rbp,%r9,1),%rcx |
3337 | |
3338 | movq %rcx,0+8(%rsp) |
3339 | movq %rdi,8+8(%rsp) |
3340 | |
3341 | leaq 48+8(%rsp),%rdi |
3342 | jmp .Lsqrx8x_reduction_loop |
3343 | |
3344 | .align 32 |
3345 | .Lsqrx8x_reduction_loop: |
3346 | movq 8(%rdi),%r9 |
3347 | movq 16(%rdi),%r10 |
3348 | movq 24(%rdi),%r11 |
3349 | movq 32(%rdi),%r12 |
3350 | movq %rdx,%r8 |
3351 | imulq %rbx,%rdx |
3352 | movq 40(%rdi),%r13 |
3353 | movq 48(%rdi),%r14 |
3354 | movq 56(%rdi),%r15 |
3355 | movq %rax,24+8(%rsp) |
3356 | |
3357 | leaq 64(%rdi),%rdi |
3358 | xorq %rsi,%rsi |
3359 | movq $-8,%rcx |
3360 | jmp .Lsqrx8x_reduce |
3361 | |
3362 | .align 32 |
3363 | .Lsqrx8x_reduce: |
3364 | movq %r8,%rbx |
3365 | mulxq 0(%rbp),%rax,%r8 |
3366 | adcxq %rbx,%rax |
3367 | adoxq %r9,%r8 |
3368 | |
3369 | mulxq 8(%rbp),%rbx,%r9 |
3370 | adcxq %rbx,%r8 |
3371 | adoxq %r10,%r9 |
3372 | |
3373 | mulxq 16(%rbp),%rbx,%r10 |
3374 | adcxq %rbx,%r9 |
3375 | adoxq %r11,%r10 |
3376 | |
3377 | mulxq 24(%rbp),%rbx,%r11 |
3378 | adcxq %rbx,%r10 |
3379 | adoxq %r12,%r11 |
3380 | |
3381 | .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 |
3382 | movq %rdx,%rax |
3383 | movq %r8,%rdx |
3384 | adcxq %rbx,%r11 |
3385 | adoxq %r13,%r12 |
3386 | |
3387 | mulxq 32+8(%rsp),%rbx,%rdx |
3388 | movq %rax,%rdx |
3389 | movq %rax,64+48+8(%rsp,%rcx,8) |
3390 | |
3391 | mulxq 40(%rbp),%rax,%r13 |
3392 | adcxq %rax,%r12 |
3393 | adoxq %r14,%r13 |
3394 | |
3395 | mulxq 48(%rbp),%rax,%r14 |
3396 | adcxq %rax,%r13 |
3397 | adoxq %r15,%r14 |
3398 | |
3399 | mulxq 56(%rbp),%rax,%r15 |
3400 | movq %rbx,%rdx |
3401 | adcxq %rax,%r14 |
3402 | adoxq %rsi,%r15 |
3403 | adcxq %rsi,%r15 |
3404 | |
3405 | .byte 0x67,0x67,0x67 |
3406 | incq %rcx |
3407 | jnz .Lsqrx8x_reduce |
3408 | |
3409 | movq %rsi,%rax |
3410 | cmpq 0+8(%rsp),%rbp |
3411 | jae .Lsqrx8x_no_tail |
3412 | |
3413 | movq 48+8(%rsp),%rdx |
3414 | addq 0(%rdi),%r8 |
3415 | leaq 64(%rbp),%rbp |
3416 | movq $-8,%rcx |
3417 | adcxq 8(%rdi),%r9 |
3418 | adcxq 16(%rdi),%r10 |
3419 | adcq 24(%rdi),%r11 |
3420 | adcq 32(%rdi),%r12 |
3421 | adcq 40(%rdi),%r13 |
3422 | adcq 48(%rdi),%r14 |
3423 | adcq 56(%rdi),%r15 |
3424 | leaq 64(%rdi),%rdi |
3425 | sbbq %rax,%rax |
3426 | |
3427 | xorq %rsi,%rsi |
3428 | movq %rax,16+8(%rsp) |
3429 | jmp .Lsqrx8x_tail |
3430 | |
3431 | .align 32 |
3432 | .Lsqrx8x_tail: |
3433 | movq %r8,%rbx |
3434 | mulxq 0(%rbp),%rax,%r8 |
3435 | adcxq %rax,%rbx |
3436 | adoxq %r9,%r8 |
3437 | |
3438 | mulxq 8(%rbp),%rax,%r9 |
3439 | adcxq %rax,%r8 |
3440 | adoxq %r10,%r9 |
3441 | |
3442 | mulxq 16(%rbp),%rax,%r10 |
3443 | adcxq %rax,%r9 |
3444 | adoxq %r11,%r10 |
3445 | |
3446 | mulxq 24(%rbp),%rax,%r11 |
3447 | adcxq %rax,%r10 |
3448 | adoxq %r12,%r11 |
3449 | |
3450 | .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 |
3451 | adcxq %rax,%r11 |
3452 | adoxq %r13,%r12 |
3453 | |
3454 | mulxq 40(%rbp),%rax,%r13 |
3455 | adcxq %rax,%r12 |
3456 | adoxq %r14,%r13 |
3457 | |
3458 | mulxq 48(%rbp),%rax,%r14 |
3459 | adcxq %rax,%r13 |
3460 | adoxq %r15,%r14 |
3461 | |
3462 | mulxq 56(%rbp),%rax,%r15 |
3463 | movq 72+48+8(%rsp,%rcx,8),%rdx |
3464 | adcxq %rax,%r14 |
3465 | adoxq %rsi,%r15 |
3466 | movq %rbx,(%rdi,%rcx,8) |
3467 | movq %r8,%rbx |
3468 | adcxq %rsi,%r15 |
3469 | |
3470 | incq %rcx |
3471 | jnz .Lsqrx8x_tail |
3472 | |
3473 | cmpq 0+8(%rsp),%rbp |
3474 | jae .Lsqrx8x_tail_done |
3475 | |
3476 | subq 16+8(%rsp),%rsi |
3477 | movq 48+8(%rsp),%rdx |
3478 | leaq 64(%rbp),%rbp |
3479 | adcq 0(%rdi),%r8 |
3480 | adcq 8(%rdi),%r9 |
3481 | adcq 16(%rdi),%r10 |
3482 | adcq 24(%rdi),%r11 |
3483 | adcq 32(%rdi),%r12 |
3484 | adcq 40(%rdi),%r13 |
3485 | adcq 48(%rdi),%r14 |
3486 | adcq 56(%rdi),%r15 |
3487 | leaq 64(%rdi),%rdi |
3488 | sbbq %rax,%rax |
3489 | subq $8,%rcx |
3490 | |
3491 | xorq %rsi,%rsi |
3492 | movq %rax,16+8(%rsp) |
3493 | jmp .Lsqrx8x_tail |
3494 | |
3495 | .align 32 |
3496 | .Lsqrx8x_tail_done: |
3497 | xorq %rax,%rax |
3498 | addq 24+8(%rsp),%r8 |
3499 | adcq $0,%r9 |
3500 | adcq $0,%r10 |
3501 | adcq $0,%r11 |
3502 | adcq $0,%r12 |
3503 | adcq $0,%r13 |
3504 | adcq $0,%r14 |
3505 | adcq $0,%r15 |
3506 | adcq $0,%rax |
3507 | |
3508 | subq 16+8(%rsp),%rsi |
3509 | .Lsqrx8x_no_tail: |
3510 | adcq 0(%rdi),%r8 |
3511 | .byte 102,72,15,126,217 |
3512 | adcq 8(%rdi),%r9 |
3513 | movq 56(%rbp),%rsi |
3514 | .byte 102,72,15,126,213 |
3515 | adcq 16(%rdi),%r10 |
3516 | adcq 24(%rdi),%r11 |
3517 | adcq 32(%rdi),%r12 |
3518 | adcq 40(%rdi),%r13 |
3519 | adcq 48(%rdi),%r14 |
3520 | adcq 56(%rdi),%r15 |
3521 | adcq $0,%rax |
3522 | |
3523 | movq 32+8(%rsp),%rbx |
3524 | movq 64(%rdi,%rcx,1),%rdx |
3525 | |
3526 | movq %r8,0(%rdi) |
3527 | leaq 64(%rdi),%r8 |
3528 | movq %r9,8(%rdi) |
3529 | movq %r10,16(%rdi) |
3530 | movq %r11,24(%rdi) |
3531 | movq %r12,32(%rdi) |
3532 | movq %r13,40(%rdi) |
3533 | movq %r14,48(%rdi) |
3534 | movq %r15,56(%rdi) |
3535 | |
3536 | leaq 64(%rdi,%rcx,1),%rdi |
3537 | cmpq 8+8(%rsp),%r8 |
3538 | jb .Lsqrx8x_reduction_loop |
3539 | .byte 0xf3,0xc3 |
3540 | .cfi_endproc |
3541 | .size bn_sqrx8x_internal,.-bn_sqrx8x_internal |
3542 | .align 32 |
3543 | .type __bn_postx4x_internal,@function |
3544 | __bn_postx4x_internal: |
3545 | .cfi_startproc |
3546 | movq 0(%rbp),%r12 |
3547 | movq %rcx,%r10 |
3548 | movq %rcx,%r9 |
3549 | negq %rax |
3550 | sarq $3+2,%rcx |
3551 | |
3552 | .byte 102,72,15,126,202 |
3553 | .byte 102,72,15,126,206 |
3554 | decq %r12 |
3555 | movq 8(%rbp),%r13 |
3556 | xorq %r8,%r8 |
3557 | movq 16(%rbp),%r14 |
3558 | movq 24(%rbp),%r15 |
3559 | jmp .Lsqrx4x_sub_entry |
3560 | |
3561 | .align 16 |
3562 | .Lsqrx4x_sub: |
3563 | movq 0(%rbp),%r12 |
3564 | movq 8(%rbp),%r13 |
3565 | movq 16(%rbp),%r14 |
3566 | movq 24(%rbp),%r15 |
3567 | .Lsqrx4x_sub_entry: |
3568 | andnq %rax,%r12,%r12 |
3569 | leaq 32(%rbp),%rbp |
3570 | andnq %rax,%r13,%r13 |
3571 | andnq %rax,%r14,%r14 |
3572 | andnq %rax,%r15,%r15 |
3573 | |
3574 | negq %r8 |
3575 | adcq 0(%rdi),%r12 |
3576 | adcq 8(%rdi),%r13 |
3577 | adcq 16(%rdi),%r14 |
3578 | adcq 24(%rdi),%r15 |
3579 | movq %r12,0(%rdx) |
3580 | leaq 32(%rdi),%rdi |
3581 | movq %r13,8(%rdx) |
3582 | sbbq %r8,%r8 |
3583 | movq %r14,16(%rdx) |
3584 | movq %r15,24(%rdx) |
3585 | leaq 32(%rdx),%rdx |
3586 | |
3587 | incq %rcx |
3588 | jnz .Lsqrx4x_sub |
3589 | |
3590 | negq %r9 |
3591 | |
3592 | .byte 0xf3,0xc3 |
3593 | .cfi_endproc |
3594 | .size __bn_postx4x_internal,.-__bn_postx4x_internal |
3595 | .globl bn_scatter5 |
3596 | .hidden bn_scatter5 |
3597 | .type bn_scatter5,@function |
3598 | .align 16 |
3599 | bn_scatter5: |
3600 | .cfi_startproc |
3601 | cmpl $0,%esi |
3602 | jz .Lscatter_epilogue |
3603 | leaq (%rdx,%rcx,8),%rdx |
3604 | .Lscatter: |
3605 | movq (%rdi),%rax |
3606 | leaq 8(%rdi),%rdi |
3607 | movq %rax,(%rdx) |
3608 | leaq 256(%rdx),%rdx |
3609 | subl $1,%esi |
3610 | jnz .Lscatter |
3611 | .Lscatter_epilogue: |
3612 | .byte 0xf3,0xc3 |
3613 | .cfi_endproc |
3614 | .size bn_scatter5,.-bn_scatter5 |
3615 | |
3616 | .globl bn_gather5 |
3617 | .hidden bn_gather5 |
3618 | .type bn_gather5,@function |
3619 | .align 32 |
3620 | bn_gather5: |
3621 | .cfi_startproc |
3622 | .LSEH_begin_bn_gather5: |
3623 | |
3624 | .byte 0x4c,0x8d,0x14,0x24 |
3625 | .cfi_def_cfa_register %r10 |
3626 | .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 |
3627 | leaq .Linc(%rip),%rax |
3628 | andq $-16,%rsp |
3629 | |
3630 | movd %ecx,%xmm5 |
3631 | movdqa 0(%rax),%xmm0 |
3632 | movdqa 16(%rax),%xmm1 |
3633 | leaq 128(%rdx),%r11 |
3634 | leaq 128(%rsp),%rax |
3635 | |
3636 | pshufd $0,%xmm5,%xmm5 |
3637 | movdqa %xmm1,%xmm4 |
3638 | movdqa %xmm1,%xmm2 |
3639 | paddd %xmm0,%xmm1 |
3640 | pcmpeqd %xmm5,%xmm0 |
3641 | movdqa %xmm4,%xmm3 |
3642 | |
3643 | paddd %xmm1,%xmm2 |
3644 | pcmpeqd %xmm5,%xmm1 |
3645 | movdqa %xmm0,-128(%rax) |
3646 | movdqa %xmm4,%xmm0 |
3647 | |
3648 | paddd %xmm2,%xmm3 |
3649 | pcmpeqd %xmm5,%xmm2 |
3650 | movdqa %xmm1,-112(%rax) |
3651 | movdqa %xmm4,%xmm1 |
3652 | |
3653 | paddd %xmm3,%xmm0 |
3654 | pcmpeqd %xmm5,%xmm3 |
3655 | movdqa %xmm2,-96(%rax) |
3656 | movdqa %xmm4,%xmm2 |
3657 | paddd %xmm0,%xmm1 |
3658 | pcmpeqd %xmm5,%xmm0 |
3659 | movdqa %xmm3,-80(%rax) |
3660 | movdqa %xmm4,%xmm3 |
3661 | |
3662 | paddd %xmm1,%xmm2 |
3663 | pcmpeqd %xmm5,%xmm1 |
3664 | movdqa %xmm0,-64(%rax) |
3665 | movdqa %xmm4,%xmm0 |
3666 | |
3667 | paddd %xmm2,%xmm3 |
3668 | pcmpeqd %xmm5,%xmm2 |
3669 | movdqa %xmm1,-48(%rax) |
3670 | movdqa %xmm4,%xmm1 |
3671 | |
3672 | paddd %xmm3,%xmm0 |
3673 | pcmpeqd %xmm5,%xmm3 |
3674 | movdqa %xmm2,-32(%rax) |
3675 | movdqa %xmm4,%xmm2 |
3676 | paddd %xmm0,%xmm1 |
3677 | pcmpeqd %xmm5,%xmm0 |
3678 | movdqa %xmm3,-16(%rax) |
3679 | movdqa %xmm4,%xmm3 |
3680 | |
3681 | paddd %xmm1,%xmm2 |
3682 | pcmpeqd %xmm5,%xmm1 |
3683 | movdqa %xmm0,0(%rax) |
3684 | movdqa %xmm4,%xmm0 |
3685 | |
3686 | paddd %xmm2,%xmm3 |
3687 | pcmpeqd %xmm5,%xmm2 |
3688 | movdqa %xmm1,16(%rax) |
3689 | movdqa %xmm4,%xmm1 |
3690 | |
3691 | paddd %xmm3,%xmm0 |
3692 | pcmpeqd %xmm5,%xmm3 |
3693 | movdqa %xmm2,32(%rax) |
3694 | movdqa %xmm4,%xmm2 |
3695 | paddd %xmm0,%xmm1 |
3696 | pcmpeqd %xmm5,%xmm0 |
3697 | movdqa %xmm3,48(%rax) |
3698 | movdqa %xmm4,%xmm3 |
3699 | |
3700 | paddd %xmm1,%xmm2 |
3701 | pcmpeqd %xmm5,%xmm1 |
3702 | movdqa %xmm0,64(%rax) |
3703 | movdqa %xmm4,%xmm0 |
3704 | |
3705 | paddd %xmm2,%xmm3 |
3706 | pcmpeqd %xmm5,%xmm2 |
3707 | movdqa %xmm1,80(%rax) |
3708 | movdqa %xmm4,%xmm1 |
3709 | |
3710 | paddd %xmm3,%xmm0 |
3711 | pcmpeqd %xmm5,%xmm3 |
3712 | movdqa %xmm2,96(%rax) |
3713 | movdqa %xmm4,%xmm2 |
3714 | movdqa %xmm3,112(%rax) |
3715 | jmp .Lgather |
3716 | |
3717 | .align 32 |
3718 | .Lgather: |
3719 | pxor %xmm4,%xmm4 |
3720 | pxor %xmm5,%xmm5 |
3721 | movdqa -128(%r11),%xmm0 |
3722 | movdqa -112(%r11),%xmm1 |
3723 | movdqa -96(%r11),%xmm2 |
3724 | pand -128(%rax),%xmm0 |
3725 | movdqa -80(%r11),%xmm3 |
3726 | pand -112(%rax),%xmm1 |
3727 | por %xmm0,%xmm4 |
3728 | pand -96(%rax),%xmm2 |
3729 | por %xmm1,%xmm5 |
3730 | pand -80(%rax),%xmm3 |
3731 | por %xmm2,%xmm4 |
3732 | por %xmm3,%xmm5 |
3733 | movdqa -64(%r11),%xmm0 |
3734 | movdqa -48(%r11),%xmm1 |
3735 | movdqa -32(%r11),%xmm2 |
3736 | pand -64(%rax),%xmm0 |
3737 | movdqa -16(%r11),%xmm3 |
3738 | pand -48(%rax),%xmm1 |
3739 | por %xmm0,%xmm4 |
3740 | pand -32(%rax),%xmm2 |
3741 | por %xmm1,%xmm5 |
3742 | pand -16(%rax),%xmm3 |
3743 | por %xmm2,%xmm4 |
3744 | por %xmm3,%xmm5 |
3745 | movdqa 0(%r11),%xmm0 |
3746 | movdqa 16(%r11),%xmm1 |
3747 | movdqa 32(%r11),%xmm2 |
3748 | pand 0(%rax),%xmm0 |
3749 | movdqa 48(%r11),%xmm3 |
3750 | pand 16(%rax),%xmm1 |
3751 | por %xmm0,%xmm4 |
3752 | pand 32(%rax),%xmm2 |
3753 | por %xmm1,%xmm5 |
3754 | pand 48(%rax),%xmm3 |
3755 | por %xmm2,%xmm4 |
3756 | por %xmm3,%xmm5 |
3757 | movdqa 64(%r11),%xmm0 |
3758 | movdqa 80(%r11),%xmm1 |
3759 | movdqa 96(%r11),%xmm2 |
3760 | pand 64(%rax),%xmm0 |
3761 | movdqa 112(%r11),%xmm3 |
3762 | pand 80(%rax),%xmm1 |
3763 | por %xmm0,%xmm4 |
3764 | pand 96(%rax),%xmm2 |
3765 | por %xmm1,%xmm5 |
3766 | pand 112(%rax),%xmm3 |
3767 | por %xmm2,%xmm4 |
3768 | por %xmm3,%xmm5 |
3769 | por %xmm5,%xmm4 |
3770 | leaq 256(%r11),%r11 |
3771 | pshufd $0x4e,%xmm4,%xmm0 |
3772 | por %xmm4,%xmm0 |
3773 | movq %xmm0,(%rdi) |
3774 | leaq 8(%rdi),%rdi |
3775 | subl $1,%esi |
3776 | jnz .Lgather |
3777 | |
3778 | leaq (%r10),%rsp |
3779 | .cfi_def_cfa_register %rsp |
3780 | .byte 0xf3,0xc3 |
3781 | .LSEH_end_bn_gather5: |
3782 | .cfi_endproc |
3783 | .size bn_gather5,.-bn_gather5 |
3784 | .align 64 |
3785 | .Linc: |
3786 | .long 0,0, 1,1 |
3787 | .long 2,2, 2,2 |
3788 | .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 |
3789 | #endif |
3790 | |