1# This file is generated from a similarly-named Perl script in the BoringSSL
2# source tree. Do not edit by hand.
3
4#if defined(__has_feature)
5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
6#define OPENSSL_NO_ASM
7#endif
8#endif
9
10#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
11#if defined(BORINGSSL_PREFIX)
12#include <boringssl_prefix_symbols_asm.h>
13#endif
14.text
15
16.extern OPENSSL_ia32cap_P
17.hidden OPENSSL_ia32cap_P
18
19.align 64
20.Lzero:
21.long 0,0,0,0
22.Lone:
23.long 1,0,0,0
24.Linc:
25.long 0,1,2,3
26.Lfour:
27.long 4,4,4,4
28.Lincy:
29.long 0,2,4,6,1,3,5,7
30.Leight:
31.long 8,8,8,8,8,8,8,8
32.Lrot16:
33.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
34.Lrot24:
35.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
36.Lsigma:
37.byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
38.align 64
39.Lzeroz:
40.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
41.Lfourz:
42.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
43.Lincz:
44.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
45.Lsixteen:
46.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
47.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
48.globl ChaCha20_ctr32
49.hidden ChaCha20_ctr32
50.type ChaCha20_ctr32,@function
51.align 64
52ChaCha20_ctr32:
53.cfi_startproc
54 cmpq $0,%rdx
55 je .Lno_data
56 movq OPENSSL_ia32cap_P+4(%rip),%r10
57 testl $512,%r10d
58 jnz .LChaCha20_ssse3
59
60 pushq %rbx
61.cfi_adjust_cfa_offset 8
62.cfi_offset rbx,-16
63 pushq %rbp
64.cfi_adjust_cfa_offset 8
65.cfi_offset rbp,-24
66 pushq %r12
67.cfi_adjust_cfa_offset 8
68.cfi_offset r12,-32
69 pushq %r13
70.cfi_adjust_cfa_offset 8
71.cfi_offset r13,-40
72 pushq %r14
73.cfi_adjust_cfa_offset 8
74.cfi_offset r14,-48
75 pushq %r15
76.cfi_adjust_cfa_offset 8
77.cfi_offset r15,-56
78 subq $64+24,%rsp
79.cfi_adjust_cfa_offset 88
80.Lctr32_body:
81
82
83 movdqu (%rcx),%xmm1
84 movdqu 16(%rcx),%xmm2
85 movdqu (%r8),%xmm3
86 movdqa .Lone(%rip),%xmm4
87
88
89 movdqa %xmm1,16(%rsp)
90 movdqa %xmm2,32(%rsp)
91 movdqa %xmm3,48(%rsp)
92 movq %rdx,%rbp
93 jmp .Loop_outer
94
95.align 32
96.Loop_outer:
97 movl $0x61707865,%eax
98 movl $0x3320646e,%ebx
99 movl $0x79622d32,%ecx
100 movl $0x6b206574,%edx
101 movl 16(%rsp),%r8d
102 movl 20(%rsp),%r9d
103 movl 24(%rsp),%r10d
104 movl 28(%rsp),%r11d
105 movd %xmm3,%r12d
106 movl 52(%rsp),%r13d
107 movl 56(%rsp),%r14d
108 movl 60(%rsp),%r15d
109
110 movq %rbp,64+0(%rsp)
111 movl $10,%ebp
112 movq %rsi,64+8(%rsp)
113.byte 102,72,15,126,214
114 movq %rdi,64+16(%rsp)
115 movq %rsi,%rdi
116 shrq $32,%rdi
117 jmp .Loop
118
119.align 32
120.Loop:
121 addl %r8d,%eax
122 xorl %eax,%r12d
123 roll $16,%r12d
124 addl %r9d,%ebx
125 xorl %ebx,%r13d
126 roll $16,%r13d
127 addl %r12d,%esi
128 xorl %esi,%r8d
129 roll $12,%r8d
130 addl %r13d,%edi
131 xorl %edi,%r9d
132 roll $12,%r9d
133 addl %r8d,%eax
134 xorl %eax,%r12d
135 roll $8,%r12d
136 addl %r9d,%ebx
137 xorl %ebx,%r13d
138 roll $8,%r13d
139 addl %r12d,%esi
140 xorl %esi,%r8d
141 roll $7,%r8d
142 addl %r13d,%edi
143 xorl %edi,%r9d
144 roll $7,%r9d
145 movl %esi,32(%rsp)
146 movl %edi,36(%rsp)
147 movl 40(%rsp),%esi
148 movl 44(%rsp),%edi
149 addl %r10d,%ecx
150 xorl %ecx,%r14d
151 roll $16,%r14d
152 addl %r11d,%edx
153 xorl %edx,%r15d
154 roll $16,%r15d
155 addl %r14d,%esi
156 xorl %esi,%r10d
157 roll $12,%r10d
158 addl %r15d,%edi
159 xorl %edi,%r11d
160 roll $12,%r11d
161 addl %r10d,%ecx
162 xorl %ecx,%r14d
163 roll $8,%r14d
164 addl %r11d,%edx
165 xorl %edx,%r15d
166 roll $8,%r15d
167 addl %r14d,%esi
168 xorl %esi,%r10d
169 roll $7,%r10d
170 addl %r15d,%edi
171 xorl %edi,%r11d
172 roll $7,%r11d
173 addl %r9d,%eax
174 xorl %eax,%r15d
175 roll $16,%r15d
176 addl %r10d,%ebx
177 xorl %ebx,%r12d
178 roll $16,%r12d
179 addl %r15d,%esi
180 xorl %esi,%r9d
181 roll $12,%r9d
182 addl %r12d,%edi
183 xorl %edi,%r10d
184 roll $12,%r10d
185 addl %r9d,%eax
186 xorl %eax,%r15d
187 roll $8,%r15d
188 addl %r10d,%ebx
189 xorl %ebx,%r12d
190 roll $8,%r12d
191 addl %r15d,%esi
192 xorl %esi,%r9d
193 roll $7,%r9d
194 addl %r12d,%edi
195 xorl %edi,%r10d
196 roll $7,%r10d
197 movl %esi,40(%rsp)
198 movl %edi,44(%rsp)
199 movl 32(%rsp),%esi
200 movl 36(%rsp),%edi
201 addl %r11d,%ecx
202 xorl %ecx,%r13d
203 roll $16,%r13d
204 addl %r8d,%edx
205 xorl %edx,%r14d
206 roll $16,%r14d
207 addl %r13d,%esi
208 xorl %esi,%r11d
209 roll $12,%r11d
210 addl %r14d,%edi
211 xorl %edi,%r8d
212 roll $12,%r8d
213 addl %r11d,%ecx
214 xorl %ecx,%r13d
215 roll $8,%r13d
216 addl %r8d,%edx
217 xorl %edx,%r14d
218 roll $8,%r14d
219 addl %r13d,%esi
220 xorl %esi,%r11d
221 roll $7,%r11d
222 addl %r14d,%edi
223 xorl %edi,%r8d
224 roll $7,%r8d
225 decl %ebp
226 jnz .Loop
227 movl %edi,36(%rsp)
228 movl %esi,32(%rsp)
229 movq 64(%rsp),%rbp
230 movdqa %xmm2,%xmm1
231 movq 64+8(%rsp),%rsi
232 paddd %xmm4,%xmm3
233 movq 64+16(%rsp),%rdi
234
235 addl $0x61707865,%eax
236 addl $0x3320646e,%ebx
237 addl $0x79622d32,%ecx
238 addl $0x6b206574,%edx
239 addl 16(%rsp),%r8d
240 addl 20(%rsp),%r9d
241 addl 24(%rsp),%r10d
242 addl 28(%rsp),%r11d
243 addl 48(%rsp),%r12d
244 addl 52(%rsp),%r13d
245 addl 56(%rsp),%r14d
246 addl 60(%rsp),%r15d
247 paddd 32(%rsp),%xmm1
248
249 cmpq $64,%rbp
250 jb .Ltail
251
252 xorl 0(%rsi),%eax
253 xorl 4(%rsi),%ebx
254 xorl 8(%rsi),%ecx
255 xorl 12(%rsi),%edx
256 xorl 16(%rsi),%r8d
257 xorl 20(%rsi),%r9d
258 xorl 24(%rsi),%r10d
259 xorl 28(%rsi),%r11d
260 movdqu 32(%rsi),%xmm0
261 xorl 48(%rsi),%r12d
262 xorl 52(%rsi),%r13d
263 xorl 56(%rsi),%r14d
264 xorl 60(%rsi),%r15d
265 leaq 64(%rsi),%rsi
266 pxor %xmm1,%xmm0
267
268 movdqa %xmm2,32(%rsp)
269 movd %xmm3,48(%rsp)
270
271 movl %eax,0(%rdi)
272 movl %ebx,4(%rdi)
273 movl %ecx,8(%rdi)
274 movl %edx,12(%rdi)
275 movl %r8d,16(%rdi)
276 movl %r9d,20(%rdi)
277 movl %r10d,24(%rdi)
278 movl %r11d,28(%rdi)
279 movdqu %xmm0,32(%rdi)
280 movl %r12d,48(%rdi)
281 movl %r13d,52(%rdi)
282 movl %r14d,56(%rdi)
283 movl %r15d,60(%rdi)
284 leaq 64(%rdi),%rdi
285
286 subq $64,%rbp
287 jnz .Loop_outer
288
289 jmp .Ldone
290
291.align 16
292.Ltail:
293 movl %eax,0(%rsp)
294 movl %ebx,4(%rsp)
295 xorq %rbx,%rbx
296 movl %ecx,8(%rsp)
297 movl %edx,12(%rsp)
298 movl %r8d,16(%rsp)
299 movl %r9d,20(%rsp)
300 movl %r10d,24(%rsp)
301 movl %r11d,28(%rsp)
302 movdqa %xmm1,32(%rsp)
303 movl %r12d,48(%rsp)
304 movl %r13d,52(%rsp)
305 movl %r14d,56(%rsp)
306 movl %r15d,60(%rsp)
307
308.Loop_tail:
309 movzbl (%rsi,%rbx,1),%eax
310 movzbl (%rsp,%rbx,1),%edx
311 leaq 1(%rbx),%rbx
312 xorl %edx,%eax
313 movb %al,-1(%rdi,%rbx,1)
314 decq %rbp
315 jnz .Loop_tail
316
317.Ldone:
318 leaq 64+24+48(%rsp),%rsi
319 movq -48(%rsi),%r15
320.cfi_restore r15
321 movq -40(%rsi),%r14
322.cfi_restore r14
323 movq -32(%rsi),%r13
324.cfi_restore r13
325 movq -24(%rsi),%r12
326.cfi_restore r12
327 movq -16(%rsi),%rbp
328.cfi_restore rbp
329 movq -8(%rsi),%rbx
330.cfi_restore rbx
331 leaq (%rsi),%rsp
332.cfi_adjust_cfa_offset -136
333.Lno_data:
334 .byte 0xf3,0xc3
335.cfi_endproc
336.size ChaCha20_ctr32,.-ChaCha20_ctr32
337.type ChaCha20_ssse3,@function
338.align 32
339ChaCha20_ssse3:
340.LChaCha20_ssse3:
341.cfi_startproc
342 movq %rsp,%r9
343.cfi_def_cfa_register r9
344 cmpq $128,%rdx
345 ja .LChaCha20_4x
346
347.Ldo_sse3_after_all:
348 subq $64+8,%rsp
349 movdqa .Lsigma(%rip),%xmm0
350 movdqu (%rcx),%xmm1
351 movdqu 16(%rcx),%xmm2
352 movdqu (%r8),%xmm3
353 movdqa .Lrot16(%rip),%xmm6
354 movdqa .Lrot24(%rip),%xmm7
355
356 movdqa %xmm0,0(%rsp)
357 movdqa %xmm1,16(%rsp)
358 movdqa %xmm2,32(%rsp)
359 movdqa %xmm3,48(%rsp)
360 movq $10,%r8
361 jmp .Loop_ssse3
362
363.align 32
364.Loop_outer_ssse3:
365 movdqa .Lone(%rip),%xmm3
366 movdqa 0(%rsp),%xmm0
367 movdqa 16(%rsp),%xmm1
368 movdqa 32(%rsp),%xmm2
369 paddd 48(%rsp),%xmm3
370 movq $10,%r8
371 movdqa %xmm3,48(%rsp)
372 jmp .Loop_ssse3
373
374.align 32
375.Loop_ssse3:
376 paddd %xmm1,%xmm0
377 pxor %xmm0,%xmm3
378.byte 102,15,56,0,222
379 paddd %xmm3,%xmm2
380 pxor %xmm2,%xmm1
381 movdqa %xmm1,%xmm4
382 psrld $20,%xmm1
383 pslld $12,%xmm4
384 por %xmm4,%xmm1
385 paddd %xmm1,%xmm0
386 pxor %xmm0,%xmm3
387.byte 102,15,56,0,223
388 paddd %xmm3,%xmm2
389 pxor %xmm2,%xmm1
390 movdqa %xmm1,%xmm4
391 psrld $25,%xmm1
392 pslld $7,%xmm4
393 por %xmm4,%xmm1
394 pshufd $78,%xmm2,%xmm2
395 pshufd $57,%xmm1,%xmm1
396 pshufd $147,%xmm3,%xmm3
397 nop
398 paddd %xmm1,%xmm0
399 pxor %xmm0,%xmm3
400.byte 102,15,56,0,222
401 paddd %xmm3,%xmm2
402 pxor %xmm2,%xmm1
403 movdqa %xmm1,%xmm4
404 psrld $20,%xmm1
405 pslld $12,%xmm4
406 por %xmm4,%xmm1
407 paddd %xmm1,%xmm0
408 pxor %xmm0,%xmm3
409.byte 102,15,56,0,223
410 paddd %xmm3,%xmm2
411 pxor %xmm2,%xmm1
412 movdqa %xmm1,%xmm4
413 psrld $25,%xmm1
414 pslld $7,%xmm4
415 por %xmm4,%xmm1
416 pshufd $78,%xmm2,%xmm2
417 pshufd $147,%xmm1,%xmm1
418 pshufd $57,%xmm3,%xmm3
419 decq %r8
420 jnz .Loop_ssse3
421 paddd 0(%rsp),%xmm0
422 paddd 16(%rsp),%xmm1
423 paddd 32(%rsp),%xmm2
424 paddd 48(%rsp),%xmm3
425
426 cmpq $64,%rdx
427 jb .Ltail_ssse3
428
429 movdqu 0(%rsi),%xmm4
430 movdqu 16(%rsi),%xmm5
431 pxor %xmm4,%xmm0
432 movdqu 32(%rsi),%xmm4
433 pxor %xmm5,%xmm1
434 movdqu 48(%rsi),%xmm5
435 leaq 64(%rsi),%rsi
436 pxor %xmm4,%xmm2
437 pxor %xmm5,%xmm3
438
439 movdqu %xmm0,0(%rdi)
440 movdqu %xmm1,16(%rdi)
441 movdqu %xmm2,32(%rdi)
442 movdqu %xmm3,48(%rdi)
443 leaq 64(%rdi),%rdi
444
445 subq $64,%rdx
446 jnz .Loop_outer_ssse3
447
448 jmp .Ldone_ssse3
449
450.align 16
451.Ltail_ssse3:
452 movdqa %xmm0,0(%rsp)
453 movdqa %xmm1,16(%rsp)
454 movdqa %xmm2,32(%rsp)
455 movdqa %xmm3,48(%rsp)
456 xorq %r8,%r8
457
458.Loop_tail_ssse3:
459 movzbl (%rsi,%r8,1),%eax
460 movzbl (%rsp,%r8,1),%ecx
461 leaq 1(%r8),%r8
462 xorl %ecx,%eax
463 movb %al,-1(%rdi,%r8,1)
464 decq %rdx
465 jnz .Loop_tail_ssse3
466
467.Ldone_ssse3:
468 leaq (%r9),%rsp
469.cfi_def_cfa_register rsp
470.Lssse3_epilogue:
471 .byte 0xf3,0xc3
472.cfi_endproc
473.size ChaCha20_ssse3,.-ChaCha20_ssse3
474.type ChaCha20_4x,@function
475.align 32
476ChaCha20_4x:
477.LChaCha20_4x:
478.cfi_startproc
479 movq %rsp,%r9
480.cfi_def_cfa_register r9
481 movq %r10,%r11
482 shrq $32,%r10
483 testq $32,%r10
484 jnz .LChaCha20_8x
485 cmpq $192,%rdx
486 ja .Lproceed4x
487
488 andq $71303168,%r11
489 cmpq $4194304,%r11
490 je .Ldo_sse3_after_all
491
492.Lproceed4x:
493 subq $0x140+8,%rsp
494 movdqa .Lsigma(%rip),%xmm11
495 movdqu (%rcx),%xmm15
496 movdqu 16(%rcx),%xmm7
497 movdqu (%r8),%xmm3
498 leaq 256(%rsp),%rcx
499 leaq .Lrot16(%rip),%r10
500 leaq .Lrot24(%rip),%r11
501
502 pshufd $0x00,%xmm11,%xmm8
503 pshufd $0x55,%xmm11,%xmm9
504 movdqa %xmm8,64(%rsp)
505 pshufd $0xaa,%xmm11,%xmm10
506 movdqa %xmm9,80(%rsp)
507 pshufd $0xff,%xmm11,%xmm11
508 movdqa %xmm10,96(%rsp)
509 movdqa %xmm11,112(%rsp)
510
511 pshufd $0x00,%xmm15,%xmm12
512 pshufd $0x55,%xmm15,%xmm13
513 movdqa %xmm12,128-256(%rcx)
514 pshufd $0xaa,%xmm15,%xmm14
515 movdqa %xmm13,144-256(%rcx)
516 pshufd $0xff,%xmm15,%xmm15
517 movdqa %xmm14,160-256(%rcx)
518 movdqa %xmm15,176-256(%rcx)
519
520 pshufd $0x00,%xmm7,%xmm4
521 pshufd $0x55,%xmm7,%xmm5
522 movdqa %xmm4,192-256(%rcx)
523 pshufd $0xaa,%xmm7,%xmm6
524 movdqa %xmm5,208-256(%rcx)
525 pshufd $0xff,%xmm7,%xmm7
526 movdqa %xmm6,224-256(%rcx)
527 movdqa %xmm7,240-256(%rcx)
528
529 pshufd $0x00,%xmm3,%xmm0
530 pshufd $0x55,%xmm3,%xmm1
531 paddd .Linc(%rip),%xmm0
532 pshufd $0xaa,%xmm3,%xmm2
533 movdqa %xmm1,272-256(%rcx)
534 pshufd $0xff,%xmm3,%xmm3
535 movdqa %xmm2,288-256(%rcx)
536 movdqa %xmm3,304-256(%rcx)
537
538 jmp .Loop_enter4x
539
540.align 32
541.Loop_outer4x:
542 movdqa 64(%rsp),%xmm8
543 movdqa 80(%rsp),%xmm9
544 movdqa 96(%rsp),%xmm10
545 movdqa 112(%rsp),%xmm11
546 movdqa 128-256(%rcx),%xmm12
547 movdqa 144-256(%rcx),%xmm13
548 movdqa 160-256(%rcx),%xmm14
549 movdqa 176-256(%rcx),%xmm15
550 movdqa 192-256(%rcx),%xmm4
551 movdqa 208-256(%rcx),%xmm5
552 movdqa 224-256(%rcx),%xmm6
553 movdqa 240-256(%rcx),%xmm7
554 movdqa 256-256(%rcx),%xmm0
555 movdqa 272-256(%rcx),%xmm1
556 movdqa 288-256(%rcx),%xmm2
557 movdqa 304-256(%rcx),%xmm3
558 paddd .Lfour(%rip),%xmm0
559
560.Loop_enter4x:
561 movdqa %xmm6,32(%rsp)
562 movdqa %xmm7,48(%rsp)
563 movdqa (%r10),%xmm7
564 movl $10,%eax
565 movdqa %xmm0,256-256(%rcx)
566 jmp .Loop4x
567
568.align 32
569.Loop4x:
570 paddd %xmm12,%xmm8
571 paddd %xmm13,%xmm9
572 pxor %xmm8,%xmm0
573 pxor %xmm9,%xmm1
574.byte 102,15,56,0,199
575.byte 102,15,56,0,207
576 paddd %xmm0,%xmm4
577 paddd %xmm1,%xmm5
578 pxor %xmm4,%xmm12
579 pxor %xmm5,%xmm13
580 movdqa %xmm12,%xmm6
581 pslld $12,%xmm12
582 psrld $20,%xmm6
583 movdqa %xmm13,%xmm7
584 pslld $12,%xmm13
585 por %xmm6,%xmm12
586 psrld $20,%xmm7
587 movdqa (%r11),%xmm6
588 por %xmm7,%xmm13
589 paddd %xmm12,%xmm8
590 paddd %xmm13,%xmm9
591 pxor %xmm8,%xmm0
592 pxor %xmm9,%xmm1
593.byte 102,15,56,0,198
594.byte 102,15,56,0,206
595 paddd %xmm0,%xmm4
596 paddd %xmm1,%xmm5
597 pxor %xmm4,%xmm12
598 pxor %xmm5,%xmm13
599 movdqa %xmm12,%xmm7
600 pslld $7,%xmm12
601 psrld $25,%xmm7
602 movdqa %xmm13,%xmm6
603 pslld $7,%xmm13
604 por %xmm7,%xmm12
605 psrld $25,%xmm6
606 movdqa (%r10),%xmm7
607 por %xmm6,%xmm13
608 movdqa %xmm4,0(%rsp)
609 movdqa %xmm5,16(%rsp)
610 movdqa 32(%rsp),%xmm4
611 movdqa 48(%rsp),%xmm5
612 paddd %xmm14,%xmm10
613 paddd %xmm15,%xmm11
614 pxor %xmm10,%xmm2
615 pxor %xmm11,%xmm3
616.byte 102,15,56,0,215
617.byte 102,15,56,0,223
618 paddd %xmm2,%xmm4
619 paddd %xmm3,%xmm5
620 pxor %xmm4,%xmm14
621 pxor %xmm5,%xmm15
622 movdqa %xmm14,%xmm6
623 pslld $12,%xmm14
624 psrld $20,%xmm6
625 movdqa %xmm15,%xmm7
626 pslld $12,%xmm15
627 por %xmm6,%xmm14
628 psrld $20,%xmm7
629 movdqa (%r11),%xmm6
630 por %xmm7,%xmm15
631 paddd %xmm14,%xmm10
632 paddd %xmm15,%xmm11
633 pxor %xmm10,%xmm2
634 pxor %xmm11,%xmm3
635.byte 102,15,56,0,214
636.byte 102,15,56,0,222
637 paddd %xmm2,%xmm4
638 paddd %xmm3,%xmm5
639 pxor %xmm4,%xmm14
640 pxor %xmm5,%xmm15
641 movdqa %xmm14,%xmm7
642 pslld $7,%xmm14
643 psrld $25,%xmm7
644 movdqa %xmm15,%xmm6
645 pslld $7,%xmm15
646 por %xmm7,%xmm14
647 psrld $25,%xmm6
648 movdqa (%r10),%xmm7
649 por %xmm6,%xmm15
650 paddd %xmm13,%xmm8
651 paddd %xmm14,%xmm9
652 pxor %xmm8,%xmm3
653 pxor %xmm9,%xmm0
654.byte 102,15,56,0,223
655.byte 102,15,56,0,199
656 paddd %xmm3,%xmm4
657 paddd %xmm0,%xmm5
658 pxor %xmm4,%xmm13
659 pxor %xmm5,%xmm14
660 movdqa %xmm13,%xmm6
661 pslld $12,%xmm13
662 psrld $20,%xmm6
663 movdqa %xmm14,%xmm7
664 pslld $12,%xmm14
665 por %xmm6,%xmm13
666 psrld $20,%xmm7
667 movdqa (%r11),%xmm6
668 por %xmm7,%xmm14
669 paddd %xmm13,%xmm8
670 paddd %xmm14,%xmm9
671 pxor %xmm8,%xmm3
672 pxor %xmm9,%xmm0
673.byte 102,15,56,0,222
674.byte 102,15,56,0,198
675 paddd %xmm3,%xmm4
676 paddd %xmm0,%xmm5
677 pxor %xmm4,%xmm13
678 pxor %xmm5,%xmm14
679 movdqa %xmm13,%xmm7
680 pslld $7,%xmm13
681 psrld $25,%xmm7
682 movdqa %xmm14,%xmm6
683 pslld $7,%xmm14
684 por %xmm7,%xmm13
685 psrld $25,%xmm6
686 movdqa (%r10),%xmm7
687 por %xmm6,%xmm14
688 movdqa %xmm4,32(%rsp)
689 movdqa %xmm5,48(%rsp)
690 movdqa 0(%rsp),%xmm4
691 movdqa 16(%rsp),%xmm5
692 paddd %xmm15,%xmm10
693 paddd %xmm12,%xmm11
694 pxor %xmm10,%xmm1
695 pxor %xmm11,%xmm2
696.byte 102,15,56,0,207
697.byte 102,15,56,0,215
698 paddd %xmm1,%xmm4
699 paddd %xmm2,%xmm5
700 pxor %xmm4,%xmm15
701 pxor %xmm5,%xmm12
702 movdqa %xmm15,%xmm6
703 pslld $12,%xmm15
704 psrld $20,%xmm6
705 movdqa %xmm12,%xmm7
706 pslld $12,%xmm12
707 por %xmm6,%xmm15
708 psrld $20,%xmm7
709 movdqa (%r11),%xmm6
710 por %xmm7,%xmm12
711 paddd %xmm15,%xmm10
712 paddd %xmm12,%xmm11
713 pxor %xmm10,%xmm1
714 pxor %xmm11,%xmm2
715.byte 102,15,56,0,206
716.byte 102,15,56,0,214
717 paddd %xmm1,%xmm4
718 paddd %xmm2,%xmm5
719 pxor %xmm4,%xmm15
720 pxor %xmm5,%xmm12
721 movdqa %xmm15,%xmm7
722 pslld $7,%xmm15
723 psrld $25,%xmm7
724 movdqa %xmm12,%xmm6
725 pslld $7,%xmm12
726 por %xmm7,%xmm15
727 psrld $25,%xmm6
728 movdqa (%r10),%xmm7
729 por %xmm6,%xmm12
730 decl %eax
731 jnz .Loop4x
732
733 paddd 64(%rsp),%xmm8
734 paddd 80(%rsp),%xmm9
735 paddd 96(%rsp),%xmm10
736 paddd 112(%rsp),%xmm11
737
738 movdqa %xmm8,%xmm6
739 punpckldq %xmm9,%xmm8
740 movdqa %xmm10,%xmm7
741 punpckldq %xmm11,%xmm10
742 punpckhdq %xmm9,%xmm6
743 punpckhdq %xmm11,%xmm7
744 movdqa %xmm8,%xmm9
745 punpcklqdq %xmm10,%xmm8
746 movdqa %xmm6,%xmm11
747 punpcklqdq %xmm7,%xmm6
748 punpckhqdq %xmm10,%xmm9
749 punpckhqdq %xmm7,%xmm11
750 paddd 128-256(%rcx),%xmm12
751 paddd 144-256(%rcx),%xmm13
752 paddd 160-256(%rcx),%xmm14
753 paddd 176-256(%rcx),%xmm15
754
755 movdqa %xmm8,0(%rsp)
756 movdqa %xmm9,16(%rsp)
757 movdqa 32(%rsp),%xmm8
758 movdqa 48(%rsp),%xmm9
759
760 movdqa %xmm12,%xmm10
761 punpckldq %xmm13,%xmm12
762 movdqa %xmm14,%xmm7
763 punpckldq %xmm15,%xmm14
764 punpckhdq %xmm13,%xmm10
765 punpckhdq %xmm15,%xmm7
766 movdqa %xmm12,%xmm13
767 punpcklqdq %xmm14,%xmm12
768 movdqa %xmm10,%xmm15
769 punpcklqdq %xmm7,%xmm10
770 punpckhqdq %xmm14,%xmm13
771 punpckhqdq %xmm7,%xmm15
772 paddd 192-256(%rcx),%xmm4
773 paddd 208-256(%rcx),%xmm5
774 paddd 224-256(%rcx),%xmm8
775 paddd 240-256(%rcx),%xmm9
776
777 movdqa %xmm6,32(%rsp)
778 movdqa %xmm11,48(%rsp)
779
780 movdqa %xmm4,%xmm14
781 punpckldq %xmm5,%xmm4
782 movdqa %xmm8,%xmm7
783 punpckldq %xmm9,%xmm8
784 punpckhdq %xmm5,%xmm14
785 punpckhdq %xmm9,%xmm7
786 movdqa %xmm4,%xmm5
787 punpcklqdq %xmm8,%xmm4
788 movdqa %xmm14,%xmm9
789 punpcklqdq %xmm7,%xmm14
790 punpckhqdq %xmm8,%xmm5
791 punpckhqdq %xmm7,%xmm9
792 paddd 256-256(%rcx),%xmm0
793 paddd 272-256(%rcx),%xmm1
794 paddd 288-256(%rcx),%xmm2
795 paddd 304-256(%rcx),%xmm3
796
797 movdqa %xmm0,%xmm8
798 punpckldq %xmm1,%xmm0
799 movdqa %xmm2,%xmm7
800 punpckldq %xmm3,%xmm2
801 punpckhdq %xmm1,%xmm8
802 punpckhdq %xmm3,%xmm7
803 movdqa %xmm0,%xmm1
804 punpcklqdq %xmm2,%xmm0
805 movdqa %xmm8,%xmm3
806 punpcklqdq %xmm7,%xmm8
807 punpckhqdq %xmm2,%xmm1
808 punpckhqdq %xmm7,%xmm3
809 cmpq $256,%rdx
810 jb .Ltail4x
811
812 movdqu 0(%rsi),%xmm6
813 movdqu 16(%rsi),%xmm11
814 movdqu 32(%rsi),%xmm2
815 movdqu 48(%rsi),%xmm7
816 pxor 0(%rsp),%xmm6
817 pxor %xmm12,%xmm11
818 pxor %xmm4,%xmm2
819 pxor %xmm0,%xmm7
820
821 movdqu %xmm6,0(%rdi)
822 movdqu 64(%rsi),%xmm6
823 movdqu %xmm11,16(%rdi)
824 movdqu 80(%rsi),%xmm11
825 movdqu %xmm2,32(%rdi)
826 movdqu 96(%rsi),%xmm2
827 movdqu %xmm7,48(%rdi)
828 movdqu 112(%rsi),%xmm7
829 leaq 128(%rsi),%rsi
830 pxor 16(%rsp),%xmm6
831 pxor %xmm13,%xmm11
832 pxor %xmm5,%xmm2
833 pxor %xmm1,%xmm7
834
835 movdqu %xmm6,64(%rdi)
836 movdqu 0(%rsi),%xmm6
837 movdqu %xmm11,80(%rdi)
838 movdqu 16(%rsi),%xmm11
839 movdqu %xmm2,96(%rdi)
840 movdqu 32(%rsi),%xmm2
841 movdqu %xmm7,112(%rdi)
842 leaq 128(%rdi),%rdi
843 movdqu 48(%rsi),%xmm7
844 pxor 32(%rsp),%xmm6
845 pxor %xmm10,%xmm11
846 pxor %xmm14,%xmm2
847 pxor %xmm8,%xmm7
848
849 movdqu %xmm6,0(%rdi)
850 movdqu 64(%rsi),%xmm6
851 movdqu %xmm11,16(%rdi)
852 movdqu 80(%rsi),%xmm11
853 movdqu %xmm2,32(%rdi)
854 movdqu 96(%rsi),%xmm2
855 movdqu %xmm7,48(%rdi)
856 movdqu 112(%rsi),%xmm7
857 leaq 128(%rsi),%rsi
858 pxor 48(%rsp),%xmm6
859 pxor %xmm15,%xmm11
860 pxor %xmm9,%xmm2
861 pxor %xmm3,%xmm7
862 movdqu %xmm6,64(%rdi)
863 movdqu %xmm11,80(%rdi)
864 movdqu %xmm2,96(%rdi)
865 movdqu %xmm7,112(%rdi)
866 leaq 128(%rdi),%rdi
867
868 subq $256,%rdx
869 jnz .Loop_outer4x
870
871 jmp .Ldone4x
872
873.Ltail4x:
874 cmpq $192,%rdx
875 jae .L192_or_more4x
876 cmpq $128,%rdx
877 jae .L128_or_more4x
878 cmpq $64,%rdx
879 jae .L64_or_more4x
880
881
882 xorq %r10,%r10
883
884 movdqa %xmm12,16(%rsp)
885 movdqa %xmm4,32(%rsp)
886 movdqa %xmm0,48(%rsp)
887 jmp .Loop_tail4x
888
889.align 32
890.L64_or_more4x:
891 movdqu 0(%rsi),%xmm6
892 movdqu 16(%rsi),%xmm11
893 movdqu 32(%rsi),%xmm2
894 movdqu 48(%rsi),%xmm7
895 pxor 0(%rsp),%xmm6
896 pxor %xmm12,%xmm11
897 pxor %xmm4,%xmm2
898 pxor %xmm0,%xmm7
899 movdqu %xmm6,0(%rdi)
900 movdqu %xmm11,16(%rdi)
901 movdqu %xmm2,32(%rdi)
902 movdqu %xmm7,48(%rdi)
903 je .Ldone4x
904
905 movdqa 16(%rsp),%xmm6
906 leaq 64(%rsi),%rsi
907 xorq %r10,%r10
908 movdqa %xmm6,0(%rsp)
909 movdqa %xmm13,16(%rsp)
910 leaq 64(%rdi),%rdi
911 movdqa %xmm5,32(%rsp)
912 subq $64,%rdx
913 movdqa %xmm1,48(%rsp)
914 jmp .Loop_tail4x
915
916.align 32
917.L128_or_more4x:
918 movdqu 0(%rsi),%xmm6
919 movdqu 16(%rsi),%xmm11
920 movdqu 32(%rsi),%xmm2
921 movdqu 48(%rsi),%xmm7
922 pxor 0(%rsp),%xmm6
923 pxor %xmm12,%xmm11
924 pxor %xmm4,%xmm2
925 pxor %xmm0,%xmm7
926
927 movdqu %xmm6,0(%rdi)
928 movdqu 64(%rsi),%xmm6
929 movdqu %xmm11,16(%rdi)
930 movdqu 80(%rsi),%xmm11
931 movdqu %xmm2,32(%rdi)
932 movdqu 96(%rsi),%xmm2
933 movdqu %xmm7,48(%rdi)
934 movdqu 112(%rsi),%xmm7
935 pxor 16(%rsp),%xmm6
936 pxor %xmm13,%xmm11
937 pxor %xmm5,%xmm2
938 pxor %xmm1,%xmm7
939 movdqu %xmm6,64(%rdi)
940 movdqu %xmm11,80(%rdi)
941 movdqu %xmm2,96(%rdi)
942 movdqu %xmm7,112(%rdi)
943 je .Ldone4x
944
945 movdqa 32(%rsp),%xmm6
946 leaq 128(%rsi),%rsi
947 xorq %r10,%r10
948 movdqa %xmm6,0(%rsp)
949 movdqa %xmm10,16(%rsp)
950 leaq 128(%rdi),%rdi
951 movdqa %xmm14,32(%rsp)
952 subq $128,%rdx
953 movdqa %xmm8,48(%rsp)
954 jmp .Loop_tail4x
955
956.align 32
957.L192_or_more4x:
958 movdqu 0(%rsi),%xmm6
959 movdqu 16(%rsi),%xmm11
960 movdqu 32(%rsi),%xmm2
961 movdqu 48(%rsi),%xmm7
962 pxor 0(%rsp),%xmm6
963 pxor %xmm12,%xmm11
964 pxor %xmm4,%xmm2
965 pxor %xmm0,%xmm7
966
967 movdqu %xmm6,0(%rdi)
968 movdqu 64(%rsi),%xmm6
969 movdqu %xmm11,16(%rdi)
970 movdqu 80(%rsi),%xmm11
971 movdqu %xmm2,32(%rdi)
972 movdqu 96(%rsi),%xmm2
973 movdqu %xmm7,48(%rdi)
974 movdqu 112(%rsi),%xmm7
975 leaq 128(%rsi),%rsi
976 pxor 16(%rsp),%xmm6
977 pxor %xmm13,%xmm11
978 pxor %xmm5,%xmm2
979 pxor %xmm1,%xmm7
980
981 movdqu %xmm6,64(%rdi)
982 movdqu 0(%rsi),%xmm6
983 movdqu %xmm11,80(%rdi)
984 movdqu 16(%rsi),%xmm11
985 movdqu %xmm2,96(%rdi)
986 movdqu 32(%rsi),%xmm2
987 movdqu %xmm7,112(%rdi)
988 leaq 128(%rdi),%rdi
989 movdqu 48(%rsi),%xmm7
990 pxor 32(%rsp),%xmm6
991 pxor %xmm10,%xmm11
992 pxor %xmm14,%xmm2
993 pxor %xmm8,%xmm7
994 movdqu %xmm6,0(%rdi)
995 movdqu %xmm11,16(%rdi)
996 movdqu %xmm2,32(%rdi)
997 movdqu %xmm7,48(%rdi)
998 je .Ldone4x
999
1000 movdqa 48(%rsp),%xmm6
1001 leaq 64(%rsi),%rsi
1002 xorq %r10,%r10
1003 movdqa %xmm6,0(%rsp)
1004 movdqa %xmm15,16(%rsp)
1005 leaq 64(%rdi),%rdi
1006 movdqa %xmm9,32(%rsp)
1007 subq $192,%rdx
1008 movdqa %xmm3,48(%rsp)
1009
1010.Loop_tail4x:
1011 movzbl (%rsi,%r10,1),%eax
1012 movzbl (%rsp,%r10,1),%ecx
1013 leaq 1(%r10),%r10
1014 xorl %ecx,%eax
1015 movb %al,-1(%rdi,%r10,1)
1016 decq %rdx
1017 jnz .Loop_tail4x
1018
1019.Ldone4x:
1020 leaq (%r9),%rsp
1021.cfi_def_cfa_register rsp
1022.L4x_epilogue:
1023 .byte 0xf3,0xc3
1024.cfi_endproc
1025.size ChaCha20_4x,.-ChaCha20_4x
1026.type ChaCha20_8x,@function
1027.align 32
1028ChaCha20_8x:
1029.LChaCha20_8x:
1030.cfi_startproc
1031 movq %rsp,%r9
1032.cfi_def_cfa_register r9
1033 subq $0x280+8,%rsp
1034 andq $-32,%rsp
1035 vzeroupper
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046 vbroadcasti128 .Lsigma(%rip),%ymm11
1047 vbroadcasti128 (%rcx),%ymm3
1048 vbroadcasti128 16(%rcx),%ymm15
1049 vbroadcasti128 (%r8),%ymm7
1050 leaq 256(%rsp),%rcx
1051 leaq 512(%rsp),%rax
1052 leaq .Lrot16(%rip),%r10
1053 leaq .Lrot24(%rip),%r11
1054
1055 vpshufd $0x00,%ymm11,%ymm8
1056 vpshufd $0x55,%ymm11,%ymm9
1057 vmovdqa %ymm8,128-256(%rcx)
1058 vpshufd $0xaa,%ymm11,%ymm10
1059 vmovdqa %ymm9,160-256(%rcx)
1060 vpshufd $0xff,%ymm11,%ymm11
1061 vmovdqa %ymm10,192-256(%rcx)
1062 vmovdqa %ymm11,224-256(%rcx)
1063
1064 vpshufd $0x00,%ymm3,%ymm0
1065 vpshufd $0x55,%ymm3,%ymm1
1066 vmovdqa %ymm0,256-256(%rcx)
1067 vpshufd $0xaa,%ymm3,%ymm2
1068 vmovdqa %ymm1,288-256(%rcx)
1069 vpshufd $0xff,%ymm3,%ymm3
1070 vmovdqa %ymm2,320-256(%rcx)
1071 vmovdqa %ymm3,352-256(%rcx)
1072
1073 vpshufd $0x00,%ymm15,%ymm12
1074 vpshufd $0x55,%ymm15,%ymm13
1075 vmovdqa %ymm12,384-512(%rax)
1076 vpshufd $0xaa,%ymm15,%ymm14
1077 vmovdqa %ymm13,416-512(%rax)
1078 vpshufd $0xff,%ymm15,%ymm15
1079 vmovdqa %ymm14,448-512(%rax)
1080 vmovdqa %ymm15,480-512(%rax)
1081
1082 vpshufd $0x00,%ymm7,%ymm4
1083 vpshufd $0x55,%ymm7,%ymm5
1084 vpaddd .Lincy(%rip),%ymm4,%ymm4
1085 vpshufd $0xaa,%ymm7,%ymm6
1086 vmovdqa %ymm5,544-512(%rax)
1087 vpshufd $0xff,%ymm7,%ymm7
1088 vmovdqa %ymm6,576-512(%rax)
1089 vmovdqa %ymm7,608-512(%rax)
1090
1091 jmp .Loop_enter8x
1092
1093.align 32
1094.Loop_outer8x:
1095 vmovdqa 128-256(%rcx),%ymm8
1096 vmovdqa 160-256(%rcx),%ymm9
1097 vmovdqa 192-256(%rcx),%ymm10
1098 vmovdqa 224-256(%rcx),%ymm11
1099 vmovdqa 256-256(%rcx),%ymm0
1100 vmovdqa 288-256(%rcx),%ymm1
1101 vmovdqa 320-256(%rcx),%ymm2
1102 vmovdqa 352-256(%rcx),%ymm3
1103 vmovdqa 384-512(%rax),%ymm12
1104 vmovdqa 416-512(%rax),%ymm13
1105 vmovdqa 448-512(%rax),%ymm14
1106 vmovdqa 480-512(%rax),%ymm15
1107 vmovdqa 512-512(%rax),%ymm4
1108 vmovdqa 544-512(%rax),%ymm5
1109 vmovdqa 576-512(%rax),%ymm6
1110 vmovdqa 608-512(%rax),%ymm7
1111 vpaddd .Leight(%rip),%ymm4,%ymm4
1112
1113.Loop_enter8x:
1114 vmovdqa %ymm14,64(%rsp)
1115 vmovdqa %ymm15,96(%rsp)
1116 vbroadcasti128 (%r10),%ymm15
1117 vmovdqa %ymm4,512-512(%rax)
1118 movl $10,%eax
1119 jmp .Loop8x
1120
1121.align 32
1122.Loop8x:
1123 vpaddd %ymm0,%ymm8,%ymm8
1124 vpxor %ymm4,%ymm8,%ymm4
1125 vpshufb %ymm15,%ymm4,%ymm4
1126 vpaddd %ymm1,%ymm9,%ymm9
1127 vpxor %ymm5,%ymm9,%ymm5
1128 vpshufb %ymm15,%ymm5,%ymm5
1129 vpaddd %ymm4,%ymm12,%ymm12
1130 vpxor %ymm0,%ymm12,%ymm0
1131 vpslld $12,%ymm0,%ymm14
1132 vpsrld $20,%ymm0,%ymm0
1133 vpor %ymm0,%ymm14,%ymm0
1134 vbroadcasti128 (%r11),%ymm14
1135 vpaddd %ymm5,%ymm13,%ymm13
1136 vpxor %ymm1,%ymm13,%ymm1
1137 vpslld $12,%ymm1,%ymm15
1138 vpsrld $20,%ymm1,%ymm1
1139 vpor %ymm1,%ymm15,%ymm1
1140 vpaddd %ymm0,%ymm8,%ymm8
1141 vpxor %ymm4,%ymm8,%ymm4
1142 vpshufb %ymm14,%ymm4,%ymm4
1143 vpaddd %ymm1,%ymm9,%ymm9
1144 vpxor %ymm5,%ymm9,%ymm5
1145 vpshufb %ymm14,%ymm5,%ymm5
1146 vpaddd %ymm4,%ymm12,%ymm12
1147 vpxor %ymm0,%ymm12,%ymm0
1148 vpslld $7,%ymm0,%ymm15
1149 vpsrld $25,%ymm0,%ymm0
1150 vpor %ymm0,%ymm15,%ymm0
1151 vbroadcasti128 (%r10),%ymm15
1152 vpaddd %ymm5,%ymm13,%ymm13
1153 vpxor %ymm1,%ymm13,%ymm1
1154 vpslld $7,%ymm1,%ymm14
1155 vpsrld $25,%ymm1,%ymm1
1156 vpor %ymm1,%ymm14,%ymm1
1157 vmovdqa %ymm12,0(%rsp)
1158 vmovdqa %ymm13,32(%rsp)
1159 vmovdqa 64(%rsp),%ymm12
1160 vmovdqa 96(%rsp),%ymm13
1161 vpaddd %ymm2,%ymm10,%ymm10
1162 vpxor %ymm6,%ymm10,%ymm6
1163 vpshufb %ymm15,%ymm6,%ymm6
1164 vpaddd %ymm3,%ymm11,%ymm11
1165 vpxor %ymm7,%ymm11,%ymm7
1166 vpshufb %ymm15,%ymm7,%ymm7
1167 vpaddd %ymm6,%ymm12,%ymm12
1168 vpxor %ymm2,%ymm12,%ymm2
1169 vpslld $12,%ymm2,%ymm14
1170 vpsrld $20,%ymm2,%ymm2
1171 vpor %ymm2,%ymm14,%ymm2
1172 vbroadcasti128 (%r11),%ymm14
1173 vpaddd %ymm7,%ymm13,%ymm13
1174 vpxor %ymm3,%ymm13,%ymm3
1175 vpslld $12,%ymm3,%ymm15
1176 vpsrld $20,%ymm3,%ymm3
1177 vpor %ymm3,%ymm15,%ymm3
1178 vpaddd %ymm2,%ymm10,%ymm10
1179 vpxor %ymm6,%ymm10,%ymm6
1180 vpshufb %ymm14,%ymm6,%ymm6
1181 vpaddd %ymm3,%ymm11,%ymm11
1182 vpxor %ymm7,%ymm11,%ymm7
1183 vpshufb %ymm14,%ymm7,%ymm7
1184 vpaddd %ymm6,%ymm12,%ymm12
1185 vpxor %ymm2,%ymm12,%ymm2
1186 vpslld $7,%ymm2,%ymm15
1187 vpsrld $25,%ymm2,%ymm2
1188 vpor %ymm2,%ymm15,%ymm2
1189 vbroadcasti128 (%r10),%ymm15
1190 vpaddd %ymm7,%ymm13,%ymm13
1191 vpxor %ymm3,%ymm13,%ymm3
1192 vpslld $7,%ymm3,%ymm14
1193 vpsrld $25,%ymm3,%ymm3
1194 vpor %ymm3,%ymm14,%ymm3
1195 vpaddd %ymm1,%ymm8,%ymm8
1196 vpxor %ymm7,%ymm8,%ymm7
1197 vpshufb %ymm15,%ymm7,%ymm7
1198 vpaddd %ymm2,%ymm9,%ymm9
1199 vpxor %ymm4,%ymm9,%ymm4
1200 vpshufb %ymm15,%ymm4,%ymm4
1201 vpaddd %ymm7,%ymm12,%ymm12
1202 vpxor %ymm1,%ymm12,%ymm1
1203 vpslld $12,%ymm1,%ymm14
1204 vpsrld $20,%ymm1,%ymm1
1205 vpor %ymm1,%ymm14,%ymm1
1206 vbroadcasti128 (%r11),%ymm14
1207 vpaddd %ymm4,%ymm13,%ymm13
1208 vpxor %ymm2,%ymm13,%ymm2
1209 vpslld $12,%ymm2,%ymm15
1210 vpsrld $20,%ymm2,%ymm2
1211 vpor %ymm2,%ymm15,%ymm2
1212 vpaddd %ymm1,%ymm8,%ymm8
1213 vpxor %ymm7,%ymm8,%ymm7
1214 vpshufb %ymm14,%ymm7,%ymm7
1215 vpaddd %ymm2,%ymm9,%ymm9
1216 vpxor %ymm4,%ymm9,%ymm4
1217 vpshufb %ymm14,%ymm4,%ymm4
1218 vpaddd %ymm7,%ymm12,%ymm12
1219 vpxor %ymm1,%ymm12,%ymm1
1220 vpslld $7,%ymm1,%ymm15
1221 vpsrld $25,%ymm1,%ymm1
1222 vpor %ymm1,%ymm15,%ymm1
1223 vbroadcasti128 (%r10),%ymm15
1224 vpaddd %ymm4,%ymm13,%ymm13
1225 vpxor %ymm2,%ymm13,%ymm2
1226 vpslld $7,%ymm2,%ymm14
1227 vpsrld $25,%ymm2,%ymm2
1228 vpor %ymm2,%ymm14,%ymm2
1229 vmovdqa %ymm12,64(%rsp)
1230 vmovdqa %ymm13,96(%rsp)
1231 vmovdqa 0(%rsp),%ymm12
1232 vmovdqa 32(%rsp),%ymm13
1233 vpaddd %ymm3,%ymm10,%ymm10
1234 vpxor %ymm5,%ymm10,%ymm5
1235 vpshufb %ymm15,%ymm5,%ymm5
1236 vpaddd %ymm0,%ymm11,%ymm11
1237 vpxor %ymm6,%ymm11,%ymm6
1238 vpshufb %ymm15,%ymm6,%ymm6
1239 vpaddd %ymm5,%ymm12,%ymm12
1240 vpxor %ymm3,%ymm12,%ymm3
1241 vpslld $12,%ymm3,%ymm14
1242 vpsrld $20,%ymm3,%ymm3
1243 vpor %ymm3,%ymm14,%ymm3
1244 vbroadcasti128 (%r11),%ymm14
1245 vpaddd %ymm6,%ymm13,%ymm13
1246 vpxor %ymm0,%ymm13,%ymm0
1247 vpslld $12,%ymm0,%ymm15
1248 vpsrld $20,%ymm0,%ymm0
1249 vpor %ymm0,%ymm15,%ymm0
1250 vpaddd %ymm3,%ymm10,%ymm10
1251 vpxor %ymm5,%ymm10,%ymm5
1252 vpshufb %ymm14,%ymm5,%ymm5
1253 vpaddd %ymm0,%ymm11,%ymm11
1254 vpxor %ymm6,%ymm11,%ymm6
1255 vpshufb %ymm14,%ymm6,%ymm6
1256 vpaddd %ymm5,%ymm12,%ymm12
1257 vpxor %ymm3,%ymm12,%ymm3
1258 vpslld $7,%ymm3,%ymm15
1259 vpsrld $25,%ymm3,%ymm3
1260 vpor %ymm3,%ymm15,%ymm3
1261 vbroadcasti128 (%r10),%ymm15
1262 vpaddd %ymm6,%ymm13,%ymm13
1263 vpxor %ymm0,%ymm13,%ymm0
1264 vpslld $7,%ymm0,%ymm14
1265 vpsrld $25,%ymm0,%ymm0
1266 vpor %ymm0,%ymm14,%ymm0
1267 decl %eax
1268 jnz .Loop8x
1269
1270 leaq 512(%rsp),%rax
1271 vpaddd 128-256(%rcx),%ymm8,%ymm8
1272 vpaddd 160-256(%rcx),%ymm9,%ymm9
1273 vpaddd 192-256(%rcx),%ymm10,%ymm10
1274 vpaddd 224-256(%rcx),%ymm11,%ymm11
1275
1276 vpunpckldq %ymm9,%ymm8,%ymm14
1277 vpunpckldq %ymm11,%ymm10,%ymm15
1278 vpunpckhdq %ymm9,%ymm8,%ymm8
1279 vpunpckhdq %ymm11,%ymm10,%ymm10
1280 vpunpcklqdq %ymm15,%ymm14,%ymm9
1281 vpunpckhqdq %ymm15,%ymm14,%ymm14
1282 vpunpcklqdq %ymm10,%ymm8,%ymm11
1283 vpunpckhqdq %ymm10,%ymm8,%ymm8
1284 vpaddd 256-256(%rcx),%ymm0,%ymm0
1285 vpaddd 288-256(%rcx),%ymm1,%ymm1
1286 vpaddd 320-256(%rcx),%ymm2,%ymm2
1287 vpaddd 352-256(%rcx),%ymm3,%ymm3
1288
1289 vpunpckldq %ymm1,%ymm0,%ymm10
1290 vpunpckldq %ymm3,%ymm2,%ymm15
1291 vpunpckhdq %ymm1,%ymm0,%ymm0
1292 vpunpckhdq %ymm3,%ymm2,%ymm2
1293 vpunpcklqdq %ymm15,%ymm10,%ymm1
1294 vpunpckhqdq %ymm15,%ymm10,%ymm10
1295 vpunpcklqdq %ymm2,%ymm0,%ymm3
1296 vpunpckhqdq %ymm2,%ymm0,%ymm0
1297 vperm2i128 $0x20,%ymm1,%ymm9,%ymm15
1298 vperm2i128 $0x31,%ymm1,%ymm9,%ymm1
1299 vperm2i128 $0x20,%ymm10,%ymm14,%ymm9
1300 vperm2i128 $0x31,%ymm10,%ymm14,%ymm10
1301 vperm2i128 $0x20,%ymm3,%ymm11,%ymm14
1302 vperm2i128 $0x31,%ymm3,%ymm11,%ymm3
1303 vperm2i128 $0x20,%ymm0,%ymm8,%ymm11
1304 vperm2i128 $0x31,%ymm0,%ymm8,%ymm0
1305 vmovdqa %ymm15,0(%rsp)
1306 vmovdqa %ymm9,32(%rsp)
1307 vmovdqa 64(%rsp),%ymm15
1308 vmovdqa 96(%rsp),%ymm9
1309
1310 vpaddd 384-512(%rax),%ymm12,%ymm12
1311 vpaddd 416-512(%rax),%ymm13,%ymm13
1312 vpaddd 448-512(%rax),%ymm15,%ymm15
1313 vpaddd 480-512(%rax),%ymm9,%ymm9
1314
1315 vpunpckldq %ymm13,%ymm12,%ymm2
1316 vpunpckldq %ymm9,%ymm15,%ymm8
1317 vpunpckhdq %ymm13,%ymm12,%ymm12
1318 vpunpckhdq %ymm9,%ymm15,%ymm15
1319 vpunpcklqdq %ymm8,%ymm2,%ymm13
1320 vpunpckhqdq %ymm8,%ymm2,%ymm2
1321 vpunpcklqdq %ymm15,%ymm12,%ymm9
1322 vpunpckhqdq %ymm15,%ymm12,%ymm12
1323 vpaddd 512-512(%rax),%ymm4,%ymm4
1324 vpaddd 544-512(%rax),%ymm5,%ymm5
1325 vpaddd 576-512(%rax),%ymm6,%ymm6
1326 vpaddd 608-512(%rax),%ymm7,%ymm7
1327
1328 vpunpckldq %ymm5,%ymm4,%ymm15
1329 vpunpckldq %ymm7,%ymm6,%ymm8
1330 vpunpckhdq %ymm5,%ymm4,%ymm4
1331 vpunpckhdq %ymm7,%ymm6,%ymm6
1332 vpunpcklqdq %ymm8,%ymm15,%ymm5
1333 vpunpckhqdq %ymm8,%ymm15,%ymm15
1334 vpunpcklqdq %ymm6,%ymm4,%ymm7
1335 vpunpckhqdq %ymm6,%ymm4,%ymm4
1336 vperm2i128 $0x20,%ymm5,%ymm13,%ymm8
1337 vperm2i128 $0x31,%ymm5,%ymm13,%ymm5
1338 vperm2i128 $0x20,%ymm15,%ymm2,%ymm13
1339 vperm2i128 $0x31,%ymm15,%ymm2,%ymm15
1340 vperm2i128 $0x20,%ymm7,%ymm9,%ymm2
1341 vperm2i128 $0x31,%ymm7,%ymm9,%ymm7
1342 vperm2i128 $0x20,%ymm4,%ymm12,%ymm9
1343 vperm2i128 $0x31,%ymm4,%ymm12,%ymm4
1344 vmovdqa 0(%rsp),%ymm6
1345 vmovdqa 32(%rsp),%ymm12
1346
1347 cmpq $512,%rdx
1348 jb .Ltail8x
1349
1350 vpxor 0(%rsi),%ymm6,%ymm6
1351 vpxor 32(%rsi),%ymm8,%ymm8
1352 vpxor 64(%rsi),%ymm1,%ymm1
1353 vpxor 96(%rsi),%ymm5,%ymm5
1354 leaq 128(%rsi),%rsi
1355 vmovdqu %ymm6,0(%rdi)
1356 vmovdqu %ymm8,32(%rdi)
1357 vmovdqu %ymm1,64(%rdi)
1358 vmovdqu %ymm5,96(%rdi)
1359 leaq 128(%rdi),%rdi
1360
1361 vpxor 0(%rsi),%ymm12,%ymm12
1362 vpxor 32(%rsi),%ymm13,%ymm13
1363 vpxor 64(%rsi),%ymm10,%ymm10
1364 vpxor 96(%rsi),%ymm15,%ymm15
1365 leaq 128(%rsi),%rsi
1366 vmovdqu %ymm12,0(%rdi)
1367 vmovdqu %ymm13,32(%rdi)
1368 vmovdqu %ymm10,64(%rdi)
1369 vmovdqu %ymm15,96(%rdi)
1370 leaq 128(%rdi),%rdi
1371
1372 vpxor 0(%rsi),%ymm14,%ymm14
1373 vpxor 32(%rsi),%ymm2,%ymm2
1374 vpxor 64(%rsi),%ymm3,%ymm3
1375 vpxor 96(%rsi),%ymm7,%ymm7
1376 leaq 128(%rsi),%rsi
1377 vmovdqu %ymm14,0(%rdi)
1378 vmovdqu %ymm2,32(%rdi)
1379 vmovdqu %ymm3,64(%rdi)
1380 vmovdqu %ymm7,96(%rdi)
1381 leaq 128(%rdi),%rdi
1382
1383 vpxor 0(%rsi),%ymm11,%ymm11
1384 vpxor 32(%rsi),%ymm9,%ymm9
1385 vpxor 64(%rsi),%ymm0,%ymm0
1386 vpxor 96(%rsi),%ymm4,%ymm4
1387 leaq 128(%rsi),%rsi
1388 vmovdqu %ymm11,0(%rdi)
1389 vmovdqu %ymm9,32(%rdi)
1390 vmovdqu %ymm0,64(%rdi)
1391 vmovdqu %ymm4,96(%rdi)
1392 leaq 128(%rdi),%rdi
1393
1394 subq $512,%rdx
1395 jnz .Loop_outer8x
1396
1397 jmp .Ldone8x
1398
1399.Ltail8x:
1400 cmpq $448,%rdx
1401 jae .L448_or_more8x
1402 cmpq $384,%rdx
1403 jae .L384_or_more8x
1404 cmpq $320,%rdx
1405 jae .L320_or_more8x
1406 cmpq $256,%rdx
1407 jae .L256_or_more8x
1408 cmpq $192,%rdx
1409 jae .L192_or_more8x
1410 cmpq $128,%rdx
1411 jae .L128_or_more8x
1412 cmpq $64,%rdx
1413 jae .L64_or_more8x
1414
1415 xorq %r10,%r10
1416 vmovdqa %ymm6,0(%rsp)
1417 vmovdqa %ymm8,32(%rsp)
1418 jmp .Loop_tail8x
1419
1420.align 32
1421.L64_or_more8x:
1422 vpxor 0(%rsi),%ymm6,%ymm6
1423 vpxor 32(%rsi),%ymm8,%ymm8
1424 vmovdqu %ymm6,0(%rdi)
1425 vmovdqu %ymm8,32(%rdi)
1426 je .Ldone8x
1427
1428 leaq 64(%rsi),%rsi
1429 xorq %r10,%r10
1430 vmovdqa %ymm1,0(%rsp)
1431 leaq 64(%rdi),%rdi
1432 subq $64,%rdx
1433 vmovdqa %ymm5,32(%rsp)
1434 jmp .Loop_tail8x
1435
1436.align 32
1437.L128_or_more8x:
1438 vpxor 0(%rsi),%ymm6,%ymm6
1439 vpxor 32(%rsi),%ymm8,%ymm8
1440 vpxor 64(%rsi),%ymm1,%ymm1
1441 vpxor 96(%rsi),%ymm5,%ymm5
1442 vmovdqu %ymm6,0(%rdi)
1443 vmovdqu %ymm8,32(%rdi)
1444 vmovdqu %ymm1,64(%rdi)
1445 vmovdqu %ymm5,96(%rdi)
1446 je .Ldone8x
1447
1448 leaq 128(%rsi),%rsi
1449 xorq %r10,%r10
1450 vmovdqa %ymm12,0(%rsp)
1451 leaq 128(%rdi),%rdi
1452 subq $128,%rdx
1453 vmovdqa %ymm13,32(%rsp)
1454 jmp .Loop_tail8x
1455
1456.align 32
1457.L192_or_more8x:
1458 vpxor 0(%rsi),%ymm6,%ymm6
1459 vpxor 32(%rsi),%ymm8,%ymm8
1460 vpxor 64(%rsi),%ymm1,%ymm1
1461 vpxor 96(%rsi),%ymm5,%ymm5
1462 vpxor 128(%rsi),%ymm12,%ymm12
1463 vpxor 160(%rsi),%ymm13,%ymm13
1464 vmovdqu %ymm6,0(%rdi)
1465 vmovdqu %ymm8,32(%rdi)
1466 vmovdqu %ymm1,64(%rdi)
1467 vmovdqu %ymm5,96(%rdi)
1468 vmovdqu %ymm12,128(%rdi)
1469 vmovdqu %ymm13,160(%rdi)
1470 je .Ldone8x
1471
1472 leaq 192(%rsi),%rsi
1473 xorq %r10,%r10
1474 vmovdqa %ymm10,0(%rsp)
1475 leaq 192(%rdi),%rdi
1476 subq $192,%rdx
1477 vmovdqa %ymm15,32(%rsp)
1478 jmp .Loop_tail8x
1479
1480.align 32
1481.L256_or_more8x:
1482 vpxor 0(%rsi),%ymm6,%ymm6
1483 vpxor 32(%rsi),%ymm8,%ymm8
1484 vpxor 64(%rsi),%ymm1,%ymm1
1485 vpxor 96(%rsi),%ymm5,%ymm5
1486 vpxor 128(%rsi),%ymm12,%ymm12
1487 vpxor 160(%rsi),%ymm13,%ymm13
1488 vpxor 192(%rsi),%ymm10,%ymm10
1489 vpxor 224(%rsi),%ymm15,%ymm15
1490 vmovdqu %ymm6,0(%rdi)
1491 vmovdqu %ymm8,32(%rdi)
1492 vmovdqu %ymm1,64(%rdi)
1493 vmovdqu %ymm5,96(%rdi)
1494 vmovdqu %ymm12,128(%rdi)
1495 vmovdqu %ymm13,160(%rdi)
1496 vmovdqu %ymm10,192(%rdi)
1497 vmovdqu %ymm15,224(%rdi)
1498 je .Ldone8x
1499
1500 leaq 256(%rsi),%rsi
1501 xorq %r10,%r10
1502 vmovdqa %ymm14,0(%rsp)
1503 leaq 256(%rdi),%rdi
1504 subq $256,%rdx
1505 vmovdqa %ymm2,32(%rsp)
1506 jmp .Loop_tail8x
1507
1508.align 32
1509.L320_or_more8x:
1510 vpxor 0(%rsi),%ymm6,%ymm6
1511 vpxor 32(%rsi),%ymm8,%ymm8
1512 vpxor 64(%rsi),%ymm1,%ymm1
1513 vpxor 96(%rsi),%ymm5,%ymm5
1514 vpxor 128(%rsi),%ymm12,%ymm12
1515 vpxor 160(%rsi),%ymm13,%ymm13
1516 vpxor 192(%rsi),%ymm10,%ymm10
1517 vpxor 224(%rsi),%ymm15,%ymm15
1518 vpxor 256(%rsi),%ymm14,%ymm14
1519 vpxor 288(%rsi),%ymm2,%ymm2
1520 vmovdqu %ymm6,0(%rdi)
1521 vmovdqu %ymm8,32(%rdi)
1522 vmovdqu %ymm1,64(%rdi)
1523 vmovdqu %ymm5,96(%rdi)
1524 vmovdqu %ymm12,128(%rdi)
1525 vmovdqu %ymm13,160(%rdi)
1526 vmovdqu %ymm10,192(%rdi)
1527 vmovdqu %ymm15,224(%rdi)
1528 vmovdqu %ymm14,256(%rdi)
1529 vmovdqu %ymm2,288(%rdi)
1530 je .Ldone8x
1531
1532 leaq 320(%rsi),%rsi
1533 xorq %r10,%r10
1534 vmovdqa %ymm3,0(%rsp)
1535 leaq 320(%rdi),%rdi
1536 subq $320,%rdx
1537 vmovdqa %ymm7,32(%rsp)
1538 jmp .Loop_tail8x
1539
1540.align 32
1541.L384_or_more8x:
1542 vpxor 0(%rsi),%ymm6,%ymm6
1543 vpxor 32(%rsi),%ymm8,%ymm8
1544 vpxor 64(%rsi),%ymm1,%ymm1
1545 vpxor 96(%rsi),%ymm5,%ymm5
1546 vpxor 128(%rsi),%ymm12,%ymm12
1547 vpxor 160(%rsi),%ymm13,%ymm13
1548 vpxor 192(%rsi),%ymm10,%ymm10
1549 vpxor 224(%rsi),%ymm15,%ymm15
1550 vpxor 256(%rsi),%ymm14,%ymm14
1551 vpxor 288(%rsi),%ymm2,%ymm2
1552 vpxor 320(%rsi),%ymm3,%ymm3
1553 vpxor 352(%rsi),%ymm7,%ymm7
1554 vmovdqu %ymm6,0(%rdi)
1555 vmovdqu %ymm8,32(%rdi)
1556 vmovdqu %ymm1,64(%rdi)
1557 vmovdqu %ymm5,96(%rdi)
1558 vmovdqu %ymm12,128(%rdi)
1559 vmovdqu %ymm13,160(%rdi)
1560 vmovdqu %ymm10,192(%rdi)
1561 vmovdqu %ymm15,224(%rdi)
1562 vmovdqu %ymm14,256(%rdi)
1563 vmovdqu %ymm2,288(%rdi)
1564 vmovdqu %ymm3,320(%rdi)
1565 vmovdqu %ymm7,352(%rdi)
1566 je .Ldone8x
1567
1568 leaq 384(%rsi),%rsi
1569 xorq %r10,%r10
1570 vmovdqa %ymm11,0(%rsp)
1571 leaq 384(%rdi),%rdi
1572 subq $384,%rdx
1573 vmovdqa %ymm9,32(%rsp)
1574 jmp .Loop_tail8x
1575
1576.align 32
1577.L448_or_more8x:
1578 vpxor 0(%rsi),%ymm6,%ymm6
1579 vpxor 32(%rsi),%ymm8,%ymm8
1580 vpxor 64(%rsi),%ymm1,%ymm1
1581 vpxor 96(%rsi),%ymm5,%ymm5
1582 vpxor 128(%rsi),%ymm12,%ymm12
1583 vpxor 160(%rsi),%ymm13,%ymm13
1584 vpxor 192(%rsi),%ymm10,%ymm10
1585 vpxor 224(%rsi),%ymm15,%ymm15
1586 vpxor 256(%rsi),%ymm14,%ymm14
1587 vpxor 288(%rsi),%ymm2,%ymm2
1588 vpxor 320(%rsi),%ymm3,%ymm3
1589 vpxor 352(%rsi),%ymm7,%ymm7
1590 vpxor 384(%rsi),%ymm11,%ymm11
1591 vpxor 416(%rsi),%ymm9,%ymm9
1592 vmovdqu %ymm6,0(%rdi)
1593 vmovdqu %ymm8,32(%rdi)
1594 vmovdqu %ymm1,64(%rdi)
1595 vmovdqu %ymm5,96(%rdi)
1596 vmovdqu %ymm12,128(%rdi)
1597 vmovdqu %ymm13,160(%rdi)
1598 vmovdqu %ymm10,192(%rdi)
1599 vmovdqu %ymm15,224(%rdi)
1600 vmovdqu %ymm14,256(%rdi)
1601 vmovdqu %ymm2,288(%rdi)
1602 vmovdqu %ymm3,320(%rdi)
1603 vmovdqu %ymm7,352(%rdi)
1604 vmovdqu %ymm11,384(%rdi)
1605 vmovdqu %ymm9,416(%rdi)
1606 je .Ldone8x
1607
1608 leaq 448(%rsi),%rsi
1609 xorq %r10,%r10
1610 vmovdqa %ymm0,0(%rsp)
1611 leaq 448(%rdi),%rdi
1612 subq $448,%rdx
1613 vmovdqa %ymm4,32(%rsp)
1614
1615.Loop_tail8x:
1616 movzbl (%rsi,%r10,1),%eax
1617 movzbl (%rsp,%r10,1),%ecx
1618 leaq 1(%r10),%r10
1619 xorl %ecx,%eax
1620 movb %al,-1(%rdi,%r10,1)
1621 decq %rdx
1622 jnz .Loop_tail8x
1623
1624.Ldone8x:
1625 vzeroall
1626 leaq (%r9),%rsp
1627.cfi_def_cfa_register rsp
1628.L8x_epilogue:
1629 .byte 0xf3,0xc3
1630.cfi_endproc
1631.size ChaCha20_8x,.-ChaCha20_8x
1632#endif
1633