1 | # This file is generated from a similarly-named Perl script in the BoringSSL |
2 | # source tree. Do not edit by hand. |
3 | |
4 | #if defined(__has_feature) |
5 | #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) |
6 | #define OPENSSL_NO_ASM |
7 | #endif |
8 | #endif |
9 | |
10 | #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) |
11 | #if defined(BORINGSSL_PREFIX) |
12 | #include <boringssl_prefix_symbols_asm.h> |
13 | #endif |
14 | .text |
15 | |
16 | .extern OPENSSL_ia32cap_P |
17 | .hidden OPENSSL_ia32cap_P |
18 | |
19 | .align 64 |
20 | .Lzero: |
21 | .long 0,0,0,0 |
22 | .Lone: |
23 | .long 1,0,0,0 |
24 | .Linc: |
25 | .long 0,1,2,3 |
26 | .Lfour: |
27 | .long 4,4,4,4 |
28 | .Lincy: |
29 | .long 0,2,4,6,1,3,5,7 |
30 | .Leight: |
31 | .long 8,8,8,8,8,8,8,8 |
32 | .Lrot16: |
33 | .byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd |
34 | .Lrot24: |
35 | .byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe |
36 | .Lsigma: |
37 | .byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0 |
38 | .align 64 |
39 | .Lzeroz: |
40 | .long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 |
41 | .Lfourz: |
42 | .long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 |
43 | .Lincz: |
44 | .long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 |
45 | .Lsixteen: |
46 | .long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 |
47 | .byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 |
48 | .globl ChaCha20_ctr32 |
49 | .hidden ChaCha20_ctr32 |
50 | .type ChaCha20_ctr32,@function |
51 | .align 64 |
52 | ChaCha20_ctr32: |
53 | .cfi_startproc |
54 | cmpq $0,%rdx |
55 | je .Lno_data |
56 | movq OPENSSL_ia32cap_P+4(%rip),%r10 |
57 | testl $512,%r10d |
58 | jnz .LChaCha20_ssse3 |
59 | |
60 | pushq %rbx |
61 | .cfi_adjust_cfa_offset 8 |
62 | .cfi_offset rbx,-16 |
63 | pushq %rbp |
64 | .cfi_adjust_cfa_offset 8 |
65 | .cfi_offset rbp,-24 |
66 | pushq %r12 |
67 | .cfi_adjust_cfa_offset 8 |
68 | .cfi_offset r12,-32 |
69 | pushq %r13 |
70 | .cfi_adjust_cfa_offset 8 |
71 | .cfi_offset r13,-40 |
72 | pushq %r14 |
73 | .cfi_adjust_cfa_offset 8 |
74 | .cfi_offset r14,-48 |
75 | pushq %r15 |
76 | .cfi_adjust_cfa_offset 8 |
77 | .cfi_offset r15,-56 |
78 | subq $64+24,%rsp |
79 | .cfi_adjust_cfa_offset 88 |
80 | .Lctr32_body: |
81 | |
82 | |
83 | movdqu (%rcx),%xmm1 |
84 | movdqu 16(%rcx),%xmm2 |
85 | movdqu (%r8),%xmm3 |
86 | movdqa .Lone(%rip),%xmm4 |
87 | |
88 | |
89 | movdqa %xmm1,16(%rsp) |
90 | movdqa %xmm2,32(%rsp) |
91 | movdqa %xmm3,48(%rsp) |
92 | movq %rdx,%rbp |
93 | jmp .Loop_outer |
94 | |
95 | .align 32 |
96 | .Loop_outer: |
97 | movl $0x61707865,%eax |
98 | movl $0x3320646e,%ebx |
99 | movl $0x79622d32,%ecx |
100 | movl $0x6b206574,%edx |
101 | movl 16(%rsp),%r8d |
102 | movl 20(%rsp),%r9d |
103 | movl 24(%rsp),%r10d |
104 | movl 28(%rsp),%r11d |
105 | movd %xmm3,%r12d |
106 | movl 52(%rsp),%r13d |
107 | movl 56(%rsp),%r14d |
108 | movl 60(%rsp),%r15d |
109 | |
110 | movq %rbp,64+0(%rsp) |
111 | movl $10,%ebp |
112 | movq %rsi,64+8(%rsp) |
113 | .byte 102,72,15,126,214 |
114 | movq %rdi,64+16(%rsp) |
115 | movq %rsi,%rdi |
116 | shrq $32,%rdi |
117 | jmp .Loop |
118 | |
119 | .align 32 |
120 | .Loop: |
121 | addl %r8d,%eax |
122 | xorl %eax,%r12d |
123 | roll $16,%r12d |
124 | addl %r9d,%ebx |
125 | xorl %ebx,%r13d |
126 | roll $16,%r13d |
127 | addl %r12d,%esi |
128 | xorl %esi,%r8d |
129 | roll $12,%r8d |
130 | addl %r13d,%edi |
131 | xorl %edi,%r9d |
132 | roll $12,%r9d |
133 | addl %r8d,%eax |
134 | xorl %eax,%r12d |
135 | roll $8,%r12d |
136 | addl %r9d,%ebx |
137 | xorl %ebx,%r13d |
138 | roll $8,%r13d |
139 | addl %r12d,%esi |
140 | xorl %esi,%r8d |
141 | roll $7,%r8d |
142 | addl %r13d,%edi |
143 | xorl %edi,%r9d |
144 | roll $7,%r9d |
145 | movl %esi,32(%rsp) |
146 | movl %edi,36(%rsp) |
147 | movl 40(%rsp),%esi |
148 | movl 44(%rsp),%edi |
149 | addl %r10d,%ecx |
150 | xorl %ecx,%r14d |
151 | roll $16,%r14d |
152 | addl %r11d,%edx |
153 | xorl %edx,%r15d |
154 | roll $16,%r15d |
155 | addl %r14d,%esi |
156 | xorl %esi,%r10d |
157 | roll $12,%r10d |
158 | addl %r15d,%edi |
159 | xorl %edi,%r11d |
160 | roll $12,%r11d |
161 | addl %r10d,%ecx |
162 | xorl %ecx,%r14d |
163 | roll $8,%r14d |
164 | addl %r11d,%edx |
165 | xorl %edx,%r15d |
166 | roll $8,%r15d |
167 | addl %r14d,%esi |
168 | xorl %esi,%r10d |
169 | roll $7,%r10d |
170 | addl %r15d,%edi |
171 | xorl %edi,%r11d |
172 | roll $7,%r11d |
173 | addl %r9d,%eax |
174 | xorl %eax,%r15d |
175 | roll $16,%r15d |
176 | addl %r10d,%ebx |
177 | xorl %ebx,%r12d |
178 | roll $16,%r12d |
179 | addl %r15d,%esi |
180 | xorl %esi,%r9d |
181 | roll $12,%r9d |
182 | addl %r12d,%edi |
183 | xorl %edi,%r10d |
184 | roll $12,%r10d |
185 | addl %r9d,%eax |
186 | xorl %eax,%r15d |
187 | roll $8,%r15d |
188 | addl %r10d,%ebx |
189 | xorl %ebx,%r12d |
190 | roll $8,%r12d |
191 | addl %r15d,%esi |
192 | xorl %esi,%r9d |
193 | roll $7,%r9d |
194 | addl %r12d,%edi |
195 | xorl %edi,%r10d |
196 | roll $7,%r10d |
197 | movl %esi,40(%rsp) |
198 | movl %edi,44(%rsp) |
199 | movl 32(%rsp),%esi |
200 | movl 36(%rsp),%edi |
201 | addl %r11d,%ecx |
202 | xorl %ecx,%r13d |
203 | roll $16,%r13d |
204 | addl %r8d,%edx |
205 | xorl %edx,%r14d |
206 | roll $16,%r14d |
207 | addl %r13d,%esi |
208 | xorl %esi,%r11d |
209 | roll $12,%r11d |
210 | addl %r14d,%edi |
211 | xorl %edi,%r8d |
212 | roll $12,%r8d |
213 | addl %r11d,%ecx |
214 | xorl %ecx,%r13d |
215 | roll $8,%r13d |
216 | addl %r8d,%edx |
217 | xorl %edx,%r14d |
218 | roll $8,%r14d |
219 | addl %r13d,%esi |
220 | xorl %esi,%r11d |
221 | roll $7,%r11d |
222 | addl %r14d,%edi |
223 | xorl %edi,%r8d |
224 | roll $7,%r8d |
225 | decl %ebp |
226 | jnz .Loop |
227 | movl %edi,36(%rsp) |
228 | movl %esi,32(%rsp) |
229 | movq 64(%rsp),%rbp |
230 | movdqa %xmm2,%xmm1 |
231 | movq 64+8(%rsp),%rsi |
232 | paddd %xmm4,%xmm3 |
233 | movq 64+16(%rsp),%rdi |
234 | |
235 | addl $0x61707865,%eax |
236 | addl $0x3320646e,%ebx |
237 | addl $0x79622d32,%ecx |
238 | addl $0x6b206574,%edx |
239 | addl 16(%rsp),%r8d |
240 | addl 20(%rsp),%r9d |
241 | addl 24(%rsp),%r10d |
242 | addl 28(%rsp),%r11d |
243 | addl 48(%rsp),%r12d |
244 | addl 52(%rsp),%r13d |
245 | addl 56(%rsp),%r14d |
246 | addl 60(%rsp),%r15d |
247 | paddd 32(%rsp),%xmm1 |
248 | |
249 | cmpq $64,%rbp |
250 | jb .Ltail |
251 | |
252 | xorl 0(%rsi),%eax |
253 | xorl 4(%rsi),%ebx |
254 | xorl 8(%rsi),%ecx |
255 | xorl 12(%rsi),%edx |
256 | xorl 16(%rsi),%r8d |
257 | xorl 20(%rsi),%r9d |
258 | xorl 24(%rsi),%r10d |
259 | xorl 28(%rsi),%r11d |
260 | movdqu 32(%rsi),%xmm0 |
261 | xorl 48(%rsi),%r12d |
262 | xorl 52(%rsi),%r13d |
263 | xorl 56(%rsi),%r14d |
264 | xorl 60(%rsi),%r15d |
265 | leaq 64(%rsi),%rsi |
266 | pxor %xmm1,%xmm0 |
267 | |
268 | movdqa %xmm2,32(%rsp) |
269 | movd %xmm3,48(%rsp) |
270 | |
271 | movl %eax,0(%rdi) |
272 | movl %ebx,4(%rdi) |
273 | movl %ecx,8(%rdi) |
274 | movl %edx,12(%rdi) |
275 | movl %r8d,16(%rdi) |
276 | movl %r9d,20(%rdi) |
277 | movl %r10d,24(%rdi) |
278 | movl %r11d,28(%rdi) |
279 | movdqu %xmm0,32(%rdi) |
280 | movl %r12d,48(%rdi) |
281 | movl %r13d,52(%rdi) |
282 | movl %r14d,56(%rdi) |
283 | movl %r15d,60(%rdi) |
284 | leaq 64(%rdi),%rdi |
285 | |
286 | subq $64,%rbp |
287 | jnz .Loop_outer |
288 | |
289 | jmp .Ldone |
290 | |
291 | .align 16 |
292 | .Ltail: |
293 | movl %eax,0(%rsp) |
294 | movl %ebx,4(%rsp) |
295 | xorq %rbx,%rbx |
296 | movl %ecx,8(%rsp) |
297 | movl %edx,12(%rsp) |
298 | movl %r8d,16(%rsp) |
299 | movl %r9d,20(%rsp) |
300 | movl %r10d,24(%rsp) |
301 | movl %r11d,28(%rsp) |
302 | movdqa %xmm1,32(%rsp) |
303 | movl %r12d,48(%rsp) |
304 | movl %r13d,52(%rsp) |
305 | movl %r14d,56(%rsp) |
306 | movl %r15d,60(%rsp) |
307 | |
308 | .Loop_tail: |
309 | movzbl (%rsi,%rbx,1),%eax |
310 | movzbl (%rsp,%rbx,1),%edx |
311 | leaq 1(%rbx),%rbx |
312 | xorl %edx,%eax |
313 | movb %al,-1(%rdi,%rbx,1) |
314 | decq %rbp |
315 | jnz .Loop_tail |
316 | |
317 | .Ldone: |
318 | leaq 64+24+48(%rsp),%rsi |
319 | movq -48(%rsi),%r15 |
320 | .cfi_restore r15 |
321 | movq -40(%rsi),%r14 |
322 | .cfi_restore r14 |
323 | movq -32(%rsi),%r13 |
324 | .cfi_restore r13 |
325 | movq -24(%rsi),%r12 |
326 | .cfi_restore r12 |
327 | movq -16(%rsi),%rbp |
328 | .cfi_restore rbp |
329 | movq -8(%rsi),%rbx |
330 | .cfi_restore rbx |
331 | leaq (%rsi),%rsp |
332 | .cfi_adjust_cfa_offset -136 |
333 | .Lno_data: |
334 | .byte 0xf3,0xc3 |
335 | .cfi_endproc |
336 | .size ChaCha20_ctr32,.-ChaCha20_ctr32 |
337 | .type ChaCha20_ssse3,@function |
338 | .align 32 |
339 | ChaCha20_ssse3: |
340 | .LChaCha20_ssse3: |
341 | .cfi_startproc |
342 | movq %rsp,%r9 |
343 | .cfi_def_cfa_register r9 |
344 | cmpq $128,%rdx |
345 | ja .LChaCha20_4x |
346 | |
347 | .Ldo_sse3_after_all: |
348 | subq $64+8,%rsp |
349 | movdqa .Lsigma(%rip),%xmm0 |
350 | movdqu (%rcx),%xmm1 |
351 | movdqu 16(%rcx),%xmm2 |
352 | movdqu (%r8),%xmm3 |
353 | movdqa .Lrot16(%rip),%xmm6 |
354 | movdqa .Lrot24(%rip),%xmm7 |
355 | |
356 | movdqa %xmm0,0(%rsp) |
357 | movdqa %xmm1,16(%rsp) |
358 | movdqa %xmm2,32(%rsp) |
359 | movdqa %xmm3,48(%rsp) |
360 | movq $10,%r8 |
361 | jmp .Loop_ssse3 |
362 | |
363 | .align 32 |
364 | .Loop_outer_ssse3: |
365 | movdqa .Lone(%rip),%xmm3 |
366 | movdqa 0(%rsp),%xmm0 |
367 | movdqa 16(%rsp),%xmm1 |
368 | movdqa 32(%rsp),%xmm2 |
369 | paddd 48(%rsp),%xmm3 |
370 | movq $10,%r8 |
371 | movdqa %xmm3,48(%rsp) |
372 | jmp .Loop_ssse3 |
373 | |
374 | .align 32 |
375 | .Loop_ssse3: |
376 | paddd %xmm1,%xmm0 |
377 | pxor %xmm0,%xmm3 |
378 | .byte 102,15,56,0,222 |
379 | paddd %xmm3,%xmm2 |
380 | pxor %xmm2,%xmm1 |
381 | movdqa %xmm1,%xmm4 |
382 | psrld $20,%xmm1 |
383 | pslld $12,%xmm4 |
384 | por %xmm4,%xmm1 |
385 | paddd %xmm1,%xmm0 |
386 | pxor %xmm0,%xmm3 |
387 | .byte 102,15,56,0,223 |
388 | paddd %xmm3,%xmm2 |
389 | pxor %xmm2,%xmm1 |
390 | movdqa %xmm1,%xmm4 |
391 | psrld $25,%xmm1 |
392 | pslld $7,%xmm4 |
393 | por %xmm4,%xmm1 |
394 | pshufd $78,%xmm2,%xmm2 |
395 | pshufd $57,%xmm1,%xmm1 |
396 | pshufd $147,%xmm3,%xmm3 |
397 | nop |
398 | paddd %xmm1,%xmm0 |
399 | pxor %xmm0,%xmm3 |
400 | .byte 102,15,56,0,222 |
401 | paddd %xmm3,%xmm2 |
402 | pxor %xmm2,%xmm1 |
403 | movdqa %xmm1,%xmm4 |
404 | psrld $20,%xmm1 |
405 | pslld $12,%xmm4 |
406 | por %xmm4,%xmm1 |
407 | paddd %xmm1,%xmm0 |
408 | pxor %xmm0,%xmm3 |
409 | .byte 102,15,56,0,223 |
410 | paddd %xmm3,%xmm2 |
411 | pxor %xmm2,%xmm1 |
412 | movdqa %xmm1,%xmm4 |
413 | psrld $25,%xmm1 |
414 | pslld $7,%xmm4 |
415 | por %xmm4,%xmm1 |
416 | pshufd $78,%xmm2,%xmm2 |
417 | pshufd $147,%xmm1,%xmm1 |
418 | pshufd $57,%xmm3,%xmm3 |
419 | decq %r8 |
420 | jnz .Loop_ssse3 |
421 | paddd 0(%rsp),%xmm0 |
422 | paddd 16(%rsp),%xmm1 |
423 | paddd 32(%rsp),%xmm2 |
424 | paddd 48(%rsp),%xmm3 |
425 | |
426 | cmpq $64,%rdx |
427 | jb .Ltail_ssse3 |
428 | |
429 | movdqu 0(%rsi),%xmm4 |
430 | movdqu 16(%rsi),%xmm5 |
431 | pxor %xmm4,%xmm0 |
432 | movdqu 32(%rsi),%xmm4 |
433 | pxor %xmm5,%xmm1 |
434 | movdqu 48(%rsi),%xmm5 |
435 | leaq 64(%rsi),%rsi |
436 | pxor %xmm4,%xmm2 |
437 | pxor %xmm5,%xmm3 |
438 | |
439 | movdqu %xmm0,0(%rdi) |
440 | movdqu %xmm1,16(%rdi) |
441 | movdqu %xmm2,32(%rdi) |
442 | movdqu %xmm3,48(%rdi) |
443 | leaq 64(%rdi),%rdi |
444 | |
445 | subq $64,%rdx |
446 | jnz .Loop_outer_ssse3 |
447 | |
448 | jmp .Ldone_ssse3 |
449 | |
450 | .align 16 |
451 | .Ltail_ssse3: |
452 | movdqa %xmm0,0(%rsp) |
453 | movdqa %xmm1,16(%rsp) |
454 | movdqa %xmm2,32(%rsp) |
455 | movdqa %xmm3,48(%rsp) |
456 | xorq %r8,%r8 |
457 | |
458 | .Loop_tail_ssse3: |
459 | movzbl (%rsi,%r8,1),%eax |
460 | movzbl (%rsp,%r8,1),%ecx |
461 | leaq 1(%r8),%r8 |
462 | xorl %ecx,%eax |
463 | movb %al,-1(%rdi,%r8,1) |
464 | decq %rdx |
465 | jnz .Loop_tail_ssse3 |
466 | |
467 | .Ldone_ssse3: |
468 | leaq (%r9),%rsp |
469 | .cfi_def_cfa_register rsp |
470 | .Lssse3_epilogue: |
471 | .byte 0xf3,0xc3 |
472 | .cfi_endproc |
473 | .size ChaCha20_ssse3,.-ChaCha20_ssse3 |
474 | .type ChaCha20_4x,@function |
475 | .align 32 |
476 | ChaCha20_4x: |
477 | .LChaCha20_4x: |
478 | .cfi_startproc |
479 | movq %rsp,%r9 |
480 | .cfi_def_cfa_register r9 |
481 | movq %r10,%r11 |
482 | shrq $32,%r10 |
483 | testq $32,%r10 |
484 | jnz .LChaCha20_8x |
485 | cmpq $192,%rdx |
486 | ja .Lproceed4x |
487 | |
488 | andq $71303168,%r11 |
489 | cmpq $4194304,%r11 |
490 | je .Ldo_sse3_after_all |
491 | |
492 | .Lproceed4x: |
493 | subq $0x140+8,%rsp |
494 | movdqa .Lsigma(%rip),%xmm11 |
495 | movdqu (%rcx),%xmm15 |
496 | movdqu 16(%rcx),%xmm7 |
497 | movdqu (%r8),%xmm3 |
498 | leaq 256(%rsp),%rcx |
499 | leaq .Lrot16(%rip),%r10 |
500 | leaq .Lrot24(%rip),%r11 |
501 | |
502 | pshufd $0x00,%xmm11,%xmm8 |
503 | pshufd $0x55,%xmm11,%xmm9 |
504 | movdqa %xmm8,64(%rsp) |
505 | pshufd $0xaa,%xmm11,%xmm10 |
506 | movdqa %xmm9,80(%rsp) |
507 | pshufd $0xff,%xmm11,%xmm11 |
508 | movdqa %xmm10,96(%rsp) |
509 | movdqa %xmm11,112(%rsp) |
510 | |
511 | pshufd $0x00,%xmm15,%xmm12 |
512 | pshufd $0x55,%xmm15,%xmm13 |
513 | movdqa %xmm12,128-256(%rcx) |
514 | pshufd $0xaa,%xmm15,%xmm14 |
515 | movdqa %xmm13,144-256(%rcx) |
516 | pshufd $0xff,%xmm15,%xmm15 |
517 | movdqa %xmm14,160-256(%rcx) |
518 | movdqa %xmm15,176-256(%rcx) |
519 | |
520 | pshufd $0x00,%xmm7,%xmm4 |
521 | pshufd $0x55,%xmm7,%xmm5 |
522 | movdqa %xmm4,192-256(%rcx) |
523 | pshufd $0xaa,%xmm7,%xmm6 |
524 | movdqa %xmm5,208-256(%rcx) |
525 | pshufd $0xff,%xmm7,%xmm7 |
526 | movdqa %xmm6,224-256(%rcx) |
527 | movdqa %xmm7,240-256(%rcx) |
528 | |
529 | pshufd $0x00,%xmm3,%xmm0 |
530 | pshufd $0x55,%xmm3,%xmm1 |
531 | paddd .Linc(%rip),%xmm0 |
532 | pshufd $0xaa,%xmm3,%xmm2 |
533 | movdqa %xmm1,272-256(%rcx) |
534 | pshufd $0xff,%xmm3,%xmm3 |
535 | movdqa %xmm2,288-256(%rcx) |
536 | movdqa %xmm3,304-256(%rcx) |
537 | |
538 | jmp .Loop_enter4x |
539 | |
540 | .align 32 |
541 | .Loop_outer4x: |
542 | movdqa 64(%rsp),%xmm8 |
543 | movdqa 80(%rsp),%xmm9 |
544 | movdqa 96(%rsp),%xmm10 |
545 | movdqa 112(%rsp),%xmm11 |
546 | movdqa 128-256(%rcx),%xmm12 |
547 | movdqa 144-256(%rcx),%xmm13 |
548 | movdqa 160-256(%rcx),%xmm14 |
549 | movdqa 176-256(%rcx),%xmm15 |
550 | movdqa 192-256(%rcx),%xmm4 |
551 | movdqa 208-256(%rcx),%xmm5 |
552 | movdqa 224-256(%rcx),%xmm6 |
553 | movdqa 240-256(%rcx),%xmm7 |
554 | movdqa 256-256(%rcx),%xmm0 |
555 | movdqa 272-256(%rcx),%xmm1 |
556 | movdqa 288-256(%rcx),%xmm2 |
557 | movdqa 304-256(%rcx),%xmm3 |
558 | paddd .Lfour(%rip),%xmm0 |
559 | |
560 | .Loop_enter4x: |
561 | movdqa %xmm6,32(%rsp) |
562 | movdqa %xmm7,48(%rsp) |
563 | movdqa (%r10),%xmm7 |
564 | movl $10,%eax |
565 | movdqa %xmm0,256-256(%rcx) |
566 | jmp .Loop4x |
567 | |
568 | .align 32 |
569 | .Loop4x: |
570 | paddd %xmm12,%xmm8 |
571 | paddd %xmm13,%xmm9 |
572 | pxor %xmm8,%xmm0 |
573 | pxor %xmm9,%xmm1 |
574 | .byte 102,15,56,0,199 |
575 | .byte 102,15,56,0,207 |
576 | paddd %xmm0,%xmm4 |
577 | paddd %xmm1,%xmm5 |
578 | pxor %xmm4,%xmm12 |
579 | pxor %xmm5,%xmm13 |
580 | movdqa %xmm12,%xmm6 |
581 | pslld $12,%xmm12 |
582 | psrld $20,%xmm6 |
583 | movdqa %xmm13,%xmm7 |
584 | pslld $12,%xmm13 |
585 | por %xmm6,%xmm12 |
586 | psrld $20,%xmm7 |
587 | movdqa (%r11),%xmm6 |
588 | por %xmm7,%xmm13 |
589 | paddd %xmm12,%xmm8 |
590 | paddd %xmm13,%xmm9 |
591 | pxor %xmm8,%xmm0 |
592 | pxor %xmm9,%xmm1 |
593 | .byte 102,15,56,0,198 |
594 | .byte 102,15,56,0,206 |
595 | paddd %xmm0,%xmm4 |
596 | paddd %xmm1,%xmm5 |
597 | pxor %xmm4,%xmm12 |
598 | pxor %xmm5,%xmm13 |
599 | movdqa %xmm12,%xmm7 |
600 | pslld $7,%xmm12 |
601 | psrld $25,%xmm7 |
602 | movdqa %xmm13,%xmm6 |
603 | pslld $7,%xmm13 |
604 | por %xmm7,%xmm12 |
605 | psrld $25,%xmm6 |
606 | movdqa (%r10),%xmm7 |
607 | por %xmm6,%xmm13 |
608 | movdqa %xmm4,0(%rsp) |
609 | movdqa %xmm5,16(%rsp) |
610 | movdqa 32(%rsp),%xmm4 |
611 | movdqa 48(%rsp),%xmm5 |
612 | paddd %xmm14,%xmm10 |
613 | paddd %xmm15,%xmm11 |
614 | pxor %xmm10,%xmm2 |
615 | pxor %xmm11,%xmm3 |
616 | .byte 102,15,56,0,215 |
617 | .byte 102,15,56,0,223 |
618 | paddd %xmm2,%xmm4 |
619 | paddd %xmm3,%xmm5 |
620 | pxor %xmm4,%xmm14 |
621 | pxor %xmm5,%xmm15 |
622 | movdqa %xmm14,%xmm6 |
623 | pslld $12,%xmm14 |
624 | psrld $20,%xmm6 |
625 | movdqa %xmm15,%xmm7 |
626 | pslld $12,%xmm15 |
627 | por %xmm6,%xmm14 |
628 | psrld $20,%xmm7 |
629 | movdqa (%r11),%xmm6 |
630 | por %xmm7,%xmm15 |
631 | paddd %xmm14,%xmm10 |
632 | paddd %xmm15,%xmm11 |
633 | pxor %xmm10,%xmm2 |
634 | pxor %xmm11,%xmm3 |
635 | .byte 102,15,56,0,214 |
636 | .byte 102,15,56,0,222 |
637 | paddd %xmm2,%xmm4 |
638 | paddd %xmm3,%xmm5 |
639 | pxor %xmm4,%xmm14 |
640 | pxor %xmm5,%xmm15 |
641 | movdqa %xmm14,%xmm7 |
642 | pslld $7,%xmm14 |
643 | psrld $25,%xmm7 |
644 | movdqa %xmm15,%xmm6 |
645 | pslld $7,%xmm15 |
646 | por %xmm7,%xmm14 |
647 | psrld $25,%xmm6 |
648 | movdqa (%r10),%xmm7 |
649 | por %xmm6,%xmm15 |
650 | paddd %xmm13,%xmm8 |
651 | paddd %xmm14,%xmm9 |
652 | pxor %xmm8,%xmm3 |
653 | pxor %xmm9,%xmm0 |
654 | .byte 102,15,56,0,223 |
655 | .byte 102,15,56,0,199 |
656 | paddd %xmm3,%xmm4 |
657 | paddd %xmm0,%xmm5 |
658 | pxor %xmm4,%xmm13 |
659 | pxor %xmm5,%xmm14 |
660 | movdqa %xmm13,%xmm6 |
661 | pslld $12,%xmm13 |
662 | psrld $20,%xmm6 |
663 | movdqa %xmm14,%xmm7 |
664 | pslld $12,%xmm14 |
665 | por %xmm6,%xmm13 |
666 | psrld $20,%xmm7 |
667 | movdqa (%r11),%xmm6 |
668 | por %xmm7,%xmm14 |
669 | paddd %xmm13,%xmm8 |
670 | paddd %xmm14,%xmm9 |
671 | pxor %xmm8,%xmm3 |
672 | pxor %xmm9,%xmm0 |
673 | .byte 102,15,56,0,222 |
674 | .byte 102,15,56,0,198 |
675 | paddd %xmm3,%xmm4 |
676 | paddd %xmm0,%xmm5 |
677 | pxor %xmm4,%xmm13 |
678 | pxor %xmm5,%xmm14 |
679 | movdqa %xmm13,%xmm7 |
680 | pslld $7,%xmm13 |
681 | psrld $25,%xmm7 |
682 | movdqa %xmm14,%xmm6 |
683 | pslld $7,%xmm14 |
684 | por %xmm7,%xmm13 |
685 | psrld $25,%xmm6 |
686 | movdqa (%r10),%xmm7 |
687 | por %xmm6,%xmm14 |
688 | movdqa %xmm4,32(%rsp) |
689 | movdqa %xmm5,48(%rsp) |
690 | movdqa 0(%rsp),%xmm4 |
691 | movdqa 16(%rsp),%xmm5 |
692 | paddd %xmm15,%xmm10 |
693 | paddd %xmm12,%xmm11 |
694 | pxor %xmm10,%xmm1 |
695 | pxor %xmm11,%xmm2 |
696 | .byte 102,15,56,0,207 |
697 | .byte 102,15,56,0,215 |
698 | paddd %xmm1,%xmm4 |
699 | paddd %xmm2,%xmm5 |
700 | pxor %xmm4,%xmm15 |
701 | pxor %xmm5,%xmm12 |
702 | movdqa %xmm15,%xmm6 |
703 | pslld $12,%xmm15 |
704 | psrld $20,%xmm6 |
705 | movdqa %xmm12,%xmm7 |
706 | pslld $12,%xmm12 |
707 | por %xmm6,%xmm15 |
708 | psrld $20,%xmm7 |
709 | movdqa (%r11),%xmm6 |
710 | por %xmm7,%xmm12 |
711 | paddd %xmm15,%xmm10 |
712 | paddd %xmm12,%xmm11 |
713 | pxor %xmm10,%xmm1 |
714 | pxor %xmm11,%xmm2 |
715 | .byte 102,15,56,0,206 |
716 | .byte 102,15,56,0,214 |
717 | paddd %xmm1,%xmm4 |
718 | paddd %xmm2,%xmm5 |
719 | pxor %xmm4,%xmm15 |
720 | pxor %xmm5,%xmm12 |
721 | movdqa %xmm15,%xmm7 |
722 | pslld $7,%xmm15 |
723 | psrld $25,%xmm7 |
724 | movdqa %xmm12,%xmm6 |
725 | pslld $7,%xmm12 |
726 | por %xmm7,%xmm15 |
727 | psrld $25,%xmm6 |
728 | movdqa (%r10),%xmm7 |
729 | por %xmm6,%xmm12 |
730 | decl %eax |
731 | jnz .Loop4x |
732 | |
733 | paddd 64(%rsp),%xmm8 |
734 | paddd 80(%rsp),%xmm9 |
735 | paddd 96(%rsp),%xmm10 |
736 | paddd 112(%rsp),%xmm11 |
737 | |
738 | movdqa %xmm8,%xmm6 |
739 | punpckldq %xmm9,%xmm8 |
740 | movdqa %xmm10,%xmm7 |
741 | punpckldq %xmm11,%xmm10 |
742 | punpckhdq %xmm9,%xmm6 |
743 | punpckhdq %xmm11,%xmm7 |
744 | movdqa %xmm8,%xmm9 |
745 | punpcklqdq %xmm10,%xmm8 |
746 | movdqa %xmm6,%xmm11 |
747 | punpcklqdq %xmm7,%xmm6 |
748 | punpckhqdq %xmm10,%xmm9 |
749 | punpckhqdq %xmm7,%xmm11 |
750 | paddd 128-256(%rcx),%xmm12 |
751 | paddd 144-256(%rcx),%xmm13 |
752 | paddd 160-256(%rcx),%xmm14 |
753 | paddd 176-256(%rcx),%xmm15 |
754 | |
755 | movdqa %xmm8,0(%rsp) |
756 | movdqa %xmm9,16(%rsp) |
757 | movdqa 32(%rsp),%xmm8 |
758 | movdqa 48(%rsp),%xmm9 |
759 | |
760 | movdqa %xmm12,%xmm10 |
761 | punpckldq %xmm13,%xmm12 |
762 | movdqa %xmm14,%xmm7 |
763 | punpckldq %xmm15,%xmm14 |
764 | punpckhdq %xmm13,%xmm10 |
765 | punpckhdq %xmm15,%xmm7 |
766 | movdqa %xmm12,%xmm13 |
767 | punpcklqdq %xmm14,%xmm12 |
768 | movdqa %xmm10,%xmm15 |
769 | punpcklqdq %xmm7,%xmm10 |
770 | punpckhqdq %xmm14,%xmm13 |
771 | punpckhqdq %xmm7,%xmm15 |
772 | paddd 192-256(%rcx),%xmm4 |
773 | paddd 208-256(%rcx),%xmm5 |
774 | paddd 224-256(%rcx),%xmm8 |
775 | paddd 240-256(%rcx),%xmm9 |
776 | |
777 | movdqa %xmm6,32(%rsp) |
778 | movdqa %xmm11,48(%rsp) |
779 | |
780 | movdqa %xmm4,%xmm14 |
781 | punpckldq %xmm5,%xmm4 |
782 | movdqa %xmm8,%xmm7 |
783 | punpckldq %xmm9,%xmm8 |
784 | punpckhdq %xmm5,%xmm14 |
785 | punpckhdq %xmm9,%xmm7 |
786 | movdqa %xmm4,%xmm5 |
787 | punpcklqdq %xmm8,%xmm4 |
788 | movdqa %xmm14,%xmm9 |
789 | punpcklqdq %xmm7,%xmm14 |
790 | punpckhqdq %xmm8,%xmm5 |
791 | punpckhqdq %xmm7,%xmm9 |
792 | paddd 256-256(%rcx),%xmm0 |
793 | paddd 272-256(%rcx),%xmm1 |
794 | paddd 288-256(%rcx),%xmm2 |
795 | paddd 304-256(%rcx),%xmm3 |
796 | |
797 | movdqa %xmm0,%xmm8 |
798 | punpckldq %xmm1,%xmm0 |
799 | movdqa %xmm2,%xmm7 |
800 | punpckldq %xmm3,%xmm2 |
801 | punpckhdq %xmm1,%xmm8 |
802 | punpckhdq %xmm3,%xmm7 |
803 | movdqa %xmm0,%xmm1 |
804 | punpcklqdq %xmm2,%xmm0 |
805 | movdqa %xmm8,%xmm3 |
806 | punpcklqdq %xmm7,%xmm8 |
807 | punpckhqdq %xmm2,%xmm1 |
808 | punpckhqdq %xmm7,%xmm3 |
809 | cmpq $256,%rdx |
810 | jb .Ltail4x |
811 | |
812 | movdqu 0(%rsi),%xmm6 |
813 | movdqu 16(%rsi),%xmm11 |
814 | movdqu 32(%rsi),%xmm2 |
815 | movdqu 48(%rsi),%xmm7 |
816 | pxor 0(%rsp),%xmm6 |
817 | pxor %xmm12,%xmm11 |
818 | pxor %xmm4,%xmm2 |
819 | pxor %xmm0,%xmm7 |
820 | |
821 | movdqu %xmm6,0(%rdi) |
822 | movdqu 64(%rsi),%xmm6 |
823 | movdqu %xmm11,16(%rdi) |
824 | movdqu 80(%rsi),%xmm11 |
825 | movdqu %xmm2,32(%rdi) |
826 | movdqu 96(%rsi),%xmm2 |
827 | movdqu %xmm7,48(%rdi) |
828 | movdqu 112(%rsi),%xmm7 |
829 | leaq 128(%rsi),%rsi |
830 | pxor 16(%rsp),%xmm6 |
831 | pxor %xmm13,%xmm11 |
832 | pxor %xmm5,%xmm2 |
833 | pxor %xmm1,%xmm7 |
834 | |
835 | movdqu %xmm6,64(%rdi) |
836 | movdqu 0(%rsi),%xmm6 |
837 | movdqu %xmm11,80(%rdi) |
838 | movdqu 16(%rsi),%xmm11 |
839 | movdqu %xmm2,96(%rdi) |
840 | movdqu 32(%rsi),%xmm2 |
841 | movdqu %xmm7,112(%rdi) |
842 | leaq 128(%rdi),%rdi |
843 | movdqu 48(%rsi),%xmm7 |
844 | pxor 32(%rsp),%xmm6 |
845 | pxor %xmm10,%xmm11 |
846 | pxor %xmm14,%xmm2 |
847 | pxor %xmm8,%xmm7 |
848 | |
849 | movdqu %xmm6,0(%rdi) |
850 | movdqu 64(%rsi),%xmm6 |
851 | movdqu %xmm11,16(%rdi) |
852 | movdqu 80(%rsi),%xmm11 |
853 | movdqu %xmm2,32(%rdi) |
854 | movdqu 96(%rsi),%xmm2 |
855 | movdqu %xmm7,48(%rdi) |
856 | movdqu 112(%rsi),%xmm7 |
857 | leaq 128(%rsi),%rsi |
858 | pxor 48(%rsp),%xmm6 |
859 | pxor %xmm15,%xmm11 |
860 | pxor %xmm9,%xmm2 |
861 | pxor %xmm3,%xmm7 |
862 | movdqu %xmm6,64(%rdi) |
863 | movdqu %xmm11,80(%rdi) |
864 | movdqu %xmm2,96(%rdi) |
865 | movdqu %xmm7,112(%rdi) |
866 | leaq 128(%rdi),%rdi |
867 | |
868 | subq $256,%rdx |
869 | jnz .Loop_outer4x |
870 | |
871 | jmp .Ldone4x |
872 | |
873 | .Ltail4x: |
874 | cmpq $192,%rdx |
875 | jae .L192_or_more4x |
876 | cmpq $128,%rdx |
877 | jae .L128_or_more4x |
878 | cmpq $64,%rdx |
879 | jae .L64_or_more4x |
880 | |
881 | |
882 | xorq %r10,%r10 |
883 | |
884 | movdqa %xmm12,16(%rsp) |
885 | movdqa %xmm4,32(%rsp) |
886 | movdqa %xmm0,48(%rsp) |
887 | jmp .Loop_tail4x |
888 | |
889 | .align 32 |
890 | .L64_or_more4x: |
891 | movdqu 0(%rsi),%xmm6 |
892 | movdqu 16(%rsi),%xmm11 |
893 | movdqu 32(%rsi),%xmm2 |
894 | movdqu 48(%rsi),%xmm7 |
895 | pxor 0(%rsp),%xmm6 |
896 | pxor %xmm12,%xmm11 |
897 | pxor %xmm4,%xmm2 |
898 | pxor %xmm0,%xmm7 |
899 | movdqu %xmm6,0(%rdi) |
900 | movdqu %xmm11,16(%rdi) |
901 | movdqu %xmm2,32(%rdi) |
902 | movdqu %xmm7,48(%rdi) |
903 | je .Ldone4x |
904 | |
905 | movdqa 16(%rsp),%xmm6 |
906 | leaq 64(%rsi),%rsi |
907 | xorq %r10,%r10 |
908 | movdqa %xmm6,0(%rsp) |
909 | movdqa %xmm13,16(%rsp) |
910 | leaq 64(%rdi),%rdi |
911 | movdqa %xmm5,32(%rsp) |
912 | subq $64,%rdx |
913 | movdqa %xmm1,48(%rsp) |
914 | jmp .Loop_tail4x |
915 | |
916 | .align 32 |
917 | .L128_or_more4x: |
918 | movdqu 0(%rsi),%xmm6 |
919 | movdqu 16(%rsi),%xmm11 |
920 | movdqu 32(%rsi),%xmm2 |
921 | movdqu 48(%rsi),%xmm7 |
922 | pxor 0(%rsp),%xmm6 |
923 | pxor %xmm12,%xmm11 |
924 | pxor %xmm4,%xmm2 |
925 | pxor %xmm0,%xmm7 |
926 | |
927 | movdqu %xmm6,0(%rdi) |
928 | movdqu 64(%rsi),%xmm6 |
929 | movdqu %xmm11,16(%rdi) |
930 | movdqu 80(%rsi),%xmm11 |
931 | movdqu %xmm2,32(%rdi) |
932 | movdqu 96(%rsi),%xmm2 |
933 | movdqu %xmm7,48(%rdi) |
934 | movdqu 112(%rsi),%xmm7 |
935 | pxor 16(%rsp),%xmm6 |
936 | pxor %xmm13,%xmm11 |
937 | pxor %xmm5,%xmm2 |
938 | pxor %xmm1,%xmm7 |
939 | movdqu %xmm6,64(%rdi) |
940 | movdqu %xmm11,80(%rdi) |
941 | movdqu %xmm2,96(%rdi) |
942 | movdqu %xmm7,112(%rdi) |
943 | je .Ldone4x |
944 | |
945 | movdqa 32(%rsp),%xmm6 |
946 | leaq 128(%rsi),%rsi |
947 | xorq %r10,%r10 |
948 | movdqa %xmm6,0(%rsp) |
949 | movdqa %xmm10,16(%rsp) |
950 | leaq 128(%rdi),%rdi |
951 | movdqa %xmm14,32(%rsp) |
952 | subq $128,%rdx |
953 | movdqa %xmm8,48(%rsp) |
954 | jmp .Loop_tail4x |
955 | |
956 | .align 32 |
957 | .L192_or_more4x: |
958 | movdqu 0(%rsi),%xmm6 |
959 | movdqu 16(%rsi),%xmm11 |
960 | movdqu 32(%rsi),%xmm2 |
961 | movdqu 48(%rsi),%xmm7 |
962 | pxor 0(%rsp),%xmm6 |
963 | pxor %xmm12,%xmm11 |
964 | pxor %xmm4,%xmm2 |
965 | pxor %xmm0,%xmm7 |
966 | |
967 | movdqu %xmm6,0(%rdi) |
968 | movdqu 64(%rsi),%xmm6 |
969 | movdqu %xmm11,16(%rdi) |
970 | movdqu 80(%rsi),%xmm11 |
971 | movdqu %xmm2,32(%rdi) |
972 | movdqu 96(%rsi),%xmm2 |
973 | movdqu %xmm7,48(%rdi) |
974 | movdqu 112(%rsi),%xmm7 |
975 | leaq 128(%rsi),%rsi |
976 | pxor 16(%rsp),%xmm6 |
977 | pxor %xmm13,%xmm11 |
978 | pxor %xmm5,%xmm2 |
979 | pxor %xmm1,%xmm7 |
980 | |
981 | movdqu %xmm6,64(%rdi) |
982 | movdqu 0(%rsi),%xmm6 |
983 | movdqu %xmm11,80(%rdi) |
984 | movdqu 16(%rsi),%xmm11 |
985 | movdqu %xmm2,96(%rdi) |
986 | movdqu 32(%rsi),%xmm2 |
987 | movdqu %xmm7,112(%rdi) |
988 | leaq 128(%rdi),%rdi |
989 | movdqu 48(%rsi),%xmm7 |
990 | pxor 32(%rsp),%xmm6 |
991 | pxor %xmm10,%xmm11 |
992 | pxor %xmm14,%xmm2 |
993 | pxor %xmm8,%xmm7 |
994 | movdqu %xmm6,0(%rdi) |
995 | movdqu %xmm11,16(%rdi) |
996 | movdqu %xmm2,32(%rdi) |
997 | movdqu %xmm7,48(%rdi) |
998 | je .Ldone4x |
999 | |
1000 | movdqa 48(%rsp),%xmm6 |
1001 | leaq 64(%rsi),%rsi |
1002 | xorq %r10,%r10 |
1003 | movdqa %xmm6,0(%rsp) |
1004 | movdqa %xmm15,16(%rsp) |
1005 | leaq 64(%rdi),%rdi |
1006 | movdqa %xmm9,32(%rsp) |
1007 | subq $192,%rdx |
1008 | movdqa %xmm3,48(%rsp) |
1009 | |
1010 | .Loop_tail4x: |
1011 | movzbl (%rsi,%r10,1),%eax |
1012 | movzbl (%rsp,%r10,1),%ecx |
1013 | leaq 1(%r10),%r10 |
1014 | xorl %ecx,%eax |
1015 | movb %al,-1(%rdi,%r10,1) |
1016 | decq %rdx |
1017 | jnz .Loop_tail4x |
1018 | |
1019 | .Ldone4x: |
1020 | leaq (%r9),%rsp |
1021 | .cfi_def_cfa_register rsp |
1022 | .L4x_epilogue: |
1023 | .byte 0xf3,0xc3 |
1024 | .cfi_endproc |
1025 | .size ChaCha20_4x,.-ChaCha20_4x |
1026 | .type ChaCha20_8x,@function |
1027 | .align 32 |
1028 | ChaCha20_8x: |
1029 | .LChaCha20_8x: |
1030 | .cfi_startproc |
1031 | movq %rsp,%r9 |
1032 | .cfi_def_cfa_register r9 |
1033 | subq $0x280+8,%rsp |
1034 | andq $-32,%rsp |
1035 | vzeroupper |
1036 | |
1037 | |
1038 | |
1039 | |
1040 | |
1041 | |
1042 | |
1043 | |
1044 | |
1045 | |
1046 | vbroadcasti128 .Lsigma(%rip),%ymm11 |
1047 | vbroadcasti128 (%rcx),%ymm3 |
1048 | vbroadcasti128 16(%rcx),%ymm15 |
1049 | vbroadcasti128 (%r8),%ymm7 |
1050 | leaq 256(%rsp),%rcx |
1051 | leaq 512(%rsp),%rax |
1052 | leaq .Lrot16(%rip),%r10 |
1053 | leaq .Lrot24(%rip),%r11 |
1054 | |
1055 | vpshufd $0x00,%ymm11,%ymm8 |
1056 | vpshufd $0x55,%ymm11,%ymm9 |
1057 | vmovdqa %ymm8,128-256(%rcx) |
1058 | vpshufd $0xaa,%ymm11,%ymm10 |
1059 | vmovdqa %ymm9,160-256(%rcx) |
1060 | vpshufd $0xff,%ymm11,%ymm11 |
1061 | vmovdqa %ymm10,192-256(%rcx) |
1062 | vmovdqa %ymm11,224-256(%rcx) |
1063 | |
1064 | vpshufd $0x00,%ymm3,%ymm0 |
1065 | vpshufd $0x55,%ymm3,%ymm1 |
1066 | vmovdqa %ymm0,256-256(%rcx) |
1067 | vpshufd $0xaa,%ymm3,%ymm2 |
1068 | vmovdqa %ymm1,288-256(%rcx) |
1069 | vpshufd $0xff,%ymm3,%ymm3 |
1070 | vmovdqa %ymm2,320-256(%rcx) |
1071 | vmovdqa %ymm3,352-256(%rcx) |
1072 | |
1073 | vpshufd $0x00,%ymm15,%ymm12 |
1074 | vpshufd $0x55,%ymm15,%ymm13 |
1075 | vmovdqa %ymm12,384-512(%rax) |
1076 | vpshufd $0xaa,%ymm15,%ymm14 |
1077 | vmovdqa %ymm13,416-512(%rax) |
1078 | vpshufd $0xff,%ymm15,%ymm15 |
1079 | vmovdqa %ymm14,448-512(%rax) |
1080 | vmovdqa %ymm15,480-512(%rax) |
1081 | |
1082 | vpshufd $0x00,%ymm7,%ymm4 |
1083 | vpshufd $0x55,%ymm7,%ymm5 |
1084 | vpaddd .Lincy(%rip),%ymm4,%ymm4 |
1085 | vpshufd $0xaa,%ymm7,%ymm6 |
1086 | vmovdqa %ymm5,544-512(%rax) |
1087 | vpshufd $0xff,%ymm7,%ymm7 |
1088 | vmovdqa %ymm6,576-512(%rax) |
1089 | vmovdqa %ymm7,608-512(%rax) |
1090 | |
1091 | jmp .Loop_enter8x |
1092 | |
1093 | .align 32 |
1094 | .Loop_outer8x: |
1095 | vmovdqa 128-256(%rcx),%ymm8 |
1096 | vmovdqa 160-256(%rcx),%ymm9 |
1097 | vmovdqa 192-256(%rcx),%ymm10 |
1098 | vmovdqa 224-256(%rcx),%ymm11 |
1099 | vmovdqa 256-256(%rcx),%ymm0 |
1100 | vmovdqa 288-256(%rcx),%ymm1 |
1101 | vmovdqa 320-256(%rcx),%ymm2 |
1102 | vmovdqa 352-256(%rcx),%ymm3 |
1103 | vmovdqa 384-512(%rax),%ymm12 |
1104 | vmovdqa 416-512(%rax),%ymm13 |
1105 | vmovdqa 448-512(%rax),%ymm14 |
1106 | vmovdqa 480-512(%rax),%ymm15 |
1107 | vmovdqa 512-512(%rax),%ymm4 |
1108 | vmovdqa 544-512(%rax),%ymm5 |
1109 | vmovdqa 576-512(%rax),%ymm6 |
1110 | vmovdqa 608-512(%rax),%ymm7 |
1111 | vpaddd .Leight(%rip),%ymm4,%ymm4 |
1112 | |
1113 | .Loop_enter8x: |
1114 | vmovdqa %ymm14,64(%rsp) |
1115 | vmovdqa %ymm15,96(%rsp) |
1116 | vbroadcasti128 (%r10),%ymm15 |
1117 | vmovdqa %ymm4,512-512(%rax) |
1118 | movl $10,%eax |
1119 | jmp .Loop8x |
1120 | |
1121 | .align 32 |
1122 | .Loop8x: |
1123 | vpaddd %ymm0,%ymm8,%ymm8 |
1124 | vpxor %ymm4,%ymm8,%ymm4 |
1125 | vpshufb %ymm15,%ymm4,%ymm4 |
1126 | vpaddd %ymm1,%ymm9,%ymm9 |
1127 | vpxor %ymm5,%ymm9,%ymm5 |
1128 | vpshufb %ymm15,%ymm5,%ymm5 |
1129 | vpaddd %ymm4,%ymm12,%ymm12 |
1130 | vpxor %ymm0,%ymm12,%ymm0 |
1131 | vpslld $12,%ymm0,%ymm14 |
1132 | vpsrld $20,%ymm0,%ymm0 |
1133 | vpor %ymm0,%ymm14,%ymm0 |
1134 | vbroadcasti128 (%r11),%ymm14 |
1135 | vpaddd %ymm5,%ymm13,%ymm13 |
1136 | vpxor %ymm1,%ymm13,%ymm1 |
1137 | vpslld $12,%ymm1,%ymm15 |
1138 | vpsrld $20,%ymm1,%ymm1 |
1139 | vpor %ymm1,%ymm15,%ymm1 |
1140 | vpaddd %ymm0,%ymm8,%ymm8 |
1141 | vpxor %ymm4,%ymm8,%ymm4 |
1142 | vpshufb %ymm14,%ymm4,%ymm4 |
1143 | vpaddd %ymm1,%ymm9,%ymm9 |
1144 | vpxor %ymm5,%ymm9,%ymm5 |
1145 | vpshufb %ymm14,%ymm5,%ymm5 |
1146 | vpaddd %ymm4,%ymm12,%ymm12 |
1147 | vpxor %ymm0,%ymm12,%ymm0 |
1148 | vpslld $7,%ymm0,%ymm15 |
1149 | vpsrld $25,%ymm0,%ymm0 |
1150 | vpor %ymm0,%ymm15,%ymm0 |
1151 | vbroadcasti128 (%r10),%ymm15 |
1152 | vpaddd %ymm5,%ymm13,%ymm13 |
1153 | vpxor %ymm1,%ymm13,%ymm1 |
1154 | vpslld $7,%ymm1,%ymm14 |
1155 | vpsrld $25,%ymm1,%ymm1 |
1156 | vpor %ymm1,%ymm14,%ymm1 |
1157 | vmovdqa %ymm12,0(%rsp) |
1158 | vmovdqa %ymm13,32(%rsp) |
1159 | vmovdqa 64(%rsp),%ymm12 |
1160 | vmovdqa 96(%rsp),%ymm13 |
1161 | vpaddd %ymm2,%ymm10,%ymm10 |
1162 | vpxor %ymm6,%ymm10,%ymm6 |
1163 | vpshufb %ymm15,%ymm6,%ymm6 |
1164 | vpaddd %ymm3,%ymm11,%ymm11 |
1165 | vpxor %ymm7,%ymm11,%ymm7 |
1166 | vpshufb %ymm15,%ymm7,%ymm7 |
1167 | vpaddd %ymm6,%ymm12,%ymm12 |
1168 | vpxor %ymm2,%ymm12,%ymm2 |
1169 | vpslld $12,%ymm2,%ymm14 |
1170 | vpsrld $20,%ymm2,%ymm2 |
1171 | vpor %ymm2,%ymm14,%ymm2 |
1172 | vbroadcasti128 (%r11),%ymm14 |
1173 | vpaddd %ymm7,%ymm13,%ymm13 |
1174 | vpxor %ymm3,%ymm13,%ymm3 |
1175 | vpslld $12,%ymm3,%ymm15 |
1176 | vpsrld $20,%ymm3,%ymm3 |
1177 | vpor %ymm3,%ymm15,%ymm3 |
1178 | vpaddd %ymm2,%ymm10,%ymm10 |
1179 | vpxor %ymm6,%ymm10,%ymm6 |
1180 | vpshufb %ymm14,%ymm6,%ymm6 |
1181 | vpaddd %ymm3,%ymm11,%ymm11 |
1182 | vpxor %ymm7,%ymm11,%ymm7 |
1183 | vpshufb %ymm14,%ymm7,%ymm7 |
1184 | vpaddd %ymm6,%ymm12,%ymm12 |
1185 | vpxor %ymm2,%ymm12,%ymm2 |
1186 | vpslld $7,%ymm2,%ymm15 |
1187 | vpsrld $25,%ymm2,%ymm2 |
1188 | vpor %ymm2,%ymm15,%ymm2 |
1189 | vbroadcasti128 (%r10),%ymm15 |
1190 | vpaddd %ymm7,%ymm13,%ymm13 |
1191 | vpxor %ymm3,%ymm13,%ymm3 |
1192 | vpslld $7,%ymm3,%ymm14 |
1193 | vpsrld $25,%ymm3,%ymm3 |
1194 | vpor %ymm3,%ymm14,%ymm3 |
1195 | vpaddd %ymm1,%ymm8,%ymm8 |
1196 | vpxor %ymm7,%ymm8,%ymm7 |
1197 | vpshufb %ymm15,%ymm7,%ymm7 |
1198 | vpaddd %ymm2,%ymm9,%ymm9 |
1199 | vpxor %ymm4,%ymm9,%ymm4 |
1200 | vpshufb %ymm15,%ymm4,%ymm4 |
1201 | vpaddd %ymm7,%ymm12,%ymm12 |
1202 | vpxor %ymm1,%ymm12,%ymm1 |
1203 | vpslld $12,%ymm1,%ymm14 |
1204 | vpsrld $20,%ymm1,%ymm1 |
1205 | vpor %ymm1,%ymm14,%ymm1 |
1206 | vbroadcasti128 (%r11),%ymm14 |
1207 | vpaddd %ymm4,%ymm13,%ymm13 |
1208 | vpxor %ymm2,%ymm13,%ymm2 |
1209 | vpslld $12,%ymm2,%ymm15 |
1210 | vpsrld $20,%ymm2,%ymm2 |
1211 | vpor %ymm2,%ymm15,%ymm2 |
1212 | vpaddd %ymm1,%ymm8,%ymm8 |
1213 | vpxor %ymm7,%ymm8,%ymm7 |
1214 | vpshufb %ymm14,%ymm7,%ymm7 |
1215 | vpaddd %ymm2,%ymm9,%ymm9 |
1216 | vpxor %ymm4,%ymm9,%ymm4 |
1217 | vpshufb %ymm14,%ymm4,%ymm4 |
1218 | vpaddd %ymm7,%ymm12,%ymm12 |
1219 | vpxor %ymm1,%ymm12,%ymm1 |
1220 | vpslld $7,%ymm1,%ymm15 |
1221 | vpsrld $25,%ymm1,%ymm1 |
1222 | vpor %ymm1,%ymm15,%ymm1 |
1223 | vbroadcasti128 (%r10),%ymm15 |
1224 | vpaddd %ymm4,%ymm13,%ymm13 |
1225 | vpxor %ymm2,%ymm13,%ymm2 |
1226 | vpslld $7,%ymm2,%ymm14 |
1227 | vpsrld $25,%ymm2,%ymm2 |
1228 | vpor %ymm2,%ymm14,%ymm2 |
1229 | vmovdqa %ymm12,64(%rsp) |
1230 | vmovdqa %ymm13,96(%rsp) |
1231 | vmovdqa 0(%rsp),%ymm12 |
1232 | vmovdqa 32(%rsp),%ymm13 |
1233 | vpaddd %ymm3,%ymm10,%ymm10 |
1234 | vpxor %ymm5,%ymm10,%ymm5 |
1235 | vpshufb %ymm15,%ymm5,%ymm5 |
1236 | vpaddd %ymm0,%ymm11,%ymm11 |
1237 | vpxor %ymm6,%ymm11,%ymm6 |
1238 | vpshufb %ymm15,%ymm6,%ymm6 |
1239 | vpaddd %ymm5,%ymm12,%ymm12 |
1240 | vpxor %ymm3,%ymm12,%ymm3 |
1241 | vpslld $12,%ymm3,%ymm14 |
1242 | vpsrld $20,%ymm3,%ymm3 |
1243 | vpor %ymm3,%ymm14,%ymm3 |
1244 | vbroadcasti128 (%r11),%ymm14 |
1245 | vpaddd %ymm6,%ymm13,%ymm13 |
1246 | vpxor %ymm0,%ymm13,%ymm0 |
1247 | vpslld $12,%ymm0,%ymm15 |
1248 | vpsrld $20,%ymm0,%ymm0 |
1249 | vpor %ymm0,%ymm15,%ymm0 |
1250 | vpaddd %ymm3,%ymm10,%ymm10 |
1251 | vpxor %ymm5,%ymm10,%ymm5 |
1252 | vpshufb %ymm14,%ymm5,%ymm5 |
1253 | vpaddd %ymm0,%ymm11,%ymm11 |
1254 | vpxor %ymm6,%ymm11,%ymm6 |
1255 | vpshufb %ymm14,%ymm6,%ymm6 |
1256 | vpaddd %ymm5,%ymm12,%ymm12 |
1257 | vpxor %ymm3,%ymm12,%ymm3 |
1258 | vpslld $7,%ymm3,%ymm15 |
1259 | vpsrld $25,%ymm3,%ymm3 |
1260 | vpor %ymm3,%ymm15,%ymm3 |
1261 | vbroadcasti128 (%r10),%ymm15 |
1262 | vpaddd %ymm6,%ymm13,%ymm13 |
1263 | vpxor %ymm0,%ymm13,%ymm0 |
1264 | vpslld $7,%ymm0,%ymm14 |
1265 | vpsrld $25,%ymm0,%ymm0 |
1266 | vpor %ymm0,%ymm14,%ymm0 |
1267 | decl %eax |
1268 | jnz .Loop8x |
1269 | |
1270 | leaq 512(%rsp),%rax |
1271 | vpaddd 128-256(%rcx),%ymm8,%ymm8 |
1272 | vpaddd 160-256(%rcx),%ymm9,%ymm9 |
1273 | vpaddd 192-256(%rcx),%ymm10,%ymm10 |
1274 | vpaddd 224-256(%rcx),%ymm11,%ymm11 |
1275 | |
1276 | vpunpckldq %ymm9,%ymm8,%ymm14 |
1277 | vpunpckldq %ymm11,%ymm10,%ymm15 |
1278 | vpunpckhdq %ymm9,%ymm8,%ymm8 |
1279 | vpunpckhdq %ymm11,%ymm10,%ymm10 |
1280 | vpunpcklqdq %ymm15,%ymm14,%ymm9 |
1281 | vpunpckhqdq %ymm15,%ymm14,%ymm14 |
1282 | vpunpcklqdq %ymm10,%ymm8,%ymm11 |
1283 | vpunpckhqdq %ymm10,%ymm8,%ymm8 |
1284 | vpaddd 256-256(%rcx),%ymm0,%ymm0 |
1285 | vpaddd 288-256(%rcx),%ymm1,%ymm1 |
1286 | vpaddd 320-256(%rcx),%ymm2,%ymm2 |
1287 | vpaddd 352-256(%rcx),%ymm3,%ymm3 |
1288 | |
1289 | vpunpckldq %ymm1,%ymm0,%ymm10 |
1290 | vpunpckldq %ymm3,%ymm2,%ymm15 |
1291 | vpunpckhdq %ymm1,%ymm0,%ymm0 |
1292 | vpunpckhdq %ymm3,%ymm2,%ymm2 |
1293 | vpunpcklqdq %ymm15,%ymm10,%ymm1 |
1294 | vpunpckhqdq %ymm15,%ymm10,%ymm10 |
1295 | vpunpcklqdq %ymm2,%ymm0,%ymm3 |
1296 | vpunpckhqdq %ymm2,%ymm0,%ymm0 |
1297 | vperm2i128 $0x20,%ymm1,%ymm9,%ymm15 |
1298 | vperm2i128 $0x31,%ymm1,%ymm9,%ymm1 |
1299 | vperm2i128 $0x20,%ymm10,%ymm14,%ymm9 |
1300 | vperm2i128 $0x31,%ymm10,%ymm14,%ymm10 |
1301 | vperm2i128 $0x20,%ymm3,%ymm11,%ymm14 |
1302 | vperm2i128 $0x31,%ymm3,%ymm11,%ymm3 |
1303 | vperm2i128 $0x20,%ymm0,%ymm8,%ymm11 |
1304 | vperm2i128 $0x31,%ymm0,%ymm8,%ymm0 |
1305 | vmovdqa %ymm15,0(%rsp) |
1306 | vmovdqa %ymm9,32(%rsp) |
1307 | vmovdqa 64(%rsp),%ymm15 |
1308 | vmovdqa 96(%rsp),%ymm9 |
1309 | |
1310 | vpaddd 384-512(%rax),%ymm12,%ymm12 |
1311 | vpaddd 416-512(%rax),%ymm13,%ymm13 |
1312 | vpaddd 448-512(%rax),%ymm15,%ymm15 |
1313 | vpaddd 480-512(%rax),%ymm9,%ymm9 |
1314 | |
1315 | vpunpckldq %ymm13,%ymm12,%ymm2 |
1316 | vpunpckldq %ymm9,%ymm15,%ymm8 |
1317 | vpunpckhdq %ymm13,%ymm12,%ymm12 |
1318 | vpunpckhdq %ymm9,%ymm15,%ymm15 |
1319 | vpunpcklqdq %ymm8,%ymm2,%ymm13 |
1320 | vpunpckhqdq %ymm8,%ymm2,%ymm2 |
1321 | vpunpcklqdq %ymm15,%ymm12,%ymm9 |
1322 | vpunpckhqdq %ymm15,%ymm12,%ymm12 |
1323 | vpaddd 512-512(%rax),%ymm4,%ymm4 |
1324 | vpaddd 544-512(%rax),%ymm5,%ymm5 |
1325 | vpaddd 576-512(%rax),%ymm6,%ymm6 |
1326 | vpaddd 608-512(%rax),%ymm7,%ymm7 |
1327 | |
1328 | vpunpckldq %ymm5,%ymm4,%ymm15 |
1329 | vpunpckldq %ymm7,%ymm6,%ymm8 |
1330 | vpunpckhdq %ymm5,%ymm4,%ymm4 |
1331 | vpunpckhdq %ymm7,%ymm6,%ymm6 |
1332 | vpunpcklqdq %ymm8,%ymm15,%ymm5 |
1333 | vpunpckhqdq %ymm8,%ymm15,%ymm15 |
1334 | vpunpcklqdq %ymm6,%ymm4,%ymm7 |
1335 | vpunpckhqdq %ymm6,%ymm4,%ymm4 |
1336 | vperm2i128 $0x20,%ymm5,%ymm13,%ymm8 |
1337 | vperm2i128 $0x31,%ymm5,%ymm13,%ymm5 |
1338 | vperm2i128 $0x20,%ymm15,%ymm2,%ymm13 |
1339 | vperm2i128 $0x31,%ymm15,%ymm2,%ymm15 |
1340 | vperm2i128 $0x20,%ymm7,%ymm9,%ymm2 |
1341 | vperm2i128 $0x31,%ymm7,%ymm9,%ymm7 |
1342 | vperm2i128 $0x20,%ymm4,%ymm12,%ymm9 |
1343 | vperm2i128 $0x31,%ymm4,%ymm12,%ymm4 |
1344 | vmovdqa 0(%rsp),%ymm6 |
1345 | vmovdqa 32(%rsp),%ymm12 |
1346 | |
1347 | cmpq $512,%rdx |
1348 | jb .Ltail8x |
1349 | |
1350 | vpxor 0(%rsi),%ymm6,%ymm6 |
1351 | vpxor 32(%rsi),%ymm8,%ymm8 |
1352 | vpxor 64(%rsi),%ymm1,%ymm1 |
1353 | vpxor 96(%rsi),%ymm5,%ymm5 |
1354 | leaq 128(%rsi),%rsi |
1355 | vmovdqu %ymm6,0(%rdi) |
1356 | vmovdqu %ymm8,32(%rdi) |
1357 | vmovdqu %ymm1,64(%rdi) |
1358 | vmovdqu %ymm5,96(%rdi) |
1359 | leaq 128(%rdi),%rdi |
1360 | |
1361 | vpxor 0(%rsi),%ymm12,%ymm12 |
1362 | vpxor 32(%rsi),%ymm13,%ymm13 |
1363 | vpxor 64(%rsi),%ymm10,%ymm10 |
1364 | vpxor 96(%rsi),%ymm15,%ymm15 |
1365 | leaq 128(%rsi),%rsi |
1366 | vmovdqu %ymm12,0(%rdi) |
1367 | vmovdqu %ymm13,32(%rdi) |
1368 | vmovdqu %ymm10,64(%rdi) |
1369 | vmovdqu %ymm15,96(%rdi) |
1370 | leaq 128(%rdi),%rdi |
1371 | |
1372 | vpxor 0(%rsi),%ymm14,%ymm14 |
1373 | vpxor 32(%rsi),%ymm2,%ymm2 |
1374 | vpxor 64(%rsi),%ymm3,%ymm3 |
1375 | vpxor 96(%rsi),%ymm7,%ymm7 |
1376 | leaq 128(%rsi),%rsi |
1377 | vmovdqu %ymm14,0(%rdi) |
1378 | vmovdqu %ymm2,32(%rdi) |
1379 | vmovdqu %ymm3,64(%rdi) |
1380 | vmovdqu %ymm7,96(%rdi) |
1381 | leaq 128(%rdi),%rdi |
1382 | |
1383 | vpxor 0(%rsi),%ymm11,%ymm11 |
1384 | vpxor 32(%rsi),%ymm9,%ymm9 |
1385 | vpxor 64(%rsi),%ymm0,%ymm0 |
1386 | vpxor 96(%rsi),%ymm4,%ymm4 |
1387 | leaq 128(%rsi),%rsi |
1388 | vmovdqu %ymm11,0(%rdi) |
1389 | vmovdqu %ymm9,32(%rdi) |
1390 | vmovdqu %ymm0,64(%rdi) |
1391 | vmovdqu %ymm4,96(%rdi) |
1392 | leaq 128(%rdi),%rdi |
1393 | |
1394 | subq $512,%rdx |
1395 | jnz .Loop_outer8x |
1396 | |
1397 | jmp .Ldone8x |
1398 | |
1399 | .Ltail8x: |
1400 | cmpq $448,%rdx |
1401 | jae .L448_or_more8x |
1402 | cmpq $384,%rdx |
1403 | jae .L384_or_more8x |
1404 | cmpq $320,%rdx |
1405 | jae .L320_or_more8x |
1406 | cmpq $256,%rdx |
1407 | jae .L256_or_more8x |
1408 | cmpq $192,%rdx |
1409 | jae .L192_or_more8x |
1410 | cmpq $128,%rdx |
1411 | jae .L128_or_more8x |
1412 | cmpq $64,%rdx |
1413 | jae .L64_or_more8x |
1414 | |
1415 | xorq %r10,%r10 |
1416 | vmovdqa %ymm6,0(%rsp) |
1417 | vmovdqa %ymm8,32(%rsp) |
1418 | jmp .Loop_tail8x |
1419 | |
1420 | .align 32 |
1421 | .L64_or_more8x: |
1422 | vpxor 0(%rsi),%ymm6,%ymm6 |
1423 | vpxor 32(%rsi),%ymm8,%ymm8 |
1424 | vmovdqu %ymm6,0(%rdi) |
1425 | vmovdqu %ymm8,32(%rdi) |
1426 | je .Ldone8x |
1427 | |
1428 | leaq 64(%rsi),%rsi |
1429 | xorq %r10,%r10 |
1430 | vmovdqa %ymm1,0(%rsp) |
1431 | leaq 64(%rdi),%rdi |
1432 | subq $64,%rdx |
1433 | vmovdqa %ymm5,32(%rsp) |
1434 | jmp .Loop_tail8x |
1435 | |
1436 | .align 32 |
1437 | .L128_or_more8x: |
1438 | vpxor 0(%rsi),%ymm6,%ymm6 |
1439 | vpxor 32(%rsi),%ymm8,%ymm8 |
1440 | vpxor 64(%rsi),%ymm1,%ymm1 |
1441 | vpxor 96(%rsi),%ymm5,%ymm5 |
1442 | vmovdqu %ymm6,0(%rdi) |
1443 | vmovdqu %ymm8,32(%rdi) |
1444 | vmovdqu %ymm1,64(%rdi) |
1445 | vmovdqu %ymm5,96(%rdi) |
1446 | je .Ldone8x |
1447 | |
1448 | leaq 128(%rsi),%rsi |
1449 | xorq %r10,%r10 |
1450 | vmovdqa %ymm12,0(%rsp) |
1451 | leaq 128(%rdi),%rdi |
1452 | subq $128,%rdx |
1453 | vmovdqa %ymm13,32(%rsp) |
1454 | jmp .Loop_tail8x |
1455 | |
1456 | .align 32 |
1457 | .L192_or_more8x: |
1458 | vpxor 0(%rsi),%ymm6,%ymm6 |
1459 | vpxor 32(%rsi),%ymm8,%ymm8 |
1460 | vpxor 64(%rsi),%ymm1,%ymm1 |
1461 | vpxor 96(%rsi),%ymm5,%ymm5 |
1462 | vpxor 128(%rsi),%ymm12,%ymm12 |
1463 | vpxor 160(%rsi),%ymm13,%ymm13 |
1464 | vmovdqu %ymm6,0(%rdi) |
1465 | vmovdqu %ymm8,32(%rdi) |
1466 | vmovdqu %ymm1,64(%rdi) |
1467 | vmovdqu %ymm5,96(%rdi) |
1468 | vmovdqu %ymm12,128(%rdi) |
1469 | vmovdqu %ymm13,160(%rdi) |
1470 | je .Ldone8x |
1471 | |
1472 | leaq 192(%rsi),%rsi |
1473 | xorq %r10,%r10 |
1474 | vmovdqa %ymm10,0(%rsp) |
1475 | leaq 192(%rdi),%rdi |
1476 | subq $192,%rdx |
1477 | vmovdqa %ymm15,32(%rsp) |
1478 | jmp .Loop_tail8x |
1479 | |
1480 | .align 32 |
1481 | .L256_or_more8x: |
1482 | vpxor 0(%rsi),%ymm6,%ymm6 |
1483 | vpxor 32(%rsi),%ymm8,%ymm8 |
1484 | vpxor 64(%rsi),%ymm1,%ymm1 |
1485 | vpxor 96(%rsi),%ymm5,%ymm5 |
1486 | vpxor 128(%rsi),%ymm12,%ymm12 |
1487 | vpxor 160(%rsi),%ymm13,%ymm13 |
1488 | vpxor 192(%rsi),%ymm10,%ymm10 |
1489 | vpxor 224(%rsi),%ymm15,%ymm15 |
1490 | vmovdqu %ymm6,0(%rdi) |
1491 | vmovdqu %ymm8,32(%rdi) |
1492 | vmovdqu %ymm1,64(%rdi) |
1493 | vmovdqu %ymm5,96(%rdi) |
1494 | vmovdqu %ymm12,128(%rdi) |
1495 | vmovdqu %ymm13,160(%rdi) |
1496 | vmovdqu %ymm10,192(%rdi) |
1497 | vmovdqu %ymm15,224(%rdi) |
1498 | je .Ldone8x |
1499 | |
1500 | leaq 256(%rsi),%rsi |
1501 | xorq %r10,%r10 |
1502 | vmovdqa %ymm14,0(%rsp) |
1503 | leaq 256(%rdi),%rdi |
1504 | subq $256,%rdx |
1505 | vmovdqa %ymm2,32(%rsp) |
1506 | jmp .Loop_tail8x |
1507 | |
1508 | .align 32 |
1509 | .L320_or_more8x: |
1510 | vpxor 0(%rsi),%ymm6,%ymm6 |
1511 | vpxor 32(%rsi),%ymm8,%ymm8 |
1512 | vpxor 64(%rsi),%ymm1,%ymm1 |
1513 | vpxor 96(%rsi),%ymm5,%ymm5 |
1514 | vpxor 128(%rsi),%ymm12,%ymm12 |
1515 | vpxor 160(%rsi),%ymm13,%ymm13 |
1516 | vpxor 192(%rsi),%ymm10,%ymm10 |
1517 | vpxor 224(%rsi),%ymm15,%ymm15 |
1518 | vpxor 256(%rsi),%ymm14,%ymm14 |
1519 | vpxor 288(%rsi),%ymm2,%ymm2 |
1520 | vmovdqu %ymm6,0(%rdi) |
1521 | vmovdqu %ymm8,32(%rdi) |
1522 | vmovdqu %ymm1,64(%rdi) |
1523 | vmovdqu %ymm5,96(%rdi) |
1524 | vmovdqu %ymm12,128(%rdi) |
1525 | vmovdqu %ymm13,160(%rdi) |
1526 | vmovdqu %ymm10,192(%rdi) |
1527 | vmovdqu %ymm15,224(%rdi) |
1528 | vmovdqu %ymm14,256(%rdi) |
1529 | vmovdqu %ymm2,288(%rdi) |
1530 | je .Ldone8x |
1531 | |
1532 | leaq 320(%rsi),%rsi |
1533 | xorq %r10,%r10 |
1534 | vmovdqa %ymm3,0(%rsp) |
1535 | leaq 320(%rdi),%rdi |
1536 | subq $320,%rdx |
1537 | vmovdqa %ymm7,32(%rsp) |
1538 | jmp .Loop_tail8x |
1539 | |
1540 | .align 32 |
1541 | .L384_or_more8x: |
1542 | vpxor 0(%rsi),%ymm6,%ymm6 |
1543 | vpxor 32(%rsi),%ymm8,%ymm8 |
1544 | vpxor 64(%rsi),%ymm1,%ymm1 |
1545 | vpxor 96(%rsi),%ymm5,%ymm5 |
1546 | vpxor 128(%rsi),%ymm12,%ymm12 |
1547 | vpxor 160(%rsi),%ymm13,%ymm13 |
1548 | vpxor 192(%rsi),%ymm10,%ymm10 |
1549 | vpxor 224(%rsi),%ymm15,%ymm15 |
1550 | vpxor 256(%rsi),%ymm14,%ymm14 |
1551 | vpxor 288(%rsi),%ymm2,%ymm2 |
1552 | vpxor 320(%rsi),%ymm3,%ymm3 |
1553 | vpxor 352(%rsi),%ymm7,%ymm7 |
1554 | vmovdqu %ymm6,0(%rdi) |
1555 | vmovdqu %ymm8,32(%rdi) |
1556 | vmovdqu %ymm1,64(%rdi) |
1557 | vmovdqu %ymm5,96(%rdi) |
1558 | vmovdqu %ymm12,128(%rdi) |
1559 | vmovdqu %ymm13,160(%rdi) |
1560 | vmovdqu %ymm10,192(%rdi) |
1561 | vmovdqu %ymm15,224(%rdi) |
1562 | vmovdqu %ymm14,256(%rdi) |
1563 | vmovdqu %ymm2,288(%rdi) |
1564 | vmovdqu %ymm3,320(%rdi) |
1565 | vmovdqu %ymm7,352(%rdi) |
1566 | je .Ldone8x |
1567 | |
1568 | leaq 384(%rsi),%rsi |
1569 | xorq %r10,%r10 |
1570 | vmovdqa %ymm11,0(%rsp) |
1571 | leaq 384(%rdi),%rdi |
1572 | subq $384,%rdx |
1573 | vmovdqa %ymm9,32(%rsp) |
1574 | jmp .Loop_tail8x |
1575 | |
1576 | .align 32 |
1577 | .L448_or_more8x: |
1578 | vpxor 0(%rsi),%ymm6,%ymm6 |
1579 | vpxor 32(%rsi),%ymm8,%ymm8 |
1580 | vpxor 64(%rsi),%ymm1,%ymm1 |
1581 | vpxor 96(%rsi),%ymm5,%ymm5 |
1582 | vpxor 128(%rsi),%ymm12,%ymm12 |
1583 | vpxor 160(%rsi),%ymm13,%ymm13 |
1584 | vpxor 192(%rsi),%ymm10,%ymm10 |
1585 | vpxor 224(%rsi),%ymm15,%ymm15 |
1586 | vpxor 256(%rsi),%ymm14,%ymm14 |
1587 | vpxor 288(%rsi),%ymm2,%ymm2 |
1588 | vpxor 320(%rsi),%ymm3,%ymm3 |
1589 | vpxor 352(%rsi),%ymm7,%ymm7 |
1590 | vpxor 384(%rsi),%ymm11,%ymm11 |
1591 | vpxor 416(%rsi),%ymm9,%ymm9 |
1592 | vmovdqu %ymm6,0(%rdi) |
1593 | vmovdqu %ymm8,32(%rdi) |
1594 | vmovdqu %ymm1,64(%rdi) |
1595 | vmovdqu %ymm5,96(%rdi) |
1596 | vmovdqu %ymm12,128(%rdi) |
1597 | vmovdqu %ymm13,160(%rdi) |
1598 | vmovdqu %ymm10,192(%rdi) |
1599 | vmovdqu %ymm15,224(%rdi) |
1600 | vmovdqu %ymm14,256(%rdi) |
1601 | vmovdqu %ymm2,288(%rdi) |
1602 | vmovdqu %ymm3,320(%rdi) |
1603 | vmovdqu %ymm7,352(%rdi) |
1604 | vmovdqu %ymm11,384(%rdi) |
1605 | vmovdqu %ymm9,416(%rdi) |
1606 | je .Ldone8x |
1607 | |
1608 | leaq 448(%rsi),%rsi |
1609 | xorq %r10,%r10 |
1610 | vmovdqa %ymm0,0(%rsp) |
1611 | leaq 448(%rdi),%rdi |
1612 | subq $448,%rdx |
1613 | vmovdqa %ymm4,32(%rsp) |
1614 | |
1615 | .Loop_tail8x: |
1616 | movzbl (%rsi,%r10,1),%eax |
1617 | movzbl (%rsp,%r10,1),%ecx |
1618 | leaq 1(%r10),%r10 |
1619 | xorl %ecx,%eax |
1620 | movb %al,-1(%rdi,%r10,1) |
1621 | decq %rdx |
1622 | jnz .Loop_tail8x |
1623 | |
1624 | .Ldone8x: |
1625 | vzeroall |
1626 | leaq (%r9),%rsp |
1627 | .cfi_def_cfa_register rsp |
1628 | .L8x_epilogue: |
1629 | .byte 0xf3,0xc3 |
1630 | .cfi_endproc |
1631 | .size ChaCha20_8x,.-ChaCha20_8x |
1632 | #endif |
1633 | |