1 | /* |
2 | * Copyright (c) Meta Platforms, Inc. and affiliates. |
3 | * All rights reserved. |
4 | * |
5 | * This source code is licensed under both the BSD-style license (found in the |
6 | * LICENSE file in the root directory of this source tree) and the GPLv2 (found |
7 | * in the COPYING file in the root directory of this source tree). |
8 | * You may select, at your option, one of the above-listed licenses. |
9 | */ |
10 | |
11 | #include "../common/portability_macros.h" |
12 | |
13 | /* Stack marking |
14 | * ref: https://wiki.gentoo.org/wiki/Hardened/GNU_stack_quickstart |
15 | */ |
16 | #if defined(__ELF__) && defined(__GNUC__) |
17 | .section .note.GNU-stack,"" ,%progbits |
18 | #endif |
19 | |
20 | #if ZSTD_ENABLE_ASM_X86_64_BMI2 |
21 | |
22 | /* Calling convention: |
23 | * |
24 | * %rdi contains the first argument: HUF_DecompressAsmArgs*. |
25 | * %rbp isn't maintained (no frame pointer). |
26 | * %rsp contains the stack pointer that grows down. |
27 | * No red-zone is assumed, only addresses >= %rsp are used. |
28 | * All register contents are preserved. |
29 | * |
30 | * TODO: Support Windows calling convention. |
31 | */ |
32 | |
33 | ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X1_usingDTable_internal_fast_asm_loop) |
34 | ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X2_usingDTable_internal_fast_asm_loop) |
35 | ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X2_usingDTable_internal_fast_asm_loop) |
36 | ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_fast_asm_loop) |
37 | .global HUF_decompress4X1_usingDTable_internal_fast_asm_loop |
38 | .global HUF_decompress4X2_usingDTable_internal_fast_asm_loop |
39 | .global _HUF_decompress4X1_usingDTable_internal_fast_asm_loop |
40 | .global _HUF_decompress4X2_usingDTable_internal_fast_asm_loop |
41 | .text |
42 | |
43 | /* Sets up register mappings for clarity. |
44 | * op[], bits[], dtable & ip[0] each get their own register. |
45 | * ip[1,2,3] & olimit alias var[]. |
46 | * %rax is a scratch register. |
47 | */ |
48 | |
49 | #define op0 rsi |
50 | #define op1 rbx |
51 | #define op2 rcx |
52 | #define op3 rdi |
53 | |
54 | #define ip0 r8 |
55 | #define ip1 r9 |
56 | #define ip2 r10 |
57 | #define ip3 r11 |
58 | |
59 | #define bits0 rbp |
60 | #define bits1 rdx |
61 | #define bits2 r12 |
62 | #define bits3 r13 |
63 | #define dtable r14 |
64 | #define olimit r15 |
65 | |
66 | /* var[] aliases ip[1,2,3] & olimit |
67 | * ip[1,2,3] are saved every iteration. |
68 | * olimit is only used in compute_olimit. |
69 | */ |
70 | #define var0 r15 |
71 | #define var1 r9 |
72 | #define var2 r10 |
73 | #define var3 r11 |
74 | |
75 | /* 32-bit var registers */ |
76 | #define vard0 r15d |
77 | #define vard1 r9d |
78 | #define vard2 r10d |
79 | #define vard3 r11d |
80 | |
81 | /* Calls X(N) for each stream 0, 1, 2, 3. */ |
82 | #define FOR_EACH_STREAM(X) \ |
83 | X(0); \ |
84 | X(1); \ |
85 | X(2); \ |
86 | X(3) |
87 | |
88 | /* Calls X(N, idx) for each stream 0, 1, 2, 3. */ |
89 | #define FOR_EACH_STREAM_WITH_INDEX(X, idx) \ |
90 | X(0, idx); \ |
91 | X(1, idx); \ |
92 | X(2, idx); \ |
93 | X(3, idx) |
94 | |
95 | /* Define both _HUF_* & HUF_* symbols because MacOS |
96 | * C symbols are prefixed with '_' & Linux symbols aren't. |
97 | */ |
98 | _HUF_decompress4X1_usingDTable_internal_fast_asm_loop: |
99 | HUF_decompress4X1_usingDTable_internal_fast_asm_loop: |
100 | ZSTD_CET_ENDBRANCH |
101 | /* Save all registers - even if they are callee saved for simplicity. */ |
102 | push %rax |
103 | push %rbx |
104 | push %rcx |
105 | push %rdx |
106 | push %rbp |
107 | push %rsi |
108 | push %rdi |
109 | push %r8 |
110 | push %r9 |
111 | push %r10 |
112 | push %r11 |
113 | push %r12 |
114 | push %r13 |
115 | push %r14 |
116 | push %r15 |
117 | |
118 | /* Read HUF_DecompressAsmArgs* args from %rax */ |
119 | movq %rdi, %rax |
120 | movq 0(%rax), %ip0 |
121 | movq 8(%rax), %ip1 |
122 | movq 16(%rax), %ip2 |
123 | movq 24(%rax), %ip3 |
124 | movq 32(%rax), %op0 |
125 | movq 40(%rax), %op1 |
126 | movq 48(%rax), %op2 |
127 | movq 56(%rax), %op3 |
128 | movq 64(%rax), %bits0 |
129 | movq 72(%rax), %bits1 |
130 | movq 80(%rax), %bits2 |
131 | movq 88(%rax), %bits3 |
132 | movq 96(%rax), %dtable |
133 | push %rax /* argument */ |
134 | push 104(%rax) /* ilimit */ |
135 | push 112(%rax) /* oend */ |
136 | push %olimit /* olimit space */ |
137 | |
138 | subq $24, %rsp |
139 | |
140 | .L_4X1_compute_olimit: |
141 | /* Computes how many iterations we can do safely |
142 | * %r15, %rax may be clobbered |
143 | * rbx, rdx must be saved |
144 | * op3 & ip0 mustn't be clobbered |
145 | */ |
146 | movq %rbx, 0(%rsp) |
147 | movq %rdx, 8(%rsp) |
148 | |
149 | movq 32(%rsp), %rax /* rax = oend */ |
150 | subq %op3, %rax /* rax = oend - op3 */ |
151 | |
152 | /* r15 = (oend - op3) / 5 */ |
153 | movabsq $-3689348814741910323, %rdx |
154 | mulq %rdx |
155 | movq %rdx, %r15 |
156 | shrq $2, %r15 |
157 | |
158 | movq %ip0, %rax /* rax = ip0 */ |
159 | movq 40(%rsp), %rdx /* rdx = ilimit */ |
160 | subq %rdx, %rax /* rax = ip0 - ilimit */ |
161 | movq %rax, %rbx /* rbx = ip0 - ilimit */ |
162 | |
163 | /* rdx = (ip0 - ilimit) / 7 */ |
164 | movabsq $2635249153387078803, %rdx |
165 | mulq %rdx |
166 | subq %rdx, %rbx |
167 | shrq %rbx |
168 | addq %rbx, %rdx |
169 | shrq $2, %rdx |
170 | |
171 | /* r15 = min(%rdx, %r15) */ |
172 | cmpq %rdx, %r15 |
173 | cmova %rdx, %r15 |
174 | |
175 | /* r15 = r15 * 5 */ |
176 | leaq (%r15, %r15, 4), %r15 |
177 | |
178 | /* olimit = op3 + r15 */ |
179 | addq %op3, %olimit |
180 | |
181 | movq 8(%rsp), %rdx |
182 | movq 0(%rsp), %rbx |
183 | |
184 | /* If (op3 + 20 > olimit) */ |
185 | movq %op3, %rax /* rax = op3 */ |
186 | addq $20, %rax /* rax = op3 + 20 */ |
187 | cmpq %rax, %olimit /* op3 + 20 > olimit */ |
188 | jb .L_4X1_exit |
189 | |
190 | /* If (ip1 < ip0) go to exit */ |
191 | cmpq %ip0, %ip1 |
192 | jb .L_4X1_exit |
193 | |
194 | /* If (ip2 < ip1) go to exit */ |
195 | cmpq %ip1, %ip2 |
196 | jb .L_4X1_exit |
197 | |
198 | /* If (ip3 < ip2) go to exit */ |
199 | cmpq %ip2, %ip3 |
200 | jb .L_4X1_exit |
201 | |
202 | /* Reads top 11 bits from bits[n] |
203 | * Loads dt[bits[n]] into var[n] |
204 | */ |
205 | #define GET_NEXT_DELT(n) \ |
206 | movq $53, %var##n; \ |
207 | shrxq %var##n, %bits##n, %var##n; \ |
208 | movzwl (%dtable,%var##n,2),%vard##n |
209 | |
210 | /* var[n] must contain the DTable entry computed with GET_NEXT_DELT |
211 | * Moves var[n] to %rax |
212 | * bits[n] <<= var[n] & 63 |
213 | * op[n][idx] = %rax >> 8 |
214 | * %ah is a way to access bits [8, 16) of %rax |
215 | */ |
216 | #define DECODE_FROM_DELT(n, idx) \ |
217 | movq %var##n, %rax; \ |
218 | shlxq %var##n, %bits##n, %bits##n; \ |
219 | movb %ah, idx(%op##n) |
220 | |
221 | /* Assumes GET_NEXT_DELT has been called. |
222 | * Calls DECODE_FROM_DELT then GET_NEXT_DELT |
223 | */ |
224 | #define DECODE_AND_GET_NEXT(n, idx) \ |
225 | DECODE_FROM_DELT(n, idx); \ |
226 | GET_NEXT_DELT(n) \ |
227 | |
228 | /* // ctz & nbBytes is stored in bits[n] |
229 | * // nbBits is stored in %rax |
230 | * ctz = CTZ[bits[n]] |
231 | * nbBits = ctz & 7 |
232 | * nbBytes = ctz >> 3 |
233 | * op[n] += 5 |
234 | * ip[n] -= nbBytes |
235 | * // Note: x86-64 is little-endian ==> no bswap |
236 | * bits[n] = MEM_readST(ip[n]) | 1 |
237 | * bits[n] <<= nbBits |
238 | */ |
239 | #define RELOAD_BITS(n) \ |
240 | bsfq %bits##n, %bits##n; \ |
241 | movq %bits##n, %rax; \ |
242 | andq $7, %rax; \ |
243 | shrq $3, %bits##n; \ |
244 | leaq 5(%op##n), %op##n; \ |
245 | subq %bits##n, %ip##n; \ |
246 | movq (%ip##n), %bits##n; \ |
247 | orq $1, %bits##n; \ |
248 | shlx %rax, %bits##n, %bits##n |
249 | |
250 | /* Store clobbered variables on the stack */ |
251 | movq %olimit, 24(%rsp) |
252 | movq %ip1, 0(%rsp) |
253 | movq %ip2, 8(%rsp) |
254 | movq %ip3, 16(%rsp) |
255 | |
256 | /* Call GET_NEXT_DELT for each stream */ |
257 | FOR_EACH_STREAM(GET_NEXT_DELT) |
258 | |
259 | .p2align 6 |
260 | |
261 | .L_4X1_loop_body: |
262 | /* Decode 5 symbols in each of the 4 streams (20 total) |
263 | * Must have called GET_NEXT_DELT for each stream |
264 | */ |
265 | FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 0) |
266 | FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 1) |
267 | FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 2) |
268 | FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 3) |
269 | FOR_EACH_STREAM_WITH_INDEX(DECODE_FROM_DELT, 4) |
270 | |
271 | /* Load ip[1,2,3] from stack (var[] aliases them) |
272 | * ip[] is needed for RELOAD_BITS |
273 | * Each will be stored back to the stack after RELOAD |
274 | */ |
275 | movq 0(%rsp), %ip1 |
276 | movq 8(%rsp), %ip2 |
277 | movq 16(%rsp), %ip3 |
278 | |
279 | /* Reload each stream & fetch the next table entry |
280 | * to prepare for the next iteration |
281 | */ |
282 | RELOAD_BITS(0) |
283 | GET_NEXT_DELT(0) |
284 | |
285 | RELOAD_BITS(1) |
286 | movq %ip1, 0(%rsp) |
287 | GET_NEXT_DELT(1) |
288 | |
289 | RELOAD_BITS(2) |
290 | movq %ip2, 8(%rsp) |
291 | GET_NEXT_DELT(2) |
292 | |
293 | RELOAD_BITS(3) |
294 | movq %ip3, 16(%rsp) |
295 | GET_NEXT_DELT(3) |
296 | |
297 | /* If op3 < olimit: continue the loop */ |
298 | cmp %op3, 24(%rsp) |
299 | ja .L_4X1_loop_body |
300 | |
301 | /* Reload ip[1,2,3] from stack */ |
302 | movq 0(%rsp), %ip1 |
303 | movq 8(%rsp), %ip2 |
304 | movq 16(%rsp), %ip3 |
305 | |
306 | /* Re-compute olimit */ |
307 | jmp .L_4X1_compute_olimit |
308 | |
309 | #undef GET_NEXT_DELT |
310 | #undef DECODE_FROM_DELT |
311 | #undef DECODE |
312 | #undef RELOAD_BITS |
313 | .L_4X1_exit: |
314 | addq $24, %rsp |
315 | |
316 | /* Restore stack (oend & olimit) */ |
317 | pop %rax /* olimit */ |
318 | pop %rax /* oend */ |
319 | pop %rax /* ilimit */ |
320 | pop %rax /* arg */ |
321 | |
322 | /* Save ip / op / bits */ |
323 | movq %ip0, 0(%rax) |
324 | movq %ip1, 8(%rax) |
325 | movq %ip2, 16(%rax) |
326 | movq %ip3, 24(%rax) |
327 | movq %op0, 32(%rax) |
328 | movq %op1, 40(%rax) |
329 | movq %op2, 48(%rax) |
330 | movq %op3, 56(%rax) |
331 | movq %bits0, 64(%rax) |
332 | movq %bits1, 72(%rax) |
333 | movq %bits2, 80(%rax) |
334 | movq %bits3, 88(%rax) |
335 | |
336 | /* Restore registers */ |
337 | pop %r15 |
338 | pop %r14 |
339 | pop %r13 |
340 | pop %r12 |
341 | pop %r11 |
342 | pop %r10 |
343 | pop %r9 |
344 | pop %r8 |
345 | pop %rdi |
346 | pop %rsi |
347 | pop %rbp |
348 | pop %rdx |
349 | pop %rcx |
350 | pop %rbx |
351 | pop %rax |
352 | ret |
353 | |
354 | _HUF_decompress4X2_usingDTable_internal_fast_asm_loop: |
355 | HUF_decompress4X2_usingDTable_internal_fast_asm_loop: |
356 | ZSTD_CET_ENDBRANCH |
357 | /* Save all registers - even if they are callee saved for simplicity. */ |
358 | push %rax |
359 | push %rbx |
360 | push %rcx |
361 | push %rdx |
362 | push %rbp |
363 | push %rsi |
364 | push %rdi |
365 | push %r8 |
366 | push %r9 |
367 | push %r10 |
368 | push %r11 |
369 | push %r12 |
370 | push %r13 |
371 | push %r14 |
372 | push %r15 |
373 | |
374 | movq %rdi, %rax |
375 | movq 0(%rax), %ip0 |
376 | movq 8(%rax), %ip1 |
377 | movq 16(%rax), %ip2 |
378 | movq 24(%rax), %ip3 |
379 | movq 32(%rax), %op0 |
380 | movq 40(%rax), %op1 |
381 | movq 48(%rax), %op2 |
382 | movq 56(%rax), %op3 |
383 | movq 64(%rax), %bits0 |
384 | movq 72(%rax), %bits1 |
385 | movq 80(%rax), %bits2 |
386 | movq 88(%rax), %bits3 |
387 | movq 96(%rax), %dtable |
388 | push %rax /* argument */ |
389 | push %rax /* olimit */ |
390 | push 104(%rax) /* ilimit */ |
391 | |
392 | movq 112(%rax), %rax |
393 | push %rax /* oend3 */ |
394 | |
395 | movq %op3, %rax |
396 | push %rax /* oend2 */ |
397 | |
398 | movq %op2, %rax |
399 | push %rax /* oend1 */ |
400 | |
401 | movq %op1, %rax |
402 | push %rax /* oend0 */ |
403 | |
404 | /* Scratch space */ |
405 | subq $8, %rsp |
406 | |
407 | .L_4X2_compute_olimit: |
408 | /* Computes how many iterations we can do safely |
409 | * %r15, %rax may be clobbered |
410 | * rdx must be saved |
411 | * op[1,2,3,4] & ip0 mustn't be clobbered |
412 | */ |
413 | movq %rdx, 0(%rsp) |
414 | |
415 | /* We can consume up to 7 input bytes each iteration. */ |
416 | movq %ip0, %rax /* rax = ip0 */ |
417 | movq 40(%rsp), %rdx /* rdx = ilimit */ |
418 | subq %rdx, %rax /* rax = ip0 - ilimit */ |
419 | movq %rax, %r15 /* r15 = ip0 - ilimit */ |
420 | |
421 | /* rdx = rax / 7 */ |
422 | movabsq $2635249153387078803, %rdx |
423 | mulq %rdx |
424 | subq %rdx, %r15 |
425 | shrq %r15 |
426 | addq %r15, %rdx |
427 | shrq $2, %rdx |
428 | |
429 | /* r15 = (ip0 - ilimit) / 7 */ |
430 | movq %rdx, %r15 |
431 | |
432 | /* r15 = min(r15, min(oend0 - op0, oend1 - op1, oend2 - op2, oend3 - op3) / 10) */ |
433 | movq 8(%rsp), %rax /* rax = oend0 */ |
434 | subq %op0, %rax /* rax = oend0 - op0 */ |
435 | movq 16(%rsp), %rdx /* rdx = oend1 */ |
436 | subq %op1, %rdx /* rdx = oend1 - op1 */ |
437 | |
438 | cmpq %rax, %rdx |
439 | cmova %rax, %rdx /* rdx = min(%rdx, %rax) */ |
440 | |
441 | movq 24(%rsp), %rax /* rax = oend2 */ |
442 | subq %op2, %rax /* rax = oend2 - op2 */ |
443 | |
444 | cmpq %rax, %rdx |
445 | cmova %rax, %rdx /* rdx = min(%rdx, %rax) */ |
446 | |
447 | movq 32(%rsp), %rax /* rax = oend3 */ |
448 | subq %op3, %rax /* rax = oend3 - op3 */ |
449 | |
450 | cmpq %rax, %rdx |
451 | cmova %rax, %rdx /* rdx = min(%rdx, %rax) */ |
452 | |
453 | movabsq $-3689348814741910323, %rax |
454 | mulq %rdx |
455 | shrq $3, %rdx /* rdx = rdx / 10 */ |
456 | |
457 | /* r15 = min(%rdx, %r15) */ |
458 | cmpq %rdx, %r15 |
459 | cmova %rdx, %r15 |
460 | |
461 | /* olimit = op3 + 5 * r15 */ |
462 | movq %r15, %rax |
463 | leaq (%op3, %rax, 4), %olimit |
464 | addq %rax, %olimit |
465 | |
466 | movq 0(%rsp), %rdx |
467 | |
468 | /* If (op3 + 10 > olimit) */ |
469 | movq %op3, %rax /* rax = op3 */ |
470 | addq $10, %rax /* rax = op3 + 10 */ |
471 | cmpq %rax, %olimit /* op3 + 10 > olimit */ |
472 | jb .L_4X2_exit |
473 | |
474 | /* If (ip1 < ip0) go to exit */ |
475 | cmpq %ip0, %ip1 |
476 | jb .L_4X2_exit |
477 | |
478 | /* If (ip2 < ip1) go to exit */ |
479 | cmpq %ip1, %ip2 |
480 | jb .L_4X2_exit |
481 | |
482 | /* If (ip3 < ip2) go to exit */ |
483 | cmpq %ip2, %ip3 |
484 | jb .L_4X2_exit |
485 | |
486 | #define DECODE(n, idx) \ |
487 | movq %bits##n, %rax; \ |
488 | shrq $53, %rax; \ |
489 | movzwl 0(%dtable,%rax,4),%r8d; \ |
490 | movzbl 2(%dtable,%rax,4),%r15d; \ |
491 | movzbl 3(%dtable,%rax,4),%eax; \ |
492 | movw %r8w, (%op##n); \ |
493 | shlxq %r15, %bits##n, %bits##n; \ |
494 | addq %rax, %op##n |
495 | |
496 | #define RELOAD_BITS(n) \ |
497 | bsfq %bits##n, %bits##n; \ |
498 | movq %bits##n, %rax; \ |
499 | shrq $3, %bits##n; \ |
500 | andq $7, %rax; \ |
501 | subq %bits##n, %ip##n; \ |
502 | movq (%ip##n), %bits##n; \ |
503 | orq $1, %bits##n; \ |
504 | shlxq %rax, %bits##n, %bits##n |
505 | |
506 | |
507 | movq %olimit, 48(%rsp) |
508 | |
509 | .p2align 6 |
510 | |
511 | .L_4X2_loop_body: |
512 | /* We clobber r8, so store it on the stack */ |
513 | movq %r8, 0(%rsp) |
514 | |
515 | /* Decode 5 symbols from each of the 4 streams (20 symbols total). */ |
516 | FOR_EACH_STREAM_WITH_INDEX(DECODE, 0) |
517 | FOR_EACH_STREAM_WITH_INDEX(DECODE, 1) |
518 | FOR_EACH_STREAM_WITH_INDEX(DECODE, 2) |
519 | FOR_EACH_STREAM_WITH_INDEX(DECODE, 3) |
520 | FOR_EACH_STREAM_WITH_INDEX(DECODE, 4) |
521 | |
522 | /* Reload r8 */ |
523 | movq 0(%rsp), %r8 |
524 | |
525 | FOR_EACH_STREAM(RELOAD_BITS) |
526 | |
527 | cmp %op3, 48(%rsp) |
528 | ja .L_4X2_loop_body |
529 | jmp .L_4X2_compute_olimit |
530 | |
531 | #undef DECODE |
532 | #undef RELOAD_BITS |
533 | .L_4X2_exit: |
534 | addq $8, %rsp |
535 | /* Restore stack (oend & olimit) */ |
536 | pop %rax /* oend0 */ |
537 | pop %rax /* oend1 */ |
538 | pop %rax /* oend2 */ |
539 | pop %rax /* oend3 */ |
540 | pop %rax /* ilimit */ |
541 | pop %rax /* olimit */ |
542 | pop %rax /* arg */ |
543 | |
544 | /* Save ip / op / bits */ |
545 | movq %ip0, 0(%rax) |
546 | movq %ip1, 8(%rax) |
547 | movq %ip2, 16(%rax) |
548 | movq %ip3, 24(%rax) |
549 | movq %op0, 32(%rax) |
550 | movq %op1, 40(%rax) |
551 | movq %op2, 48(%rax) |
552 | movq %op3, 56(%rax) |
553 | movq %bits0, 64(%rax) |
554 | movq %bits1, 72(%rax) |
555 | movq %bits2, 80(%rax) |
556 | movq %bits3, 88(%rax) |
557 | |
558 | /* Restore registers */ |
559 | pop %r15 |
560 | pop %r14 |
561 | pop %r13 |
562 | pop %r12 |
563 | pop %r11 |
564 | pop %r10 |
565 | pop %r9 |
566 | pop %r8 |
567 | pop %rdi |
568 | pop %rsi |
569 | pop %rbp |
570 | pop %rdx |
571 | pop %rcx |
572 | pop %rbx |
573 | pop %rax |
574 | ret |
575 | |
576 | #endif |
577 | |