1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under both the BSD-style license (found in the
6 * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7 * in the COPYING file in the root directory of this source tree).
8 * You may select, at your option, one of the above-listed licenses.
9 */
10
11#include "../common/portability_macros.h"
12
13/* Stack marking
14 * ref: https://wiki.gentoo.org/wiki/Hardened/GNU_stack_quickstart
15 */
16#if defined(__ELF__) && defined(__GNUC__)
17.section .note.GNU-stack,"",%progbits
18#endif
19
20#if ZSTD_ENABLE_ASM_X86_64_BMI2
21
22/* Calling convention:
23 *
24 * %rdi contains the first argument: HUF_DecompressAsmArgs*.
25 * %rbp isn't maintained (no frame pointer).
26 * %rsp contains the stack pointer that grows down.
27 * No red-zone is assumed, only addresses >= %rsp are used.
28 * All register contents are preserved.
29 *
30 * TODO: Support Windows calling convention.
31 */
32
33ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X1_usingDTable_internal_fast_asm_loop)
34ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X2_usingDTable_internal_fast_asm_loop)
35ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X2_usingDTable_internal_fast_asm_loop)
36ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_fast_asm_loop)
37.global HUF_decompress4X1_usingDTable_internal_fast_asm_loop
38.global HUF_decompress4X2_usingDTable_internal_fast_asm_loop
39.global _HUF_decompress4X1_usingDTable_internal_fast_asm_loop
40.global _HUF_decompress4X2_usingDTable_internal_fast_asm_loop
41.text
42
43/* Sets up register mappings for clarity.
44 * op[], bits[], dtable & ip[0] each get their own register.
45 * ip[1,2,3] & olimit alias var[].
46 * %rax is a scratch register.
47 */
48
49#define op0 rsi
50#define op1 rbx
51#define op2 rcx
52#define op3 rdi
53
54#define ip0 r8
55#define ip1 r9
56#define ip2 r10
57#define ip3 r11
58
59#define bits0 rbp
60#define bits1 rdx
61#define bits2 r12
62#define bits3 r13
63#define dtable r14
64#define olimit r15
65
66/* var[] aliases ip[1,2,3] & olimit
67 * ip[1,2,3] are saved every iteration.
68 * olimit is only used in compute_olimit.
69 */
70#define var0 r15
71#define var1 r9
72#define var2 r10
73#define var3 r11
74
75/* 32-bit var registers */
76#define vard0 r15d
77#define vard1 r9d
78#define vard2 r10d
79#define vard3 r11d
80
81/* Calls X(N) for each stream 0, 1, 2, 3. */
82#define FOR_EACH_STREAM(X) \
83 X(0); \
84 X(1); \
85 X(2); \
86 X(3)
87
88/* Calls X(N, idx) for each stream 0, 1, 2, 3. */
89#define FOR_EACH_STREAM_WITH_INDEX(X, idx) \
90 X(0, idx); \
91 X(1, idx); \
92 X(2, idx); \
93 X(3, idx)
94
95/* Define both _HUF_* & HUF_* symbols because MacOS
96 * C symbols are prefixed with '_' & Linux symbols aren't.
97 */
98_HUF_decompress4X1_usingDTable_internal_fast_asm_loop:
99HUF_decompress4X1_usingDTable_internal_fast_asm_loop:
100 ZSTD_CET_ENDBRANCH
101 /* Save all registers - even if they are callee saved for simplicity. */
102 push %rax
103 push %rbx
104 push %rcx
105 push %rdx
106 push %rbp
107 push %rsi
108 push %rdi
109 push %r8
110 push %r9
111 push %r10
112 push %r11
113 push %r12
114 push %r13
115 push %r14
116 push %r15
117
118 /* Read HUF_DecompressAsmArgs* args from %rax */
119 movq %rdi, %rax
120 movq 0(%rax), %ip0
121 movq 8(%rax), %ip1
122 movq 16(%rax), %ip2
123 movq 24(%rax), %ip3
124 movq 32(%rax), %op0
125 movq 40(%rax), %op1
126 movq 48(%rax), %op2
127 movq 56(%rax), %op3
128 movq 64(%rax), %bits0
129 movq 72(%rax), %bits1
130 movq 80(%rax), %bits2
131 movq 88(%rax), %bits3
132 movq 96(%rax), %dtable
133 push %rax /* argument */
134 push 104(%rax) /* ilimit */
135 push 112(%rax) /* oend */
136 push %olimit /* olimit space */
137
138 subq $24, %rsp
139
140.L_4X1_compute_olimit:
141 /* Computes how many iterations we can do safely
142 * %r15, %rax may be clobbered
143 * rbx, rdx must be saved
144 * op3 & ip0 mustn't be clobbered
145 */
146 movq %rbx, 0(%rsp)
147 movq %rdx, 8(%rsp)
148
149 movq 32(%rsp), %rax /* rax = oend */
150 subq %op3, %rax /* rax = oend - op3 */
151
152 /* r15 = (oend - op3) / 5 */
153 movabsq $-3689348814741910323, %rdx
154 mulq %rdx
155 movq %rdx, %r15
156 shrq $2, %r15
157
158 movq %ip0, %rax /* rax = ip0 */
159 movq 40(%rsp), %rdx /* rdx = ilimit */
160 subq %rdx, %rax /* rax = ip0 - ilimit */
161 movq %rax, %rbx /* rbx = ip0 - ilimit */
162
163 /* rdx = (ip0 - ilimit) / 7 */
164 movabsq $2635249153387078803, %rdx
165 mulq %rdx
166 subq %rdx, %rbx
167 shrq %rbx
168 addq %rbx, %rdx
169 shrq $2, %rdx
170
171 /* r15 = min(%rdx, %r15) */
172 cmpq %rdx, %r15
173 cmova %rdx, %r15
174
175 /* r15 = r15 * 5 */
176 leaq (%r15, %r15, 4), %r15
177
178 /* olimit = op3 + r15 */
179 addq %op3, %olimit
180
181 movq 8(%rsp), %rdx
182 movq 0(%rsp), %rbx
183
184 /* If (op3 + 20 > olimit) */
185 movq %op3, %rax /* rax = op3 */
186 addq $20, %rax /* rax = op3 + 20 */
187 cmpq %rax, %olimit /* op3 + 20 > olimit */
188 jb .L_4X1_exit
189
190 /* If (ip1 < ip0) go to exit */
191 cmpq %ip0, %ip1
192 jb .L_4X1_exit
193
194 /* If (ip2 < ip1) go to exit */
195 cmpq %ip1, %ip2
196 jb .L_4X1_exit
197
198 /* If (ip3 < ip2) go to exit */
199 cmpq %ip2, %ip3
200 jb .L_4X1_exit
201
202/* Reads top 11 bits from bits[n]
203 * Loads dt[bits[n]] into var[n]
204 */
205#define GET_NEXT_DELT(n) \
206 movq $53, %var##n; \
207 shrxq %var##n, %bits##n, %var##n; \
208 movzwl (%dtable,%var##n,2),%vard##n
209
210/* var[n] must contain the DTable entry computed with GET_NEXT_DELT
211 * Moves var[n] to %rax
212 * bits[n] <<= var[n] & 63
213 * op[n][idx] = %rax >> 8
214 * %ah is a way to access bits [8, 16) of %rax
215 */
216#define DECODE_FROM_DELT(n, idx) \
217 movq %var##n, %rax; \
218 shlxq %var##n, %bits##n, %bits##n; \
219 movb %ah, idx(%op##n)
220
221/* Assumes GET_NEXT_DELT has been called.
222 * Calls DECODE_FROM_DELT then GET_NEXT_DELT
223 */
224#define DECODE_AND_GET_NEXT(n, idx) \
225 DECODE_FROM_DELT(n, idx); \
226 GET_NEXT_DELT(n) \
227
228/* // ctz & nbBytes is stored in bits[n]
229 * // nbBits is stored in %rax
230 * ctz = CTZ[bits[n]]
231 * nbBits = ctz & 7
232 * nbBytes = ctz >> 3
233 * op[n] += 5
234 * ip[n] -= nbBytes
235 * // Note: x86-64 is little-endian ==> no bswap
236 * bits[n] = MEM_readST(ip[n]) | 1
237 * bits[n] <<= nbBits
238 */
239#define RELOAD_BITS(n) \
240 bsfq %bits##n, %bits##n; \
241 movq %bits##n, %rax; \
242 andq $7, %rax; \
243 shrq $3, %bits##n; \
244 leaq 5(%op##n), %op##n; \
245 subq %bits##n, %ip##n; \
246 movq (%ip##n), %bits##n; \
247 orq $1, %bits##n; \
248 shlx %rax, %bits##n, %bits##n
249
250 /* Store clobbered variables on the stack */
251 movq %olimit, 24(%rsp)
252 movq %ip1, 0(%rsp)
253 movq %ip2, 8(%rsp)
254 movq %ip3, 16(%rsp)
255
256 /* Call GET_NEXT_DELT for each stream */
257 FOR_EACH_STREAM(GET_NEXT_DELT)
258
259 .p2align 6
260
261.L_4X1_loop_body:
262 /* Decode 5 symbols in each of the 4 streams (20 total)
263 * Must have called GET_NEXT_DELT for each stream
264 */
265 FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 0)
266 FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 1)
267 FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 2)
268 FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 3)
269 FOR_EACH_STREAM_WITH_INDEX(DECODE_FROM_DELT, 4)
270
271 /* Load ip[1,2,3] from stack (var[] aliases them)
272 * ip[] is needed for RELOAD_BITS
273 * Each will be stored back to the stack after RELOAD
274 */
275 movq 0(%rsp), %ip1
276 movq 8(%rsp), %ip2
277 movq 16(%rsp), %ip3
278
279 /* Reload each stream & fetch the next table entry
280 * to prepare for the next iteration
281 */
282 RELOAD_BITS(0)
283 GET_NEXT_DELT(0)
284
285 RELOAD_BITS(1)
286 movq %ip1, 0(%rsp)
287 GET_NEXT_DELT(1)
288
289 RELOAD_BITS(2)
290 movq %ip2, 8(%rsp)
291 GET_NEXT_DELT(2)
292
293 RELOAD_BITS(3)
294 movq %ip3, 16(%rsp)
295 GET_NEXT_DELT(3)
296
297 /* If op3 < olimit: continue the loop */
298 cmp %op3, 24(%rsp)
299 ja .L_4X1_loop_body
300
301 /* Reload ip[1,2,3] from stack */
302 movq 0(%rsp), %ip1
303 movq 8(%rsp), %ip2
304 movq 16(%rsp), %ip3
305
306 /* Re-compute olimit */
307 jmp .L_4X1_compute_olimit
308
309#undef GET_NEXT_DELT
310#undef DECODE_FROM_DELT
311#undef DECODE
312#undef RELOAD_BITS
313.L_4X1_exit:
314 addq $24, %rsp
315
316 /* Restore stack (oend & olimit) */
317 pop %rax /* olimit */
318 pop %rax /* oend */
319 pop %rax /* ilimit */
320 pop %rax /* arg */
321
322 /* Save ip / op / bits */
323 movq %ip0, 0(%rax)
324 movq %ip1, 8(%rax)
325 movq %ip2, 16(%rax)
326 movq %ip3, 24(%rax)
327 movq %op0, 32(%rax)
328 movq %op1, 40(%rax)
329 movq %op2, 48(%rax)
330 movq %op3, 56(%rax)
331 movq %bits0, 64(%rax)
332 movq %bits1, 72(%rax)
333 movq %bits2, 80(%rax)
334 movq %bits3, 88(%rax)
335
336 /* Restore registers */
337 pop %r15
338 pop %r14
339 pop %r13
340 pop %r12
341 pop %r11
342 pop %r10
343 pop %r9
344 pop %r8
345 pop %rdi
346 pop %rsi
347 pop %rbp
348 pop %rdx
349 pop %rcx
350 pop %rbx
351 pop %rax
352 ret
353
354_HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
355HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
356 ZSTD_CET_ENDBRANCH
357 /* Save all registers - even if they are callee saved for simplicity. */
358 push %rax
359 push %rbx
360 push %rcx
361 push %rdx
362 push %rbp
363 push %rsi
364 push %rdi
365 push %r8
366 push %r9
367 push %r10
368 push %r11
369 push %r12
370 push %r13
371 push %r14
372 push %r15
373
374 movq %rdi, %rax
375 movq 0(%rax), %ip0
376 movq 8(%rax), %ip1
377 movq 16(%rax), %ip2
378 movq 24(%rax), %ip3
379 movq 32(%rax), %op0
380 movq 40(%rax), %op1
381 movq 48(%rax), %op2
382 movq 56(%rax), %op3
383 movq 64(%rax), %bits0
384 movq 72(%rax), %bits1
385 movq 80(%rax), %bits2
386 movq 88(%rax), %bits3
387 movq 96(%rax), %dtable
388 push %rax /* argument */
389 push %rax /* olimit */
390 push 104(%rax) /* ilimit */
391
392 movq 112(%rax), %rax
393 push %rax /* oend3 */
394
395 movq %op3, %rax
396 push %rax /* oend2 */
397
398 movq %op2, %rax
399 push %rax /* oend1 */
400
401 movq %op1, %rax
402 push %rax /* oend0 */
403
404 /* Scratch space */
405 subq $8, %rsp
406
407.L_4X2_compute_olimit:
408 /* Computes how many iterations we can do safely
409 * %r15, %rax may be clobbered
410 * rdx must be saved
411 * op[1,2,3,4] & ip0 mustn't be clobbered
412 */
413 movq %rdx, 0(%rsp)
414
415 /* We can consume up to 7 input bytes each iteration. */
416 movq %ip0, %rax /* rax = ip0 */
417 movq 40(%rsp), %rdx /* rdx = ilimit */
418 subq %rdx, %rax /* rax = ip0 - ilimit */
419 movq %rax, %r15 /* r15 = ip0 - ilimit */
420
421 /* rdx = rax / 7 */
422 movabsq $2635249153387078803, %rdx
423 mulq %rdx
424 subq %rdx, %r15
425 shrq %r15
426 addq %r15, %rdx
427 shrq $2, %rdx
428
429 /* r15 = (ip0 - ilimit) / 7 */
430 movq %rdx, %r15
431
432 /* r15 = min(r15, min(oend0 - op0, oend1 - op1, oend2 - op2, oend3 - op3) / 10) */
433 movq 8(%rsp), %rax /* rax = oend0 */
434 subq %op0, %rax /* rax = oend0 - op0 */
435 movq 16(%rsp), %rdx /* rdx = oend1 */
436 subq %op1, %rdx /* rdx = oend1 - op1 */
437
438 cmpq %rax, %rdx
439 cmova %rax, %rdx /* rdx = min(%rdx, %rax) */
440
441 movq 24(%rsp), %rax /* rax = oend2 */
442 subq %op2, %rax /* rax = oend2 - op2 */
443
444 cmpq %rax, %rdx
445 cmova %rax, %rdx /* rdx = min(%rdx, %rax) */
446
447 movq 32(%rsp), %rax /* rax = oend3 */
448 subq %op3, %rax /* rax = oend3 - op3 */
449
450 cmpq %rax, %rdx
451 cmova %rax, %rdx /* rdx = min(%rdx, %rax) */
452
453 movabsq $-3689348814741910323, %rax
454 mulq %rdx
455 shrq $3, %rdx /* rdx = rdx / 10 */
456
457 /* r15 = min(%rdx, %r15) */
458 cmpq %rdx, %r15
459 cmova %rdx, %r15
460
461 /* olimit = op3 + 5 * r15 */
462 movq %r15, %rax
463 leaq (%op3, %rax, 4), %olimit
464 addq %rax, %olimit
465
466 movq 0(%rsp), %rdx
467
468 /* If (op3 + 10 > olimit) */
469 movq %op3, %rax /* rax = op3 */
470 addq $10, %rax /* rax = op3 + 10 */
471 cmpq %rax, %olimit /* op3 + 10 > olimit */
472 jb .L_4X2_exit
473
474 /* If (ip1 < ip0) go to exit */
475 cmpq %ip0, %ip1
476 jb .L_4X2_exit
477
478 /* If (ip2 < ip1) go to exit */
479 cmpq %ip1, %ip2
480 jb .L_4X2_exit
481
482 /* If (ip3 < ip2) go to exit */
483 cmpq %ip2, %ip3
484 jb .L_4X2_exit
485
486#define DECODE(n, idx) \
487 movq %bits##n, %rax; \
488 shrq $53, %rax; \
489 movzwl 0(%dtable,%rax,4),%r8d; \
490 movzbl 2(%dtable,%rax,4),%r15d; \
491 movzbl 3(%dtable,%rax,4),%eax; \
492 movw %r8w, (%op##n); \
493 shlxq %r15, %bits##n, %bits##n; \
494 addq %rax, %op##n
495
496#define RELOAD_BITS(n) \
497 bsfq %bits##n, %bits##n; \
498 movq %bits##n, %rax; \
499 shrq $3, %bits##n; \
500 andq $7, %rax; \
501 subq %bits##n, %ip##n; \
502 movq (%ip##n), %bits##n; \
503 orq $1, %bits##n; \
504 shlxq %rax, %bits##n, %bits##n
505
506
507 movq %olimit, 48(%rsp)
508
509 .p2align 6
510
511.L_4X2_loop_body:
512 /* We clobber r8, so store it on the stack */
513 movq %r8, 0(%rsp)
514
515 /* Decode 5 symbols from each of the 4 streams (20 symbols total). */
516 FOR_EACH_STREAM_WITH_INDEX(DECODE, 0)
517 FOR_EACH_STREAM_WITH_INDEX(DECODE, 1)
518 FOR_EACH_STREAM_WITH_INDEX(DECODE, 2)
519 FOR_EACH_STREAM_WITH_INDEX(DECODE, 3)
520 FOR_EACH_STREAM_WITH_INDEX(DECODE, 4)
521
522 /* Reload r8 */
523 movq 0(%rsp), %r8
524
525 FOR_EACH_STREAM(RELOAD_BITS)
526
527 cmp %op3, 48(%rsp)
528 ja .L_4X2_loop_body
529 jmp .L_4X2_compute_olimit
530
531#undef DECODE
532#undef RELOAD_BITS
533.L_4X2_exit:
534 addq $8, %rsp
535 /* Restore stack (oend & olimit) */
536 pop %rax /* oend0 */
537 pop %rax /* oend1 */
538 pop %rax /* oend2 */
539 pop %rax /* oend3 */
540 pop %rax /* ilimit */
541 pop %rax /* olimit */
542 pop %rax /* arg */
543
544 /* Save ip / op / bits */
545 movq %ip0, 0(%rax)
546 movq %ip1, 8(%rax)
547 movq %ip2, 16(%rax)
548 movq %ip3, 24(%rax)
549 movq %op0, 32(%rax)
550 movq %op1, 40(%rax)
551 movq %op2, 48(%rax)
552 movq %op3, 56(%rax)
553 movq %bits0, 64(%rax)
554 movq %bits1, 72(%rax)
555 movq %bits2, 80(%rax)
556 movq %bits3, 88(%rax)
557
558 /* Restore registers */
559 pop %r15
560 pop %r14
561 pop %r13
562 pop %r12
563 pop %r11
564 pop %r10
565 pop %r9
566 pop %r8
567 pop %rdi
568 pop %rsi
569 pop %rbp
570 pop %rdx
571 pop %rcx
572 pop %rbx
573 pop %rax
574 ret
575
576#endif
577