1 | /* x86_64 BIGNUM accelerator version 0.1, December 2002. |
2 | * |
3 | * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL |
4 | * project. |
5 | * |
6 | * Rights for redistribution and usage in source and binary forms are |
7 | * granted according to the OpenSSL license. Warranty of any kind is |
8 | * disclaimed. |
9 | * |
10 | * Q. Version 0.1? It doesn't sound like Andy, he used to assign real |
11 | * versions, like 1.0... |
12 | * A. Well, that's because this code is basically a quick-n-dirty |
13 | * proof-of-concept hack. As you can see it's implemented with |
14 | * inline assembler, which means that you're bound to GCC and that |
15 | * there might be enough room for further improvement. |
16 | * |
17 | * Q. Why inline assembler? |
18 | * A. x86_64 features own ABI which I'm not familiar with. This is |
19 | * why I decided to let the compiler take care of subroutine |
20 | * prologue/epilogue as well as register allocation. For reference. |
21 | * Win64 implements different ABI for AMD64, different from Linux. |
22 | * |
23 | * Q. How much faster does it get? |
24 | * A. 'apps/openssl speed rsa dsa' output with no-asm: |
25 | * |
26 | * sign verify sign/s verify/s |
27 | * rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2 |
28 | * rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0 |
29 | * rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8 |
30 | * rsa 4096 bits 0.1155s 0.0018s 8.7 555.6 |
31 | * sign verify sign/s verify/s |
32 | * dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3 |
33 | * dsa 1024 bits 0.0014s 0.0018s 692.3 559.2 |
34 | * dsa 2048 bits 0.0049s 0.0061s 204.7 165.0 |
35 | * |
36 | * 'apps/openssl speed rsa dsa' output with this module: |
37 | * |
38 | * sign verify sign/s verify/s |
39 | * rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9 |
40 | * rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7 |
41 | * rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0 |
42 | * rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8 |
43 | * sign verify sign/s verify/s |
44 | * dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3 |
45 | * dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4 |
46 | * dsa 2048 bits 0.0016s 0.0020s 620.4 504.6 |
47 | * |
48 | * For the reference. IA-32 assembler implementation performs |
49 | * very much like 64-bit code compiled with no-asm on the same |
50 | * machine. |
51 | */ |
52 | |
53 | #include <openssl/bn.h> |
54 | |
55 | // TODO(davidben): Get this file working on MSVC x64. |
56 | #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \ |
57 | (defined(__GNUC__) || defined(__clang__)) |
58 | |
59 | #include "../internal.h" |
60 | |
61 | |
62 | #undef mul |
63 | #undef mul_add |
64 | |
65 | // "m"(a), "+m"(r) is the way to favor DirectPath ยต-code; |
66 | // "g"(0) let the compiler to decide where does it |
67 | // want to keep the value of zero; |
68 | #define mul_add(r, a, word, carry) \ |
69 | do { \ |
70 | register BN_ULONG high, low; \ |
71 | __asm__("mulq %3" : "=a"(low), "=d"(high) : "a"(word), "m"(a) : "cc"); \ |
72 | __asm__("addq %2,%0; adcq %3,%1" \ |
73 | : "+r"(carry), "+d"(high) \ |
74 | : "a"(low), "g"(0) \ |
75 | : "cc"); \ |
76 | __asm__("addq %2,%0; adcq %3,%1" \ |
77 | : "+m"(r), "+d"(high) \ |
78 | : "r"(carry), "g"(0) \ |
79 | : "cc"); \ |
80 | (carry) = high; \ |
81 | } while (0) |
82 | |
83 | #define mul(r, a, word, carry) \ |
84 | do { \ |
85 | register BN_ULONG high, low; \ |
86 | __asm__("mulq %3" : "=a"(low), "=d"(high) : "a"(word), "g"(a) : "cc"); \ |
87 | __asm__("addq %2,%0; adcq %3,%1" \ |
88 | : "+r"(carry), "+d"(high) \ |
89 | : "a"(low), "g"(0) \ |
90 | : "cc"); \ |
91 | (r) = (carry); \ |
92 | (carry) = high; \ |
93 | } while (0) |
94 | #undef sqr |
95 | #define sqr(r0, r1, a) __asm__("mulq %2" : "=a"(r0), "=d"(r1) : "a"(a) : "cc"); |
96 | |
97 | BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, size_t num, |
98 | BN_ULONG w) { |
99 | BN_ULONG c1 = 0; |
100 | |
101 | if (num == 0) { |
102 | return (c1); |
103 | } |
104 | |
105 | while (num & ~3) { |
106 | mul_add(rp[0], ap[0], w, c1); |
107 | mul_add(rp[1], ap[1], w, c1); |
108 | mul_add(rp[2], ap[2], w, c1); |
109 | mul_add(rp[3], ap[3], w, c1); |
110 | ap += 4; |
111 | rp += 4; |
112 | num -= 4; |
113 | } |
114 | if (num) { |
115 | mul_add(rp[0], ap[0], w, c1); |
116 | if (--num == 0) { |
117 | return c1; |
118 | } |
119 | mul_add(rp[1], ap[1], w, c1); |
120 | if (--num == 0) { |
121 | return c1; |
122 | } |
123 | mul_add(rp[2], ap[2], w, c1); |
124 | return c1; |
125 | } |
126 | |
127 | return c1; |
128 | } |
129 | |
130 | BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, size_t num, |
131 | BN_ULONG w) { |
132 | BN_ULONG c1 = 0; |
133 | |
134 | if (num == 0) { |
135 | return c1; |
136 | } |
137 | |
138 | while (num & ~3) { |
139 | mul(rp[0], ap[0], w, c1); |
140 | mul(rp[1], ap[1], w, c1); |
141 | mul(rp[2], ap[2], w, c1); |
142 | mul(rp[3], ap[3], w, c1); |
143 | ap += 4; |
144 | rp += 4; |
145 | num -= 4; |
146 | } |
147 | if (num) { |
148 | mul(rp[0], ap[0], w, c1); |
149 | if (--num == 0) { |
150 | return c1; |
151 | } |
152 | mul(rp[1], ap[1], w, c1); |
153 | if (--num == 0) { |
154 | return c1; |
155 | } |
156 | mul(rp[2], ap[2], w, c1); |
157 | } |
158 | return c1; |
159 | } |
160 | |
161 | void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, size_t n) { |
162 | if (n == 0) { |
163 | return; |
164 | } |
165 | |
166 | while (n & ~3) { |
167 | sqr(r[0], r[1], a[0]); |
168 | sqr(r[2], r[3], a[1]); |
169 | sqr(r[4], r[5], a[2]); |
170 | sqr(r[6], r[7], a[3]); |
171 | a += 4; |
172 | r += 8; |
173 | n -= 4; |
174 | } |
175 | if (n) { |
176 | sqr(r[0], r[1], a[0]); |
177 | if (--n == 0) { |
178 | return; |
179 | } |
180 | sqr(r[2], r[3], a[1]); |
181 | if (--n == 0) { |
182 | return; |
183 | } |
184 | sqr(r[4], r[5], a[2]); |
185 | } |
186 | } |
187 | |
188 | BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, |
189 | size_t n) { |
190 | BN_ULONG ret; |
191 | size_t i = 0; |
192 | |
193 | if (n == 0) { |
194 | return 0; |
195 | } |
196 | |
197 | __asm__ volatile ( |
198 | " subq %0,%0 \n" // clear carry |
199 | " jmp 1f \n" |
200 | ".p2align 4 \n" |
201 | "1:" |
202 | " movq (%4,%2,8),%0 \n" |
203 | " adcq (%5,%2,8),%0 \n" |
204 | " movq %0,(%3,%2,8) \n" |
205 | " lea 1(%2),%2 \n" |
206 | " dec %1 \n" |
207 | " jnz 1b \n" |
208 | " sbbq %0,%0 \n" |
209 | : "=&r" (ret), "+c" (n), "+r" (i) |
210 | : "r" (rp), "r" (ap), "r" (bp) |
211 | : "cc" , "memory" ); |
212 | |
213 | return ret & 1; |
214 | } |
215 | |
216 | BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, |
217 | size_t n) { |
218 | BN_ULONG ret; |
219 | size_t i = 0; |
220 | |
221 | if (n == 0) { |
222 | return 0; |
223 | } |
224 | |
225 | __asm__ volatile ( |
226 | " subq %0,%0 \n" // clear borrow |
227 | " jmp 1f \n" |
228 | ".p2align 4 \n" |
229 | "1:" |
230 | " movq (%4,%2,8),%0 \n" |
231 | " sbbq (%5,%2,8),%0 \n" |
232 | " movq %0,(%3,%2,8) \n" |
233 | " lea 1(%2),%2 \n" |
234 | " dec %1 \n" |
235 | " jnz 1b \n" |
236 | " sbbq %0,%0 \n" |
237 | : "=&r" (ret), "+c" (n), "+r" (i) |
238 | : "r" (rp), "r" (ap), "r" (bp) |
239 | : "cc" , "memory" ); |
240 | |
241 | return ret & 1; |
242 | } |
243 | |
244 | // mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) |
245 | // mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) |
246 | // sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) |
247 | // sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) |
248 | |
249 | // Keep in mind that carrying into high part of multiplication result can not |
250 | // overflow, because it cannot be all-ones. |
251 | #define mul_add_c(a, b, c0, c1, c2) \ |
252 | do { \ |
253 | BN_ULONG t1, t2; \ |
254 | __asm__("mulq %3" : "=a"(t1), "=d"(t2) : "a"(a), "m"(b) : "cc"); \ |
255 | __asm__("addq %3,%0; adcq %4,%1; adcq %5,%2" \ |
256 | : "+r"(c0), "+r"(c1), "+r"(c2) \ |
257 | : "r"(t1), "r"(t2), "g"(0) \ |
258 | : "cc"); \ |
259 | } while (0) |
260 | |
261 | #define sqr_add_c(a, i, c0, c1, c2) \ |
262 | do { \ |
263 | BN_ULONG t1, t2; \ |
264 | __asm__("mulq %2" : "=a"(t1), "=d"(t2) : "a"((a)[i]) : "cc"); \ |
265 | __asm__("addq %3,%0; adcq %4,%1; adcq %5,%2" \ |
266 | : "+r"(c0), "+r"(c1), "+r"(c2) \ |
267 | : "r"(t1), "r"(t2), "g"(0) \ |
268 | : "cc"); \ |
269 | } while (0) |
270 | |
271 | #define mul_add_c2(a, b, c0, c1, c2) \ |
272 | do { \ |
273 | BN_ULONG t1, t2; \ |
274 | __asm__("mulq %3" : "=a"(t1), "=d"(t2) : "a"(a), "m"(b) : "cc"); \ |
275 | __asm__("addq %3,%0; adcq %4,%1; adcq %5,%2" \ |
276 | : "+r"(c0), "+r"(c1), "+r"(c2) \ |
277 | : "r"(t1), "r"(t2), "g"(0) \ |
278 | : "cc"); \ |
279 | __asm__("addq %3,%0; adcq %4,%1; adcq %5,%2" \ |
280 | : "+r"(c0), "+r"(c1), "+r"(c2) \ |
281 | : "r"(t1), "r"(t2), "g"(0) \ |
282 | : "cc"); \ |
283 | } while (0) |
284 | |
285 | #define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2) |
286 | |
287 | void bn_mul_comba8(BN_ULONG r[16], const BN_ULONG a[8], const BN_ULONG b[8]) { |
288 | BN_ULONG c1, c2, c3; |
289 | |
290 | c1 = 0; |
291 | c2 = 0; |
292 | c3 = 0; |
293 | mul_add_c(a[0], b[0], c1, c2, c3); |
294 | r[0] = c1; |
295 | c1 = 0; |
296 | mul_add_c(a[0], b[1], c2, c3, c1); |
297 | mul_add_c(a[1], b[0], c2, c3, c1); |
298 | r[1] = c2; |
299 | c2 = 0; |
300 | mul_add_c(a[2], b[0], c3, c1, c2); |
301 | mul_add_c(a[1], b[1], c3, c1, c2); |
302 | mul_add_c(a[0], b[2], c3, c1, c2); |
303 | r[2] = c3; |
304 | c3 = 0; |
305 | mul_add_c(a[0], b[3], c1, c2, c3); |
306 | mul_add_c(a[1], b[2], c1, c2, c3); |
307 | mul_add_c(a[2], b[1], c1, c2, c3); |
308 | mul_add_c(a[3], b[0], c1, c2, c3); |
309 | r[3] = c1; |
310 | c1 = 0; |
311 | mul_add_c(a[4], b[0], c2, c3, c1); |
312 | mul_add_c(a[3], b[1], c2, c3, c1); |
313 | mul_add_c(a[2], b[2], c2, c3, c1); |
314 | mul_add_c(a[1], b[3], c2, c3, c1); |
315 | mul_add_c(a[0], b[4], c2, c3, c1); |
316 | r[4] = c2; |
317 | c2 = 0; |
318 | mul_add_c(a[0], b[5], c3, c1, c2); |
319 | mul_add_c(a[1], b[4], c3, c1, c2); |
320 | mul_add_c(a[2], b[3], c3, c1, c2); |
321 | mul_add_c(a[3], b[2], c3, c1, c2); |
322 | mul_add_c(a[4], b[1], c3, c1, c2); |
323 | mul_add_c(a[5], b[0], c3, c1, c2); |
324 | r[5] = c3; |
325 | c3 = 0; |
326 | mul_add_c(a[6], b[0], c1, c2, c3); |
327 | mul_add_c(a[5], b[1], c1, c2, c3); |
328 | mul_add_c(a[4], b[2], c1, c2, c3); |
329 | mul_add_c(a[3], b[3], c1, c2, c3); |
330 | mul_add_c(a[2], b[4], c1, c2, c3); |
331 | mul_add_c(a[1], b[5], c1, c2, c3); |
332 | mul_add_c(a[0], b[6], c1, c2, c3); |
333 | r[6] = c1; |
334 | c1 = 0; |
335 | mul_add_c(a[0], b[7], c2, c3, c1); |
336 | mul_add_c(a[1], b[6], c2, c3, c1); |
337 | mul_add_c(a[2], b[5], c2, c3, c1); |
338 | mul_add_c(a[3], b[4], c2, c3, c1); |
339 | mul_add_c(a[4], b[3], c2, c3, c1); |
340 | mul_add_c(a[5], b[2], c2, c3, c1); |
341 | mul_add_c(a[6], b[1], c2, c3, c1); |
342 | mul_add_c(a[7], b[0], c2, c3, c1); |
343 | r[7] = c2; |
344 | c2 = 0; |
345 | mul_add_c(a[7], b[1], c3, c1, c2); |
346 | mul_add_c(a[6], b[2], c3, c1, c2); |
347 | mul_add_c(a[5], b[3], c3, c1, c2); |
348 | mul_add_c(a[4], b[4], c3, c1, c2); |
349 | mul_add_c(a[3], b[5], c3, c1, c2); |
350 | mul_add_c(a[2], b[6], c3, c1, c2); |
351 | mul_add_c(a[1], b[7], c3, c1, c2); |
352 | r[8] = c3; |
353 | c3 = 0; |
354 | mul_add_c(a[2], b[7], c1, c2, c3); |
355 | mul_add_c(a[3], b[6], c1, c2, c3); |
356 | mul_add_c(a[4], b[5], c1, c2, c3); |
357 | mul_add_c(a[5], b[4], c1, c2, c3); |
358 | mul_add_c(a[6], b[3], c1, c2, c3); |
359 | mul_add_c(a[7], b[2], c1, c2, c3); |
360 | r[9] = c1; |
361 | c1 = 0; |
362 | mul_add_c(a[7], b[3], c2, c3, c1); |
363 | mul_add_c(a[6], b[4], c2, c3, c1); |
364 | mul_add_c(a[5], b[5], c2, c3, c1); |
365 | mul_add_c(a[4], b[6], c2, c3, c1); |
366 | mul_add_c(a[3], b[7], c2, c3, c1); |
367 | r[10] = c2; |
368 | c2 = 0; |
369 | mul_add_c(a[4], b[7], c3, c1, c2); |
370 | mul_add_c(a[5], b[6], c3, c1, c2); |
371 | mul_add_c(a[6], b[5], c3, c1, c2); |
372 | mul_add_c(a[7], b[4], c3, c1, c2); |
373 | r[11] = c3; |
374 | c3 = 0; |
375 | mul_add_c(a[7], b[5], c1, c2, c3); |
376 | mul_add_c(a[6], b[6], c1, c2, c3); |
377 | mul_add_c(a[5], b[7], c1, c2, c3); |
378 | r[12] = c1; |
379 | c1 = 0; |
380 | mul_add_c(a[6], b[7], c2, c3, c1); |
381 | mul_add_c(a[7], b[6], c2, c3, c1); |
382 | r[13] = c2; |
383 | c2 = 0; |
384 | mul_add_c(a[7], b[7], c3, c1, c2); |
385 | r[14] = c3; |
386 | r[15] = c1; |
387 | } |
388 | |
389 | void bn_mul_comba4(BN_ULONG r[8], const BN_ULONG a[4], const BN_ULONG b[4]) { |
390 | BN_ULONG c1, c2, c3; |
391 | |
392 | c1 = 0; |
393 | c2 = 0; |
394 | c3 = 0; |
395 | mul_add_c(a[0], b[0], c1, c2, c3); |
396 | r[0] = c1; |
397 | c1 = 0; |
398 | mul_add_c(a[0], b[1], c2, c3, c1); |
399 | mul_add_c(a[1], b[0], c2, c3, c1); |
400 | r[1] = c2; |
401 | c2 = 0; |
402 | mul_add_c(a[2], b[0], c3, c1, c2); |
403 | mul_add_c(a[1], b[1], c3, c1, c2); |
404 | mul_add_c(a[0], b[2], c3, c1, c2); |
405 | r[2] = c3; |
406 | c3 = 0; |
407 | mul_add_c(a[0], b[3], c1, c2, c3); |
408 | mul_add_c(a[1], b[2], c1, c2, c3); |
409 | mul_add_c(a[2], b[1], c1, c2, c3); |
410 | mul_add_c(a[3], b[0], c1, c2, c3); |
411 | r[3] = c1; |
412 | c1 = 0; |
413 | mul_add_c(a[3], b[1], c2, c3, c1); |
414 | mul_add_c(a[2], b[2], c2, c3, c1); |
415 | mul_add_c(a[1], b[3], c2, c3, c1); |
416 | r[4] = c2; |
417 | c2 = 0; |
418 | mul_add_c(a[2], b[3], c3, c1, c2); |
419 | mul_add_c(a[3], b[2], c3, c1, c2); |
420 | r[5] = c3; |
421 | c3 = 0; |
422 | mul_add_c(a[3], b[3], c1, c2, c3); |
423 | r[6] = c1; |
424 | r[7] = c2; |
425 | } |
426 | |
427 | void bn_sqr_comba8(BN_ULONG r[16], const BN_ULONG a[8]) { |
428 | BN_ULONG c1, c2, c3; |
429 | |
430 | c1 = 0; |
431 | c2 = 0; |
432 | c3 = 0; |
433 | sqr_add_c(a, 0, c1, c2, c3); |
434 | r[0] = c1; |
435 | c1 = 0; |
436 | sqr_add_c2(a, 1, 0, c2, c3, c1); |
437 | r[1] = c2; |
438 | c2 = 0; |
439 | sqr_add_c(a, 1, c3, c1, c2); |
440 | sqr_add_c2(a, 2, 0, c3, c1, c2); |
441 | r[2] = c3; |
442 | c3 = 0; |
443 | sqr_add_c2(a, 3, 0, c1, c2, c3); |
444 | sqr_add_c2(a, 2, 1, c1, c2, c3); |
445 | r[3] = c1; |
446 | c1 = 0; |
447 | sqr_add_c(a, 2, c2, c3, c1); |
448 | sqr_add_c2(a, 3, 1, c2, c3, c1); |
449 | sqr_add_c2(a, 4, 0, c2, c3, c1); |
450 | r[4] = c2; |
451 | c2 = 0; |
452 | sqr_add_c2(a, 5, 0, c3, c1, c2); |
453 | sqr_add_c2(a, 4, 1, c3, c1, c2); |
454 | sqr_add_c2(a, 3, 2, c3, c1, c2); |
455 | r[5] = c3; |
456 | c3 = 0; |
457 | sqr_add_c(a, 3, c1, c2, c3); |
458 | sqr_add_c2(a, 4, 2, c1, c2, c3); |
459 | sqr_add_c2(a, 5, 1, c1, c2, c3); |
460 | sqr_add_c2(a, 6, 0, c1, c2, c3); |
461 | r[6] = c1; |
462 | c1 = 0; |
463 | sqr_add_c2(a, 7, 0, c2, c3, c1); |
464 | sqr_add_c2(a, 6, 1, c2, c3, c1); |
465 | sqr_add_c2(a, 5, 2, c2, c3, c1); |
466 | sqr_add_c2(a, 4, 3, c2, c3, c1); |
467 | r[7] = c2; |
468 | c2 = 0; |
469 | sqr_add_c(a, 4, c3, c1, c2); |
470 | sqr_add_c2(a, 5, 3, c3, c1, c2); |
471 | sqr_add_c2(a, 6, 2, c3, c1, c2); |
472 | sqr_add_c2(a, 7, 1, c3, c1, c2); |
473 | r[8] = c3; |
474 | c3 = 0; |
475 | sqr_add_c2(a, 7, 2, c1, c2, c3); |
476 | sqr_add_c2(a, 6, 3, c1, c2, c3); |
477 | sqr_add_c2(a, 5, 4, c1, c2, c3); |
478 | r[9] = c1; |
479 | c1 = 0; |
480 | sqr_add_c(a, 5, c2, c3, c1); |
481 | sqr_add_c2(a, 6, 4, c2, c3, c1); |
482 | sqr_add_c2(a, 7, 3, c2, c3, c1); |
483 | r[10] = c2; |
484 | c2 = 0; |
485 | sqr_add_c2(a, 7, 4, c3, c1, c2); |
486 | sqr_add_c2(a, 6, 5, c3, c1, c2); |
487 | r[11] = c3; |
488 | c3 = 0; |
489 | sqr_add_c(a, 6, c1, c2, c3); |
490 | sqr_add_c2(a, 7, 5, c1, c2, c3); |
491 | r[12] = c1; |
492 | c1 = 0; |
493 | sqr_add_c2(a, 7, 6, c2, c3, c1); |
494 | r[13] = c2; |
495 | c2 = 0; |
496 | sqr_add_c(a, 7, c3, c1, c2); |
497 | r[14] = c3; |
498 | r[15] = c1; |
499 | } |
500 | |
501 | void bn_sqr_comba4(BN_ULONG r[8], const BN_ULONG a[4]) { |
502 | BN_ULONG c1, c2, c3; |
503 | |
504 | c1 = 0; |
505 | c2 = 0; |
506 | c3 = 0; |
507 | sqr_add_c(a, 0, c1, c2, c3); |
508 | r[0] = c1; |
509 | c1 = 0; |
510 | sqr_add_c2(a, 1, 0, c2, c3, c1); |
511 | r[1] = c2; |
512 | c2 = 0; |
513 | sqr_add_c(a, 1, c3, c1, c2); |
514 | sqr_add_c2(a, 2, 0, c3, c1, c2); |
515 | r[2] = c3; |
516 | c3 = 0; |
517 | sqr_add_c2(a, 3, 0, c1, c2, c3); |
518 | sqr_add_c2(a, 2, 1, c1, c2, c3); |
519 | r[3] = c1; |
520 | c1 = 0; |
521 | sqr_add_c(a, 2, c2, c3, c1); |
522 | sqr_add_c2(a, 3, 1, c2, c3, c1); |
523 | r[4] = c2; |
524 | c2 = 0; |
525 | sqr_add_c2(a, 3, 2, c3, c1, c2); |
526 | r[5] = c3; |
527 | c3 = 0; |
528 | sqr_add_c(a, 3, c1, c2, c3); |
529 | r[6] = c1; |
530 | r[7] = c2; |
531 | } |
532 | |
533 | #undef mul_add |
534 | #undef mul |
535 | #undef sqr |
536 | #undef mul_add_c |
537 | #undef sqr_add_c |
538 | #undef mul_add_c2 |
539 | #undef sqr_add_c2 |
540 | |
541 | #endif // !NO_ASM && X86_64 && (__GNUC__ || __clang__) |
542 | |