1 | /* |
2 | * Copyright 2002-2018 The OpenSSL Project Authors. All Rights Reserved. |
3 | * |
4 | * Licensed under the Apache License 2.0 (the "License"). You may not use |
5 | * this file except in compliance with the License. You can obtain a copy |
6 | * in the file LICENSE in the source distribution or at |
7 | * https://www.openssl.org/source/license.html |
8 | */ |
9 | |
10 | #include "../bn_local.h" |
11 | #if !(defined(__GNUC__) && __GNUC__>=2) |
12 | # include "../bn_asm.c" /* kind of dirty hack for Sun Studio */ |
13 | #else |
14 | /*- |
15 | * x86_64 BIGNUM accelerator version 0.1, December 2002. |
16 | * |
17 | * Implemented by Andy Polyakov <appro@openssl.org> for the OpenSSL |
18 | * project. |
19 | * |
20 | * Rights for redistribution and usage in source and binary forms are |
21 | * granted according to the License. Warranty of any kind is disclaimed. |
22 | * |
23 | * Q. Version 0.1? It doesn't sound like Andy, he used to assign real |
24 | * versions, like 1.0... |
25 | * A. Well, that's because this code is basically a quick-n-dirty |
26 | * proof-of-concept hack. As you can see it's implemented with |
27 | * inline assembler, which means that you're bound to GCC and that |
28 | * there might be enough room for further improvement. |
29 | * |
30 | * Q. Why inline assembler? |
31 | * A. x86_64 features own ABI which I'm not familiar with. This is |
32 | * why I decided to let the compiler take care of subroutine |
33 | * prologue/epilogue as well as register allocation. For reference. |
34 | * Win64 implements different ABI for AMD64, different from Linux. |
35 | * |
36 | * Q. How much faster does it get? |
37 | * A. 'apps/openssl speed rsa dsa' output with no-asm: |
38 | * |
39 | * sign verify sign/s verify/s |
40 | * rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2 |
41 | * rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0 |
42 | * rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8 |
43 | * rsa 4096 bits 0.1155s 0.0018s 8.7 555.6 |
44 | * sign verify sign/s verify/s |
45 | * dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3 |
46 | * dsa 1024 bits 0.0014s 0.0018s 692.3 559.2 |
47 | * dsa 2048 bits 0.0049s 0.0061s 204.7 165.0 |
48 | * |
49 | * 'apps/openssl speed rsa dsa' output with this module: |
50 | * |
51 | * sign verify sign/s verify/s |
52 | * rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9 |
53 | * rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7 |
54 | * rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0 |
55 | * rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8 |
56 | * sign verify sign/s verify/s |
57 | * dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3 |
58 | * dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4 |
59 | * dsa 2048 bits 0.0016s 0.0020s 620.4 504.6 |
60 | * |
61 | * For the reference. IA-32 assembler implementation performs |
62 | * very much like 64-bit code compiled with no-asm on the same |
63 | * machine. |
64 | */ |
65 | |
66 | # undef mul |
67 | # undef mul_add |
68 | |
69 | /*- |
70 | * "m"(a), "+m"(r) is the way to favor DirectPath ยต-code; |
71 | * "g"(0) let the compiler to decide where does it |
72 | * want to keep the value of zero; |
73 | */ |
74 | # define mul_add(r,a,word,carry) do { \ |
75 | register BN_ULONG high,low; \ |
76 | asm ("mulq %3" \ |
77 | : "=a"(low),"=d"(high) \ |
78 | : "a"(word),"m"(a) \ |
79 | : "cc"); \ |
80 | asm ("addq %2,%0; adcq %3,%1" \ |
81 | : "+r"(carry),"+d"(high)\ |
82 | : "a"(low),"g"(0) \ |
83 | : "cc"); \ |
84 | asm ("addq %2,%0; adcq %3,%1" \ |
85 | : "+m"(r),"+d"(high) \ |
86 | : "r"(carry),"g"(0) \ |
87 | : "cc"); \ |
88 | carry=high; \ |
89 | } while (0) |
90 | |
91 | # define mul(r,a,word,carry) do { \ |
92 | register BN_ULONG high,low; \ |
93 | asm ("mulq %3" \ |
94 | : "=a"(low),"=d"(high) \ |
95 | : "a"(word),"g"(a) \ |
96 | : "cc"); \ |
97 | asm ("addq %2,%0; adcq %3,%1" \ |
98 | : "+r"(carry),"+d"(high)\ |
99 | : "a"(low),"g"(0) \ |
100 | : "cc"); \ |
101 | (r)=carry, carry=high; \ |
102 | } while (0) |
103 | # undef sqr |
104 | # define sqr(r0,r1,a) \ |
105 | asm ("mulq %2" \ |
106 | : "=a"(r0),"=d"(r1) \ |
107 | : "a"(a) \ |
108 | : "cc"); |
109 | |
110 | BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, |
111 | BN_ULONG w) |
112 | { |
113 | BN_ULONG c1 = 0; |
114 | |
115 | if (num <= 0) |
116 | return c1; |
117 | |
118 | while (num & ~3) { |
119 | mul_add(rp[0], ap[0], w, c1); |
120 | mul_add(rp[1], ap[1], w, c1); |
121 | mul_add(rp[2], ap[2], w, c1); |
122 | mul_add(rp[3], ap[3], w, c1); |
123 | ap += 4; |
124 | rp += 4; |
125 | num -= 4; |
126 | } |
127 | if (num) { |
128 | mul_add(rp[0], ap[0], w, c1); |
129 | if (--num == 0) |
130 | return c1; |
131 | mul_add(rp[1], ap[1], w, c1); |
132 | if (--num == 0) |
133 | return c1; |
134 | mul_add(rp[2], ap[2], w, c1); |
135 | return c1; |
136 | } |
137 | |
138 | return c1; |
139 | } |
140 | |
141 | BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) |
142 | { |
143 | BN_ULONG c1 = 0; |
144 | |
145 | if (num <= 0) |
146 | return c1; |
147 | |
148 | while (num & ~3) { |
149 | mul(rp[0], ap[0], w, c1); |
150 | mul(rp[1], ap[1], w, c1); |
151 | mul(rp[2], ap[2], w, c1); |
152 | mul(rp[3], ap[3], w, c1); |
153 | ap += 4; |
154 | rp += 4; |
155 | num -= 4; |
156 | } |
157 | if (num) { |
158 | mul(rp[0], ap[0], w, c1); |
159 | if (--num == 0) |
160 | return c1; |
161 | mul(rp[1], ap[1], w, c1); |
162 | if (--num == 0) |
163 | return c1; |
164 | mul(rp[2], ap[2], w, c1); |
165 | } |
166 | return c1; |
167 | } |
168 | |
169 | void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) |
170 | { |
171 | if (n <= 0) |
172 | return; |
173 | |
174 | while (n & ~3) { |
175 | sqr(r[0], r[1], a[0]); |
176 | sqr(r[2], r[3], a[1]); |
177 | sqr(r[4], r[5], a[2]); |
178 | sqr(r[6], r[7], a[3]); |
179 | a += 4; |
180 | r += 8; |
181 | n -= 4; |
182 | } |
183 | if (n) { |
184 | sqr(r[0], r[1], a[0]); |
185 | if (--n == 0) |
186 | return; |
187 | sqr(r[2], r[3], a[1]); |
188 | if (--n == 0) |
189 | return; |
190 | sqr(r[4], r[5], a[2]); |
191 | } |
192 | } |
193 | |
194 | BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) |
195 | { |
196 | BN_ULONG ret, waste; |
197 | |
198 | asm("divq %4" :"=a" (ret), "=d" (waste) |
199 | : "a" (l), "d" (h), "r" (d) |
200 | : "cc" ); |
201 | |
202 | return ret; |
203 | } |
204 | |
205 | BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, |
206 | int n) |
207 | { |
208 | BN_ULONG ret; |
209 | size_t i = 0; |
210 | |
211 | if (n <= 0) |
212 | return 0; |
213 | |
214 | asm volatile (" subq %0,%0 \n" /* clear carry */ |
215 | " jmp 1f \n" |
216 | ".p2align 4 \n" |
217 | "1: movq (%4,%2,8),%0 \n" |
218 | " adcq (%5,%2,8),%0 \n" |
219 | " movq %0,(%3,%2,8) \n" |
220 | " lea 1(%2),%2 \n" |
221 | " dec %1 \n" |
222 | " jnz 1b \n" |
223 | " sbbq %0,%0 \n" |
224 | :"=&r" (ret), "+c" (n), "+r" (i) |
225 | :"r" (rp), "r" (ap), "r" (bp) |
226 | :"cc" , "memory" ); |
227 | |
228 | return ret & 1; |
229 | } |
230 | |
231 | # ifndef SIMICS |
232 | BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, |
233 | int n) |
234 | { |
235 | BN_ULONG ret; |
236 | size_t i = 0; |
237 | |
238 | if (n <= 0) |
239 | return 0; |
240 | |
241 | asm volatile (" subq %0,%0 \n" /* clear borrow */ |
242 | " jmp 1f \n" |
243 | ".p2align 4 \n" |
244 | "1: movq (%4,%2,8),%0 \n" |
245 | " sbbq (%5,%2,8),%0 \n" |
246 | " movq %0,(%3,%2,8) \n" |
247 | " lea 1(%2),%2 \n" |
248 | " dec %1 \n" |
249 | " jnz 1b \n" |
250 | " sbbq %0,%0 \n" |
251 | :"=&r" (ret), "+c" (n), "+r" (i) |
252 | :"r" (rp), "r" (ap), "r" (bp) |
253 | :"cc" , "memory" ); |
254 | |
255 | return ret & 1; |
256 | } |
257 | # else |
258 | /* Simics 1.4<7 has buggy sbbq:-( */ |
259 | # define BN_MASK2 0xffffffffffffffffL |
260 | BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) |
261 | { |
262 | BN_ULONG t1, t2; |
263 | int c = 0; |
264 | |
265 | if (n <= 0) |
266 | return (BN_ULONG)0; |
267 | |
268 | for (;;) { |
269 | t1 = a[0]; |
270 | t2 = b[0]; |
271 | r[0] = (t1 - t2 - c) & BN_MASK2; |
272 | if (t1 != t2) |
273 | c = (t1 < t2); |
274 | if (--n <= 0) |
275 | break; |
276 | |
277 | t1 = a[1]; |
278 | t2 = b[1]; |
279 | r[1] = (t1 - t2 - c) & BN_MASK2; |
280 | if (t1 != t2) |
281 | c = (t1 < t2); |
282 | if (--n <= 0) |
283 | break; |
284 | |
285 | t1 = a[2]; |
286 | t2 = b[2]; |
287 | r[2] = (t1 - t2 - c) & BN_MASK2; |
288 | if (t1 != t2) |
289 | c = (t1 < t2); |
290 | if (--n <= 0) |
291 | break; |
292 | |
293 | t1 = a[3]; |
294 | t2 = b[3]; |
295 | r[3] = (t1 - t2 - c) & BN_MASK2; |
296 | if (t1 != t2) |
297 | c = (t1 < t2); |
298 | if (--n <= 0) |
299 | break; |
300 | |
301 | a += 4; |
302 | b += 4; |
303 | r += 4; |
304 | } |
305 | return c; |
306 | } |
307 | # endif |
308 | |
309 | /* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */ |
310 | /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */ |
311 | /* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */ |
312 | /* |
313 | * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number |
314 | * c=(c2,c1,c0) |
315 | */ |
316 | |
317 | /* |
318 | * Keep in mind that carrying into high part of multiplication result |
319 | * can not overflow, because it cannot be all-ones. |
320 | */ |
321 | # if 0 |
322 | /* original macros are kept for reference purposes */ |
323 | # define mul_add_c(a,b,c0,c1,c2) do { \ |
324 | BN_ULONG ta = (a), tb = (b); \ |
325 | BN_ULONG lo, hi; \ |
326 | BN_UMULT_LOHI(lo,hi,ta,tb); \ |
327 | c0 += lo; hi += (c0<lo)?1:0; \ |
328 | c1 += hi; c2 += (c1<hi)?1:0; \ |
329 | } while(0) |
330 | |
331 | # define mul_add_c2(a,b,c0,c1,c2) do { \ |
332 | BN_ULONG ta = (a), tb = (b); \ |
333 | BN_ULONG lo, hi, tt; \ |
334 | BN_UMULT_LOHI(lo,hi,ta,tb); \ |
335 | c0 += lo; tt = hi+((c0<lo)?1:0); \ |
336 | c1 += tt; c2 += (c1<tt)?1:0; \ |
337 | c0 += lo; hi += (c0<lo)?1:0; \ |
338 | c1 += hi; c2 += (c1<hi)?1:0; \ |
339 | } while(0) |
340 | |
341 | # define sqr_add_c(a,i,c0,c1,c2) do { \ |
342 | BN_ULONG ta = (a)[i]; \ |
343 | BN_ULONG lo, hi; \ |
344 | BN_UMULT_LOHI(lo,hi,ta,ta); \ |
345 | c0 += lo; hi += (c0<lo)?1:0; \ |
346 | c1 += hi; c2 += (c1<hi)?1:0; \ |
347 | } while(0) |
348 | # else |
349 | # define mul_add_c(a,b,c0,c1,c2) do { \ |
350 | BN_ULONG t1,t2; \ |
351 | asm ("mulq %3" \ |
352 | : "=a"(t1),"=d"(t2) \ |
353 | : "a"(a),"m"(b) \ |
354 | : "cc"); \ |
355 | asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ |
356 | : "+r"(c0),"+r"(c1),"+r"(c2) \ |
357 | : "r"(t1),"r"(t2),"g"(0) \ |
358 | : "cc"); \ |
359 | } while (0) |
360 | |
361 | # define sqr_add_c(a,i,c0,c1,c2) do { \ |
362 | BN_ULONG t1,t2; \ |
363 | asm ("mulq %2" \ |
364 | : "=a"(t1),"=d"(t2) \ |
365 | : "a"(a[i]) \ |
366 | : "cc"); \ |
367 | asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ |
368 | : "+r"(c0),"+r"(c1),"+r"(c2) \ |
369 | : "r"(t1),"r"(t2),"g"(0) \ |
370 | : "cc"); \ |
371 | } while (0) |
372 | |
373 | # define mul_add_c2(a,b,c0,c1,c2) do { \ |
374 | BN_ULONG t1,t2; \ |
375 | asm ("mulq %3" \ |
376 | : "=a"(t1),"=d"(t2) \ |
377 | : "a"(a),"m"(b) \ |
378 | : "cc"); \ |
379 | asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ |
380 | : "+r"(c0),"+r"(c1),"+r"(c2) \ |
381 | : "r"(t1),"r"(t2),"g"(0) \ |
382 | : "cc"); \ |
383 | asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ |
384 | : "+r"(c0),"+r"(c1),"+r"(c2) \ |
385 | : "r"(t1),"r"(t2),"g"(0) \ |
386 | : "cc"); \ |
387 | } while (0) |
388 | # endif |
389 | |
390 | # define sqr_add_c2(a,i,j,c0,c1,c2) \ |
391 | mul_add_c2((a)[i],(a)[j],c0,c1,c2) |
392 | |
393 | void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) |
394 | { |
395 | BN_ULONG c1, c2, c3; |
396 | |
397 | c1 = 0; |
398 | c2 = 0; |
399 | c3 = 0; |
400 | mul_add_c(a[0], b[0], c1, c2, c3); |
401 | r[0] = c1; |
402 | c1 = 0; |
403 | mul_add_c(a[0], b[1], c2, c3, c1); |
404 | mul_add_c(a[1], b[0], c2, c3, c1); |
405 | r[1] = c2; |
406 | c2 = 0; |
407 | mul_add_c(a[2], b[0], c3, c1, c2); |
408 | mul_add_c(a[1], b[1], c3, c1, c2); |
409 | mul_add_c(a[0], b[2], c3, c1, c2); |
410 | r[2] = c3; |
411 | c3 = 0; |
412 | mul_add_c(a[0], b[3], c1, c2, c3); |
413 | mul_add_c(a[1], b[2], c1, c2, c3); |
414 | mul_add_c(a[2], b[1], c1, c2, c3); |
415 | mul_add_c(a[3], b[0], c1, c2, c3); |
416 | r[3] = c1; |
417 | c1 = 0; |
418 | mul_add_c(a[4], b[0], c2, c3, c1); |
419 | mul_add_c(a[3], b[1], c2, c3, c1); |
420 | mul_add_c(a[2], b[2], c2, c3, c1); |
421 | mul_add_c(a[1], b[3], c2, c3, c1); |
422 | mul_add_c(a[0], b[4], c2, c3, c1); |
423 | r[4] = c2; |
424 | c2 = 0; |
425 | mul_add_c(a[0], b[5], c3, c1, c2); |
426 | mul_add_c(a[1], b[4], c3, c1, c2); |
427 | mul_add_c(a[2], b[3], c3, c1, c2); |
428 | mul_add_c(a[3], b[2], c3, c1, c2); |
429 | mul_add_c(a[4], b[1], c3, c1, c2); |
430 | mul_add_c(a[5], b[0], c3, c1, c2); |
431 | r[5] = c3; |
432 | c3 = 0; |
433 | mul_add_c(a[6], b[0], c1, c2, c3); |
434 | mul_add_c(a[5], b[1], c1, c2, c3); |
435 | mul_add_c(a[4], b[2], c1, c2, c3); |
436 | mul_add_c(a[3], b[3], c1, c2, c3); |
437 | mul_add_c(a[2], b[4], c1, c2, c3); |
438 | mul_add_c(a[1], b[5], c1, c2, c3); |
439 | mul_add_c(a[0], b[6], c1, c2, c3); |
440 | r[6] = c1; |
441 | c1 = 0; |
442 | mul_add_c(a[0], b[7], c2, c3, c1); |
443 | mul_add_c(a[1], b[6], c2, c3, c1); |
444 | mul_add_c(a[2], b[5], c2, c3, c1); |
445 | mul_add_c(a[3], b[4], c2, c3, c1); |
446 | mul_add_c(a[4], b[3], c2, c3, c1); |
447 | mul_add_c(a[5], b[2], c2, c3, c1); |
448 | mul_add_c(a[6], b[1], c2, c3, c1); |
449 | mul_add_c(a[7], b[0], c2, c3, c1); |
450 | r[7] = c2; |
451 | c2 = 0; |
452 | mul_add_c(a[7], b[1], c3, c1, c2); |
453 | mul_add_c(a[6], b[2], c3, c1, c2); |
454 | mul_add_c(a[5], b[3], c3, c1, c2); |
455 | mul_add_c(a[4], b[4], c3, c1, c2); |
456 | mul_add_c(a[3], b[5], c3, c1, c2); |
457 | mul_add_c(a[2], b[6], c3, c1, c2); |
458 | mul_add_c(a[1], b[7], c3, c1, c2); |
459 | r[8] = c3; |
460 | c3 = 0; |
461 | mul_add_c(a[2], b[7], c1, c2, c3); |
462 | mul_add_c(a[3], b[6], c1, c2, c3); |
463 | mul_add_c(a[4], b[5], c1, c2, c3); |
464 | mul_add_c(a[5], b[4], c1, c2, c3); |
465 | mul_add_c(a[6], b[3], c1, c2, c3); |
466 | mul_add_c(a[7], b[2], c1, c2, c3); |
467 | r[9] = c1; |
468 | c1 = 0; |
469 | mul_add_c(a[7], b[3], c2, c3, c1); |
470 | mul_add_c(a[6], b[4], c2, c3, c1); |
471 | mul_add_c(a[5], b[5], c2, c3, c1); |
472 | mul_add_c(a[4], b[6], c2, c3, c1); |
473 | mul_add_c(a[3], b[7], c2, c3, c1); |
474 | r[10] = c2; |
475 | c2 = 0; |
476 | mul_add_c(a[4], b[7], c3, c1, c2); |
477 | mul_add_c(a[5], b[6], c3, c1, c2); |
478 | mul_add_c(a[6], b[5], c3, c1, c2); |
479 | mul_add_c(a[7], b[4], c3, c1, c2); |
480 | r[11] = c3; |
481 | c3 = 0; |
482 | mul_add_c(a[7], b[5], c1, c2, c3); |
483 | mul_add_c(a[6], b[6], c1, c2, c3); |
484 | mul_add_c(a[5], b[7], c1, c2, c3); |
485 | r[12] = c1; |
486 | c1 = 0; |
487 | mul_add_c(a[6], b[7], c2, c3, c1); |
488 | mul_add_c(a[7], b[6], c2, c3, c1); |
489 | r[13] = c2; |
490 | c2 = 0; |
491 | mul_add_c(a[7], b[7], c3, c1, c2); |
492 | r[14] = c3; |
493 | r[15] = c1; |
494 | } |
495 | |
496 | void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) |
497 | { |
498 | BN_ULONG c1, c2, c3; |
499 | |
500 | c1 = 0; |
501 | c2 = 0; |
502 | c3 = 0; |
503 | mul_add_c(a[0], b[0], c1, c2, c3); |
504 | r[0] = c1; |
505 | c1 = 0; |
506 | mul_add_c(a[0], b[1], c2, c3, c1); |
507 | mul_add_c(a[1], b[0], c2, c3, c1); |
508 | r[1] = c2; |
509 | c2 = 0; |
510 | mul_add_c(a[2], b[0], c3, c1, c2); |
511 | mul_add_c(a[1], b[1], c3, c1, c2); |
512 | mul_add_c(a[0], b[2], c3, c1, c2); |
513 | r[2] = c3; |
514 | c3 = 0; |
515 | mul_add_c(a[0], b[3], c1, c2, c3); |
516 | mul_add_c(a[1], b[2], c1, c2, c3); |
517 | mul_add_c(a[2], b[1], c1, c2, c3); |
518 | mul_add_c(a[3], b[0], c1, c2, c3); |
519 | r[3] = c1; |
520 | c1 = 0; |
521 | mul_add_c(a[3], b[1], c2, c3, c1); |
522 | mul_add_c(a[2], b[2], c2, c3, c1); |
523 | mul_add_c(a[1], b[3], c2, c3, c1); |
524 | r[4] = c2; |
525 | c2 = 0; |
526 | mul_add_c(a[2], b[3], c3, c1, c2); |
527 | mul_add_c(a[3], b[2], c3, c1, c2); |
528 | r[5] = c3; |
529 | c3 = 0; |
530 | mul_add_c(a[3], b[3], c1, c2, c3); |
531 | r[6] = c1; |
532 | r[7] = c2; |
533 | } |
534 | |
535 | void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) |
536 | { |
537 | BN_ULONG c1, c2, c3; |
538 | |
539 | c1 = 0; |
540 | c2 = 0; |
541 | c3 = 0; |
542 | sqr_add_c(a, 0, c1, c2, c3); |
543 | r[0] = c1; |
544 | c1 = 0; |
545 | sqr_add_c2(a, 1, 0, c2, c3, c1); |
546 | r[1] = c2; |
547 | c2 = 0; |
548 | sqr_add_c(a, 1, c3, c1, c2); |
549 | sqr_add_c2(a, 2, 0, c3, c1, c2); |
550 | r[2] = c3; |
551 | c3 = 0; |
552 | sqr_add_c2(a, 3, 0, c1, c2, c3); |
553 | sqr_add_c2(a, 2, 1, c1, c2, c3); |
554 | r[3] = c1; |
555 | c1 = 0; |
556 | sqr_add_c(a, 2, c2, c3, c1); |
557 | sqr_add_c2(a, 3, 1, c2, c3, c1); |
558 | sqr_add_c2(a, 4, 0, c2, c3, c1); |
559 | r[4] = c2; |
560 | c2 = 0; |
561 | sqr_add_c2(a, 5, 0, c3, c1, c2); |
562 | sqr_add_c2(a, 4, 1, c3, c1, c2); |
563 | sqr_add_c2(a, 3, 2, c3, c1, c2); |
564 | r[5] = c3; |
565 | c3 = 0; |
566 | sqr_add_c(a, 3, c1, c2, c3); |
567 | sqr_add_c2(a, 4, 2, c1, c2, c3); |
568 | sqr_add_c2(a, 5, 1, c1, c2, c3); |
569 | sqr_add_c2(a, 6, 0, c1, c2, c3); |
570 | r[6] = c1; |
571 | c1 = 0; |
572 | sqr_add_c2(a, 7, 0, c2, c3, c1); |
573 | sqr_add_c2(a, 6, 1, c2, c3, c1); |
574 | sqr_add_c2(a, 5, 2, c2, c3, c1); |
575 | sqr_add_c2(a, 4, 3, c2, c3, c1); |
576 | r[7] = c2; |
577 | c2 = 0; |
578 | sqr_add_c(a, 4, c3, c1, c2); |
579 | sqr_add_c2(a, 5, 3, c3, c1, c2); |
580 | sqr_add_c2(a, 6, 2, c3, c1, c2); |
581 | sqr_add_c2(a, 7, 1, c3, c1, c2); |
582 | r[8] = c3; |
583 | c3 = 0; |
584 | sqr_add_c2(a, 7, 2, c1, c2, c3); |
585 | sqr_add_c2(a, 6, 3, c1, c2, c3); |
586 | sqr_add_c2(a, 5, 4, c1, c2, c3); |
587 | r[9] = c1; |
588 | c1 = 0; |
589 | sqr_add_c(a, 5, c2, c3, c1); |
590 | sqr_add_c2(a, 6, 4, c2, c3, c1); |
591 | sqr_add_c2(a, 7, 3, c2, c3, c1); |
592 | r[10] = c2; |
593 | c2 = 0; |
594 | sqr_add_c2(a, 7, 4, c3, c1, c2); |
595 | sqr_add_c2(a, 6, 5, c3, c1, c2); |
596 | r[11] = c3; |
597 | c3 = 0; |
598 | sqr_add_c(a, 6, c1, c2, c3); |
599 | sqr_add_c2(a, 7, 5, c1, c2, c3); |
600 | r[12] = c1; |
601 | c1 = 0; |
602 | sqr_add_c2(a, 7, 6, c2, c3, c1); |
603 | r[13] = c2; |
604 | c2 = 0; |
605 | sqr_add_c(a, 7, c3, c1, c2); |
606 | r[14] = c3; |
607 | r[15] = c1; |
608 | } |
609 | |
610 | void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) |
611 | { |
612 | BN_ULONG c1, c2, c3; |
613 | |
614 | c1 = 0; |
615 | c2 = 0; |
616 | c3 = 0; |
617 | sqr_add_c(a, 0, c1, c2, c3); |
618 | r[0] = c1; |
619 | c1 = 0; |
620 | sqr_add_c2(a, 1, 0, c2, c3, c1); |
621 | r[1] = c2; |
622 | c2 = 0; |
623 | sqr_add_c(a, 1, c3, c1, c2); |
624 | sqr_add_c2(a, 2, 0, c3, c1, c2); |
625 | r[2] = c3; |
626 | c3 = 0; |
627 | sqr_add_c2(a, 3, 0, c1, c2, c3); |
628 | sqr_add_c2(a, 2, 1, c1, c2, c3); |
629 | r[3] = c1; |
630 | c1 = 0; |
631 | sqr_add_c(a, 2, c2, c3, c1); |
632 | sqr_add_c2(a, 3, 1, c2, c3, c1); |
633 | r[4] = c2; |
634 | c2 = 0; |
635 | sqr_add_c2(a, 3, 2, c3, c1, c2); |
636 | r[5] = c3; |
637 | c3 = 0; |
638 | sqr_add_c(a, 3, c1, c2, c3); |
639 | r[6] = c1; |
640 | r[7] = c2; |
641 | } |
642 | #endif |
643 | |