1/* crypto/bn/bn_asm.c */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58
59#ifndef BN_DEBUG
60# undef NDEBUG /* avoid conflicting definitions */
61# define NDEBUG
62#endif
63
64#include <stdio.h>
65#include <assert.h>
66#include "../bn/bn_lcl.h"
67
68#if defined(BN_LLONG) || defined(BN_UMULT_HIGH)
69
70BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
71 BN_ULONG w)
72{
73 BN_ULONG c1 = 0;
74
75 assert(num >= 0);
76 if (num <= 0)
77 return (c1);
78
79# ifndef OPENSSL_SMALL_FOOTPRINT
80 while (num & ~3) {
81 mul_add(rp[0], ap[0], w, c1);
82 mul_add(rp[1], ap[1], w, c1);
83 mul_add(rp[2], ap[2], w, c1);
84 mul_add(rp[3], ap[3], w, c1);
85 ap += 4;
86 rp += 4;
87 num -= 4;
88 }
89# endif
90 while (num) {
91 mul_add(rp[0], ap[0], w, c1);
92 ap++;
93 rp++;
94 num--;
95 }
96
97 return (c1);
98}
99
100BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
101{
102 BN_ULONG c1 = 0;
103
104 assert(num >= 0);
105 if (num <= 0)
106 return (c1);
107
108# ifndef OPENSSL_SMALL_FOOTPRINT
109 while (num & ~3) {
110 mul(rp[0], ap[0], w, c1);
111 mul(rp[1], ap[1], w, c1);
112 mul(rp[2], ap[2], w, c1);
113 mul(rp[3], ap[3], w, c1);
114 ap += 4;
115 rp += 4;
116 num -= 4;
117 }
118# endif
119 while (num) {
120 mul(rp[0], ap[0], w, c1);
121 ap++;
122 rp++;
123 num--;
124 }
125 return (c1);
126}
127
128void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
129{
130 assert(n >= 0);
131 if (n <= 0)
132 return;
133
134# ifndef OPENSSL_SMALL_FOOTPRINT
135 while (n & ~3) {
136 sqr(r[0], r[1], a[0]);
137 sqr(r[2], r[3], a[1]);
138 sqr(r[4], r[5], a[2]);
139 sqr(r[6], r[7], a[3]);
140 a += 4;
141 r += 8;
142 n -= 4;
143 }
144# endif
145 while (n) {
146 sqr(r[0], r[1], a[0]);
147 a++;
148 r += 2;
149 n--;
150 }
151}
152
153#else /* !(defined(BN_LLONG) ||
154 * defined(BN_UMULT_HIGH)) */
155
156BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
157 BN_ULONG w)
158{
159 BN_ULONG c = 0;
160 BN_ULONG bl, bh;
161
162 assert(num >= 0);
163 if (num <= 0)
164 return ((BN_ULONG)0);
165
166 bl = LBITS(w);
167 bh = HBITS(w);
168
169# ifndef OPENSSL_SMALL_FOOTPRINT
170 while (num & ~3) {
171 mul_add(rp[0], ap[0], bl, bh, c);
172 mul_add(rp[1], ap[1], bl, bh, c);
173 mul_add(rp[2], ap[2], bl, bh, c);
174 mul_add(rp[3], ap[3], bl, bh, c);
175 ap += 4;
176 rp += 4;
177 num -= 4;
178 }
179# endif
180 while (num) {
181 mul_add(rp[0], ap[0], bl, bh, c);
182 ap++;
183 rp++;
184 num--;
185 }
186 return (c);
187}
188
189BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
190{
191 BN_ULONG carry = 0;
192 BN_ULONG bl, bh;
193
194 assert(num >= 0);
195 if (num <= 0)
196 return ((BN_ULONG)0);
197
198 bl = LBITS(w);
199 bh = HBITS(w);
200
201# ifndef OPENSSL_SMALL_FOOTPRINT
202 while (num & ~3) {
203 mul(rp[0], ap[0], bl, bh, carry);
204 mul(rp[1], ap[1], bl, bh, carry);
205 mul(rp[2], ap[2], bl, bh, carry);
206 mul(rp[3], ap[3], bl, bh, carry);
207 ap += 4;
208 rp += 4;
209 num -= 4;
210 }
211# endif
212 while (num) {
213 mul(rp[0], ap[0], bl, bh, carry);
214 ap++;
215 rp++;
216 num--;
217 }
218 return (carry);
219}
220
221void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
222{
223 assert(n >= 0);
224 if (n <= 0)
225 return;
226
227# ifndef OPENSSL_SMALL_FOOTPRINT
228 while (n & ~3) {
229 sqr64(r[0], r[1], a[0]);
230 sqr64(r[2], r[3], a[1]);
231 sqr64(r[4], r[5], a[2]);
232 sqr64(r[6], r[7], a[3]);
233 a += 4;
234 r += 8;
235 n -= 4;
236 }
237# endif
238 while (n) {
239 sqr64(r[0], r[1], a[0]);
240 a++;
241 r += 2;
242 n--;
243 }
244}
245
246#endif /* !(defined(BN_LLONG) ||
247 * defined(BN_UMULT_HIGH)) */
248
249#if defined(BN_LLONG) && defined(BN_DIV2W)
250
251BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
252{
253 return ((BN_ULONG)(((((BN_ULLONG) h) << BN_BITS2) | l) / (BN_ULLONG) d));
254}
255
256#else
257
258/* Divide h,l by d and return the result. */
259/* I need to test this some more :-( */
260BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
261{
262 BN_ULONG dh, dl, q, ret = 0, th, tl, t;
263 int i, count = 2;
264
265 if (d == 0)
266 return (BN_MASK2);
267
268 i = BN_num_bits_word(d);
269 assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i));
270
271 i = BN_BITS2 - i;
272 if (h >= d)
273 h -= d;
274
275 if (i) {
276 d <<= i;
277 h = (h << i) | (l >> (BN_BITS2 - i));
278 l <<= i;
279 }
280 dh = (d & BN_MASK2h) >> BN_BITS4;
281 dl = (d & BN_MASK2l);
282 for (;;) {
283 if ((h >> BN_BITS4) == dh)
284 q = BN_MASK2l;
285 else
286 q = h / dh;
287
288 th = q * dh;
289 tl = dl * q;
290 for (;;) {
291 t = h - th;
292 if ((t & BN_MASK2h) ||
293 ((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4))))
294 break;
295 q--;
296 th -= dh;
297 tl -= dl;
298 }
299 t = (tl >> BN_BITS4);
300 tl = (tl << BN_BITS4) & BN_MASK2h;
301 th += t;
302
303 if (l < tl)
304 th++;
305 l -= tl;
306 if (h < th) {
307 h += d;
308 q--;
309 }
310 h -= th;
311
312 if (--count == 0)
313 break;
314
315 ret = q << BN_BITS4;
316 h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2;
317 l = (l & BN_MASK2l) << BN_BITS4;
318 }
319 ret |= q;
320 return (ret);
321}
322#endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */
323
324#ifdef BN_LLONG
325BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
326 int n)
327{
328 BN_ULLONG ll = 0;
329
330 assert(n >= 0);
331 if (n <= 0)
332 return ((BN_ULONG)0);
333
334# ifndef OPENSSL_SMALL_FOOTPRINT
335 while (n & ~3) {
336 ll += (BN_ULLONG) a[0] + b[0];
337 r[0] = (BN_ULONG)ll & BN_MASK2;
338 ll >>= BN_BITS2;
339 ll += (BN_ULLONG) a[1] + b[1];
340 r[1] = (BN_ULONG)ll & BN_MASK2;
341 ll >>= BN_BITS2;
342 ll += (BN_ULLONG) a[2] + b[2];
343 r[2] = (BN_ULONG)ll & BN_MASK2;
344 ll >>= BN_BITS2;
345 ll += (BN_ULLONG) a[3] + b[3];
346 r[3] = (BN_ULONG)ll & BN_MASK2;
347 ll >>= BN_BITS2;
348 a += 4;
349 b += 4;
350 r += 4;
351 n -= 4;
352 }
353# endif
354 while (n) {
355 ll += (BN_ULLONG) a[0] + b[0];
356 r[0] = (BN_ULONG)ll & BN_MASK2;
357 ll >>= BN_BITS2;
358 a++;
359 b++;
360 r++;
361 n--;
362 }
363 return ((BN_ULONG)ll);
364}
365#else /* !BN_LLONG */
366BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
367 int n)
368{
369 BN_ULONG c, l, t;
370
371 assert(n >= 0);
372 if (n <= 0)
373 return ((BN_ULONG)0);
374
375 c = 0;
376# ifndef OPENSSL_SMALL_FOOTPRINT
377 while (n & ~3) {
378 t = a[0];
379 t = (t + c) & BN_MASK2;
380 c = (t < c);
381 l = (t + b[0]) & BN_MASK2;
382 c += (l < t);
383 r[0] = l;
384 t = a[1];
385 t = (t + c) & BN_MASK2;
386 c = (t < c);
387 l = (t + b[1]) & BN_MASK2;
388 c += (l < t);
389 r[1] = l;
390 t = a[2];
391 t = (t + c) & BN_MASK2;
392 c = (t < c);
393 l = (t + b[2]) & BN_MASK2;
394 c += (l < t);
395 r[2] = l;
396 t = a[3];
397 t = (t + c) & BN_MASK2;
398 c = (t < c);
399 l = (t + b[3]) & BN_MASK2;
400 c += (l < t);
401 r[3] = l;
402 a += 4;
403 b += 4;
404 r += 4;
405 n -= 4;
406 }
407# endif
408 while (n) {
409 t = a[0];
410 t = (t + c) & BN_MASK2;
411 c = (t < c);
412 l = (t + b[0]) & BN_MASK2;
413 c += (l < t);
414 r[0] = l;
415 a++;
416 b++;
417 r++;
418 n--;
419 }
420 return ((BN_ULONG)c);
421}
422#endif /* !BN_LLONG */
423
424BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
425 int n)
426{
427 BN_ULONG t1, t2;
428 int c = 0;
429
430 assert(n >= 0);
431 if (n <= 0)
432 return ((BN_ULONG)0);
433
434#ifndef OPENSSL_SMALL_FOOTPRINT
435 while (n & ~3) {
436 t1 = a[0];
437 t2 = b[0];
438 r[0] = (t1 - t2 - c) & BN_MASK2;
439 if (t1 != t2)
440 c = (t1 < t2);
441 t1 = a[1];
442 t2 = b[1];
443 r[1] = (t1 - t2 - c) & BN_MASK2;
444 if (t1 != t2)
445 c = (t1 < t2);
446 t1 = a[2];
447 t2 = b[2];
448 r[2] = (t1 - t2 - c) & BN_MASK2;
449 if (t1 != t2)
450 c = (t1 < t2);
451 t1 = a[3];
452 t2 = b[3];
453 r[3] = (t1 - t2 - c) & BN_MASK2;
454 if (t1 != t2)
455 c = (t1 < t2);
456 a += 4;
457 b += 4;
458 r += 4;
459 n -= 4;
460 }
461#endif
462 while (n) {
463 t1 = a[0];
464 t2 = b[0];
465 r[0] = (t1 - t2 - c) & BN_MASK2;
466 if (t1 != t2)
467 c = (t1 < t2);
468 a++;
469 b++;
470 r++;
471 n--;
472 }
473 return (c);
474}
475
476#if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT)
477
478# undef bn_mul_comba8
479# undef bn_mul_comba4
480# undef bn_sqr_comba8
481# undef bn_sqr_comba4
482
483/* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */
484/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
485/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
486/*
487 * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number
488 * c=(c2,c1,c0)
489 */
490
491# ifdef BN_LLONG
492/*
493 * Keep in mind that additions to multiplication result can not
494 * overflow, because its high half cannot be all-ones.
495 */
496# define mul_add_c(a,b,c0,c1,c2) do { \
497 BN_ULONG hi; \
498 BN_ULLONG t = (BN_ULLONG)(a)*(b); \
499 t += c0; /* no carry */ \
500 c0 = (BN_ULONG)Lw(t); \
501 hi = (BN_ULONG)Hw(t); \
502 c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
503 } while(0)
504
505# define mul_add_c2(a,b,c0,c1,c2) do { \
506 BN_ULONG hi; \
507 BN_ULLONG t = (BN_ULLONG)(a)*(b); \
508 BN_ULLONG tt = t+c0; /* no carry */ \
509 c0 = (BN_ULONG)Lw(tt); \
510 hi = (BN_ULONG)Hw(tt); \
511 c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
512 t += c0; /* no carry */ \
513 c0 = (BN_ULONG)Lw(t); \
514 hi = (BN_ULONG)Hw(t); \
515 c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
516 } while(0)
517
518# define sqr_add_c(a,i,c0,c1,c2) do { \
519 BN_ULONG hi; \
520 BN_ULLONG t = (BN_ULLONG)a[i]*a[i]; \
521 t += c0; /* no carry */ \
522 c0 = (BN_ULONG)Lw(t); \
523 hi = (BN_ULONG)Hw(t); \
524 c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
525 } while(0)
526
527# define sqr_add_c2(a,i,j,c0,c1,c2) \
528 mul_add_c2((a)[i],(a)[j],c0,c1,c2)
529
530# elif defined(BN_UMULT_LOHI)
531/*
532 * Keep in mind that additions to hi can not overflow, because
533 * the high word of a multiplication result cannot be all-ones.
534 */
535# define mul_add_c(a,b,c0,c1,c2) do { \
536 BN_ULONG ta = (a), tb = (b); \
537 BN_ULONG lo, hi; \
538 BN_UMULT_LOHI(lo,hi,ta,tb); \
539 c0 += lo; hi += (c0<lo)?1:0; \
540 c1 += hi; c2 += (c1<hi)?1:0; \
541 } while(0)
542
543# define mul_add_c2(a,b,c0,c1,c2) do { \
544 BN_ULONG ta = (a), tb = (b); \
545 BN_ULONG lo, hi, tt; \
546 BN_UMULT_LOHI(lo,hi,ta,tb); \
547 c0 += lo; tt = hi+((c0<lo)?1:0); \
548 c1 += tt; c2 += (c1<tt)?1:0; \
549 c0 += lo; hi += (c0<lo)?1:0; \
550 c1 += hi; c2 += (c1<hi)?1:0; \
551 } while(0)
552
553# define sqr_add_c(a,i,c0,c1,c2) do { \
554 BN_ULONG ta = (a)[i]; \
555 BN_ULONG lo, hi; \
556 BN_UMULT_LOHI(lo,hi,ta,ta); \
557 c0 += lo; hi += (c0<lo)?1:0; \
558 c1 += hi; c2 += (c1<hi)?1:0; \
559 } while(0)
560
561# define sqr_add_c2(a,i,j,c0,c1,c2) \
562 mul_add_c2((a)[i],(a)[j],c0,c1,c2)
563
564# elif defined(BN_UMULT_HIGH)
565/*
566 * Keep in mind that additions to hi can not overflow, because
567 * the high word of a multiplication result cannot be all-ones.
568 */
569# define mul_add_c(a,b,c0,c1,c2) do { \
570 BN_ULONG ta = (a), tb = (b); \
571 BN_ULONG lo = ta * tb; \
572 BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \
573 c0 += lo; hi += (c0<lo)?1:0; \
574 c1 += hi; c2 += (c1<hi)?1:0; \
575 } while(0)
576
577# define mul_add_c2(a,b,c0,c1,c2) do { \
578 BN_ULONG ta = (a), tb = (b), tt; \
579 BN_ULONG lo = ta * tb; \
580 BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \
581 c0 += lo; tt = hi + ((c0<lo)?1:0); \
582 c1 += tt; c2 += (c1<tt)?1:0; \
583 c0 += lo; hi += (c0<lo)?1:0; \
584 c1 += hi; c2 += (c1<hi)?1:0; \
585 } while(0)
586
587# define sqr_add_c(a,i,c0,c1,c2) do { \
588 BN_ULONG ta = (a)[i]; \
589 BN_ULONG lo = ta * ta; \
590 BN_ULONG hi = BN_UMULT_HIGH(ta,ta); \
591 c0 += lo; hi += (c0<lo)?1:0; \
592 c1 += hi; c2 += (c1<hi)?1:0; \
593 } while(0)
594
595# define sqr_add_c2(a,i,j,c0,c1,c2) \
596 mul_add_c2((a)[i],(a)[j],c0,c1,c2)
597
598# else /* !BN_LLONG */
599/*
600 * Keep in mind that additions to hi can not overflow, because
601 * the high word of a multiplication result cannot be all-ones.
602 */
603# define mul_add_c(a,b,c0,c1,c2) do { \
604 BN_ULONG lo = LBITS(a), hi = HBITS(a); \
605 BN_ULONG bl = LBITS(b), bh = HBITS(b); \
606 mul64(lo,hi,bl,bh); \
607 c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
608 c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
609 } while(0)
610
611# define mul_add_c2(a,b,c0,c1,c2) do { \
612 BN_ULONG tt; \
613 BN_ULONG lo = LBITS(a), hi = HBITS(a); \
614 BN_ULONG bl = LBITS(b), bh = HBITS(b); \
615 mul64(lo,hi,bl,bh); \
616 tt = hi; \
617 c0 = (c0+lo)&BN_MASK2; if (c0<lo) tt++; \
618 c1 = (c1+tt)&BN_MASK2; if (c1<tt) c2++; \
619 c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
620 c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
621 } while(0)
622
623# define sqr_add_c(a,i,c0,c1,c2) do { \
624 BN_ULONG lo, hi; \
625 sqr64(lo,hi,(a)[i]); \
626 c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
627 c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
628 } while(0)
629
630# define sqr_add_c2(a,i,j,c0,c1,c2) \
631 mul_add_c2((a)[i],(a)[j],c0,c1,c2)
632# endif /* !BN_LLONG */
633
634void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
635{
636 BN_ULONG c1, c2, c3;
637
638 c1 = 0;
639 c2 = 0;
640 c3 = 0;
641 mul_add_c(a[0], b[0], c1, c2, c3);
642 r[0] = c1;
643 c1 = 0;
644 mul_add_c(a[0], b[1], c2, c3, c1);
645 mul_add_c(a[1], b[0], c2, c3, c1);
646 r[1] = c2;
647 c2 = 0;
648 mul_add_c(a[2], b[0], c3, c1, c2);
649 mul_add_c(a[1], b[1], c3, c1, c2);
650 mul_add_c(a[0], b[2], c3, c1, c2);
651 r[2] = c3;
652 c3 = 0;
653 mul_add_c(a[0], b[3], c1, c2, c3);
654 mul_add_c(a[1], b[2], c1, c2, c3);
655 mul_add_c(a[2], b[1], c1, c2, c3);
656 mul_add_c(a[3], b[0], c1, c2, c3);
657 r[3] = c1;
658 c1 = 0;
659 mul_add_c(a[4], b[0], c2, c3, c1);
660 mul_add_c(a[3], b[1], c2, c3, c1);
661 mul_add_c(a[2], b[2], c2, c3, c1);
662 mul_add_c(a[1], b[3], c2, c3, c1);
663 mul_add_c(a[0], b[4], c2, c3, c1);
664 r[4] = c2;
665 c2 = 0;
666 mul_add_c(a[0], b[5], c3, c1, c2);
667 mul_add_c(a[1], b[4], c3, c1, c2);
668 mul_add_c(a[2], b[3], c3, c1, c2);
669 mul_add_c(a[3], b[2], c3, c1, c2);
670 mul_add_c(a[4], b[1], c3, c1, c2);
671 mul_add_c(a[5], b[0], c3, c1, c2);
672 r[5] = c3;
673 c3 = 0;
674 mul_add_c(a[6], b[0], c1, c2, c3);
675 mul_add_c(a[5], b[1], c1, c2, c3);
676 mul_add_c(a[4], b[2], c1, c2, c3);
677 mul_add_c(a[3], b[3], c1, c2, c3);
678 mul_add_c(a[2], b[4], c1, c2, c3);
679 mul_add_c(a[1], b[5], c1, c2, c3);
680 mul_add_c(a[0], b[6], c1, c2, c3);
681 r[6] = c1;
682 c1 = 0;
683 mul_add_c(a[0], b[7], c2, c3, c1);
684 mul_add_c(a[1], b[6], c2, c3, c1);
685 mul_add_c(a[2], b[5], c2, c3, c1);
686 mul_add_c(a[3], b[4], c2, c3, c1);
687 mul_add_c(a[4], b[3], c2, c3, c1);
688 mul_add_c(a[5], b[2], c2, c3, c1);
689 mul_add_c(a[6], b[1], c2, c3, c1);
690 mul_add_c(a[7], b[0], c2, c3, c1);
691 r[7] = c2;
692 c2 = 0;
693 mul_add_c(a[7], b[1], c3, c1, c2);
694 mul_add_c(a[6], b[2], c3, c1, c2);
695 mul_add_c(a[5], b[3], c3, c1, c2);
696 mul_add_c(a[4], b[4], c3, c1, c2);
697 mul_add_c(a[3], b[5], c3, c1, c2);
698 mul_add_c(a[2], b[6], c3, c1, c2);
699 mul_add_c(a[1], b[7], c3, c1, c2);
700 r[8] = c3;
701 c3 = 0;
702 mul_add_c(a[2], b[7], c1, c2, c3);
703 mul_add_c(a[3], b[6], c1, c2, c3);
704 mul_add_c(a[4], b[5], c1, c2, c3);
705 mul_add_c(a[5], b[4], c1, c2, c3);
706 mul_add_c(a[6], b[3], c1, c2, c3);
707 mul_add_c(a[7], b[2], c1, c2, c3);
708 r[9] = c1;
709 c1 = 0;
710 mul_add_c(a[7], b[3], c2, c3, c1);
711 mul_add_c(a[6], b[4], c2, c3, c1);
712 mul_add_c(a[5], b[5], c2, c3, c1);
713 mul_add_c(a[4], b[6], c2, c3, c1);
714 mul_add_c(a[3], b[7], c2, c3, c1);
715 r[10] = c2;
716 c2 = 0;
717 mul_add_c(a[4], b[7], c3, c1, c2);
718 mul_add_c(a[5], b[6], c3, c1, c2);
719 mul_add_c(a[6], b[5], c3, c1, c2);
720 mul_add_c(a[7], b[4], c3, c1, c2);
721 r[11] = c3;
722 c3 = 0;
723 mul_add_c(a[7], b[5], c1, c2, c3);
724 mul_add_c(a[6], b[6], c1, c2, c3);
725 mul_add_c(a[5], b[7], c1, c2, c3);
726 r[12] = c1;
727 c1 = 0;
728 mul_add_c(a[6], b[7], c2, c3, c1);
729 mul_add_c(a[7], b[6], c2, c3, c1);
730 r[13] = c2;
731 c2 = 0;
732 mul_add_c(a[7], b[7], c3, c1, c2);
733 r[14] = c3;
734 r[15] = c1;
735}
736
737void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
738{
739 BN_ULONG c1, c2, c3;
740
741 c1 = 0;
742 c2 = 0;
743 c3 = 0;
744 mul_add_c(a[0], b[0], c1, c2, c3);
745 r[0] = c1;
746 c1 = 0;
747 mul_add_c(a[0], b[1], c2, c3, c1);
748 mul_add_c(a[1], b[0], c2, c3, c1);
749 r[1] = c2;
750 c2 = 0;
751 mul_add_c(a[2], b[0], c3, c1, c2);
752 mul_add_c(a[1], b[1], c3, c1, c2);
753 mul_add_c(a[0], b[2], c3, c1, c2);
754 r[2] = c3;
755 c3 = 0;
756 mul_add_c(a[0], b[3], c1, c2, c3);
757 mul_add_c(a[1], b[2], c1, c2, c3);
758 mul_add_c(a[2], b[1], c1, c2, c3);
759 mul_add_c(a[3], b[0], c1, c2, c3);
760 r[3] = c1;
761 c1 = 0;
762 mul_add_c(a[3], b[1], c2, c3, c1);
763 mul_add_c(a[2], b[2], c2, c3, c1);
764 mul_add_c(a[1], b[3], c2, c3, c1);
765 r[4] = c2;
766 c2 = 0;
767 mul_add_c(a[2], b[3], c3, c1, c2);
768 mul_add_c(a[3], b[2], c3, c1, c2);
769 r[5] = c3;
770 c3 = 0;
771 mul_add_c(a[3], b[3], c1, c2, c3);
772 r[6] = c1;
773 r[7] = c2;
774}
775
776void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
777{
778 BN_ULONG c1, c2, c3;
779
780 c1 = 0;
781 c2 = 0;
782 c3 = 0;
783 sqr_add_c(a, 0, c1, c2, c3);
784 r[0] = c1;
785 c1 = 0;
786 sqr_add_c2(a, 1, 0, c2, c3, c1);
787 r[1] = c2;
788 c2 = 0;
789 sqr_add_c(a, 1, c3, c1, c2);
790 sqr_add_c2(a, 2, 0, c3, c1, c2);
791 r[2] = c3;
792 c3 = 0;
793 sqr_add_c2(a, 3, 0, c1, c2, c3);
794 sqr_add_c2(a, 2, 1, c1, c2, c3);
795 r[3] = c1;
796 c1 = 0;
797 sqr_add_c(a, 2, c2, c3, c1);
798 sqr_add_c2(a, 3, 1, c2, c3, c1);
799 sqr_add_c2(a, 4, 0, c2, c3, c1);
800 r[4] = c2;
801 c2 = 0;
802 sqr_add_c2(a, 5, 0, c3, c1, c2);
803 sqr_add_c2(a, 4, 1, c3, c1, c2);
804 sqr_add_c2(a, 3, 2, c3, c1, c2);
805 r[5] = c3;
806 c3 = 0;
807 sqr_add_c(a, 3, c1, c2, c3);
808 sqr_add_c2(a, 4, 2, c1, c2, c3);
809 sqr_add_c2(a, 5, 1, c1, c2, c3);
810 sqr_add_c2(a, 6, 0, c1, c2, c3);
811 r[6] = c1;
812 c1 = 0;
813 sqr_add_c2(a, 7, 0, c2, c3, c1);
814 sqr_add_c2(a, 6, 1, c2, c3, c1);
815 sqr_add_c2(a, 5, 2, c2, c3, c1);
816 sqr_add_c2(a, 4, 3, c2, c3, c1);
817 r[7] = c2;
818 c2 = 0;
819 sqr_add_c(a, 4, c3, c1, c2);
820 sqr_add_c2(a, 5, 3, c3, c1, c2);
821 sqr_add_c2(a, 6, 2, c3, c1, c2);
822 sqr_add_c2(a, 7, 1, c3, c1, c2);
823 r[8] = c3;
824 c3 = 0;
825 sqr_add_c2(a, 7, 2, c1, c2, c3);
826 sqr_add_c2(a, 6, 3, c1, c2, c3);
827 sqr_add_c2(a, 5, 4, c1, c2, c3);
828 r[9] = c1;
829 c1 = 0;
830 sqr_add_c(a, 5, c2, c3, c1);
831 sqr_add_c2(a, 6, 4, c2, c3, c1);
832 sqr_add_c2(a, 7, 3, c2, c3, c1);
833 r[10] = c2;
834 c2 = 0;
835 sqr_add_c2(a, 7, 4, c3, c1, c2);
836 sqr_add_c2(a, 6, 5, c3, c1, c2);
837 r[11] = c3;
838 c3 = 0;
839 sqr_add_c(a, 6, c1, c2, c3);
840 sqr_add_c2(a, 7, 5, c1, c2, c3);
841 r[12] = c1;
842 c1 = 0;
843 sqr_add_c2(a, 7, 6, c2, c3, c1);
844 r[13] = c2;
845 c2 = 0;
846 sqr_add_c(a, 7, c3, c1, c2);
847 r[14] = c3;
848 r[15] = c1;
849}
850
851void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
852{
853 BN_ULONG c1, c2, c3;
854
855 c1 = 0;
856 c2 = 0;
857 c3 = 0;
858 sqr_add_c(a, 0, c1, c2, c3);
859 r[0] = c1;
860 c1 = 0;
861 sqr_add_c2(a, 1, 0, c2, c3, c1);
862 r[1] = c2;
863 c2 = 0;
864 sqr_add_c(a, 1, c3, c1, c2);
865 sqr_add_c2(a, 2, 0, c3, c1, c2);
866 r[2] = c3;
867 c3 = 0;
868 sqr_add_c2(a, 3, 0, c1, c2, c3);
869 sqr_add_c2(a, 2, 1, c1, c2, c3);
870 r[3] = c1;
871 c1 = 0;
872 sqr_add_c(a, 2, c2, c3, c1);
873 sqr_add_c2(a, 3, 1, c2, c3, c1);
874 r[4] = c2;
875 c2 = 0;
876 sqr_add_c2(a, 3, 2, c3, c1, c2);
877 r[5] = c3;
878 c3 = 0;
879 sqr_add_c(a, 3, c1, c2, c3);
880 r[6] = c1;
881 r[7] = c2;
882}
883
884# ifdef OPENSSL_NO_ASM
885# ifdef OPENSSL_BN_ASM_MONT
886# include <alloca.h>
887/*
888 * This is essentially reference implementation, which may or may not
889 * result in performance improvement. E.g. on IA-32 this routine was
890 * observed to give 40% faster rsa1024 private key operations and 10%
891 * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only
892 * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a
893 * reference implementation, one to be used as starting point for
894 * platform-specific assembler. Mentioned numbers apply to compiler
895 * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and
896 * can vary not only from platform to platform, but even for compiler
897 * versions. Assembler vs. assembler improvement coefficients can
898 * [and are known to] differ and are to be documented elsewhere.
899 */
900int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
901 const BN_ULONG *np, const BN_ULONG *n0p, int num)
902{
903 BN_ULONG c0, c1, ml, *tp, n0;
904# ifdef mul64
905 BN_ULONG mh;
906# endif
907 volatile BN_ULONG *vp;
908 int i = 0, j;
909
910# if 0 /* template for platform-specific
911 * implementation */
912 if (ap == bp)
913 return bn_sqr_mont(rp, ap, np, n0p, num);
914# endif
915 vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
916
917 n0 = *n0p;
918
919 c0 = 0;
920 ml = bp[0];
921# ifdef mul64
922 mh = HBITS(ml);
923 ml = LBITS(ml);
924 for (j = 0; j < num; ++j)
925 mul(tp[j], ap[j], ml, mh, c0);
926# else
927 for (j = 0; j < num; ++j)
928 mul(tp[j], ap[j], ml, c0);
929# endif
930
931 tp[num] = c0;
932 tp[num + 1] = 0;
933 goto enter;
934
935 for (i = 0; i < num; i++) {
936 c0 = 0;
937 ml = bp[i];
938# ifdef mul64
939 mh = HBITS(ml);
940 ml = LBITS(ml);
941 for (j = 0; j < num; ++j)
942 mul_add(tp[j], ap[j], ml, mh, c0);
943# else
944 for (j = 0; j < num; ++j)
945 mul_add(tp[j], ap[j], ml, c0);
946# endif
947 c1 = (tp[num] + c0) & BN_MASK2;
948 tp[num] = c1;
949 tp[num + 1] = (c1 < c0 ? 1 : 0);
950 enter:
951 c1 = tp[0];
952 ml = (c1 * n0) & BN_MASK2;
953 c0 = 0;
954# ifdef mul64
955 mh = HBITS(ml);
956 ml = LBITS(ml);
957 mul_add(c1, np[0], ml, mh, c0);
958# else
959 mul_add(c1, ml, np[0], c0);
960# endif
961 for (j = 1; j < num; j++) {
962 c1 = tp[j];
963# ifdef mul64
964 mul_add(c1, np[j], ml, mh, c0);
965# else
966 mul_add(c1, ml, np[j], c0);
967# endif
968 tp[j - 1] = c1 & BN_MASK2;
969 }
970 c1 = (tp[num] + c0) & BN_MASK2;
971 tp[num - 1] = c1;
972 tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0);
973 }
974
975 if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) {
976 c0 = bn_sub_words(rp, tp, np, num);
977 if (tp[num] != 0 || c0 == 0) {
978 for (i = 0; i < num + 2; i++)
979 vp[i] = 0;
980 return 1;
981 }
982 }
983 for (i = 0; i < num; i++)
984 rp[i] = tp[i], vp[i] = 0;
985 vp[num] = 0;
986 vp[num + 1] = 0;
987 return 1;
988}
989# else
990/*
991 * Return value of 0 indicates that multiplication/convolution was not
992 * performed to signal the caller to fall down to alternative/original
993 * code-path.
994 */
995int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
996 const BN_ULONG *np, const BN_ULONG *n0, int num)
997{
998 return 0;
999}
1000# endif /* OPENSSL_BN_ASM_MONT */
1001# endif
1002
1003#else /* !BN_MUL_COMBA */
1004
1005/* hmm... is it faster just to do a multiply? */
1006# undef bn_sqr_comba4
1007void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
1008{
1009 BN_ULONG t[8];
1010 bn_sqr_normal(r, a, 4, t);
1011}
1012
1013# undef bn_sqr_comba8
1014void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
1015{
1016 BN_ULONG t[16];
1017 bn_sqr_normal(r, a, 8, t);
1018}
1019
1020void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1021{
1022 r[4] = bn_mul_words(&(r[0]), a, 4, b[0]);
1023 r[5] = bn_mul_add_words(&(r[1]), a, 4, b[1]);
1024 r[6] = bn_mul_add_words(&(r[2]), a, 4, b[2]);
1025 r[7] = bn_mul_add_words(&(r[3]), a, 4, b[3]);
1026}
1027
1028void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1029{
1030 r[8] = bn_mul_words(&(r[0]), a, 8, b[0]);
1031 r[9] = bn_mul_add_words(&(r[1]), a, 8, b[1]);
1032 r[10] = bn_mul_add_words(&(r[2]), a, 8, b[2]);
1033 r[11] = bn_mul_add_words(&(r[3]), a, 8, b[3]);
1034 r[12] = bn_mul_add_words(&(r[4]), a, 8, b[4]);
1035 r[13] = bn_mul_add_words(&(r[5]), a, 8, b[5]);
1036 r[14] = bn_mul_add_words(&(r[6]), a, 8, b[6]);
1037 r[15] = bn_mul_add_words(&(r[7]), a, 8, b[7]);
1038}
1039
1040# ifdef OPENSSL_NO_ASM
1041# ifdef OPENSSL_BN_ASM_MONT
1042# include <alloca.h>
1043int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
1044 const BN_ULONG *np, const BN_ULONG *n0p, int num)
1045{
1046 BN_ULONG c0, c1, *tp, n0 = *n0p;
1047 volatile BN_ULONG *vp;
1048 int i = 0, j;
1049
1050 vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
1051
1052 for (i = 0; i <= num; i++)
1053 tp[i] = 0;
1054
1055 for (i = 0; i < num; i++) {
1056 c0 = bn_mul_add_words(tp, ap, num, bp[i]);
1057 c1 = (tp[num] + c0) & BN_MASK2;
1058 tp[num] = c1;
1059 tp[num + 1] = (c1 < c0 ? 1 : 0);
1060
1061 c0 = bn_mul_add_words(tp, np, num, tp[0] * n0);
1062 c1 = (tp[num] + c0) & BN_MASK2;
1063 tp[num] = c1;
1064 tp[num + 1] += (c1 < c0 ? 1 : 0);
1065 for (j = 0; j <= num; j++)
1066 tp[j] = tp[j + 1];
1067 }
1068
1069 if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) {
1070 c0 = bn_sub_words(rp, tp, np, num);
1071 if (tp[num] != 0 || c0 == 0) {
1072 for (i = 0; i < num + 2; i++)
1073 vp[i] = 0;
1074 return 1;
1075 }
1076 }
1077 for (i = 0; i < num; i++)
1078 rp[i] = tp[i], vp[i] = 0;
1079 vp[num] = 0;
1080 vp[num + 1] = 0;
1081 return 1;
1082}
1083# else
1084int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
1085 const BN_ULONG *np, const BN_ULONG *n0, int num)
1086{
1087 return 0;
1088}
1089# endif /* OPENSSL_BN_ASM_MONT */
1090# endif
1091
1092#endif /* !BN_MUL_COMBA */
1093