bn_asm.c source code [Aerospike/modules/s2-geometry-library/geometry/util/math/exactfloat/bn/bn_asm.c]

1	/ crypto/bn/bn_asm.c /
2	/ Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)*
3	* All rights reserved.
4	*
5	* This package is an SSL implementation written
6	* by Eric Young (eay@cryptsoft.com).
7	* The implementation was written so as to conform with Netscapes SSL.
8	*
9	* This library is free for commercial and non-commercial use as long as
10	* the following conditions are aheared to. The following conditions
11	* apply to all code found in this distribution, be it the RC4, RSA,
12	* lhash, DES, etc., code; not just the SSL code. The SSL documentation
13	* included with this distribution is covered by the same copyright terms
14	* except that the holder is Tim Hudson (tjh@cryptsoft.com).
15	*
16	* Copyright remains Eric Young's, and as such any Copyright notices in
17	* the code are not to be removed.
18	* If this package is used in a product, Eric Young should be given attribution
19	* as the author of the parts of the library used.
20	* This can be in the form of a textual message at program startup or
21	* in documentation (online or textual) provided with the package.
22	*
23	* Redistribution and use in source and binary forms, with or without
24	* modification, are permitted provided that the following conditions
25	* are met:
26	* 1. Redistributions of source code must retain the copyright
27	* notice, this list of conditions and the following disclaimer.
28	* 2. Redistributions in binary form must reproduce the above copyright
29	* notice, this list of conditions and the following disclaimer in the
30	* documentation and/or other materials provided with the distribution.
31	* 3. All advertising materials mentioning features or use of this software
32	* must display the following acknowledgement:
33	* "This product includes cryptographic software written by
34	* Eric Young (eay@cryptsoft.com)"
35	* The word 'cryptographic' can be left out if the rouines from the library
36	* being used are not cryptographic related :-).
37	* 4. If you include any Windows specific code (or a derivative thereof) from
38	* the apps directory (application code) you must include an acknowledgement:
39	* "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40	*
41	* THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51	* SUCH DAMAGE.
52	*
53	* The licence and distribution terms for any publically available version or
54	* derivative of this code cannot be changed. i.e. this code cannot simply be
55	* copied and put under another distribution licence
56	* [including the GNU Public Licence.]
57	*/
58
59	#ifndef BN_DEBUG
60	# undef NDEBUG /* avoid conflicting definitions */
61	# define NDEBUG
62	#endif
63
64	#include <stdio.h>
65	#include <assert.h>
66	#include "../bn/bn_lcl.h"
67
68	#if defined(BN_LLONG) \|\| defined(BN_UMULT_HIGH)
69
70	BN_ULONG bn_mul_add_words(BN_ULONG rp, const* BN_ULONG ap, int* num,
71	BN_ULONG w)
72	{
73	BN_ULONG c1 = `0`;
74
75	assert(num >= `0`);
76	if (num <= `0`)
77	return (c1);
78
79	# ifndef OPENSSL_SMALL_FOOTPRINT
80	while (num & ~`3`) {
81	mul_add(rp[`0`], ap[`0`], w, c1);
82	mul_add(rp[`1`], ap[`1`], w, c1);
83	mul_add(rp[`2`], ap[`2`], w, c1);
84	mul_add(rp[`3`], ap[`3`], w, c1);
85	ap += `4`;
86	rp += `4`;
87	num -= `4`;
88	}
89	# endif
90	while (num) {
91	mul_add(rp[`0`], ap[`0`], w, c1);
92	ap++;
93	rp++;
94	num--;
95	}
96
97	return (c1);
98	}
99
100	BN_ULONG bn_mul_words(BN_ULONG rp, const* BN_ULONG ap, int* num, BN_ULONG w)
101	{
102	BN_ULONG c1 = `0`;
103
104	assert(num >= `0`);
105	if (num <= `0`)
106	return (c1);
107
108	# ifndef OPENSSL_SMALL_FOOTPRINT
109	while (num & ~`3`) {
110	mul(rp[`0`], ap[`0`], w, c1);
111	mul(rp[`1`], ap[`1`], w, c1);
112	mul(rp[`2`], ap[`2`], w, c1);
113	mul(rp[`3`], ap[`3`], w, c1);
114	ap += `4`;
115	rp += `4`;
116	num -= `4`;
117	}
118	# endif
119	while (num) {
120	mul(rp[`0`], ap[`0`], w, c1);
121	ap++;
122	rp++;
123	num--;
124	}
125	return (c1);
126	}
127
128	void bn_sqr_words(BN_ULONG r, const* BN_ULONG a, int* n)
129	{
130	assert(n >= `0`);
131	if (n <= `0`)
132	return;
133
134	# ifndef OPENSSL_SMALL_FOOTPRINT
135	while (n & ~`3`) {
136	sqr(r[`0`], r[`1`], a[`0`]);
137	sqr(r[`2`], r[`3`], a[`1`]);
138	sqr(r[`4`], r[`5`], a[`2`]);
139	sqr(r[`6`], r[`7`], a[`3`]);
140	a += `4`;
141	r += `8`;
142	n -= `4`;
143	}
144	# endif
145	while (n) {
146	sqr(r[`0`], r[`1`], a[`0`]);
147	a++;
148	r += `2`;
149	n--;
150	}
151	}
152
153	#else /* !(defined(BN_LLONG) \|\|
154	* defined(BN_UMULT_HIGH)) */
155
156	BN_ULONG bn_mul_add_words(BN_ULONG rp, const* BN_ULONG ap, int* num,
157	BN_ULONG w)
158	{
159	BN_ULONG c = `0`;
160	BN_ULONG bl, bh;
161
162	assert(num >= `0`);
163	if (num <= `0`)
164	return ((BN_ULONG)`0`);
165
166	bl = LBITS(w);
167	bh = HBITS(w);
168
169	# ifndef OPENSSL_SMALL_FOOTPRINT
170	while (num & ~`3`) {
171	mul_add(rp[`0`], ap[`0`], bl, bh, c);
172	mul_add(rp[`1`], ap[`1`], bl, bh, c);
173	mul_add(rp[`2`], ap[`2`], bl, bh, c);
174	mul_add(rp[`3`], ap[`3`], bl, bh, c);
175	ap += `4`;
176	rp += `4`;
177	num -= `4`;
178	}
179	# endif
180	while (num) {
181	mul_add(rp[`0`], ap[`0`], bl, bh, c);
182	ap++;
183	rp++;
184	num--;
185	}
186	return (c);
187	}
188
189	BN_ULONG bn_mul_words(BN_ULONG rp, const* BN_ULONG ap, int* num, BN_ULONG w)
190	{
191	BN_ULONG carry = `0`;
192	BN_ULONG bl, bh;
193
194	assert(num >= `0`);
195	if (num <= `0`)
196	return ((BN_ULONG)`0`);
197
198	bl = LBITS(w);
199	bh = HBITS(w);
200
201	# ifndef OPENSSL_SMALL_FOOTPRINT
202	while (num & ~`3`) {
203	mul(rp[`0`], ap[`0`], bl, bh, carry);
204	mul(rp[`1`], ap[`1`], bl, bh, carry);
205	mul(rp[`2`], ap[`2`], bl, bh, carry);
206	mul(rp[`3`], ap[`3`], bl, bh, carry);
207	ap += `4`;
208	rp += `4`;
209	num -= `4`;
210	}
211	# endif
212	while (num) {
213	mul(rp[`0`], ap[`0`], bl, bh, carry);
214	ap++;
215	rp++;
216	num--;
217	}
218	return (carry);
219	}
220
221	void bn_sqr_words(BN_ULONG r, const* BN_ULONG a, int* n)
222	{
223	assert(n >= `0`);
224	if (n <= `0`)
225	return;
226
227	# ifndef OPENSSL_SMALL_FOOTPRINT
228	while (n & ~`3`) {
229	sqr64(r[`0`], r[`1`], a[`0`]);
230	sqr64(r[`2`], r[`3`], a[`1`]);
231	sqr64(r[`4`], r[`5`], a[`2`]);
232	sqr64(r[`6`], r[`7`], a[`3`]);
233	a += `4`;
234	r += `8`;
235	n -= `4`;
236	}
237	# endif
238	while (n) {
239	sqr64(r[`0`], r[`1`], a[`0`]);
240	a++;
241	r += `2`;
242	n--;
243	}
244	}
245
246	#endif /* !(defined(BN_LLONG) \|\|
247	* defined(BN_UMULT_HIGH)) */
248
249	#if defined(BN_LLONG) && defined(BN_DIV2W)
250
251	BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
252	{
253	return ((BN_ULONG)(((((BN_ULLONG) h) << BN_BITS2) \| l) / (BN_ULLONG) d));
254	}
255
256	#else
257
258	/ Divide h,l by d and return the result. /
259	/ I need to test this some more :-( /
260	BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
261	{
262	BN_ULONG dh, dl, q, ret = `0`, th, tl, t;
263	int i, count = `2`;
264
265	if (d == `0`)
266	return (BN_MASK2);
267
268	i = BN_num_bits_word(d);
269	assert((i == BN_BITS2) \|\| (h <= (BN_ULONG)`1` << i));
270
271	i = BN_BITS2 - i;
272	if (h >= d)
273	h -= d;
274
275	if (i) {
276	d <<= i;
277	h = (h << i) \| (l >> (BN_BITS2 - i));
278	l <<= i;
279	}
280	dh = (d & BN_MASK2h) >> BN_BITS4;
281	dl = (d & BN_MASK2l);
282	for (;;) {
283	if ((h >> BN_BITS4) == dh)
284	q = BN_MASK2l;
285	else
286	q = h / dh;
287
288	th = q * dh;
289	tl = dl * q;
290	for (;;) {
291	t = h - th;
292	if ((t & BN_MASK2h) \|\|
293	((tl) <= ((t << BN_BITS4) \| ((l & BN_MASK2h) >> BN_BITS4))))
294	break;
295	q--;
296	th -= dh;
297	tl -= dl;
298	}
299	t = (tl >> BN_BITS4);
300	tl = (tl << BN_BITS4) & BN_MASK2h;
301	th += t;
302
303	if (l < tl)
304	th++;
305	l -= tl;
306	if (h < th) {
307	h += d;
308	q--;
309	}
310	h -= th;
311
312	if (--count == `0`)
313	break;
314
315	ret = q << BN_BITS4;
316	h = ((h << BN_BITS4) \| (l >> BN_BITS4)) & BN_MASK2;
317	l = (l & BN_MASK2l) << BN_BITS4;
318	}
319	ret \|= q;
320	return (ret);
321	}
322	#endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */
323
324	#ifdef BN_LLONG
325	BN_ULONG bn_add_words(BN_ULONG r, const* BN_ULONG a, const* BN_ULONG *b,
326	int n)
327	{
328	BN_ULLONG ll = `0`;
329
330	assert(n >= `0`);
331	if (n <= `0`)
332	return ((BN_ULONG)`0`);
333
334	# ifndef OPENSSL_SMALL_FOOTPRINT
335	while (n & ~`3`) {
336	ll += (BN_ULLONG) a[`0`] + b[`0`];
337	r[`0`] = (BN_ULONG)ll & BN_MASK2;
338	ll >>= BN_BITS2;
339	ll += (BN_ULLONG) a[`1`] + b[`1`];
340	r[`1`] = (BN_ULONG)ll & BN_MASK2;
341	ll >>= BN_BITS2;
342	ll += (BN_ULLONG) a[`2`] + b[`2`];
343	r[`2`] = (BN_ULONG)ll & BN_MASK2;
344	ll >>= BN_BITS2;
345	ll += (BN_ULLONG) a[`3`] + b[`3`];
346	r[`3`] = (BN_ULONG)ll & BN_MASK2;
347	ll >>= BN_BITS2;
348	a += `4`;
349	b += `4`;
350	r += `4`;
351	n -= `4`;
352	}
353	# endif
354	while (n) {
355	ll += (BN_ULLONG) a[`0`] + b[`0`];
356	r[`0`] = (BN_ULONG)ll & BN_MASK2;
357	ll >>= BN_BITS2;
358	a++;
359	b++;
360	r++;
361	n--;
362	}
363	return ((BN_ULONG)ll);
364	}
365	#else /* !BN_LLONG */
366	BN_ULONG bn_add_words(BN_ULONG r, const* BN_ULONG a, const* BN_ULONG *b,
367	int n)
368	{
369	BN_ULONG c, l, t;
370
371	assert(n >= `0`);
372	if (n <= `0`)
373	return ((BN_ULONG)`0`);
374
375	c = `0`;
376	# ifndef OPENSSL_SMALL_FOOTPRINT
377	while (n & ~`3`) {
378	t = a[`0`];
379	t = (t + c) & BN_MASK2;
380	c = (t < c);
381	l = (t + b[`0`]) & BN_MASK2;
382	c += (l < t);
383	r[`0`] = l;
384	t = a[`1`];
385	t = (t + c) & BN_MASK2;
386	c = (t < c);
387	l = (t + b[`1`]) & BN_MASK2;
388	c += (l < t);
389	r[`1`] = l;
390	t = a[`2`];
391	t = (t + c) & BN_MASK2;
392	c = (t < c);
393	l = (t + b[`2`]) & BN_MASK2;
394	c += (l < t);
395	r[`2`] = l;
396	t = a[`3`];
397	t = (t + c) & BN_MASK2;
398	c = (t < c);
399	l = (t + b[`3`]) & BN_MASK2;
400	c += (l < t);
401	r[`3`] = l;
402	a += `4`;
403	b += `4`;
404	r += `4`;
405	n -= `4`;
406	}
407	# endif
408	while (n) {
409	t = a[`0`];
410	t = (t + c) & BN_MASK2;
411	c = (t < c);
412	l = (t + b[`0`]) & BN_MASK2;
413	c += (l < t);
414	r[`0`] = l;
415	a++;
416	b++;
417	r++;
418	n--;
419	}
420	return ((BN_ULONG)c);
421	}
422	#endif /* !BN_LLONG */
423
424	BN_ULONG bn_sub_words(BN_ULONG r, const* BN_ULONG a, const* BN_ULONG *b,
425	int n)
426	{
427	BN_ULONG t1, t2;
428	int c = `0`;
429
430	assert(n >= `0`);
431	if (n <= `0`)
432	return ((BN_ULONG)`0`);
433
434	#ifndef OPENSSL_SMALL_FOOTPRINT
435	while (n & ~`3`) {
436	t1 = a[`0`];
437	t2 = b[`0`];
438	r[`0`] = (t1 - t2 - c) & BN_MASK2;
439	if (t1 != t2)
440	c = (t1 < t2);
441	t1 = a[`1`];
442	t2 = b[`1`];
443	r[`1`] = (t1 - t2 - c) & BN_MASK2;
444	if (t1 != t2)
445	c = (t1 < t2);
446	t1 = a[`2`];
447	t2 = b[`2`];
448	r[`2`] = (t1 - t2 - c) & BN_MASK2;
449	if (t1 != t2)
450	c = (t1 < t2);
451	t1 = a[`3`];
452	t2 = b[`3`];
453	r[`3`] = (t1 - t2 - c) & BN_MASK2;
454	if (t1 != t2)
455	c = (t1 < t2);
456	a += `4`;
457	b += `4`;
458	r += `4`;
459	n -= `4`;
460	}
461	#endif
462	while (n) {
463	t1 = a[`0`];
464	t2 = b[`0`];
465	r[`0`] = (t1 - t2 - c) & BN_MASK2;
466	if (t1 != t2)
467	c = (t1 < t2);
468	a++;
469	b++;
470	r++;
471	n--;
472	}
473	return (c);
474	}
475
476	#if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT)
477
478	# undef bn_mul_comba8
479	# undef bn_mul_comba4
480	# undef bn_sqr_comba8
481	# undef bn_sqr_comba4
482
483	/ mul_add_c(a,b,c0,c1,c2) -- c+=ab for three word number c=(c2,c1,c0) /*
484	/ mul_add_c2(a,b,c0,c1,c2) -- c+=2ab for three word number c=(c2,c1,c0) /
485	/ sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) /
486	/*
487	* sqr_add_c2(a,i,c0,c1,c2) -- c+=2a[i]a[j] for three word number
488	* c=(c2,c1,c0)
489	*/
490
491	# ifdef BN_LLONG
492	/*
493	* Keep in mind that additions to multiplication result can not
494	* overflow, because its high half cannot be all-ones.
495	*/
496	# define mul_add_c(a,b,c0,c1,c2) do { \
497	BN_ULONG hi; \
498	BN_ULLONG t = (BN_ULLONG)(a)*(b); \
499	t += c0; /* no carry */ \
500	c0 = (BN_ULONG)Lw(t); \
501	hi = (BN_ULONG)Hw(t); \
502	c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
503	} while(0)
504
505	# define mul_add_c2(a,b,c0,c1,c2) do { \
506	BN_ULONG hi; \
507	BN_ULLONG t = (BN_ULLONG)(a)*(b); \
508	BN_ULLONG tt = t+c0; /* no carry */ \
509	c0 = (BN_ULONG)Lw(tt); \
510	hi = (BN_ULONG)Hw(tt); \
511	c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
512	t += c0; /* no carry */ \
513	c0 = (BN_ULONG)Lw(t); \
514	hi = (BN_ULONG)Hw(t); \
515	c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
516	} while(0)
517
518	# define sqr_add_c(a,i,c0,c1,c2) do { \
519	BN_ULONG hi; \
520	BN_ULLONG t = (BN_ULLONG)a[i]*a[i]; \
521	t += c0; /* no carry */ \
522	c0 = (BN_ULONG)Lw(t); \
523	hi = (BN_ULONG)Hw(t); \
524	c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
525	} while(0)
526
527	# define sqr_add_c2(a,i,j,c0,c1,c2) \
528	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
529
530	# elif defined(BN_UMULT_LOHI)
531	/*
532	* Keep in mind that additions to hi can not overflow, because
533	* the high word of a multiplication result cannot be all-ones.
534	*/
535	# define mul_add_c(a,b,c0,c1,c2) do { \
536	BN_ULONG ta = (a), tb = (b); \
537	BN_ULONG lo, hi; \
538	BN_UMULT_LOHI(lo,hi,ta,tb); \
539	c0 += lo; hi += (c0<lo)?1:0; \
540	c1 += hi; c2 += (c1<hi)?1:0; \
541	} while(0)
542
543	# define mul_add_c2(a,b,c0,c1,c2) do { \
544	BN_ULONG ta = (a), tb = (b); \
545	BN_ULONG lo, hi, tt; \
546	BN_UMULT_LOHI(lo,hi,ta,tb); \
547	c0 += lo; tt = hi+((c0<lo)?1:0); \
548	c1 += tt; c2 += (c1<tt)?1:0; \
549	c0 += lo; hi += (c0<lo)?1:0; \
550	c1 += hi; c2 += (c1<hi)?1:0; \
551	} while(0)
552
553	# define sqr_add_c(a,i,c0,c1,c2) do { \
554	BN_ULONG ta = (a)[i]; \
555	BN_ULONG lo, hi; \
556	BN_UMULT_LOHI(lo,hi,ta,ta); \
557	c0 += lo; hi += (c0<lo)?1:0; \
558	c1 += hi; c2 += (c1<hi)?1:0; \
559	} while(0)
560
561	# define sqr_add_c2(a,i,j,c0,c1,c2) \
562	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
563
564	# elif defined(BN_UMULT_HIGH)
565	/*
566	* Keep in mind that additions to hi can not overflow, because
567	* the high word of a multiplication result cannot be all-ones.
568	*/
569	# define mul_add_c(a,b,c0,c1,c2) do { \
570	BN_ULONG ta = (a), tb = (b); \
571	BN_ULONG lo = ta * tb; \
572	BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \
573	c0 += lo; hi += (c0<lo)?1:0; \
574	c1 += hi; c2 += (c1<hi)?1:0; \
575	} while(0)
576
577	# define mul_add_c2(a,b,c0,c1,c2) do { \
578	BN_ULONG ta = (a), tb = (b), tt; \
579	BN_ULONG lo = ta * tb; \
580	BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \
581	c0 += lo; tt = hi + ((c0<lo)?1:0); \
582	c1 += tt; c2 += (c1<tt)?1:0; \
583	c0 += lo; hi += (c0<lo)?1:0; \
584	c1 += hi; c2 += (c1<hi)?1:0; \
585	} while(0)
586
587	# define sqr_add_c(a,i,c0,c1,c2) do { \
588	BN_ULONG ta = (a)[i]; \
589	BN_ULONG lo = ta * ta; \
590	BN_ULONG hi = BN_UMULT_HIGH(ta,ta); \
591	c0 += lo; hi += (c0<lo)?1:0; \
592	c1 += hi; c2 += (c1<hi)?1:0; \
593	} while(0)
594
595	# define sqr_add_c2(a,i,j,c0,c1,c2) \
596	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
597
598	# else /* !BN_LLONG */
599	/*
600	* Keep in mind that additions to hi can not overflow, because
601	* the high word of a multiplication result cannot be all-ones.
602	*/
603	# define mul_add_c(a,b,c0,c1,c2) do { \
604	BN_ULONG lo = LBITS(a), hi = HBITS(a); \
605	BN_ULONG bl = LBITS(b), bh = HBITS(b); \
606	mul64(lo,hi,bl,bh); \
607	c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
608	c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
609	} while(0)
610
611	# define mul_add_c2(a,b,c0,c1,c2) do { \
612	BN_ULONG tt; \
613	BN_ULONG lo = LBITS(a), hi = HBITS(a); \
614	BN_ULONG bl = LBITS(b), bh = HBITS(b); \
615	mul64(lo,hi,bl,bh); \
616	tt = hi; \
617	c0 = (c0+lo)&BN_MASK2; if (c0<lo) tt++; \
618	c1 = (c1+tt)&BN_MASK2; if (c1<tt) c2++; \
619	c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
620	c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
621	} while(0)
622
623	# define sqr_add_c(a,i,c0,c1,c2) do { \
624	BN_ULONG lo, hi; \
625	sqr64(lo,hi,(a)[i]); \
626	c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
627	c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
628	} while(0)
629
630	# define sqr_add_c2(a,i,j,c0,c1,c2) \
631	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
632	# endif /* !BN_LLONG */
633
634	void bn_mul_comba8(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
635	{
636	BN_ULONG c1, c2, c3;
637
638	c1 = `0`;
639	c2 = `0`;
640	c3 = `0`;
641	mul_add_c(a[`0`], b[`0`], c1, c2, c3);
642	r[`0`] = c1;
643	c1 = `0`;
644	mul_add_c(a[`0`], b[`1`], c2, c3, c1);
645	mul_add_c(a[`1`], b[`0`], c2, c3, c1);
646	r[`1`] = c2;
647	c2 = `0`;
648	mul_add_c(a[`2`], b[`0`], c3, c1, c2);
649	mul_add_c(a[`1`], b[`1`], c3, c1, c2);
650	mul_add_c(a[`0`], b[`2`], c3, c1, c2);
651	r[`2`] = c3;
652	c3 = `0`;
653	mul_add_c(a[`0`], b[`3`], c1, c2, c3);
654	mul_add_c(a[`1`], b[`2`], c1, c2, c3);
655	mul_add_c(a[`2`], b[`1`], c1, c2, c3);
656	mul_add_c(a[`3`], b[`0`], c1, c2, c3);
657	r[`3`] = c1;
658	c1 = `0`;
659	mul_add_c(a[`4`], b[`0`], c2, c3, c1);
660	mul_add_c(a[`3`], b[`1`], c2, c3, c1);
661	mul_add_c(a[`2`], b[`2`], c2, c3, c1);
662	mul_add_c(a[`1`], b[`3`], c2, c3, c1);
663	mul_add_c(a[`0`], b[`4`], c2, c3, c1);
664	r[`4`] = c2;
665	c2 = `0`;
666	mul_add_c(a[`0`], b[`5`], c3, c1, c2);
667	mul_add_c(a[`1`], b[`4`], c3, c1, c2);
668	mul_add_c(a[`2`], b[`3`], c3, c1, c2);
669	mul_add_c(a[`3`], b[`2`], c3, c1, c2);
670	mul_add_c(a[`4`], b[`1`], c3, c1, c2);
671	mul_add_c(a[`5`], b[`0`], c3, c1, c2);
672	r[`5`] = c3;
673	c3 = `0`;
674	mul_add_c(a[`6`], b[`0`], c1, c2, c3);
675	mul_add_c(a[`5`], b[`1`], c1, c2, c3);
676	mul_add_c(a[`4`], b[`2`], c1, c2, c3);
677	mul_add_c(a[`3`], b[`3`], c1, c2, c3);
678	mul_add_c(a[`2`], b[`4`], c1, c2, c3);
679	mul_add_c(a[`1`], b[`5`], c1, c2, c3);
680	mul_add_c(a[`0`], b[`6`], c1, c2, c3);
681	r[`6`] = c1;
682	c1 = `0`;
683	mul_add_c(a[`0`], b[`7`], c2, c3, c1);
684	mul_add_c(a[`1`], b[`6`], c2, c3, c1);
685	mul_add_c(a[`2`], b[`5`], c2, c3, c1);
686	mul_add_c(a[`3`], b[`4`], c2, c3, c1);
687	mul_add_c(a[`4`], b[`3`], c2, c3, c1);
688	mul_add_c(a[`5`], b[`2`], c2, c3, c1);
689	mul_add_c(a[`6`], b[`1`], c2, c3, c1);
690	mul_add_c(a[`7`], b[`0`], c2, c3, c1);
691	r[`7`] = c2;
692	c2 = `0`;
693	mul_add_c(a[`7`], b[`1`], c3, c1, c2);
694	mul_add_c(a[`6`], b[`2`], c3, c1, c2);
695	mul_add_c(a[`5`], b[`3`], c3, c1, c2);
696	mul_add_c(a[`4`], b[`4`], c3, c1, c2);
697	mul_add_c(a[`3`], b[`5`], c3, c1, c2);
698	mul_add_c(a[`2`], b[`6`], c3, c1, c2);
699	mul_add_c(a[`1`], b[`7`], c3, c1, c2);
700	r[`8`] = c3;
701	c3 = `0`;
702	mul_add_c(a[`2`], b[`7`], c1, c2, c3);
703	mul_add_c(a[`3`], b[`6`], c1, c2, c3);
704	mul_add_c(a[`4`], b[`5`], c1, c2, c3);
705	mul_add_c(a[`5`], b[`4`], c1, c2, c3);
706	mul_add_c(a[`6`], b[`3`], c1, c2, c3);
707	mul_add_c(a[`7`], b[`2`], c1, c2, c3);
708	r[`9`] = c1;
709	c1 = `0`;
710	mul_add_c(a[`7`], b[`3`], c2, c3, c1);
711	mul_add_c(a[`6`], b[`4`], c2, c3, c1);
712	mul_add_c(a[`5`], b[`5`], c2, c3, c1);
713	mul_add_c(a[`4`], b[`6`], c2, c3, c1);
714	mul_add_c(a[`3`], b[`7`], c2, c3, c1);
715	r[`10`] = c2;
716	c2 = `0`;
717	mul_add_c(a[`4`], b[`7`], c3, c1, c2);
718	mul_add_c(a[`5`], b[`6`], c3, c1, c2);
719	mul_add_c(a[`6`], b[`5`], c3, c1, c2);
720	mul_add_c(a[`7`], b[`4`], c3, c1, c2);
721	r[`11`] = c3;
722	c3 = `0`;
723	mul_add_c(a[`7`], b[`5`], c1, c2, c3);
724	mul_add_c(a[`6`], b[`6`], c1, c2, c3);
725	mul_add_c(a[`5`], b[`7`], c1, c2, c3);
726	r[`12`] = c1;
727	c1 = `0`;
728	mul_add_c(a[`6`], b[`7`], c2, c3, c1);
729	mul_add_c(a[`7`], b[`6`], c2, c3, c1);
730	r[`13`] = c2;
731	c2 = `0`;
732	mul_add_c(a[`7`], b[`7`], c3, c1, c2);
733	r[`14`] = c3;
734	r[`15`] = c1;
735	}
736
737	void bn_mul_comba4(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
738	{
739	BN_ULONG c1, c2, c3;
740
741	c1 = `0`;
742	c2 = `0`;
743	c3 = `0`;
744	mul_add_c(a[`0`], b[`0`], c1, c2, c3);
745	r[`0`] = c1;
746	c1 = `0`;
747	mul_add_c(a[`0`], b[`1`], c2, c3, c1);
748	mul_add_c(a[`1`], b[`0`], c2, c3, c1);
749	r[`1`] = c2;
750	c2 = `0`;
751	mul_add_c(a[`2`], b[`0`], c3, c1, c2);
752	mul_add_c(a[`1`], b[`1`], c3, c1, c2);
753	mul_add_c(a[`0`], b[`2`], c3, c1, c2);
754	r[`2`] = c3;
755	c3 = `0`;
756	mul_add_c(a[`0`], b[`3`], c1, c2, c3);
757	mul_add_c(a[`1`], b[`2`], c1, c2, c3);
758	mul_add_c(a[`2`], b[`1`], c1, c2, c3);
759	mul_add_c(a[`3`], b[`0`], c1, c2, c3);
760	r[`3`] = c1;
761	c1 = `0`;
762	mul_add_c(a[`3`], b[`1`], c2, c3, c1);
763	mul_add_c(a[`2`], b[`2`], c2, c3, c1);
764	mul_add_c(a[`1`], b[`3`], c2, c3, c1);
765	r[`4`] = c2;
766	c2 = `0`;
767	mul_add_c(a[`2`], b[`3`], c3, c1, c2);
768	mul_add_c(a[`3`], b[`2`], c3, c1, c2);
769	r[`5`] = c3;
770	c3 = `0`;
771	mul_add_c(a[`3`], b[`3`], c1, c2, c3);
772	r[`6`] = c1;
773	r[`7`] = c2;
774	}
775
776	void bn_sqr_comba8(BN_ULONG r, const* BN_ULONG *a)
777	{
778	BN_ULONG c1, c2, c3;
779
780	c1 = `0`;
781	c2 = `0`;
782	c3 = `0`;
783	sqr_add_c(a, `0`, c1, c2, c3);
784	r[`0`] = c1;
785	c1 = `0`;
786	sqr_add_c2(a, `1`, `0`, c2, c3, c1);
787	r[`1`] = c2;
788	c2 = `0`;
789	sqr_add_c(a, `1`, c3, c1, c2);
790	sqr_add_c2(a, `2`, `0`, c3, c1, c2);
791	r[`2`] = c3;
792	c3 = `0`;
793	sqr_add_c2(a, `3`, `0`, c1, c2, c3);
794	sqr_add_c2(a, `2`, `1`, c1, c2, c3);
795	r[`3`] = c1;
796	c1 = `0`;
797	sqr_add_c(a, `2`, c2, c3, c1);
798	sqr_add_c2(a, `3`, `1`, c2, c3, c1);
799	sqr_add_c2(a, `4`, `0`, c2, c3, c1);
800	r[`4`] = c2;
801	c2 = `0`;
802	sqr_add_c2(a, `5`, `0`, c3, c1, c2);
803	sqr_add_c2(a, `4`, `1`, c3, c1, c2);
804	sqr_add_c2(a, `3`, `2`, c3, c1, c2);
805	r[`5`] = c3;
806	c3 = `0`;
807	sqr_add_c(a, `3`, c1, c2, c3);
808	sqr_add_c2(a, `4`, `2`, c1, c2, c3);
809	sqr_add_c2(a, `5`, `1`, c1, c2, c3);
810	sqr_add_c2(a, `6`, `0`, c1, c2, c3);
811	r[`6`] = c1;
812	c1 = `0`;
813	sqr_add_c2(a, `7`, `0`, c2, c3, c1);
814	sqr_add_c2(a, `6`, `1`, c2, c3, c1);
815	sqr_add_c2(a, `5`, `2`, c2, c3, c1);
816	sqr_add_c2(a, `4`, `3`, c2, c3, c1);
817	r[`7`] = c2;
818	c2 = `0`;
819	sqr_add_c(a, `4`, c3, c1, c2);
820	sqr_add_c2(a, `5`, `3`, c3, c1, c2);
821	sqr_add_c2(a, `6`, `2`, c3, c1, c2);
822	sqr_add_c2(a, `7`, `1`, c3, c1, c2);
823	r[`8`] = c3;
824	c3 = `0`;
825	sqr_add_c2(a, `7`, `2`, c1, c2, c3);
826	sqr_add_c2(a, `6`, `3`, c1, c2, c3);
827	sqr_add_c2(a, `5`, `4`, c1, c2, c3);
828	r[`9`] = c1;
829	c1 = `0`;
830	sqr_add_c(a, `5`, c2, c3, c1);
831	sqr_add_c2(a, `6`, `4`, c2, c3, c1);
832	sqr_add_c2(a, `7`, `3`, c2, c3, c1);
833	r[`10`] = c2;
834	c2 = `0`;
835	sqr_add_c2(a, `7`, `4`, c3, c1, c2);
836	sqr_add_c2(a, `6`, `5`, c3, c1, c2);
837	r[`11`] = c3;
838	c3 = `0`;
839	sqr_add_c(a, `6`, c1, c2, c3);
840	sqr_add_c2(a, `7`, `5`, c1, c2, c3);
841	r[`12`] = c1;
842	c1 = `0`;
843	sqr_add_c2(a, `7`, `6`, c2, c3, c1);
844	r[`13`] = c2;
845	c2 = `0`;
846	sqr_add_c(a, `7`, c3, c1, c2);
847	r[`14`] = c3;
848	r[`15`] = c1;
849	}
850
851	void bn_sqr_comba4(BN_ULONG r, const* BN_ULONG *a)
852	{
853	BN_ULONG c1, c2, c3;
854
855	c1 = `0`;
856	c2 = `0`;
857	c3 = `0`;
858	sqr_add_c(a, `0`, c1, c2, c3);
859	r[`0`] = c1;
860	c1 = `0`;
861	sqr_add_c2(a, `1`, `0`, c2, c3, c1);
862	r[`1`] = c2;
863	c2 = `0`;
864	sqr_add_c(a, `1`, c3, c1, c2);
865	sqr_add_c2(a, `2`, `0`, c3, c1, c2);
866	r[`2`] = c3;
867	c3 = `0`;
868	sqr_add_c2(a, `3`, `0`, c1, c2, c3);
869	sqr_add_c2(a, `2`, `1`, c1, c2, c3);
870	r[`3`] = c1;
871	c1 = `0`;
872	sqr_add_c(a, `2`, c2, c3, c1);
873	sqr_add_c2(a, `3`, `1`, c2, c3, c1);
874	r[`4`] = c2;
875	c2 = `0`;
876	sqr_add_c2(a, `3`, `2`, c3, c1, c2);
877	r[`5`] = c3;
878	c3 = `0`;
879	sqr_add_c(a, `3`, c1, c2, c3);
880	r[`6`] = c1;
881	r[`7`] = c2;
882	}
883
884	# ifdef OPENSSL_NO_ASM
885	# ifdef OPENSSL_BN_ASM_MONT
886	# include <alloca.h>
887	/*
888	* This is essentially reference implementation, which may or may not
889	* result in performance improvement. E.g. on IA-32 this routine was
890	* observed to give 40% faster rsa1024 private key operations and 10%
891	* faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only
892	* by 10% and worsens rsa4096 sign by 15%. Once again, it's a
893	* reference implementation, one to be used as starting point for
894	* platform-specific assembler. Mentioned numbers apply to compiler
895	* generated code compiled with and without -DOPENSSL_BN_ASM_MONT and
896	* can vary not only from platform to platform, but even for compiler
897	* versions. Assembler vs. assembler improvement coefficients can
898	* [and are known to] differ and are to be documented elsewhere.
899	*/
900	int bn_mul_mont(BN_ULONG rp, const* BN_ULONG ap, const* BN_ULONG *bp,
901	const BN_ULONG np, const* BN_ULONG n0p, int* num)
902	{
903	BN_ULONG c0, c1, ml, *tp, n0;
904	# ifdef mul64
905	BN_ULONG mh;
906	# endif
907	volatile BN_ULONG *vp;
908	int i = `0`, j;
909
910	# if 0 /* template for platform-specific
911	* implementation */
912	if (ap == bp)
913	return bn_sqr_mont(rp, ap, np, n0p, num);
914	# endif
915	vp = tp = alloca((num + `2`) * sizeof(BN_ULONG));
916
917	n0 = *n0p;
918
919	c0 = `0`;
920	ml = bp[`0`];
921	# ifdef mul64
922	mh = HBITS(ml);
923	ml = LBITS(ml);
924	for (j = `0`; j < num; ++j)
925	mul(tp[j], ap[j], ml, mh, c0);
926	# else
927	for (j = `0`; j < num; ++j)
928	mul(tp[j], ap[j], ml, c0);
929	# endif
930
931	tp[num] = c0;
932	tp[num + `1`] = `0`;
933	goto enter;
934
935	for (i = `0`; i < num; i++) {
936	c0 = `0`;
937	ml = bp[i];
938	# ifdef mul64
939	mh = HBITS(ml);
940	ml = LBITS(ml);
941	for (j = `0`; j < num; ++j)
942	mul_add(tp[j], ap[j], ml, mh, c0);
943	# else
944	for (j = `0`; j < num; ++j)
945	mul_add(tp[j], ap[j], ml, c0);
946	# endif
947	c1 = (tp[num] + c0) & BN_MASK2;
948	tp[num] = c1;
949	tp[num + `1`] = (c1 < c0 ? `1` : `0`);
950	enter:
951	c1 = tp[`0`];
952	ml = (c1 * n0) & BN_MASK2;
953	c0 = `0`;
954	# ifdef mul64
955	mh = HBITS(ml);
956	ml = LBITS(ml);
957	mul_add(c1, np[`0`], ml, mh, c0);
958	# else
959	mul_add(c1, ml, np[`0`], c0);
960	# endif
961	for (j = `1`; j < num; j++) {
962	c1 = tp[j];
963	# ifdef mul64
964	mul_add(c1, np[j], ml, mh, c0);
965	# else
966	mul_add(c1, ml, np[j], c0);
967	# endif
968	tp[j - `1`] = c1 & BN_MASK2;
969	}
970	c1 = (tp[num] + c0) & BN_MASK2;
971	tp[num - `1`] = c1;
972	tp[num] = tp[num + `1`] + (c1 < c0 ? `1` : `0`);
973	}
974
975	if (tp[num] != `0` \|\| tp[num - `1`] >= np[num - `1`]) {
976	c0 = bn_sub_words(rp, tp, np, num);
977	if (tp[num] != `0` \|\| c0 == `0`) {
978	for (i = `0`; i < num + `2`; i++)
979	vp[i] = `0`;
980	return `1`;
981	}
982	}
983	for (i = `0`; i < num; i++)
984	rp[i] = tp[i], vp[i] = `0`;
985	vp[num] = `0`;
986	vp[num + `1`] = `0`;
987	return `1`;
988	}
989	# else
990	/*
991	* Return value of 0 indicates that multiplication/convolution was not
992	* performed to signal the caller to fall down to alternative/original
993	* code-path.
994	*/
995	int bn_mul_mont(BN_ULONG rp, const* BN_ULONG ap, const* BN_ULONG *bp,
996	const BN_ULONG np, const* BN_ULONG n0, int* num)
997	{
998	return `0`;
999	}
1000	# endif /* OPENSSL_BN_ASM_MONT */
1001	# endif
1002
1003	#else /* !BN_MUL_COMBA */
1004
1005	/ hmm... is it faster just to do a multiply? /
1006	# undef bn_sqr_comba4
1007	void bn_sqr_comba4(BN_ULONG r, const* BN_ULONG *a)
1008	{
1009	BN_ULONG t[`8`];
1010	bn_sqr_normal(r, a, `4`, t);
1011	}
1012
1013	# undef bn_sqr_comba8
1014	void bn_sqr_comba8(BN_ULONG r, const* BN_ULONG *a)
1015	{
1016	BN_ULONG t[`16`];
1017	bn_sqr_normal(r, a, `8`, t);
1018	}
1019
1020	void bn_mul_comba4(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
1021	{
1022	r[`4`] = bn_mul_words(&(r[`0`]), a, `4`, b[`0`]);
1023	r[`5`] = bn_mul_add_words(&(r[`1`]), a, `4`, b[`1`]);
1024	r[`6`] = bn_mul_add_words(&(r[`2`]), a, `4`, b[`2`]);
1025	r[`7`] = bn_mul_add_words(&(r[`3`]), a, `4`, b[`3`]);
1026	}
1027
1028	void bn_mul_comba8(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
1029	{
1030	r[`8`] = bn_mul_words(&(r[`0`]), a, `8`, b[`0`]);
1031	r[`9`] = bn_mul_add_words(&(r[`1`]), a, `8`, b[`1`]);
1032	r[`10`] = bn_mul_add_words(&(r[`2`]), a, `8`, b[`2`]);
1033	r[`11`] = bn_mul_add_words(&(r[`3`]), a, `8`, b[`3`]);
1034	r[`12`] = bn_mul_add_words(&(r[`4`]), a, `8`, b[`4`]);
1035	r[`13`] = bn_mul_add_words(&(r[`5`]), a, `8`, b[`5`]);
1036	r[`14`] = bn_mul_add_words(&(r[`6`]), a, `8`, b[`6`]);
1037	r[`15`] = bn_mul_add_words(&(r[`7`]), a, `8`, b[`7`]);
1038	}
1039
1040	# ifdef OPENSSL_NO_ASM
1041	# ifdef OPENSSL_BN_ASM_MONT
1042	# include <alloca.h>
1043	int bn_mul_mont(BN_ULONG rp, const* BN_ULONG ap, const* BN_ULONG *bp,
1044	const BN_ULONG np, const* BN_ULONG n0p, int* num)
1045	{
1046	BN_ULONG c0, c1, tp, n0 = n0p;
1047	volatile BN_ULONG *vp;
1048	int i = `0`, j;
1049
1050	vp = tp = alloca((num + `2`) * sizeof(BN_ULONG));
1051
1052	for (i = `0`; i <= num; i++)
1053	tp[i] = `0`;
1054
1055	for (i = `0`; i < num; i++) {
1056	c0 = bn_mul_add_words(tp, ap, num, bp[i]);
1057	c1 = (tp[num] + c0) & BN_MASK2;
1058	tp[num] = c1;
1059	tp[num + `1`] = (c1 < c0 ? `1` : `0`);
1060
1061	c0 = bn_mul_add_words(tp, np, num, tp[`0`] * n0);
1062	c1 = (tp[num] + c0) & BN_MASK2;
1063	tp[num] = c1;
1064	tp[num + `1`] += (c1 < c0 ? `1` : `0`);
1065	for (j = `0`; j <= num; j++)
1066	tp[j] = tp[j + `1`];
1067	}
1068
1069	if (tp[num] != `0` \|\| tp[num - `1`] >= np[num - `1`]) {
1070	c0 = bn_sub_words(rp, tp, np, num);
1071	if (tp[num] != `0` \|\| c0 == `0`) {
1072	for (i = `0`; i < num + `2`; i++)
1073	vp[i] = `0`;
1074	return `1`;
1075	}
1076	}
1077	for (i = `0`; i < num; i++)
1078	rp[i] = tp[i], vp[i] = `0`;
1079	vp[num] = `0`;
1080	vp[num + `1`] = `0`;
1081	return `1`;
1082	}
1083	# else
1084	int bn_mul_mont(BN_ULONG rp, const* BN_ULONG ap, const* BN_ULONG *bp,
1085	const BN_ULONG np, const* BN_ULONG n0, int* num)
1086	{
1087	return `0`;
1088	}
1089	# endif /* OPENSSL_BN_ASM_MONT */
1090	# endif
1091
1092	#endif /* !BN_MUL_COMBA */
1093

Browse the source code of Aerospike/modules/s2-geometry-library/geometry/util/math/exactfloat/bn/bn_asm.c