x86_64-gcc.c source code [ClickHouse/contrib/openssl/crypto/bn/asm/x86_64-gcc.c]

1	/*
2	* Copyright 2002-2018 The OpenSSL Project Authors. All Rights Reserved.
3	*
4	* Licensed under the Apache License 2.0 (the "License"). You may not use
5	* this file except in compliance with the License. You can obtain a copy
6	* in the file LICENSE in the source distribution or at
7	* https://www.openssl.org/source/license.html
8	*/
9
10	#include "../bn_local.h"
11	#if !(defined(__GNUC__) && __GNUC__>=2)
12	# include "../bn_asm.c" /* kind of dirty hack for Sun Studio */
13	#else
14	/-*
15	* x86_64 BIGNUM accelerator version 0.1, December 2002.
16	*
17	* Implemented by Andy Polyakov <appro@openssl.org> for the OpenSSL
18	* project.
19	*
20	* Rights for redistribution and usage in source and binary forms are
21	* granted according to the License. Warranty of any kind is disclaimed.
22	*
23	* Q. Version 0.1? It doesn't sound like Andy, he used to assign real
24	* versions, like 1.0...
25	* A. Well, that's because this code is basically a quick-n-dirty
26	* proof-of-concept hack. As you can see it's implemented with
27	* inline assembler, which means that you're bound to GCC and that
28	* there might be enough room for further improvement.
29	*
30	* Q. Why inline assembler?
31	* A. x86_64 features own ABI which I'm not familiar with. This is
32	* why I decided to let the compiler take care of subroutine
33	* prologue/epilogue as well as register allocation. For reference.
34	* Win64 implements different ABI for AMD64, different from Linux.
35	*
36	* Q. How much faster does it get?
37	* A. 'apps/openssl speed rsa dsa' output with no-asm:
38	*
39	* sign verify sign/s verify/s
40	* rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2
41	* rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0
42	* rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8
43	* rsa 4096 bits 0.1155s 0.0018s 8.7 555.6
44	* sign verify sign/s verify/s
45	* dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3
46	* dsa 1024 bits 0.0014s 0.0018s 692.3 559.2
47	* dsa 2048 bits 0.0049s 0.0061s 204.7 165.0
48	*
49	* 'apps/openssl speed rsa dsa' output with this module:
50	*
51	* sign verify sign/s verify/s
52	* rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9
53	* rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7
54	* rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0
55	* rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8
56	* sign verify sign/s verify/s
57	* dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3
58	* dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4
59	* dsa 2048 bits 0.0016s 0.0020s 620.4 504.6
60	*
61	* For the reference. IA-32 assembler implementation performs
62	* very much like 64-bit code compiled with no-asm on the same
63	* machine.
64	*/
65
66	# undef mul
67	# undef mul_add
68
69	/-*
70	* "m"(a), "+m"(r) is the way to favor DirectPath µ-code;
71	* "g"(0) let the compiler to decide where does it
72	* want to keep the value of zero;
73	*/
74	# define mul_add(r,a,word,carry) do { \
75	register BN_ULONG high,low; \
76	asm ("mulq %3" \
77	: "=a"(low),"=d"(high) \
78	: "a"(word),"m"(a) \
79	: "cc"); \
80	asm ("addq %2,%0; adcq %3,%1" \
81	: "+r"(carry),"+d"(high)\
82	: "a"(low),"g"(0) \
83	: "cc"); \
84	asm ("addq %2,%0; adcq %3,%1" \
85	: "+m"(r),"+d"(high) \
86	: "r"(carry),"g"(0) \
87	: "cc"); \
88	carry=high; \
89	} while (0)
90
91	# define mul(r,a,word,carry) do { \
92	register BN_ULONG high,low; \
93	asm ("mulq %3" \
94	: "=a"(low),"=d"(high) \
95	: "a"(word),"g"(a) \
96	: "cc"); \
97	asm ("addq %2,%0; adcq %3,%1" \
98	: "+r"(carry),"+d"(high)\
99	: "a"(low),"g"(0) \
100	: "cc"); \
101	(r)=carry, carry=high; \
102	} while (0)
103	# undef sqr
104	# define sqr(r0,r1,a) \
105	asm ("mulq %2" \
106	: "=a"(r0),"=d"(r1) \
107	: "a"(a) \
108	: "cc");
109
110	BN_ULONG bn_mul_add_words(BN_ULONG rp, const* BN_ULONG ap, int* num,
111	BN_ULONG w)
112	{
113	BN_ULONG c1 = `0`;
114
115	if (num <= `0`)
116	return c1;
117
118	while (num & ~`3`) {
119	mul_add(rp[`0`], ap[`0`], w, c1);
120	mul_add(rp[`1`], ap[`1`], w, c1);
121	mul_add(rp[`2`], ap[`2`], w, c1);
122	mul_add(rp[`3`], ap[`3`], w, c1);
123	ap += `4`;
124	rp += `4`;
125	num -= `4`;
126	}
127	if (num) {
128	mul_add(rp[`0`], ap[`0`], w, c1);
129	if (--num == `0`)
130	return c1;
131	mul_add(rp[`1`], ap[`1`], w, c1);
132	if (--num == `0`)
133	return c1;
134	mul_add(rp[`2`], ap[`2`], w, c1);
135	return c1;
136	}
137
138	return c1;
139	}
140
141	BN_ULONG bn_mul_words(BN_ULONG rp, const* BN_ULONG ap, int* num, BN_ULONG w)
142	{
143	BN_ULONG c1 = `0`;
144
145	if (num <= `0`)
146	return c1;
147
148	while (num & ~`3`) {
149	mul(rp[`0`], ap[`0`], w, c1);
150	mul(rp[`1`], ap[`1`], w, c1);
151	mul(rp[`2`], ap[`2`], w, c1);
152	mul(rp[`3`], ap[`3`], w, c1);
153	ap += `4`;
154	rp += `4`;
155	num -= `4`;
156	}
157	if (num) {
158	mul(rp[`0`], ap[`0`], w, c1);
159	if (--num == `0`)
160	return c1;
161	mul(rp[`1`], ap[`1`], w, c1);
162	if (--num == `0`)
163	return c1;
164	mul(rp[`2`], ap[`2`], w, c1);
165	}
166	return c1;
167	}
168
169	void bn_sqr_words(BN_ULONG r, const* BN_ULONG a, int* n)
170	{
171	if (n <= `0`)
172	return;
173
174	while (n & ~`3`) {
175	sqr(r[`0`], r[`1`], a[`0`]);
176	sqr(r[`2`], r[`3`], a[`1`]);
177	sqr(r[`4`], r[`5`], a[`2`]);
178	sqr(r[`6`], r[`7`], a[`3`]);
179	a += `4`;
180	r += `8`;
181	n -= `4`;
182	}
183	if (n) {
184	sqr(r[`0`], r[`1`], a[`0`]);
185	if (--n == `0`)
186	return;
187	sqr(r[`2`], r[`3`], a[`1`]);
188	if (--n == `0`)
189	return;
190	sqr(r[`4`], r[`5`], a[`2`]);
191	}
192	}
193
194	BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
195	{
196	BN_ULONG ret, waste;
197
198	asm("divq %4":"=a"(ret), "=d"(waste)
199	: "a"(l), "d"(h), "r"(d)
200	: "cc");
201
202	return ret;
203	}
204
205	BN_ULONG bn_add_words(BN_ULONG rp, const* BN_ULONG ap, const* BN_ULONG *bp,
206	int n)
207	{
208	BN_ULONG ret;
209	size_t i = `0`;
210
211	if (n <= `0`)
212	return `0`;
213
214	asm volatile (" subq %0,%0 \n" / clear carry /
215	" jmp 1f \n"
216	".p2align 4 \n"
217	"1: movq (%4,%2,8),%0 \n"
218	" adcq (%5,%2,8),%0 \n"
219	" movq %0,(%3,%2,8) \n"
220	" lea 1(%2),%2 \n"
221	" dec %1 \n"
222	" jnz 1b \n"
223	" sbbq %0,%0 \n"
224	:"=&r" (ret), "+c"(n), "+r"(i)
225	:"r"(rp), "r"(ap), "r"(bp)
226	:"cc", "memory");
227
228	return ret & `1`;
229	}
230
231	# ifndef SIMICS
232	BN_ULONG bn_sub_words(BN_ULONG rp, const* BN_ULONG ap, const* BN_ULONG *bp,
233	int n)
234	{
235	BN_ULONG ret;
236	size_t i = `0`;
237
238	if (n <= `0`)
239	return `0`;
240
241	asm volatile (" subq %0,%0 \n" / clear borrow /
242	" jmp 1f \n"
243	".p2align 4 \n"
244	"1: movq (%4,%2,8),%0 \n"
245	" sbbq (%5,%2,8),%0 \n"
246	" movq %0,(%3,%2,8) \n"
247	" lea 1(%2),%2 \n"
248	" dec %1 \n"
249	" jnz 1b \n"
250	" sbbq %0,%0 \n"
251	:"=&r" (ret), "+c"(n), "+r"(i)
252	:"r"(rp), "r"(ap), "r"(bp)
253	:"cc", "memory");
254
255	return ret & `1`;
256	}
257	# else
258	/ Simics 1.4<7 has buggy sbbq:-( /
259	# define BN_MASK2 0xffffffffffffffffL
260	BN_ULONG bn_sub_words(BN_ULONG r, BN_ULONG a, BN_ULONG b, int* n)
261	{
262	BN_ULONG t1, t2;
263	int c = `0`;
264
265	if (n <= `0`)
266	return (BN_ULONG)`0`;
267
268	for (;;) {
269	t1 = a[`0`];
270	t2 = b[`0`];
271	r[`0`] = (t1 - t2 - c) & BN_MASK2;
272	if (t1 != t2)
273	c = (t1 < t2);
274	if (--n <= `0`)
275	break;
276
277	t1 = a[`1`];
278	t2 = b[`1`];
279	r[`1`] = (t1 - t2 - c) & BN_MASK2;
280	if (t1 != t2)
281	c = (t1 < t2);
282	if (--n <= `0`)
283	break;
284
285	t1 = a[`2`];
286	t2 = b[`2`];
287	r[`2`] = (t1 - t2 - c) & BN_MASK2;
288	if (t1 != t2)
289	c = (t1 < t2);
290	if (--n <= `0`)
291	break;
292
293	t1 = a[`3`];
294	t2 = b[`3`];
295	r[`3`] = (t1 - t2 - c) & BN_MASK2;
296	if (t1 != t2)
297	c = (t1 < t2);
298	if (--n <= `0`)
299	break;
300
301	a += `4`;
302	b += `4`;
303	r += `4`;
304	}
305	return c;
306	}
307	# endif
308
309	/ mul_add_c(a,b,c0,c1,c2) -- c+=ab for three word number c=(c2,c1,c0) /*
310	/ mul_add_c2(a,b,c0,c1,c2) -- c+=2ab for three word number c=(c2,c1,c0) /
311	/ sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) /
312	/*
313	* sqr_add_c2(a,i,c0,c1,c2) -- c+=2a[i]a[j] for three word number
314	* c=(c2,c1,c0)
315	*/
316
317	/*
318	* Keep in mind that carrying into high part of multiplication result
319	* can not overflow, because it cannot be all-ones.
320	*/
321	# if 0
322	/ original macros are kept for reference purposes /
323	# define mul_add_c(a,b,c0,c1,c2) do { \
324	BN_ULONG ta = (a), tb = (b); \
325	BN_ULONG lo, hi; \
326	BN_UMULT_LOHI(lo,hi,ta,tb); \
327	c0 += lo; hi += (c0<lo)?1:0; \
328	c1 += hi; c2 += (c1<hi)?1:0; \
329	} while(0)
330
331	# define mul_add_c2(a,b,c0,c1,c2) do { \
332	BN_ULONG ta = (a), tb = (b); \
333	BN_ULONG lo, hi, tt; \
334	BN_UMULT_LOHI(lo,hi,ta,tb); \
335	c0 += lo; tt = hi+((c0<lo)?1:0); \
336	c1 += tt; c2 += (c1<tt)?1:0; \
337	c0 += lo; hi += (c0<lo)?1:0; \
338	c1 += hi; c2 += (c1<hi)?1:0; \
339	} while(0)
340
341	# define sqr_add_c(a,i,c0,c1,c2) do { \
342	BN_ULONG ta = (a)[i]; \
343	BN_ULONG lo, hi; \
344	BN_UMULT_LOHI(lo,hi,ta,ta); \
345	c0 += lo; hi += (c0<lo)?1:0; \
346	c1 += hi; c2 += (c1<hi)?1:0; \
347	} while(0)
348	# else
349	# define mul_add_c(a,b,c0,c1,c2) do { \
350	BN_ULONG t1,t2; \
351	asm ("mulq %3" \
352	: "=a"(t1),"=d"(t2) \
353	: "a"(a),"m"(b) \
354	: "cc"); \
355	asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
356	: "+r"(c0),"+r"(c1),"+r"(c2) \
357	: "r"(t1),"r"(t2),"g"(0) \
358	: "cc"); \
359	} while (0)
360
361	# define sqr_add_c(a,i,c0,c1,c2) do { \
362	BN_ULONG t1,t2; \
363	asm ("mulq %2" \
364	: "=a"(t1),"=d"(t2) \
365	: "a"(a[i]) \
366	: "cc"); \
367	asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
368	: "+r"(c0),"+r"(c1),"+r"(c2) \
369	: "r"(t1),"r"(t2),"g"(0) \
370	: "cc"); \
371	} while (0)
372
373	# define mul_add_c2(a,b,c0,c1,c2) do { \
374	BN_ULONG t1,t2; \
375	asm ("mulq %3" \
376	: "=a"(t1),"=d"(t2) \
377	: "a"(a),"m"(b) \
378	: "cc"); \
379	asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
380	: "+r"(c0),"+r"(c1),"+r"(c2) \
381	: "r"(t1),"r"(t2),"g"(0) \
382	: "cc"); \
383	asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
384	: "+r"(c0),"+r"(c1),"+r"(c2) \
385	: "r"(t1),"r"(t2),"g"(0) \
386	: "cc"); \
387	} while (0)
388	# endif
389
390	# define sqr_add_c2(a,i,j,c0,c1,c2) \
391	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
392
393	void bn_mul_comba8(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
394	{
395	BN_ULONG c1, c2, c3;
396
397	c1 = `0`;
398	c2 = `0`;
399	c3 = `0`;
400	mul_add_c(a[`0`], b[`0`], c1, c2, c3);
401	r[`0`] = c1;
402	c1 = `0`;
403	mul_add_c(a[`0`], b[`1`], c2, c3, c1);
404	mul_add_c(a[`1`], b[`0`], c2, c3, c1);
405	r[`1`] = c2;
406	c2 = `0`;
407	mul_add_c(a[`2`], b[`0`], c3, c1, c2);
408	mul_add_c(a[`1`], b[`1`], c3, c1, c2);
409	mul_add_c(a[`0`], b[`2`], c3, c1, c2);
410	r[`2`] = c3;
411	c3 = `0`;
412	mul_add_c(a[`0`], b[`3`], c1, c2, c3);
413	mul_add_c(a[`1`], b[`2`], c1, c2, c3);
414	mul_add_c(a[`2`], b[`1`], c1, c2, c3);
415	mul_add_c(a[`3`], b[`0`], c1, c2, c3);
416	r[`3`] = c1;
417	c1 = `0`;
418	mul_add_c(a[`4`], b[`0`], c2, c3, c1);
419	mul_add_c(a[`3`], b[`1`], c2, c3, c1);
420	mul_add_c(a[`2`], b[`2`], c2, c3, c1);
421	mul_add_c(a[`1`], b[`3`], c2, c3, c1);
422	mul_add_c(a[`0`], b[`4`], c2, c3, c1);
423	r[`4`] = c2;
424	c2 = `0`;
425	mul_add_c(a[`0`], b[`5`], c3, c1, c2);
426	mul_add_c(a[`1`], b[`4`], c3, c1, c2);
427	mul_add_c(a[`2`], b[`3`], c3, c1, c2);
428	mul_add_c(a[`3`], b[`2`], c3, c1, c2);
429	mul_add_c(a[`4`], b[`1`], c3, c1, c2);
430	mul_add_c(a[`5`], b[`0`], c3, c1, c2);
431	r[`5`] = c3;
432	c3 = `0`;
433	mul_add_c(a[`6`], b[`0`], c1, c2, c3);
434	mul_add_c(a[`5`], b[`1`], c1, c2, c3);
435	mul_add_c(a[`4`], b[`2`], c1, c2, c3);
436	mul_add_c(a[`3`], b[`3`], c1, c2, c3);
437	mul_add_c(a[`2`], b[`4`], c1, c2, c3);
438	mul_add_c(a[`1`], b[`5`], c1, c2, c3);
439	mul_add_c(a[`0`], b[`6`], c1, c2, c3);
440	r[`6`] = c1;
441	c1 = `0`;
442	mul_add_c(a[`0`], b[`7`], c2, c3, c1);
443	mul_add_c(a[`1`], b[`6`], c2, c3, c1);
444	mul_add_c(a[`2`], b[`5`], c2, c3, c1);
445	mul_add_c(a[`3`], b[`4`], c2, c3, c1);
446	mul_add_c(a[`4`], b[`3`], c2, c3, c1);
447	mul_add_c(a[`5`], b[`2`], c2, c3, c1);
448	mul_add_c(a[`6`], b[`1`], c2, c3, c1);
449	mul_add_c(a[`7`], b[`0`], c2, c3, c1);
450	r[`7`] = c2;
451	c2 = `0`;
452	mul_add_c(a[`7`], b[`1`], c3, c1, c2);
453	mul_add_c(a[`6`], b[`2`], c3, c1, c2);
454	mul_add_c(a[`5`], b[`3`], c3, c1, c2);
455	mul_add_c(a[`4`], b[`4`], c3, c1, c2);
456	mul_add_c(a[`3`], b[`5`], c3, c1, c2);
457	mul_add_c(a[`2`], b[`6`], c3, c1, c2);
458	mul_add_c(a[`1`], b[`7`], c3, c1, c2);
459	r[`8`] = c3;
460	c3 = `0`;
461	mul_add_c(a[`2`], b[`7`], c1, c2, c3);
462	mul_add_c(a[`3`], b[`6`], c1, c2, c3);
463	mul_add_c(a[`4`], b[`5`], c1, c2, c3);
464	mul_add_c(a[`5`], b[`4`], c1, c2, c3);
465	mul_add_c(a[`6`], b[`3`], c1, c2, c3);
466	mul_add_c(a[`7`], b[`2`], c1, c2, c3);
467	r[`9`] = c1;
468	c1 = `0`;
469	mul_add_c(a[`7`], b[`3`], c2, c3, c1);
470	mul_add_c(a[`6`], b[`4`], c2, c3, c1);
471	mul_add_c(a[`5`], b[`5`], c2, c3, c1);
472	mul_add_c(a[`4`], b[`6`], c2, c3, c1);
473	mul_add_c(a[`3`], b[`7`], c2, c3, c1);
474	r[`10`] = c2;
475	c2 = `0`;
476	mul_add_c(a[`4`], b[`7`], c3, c1, c2);
477	mul_add_c(a[`5`], b[`6`], c3, c1, c2);
478	mul_add_c(a[`6`], b[`5`], c3, c1, c2);
479	mul_add_c(a[`7`], b[`4`], c3, c1, c2);
480	r[`11`] = c3;
481	c3 = `0`;
482	mul_add_c(a[`7`], b[`5`], c1, c2, c3);
483	mul_add_c(a[`6`], b[`6`], c1, c2, c3);
484	mul_add_c(a[`5`], b[`7`], c1, c2, c3);
485	r[`12`] = c1;
486	c1 = `0`;
487	mul_add_c(a[`6`], b[`7`], c2, c3, c1);
488	mul_add_c(a[`7`], b[`6`], c2, c3, c1);
489	r[`13`] = c2;
490	c2 = `0`;
491	mul_add_c(a[`7`], b[`7`], c3, c1, c2);
492	r[`14`] = c3;
493	r[`15`] = c1;
494	}
495
496	void bn_mul_comba4(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
497	{
498	BN_ULONG c1, c2, c3;
499
500	c1 = `0`;
501	c2 = `0`;
502	c3 = `0`;
503	mul_add_c(a[`0`], b[`0`], c1, c2, c3);
504	r[`0`] = c1;
505	c1 = `0`;
506	mul_add_c(a[`0`], b[`1`], c2, c3, c1);
507	mul_add_c(a[`1`], b[`0`], c2, c3, c1);
508	r[`1`] = c2;
509	c2 = `0`;
510	mul_add_c(a[`2`], b[`0`], c3, c1, c2);
511	mul_add_c(a[`1`], b[`1`], c3, c1, c2);
512	mul_add_c(a[`0`], b[`2`], c3, c1, c2);
513	r[`2`] = c3;
514	c3 = `0`;
515	mul_add_c(a[`0`], b[`3`], c1, c2, c3);
516	mul_add_c(a[`1`], b[`2`], c1, c2, c3);
517	mul_add_c(a[`2`], b[`1`], c1, c2, c3);
518	mul_add_c(a[`3`], b[`0`], c1, c2, c3);
519	r[`3`] = c1;
520	c1 = `0`;
521	mul_add_c(a[`3`], b[`1`], c2, c3, c1);
522	mul_add_c(a[`2`], b[`2`], c2, c3, c1);
523	mul_add_c(a[`1`], b[`3`], c2, c3, c1);
524	r[`4`] = c2;
525	c2 = `0`;
526	mul_add_c(a[`2`], b[`3`], c3, c1, c2);
527	mul_add_c(a[`3`], b[`2`], c3, c1, c2);
528	r[`5`] = c3;
529	c3 = `0`;
530	mul_add_c(a[`3`], b[`3`], c1, c2, c3);
531	r[`6`] = c1;
532	r[`7`] = c2;
533	}
534
535	void bn_sqr_comba8(BN_ULONG r, const* BN_ULONG *a)
536	{
537	BN_ULONG c1, c2, c3;
538
539	c1 = `0`;
540	c2 = `0`;
541	c3 = `0`;
542	sqr_add_c(a, `0`, c1, c2, c3);
543	r[`0`] = c1;
544	c1 = `0`;
545	sqr_add_c2(a, `1`, `0`, c2, c3, c1);
546	r[`1`] = c2;
547	c2 = `0`;
548	sqr_add_c(a, `1`, c3, c1, c2);
549	sqr_add_c2(a, `2`, `0`, c3, c1, c2);
550	r[`2`] = c3;
551	c3 = `0`;
552	sqr_add_c2(a, `3`, `0`, c1, c2, c3);
553	sqr_add_c2(a, `2`, `1`, c1, c2, c3);
554	r[`3`] = c1;
555	c1 = `0`;
556	sqr_add_c(a, `2`, c2, c3, c1);
557	sqr_add_c2(a, `3`, `1`, c2, c3, c1);
558	sqr_add_c2(a, `4`, `0`, c2, c3, c1);
559	r[`4`] = c2;
560	c2 = `0`;
561	sqr_add_c2(a, `5`, `0`, c3, c1, c2);
562	sqr_add_c2(a, `4`, `1`, c3, c1, c2);
563	sqr_add_c2(a, `3`, `2`, c3, c1, c2);
564	r[`5`] = c3;
565	c3 = `0`;
566	sqr_add_c(a, `3`, c1, c2, c3);
567	sqr_add_c2(a, `4`, `2`, c1, c2, c3);
568	sqr_add_c2(a, `5`, `1`, c1, c2, c3);
569	sqr_add_c2(a, `6`, `0`, c1, c2, c3);
570	r[`6`] = c1;
571	c1 = `0`;
572	sqr_add_c2(a, `7`, `0`, c2, c3, c1);
573	sqr_add_c2(a, `6`, `1`, c2, c3, c1);
574	sqr_add_c2(a, `5`, `2`, c2, c3, c1);
575	sqr_add_c2(a, `4`, `3`, c2, c3, c1);
576	r[`7`] = c2;
577	c2 = `0`;
578	sqr_add_c(a, `4`, c3, c1, c2);
579	sqr_add_c2(a, `5`, `3`, c3, c1, c2);
580	sqr_add_c2(a, `6`, `2`, c3, c1, c2);
581	sqr_add_c2(a, `7`, `1`, c3, c1, c2);
582	r[`8`] = c3;
583	c3 = `0`;
584	sqr_add_c2(a, `7`, `2`, c1, c2, c3);
585	sqr_add_c2(a, `6`, `3`, c1, c2, c3);
586	sqr_add_c2(a, `5`, `4`, c1, c2, c3);
587	r[`9`] = c1;
588	c1 = `0`;
589	sqr_add_c(a, `5`, c2, c3, c1);
590	sqr_add_c2(a, `6`, `4`, c2, c3, c1);
591	sqr_add_c2(a, `7`, `3`, c2, c3, c1);
592	r[`10`] = c2;
593	c2 = `0`;
594	sqr_add_c2(a, `7`, `4`, c3, c1, c2);
595	sqr_add_c2(a, `6`, `5`, c3, c1, c2);
596	r[`11`] = c3;
597	c3 = `0`;
598	sqr_add_c(a, `6`, c1, c2, c3);
599	sqr_add_c2(a, `7`, `5`, c1, c2, c3);
600	r[`12`] = c1;
601	c1 = `0`;
602	sqr_add_c2(a, `7`, `6`, c2, c3, c1);
603	r[`13`] = c2;
604	c2 = `0`;
605	sqr_add_c(a, `7`, c3, c1, c2);
606	r[`14`] = c3;
607	r[`15`] = c1;
608	}
609
610	void bn_sqr_comba4(BN_ULONG r, const* BN_ULONG *a)
611	{
612	BN_ULONG c1, c2, c3;
613
614	c1 = `0`;
615	c2 = `0`;
616	c3 = `0`;
617	sqr_add_c(a, `0`, c1, c2, c3);
618	r[`0`] = c1;
619	c1 = `0`;
620	sqr_add_c2(a, `1`, `0`, c2, c3, c1);
621	r[`1`] = c2;
622	c2 = `0`;
623	sqr_add_c(a, `1`, c3, c1, c2);
624	sqr_add_c2(a, `2`, `0`, c3, c1, c2);
625	r[`2`] = c3;
626	c3 = `0`;
627	sqr_add_c2(a, `3`, `0`, c1, c2, c3);
628	sqr_add_c2(a, `2`, `1`, c1, c2, c3);
629	r[`3`] = c1;
630	c1 = `0`;
631	sqr_add_c(a, `2`, c2, c3, c1);
632	sqr_add_c2(a, `3`, `1`, c2, c3, c1);
633	r[`4`] = c2;
634	c2 = `0`;
635	sqr_add_c2(a, `3`, `2`, c3, c1, c2);
636	r[`5`] = c3;
637	c3 = `0`;
638	sqr_add_c(a, `3`, c1, c2, c3);
639	r[`6`] = c1;
640	r[`7`] = c2;
641	}
642	#endif
643

Browse the source code of ClickHouse/contrib/openssl/crypto/bn/asm/x86_64-gcc.c