ecp_nistp521.c source code [ClickHouse/contrib/openssl/crypto/ec/ecp_nistp521.c]

1	/*
2	* Copyright 2011-2018 The OpenSSL Project Authors. All Rights Reserved.
3	*
4	* Licensed under the Apache License 2.0 (the "License"). You may not use
5	* this file except in compliance with the License. You can obtain a copy
6	* in the file LICENSE in the source distribution or at
7	* https://www.openssl.org/source/license.html
8	*/
9
10	/ Copyright 2011 Google Inc.*
11	*
12	* Licensed under the Apache License, Version 2.0 (the "License");
13	*
14	* you may not use this file except in compliance with the License.
15	* You may obtain a copy of the License at
16	*
17	* http://www.apache.org/licenses/LICENSE-2.0
18	*
19	* Unless required by applicable law or agreed to in writing, software
20	* distributed under the License is distributed on an "AS IS" BASIS,
21	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22	* See the License for the specific language governing permissions and
23	* limitations under the License.
24	*/
25
26	/*
27	* A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
28	*
29	* OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
30	* Otherwise based on Emilia's P224 work, which was inspired by my curve25519
31	* work which got its smarts from Daniel J. Bernstein's work on the same.
32	*/
33
34	#include <openssl/e_os2.h>
35	#ifdef OPENSSL_NO_EC_NISTP_64_GCC_128
36	NON_EMPTY_TRANSLATION_UNIT
37	#else
38
39	# include <string.h>
40	# include <openssl/err.h>
41	# include "ec_local.h"
42
43	# if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16
44	/ even with gcc, the typedef won't work for 32-bit platforms /
45	typedef __uint128_t uint128_t; / nonstandard; implemented by gcc on 64-bit*
46	* platforms */
47	# else
48	# error "Your compiler doesn't appear to support 128-bit integer types"
49	# endif
50
51	typedef uint8_t u8;
52	typedef uint64_t u64;
53
54	/*
55	* The underlying field. P521 operates over GF(2^521-1). We can serialise an
56	* element of this field into 66 bytes where the most significant byte
57	* contains only a single bit. We call this an felem_bytearray.
58	*/
59
60	typedef u8 felem_bytearray[`66`];
61
62	/*
63	* These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
64	* These values are big-endian.
65	*/
66	static const felem_bytearray nistp521_curve_params[`5`] = {
67	{`0x01`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, / p /
68	`0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`,
69	`0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`,
70	`0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`,
71	`0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`,
72	`0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`,
73	`0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`,
74	`0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`,
75	`0xff`, `0xff`},
76	{`0x01`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, / a = -3 /
77	`0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`,
78	`0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`,
79	`0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`,
80	`0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`,
81	`0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`,
82	`0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`,
83	`0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`,
84	`0xff`, `0xfc`},
85	{`0x00`, `0x51`, `0x95`, `0x3e`, `0xb9`, `0x61`, `0x8e`, `0x1c`, / b /
86	`0x9a`, `0x1f`, `0x92`, `0x9a`, `0x21`, `0xa0`, `0xb6`, `0x85`,
87	`0x40`, `0xee`, `0xa2`, `0xda`, `0x72`, `0x5b`, `0x99`, `0xb3`,
88	`0x15`, `0xf3`, `0xb8`, `0xb4`, `0x89`, `0x91`, `0x8e`, `0xf1`,
89	`0x09`, `0xe1`, `0x56`, `0x19`, `0x39`, `0x51`, `0xec`, `0x7e`,
90	`0x93`, `0x7b`, `0x16`, `0x52`, `0xc0`, `0xbd`, `0x3b`, `0xb1`,
91	`0xbf`, `0x07`, `0x35`, `0x73`, `0xdf`, `0x88`, `0x3d`, `0x2c`,
92	`0x34`, `0xf1`, `0xef`, `0x45`, `0x1f`, `0xd4`, `0x6b`, `0x50`,
93	`0x3f`, `0x00`},
94	{`0x00`, `0xc6`, `0x85`, `0x8e`, `0x06`, `0xb7`, `0x04`, `0x04`, / x /
95	`0xe9`, `0xcd`, `0x9e`, `0x3e`, `0xcb`, `0x66`, `0x23`, `0x95`,
96	`0xb4`, `0x42`, `0x9c`, `0x64`, `0x81`, `0x39`, `0x05`, `0x3f`,
97	`0xb5`, `0x21`, `0xf8`, `0x28`, `0xaf`, `0x60`, `0x6b`, `0x4d`,
98	`0x3d`, `0xba`, `0xa1`, `0x4b`, `0x5e`, `0x77`, `0xef`, `0xe7`,
99	`0x59`, `0x28`, `0xfe`, `0x1d`, `0xc1`, `0x27`, `0xa2`, `0xff`,
100	`0xa8`, `0xde`, `0x33`, `0x48`, `0xb3`, `0xc1`, `0x85`, `0x6a`,
101	`0x42`, `0x9b`, `0xf9`, `0x7e`, `0x7e`, `0x31`, `0xc2`, `0xe5`,
102	`0xbd`, `0x66`},
103	{`0x01`, `0x18`, `0x39`, `0x29`, `0x6a`, `0x78`, `0x9a`, `0x3b`, / y /
104	`0xc0`, `0x04`, `0x5c`, `0x8a`, `0x5f`, `0xb4`, `0x2c`, `0x7d`,
105	`0x1b`, `0xd9`, `0x98`, `0xf5`, `0x44`, `0x49`, `0x57`, `0x9b`,
106	`0x44`, `0x68`, `0x17`, `0xaf`, `0xbd`, `0x17`, `0x27`, `0x3e`,
107	`0x66`, `0x2c`, `0x97`, `0xee`, `0x72`, `0x99`, `0x5e`, `0xf4`,
108	`0x26`, `0x40`, `0xc5`, `0x50`, `0xb9`, `0x01`, `0x3f`, `0xad`,
109	`0x07`, `0x61`, `0x35`, `0x3c`, `0x70`, `0x86`, `0xa2`, `0x72`,
110	`0xc2`, `0x40`, `0x88`, `0xbe`, `0x94`, `0x76`, `0x9f`, `0xd1`,
111	`0x66`, `0x50`}
112	};
113
114	/-*
115	* The representation of field elements.
116	* ------------------------------------
117	*
118	* We represent field elements with nine values. These values are either 64 or
119	* 128 bits and the field element represented is:
120	* v[0]2^0 + v[1]2^58 + v[2]2^116 + ... + v[8]2^464 (mod p)
121	* Each of the nine values is called a 'limb'. Since the limbs are spaced only
122	* 58 bits apart, but are greater than 58 bits in length, the most significant
123	* bits of each limb overlap with the least significant bits of the next.
124	*
125	* A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
126	* 'largefelem' */
127
128	# define NLIMBS 9
129
130	typedef uint64_t limb;
131	typedef limb felem[NLIMBS];
132	typedef uint128_t largefelem[NLIMBS];
133
134	static const limb bottom57bits = `0x1ffffffffffffff`;
135	static const limb bottom58bits = `0x3ffffffffffffff`;
136
137	/*
138	* bin66_to_felem takes a little-endian byte array and converts it into felem
139	* form. This assumes that the CPU is little-endian.
140	*/
141	static void bin66_to_felem(felem out, const u8 in[`66`])
142	{
143	out[`0`] = (((limb ) & in[`0`])) & bottom58bits;
144	out[`1`] = (((limb ) & in[`7`]) >> `2`) & bottom58bits;
145	out[`2`] = (((limb ) & in[`14`]) >> `4`) & bottom58bits;
146	out[`3`] = (((limb ) & in[`21`]) >> `6`) & bottom58bits;
147	out[`4`] = (((limb ) & in[`29`])) & bottom58bits;
148	out[`5`] = (((limb ) & in[`36`]) >> `2`) & bottom58bits;
149	out[`6`] = (((limb ) & in[`43`]) >> `4`) & bottom58bits;
150	out[`7`] = (((limb ) & in[`50`]) >> `6`) & bottom58bits;
151	out[`8`] = (((limb ) & in[`58`])) & bottom57bits;
152	}
153
154	/*
155	* felem_to_bin66 takes an felem and serialises into a little endian, 66 byte
156	* array. This assumes that the CPU is little-endian.
157	*/
158	static void felem_to_bin66(u8 out[`66`], const felem in)
159	{
160	memset(out, `0`, `66`);
161	(((limb ) & out[`0`])) = in[`0`];
162	(((limb ) & out[`7`])) \|= in[`1`] << `2`;
163	(((limb ) & out[`14`])) \|= in[`2`] << `4`;
164	(((limb ) & out[`21`])) \|= in[`3`] << `6`;
165	(((limb ) & out[`29`])) = in[`4`];
166	(((limb ) & out[`36`])) \|= in[`5`] << `2`;
167	(((limb ) & out[`43`])) \|= in[`6`] << `4`;
168	(((limb ) & out[`50`])) \|= in[`7`] << `6`;
169	(((limb ) & out[`58`])) = in[`8`];
170	}
171
172	/ BN_to_felem converts an OpenSSL BIGNUM into an felem /
173	static int BN_to_felem(felem out, const BIGNUM *bn)
174	{
175	felem_bytearray b_out;
176	int num_bytes;
177
178	if (BN_is_negative(bn)) {
179	ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
180	return `0`;
181	}
182	num_bytes = BN_bn2lebinpad(bn, b_out, sizeof(b_out));
183	if (num_bytes < `0`) {
184	ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
185	return `0`;
186	}
187	bin66_to_felem(out, b_out);
188	return `1`;
189	}
190
191	/ felem_to_BN converts an felem into an OpenSSL BIGNUM /
192	static BIGNUM felem_to_BN(BIGNUM out, const felem in)
193	{
194	felem_bytearray b_out;
195	felem_to_bin66(b_out, in);
196	return BN_lebin2bn(b_out, sizeof(b_out), out);
197	}
198
199	/-*
200	* Field operations
201	* ----------------
202	*/
203
204	static void felem_one(felem out)
205	{
206	out[`0`] = `1`;
207	out[`1`] = `0`;
208	out[`2`] = `0`;
209	out[`3`] = `0`;
210	out[`4`] = `0`;
211	out[`5`] = `0`;
212	out[`6`] = `0`;
213	out[`7`] = `0`;
214	out[`8`] = `0`;
215	}
216
217	static void felem_assign(felem out, const felem in)
218	{
219	out[`0`] = in[`0`];
220	out[`1`] = in[`1`];
221	out[`2`] = in[`2`];
222	out[`3`] = in[`3`];
223	out[`4`] = in[`4`];
224	out[`5`] = in[`5`];
225	out[`6`] = in[`6`];
226	out[`7`] = in[`7`];
227	out[`8`] = in[`8`];
228	}
229
230	/ felem_sum64 sets out = out + in. /
231	static void felem_sum64(felem out, const felem in)
232	{
233	out[`0`] += in[`0`];
234	out[`1`] += in[`1`];
235	out[`2`] += in[`2`];
236	out[`3`] += in[`3`];
237	out[`4`] += in[`4`];
238	out[`5`] += in[`5`];
239	out[`6`] += in[`6`];
240	out[`7`] += in[`7`];
241	out[`8`] += in[`8`];
242	}
243
244	/ felem_scalar sets out = in * scalar /
245	static void felem_scalar(felem out, const felem in, limb scalar)
246	{
247	out[`0`] = in[`0`] * scalar;
248	out[`1`] = in[`1`] * scalar;
249	out[`2`] = in[`2`] * scalar;
250	out[`3`] = in[`3`] * scalar;
251	out[`4`] = in[`4`] * scalar;
252	out[`5`] = in[`5`] * scalar;
253	out[`6`] = in[`6`] * scalar;
254	out[`7`] = in[`7`] * scalar;
255	out[`8`] = in[`8`] * scalar;
256	}
257
258	/ felem_scalar64 sets out = out * scalar /
259	static void felem_scalar64(felem out, limb scalar)
260	{
261	out[`0`] *= scalar;
262	out[`1`] *= scalar;
263	out[`2`] *= scalar;
264	out[`3`] *= scalar;
265	out[`4`] *= scalar;
266	out[`5`] *= scalar;
267	out[`6`] *= scalar;
268	out[`7`] *= scalar;
269	out[`8`] *= scalar;
270	}
271
272	/ felem_scalar128 sets out = out * scalar /
273	static void felem_scalar128(largefelem out, limb scalar)
274	{
275	out[`0`] *= scalar;
276	out[`1`] *= scalar;
277	out[`2`] *= scalar;
278	out[`3`] *= scalar;
279	out[`4`] *= scalar;
280	out[`5`] *= scalar;
281	out[`6`] *= scalar;
282	out[`7`] *= scalar;
283	out[`8`] *= scalar;
284	}
285
286	/-*
287	* felem_neg sets \|out\| to \|-in\|
288	* On entry:
289	* in[i] < 2^59 + 2^14
290	* On exit:
291	* out[i] < 2^62
292	*/
293	static void felem_neg(felem out, const felem in)
294	{
295	/ In order to prevent underflow, we subtract from 0 mod p. /
296	static const limb two62m3 = (((limb) `1`) << `62`) - (((limb) `1`) << `5`);
297	static const limb two62m2 = (((limb) `1`) << `62`) - (((limb) `1`) << `4`);
298
299	out[`0`] = two62m3 - in[`0`];
300	out[`1`] = two62m2 - in[`1`];
301	out[`2`] = two62m2 - in[`2`];
302	out[`3`] = two62m2 - in[`3`];
303	out[`4`] = two62m2 - in[`4`];
304	out[`5`] = two62m2 - in[`5`];
305	out[`6`] = two62m2 - in[`6`];
306	out[`7`] = two62m2 - in[`7`];
307	out[`8`] = two62m2 - in[`8`];
308	}
309
310	/-*
311	* felem_diff64 subtracts \|in\| from \|out\|
312	* On entry:
313	* in[i] < 2^59 + 2^14
314	* On exit:
315	* out[i] < out[i] + 2^62
316	*/
317	static void felem_diff64(felem out, const felem in)
318	{
319	/*
320	* In order to prevent underflow, we add 0 mod p before subtracting.
321	*/
322	static const limb two62m3 = (((limb) `1`) << `62`) - (((limb) `1`) << `5`);
323	static const limb two62m2 = (((limb) `1`) << `62`) - (((limb) `1`) << `4`);
324
325	out[`0`] += two62m3 - in[`0`];
326	out[`1`] += two62m2 - in[`1`];
327	out[`2`] += two62m2 - in[`2`];
328	out[`3`] += two62m2 - in[`3`];
329	out[`4`] += two62m2 - in[`4`];
330	out[`5`] += two62m2 - in[`5`];
331	out[`6`] += two62m2 - in[`6`];
332	out[`7`] += two62m2 - in[`7`];
333	out[`8`] += two62m2 - in[`8`];
334	}
335
336	/-*
337	* felem_diff_128_64 subtracts \|in\| from \|out\|
338	* On entry:
339	* in[i] < 2^62 + 2^17
340	* On exit:
341	* out[i] < out[i] + 2^63
342	*/
343	static void felem_diff_128_64(largefelem out, const felem in)
344	{
345	/*
346	* In order to prevent underflow, we add 64p mod p (which is equivalent
347	* to 0 mod p) before subtracting. p is 2^521 - 1, i.e. in binary a 521
348	* digit number with all bits set to 1. See "The representation of field
349	* elements" comment above for a description of how limbs are used to
350	* represent a number. 64p is represented with 8 limbs containing a number
351	* with 58 bits set and one limb with a number with 57 bits set.
352	*/
353	static const limb two63m6 = (((limb) `1`) << `63`) - (((limb) `1`) << `6`);
354	static const limb two63m5 = (((limb) `1`) << `63`) - (((limb) `1`) << `5`);
355
356	out[`0`] += two63m6 - in[`0`];
357	out[`1`] += two63m5 - in[`1`];
358	out[`2`] += two63m5 - in[`2`];
359	out[`3`] += two63m5 - in[`3`];
360	out[`4`] += two63m5 - in[`4`];
361	out[`5`] += two63m5 - in[`5`];
362	out[`6`] += two63m5 - in[`6`];
363	out[`7`] += two63m5 - in[`7`];
364	out[`8`] += two63m5 - in[`8`];
365	}
366
367	/-*
368	* felem_diff_128_64 subtracts \|in\| from \|out\|
369	* On entry:
370	* in[i] < 2^126
371	* On exit:
372	* out[i] < out[i] + 2^127 - 2^69
373	*/
374	static void felem_diff128(largefelem out, const largefelem in)
375	{
376	/*
377	* In order to prevent underflow, we add 0 mod p before subtracting.
378	*/
379	static const uint128_t two127m70 =
380	(((uint128_t) `1`) << `127`) - (((uint128_t) `1`) << `70`);
381	static const uint128_t two127m69 =
382	(((uint128_t) `1`) << `127`) - (((uint128_t) `1`) << `69`);
383
384	out[`0`] += (two127m70 - in[`0`]);
385	out[`1`] += (two127m69 - in[`1`]);
386	out[`2`] += (two127m69 - in[`2`]);
387	out[`3`] += (two127m69 - in[`3`]);
388	out[`4`] += (two127m69 - in[`4`]);
389	out[`5`] += (two127m69 - in[`5`]);
390	out[`6`] += (two127m69 - in[`6`]);
391	out[`7`] += (two127m69 - in[`7`]);
392	out[`8`] += (two127m69 - in[`8`]);
393	}
394
395	/-*
396	* felem_square sets \|out\| = \|in\|^2
397	* On entry:
398	* in[i] < 2^62
399	* On exit:
400	* out[i] < 17 * max(in[i]) * max(in[i])
401	*/
402	static void felem_square(largefelem out, const felem in)
403	{
404	felem inx2, inx4;
405	felem_scalar(inx2, in, `2`);
406	felem_scalar(inx4, in, `4`);
407
408	/-*
409	* We have many cases were we want to do
410	* in[x] * in[y] +
411	* in[y] * in[x]
412	* This is obviously just
413	* 2 * in[x] * in[y]
414	* However, rather than do the doubling on the 128 bit result, we
415	* double one of the inputs to the multiplication by reading from
416	* \|inx2\|
417	*/
418
419	out[`0`] = ((uint128_t) in[`0`]) * in[`0`];
420	out[`1`] = ((uint128_t) in[`0`]) * inx2[`1`];
421	out[`2`] = ((uint128_t) in[`0`]) * inx2[`2`] + ((uint128_t) in[`1`]) * in[`1`];
422	out[`3`] = ((uint128_t) in[`0`]) * inx2[`3`] + ((uint128_t) in[`1`]) * inx2[`2`];
423	out[`4`] = ((uint128_t) in[`0`]) * inx2[`4`] +
424	((uint128_t) in[`1`]) * inx2[`3`] + ((uint128_t) in[`2`]) * in[`2`];
425	out[`5`] = ((uint128_t) in[`0`]) * inx2[`5`] +
426	((uint128_t) in[`1`]) * inx2[`4`] + ((uint128_t) in[`2`]) * inx2[`3`];
427	out[`6`] = ((uint128_t) in[`0`]) * inx2[`6`] +
428	((uint128_t) in[`1`]) * inx2[`5`] +
429	((uint128_t) in[`2`]) * inx2[`4`] + ((uint128_t) in[`3`]) * in[`3`];
430	out[`7`] = ((uint128_t) in[`0`]) * inx2[`7`] +
431	((uint128_t) in[`1`]) * inx2[`6`] +
432	((uint128_t) in[`2`]) * inx2[`5`] + ((uint128_t) in[`3`]) * inx2[`4`];
433	out[`8`] = ((uint128_t) in[`0`]) * inx2[`8`] +
434	((uint128_t) in[`1`]) * inx2[`7`] +
435	((uint128_t) in[`2`]) * inx2[`6`] +
436	((uint128_t) in[`3`]) * inx2[`5`] + ((uint128_t) in[`4`]) * in[`4`];
437
438	/*
439	* The remaining limbs fall above 2^521, with the first falling at 2^522.
440	* They correspond to locations one bit up from the limbs produced above
441	* so we would have to multiply by two to align them. Again, rather than
442	* operate on the 128-bit result, we double one of the inputs to the
443	* multiplication. If we want to double for both this reason, and the
444	* reason above, then we end up multiplying by four.
445	*/
446
447	/ 9 /
448	out[`0`] += ((uint128_t) in[`1`]) * inx4[`8`] +
449	((uint128_t) in[`2`]) * inx4[`7`] +
450	((uint128_t) in[`3`]) * inx4[`6`] + ((uint128_t) in[`4`]) * inx4[`5`];
451
452	/ 10 /
453	out[`1`] += ((uint128_t) in[`2`]) * inx4[`8`] +
454	((uint128_t) in[`3`]) * inx4[`7`] +
455	((uint128_t) in[`4`]) * inx4[`6`] + ((uint128_t) in[`5`]) * inx2[`5`];
456
457	/ 11 /
458	out[`2`] += ((uint128_t) in[`3`]) * inx4[`8`] +
459	((uint128_t) in[`4`]) * inx4[`7`] + ((uint128_t) in[`5`]) * inx4[`6`];
460
461	/ 12 /
462	out[`3`] += ((uint128_t) in[`4`]) * inx4[`8`] +
463	((uint128_t) in[`5`]) * inx4[`7`] + ((uint128_t) in[`6`]) * inx2[`6`];
464
465	/ 13 /
466	out[`4`] += ((uint128_t) in[`5`]) * inx4[`8`] + ((uint128_t) in[`6`]) * inx4[`7`];
467
468	/ 14 /
469	out[`5`] += ((uint128_t) in[`6`]) * inx4[`8`] + ((uint128_t) in[`7`]) * inx2[`7`];
470
471	/ 15 /
472	out[`6`] += ((uint128_t) in[`7`]) * inx4[`8`];
473
474	/ 16 /
475	out[`7`] += ((uint128_t) in[`8`]) * inx2[`8`];
476	}
477
478	/-*
479	* felem_mul sets \|out\| = \|in1\| * \|in2\|
480	* On entry:
481	* in1[i] < 2^64
482	* in2[i] < 2^63
483	* On exit:
484	* out[i] < 17 * max(in1[i]) * max(in2[i])
485	*/
486	static void felem_mul(largefelem out, const felem in1, const felem in2)
487	{
488	felem in2x2;
489	felem_scalar(in2x2, in2, `2`);
490
491	out[`0`] = ((uint128_t) in1[`0`]) * in2[`0`];
492
493	out[`1`] = ((uint128_t) in1[`0`]) * in2[`1`] +
494	((uint128_t) in1[`1`]) * in2[`0`];
495
496	out[`2`] = ((uint128_t) in1[`0`]) * in2[`2`] +
497	((uint128_t) in1[`1`]) * in2[`1`] +
498	((uint128_t) in1[`2`]) * in2[`0`];
499
500	out[`3`] = ((uint128_t) in1[`0`]) * in2[`3`] +
501	((uint128_t) in1[`1`]) * in2[`2`] +
502	((uint128_t) in1[`2`]) * in2[`1`] +
503	((uint128_t) in1[`3`]) * in2[`0`];
504
505	out[`4`] = ((uint128_t) in1[`0`]) * in2[`4`] +
506	((uint128_t) in1[`1`]) * in2[`3`] +
507	((uint128_t) in1[`2`]) * in2[`2`] +
508	((uint128_t) in1[`3`]) * in2[`1`] +
509	((uint128_t) in1[`4`]) * in2[`0`];
510
511	out[`5`] = ((uint128_t) in1[`0`]) * in2[`5`] +
512	((uint128_t) in1[`1`]) * in2[`4`] +
513	((uint128_t) in1[`2`]) * in2[`3`] +
514	((uint128_t) in1[`3`]) * in2[`2`] +
515	((uint128_t) in1[`4`]) * in2[`1`] +
516	((uint128_t) in1[`5`]) * in2[`0`];
517
518	out[`6`] = ((uint128_t) in1[`0`]) * in2[`6`] +
519	((uint128_t) in1[`1`]) * in2[`5`] +
520	((uint128_t) in1[`2`]) * in2[`4`] +
521	((uint128_t) in1[`3`]) * in2[`3`] +
522	((uint128_t) in1[`4`]) * in2[`2`] +
523	((uint128_t) in1[`5`]) * in2[`1`] +
524	((uint128_t) in1[`6`]) * in2[`0`];
525
526	out[`7`] = ((uint128_t) in1[`0`]) * in2[`7`] +
527	((uint128_t) in1[`1`]) * in2[`6`] +
528	((uint128_t) in1[`2`]) * in2[`5`] +
529	((uint128_t) in1[`3`]) * in2[`4`] +
530	((uint128_t) in1[`4`]) * in2[`3`] +
531	((uint128_t) in1[`5`]) * in2[`2`] +
532	((uint128_t) in1[`6`]) * in2[`1`] +
533	((uint128_t) in1[`7`]) * in2[`0`];
534
535	out[`8`] = ((uint128_t) in1[`0`]) * in2[`8`] +
536	((uint128_t) in1[`1`]) * in2[`7`] +
537	((uint128_t) in1[`2`]) * in2[`6`] +
538	((uint128_t) in1[`3`]) * in2[`5`] +
539	((uint128_t) in1[`4`]) * in2[`4`] +
540	((uint128_t) in1[`5`]) * in2[`3`] +
541	((uint128_t) in1[`6`]) * in2[`2`] +
542	((uint128_t) in1[`7`]) * in2[`1`] +
543	((uint128_t) in1[`8`]) * in2[`0`];
544
545	/ See comment in felem_square about the use of in2x2 here /
546
547	out[`0`] += ((uint128_t) in1[`1`]) * in2x2[`8`] +
548	((uint128_t) in1[`2`]) * in2x2[`7`] +
549	((uint128_t) in1[`3`]) * in2x2[`6`] +
550	((uint128_t) in1[`4`]) * in2x2[`5`] +
551	((uint128_t) in1[`5`]) * in2x2[`4`] +
552	((uint128_t) in1[`6`]) * in2x2[`3`] +
553	((uint128_t) in1[`7`]) * in2x2[`2`] +
554	((uint128_t) in1[`8`]) * in2x2[`1`];
555
556	out[`1`] += ((uint128_t) in1[`2`]) * in2x2[`8`] +
557	((uint128_t) in1[`3`]) * in2x2[`7`] +
558	((uint128_t) in1[`4`]) * in2x2[`6`] +
559	((uint128_t) in1[`5`]) * in2x2[`5`] +
560	((uint128_t) in1[`6`]) * in2x2[`4`] +
561	((uint128_t) in1[`7`]) * in2x2[`3`] +
562	((uint128_t) in1[`8`]) * in2x2[`2`];
563
564	out[`2`] += ((uint128_t) in1[`3`]) * in2x2[`8`] +
565	((uint128_t) in1[`4`]) * in2x2[`7`] +
566	((uint128_t) in1[`5`]) * in2x2[`6`] +
567	((uint128_t) in1[`6`]) * in2x2[`5`] +
568	((uint128_t) in1[`7`]) * in2x2[`4`] +
569	((uint128_t) in1[`8`]) * in2x2[`3`];
570
571	out[`3`] += ((uint128_t) in1[`4`]) * in2x2[`8`] +
572	((uint128_t) in1[`5`]) * in2x2[`7`] +
573	((uint128_t) in1[`6`]) * in2x2[`6`] +
574	((uint128_t) in1[`7`]) * in2x2[`5`] +
575	((uint128_t) in1[`8`]) * in2x2[`4`];
576
577	out[`4`] += ((uint128_t) in1[`5`]) * in2x2[`8`] +
578	((uint128_t) in1[`6`]) * in2x2[`7`] +
579	((uint128_t) in1[`7`]) * in2x2[`6`] +
580	((uint128_t) in1[`8`]) * in2x2[`5`];
581
582	out[`5`] += ((uint128_t) in1[`6`]) * in2x2[`8`] +
583	((uint128_t) in1[`7`]) * in2x2[`7`] +
584	((uint128_t) in1[`8`]) * in2x2[`6`];
585
586	out[`6`] += ((uint128_t) in1[`7`]) * in2x2[`8`] +
587	((uint128_t) in1[`8`]) * in2x2[`7`];
588
589	out[`7`] += ((uint128_t) in1[`8`]) * in2x2[`8`];
590	}
591
592	static const limb bottom52bits = `0xfffffffffffff`;
593
594	/-*
595	* felem_reduce converts a largefelem to an felem.
596	* On entry:
597	* in[i] < 2^128
598	* On exit:
599	* out[i] < 2^59 + 2^14
600	*/
601	static void felem_reduce(felem out, const largefelem in)
602	{
603	u64 overflow1, overflow2;
604
605	out[`0`] = ((limb) in[`0`]) & bottom58bits;
606	out[`1`] = ((limb) in[`1`]) & bottom58bits;
607	out[`2`] = ((limb) in[`2`]) & bottom58bits;
608	out[`3`] = ((limb) in[`3`]) & bottom58bits;
609	out[`4`] = ((limb) in[`4`]) & bottom58bits;
610	out[`5`] = ((limb) in[`5`]) & bottom58bits;
611	out[`6`] = ((limb) in[`6`]) & bottom58bits;
612	out[`7`] = ((limb) in[`7`]) & bottom58bits;
613	out[`8`] = ((limb) in[`8`]) & bottom58bits;
614
615	/ out[i] < 2^58 /
616
617	out[`1`] += ((limb) in[`0`]) >> `58`;
618	out[`1`] += (((limb) (in[`0`] >> `64`)) & bottom52bits) << `6`;
619	/-*
620	* out[1] < 2^58 + 2^6 + 2^58
621	* = 2^59 + 2^6
622	*/
623	out[`2`] += ((limb) (in[`0`] >> `64`)) >> `52`;
624
625	out[`2`] += ((limb) in[`1`]) >> `58`;
626	out[`2`] += (((limb) (in[`1`] >> `64`)) & bottom52bits) << `6`;
627	out[`3`] += ((limb) (in[`1`] >> `64`)) >> `52`;
628
629	out[`3`] += ((limb) in[`2`]) >> `58`;
630	out[`3`] += (((limb) (in[`2`] >> `64`)) & bottom52bits) << `6`;
631	out[`4`] += ((limb) (in[`2`] >> `64`)) >> `52`;
632
633	out[`4`] += ((limb) in[`3`]) >> `58`;
634	out[`4`] += (((limb) (in[`3`] >> `64`)) & bottom52bits) << `6`;
635	out[`5`] += ((limb) (in[`3`] >> `64`)) >> `52`;
636
637	out[`5`] += ((limb) in[`4`]) >> `58`;
638	out[`5`] += (((limb) (in[`4`] >> `64`)) & bottom52bits) << `6`;
639	out[`6`] += ((limb) (in[`4`] >> `64`)) >> `52`;
640
641	out[`6`] += ((limb) in[`5`]) >> `58`;
642	out[`6`] += (((limb) (in[`5`] >> `64`)) & bottom52bits) << `6`;
643	out[`7`] += ((limb) (in[`5`] >> `64`)) >> `52`;
644
645	out[`7`] += ((limb) in[`6`]) >> `58`;
646	out[`7`] += (((limb) (in[`6`] >> `64`)) & bottom52bits) << `6`;
647	out[`8`] += ((limb) (in[`6`] >> `64`)) >> `52`;
648
649	out[`8`] += ((limb) in[`7`]) >> `58`;
650	out[`8`] += (((limb) (in[`7`] >> `64`)) & bottom52bits) << `6`;
651	/-*
652	* out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12
653	* < 2^59 + 2^13
654	*/
655	overflow1 = ((limb) (in[`7`] >> `64`)) >> `52`;
656
657	overflow1 += ((limb) in[`8`]) >> `58`;
658	overflow1 += (((limb) (in[`8`] >> `64`)) & bottom52bits) << `6`;
659	overflow2 = ((limb) (in[`8`] >> `64`)) >> `52`;
660
661	overflow1 <<= `1`; / overflow1 < 2^13 + 2^7 + 2^59 /
662	overflow2 <<= `1`; / overflow2 < 2^13 /
663
664	out[`0`] += overflow1; / out[0] < 2^60 /
665	out[`1`] += overflow2; / out[1] < 2^59 + 2^6 + 2^13 /
666
667	out[`1`] += out[`0`] >> `58`;
668	out[`0`] &= bottom58bits;
669	/-*
670	* out[0] < 2^58
671	* out[1] < 2^59 + 2^6 + 2^13 + 2^2
672	* < 2^59 + 2^14
673	*/
674	}
675
676	static void felem_square_reduce(felem out, const felem in)
677	{
678	largefelem tmp;
679	felem_square(tmp, in);
680	felem_reduce(out, tmp);
681	}
682
683	static void felem_mul_reduce(felem out, const felem in1, const felem in2)
684	{
685	largefelem tmp;
686	felem_mul(tmp, in1, in2);
687	felem_reduce(out, tmp);
688	}
689
690	/-*
691	* felem_inv calculates \|out\| = \|in\|^{-1}
692	*
693	* Based on Fermat's Little Theorem:
694	* a^p = a (mod p)
695	* a^{p-1} = 1 (mod p)
696	* a^{p-2} = a^{-1} (mod p)
697	*/
698	static void felem_inv(felem out, const felem in)
699	{
700	felem ftmp, ftmp2, ftmp3, ftmp4;
701	largefelem tmp;
702	unsigned i;
703
704	felem_square(tmp, in);
705	felem_reduce(ftmp, tmp); / 2^1 /
706	felem_mul(tmp, in, ftmp);
707	felem_reduce(ftmp, tmp); / 2^2 - 2^0 /
708	felem_assign(ftmp2, ftmp);
709	felem_square(tmp, ftmp);
710	felem_reduce(ftmp, tmp); / 2^3 - 2^1 /
711	felem_mul(tmp, in, ftmp);
712	felem_reduce(ftmp, tmp); / 2^3 - 2^0 /
713	felem_square(tmp, ftmp);
714	felem_reduce(ftmp, tmp); / 2^4 - 2^1 /
715
716	felem_square(tmp, ftmp2);
717	felem_reduce(ftmp3, tmp); / 2^3 - 2^1 /
718	felem_square(tmp, ftmp3);
719	felem_reduce(ftmp3, tmp); / 2^4 - 2^2 /
720	felem_mul(tmp, ftmp3, ftmp2);
721	felem_reduce(ftmp3, tmp); / 2^4 - 2^0 /
722
723	felem_assign(ftmp2, ftmp3);
724	felem_square(tmp, ftmp3);
725	felem_reduce(ftmp3, tmp); / 2^5 - 2^1 /
726	felem_square(tmp, ftmp3);
727	felem_reduce(ftmp3, tmp); / 2^6 - 2^2 /
728	felem_square(tmp, ftmp3);
729	felem_reduce(ftmp3, tmp); / 2^7 - 2^3 /
730	felem_square(tmp, ftmp3);
731	felem_reduce(ftmp3, tmp); / 2^8 - 2^4 /
732	felem_assign(ftmp4, ftmp3);
733	felem_mul(tmp, ftmp3, ftmp);
734	felem_reduce(ftmp4, tmp); / 2^8 - 2^1 /
735	felem_square(tmp, ftmp4);
736	felem_reduce(ftmp4, tmp); / 2^9 - 2^2 /
737	felem_mul(tmp, ftmp3, ftmp2);
738	felem_reduce(ftmp3, tmp); / 2^8 - 2^0 /
739	felem_assign(ftmp2, ftmp3);
740
741	for (i = `0`; i < `8`; i++) {
742	felem_square(tmp, ftmp3);
743	felem_reduce(ftmp3, tmp); / 2^16 - 2^8 /
744	}
745	felem_mul(tmp, ftmp3, ftmp2);
746	felem_reduce(ftmp3, tmp); / 2^16 - 2^0 /
747	felem_assign(ftmp2, ftmp3);
748
749	for (i = `0`; i < `16`; i++) {
750	felem_square(tmp, ftmp3);
751	felem_reduce(ftmp3, tmp); / 2^32 - 2^16 /
752	}
753	felem_mul(tmp, ftmp3, ftmp2);
754	felem_reduce(ftmp3, tmp); / 2^32 - 2^0 /
755	felem_assign(ftmp2, ftmp3);
756
757	for (i = `0`; i < `32`; i++) {
758	felem_square(tmp, ftmp3);
759	felem_reduce(ftmp3, tmp); / 2^64 - 2^32 /
760	}
761	felem_mul(tmp, ftmp3, ftmp2);
762	felem_reduce(ftmp3, tmp); / 2^64 - 2^0 /
763	felem_assign(ftmp2, ftmp3);
764
765	for (i = `0`; i < `64`; i++) {
766	felem_square(tmp, ftmp3);
767	felem_reduce(ftmp3, tmp); / 2^128 - 2^64 /
768	}
769	felem_mul(tmp, ftmp3, ftmp2);
770	felem_reduce(ftmp3, tmp); / 2^128 - 2^0 /
771	felem_assign(ftmp2, ftmp3);
772
773	for (i = `0`; i < `128`; i++) {
774	felem_square(tmp, ftmp3);
775	felem_reduce(ftmp3, tmp); / 2^256 - 2^128 /
776	}
777	felem_mul(tmp, ftmp3, ftmp2);
778	felem_reduce(ftmp3, tmp); / 2^256 - 2^0 /
779	felem_assign(ftmp2, ftmp3);
780
781	for (i = `0`; i < `256`; i++) {
782	felem_square(tmp, ftmp3);
783	felem_reduce(ftmp3, tmp); / 2^512 - 2^256 /
784	}
785	felem_mul(tmp, ftmp3, ftmp2);
786	felem_reduce(ftmp3, tmp); / 2^512 - 2^0 /
787
788	for (i = `0`; i < `9`; i++) {
789	felem_square(tmp, ftmp3);
790	felem_reduce(ftmp3, tmp); / 2^521 - 2^9 /
791	}
792	felem_mul(tmp, ftmp3, ftmp4);
793	felem_reduce(ftmp3, tmp); / 2^512 - 2^2 /
794	felem_mul(tmp, ftmp3, in);
795	felem_reduce(out, tmp); / 2^512 - 3 /
796	}
797
798	/ This is 2^521-1, expressed as an felem /
799	static const felem kPrime = {
800	`0x03ffffffffffffff`, `0x03ffffffffffffff`, `0x03ffffffffffffff`,
801	`0x03ffffffffffffff`, `0x03ffffffffffffff`, `0x03ffffffffffffff`,
802	`0x03ffffffffffffff`, `0x03ffffffffffffff`, `0x01ffffffffffffff`
803	};
804
805	/-*
806	* felem_is_zero returns a limb with all bits set if \|in\| == 0 (mod p) and 0
807	* otherwise.
808	* On entry:
809	* in[i] < 2^59 + 2^14
810	*/
811	static limb felem_is_zero(const felem in)
812	{
813	felem ftmp;
814	limb is_zero, is_p;
815	felem_assign(ftmp, in);
816
817	ftmp[`0`] += ftmp[`8`] >> `57`;
818	ftmp[`8`] &= bottom57bits;
819	/ ftmp[8] < 2^57 /
820	ftmp[`1`] += ftmp[`0`] >> `58`;
821	ftmp[`0`] &= bottom58bits;
822	ftmp[`2`] += ftmp[`1`] >> `58`;
823	ftmp[`1`] &= bottom58bits;
824	ftmp[`3`] += ftmp[`2`] >> `58`;
825	ftmp[`2`] &= bottom58bits;
826	ftmp[`4`] += ftmp[`3`] >> `58`;
827	ftmp[`3`] &= bottom58bits;
828	ftmp[`5`] += ftmp[`4`] >> `58`;
829	ftmp[`4`] &= bottom58bits;
830	ftmp[`6`] += ftmp[`5`] >> `58`;
831	ftmp[`5`] &= bottom58bits;
832	ftmp[`7`] += ftmp[`6`] >> `58`;
833	ftmp[`6`] &= bottom58bits;
834	ftmp[`8`] += ftmp[`7`] >> `58`;
835	ftmp[`7`] &= bottom58bits;
836	/ ftmp[8] < 2^57 + 4 /
837
838	/*
839	* The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is greater
840	* than our bound for ftmp[8]. Therefore we only have to check if the
841	* zero is zero or 2^521-1.
842	*/
843
844	is_zero = `0`;
845	is_zero \|= ftmp[`0`];
846	is_zero \|= ftmp[`1`];
847	is_zero \|= ftmp[`2`];
848	is_zero \|= ftmp[`3`];
849	is_zero \|= ftmp[`4`];
850	is_zero \|= ftmp[`5`];
851	is_zero \|= ftmp[`6`];
852	is_zero \|= ftmp[`7`];
853	is_zero \|= ftmp[`8`];
854
855	is_zero--;
856	/*
857	* We know that ftmp[i] < 2^63, therefore the only way that the top bit
858	* can be set is if is_zero was 0 before the decrement.
859	*/
860	is_zero = `0` - (is_zero >> `63`);
861
862	is_p = ftmp[`0`] ^ kPrime[`0`];
863	is_p \|= ftmp[`1`] ^ kPrime[`1`];
864	is_p \|= ftmp[`2`] ^ kPrime[`2`];
865	is_p \|= ftmp[`3`] ^ kPrime[`3`];
866	is_p \|= ftmp[`4`] ^ kPrime[`4`];
867	is_p \|= ftmp[`5`] ^ kPrime[`5`];
868	is_p \|= ftmp[`6`] ^ kPrime[`6`];
869	is_p \|= ftmp[`7`] ^ kPrime[`7`];
870	is_p \|= ftmp[`8`] ^ kPrime[`8`];
871
872	is_p--;
873	is_p = `0` - (is_p >> `63`);
874
875	is_zero \|= is_p;
876	return is_zero;
877	}
878
879	static int felem_is_zero_int(const void *in)
880	{
881	return (int)(felem_is_zero(in) & ((limb) `1`));
882	}
883
884	/-*
885	* felem_contract converts \|in\| to its unique, minimal representation.
886	* On entry:
887	* in[i] < 2^59 + 2^14
888	*/
889	static void felem_contract(felem out, const felem in)
890	{
891	limb is_p, is_greater, sign;
892	static const limb two58 = ((limb) `1`) << `58`;
893
894	felem_assign(out, in);
895
896	out[`0`] += out[`8`] >> `57`;
897	out[`8`] &= bottom57bits;
898	/ out[8] < 2^57 /
899	out[`1`] += out[`0`] >> `58`;
900	out[`0`] &= bottom58bits;
901	out[`2`] += out[`1`] >> `58`;
902	out[`1`] &= bottom58bits;
903	out[`3`] += out[`2`] >> `58`;
904	out[`2`] &= bottom58bits;
905	out[`4`] += out[`3`] >> `58`;
906	out[`3`] &= bottom58bits;
907	out[`5`] += out[`4`] >> `58`;
908	out[`4`] &= bottom58bits;
909	out[`6`] += out[`5`] >> `58`;
910	out[`5`] &= bottom58bits;
911	out[`7`] += out[`6`] >> `58`;
912	out[`6`] &= bottom58bits;
913	out[`8`] += out[`7`] >> `58`;
914	out[`7`] &= bottom58bits;
915	/ out[8] < 2^57 + 4 /
916
917	/*
918	* If the value is greater than 2^521-1 then we have to subtract 2^521-1
919	* out. See the comments in felem_is_zero regarding why we don't test for
920	* other multiples of the prime.
921	*/
922
923	/*
924	* First, if \|out\| is equal to 2^521-1, we subtract it out to get zero.
925	*/
926
927	is_p = out[`0`] ^ kPrime[`0`];
928	is_p \|= out[`1`] ^ kPrime[`1`];
929	is_p \|= out[`2`] ^ kPrime[`2`];
930	is_p \|= out[`3`] ^ kPrime[`3`];
931	is_p \|= out[`4`] ^ kPrime[`4`];
932	is_p \|= out[`5`] ^ kPrime[`5`];
933	is_p \|= out[`6`] ^ kPrime[`6`];
934	is_p \|= out[`7`] ^ kPrime[`7`];
935	is_p \|= out[`8`] ^ kPrime[`8`];
936
937	is_p--;
938	is_p &= is_p << `32`;
939	is_p &= is_p << `16`;
940	is_p &= is_p << `8`;
941	is_p &= is_p << `4`;
942	is_p &= is_p << `2`;
943	is_p &= is_p << `1`;
944	is_p = `0` - (is_p >> `63`);
945	is_p = ~is_p;
946
947	/ is_p is 0 iff \|out\| == 2^521-1 and all ones otherwise /
948
949	out[`0`] &= is_p;
950	out[`1`] &= is_p;
951	out[`2`] &= is_p;
952	out[`3`] &= is_p;
953	out[`4`] &= is_p;
954	out[`5`] &= is_p;
955	out[`6`] &= is_p;
956	out[`7`] &= is_p;
957	out[`8`] &= is_p;
958
959	/*
960	* In order to test that \|out\| >= 2^521-1 we need only test if out[8] >>
961	* 57 is greater than zero as (2^521-1) + x >= 2^522
962	*/
963	is_greater = out[`8`] >> `57`;
964	is_greater \|= is_greater << `32`;
965	is_greater \|= is_greater << `16`;
966	is_greater \|= is_greater << `8`;
967	is_greater \|= is_greater << `4`;
968	is_greater \|= is_greater << `2`;
969	is_greater \|= is_greater << `1`;
970	is_greater = `0` - (is_greater >> `63`);
971
972	out[`0`] -= kPrime[`0`] & is_greater;
973	out[`1`] -= kPrime[`1`] & is_greater;
974	out[`2`] -= kPrime[`2`] & is_greater;
975	out[`3`] -= kPrime[`3`] & is_greater;
976	out[`4`] -= kPrime[`4`] & is_greater;
977	out[`5`] -= kPrime[`5`] & is_greater;
978	out[`6`] -= kPrime[`6`] & is_greater;
979	out[`7`] -= kPrime[`7`] & is_greater;
980	out[`8`] -= kPrime[`8`] & is_greater;
981
982	/ Eliminate negative coefficients /
983	sign = -(out[`0`] >> `63`);
984	out[`0`] += (two58 & sign);
985	out[`1`] -= (`1` & sign);
986	sign = -(out[`1`] >> `63`);
987	out[`1`] += (two58 & sign);
988	out[`2`] -= (`1` & sign);
989	sign = -(out[`2`] >> `63`);
990	out[`2`] += (two58 & sign);
991	out[`3`] -= (`1` & sign);
992	sign = -(out[`3`] >> `63`);
993	out[`3`] += (two58 & sign);
994	out[`4`] -= (`1` & sign);
995	sign = -(out[`4`] >> `63`);
996	out[`4`] += (two58 & sign);
997	out[`5`] -= (`1` & sign);
998	sign = -(out[`0`] >> `63`);
999	out[`5`] += (two58 & sign);
1000	out[`6`] -= (`1` & sign);
1001	sign = -(out[`6`] >> `63`);
1002	out[`6`] += (two58 & sign);
1003	out[`7`] -= (`1` & sign);
1004	sign = -(out[`7`] >> `63`);
1005	out[`7`] += (two58 & sign);
1006	out[`8`] -= (`1` & sign);
1007	sign = -(out[`5`] >> `63`);
1008	out[`5`] += (two58 & sign);
1009	out[`6`] -= (`1` & sign);
1010	sign = -(out[`6`] >> `63`);
1011	out[`6`] += (two58 & sign);
1012	out[`7`] -= (`1` & sign);
1013	sign = -(out[`7`] >> `63`);
1014	out[`7`] += (two58 & sign);
1015	out[`8`] -= (`1` & sign);
1016	}
1017
1018	/-*
1019	* Group operations
1020	* ----------------
1021	*
1022	* Building on top of the field operations we have the operations on the
1023	* elliptic curve group itself. Points on the curve are represented in Jacobian
1024	* coordinates */
1025
1026	/-*
1027	* point_double calculates 2*(x_in, y_in, z_in)
1028	*
1029	* The method is taken from:
1030	* http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1031	*
1032	* Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1033	* while x_out == y_in is not (maybe this works, but it's not tested). */
1034	static void
1035	point_double(felem x_out, felem y_out, felem z_out,
1036	const felem x_in, const felem y_in, const felem z_in)
1037	{
1038	largefelem tmp, tmp2;
1039	felem delta, gamma, beta, alpha, ftmp, ftmp2;
1040
1041	felem_assign(ftmp, x_in);
1042	felem_assign(ftmp2, x_in);
1043
1044	/ delta = z^2 /
1045	felem_square(tmp, z_in);
1046	felem_reduce(delta, tmp); / delta[i] < 2^59 + 2^14 /
1047
1048	/ gamma = y^2 /
1049	felem_square(tmp, y_in);
1050	felem_reduce(gamma, tmp); / gamma[i] < 2^59 + 2^14 /
1051
1052	/ beta = xgamma /*
1053	felem_mul(tmp, x_in, gamma);
1054	felem_reduce(beta, tmp); / beta[i] < 2^59 + 2^14 /
1055
1056	/ alpha = 3(x-delta)(x+delta) /
1057	felem_diff64(ftmp, delta);
1058	/ ftmp[i] < 2^61 /
1059	felem_sum64(ftmp2, delta);
1060	/ ftmp2[i] < 2^60 + 2^15 /
1061	felem_scalar64(ftmp2, `3`);
1062	/ ftmp2[i] < 32^60 + 32^15 /
1063	felem_mul(tmp, ftmp, ftmp2);
1064	/-*
1065	* tmp[i] < 17(32^121 + 32^76)
1066	* = 612^121 + 612^76
1067	* < 642^121 + 642^76
1068	* = 2^127 + 2^82
1069	* < 2^128
1070	*/
1071	felem_reduce(alpha, tmp);
1072
1073	/ x' = alpha^2 - 8beta /*
1074	felem_square(tmp, alpha);
1075	/*
1076	* tmp[i] < 17*2^120 < 2^125
1077	*/
1078	felem_assign(ftmp, beta);
1079	felem_scalar64(ftmp, `8`);
1080	/ ftmp[i] < 2^62 + 2^17 /
1081	felem_diff_128_64(tmp, ftmp);
1082	/ tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 /
1083	felem_reduce(x_out, tmp);
1084
1085	/ z' = (y + z)^2 - gamma - delta /
1086	felem_sum64(delta, gamma);
1087	/ delta[i] < 2^60 + 2^15 /
1088	felem_assign(ftmp, y_in);
1089	felem_sum64(ftmp, z_in);
1090	/ ftmp[i] < 2^60 + 2^15 /
1091	felem_square(tmp, ftmp);
1092	/*
1093	* tmp[i] < 17(2^122) < 2^127
1094	*/
1095	felem_diff_128_64(tmp, delta);
1096	/ tmp[i] < 2^127 + 2^63 /
1097	felem_reduce(z_out, tmp);
1098
1099	/ y' = alpha(4beta - x') - 8gamma^2 /*
1100	felem_scalar64(beta, `4`);
1101	/ beta[i] < 2^61 + 2^16 /
1102	felem_diff64(beta, x_out);
1103	/ beta[i] < 2^61 + 2^60 + 2^16 /
1104	felem_mul(tmp, alpha, beta);
1105	/-*
1106	* tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16))
1107	* = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30)
1108	* = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1109	* < 2^128
1110	*/
1111	felem_square(tmp2, gamma);
1112	/-*
1113	* tmp2[i] < 17*(2^59 + 2^14)^2
1114	* = 17*(2^118 + 2^74 + 2^28)
1115	*/
1116	felem_scalar128(tmp2, `8`);
1117	/-*
1118	* tmp2[i] < 817(2^118 + 2^74 + 2^28)
1119	* = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31
1120	* < 2^126
1121	*/
1122	felem_diff128(tmp, tmp2);
1123	/-*
1124	* tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1125	* = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 +
1126	* 2^74 + 2^69 + 2^34 + 2^30
1127	* < 2^128
1128	*/
1129	felem_reduce(y_out, tmp);
1130	}
1131
1132	/ copy_conditional copies in to out iff mask is all ones. /
1133	static void copy_conditional(felem out, const felem in, limb mask)
1134	{
1135	unsigned i;
1136	for (i = `0`; i < NLIMBS; ++i) {
1137	const limb tmp = mask & (in[i] ^ out[i]);
1138	out[i] ^= tmp;
1139	}
1140	}
1141
1142	/-*
1143	* point_add calculates (x1, y1, z1) + (x2, y2, z2)
1144	*
1145	* The method is taken from
1146	* http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1147	* adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1148	*
1149	* This function includes a branch for checking whether the two input points
1150	* are equal (while not equal to the point at infinity). See comment below
1151	* on constant-time.
1152	*/
1153	static void point_add(felem x3, felem y3, felem z3,
1154	const felem x1, const felem y1, const felem z1,
1155	const int mixed, const felem x2, const felem y2,
1156	const felem z2)
1157	{
1158	felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1159	largefelem tmp, tmp2;
1160	limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1161
1162	z1_is_zero = felem_is_zero(z1);
1163	z2_is_zero = felem_is_zero(z2);
1164
1165	/ ftmp = z1z1 = z1*2 /*
1166	felem_square(tmp, z1);
1167	felem_reduce(ftmp, tmp);
1168
1169	if (!mixed) {
1170	/ ftmp2 = z2z2 = z2*2 /*
1171	felem_square(tmp, z2);
1172	felem_reduce(ftmp2, tmp);
1173
1174	/ u1 = ftmp3 = x1z2z2 /*
1175	felem_mul(tmp, x1, ftmp2);
1176	felem_reduce(ftmp3, tmp);
1177
1178	/ ftmp5 = z1 + z2 /
1179	felem_assign(ftmp5, z1);
1180	felem_sum64(ftmp5, z2);
1181	/ ftmp5[i] < 2^61 /
1182
1183	/ ftmp5 = (z1 + z2)*2 - z1z1 - z2z2 = 2z1z2 /
1184	felem_square(tmp, ftmp5);
1185	/ tmp[i] < 172^122 /*
1186	felem_diff_128_64(tmp, ftmp);
1187	/ tmp[i] < 172^122 + 2^63 /*
1188	felem_diff_128_64(tmp, ftmp2);
1189	/ tmp[i] < 172^122 + 2^64 /*
1190	felem_reduce(ftmp5, tmp);
1191
1192	/ ftmp2 = z2 * z2z2 /
1193	felem_mul(tmp, ftmp2, z2);
1194	felem_reduce(ftmp2, tmp);
1195
1196	/ s1 = ftmp6 = y1 * z2*3 /*
1197	felem_mul(tmp, y1, ftmp2);
1198	felem_reduce(ftmp6, tmp);
1199	} else {
1200	/*
1201	* We'll assume z2 = 1 (special case z2 = 0 is handled later)
1202	*/
1203
1204	/ u1 = ftmp3 = x1z2z2 /*
1205	felem_assign(ftmp3, x1);
1206
1207	/ ftmp5 = 2z1z2 /*
1208	felem_scalar(ftmp5, z1, `2`);
1209
1210	/ s1 = ftmp6 = y1 * z2*3 /*
1211	felem_assign(ftmp6, y1);
1212	}
1213
1214	/ u2 = x2z1z1 /*
1215	felem_mul(tmp, x2, ftmp);
1216	/ tmp[i] < 172^120 /*
1217
1218	/ h = ftmp4 = u2 - u1 /
1219	felem_diff_128_64(tmp, ftmp3);
1220	/ tmp[i] < 172^120 + 2^63 /*
1221	felem_reduce(ftmp4, tmp);
1222
1223	x_equal = felem_is_zero(ftmp4);
1224
1225	/ z_out = ftmp5 * h /
1226	felem_mul(tmp, ftmp5, ftmp4);
1227	felem_reduce(z_out, tmp);
1228
1229	/ ftmp = z1 * z1z1 /
1230	felem_mul(tmp, ftmp, z1);
1231	felem_reduce(ftmp, tmp);
1232
1233	/ s2 = tmp = y2 * z1*3 /*
1234	felem_mul(tmp, y2, ftmp);
1235	/ tmp[i] < 172^120 /*
1236
1237	/ r = ftmp5 = (s2 - s1)2 /*
1238	felem_diff_128_64(tmp, ftmp6);
1239	/ tmp[i] < 172^120 + 2^63 /*
1240	felem_reduce(ftmp5, tmp);
1241	y_equal = felem_is_zero(ftmp5);
1242	felem_scalar64(ftmp5, `2`);
1243	/ ftmp5[i] < 2^61 /
1244
1245	if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
1246	/*
1247	* This is obviously not constant-time but it will almost-never happen
1248	* for ECDH / ECDSA. The case where it can happen is during scalar-mult
1249	* where the intermediate value gets very close to the group order.
1250	* Since \|ec_GFp_nistp_recode_scalar_bits\| produces signed digits for
1251	* the scalar, it's possible for the intermediate value to be a small
1252	* negative multiple of the base point, and for the final signed digit
1253	* to be the same value. We believe that this only occurs for the scalar
1254	* 1fffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
1255	* ffffffa51868783bf2f966b7fcc0148f709a5d03bb5c9b8899c47aebb6fb
1256	* 71e913863f7, in that case the penultimate intermediate is -9G and
1257	* the final digit is also -9G. Since this only happens for a single
1258	* scalar, the timing leak is irrelevant. (Any attacker who wanted to
1259	* check whether a secret scalar was that exact value, can already do
1260	* so.)
1261	*/
1262	point_double(x3, y3, z3, x1, y1, z1);
1263	return;
1264	}
1265
1266	/ I = ftmp = (2h)*2 /*
1267	felem_assign(ftmp, ftmp4);
1268	felem_scalar64(ftmp, `2`);
1269	/ ftmp[i] < 2^61 /
1270	felem_square(tmp, ftmp);
1271	/ tmp[i] < 172^122 /*
1272	felem_reduce(ftmp, tmp);
1273
1274	/ J = ftmp2 = h * I /
1275	felem_mul(tmp, ftmp4, ftmp);
1276	felem_reduce(ftmp2, tmp);
1277
1278	/ V = ftmp4 = U1 * I /
1279	felem_mul(tmp, ftmp3, ftmp);
1280	felem_reduce(ftmp4, tmp);
1281
1282	/ x_out = r*2 - J - 2V /*
1283	felem_square(tmp, ftmp5);
1284	/ tmp[i] < 172^122 /*
1285	felem_diff_128_64(tmp, ftmp2);
1286	/ tmp[i] < 172^122 + 2^63 /*
1287	felem_assign(ftmp3, ftmp4);
1288	felem_scalar64(ftmp4, `2`);
1289	/ ftmp4[i] < 2^61 /
1290	felem_diff_128_64(tmp, ftmp4);
1291	/ tmp[i] < 172^122 + 2^64 /*
1292	felem_reduce(x_out, tmp);
1293
1294	/ y_out = r(V-x_out) - 2 * s1 * J /
1295	felem_diff64(ftmp3, x_out);
1296	/*
1297	* ftmp3[i] < 2^60 + 2^60 = 2^61
1298	*/
1299	felem_mul(tmp, ftmp5, ftmp3);
1300	/ tmp[i] < 172^122 /*
1301	felem_mul(tmp2, ftmp6, ftmp2);
1302	/ tmp2[i] < 172^120 /*
1303	felem_scalar128(tmp2, `2`);
1304	/ tmp2[i] < 172^121 /*
1305	felem_diff128(tmp, tmp2);
1306	/-*
1307	* tmp[i] < 2^127 - 2^69 + 17*2^122
1308	* = 2^126 - 2^122 - 2^6 - 2^2 - 1
1309	* < 2^127
1310	*/
1311	felem_reduce(y_out, tmp);
1312
1313	copy_conditional(x_out, x2, z1_is_zero);
1314	copy_conditional(x_out, x1, z2_is_zero);
1315	copy_conditional(y_out, y2, z1_is_zero);
1316	copy_conditional(y_out, y1, z2_is_zero);
1317	copy_conditional(z_out, z2, z1_is_zero);
1318	copy_conditional(z_out, z1, z2_is_zero);
1319	felem_assign(x3, x_out);
1320	felem_assign(y3, y_out);
1321	felem_assign(z3, z_out);
1322	}
1323
1324	/-*
1325	* Base point pre computation
1326	* --------------------------
1327	*
1328	* Two different sorts of precomputed tables are used in the following code.
1329	* Each contain various points on the curve, where each point is three field
1330	* elements (x, y, z).
1331	*
1332	* For the base point table, z is usually 1 (0 for the point at infinity).
1333	* This table has 16 elements:
1334	* index \| bits \| point
1335	* ------+---------+------------------------------
1336	* 0 \| 0 0 0 0 \| 0G
1337	* 1 \| 0 0 0 1 \| 1G
1338	* 2 \| 0 0 1 0 \| 2^130G
1339	* 3 \| 0 0 1 1 \| (2^130 + 1)G
1340	* 4 \| 0 1 0 0 \| 2^260G
1341	* 5 \| 0 1 0 1 \| (2^260 + 1)G
1342	* 6 \| 0 1 1 0 \| (2^260 + 2^130)G
1343	* 7 \| 0 1 1 1 \| (2^260 + 2^130 + 1)G
1344	* 8 \| 1 0 0 0 \| 2^390G
1345	* 9 \| 1 0 0 1 \| (2^390 + 1)G
1346	* 10 \| 1 0 1 0 \| (2^390 + 2^130)G
1347	* 11 \| 1 0 1 1 \| (2^390 + 2^130 + 1)G
1348	* 12 \| 1 1 0 0 \| (2^390 + 2^260)G
1349	* 13 \| 1 1 0 1 \| (2^390 + 2^260 + 1)G
1350	* 14 \| 1 1 1 0 \| (2^390 + 2^260 + 2^130)G
1351	* 15 \| 1 1 1 1 \| (2^390 + 2^260 + 2^130 + 1)G
1352	*
1353	* The reason for this is so that we can clock bits into four different
1354	* locations when doing simple scalar multiplies against the base point.
1355	*
1356	* Tables for other points have table[i] = iG for i in 0 .. 16. */
1357
1358	/ gmul is the table of precomputed base points /
1359	static const felem gmul[`16`][`3`] = {
1360	{{`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`},
1361	{`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`},
1362	{`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`}},
1363	{{`0x017e7e31c2e5bd66`, `0x022cf0615a90a6fe`, `0x00127a2ffa8de334`,
1364	`0x01dfbf9d64a3f877`, `0x006b4d3dbaa14b5e`, `0x014fed487e0a2bd8`,
1365	`0x015b4429c6481390`, `0x03a73678fb2d988e`, `0x00c6858e06b70404`},
1366	{`0x00be94769fd16650`, `0x031c21a89cb09022`, `0x039013fad0761353`,
1367	`0x02657bd099031542`, `0x03273e662c97ee72`, `0x01e6d11a05ebef45`,
1368	`0x03d1bd998f544495`, `0x03001172297ed0b1`, `0x011839296a789a3b`},
1369	{`1`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`}},
1370	{{`0x0373faacbc875bae`, `0x00f325023721c671`, `0x00f666fd3dbde5ad`,
1371	`0x01a6932363f88ea7`, `0x01fc6d9e13f9c47b`, `0x03bcbffc2bbf734e`,
1372	`0x013ee3c3647f3a92`, `0x029409fefe75d07d`, `0x00ef9199963d85e5`},
1373	{`0x011173743ad5b178`, `0x02499c7c21bf7d46`, `0x035beaeabb8b1a58`,
1374	`0x00f989c4752ea0a3`, `0x0101e1de48a9c1a3`, `0x01a20076be28ba6c`,
1375	`0x02f8052e5eb2de95`, `0x01bfe8f82dea117c`, `0x0160074d3c36ddb7`},
1376	{`1`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`}},
1377	{{`0x012f3fc373393b3b`, `0x03d3d6172f1419fa`, `0x02adc943c0b86873`,
1378	`0x00d475584177952b`, `0x012a4d1673750ee2`, `0x00512517a0f13b0c`,
1379	`0x02b184671a7b1734`, `0x0315b84236f1a50a`, `0x00a4afc472edbdb9`},
1380	{`0x00152a7077f385c4`, `0x03044007d8d1c2ee`, `0x0065829d61d52b52`,
1381	`0x00494ff6b6631d0d`, `0x00a11d94d5f06bcf`, `0x02d2f89474d9282e`,
1382	`0x0241c5727c06eeb9`, `0x0386928710fbdb9d`, `0x01f883f727b0dfbe`},
1383	{`1`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`}},
1384	{{`0x019b0c3c9185544d`, `0x006243a37c9d97db`, `0x02ee3cbe030a2ad2`,
1385	`0x00cfdd946bb51e0d`, `0x0271c00932606b91`, `0x03f817d1ec68c561`,
1386	`0x03f37009806a369c`, `0x03c1f30baf184fd5`, `0x01091022d6d2f065`},
1387	{`0x0292c583514c45ed`, `0x0316fca51f9a286c`, `0x00300af507c1489a`,
1388	`0x0295f69008298cf1`, `0x02c0ed8274943d7b`, `0x016509b9b47a431e`,
1389	`0x02bc9de9634868ce`, `0x005b34929bffcb09`, `0x000c1a0121681524`},
1390	{`1`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`}},
1391	{{`0x0286abc0292fb9f2`, `0x02665eee9805b3f7`, `0x01ed7455f17f26d6`,
1392	`0x0346355b83175d13`, `0x006284944cd0a097`, `0x0191895bcdec5e51`,
1393	`0x02e288370afda7d9`, `0x03b22312bfefa67a`, `0x01d104d3fc0613fe`},
1394	{`0x0092421a12f7e47f`, `0x0077a83fa373c501`, `0x03bd25c5f696bd0d`,
1395	`0x035c41e4d5459761`, `0x01ca0d1742b24f53`, `0x00aaab27863a509c`,
1396	`0x018b6de47df73917`, `0x025c0b771705cd01`, `0x01fd51d566d760a7`},
1397	{`1`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`}},
1398	{{`0x01dd92ff6b0d1dbd`, `0x039c5e2e8f8afa69`, `0x0261ed13242c3b27`,
1399	`0x0382c6e67026e6a0`, `0x01d60b10be2089f9`, `0x03c15f3dce86723f`,
1400	`0x03c764a32d2a062d`, `0x017307eac0fad056`, `0x018207c0b96c5256`},
1401	{`0x0196a16d60e13154`, `0x03e6ce74c0267030`, `0x00ddbf2b4e52a5aa`,
1402	`0x012738241bbf31c8`, `0x00ebe8dc04685a28`, `0x024c2ad6d380d4a2`,
1403	`0x035ee062a6e62d0e`, `0x0029ed74af7d3a0f`, `0x00eef32aec142ebd`},
1404	{`1`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`}},
1405	{{`0x00c31ec398993b39`, `0x03a9f45bcda68253`, `0x00ac733c24c70890`,
1406	`0x00872b111401ff01`, `0x01d178c23195eafb`, `0x03bca2c816b87f74`,
1407	`0x0261a9af46fbad7a`, `0x0324b2a8dd3d28f9`, `0x00918121d8f24e23`},
1408	{`0x032bc8c1ca983cd7`, `0x00d869dfb08fc8c6`, `0x01693cb61fce1516`,
1409	`0x012a5ea68f4e88a8`, `0x010869cab88d7ae3`, `0x009081ad277ceee1`,
1410	`0x033a77166d064cdc`, `0x03955235a1fb3a95`, `0x01251a4a9b25b65e`},
1411	{`1`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`}},
1412	{{`0x00148a3a1b27f40b`, `0x0123186df1b31fdc`, `0x00026e7beaad34ce`,
1413	`0x01db446ac1d3dbba`, `0x0299c1a33437eaec`, `0x024540610183cbb7`,
1414	`0x0173bb0e9ce92e46`, `0x02b937e43921214b`, `0x01ab0436a9bf01b5`},
1415	{`0x0383381640d46948`, `0x008dacbf0e7f330f`, `0x03602122bcc3f318`,
1416	`0x01ee596b200620d6`, `0x03bd0585fda430b3`, `0x014aed77fd123a83`,
1417	`0x005ace749e52f742`, `0x0390fe041da2b842`, `0x0189a8ceb3299242`},
1418	{`1`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`}},
1419	{{`0x012a19d6b3282473`, `0x00c0915918b423ce`, `0x023a954eb94405ae`,
1420	`0x00529f692be26158`, `0x0289fa1b6fa4b2aa`, `0x0198ae4ceea346ef`,
1421	`0x0047d8cdfbdedd49`, `0x00cc8c8953f0f6b8`, `0x001424abbff49203`},
1422	{`0x0256732a1115a03a`, `0x0351bc38665c6733`, `0x03f7b950fb4a6447`,
1423	`0x000afffa94c22155`, `0x025763d0a4dab540`, `0x000511e92d4fc283`,
1424	`0x030a7e9eda0ee96c`, `0x004c3cd93a28bf0a`, `0x017edb3a8719217f`},
1425	{`1`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`}},
1426	{{`0x011de5675a88e673`, `0x031d7d0f5e567fbe`, `0x0016b2062c970ae5`,
1427	`0x03f4a2be49d90aa7`, `0x03cef0bd13822866`, `0x03f0923dcf774a6c`,
1428	`0x0284bebc4f322f72`, `0x016ab2645302bb2c`, `0x01793f95dace0e2a`},
1429	{`0x010646e13527a28f`, `0x01ca1babd59dc5e7`, `0x01afedfd9a5595df`,
1430	`0x01f15785212ea6b1`, `0x0324e5d64f6ae3f4`, `0x02d680f526d00645`,
1431	`0x0127920fadf627a7`, `0x03b383f75df4f684`, `0x0089e0057e783b0a`},
1432	{`1`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`}},
1433	{{`0x00f334b9eb3c26c6`, `0x0298fdaa98568dce`, `0x01c2d24843a82292`,
1434	`0x020bcb24fa1b0711`, `0x02cbdb3d2b1875e6`, `0x0014907598f89422`,
1435	`0x03abe3aa43b26664`, `0x02cbf47f720bc168`, `0x0133b5e73014b79b`},
1436	{`0x034aab5dab05779d`, `0x00cdc5d71fee9abb`, `0x0399f16bd4bd9d30`,
1437	`0x03582fa592d82647`, `0x02be1cdfb775b0e9`, `0x0034f7cea32e94cb`,
1438	`0x0335a7f08f56f286`, `0x03b707e9565d1c8b`, `0x0015c946ea5b614f`},
1439	{`1`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`}},
1440	{{`0x024676f6cff72255`, `0x00d14625cac96378`, `0x00532b6008bc3767`,
1441	`0x01fc16721b985322`, `0x023355ea1b091668`, `0x029de7afdc0317c3`,
1442	`0x02fc8a7ca2da037c`, `0x02de1217d74a6f30`, `0x013f7173175b73bf`},
1443	{`0x0344913f441490b5`, `0x0200f9e272b61eca`, `0x0258a246b1dd55d2`,
1444	`0x03753db9ea496f36`, `0x025e02937a09c5ef`, `0x030cbd3d14012692`,
1445	`0x01793a67e70dc72a`, `0x03ec1d37048a662e`, `0x006550f700c32a8d`},
1446	{`1`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`}},
1447	{{`0x00d3f48a347eba27`, `0x008e636649b61bd8`, `0x00d3b93716778fb3`,
1448	`0x004d1915757bd209`, `0x019d5311a3da44e0`, `0x016d1afcbbe6aade`,
1449	`0x0241bf5f73265616`, `0x0384672e5d50d39b`, `0x005009fee522b684`},
1450	{`0x029b4fab064435fe`, `0x018868ee095bbb07`, `0x01ea3d6936cc92b8`,
1451	`0x000608b00f78a2f3`, `0x02db911073d1c20f`, `0x018205938470100a`,
1452	`0x01f1e4964cbe6ff2`, `0x021a19a29eed4663`, `0x01414485f42afa81`},
1453	{`1`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`}},
1454	{{`0x01612b3a17f63e34`, `0x03813992885428e6`, `0x022b3c215b5a9608`,
1455	`0x029b4057e19f2fcb`, `0x0384059a587af7e6`, `0x02d6400ace6fe610`,
1456	`0x029354d896e8e331`, `0x00c047ee6dfba65e`, `0x0037720542e9d49d`},
1457	{`0x02ce9eed7c5e9278`, `0x0374ed703e79643b`, `0x01316c54c4072006`,
1458	`0x005aaa09054b2ee8`, `0x002824000c840d57`, `0x03d4eba24771ed86`,
1459	`0x0189c50aabc3bdae`, `0x0338c01541e15510`, `0x00466d56e38eed42`},
1460	{`1`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`}},
1461	{{`0x007efd8330ad8bd6`, `0x02465ed48047710b`, `0x0034c6606b215e0c`,
1462	`0x016ae30c53cbf839`, `0x01fa17bd37161216`, `0x018ead4e61ce8ab9`,
1463	`0x005482ed5f5dee46`, `0x037543755bba1d7f`, `0x005e5ac7e70a9d0f`},
1464	{`0x0117e1bb2fdcb2a2`, `0x03deea36249f40c4`, `0x028d09b4a6246cb7`,
1465	`0x03524b8855bcf756`, `0x023d7d109d5ceb58`, `0x0178e43e3223ef9c`,
1466	`0x0154536a0c6e966a`, `0x037964d1286ee9fe`, `0x0199bcd90e125055`},
1467	{`1`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`}}
1468	};
1469
1470	/*
1471	* select_point selects the \|idx\|th point from a precomputation table and
1472	* copies it to out.
1473	*/
1474	/ pre_comp below is of the size provided in \|size\| /
1475	static void select_point(const limb idx, unsigned int size,
1476	const felem pre_comp[][`3`], felem out[`3`])
1477	{
1478	unsigned i, j;
1479	limb *outlimbs = &out[`0`][`0`];
1480
1481	memset(out, `0`, sizeof(out) `3`);
1482
1483	for (i = `0`; i < size; i++) {
1484	const limb *inlimbs = &pre_comp[i][`0`][`0`];
1485	limb mask = i ^ idx;
1486	mask \|= mask >> `4`;
1487	mask \|= mask >> `2`;
1488	mask \|= mask >> `1`;
1489	mask &= `1`;
1490	mask--;
1491	for (j = `0`; j < NLIMBS * `3`; j++)
1492	outlimbs[j] \|= inlimbs[j] & mask;
1493	}
1494	}
1495
1496	/ get_bit returns the \|i\|th bit in \|in\| /
1497	static char get_bit(const felem_bytearray in, int i)
1498	{
1499	if (i < `0`)
1500	return `0`;
1501	return (in[i >> `3`] >> (i & `7`)) & `1`;
1502	}
1503
1504	/*
1505	* Interleaved point multiplication using precomputed point multiples: The
1506	* small point multiples 0P, 1P, ..., 16*P are in pre_comp[], the scalars
1507	* in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1508	* generator, using certain (large) precomputed multiples in g_pre_comp.
1509	* Output point (X, Y, Z) is stored in x_out, y_out, z_out
1510	*/
1511	static void batch_mul(felem x_out, felem y_out, felem z_out,
1512	const felem_bytearray scalars[],
1513	const unsigned num_points, const u8 *g_scalar,
1514	const int mixed, const felem pre_comp[][`17`][`3`],
1515	const felem g_pre_comp[`16`][`3`])
1516	{
1517	int i, skip;
1518	unsigned num, gen_mul = (g_scalar != NULL);
1519	felem nq[`3`], tmp[`4`];
1520	limb bits;
1521	u8 sign, digit;
1522
1523	/ set nq to the point at infinity /
1524	memset(nq, `0`, sizeof(nq));
1525
1526	/*
1527	* Loop over all scalars msb-to-lsb, interleaving additions of multiples
1528	* of the generator (last quarter of rounds) and additions of other
1529	* points multiples (every 5th round).
1530	*/
1531	skip = `1`; / save two point operations in the first*
1532	* round */
1533	for (i = (num_points ? `520` : `130`); i >= `0`; --i) {
1534	/ double /
1535	if (!skip)
1536	point_double(nq[`0`], nq[`1`], nq[`2`], nq[`0`], nq[`1`], nq[`2`]);
1537
1538	/ add multiples of the generator /
1539	if (gen_mul && (i <= `130`)) {
1540	bits = get_bit(g_scalar, i + `390`) << `3`;
1541	if (i < `130`) {
1542	bits \|= get_bit(g_scalar, i + `260`) << `2`;
1543	bits \|= get_bit(g_scalar, i + `130`) << `1`;
1544	bits \|= get_bit(g_scalar, i);
1545	}
1546	/ select the point to add, in constant time /
1547	select_point(bits, `16`, g_pre_comp, tmp);
1548	if (!skip) {
1549	/ The 1 argument below is for "mixed" /
1550	point_add(nq[`0`], nq[`1`], nq[`2`],
1551	nq[`0`], nq[`1`], nq[`2`], `1`, tmp[`0`], tmp[`1`], tmp[`2`]);
1552	} else {
1553	memcpy(nq, tmp, `3` * sizeof(felem));
1554	skip = `0`;
1555	}
1556	}
1557
1558	/ do other additions every 5 doublings /
1559	if (num_points && (i % `5` == `0`)) {
1560	/ loop over all scalars /
1561	for (num = `0`; num < num_points; ++num) {
1562	bits = get_bit(scalars[num], i + `4`) << `5`;
1563	bits \|= get_bit(scalars[num], i + `3`) << `4`;
1564	bits \|= get_bit(scalars[num], i + `2`) << `3`;
1565	bits \|= get_bit(scalars[num], i + `1`) << `2`;
1566	bits \|= get_bit(scalars[num], i) << `1`;
1567	bits \|= get_bit(scalars[num], i - `1`);
1568	ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1569
1570	/*
1571	* select the point to add or subtract, in constant time
1572	*/
1573	select_point(digit, `17`, pre_comp[num], tmp);
1574	felem_neg(tmp[`3`], tmp[`1`]); / (X, -Y, Z) is the negative*
1575	* point */
1576	copy_conditional(tmp[`1`], tmp[`3`], (-(limb) sign));
1577
1578	if (!skip) {
1579	point_add(nq[`0`], nq[`1`], nq[`2`],
1580	nq[`0`], nq[`1`], nq[`2`],
1581	mixed, tmp[`0`], tmp[`1`], tmp[`2`]);
1582	} else {
1583	memcpy(nq, tmp, `3` * sizeof(felem));
1584	skip = `0`;
1585	}
1586	}
1587	}
1588	}
1589	felem_assign(x_out, nq[`0`]);
1590	felem_assign(y_out, nq[`1`]);
1591	felem_assign(z_out, nq[`2`]);
1592	}
1593
1594	/ Precomputation for the group generator. /
1595	struct nistp521_pre_comp_st {
1596	felem g_pre_comp[`16`][`3`];
1597	CRYPTO_REF_COUNT references;
1598	CRYPTO_RWLOCK *lock;
1599	};
1600
1601	const EC_METHOD EC_GFp_nistp521_method(void*)
1602	{
1603	static const EC_METHOD ret = {
1604	EC_FLAGS_DEFAULT_OCT,
1605	NID_X9_62_prime_field,
1606	ec_GFp_nistp521_group_init,
1607	ec_GFp_simple_group_finish,
1608	ec_GFp_simple_group_clear_finish,
1609	ec_GFp_nist_group_copy,
1610	ec_GFp_nistp521_group_set_curve,
1611	ec_GFp_simple_group_get_curve,
1612	ec_GFp_simple_group_get_degree,
1613	ec_group_simple_order_bits,
1614	ec_GFp_simple_group_check_discriminant,
1615	ec_GFp_simple_point_init,
1616	ec_GFp_simple_point_finish,
1617	ec_GFp_simple_point_clear_finish,
1618	ec_GFp_simple_point_copy,
1619	ec_GFp_simple_point_set_to_infinity,
1620	ec_GFp_simple_set_Jprojective_coordinates_GFp,
1621	ec_GFp_simple_get_Jprojective_coordinates_GFp,
1622	ec_GFp_simple_point_set_affine_coordinates,
1623	ec_GFp_nistp521_point_get_affine_coordinates,
1624	`0` / point_set_compressed_coordinates / ,
1625	`0` / point2oct / ,
1626	`0` / oct2point / ,
1627	ec_GFp_simple_add,
1628	ec_GFp_simple_dbl,
1629	ec_GFp_simple_invert,
1630	ec_GFp_simple_is_at_infinity,
1631	ec_GFp_simple_is_on_curve,
1632	ec_GFp_simple_cmp,
1633	ec_GFp_simple_make_affine,
1634	ec_GFp_simple_points_make_affine,
1635	ec_GFp_nistp521_points_mul,
1636	ec_GFp_nistp521_precompute_mult,
1637	ec_GFp_nistp521_have_precompute_mult,
1638	ec_GFp_nist_field_mul,
1639	ec_GFp_nist_field_sqr,
1640	`0` / field_div / ,
1641	ec_GFp_simple_field_inv,
1642	`0` / field_encode / ,
1643	`0` / field_decode / ,
1644	`0`, / field_set_to_one /
1645	ec_key_simple_priv2oct,
1646	ec_key_simple_oct2priv,
1647	`0`, / set private /
1648	ec_key_simple_generate_key,
1649	ec_key_simple_check_key,
1650	ec_key_simple_generate_public_key,
1651	`0`, / keycopy /
1652	`0`, / keyfinish /
1653	ecdh_simple_compute_key,
1654	ecdsa_simple_sign_setup,
1655	ecdsa_simple_sign_sig,
1656	ecdsa_simple_verify_sig,
1657	`0`, / field_inverse_mod_ord /
1658	`0`, / blind_coordinates /
1659	`0`, / ladder_pre /
1660	`0`, / ladder_step /
1661	`0` / ladder_post /
1662	};
1663
1664	return &ret;
1665	}
1666
1667	/****************************************************************************/
1668	/*
1669	* FUNCTIONS TO MANAGE PRECOMPUTATION
1670	*/
1671
1672	static NISTP521_PRE_COMP nistp521_pre_comp_new(void*)
1673	{
1674	NISTP521_PRE_COMP ret = OPENSSL_zalloc(sizeof(ret));
1675
1676	if (ret == NULL) {
1677	ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1678	return ret;
1679	}
1680
1681	ret->references = `1`;
1682
1683	ret->lock = CRYPTO_THREAD_lock_new();
1684	if (ret->lock == NULL) {
1685	ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1686	OPENSSL_free(ret);
1687	return NULL;
1688	}
1689	return ret;
1690	}
1691
1692	NISTP521_PRE_COMP EC_nistp521_pre_comp_dup(NISTP521_PRE_COMP p)
1693	{
1694	int i;
1695	if (p != NULL)
1696	CRYPTO_UP_REF(&p->references, &i, p->lock);
1697	return p;
1698	}
1699
1700	void EC_nistp521_pre_comp_free(NISTP521_PRE_COMP *p)
1701	{
1702	int i;
1703
1704	if (p == NULL)
1705	return;
1706
1707	CRYPTO_DOWN_REF(&p->references, &i, p->lock);
1708	REF_PRINT_COUNT("EC_nistp521", x);
1709	if (i > `0`)
1710	return;
1711	REF_ASSERT_ISNT(i < `0`);
1712
1713	CRYPTO_THREAD_lock_free(p->lock);
1714	OPENSSL_free(p);
1715	}
1716
1717	/****************************************************************************/
1718	/*
1719	* OPENSSL EC_METHOD FUNCTIONS
1720	*/
1721
1722	int ec_GFp_nistp521_group_init(EC_GROUP *group)
1723	{
1724	int ret;
1725	ret = ec_GFp_simple_group_init(group);
1726	group->a_is_minus3 = `1`;
1727	return ret;
1728	}
1729
1730	int ec_GFp_nistp521_group_set_curve(EC_GROUP group, const* BIGNUM *p,
1731	const BIGNUM a, const* BIGNUM *b,
1732	BN_CTX *ctx)
1733	{
1734	int ret = `0`;
1735	BIGNUM curve_p, curve_a, *curve_b;
1736	#ifndef FIPS_MODE
1737	BN_CTX *new_ctx = NULL;
1738
1739	if (ctx == NULL)
1740	ctx = new_ctx = BN_CTX_new();
1741	#endif
1742	if (ctx == NULL)
1743	return `0`;
1744
1745	BN_CTX_start(ctx);
1746	curve_p = BN_CTX_get(ctx);
1747	curve_a = BN_CTX_get(ctx);
1748	curve_b = BN_CTX_get(ctx);
1749	if (curve_b == NULL)
1750	goto err;
1751	BN_bin2bn(nistp521_curve_params[`0`], sizeof(felem_bytearray), curve_p);
1752	BN_bin2bn(nistp521_curve_params[`1`], sizeof(felem_bytearray), curve_a);
1753	BN_bin2bn(nistp521_curve_params[`2`], sizeof(felem_bytearray), curve_b);
1754	if ((BN_cmp(curve_p, p)) \|\| (BN_cmp(curve_a, a)) \|\| (BN_cmp(curve_b, b))) {
1755	ECerr(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE,
1756	EC_R_WRONG_CURVE_PARAMETERS);
1757	goto err;
1758	}
1759	group->field_mod_func = BN_nist_mod_521;
1760	ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1761	err:
1762	BN_CTX_end(ctx);
1763	#ifndef FIPS_MODE
1764	BN_CTX_free(new_ctx);
1765	#endif
1766	return ret;
1767	}
1768
1769	/*
1770	* Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1771	* (X/Z^2, Y/Z^3)
1772	*/
1773	int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group,
1774	const EC_POINT *point,
1775	BIGNUM x, BIGNUM y,
1776	BN_CTX *ctx)
1777	{
1778	felem z1, z2, x_in, y_in, x_out, y_out;
1779	largefelem tmp;
1780
1781	if (EC_POINT_is_at_infinity(group, point)) {
1782	ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1783	EC_R_POINT_AT_INFINITY);
1784	return `0`;
1785	}
1786	if ((!BN_to_felem(x_in, point->X)) \|\| (!BN_to_felem(y_in, point->Y)) \|\|
1787	(!BN_to_felem(z1, point->Z)))
1788	return `0`;
1789	felem_inv(z2, z1);
1790	felem_square(tmp, z2);
1791	felem_reduce(z1, tmp);
1792	felem_mul(tmp, x_in, z1);
1793	felem_reduce(x_in, tmp);
1794	felem_contract(x_out, x_in);
1795	if (x != NULL) {
1796	if (!felem_to_BN(x, x_out)) {
1797	ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1798	ERR_R_BN_LIB);
1799	return `0`;
1800	}
1801	}
1802	felem_mul(tmp, z1, z2);
1803	felem_reduce(z1, tmp);
1804	felem_mul(tmp, y_in, z1);
1805	felem_reduce(y_in, tmp);
1806	felem_contract(y_out, y_in);
1807	if (y != NULL) {
1808	if (!felem_to_BN(y, y_out)) {
1809	ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1810	ERR_R_BN_LIB);
1811	return `0`;
1812	}
1813	}
1814	return `1`;
1815	}
1816
1817	/ points below is of size \|num\|, and tmp_felems is of size \|num+1/ /
1818	static void make_points_affine(size_t num, felem points[][`3`],
1819	felem tmp_felems[])
1820	{
1821	/*
1822	* Runs in constant time, unless an input is the point at infinity (which
1823	* normally shouldn't happen).
1824	*/
1825	ec_GFp_nistp_points_make_affine_internal(num,
1826	points,
1827	sizeof(felem),
1828	tmp_felems,
1829	(void ()(void* *))felem_one,
1830	felem_is_zero_int,
1831	(void ()(void* , const* void *))
1832	felem_assign,
1833	(void ()(void* , const* void *))
1834	felem_square_reduce, (void (*)
1835	(void *,
1836	const void
1837	*,
1838	const void
1839	*))
1840	felem_mul_reduce,
1841	(void ()(void* , const* void *))
1842	felem_inv,
1843	(void ()(void* , const* void *))
1844	felem_contract);
1845	}
1846
1847	/*
1848	* Computes scalargenerator + \sum scalars[i]points[i], ignoring NULL
1849	* values Result is stored in r (r can equal one of the inputs).
1850	*/
1851	int ec_GFp_nistp521_points_mul(const EC_GROUP group, EC_POINT r,
1852	const BIGNUM *scalar, size_t num,
1853	const EC_POINT *points[],
1854	const BIGNUM scalars[], BN_CTX ctx)
1855	{
1856	int ret = `0`;
1857	int j;
1858	int mixed = `0`;
1859	BIGNUM x, y, z, tmp_scalar;
1860	felem_bytearray g_secret;
1861	felem_bytearray *secrets = NULL;
1862	felem (*pre_comp)[`17`][`3`] = NULL;
1863	felem *tmp_felems = NULL;
1864	unsigned i;
1865	int num_bytes;
1866	int have_pre_comp = `0`;
1867	size_t num_points = num;
1868	felem x_in, y_in, z_in, x_out, y_out, z_out;
1869	NISTP521_PRE_COMP *pre = NULL;
1870	felem(*g_pre_comp)[`3`] = NULL;
1871	EC_POINT *generator = NULL;
1872	const EC_POINT *p = NULL;
1873	const BIGNUM *p_scalar = NULL;
1874
1875	BN_CTX_start(ctx);
1876	x = BN_CTX_get(ctx);
1877	y = BN_CTX_get(ctx);
1878	z = BN_CTX_get(ctx);
1879	tmp_scalar = BN_CTX_get(ctx);
1880	if (tmp_scalar == NULL)
1881	goto err;
1882
1883	if (scalar != NULL) {
1884	pre = group->pre_comp.nistp521;
1885	if (pre)
1886	/ we have precomputation, try to use it /
1887	g_pre_comp = &pre->g_pre_comp[`0`];
1888	else
1889	/ try to use the standard precomputation /
1890	g_pre_comp = (felem(*)[`3`]) gmul;
1891	generator = EC_POINT_new(group);
1892	if (generator == NULL)
1893	goto err;
1894	/ get the generator from precomputation /
1895	if (!felem_to_BN(x, g_pre_comp[`1`][`0`]) \|\|
1896	!felem_to_BN(y, g_pre_comp[`1`][`1`]) \|\|
1897	!felem_to_BN(z, g_pre_comp[`1`][`2`])) {
1898	ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1899	goto err;
1900	}
1901	if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1902	generator, x, y, z,
1903	ctx))
1904	goto err;
1905	if (`0` == EC_POINT_cmp(group, generator, group->generator, ctx))
1906	/ precomputation matches generator /
1907	have_pre_comp = `1`;
1908	else
1909	/*
1910	* we don't have valid precomputation: treat the generator as a
1911	* random point
1912	*/
1913	num_points++;
1914	}
1915
1916	if (num_points > `0`) {
1917	if (num_points >= `2`) {
1918	/*
1919	* unless we precompute multiples for just one point, converting
1920	* those into affine form is time well spent
1921	*/
1922	mixed = `1`;
1923	}
1924	secrets = OPENSSL_zalloc(sizeof(secrets) num_points);
1925	pre_comp = OPENSSL_zalloc(sizeof(pre_comp) num_points);
1926	if (mixed)
1927	tmp_felems =
1928	OPENSSL_malloc(sizeof(tmp_felems) (num_points * `17` + `1`));
1929	if ((secrets == NULL) \|\| (pre_comp == NULL)
1930	\|\| (mixed && (tmp_felems == NULL))) {
1931	ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1932	goto err;
1933	}
1934
1935	/*
1936	* we treat NULL scalars as 0, and NULL points as points at infinity,
1937	* i.e., they contribute nothing to the linear combination
1938	*/
1939	for (i = `0`; i < num_points; ++i) {
1940	if (i == num) {
1941	/*
1942	* we didn't have a valid precomputation, so we pick the
1943	* generator
1944	*/
1945	p = EC_GROUP_get0_generator(group);
1946	p_scalar = scalar;
1947	} else {
1948	/ the i^th point /
1949	p = points[i];
1950	p_scalar = scalars[i];
1951	}
1952	if ((p_scalar != NULL) && (p != NULL)) {
1953	/ reduce scalar to 0 <= scalar < 2^521 /
1954	if ((BN_num_bits(p_scalar) > `521`)
1955	\|\| (BN_is_negative(p_scalar))) {
1956	/*
1957	* this is an unusual input, and we don't guarantee
1958	* constant-timeness
1959	*/
1960	if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
1961	ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1962	goto err;
1963	}
1964	num_bytes = BN_bn2lebinpad(tmp_scalar,
1965	secrets[i], sizeof(secrets[i]));
1966	} else {
1967	num_bytes = BN_bn2lebinpad(p_scalar,
1968	secrets[i], sizeof(secrets[i]));
1969	}
1970	if (num_bytes < `0`) {
1971	ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1972	goto err;
1973	}
1974	/ precompute multiples /
1975	if ((!BN_to_felem(x_out, p->X)) \|\|
1976	(!BN_to_felem(y_out, p->Y)) \|\|
1977	(!BN_to_felem(z_out, p->Z)))
1978	goto err;
1979	memcpy(pre_comp[i][`1`][`0`], x_out, sizeof(felem));
1980	memcpy(pre_comp[i][`1`][`1`], y_out, sizeof(felem));
1981	memcpy(pre_comp[i][`1`][`2`], z_out, sizeof(felem));
1982	for (j = `2`; j <= `16`; ++j) {
1983	if (j & `1`) {
1984	point_add(pre_comp[i][j][`0`], pre_comp[i][j][`1`],
1985	pre_comp[i][j][`2`], pre_comp[i][`1`][`0`],
1986	pre_comp[i][`1`][`1`], pre_comp[i][`1`][`2`], `0`,
1987	pre_comp[i][j - `1`][`0`],
1988	pre_comp[i][j - `1`][`1`],
1989	pre_comp[i][j - `1`][`2`]);
1990	} else {
1991	point_double(pre_comp[i][j][`0`], pre_comp[i][j][`1`],
1992	pre_comp[i][j][`2`], pre_comp[i][j / `2`][`0`],
1993	pre_comp[i][j / `2`][`1`],
1994	pre_comp[i][j / `2`][`2`]);
1995	}
1996	}
1997	}
1998	}
1999	if (mixed)
2000	make_points_affine(num_points * `17`, pre_comp[`0`], tmp_felems);
2001	}
2002
2003	/ the scalar for the generator /
2004	if ((scalar != NULL) && (have_pre_comp)) {
2005	memset(g_secret, `0`, sizeof(g_secret));
2006	/ reduce scalar to 0 <= scalar < 2^521 /
2007	if ((BN_num_bits(scalar) > `521`) \|\| (BN_is_negative(scalar))) {
2008	/*
2009	* this is an unusual input, and we don't guarantee
2010	* constant-timeness
2011	*/
2012	if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
2013	ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
2014	goto err;
2015	}
2016	num_bytes = BN_bn2lebinpad(tmp_scalar, g_secret, sizeof(g_secret));
2017	} else {
2018	num_bytes = BN_bn2lebinpad(scalar, g_secret, sizeof(g_secret));
2019	}
2020	/ do the multiplication with generator precomputation /
2021	batch_mul(x_out, y_out, z_out,
2022	(const felem_bytearray(*))secrets, num_points,
2023	g_secret,
2024	mixed, (const felem(*)[`17`][`3`])pre_comp,
2025	(const felem(*)[`3`])g_pre_comp);
2026	} else {
2027	/ do the multiplication without generator precomputation /
2028	batch_mul(x_out, y_out, z_out,
2029	(const felem_bytearray(*))secrets, num_points,
2030	NULL, mixed, (const felem(*)[`17`][`3`])pre_comp, NULL);
2031	}
2032	/ reduce the output to its unique minimal representation /
2033	felem_contract(x_in, x_out);
2034	felem_contract(y_in, y_out);
2035	felem_contract(z_in, z_out);
2036	if ((!felem_to_BN(x, x_in)) \|\| (!felem_to_BN(y, y_in)) \|\|
2037	(!felem_to_BN(z, z_in))) {
2038	ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
2039	goto err;
2040	}
2041	ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
2042
2043	err:
2044	BN_CTX_end(ctx);
2045	EC_POINT_free(generator);
2046	OPENSSL_free(secrets);
2047	OPENSSL_free(pre_comp);
2048	OPENSSL_free(tmp_felems);
2049	return ret;
2050	}
2051
2052	int ec_GFp_nistp521_precompute_mult(EC_GROUP group, BN_CTX ctx)
2053	{
2054	int ret = `0`;
2055	NISTP521_PRE_COMP *pre = NULL;
2056	int i, j;
2057	BIGNUM x, y;
2058	EC_POINT *generator = NULL;
2059	felem tmp_felems[`16`];
2060	#ifndef FIPS_MODE
2061	BN_CTX *new_ctx = NULL;
2062	#endif
2063
2064	/ throw away old precomputation /
2065	EC_pre_comp_free(group);
2066
2067	#ifndef FIPS_MODE
2068	if (ctx == NULL)
2069	ctx = new_ctx = BN_CTX_new();
2070	#endif
2071	if (ctx == NULL)
2072	return `0`;
2073
2074	BN_CTX_start(ctx);
2075	x = BN_CTX_get(ctx);
2076	y = BN_CTX_get(ctx);
2077	if (y == NULL)
2078	goto err;
2079	/ get the generator /
2080	if (group->generator == NULL)
2081	goto err;
2082	generator = EC_POINT_new(group);
2083	if (generator == NULL)
2084	goto err;
2085	BN_bin2bn(nistp521_curve_params[`3`], sizeof(felem_bytearray), x);
2086	BN_bin2bn(nistp521_curve_params[`4`], sizeof(felem_bytearray), y);
2087	if (!EC_POINT_set_affine_coordinates(group, generator, x, y, ctx))
2088	goto err;
2089	if ((pre = nistp521_pre_comp_new()) == NULL)
2090	goto err;
2091	/*
2092	* if the generator is the standard one, use built-in precomputation
2093	*/
2094	if (`0` == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2095	memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2096	goto done;
2097	}
2098	if ((!BN_to_felem(pre->g_pre_comp[`1`][`0`], group->generator->X)) \|\|
2099	(!BN_to_felem(pre->g_pre_comp[`1`][`1`], group->generator->Y)) \|\|
2100	(!BN_to_felem(pre->g_pre_comp[`1`][`2`], group->generator->Z)))
2101	goto err;
2102	/ compute 2^130G, 2^260G, 2^390G /*
2103	for (i = `1`; i <= `4`; i <<= `1`) {
2104	point_double(pre->g_pre_comp[`2` * i][`0`], pre->g_pre_comp[`2` * i][`1`],
2105	pre->g_pre_comp[`2` * i][`2`], pre->g_pre_comp[i][`0`],
2106	pre->g_pre_comp[i][`1`], pre->g_pre_comp[i][`2`]);
2107	for (j = `0`; j < `129`; ++j) {
2108	point_double(pre->g_pre_comp[`2` * i][`0`],
2109	pre->g_pre_comp[`2` * i][`1`],
2110	pre->g_pre_comp[`2` * i][`2`],
2111	pre->g_pre_comp[`2` * i][`0`],
2112	pre->g_pre_comp[`2` * i][`1`],
2113	pre->g_pre_comp[`2` * i][`2`]);
2114	}
2115	}
2116	/ g_pre_comp[0] is the point at infinity /
2117	memset(pre->g_pre_comp[`0`], `0`, sizeof(pre->g_pre_comp[`0`]));
2118	/ the remaining multiples /
2119	/ 2^130G + 2^260G /
2120	point_add(pre->g_pre_comp[`6`][`0`], pre->g_pre_comp[`6`][`1`],
2121	pre->g_pre_comp[`6`][`2`], pre->g_pre_comp[`4`][`0`],
2122	pre->g_pre_comp[`4`][`1`], pre->g_pre_comp[`4`][`2`],
2123	`0`, pre->g_pre_comp[`2`][`0`], pre->g_pre_comp[`2`][`1`],
2124	pre->g_pre_comp[`2`][`2`]);
2125	/ 2^130G + 2^390G /
2126	point_add(pre->g_pre_comp[`10`][`0`], pre->g_pre_comp[`10`][`1`],
2127	pre->g_pre_comp[`10`][`2`], pre->g_pre_comp[`8`][`0`],
2128	pre->g_pre_comp[`8`][`1`], pre->g_pre_comp[`8`][`2`],
2129	`0`, pre->g_pre_comp[`2`][`0`], pre->g_pre_comp[`2`][`1`],
2130	pre->g_pre_comp[`2`][`2`]);
2131	/ 2^260G + 2^390G /
2132	point_add(pre->g_pre_comp[`12`][`0`], pre->g_pre_comp[`12`][`1`],
2133	pre->g_pre_comp[`12`][`2`], pre->g_pre_comp[`8`][`0`],
2134	pre->g_pre_comp[`8`][`1`], pre->g_pre_comp[`8`][`2`],
2135	`0`, pre->g_pre_comp[`4`][`0`], pre->g_pre_comp[`4`][`1`],
2136	pre->g_pre_comp[`4`][`2`]);
2137	/ 2^130G + 2^260G + 2^390G /*
2138	point_add(pre->g_pre_comp[`14`][`0`], pre->g_pre_comp[`14`][`1`],
2139	pre->g_pre_comp[`14`][`2`], pre->g_pre_comp[`12`][`0`],
2140	pre->g_pre_comp[`12`][`1`], pre->g_pre_comp[`12`][`2`],
2141	`0`, pre->g_pre_comp[`2`][`0`], pre->g_pre_comp[`2`][`1`],
2142	pre->g_pre_comp[`2`][`2`]);
2143	for (i = `1`; i < `8`; ++i) {
2144	/ odd multiples: add G /
2145	point_add(pre->g_pre_comp[`2` * i + `1`][`0`],
2146	pre->g_pre_comp[`2` * i + `1`][`1`],
2147	pre->g_pre_comp[`2` * i + `1`][`2`], pre->g_pre_comp[`2` * i][`0`],
2148	pre->g_pre_comp[`2` * i][`1`], pre->g_pre_comp[`2` * i][`2`], `0`,
2149	pre->g_pre_comp[`1`][`0`], pre->g_pre_comp[`1`][`1`],
2150	pre->g_pre_comp[`1`][`2`]);
2151	}
2152	make_points_affine(`15`, &(pre->g_pre_comp[`1`]), tmp_felems);
2153
2154	done:
2155	SETPRECOMP(group, nistp521, pre);
2156	ret = `1`;
2157	pre = NULL;
2158	err:
2159	BN_CTX_end(ctx);
2160	EC_POINT_free(generator);
2161	#ifndef FIPS_MODE
2162	BN_CTX_free(new_ctx);
2163	#endif
2164	EC_nistp521_pre_comp_free(pre);
2165	return ret;
2166	}
2167
2168	int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group)
2169	{
2170	return HAVEPRECOMP(group, nistp521);
2171	}
2172
2173	#endif
2174

Browse the source code of ClickHouse/contrib/openssl/crypto/ec/ecp_nistp521.c