ecp_nistz256.c source code [ClickHouse/contrib/openssl/crypto/ec/ecp_nistz256.c]

1	/*
2	* Copyright 2014-2018 The OpenSSL Project Authors. All Rights Reserved.
3	* Copyright (c) 2014, Intel Corporation. All Rights Reserved.
4	* Copyright (c) 2015, CloudFlare, Inc.
5	*
6	* Licensed under the Apache License 2.0 (the "License"). You may not use
7	* this file except in compliance with the License. You can obtain a copy
8	* in the file LICENSE in the source distribution or at
9	* https://www.openssl.org/source/license.html
10	*
11	* Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1, 3)
12	* (1) Intel Corporation, Israel Development Center, Haifa, Israel
13	* (2) University of Haifa, Israel
14	* (3) CloudFlare, Inc.
15	*
16	* Reference:
17	* S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
18	* 256 Bit Primes"
19	*/
20
21	#include <string.h>
22
23	#include "internal/cryptlib.h"
24	#include "crypto/bn.h"
25	#include "ec_local.h"
26	#include "internal/refcount.h"
27
28	#if BN_BITS2 != 64
29	# define TOBN(hi,lo) lo,hi
30	#else
31	# define TOBN(hi,lo) ((BN_ULONG)hi<<32\|lo)
32	#endif
33
34	#if defined(__GNUC__)
35	# define ALIGN32 __attribute((aligned(32)))
36	#elif defined(_MSC_VER)
37	# define ALIGN32 __declspec(align(32))
38	#else
39	# define ALIGN32
40	#endif
41
42	#define ALIGNPTR(p,N) ((unsigned char *)p+N-(size_t)p%N)
43	#define P256_LIMBS (256/BN_BITS2)
44
45	typedef unsigned short u16;
46
47	typedef struct {
48	BN_ULONG X[P256_LIMBS];
49	BN_ULONG Y[P256_LIMBS];
50	BN_ULONG Z[P256_LIMBS];
51	} P256_POINT;
52
53	typedef struct {
54	BN_ULONG X[P256_LIMBS];
55	BN_ULONG Y[P256_LIMBS];
56	} P256_POINT_AFFINE;
57
58	typedef P256_POINT_AFFINE PRECOMP256_ROW[`64`];
59
60	/ structure for precomputed multiples of the generator /
61	struct nistz256_pre_comp_st {
62	const EC_GROUP group; /* Parent EC_GROUP object /
63	size_t w; / Window size /
64	/*
65	* Constant time access to the X and Y coordinates of the pre-computed,
66	* generator multiplies, in the Montgomery domain. Pre-calculated
67	* multiplies are stored in affine form.
68	*/
69	PRECOMP256_ROW *precomp;
70	void *precomp_storage;
71	CRYPTO_REF_COUNT references;
72	CRYPTO_RWLOCK *lock;
73	};
74
75	/ Functions implemented in assembly /
76	/*
77	* Most of below mentioned functions preserve the property of inputs
78	* being fully reduced, i.e. being in [0, modulus) range. Simply put if
79	* inputs are fully reduced, then output is too. Note that reverse is
80	* not true, in sense that given partially reduced inputs output can be
81	* either, not unlikely reduced. And "most" in first sentence refers to
82	* the fact that given the calculations flow one can tolerate that
83	* addition, 1st function below, produces partially reduced result if
84	* multiplications by 2 and 3, which customarily use addition, fully
85	* reduce it. This effectively gives two options: a) addition produces
86	* fully reduced result [as long as inputs are, just like remaining
87	* functions]; b) addition is allowed to produce partially reduced
88	* result, but multiplications by 2 and 3 perform additional reduction
89	* step. Choice between the two can be platform-specific, but it was a)
90	* in all cases so far...
91	*/
92	/ Modular add: res = a+b mod P /
93	void ecp_nistz256_add(BN_ULONG res[P256_LIMBS],
94	const BN_ULONG a[P256_LIMBS],
95	const BN_ULONG b[P256_LIMBS]);
96	/ Modular mul by 2: res = 2a mod P /*
97	void ecp_nistz256_mul_by_2(BN_ULONG res[P256_LIMBS],
98	const BN_ULONG a[P256_LIMBS]);
99	/ Modular mul by 3: res = 3a mod P /*
100	void ecp_nistz256_mul_by_3(BN_ULONG res[P256_LIMBS],
101	const BN_ULONG a[P256_LIMBS]);
102
103	/ Modular div by 2: res = a/2 mod P /
104	void ecp_nistz256_div_by_2(BN_ULONG res[P256_LIMBS],
105	const BN_ULONG a[P256_LIMBS]);
106	/ Modular sub: res = a-b mod P /
107	void ecp_nistz256_sub(BN_ULONG res[P256_LIMBS],
108	const BN_ULONG a[P256_LIMBS],
109	const BN_ULONG b[P256_LIMBS]);
110	/ Modular neg: res = -a mod P /
111	void ecp_nistz256_neg(BN_ULONG res[P256_LIMBS], const BN_ULONG a[P256_LIMBS]);
112	/ Montgomery mul: res = ab2^-256 mod P /
113	void ecp_nistz256_mul_mont(BN_ULONG res[P256_LIMBS],
114	const BN_ULONG a[P256_LIMBS],
115	const BN_ULONG b[P256_LIMBS]);
116	/ Montgomery sqr: res = aa2^-256 mod P /
117	void ecp_nistz256_sqr_mont(BN_ULONG res[P256_LIMBS],
118	const BN_ULONG a[P256_LIMBS]);
119	/ Convert a number from Montgomery domain, by multiplying with 1 /
120	void ecp_nistz256_from_mont(BN_ULONG res[P256_LIMBS],
121	const BN_ULONG in[P256_LIMBS]);
122	/ Convert a number to Montgomery domain, by multiplying with 2^512 mod P/
123	void ecp_nistz256_to_mont(BN_ULONG res[P256_LIMBS],
124	const BN_ULONG in[P256_LIMBS]);
125	/ Functions that perform constant time access to the precomputed tables /
126	void ecp_nistz256_scatter_w5(P256_POINT *val,
127	const P256_POINT in_t, int* idx);
128	void ecp_nistz256_gather_w5(P256_POINT *val,
129	const P256_POINT in_t, int* idx);
130	void ecp_nistz256_scatter_w7(P256_POINT_AFFINE *val,
131	const P256_POINT_AFFINE in_t, int* idx);
132	void ecp_nistz256_gather_w7(P256_POINT_AFFINE *val,
133	const P256_POINT_AFFINE in_t, int* idx);
134
135	/ One converted into the Montgomery domain /
136	static const BN_ULONG ONE[P256_LIMBS] = {
137	TOBN(`0x00000000`, `0x00000001`), TOBN(`0xffffffff`, `0x00000000`),
138	TOBN(`0xffffffff`, `0xffffffff`), TOBN(`0x00000000`, `0xfffffffe`)
139	};
140
141	static NISTZ256_PRE_COMP ecp_nistz256_pre_comp_new(const* EC_GROUP *group);
142
143	/ Precomputed tables for the default generator /
144	extern const PRECOMP256_ROW ecp_nistz256_precomputed[`37`];
145
146	/ Recode window to a signed digit, see ecp_nistputil.c for details /
147	static unsigned int _booth_recode_w5(unsigned int in)
148	{
149	unsigned int s, d;
150
151	s = ~((in >> `5`) - `1`);
152	d = (`1` << `6`) - in - `1`;
153	d = (d & s) \| (in & ~s);
154	d = (d >> `1`) + (d & `1`);
155
156	return (d << `1`) + (s & `1`);
157	}
158
159	static unsigned int _booth_recode_w7(unsigned int in)
160	{
161	unsigned int s, d;
162
163	s = ~((in >> `7`) - `1`);
164	d = (`1` << `8`) - in - `1`;
165	d = (d & s) \| (in & ~s);
166	d = (d >> `1`) + (d & `1`);
167
168	return (d << `1`) + (s & `1`);
169	}
170
171	static void copy_conditional(BN_ULONG dst[P256_LIMBS],
172	const BN_ULONG src[P256_LIMBS], BN_ULONG move)
173	{
174	BN_ULONG mask1 = `0`-move;
175	BN_ULONG mask2 = ~mask1;
176
177	dst[`0`] = (src[`0`] & mask1) ^ (dst[`0`] & mask2);
178	dst[`1`] = (src[`1`] & mask1) ^ (dst[`1`] & mask2);
179	dst[`2`] = (src[`2`] & mask1) ^ (dst[`2`] & mask2);
180	dst[`3`] = (src[`3`] & mask1) ^ (dst[`3`] & mask2);
181	if (P256_LIMBS == `8`) {
182	dst[`4`] = (src[`4`] & mask1) ^ (dst[`4`] & mask2);
183	dst[`5`] = (src[`5`] & mask1) ^ (dst[`5`] & mask2);
184	dst[`6`] = (src[`6`] & mask1) ^ (dst[`6`] & mask2);
185	dst[`7`] = (src[`7`] & mask1) ^ (dst[`7`] & mask2);
186	}
187	}
188
189	static BN_ULONG is_zero(BN_ULONG in)
190	{
191	in \|= (`0` - in);
192	in = ~in;
193	in >>= BN_BITS2 - `1`;
194	return in;
195	}
196
197	static BN_ULONG is_equal(const BN_ULONG a[P256_LIMBS],
198	const BN_ULONG b[P256_LIMBS])
199	{
200	BN_ULONG res;
201
202	res = a[`0`] ^ b[`0`];
203	res \|= a[`1`] ^ b[`1`];
204	res \|= a[`2`] ^ b[`2`];
205	res \|= a[`3`] ^ b[`3`];
206	if (P256_LIMBS == `8`) {
207	res \|= a[`4`] ^ b[`4`];
208	res \|= a[`5`] ^ b[`5`];
209	res \|= a[`6`] ^ b[`6`];
210	res \|= a[`7`] ^ b[`7`];
211	}
212
213	return is_zero(res);
214	}
215
216	static BN_ULONG is_one(const BIGNUM *z)
217	{
218	BN_ULONG res = `0`;
219	BN_ULONG *a = bn_get_words(z);
220
221	if (bn_get_top(z) == (P256_LIMBS - P256_LIMBS / `8`)) {
222	res = a[`0`] ^ ONE[`0`];
223	res \|= a[`1`] ^ ONE[`1`];
224	res \|= a[`2`] ^ ONE[`2`];
225	res \|= a[`3`] ^ ONE[`3`];
226	if (P256_LIMBS == `8`) {
227	res \|= a[`4`] ^ ONE[`4`];
228	res \|= a[`5`] ^ ONE[`5`];
229	res \|= a[`6`] ^ ONE[`6`];
230	/*
231	* no check for a[7] (being zero) on 32-bit platforms,
232	* because value of "one" takes only 7 limbs.
233	*/
234	}
235	res = is_zero(res);
236	}
237
238	return res;
239	}
240
241	/*
242	* For reference, this macro is used only when new ecp_nistz256 assembly
243	* module is being developed. For example, configure with
244	* -DECP_NISTZ256_REFERENCE_IMPLEMENTATION and implement only functions
245	* performing simplest arithmetic operations on 256-bit vectors. Then
246	* work on implementation of higher-level functions performing point
247	* operations. Then remove ECP_NISTZ256_REFERENCE_IMPLEMENTATION
248	* and never define it again. (The correct macro denoting presence of
249	* ecp_nistz256 module is ECP_NISTZ256_ASM.)
250	*/
251	#ifndef ECP_NISTZ256_REFERENCE_IMPLEMENTATION
252	void ecp_nistz256_point_double(P256_POINT r, const* P256_POINT *a);
253	void ecp_nistz256_point_add(P256_POINT *r,
254	const P256_POINT a, const* P256_POINT *b);
255	void ecp_nistz256_point_add_affine(P256_POINT *r,
256	const P256_POINT *a,
257	const P256_POINT_AFFINE *b);
258	#else
259	/ Point double: r = 2a /*
260	static void ecp_nistz256_point_double(P256_POINT r, const* P256_POINT *a)
261	{
262	BN_ULONG S[P256_LIMBS];
263	BN_ULONG M[P256_LIMBS];
264	BN_ULONG Zsqr[P256_LIMBS];
265	BN_ULONG tmp0[P256_LIMBS];
266
267	const BN_ULONG *in_x = a->X;
268	const BN_ULONG *in_y = a->Y;
269	const BN_ULONG *in_z = a->Z;
270
271	BN_ULONG *res_x = r->X;
272	BN_ULONG *res_y = r->Y;
273	BN_ULONG *res_z = r->Z;
274
275	ecp_nistz256_mul_by_2(S, in_y);
276
277	ecp_nistz256_sqr_mont(Zsqr, in_z);
278
279	ecp_nistz256_sqr_mont(S, S);
280
281	ecp_nistz256_mul_mont(res_z, in_z, in_y);
282	ecp_nistz256_mul_by_2(res_z, res_z);
283
284	ecp_nistz256_add(M, in_x, Zsqr);
285	ecp_nistz256_sub(Zsqr, in_x, Zsqr);
286
287	ecp_nistz256_sqr_mont(res_y, S);
288	ecp_nistz256_div_by_2(res_y, res_y);
289
290	ecp_nistz256_mul_mont(M, M, Zsqr);
291	ecp_nistz256_mul_by_3(M, M);
292
293	ecp_nistz256_mul_mont(S, S, in_x);
294	ecp_nistz256_mul_by_2(tmp0, S);
295
296	ecp_nistz256_sqr_mont(res_x, M);
297
298	ecp_nistz256_sub(res_x, res_x, tmp0);
299	ecp_nistz256_sub(S, S, res_x);
300
301	ecp_nistz256_mul_mont(S, S, M);
302	ecp_nistz256_sub(res_y, S, res_y);
303	}
304
305	/ Point addition: r = a+b /
306	static void ecp_nistz256_point_add(P256_POINT *r,
307	const P256_POINT a, const* P256_POINT *b)
308	{
309	BN_ULONG U2[P256_LIMBS], S2[P256_LIMBS];
310	BN_ULONG U1[P256_LIMBS], S1[P256_LIMBS];
311	BN_ULONG Z1sqr[P256_LIMBS];
312	BN_ULONG Z2sqr[P256_LIMBS];
313	BN_ULONG H[P256_LIMBS], R[P256_LIMBS];
314	BN_ULONG Hsqr[P256_LIMBS];
315	BN_ULONG Rsqr[P256_LIMBS];
316	BN_ULONG Hcub[P256_LIMBS];
317
318	BN_ULONG res_x[P256_LIMBS];
319	BN_ULONG res_y[P256_LIMBS];
320	BN_ULONG res_z[P256_LIMBS];
321
322	BN_ULONG in1infty, in2infty;
323
324	const BN_ULONG *in1_x = a->X;
325	const BN_ULONG *in1_y = a->Y;
326	const BN_ULONG *in1_z = a->Z;
327
328	const BN_ULONG *in2_x = b->X;
329	const BN_ULONG *in2_y = b->Y;
330	const BN_ULONG *in2_z = b->Z;
331
332	/*
333	* Infinity in encoded as (,,0)
334	*/
335	in1infty = (in1_z[`0`] \| in1_z[`1`] \| in1_z[`2`] \| in1_z[`3`]);
336	if (P256_LIMBS == `8`)
337	in1infty \|= (in1_z[`4`] \| in1_z[`5`] \| in1_z[`6`] \| in1_z[`7`]);
338
339	in2infty = (in2_z[`0`] \| in2_z[`1`] \| in2_z[`2`] \| in2_z[`3`]);
340	if (P256_LIMBS == `8`)
341	in2infty \|= (in2_z[`4`] \| in2_z[`5`] \| in2_z[`6`] \| in2_z[`7`]);
342
343	in1infty = is_zero(in1infty);
344	in2infty = is_zero(in2infty);
345
346	ecp_nistz256_sqr_mont(Z2sqr, in2_z); / Z2^2 /
347	ecp_nistz256_sqr_mont(Z1sqr, in1_z); / Z1^2 /
348
349	ecp_nistz256_mul_mont(S1, Z2sqr, in2_z); / S1 = Z2^3 /
350	ecp_nistz256_mul_mont(S2, Z1sqr, in1_z); / S2 = Z1^3 /
351
352	ecp_nistz256_mul_mont(S1, S1, in1_y); / S1 = Y1Z2^3 /*
353	ecp_nistz256_mul_mont(S2, S2, in2_y); / S2 = Y2Z1^3 /*
354	ecp_nistz256_sub(R, S2, S1); / R = S2 - S1 /
355
356	ecp_nistz256_mul_mont(U1, in1_x, Z2sqr); / U1 = X1Z2^2 /*
357	ecp_nistz256_mul_mont(U2, in2_x, Z1sqr); / U2 = X2Z1^2 /*
358	ecp_nistz256_sub(H, U2, U1); / H = U2 - U1 /
359
360	/*
361	* This should not happen during sign/ecdh, so no constant time violation
362	*/
363	if (is_equal(U1, U2) && !in1infty && !in2infty) {
364	if (is_equal(S1, S2)) {
365	ecp_nistz256_point_double(r, a);
366	return;
367	} else {
368	memset(r, `0`, sizeof(*r));
369	return;
370	}
371	}
372
373	ecp_nistz256_sqr_mont(Rsqr, R); / R^2 /
374	ecp_nistz256_mul_mont(res_z, H, in1_z); / Z3 = HZ1Z2 /
375	ecp_nistz256_sqr_mont(Hsqr, H); / H^2 /
376	ecp_nistz256_mul_mont(res_z, res_z, in2_z); / Z3 = HZ1Z2 /
377	ecp_nistz256_mul_mont(Hcub, Hsqr, H); / H^3 /
378
379	ecp_nistz256_mul_mont(U2, U1, Hsqr); / U1H^2 /*
380	ecp_nistz256_mul_by_2(Hsqr, U2); / 2U1H^2 /
381
382	ecp_nistz256_sub(res_x, Rsqr, Hsqr);
383	ecp_nistz256_sub(res_x, res_x, Hcub);
384
385	ecp_nistz256_sub(res_y, U2, res_x);
386
387	ecp_nistz256_mul_mont(S2, S1, Hcub);
388	ecp_nistz256_mul_mont(res_y, R, res_y);
389	ecp_nistz256_sub(res_y, res_y, S2);
390
391	copy_conditional(res_x, in2_x, in1infty);
392	copy_conditional(res_y, in2_y, in1infty);
393	copy_conditional(res_z, in2_z, in1infty);
394
395	copy_conditional(res_x, in1_x, in2infty);
396	copy_conditional(res_y, in1_y, in2infty);
397	copy_conditional(res_z, in1_z, in2infty);
398
399	memcpy(r->X, res_x, sizeof(res_x));
400	memcpy(r->Y, res_y, sizeof(res_y));
401	memcpy(r->Z, res_z, sizeof(res_z));
402	}
403
404	/ Point addition when b is known to be affine: r = a+b /
405	static void ecp_nistz256_point_add_affine(P256_POINT *r,
406	const P256_POINT *a,
407	const P256_POINT_AFFINE *b)
408	{
409	BN_ULONG U2[P256_LIMBS], S2[P256_LIMBS];
410	BN_ULONG Z1sqr[P256_LIMBS];
411	BN_ULONG H[P256_LIMBS], R[P256_LIMBS];
412	BN_ULONG Hsqr[P256_LIMBS];
413	BN_ULONG Rsqr[P256_LIMBS];
414	BN_ULONG Hcub[P256_LIMBS];
415
416	BN_ULONG res_x[P256_LIMBS];
417	BN_ULONG res_y[P256_LIMBS];
418	BN_ULONG res_z[P256_LIMBS];
419
420	BN_ULONG in1infty, in2infty;
421
422	const BN_ULONG *in1_x = a->X;
423	const BN_ULONG *in1_y = a->Y;
424	const BN_ULONG *in1_z = a->Z;
425
426	const BN_ULONG *in2_x = b->X;
427	const BN_ULONG *in2_y = b->Y;
428
429	/*
430	* Infinity in encoded as (,,0)
431	*/
432	in1infty = (in1_z[`0`] \| in1_z[`1`] \| in1_z[`2`] \| in1_z[`3`]);
433	if (P256_LIMBS == `8`)
434	in1infty \|= (in1_z[`4`] \| in1_z[`5`] \| in1_z[`6`] \| in1_z[`7`]);
435
436	/*
437	* In affine representation we encode infinity as (0,0), which is
438	* not on the curve, so it is OK
439	*/
440	in2infty = (in2_x[`0`] \| in2_x[`1`] \| in2_x[`2`] \| in2_x[`3`] \|
441	in2_y[`0`] \| in2_y[`1`] \| in2_y[`2`] \| in2_y[`3`]);
442	if (P256_LIMBS == `8`)
443	in2infty \|= (in2_x[`4`] \| in2_x[`5`] \| in2_x[`6`] \| in2_x[`7`] \|
444	in2_y[`4`] \| in2_y[`5`] \| in2_y[`6`] \| in2_y[`7`]);
445
446	in1infty = is_zero(in1infty);
447	in2infty = is_zero(in2infty);
448
449	ecp_nistz256_sqr_mont(Z1sqr, in1_z); / Z1^2 /
450
451	ecp_nistz256_mul_mont(U2, in2_x, Z1sqr); / U2 = X2Z1^2 /*
452	ecp_nistz256_sub(H, U2, in1_x); / H = U2 - U1 /
453
454	ecp_nistz256_mul_mont(S2, Z1sqr, in1_z); / S2 = Z1^3 /
455
456	ecp_nistz256_mul_mont(res_z, H, in1_z); / Z3 = HZ1Z2 /
457
458	ecp_nistz256_mul_mont(S2, S2, in2_y); / S2 = Y2Z1^3 /*
459	ecp_nistz256_sub(R, S2, in1_y); / R = S2 - S1 /
460
461	ecp_nistz256_sqr_mont(Hsqr, H); / H^2 /
462	ecp_nistz256_sqr_mont(Rsqr, R); / R^2 /
463	ecp_nistz256_mul_mont(Hcub, Hsqr, H); / H^3 /
464
465	ecp_nistz256_mul_mont(U2, in1_x, Hsqr); / U1H^2 /*
466	ecp_nistz256_mul_by_2(Hsqr, U2); / 2U1H^2 /
467
468	ecp_nistz256_sub(res_x, Rsqr, Hsqr);
469	ecp_nistz256_sub(res_x, res_x, Hcub);
470	ecp_nistz256_sub(H, U2, res_x);
471
472	ecp_nistz256_mul_mont(S2, in1_y, Hcub);
473	ecp_nistz256_mul_mont(H, H, R);
474	ecp_nistz256_sub(res_y, H, S2);
475
476	copy_conditional(res_x, in2_x, in1infty);
477	copy_conditional(res_x, in1_x, in2infty);
478
479	copy_conditional(res_y, in2_y, in1infty);
480	copy_conditional(res_y, in1_y, in2infty);
481
482	copy_conditional(res_z, ONE, in1infty);
483	copy_conditional(res_z, in1_z, in2infty);
484
485	memcpy(r->X, res_x, sizeof(res_x));
486	memcpy(r->Y, res_y, sizeof(res_y));
487	memcpy(r->Z, res_z, sizeof(res_z));
488	}
489	#endif
490
491	/ r = in^-1 mod p /
492	static void ecp_nistz256_mod_inverse(BN_ULONG r[P256_LIMBS],
493	const BN_ULONG in[P256_LIMBS])
494	{
495	/*
496	* The poly is ffffffff 00000001 00000000 00000000 00000000 ffffffff
497	* ffffffff ffffffff We use FLT and used poly-2 as exponent
498	*/
499	BN_ULONG p2[P256_LIMBS];
500	BN_ULONG p4[P256_LIMBS];
501	BN_ULONG p8[P256_LIMBS];
502	BN_ULONG p16[P256_LIMBS];
503	BN_ULONG p32[P256_LIMBS];
504	BN_ULONG res[P256_LIMBS];
505	int i;
506
507	ecp_nistz256_sqr_mont(res, in);
508	ecp_nistz256_mul_mont(p2, res, in); / 3p /*
509
510	ecp_nistz256_sqr_mont(res, p2);
511	ecp_nistz256_sqr_mont(res, res);
512	ecp_nistz256_mul_mont(p4, res, p2); / fp /*
513
514	ecp_nistz256_sqr_mont(res, p4);
515	ecp_nistz256_sqr_mont(res, res);
516	ecp_nistz256_sqr_mont(res, res);
517	ecp_nistz256_sqr_mont(res, res);
518	ecp_nistz256_mul_mont(p8, res, p4); / ffp /*
519
520	ecp_nistz256_sqr_mont(res, p8);
521	for (i = `0`; i < `7`; i++)
522	ecp_nistz256_sqr_mont(res, res);
523	ecp_nistz256_mul_mont(p16, res, p8); / ffffp /*
524
525	ecp_nistz256_sqr_mont(res, p16);
526	for (i = `0`; i < `15`; i++)
527	ecp_nistz256_sqr_mont(res, res);
528	ecp_nistz256_mul_mont(p32, res, p16); / ffffffffp /*
529
530	ecp_nistz256_sqr_mont(res, p32);
531	for (i = `0`; i < `31`; i++)
532	ecp_nistz256_sqr_mont(res, res);
533	ecp_nistz256_mul_mont(res, res, in);
534
535	for (i = `0`; i < `32` * `4`; i++)
536	ecp_nistz256_sqr_mont(res, res);
537	ecp_nistz256_mul_mont(res, res, p32);
538
539	for (i = `0`; i < `32`; i++)
540	ecp_nistz256_sqr_mont(res, res);
541	ecp_nistz256_mul_mont(res, res, p32);
542
543	for (i = `0`; i < `16`; i++)
544	ecp_nistz256_sqr_mont(res, res);
545	ecp_nistz256_mul_mont(res, res, p16);
546
547	for (i = `0`; i < `8`; i++)
548	ecp_nistz256_sqr_mont(res, res);
549	ecp_nistz256_mul_mont(res, res, p8);
550
551	ecp_nistz256_sqr_mont(res, res);
552	ecp_nistz256_sqr_mont(res, res);
553	ecp_nistz256_sqr_mont(res, res);
554	ecp_nistz256_sqr_mont(res, res);
555	ecp_nistz256_mul_mont(res, res, p4);
556
557	ecp_nistz256_sqr_mont(res, res);
558	ecp_nistz256_sqr_mont(res, res);
559	ecp_nistz256_mul_mont(res, res, p2);
560
561	ecp_nistz256_sqr_mont(res, res);
562	ecp_nistz256_sqr_mont(res, res);
563	ecp_nistz256_mul_mont(res, res, in);
564
565	memcpy(r, res, sizeof(res));
566	}
567
568	/*
569	* ecp_nistz256_bignum_to_field_elem copies the contents of \|in\| to \|out\| and
570	* returns one if it fits. Otherwise it returns zero.
571	*/
572	__owur static int ecp_nistz256_bignum_to_field_elem(BN_ULONG out[P256_LIMBS],
573	const BIGNUM *in)
574	{
575	return bn_copy_words(out, in, P256_LIMBS);
576	}
577
578	/ r = sum(scalar[i]point[i]) /*
579	__owur static int ecp_nistz256_windowed_mul(const EC_GROUP *group,
580	P256_POINT *r,
581	const BIGNUM **scalar,
582	const EC_POINT **point,
583	size_t num, BN_CTX *ctx)
584	{
585	size_t i;
586	int j, ret = `0`;
587	unsigned int idx;
588	unsigned char (*p_str)[`33`] = NULL;
589	const unsigned int window_size = `5`;
590	const unsigned int mask = (`1` << (window_size + `1`)) - `1`;
591	unsigned int wvalue;
592	P256_POINT temp; /* place for 5 temporary points /
593	const BIGNUM **scalars = NULL;
594	P256_POINT (*table)[`16`] = NULL;
595	void *table_storage = NULL;
596
597	if ((num * `16` + `6`) > OPENSSL_MALLOC_MAX_NELEMS(P256_POINT)
598	\|\| (table_storage =
599	OPENSSL_malloc((num * `16` + `5`) * sizeof(P256_POINT) + `64`)) == NULL
600	\|\| (p_str =
601	OPENSSL_malloc(num * `33` * sizeof(unsigned char))) == NULL
602	\|\| (scalars = OPENSSL_malloc(num * sizeof(BIGNUM *))) == NULL) {
603	ECerr(EC_F_ECP_NISTZ256_WINDOWED_MUL, ERR_R_MALLOC_FAILURE);
604	goto err;
605	}
606
607	table = (void *)ALIGNPTR(table_storage, `64`);
608	temp = (P256_POINT *)(table + num);
609
610	for (i = `0`; i < num; i++) {
611	P256_POINT *row = table[i];
612
613	/ This is an unusual input, we don't guarantee constant-timeness. /
614	if ((BN_num_bits(scalar[i]) > `256`) \|\| BN_is_negative(scalar[i])) {
615	BIGNUM *mod;
616
617	if ((mod = BN_CTX_get(ctx)) == NULL)
618	goto err;
619	if (!BN_nnmod(mod, scalar[i], group->order, ctx)) {
620	ECerr(EC_F_ECP_NISTZ256_WINDOWED_MUL, ERR_R_BN_LIB);
621	goto err;
622	}
623	scalars[i] = mod;
624	} else
625	scalars[i] = scalar[i];
626
627	for (j = `0`; j < bn_get_top(scalars[i]) * BN_BYTES; j += BN_BYTES) {
628	BN_ULONG d = bn_get_words(scalars[i])[j / BN_BYTES];
629
630	p_str[i][j + `0`] = (unsigned char)d;
631	p_str[i][j + `1`] = (unsigned char)(d >> `8`);
632	p_str[i][j + `2`] = (unsigned char)(d >> `16`);
633	p_str[i][j + `3`] = (unsigned char)(d >>= `24`);
634	if (BN_BYTES == `8`) {
635	d >>= `8`;
636	p_str[i][j + `4`] = (unsigned char)d;
637	p_str[i][j + `5`] = (unsigned char)(d >> `8`);
638	p_str[i][j + `6`] = (unsigned char)(d >> `16`);
639	p_str[i][j + `7`] = (unsigned char)(d >> `24`);
640	}
641	}
642	for (; j < `33`; j++)
643	p_str[i][j] = `0`;
644
645	if (!ecp_nistz256_bignum_to_field_elem(temp[`0`].X, point[i]->X)
646	\|\| !ecp_nistz256_bignum_to_field_elem(temp[`0`].Y, point[i]->Y)
647	\|\| !ecp_nistz256_bignum_to_field_elem(temp[`0`].Z, point[i]->Z)) {
648	ECerr(EC_F_ECP_NISTZ256_WINDOWED_MUL,
649	EC_R_COORDINATES_OUT_OF_RANGE);
650	goto err;
651	}
652
653	/*
654	* row[0] is implicitly (0,0,0) (the point at infinity), therefore it
655	* is not stored. All other values are actually stored with an offset
656	* of -1 in table.
657	*/
658
659	ecp_nistz256_scatter_w5 (row, &temp[`0`], `1`);
660	ecp_nistz256_point_double(&temp[`1`], &temp[`0`]); /1+1=2 /
661	ecp_nistz256_scatter_w5 (row, &temp[`1`], `2`);
662	ecp_nistz256_point_add (&temp[`2`], &temp[`1`], &temp[`0`]); /2+1=3 /
663	ecp_nistz256_scatter_w5 (row, &temp[`2`], `3`);
664	ecp_nistz256_point_double(&temp[`1`], &temp[`1`]); /22=4 /*
665	ecp_nistz256_scatter_w5 (row, &temp[`1`], `4`);
666	ecp_nistz256_point_double(&temp[`2`], &temp[`2`]); /23=6 /*
667	ecp_nistz256_scatter_w5 (row, &temp[`2`], `6`);
668	ecp_nistz256_point_add (&temp[`3`], &temp[`1`], &temp[`0`]); /4+1=5 /
669	ecp_nistz256_scatter_w5 (row, &temp[`3`], `5`);
670	ecp_nistz256_point_add (&temp[`4`], &temp[`2`], &temp[`0`]); /6+1=7 /
671	ecp_nistz256_scatter_w5 (row, &temp[`4`], `7`);
672	ecp_nistz256_point_double(&temp[`1`], &temp[`1`]); /24=8 /*
673	ecp_nistz256_scatter_w5 (row, &temp[`1`], `8`);
674	ecp_nistz256_point_double(&temp[`2`], &temp[`2`]); /26=12 /*
675	ecp_nistz256_scatter_w5 (row, &temp[`2`], `12`);
676	ecp_nistz256_point_double(&temp[`3`], &temp[`3`]); /25=10 /*
677	ecp_nistz256_scatter_w5 (row, &temp[`3`], `10`);
678	ecp_nistz256_point_double(&temp[`4`], &temp[`4`]); /27=14 /*
679	ecp_nistz256_scatter_w5 (row, &temp[`4`], `14`);
680	ecp_nistz256_point_add (&temp[`2`], &temp[`2`], &temp[`0`]); /12+1=13/
681	ecp_nistz256_scatter_w5 (row, &temp[`2`], `13`);
682	ecp_nistz256_point_add (&temp[`3`], &temp[`3`], &temp[`0`]); /10+1=11/
683	ecp_nistz256_scatter_w5 (row, &temp[`3`], `11`);
684	ecp_nistz256_point_add (&temp[`4`], &temp[`4`], &temp[`0`]); /14+1=15/
685	ecp_nistz256_scatter_w5 (row, &temp[`4`], `15`);
686	ecp_nistz256_point_add (&temp[`2`], &temp[`1`], &temp[`0`]); /8+1=9 /
687	ecp_nistz256_scatter_w5 (row, &temp[`2`], `9`);
688	ecp_nistz256_point_double(&temp[`1`], &temp[`1`]); /28=16 /*
689	ecp_nistz256_scatter_w5 (row, &temp[`1`], `16`);
690	}
691
692	idx = `255`;
693
694	wvalue = p_str[`0`][(idx - `1`) / `8`];
695	wvalue = (wvalue >> ((idx - `1`) % `8`)) & mask;
696
697	/*
698	* We gather to temp[0], because we know it's position relative
699	* to table
700	*/
701	ecp_nistz256_gather_w5(&temp[`0`], table[`0`], _booth_recode_w5(wvalue) >> `1`);
702	memcpy(r, &temp[`0`], sizeof(temp[`0`]));
703
704	while (idx >= `5`) {
705	for (i = (idx == `255` ? `1` : `0`); i < num; i++) {
706	unsigned int off = (idx - `1`) / `8`;
707
708	wvalue = p_str[i][off] \| p_str[i][off + `1`] << `8`;
709	wvalue = (wvalue >> ((idx - `1`) % `8`)) & mask;
710
711	wvalue = _booth_recode_w5(wvalue);
712
713	ecp_nistz256_gather_w5(&temp[`0`], table[i], wvalue >> `1`);
714
715	ecp_nistz256_neg(temp[`1`].Y, temp[`0`].Y);
716	copy_conditional(temp[`0`].Y, temp[`1`].Y, (wvalue & `1`));
717
718	ecp_nistz256_point_add(r, r, &temp[`0`]);
719	}
720
721	idx -= window_size;
722
723	ecp_nistz256_point_double(r, r);
724	ecp_nistz256_point_double(r, r);
725	ecp_nistz256_point_double(r, r);
726	ecp_nistz256_point_double(r, r);
727	ecp_nistz256_point_double(r, r);
728	}
729
730	/ Final window /
731	for (i = `0`; i < num; i++) {
732	wvalue = p_str[i][`0`];
733	wvalue = (wvalue << `1`) & mask;
734
735	wvalue = _booth_recode_w5(wvalue);
736
737	ecp_nistz256_gather_w5(&temp[`0`], table[i], wvalue >> `1`);
738
739	ecp_nistz256_neg(temp[`1`].Y, temp[`0`].Y);
740	copy_conditional(temp[`0`].Y, temp[`1`].Y, wvalue & `1`);
741
742	ecp_nistz256_point_add(r, r, &temp[`0`]);
743	}
744
745	ret = `1`;
746	err:
747	OPENSSL_free(table_storage);
748	OPENSSL_free(p_str);
749	OPENSSL_free(scalars);
750	return ret;
751	}
752
753	/ Coordinates of G, for which we have precomputed tables /
754	static const BN_ULONG def_xG[P256_LIMBS] = {
755	TOBN(`0x79e730d4`, `0x18a9143c`), TOBN(`0x75ba95fc`, `0x5fedb601`),
756	TOBN(`0x79fb732b`, `0x77622510`), TOBN(`0x18905f76`, `0xa53755c6`)
757	};
758
759	static const BN_ULONG def_yG[P256_LIMBS] = {
760	TOBN(`0xddf25357`, `0xce95560a`), TOBN(`0x8b4ab8e4`, `0xba19e45c`),
761	TOBN(`0xd2e88688`, `0xdd21f325`), TOBN(`0x8571ff18`, `0x25885d85`)
762	};
763
764	/*
765	* ecp_nistz256_is_affine_G returns one if \|generator\| is the standard, P-256
766	* generator.
767	*/
768	static int ecp_nistz256_is_affine_G(const EC_POINT *generator)
769	{
770	return (bn_get_top(generator->X) == P256_LIMBS) &&
771	(bn_get_top(generator->Y) == P256_LIMBS) &&
772	is_equal(bn_get_words(generator->X), def_xG) &&
773	is_equal(bn_get_words(generator->Y), def_yG) &&
774	is_one(generator->Z);
775	}
776
777	__owur static int ecp_nistz256_mult_precompute(EC_GROUP group, BN_CTX ctx)
778	{
779	/*
780	* We precompute a table for a Booth encoded exponent (wNAF) based
781	* computation. Each table holds 64 values for safe access, with an
782	* implicit value of infinity at index zero. We use window of size 7, and
783	* therefore require ceil(256/7) = 37 tables.
784	*/
785	const BIGNUM *order;
786	EC_POINT P = NULL, T = NULL;
787	const EC_POINT *generator;
788	NISTZ256_PRE_COMP *pre_comp;
789	BN_CTX *new_ctx = NULL;
790	int i, j, k, ret = `0`;
791	size_t w;
792
793	PRECOMP256_ROW *preComputedTable = NULL;
794	unsigned char *precomp_storage = NULL;
795
796	/ if there is an old NISTZ256_PRE_COMP object, throw it away /
797	EC_pre_comp_free(group);
798	generator = EC_GROUP_get0_generator(group);
799	if (generator == NULL) {
800	ECerr(EC_F_ECP_NISTZ256_MULT_PRECOMPUTE, EC_R_UNDEFINED_GENERATOR);
801	return `0`;
802	}
803
804	if (ecp_nistz256_is_affine_G(generator)) {
805	/*
806	* No need to calculate tables for the standard generator because we
807	* have them statically.
808	*/
809	return `1`;
810	}
811
812	if ((pre_comp = ecp_nistz256_pre_comp_new(group)) == NULL)
813	return `0`;
814
815	if (ctx == NULL) {
816	ctx = new_ctx = BN_CTX_new_ex(group->libctx);
817	if (ctx == NULL)
818	goto err;
819	}
820
821	BN_CTX_start(ctx);
822
823	order = EC_GROUP_get0_order(group);
824	if (order == NULL)
825	goto err;
826
827	if (BN_is_zero(order)) {
828	ECerr(EC_F_ECP_NISTZ256_MULT_PRECOMPUTE, EC_R_UNKNOWN_ORDER);
829	goto err;
830	}
831
832	w = `7`;
833
834	if ((precomp_storage =
835	OPENSSL_malloc(`37` * `64` * sizeof(P256_POINT_AFFINE) + `64`)) == NULL) {
836	ECerr(EC_F_ECP_NISTZ256_MULT_PRECOMPUTE, ERR_R_MALLOC_FAILURE);
837	goto err;
838	}
839
840	preComputedTable = (void *)ALIGNPTR(precomp_storage, `64`);
841
842	P = EC_POINT_new(group);
843	T = EC_POINT_new(group);
844	if (P == NULL \|\| T == NULL)
845	goto err;
846
847	/*
848	* The zero entry is implicitly infinity, and we skip it, storing other
849	* values with -1 offset.
850	*/
851	if (!EC_POINT_copy(T, generator))
852	goto err;
853
854	for (k = `0`; k < `64`; k++) {
855	if (!EC_POINT_copy(P, T))
856	goto err;
857	for (j = `0`; j < `37`; j++) {
858	P256_POINT_AFFINE temp;
859	/*
860	* It would be faster to use EC_POINTs_make_affine and
861	* make multiple points affine at the same time.
862	*/
863	if (!EC_POINT_make_affine(group, P, ctx))
864	goto err;
865	if (!ecp_nistz256_bignum_to_field_elem(temp.X, P->X) \|\|
866	!ecp_nistz256_bignum_to_field_elem(temp.Y, P->Y)) {
867	ECerr(EC_F_ECP_NISTZ256_MULT_PRECOMPUTE,
868	EC_R_COORDINATES_OUT_OF_RANGE);
869	goto err;
870	}
871	ecp_nistz256_scatter_w7(preComputedTable[j], &temp, k);
872	for (i = `0`; i < `7`; i++) {
873	if (!EC_POINT_dbl(group, P, P, ctx))
874	goto err;
875	}
876	}
877	if (!EC_POINT_add(group, T, T, generator, ctx))
878	goto err;
879	}
880
881	pre_comp->group = group;
882	pre_comp->w = w;
883	pre_comp->precomp = preComputedTable;
884	pre_comp->precomp_storage = precomp_storage;
885	precomp_storage = NULL;
886	SETPRECOMP(group, nistz256, pre_comp);
887	pre_comp = NULL;
888	ret = `1`;
889
890	err:
891	BN_CTX_end(ctx);
892	BN_CTX_free(new_ctx);
893
894	EC_nistz256_pre_comp_free(pre_comp);
895	OPENSSL_free(precomp_storage);
896	EC_POINT_free(P);
897	EC_POINT_free(T);
898	return ret;
899	}
900
901	/*
902	* Note that by default ECP_NISTZ256_AVX2 is undefined. While it's great
903	* code processing 4 points in parallel, corresponding serial operation
904	* is several times slower, because it uses 29x29=58-bit multiplication
905	* as opposite to 64x64=128-bit in integer-only scalar case. As result
906	* it doesn't provide significant performance improvement. Note that
907	* just defining ECP_NISTZ256_AVX2 is not sufficient to make it work,
908	* you'd need to compile even asm/ecp_nistz256-avx.pl module.
909	*/
910	#if defined(ECP_NISTZ256_AVX2)
911	# if !(defined(__x86_64) \|\| defined(__x86_64__) \|\| \
912	defined(_M_AMD64) \|\| defined(_M_X64)) \|\| \
913	!(defined(__GNUC__) \|\| defined(_MSC_VER)) /* this is for ALIGN32 */
914	# undef ECP_NISTZ256_AVX2
915	# else
916	/ Constant time access, loading four values, from four consecutive tables /
917	void ecp_nistz256_avx2_multi_gather_w7(void result, const* void *in,
918	int index0, int index1, int index2,
919	int index3);
920	void ecp_nistz256_avx2_transpose_convert(void RESULTx4, const* void *in);
921	void ecp_nistz256_avx2_convert_transpose_back(void result, const* void *Ax4);
922	void ecp_nistz256_avx2_point_add_affine_x4(void RESULTx4, const* void *Ax4,
923	const void *Bx4);
924	void ecp_nistz256_avx2_point_add_affines_x4(void RESULTx4, const* void *Ax4,
925	const void *Bx4);
926	void ecp_nistz256_avx2_to_mont(void RESULTx4, const* void *Ax4);
927	void ecp_nistz256_avx2_from_mont(void RESULTx4, const* void *Ax4);
928	void ecp_nistz256_avx2_set1(void *RESULTx4);
929	int ecp_nistz_avx2_eligible(void);
930
931	static void booth_recode_w7(unsigned char *sign,
932	unsigned char digit, unsigned* char in)
933	{
934	unsigned char s, d;
935
936	s = ~((in >> `7`) - `1`);
937	d = (`1` << `8`) - in - `1`;
938	d = (d & s) \| (in & ~s);
939	d = (d >> `1`) + (d & `1`);
940
941	*sign = s & `1`;
942	*digit = d;
943	}
944
945	/*
946	* ecp_nistz256_avx2_mul_g performs multiplication by G, using only the
947	* precomputed table. It does 4 affine point additions in parallel,
948	* significantly speeding up point multiplication for a fixed value.
949	*/
950	static void ecp_nistz256_avx2_mul_g(P256_POINT *r,
951	unsigned char p_str[`33`],
952	const P256_POINT_AFFINE(*preComputedTable)[`64`])
953	{
954	const unsigned int window_size = `7`;
955	const unsigned int mask = (`1` << (window_size + `1`)) - `1`;
956	unsigned int wvalue;
957	/ Using 4 windows at a time /
958	unsigned char sign0, digit0;
959	unsigned char sign1, digit1;
960	unsigned char sign2, digit2;
961	unsigned char sign3, digit3;
962	unsigned int idx = `0`;
963	BN_ULONG tmp[P256_LIMBS];
964	int i;
965
966	ALIGN32 BN_ULONG aX4[`4` * `9` * `3`] = { `0` };
967	ALIGN32 BN_ULONG bX4[`4` * `9` * `2`] = { `0` };
968	ALIGN32 P256_POINT_AFFINE point_arr[`4`];
969	ALIGN32 P256_POINT res_point_arr[`4`];
970
971	/ Initial four windows /
972	wvalue = ((u16 ) & p_str[`0`]);
973	wvalue = (wvalue << `1`) & mask;
974	idx += window_size;
975	booth_recode_w7(&sign0, &digit0, wvalue);
976	wvalue = ((u16 ) & p_str[(idx - `1`) / `8`]);
977	wvalue = (wvalue >> ((idx - `1`) % `8`)) & mask;
978	idx += window_size;
979	booth_recode_w7(&sign1, &digit1, wvalue);
980	wvalue = ((u16 ) & p_str[(idx - `1`) / `8`]);
981	wvalue = (wvalue >> ((idx - `1`) % `8`)) & mask;
982	idx += window_size;
983	booth_recode_w7(&sign2, &digit2, wvalue);
984	wvalue = ((u16 ) & p_str[(idx - `1`) / `8`]);
985	wvalue = (wvalue >> ((idx - `1`) % `8`)) & mask;
986	idx += window_size;
987	booth_recode_w7(&sign3, &digit3, wvalue);
988
989	ecp_nistz256_avx2_multi_gather_w7(point_arr, preComputedTable[`0`],
990	digit0, digit1, digit2, digit3);
991
992	ecp_nistz256_neg(tmp, point_arr[`0`].Y);
993	copy_conditional(point_arr[`0`].Y, tmp, sign0);
994	ecp_nistz256_neg(tmp, point_arr[`1`].Y);
995	copy_conditional(point_arr[`1`].Y, tmp, sign1);
996	ecp_nistz256_neg(tmp, point_arr[`2`].Y);
997	copy_conditional(point_arr[`2`].Y, tmp, sign2);
998	ecp_nistz256_neg(tmp, point_arr[`3`].Y);
999	copy_conditional(point_arr[`3`].Y, tmp, sign3);
1000
1001	ecp_nistz256_avx2_transpose_convert(aX4, point_arr);
1002	ecp_nistz256_avx2_to_mont(aX4, aX4);
1003	ecp_nistz256_avx2_to_mont(&aX4[`4` * `9`], &aX4[`4` * `9`]);
1004	ecp_nistz256_avx2_set1(&aX4[`4` * `9` * `2`]);
1005
1006	wvalue = ((u16 ) & p_str[(idx - `1`) / `8`]);
1007	wvalue = (wvalue >> ((idx - `1`) % `8`)) & mask;
1008	idx += window_size;
1009	booth_recode_w7(&sign0, &digit0, wvalue);
1010	wvalue = ((u16 ) & p_str[(idx - `1`) / `8`]);
1011	wvalue = (wvalue >> ((idx - `1`) % `8`)) & mask;
1012	idx += window_size;
1013	booth_recode_w7(&sign1, &digit1, wvalue);
1014	wvalue = ((u16 ) & p_str[(idx - `1`) / `8`]);
1015	wvalue = (wvalue >> ((idx - `1`) % `8`)) & mask;
1016	idx += window_size;
1017	booth_recode_w7(&sign2, &digit2, wvalue);
1018	wvalue = ((u16 ) & p_str[(idx - `1`) / `8`]);
1019	wvalue = (wvalue >> ((idx - `1`) % `8`)) & mask;
1020	idx += window_size;
1021	booth_recode_w7(&sign3, &digit3, wvalue);
1022
1023	ecp_nistz256_avx2_multi_gather_w7(point_arr, preComputedTable[`4` * `1`],
1024	digit0, digit1, digit2, digit3);
1025
1026	ecp_nistz256_neg(tmp, point_arr[`0`].Y);
1027	copy_conditional(point_arr[`0`].Y, tmp, sign0);
1028	ecp_nistz256_neg(tmp, point_arr[`1`].Y);
1029	copy_conditional(point_arr[`1`].Y, tmp, sign1);
1030	ecp_nistz256_neg(tmp, point_arr[`2`].Y);
1031	copy_conditional(point_arr[`2`].Y, tmp, sign2);
1032	ecp_nistz256_neg(tmp, point_arr[`3`].Y);
1033	copy_conditional(point_arr[`3`].Y, tmp, sign3);
1034
1035	ecp_nistz256_avx2_transpose_convert(bX4, point_arr);
1036	ecp_nistz256_avx2_to_mont(bX4, bX4);
1037	ecp_nistz256_avx2_to_mont(&bX4[`4` * `9`], &bX4[`4` * `9`]);
1038	/ Optimized when both inputs are affine /
1039	ecp_nistz256_avx2_point_add_affines_x4(aX4, aX4, bX4);
1040
1041	for (i = `2`; i < `9`; i++) {
1042	wvalue = ((u16 ) & p_str[(idx - `1`) / `8`]);
1043	wvalue = (wvalue >> ((idx - `1`) % `8`)) & mask;
1044	idx += window_size;
1045	booth_recode_w7(&sign0, &digit0, wvalue);
1046	wvalue = ((u16 ) & p_str[(idx - `1`) / `8`]);
1047	wvalue = (wvalue >> ((idx - `1`) % `8`)) & mask;
1048	idx += window_size;
1049	booth_recode_w7(&sign1, &digit1, wvalue);
1050	wvalue = ((u16 ) & p_str[(idx - `1`) / `8`]);
1051	wvalue = (wvalue >> ((idx - `1`) % `8`)) & mask;
1052	idx += window_size;
1053	booth_recode_w7(&sign2, &digit2, wvalue);
1054	wvalue = ((u16 ) & p_str[(idx - `1`) / `8`]);
1055	wvalue = (wvalue >> ((idx - `1`) % `8`)) & mask;
1056	idx += window_size;
1057	booth_recode_w7(&sign3, &digit3, wvalue);
1058
1059	ecp_nistz256_avx2_multi_gather_w7(point_arr,
1060	preComputedTable[`4` * i],
1061	digit0, digit1, digit2, digit3);
1062
1063	ecp_nistz256_neg(tmp, point_arr[`0`].Y);
1064	copy_conditional(point_arr[`0`].Y, tmp, sign0);
1065	ecp_nistz256_neg(tmp, point_arr[`1`].Y);
1066	copy_conditional(point_arr[`1`].Y, tmp, sign1);
1067	ecp_nistz256_neg(tmp, point_arr[`2`].Y);
1068	copy_conditional(point_arr[`2`].Y, tmp, sign2);
1069	ecp_nistz256_neg(tmp, point_arr[`3`].Y);
1070	copy_conditional(point_arr[`3`].Y, tmp, sign3);
1071
1072	ecp_nistz256_avx2_transpose_convert(bX4, point_arr);
1073	ecp_nistz256_avx2_to_mont(bX4, bX4);
1074	ecp_nistz256_avx2_to_mont(&bX4[`4` * `9`], &bX4[`4` * `9`]);
1075
1076	ecp_nistz256_avx2_point_add_affine_x4(aX4, aX4, bX4);
1077	}
1078
1079	ecp_nistz256_avx2_from_mont(&aX4[`4` * `9` * `0`], &aX4[`4` * `9` * `0`]);
1080	ecp_nistz256_avx2_from_mont(&aX4[`4` * `9` * `1`], &aX4[`4` * `9` * `1`]);
1081	ecp_nistz256_avx2_from_mont(&aX4[`4` * `9` * `2`], &aX4[`4` * `9` * `2`]);
1082
1083	ecp_nistz256_avx2_convert_transpose_back(res_point_arr, aX4);
1084	/ Last window is performed serially /
1085	wvalue = ((u16 ) & p_str[(idx - `1`) / `8`]);
1086	wvalue = (wvalue >> ((idx - `1`) % `8`)) & mask;
1087	booth_recode_w7(&sign0, &digit0, wvalue);
1088	ecp_nistz256_gather_w7((P256_POINT_AFFINE *)r,
1089	preComputedTable[`36`], digit0);
1090	ecp_nistz256_neg(tmp, r->Y);
1091	copy_conditional(r->Y, tmp, sign0);
1092	memcpy(r->Z, ONE, sizeof(ONE));
1093	/ Sum the four windows /
1094	ecp_nistz256_point_add(r, r, &res_point_arr[`0`]);
1095	ecp_nistz256_point_add(r, r, &res_point_arr[`1`]);
1096	ecp_nistz256_point_add(r, r, &res_point_arr[`2`]);
1097	ecp_nistz256_point_add(r, r, &res_point_arr[`3`]);
1098	}
1099	# endif
1100	#endif
1101
1102	__owur static int ecp_nistz256_set_from_affine(EC_POINT out, const* EC_GROUP *group,
1103	const P256_POINT_AFFINE *in,
1104	BN_CTX *ctx)
1105	{
1106	int ret = `0`;
1107
1108	if ((ret = bn_set_words(out->X, in->X, P256_LIMBS))
1109	&& (ret = bn_set_words(out->Y, in->Y, P256_LIMBS))
1110	&& (ret = bn_set_words(out->Z, ONE, P256_LIMBS)))
1111	out->Z_is_one = `1`;
1112
1113	return ret;
1114	}
1115
1116	/ r = scalarG + sum(scalars[i]points[i]) /
1117	__owur static int ecp_nistz256_points_mul(const EC_GROUP *group,
1118	EC_POINT *r,
1119	const BIGNUM *scalar,
1120	size_t num,
1121	const EC_POINT *points[],
1122	const BIGNUM scalars[], BN_CTX ctx)
1123	{
1124	int i = `0`, ret = `0`, no_precomp_for_generator = `0`, p_is_infinity = `0`;
1125	unsigned char p_str[`33`] = { `0` };
1126	const PRECOMP256_ROW *preComputedTable = NULL;
1127	const NISTZ256_PRE_COMP *pre_comp = NULL;
1128	const EC_POINT *generator = NULL;
1129	const BIGNUM **new_scalars = NULL;
1130	const EC_POINT **new_points = NULL;
1131	unsigned int idx = `0`;
1132	const unsigned int window_size = `7`;
1133	const unsigned int mask = (`1` << (window_size + `1`)) - `1`;
1134	unsigned int wvalue;
1135	ALIGN32 union {
1136	P256_POINT p;
1137	P256_POINT_AFFINE a;
1138	} t, p;
1139	BIGNUM *tmp_scalar;
1140
1141	if ((num + `1`) == `0` \|\| (num + `1`) > OPENSSL_MALLOC_MAX_NELEMS(void *)) {
1142	ECerr(EC_F_ECP_NISTZ256_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1143	return `0`;
1144	}
1145
1146	BN_CTX_start(ctx);
1147
1148	if (scalar) {
1149	generator = EC_GROUP_get0_generator(group);
1150	if (generator == NULL) {
1151	ECerr(EC_F_ECP_NISTZ256_POINTS_MUL, EC_R_UNDEFINED_GENERATOR);
1152	goto err;
1153	}
1154
1155	/ look if we can use precomputed multiples of generator /
1156	pre_comp = group->pre_comp.nistz256;
1157
1158	if (pre_comp) {
1159	/*
1160	* If there is a precomputed table for the generator, check that
1161	* it was generated with the same generator.
1162	*/
1163	EC_POINT *pre_comp_generator = EC_POINT_new(group);
1164	if (pre_comp_generator == NULL)
1165	goto err;
1166
1167	ecp_nistz256_gather_w7(&p.a, pre_comp->precomp[`0`], `1`);
1168	if (!ecp_nistz256_set_from_affine(pre_comp_generator,
1169	group, &p.a, ctx)) {
1170	EC_POINT_free(pre_comp_generator);
1171	goto err;
1172	}
1173
1174	if (`0` == EC_POINT_cmp(group, generator, pre_comp_generator, ctx))
1175	preComputedTable = (const PRECOMP256_ROW *)pre_comp->precomp;
1176
1177	EC_POINT_free(pre_comp_generator);
1178	}
1179
1180	if (preComputedTable == NULL && ecp_nistz256_is_affine_G(generator)) {
1181	/*
1182	* If there is no precomputed data, but the generator is the
1183	* default, a hardcoded table of precomputed data is used. This
1184	* is because applications, such as Apache, do not use
1185	* EC_KEY_precompute_mult.
1186	*/
1187	preComputedTable = ecp_nistz256_precomputed;
1188	}
1189
1190	if (preComputedTable) {
1191	if ((BN_num_bits(scalar) > `256`)
1192	\|\| BN_is_negative(scalar)) {
1193	if ((tmp_scalar = BN_CTX_get(ctx)) == NULL)
1194	goto err;
1195
1196	if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
1197	ECerr(EC_F_ECP_NISTZ256_POINTS_MUL, ERR_R_BN_LIB);
1198	goto err;
1199	}
1200	scalar = tmp_scalar;
1201	}
1202
1203	for (i = `0`; i < bn_get_top(scalar) * BN_BYTES; i += BN_BYTES) {
1204	BN_ULONG d = bn_get_words(scalar)[i / BN_BYTES];
1205
1206	p_str[i + `0`] = (unsigned char)d;
1207	p_str[i + `1`] = (unsigned char)(d >> `8`);
1208	p_str[i + `2`] = (unsigned char)(d >> `16`);
1209	p_str[i + `3`] = (unsigned char)(d >>= `24`);
1210	if (BN_BYTES == `8`) {
1211	d >>= `8`;
1212	p_str[i + `4`] = (unsigned char)d;
1213	p_str[i + `5`] = (unsigned char)(d >> `8`);
1214	p_str[i + `6`] = (unsigned char)(d >> `16`);
1215	p_str[i + `7`] = (unsigned char)(d >> `24`);
1216	}
1217	}
1218
1219	for (; i < `33`; i++)
1220	p_str[i] = `0`;
1221
1222	#if defined(ECP_NISTZ256_AVX2)
1223	if (ecp_nistz_avx2_eligible()) {
1224	ecp_nistz256_avx2_mul_g(&p.p, p_str, preComputedTable);
1225	} else
1226	#endif
1227	{
1228	BN_ULONG infty;
1229
1230	/ First window /
1231	wvalue = (p_str[`0`] << `1`) & mask;
1232	idx += window_size;
1233
1234	wvalue = _booth_recode_w7(wvalue);
1235
1236	ecp_nistz256_gather_w7(&p.a, preComputedTable[`0`],
1237	wvalue >> `1`);
1238
1239	ecp_nistz256_neg(p.p.Z, p.p.Y);
1240	copy_conditional(p.p.Y, p.p.Z, wvalue & `1`);
1241
1242	/*
1243	* Since affine infinity is encoded as (0,0) and
1244	* Jacobian ias (,,0), we need to harmonize them
1245	* by assigning "one" or zero to Z.
1246	*/
1247	infty = (p.p.X[`0`] \| p.p.X[`1`] \| p.p.X[`2`] \| p.p.X[`3`] \|
1248	p.p.Y[`0`] \| p.p.Y[`1`] \| p.p.Y[`2`] \| p.p.Y[`3`]);
1249	if (P256_LIMBS == `8`)
1250	infty \|= (p.p.X[`4`] \| p.p.X[`5`] \| p.p.X[`6`] \| p.p.X[`7`] \|
1251	p.p.Y[`4`] \| p.p.Y[`5`] \| p.p.Y[`6`] \| p.p.Y[`7`]);
1252
1253	infty = `0` - is_zero(infty);
1254	infty = ~infty;
1255
1256	p.p.Z[`0`] = ONE[`0`] & infty;
1257	p.p.Z[`1`] = ONE[`1`] & infty;
1258	p.p.Z[`2`] = ONE[`2`] & infty;
1259	p.p.Z[`3`] = ONE[`3`] & infty;
1260	if (P256_LIMBS == `8`) {
1261	p.p.Z[`4`] = ONE[`4`] & infty;
1262	p.p.Z[`5`] = ONE[`5`] & infty;
1263	p.p.Z[`6`] = ONE[`6`] & infty;
1264	p.p.Z[`7`] = ONE[`7`] & infty;
1265	}
1266
1267	for (i = `1`; i < `37`; i++) {
1268	unsigned int off = (idx - `1`) / `8`;
1269	wvalue = p_str[off] \| p_str[off + `1`] << `8`;
1270	wvalue = (wvalue >> ((idx - `1`) % `8`)) & mask;
1271	idx += window_size;
1272
1273	wvalue = _booth_recode_w7(wvalue);
1274
1275	ecp_nistz256_gather_w7(&t.a,
1276	preComputedTable[i], wvalue >> `1`);
1277
1278	ecp_nistz256_neg(t.p.Z, t.a.Y);
1279	copy_conditional(t.a.Y, t.p.Z, wvalue & `1`);
1280
1281	ecp_nistz256_point_add_affine(&p.p, &p.p, &t.a);
1282	}
1283	}
1284	} else {
1285	p_is_infinity = `1`;
1286	no_precomp_for_generator = `1`;
1287	}
1288	} else
1289	p_is_infinity = `1`;
1290
1291	if (no_precomp_for_generator) {
1292	/*
1293	* Without a precomputed table for the generator, it has to be
1294	* handled like a normal point.
1295	*/
1296	new_scalars = OPENSSL_malloc((num + `1`) * sizeof(BIGNUM *));
1297	if (new_scalars == NULL) {
1298	ECerr(EC_F_ECP_NISTZ256_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1299	goto err;
1300	}
1301
1302	new_points = OPENSSL_malloc((num + `1`) * sizeof(EC_POINT *));
1303	if (new_points == NULL) {
1304	ECerr(EC_F_ECP_NISTZ256_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1305	goto err;
1306	}
1307
1308	memcpy(new_scalars, scalars, num * sizeof(BIGNUM *));
1309	new_scalars[num] = scalar;
1310	memcpy(new_points, points, num * sizeof(EC_POINT *));
1311	new_points[num] = generator;
1312
1313	scalars = new_scalars;
1314	points = new_points;
1315	num++;
1316	}
1317
1318	if (num) {
1319	P256_POINT *out = &t.p;
1320	if (p_is_infinity)
1321	out = &p.p;
1322
1323	if (!ecp_nistz256_windowed_mul(group, out, scalars, points, num, ctx))
1324	goto err;
1325
1326	if (!p_is_infinity)
1327	ecp_nistz256_point_add(&p.p, &p.p, out);
1328	}
1329
1330	/ Not constant-time, but we're only operating on the public output. /
1331	if (!bn_set_words(r->X, p.p.X, P256_LIMBS) \|\|
1332	!bn_set_words(r->Y, p.p.Y, P256_LIMBS) \|\|
1333	!bn_set_words(r->Z, p.p.Z, P256_LIMBS)) {
1334	goto err;
1335	}
1336	r->Z_is_one = is_one(r->Z) & `1`;
1337
1338	ret = `1`;
1339
1340	err:
1341	BN_CTX_end(ctx);
1342	OPENSSL_free(new_points);
1343	OPENSSL_free(new_scalars);
1344	return ret;
1345	}
1346
1347	__owur static int ecp_nistz256_get_affine(const EC_GROUP *group,
1348	const EC_POINT *point,
1349	BIGNUM x, BIGNUM y, BN_CTX *ctx)
1350	{
1351	BN_ULONG z_inv2[P256_LIMBS];
1352	BN_ULONG z_inv3[P256_LIMBS];
1353	BN_ULONG x_aff[P256_LIMBS];
1354	BN_ULONG y_aff[P256_LIMBS];
1355	BN_ULONG point_x[P256_LIMBS], point_y[P256_LIMBS], point_z[P256_LIMBS];
1356	BN_ULONG x_ret[P256_LIMBS], y_ret[P256_LIMBS];
1357
1358	if (EC_POINT_is_at_infinity(group, point)) {
1359	ECerr(EC_F_ECP_NISTZ256_GET_AFFINE, EC_R_POINT_AT_INFINITY);
1360	return `0`;
1361	}
1362
1363	if (!ecp_nistz256_bignum_to_field_elem(point_x, point->X) \|\|
1364	!ecp_nistz256_bignum_to_field_elem(point_y, point->Y) \|\|
1365	!ecp_nistz256_bignum_to_field_elem(point_z, point->Z)) {
1366	ECerr(EC_F_ECP_NISTZ256_GET_AFFINE, EC_R_COORDINATES_OUT_OF_RANGE);
1367	return `0`;
1368	}
1369
1370	ecp_nistz256_mod_inverse(z_inv3, point_z);
1371	ecp_nistz256_sqr_mont(z_inv2, z_inv3);
1372	ecp_nistz256_mul_mont(x_aff, z_inv2, point_x);
1373
1374	if (x != NULL) {
1375	ecp_nistz256_from_mont(x_ret, x_aff);
1376	if (!bn_set_words(x, x_ret, P256_LIMBS))
1377	return `0`;
1378	}
1379
1380	if (y != NULL) {
1381	ecp_nistz256_mul_mont(z_inv3, z_inv3, z_inv2);
1382	ecp_nistz256_mul_mont(y_aff, z_inv3, point_y);
1383	ecp_nistz256_from_mont(y_ret, y_aff);
1384	if (!bn_set_words(y, y_ret, P256_LIMBS))
1385	return `0`;
1386	}
1387
1388	return `1`;
1389	}
1390
1391	static NISTZ256_PRE_COMP ecp_nistz256_pre_comp_new(const* EC_GROUP *group)
1392	{
1393	NISTZ256_PRE_COMP *ret = NULL;
1394
1395	if (!group)
1396	return NULL;
1397
1398	ret = OPENSSL_zalloc(sizeof(*ret));
1399
1400	if (ret == NULL) {
1401	ECerr(EC_F_ECP_NISTZ256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1402	return ret;
1403	}
1404
1405	ret->group = group;
1406	ret->w = `6`; / default /
1407	ret->references = `1`;
1408
1409	ret->lock = CRYPTO_THREAD_lock_new();
1410	if (ret->lock == NULL) {
1411	ECerr(EC_F_ECP_NISTZ256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1412	OPENSSL_free(ret);
1413	return NULL;
1414	}
1415	return ret;
1416	}
1417
1418	NISTZ256_PRE_COMP EC_nistz256_pre_comp_dup(NISTZ256_PRE_COMP p)
1419	{
1420	int i;
1421	if (p != NULL)
1422	CRYPTO_UP_REF(&p->references, &i, p->lock);
1423	return p;
1424	}
1425
1426	void EC_nistz256_pre_comp_free(NISTZ256_PRE_COMP *pre)
1427	{
1428	int i;
1429
1430	if (pre == NULL)
1431	return;
1432
1433	CRYPTO_DOWN_REF(&pre->references, &i, pre->lock);
1434	REF_PRINT_COUNT("EC_nistz256", pre);
1435	if (i > `0`)
1436	return;
1437	REF_ASSERT_ISNT(i < `0`);
1438
1439	OPENSSL_free(pre->precomp_storage);
1440	CRYPTO_THREAD_lock_free(pre->lock);
1441	OPENSSL_free(pre);
1442	}
1443
1444
1445	static int ecp_nistz256_window_have_precompute_mult(const EC_GROUP *group)
1446	{
1447	/ There is a hard-coded table for the default generator. /
1448	const EC_POINT *generator = EC_GROUP_get0_generator(group);
1449
1450	if (generator != NULL && ecp_nistz256_is_affine_G(generator)) {
1451	/ There is a hard-coded table for the default generator. /
1452	return `1`;
1453	}
1454
1455	return HAVEPRECOMP(group, nistz256);
1456	}
1457
1458	#if defined(__x86_64) \|\| defined(__x86_64__) \|\| \
1459	defined(_M_AMD64) \|\| defined(_M_X64) \|\| \
1460	defined(__powerpc64__) \|\| defined(_ARCH_PP64) \|\| \
1461	defined(__aarch64__)
1462	/*
1463	* Montgomery mul modulo Order(P): res = ab2^-256 mod Order(P)
1464	*/
1465	void ecp_nistz256_ord_mul_mont(BN_ULONG res[P256_LIMBS],
1466	const BN_ULONG a[P256_LIMBS],
1467	const BN_ULONG b[P256_LIMBS]);
1468	void ecp_nistz256_ord_sqr_mont(BN_ULONG res[P256_LIMBS],
1469	const BN_ULONG a[P256_LIMBS],
1470	BN_ULONG rep);
1471
1472	static int ecp_nistz256_inv_mod_ord(const EC_GROUP group, BIGNUM r,
1473	const BIGNUM x, BN_CTX ctx)
1474	{
1475	/ RR = 2^512 mod ord(p256) /
1476	static const BN_ULONG RR[P256_LIMBS] = {
1477	TOBN(`0x83244c95`,`0xbe79eea2`), TOBN(`0x4699799c`,`0x49bd6fa6`),
1478	TOBN(`0x2845b239`,`0x2b6bec59`), TOBN(`0x66e12d94`,`0xf3d95620`)
1479	};
1480	/ The constant 1 (unlike ONE that is one in Montgomery representation) /
1481	static const BN_ULONG one[P256_LIMBS] = {
1482	TOBN(`0`,`1`), TOBN(`0`,`0`), TOBN(`0`,`0`), TOBN(`0`,`0`)
1483	};
1484	/*
1485	* We don't use entry 0 in the table, so we omit it and address
1486	* with -1 offset.
1487	*/
1488	BN_ULONG table[`15`][P256_LIMBS];
1489	BN_ULONG out[P256_LIMBS], t[P256_LIMBS];
1490	int i, ret = `0`;
1491	enum {
1492	i_1 = `0`, i_10, i_11, i_101, i_111, i_1010, i_1111,
1493	i_10101, i_101010, i_101111, i_x6, i_x8, i_x16, i_x32
1494	};
1495
1496	/*
1497	* Catch allocation failure early.
1498	*/
1499	if (bn_wexpand(r, P256_LIMBS) == NULL) {
1500	ECerr(EC_F_ECP_NISTZ256_INV_MOD_ORD, ERR_R_BN_LIB);
1501	goto err;
1502	}
1503
1504	if ((BN_num_bits(x) > `256`) \|\| BN_is_negative(x)) {
1505	BIGNUM *tmp;
1506
1507	if ((tmp = BN_CTX_get(ctx)) == NULL
1508	\|\| !BN_nnmod(tmp, x, group->order, ctx)) {
1509	ECerr(EC_F_ECP_NISTZ256_INV_MOD_ORD, ERR_R_BN_LIB);
1510	goto err;
1511	}
1512	x = tmp;
1513	}
1514
1515	if (!ecp_nistz256_bignum_to_field_elem(t, x)) {
1516	ECerr(EC_F_ECP_NISTZ256_INV_MOD_ORD, EC_R_COORDINATES_OUT_OF_RANGE);
1517	goto err;
1518	}
1519
1520	ecp_nistz256_ord_mul_mont(table[`0`], t, RR);
1521	#if 0
1522	/*
1523	* Original sparse-then-fixed-window algorithm, retained for reference.
1524	*/
1525	for (i = `2`; i < `16`; i += `2`) {
1526	ecp_nistz256_ord_sqr_mont(table[i-`1`], table[i/`2`-`1`], `1`);
1527	ecp_nistz256_ord_mul_mont(table[i], table[i-`1`], table[`0`]);
1528	}
1529
1530	/*
1531	* The top 128bit of the exponent are highly redudndant, so we
1532	* perform an optimized flow
1533	*/
1534	ecp_nistz256_ord_sqr_mont(t, table[`15`-`1`], `4`); / f0 /
1535	ecp_nistz256_ord_mul_mont(t, t, table[`15`-`1`]); / ff /
1536
1537	ecp_nistz256_ord_sqr_mont(out, t, `8`); / ff00 /
1538	ecp_nistz256_ord_mul_mont(out, out, t); / ffff /
1539
1540	ecp_nistz256_ord_sqr_mont(t, out, `16`); / ffff0000 /
1541	ecp_nistz256_ord_mul_mont(t, t, out); / ffffffff /
1542
1543	ecp_nistz256_ord_sqr_mont(out, t, `64`); / ffffffff0000000000000000 /
1544	ecp_nistz256_ord_mul_mont(out, out, t); / ffffffff00000000ffffffff /
1545
1546	ecp_nistz256_ord_sqr_mont(out, out, `32`); / ffffffff00000000ffffffff00000000 /
1547	ecp_nistz256_ord_mul_mont(out, out, t); / ffffffff00000000ffffffffffffffff /
1548
1549	/*
1550	* The bottom 128 bit of the exponent are processed with fixed 4-bit window
1551	*/
1552	for(i = `0`; i < `32`; i++) {
1553	/ expLo - the low 128 bits of the exponent we use (ord(p256) - 2),*
1554	* split into nibbles */
1555	static const unsigned char expLo[`32`] = {
1556	`0xb`,`0xc`,`0xe`,`0x6`,`0xf`,`0xa`,`0xa`,`0xd`,`0xa`,`0x7`,`0x1`,`0x7`,`0x9`,`0xe`,`0x8`,`0x4`,
1557	`0xf`,`0x3`,`0xb`,`0x9`,`0xc`,`0xa`,`0xc`,`0x2`,`0xf`,`0xc`,`0x6`,`0x3`,`0x2`,`0x5`,`0x4`,`0xf`
1558	};
1559
1560	ecp_nistz256_ord_sqr_mont(out, out, `4`);
1561	/ The exponent is public, no need in constant-time access /
1562	ecp_nistz256_ord_mul_mont(out, out, table[expLo[i]-`1`]);
1563	}
1564	#else
1565	/*
1566	* https://briansmith.org/ecc-inversion-addition-chains-01#p256_scalar_inversion
1567	*
1568	* Even though this code path spares 12 squarings, 4.5%, and 13
1569	* multiplications, 25%, on grand scale sign operation is not that
1570	* much faster, not more that 2%...
1571	*/
1572
1573	/ pre-calculate powers /
1574	ecp_nistz256_ord_sqr_mont(table[i_10], table[i_1], `1`);
1575
1576	ecp_nistz256_ord_mul_mont(table[i_11], table[i_1], table[i_10]);
1577
1578	ecp_nistz256_ord_mul_mont(table[i_101], table[i_11], table[i_10]);
1579
1580	ecp_nistz256_ord_mul_mont(table[i_111], table[i_101], table[i_10]);
1581
1582	ecp_nistz256_ord_sqr_mont(table[i_1010], table[i_101], `1`);
1583
1584	ecp_nistz256_ord_mul_mont(table[i_1111], table[i_1010], table[i_101]);
1585
1586	ecp_nistz256_ord_sqr_mont(table[i_10101], table[i_1010], `1`);
1587	ecp_nistz256_ord_mul_mont(table[i_10101], table[i_10101], table[i_1]);
1588
1589	ecp_nistz256_ord_sqr_mont(table[i_101010], table[i_10101], `1`);
1590
1591	ecp_nistz256_ord_mul_mont(table[i_101111], table[i_101010], table[i_101]);
1592
1593	ecp_nistz256_ord_mul_mont(table[i_x6], table[i_101010], table[i_10101]);
1594
1595	ecp_nistz256_ord_sqr_mont(table[i_x8], table[i_x6], `2`);
1596	ecp_nistz256_ord_mul_mont(table[i_x8], table[i_x8], table[i_11]);
1597
1598	ecp_nistz256_ord_sqr_mont(table[i_x16], table[i_x8], `8`);
1599	ecp_nistz256_ord_mul_mont(table[i_x16], table[i_x16], table[i_x8]);
1600
1601	ecp_nistz256_ord_sqr_mont(table[i_x32], table[i_x16], `16`);
1602	ecp_nistz256_ord_mul_mont(table[i_x32], table[i_x32], table[i_x16]);
1603
1604	/ calculations /
1605	ecp_nistz256_ord_sqr_mont(out, table[i_x32], `64`);
1606	ecp_nistz256_ord_mul_mont(out, out, table[i_x32]);
1607
1608	for (i = `0`; i < `27`; i++) {
1609	static const struct { unsigned char p, i; } chain[`27`] = {
1610	{ `32`, i_x32 }, { `6`, i_101111 }, { `5`, i_111 },
1611	{ `4`, i_11 }, { `5`, i_1111 }, { `5`, i_10101 },
1612	{ `4`, i_101 }, { `3`, i_101 }, { `3`, i_101 },
1613	{ `5`, i_111 }, { `9`, i_101111 }, { `6`, i_1111 },
1614	{ `2`, i_1 }, { `5`, i_1 }, { `6`, i_1111 },
1615	{ `5`, i_111 }, { `4`, i_111 }, { `5`, i_111 },
1616	{ `5`, i_101 }, { `3`, i_11 }, { `10`, i_101111 },
1617	{ `2`, i_11 }, { `5`, i_11 }, { `5`, i_11 },
1618	{ `3`, i_1 }, { `7`, i_10101 }, { `6`, i_1111 }
1619	};
1620
1621	ecp_nistz256_ord_sqr_mont(out, out, chain[i].p);
1622	ecp_nistz256_ord_mul_mont(out, out, table[chain[i].i]);
1623	}
1624	#endif
1625	ecp_nistz256_ord_mul_mont(out, out, one);
1626
1627	/*
1628	* Can't fail, but check return code to be consistent anyway.
1629	*/
1630	if (!bn_set_words(r, out, P256_LIMBS))
1631	goto err;
1632
1633	ret = `1`;
1634	err:
1635	return ret;
1636	}
1637	#else
1638	# define ecp_nistz256_inv_mod_ord NULL
1639	#endif
1640
1641	const EC_METHOD EC_GFp_nistz256_method(void*)
1642	{
1643	static const EC_METHOD ret = {
1644	EC_FLAGS_DEFAULT_OCT,
1645	NID_X9_62_prime_field,
1646	ec_GFp_mont_group_init,
1647	ec_GFp_mont_group_finish,
1648	ec_GFp_mont_group_clear_finish,
1649	ec_GFp_mont_group_copy,
1650	ec_GFp_mont_group_set_curve,
1651	ec_GFp_simple_group_get_curve,
1652	ec_GFp_simple_group_get_degree,
1653	ec_group_simple_order_bits,
1654	ec_GFp_simple_group_check_discriminant,
1655	ec_GFp_simple_point_init,
1656	ec_GFp_simple_point_finish,
1657	ec_GFp_simple_point_clear_finish,
1658	ec_GFp_simple_point_copy,
1659	ec_GFp_simple_point_set_to_infinity,
1660	ec_GFp_simple_set_Jprojective_coordinates_GFp,
1661	ec_GFp_simple_get_Jprojective_coordinates_GFp,
1662	ec_GFp_simple_point_set_affine_coordinates,
1663	ecp_nistz256_get_affine,
1664	`0`, `0`, `0`,
1665	ec_GFp_simple_add,
1666	ec_GFp_simple_dbl,
1667	ec_GFp_simple_invert,
1668	ec_GFp_simple_is_at_infinity,
1669	ec_GFp_simple_is_on_curve,
1670	ec_GFp_simple_cmp,
1671	ec_GFp_simple_make_affine,
1672	ec_GFp_simple_points_make_affine,
1673	ecp_nistz256_points_mul, / mul /
1674	ecp_nistz256_mult_precompute, / precompute_mult /
1675	ecp_nistz256_window_have_precompute_mult, / have_precompute_mult /
1676	ec_GFp_mont_field_mul,
1677	ec_GFp_mont_field_sqr,
1678	`0`, / field_div /
1679	ec_GFp_mont_field_inv,
1680	ec_GFp_mont_field_encode,
1681	ec_GFp_mont_field_decode,
1682	ec_GFp_mont_field_set_to_one,
1683	ec_key_simple_priv2oct,
1684	ec_key_simple_oct2priv,
1685	`0`, / set private /
1686	ec_key_simple_generate_key,
1687	ec_key_simple_check_key,
1688	ec_key_simple_generate_public_key,
1689	`0`, / keycopy /
1690	`0`, / keyfinish /
1691	ecdh_simple_compute_key,
1692	ecdsa_simple_sign_setup,
1693	ecdsa_simple_sign_sig,
1694	ecdsa_simple_verify_sig,
1695	ecp_nistz256_inv_mod_ord, / can be #define-d NULL /
1696	`0`, / blind_coordinates /
1697	`0`, / ladder_pre /
1698	`0`, / ladder_step /
1699	`0` / ladder_post /
1700	};
1701
1702	return &ret;
1703	}
1704

Browse the source code of ClickHouse/contrib/openssl/crypto/ec/ecp_nistz256.c