p256-x86_64.c source code [engine/third_party/boringssl/src/crypto/fipsmodule/ec/p256-x86_64.c]

1	/*
2	* Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
3	* Copyright (c) 2014, Intel Corporation. All Rights Reserved.
4	*
5	* Licensed under the OpenSSL license (the "License"). You may not use
6	* this file except in compliance with the License. You can obtain a copy
7	* in the file LICENSE in the source distribution or at
8	* https://www.openssl.org/source/license.html
9	*
10	* Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
11	* (1) Intel Corporation, Israel Development Center, Haifa, Israel
12	* (2) University of Haifa, Israel
13	*
14	* Reference:
15	* S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
16	* 256 Bit Primes"
17	*/
18
19	#include <openssl/ec.h>
20
21	#include <assert.h>
22	#include <stdint.h>
23	#include <string.h>
24
25	#include <openssl/bn.h>
26	#include <openssl/cpu.h>
27	#include <openssl/crypto.h>
28	#include <openssl/err.h>
29
30	#include "../bn/internal.h"
31	#include "../delocate.h"
32	#include "../../internal.h"
33	#include "internal.h"
34	#include "p256-x86_64.h"
35
36
37	#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
38	!defined(OPENSSL_SMALL)
39
40	typedef P256_POINT_AFFINE PRECOMP256_ROW[`64`];
41
42	// One converted into the Montgomery domain
43	static const BN_ULONG ONE[P256_LIMBS] = {
44	TOBN(`0x00000000`, `0x00000001`), TOBN(`0xffffffff`, `0x00000000`),
45	TOBN(`0xffffffff`, `0xffffffff`), TOBN(`0x00000000`, `0xfffffffe`),
46	};
47
48	// Precomputed tables for the default generator
49	#include "p256-x86_64-table.h"
50
51	// Recode window to a signed digit, see \|ec_GFp_nistp_recode_scalar_bits\| in
52	// util.c for details
53	static unsigned booth_recode_w5(unsigned in) {
54	unsigned s, d;
55
56	s = ~((in >> `5`) - `1`);
57	d = (`1` << `6`) - in - `1`;
58	d = (d & s) \| (in & ~s);
59	d = (d >> `1`) + (d & `1`);
60
61	return (d << `1`) + (s & `1`);
62	}
63
64	static unsigned booth_recode_w7(unsigned in) {
65	unsigned s, d;
66
67	s = ~((in >> `7`) - `1`);
68	d = (`1` << `8`) - in - `1`;
69	d = (d & s) \| (in & ~s);
70	d = (d >> `1`) + (d & `1`);
71
72	return (d << `1`) + (s & `1`);
73	}
74
75	// copy_conditional copies \|src\| to \|dst\| if \|move\| is one and leaves it as-is
76	// if \|move\| is zero.
77	//
78	// WARNING: this breaks the usual convention of constant-time functions
79	// returning masks.
80	static void copy_conditional(BN_ULONG dst[P256_LIMBS],
81	const BN_ULONG src[P256_LIMBS], BN_ULONG move) {
82	BN_ULONG mask1 = ((BN_ULONG)`0`) - move;
83	BN_ULONG mask2 = ~mask1;
84
85	dst[`0`] = (src[`0`] & mask1) ^ (dst[`0`] & mask2);
86	dst[`1`] = (src[`1`] & mask1) ^ (dst[`1`] & mask2);
87	dst[`2`] = (src[`2`] & mask1) ^ (dst[`2`] & mask2);
88	dst[`3`] = (src[`3`] & mask1) ^ (dst[`3`] & mask2);
89	if (P256_LIMBS == `8`) {
90	dst[`4`] = (src[`4`] & mask1) ^ (dst[`4`] & mask2);
91	dst[`5`] = (src[`5`] & mask1) ^ (dst[`5`] & mask2);
92	dst[`6`] = (src[`6`] & mask1) ^ (dst[`6`] & mask2);
93	dst[`7`] = (src[`7`] & mask1) ^ (dst[`7`] & mask2);
94	}
95	}
96
97	// is_not_zero returns one iff in != 0 and zero otherwise.
98	//
99	// WARNING: this breaks the usual convention of constant-time functions
100	// returning masks.
101	//
102	// (define-fun is_not_zero ((in (_ BitVec 64))) (_ BitVec 64)
103	// (bvlshr (bvor in (bvsub #x0000000000000000 in)) #x000000000000003f)
104	// )
105	//
106	// (declare-fun x () (_ BitVec 64))
107	//
108	// (assert (and (= x #x0000000000000000) (= (is_not_zero x) #x0000000000000001)))
109	// (check-sat)
110	//
111	// (assert (and (not (= x #x0000000000000000)) (= (is_not_zero x) #x0000000000000000)))
112	// (check-sat)
113	//
114	static BN_ULONG is_not_zero(BN_ULONG in) {
115	in \|= (`0` - in);
116	in >>= BN_BITS2 - `1`;
117	return in;
118	}
119
120	// ecp_nistz256_mod_inverse_mont sets \|r\| to (\|in\| 2^-256)^-1 * 2^256 mod p.*
121	// That is, \|r\| is the modular inverse of \|in\| for input and output in the
122	// Montgomery domain.
123	static void ecp_nistz256_mod_inverse_mont(BN_ULONG r[P256_LIMBS],
124	const BN_ULONG in[P256_LIMBS]) {
125	/ The poly is ffffffff 00000001 00000000 00000000 00000000 ffffffff ffffffff*
126	ffffffff
127	We use FLT and used poly-2 as exponent /*
128	BN_ULONG p2[P256_LIMBS];
129	BN_ULONG p4[P256_LIMBS];
130	BN_ULONG p8[P256_LIMBS];
131	BN_ULONG p16[P256_LIMBS];
132	BN_ULONG p32[P256_LIMBS];
133	BN_ULONG res[P256_LIMBS];
134	int i;
135
136	ecp_nistz256_sqr_mont(res, in);
137	ecp_nistz256_mul_mont(p2, res, in); // 3p*
138
139	ecp_nistz256_sqr_mont(res, p2);
140	ecp_nistz256_sqr_mont(res, res);
141	ecp_nistz256_mul_mont(p4, res, p2); // fp*
142
143	ecp_nistz256_sqr_mont(res, p4);
144	ecp_nistz256_sqr_mont(res, res);
145	ecp_nistz256_sqr_mont(res, res);
146	ecp_nistz256_sqr_mont(res, res);
147	ecp_nistz256_mul_mont(p8, res, p4); // ffp*
148
149	ecp_nistz256_sqr_mont(res, p8);
150	for (i = `0`; i < `7`; i++) {
151	ecp_nistz256_sqr_mont(res, res);
152	}
153	ecp_nistz256_mul_mont(p16, res, p8); // ffffp*
154
155	ecp_nistz256_sqr_mont(res, p16);
156	for (i = `0`; i < `15`; i++) {
157	ecp_nistz256_sqr_mont(res, res);
158	}
159	ecp_nistz256_mul_mont(p32, res, p16); // ffffffffp*
160
161	ecp_nistz256_sqr_mont(res, p32);
162	for (i = `0`; i < `31`; i++) {
163	ecp_nistz256_sqr_mont(res, res);
164	}
165	ecp_nistz256_mul_mont(res, res, in);
166
167	for (i = `0`; i < `32` * `4`; i++) {
168	ecp_nistz256_sqr_mont(res, res);
169	}
170	ecp_nistz256_mul_mont(res, res, p32);
171
172	for (i = `0`; i < `32`; i++) {
173	ecp_nistz256_sqr_mont(res, res);
174	}
175	ecp_nistz256_mul_mont(res, res, p32);
176
177	for (i = `0`; i < `16`; i++) {
178	ecp_nistz256_sqr_mont(res, res);
179	}
180	ecp_nistz256_mul_mont(res, res, p16);
181
182	for (i = `0`; i < `8`; i++) {
183	ecp_nistz256_sqr_mont(res, res);
184	}
185	ecp_nistz256_mul_mont(res, res, p8);
186
187	ecp_nistz256_sqr_mont(res, res);
188	ecp_nistz256_sqr_mont(res, res);
189	ecp_nistz256_sqr_mont(res, res);
190	ecp_nistz256_sqr_mont(res, res);
191	ecp_nistz256_mul_mont(res, res, p4);
192
193	ecp_nistz256_sqr_mont(res, res);
194	ecp_nistz256_sqr_mont(res, res);
195	ecp_nistz256_mul_mont(res, res, p2);
196
197	ecp_nistz256_sqr_mont(res, res);
198	ecp_nistz256_sqr_mont(res, res);
199	ecp_nistz256_mul_mont(r, res, in);
200	}
201
202	// r = p p_scalar*
203	static void ecp_nistz256_windowed_mul(const EC_GROUP group, P256_POINT r,
204	const EC_RAW_POINT *p,
205	const EC_SCALAR *p_scalar) {
206	assert(p != NULL);
207	assert(p_scalar != NULL);
208	assert(group->field.width == P256_LIMBS);
209
210	static const unsigned kWindowSize = `5`;
211	static const unsigned kMask = (`1` << (`5` / kWindowSize / + `1`)) - `1`;
212
213	// A \|P256_POINT\| is (3 32) = 96 bytes, and the 64-byte alignment should*
214	// add no more than 63 bytes of overhead. Thus, \|table\| should require
215	// ~1599 ((96 16) + 63) bytes of stack space.*
216	alignas(`64`) P256_POINT table[`16`];
217	uint8_t p_str[`33`];
218	OPENSSL_memcpy(p_str, p_scalar->bytes, `32`);
219	p_str[`32`] = `0`;
220
221	// table[0] is implicitly (0,0,0) (the point at infinity), therefore it is
222	// not stored. All other values are actually stored with an offset of -1 in
223	// table.
224	P256_POINT *row = table;
225	assert(group->field.width == P256_LIMBS);
226	OPENSSL_memcpy(row[`1` - `1`].X, p->X.words, P256_LIMBS * sizeof(BN_ULONG));
227	OPENSSL_memcpy(row[`1` - `1`].Y, p->Y.words, P256_LIMBS * sizeof(BN_ULONG));
228	OPENSSL_memcpy(row[`1` - `1`].Z, p->Z.words, P256_LIMBS * sizeof(BN_ULONG));
229
230	ecp_nistz256_point_double(&row[`2` - `1`], &row[`1` - `1`]);
231	ecp_nistz256_point_add(&row[`3` - `1`], &row[`2` - `1`], &row[`1` - `1`]);
232	ecp_nistz256_point_double(&row[`4` - `1`], &row[`2` - `1`]);
233	ecp_nistz256_point_double(&row[`6` - `1`], &row[`3` - `1`]);
234	ecp_nistz256_point_double(&row[`8` - `1`], &row[`4` - `1`]);
235	ecp_nistz256_point_double(&row[`12` - `1`], &row[`6` - `1`]);
236	ecp_nistz256_point_add(&row[`5` - `1`], &row[`4` - `1`], &row[`1` - `1`]);
237	ecp_nistz256_point_add(&row[`7` - `1`], &row[`6` - `1`], &row[`1` - `1`]);
238	ecp_nistz256_point_add(&row[`9` - `1`], &row[`8` - `1`], &row[`1` - `1`]);
239	ecp_nistz256_point_add(&row[`13` - `1`], &row[`12` - `1`], &row[`1` - `1`]);
240	ecp_nistz256_point_double(&row[`14` - `1`], &row[`7` - `1`]);
241	ecp_nistz256_point_double(&row[`10` - `1`], &row[`5` - `1`]);
242	ecp_nistz256_point_add(&row[`15` - `1`], &row[`14` - `1`], &row[`1` - `1`]);
243	ecp_nistz256_point_add(&row[`11` - `1`], &row[`10` - `1`], &row[`1` - `1`]);
244	ecp_nistz256_point_double(&row[`16` - `1`], &row[`8` - `1`]);
245
246	BN_ULONG tmp[P256_LIMBS];
247	alignas(`32`) P256_POINT h;
248	unsigned index = `255`;
249	unsigned wvalue = p_str[(index - `1`) / `8`];
250	wvalue = (wvalue >> ((index - `1`) % `8`)) & kMask;
251
252	ecp_nistz256_select_w5(r, table, booth_recode_w5(wvalue) >> `1`);
253
254	while (index >= `5`) {
255	if (index != `255`) {
256	unsigned off = (index - `1`) / `8`;
257
258	wvalue = p_str[off] \| p_str[off + `1`] << `8`;
259	wvalue = (wvalue >> ((index - `1`) % `8`)) & kMask;
260
261	wvalue = booth_recode_w5(wvalue);
262
263	ecp_nistz256_select_w5(&h, table, wvalue >> `1`);
264
265	ecp_nistz256_neg(tmp, h.Y);
266	copy_conditional(h.Y, tmp, (wvalue & `1`));
267
268	ecp_nistz256_point_add(r, r, &h);
269	}
270
271	index -= kWindowSize;
272
273	ecp_nistz256_point_double(r, r);
274	ecp_nistz256_point_double(r, r);
275	ecp_nistz256_point_double(r, r);
276	ecp_nistz256_point_double(r, r);
277	ecp_nistz256_point_double(r, r);
278	}
279
280	// Final window
281	wvalue = p_str[`0`];
282	wvalue = (wvalue << `1`) & kMask;
283
284	wvalue = booth_recode_w5(wvalue);
285
286	ecp_nistz256_select_w5(&h, table, wvalue >> `1`);
287
288	ecp_nistz256_neg(tmp, h.Y);
289	copy_conditional(h.Y, tmp, wvalue & `1`);
290
291	ecp_nistz256_point_add(r, r, &h);
292	}
293
294	typedef union {
295	P256_POINT p;
296	P256_POINT_AFFINE a;
297	} p256_point_union_t;
298
299	static unsigned calc_first_wvalue(unsigned index, const* uint8_t p_str[`33`]) {
300	static const unsigned kWindowSize = `7`;
301	static const unsigned kMask = (`1` << (`7` / kWindowSize / + `1`)) - `1`;
302	*index = kWindowSize;
303
304	unsigned wvalue = (p_str[`0`] << `1`) & kMask;
305	return booth_recode_w7(wvalue);
306	}
307
308	static unsigned calc_wvalue(unsigned index, const* uint8_t p_str[`33`]) {
309	static const unsigned kWindowSize = `7`;
310	static const unsigned kMask = (`1` << (`7` / kWindowSize / + `1`)) - `1`;
311
312	const unsigned off = (*index - `1`) / `8`;
313	unsigned wvalue = p_str[off] \| p_str[off + `1`] << `8`;
314	wvalue = (wvalue >> ((*index - `1`) % `8`)) & kMask;
315	*index += kWindowSize;
316
317	return booth_recode_w7(wvalue);
318	}
319
320	static void ecp_nistz256_point_mul(const EC_GROUP group, EC_RAW_POINT r,
321	const EC_RAW_POINT *p,
322	const EC_SCALAR *scalar) {
323	alignas(`32`) P256_POINT out;
324	ecp_nistz256_windowed_mul(group, &out, p, scalar);
325
326	assert(group->field.width == P256_LIMBS);
327	OPENSSL_memcpy(r->X.words, out.X, P256_LIMBS * sizeof(BN_ULONG));
328	OPENSSL_memcpy(r->Y.words, out.Y, P256_LIMBS * sizeof(BN_ULONG));
329	OPENSSL_memcpy(r->Z.words, out.Z, P256_LIMBS * sizeof(BN_ULONG));
330	}
331
332	static void ecp_nistz256_point_mul_base(const EC_GROUP group, EC_RAW_POINT r,
333	const EC_SCALAR *scalar) {
334	alignas(`32`) p256_point_union_t t, p;
335
336	uint8_t p_str[`33`];
337	OPENSSL_memcpy(p_str, scalar->bytes, `32`);
338	p_str[`32`] = `0`;
339
340	// First window
341	unsigned index = `0`;
342	unsigned wvalue = calc_first_wvalue(&index, p_str);
343
344	ecp_nistz256_select_w7(&p.a, ecp_nistz256_precomputed[`0`], wvalue >> `1`);
345	ecp_nistz256_neg(p.p.Z, p.p.Y);
346	copy_conditional(p.p.Y, p.p.Z, wvalue & `1`);
347
348	// Convert \|p\| from affine to Jacobian coordinates. We set Z to zero if \|p\|
349	// is infinity and \|ONE\| otherwise. \|p\| was computed from the table, so it
350	// is infinity iff \|wvalue >> 1\| is zero.
351	OPENSSL_memset(p.p.Z, `0`, sizeof(p.p.Z));
352	copy_conditional(p.p.Z, ONE, is_not_zero(wvalue >> `1`));
353
354	for (int i = `1`; i < `37`; i++) {
355	wvalue = calc_wvalue(&index, p_str);
356
357	ecp_nistz256_select_w7(&t.a, ecp_nistz256_precomputed[i], wvalue >> `1`);
358
359	ecp_nistz256_neg(t.p.Z, t.a.Y);
360	copy_conditional(t.a.Y, t.p.Z, wvalue & `1`);
361
362	// Note \|ecp_nistz256_point_add_affine\| does not work if \|p.p\| and \|t.a\|
363	// are the same non-infinity point.
364	ecp_nistz256_point_add_affine(&p.p, &p.p, &t.a);
365	}
366
367	assert(group->field.width == P256_LIMBS);
368	OPENSSL_memcpy(r->X.words, p.p.X, P256_LIMBS * sizeof(BN_ULONG));
369	OPENSSL_memcpy(r->Y.words, p.p.Y, P256_LIMBS * sizeof(BN_ULONG));
370	OPENSSL_memcpy(r->Z.words, p.p.Z, P256_LIMBS * sizeof(BN_ULONG));
371	}
372
373	static void ecp_nistz256_points_mul_public(const EC_GROUP *group,
374	EC_RAW_POINT *r,
375	const EC_SCALAR *g_scalar,
376	const EC_RAW_POINT *p_,
377	const EC_SCALAR *p_scalar) {
378	assert(p_ != NULL && p_scalar != NULL && g_scalar != NULL);
379
380	alignas(`32`) p256_point_union_t t, p;
381	uint8_t p_str[`33`];
382	OPENSSL_memcpy(p_str, g_scalar->bytes, `32`);
383	p_str[`32`] = `0`;
384
385	// First window
386	unsigned index = `0`;
387	unsigned wvalue = calc_first_wvalue(&index, p_str);
388
389	// Convert \|p\| from affine to Jacobian coordinates. We set Z to zero if \|p\|
390	// is infinity and \|ONE\| otherwise. \|p\| was computed from the table, so it
391	// is infinity iff \|wvalue >> 1\| is zero.
392	if ((wvalue >> `1`) != `0`) {
393	OPENSSL_memcpy(&p.a, &ecp_nistz256_precomputed[`0`][(wvalue >> `1`) - `1`],
394	sizeof(p.a));
395	OPENSSL_memcpy(&p.p.Z, ONE, sizeof(p.p.Z));
396	} else {
397	OPENSSL_memset(&p.a, `0`, sizeof(p.a));
398	OPENSSL_memset(p.p.Z, `0`, sizeof(p.p.Z));
399	}
400
401	if ((wvalue & `1`) == `1`) {
402	ecp_nistz256_neg(p.p.Y, p.p.Y);
403	}
404
405	for (int i = `1`; i < `37`; i++) {
406	wvalue = calc_wvalue(&index, p_str);
407
408	if ((wvalue >> `1`) == `0`) {
409	continue;
410	}
411
412	OPENSSL_memcpy(&t.a, &ecp_nistz256_precomputed[i][(wvalue >> `1`) - `1`],
413	sizeof(p.a));
414
415	if ((wvalue & `1`) == `1`) {
416	ecp_nistz256_neg(t.a.Y, t.a.Y);
417	}
418
419	// Note \|ecp_nistz256_point_add_affine\| does not work if \|p.p\| and \|t.a\|
420	// are the same non-infinity point, so it is important that we compute the
421	// \|g_scalar\| term before the \|p_scalar\| term.
422	ecp_nistz256_point_add_affine(&p.p, &p.p, &t.a);
423	}
424
425	ecp_nistz256_windowed_mul(group, &t.p, p_, p_scalar);
426	ecp_nistz256_point_add(&p.p, &p.p, &t.p);
427
428	assert(group->field.width == P256_LIMBS);
429	OPENSSL_memcpy(r->X.words, p.p.X, P256_LIMBS * sizeof(BN_ULONG));
430	OPENSSL_memcpy(r->Y.words, p.p.Y, P256_LIMBS * sizeof(BN_ULONG));
431	OPENSSL_memcpy(r->Z.words, p.p.Z, P256_LIMBS * sizeof(BN_ULONG));
432	}
433
434	static int ecp_nistz256_get_affine(const EC_GROUP *group,
435	const EC_RAW_POINT point, EC_FELEM x,
436	EC_FELEM *y) {
437	if (ec_GFp_simple_is_at_infinity(group, point)) {
438	OPENSSL_PUT_ERROR(EC, EC_R_POINT_AT_INFINITY);
439	return `0`;
440	}
441
442	BN_ULONG z_inv2[P256_LIMBS];
443	BN_ULONG z_inv3[P256_LIMBS];
444	assert(group->field.width == P256_LIMBS);
445	ecp_nistz256_mod_inverse_mont(z_inv3, point->Z.words);
446	ecp_nistz256_sqr_mont(z_inv2, z_inv3);
447
448	// Instead of using \|ecp_nistz256_from_mont\| to convert the \|x\| coordinate
449	// and then calling \|ecp_nistz256_from_mont\| again to convert the \|y\|
450	// coordinate below, convert the common factor \|z_inv2\| once now, saving one
451	// reduction.
452	ecp_nistz256_from_mont(z_inv2, z_inv2);
453
454	if (x != NULL) {
455	ecp_nistz256_mul_mont(x->words, z_inv2, point->X.words);
456	}
457
458	if (y != NULL) {
459	ecp_nistz256_mul_mont(z_inv3, z_inv3, z_inv2);
460	ecp_nistz256_mul_mont(y->words, z_inv3, point->Y.words);
461	}
462
463	return `1`;
464	}
465
466	static void ecp_nistz256_add(const EC_GROUP group, EC_RAW_POINT r,
467	const EC_RAW_POINT a_, const* EC_RAW_POINT *b_) {
468	P256_POINT a, b;
469	OPENSSL_memcpy(a.X, a_->X.words, P256_LIMBS * sizeof(BN_ULONG));
470	OPENSSL_memcpy(a.Y, a_->Y.words, P256_LIMBS * sizeof(BN_ULONG));
471	OPENSSL_memcpy(a.Z, a_->Z.words, P256_LIMBS * sizeof(BN_ULONG));
472	OPENSSL_memcpy(b.X, b_->X.words, P256_LIMBS * sizeof(BN_ULONG));
473	OPENSSL_memcpy(b.Y, b_->Y.words, P256_LIMBS * sizeof(BN_ULONG));
474	OPENSSL_memcpy(b.Z, b_->Z.words, P256_LIMBS * sizeof(BN_ULONG));
475	ecp_nistz256_point_add(&a, &a, &b);
476	OPENSSL_memcpy(r->X.words, a.X, P256_LIMBS * sizeof(BN_ULONG));
477	OPENSSL_memcpy(r->Y.words, a.Y, P256_LIMBS * sizeof(BN_ULONG));
478	OPENSSL_memcpy(r->Z.words, a.Z, P256_LIMBS * sizeof(BN_ULONG));
479	}
480
481	static void ecp_nistz256_dbl(const EC_GROUP group, EC_RAW_POINT r,
482	const EC_RAW_POINT *a_) {
483	P256_POINT a;
484	OPENSSL_memcpy(a.X, a_->X.words, P256_LIMBS * sizeof(BN_ULONG));
485	OPENSSL_memcpy(a.Y, a_->Y.words, P256_LIMBS * sizeof(BN_ULONG));
486	OPENSSL_memcpy(a.Z, a_->Z.words, P256_LIMBS * sizeof(BN_ULONG));
487	ecp_nistz256_point_double(&a, &a);
488	OPENSSL_memcpy(r->X.words, a.X, P256_LIMBS * sizeof(BN_ULONG));
489	OPENSSL_memcpy(r->Y.words, a.Y, P256_LIMBS * sizeof(BN_ULONG));
490	OPENSSL_memcpy(r->Z.words, a.Z, P256_LIMBS * sizeof(BN_ULONG));
491	}
492
493	static void ecp_nistz256_inv_mod_ord(const EC_GROUP group, EC_SCALAR out,
494	const EC_SCALAR *in) {
495	// table[i] stores a power of \|in\| corresponding to the matching enum value.
496	enum {
497	// The following indices specify the power in binary.
498	i_1 = `0`,
499	i_10,
500	i_11,
501	i_101,
502	i_111,
503	i_1010,
504	i_1111,
505	i_10101,
506	i_101010,
507	i_101111,
508	// The following indices specify 2^N-1, or N ones in a row.
509	i_x6,
510	i_x8,
511	i_x16,
512	i_x32
513	};
514	BN_ULONG table[`15`][P256_LIMBS];
515
516	// https://briansmith.org/ecc-inversion-addition-chains-01#p256_scalar_inversion
517	//
518	// Even though this code path spares 12 squarings, 4.5%, and 13
519	// multiplications, 25%, the overall sign operation is not that much faster,
520	// not more that 2%. Most of the performance of this function comes from the
521	// scalar operations.
522
523	// Pre-calculate powers.
524	OPENSSL_memcpy(table[i_1], in->words, P256_LIMBS * sizeof(BN_ULONG));
525
526	ecp_nistz256_ord_sqr_mont(table[i_10], table[i_1], `1`);
527
528	ecp_nistz256_ord_mul_mont(table[i_11], table[i_1], table[i_10]);
529
530	ecp_nistz256_ord_mul_mont(table[i_101], table[i_11], table[i_10]);
531
532	ecp_nistz256_ord_mul_mont(table[i_111], table[i_101], table[i_10]);
533
534	ecp_nistz256_ord_sqr_mont(table[i_1010], table[i_101], `1`);
535
536	ecp_nistz256_ord_mul_mont(table[i_1111], table[i_1010], table[i_101]);
537
538	ecp_nistz256_ord_sqr_mont(table[i_10101], table[i_1010], `1`);
539	ecp_nistz256_ord_mul_mont(table[i_10101], table[i_10101], table[i_1]);
540
541	ecp_nistz256_ord_sqr_mont(table[i_101010], table[i_10101], `1`);
542
543	ecp_nistz256_ord_mul_mont(table[i_101111], table[i_101010], table[i_101]);
544
545	ecp_nistz256_ord_mul_mont(table[i_x6], table[i_101010], table[i_10101]);
546
547	ecp_nistz256_ord_sqr_mont(table[i_x8], table[i_x6], `2`);
548	ecp_nistz256_ord_mul_mont(table[i_x8], table[i_x8], table[i_11]);
549
550	ecp_nistz256_ord_sqr_mont(table[i_x16], table[i_x8], `8`);
551	ecp_nistz256_ord_mul_mont(table[i_x16], table[i_x16], table[i_x8]);
552
553	ecp_nistz256_ord_sqr_mont(table[i_x32], table[i_x16], `16`);
554	ecp_nistz256_ord_mul_mont(table[i_x32], table[i_x32], table[i_x16]);
555
556	// Compute \|in\| raised to the order-2.
557	ecp_nistz256_ord_sqr_mont(out->words, table[i_x32], `64`);
558	ecp_nistz256_ord_mul_mont(out->words, out->words, table[i_x32]);
559	static const struct {
560	uint8_t p, i;
561	} kChain[`27`] = {{`32`, i_x32}, {`6`, i_101111}, {`5`, i_111}, {`4`, i_11},
562	{`5`, i_1111}, {`5`, i_10101}, {`4`, i_101}, {`3`, i_101},
563	{`3`, i_101}, {`5`, i_111}, {`9`, i_101111}, {`6`, i_1111},
564	{`2`, i_1}, {`5`, i_1}, {`6`, i_1111}, {`5`, i_111},
565	{`4`, i_111}, {`5`, i_111}, {`5`, i_101}, {`3`, i_11},
566	{`10`, i_101111}, {`2`, i_11}, {`5`, i_11}, {`5`, i_11},
567	{`3`, i_1}, {`7`, i_10101}, {`6`, i_1111}};
568	for (size_t i = `0`; i < OPENSSL_ARRAY_SIZE(kChain); i++) {
569	ecp_nistz256_ord_sqr_mont(out->words, out->words, kChain[i].p);
570	ecp_nistz256_ord_mul_mont(out->words, out->words, table[kChain[i].i]);
571	}
572	}
573
574	static int ecp_nistz256_mont_inv_mod_ord_vartime(const EC_GROUP *group,
575	EC_SCALAR *out,
576	const EC_SCALAR *in) {
577	if ((OPENSSL_ia32cap_get()[`1`] & (`1` << `28`)) == `0`) {
578	// No AVX support; fallback to generic code.
579	return ec_GFp_simple_mont_inv_mod_ord_vartime(group, out, in);
580	}
581
582	assert(group->order.width == P256_LIMBS);
583	if (!beeu_mod_inverse_vartime(out->words, in->words, group->order.d)) {
584	return `0`;
585	}
586
587	// The result should be returned in the Montgomery domain.
588	ec_scalar_to_montgomery(group, out, out);
589	return `1`;
590	}
591
592	static int ecp_nistz256_cmp_x_coordinate(const EC_GROUP *group,
593	const EC_RAW_POINT *p,
594	const EC_SCALAR *r) {
595	if (ec_GFp_simple_is_at_infinity(group, p)) {
596	return `0`;
597	}
598
599	assert(group->order.width == P256_LIMBS);
600	assert(group->field.width == P256_LIMBS);
601
602	// We wish to compare X/Z^2 with r. This is equivalent to comparing X with
603	// rZ^2. Note that X and Z are represented in Montgomery form, while r is*
604	// not.
605	BN_ULONG r_Z2[P256_LIMBS], Z2_mont[P256_LIMBS], X[P256_LIMBS];
606	ecp_nistz256_mul_mont(Z2_mont, p->Z.words, p->Z.words);
607	ecp_nistz256_mul_mont(r_Z2, r->words, Z2_mont);
608	ecp_nistz256_from_mont(X, p->X.words);
609
610	if (OPENSSL_memcmp(r_Z2, X, sizeof(r_Z2)) == `0`) {
611	return `1`;
612	}
613
614	// During signing the x coefficient is reduced modulo the group order.
615	// Therefore there is a small possibility, less than 1/2^128, that group_order
616	// < p.x < P. in that case we need not only to compare against \|r\| but also to
617	// compare against r+group_order.
618	if (bn_less_than_words(r->words, group->field_minus_order.words,
619	P256_LIMBS)) {
620	// We can ignore the carry because: r + group_order < p < 2^256.
621	bn_add_words(r_Z2, r->words, group->order.d, P256_LIMBS);
622	ecp_nistz256_mul_mont(r_Z2, r_Z2, Z2_mont);
623	if (OPENSSL_memcmp(r_Z2, X, sizeof(r_Z2)) == `0`) {
624	return `1`;
625	}
626	}
627
628	return `0`;
629	}
630
631	DEFINE_METHOD_FUNCTION(EC_METHOD, EC_GFp_nistz256_method) {
632	out->group_init = ec_GFp_mont_group_init;
633	out->group_finish = ec_GFp_mont_group_finish;
634	out->group_set_curve = ec_GFp_mont_group_set_curve;
635	out->point_get_affine_coordinates = ecp_nistz256_get_affine;
636	out->add = ecp_nistz256_add;
637	out->dbl = ecp_nistz256_dbl;
638	out->mul = ecp_nistz256_point_mul;
639	out->mul_base = ecp_nistz256_point_mul_base;
640	out->mul_public = ecp_nistz256_points_mul_public;
641	out->felem_mul = ec_GFp_mont_felem_mul;
642	out->felem_sqr = ec_GFp_mont_felem_sqr;
643	out->bignum_to_felem = ec_GFp_mont_bignum_to_felem;
644	out->felem_to_bignum = ec_GFp_mont_felem_to_bignum;
645	out->scalar_inv_montgomery = ecp_nistz256_inv_mod_ord;
646	out->scalar_inv_montgomery_vartime = ecp_nistz256_mont_inv_mod_ord_vartime;
647	out->cmp_x_coordinate = ecp_nistz256_cmp_x_coordinate;
648	}
649
650	#endif /* !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
651	!defined(OPENSSL_SMALL) */
652

Browse the source code of engine/third_party/boringssl/src/crypto/fipsmodule/ec/p256-x86_64.c