p224-64.c source code [engine/third_party/boringssl/src/crypto/fipsmodule/ec/p224-64.c]

1	/ Copyright (c) 2015, Google Inc.*
2	*
3	* Permission to use, copy, modify, and/or distribute this software for any
4	* purpose with or without fee is hereby granted, provided that the above
5	* copyright notice and this permission notice appear in all copies.
6	*
7	* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8	* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9	* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
10	* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11	* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
12	* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
13	* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
14
15	// A 64-bit implementation of the NIST P-224 elliptic curve point multiplication
16	//
17	// Inspired by Daniel J. Bernstein's public domain nistp224 implementation
18	// and Adam Langley's public domain 64-bit C implementation of curve25519.
19
20	#include <openssl/base.h>
21
22	#include <openssl/bn.h>
23	#include <openssl/ec.h>
24	#include <openssl/err.h>
25	#include <openssl/mem.h>
26
27	#include <string.h>
28
29	#include "internal.h"
30	#include "../delocate.h"
31	#include "../../internal.h"
32
33
34	#if defined(BORINGSSL_HAS_UINT128) && !defined(OPENSSL_SMALL)
35
36	// Field elements are represented as a_0 + 2^56a_1 + 2^112a_2 + 2^168a_3*
37	// using 64-bit coefficients called 'limbs', and sometimes (for multiplication
38	// results) as b_0 + 2^56b_1 + 2^112b_2 + 2^168b_3 + 2^224b_4 + 2^280b_5 +*
39	// 2^336b_6 using 128-bit coefficients called 'widelimbs'. A 4-p224_limb*
40	// representation is an 'p224_felem'; a 7-p224_widelimb representation is a
41	// 'p224_widefelem'. Even within felems, bits of adjacent limbs overlap, and we
42	// don't always reduce the representations: we ensure that inputs to each
43	// p224_felem multiplication satisfy a_i < 2^60, so outputs satisfy b_i <
44	// 42^602^60, and fit into a 128-bit word without overflow. The coefficients
45	// are then again partially reduced to obtain an p224_felem satisfying a_i <
46	// 2^57. We only reduce to the unique minimal representation at the end of the
47	// computation.
48
49	typedef uint64_t p224_limb;
50	typedef uint128_t p224_widelimb;
51
52	typedef p224_limb p224_felem[`4`];
53	typedef p224_widelimb p224_widefelem[`7`];
54
55	// Field element represented as a byte arrary. 288 = 224 bits is also the*
56	// group order size for the elliptic curve, and we also use this type for
57	// scalars for point multiplication.
58	typedef uint8_t p224_felem_bytearray[`28`];
59
60	// Precomputed multiples of the standard generator
61	// Points are given in coordinates (X, Y, Z) where Z normally is 1
62	// (0 for the point at infinity).
63	// For each field element, slice a_0 is word 0, etc.
64	//
65	// The table has 2 16 elements, starting with the following:*
66	// index \| bits \| point
67	// ------+---------+------------------------------
68	// 0 \| 0 0 0 0 \| 0G
69	// 1 \| 0 0 0 1 \| 1G
70	// 2 \| 0 0 1 0 \| 2^56G
71	// 3 \| 0 0 1 1 \| (2^56 + 1)G
72	// 4 \| 0 1 0 0 \| 2^112G
73	// 5 \| 0 1 0 1 \| (2^112 + 1)G
74	// 6 \| 0 1 1 0 \| (2^112 + 2^56)G
75	// 7 \| 0 1 1 1 \| (2^112 + 2^56 + 1)G
76	// 8 \| 1 0 0 0 \| 2^168G
77	// 9 \| 1 0 0 1 \| (2^168 + 1)G
78	// 10 \| 1 0 1 0 \| (2^168 + 2^56)G
79	// 11 \| 1 0 1 1 \| (2^168 + 2^56 + 1)G
80	// 12 \| 1 1 0 0 \| (2^168 + 2^112)G
81	// 13 \| 1 1 0 1 \| (2^168 + 2^112 + 1)G
82	// 14 \| 1 1 1 0 \| (2^168 + 2^112 + 2^56)G
83	// 15 \| 1 1 1 1 \| (2^168 + 2^112 + 2^56 + 1)G
84	// followed by a copy of this with each element multiplied by 2^28.
85	//
86	// The reason for this is so that we can clock bits into four different
87	// locations when doing simple scalar multiplies against the base point,
88	// and then another four locations using the second 16 elements.
89	static const p224_felem g_p224_pre_comp[`2`][`16`][`3`] = {
90	{{{`0`, `0`, `0`, `0`}, {`0`, `0`, `0`, `0`}, {`0`, `0`, `0`, `0`}},
91	{{`0x3280d6115c1d21`, `0xc1d356c2112234`, `0x7f321390b94a03`, `0xb70e0cbd6bb4bf`},
92	{`0xd5819985007e34`, `0x75a05a07476444`, `0xfb4c22dfe6cd43`, `0xbd376388b5f723`},
93	{`1`, `0`, `0`, `0`}},
94	{{`0xfd9675666ebbe9`, `0xbca7664d40ce5e`, `0x2242df8d8a2a43`, `0x1f49bbb0f99bc5`},
95	{`0x29e0b892dc9c43`, `0xece8608436e662`, `0xdc858f185310d0`, `0x9812dd4eb8d321`},
96	{`1`, `0`, `0`, `0`}},
97	{{`0x6d3e678d5d8eb8`, `0x559eed1cb362f1`, `0x16e9a3bbce8a3f`, `0xeedcccd8c2a748`},
98	{`0xf19f90ed50266d`, `0xabf2b4bf65f9df`, `0x313865468fafec`, `0x5cb379ba910a17`},
99	{`1`, `0`, `0`, `0`}},
100	{{`0x0641966cab26e3`, `0x91fb2991fab0a0`, `0xefec27a4e13a0b`, `0x0499aa8a5f8ebe`},
101	{`0x7510407766af5d`, `0x84d929610d5450`, `0x81d77aae82f706`, `0x6916f6d4338c5b`},
102	{`1`, `0`, `0`, `0`}},
103	{{`0xea95ac3b1f15c6`, `0x086000905e82d4`, `0xdd323ae4d1c8b1`, `0x932b56be7685a3`},
104	{`0x9ef93dea25dbbf`, `0x41665960f390f0`, `0xfdec76dbe2a8a7`, `0x523e80f019062a`},
105	{`1`, `0`, `0`, `0`}},
106	{{`0x822fdd26732c73`, `0xa01c83531b5d0f`, `0x363f37347c1ba4`, `0xc391b45c84725c`},
107	{`0xbbd5e1b2d6ad24`, `0xddfbcde19dfaec`, `0xc393da7e222a7f`, `0x1efb7890ede244`},
108	{`1`, `0`, `0`, `0`}},
109	{{`0x4c9e90ca217da1`, `0xd11beca79159bb`, `0xff8d33c2c98b7c`, `0x2610b39409f849`},
110	{`0x44d1352ac64da0`, `0xcdbb7b2c46b4fb`, `0x966c079b753c89`, `0xfe67e4e820b112`},
111	{`1`, `0`, `0`, `0`}},
112	{{`0xe28cae2df5312d`, `0xc71b61d16f5c6e`, `0x79b7619a3e7c4c`, `0x05c73240899b47`},
113	{`0x9f7f6382c73e3a`, `0x18615165c56bda`, `0x641fab2116fd56`, `0x72855882b08394`},
114	{`1`, `0`, `0`, `0`}},
115	{{`0x0469182f161c09`, `0x74a98ca8d00fb5`, `0xb89da93489a3e0`, `0x41c98768fb0c1d`},
116	{`0xe5ea05fb32da81`, `0x3dce9ffbca6855`, `0x1cfe2d3fbf59e6`, `0x0e5e03408738a7`},
117	{`1`, `0`, `0`, `0`}},
118	{{`0xdab22b2333e87f`, `0x4430137a5dd2f6`, `0xe03ab9f738beb8`, `0xcb0c5d0dc34f24`},
119	{`0x764a7df0c8fda5`, `0x185ba5c3fa2044`, `0x9281d688bcbe50`, `0xc40331df893881`},
120	{`1`, `0`, `0`, `0`}},
121	{{`0xb89530796f0f60`, `0xade92bd26909a3`, `0x1a0c83fb4884da`, `0x1765bf22a5a984`},
122	{`0x772a9ee75db09e`, `0x23bc6c67cec16f`, `0x4c1edba8b14e2f`, `0xe2a215d9611369`},
123	{`1`, `0`, `0`, `0`}},
124	{{`0x571e509fb5efb3`, `0xade88696410552`, `0xc8ae85fada74fe`, `0x6c7e4be83bbde3`},
125	{`0xff9f51160f4652`, `0xb47ce2495a6539`, `0xa2946c53b582f4`, `0x286d2db3ee9a60`},
126	{`1`, `0`, `0`, `0`}},
127	{{`0x40bbd5081a44af`, `0x0995183b13926c`, `0xbcefba6f47f6d0`, `0x215619e9cc0057`},
128	{`0x8bc94d3b0df45e`, `0xf11c54a3694f6f`, `0x8631b93cdfe8b5`, `0xe7e3f4b0982db9`},
129	{`1`, `0`, `0`, `0`}},
130	{{`0xb17048ab3e1c7b`, `0xac38f36ff8a1d8`, `0x1c29819435d2c6`, `0xc813132f4c07e9`},
131	{`0x2891425503b11f`, `0x08781030579fea`, `0xf5426ba5cc9674`, `0x1e28ebf18562bc`},
132	{`1`, `0`, `0`, `0`}},
133	{{`0x9f31997cc864eb`, `0x06cd91d28b5e4c`, `0xff17036691a973`, `0xf1aef351497c58`},
134	{`0xdd1f2d600564ff`, `0xdead073b1402db`, `0x74a684435bd693`, `0xeea7471f962558`},
135	{`1`, `0`, `0`, `0`}}},
136	{{{`0`, `0`, `0`, `0`}, {`0`, `0`, `0`, `0`}, {`0`, `0`, `0`, `0`}},
137	{{`0x9665266dddf554`, `0x9613d78b60ef2d`, `0xce27a34cdba417`, `0xd35ab74d6afc31`},
138	{`0x85ccdd22deb15e`, `0x2137e5783a6aab`, `0xa141cffd8c93c6`, `0x355a1830e90f2d`},
139	{`1`, `0`, `0`, `0`}},
140	{{`0x1a494eadaade65`, `0xd6da4da77fe53c`, `0xe7992996abec86`, `0x65c3553c6090e3`},
141	{`0xfa610b1fb09346`, `0xf1c6540b8a4aaf`, `0xc51a13ccd3cbab`, `0x02995b1b18c28a`},
142	{`1`, `0`, `0`, `0`}},
143	{{`0x7874568e7295ef`, `0x86b419fbe38d04`, `0xdc0690a7550d9a`, `0xd3966a44beac33`},
144	{`0x2b7280ec29132f`, `0xbeaa3b6a032df3`, `0xdc7dd88ae41200`, `0xd25e2513e3a100`},
145	{`1`, `0`, `0`, `0`}},
146	{{`0x924857eb2efafd`, `0xac2bce41223190`, `0x8edaa1445553fc`, `0x825800fd3562d5`},
147	{`0x8d79148ea96621`, `0x23a01c3dd9ed8d`, `0xaf8b219f9416b5`, `0xd8db0cc277daea`},
148	{`1`, `0`, `0`, `0`}},
149	{{`0x76a9c3b1a700f0`, `0xe9acd29bc7e691`, `0x69212d1a6b0327`, `0x6322e97fe154be`},
150	{`0x469fc5465d62aa`, `0x8d41ed18883b05`, `0x1f8eae66c52b88`, `0xe4fcbe9325be51`},
151	{`1`, `0`, `0`, `0`}},
152	{{`0x825fdf583cac16`, `0x020b857c7b023a`, `0x683c17744b0165`, `0x14ffd0a2daf2f1`},
153	{`0x323b36184218f9`, `0x4944ec4e3b47d4`, `0xc15b3080841acf`, `0x0bced4b01a28bb`},
154	{`1`, `0`, `0`, `0`}},
155	{{`0x92ac22230df5c4`, `0x52f33b4063eda8`, `0xcb3f19870c0c93`, `0x40064f2ba65233`},
156	{`0xfe16f0924f8992`, `0x012da25af5b517`, `0x1a57bb24f723a6`, `0x06f8bc76760def`},
157	{`1`, `0`, `0`, `0`}},
158	{{`0x4a7084f7817cb9`, `0xbcab0738ee9a78`, `0x3ec11e11d9c326`, `0xdc0fe90e0f1aae`},
159	{`0xcf639ea5f98390`, `0x5c350aa22ffb74`, `0x9afae98a4047b7`, `0x956ec2d617fc45`},
160	{`1`, `0`, `0`, `0`}},
161	{{`0x4306d648c1be6a`, `0x9247cd8bc9a462`, `0xf5595e377d2f2e`, `0xbd1c3caff1a52e`},
162	{`0x045e14472409d0`, `0x29f3e17078f773`, `0x745a602b2d4f7d`, `0x191837685cdfbb`},
163	{`1`, `0`, `0`, `0`}},
164	{{`0x5b6ee254a8cb79`, `0x4953433f5e7026`, `0xe21faeb1d1def4`, `0xc4c225785c09de`},
165	{`0x307ce7bba1e518`, `0x31b125b1036db8`, `0x47e91868839e8f`, `0xc765866e33b9f3`},
166	{`1`, `0`, `0`, `0`}},
167	{{`0x3bfece24f96906`, `0x4794da641e5093`, `0xde5df64f95db26`, `0x297ecd89714b05`},
168	{`0x701bd3ebb2c3aa`, `0x7073b4f53cb1d5`, `0x13c5665658af16`, `0x9895089d66fe58`},
169	{`1`, `0`, `0`, `0`}},
170	{{`0x0fef05f78c4790`, `0x2d773633b05d2e`, `0x94229c3a951c94`, `0xbbbd70df4911bb`},
171	{`0xb2c6963d2c1168`, `0x105f47a72b0d73`, `0x9fdf6111614080`, `0x7b7e94b39e67b0`},
172	{`1`, `0`, `0`, `0`}},
173	{{`0xad1a7d6efbe2b3`, `0xf012482c0da69d`, `0x6b3bdf12438345`, `0x40d7558d7aa4d9`},
174	{`0x8a09fffb5c6d3d`, `0x9a356e5d9ffd38`, `0x5973f15f4f9b1c`, `0xdcd5f59f63c3ea`},
175	{`1`, `0`, `0`, `0`}},
176	{{`0xacf39f4c5ca7ab`, `0x4c8071cc5fd737`, `0xc64e3602cd1184`, `0x0acd4644c9abba`},
177	{`0x6c011a36d8bf6e`, `0xfecd87ba24e32a`, `0x19f6f56574fad8`, `0x050b204ced9405`},
178	{`1`, `0`, `0`, `0`}},
179	{{`0xed4f1cae7d9a96`, `0x5ceef7ad94c40a`, `0x778e4a3bf3ef9b`, `0x7405783dc3b55e`},
180	{`0x32477c61b6e8c6`, `0xb46a97570f018b`, `0x91176d0a7e95d1`, `0x3df90fbc4c7d0e`},
181	{`1`, `0`, `0`, `0`}}}};
182
183	static uint64_t p224_load_u64(const uint8_t in[`8`]) {
184	uint64_t ret;
185	OPENSSL_memcpy(&ret, in, sizeof(ret));
186	return ret;
187	}
188
189	// Helper functions to convert field elements to/from internal representation
190	static void p224_bin28_to_felem(p224_felem out, const uint8_t in[`28`]) {
191	out[`0`] = p224_load_u64(in) & `0x00ffffffffffffff`;
192	out[`1`] = p224_load_u64(in + `7`) & `0x00ffffffffffffff`;
193	out[`2`] = p224_load_u64(in + `14`) & `0x00ffffffffffffff`;
194	out[`3`] = p224_load_u64(in + `20`) >> `8`;
195	}
196
197	static void p224_felem_to_bin28(uint8_t out[`28`], const p224_felem in) {
198	for (size_t i = `0`; i < `7`; ++i) {
199	out[i] = in[`0`] >> (`8` * i);
200	out[i + `7`] = in[`1`] >> (`8` * i);
201	out[i + `14`] = in[`2`] >> (`8` * i);
202	out[i + `21`] = in[`3`] >> (`8` * i);
203	}
204	}
205
206	static void p224_generic_to_felem(p224_felem out, const EC_FELEM *in) {
207	p224_bin28_to_felem(out, in->bytes);
208	}
209
210	// Requires 0 <= in < 2p (always call p224_felem_reduce first)*
211	static void p224_felem_to_generic(EC_FELEM out, const* p224_felem in) {
212	// Reduce to unique minimal representation.
213	static const int64_t two56 = ((p224_limb)`1`) << `56`;
214	// 0 <= in < 2p, p = 2^224 - 2^96 + 1*
215	// if in > p , reduce in = in - 2^224 + 2^96 - 1
216	int64_t tmp[`4`], a;
217	tmp[`0`] = in[`0`];
218	tmp[`1`] = in[`1`];
219	tmp[`2`] = in[`2`];
220	tmp[`3`] = in[`3`];
221	// Case 1: a = 1 iff in >= 2^224
222	a = (in[`3`] >> `56`);
223	tmp[`0`] -= a;
224	tmp[`1`] += a << `40`;
225	tmp[`3`] &= `0x00ffffffffffffff`;
226	// Case 2: a = 0 iff p <= in < 2^224, i.e., the high 128 bits are all 1 and
227	// the lower part is non-zero
228	a = ((in[`3`] & in[`2`] & (in[`1`] \| `0x000000ffffffffff`)) + `1`) \|
229	(((int64_t)(in[`0`] + (in[`1`] & `0x000000ffffffffff`)) - `1`) >> `63`);
230	a &= `0x00ffffffffffffff`;
231	// turn a into an all-one mask (if a = 0) or an all-zero mask
232	a = (a - `1`) >> `63`;
233	// subtract 2^224 - 2^96 + 1 if a is all-one
234	tmp[`3`] &= a ^ `0xffffffffffffffff`;
235	tmp[`2`] &= a ^ `0xffffffffffffffff`;
236	tmp[`1`] &= (a ^ `0xffffffffffffffff`) \| `0x000000ffffffffff`;
237	tmp[`0`] -= `1` & a;
238
239	// eliminate negative coefficients: if tmp[0] is negative, tmp[1] must
240	// be non-zero, so we only need one step
241	a = tmp[`0`] >> `63`;
242	tmp[`0`] += two56 & a;
243	tmp[`1`] -= `1` & a;
244
245	// carry 1 -> 2 -> 3
246	tmp[`2`] += tmp[`1`] >> `56`;
247	tmp[`1`] &= `0x00ffffffffffffff`;
248
249	tmp[`3`] += tmp[`2`] >> `56`;
250	tmp[`2`] &= `0x00ffffffffffffff`;
251
252	// Now 0 <= tmp < p
253	p224_felem tmp2;
254	tmp2[`0`] = tmp[`0`];
255	tmp2[`1`] = tmp[`1`];
256	tmp2[`2`] = tmp[`2`];
257	tmp2[`3`] = tmp[`3`];
258
259	p224_felem_to_bin28(out->bytes, tmp2);
260	// 224 is not a multiple of 64, so zero the remaining bytes.
261	OPENSSL_memset(out->bytes + `28`, `0`, `32` - `28`);
262	}
263
264
265	// Field operations, using the internal representation of field elements.
266	// NB! These operations are specific to our point multiplication and cannot be
267	// expected to be correct in general - e.g., multiplication with a large scalar
268	// will cause an overflow.
269
270	static void p224_felem_assign(p224_felem out, const p224_felem in) {
271	out[`0`] = in[`0`];
272	out[`1`] = in[`1`];
273	out[`2`] = in[`2`];
274	out[`3`] = in[`3`];
275	}
276
277	// Sum two field elements: out += in
278	static void p224_felem_sum(p224_felem out, const p224_felem in) {
279	out[`0`] += in[`0`];
280	out[`1`] += in[`1`];
281	out[`2`] += in[`2`];
282	out[`3`] += in[`3`];
283	}
284
285	// Subtract field elements: out -= in
286	// Assumes in[i] < 2^57
287	static void p224_felem_diff(p224_felem out, const p224_felem in) {
288	static const p224_limb two58p2 =
289	(((p224_limb)`1`) << `58`) + (((p224_limb)`1`) << `2`);
290	static const p224_limb two58m2 =
291	(((p224_limb)`1`) << `58`) - (((p224_limb)`1`) << `2`);
292	static const p224_limb two58m42m2 =
293	(((p224_limb)`1`) << `58`) - (((p224_limb)`1`) << `42`) - (((p224_limb)`1`) << `2`);
294
295	// Add 0 mod 2^224-2^96+1 to ensure out > in
296	out[`0`] += two58p2;
297	out[`1`] += two58m42m2;
298	out[`2`] += two58m2;
299	out[`3`] += two58m2;
300
301	out[`0`] -= in[`0`];
302	out[`1`] -= in[`1`];
303	out[`2`] -= in[`2`];
304	out[`3`] -= in[`3`];
305	}
306
307	// Subtract in unreduced 128-bit mode: out -= in
308	// Assumes in[i] < 2^119
309	static void p224_widefelem_diff(p224_widefelem out, const p224_widefelem in) {
310	static const p224_widelimb two120 = ((p224_widelimb)`1`) << `120`;
311	static const p224_widelimb two120m64 =
312	(((p224_widelimb)`1`) << `120`) - (((p224_widelimb)`1`) << `64`);
313	static const p224_widelimb two120m104m64 = (((p224_widelimb)`1`) << `120`) -
314	(((p224_widelimb)`1`) << `104`) -
315	(((p224_widelimb)`1`) << `64`);
316
317	// Add 0 mod 2^224-2^96+1 to ensure out > in
318	out[`0`] += two120;
319	out[`1`] += two120m64;
320	out[`2`] += two120m64;
321	out[`3`] += two120;
322	out[`4`] += two120m104m64;
323	out[`5`] += two120m64;
324	out[`6`] += two120m64;
325
326	out[`0`] -= in[`0`];
327	out[`1`] -= in[`1`];
328	out[`2`] -= in[`2`];
329	out[`3`] -= in[`3`];
330	out[`4`] -= in[`4`];
331	out[`5`] -= in[`5`];
332	out[`6`] -= in[`6`];
333	}
334
335	// Subtract in mixed mode: out128 -= in64
336	// in[i] < 2^63
337	static void p224_felem_diff_128_64(p224_widefelem out, const p224_felem in) {
338	static const p224_widelimb two64p8 =
339	(((p224_widelimb)`1`) << `64`) + (((p224_widelimb)`1`) << `8`);
340	static const p224_widelimb two64m8 =
341	(((p224_widelimb)`1`) << `64`) - (((p224_widelimb)`1`) << `8`);
342	static const p224_widelimb two64m48m8 = (((p224_widelimb)`1`) << `64`) -
343	(((p224_widelimb)`1`) << `48`) -
344	(((p224_widelimb)`1`) << `8`);
345
346	// Add 0 mod 2^224-2^96+1 to ensure out > in
347	out[`0`] += two64p8;
348	out[`1`] += two64m48m8;
349	out[`2`] += two64m8;
350	out[`3`] += two64m8;
351
352	out[`0`] -= in[`0`];
353	out[`1`] -= in[`1`];
354	out[`2`] -= in[`2`];
355	out[`3`] -= in[`3`];
356	}
357
358	// Multiply a field element by a scalar: out = out scalar*
359	// The scalars we actually use are small, so results fit without overflow
360	static void p224_felem_scalar(p224_felem out, const p224_limb scalar) {
361	out[`0`] *= scalar;
362	out[`1`] *= scalar;
363	out[`2`] *= scalar;
364	out[`3`] *= scalar;
365	}
366
367	// Multiply an unreduced field element by a scalar: out = out scalar*
368	// The scalars we actually use are small, so results fit without overflow
369	static void p224_widefelem_scalar(p224_widefelem out,
370	const p224_widelimb scalar) {
371	out[`0`] *= scalar;
372	out[`1`] *= scalar;
373	out[`2`] *= scalar;
374	out[`3`] *= scalar;
375	out[`4`] *= scalar;
376	out[`5`] *= scalar;
377	out[`6`] *= scalar;
378	}
379
380	// Square a field element: out = in^2
381	static void p224_felem_square(p224_widefelem out, const p224_felem in) {
382	p224_limb tmp0, tmp1, tmp2;
383	tmp0 = `2` * in[`0`];
384	tmp1 = `2` * in[`1`];
385	tmp2 = `2` * in[`2`];
386	out[`0`] = ((p224_widelimb)in[`0`]) * in[`0`];
387	out[`1`] = ((p224_widelimb)in[`0`]) * tmp1;
388	out[`2`] = ((p224_widelimb)in[`0`]) * tmp2 + ((p224_widelimb)in[`1`]) * in[`1`];
389	out[`3`] = ((p224_widelimb)in[`3`]) * tmp0 + ((p224_widelimb)in[`1`]) * tmp2;
390	out[`4`] = ((p224_widelimb)in[`3`]) * tmp1 + ((p224_widelimb)in[`2`]) * in[`2`];
391	out[`5`] = ((p224_widelimb)in[`3`]) * tmp2;
392	out[`6`] = ((p224_widelimb)in[`3`]) * in[`3`];
393	}
394
395	// Multiply two field elements: out = in1 in2*
396	static void p224_felem_mul(p224_widefelem out, const p224_felem in1,
397	const p224_felem in2) {
398	out[`0`] = ((p224_widelimb)in1[`0`]) * in2[`0`];
399	out[`1`] = ((p224_widelimb)in1[`0`]) * in2[`1`] + ((p224_widelimb)in1[`1`]) * in2[`0`];
400	out[`2`] = ((p224_widelimb)in1[`0`]) * in2[`2`] + ((p224_widelimb)in1[`1`]) * in2[`1`] +
401	((p224_widelimb)in1[`2`]) * in2[`0`];
402	out[`3`] = ((p224_widelimb)in1[`0`]) * in2[`3`] + ((p224_widelimb)in1[`1`]) * in2[`2`] +
403	((p224_widelimb)in1[`2`]) * in2[`1`] + ((p224_widelimb)in1[`3`]) * in2[`0`];
404	out[`4`] = ((p224_widelimb)in1[`1`]) * in2[`3`] + ((p224_widelimb)in1[`2`]) * in2[`2`] +
405	((p224_widelimb)in1[`3`]) * in2[`1`];
406	out[`5`] = ((p224_widelimb)in1[`2`]) * in2[`3`] + ((p224_widelimb)in1[`3`]) * in2[`2`];
407	out[`6`] = ((p224_widelimb)in1[`3`]) * in2[`3`];
408	}
409
410	// Reduce seven 128-bit coefficients to four 64-bit coefficients.
411	// Requires in[i] < 2^126,
412	// ensures out[0] < 2^56, out[1] < 2^56, out[2] < 2^56, out[3] <= 2^56 + 2^16
413	static void p224_felem_reduce(p224_felem out, const p224_widefelem in) {
414	static const p224_widelimb two127p15 =
415	(((p224_widelimb)`1`) << `127`) + (((p224_widelimb)`1`) << `15`);
416	static const p224_widelimb two127m71 =
417	(((p224_widelimb)`1`) << `127`) - (((p224_widelimb)`1`) << `71`);
418	static const p224_widelimb two127m71m55 = (((p224_widelimb)`1`) << `127`) -
419	(((p224_widelimb)`1`) << `71`) -
420	(((p224_widelimb)`1`) << `55`);
421	p224_widelimb output[`5`];
422
423	// Add 0 mod 2^224-2^96+1 to ensure all differences are positive
424	output[`0`] = in[`0`] + two127p15;
425	output[`1`] = in[`1`] + two127m71m55;
426	output[`2`] = in[`2`] + two127m71;
427	output[`3`] = in[`3`];
428	output[`4`] = in[`4`];
429
430	// Eliminate in[4], in[5], in[6]
431	output[`4`] += in[`6`] >> `16`;
432	output[`3`] += (in[`6`] & `0xffff`) << `40`;
433	output[`2`] -= in[`6`];
434
435	output[`3`] += in[`5`] >> `16`;
436	output[`2`] += (in[`5`] & `0xffff`) << `40`;
437	output[`1`] -= in[`5`];
438
439	output[`2`] += output[`4`] >> `16`;
440	output[`1`] += (output[`4`] & `0xffff`) << `40`;
441	output[`0`] -= output[`4`];
442
443	// Carry 2 -> 3 -> 4
444	output[`3`] += output[`2`] >> `56`;
445	output[`2`] &= `0x00ffffffffffffff`;
446
447	output[`4`] = output[`3`] >> `56`;
448	output[`3`] &= `0x00ffffffffffffff`;
449
450	// Now output[2] < 2^56, output[3] < 2^56, output[4] < 2^72
451
452	// Eliminate output[4]
453	output[`2`] += output[`4`] >> `16`;
454	// output[2] < 2^56 + 2^56 = 2^57
455	output[`1`] += (output[`4`] & `0xffff`) << `40`;
456	output[`0`] -= output[`4`];
457
458	// Carry 0 -> 1 -> 2 -> 3
459	output[`1`] += output[`0`] >> `56`;
460	out[`0`] = output[`0`] & `0x00ffffffffffffff`;
461
462	output[`2`] += output[`1`] >> `56`;
463	// output[2] < 2^57 + 2^72
464	out[`1`] = output[`1`] & `0x00ffffffffffffff`;
465	output[`3`] += output[`2`] >> `56`;
466	// output[3] <= 2^56 + 2^16
467	out[`2`] = output[`2`] & `0x00ffffffffffffff`;
468
469	// out[0] < 2^56, out[1] < 2^56, out[2] < 2^56,
470	// out[3] <= 2^56 + 2^16 (due to final carry),
471	// so out < 2p*
472	out[`3`] = output[`3`];
473	}
474
475	// Get negative value: out = -in
476	// Requires in[i] < 2^63,
477	// ensures out[0] < 2^56, out[1] < 2^56, out[2] < 2^56, out[3] <= 2^56 + 2^16
478	static void p224_felem_neg(p224_felem out, const p224_felem in) {
479	p224_widefelem tmp = {`0`};
480	p224_felem_diff_128_64(tmp, in);
481	p224_felem_reduce(out, tmp);
482	}
483
484	// Zero-check: returns 1 if input is 0, and 0 otherwise. We know that field
485	// elements are reduced to in < 2^225, so we only need to check three cases: 0,
486	// 2^224 - 2^96 + 1, and 2^225 - 2^97 + 2
487	static p224_limb p224_felem_is_zero(const p224_felem in) {
488	p224_limb zero = in[`0`] \| in[`1`] \| in[`2`] \| in[`3`];
489	zero = (((int64_t)(zero)-`1`) >> `63`) & `1`;
490
491	p224_limb two224m96p1 = (in[`0`] ^ `1`) \| (in[`1`] ^ `0x00ffff0000000000`) \|
492	(in[`2`] ^ `0x00ffffffffffffff`) \|
493	(in[`3`] ^ `0x00ffffffffffffff`);
494	two224m96p1 = (((int64_t)(two224m96p1)-`1`) >> `63`) & `1`;
495	p224_limb two225m97p2 = (in[`0`] ^ `2`) \| (in[`1`] ^ `0x00fffe0000000000`) \|
496	(in[`2`] ^ `0x00ffffffffffffff`) \|
497	(in[`3`] ^ `0x01ffffffffffffff`);
498	two225m97p2 = (((int64_t)(two225m97p2)-`1`) >> `63`) & `1`;
499	return (zero \| two224m96p1 \| two225m97p2);
500	}
501
502	// Invert a field element
503	// Computation chain copied from djb's code
504	static void p224_felem_inv(p224_felem out, const p224_felem in) {
505	p224_felem ftmp, ftmp2, ftmp3, ftmp4;
506	p224_widefelem tmp;
507
508	p224_felem_square(tmp, in);
509	p224_felem_reduce(ftmp, tmp); // 2
510	p224_felem_mul(tmp, in, ftmp);
511	p224_felem_reduce(ftmp, tmp); // 2^2 - 1
512	p224_felem_square(tmp, ftmp);
513	p224_felem_reduce(ftmp, tmp); // 2^3 - 2
514	p224_felem_mul(tmp, in, ftmp);
515	p224_felem_reduce(ftmp, tmp); // 2^3 - 1
516	p224_felem_square(tmp, ftmp);
517	p224_felem_reduce(ftmp2, tmp); // 2^4 - 2
518	p224_felem_square(tmp, ftmp2);
519	p224_felem_reduce(ftmp2, tmp); // 2^5 - 4
520	p224_felem_square(tmp, ftmp2);
521	p224_felem_reduce(ftmp2, tmp); // 2^6 - 8
522	p224_felem_mul(tmp, ftmp2, ftmp);
523	p224_felem_reduce(ftmp, tmp); // 2^6 - 1
524	p224_felem_square(tmp, ftmp);
525	p224_felem_reduce(ftmp2, tmp); // 2^7 - 2
526	for (size_t i = `0`; i < `5`; ++i) { // 2^12 - 2^6
527	p224_felem_square(tmp, ftmp2);
528	p224_felem_reduce(ftmp2, tmp);
529	}
530	p224_felem_mul(tmp, ftmp2, ftmp);
531	p224_felem_reduce(ftmp2, tmp); // 2^12 - 1
532	p224_felem_square(tmp, ftmp2);
533	p224_felem_reduce(ftmp3, tmp); // 2^13 - 2
534	for (size_t i = `0`; i < `11`; ++i) { // 2^24 - 2^12
535	p224_felem_square(tmp, ftmp3);
536	p224_felem_reduce(ftmp3, tmp);
537	}
538	p224_felem_mul(tmp, ftmp3, ftmp2);
539	p224_felem_reduce(ftmp2, tmp); // 2^24 - 1
540	p224_felem_square(tmp, ftmp2);
541	p224_felem_reduce(ftmp3, tmp); // 2^25 - 2
542	for (size_t i = `0`; i < `23`; ++i) { // 2^48 - 2^24
543	p224_felem_square(tmp, ftmp3);
544	p224_felem_reduce(ftmp3, tmp);
545	}
546	p224_felem_mul(tmp, ftmp3, ftmp2);
547	p224_felem_reduce(ftmp3, tmp); // 2^48 - 1
548	p224_felem_square(tmp, ftmp3);
549	p224_felem_reduce(ftmp4, tmp); // 2^49 - 2
550	for (size_t i = `0`; i < `47`; ++i) { // 2^96 - 2^48
551	p224_felem_square(tmp, ftmp4);
552	p224_felem_reduce(ftmp4, tmp);
553	}
554	p224_felem_mul(tmp, ftmp3, ftmp4);
555	p224_felem_reduce(ftmp3, tmp); // 2^96 - 1
556	p224_felem_square(tmp, ftmp3);
557	p224_felem_reduce(ftmp4, tmp); // 2^97 - 2
558	for (size_t i = `0`; i < `23`; ++i) { // 2^120 - 2^24
559	p224_felem_square(tmp, ftmp4);
560	p224_felem_reduce(ftmp4, tmp);
561	}
562	p224_felem_mul(tmp, ftmp2, ftmp4);
563	p224_felem_reduce(ftmp2, tmp); // 2^120 - 1
564	for (size_t i = `0`; i < `6`; ++i) { // 2^126 - 2^6
565	p224_felem_square(tmp, ftmp2);
566	p224_felem_reduce(ftmp2, tmp);
567	}
568	p224_felem_mul(tmp, ftmp2, ftmp);
569	p224_felem_reduce(ftmp, tmp); // 2^126 - 1
570	p224_felem_square(tmp, ftmp);
571	p224_felem_reduce(ftmp, tmp); // 2^127 - 2
572	p224_felem_mul(tmp, ftmp, in);
573	p224_felem_reduce(ftmp, tmp); // 2^127 - 1
574	for (size_t i = `0`; i < `97`; ++i) { // 2^224 - 2^97
575	p224_felem_square(tmp, ftmp);
576	p224_felem_reduce(ftmp, tmp);
577	}
578	p224_felem_mul(tmp, ftmp, ftmp3);
579	p224_felem_reduce(out, tmp); // 2^224 - 2^96 - 1
580	}
581
582	// Copy in constant time:
583	// if icopy == 1, copy in to out,
584	// if icopy == 0, copy out to itself.
585	static void p224_copy_conditional(p224_felem out, const p224_felem in,
586	p224_limb icopy) {
587	// icopy is a (64-bit) 0 or 1, so copy is either all-zero or all-one
588	const p224_limb copy = -icopy;
589	for (size_t i = `0`; i < `4`; ++i) {
590	const p224_limb tmp = copy & (in[i] ^ out[i]);
591	out[i] ^= tmp;
592	}
593	}
594
595	// ELLIPTIC CURVE POINT OPERATIONS
596	//
597	// Points are represented in Jacobian projective coordinates:
598	// (X, Y, Z) corresponds to the affine point (X/Z^2, Y/Z^3),
599	// or to the point at infinity if Z == 0.
600
601	// Double an elliptic curve point:
602	// (X', Y', Z') = 2 (X, Y, Z), where*
603	// X' = (3 (X - Z^2) * (X + Z^2))^2 - 8 * X * Y^2*
604	// Y' = 3 (X - Z^2) * (X + Z^2) * (4 * X * Y^2 - X') - 8 * Y^2*
605	// Z' = (Y + Z)^2 - Y^2 - Z^2 = 2 Y * Z*
606	// Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed,
607	// while x_out == y_in is not (maybe this works, but it's not tested).
608	static void p224_point_double(p224_felem x_out, p224_felem y_out,
609	p224_felem z_out, const p224_felem x_in,
610	const p224_felem y_in, const p224_felem z_in) {
611	p224_widefelem tmp, tmp2;
612	p224_felem delta, gamma, beta, alpha, ftmp, ftmp2;
613
614	p224_felem_assign(ftmp, x_in);
615	p224_felem_assign(ftmp2, x_in);
616
617	// delta = z^2
618	p224_felem_square(tmp, z_in);
619	p224_felem_reduce(delta, tmp);
620
621	// gamma = y^2
622	p224_felem_square(tmp, y_in);
623	p224_felem_reduce(gamma, tmp);
624
625	// beta = xgamma*
626	p224_felem_mul(tmp, x_in, gamma);
627	p224_felem_reduce(beta, tmp);
628
629	// alpha = 3(x-delta)(x+delta)
630	p224_felem_diff(ftmp, delta);
631	// ftmp[i] < 2^57 + 2^58 + 2 < 2^59
632	p224_felem_sum(ftmp2, delta);
633	// ftmp2[i] < 2^57 + 2^57 = 2^58
634	p224_felem_scalar(ftmp2, `3`);
635	// ftmp2[i] < 3 2^58 < 2^60*
636	p224_felem_mul(tmp, ftmp, ftmp2);
637	// tmp[i] < 2^60 2^59 * 4 = 2^121*
638	p224_felem_reduce(alpha, tmp);
639
640	// x' = alpha^2 - 8beta*
641	p224_felem_square(tmp, alpha);
642	// tmp[i] < 4 2^57 * 2^57 = 2^116*
643	p224_felem_assign(ftmp, beta);
644	p224_felem_scalar(ftmp, `8`);
645	// ftmp[i] < 8 2^57 = 2^60*
646	p224_felem_diff_128_64(tmp, ftmp);
647	// tmp[i] < 2^116 + 2^64 + 8 < 2^117
648	p224_felem_reduce(x_out, tmp);
649
650	// z' = (y + z)^2 - gamma - delta
651	p224_felem_sum(delta, gamma);
652	// delta[i] < 2^57 + 2^57 = 2^58
653	p224_felem_assign(ftmp, y_in);
654	p224_felem_sum(ftmp, z_in);
655	// ftmp[i] < 2^57 + 2^57 = 2^58
656	p224_felem_square(tmp, ftmp);
657	// tmp[i] < 4 2^58 * 2^58 = 2^118*
658	p224_felem_diff_128_64(tmp, delta);
659	// tmp[i] < 2^118 + 2^64 + 8 < 2^119
660	p224_felem_reduce(z_out, tmp);
661
662	// y' = alpha(4beta - x') - 8gamma^2*
663	p224_felem_scalar(beta, `4`);
664	// beta[i] < 4 2^57 = 2^59*
665	p224_felem_diff(beta, x_out);
666	// beta[i] < 2^59 + 2^58 + 2 < 2^60
667	p224_felem_mul(tmp, alpha, beta);
668	// tmp[i] < 4 2^57 * 2^60 = 2^119*
669	p224_felem_square(tmp2, gamma);
670	// tmp2[i] < 4 2^57 * 2^57 = 2^116*
671	p224_widefelem_scalar(tmp2, `8`);
672	// tmp2[i] < 8 2^116 = 2^119*
673	p224_widefelem_diff(tmp, tmp2);
674	// tmp[i] < 2^119 + 2^120 < 2^121
675	p224_felem_reduce(y_out, tmp);
676	}
677
678	// Add two elliptic curve points:
679	// (X_1, Y_1, Z_1) + (X_2, Y_2, Z_2) = (X_3, Y_3, Z_3), where
680	// X_3 = (Z_1^3 Y_2 - Z_2^3 * Y_1)^2 - (Z_1^2 * X_2 - Z_2^2 * X_1)^3 -*
681	// 2 Z_2^2 * X_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^2*
682	// Y_3 = (Z_1^3 * Y_2 - Z_2^3 * Y_1) * (Z_2^2 * X_1 * (Z_1^2 * X_2 - Z_2^2 *
683	// X_1)^2 - X_3) -
684	// Z_2^3 Y_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^3*
685	// Z_3 = (Z_1^2 X_2 - Z_2^2 * X_1) * (Z_1 * Z_2)*
686	//
687	// This runs faster if 'mixed' is set, which requires Z_2 = 1 or Z_2 = 0.
688
689	// This function is not entirely constant-time: it includes a branch for
690	// checking whether the two input points are equal, (while not equal to the
691	// point at infinity). This case never happens during single point
692	// multiplication, so there is no timing leak for ECDH or ECDSA signing.
693	static void p224_point_add(p224_felem x3, p224_felem y3, p224_felem z3,
694	const p224_felem x1, const p224_felem y1,
695	const p224_felem z1, const int mixed,
696	const p224_felem x2, const p224_felem y2,
697	const p224_felem z2) {
698	p224_felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, x_out, y_out, z_out;
699	p224_widefelem tmp, tmp2;
700	p224_limb z1_is_zero, z2_is_zero, x_equal, y_equal;
701
702	if (!mixed) {
703	// ftmp2 = z2^2
704	p224_felem_square(tmp, z2);
705	p224_felem_reduce(ftmp2, tmp);
706
707	// ftmp4 = z2^3
708	p224_felem_mul(tmp, ftmp2, z2);
709	p224_felem_reduce(ftmp4, tmp);
710
711	// ftmp4 = z2^3y1*
712	p224_felem_mul(tmp2, ftmp4, y1);
713	p224_felem_reduce(ftmp4, tmp2);
714
715	// ftmp2 = z2^2x1*
716	p224_felem_mul(tmp2, ftmp2, x1);
717	p224_felem_reduce(ftmp2, tmp2);
718	} else {
719	// We'll assume z2 = 1 (special case z2 = 0 is handled later)
720
721	// ftmp4 = z2^3y1*
722	p224_felem_assign(ftmp4, y1);
723
724	// ftmp2 = z2^2x1*
725	p224_felem_assign(ftmp2, x1);
726	}
727
728	// ftmp = z1^2
729	p224_felem_square(tmp, z1);
730	p224_felem_reduce(ftmp, tmp);
731
732	// ftmp3 = z1^3
733	p224_felem_mul(tmp, ftmp, z1);
734	p224_felem_reduce(ftmp3, tmp);
735
736	// tmp = z1^3y2*
737	p224_felem_mul(tmp, ftmp3, y2);
738	// tmp[i] < 4 2^57 * 2^57 = 2^116*
739
740	// ftmp3 = z1^3y2 - z2^3y1
741	p224_felem_diff_128_64(tmp, ftmp4);
742	// tmp[i] < 2^116 + 2^64 + 8 < 2^117
743	p224_felem_reduce(ftmp3, tmp);
744
745	// tmp = z1^2x2*
746	p224_felem_mul(tmp, ftmp, x2);
747	// tmp[i] < 4 2^57 * 2^57 = 2^116*
748
749	// ftmp = z1^2x2 - z2^2x1
750	p224_felem_diff_128_64(tmp, ftmp2);
751	// tmp[i] < 2^116 + 2^64 + 8 < 2^117
752	p224_felem_reduce(ftmp, tmp);
753
754	// the formulae are incorrect if the points are equal
755	// so we check for this and do doubling if this happens
756	x_equal = p224_felem_is_zero(ftmp);
757	y_equal = p224_felem_is_zero(ftmp3);
758	z1_is_zero = p224_felem_is_zero(z1);
759	z2_is_zero = p224_felem_is_zero(z2);
760	// In affine coordinates, (X_1, Y_1) == (X_2, Y_2)
761	p224_limb is_nontrivial_double =
762	x_equal & y_equal & (`1` - z1_is_zero) & (`1` - z2_is_zero);
763	if (is_nontrivial_double) {
764	p224_point_double(x3, y3, z3, x1, y1, z1);
765	return;
766	}
767
768	// ftmp5 = z1z2*
769	if (!mixed) {
770	p224_felem_mul(tmp, z1, z2);
771	p224_felem_reduce(ftmp5, tmp);
772	} else {
773	// special case z2 = 0 is handled later
774	p224_felem_assign(ftmp5, z1);
775	}
776
777	// z_out = (z1^2x2 - z2^2x1)(z1z2)
778	p224_felem_mul(tmp, ftmp, ftmp5);
779	p224_felem_reduce(z_out, tmp);
780
781	// ftmp = (z1^2x2 - z2^2x1)^2
782	p224_felem_assign(ftmp5, ftmp);
783	p224_felem_square(tmp, ftmp);
784	p224_felem_reduce(ftmp, tmp);
785
786	// ftmp5 = (z1^2x2 - z2^2x1)^3
787	p224_felem_mul(tmp, ftmp, ftmp5);
788	p224_felem_reduce(ftmp5, tmp);
789
790	// ftmp2 = z2^2x1(z1^2x2 - z2^2x1)^2
791	p224_felem_mul(tmp, ftmp2, ftmp);
792	p224_felem_reduce(ftmp2, tmp);
793
794	// tmp = z2^3y1(z1^2x2 - z2^2x1)^3
795	p224_felem_mul(tmp, ftmp4, ftmp5);
796	// tmp[i] < 4 2^57 * 2^57 = 2^116*
797
798	// tmp2 = (z1^3y2 - z2^3y1)^2
799	p224_felem_square(tmp2, ftmp3);
800	// tmp2[i] < 4 2^57 * 2^57 < 2^116*
801
802	// tmp2 = (z1^3y2 - z2^3y1)^2 - (z1^2x2 - z2^2x1)^3
803	p224_felem_diff_128_64(tmp2, ftmp5);
804	// tmp2[i] < 2^116 + 2^64 + 8 < 2^117
805
806	// ftmp5 = 2z2^2x1(z1^2x2 - z2^2x1)^2*
807	p224_felem_assign(ftmp5, ftmp2);
808	p224_felem_scalar(ftmp5, `2`);
809	// ftmp5[i] < 2 2^57 = 2^58*
810
811	/ x_out = (z1^3y2 - z2^3y1)^2 - (z1^2x2 - z2^2x1)^3 -*
812	2z2^2x1(z1^2x2 - z2^2x1)^2 /
813	p224_felem_diff_128_64(tmp2, ftmp5);
814	// tmp2[i] < 2^117 + 2^64 + 8 < 2^118
815	p224_felem_reduce(x_out, tmp2);
816
817	// ftmp2 = z2^2x1(z1^2x2 - z2^2x1)^2 - x_out
818	p224_felem_diff(ftmp2, x_out);
819	// ftmp2[i] < 2^57 + 2^58 + 2 < 2^59
820
821	// tmp2 = (z1^3y2 - z2^3y1)(z2^2x1(z1^2x2 - z2^2x1)^2 - x_out)*
822	p224_felem_mul(tmp2, ftmp3, ftmp2);
823	// tmp2[i] < 4 2^57 * 2^59 = 2^118*
824
825	/ y_out = (z1^3y2 - z2^3y1)(z2^2x1(z1^2x2 - z2^2x1)^2 - x_out) -
826	z2^3y1(z1^2x2 - z2^2x1)^3 /*
827	p224_widefelem_diff(tmp2, tmp);
828	// tmp2[i] < 2^118 + 2^120 < 2^121
829	p224_felem_reduce(y_out, tmp2);
830
831	// the result (x_out, y_out, z_out) is incorrect if one of the inputs is
832	// the point at infinity, so we need to check for this separately
833
834	// if point 1 is at infinity, copy point 2 to output, and vice versa
835	p224_copy_conditional(x_out, x2, z1_is_zero);
836	p224_copy_conditional(x_out, x1, z2_is_zero);
837	p224_copy_conditional(y_out, y2, z1_is_zero);
838	p224_copy_conditional(y_out, y1, z2_is_zero);
839	p224_copy_conditional(z_out, z2, z1_is_zero);
840	p224_copy_conditional(z_out, z1, z2_is_zero);
841	p224_felem_assign(x3, x_out);
842	p224_felem_assign(y3, y_out);
843	p224_felem_assign(z3, z_out);
844	}
845
846	// p224_select_point selects the \|idx\|th point from a precomputation table and
847	// copies it to out.
848	static void p224_select_point(const uint64_t idx, size_t size,
849	const p224_felem pre_comp[/size/][`3`],
850	p224_felem out[`3`]) {
851	p224_limb *outlimbs = &out[`0`][`0`];
852	OPENSSL_memset(outlimbs, `0`, `3` * sizeof(p224_felem));
853
854	for (size_t i = `0`; i < size; i++) {
855	const p224_limb *inlimbs = &pre_comp[i][`0`][`0`];
856	uint64_t mask = i ^ idx;
857	mask \|= mask >> `4`;
858	mask \|= mask >> `2`;
859	mask \|= mask >> `1`;
860	mask &= `1`;
861	mask--;
862	for (size_t j = `0`; j < `4` * `3`; j++) {
863	outlimbs[j] \|= inlimbs[j] & mask;
864	}
865	}
866	}
867
868	// p224_get_bit returns the \|i\|th bit in \|in\|
869	static char p224_get_bit(const p224_felem_bytearray in, size_t i) {
870	if (i >= `224`) {
871	return `0`;
872	}
873	return (in[i >> `3`] >> (i & `7`)) & `1`;
874	}
875
876	// Takes the Jacobian coordinates (X, Y, Z) of a point and returns
877	// (X', Y') = (X/Z^2, Y/Z^3)
878	static int ec_GFp_nistp224_point_get_affine_coordinates(
879	const EC_GROUP group, const* EC_RAW_POINT point, EC_FELEM x,
880	EC_FELEM *y) {
881	if (ec_GFp_simple_is_at_infinity(group, point)) {
882	OPENSSL_PUT_ERROR(EC, EC_R_POINT_AT_INFINITY);
883	return `0`;
884	}
885
886	p224_felem z1, z2;
887	p224_widefelem tmp;
888	p224_generic_to_felem(z1, &point->Z);
889	p224_felem_inv(z2, z1);
890	p224_felem_square(tmp, z2);
891	p224_felem_reduce(z1, tmp);
892
893	if (x != NULL) {
894	p224_felem x_in, x_out;
895	p224_generic_to_felem(x_in, &point->X);
896	p224_felem_mul(tmp, x_in, z1);
897	p224_felem_reduce(x_out, tmp);
898	p224_felem_to_generic(x, x_out);
899	}
900
901	if (y != NULL) {
902	p224_felem y_in, y_out;
903	p224_generic_to_felem(y_in, &point->Y);
904	p224_felem_mul(tmp, z1, z2);
905	p224_felem_reduce(z1, tmp);
906	p224_felem_mul(tmp, y_in, z1);
907	p224_felem_reduce(y_out, tmp);
908	p224_felem_to_generic(y, y_out);
909	}
910
911	return `1`;
912	}
913
914	static void ec_GFp_nistp224_add(const EC_GROUP group, EC_RAW_POINT r,
915	const EC_RAW_POINT a, const* EC_RAW_POINT *b) {
916	p224_felem x1, y1, z1, x2, y2, z2;
917	p224_generic_to_felem(x1, &a->X);
918	p224_generic_to_felem(y1, &a->Y);
919	p224_generic_to_felem(z1, &a->Z);
920	p224_generic_to_felem(x2, &b->X);
921	p224_generic_to_felem(y2, &b->Y);
922	p224_generic_to_felem(z2, &b->Z);
923	p224_point_add(x1, y1, z1, x1, y1, z1, `0` / both Jacobian /, x2, y2, z2);
924	// The outputs are already reduced, but still need to be contracted.
925	p224_felem_to_generic(&r->X, x1);
926	p224_felem_to_generic(&r->Y, y1);
927	p224_felem_to_generic(&r->Z, z1);
928	}
929
930	static void ec_GFp_nistp224_dbl(const EC_GROUP group, EC_RAW_POINT r,
931	const EC_RAW_POINT *a) {
932	p224_felem x, y, z;
933	p224_generic_to_felem(x, &a->X);
934	p224_generic_to_felem(y, &a->Y);
935	p224_generic_to_felem(z, &a->Z);
936	p224_point_double(x, y, z, x, y, z);
937	// The outputs are already reduced, but still need to be contracted.
938	p224_felem_to_generic(&r->X, x);
939	p224_felem_to_generic(&r->Y, y);
940	p224_felem_to_generic(&r->Z, z);
941	}
942
943	static void ec_GFp_nistp224_make_precomp(p224_felem out[`17`][`3`],
944	const EC_RAW_POINT *p) {
945	OPENSSL_memset(out[`0`], `0`, sizeof(p224_felem) * `3`);
946
947	p224_generic_to_felem(out[`1`][`0`], &p->X);
948	p224_generic_to_felem(out[`1`][`1`], &p->Y);
949	p224_generic_to_felem(out[`1`][`2`], &p->Z);
950
951	for (size_t j = `2`; j <= `16`; ++j) {
952	if (j & `1`) {
953	p224_point_add(out[j][`0`], out[j][`1`], out[j][`2`], out[`1`][`0`], out[`1`][`1`],
954	out[`1`][`2`], `0`, out[j - `1`][`0`], out[j - `1`][`1`], out[j - `1`][`2`]);
955	} else {
956	p224_point_double(out[j][`0`], out[j][`1`], out[j][`2`], out[j / `2`][`0`],
957	out[j / `2`][`1`], out[j / `2`][`2`]);
958	}
959	}
960	}
961
962	static void ec_GFp_nistp224_point_mul(const EC_GROUP group, EC_RAW_POINT r,
963	const EC_RAW_POINT *p,
964	const EC_SCALAR *scalar) {
965	p224_felem p_pre_comp[`17`][`3`];
966	ec_GFp_nistp224_make_precomp(p_pre_comp, p);
967
968	// Set nq to the point at infinity.
969	p224_felem nq[`3`], tmp[`4`];
970	OPENSSL_memset(nq, `0`, `3` * sizeof(p224_felem));
971
972	int skip = `1`; // Save two point operations in the first round.
973	for (size_t i = `220`; i < `221`; i--) {
974	if (!skip) {
975	p224_point_double(nq[`0`], nq[`1`], nq[`2`], nq[`0`], nq[`1`], nq[`2`]);
976	}
977
978	// Add every 5 doublings.
979	if (i % `5` == `0`) {
980	uint64_t bits = p224_get_bit(scalar->bytes, i + `4`) << `5`;
981	bits \|= p224_get_bit(scalar->bytes, i + `3`) << `4`;
982	bits \|= p224_get_bit(scalar->bytes, i + `2`) << `3`;
983	bits \|= p224_get_bit(scalar->bytes, i + `1`) << `2`;
984	bits \|= p224_get_bit(scalar->bytes, i) << `1`;
985	bits \|= p224_get_bit(scalar->bytes, i - `1`);
986	uint8_t sign, digit;
987	ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
988
989	// Select the point to add or subtract.
990	p224_select_point(digit, `17`, (const p224_felem(*)[`3`])p_pre_comp, tmp);
991	p224_felem_neg(tmp[`3`], tmp[`1`]); // (X, -Y, Z) is the negative point
992	p224_copy_conditional(tmp[`1`], tmp[`3`], sign);
993
994	if (!skip) {
995	p224_point_add(nq[`0`], nq[`1`], nq[`2`], nq[`0`], nq[`1`], nq[`2`], `0` / mixed /,
996	tmp[`0`], tmp[`1`], tmp[`2`]);
997	} else {
998	OPENSSL_memcpy(nq, tmp, `3` * sizeof(p224_felem));
999	skip = `0`;
1000	}
1001	}
1002	}
1003
1004	// Reduce the output to its unique minimal representation.
1005	p224_felem_to_generic(&r->X, nq[`0`]);
1006	p224_felem_to_generic(&r->Y, nq[`1`]);
1007	p224_felem_to_generic(&r->Z, nq[`2`]);
1008	}
1009
1010	static void ec_GFp_nistp224_point_mul_base(const EC_GROUP *group,
1011	EC_RAW_POINT *r,
1012	const EC_SCALAR *scalar) {
1013	// Set nq to the point at infinity.
1014	p224_felem nq[`3`], tmp[`3`];
1015	OPENSSL_memset(nq, `0`, `3` * sizeof(p224_felem));
1016
1017	int skip = `1`; // Save two point operations in the first round.
1018	for (size_t i = `27`; i < `28`; i--) {
1019	// double
1020	if (!skip) {
1021	p224_point_double(nq[`0`], nq[`1`], nq[`2`], nq[`0`], nq[`1`], nq[`2`]);
1022	}
1023
1024	// First, look 28 bits upwards.
1025	uint64_t bits = p224_get_bit(scalar->bytes, i + `196`) << `3`;
1026	bits \|= p224_get_bit(scalar->bytes, i + `140`) << `2`;
1027	bits \|= p224_get_bit(scalar->bytes, i + `84`) << `1`;
1028	bits \|= p224_get_bit(scalar->bytes, i + `28`);
1029	// Select the point to add, in constant time.
1030	p224_select_point(bits, `16`, g_p224_pre_comp[`1`], tmp);
1031
1032	if (!skip) {
1033	p224_point_add(nq[`0`], nq[`1`], nq[`2`], nq[`0`], nq[`1`], nq[`2`], `1` / mixed /,
1034	tmp[`0`], tmp[`1`], tmp[`2`]);
1035	} else {
1036	OPENSSL_memcpy(nq, tmp, `3` * sizeof(p224_felem));
1037	skip = `0`;
1038	}
1039
1040	// Second, look at the current position/
1041	bits = p224_get_bit(scalar->bytes, i + `168`) << `3`;
1042	bits \|= p224_get_bit(scalar->bytes, i + `112`) << `2`;
1043	bits \|= p224_get_bit(scalar->bytes, i + `56`) << `1`;
1044	bits \|= p224_get_bit(scalar->bytes, i);
1045	// Select the point to add, in constant time.
1046	p224_select_point(bits, `16`, g_p224_pre_comp[`0`], tmp);
1047	p224_point_add(nq[`0`], nq[`1`], nq[`2`], nq[`0`], nq[`1`], nq[`2`], `1` / mixed /,
1048	tmp[`0`], tmp[`1`], tmp[`2`]);
1049	}
1050
1051	// Reduce the output to its unique minimal representation.
1052	p224_felem_to_generic(&r->X, nq[`0`]);
1053	p224_felem_to_generic(&r->Y, nq[`1`]);
1054	p224_felem_to_generic(&r->Z, nq[`2`]);
1055	}
1056
1057	static void ec_GFp_nistp224_point_mul_public(const EC_GROUP *group,
1058	EC_RAW_POINT *r,
1059	const EC_SCALAR *g_scalar,
1060	const EC_RAW_POINT *p,
1061	const EC_SCALAR *p_scalar) {
1062	// TODO(davidben): If P-224 ECDSA verify performance ever matters, using
1063	// \|ec_compute_wNAF\| for \|p_scalar\| would likely be an easy improvement.
1064	p224_felem p_pre_comp[`17`][`3`];
1065	ec_GFp_nistp224_make_precomp(p_pre_comp, p);
1066
1067	// Set nq to the point at infinity.
1068	p224_felem nq[`3`], tmp[`3`];
1069	OPENSSL_memset(nq, `0`, `3` * sizeof(p224_felem));
1070
1071	// Loop over both scalars msb-to-lsb, interleaving additions of multiples of
1072	// the generator (two in each of the last 28 rounds) and additions of p (every
1073	// 5th round).
1074	int skip = `1`; // Save two point operations in the first round.
1075	for (size_t i = `220`; i < `221`; i--) {
1076	if (!skip) {
1077	p224_point_double(nq[`0`], nq[`1`], nq[`2`], nq[`0`], nq[`1`], nq[`2`]);
1078	}
1079
1080	// Add multiples of the generator.
1081	if (i <= `27`) {
1082	// First, look 28 bits upwards.
1083	uint64_t bits = p224_get_bit(g_scalar->bytes, i + `196`) << `3`;
1084	bits \|= p224_get_bit(g_scalar->bytes, i + `140`) << `2`;
1085	bits \|= p224_get_bit(g_scalar->bytes, i + `84`) << `1`;
1086	bits \|= p224_get_bit(g_scalar->bytes, i + `28`);
1087
1088	p224_point_add(nq[`0`], nq[`1`], nq[`2`], nq[`0`], nq[`1`], nq[`2`], `1` / mixed /,
1089	g_p224_pre_comp[`1`][bits][`0`], g_p224_pre_comp[`1`][bits][`1`],
1090	g_p224_pre_comp[`1`][bits][`2`]);
1091	assert(!skip);
1092
1093	// Second, look at the current position.
1094	bits = p224_get_bit(g_scalar->bytes, i + `168`) << `3`;
1095	bits \|= p224_get_bit(g_scalar->bytes, i + `112`) << `2`;
1096	bits \|= p224_get_bit(g_scalar->bytes, i + `56`) << `1`;
1097	bits \|= p224_get_bit(g_scalar->bytes, i);
1098	p224_point_add(nq[`0`], nq[`1`], nq[`2`], nq[`0`], nq[`1`], nq[`2`], `1` / mixed /,
1099	g_p224_pre_comp[`0`][bits][`0`], g_p224_pre_comp[`0`][bits][`1`],
1100	g_p224_pre_comp[`0`][bits][`2`]);
1101	}
1102
1103	// Incorporate \|p_scalar\| every 5 doublings.
1104	if (i % `5` == `0`) {
1105	uint64_t bits = p224_get_bit(p_scalar->bytes, i + `4`) << `5`;
1106	bits \|= p224_get_bit(p_scalar->bytes, i + `3`) << `4`;
1107	bits \|= p224_get_bit(p_scalar->bytes, i + `2`) << `3`;
1108	bits \|= p224_get_bit(p_scalar->bytes, i + `1`) << `2`;
1109	bits \|= p224_get_bit(p_scalar->bytes, i) << `1`;
1110	bits \|= p224_get_bit(p_scalar->bytes, i - `1`);
1111	uint8_t sign, digit;
1112	ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1113
1114	// Select the point to add or subtract.
1115	OPENSSL_memcpy(tmp, p_pre_comp[digit], `3` * sizeof(p224_felem));
1116	if (sign) {
1117	p224_felem_neg(tmp[`1`], tmp[`1`]); // (X, -Y, Z) is the negative point
1118	}
1119
1120	if (!skip) {
1121	p224_point_add(nq[`0`], nq[`1`], nq[`2`], nq[`0`], nq[`1`], nq[`2`], `0` / mixed /,
1122	tmp[`0`], tmp[`1`], tmp[`2`]);
1123	} else {
1124	OPENSSL_memcpy(nq, tmp, `3` * sizeof(p224_felem));
1125	skip = `0`;
1126	}
1127	}
1128	}
1129
1130	// Reduce the output to its unique minimal representation.
1131	p224_felem_to_generic(&r->X, nq[`0`]);
1132	p224_felem_to_generic(&r->Y, nq[`1`]);
1133	p224_felem_to_generic(&r->Z, nq[`2`]);
1134	}
1135
1136	static void ec_GFp_nistp224_felem_mul(const EC_GROUP group, EC_FELEM r,
1137	const EC_FELEM a, const* EC_FELEM *b) {
1138	p224_felem felem1, felem2;
1139	p224_widefelem wide;
1140	p224_generic_to_felem(felem1, a);
1141	p224_generic_to_felem(felem2, b);
1142	p224_felem_mul(wide, felem1, felem2);
1143	p224_felem_reduce(felem1, wide);
1144	p224_felem_to_generic(r, felem1);
1145	}
1146
1147	static void ec_GFp_nistp224_felem_sqr(const EC_GROUP group, EC_FELEM r,
1148	const EC_FELEM *a) {
1149	p224_felem felem;
1150	p224_generic_to_felem(felem, a);
1151	p224_widefelem wide;
1152	p224_felem_square(wide, felem);
1153	p224_felem_reduce(felem, wide);
1154	p224_felem_to_generic(r, felem);
1155	}
1156
1157	static int ec_GFp_nistp224_bignum_to_felem(const EC_GROUP group, EC_FELEM out,
1158	const BIGNUM *in) {
1159	return bn_copy_words(out->words, group->field.width, in);
1160	}
1161
1162	static int ec_GFp_nistp224_felem_to_bignum(const EC_GROUP group, BIGNUM out,
1163	const EC_FELEM *in) {
1164	return bn_set_words(out, in->words, group->field.width);
1165	}
1166
1167	DEFINE_METHOD_FUNCTION(EC_METHOD, EC_GFp_nistp224_method) {
1168	out->group_init = ec_GFp_simple_group_init;
1169	out->group_finish = ec_GFp_simple_group_finish;
1170	out->group_set_curve = ec_GFp_simple_group_set_curve;
1171	out->point_get_affine_coordinates =
1172	ec_GFp_nistp224_point_get_affine_coordinates;
1173	out->add = ec_GFp_nistp224_add;
1174	out->dbl = ec_GFp_nistp224_dbl;
1175	out->mul = ec_GFp_nistp224_point_mul;
1176	out->mul_base = ec_GFp_nistp224_point_mul_base;
1177	out->mul_public = ec_GFp_nistp224_point_mul_public;
1178	out->felem_mul = ec_GFp_nistp224_felem_mul;
1179	out->felem_sqr = ec_GFp_nistp224_felem_sqr;
1180	out->bignum_to_felem = ec_GFp_nistp224_bignum_to_felem;
1181	out->felem_to_bignum = ec_GFp_nistp224_felem_to_bignum;
1182	out->scalar_inv_montgomery = ec_simple_scalar_inv_montgomery;
1183	out->scalar_inv_montgomery_vartime = ec_GFp_simple_mont_inv_mod_ord_vartime;
1184	out->cmp_x_coordinate = ec_GFp_simple_cmp_x_coordinate;
1185	}
1186
1187	#endif // BORINGSSL_HAS_UINT128 && !SMALL
1188

Browse the source code of engine/third_party/boringssl/src/crypto/fipsmodule/ec/p224-64.c