poly1305.c source code [ClickHouse/contrib/openssl/crypto/poly1305/poly1305.c]

1	/*
2	* Copyright 2015-2018 The OpenSSL Project Authors. All Rights Reserved.
3	*
4	* Licensed under the Apache License 2.0 (the "License"). You may not use
5	* this file except in compliance with the License. You can obtain a copy
6	* in the file LICENSE in the source distribution or at
7	* https://www.openssl.org/source/license.html
8	*/
9
10	#include <stdlib.h>
11	#include <string.h>
12	#include <openssl/crypto.h>
13
14	#include "crypto/poly1305.h"
15
16	size_t Poly1305_ctx_size(void)
17	{
18	return sizeof(struct poly1305_context);
19	}
20
21	/ pick 32-bit unsigned integer in little endian order /
22	static unsigned int U8TOU32(const unsigned char *p)
23	{
24	return (((unsigned int)(p[`0`] & `0xff`)) \|
25	((unsigned int)(p[`1`] & `0xff`) << `8`) \|
26	((unsigned int)(p[`2`] & `0xff`) << `16`) \|
27	((unsigned int)(p[`3`] & `0xff`) << `24`));
28	}
29
30	/*
31	* Implementations can be classified by amount of significant bits in
32	* words making up the multi-precision value, or in other words radix
33	* or base of numerical representation, e.g. base 2^64, base 2^32,
34	* base 2^26. Complementary characteristic is how wide is the result of
35	* multiplication of pair of digits, e.g. it would take 128 bits to
36	* accommodate multiplication result in base 2^64 case. These are used
37	* interchangeably. To describe implementation that is. But interface
38	* is designed to isolate this so that low-level primitives implemented
39	* in assembly can be self-contained/self-coherent.
40	*/
41	#ifndef POLY1305_ASM
42	/*
43	* Even though there is __int128 reference implementation targeting
44	* 64-bit platforms provided below, it's not obvious that it's optimal
45	* choice for every one of them. Depending on instruction set overall
46	* amount of instructions can be comparable to one in __int64
47	* implementation. Amount of multiplication instructions would be lower,
48	* but not necessarily overall. And in out-of-order execution context,
49	* it is the latter that can be crucial...
50	*
51	* On related note. Poly1305 author, D. J. Bernstein, discusses and
52	* provides floating-point implementations of the algorithm in question.
53	* It made a lot of sense by the time of introduction, because most
54	* then-modern processors didn't have pipelined integer multiplier.
55	* [Not to mention that some had non-constant timing for integer
56	* multiplications.] Floating-point instructions on the other hand could
57	* be issued every cycle, which allowed to achieve better performance.
58	* Nowadays, with SIMD and/or out-or-order execution, shared or
59	* even emulated FPU, it's more complicated, and floating-point
60	* implementation is not necessarily optimal choice in every situation,
61	* rather contrary...
62	*
63	* <appro@openssl.org>
64	*/
65
66	typedef unsigned int u32;
67
68	/*
69	* poly1305_blocks processes a multiple of POLY1305_BLOCK_SIZE blocks
70	* of \|inp\| no longer than \|len\|. Behaviour for \|len\| not divisible by
71	* block size is unspecified in general case, even though in reference
72	* implementation the trailing chunk is simply ignored. Per algorithm
73	* specification, every input block, complete or last partial, is to be
74	* padded with a bit past most significant byte. The latter kind is then
75	* padded with zeros till block size. This last partial block padding
76	* is caller(*)'s responsibility, and because of this the last partial
77	* block is always processed with separate call with \|len\| set to
78	* POLY1305_BLOCK_SIZE and \|padbit\| to 0. In all other cases \|padbit\|
79	* should be set to 1 to perform implicit padding with 128th bit.
80	* poly1305_blocks does not actually check for this constraint though,
81	* it's caller(*)'s responsibility to comply.
82	*
83	* (*) In the context "caller" is not application code, but higher
84	* level Poly1305_* from this very module, so that quirks are
85	* handled locally.
86	*/
87	static void
88	poly1305_blocks(void ctx, const* unsigned char *inp, size_t len, u32 padbit);
89
90	/*
91	* Type-agnostic "rip-off" from constant_time.h
92	*/
93	# define CONSTANT_TIME_CARRY(a,b) ( \
94	(a ^ ((a ^ b) \| ((a - b) ^ b))) >> (sizeof(a) * 8 - 1) \
95	)
96
97	# if (defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16) && \
98	(defined(__SIZEOF_LONG__) && __SIZEOF_LONG__==8)
99
100	typedef unsigned long u64;
101	typedef __uint128_t u128;
102
103	typedef struct {
104	u64 h[`3`];
105	u64 r[`2`];
106	} poly1305_internal;
107
108	/ pick 32-bit unsigned integer in little endian order /
109	static u64 U8TOU64(const unsigned char *p)
110	{
111	return (((u64)(p[`0`] & `0xff`)) \|
112	((u64)(p[`1`] & `0xff`) << `8`) \|
113	((u64)(p[`2`] & `0xff`) << `16`) \|
114	((u64)(p[`3`] & `0xff`) << `24`) \|
115	((u64)(p[`4`] & `0xff`) << `32`) \|
116	((u64)(p[`5`] & `0xff`) << `40`) \|
117	((u64)(p[`6`] & `0xff`) << `48`) \|
118	((u64)(p[`7`] & `0xff`) << `56`));
119	}
120
121	/ store a 32-bit unsigned integer in little endian /
122	static void U64TO8(unsigned char *p, u64 v)
123	{
124	p[`0`] = (unsigned char)((v) & `0xff`);
125	p[`1`] = (unsigned char)((v >> `8`) & `0xff`);
126	p[`2`] = (unsigned char)((v >> `16`) & `0xff`);
127	p[`3`] = (unsigned char)((v >> `24`) & `0xff`);
128	p[`4`] = (unsigned char)((v >> `32`) & `0xff`);
129	p[`5`] = (unsigned char)((v >> `40`) & `0xff`);
130	p[`6`] = (unsigned char)((v >> `48`) & `0xff`);
131	p[`7`] = (unsigned char)((v >> `56`) & `0xff`);
132	}
133
134	static void poly1305_init(void ctx, const* unsigned char key[`16`])
135	{
136	poly1305_internal st = (poly1305_internal ) ctx;
137
138	/ h = 0 /
139	st->h[`0`] = `0`;
140	st->h[`1`] = `0`;
141	st->h[`2`] = `0`;
142
143	/ r &= 0xffffffc0ffffffc0ffffffc0fffffff /
144	st->r[`0`] = U8TOU64(&key[`0`]) & `0x0ffffffc0fffffff`;
145	st->r[`1`] = U8TOU64(&key[`8`]) & `0x0ffffffc0ffffffc`;
146	}
147
148	static void
149	poly1305_blocks(void ctx, const* unsigned char *inp, size_t len, u32 padbit)
150	{
151	poly1305_internal st = (poly1305_internal )ctx;
152	u64 r0, r1;
153	u64 s1;
154	u64 h0, h1, h2, c;
155	u128 d0, d1;
156
157	r0 = st->r[`0`];
158	r1 = st->r[`1`];
159
160	s1 = r1 + (r1 >> `2`);
161
162	h0 = st->h[`0`];
163	h1 = st->h[`1`];
164	h2 = st->h[`2`];
165
166	while (len >= POLY1305_BLOCK_SIZE) {
167	/ h += m[i] /
168	h0 = (u64)(d0 = (u128)h0 + U8TOU64(inp + `0`));
169	h1 = (u64)(d1 = (u128)h1 + (d0 >> `64`) + U8TOU64(inp + `8`));
170	/*
171	* padbit can be zero only when original len was
172	* POLY1306_BLOCK_SIZE, but we don't check
173	*/
174	h2 += (u64)(d1 >> `64`) + padbit;
175
176	/ h = r "%" p, where "%" stands for "partial remainder" /*
177	d0 = ((u128)h0 * r0) +
178	((u128)h1 * s1);
179	d1 = ((u128)h0 * r1) +
180	((u128)h1 * r0) +
181	(h2 * s1);
182	h2 = (h2 * r0);
183
184	/ last reduction step: /
185	/ a) h2:h0 = h2<<128 + d1<<64 + d0 /
186	h0 = (u64)d0;
187	h1 = (u64)(d1 += d0 >> `64`);
188	h2 += (u64)(d1 >> `64`);
189	/ b) (h2:h0 += (h2:h0>>130) * 5) %= 2^130 /
190	c = (h2 >> `2`) + (h2 & ~`3UL`);
191	h2 &= `3`;
192	h0 += c;
193	h1 += (c = CONSTANT_TIME_CARRY(h0,c));
194	h2 += CONSTANT_TIME_CARRY(h1,c);
195	/*
196	* Occasional overflows to 3rd bit of h2 are taken care of
197	* "naturally". If after this point we end up at the top of
198	* this loop, then the overflow bit will be accounted for
199	* in next iteration. If we end up in poly1305_emit, then
200	* comparison to modulus below will still count as "carry
201	* into 131st bit", so that properly reduced value will be
202	* picked in conditional move.
203	*/
204
205	inp += POLY1305_BLOCK_SIZE;
206	len -= POLY1305_BLOCK_SIZE;
207	}
208
209	st->h[`0`] = h0;
210	st->h[`1`] = h1;
211	st->h[`2`] = h2;
212	}
213
214	static void poly1305_emit(void ctx, unsigned* char mac[`16`],
215	const u32 nonce[`4`])
216	{
217	poly1305_internal st = (poly1305_internal ) ctx;
218	u64 h0, h1, h2;
219	u64 g0, g1, g2;
220	u128 t;
221	u64 mask;
222
223	h0 = st->h[`0`];
224	h1 = st->h[`1`];
225	h2 = st->h[`2`];
226
227	/ compare to modulus by computing h + -p /
228	g0 = (u64)(t = (u128)h0 + `5`);
229	g1 = (u64)(t = (u128)h1 + (t >> `64`));
230	g2 = h2 + (u64)(t >> `64`);
231
232	/ if there was carry into 131st bit, h1:h0 = g1:g0 /
233	mask = `0` - (g2 >> `2`);
234	g0 &= mask;
235	g1 &= mask;
236	mask = ~mask;
237	h0 = (h0 & mask) \| g0;
238	h1 = (h1 & mask) \| g1;
239
240	/ mac = (h + nonce) % (2^128) /
241	h0 = (u64)(t = (u128)h0 + nonce[`0`] + ((u64)nonce[`1`]<<`32`));
242	h1 = (u64)(t = (u128)h1 + nonce[`2`] + ((u64)nonce[`3`]<<`32`) + (t >> `64`));
243
244	U64TO8(mac + `0`, h0);
245	U64TO8(mac + `8`, h1);
246	}
247
248	# else
249
250	# if defined(_WIN32) && !defined(__MINGW32__)
251	typedef unsigned __int64 u64;
252	# elif defined(__arch64__)
253	typedef unsigned long u64;
254	# else
255	typedef unsigned long long u64;
256	# endif
257
258	typedef struct {
259	u32 h[`5`];
260	u32 r[`4`];
261	} poly1305_internal;
262
263	/ store a 32-bit unsigned integer in little endian /
264	static void U32TO8(unsigned char p, unsigned* int v)
265	{
266	p[`0`] = (unsigned char)((v) & `0xff`);
267	p[`1`] = (unsigned char)((v >> `8`) & `0xff`);
268	p[`2`] = (unsigned char)((v >> `16`) & `0xff`);
269	p[`3`] = (unsigned char)((v >> `24`) & `0xff`);
270	}
271
272	static void poly1305_init(void ctx, const* unsigned char key[`16`])
273	{
274	poly1305_internal st = (poly1305_internal ) ctx;
275
276	/ h = 0 /
277	st->h[`0`] = `0`;
278	st->h[`1`] = `0`;
279	st->h[`2`] = `0`;
280	st->h[`3`] = `0`;
281	st->h[`4`] = `0`;
282
283	/ r &= 0xffffffc0ffffffc0ffffffc0fffffff /
284	st->r[`0`] = U8TOU32(&key[`0`]) & `0x0fffffff`;
285	st->r[`1`] = U8TOU32(&key[`4`]) & `0x0ffffffc`;
286	st->r[`2`] = U8TOU32(&key[`8`]) & `0x0ffffffc`;
287	st->r[`3`] = U8TOU32(&key[`12`]) & `0x0ffffffc`;
288	}
289
290	static void
291	poly1305_blocks(void ctx, const* unsigned char *inp, size_t len, u32 padbit)
292	{
293	poly1305_internal st = (poly1305_internal )ctx;
294	u32 r0, r1, r2, r3;
295	u32 s1, s2, s3;
296	u32 h0, h1, h2, h3, h4, c;
297	u64 d0, d1, d2, d3;
298
299	r0 = st->r[`0`];
300	r1 = st->r[`1`];
301	r2 = st->r[`2`];
302	r3 = st->r[`3`];
303
304	s1 = r1 + (r1 >> `2`);
305	s2 = r2 + (r2 >> `2`);
306	s3 = r3 + (r3 >> `2`);
307
308	h0 = st->h[`0`];
309	h1 = st->h[`1`];
310	h2 = st->h[`2`];
311	h3 = st->h[`3`];
312	h4 = st->h[`4`];
313
314	while (len >= POLY1305_BLOCK_SIZE) {
315	/ h += m[i] /
316	h0 = (u32)(d0 = (u64)h0 + U8TOU32(inp + `0`));
317	h1 = (u32)(d1 = (u64)h1 + (d0 >> `32`) + U8TOU32(inp + `4`));
318	h2 = (u32)(d2 = (u64)h2 + (d1 >> `32`) + U8TOU32(inp + `8`));
319	h3 = (u32)(d3 = (u64)h3 + (d2 >> `32`) + U8TOU32(inp + `12`));
320	h4 += (u32)(d3 >> `32`) + padbit;
321
322	/ h = r "%" p, where "%" stands for "partial remainder" /*
323	d0 = ((u64)h0 * r0) +
324	((u64)h1 * s3) +
325	((u64)h2 * s2) +
326	((u64)h3 * s1);
327	d1 = ((u64)h0 * r1) +
328	((u64)h1 * r0) +
329	((u64)h2 * s3) +
330	((u64)h3 * s2) +
331	(h4 * s1);
332	d2 = ((u64)h0 * r2) +
333	((u64)h1 * r1) +
334	((u64)h2 * r0) +
335	((u64)h3 * s3) +
336	(h4 * s2);
337	d3 = ((u64)h0 * r3) +
338	((u64)h1 * r2) +
339	((u64)h2 * r1) +
340	((u64)h3 * r0) +
341	(h4 * s3);
342	h4 = (h4 * r0);
343
344	/ last reduction step: /
345	/ a) h4:h0 = h4<<128 + d3<<96 + d2<<64 + d1<<32 + d0 /
346	h0 = (u32)d0;
347	h1 = (u32)(d1 += d0 >> `32`);
348	h2 = (u32)(d2 += d1 >> `32`);
349	h3 = (u32)(d3 += d2 >> `32`);
350	h4 += (u32)(d3 >> `32`);
351	/ b) (h4:h0 += (h4:h0>>130) * 5) %= 2^130 /
352	c = (h4 >> `2`) + (h4 & ~`3U`);
353	h4 &= `3`;
354	h0 += c;
355	h1 += (c = CONSTANT_TIME_CARRY(h0,c));
356	h2 += (c = CONSTANT_TIME_CARRY(h1,c));
357	h3 += (c = CONSTANT_TIME_CARRY(h2,c));
358	h4 += CONSTANT_TIME_CARRY(h3,c);
359	/*
360	* Occasional overflows to 3rd bit of h4 are taken care of
361	* "naturally". If after this point we end up at the top of
362	* this loop, then the overflow bit will be accounted for
363	* in next iteration. If we end up in poly1305_emit, then
364	* comparison to modulus below will still count as "carry
365	* into 131st bit", so that properly reduced value will be
366	* picked in conditional move.
367	*/
368
369	inp += POLY1305_BLOCK_SIZE;
370	len -= POLY1305_BLOCK_SIZE;
371	}
372
373	st->h[`0`] = h0;
374	st->h[`1`] = h1;
375	st->h[`2`] = h2;
376	st->h[`3`] = h3;
377	st->h[`4`] = h4;
378	}
379
380	static void poly1305_emit(void ctx, unsigned* char mac[`16`],
381	const u32 nonce[`4`])
382	{
383	poly1305_internal st = (poly1305_internal ) ctx;
384	u32 h0, h1, h2, h3, h4;
385	u32 g0, g1, g2, g3, g4;
386	u64 t;
387	u32 mask;
388
389	h0 = st->h[`0`];
390	h1 = st->h[`1`];
391	h2 = st->h[`2`];
392	h3 = st->h[`3`];
393	h4 = st->h[`4`];
394
395	/ compare to modulus by computing h + -p /
396	g0 = (u32)(t = (u64)h0 + `5`);
397	g1 = (u32)(t = (u64)h1 + (t >> `32`));
398	g2 = (u32)(t = (u64)h2 + (t >> `32`));
399	g3 = (u32)(t = (u64)h3 + (t >> `32`));
400	g4 = h4 + (u32)(t >> `32`);
401
402	/ if there was carry into 131st bit, h3:h0 = g3:g0 /
403	mask = `0` - (g4 >> `2`);
404	g0 &= mask;
405	g1 &= mask;
406	g2 &= mask;
407	g3 &= mask;
408	mask = ~mask;
409	h0 = (h0 & mask) \| g0;
410	h1 = (h1 & mask) \| g1;
411	h2 = (h2 & mask) \| g2;
412	h3 = (h3 & mask) \| g3;
413
414	/ mac = (h + nonce) % (2^128) /
415	h0 = (u32)(t = (u64)h0 + nonce[`0`]);
416	h1 = (u32)(t = (u64)h1 + (t >> `32`) + nonce[`1`]);
417	h2 = (u32)(t = (u64)h2 + (t >> `32`) + nonce[`2`]);
418	h3 = (u32)(t = (u64)h3 + (t >> `32`) + nonce[`3`]);
419
420	U32TO8(mac + `0`, h0);
421	U32TO8(mac + `4`, h1);
422	U32TO8(mac + `8`, h2);
423	U32TO8(mac + `12`, h3);
424	}
425	# endif
426	#else
427	int poly1305_init(void ctx, const* unsigned char key[`16`], void *func);
428	void poly1305_blocks(void ctx, const* unsigned char *inp, size_t len,
429	unsigned int padbit);
430	void poly1305_emit(void ctx, unsigned* char mac[`16`],
431	const unsigned int nonce[`4`]);
432	#endif
433
434	void Poly1305_Init(POLY1305 ctx, const* unsigned char key[`32`])
435	{
436	ctx->nonce[`0`] = U8TOU32(&key[`16`]);
437	ctx->nonce[`1`] = U8TOU32(&key[`20`]);
438	ctx->nonce[`2`] = U8TOU32(&key[`24`]);
439	ctx->nonce[`3`] = U8TOU32(&key[`28`]);
440
441	#ifndef POLY1305_ASM
442	poly1305_init(ctx->opaque, key);
443	#else
444	/*
445	* Unlike reference poly1305_init assembly counterpart is expected
446	* to return a value: non-zero if it initializes ctx->func, and zero
447	* otherwise. Latter is to simplify assembly in cases when there no
448	* multiple code paths to switch between.
449	*/
450	if (!poly1305_init(ctx->opaque, key, &ctx->func)) {
451	ctx->func.blocks = poly1305_blocks;
452	ctx->func.emit = poly1305_emit;
453	}
454	#endif
455
456	ctx->num = `0`;
457
458	}
459
460	#ifdef POLY1305_ASM
461	/*
462	* This "eclipses" poly1305_blocks and poly1305_emit, but it's
463	* conscious choice imposed by -Wshadow compiler warnings.
464	*/
465	# define poly1305_blocks (*poly1305_blocks_p)
466	# define poly1305_emit (*poly1305_emit_p)
467	#endif
468
469	void Poly1305_Update(POLY1305 ctx, const* unsigned char *inp, size_t len)
470	{
471	#ifdef POLY1305_ASM
472	/*
473	* As documented, poly1305_blocks is never called with input
474	* longer than single block and padbit argument set to 0. This
475	* property is fluently used in assembly modules to optimize
476	* padbit handling on loop boundary.
477	*/
478	poly1305_blocks_f poly1305_blocks_p = ctx->func.blocks;
479	#endif
480	size_t rem, num;
481
482	if ((num = ctx->num)) {
483	rem = POLY1305_BLOCK_SIZE - num;
484	if (len >= rem) {
485	memcpy(ctx->data + num, inp, rem);
486	poly1305_blocks(ctx->opaque, ctx->data, POLY1305_BLOCK_SIZE, `1`);
487	inp += rem;
488	len -= rem;
489	} else {
490	/ Still not enough data to process a block. /
491	memcpy(ctx->data + num, inp, len);
492	ctx->num = num + len;
493	return;
494	}
495	}
496
497	rem = len % POLY1305_BLOCK_SIZE;
498	len -= rem;
499
500	if (len >= POLY1305_BLOCK_SIZE) {
501	poly1305_blocks(ctx->opaque, inp, len, `1`);
502	inp += len;
503	}
504
505	if (rem)
506	memcpy(ctx->data, inp, rem);
507
508	ctx->num = rem;
509	}
510
511	void Poly1305_Final(POLY1305 ctx, unsigned* char mac[`16`])
512	{
513	#ifdef POLY1305_ASM
514	poly1305_blocks_f poly1305_blocks_p = ctx->func.blocks;
515	poly1305_emit_f poly1305_emit_p = ctx->func.emit;
516	#endif
517	size_t num;
518
519	if ((num = ctx->num)) {
520	ctx->data[num++] = `1`; / pad bit /
521	while (num < POLY1305_BLOCK_SIZE)
522	ctx->data[num++] = `0`;
523	poly1305_blocks(ctx->opaque, ctx->data, POLY1305_BLOCK_SIZE, `0`);
524	}
525
526	poly1305_emit(ctx->opaque, mac, ctx->nonce);
527
528	/ zero out the state /
529	OPENSSL_cleanse(ctx, sizeof(*ctx));
530	}
531

Browse the source code of ClickHouse/contrib/openssl/crypto/poly1305/poly1305.c