gcm128.c source code [ClickHouse/contrib/openssl/crypto/modes/gcm128.c]

1	/*
2	* Copyright 2010-2018 The OpenSSL Project Authors. All Rights Reserved.
3	*
4	* Licensed under the Apache License 2.0 (the "License"). You may not use
5	* this file except in compliance with the License. You can obtain a copy
6	* in the file LICENSE in the source distribution or at
7	* https://www.openssl.org/source/license.html
8	*/
9
10	#include <string.h>
11	#include <openssl/crypto.h>
12	#include "internal/cryptlib.h"
13	#include "crypto/modes.h"
14
15	#if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
16	/ redefine, because alignment is ensured /
17	# undef GETU32
18	# define GETU32(p) BSWAP4((const u32 )(p))
19	# undef PUTU32
20	# define PUTU32(p,v) (u32 )(p) = BSWAP4(v)
21	#endif
22
23	#define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16))
24	#define REDUCE1BIT(V) do { \
25	if (sizeof(size_t)==8) { \
26	u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
27	V.lo = (V.hi<<63)\|(V.lo>>1); \
28	V.hi = (V.hi>>1 )^T; \
29	} \
30	else { \
31	u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
32	V.lo = (V.hi<<63)\|(V.lo>>1); \
33	V.hi = (V.hi>>1 )^((u64)T<<32); \
34	} \
35	} while(0)
36
37	/-*
38	* Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
39	* never be set to 8. 8 is effectively reserved for testing purposes.
40	* TABLE_BITS>1 are lookup-table-driven implementations referred to as
41	* "Shoup's" in GCM specification. In other words OpenSSL does not cover
42	* whole spectrum of possible table driven implementations. Why? In
43	* non-"Shoup's" case memory access pattern is segmented in such manner,
44	* that it's trivial to see that cache timing information can reveal
45	* fair portion of intermediate hash value. Given that ciphertext is
46	* always available to attacker, it's possible for him to attempt to
47	* deduce secret parameter H and if successful, tamper with messages
48	* [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
49	* not as trivial, but there is no reason to believe that it's resistant
50	* to cache-timing attack. And the thing about "8-bit" implementation is
51	* that it consumes 16 (sixteen) times more memory, 4KB per individual
52	* key + 1KB shared. Well, on pros side it should be twice as fast as
53	* "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
54	* was observed to run ~75% faster, closer to 100% for commercial
55	* compilers... Yet "4-bit" procedure is preferred, because it's
56	* believed to provide better security-performance balance and adequate
57	* all-round performance. "All-round" refers to things like:
58	*
59	* - shorter setup time effectively improves overall timing for
60	* handling short messages;
61	* - larger table allocation can become unbearable because of VM
62	* subsystem penalties (for example on Windows large enough free
63	* results in VM working set trimming, meaning that consequent
64	* malloc would immediately incur working set expansion);
65	* - larger table has larger cache footprint, which can affect
66	* performance of other code paths (not necessarily even from same
67	* thread in Hyper-Threading world);
68	*
69	* Value of 1 is not appropriate for performance reasons.
70	*/
71	#if TABLE_BITS==8
72
73	static void gcm_init_8bit(u128 Htable[`256`], u64 H[`2`])
74	{
75	int i, j;
76	u128 V;
77
78	Htable[`0`].hi = `0`;
79	Htable[`0`].lo = `0`;
80	V.hi = H[`0`];
81	V.lo = H[`1`];
82
83	for (Htable[`128`] = V, i = `64`; i > `0`; i >>= `1`) {
84	REDUCE1BIT(V);
85	Htable[i] = V;
86	}
87
88	for (i = `2`; i < `256`; i <<= `1`) {
89	u128 Hi = Htable + i, H0 = Hi;
90	for (j = `1`; j < i; ++j) {
91	Hi[j].hi = H0.hi ^ Htable[j].hi;
92	Hi[j].lo = H0.lo ^ Htable[j].lo;
93	}
94	}
95	}
96
97	static void gcm_gmult_8bit(u64 Xi[`2`], const u128 Htable[`256`])
98	{
99	u128 Z = { `0`, `0` };
100	const u8 xi = (const* u8 *)Xi + `15`;
101	size_t rem, n = *xi;
102	const union {
103	long one;
104	char little;
105	} is_endian = { `1` };
106	static const size_t rem_8bit[`256`] = {
107	PACK(`0x0000`), PACK(`0x01C2`), PACK(`0x0384`), PACK(`0x0246`),
108	PACK(`0x0708`), PACK(`0x06CA`), PACK(`0x048C`), PACK(`0x054E`),
109	PACK(`0x0E10`), PACK(`0x0FD2`), PACK(`0x0D94`), PACK(`0x0C56`),
110	PACK(`0x0918`), PACK(`0x08DA`), PACK(`0x0A9C`), PACK(`0x0B5E`),
111	PACK(`0x1C20`), PACK(`0x1DE2`), PACK(`0x1FA4`), PACK(`0x1E66`),
112	PACK(`0x1B28`), PACK(`0x1AEA`), PACK(`0x18AC`), PACK(`0x196E`),
113	PACK(`0x1230`), PACK(`0x13F2`), PACK(`0x11B4`), PACK(`0x1076`),
114	PACK(`0x1538`), PACK(`0x14FA`), PACK(`0x16BC`), PACK(`0x177E`),
115	PACK(`0x3840`), PACK(`0x3982`), PACK(`0x3BC4`), PACK(`0x3A06`),
116	PACK(`0x3F48`), PACK(`0x3E8A`), PACK(`0x3CCC`), PACK(`0x3D0E`),
117	PACK(`0x3650`), PACK(`0x3792`), PACK(`0x35D4`), PACK(`0x3416`),
118	PACK(`0x3158`), PACK(`0x309A`), PACK(`0x32DC`), PACK(`0x331E`),
119	PACK(`0x2460`), PACK(`0x25A2`), PACK(`0x27E4`), PACK(`0x2626`),
120	PACK(`0x2368`), PACK(`0x22AA`), PACK(`0x20EC`), PACK(`0x212E`),
121	PACK(`0x2A70`), PACK(`0x2BB2`), PACK(`0x29F4`), PACK(`0x2836`),
122	PACK(`0x2D78`), PACK(`0x2CBA`), PACK(`0x2EFC`), PACK(`0x2F3E`),
123	PACK(`0x7080`), PACK(`0x7142`), PACK(`0x7304`), PACK(`0x72C6`),
124	PACK(`0x7788`), PACK(`0x764A`), PACK(`0x740C`), PACK(`0x75CE`),
125	PACK(`0x7E90`), PACK(`0x7F52`), PACK(`0x7D14`), PACK(`0x7CD6`),
126	PACK(`0x7998`), PACK(`0x785A`), PACK(`0x7A1C`), PACK(`0x7BDE`),
127	PACK(`0x6CA0`), PACK(`0x6D62`), PACK(`0x6F24`), PACK(`0x6EE6`),
128	PACK(`0x6BA8`), PACK(`0x6A6A`), PACK(`0x682C`), PACK(`0x69EE`),
129	PACK(`0x62B0`), PACK(`0x6372`), PACK(`0x6134`), PACK(`0x60F6`),
130	PACK(`0x65B8`), PACK(`0x647A`), PACK(`0x663C`), PACK(`0x67FE`),
131	PACK(`0x48C0`), PACK(`0x4902`), PACK(`0x4B44`), PACK(`0x4A86`),
132	PACK(`0x4FC8`), PACK(`0x4E0A`), PACK(`0x4C4C`), PACK(`0x4D8E`),
133	PACK(`0x46D0`), PACK(`0x4712`), PACK(`0x4554`), PACK(`0x4496`),
134	PACK(`0x41D8`), PACK(`0x401A`), PACK(`0x425C`), PACK(`0x439E`),
135	PACK(`0x54E0`), PACK(`0x5522`), PACK(`0x5764`), PACK(`0x56A6`),
136	PACK(`0x53E8`), PACK(`0x522A`), PACK(`0x506C`), PACK(`0x51AE`),
137	PACK(`0x5AF0`), PACK(`0x5B32`), PACK(`0x5974`), PACK(`0x58B6`),
138	PACK(`0x5DF8`), PACK(`0x5C3A`), PACK(`0x5E7C`), PACK(`0x5FBE`),
139	PACK(`0xE100`), PACK(`0xE0C2`), PACK(`0xE284`), PACK(`0xE346`),
140	PACK(`0xE608`), PACK(`0xE7CA`), PACK(`0xE58C`), PACK(`0xE44E`),
141	PACK(`0xEF10`), PACK(`0xEED2`), PACK(`0xEC94`), PACK(`0xED56`),
142	PACK(`0xE818`), PACK(`0xE9DA`), PACK(`0xEB9C`), PACK(`0xEA5E`),
143	PACK(`0xFD20`), PACK(`0xFCE2`), PACK(`0xFEA4`), PACK(`0xFF66`),
144	PACK(`0xFA28`), PACK(`0xFBEA`), PACK(`0xF9AC`), PACK(`0xF86E`),
145	PACK(`0xF330`), PACK(`0xF2F2`), PACK(`0xF0B4`), PACK(`0xF176`),
146	PACK(`0xF438`), PACK(`0xF5FA`), PACK(`0xF7BC`), PACK(`0xF67E`),
147	PACK(`0xD940`), PACK(`0xD882`), PACK(`0xDAC4`), PACK(`0xDB06`),
148	PACK(`0xDE48`), PACK(`0xDF8A`), PACK(`0xDDCC`), PACK(`0xDC0E`),
149	PACK(`0xD750`), PACK(`0xD692`), PACK(`0xD4D4`), PACK(`0xD516`),
150	PACK(`0xD058`), PACK(`0xD19A`), PACK(`0xD3DC`), PACK(`0xD21E`),
151	PACK(`0xC560`), PACK(`0xC4A2`), PACK(`0xC6E4`), PACK(`0xC726`),
152	PACK(`0xC268`), PACK(`0xC3AA`), PACK(`0xC1EC`), PACK(`0xC02E`),
153	PACK(`0xCB70`), PACK(`0xCAB2`), PACK(`0xC8F4`), PACK(`0xC936`),
154	PACK(`0xCC78`), PACK(`0xCDBA`), PACK(`0xCFFC`), PACK(`0xCE3E`),
155	PACK(`0x9180`), PACK(`0x9042`), PACK(`0x9204`), PACK(`0x93C6`),
156	PACK(`0x9688`), PACK(`0x974A`), PACK(`0x950C`), PACK(`0x94CE`),
157	PACK(`0x9F90`), PACK(`0x9E52`), PACK(`0x9C14`), PACK(`0x9DD6`),
158	PACK(`0x9898`), PACK(`0x995A`), PACK(`0x9B1C`), PACK(`0x9ADE`),
159	PACK(`0x8DA0`), PACK(`0x8C62`), PACK(`0x8E24`), PACK(`0x8FE6`),
160	PACK(`0x8AA8`), PACK(`0x8B6A`), PACK(`0x892C`), PACK(`0x88EE`),
161	PACK(`0x83B0`), PACK(`0x8272`), PACK(`0x8034`), PACK(`0x81F6`),
162	PACK(`0x84B8`), PACK(`0x857A`), PACK(`0x873C`), PACK(`0x86FE`),
163	PACK(`0xA9C0`), PACK(`0xA802`), PACK(`0xAA44`), PACK(`0xAB86`),
164	PACK(`0xAEC8`), PACK(`0xAF0A`), PACK(`0xAD4C`), PACK(`0xAC8E`),
165	PACK(`0xA7D0`), PACK(`0xA612`), PACK(`0xA454`), PACK(`0xA596`),
166	PACK(`0xA0D8`), PACK(`0xA11A`), PACK(`0xA35C`), PACK(`0xA29E`),
167	PACK(`0xB5E0`), PACK(`0xB422`), PACK(`0xB664`), PACK(`0xB7A6`),
168	PACK(`0xB2E8`), PACK(`0xB32A`), PACK(`0xB16C`), PACK(`0xB0AE`),
169	PACK(`0xBBF0`), PACK(`0xBA32`), PACK(`0xB874`), PACK(`0xB9B6`),
170	PACK(`0xBCF8`), PACK(`0xBD3A`), PACK(`0xBF7C`), PACK(`0xBEBE`)
171	};
172
173	while (`1`) {
174	Z.hi ^= Htable[n].hi;
175	Z.lo ^= Htable[n].lo;
176
177	if ((u8 *)Xi == xi)
178	break;
179
180	n = *(--xi);
181
182	rem = (size_t)Z.lo & `0xff`;
183	Z.lo = (Z.hi << `56`) \| (Z.lo >> `8`);
184	Z.hi = (Z.hi >> `8`);
185	if (sizeof(size_t) == `8`)
186	Z.hi ^= rem_8bit[rem];
187	else
188	Z.hi ^= (u64)rem_8bit[rem] << `32`;
189	}
190
191	if (is_endian.little) {
192	# ifdef BSWAP8
193	Xi[`0`] = BSWAP8(Z.hi);
194	Xi[`1`] = BSWAP8(Z.lo);
195	# else
196	u8 p = (u8 )Xi;
197	u32 v;
198	v = (u32)(Z.hi >> `32`);
199	PUTU32(p, v);
200	v = (u32)(Z.hi);
201	PUTU32(p + `4`, v);
202	v = (u32)(Z.lo >> `32`);
203	PUTU32(p + `8`, v);
204	v = (u32)(Z.lo);
205	PUTU32(p + `12`, v);
206	# endif
207	} else {
208	Xi[`0`] = Z.hi;
209	Xi[`1`] = Z.lo;
210	}
211	}
212
213	# define GCM_MUL(ctx) gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
214
215	#elif TABLE_BITS==4
216
217	static void gcm_init_4bit(u128 Htable[`16`], u64 H[`2`])
218	{
219	u128 V;
220	# if defined(OPENSSL_SMALL_FOOTPRINT)
221	int i;
222	# endif
223
224	Htable[`0`].hi = `0`;
225	Htable[`0`].lo = `0`;
226	V.hi = H[`0`];
227	V.lo = H[`1`];
228
229	# if defined(OPENSSL_SMALL_FOOTPRINT)
230	for (Htable[`8`] = V, i = `4`; i > `0`; i >>= `1`) {
231	REDUCE1BIT(V);
232	Htable[i] = V;
233	}
234
235	for (i = `2`; i < `16`; i <<= `1`) {
236	u128 *Hi = Htable + i;
237	int j;
238	for (V = *Hi, j = `1`; j < i; ++j) {
239	Hi[j].hi = V.hi ^ Htable[j].hi;
240	Hi[j].lo = V.lo ^ Htable[j].lo;
241	}
242	}
243	# else
244	Htable[`8`] = V;
245	REDUCE1BIT(V);
246	Htable[`4`] = V;
247	REDUCE1BIT(V);
248	Htable[`2`] = V;
249	REDUCE1BIT(V);
250	Htable[`1`] = V;
251	Htable[`3`].hi = V.hi ^ Htable[`2`].hi, Htable[`3`].lo = V.lo ^ Htable[`2`].lo;
252	V = Htable[`4`];
253	Htable[`5`].hi = V.hi ^ Htable[`1`].hi, Htable[`5`].lo = V.lo ^ Htable[`1`].lo;
254	Htable[`6`].hi = V.hi ^ Htable[`2`].hi, Htable[`6`].lo = V.lo ^ Htable[`2`].lo;
255	Htable[`7`].hi = V.hi ^ Htable[`3`].hi, Htable[`7`].lo = V.lo ^ Htable[`3`].lo;
256	V = Htable[`8`];
257	Htable[`9`].hi = V.hi ^ Htable[`1`].hi, Htable[`9`].lo = V.lo ^ Htable[`1`].lo;
258	Htable[`10`].hi = V.hi ^ Htable[`2`].hi, Htable[`10`].lo = V.lo ^ Htable[`2`].lo;
259	Htable[`11`].hi = V.hi ^ Htable[`3`].hi, Htable[`11`].lo = V.lo ^ Htable[`3`].lo;
260	Htable[`12`].hi = V.hi ^ Htable[`4`].hi, Htable[`12`].lo = V.lo ^ Htable[`4`].lo;
261	Htable[`13`].hi = V.hi ^ Htable[`5`].hi, Htable[`13`].lo = V.lo ^ Htable[`5`].lo;
262	Htable[`14`].hi = V.hi ^ Htable[`6`].hi, Htable[`14`].lo = V.lo ^ Htable[`6`].lo;
263	Htable[`15`].hi = V.hi ^ Htable[`7`].hi, Htable[`15`].lo = V.lo ^ Htable[`7`].lo;
264	# endif
265	# if defined(GHASH_ASM) && (defined(__arm__) \|\| defined(__arm))
266	/*
267	* ARM assembler expects specific dword order in Htable.
268	*/
269	{
270	int j;
271	const union {
272	long one;
273	char little;
274	} is_endian = { `1` };
275
276	if (is_endian.little)
277	for (j = `0`; j < `16`; ++j) {
278	V = Htable[j];
279	Htable[j].hi = V.lo;
280	Htable[j].lo = V.hi;
281	} else
282	for (j = `0`; j < `16`; ++j) {
283	V = Htable[j];
284	Htable[j].hi = V.lo << `32` \| V.lo >> `32`;
285	Htable[j].lo = V.hi << `32` \| V.hi >> `32`;
286	}
287	}
288	# endif
289	}
290
291	# ifndef GHASH_ASM
292	static const size_t rem_4bit[`16`] = {
293	PACK(`0x0000`), PACK(`0x1C20`), PACK(`0x3840`), PACK(`0x2460`),
294	PACK(`0x7080`), PACK(`0x6CA0`), PACK(`0x48C0`), PACK(`0x54E0`),
295	PACK(`0xE100`), PACK(`0xFD20`), PACK(`0xD940`), PACK(`0xC560`),
296	PACK(`0x9180`), PACK(`0x8DA0`), PACK(`0xA9C0`), PACK(`0xB5E0`)
297	};
298
299	static void gcm_gmult_4bit(u64 Xi[`2`], const u128 Htable[`16`])
300	{
301	u128 Z;
302	int cnt = `15`;
303	size_t rem, nlo, nhi;
304	const union {
305	long one;
306	char little;
307	} is_endian = { `1` };
308
309	nlo = ((const u8 *)Xi)[`15`];
310	nhi = nlo >> `4`;
311	nlo &= `0xf`;
312
313	Z.hi = Htable[nlo].hi;
314	Z.lo = Htable[nlo].lo;
315
316	while (`1`) {
317	rem = (size_t)Z.lo & `0xf`;
318	Z.lo = (Z.hi << `60`) \| (Z.lo >> `4`);
319	Z.hi = (Z.hi >> `4`);
320	if (sizeof(size_t) == `8`)
321	Z.hi ^= rem_4bit[rem];
322	else
323	Z.hi ^= (u64)rem_4bit[rem] << `32`;
324
325	Z.hi ^= Htable[nhi].hi;
326	Z.lo ^= Htable[nhi].lo;
327
328	if (--cnt < `0`)
329	break;
330
331	nlo = ((const u8 *)Xi)[cnt];
332	nhi = nlo >> `4`;
333	nlo &= `0xf`;
334
335	rem = (size_t)Z.lo & `0xf`;
336	Z.lo = (Z.hi << `60`) \| (Z.lo >> `4`);
337	Z.hi = (Z.hi >> `4`);
338	if (sizeof(size_t) == `8`)
339	Z.hi ^= rem_4bit[rem];
340	else
341	Z.hi ^= (u64)rem_4bit[rem] << `32`;
342
343	Z.hi ^= Htable[nlo].hi;
344	Z.lo ^= Htable[nlo].lo;
345	}
346
347	if (is_endian.little) {
348	# ifdef BSWAP8
349	Xi[`0`] = BSWAP8(Z.hi);
350	Xi[`1`] = BSWAP8(Z.lo);
351	# else
352	u8 p = (u8 )Xi;
353	u32 v;
354	v = (u32)(Z.hi >> `32`);
355	PUTU32(p, v);
356	v = (u32)(Z.hi);
357	PUTU32(p + `4`, v);
358	v = (u32)(Z.lo >> `32`);
359	PUTU32(p + `8`, v);
360	v = (u32)(Z.lo);
361	PUTU32(p + `12`, v);
362	# endif
363	} else {
364	Xi[`0`] = Z.hi;
365	Xi[`1`] = Z.lo;
366	}
367	}
368
369	# if !defined(OPENSSL_SMALL_FOOTPRINT)
370	/*
371	* Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en\|de]crypt for
372	* details... Compiler-generated code doesn't seem to give any
373	* performance improvement, at least not on x86[_64]. It's here
374	* mostly as reference and a placeholder for possible future
375	* non-trivial optimization[s]...
376	*/
377	static void gcm_ghash_4bit(u64 Xi[`2`], const u128 Htable[`16`],
378	const u8 *inp, size_t len)
379	{
380	u128 Z;
381	int cnt;
382	size_t rem, nlo, nhi;
383	const union {
384	long one;
385	char little;
386	} is_endian = { `1` };
387
388	# if 1
389	do {
390	cnt = `15`;
391	nlo = ((const u8 *)Xi)[`15`];
392	nlo ^= inp[`15`];
393	nhi = nlo >> `4`;
394	nlo &= `0xf`;
395
396	Z.hi = Htable[nlo].hi;
397	Z.lo = Htable[nlo].lo;
398
399	while (`1`) {
400	rem = (size_t)Z.lo & `0xf`;
401	Z.lo = (Z.hi << `60`) \| (Z.lo >> `4`);
402	Z.hi = (Z.hi >> `4`);
403	if (sizeof(size_t) == `8`)
404	Z.hi ^= rem_4bit[rem];
405	else
406	Z.hi ^= (u64)rem_4bit[rem] << `32`;
407
408	Z.hi ^= Htable[nhi].hi;
409	Z.lo ^= Htable[nhi].lo;
410
411	if (--cnt < `0`)
412	break;
413
414	nlo = ((const u8 *)Xi)[cnt];
415	nlo ^= inp[cnt];
416	nhi = nlo >> `4`;
417	nlo &= `0xf`;
418
419	rem = (size_t)Z.lo & `0xf`;
420	Z.lo = (Z.hi << `60`) \| (Z.lo >> `4`);
421	Z.hi = (Z.hi >> `4`);
422	if (sizeof(size_t) == `8`)
423	Z.hi ^= rem_4bit[rem];
424	else
425	Z.hi ^= (u64)rem_4bit[rem] << `32`;
426
427	Z.hi ^= Htable[nlo].hi;
428	Z.lo ^= Htable[nlo].lo;
429	}
430	# else
431	/*
432	* Extra 256+16 bytes per-key plus 512 bytes shared tables
433	* [should] give ~50% improvement... One could have PACK()-ed
434	* the rem_8bit even here, but the priority is to minimize
435	* cache footprint...
436	*/
437	u128 Hshr4[`16`]; / Htable shifted right by 4 bits /
438	u8 Hshl4[`16`]; / Htable shifted left by 4 bits /
439	static const unsigned short rem_8bit[`256`] = {
440	`0x0000`, `0x01C2`, `0x0384`, `0x0246`, `0x0708`, `0x06CA`, `0x048C`, `0x054E`,
441	`0x0E10`, `0x0FD2`, `0x0D94`, `0x0C56`, `0x0918`, `0x08DA`, `0x0A9C`, `0x0B5E`,
442	`0x1C20`, `0x1DE2`, `0x1FA4`, `0x1E66`, `0x1B28`, `0x1AEA`, `0x18AC`, `0x196E`,
443	`0x1230`, `0x13F2`, `0x11B4`, `0x1076`, `0x1538`, `0x14FA`, `0x16BC`, `0x177E`,
444	`0x3840`, `0x3982`, `0x3BC4`, `0x3A06`, `0x3F48`, `0x3E8A`, `0x3CCC`, `0x3D0E`,
445	`0x3650`, `0x3792`, `0x35D4`, `0x3416`, `0x3158`, `0x309A`, `0x32DC`, `0x331E`,
446	`0x2460`, `0x25A2`, `0x27E4`, `0x2626`, `0x2368`, `0x22AA`, `0x20EC`, `0x212E`,
447	`0x2A70`, `0x2BB2`, `0x29F4`, `0x2836`, `0x2D78`, `0x2CBA`, `0x2EFC`, `0x2F3E`,
448	`0x7080`, `0x7142`, `0x7304`, `0x72C6`, `0x7788`, `0x764A`, `0x740C`, `0x75CE`,
449	`0x7E90`, `0x7F52`, `0x7D14`, `0x7CD6`, `0x7998`, `0x785A`, `0x7A1C`, `0x7BDE`,
450	`0x6CA0`, `0x6D62`, `0x6F24`, `0x6EE6`, `0x6BA8`, `0x6A6A`, `0x682C`, `0x69EE`,
451	`0x62B0`, `0x6372`, `0x6134`, `0x60F6`, `0x65B8`, `0x647A`, `0x663C`, `0x67FE`,
452	`0x48C0`, `0x4902`, `0x4B44`, `0x4A86`, `0x4FC8`, `0x4E0A`, `0x4C4C`, `0x4D8E`,
453	`0x46D0`, `0x4712`, `0x4554`, `0x4496`, `0x41D8`, `0x401A`, `0x425C`, `0x439E`,
454	`0x54E0`, `0x5522`, `0x5764`, `0x56A6`, `0x53E8`, `0x522A`, `0x506C`, `0x51AE`,
455	`0x5AF0`, `0x5B32`, `0x5974`, `0x58B6`, `0x5DF8`, `0x5C3A`, `0x5E7C`, `0x5FBE`,
456	`0xE100`, `0xE0C2`, `0xE284`, `0xE346`, `0xE608`, `0xE7CA`, `0xE58C`, `0xE44E`,
457	`0xEF10`, `0xEED2`, `0xEC94`, `0xED56`, `0xE818`, `0xE9DA`, `0xEB9C`, `0xEA5E`,
458	`0xFD20`, `0xFCE2`, `0xFEA4`, `0xFF66`, `0xFA28`, `0xFBEA`, `0xF9AC`, `0xF86E`,
459	`0xF330`, `0xF2F2`, `0xF0B4`, `0xF176`, `0xF438`, `0xF5FA`, `0xF7BC`, `0xF67E`,
460	`0xD940`, `0xD882`, `0xDAC4`, `0xDB06`, `0xDE48`, `0xDF8A`, `0xDDCC`, `0xDC0E`,
461	`0xD750`, `0xD692`, `0xD4D4`, `0xD516`, `0xD058`, `0xD19A`, `0xD3DC`, `0xD21E`,
462	`0xC560`, `0xC4A2`, `0xC6E4`, `0xC726`, `0xC268`, `0xC3AA`, `0xC1EC`, `0xC02E`,
463	`0xCB70`, `0xCAB2`, `0xC8F4`, `0xC936`, `0xCC78`, `0xCDBA`, `0xCFFC`, `0xCE3E`,
464	`0x9180`, `0x9042`, `0x9204`, `0x93C6`, `0x9688`, `0x974A`, `0x950C`, `0x94CE`,
465	`0x9F90`, `0x9E52`, `0x9C14`, `0x9DD6`, `0x9898`, `0x995A`, `0x9B1C`, `0x9ADE`,
466	`0x8DA0`, `0x8C62`, `0x8E24`, `0x8FE6`, `0x8AA8`, `0x8B6A`, `0x892C`, `0x88EE`,
467	`0x83B0`, `0x8272`, `0x8034`, `0x81F6`, `0x84B8`, `0x857A`, `0x873C`, `0x86FE`,
468	`0xA9C0`, `0xA802`, `0xAA44`, `0xAB86`, `0xAEC8`, `0xAF0A`, `0xAD4C`, `0xAC8E`,
469	`0xA7D0`, `0xA612`, `0xA454`, `0xA596`, `0xA0D8`, `0xA11A`, `0xA35C`, `0xA29E`,
470	`0xB5E0`, `0xB422`, `0xB664`, `0xB7A6`, `0xB2E8`, `0xB32A`, `0xB16C`, `0xB0AE`,
471	`0xBBF0`, `0xBA32`, `0xB874`, `0xB9B6`, `0xBCF8`, `0xBD3A`, `0xBF7C`, `0xBEBE`
472	};
473	/*
474	* This pre-processing phase slows down procedure by approximately
475	* same time as it makes each loop spin faster. In other words
476	* single block performance is approximately same as straightforward
477	* "4-bit" implementation, and then it goes only faster...
478	*/
479	for (cnt = `0`; cnt < `16`; ++cnt) {
480	Z.hi = Htable[cnt].hi;
481	Z.lo = Htable[cnt].lo;
482	Hshr4[cnt].lo = (Z.hi << `60`) \| (Z.lo >> `4`);
483	Hshr4[cnt].hi = (Z.hi >> `4`);
484	Hshl4[cnt] = (u8)(Z.lo << `4`);
485	}
486
487	do {
488	for (Z.lo = `0`, Z.hi = `0`, cnt = `15`; cnt; --cnt) {
489	nlo = ((const u8 *)Xi)[cnt];
490	nlo ^= inp[cnt];
491	nhi = nlo >> `4`;
492	nlo &= `0xf`;
493
494	Z.hi ^= Htable[nlo].hi;
495	Z.lo ^= Htable[nlo].lo;
496
497	rem = (size_t)Z.lo & `0xff`;
498
499	Z.lo = (Z.hi << `56`) \| (Z.lo >> `8`);
500	Z.hi = (Z.hi >> `8`);
501
502	Z.hi ^= Hshr4[nhi].hi;
503	Z.lo ^= Hshr4[nhi].lo;
504	Z.hi ^= (u64)rem_8bit[rem ^ Hshl4[nhi]] << `48`;
505	}
506
507	nlo = ((const u8 *)Xi)[`0`];
508	nlo ^= inp[`0`];
509	nhi = nlo >> `4`;
510	nlo &= `0xf`;
511
512	Z.hi ^= Htable[nlo].hi;
513	Z.lo ^= Htable[nlo].lo;
514
515	rem = (size_t)Z.lo & `0xf`;
516
517	Z.lo = (Z.hi << `60`) \| (Z.lo >> `4`);
518	Z.hi = (Z.hi >> `4`);
519
520	Z.hi ^= Htable[nhi].hi;
521	Z.lo ^= Htable[nhi].lo;
522	Z.hi ^= ((u64)rem_8bit[rem << `4`]) << `48`;
523	# endif
524
525	if (is_endian.little) {
526	# ifdef BSWAP8
527	Xi[`0`] = BSWAP8(Z.hi);
528	Xi[`1`] = BSWAP8(Z.lo);
529	# else
530	u8 p = (u8 )Xi;
531	u32 v;
532	v = (u32)(Z.hi >> `32`);
533	PUTU32(p, v);
534	v = (u32)(Z.hi);
535	PUTU32(p + `4`, v);
536	v = (u32)(Z.lo >> `32`);
537	PUTU32(p + `8`, v);
538	v = (u32)(Z.lo);
539	PUTU32(p + `12`, v);
540	# endif
541	} else {
542	Xi[`0`] = Z.hi;
543	Xi[`1`] = Z.lo;
544	}
545	} while (inp += `16`, len -= `16`);
546	}
547	# endif
548	# else
549	void gcm_gmult_4bit(u64 Xi[`2`], const u128 Htable[`16`]);
550	void gcm_ghash_4bit(u64 Xi[`2`], const u128 Htable[`16`], const u8 *inp,
551	size_t len);
552	# endif
553
554	# define GCM_MUL(ctx) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
555	# if defined(GHASH_ASM) \|\| !defined(OPENSSL_SMALL_FOOTPRINT)
556	# define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
557	/*
558	* GHASH_CHUNK is "stride parameter" missioned to mitigate cache trashing
559	* effect. In other words idea is to hash data while it's still in L1 cache
560	* after encryption pass...
561	*/
562	# define GHASH_CHUNK (3*1024)
563	# endif
564
565	#else /* TABLE_BITS */
566
567	static void gcm_gmult_1bit(u64 Xi[`2`], const u64 H[`2`])
568	{
569	u128 V, Z = { `0`, `0` };
570	long X;
571	int i, j;
572	const long xi = (const* long *)Xi;
573	const union {
574	long one;
575	char little;
576	} is_endian = { `1` };
577
578	V.hi = H[`0`]; / H is in host byte order, no byte swapping /
579	V.lo = H[`1`];
580
581	for (j = `0`; j < `16` / sizeof(long); ++j) {
582	if (is_endian.little) {
583	if (sizeof(long) == `8`) {
584	# ifdef BSWAP8
585	X = (long)(BSWAP8(xi[j]));
586	# else
587	const u8 p = (const* u8 *)(xi + j);
588	X = (long)((u64)GETU32(p) << `32` \| GETU32(p + `4`));
589	# endif
590	} else {
591	const u8 p = (const* u8 *)(xi + j);
592	X = (long)GETU32(p);
593	}
594	} else
595	X = xi[j];
596
597	for (i = `0`; i < `8` * sizeof(long); ++i, X <<= `1`) {
598	u64 M = (u64)(X >> (`8` * sizeof(long) - `1`));
599	Z.hi ^= V.hi & M;
600	Z.lo ^= V.lo & M;
601
602	REDUCE1BIT(V);
603	}
604	}
605
606	if (is_endian.little) {
607	# ifdef BSWAP8
608	Xi[`0`] = BSWAP8(Z.hi);
609	Xi[`1`] = BSWAP8(Z.lo);
610	# else
611	u8 p = (u8 )Xi;
612	u32 v;
613	v = (u32)(Z.hi >> `32`);
614	PUTU32(p, v);
615	v = (u32)(Z.hi);
616	PUTU32(p + `4`, v);
617	v = (u32)(Z.lo >> `32`);
618	PUTU32(p + `8`, v);
619	v = (u32)(Z.lo);
620	PUTU32(p + `12`, v);
621	# endif
622	} else {
623	Xi[`0`] = Z.hi;
624	Xi[`1`] = Z.lo;
625	}
626	}
627
628	# define GCM_MUL(ctx) gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
629
630	#endif
631
632	#if TABLE_BITS==4 && (defined(GHASH_ASM) \|\| defined(OPENSSL_CPUID_OBJ))
633	# if !defined(I386_ONLY) && \
634	(defined(__i386) \|\| defined(__i386__) \|\| \
635	defined(__x86_64) \|\| defined(__x86_64__) \|\| \
636	defined(_M_IX86) \|\| defined(_M_AMD64) \|\| defined(_M_X64))
637	# define GHASH_ASM_X86_OR_64
638	# define GCM_FUNCREF_4BIT
639
640	void gcm_init_clmul(u128 Htable[`16`], const u64 Xi[`2`]);
641	void gcm_gmult_clmul(u64 Xi[`2`], const u128 Htable[`16`]);
642	void gcm_ghash_clmul(u64 Xi[`2`], const u128 Htable[`16`], const u8 *inp,
643	size_t len);
644
645	# if defined(__i386) \|\| defined(__i386__) \|\| defined(_M_IX86)
646	# define gcm_init_avx gcm_init_clmul
647	# define gcm_gmult_avx gcm_gmult_clmul
648	# define gcm_ghash_avx gcm_ghash_clmul
649	# else
650	void gcm_init_avx(u128 Htable[`16`], const u64 Xi[`2`]);
651	void gcm_gmult_avx(u64 Xi[`2`], const u128 Htable[`16`]);
652	void gcm_ghash_avx(u64 Xi[`2`], const u128 Htable[`16`], const u8 *inp,
653	size_t len);
654	# endif
655
656	# if defined(__i386) \|\| defined(__i386__) \|\| defined(_M_IX86)
657	# define GHASH_ASM_X86
658	void gcm_gmult_4bit_mmx(u64 Xi[`2`], const u128 Htable[`16`]);
659	void gcm_ghash_4bit_mmx(u64 Xi[`2`], const u128 Htable[`16`], const u8 *inp,
660	size_t len);
661
662	void gcm_gmult_4bit_x86(u64 Xi[`2`], const u128 Htable[`16`]);
663	void gcm_ghash_4bit_x86(u64 Xi[`2`], const u128 Htable[`16`], const u8 *inp,
664	size_t len);
665	# endif
666	# elif defined(__arm__) \|\| defined(__arm) \|\| defined(__aarch64__)
667	# include "arm_arch.h"
668	# if __ARM_MAX_ARCH__>=7
669	# define GHASH_ASM_ARM
670	# define GCM_FUNCREF_4BIT
671	# define PMULL_CAPABLE (OPENSSL_armcap_P & ARMV8_PMULL)
672	# if defined(__arm__) \|\| defined(__arm)
673	# define NEON_CAPABLE (OPENSSL_armcap_P & ARMV7_NEON)
674	# endif
675	void gcm_init_neon(u128 Htable[`16`], const u64 Xi[`2`]);
676	void gcm_gmult_neon(u64 Xi[`2`], const u128 Htable[`16`]);
677	void gcm_ghash_neon(u64 Xi[`2`], const u128 Htable[`16`], const u8 *inp,
678	size_t len);
679	void gcm_init_v8(u128 Htable[`16`], const u64 Xi[`2`]);
680	void gcm_gmult_v8(u64 Xi[`2`], const u128 Htable[`16`]);
681	void gcm_ghash_v8(u64 Xi[`2`], const u128 Htable[`16`], const u8 *inp,
682	size_t len);
683	# endif
684	# elif defined(__sparc__) \|\| defined(__sparc)
685	# include "sparc_arch.h"
686	# define GHASH_ASM_SPARC
687	# define GCM_FUNCREF_4BIT
688	extern unsigned int OPENSSL_sparcv9cap_P[];
689	void gcm_init_vis3(u128 Htable[`16`], const u64 Xi[`2`]);
690	void gcm_gmult_vis3(u64 Xi[`2`], const u128 Htable[`16`]);
691	void gcm_ghash_vis3(u64 Xi[`2`], const u128 Htable[`16`], const u8 *inp,
692	size_t len);
693	# elif defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) \|\| defined(__ppc__) \|\| defined(_ARCH_PPC))
694	# include "ppc_arch.h"
695	# define GHASH_ASM_PPC
696	# define GCM_FUNCREF_4BIT
697	void gcm_init_p8(u128 Htable[`16`], const u64 Xi[`2`]);
698	void gcm_gmult_p8(u64 Xi[`2`], const u128 Htable[`16`]);
699	void gcm_ghash_p8(u64 Xi[`2`], const u128 Htable[`16`], const u8 *inp,
700	size_t len);
701	# endif
702	#endif
703
704	#ifdef GCM_FUNCREF_4BIT
705	# undef GCM_MUL
706	# define GCM_MUL(ctx) (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
707	# ifdef GHASH
708	# undef GHASH
709	# define GHASH(ctx,in,len) (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
710	# endif
711	#endif
712
713	void CRYPTO_gcm128_init(GCM128_CONTEXT ctx, void* *key, block128_f block)
714	{
715	const union {
716	long one;
717	char little;
718	} is_endian = { `1` };
719
720	memset(ctx, `0`, sizeof(*ctx));
721	ctx->block = block;
722	ctx->key = key;
723
724	(*block) (ctx->H.c, ctx->H.c, key);
725
726	if (is_endian.little) {
727	/ H is stored in host byte order /
728	#ifdef BSWAP8
729	ctx->H.u[`0`] = BSWAP8(ctx->H.u[`0`]);
730	ctx->H.u[`1`] = BSWAP8(ctx->H.u[`1`]);
731	#else
732	u8 *p = ctx->H.c;
733	u64 hi, lo;
734	hi = (u64)GETU32(p) << `32` \| GETU32(p + `4`);
735	lo = (u64)GETU32(p + `8`) << `32` \| GETU32(p + `12`);
736	ctx->H.u[`0`] = hi;
737	ctx->H.u[`1`] = lo;
738	#endif
739	}
740	#if TABLE_BITS==8
741	gcm_init_8bit(ctx->Htable, ctx->H.u);
742	#elif TABLE_BITS==4
743	# if defined(GHASH)
744	# define CTX__GHASH(f) (ctx->ghash = (f))
745	# else
746	# define CTX__GHASH(f) (ctx->ghash = NULL)
747	# endif
748	# if defined(GHASH_ASM_X86_OR_64)
749	# if !defined(GHASH_ASM_X86) \|\| defined(OPENSSL_IA32_SSE2)
750	if (OPENSSL_ia32cap_P[`1`] & (`1` << `1`)) { / check PCLMULQDQ bit /
751	if (((OPENSSL_ia32cap_P[`1`] >> `22`) & `0x41`) == `0x41`) { / AVX+MOVBE /
752	gcm_init_avx(ctx->Htable, ctx->H.u);
753	ctx->gmult = gcm_gmult_avx;
754	CTX__GHASH(gcm_ghash_avx);
755	} else {
756	gcm_init_clmul(ctx->Htable, ctx->H.u);
757	ctx->gmult = gcm_gmult_clmul;
758	CTX__GHASH(gcm_ghash_clmul);
759	}
760	return;
761	}
762	# endif
763	gcm_init_4bit(ctx->Htable, ctx->H.u);
764	# if defined(GHASH_ASM_X86) /* x86 only */
765	# if defined(OPENSSL_IA32_SSE2)
766	if (OPENSSL_ia32cap_P[`0`] & (`1` << `25`)) { / check SSE bit /
767	# else
768	if (OPENSSL_ia32cap_P[`0`] & (`1` << `23`)) { / check MMX bit /
769	# endif
770	ctx->gmult = gcm_gmult_4bit_mmx;
771	CTX__GHASH(gcm_ghash_4bit_mmx);
772	} else {
773	ctx->gmult = gcm_gmult_4bit_x86;
774	CTX__GHASH(gcm_ghash_4bit_x86);
775	}
776	# else
777	ctx->gmult = gcm_gmult_4bit;
778	CTX__GHASH(gcm_ghash_4bit);
779	# endif
780	# elif defined(GHASH_ASM_ARM)
781	# ifdef PMULL_CAPABLE
782	if (PMULL_CAPABLE) {
783	gcm_init_v8(ctx->Htable, ctx->H.u);
784	ctx->gmult = gcm_gmult_v8;
785	CTX__GHASH(gcm_ghash_v8);
786	} else
787	# endif
788	# ifdef NEON_CAPABLE
789	if (NEON_CAPABLE) {
790	gcm_init_neon(ctx->Htable, ctx->H.u);
791	ctx->gmult = gcm_gmult_neon;
792	CTX__GHASH(gcm_ghash_neon);
793	} else
794	# endif
795	{
796	gcm_init_4bit(ctx->Htable, ctx->H.u);
797	ctx->gmult = gcm_gmult_4bit;
798	CTX__GHASH(gcm_ghash_4bit);
799	}
800	# elif defined(GHASH_ASM_SPARC)
801	if (OPENSSL_sparcv9cap_P[`0`] & SPARCV9_VIS3) {
802	gcm_init_vis3(ctx->Htable, ctx->H.u);
803	ctx->gmult = gcm_gmult_vis3;
804	CTX__GHASH(gcm_ghash_vis3);
805	} else {
806	gcm_init_4bit(ctx->Htable, ctx->H.u);
807	ctx->gmult = gcm_gmult_4bit;
808	CTX__GHASH(gcm_ghash_4bit);
809	}
810	# elif defined(GHASH_ASM_PPC)
811	if (OPENSSL_ppccap_P & PPC_CRYPTO207) {
812	gcm_init_p8(ctx->Htable, ctx->H.u);
813	ctx->gmult = gcm_gmult_p8;
814	CTX__GHASH(gcm_ghash_p8);
815	} else {
816	gcm_init_4bit(ctx->Htable, ctx->H.u);
817	ctx->gmult = gcm_gmult_4bit;
818	CTX__GHASH(gcm_ghash_4bit);
819	}
820	# else
821	gcm_init_4bit(ctx->Htable, ctx->H.u);
822	# endif
823	# undef CTX__GHASH
824	#endif
825	}
826
827	void CRYPTO_gcm128_setiv(GCM128_CONTEXT ctx, const* unsigned char *iv,
828	size_t len)
829	{
830	const union {
831	long one;
832	char little;
833	} is_endian = { `1` };
834	unsigned int ctr;
835	#ifdef GCM_FUNCREF_4BIT
836	void (gcm_gmult_p) (u64 Xi[`2`], const* u128 Htable[`16`]) = ctx->gmult;
837	#endif
838
839	ctx->len.u[`0`] = `0`; / AAD length /
840	ctx->len.u[`1`] = `0`; / message length /
841	ctx->ares = `0`;
842	ctx->mres = `0`;
843
844	if (len == `12`) {
845	memcpy(ctx->Yi.c, iv, `12`);
846	ctx->Yi.c[`12`] = `0`;
847	ctx->Yi.c[`13`] = `0`;
848	ctx->Yi.c[`14`] = `0`;
849	ctx->Yi.c[`15`] = `1`;
850	ctr = `1`;
851	} else {
852	size_t i;
853	u64 len0 = len;
854
855	/ Borrow ctx->Xi to calculate initial Yi /
856	ctx->Xi.u[`0`] = `0`;
857	ctx->Xi.u[`1`] = `0`;
858
859	while (len >= `16`) {
860	for (i = `0`; i < `16`; ++i)
861	ctx->Xi.c[i] ^= iv[i];
862	GCM_MUL(ctx);
863	iv += `16`;
864	len -= `16`;
865	}
866	if (len) {
867	for (i = `0`; i < len; ++i)
868	ctx->Xi.c[i] ^= iv[i];
869	GCM_MUL(ctx);
870	}
871	len0 <<= `3`;
872	if (is_endian.little) {
873	#ifdef BSWAP8
874	ctx->Xi.u[`1`] ^= BSWAP8(len0);
875	#else
876	ctx->Xi.c[`8`] ^= (u8)(len0 >> `56`);
877	ctx->Xi.c[`9`] ^= (u8)(len0 >> `48`);
878	ctx->Xi.c[`10`] ^= (u8)(len0 >> `40`);
879	ctx->Xi.c[`11`] ^= (u8)(len0 >> `32`);
880	ctx->Xi.c[`12`] ^= (u8)(len0 >> `24`);
881	ctx->Xi.c[`13`] ^= (u8)(len0 >> `16`);
882	ctx->Xi.c[`14`] ^= (u8)(len0 >> `8`);
883	ctx->Xi.c[`15`] ^= (u8)(len0);
884	#endif
885	} else {
886	ctx->Xi.u[`1`] ^= len0;
887	}
888
889	GCM_MUL(ctx);
890
891	if (is_endian.little)
892	#ifdef BSWAP4
893	ctr = BSWAP4(ctx->Xi.d[`3`]);
894	#else
895	ctr = GETU32(ctx->Xi.c + `12`);
896	#endif
897	else
898	ctr = ctx->Xi.d[`3`];
899
900	/ Copy borrowed Xi to Yi /
901	ctx->Yi.u[`0`] = ctx->Xi.u[`0`];
902	ctx->Yi.u[`1`] = ctx->Xi.u[`1`];
903	}
904
905	ctx->Xi.u[`0`] = `0`;
906	ctx->Xi.u[`1`] = `0`;
907
908	(*ctx->block) (ctx->Yi.c, ctx->EK0.c, ctx->key);
909	++ctr;
910	if (is_endian.little)
911	#ifdef BSWAP4
912	ctx->Yi.d[`3`] = BSWAP4(ctr);
913	#else
914	PUTU32(ctx->Yi.c + `12`, ctr);
915	#endif
916	else
917	ctx->Yi.d[`3`] = ctr;
918	}
919
920	int CRYPTO_gcm128_aad(GCM128_CONTEXT ctx, const* unsigned char *aad,
921	size_t len)
922	{
923	size_t i;
924	unsigned int n;
925	u64 alen = ctx->len.u[`0`];
926	#ifdef GCM_FUNCREF_4BIT
927	void (gcm_gmult_p) (u64 Xi[`2`], const* u128 Htable[`16`]) = ctx->gmult;
928	# ifdef GHASH
929	void (gcm_ghash_p) (u64 Xi[`2`], const* u128 Htable[`16`],
930	const u8 *inp, size_t len) = ctx->ghash;
931	# endif
932	#endif
933
934	if (ctx->len.u[`1`])
935	return -`2`;
936
937	alen += len;
938	if (alen > (U64(`1`) << `61`) \|\| (sizeof(len) == `8` && alen < len))
939	return -`1`;
940	ctx->len.u[`0`] = alen;
941
942	n = ctx->ares;
943	if (n) {
944	while (n && len) {
945	ctx->Xi.c[n] ^= *(aad++);
946	--len;
947	n = (n + `1`) % `16`;
948	}
949	if (n == `0`)
950	GCM_MUL(ctx);
951	else {
952	ctx->ares = n;
953	return `0`;
954	}
955	}
956	#ifdef GHASH
957	if ((i = (len & (size_t)-`16`))) {
958	GHASH(ctx, aad, i);
959	aad += i;
960	len -= i;
961	}
962	#else
963	while (len >= `16`) {
964	for (i = `0`; i < `16`; ++i)
965	ctx->Xi.c[i] ^= aad[i];
966	GCM_MUL(ctx);
967	aad += `16`;
968	len -= `16`;
969	}
970	#endif
971	if (len) {
972	n = (unsigned int)len;
973	for (i = `0`; i < len; ++i)
974	ctx->Xi.c[i] ^= aad[i];
975	}
976
977	ctx->ares = n;
978	return `0`;
979	}
980
981	int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
982	const unsigned char in, unsigned* char *out,
983	size_t len)
984	{
985	const union {
986	long one;
987	char little;
988	} is_endian = { `1` };
989	unsigned int n, ctr, mres;
990	size_t i;
991	u64 mlen = ctx->len.u[`1`];
992	block128_f block = ctx->block;
993	void *key = ctx->key;
994	#ifdef GCM_FUNCREF_4BIT
995	void (gcm_gmult_p) (u64 Xi[`2`], const* u128 Htable[`16`]) = ctx->gmult;
996	# if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
997	void (gcm_ghash_p) (u64 Xi[`2`], const* u128 Htable[`16`],
998	const u8 *inp, size_t len) = ctx->ghash;
999	# endif
1000	#endif
1001
1002	mlen += len;
1003	if (mlen > ((U64(`1`) << `36`) - `32`) \|\| (sizeof(len) == `8` && mlen < len))
1004	return -`1`;
1005	ctx->len.u[`1`] = mlen;
1006
1007	mres = ctx->mres;
1008
1009	if (ctx->ares) {
1010	/ First call to encrypt finalizes GHASH(AAD) /
1011	#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1012	if (len == `0`) {
1013	GCM_MUL(ctx);
1014	ctx->ares = `0`;
1015	return `0`;
1016	}
1017	memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1018	ctx->Xi.u[`0`] = `0`;
1019	ctx->Xi.u[`1`] = `0`;
1020	mres = sizeof(ctx->Xi);
1021	#else
1022	GCM_MUL(ctx);
1023	#endif
1024	ctx->ares = `0`;
1025	}
1026
1027	if (is_endian.little)
1028	#ifdef BSWAP4
1029	ctr = BSWAP4(ctx->Yi.d[`3`]);
1030	#else
1031	ctr = GETU32(ctx->Yi.c + `12`);
1032	#endif
1033	else
1034	ctr = ctx->Yi.d[`3`];
1035
1036	n = mres % `16`;
1037	#if !defined(OPENSSL_SMALL_FOOTPRINT)
1038	if (`16` % sizeof(size_t) == `0`) { / always true actually /
1039	do {
1040	if (n) {
1041	# if defined(GHASH)
1042	while (n && len) {
1043	ctx->Xn[mres++] = (out++) = (in++) ^ ctx->EKi.c[n];
1044	--len;
1045	n = (n + `1`) % `16`;
1046	}
1047	if (n == `0`) {
1048	GHASH(ctx, ctx->Xn, mres);
1049	mres = `0`;
1050	} else {
1051	ctx->mres = mres;
1052	return `0`;
1053	}
1054	# else
1055	while (n && len) {
1056	ctx->Xi.c[n] ^= (out++) = (in++) ^ ctx->EKi.c[n];
1057	--len;
1058	n = (n + `1`) % `16`;
1059	}
1060	if (n == `0`) {
1061	GCM_MUL(ctx);
1062	mres = `0`;
1063	} else {
1064	ctx->mres = n;
1065	return `0`;
1066	}
1067	# endif
1068	}
1069	# if defined(STRICT_ALIGNMENT)
1070	if (((size_t)in \| (size_t)out) % sizeof(size_t) != `0`)
1071	break;
1072	# endif
1073	# if defined(GHASH)
1074	if (len >= `16` && mres) {
1075	GHASH(ctx, ctx->Xn, mres);
1076	mres = `0`;
1077	}
1078	# if defined(GHASH_CHUNK)
1079	while (len >= GHASH_CHUNK) {
1080	size_t j = GHASH_CHUNK;
1081
1082	while (j) {
1083	size_t out_t = (size_t )out;
1084	const size_t in_t = (const* size_t *)in;
1085
1086	(*block) (ctx->Yi.c, ctx->EKi.c, key);
1087	++ctr;
1088	if (is_endian.little)
1089	# ifdef BSWAP4
1090	ctx->Yi.d[`3`] = BSWAP4(ctr);
1091	# else
1092	PUTU32(ctx->Yi.c + `12`, ctr);
1093	# endif
1094	else
1095	ctx->Yi.d[`3`] = ctr;
1096	for (i = `0`; i < `16` / sizeof(size_t); ++i)
1097	out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1098	out += `16`;
1099	in += `16`;
1100	j -= `16`;
1101	}
1102	GHASH(ctx, out - GHASH_CHUNK, GHASH_CHUNK);
1103	len -= GHASH_CHUNK;
1104	}
1105	# endif
1106	if ((i = (len & (size_t)-`16`))) {
1107	size_t j = i;
1108
1109	while (len >= `16`) {
1110	size_t out_t = (size_t )out;
1111	const size_t in_t = (const* size_t *)in;
1112
1113	(*block) (ctx->Yi.c, ctx->EKi.c, key);
1114	++ctr;
1115	if (is_endian.little)
1116	# ifdef BSWAP4
1117	ctx->Yi.d[`3`] = BSWAP4(ctr);
1118	# else
1119	PUTU32(ctx->Yi.c + `12`, ctr);
1120	# endif
1121	else
1122	ctx->Yi.d[`3`] = ctr;
1123	for (i = `0`; i < `16` / sizeof(size_t); ++i)
1124	out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1125	out += `16`;
1126	in += `16`;
1127	len -= `16`;
1128	}
1129	GHASH(ctx, out - j, j);
1130	}
1131	# else
1132	while (len >= `16`) {
1133	size_t out_t = (size_t )out;
1134	const size_t in_t = (const* size_t *)in;
1135
1136	(*block) (ctx->Yi.c, ctx->EKi.c, key);
1137	++ctr;
1138	if (is_endian.little)
1139	# ifdef BSWAP4
1140	ctx->Yi.d[`3`] = BSWAP4(ctr);
1141	# else
1142	PUTU32(ctx->Yi.c + `12`, ctr);
1143	# endif
1144	else
1145	ctx->Yi.d[`3`] = ctr;
1146	for (i = `0`; i < `16` / sizeof(size_t); ++i)
1147	ctx->Xi.t[i] ^= out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1148	GCM_MUL(ctx);
1149	out += `16`;
1150	in += `16`;
1151	len -= `16`;
1152	}
1153	# endif
1154	if (len) {
1155	(*block) (ctx->Yi.c, ctx->EKi.c, key);
1156	++ctr;
1157	if (is_endian.little)
1158	# ifdef BSWAP4
1159	ctx->Yi.d[`3`] = BSWAP4(ctr);
1160	# else
1161	PUTU32(ctx->Yi.c + `12`, ctr);
1162	# endif
1163	else
1164	ctx->Yi.d[`3`] = ctr;
1165	# if defined(GHASH)
1166	while (len--) {
1167	ctx->Xn[mres++] = out[n] = in[n] ^ ctx->EKi.c[n];
1168	++n;
1169	}
1170	# else
1171	while (len--) {
1172	ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
1173	++n;
1174	}
1175	mres = n;
1176	# endif
1177	}
1178
1179	ctx->mres = mres;
1180	return `0`;
1181	} while (`0`);
1182	}
1183	#endif
1184	for (i = `0`; i < len; ++i) {
1185	if (n == `0`) {
1186	(*block) (ctx->Yi.c, ctx->EKi.c, key);
1187	++ctr;
1188	if (is_endian.little)
1189	#ifdef BSWAP4
1190	ctx->Yi.d[`3`] = BSWAP4(ctr);
1191	#else
1192	PUTU32(ctx->Yi.c + `12`, ctr);
1193	#endif
1194	else
1195	ctx->Yi.d[`3`] = ctr;
1196	}
1197	#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1198	ctx->Xn[mres++] = out[i] = in[i] ^ ctx->EKi.c[n];
1199	n = (n + `1`) % `16`;
1200	if (mres == sizeof(ctx->Xn)) {
1201	GHASH(ctx,ctx->Xn,sizeof(ctx->Xn));
1202	mres = `0`;
1203	}
1204	#else
1205	ctx->Xi.c[n] ^= out[i] = in[i] ^ ctx->EKi.c[n];
1206	mres = n = (n + `1`) % `16`;
1207	if (n == `0`)
1208	GCM_MUL(ctx);
1209	#endif
1210	}
1211
1212	ctx->mres = mres;
1213	return `0`;
1214	}
1215
1216	int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1217	const unsigned char in, unsigned* char *out,
1218	size_t len)
1219	{
1220	const union {
1221	long one;
1222	char little;
1223	} is_endian = { `1` };
1224	unsigned int n, ctr, mres;
1225	size_t i;
1226	u64 mlen = ctx->len.u[`1`];
1227	block128_f block = ctx->block;
1228	void *key = ctx->key;
1229	#ifdef GCM_FUNCREF_4BIT
1230	void (gcm_gmult_p) (u64 Xi[`2`], const* u128 Htable[`16`]) = ctx->gmult;
1231	# if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1232	void (gcm_ghash_p) (u64 Xi[`2`], const* u128 Htable[`16`],
1233	const u8 *inp, size_t len) = ctx->ghash;
1234	# endif
1235	#endif
1236
1237	mlen += len;
1238	if (mlen > ((U64(`1`) << `36`) - `32`) \|\| (sizeof(len) == `8` && mlen < len))
1239	return -`1`;
1240	ctx->len.u[`1`] = mlen;
1241
1242	mres = ctx->mres;
1243
1244	if (ctx->ares) {
1245	/ First call to decrypt finalizes GHASH(AAD) /
1246	#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1247	if (len == `0`) {
1248	GCM_MUL(ctx);
1249	ctx->ares = `0`;
1250	return `0`;
1251	}
1252	memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1253	ctx->Xi.u[`0`] = `0`;
1254	ctx->Xi.u[`1`] = `0`;
1255	mres = sizeof(ctx->Xi);
1256	#else
1257	GCM_MUL(ctx);
1258	#endif
1259	ctx->ares = `0`;
1260	}
1261
1262	if (is_endian.little)
1263	#ifdef BSWAP4
1264	ctr = BSWAP4(ctx->Yi.d[`3`]);
1265	#else
1266	ctr = GETU32(ctx->Yi.c + `12`);
1267	#endif
1268	else
1269	ctr = ctx->Yi.d[`3`];
1270
1271	n = mres % `16`;
1272	#if !defined(OPENSSL_SMALL_FOOTPRINT)
1273	if (`16` % sizeof(size_t) == `0`) { / always true actually /
1274	do {
1275	if (n) {
1276	# if defined(GHASH)
1277	while (n && len) {
1278	(out++) = (ctx->Xn[mres++] = (in++)) ^ ctx->EKi.c[n];
1279	--len;
1280	n = (n + `1`) % `16`;
1281	}
1282	if (n == `0`) {
1283	GHASH(ctx, ctx->Xn, mres);
1284	mres = `0`;
1285	} else {
1286	ctx->mres = mres;
1287	return `0`;
1288	}
1289	# else
1290	while (n && len) {
1291	u8 c = *(in++);
1292	*(out++) = c ^ ctx->EKi.c[n];
1293	ctx->Xi.c[n] ^= c;
1294	--len;
1295	n = (n + `1`) % `16`;
1296	}
1297	if (n == `0`) {
1298	GCM_MUL(ctx);
1299	mres = `0`;
1300	} else {
1301	ctx->mres = n;
1302	return `0`;
1303	}
1304	# endif
1305	}
1306	# if defined(STRICT_ALIGNMENT)
1307	if (((size_t)in \| (size_t)out) % sizeof(size_t) != `0`)
1308	break;
1309	# endif
1310	# if defined(GHASH)
1311	if (len >= `16` && mres) {
1312	GHASH(ctx, ctx->Xn, mres);
1313	mres = `0`;
1314	}
1315	# if defined(GHASH_CHUNK)
1316	while (len >= GHASH_CHUNK) {
1317	size_t j = GHASH_CHUNK;
1318
1319	GHASH(ctx, in, GHASH_CHUNK);
1320	while (j) {
1321	size_t out_t = (size_t )out;
1322	const size_t in_t = (const* size_t *)in;
1323
1324	(*block) (ctx->Yi.c, ctx->EKi.c, key);
1325	++ctr;
1326	if (is_endian.little)
1327	# ifdef BSWAP4
1328	ctx->Yi.d[`3`] = BSWAP4(ctr);
1329	# else
1330	PUTU32(ctx->Yi.c + `12`, ctr);
1331	# endif
1332	else
1333	ctx->Yi.d[`3`] = ctr;
1334	for (i = `0`; i < `16` / sizeof(size_t); ++i)
1335	out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1336	out += `16`;
1337	in += `16`;
1338	j -= `16`;
1339	}
1340	len -= GHASH_CHUNK;
1341	}
1342	# endif
1343	if ((i = (len & (size_t)-`16`))) {
1344	GHASH(ctx, in, i);
1345	while (len >= `16`) {
1346	size_t out_t = (size_t )out;
1347	const size_t in_t = (const* size_t *)in;
1348
1349	(*block) (ctx->Yi.c, ctx->EKi.c, key);
1350	++ctr;
1351	if (is_endian.little)
1352	# ifdef BSWAP4
1353	ctx->Yi.d[`3`] = BSWAP4(ctr);
1354	# else
1355	PUTU32(ctx->Yi.c + `12`, ctr);
1356	# endif
1357	else
1358	ctx->Yi.d[`3`] = ctr;
1359	for (i = `0`; i < `16` / sizeof(size_t); ++i)
1360	out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1361	out += `16`;
1362	in += `16`;
1363	len -= `16`;
1364	}
1365	}
1366	# else
1367	while (len >= `16`) {
1368	size_t out_t = (size_t )out;
1369	const size_t in_t = (const* size_t *)in;
1370
1371	(*block) (ctx->Yi.c, ctx->EKi.c, key);
1372	++ctr;
1373	if (is_endian.little)
1374	# ifdef BSWAP4
1375	ctx->Yi.d[`3`] = BSWAP4(ctr);
1376	# else
1377	PUTU32(ctx->Yi.c + `12`, ctr);
1378	# endif
1379	else
1380	ctx->Yi.d[`3`] = ctr;
1381	for (i = `0`; i < `16` / sizeof(size_t); ++i) {
1382	size_t c = in[i];
1383	out[i] = c ^ ctx->EKi.t[i];
1384	ctx->Xi.t[i] ^= c;
1385	}
1386	GCM_MUL(ctx);
1387	out += `16`;
1388	in += `16`;
1389	len -= `16`;
1390	}
1391	# endif
1392	if (len) {
1393	(*block) (ctx->Yi.c, ctx->EKi.c, key);
1394	++ctr;
1395	if (is_endian.little)
1396	# ifdef BSWAP4
1397	ctx->Yi.d[`3`] = BSWAP4(ctr);
1398	# else
1399	PUTU32(ctx->Yi.c + `12`, ctr);
1400	# endif
1401	else
1402	ctx->Yi.d[`3`] = ctr;
1403	# if defined(GHASH)
1404	while (len--) {
1405	out[n] = (ctx->Xn[mres++] = in[n]) ^ ctx->EKi.c[n];
1406	++n;
1407	}
1408	# else
1409	while (len--) {
1410	u8 c = in[n];
1411	ctx->Xi.c[n] ^= c;
1412	out[n] = c ^ ctx->EKi.c[n];
1413	++n;
1414	}
1415	mres = n;
1416	# endif
1417	}
1418
1419	ctx->mres = mres;
1420	return `0`;
1421	} while (`0`);
1422	}
1423	#endif
1424	for (i = `0`; i < len; ++i) {
1425	u8 c;
1426	if (n == `0`) {
1427	(*block) (ctx->Yi.c, ctx->EKi.c, key);
1428	++ctr;
1429	if (is_endian.little)
1430	#ifdef BSWAP4
1431	ctx->Yi.d[`3`] = BSWAP4(ctr);
1432	#else
1433	PUTU32(ctx->Yi.c + `12`, ctr);
1434	#endif
1435	else
1436	ctx->Yi.d[`3`] = ctr;
1437	}
1438	#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1439	out[i] = (ctx->Xn[mres++] = c = in[i]) ^ ctx->EKi.c[n];
1440	n = (n + `1`) % `16`;
1441	if (mres == sizeof(ctx->Xn)) {
1442	GHASH(ctx,ctx->Xn,sizeof(ctx->Xn));
1443	mres = `0`;
1444	}
1445	#else
1446	c = in[i];
1447	out[i] = c ^ ctx->EKi.c[n];
1448	ctx->Xi.c[n] ^= c;
1449	mres = n = (n + `1`) % `16`;
1450	if (n == `0`)
1451	GCM_MUL(ctx);
1452	#endif
1453	}
1454
1455	ctx->mres = mres;
1456	return `0`;
1457	}
1458
1459	int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1460	const unsigned char in, unsigned* char *out,
1461	size_t len, ctr128_f stream)
1462	{
1463	#if defined(OPENSSL_SMALL_FOOTPRINT)
1464	return CRYPTO_gcm128_encrypt(ctx, in, out, len);
1465	#else
1466	const union {
1467	long one;
1468	char little;
1469	} is_endian = { `1` };
1470	unsigned int n, ctr, mres;
1471	size_t i;
1472	u64 mlen = ctx->len.u[`1`];
1473	void *key = ctx->key;
1474	# ifdef GCM_FUNCREF_4BIT
1475	void (gcm_gmult_p) (u64 Xi[`2`], const* u128 Htable[`16`]) = ctx->gmult;
1476	# ifdef GHASH
1477	void (gcm_ghash_p) (u64 Xi[`2`], const* u128 Htable[`16`],
1478	const u8 *inp, size_t len) = ctx->ghash;
1479	# endif
1480	# endif
1481
1482	mlen += len;
1483	if (mlen > ((U64(`1`) << `36`) - `32`) \|\| (sizeof(len) == `8` && mlen < len))
1484	return -`1`;
1485	ctx->len.u[`1`] = mlen;
1486
1487	mres = ctx->mres;
1488
1489	if (ctx->ares) {
1490	/ First call to encrypt finalizes GHASH(AAD) /
1491	#if defined(GHASH)
1492	if (len == `0`) {
1493	GCM_MUL(ctx);
1494	ctx->ares = `0`;
1495	return `0`;
1496	}
1497	memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1498	ctx->Xi.u[`0`] = `0`;
1499	ctx->Xi.u[`1`] = `0`;
1500	mres = sizeof(ctx->Xi);
1501	#else
1502	GCM_MUL(ctx);
1503	#endif
1504	ctx->ares = `0`;
1505	}
1506
1507	if (is_endian.little)
1508	# ifdef BSWAP4
1509	ctr = BSWAP4(ctx->Yi.d[`3`]);
1510	# else
1511	ctr = GETU32(ctx->Yi.c + `12`);
1512	# endif
1513	else
1514	ctr = ctx->Yi.d[`3`];
1515
1516	n = mres % `16`;
1517	if (n) {
1518	# if defined(GHASH)
1519	while (n && len) {
1520	ctx->Xn[mres++] = (out++) = (in++) ^ ctx->EKi.c[n];
1521	--len;
1522	n = (n + `1`) % `16`;
1523	}
1524	if (n == `0`) {
1525	GHASH(ctx, ctx->Xn, mres);
1526	mres = `0`;
1527	} else {
1528	ctx->mres = mres;
1529	return `0`;
1530	}
1531	# else
1532	while (n && len) {
1533	ctx->Xi.c[n] ^= (out++) = (in++) ^ ctx->EKi.c[n];
1534	--len;
1535	n = (n + `1`) % `16`;
1536	}
1537	if (n == `0`) {
1538	GCM_MUL(ctx);
1539	mres = `0`;
1540	} else {
1541	ctx->mres = n;
1542	return `0`;
1543	}
1544	# endif
1545	}
1546	# if defined(GHASH)
1547	if (len >= `16` && mres) {
1548	GHASH(ctx, ctx->Xn, mres);
1549	mres = `0`;
1550	}
1551	# if defined(GHASH_CHUNK)
1552	while (len >= GHASH_CHUNK) {
1553	(*stream) (in, out, GHASH_CHUNK / `16`, key, ctx->Yi.c);
1554	ctr += GHASH_CHUNK / `16`;
1555	if (is_endian.little)
1556	# ifdef BSWAP4
1557	ctx->Yi.d[`3`] = BSWAP4(ctr);
1558	# else
1559	PUTU32(ctx->Yi.c + `12`, ctr);
1560	# endif
1561	else
1562	ctx->Yi.d[`3`] = ctr;
1563	GHASH(ctx, out, GHASH_CHUNK);
1564	out += GHASH_CHUNK;
1565	in += GHASH_CHUNK;
1566	len -= GHASH_CHUNK;
1567	}
1568	# endif
1569	# endif
1570	if ((i = (len & (size_t)-`16`))) {
1571	size_t j = i / `16`;
1572
1573	(*stream) (in, out, j, key, ctx->Yi.c);
1574	ctr += (unsigned int)j;
1575	if (is_endian.little)
1576	# ifdef BSWAP4
1577	ctx->Yi.d[`3`] = BSWAP4(ctr);
1578	# else
1579	PUTU32(ctx->Yi.c + `12`, ctr);
1580	# endif
1581	else
1582	ctx->Yi.d[`3`] = ctr;
1583	in += i;
1584	len -= i;
1585	# if defined(GHASH)
1586	GHASH(ctx, out, i);
1587	out += i;
1588	# else
1589	while (j--) {
1590	for (i = `0`; i < `16`; ++i)
1591	ctx->Xi.c[i] ^= out[i];
1592	GCM_MUL(ctx);
1593	out += `16`;
1594	}
1595	# endif
1596	}
1597	if (len) {
1598	(*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
1599	++ctr;
1600	if (is_endian.little)
1601	# ifdef BSWAP4
1602	ctx->Yi.d[`3`] = BSWAP4(ctr);
1603	# else
1604	PUTU32(ctx->Yi.c + `12`, ctr);
1605	# endif
1606	else
1607	ctx->Yi.d[`3`] = ctr;
1608	while (len--) {
1609	# if defined(GHASH)
1610	ctx->Xn[mres++] = out[n] = in[n] ^ ctx->EKi.c[n];
1611	# else
1612	ctx->Xi.c[mres++] ^= out[n] = in[n] ^ ctx->EKi.c[n];
1613	# endif
1614	++n;
1615	}
1616	}
1617
1618	ctx->mres = mres;
1619	return `0`;
1620	#endif
1621	}
1622
1623	int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1624	const unsigned char in, unsigned* char *out,
1625	size_t len, ctr128_f stream)
1626	{
1627	#if defined(OPENSSL_SMALL_FOOTPRINT)
1628	return CRYPTO_gcm128_decrypt(ctx, in, out, len);
1629	#else
1630	const union {
1631	long one;
1632	char little;
1633	} is_endian = { `1` };
1634	unsigned int n, ctr, mres;
1635	size_t i;
1636	u64 mlen = ctx->len.u[`1`];
1637	void *key = ctx->key;
1638	# ifdef GCM_FUNCREF_4BIT
1639	void (gcm_gmult_p) (u64 Xi[`2`], const* u128 Htable[`16`]) = ctx->gmult;
1640	# ifdef GHASH
1641	void (gcm_ghash_p) (u64 Xi[`2`], const* u128 Htable[`16`],
1642	const u8 *inp, size_t len) = ctx->ghash;
1643	# endif
1644	# endif
1645
1646	mlen += len;
1647	if (mlen > ((U64(`1`) << `36`) - `32`) \|\| (sizeof(len) == `8` && mlen < len))
1648	return -`1`;
1649	ctx->len.u[`1`] = mlen;
1650
1651	mres = ctx->mres;
1652
1653	if (ctx->ares) {
1654	/ First call to decrypt finalizes GHASH(AAD) /
1655	# if defined(GHASH)
1656	if (len == `0`) {
1657	GCM_MUL(ctx);
1658	ctx->ares = `0`;
1659	return `0`;
1660	}
1661	memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1662	ctx->Xi.u[`0`] = `0`;
1663	ctx->Xi.u[`1`] = `0`;
1664	mres = sizeof(ctx->Xi);
1665	# else
1666	GCM_MUL(ctx);
1667	# endif
1668	ctx->ares = `0`;
1669	}
1670
1671	if (is_endian.little)
1672	# ifdef BSWAP4
1673	ctr = BSWAP4(ctx->Yi.d[`3`]);
1674	# else
1675	ctr = GETU32(ctx->Yi.c + `12`);
1676	# endif
1677	else
1678	ctr = ctx->Yi.d[`3`];
1679
1680	n = mres % `16`;
1681	if (n) {
1682	# if defined(GHASH)
1683	while (n && len) {
1684	(out++) = (ctx->Xn[mres++] = (in++)) ^ ctx->EKi.c[n];
1685	--len;
1686	n = (n + `1`) % `16`;
1687	}
1688	if (n == `0`) {
1689	GHASH(ctx, ctx->Xn, mres);
1690	mres = `0`;
1691	} else {
1692	ctx->mres = mres;
1693	return `0`;
1694	}
1695	# else
1696	while (n && len) {
1697	u8 c = *(in++);
1698	*(out++) = c ^ ctx->EKi.c[n];
1699	ctx->Xi.c[n] ^= c;
1700	--len;
1701	n = (n + `1`) % `16`;
1702	}
1703	if (n == `0`) {
1704	GCM_MUL(ctx);
1705	mres = `0`;
1706	} else {
1707	ctx->mres = n;
1708	return `0`;
1709	}
1710	# endif
1711	}
1712	# if defined(GHASH)
1713	if (len >= `16` && mres) {
1714	GHASH(ctx, ctx->Xn, mres);
1715	mres = `0`;
1716	}
1717	# if defined(GHASH_CHUNK)
1718	while (len >= GHASH_CHUNK) {
1719	GHASH(ctx, in, GHASH_CHUNK);
1720	(*stream) (in, out, GHASH_CHUNK / `16`, key, ctx->Yi.c);
1721	ctr += GHASH_CHUNK / `16`;
1722	if (is_endian.little)
1723	# ifdef BSWAP4
1724	ctx->Yi.d[`3`] = BSWAP4(ctr);
1725	# else
1726	PUTU32(ctx->Yi.c + `12`, ctr);
1727	# endif
1728	else
1729	ctx->Yi.d[`3`] = ctr;
1730	out += GHASH_CHUNK;
1731	in += GHASH_CHUNK;
1732	len -= GHASH_CHUNK;
1733	}
1734	# endif
1735	# endif
1736	if ((i = (len & (size_t)-`16`))) {
1737	size_t j = i / `16`;
1738
1739	# if defined(GHASH)
1740	GHASH(ctx, in, i);
1741	# else
1742	while (j--) {
1743	size_t k;
1744	for (k = `0`; k < `16`; ++k)
1745	ctx->Xi.c[k] ^= in[k];
1746	GCM_MUL(ctx);
1747	in += `16`;
1748	}
1749	j = i / `16`;
1750	in -= i;
1751	# endif
1752	(*stream) (in, out, j, key, ctx->Yi.c);
1753	ctr += (unsigned int)j;
1754	if (is_endian.little)
1755	# ifdef BSWAP4
1756	ctx->Yi.d[`3`] = BSWAP4(ctr);
1757	# else
1758	PUTU32(ctx->Yi.c + `12`, ctr);
1759	# endif
1760	else
1761	ctx->Yi.d[`3`] = ctr;
1762	out += i;
1763	in += i;
1764	len -= i;
1765	}
1766	if (len) {
1767	(*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
1768	++ctr;
1769	if (is_endian.little)
1770	# ifdef BSWAP4
1771	ctx->Yi.d[`3`] = BSWAP4(ctr);
1772	# else
1773	PUTU32(ctx->Yi.c + `12`, ctr);
1774	# endif
1775	else
1776	ctx->Yi.d[`3`] = ctr;
1777	while (len--) {
1778	# if defined(GHASH)
1779	out[n] = (ctx->Xn[mres++] = in[n]) ^ ctx->EKi.c[n];
1780	# else
1781	u8 c = in[n];
1782	ctx->Xi.c[mres++] ^= c;
1783	out[n] = c ^ ctx->EKi.c[n];
1784	# endif
1785	++n;
1786	}
1787	}
1788
1789	ctx->mres = mres;
1790	return `0`;
1791	#endif
1792	}
1793
1794	int CRYPTO_gcm128_finish(GCM128_CONTEXT ctx, const* unsigned char *tag,
1795	size_t len)
1796	{
1797	const union {
1798	long one;
1799	char little;
1800	} is_endian = { `1` };
1801	u64 alen = ctx->len.u[`0`] << `3`;
1802	u64 clen = ctx->len.u[`1`] << `3`;
1803	#ifdef GCM_FUNCREF_4BIT
1804	void (gcm_gmult_p) (u64 Xi[`2`], const* u128 Htable[`16`]) = ctx->gmult;
1805	# if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1806	void (gcm_ghash_p) (u64 Xi[`2`], const* u128 Htable[`16`],
1807	const u8 *inp, size_t len) = ctx->ghash;
1808	# endif
1809	#endif
1810
1811	#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1812	u128 bitlen;
1813	unsigned int mres = ctx->mres;
1814
1815	if (mres) {
1816	unsigned blocks = (mres + `15`) & -`16`;
1817
1818	memset(ctx->Xn + mres, `0`, blocks - mres);
1819	mres = blocks;
1820	if (mres == sizeof(ctx->Xn)) {
1821	GHASH(ctx, ctx->Xn, mres);
1822	mres = `0`;
1823	}
1824	} else if (ctx->ares) {
1825	GCM_MUL(ctx);
1826	}
1827	#else
1828	if (ctx->mres \|\| ctx->ares)
1829	GCM_MUL(ctx);
1830	#endif
1831
1832	if (is_endian.little) {
1833	#ifdef BSWAP8
1834	alen = BSWAP8(alen);
1835	clen = BSWAP8(clen);
1836	#else
1837	u8 *p = ctx->len.c;
1838
1839	ctx->len.u[`0`] = alen;
1840	ctx->len.u[`1`] = clen;
1841
1842	alen = (u64)GETU32(p) << `32` \| GETU32(p + `4`);
1843	clen = (u64)GETU32(p + `8`) << `32` \| GETU32(p + `12`);
1844	#endif
1845	}
1846
1847	#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1848	bitlen.hi = alen;
1849	bitlen.lo = clen;
1850	memcpy(ctx->Xn + mres, &bitlen, sizeof(bitlen));
1851	mres += sizeof(bitlen);
1852	GHASH(ctx, ctx->Xn, mres);
1853	#else
1854	ctx->Xi.u[`0`] ^= alen;
1855	ctx->Xi.u[`1`] ^= clen;
1856	GCM_MUL(ctx);
1857	#endif
1858
1859	ctx->Xi.u[`0`] ^= ctx->EK0.u[`0`];
1860	ctx->Xi.u[`1`] ^= ctx->EK0.u[`1`];
1861
1862	if (tag && len <= sizeof(ctx->Xi))
1863	return CRYPTO_memcmp(ctx->Xi.c, tag, len);
1864	else
1865	return -`1`;
1866	}
1867
1868	void CRYPTO_gcm128_tag(GCM128_CONTEXT ctx, unsigned* char *tag, size_t len)
1869	{
1870	CRYPTO_gcm128_finish(ctx, NULL, `0`);
1871	memcpy(tag, ctx->Xi.c,
1872	len <= sizeof(ctx->Xi.c) ? len : sizeof(ctx->Xi.c));
1873	}
1874
1875	GCM128_CONTEXT CRYPTO_gcm128_new(void* *key, block128_f block)
1876	{
1877	GCM128_CONTEXT *ret;
1878
1879	if ((ret = OPENSSL_malloc(sizeof(*ret))) != NULL)
1880	CRYPTO_gcm128_init(ret, key, block);
1881
1882	return ret;
1883	}
1884
1885	void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1886	{
1887	OPENSSL_clear_free(ctx, sizeof(*ctx));
1888	}
1889

Browse the source code of ClickHouse/contrib/openssl/crypto/modes/gcm128.c