sha1-altivec.c source code [engine/third_party/boringssl/src/crypto/fipsmodule/sha/sha1-altivec.c]

1	/ Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)*
2	* All rights reserved.
3	*
4	* This package is an SSL implementation written
5	* by Eric Young (eay@cryptsoft.com).
6	* The implementation was written so as to conform with Netscapes SSL.
7	*
8	* This library is free for commercial and non-commercial use as long as
9	* the following conditions are aheared to. The following conditions
10	* apply to all code found in this distribution, be it the RC4, RSA,
11	* lhash, DES, etc., code; not just the SSL code. The SSL documentation
12	* included with this distribution is covered by the same copyright terms
13	* except that the holder is Tim Hudson (tjh@cryptsoft.com).
14	*
15	* Copyright remains Eric Young's, and as such any Copyright notices in
16	* the code are not to be removed.
17	* If this package is used in a product, Eric Young should be given attribution
18	* as the author of the parts of the library used.
19	* This can be in the form of a textual message at program startup or
20	* in documentation (online or textual) provided with the package.
21	*
22	* Redistribution and use in source and binary forms, with or without
23	* modification, are permitted provided that the following conditions
24	* are met:
25	* 1. Redistributions of source code must retain the copyright
26	* notice, this list of conditions and the following disclaimer.
27	* 2. Redistributions in binary form must reproduce the above copyright
28	* notice, this list of conditions and the following disclaimer in the
29	* documentation and/or other materials provided with the distribution.
30	* 3. All advertising materials mentioning features or use of this software
31	* must display the following acknowledgement:
32	* "This product includes cryptographic software written by
33	* Eric Young (eay@cryptsoft.com)"
34	* The word 'cryptographic' can be left out if the rouines from the library
35	* being used are not cryptographic related :-).
36	* 4. If you include any Windows specific code (or a derivative thereof) from
37	* the apps directory (application code) you must include an acknowledgement:
38	* "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
39	*
40	* THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
41	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
42	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
43	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
44	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
45	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
46	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
48	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
49	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50	* SUCH DAMAGE.
51	*
52	* The licence and distribution terms for any publically available version or
53	* derivative of this code cannot be changed. i.e. this code cannot simply be
54	* copied and put under another distribution licence
55	* [including the GNU Public Licence.] */
56
57	// Altivec-optimized SHA1 in C. This is tested on ppc64le only.
58	//
59	// References:
60	// https://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1
61	// http://arctic.org/~dean/crypto/sha1.html
62	//
63	// This code used the generic SHA-1 from OpenSSL as a basis and AltiVec
64	// optimisations were added on top.
65
66	#include <openssl/sha.h>
67
68	#if defined(OPENSSL_PPC64LE)
69
70	#include <altivec.h>
71
72	void sha1_block_data_order(uint32_t state, const* uint8_t *data, size_t num);
73
74	static uint32_t rotate(uint32_t a, int n) { return (a << n) \| (a >> (`32` - n)); }
75
76	typedef vector unsigned int vec_uint32_t;
77	typedef vector unsigned char vec_uint8_t;
78
79	// Vector constants
80	static const vec_uint8_t k_swap_endianness = {`3`, `2`, `1`, `0`, `7`, `6`, `5`, `4`,
81	`11`, `10`, `9`, `8`, `15`, `14`, `13`, `12`};
82
83	// Shift amounts for byte and bit shifts and rotations
84	static const vec_uint8_t k_4_bytes = {`32`, `32`, `32`, `32`, `32`, `32`, `32`, `32`,
85	`32`, `32`, `32`, `32`, `32`, `32`, `32`, `32`};
86	static const vec_uint8_t k_12_bytes = {`96`, `96`, `96`, `96`, `96`, `96`, `96`, `96`,
87	`96`, `96`, `96`, `96`, `96`, `96`, `96`, `96`};
88
89	#define K_00_19 0x5a827999UL
90	#define K_20_39 0x6ed9eba1UL
91	#define K_40_59 0x8f1bbcdcUL
92	#define K_60_79 0xca62c1d6UL
93
94	// Vector versions of the above.
95	static const vec_uint32_t K_00_19_x_4 = {K_00_19, K_00_19, K_00_19, K_00_19};
96	static const vec_uint32_t K_20_39_x_4 = {K_20_39, K_20_39, K_20_39, K_20_39};
97	static const vec_uint32_t K_40_59_x_4 = {K_40_59, K_40_59, K_40_59, K_40_59};
98	static const vec_uint32_t K_60_79_x_4 = {K_60_79, K_60_79, K_60_79, K_60_79};
99
100	// vector message scheduling: compute message schedule for round i..i+3 where i
101	// is divisible by 4. We return the schedule w[i..i+3] as a vector. In
102	// addition, we also precompute sum w[i..+3] and an additive constant K. This
103	// is done to offload some computation of f() in the integer execution units.
104	//
105	// Byte shifting code below may not be correct for big-endian systems.
106	static vec_uint32_t sched_00_15(vec_uint32_t pre_added, const* void *data,
107	vec_uint32_t k) {
108	const vector unsigned char unaligned_data =
109	vec_vsx_ld(`0`, (const unsigned char*) data);
110	const vec_uint32_t v = (vec_uint32_t) unaligned_data;
111	const vec_uint32_t w = vec_perm(v, v, k_swap_endianness);
112	vec_st(w + k, `0`, pre_added);
113	return w;
114	}
115
116	// Compute w[i..i+3] using these steps for i in [16, 20, 24, 28]
117	//
118	// w'[i ] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) <<< 1
119	// w'[i+1] = (w[i-2] ^ w[i-7] ^ w[i-13] ^ w[i-15]) <<< 1
120	// w'[i+2] = (w[i-1] ^ w[i-6] ^ w[i-12] ^ w[i-14]) <<< 1
121	// w'[i+3] = ( 0 ^ w[i-5] ^ w[i-11] ^ w[i-13]) <<< 1
122	//
123	// w[ i] = w'[ i]
124	// w[i+1] = w'[i+1]
125	// w[i+2] = w'[i+2]
126	// w[i+3] = w'[i+3] ^ (w'[i] <<< 1)
127	static vec_uint32_t sched_16_31(vec_uint32_t *pre_added, vec_uint32_t minus_4,
128	vec_uint32_t minus_8, vec_uint32_t minus_12,
129	vec_uint32_t minus_16, vec_uint32_t k) {
130	const vec_uint32_t minus_3 = vec_sro(minus_4, k_4_bytes);
131	const vec_uint32_t minus_14 = vec_sld((minus_12), (minus_16), `8`);
132	const vec_uint32_t k_1_bit = vec_splat_u32(`1`);
133	const vec_uint32_t w_prime =
134	vec_rl(minus_3 ^ minus_8 ^ minus_14 ^ minus_16, k_1_bit);
135	const vec_uint32_t w =
136	w_prime ^ vec_rl(vec_slo(w_prime, k_12_bytes), k_1_bit);
137	vec_st(w + k, `0`, pre_added);
138	return w;
139	}
140
141	// Compute w[i..i+3] using this relation for i in [32, 36, 40 ... 76]
142	// w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]), 2) <<< 2
143	static vec_uint32_t sched_32_79(vec_uint32_t *pre_added, vec_uint32_t minus_4,
144	vec_uint32_t minus_8, vec_uint32_t minus_16,
145	vec_uint32_t minus_28, vec_uint32_t minus_32,
146	vec_uint32_t k) {
147	const vec_uint32_t minus_6 = vec_sld(minus_4, minus_8, `8`);
148	const vec_uint32_t k_2_bits = vec_splat_u32(`2`);
149	const vec_uint32_t w =
150	vec_rl(minus_6 ^ minus_16 ^ minus_28 ^ minus_32, k_2_bits);
151	vec_st(w + k, `0`, pre_added);
152	return w;
153	}
154
155	// As pointed out by Wei Dai <weidai@eskimo.com>, F() below can be simplified
156	// to the code in F_00_19. Wei attributes these optimisations to Peter
157	// Gutmann's SHS code, and he attributes it to Rich Schroeppel. #define
158	// F(x,y,z) (((x) & (y)) \| ((~(x)) & (z))) I've just become aware of another
159	// tweak to be made, again from Wei Dai, in F_40_59, (x&a)\|(y&a) -> (x\|y)&a
160	#define F_00_19(b, c, d) ((((c) ^ (d)) & (b)) ^ (d))
161	#define F_20_39(b, c, d) ((b) ^ (c) ^ (d))
162	#define F_40_59(b, c, d) (((b) & (c)) \| (((b) \| (c)) & (d)))
163	#define F_60_79(b, c, d) F_20_39(b, c, d)
164
165	// We pre-added the K constants during message scheduling.
166	#define BODY_00_19(i, a, b, c, d, e, f) \
167	do { \
168	(f) = w[i] + (e) + rotate((a), 5) + F_00_19((b), (c), (d)); \
169	(b) = rotate((b), 30); \
170	} while (0)
171
172	#define BODY_20_39(i, a, b, c, d, e, f) \
173	do { \
174	(f) = w[i] + (e) + rotate((a), 5) + F_20_39((b), (c), (d)); \
175	(b) = rotate((b), 30); \
176	} while (0)
177
178	#define BODY_40_59(i, a, b, c, d, e, f) \
179	do { \
180	(f) = w[i] + (e) + rotate((a), 5) + F_40_59((b), (c), (d)); \
181	(b) = rotate((b), 30); \
182	} while (0)
183
184	#define BODY_60_79(i, a, b, c, d, e, f) \
185	do { \
186	(f) = w[i] + (e) + rotate((a), 5) + F_60_79((b), (c), (d)); \
187	(b) = rotate((b), 30); \
188	} while (0)
189
190	void sha1_block_data_order(uint32_t state, const* uint8_t *data, size_t num) {
191	uint32_t A, B, C, D, E, T;
192
193	A = state[`0`];
194	B = state[`1`];
195	C = state[`2`];
196	D = state[`3`];
197	E = state[`4`];
198
199	for (;;) {
200	vec_uint32_t vw[`20`];
201	const uint32_t w = (const* uint32_t *)&vw;
202
203	vec_uint32_t k = K_00_19_x_4;
204	const vec_uint32_t w0 = sched_00_15(vw + `0`, data + `0`, k);
205	BODY_00_19(`0`, A, B, C, D, E, T);
206	BODY_00_19(`1`, T, A, B, C, D, E);
207	BODY_00_19(`2`, E, T, A, B, C, D);
208	BODY_00_19(`3`, D, E, T, A, B, C);
209
210	const vec_uint32_t w4 = sched_00_15(vw + `1`, data + `16`, k);
211	BODY_00_19(`4`, C, D, E, T, A, B);
212	BODY_00_19(`5`, B, C, D, E, T, A);
213	BODY_00_19(`6`, A, B, C, D, E, T);
214	BODY_00_19(`7`, T, A, B, C, D, E);
215
216	const vec_uint32_t w8 = sched_00_15(vw + `2`, data + `32`, k);
217	BODY_00_19(`8`, E, T, A, B, C, D);
218	BODY_00_19(`9`, D, E, T, A, B, C);
219	BODY_00_19(`10`, C, D, E, T, A, B);
220	BODY_00_19(`11`, B, C, D, E, T, A);
221
222	const vec_uint32_t w12 = sched_00_15(vw + `3`, data + `48`, k);
223	BODY_00_19(`12`, A, B, C, D, E, T);
224	BODY_00_19(`13`, T, A, B, C, D, E);
225	BODY_00_19(`14`, E, T, A, B, C, D);
226	BODY_00_19(`15`, D, E, T, A, B, C);
227
228	const vec_uint32_t w16 = sched_16_31(vw + `4`, w12, w8, w4, w0, k);
229	BODY_00_19(`16`, C, D, E, T, A, B);
230	BODY_00_19(`17`, B, C, D, E, T, A);
231	BODY_00_19(`18`, A, B, C, D, E, T);
232	BODY_00_19(`19`, T, A, B, C, D, E);
233
234	k = K_20_39_x_4;
235	const vec_uint32_t w20 = sched_16_31(vw + `5`, w16, w12, w8, w4, k);
236	BODY_20_39(`20`, E, T, A, B, C, D);
237	BODY_20_39(`21`, D, E, T, A, B, C);
238	BODY_20_39(`22`, C, D, E, T, A, B);
239	BODY_20_39(`23`, B, C, D, E, T, A);
240
241	const vec_uint32_t w24 = sched_16_31(vw + `6`, w20, w16, w12, w8, k);
242	BODY_20_39(`24`, A, B, C, D, E, T);
243	BODY_20_39(`25`, T, A, B, C, D, E);
244	BODY_20_39(`26`, E, T, A, B, C, D);
245	BODY_20_39(`27`, D, E, T, A, B, C);
246
247	const vec_uint32_t w28 = sched_16_31(vw + `7`, w24, w20, w16, w12, k);
248	BODY_20_39(`28`, C, D, E, T, A, B);
249	BODY_20_39(`29`, B, C, D, E, T, A);
250	BODY_20_39(`30`, A, B, C, D, E, T);
251	BODY_20_39(`31`, T, A, B, C, D, E);
252
253	const vec_uint32_t w32 = sched_32_79(vw + `8`, w28, w24, w16, w4, w0, k);
254	BODY_20_39(`32`, E, T, A, B, C, D);
255	BODY_20_39(`33`, D, E, T, A, B, C);
256	BODY_20_39(`34`, C, D, E, T, A, B);
257	BODY_20_39(`35`, B, C, D, E, T, A);
258
259	const vec_uint32_t w36 = sched_32_79(vw + `9`, w32, w28, w20, w8, w4, k);
260	BODY_20_39(`36`, A, B, C, D, E, T);
261	BODY_20_39(`37`, T, A, B, C, D, E);
262	BODY_20_39(`38`, E, T, A, B, C, D);
263	BODY_20_39(`39`, D, E, T, A, B, C);
264
265	k = K_40_59_x_4;
266	const vec_uint32_t w40 = sched_32_79(vw + `10`, w36, w32, w24, w12, w8, k);
267	BODY_40_59(`40`, C, D, E, T, A, B);
268	BODY_40_59(`41`, B, C, D, E, T, A);
269	BODY_40_59(`42`, A, B, C, D, E, T);
270	BODY_40_59(`43`, T, A, B, C, D, E);
271
272	const vec_uint32_t w44 = sched_32_79(vw + `11`, w40, w36, w28, w16, w12, k);
273	BODY_40_59(`44`, E, T, A, B, C, D);
274	BODY_40_59(`45`, D, E, T, A, B, C);
275	BODY_40_59(`46`, C, D, E, T, A, B);
276	BODY_40_59(`47`, B, C, D, E, T, A);
277
278	const vec_uint32_t w48 = sched_32_79(vw + `12`, w44, w40, w32, w20, w16, k);
279	BODY_40_59(`48`, A, B, C, D, E, T);
280	BODY_40_59(`49`, T, A, B, C, D, E);
281	BODY_40_59(`50`, E, T, A, B, C, D);
282	BODY_40_59(`51`, D, E, T, A, B, C);
283
284	const vec_uint32_t w52 = sched_32_79(vw + `13`, w48, w44, w36, w24, w20, k);
285	BODY_40_59(`52`, C, D, E, T, A, B);
286	BODY_40_59(`53`, B, C, D, E, T, A);
287	BODY_40_59(`54`, A, B, C, D, E, T);
288	BODY_40_59(`55`, T, A, B, C, D, E);
289
290	const vec_uint32_t w56 = sched_32_79(vw + `14`, w52, w48, w40, w28, w24, k);
291	BODY_40_59(`56`, E, T, A, B, C, D);
292	BODY_40_59(`57`, D, E, T, A, B, C);
293	BODY_40_59(`58`, C, D, E, T, A, B);
294	BODY_40_59(`59`, B, C, D, E, T, A);
295
296	k = K_60_79_x_4;
297	const vec_uint32_t w60 = sched_32_79(vw + `15`, w56, w52, w44, w32, w28, k);
298	BODY_60_79(`60`, A, B, C, D, E, T);
299	BODY_60_79(`61`, T, A, B, C, D, E);
300	BODY_60_79(`62`, E, T, A, B, C, D);
301	BODY_60_79(`63`, D, E, T, A, B, C);
302
303	const vec_uint32_t w64 = sched_32_79(vw + `16`, w60, w56, w48, w36, w32, k);
304	BODY_60_79(`64`, C, D, E, T, A, B);
305	BODY_60_79(`65`, B, C, D, E, T, A);
306	BODY_60_79(`66`, A, B, C, D, E, T);
307	BODY_60_79(`67`, T, A, B, C, D, E);
308
309	const vec_uint32_t w68 = sched_32_79(vw + `17`, w64, w60, w52, w40, w36, k);
310	BODY_60_79(`68`, E, T, A, B, C, D);
311	BODY_60_79(`69`, D, E, T, A, B, C);
312	BODY_60_79(`70`, C, D, E, T, A, B);
313	BODY_60_79(`71`, B, C, D, E, T, A);
314
315	const vec_uint32_t w72 = sched_32_79(vw + `18`, w68, w64, w56, w44, w40, k);
316	BODY_60_79(`72`, A, B, C, D, E, T);
317	BODY_60_79(`73`, T, A, B, C, D, E);
318	BODY_60_79(`74`, E, T, A, B, C, D);
319	BODY_60_79(`75`, D, E, T, A, B, C);
320
321	// We don't use the last value
322	(void)sched_32_79(vw + `19`, w72, w68, w60, w48, w44, k);
323	BODY_60_79(`76`, C, D, E, T, A, B);
324	BODY_60_79(`77`, B, C, D, E, T, A);
325	BODY_60_79(`78`, A, B, C, D, E, T);
326	BODY_60_79(`79`, T, A, B, C, D, E);
327
328	const uint32_t mask = `0xffffffffUL`;
329	state[`0`] = (state[`0`] + E) & mask;
330	state[`1`] = (state[`1`] + T) & mask;
331	state[`2`] = (state[`2`] + A) & mask;
332	state[`3`] = (state[`3`] + B) & mask;
333	state[`4`] = (state[`4`] + C) & mask;
334
335	data += `64`;
336	if (--num == `0`) {
337	break;
338	}
339
340	A = state[`0`];
341	B = state[`1`];
342	C = state[`2`];
343	D = state[`3`];
344	E = state[`4`];
345	}
346	}
347
348	#endif // OPENSSL_PPC64LE
349
350	#undef K_00_19
351	#undef K_20_39
352	#undef K_40_59
353	#undef K_60_79
354	#undef F_00_19
355	#undef F_20_39
356	#undef F_40_59
357	#undef F_60_79
358	#undef BODY_00_19
359	#undef BODY_20_39
360	#undef BODY_40_59
361	#undef BODY_60_79
362

Browse the source code of engine/third_party/boringssl/src/crypto/fipsmodule/sha/sha1-altivec.c