| 1 | /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) |
| 2 | * All rights reserved. |
| 3 | * |
| 4 | * This package is an SSL implementation written |
| 5 | * by Eric Young (eay@cryptsoft.com). |
| 6 | * The implementation was written so as to conform with Netscapes SSL. |
| 7 | * |
| 8 | * This library is free for commercial and non-commercial use as long as |
| 9 | * the following conditions are aheared to. The following conditions |
| 10 | * apply to all code found in this distribution, be it the RC4, RSA, |
| 11 | * lhash, DES, etc., code; not just the SSL code. The SSL documentation |
| 12 | * included with this distribution is covered by the same copyright terms |
| 13 | * except that the holder is Tim Hudson (tjh@cryptsoft.com). |
| 14 | * |
| 15 | * Copyright remains Eric Young's, and as such any Copyright notices in |
| 16 | * the code are not to be removed. |
| 17 | * If this package is used in a product, Eric Young should be given attribution |
| 18 | * as the author of the parts of the library used. |
| 19 | * This can be in the form of a textual message at program startup or |
| 20 | * in documentation (online or textual) provided with the package. |
| 21 | * |
| 22 | * Redistribution and use in source and binary forms, with or without |
| 23 | * modification, are permitted provided that the following conditions |
| 24 | * are met: |
| 25 | * 1. Redistributions of source code must retain the copyright |
| 26 | * notice, this list of conditions and the following disclaimer. |
| 27 | * 2. Redistributions in binary form must reproduce the above copyright |
| 28 | * notice, this list of conditions and the following disclaimer in the |
| 29 | * documentation and/or other materials provided with the distribution. |
| 30 | * 3. All advertising materials mentioning features or use of this software |
| 31 | * must display the following acknowledgement: |
| 32 | * "This product includes cryptographic software written by |
| 33 | * Eric Young (eay@cryptsoft.com)" |
| 34 | * The word 'cryptographic' can be left out if the rouines from the library |
| 35 | * being used are not cryptographic related :-). |
| 36 | * 4. If you include any Windows specific code (or a derivative thereof) from |
| 37 | * the apps directory (application code) you must include an acknowledgement: |
| 38 | * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" |
| 39 | * |
| 40 | * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND |
| 41 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 42 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 43 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE |
| 44 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| 45 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
| 46 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
| 47 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
| 48 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
| 49 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
| 50 | * SUCH DAMAGE. |
| 51 | * |
| 52 | * The licence and distribution terms for any publically available version or |
| 53 | * derivative of this code cannot be changed. i.e. this code cannot simply be |
| 54 | * copied and put under another distribution licence |
| 55 | * [including the GNU Public Licence.] */ |
| 56 | |
| 57 | // Altivec-optimized SHA1 in C. This is tested on ppc64le only. |
| 58 | // |
| 59 | // References: |
| 60 | // https://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1 |
| 61 | // http://arctic.org/~dean/crypto/sha1.html |
| 62 | // |
| 63 | // This code used the generic SHA-1 from OpenSSL as a basis and AltiVec |
| 64 | // optimisations were added on top. |
| 65 | |
| 66 | #include <openssl/sha.h> |
| 67 | |
| 68 | #if defined(OPENSSL_PPC64LE) |
| 69 | |
| 70 | #include <altivec.h> |
| 71 | |
| 72 | void sha1_block_data_order(uint32_t *state, const uint8_t *data, size_t num); |
| 73 | |
| 74 | static uint32_t rotate(uint32_t a, int n) { return (a << n) | (a >> (32 - n)); } |
| 75 | |
| 76 | typedef vector unsigned int vec_uint32_t; |
| 77 | typedef vector unsigned char vec_uint8_t; |
| 78 | |
| 79 | // Vector constants |
| 80 | static const vec_uint8_t k_swap_endianness = {3, 2, 1, 0, 7, 6, 5, 4, |
| 81 | 11, 10, 9, 8, 15, 14, 13, 12}; |
| 82 | |
| 83 | // Shift amounts for byte and bit shifts and rotations |
| 84 | static const vec_uint8_t k_4_bytes = {32, 32, 32, 32, 32, 32, 32, 32, |
| 85 | 32, 32, 32, 32, 32, 32, 32, 32}; |
| 86 | static const vec_uint8_t k_12_bytes = {96, 96, 96, 96, 96, 96, 96, 96, |
| 87 | 96, 96, 96, 96, 96, 96, 96, 96}; |
| 88 | |
| 89 | #define K_00_19 0x5a827999UL |
| 90 | #define K_20_39 0x6ed9eba1UL |
| 91 | #define K_40_59 0x8f1bbcdcUL |
| 92 | #define K_60_79 0xca62c1d6UL |
| 93 | |
| 94 | // Vector versions of the above. |
| 95 | static const vec_uint32_t K_00_19_x_4 = {K_00_19, K_00_19, K_00_19, K_00_19}; |
| 96 | static const vec_uint32_t K_20_39_x_4 = {K_20_39, K_20_39, K_20_39, K_20_39}; |
| 97 | static const vec_uint32_t K_40_59_x_4 = {K_40_59, K_40_59, K_40_59, K_40_59}; |
| 98 | static const vec_uint32_t K_60_79_x_4 = {K_60_79, K_60_79, K_60_79, K_60_79}; |
| 99 | |
| 100 | // vector message scheduling: compute message schedule for round i..i+3 where i |
| 101 | // is divisible by 4. We return the schedule w[i..i+3] as a vector. In |
| 102 | // addition, we also precompute sum w[i..+3] and an additive constant K. This |
| 103 | // is done to offload some computation of f() in the integer execution units. |
| 104 | // |
| 105 | // Byte shifting code below may not be correct for big-endian systems. |
| 106 | static vec_uint32_t sched_00_15(vec_uint32_t *pre_added, const void *data, |
| 107 | vec_uint32_t k) { |
| 108 | const vector unsigned char unaligned_data = |
| 109 | vec_vsx_ld(0, (const unsigned char*) data); |
| 110 | const vec_uint32_t v = (vec_uint32_t) unaligned_data; |
| 111 | const vec_uint32_t w = vec_perm(v, v, k_swap_endianness); |
| 112 | vec_st(w + k, 0, pre_added); |
| 113 | return w; |
| 114 | } |
| 115 | |
| 116 | // Compute w[i..i+3] using these steps for i in [16, 20, 24, 28] |
| 117 | // |
| 118 | // w'[i ] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) <<< 1 |
| 119 | // w'[i+1] = (w[i-2] ^ w[i-7] ^ w[i-13] ^ w[i-15]) <<< 1 |
| 120 | // w'[i+2] = (w[i-1] ^ w[i-6] ^ w[i-12] ^ w[i-14]) <<< 1 |
| 121 | // w'[i+3] = ( 0 ^ w[i-5] ^ w[i-11] ^ w[i-13]) <<< 1 |
| 122 | // |
| 123 | // w[ i] = w'[ i] |
| 124 | // w[i+1] = w'[i+1] |
| 125 | // w[i+2] = w'[i+2] |
| 126 | // w[i+3] = w'[i+3] ^ (w'[i] <<< 1) |
| 127 | static vec_uint32_t sched_16_31(vec_uint32_t *pre_added, vec_uint32_t minus_4, |
| 128 | vec_uint32_t minus_8, vec_uint32_t minus_12, |
| 129 | vec_uint32_t minus_16, vec_uint32_t k) { |
| 130 | const vec_uint32_t minus_3 = vec_sro(minus_4, k_4_bytes); |
| 131 | const vec_uint32_t minus_14 = vec_sld((minus_12), (minus_16), 8); |
| 132 | const vec_uint32_t k_1_bit = vec_splat_u32(1); |
| 133 | const vec_uint32_t w_prime = |
| 134 | vec_rl(minus_3 ^ minus_8 ^ minus_14 ^ minus_16, k_1_bit); |
| 135 | const vec_uint32_t w = |
| 136 | w_prime ^ vec_rl(vec_slo(w_prime, k_12_bytes), k_1_bit); |
| 137 | vec_st(w + k, 0, pre_added); |
| 138 | return w; |
| 139 | } |
| 140 | |
| 141 | // Compute w[i..i+3] using this relation for i in [32, 36, 40 ... 76] |
| 142 | // w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]), 2) <<< 2 |
| 143 | static vec_uint32_t sched_32_79(vec_uint32_t *pre_added, vec_uint32_t minus_4, |
| 144 | vec_uint32_t minus_8, vec_uint32_t minus_16, |
| 145 | vec_uint32_t minus_28, vec_uint32_t minus_32, |
| 146 | vec_uint32_t k) { |
| 147 | const vec_uint32_t minus_6 = vec_sld(minus_4, minus_8, 8); |
| 148 | const vec_uint32_t k_2_bits = vec_splat_u32(2); |
| 149 | const vec_uint32_t w = |
| 150 | vec_rl(minus_6 ^ minus_16 ^ minus_28 ^ minus_32, k_2_bits); |
| 151 | vec_st(w + k, 0, pre_added); |
| 152 | return w; |
| 153 | } |
| 154 | |
| 155 | // As pointed out by Wei Dai <weidai@eskimo.com>, F() below can be simplified |
| 156 | // to the code in F_00_19. Wei attributes these optimisations to Peter |
| 157 | // Gutmann's SHS code, and he attributes it to Rich Schroeppel. #define |
| 158 | // F(x,y,z) (((x) & (y)) | ((~(x)) & (z))) I've just become aware of another |
| 159 | // tweak to be made, again from Wei Dai, in F_40_59, (x&a)|(y&a) -> (x|y)&a |
| 160 | #define F_00_19(b, c, d) ((((c) ^ (d)) & (b)) ^ (d)) |
| 161 | #define F_20_39(b, c, d) ((b) ^ (c) ^ (d)) |
| 162 | #define F_40_59(b, c, d) (((b) & (c)) | (((b) | (c)) & (d))) |
| 163 | #define F_60_79(b, c, d) F_20_39(b, c, d) |
| 164 | |
| 165 | // We pre-added the K constants during message scheduling. |
| 166 | #define BODY_00_19(i, a, b, c, d, e, f) \ |
| 167 | do { \ |
| 168 | (f) = w[i] + (e) + rotate((a), 5) + F_00_19((b), (c), (d)); \ |
| 169 | (b) = rotate((b), 30); \ |
| 170 | } while (0) |
| 171 | |
| 172 | #define BODY_20_39(i, a, b, c, d, e, f) \ |
| 173 | do { \ |
| 174 | (f) = w[i] + (e) + rotate((a), 5) + F_20_39((b), (c), (d)); \ |
| 175 | (b) = rotate((b), 30); \ |
| 176 | } while (0) |
| 177 | |
| 178 | #define BODY_40_59(i, a, b, c, d, e, f) \ |
| 179 | do { \ |
| 180 | (f) = w[i] + (e) + rotate((a), 5) + F_40_59((b), (c), (d)); \ |
| 181 | (b) = rotate((b), 30); \ |
| 182 | } while (0) |
| 183 | |
| 184 | #define BODY_60_79(i, a, b, c, d, e, f) \ |
| 185 | do { \ |
| 186 | (f) = w[i] + (e) + rotate((a), 5) + F_60_79((b), (c), (d)); \ |
| 187 | (b) = rotate((b), 30); \ |
| 188 | } while (0) |
| 189 | |
| 190 | void sha1_block_data_order(uint32_t *state, const uint8_t *data, size_t num) { |
| 191 | uint32_t A, B, C, D, E, T; |
| 192 | |
| 193 | A = state[0]; |
| 194 | B = state[1]; |
| 195 | C = state[2]; |
| 196 | D = state[3]; |
| 197 | E = state[4]; |
| 198 | |
| 199 | for (;;) { |
| 200 | vec_uint32_t vw[20]; |
| 201 | const uint32_t *w = (const uint32_t *)&vw; |
| 202 | |
| 203 | vec_uint32_t k = K_00_19_x_4; |
| 204 | const vec_uint32_t w0 = sched_00_15(vw + 0, data + 0, k); |
| 205 | BODY_00_19(0, A, B, C, D, E, T); |
| 206 | BODY_00_19(1, T, A, B, C, D, E); |
| 207 | BODY_00_19(2, E, T, A, B, C, D); |
| 208 | BODY_00_19(3, D, E, T, A, B, C); |
| 209 | |
| 210 | const vec_uint32_t w4 = sched_00_15(vw + 1, data + 16, k); |
| 211 | BODY_00_19(4, C, D, E, T, A, B); |
| 212 | BODY_00_19(5, B, C, D, E, T, A); |
| 213 | BODY_00_19(6, A, B, C, D, E, T); |
| 214 | BODY_00_19(7, T, A, B, C, D, E); |
| 215 | |
| 216 | const vec_uint32_t w8 = sched_00_15(vw + 2, data + 32, k); |
| 217 | BODY_00_19(8, E, T, A, B, C, D); |
| 218 | BODY_00_19(9, D, E, T, A, B, C); |
| 219 | BODY_00_19(10, C, D, E, T, A, B); |
| 220 | BODY_00_19(11, B, C, D, E, T, A); |
| 221 | |
| 222 | const vec_uint32_t w12 = sched_00_15(vw + 3, data + 48, k); |
| 223 | BODY_00_19(12, A, B, C, D, E, T); |
| 224 | BODY_00_19(13, T, A, B, C, D, E); |
| 225 | BODY_00_19(14, E, T, A, B, C, D); |
| 226 | BODY_00_19(15, D, E, T, A, B, C); |
| 227 | |
| 228 | const vec_uint32_t w16 = sched_16_31(vw + 4, w12, w8, w4, w0, k); |
| 229 | BODY_00_19(16, C, D, E, T, A, B); |
| 230 | BODY_00_19(17, B, C, D, E, T, A); |
| 231 | BODY_00_19(18, A, B, C, D, E, T); |
| 232 | BODY_00_19(19, T, A, B, C, D, E); |
| 233 | |
| 234 | k = K_20_39_x_4; |
| 235 | const vec_uint32_t w20 = sched_16_31(vw + 5, w16, w12, w8, w4, k); |
| 236 | BODY_20_39(20, E, T, A, B, C, D); |
| 237 | BODY_20_39(21, D, E, T, A, B, C); |
| 238 | BODY_20_39(22, C, D, E, T, A, B); |
| 239 | BODY_20_39(23, B, C, D, E, T, A); |
| 240 | |
| 241 | const vec_uint32_t w24 = sched_16_31(vw + 6, w20, w16, w12, w8, k); |
| 242 | BODY_20_39(24, A, B, C, D, E, T); |
| 243 | BODY_20_39(25, T, A, B, C, D, E); |
| 244 | BODY_20_39(26, E, T, A, B, C, D); |
| 245 | BODY_20_39(27, D, E, T, A, B, C); |
| 246 | |
| 247 | const vec_uint32_t w28 = sched_16_31(vw + 7, w24, w20, w16, w12, k); |
| 248 | BODY_20_39(28, C, D, E, T, A, B); |
| 249 | BODY_20_39(29, B, C, D, E, T, A); |
| 250 | BODY_20_39(30, A, B, C, D, E, T); |
| 251 | BODY_20_39(31, T, A, B, C, D, E); |
| 252 | |
| 253 | const vec_uint32_t w32 = sched_32_79(vw + 8, w28, w24, w16, w4, w0, k); |
| 254 | BODY_20_39(32, E, T, A, B, C, D); |
| 255 | BODY_20_39(33, D, E, T, A, B, C); |
| 256 | BODY_20_39(34, C, D, E, T, A, B); |
| 257 | BODY_20_39(35, B, C, D, E, T, A); |
| 258 | |
| 259 | const vec_uint32_t w36 = sched_32_79(vw + 9, w32, w28, w20, w8, w4, k); |
| 260 | BODY_20_39(36, A, B, C, D, E, T); |
| 261 | BODY_20_39(37, T, A, B, C, D, E); |
| 262 | BODY_20_39(38, E, T, A, B, C, D); |
| 263 | BODY_20_39(39, D, E, T, A, B, C); |
| 264 | |
| 265 | k = K_40_59_x_4; |
| 266 | const vec_uint32_t w40 = sched_32_79(vw + 10, w36, w32, w24, w12, w8, k); |
| 267 | BODY_40_59(40, C, D, E, T, A, B); |
| 268 | BODY_40_59(41, B, C, D, E, T, A); |
| 269 | BODY_40_59(42, A, B, C, D, E, T); |
| 270 | BODY_40_59(43, T, A, B, C, D, E); |
| 271 | |
| 272 | const vec_uint32_t w44 = sched_32_79(vw + 11, w40, w36, w28, w16, w12, k); |
| 273 | BODY_40_59(44, E, T, A, B, C, D); |
| 274 | BODY_40_59(45, D, E, T, A, B, C); |
| 275 | BODY_40_59(46, C, D, E, T, A, B); |
| 276 | BODY_40_59(47, B, C, D, E, T, A); |
| 277 | |
| 278 | const vec_uint32_t w48 = sched_32_79(vw + 12, w44, w40, w32, w20, w16, k); |
| 279 | BODY_40_59(48, A, B, C, D, E, T); |
| 280 | BODY_40_59(49, T, A, B, C, D, E); |
| 281 | BODY_40_59(50, E, T, A, B, C, D); |
| 282 | BODY_40_59(51, D, E, T, A, B, C); |
| 283 | |
| 284 | const vec_uint32_t w52 = sched_32_79(vw + 13, w48, w44, w36, w24, w20, k); |
| 285 | BODY_40_59(52, C, D, E, T, A, B); |
| 286 | BODY_40_59(53, B, C, D, E, T, A); |
| 287 | BODY_40_59(54, A, B, C, D, E, T); |
| 288 | BODY_40_59(55, T, A, B, C, D, E); |
| 289 | |
| 290 | const vec_uint32_t w56 = sched_32_79(vw + 14, w52, w48, w40, w28, w24, k); |
| 291 | BODY_40_59(56, E, T, A, B, C, D); |
| 292 | BODY_40_59(57, D, E, T, A, B, C); |
| 293 | BODY_40_59(58, C, D, E, T, A, B); |
| 294 | BODY_40_59(59, B, C, D, E, T, A); |
| 295 | |
| 296 | k = K_60_79_x_4; |
| 297 | const vec_uint32_t w60 = sched_32_79(vw + 15, w56, w52, w44, w32, w28, k); |
| 298 | BODY_60_79(60, A, B, C, D, E, T); |
| 299 | BODY_60_79(61, T, A, B, C, D, E); |
| 300 | BODY_60_79(62, E, T, A, B, C, D); |
| 301 | BODY_60_79(63, D, E, T, A, B, C); |
| 302 | |
| 303 | const vec_uint32_t w64 = sched_32_79(vw + 16, w60, w56, w48, w36, w32, k); |
| 304 | BODY_60_79(64, C, D, E, T, A, B); |
| 305 | BODY_60_79(65, B, C, D, E, T, A); |
| 306 | BODY_60_79(66, A, B, C, D, E, T); |
| 307 | BODY_60_79(67, T, A, B, C, D, E); |
| 308 | |
| 309 | const vec_uint32_t w68 = sched_32_79(vw + 17, w64, w60, w52, w40, w36, k); |
| 310 | BODY_60_79(68, E, T, A, B, C, D); |
| 311 | BODY_60_79(69, D, E, T, A, B, C); |
| 312 | BODY_60_79(70, C, D, E, T, A, B); |
| 313 | BODY_60_79(71, B, C, D, E, T, A); |
| 314 | |
| 315 | const vec_uint32_t w72 = sched_32_79(vw + 18, w68, w64, w56, w44, w40, k); |
| 316 | BODY_60_79(72, A, B, C, D, E, T); |
| 317 | BODY_60_79(73, T, A, B, C, D, E); |
| 318 | BODY_60_79(74, E, T, A, B, C, D); |
| 319 | BODY_60_79(75, D, E, T, A, B, C); |
| 320 | |
| 321 | // We don't use the last value |
| 322 | (void)sched_32_79(vw + 19, w72, w68, w60, w48, w44, k); |
| 323 | BODY_60_79(76, C, D, E, T, A, B); |
| 324 | BODY_60_79(77, B, C, D, E, T, A); |
| 325 | BODY_60_79(78, A, B, C, D, E, T); |
| 326 | BODY_60_79(79, T, A, B, C, D, E); |
| 327 | |
| 328 | const uint32_t mask = 0xffffffffUL; |
| 329 | state[0] = (state[0] + E) & mask; |
| 330 | state[1] = (state[1] + T) & mask; |
| 331 | state[2] = (state[2] + A) & mask; |
| 332 | state[3] = (state[3] + B) & mask; |
| 333 | state[4] = (state[4] + C) & mask; |
| 334 | |
| 335 | data += 64; |
| 336 | if (--num == 0) { |
| 337 | break; |
| 338 | } |
| 339 | |
| 340 | A = state[0]; |
| 341 | B = state[1]; |
| 342 | C = state[2]; |
| 343 | D = state[3]; |
| 344 | E = state[4]; |
| 345 | } |
| 346 | } |
| 347 | |
| 348 | #endif // OPENSSL_PPC64LE |
| 349 | |
| 350 | #undef K_00_19 |
| 351 | #undef K_20_39 |
| 352 | #undef K_40_59 |
| 353 | #undef K_60_79 |
| 354 | #undef F_00_19 |
| 355 | #undef F_20_39 |
| 356 | #undef F_40_59 |
| 357 | #undef F_60_79 |
| 358 | #undef BODY_00_19 |
| 359 | #undef BODY_20_39 |
| 360 | #undef BODY_40_59 |
| 361 | #undef BODY_60_79 |
| 362 | |