| 1 | // The input consists of six character sets in the Base64 alphabet, which we |
| 2 | // need to map back to the 6-bit values they represent. There are three ranges, |
| 3 | // two singles, and then there's the rest. |
| 4 | // |
| 5 | // # From To Add Characters |
| 6 | // 1 [43] [62] +19 + |
| 7 | // 2 [47] [63] +16 / |
| 8 | // 3 [48..57] [52..61] +4 0..9 |
| 9 | // 4 [65..90] [0..25] -65 A..Z |
| 10 | // 5 [97..122] [26..51] -71 a..z |
| 11 | // (6) Everything else => invalid input |
| 12 | // |
| 13 | // We will use lookup tables for character validation and offset computation. |
| 14 | // Remember that 0x2X and 0x0X are the same index for _mm_shuffle_epi8, this |
| 15 | // allows to mask with 0x2F instead of 0x0F and thus save one constant |
| 16 | // declaration (register and/or memory access). |
| 17 | // |
| 18 | // For offsets: |
| 19 | // Perfect hash for lut = ((src >> 4) & 0x2F) + ((src == 0x2F) ? 0xFF : 0x00) |
| 20 | // 0000 = garbage |
| 21 | // 0001 = / |
| 22 | // 0010 = + |
| 23 | // 0011 = 0-9 |
| 24 | // 0100 = A-Z |
| 25 | // 0101 = A-Z |
| 26 | // 0110 = a-z |
| 27 | // 0111 = a-z |
| 28 | // 1000 >= garbage |
| 29 | // |
| 30 | // For validation, here's the table. |
| 31 | // A character is valid if and only if the AND of the 2 lookups equals 0: |
| 32 | // |
| 33 | // hi \ lo 0000 0001 0010 0011 0100 0101 0110 0111 1000 1001 1010 1011 1100 1101 1110 1111 |
| 34 | // LUT 0x15 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x13 0x1A 0x1B 0x1B 0x1B 0x1A |
| 35 | // |
| 36 | // 0000 0x10 char NUL SOH STX ETX EOT ENQ ACK BEL BS HT LF VT FF CR SO SI |
| 37 | // andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 |
| 38 | // |
| 39 | // 0001 0x10 char DLE DC1 DC2 DC3 DC4 NAK SYN ETB CAN EM SUB ESC FS GS RS US |
| 40 | // andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 |
| 41 | // |
| 42 | // 0010 0x01 char ! " # $ % & ' ( ) * + , - . / |
| 43 | // andlut 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x00 0x01 0x01 0x01 0x00 |
| 44 | // |
| 45 | // 0011 0x02 char 0 1 2 3 4 5 6 7 8 9 : ; < = > ? |
| 46 | // andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x02 0x02 0x02 0x02 0x02 0x02 |
| 47 | // |
| 48 | // 0100 0x04 char @ A B C D E F G H I J K L M N O |
| 49 | // andlut 0x04 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 |
| 50 | // |
| 51 | // 0101 0x08 char P Q R S T U V W X Y Z [ \ ] ^ _ |
| 52 | // andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x08 |
| 53 | // |
| 54 | // 0110 0x04 char ` a b c d e f g h i j k l m n o |
| 55 | // andlut 0x04 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 |
| 56 | // 0111 0x08 char p q r s t u v w x y z { | } ~ |
| 57 | // andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x08 |
| 58 | // |
| 59 | // 1000 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 |
| 60 | // 1001 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 |
| 61 | // 1010 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 |
| 62 | // 1011 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 |
| 63 | // 1100 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 |
| 64 | // 1101 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 |
| 65 | // 1110 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 |
| 66 | // 1111 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 |
| 67 | |
| 68 | static inline int |
| 69 | dec_loop_ssse3_inner (const uint8_t **s, uint8_t **o, size_t *rounds) |
| 70 | { |
| 71 | const __m128i lut_lo = _mm_setr_epi8( |
| 72 | 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, |
| 73 | 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A); |
| 74 | |
| 75 | const __m128i lut_hi = _mm_setr_epi8( |
| 76 | 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08, |
| 77 | 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10); |
| 78 | |
| 79 | const __m128i lut_roll = _mm_setr_epi8( |
| 80 | 0, 16, 19, 4, -65, -65, -71, -71, |
| 81 | 0, 0, 0, 0, 0, 0, 0, 0); |
| 82 | |
| 83 | const __m128i mask_2F = _mm_set1_epi8(0x2F); |
| 84 | |
| 85 | // Load input: |
| 86 | __m128i str = _mm_loadu_si128((__m128i *) *s); |
| 87 | |
| 88 | // Table lookups: |
| 89 | const __m128i hi_nibbles = _mm_and_si128(_mm_srli_epi32(str, 4), mask_2F); |
| 90 | const __m128i lo_nibbles = _mm_and_si128(str, mask_2F); |
| 91 | const __m128i hi = _mm_shuffle_epi8(lut_hi, hi_nibbles); |
| 92 | const __m128i lo = _mm_shuffle_epi8(lut_lo, lo_nibbles); |
| 93 | |
| 94 | // Check for invalid input: if any "and" values from lo and hi are not |
| 95 | // zero, fall back on bytewise code to do error checking and reporting: |
| 96 | if (_mm_movemask_epi8(_mm_cmpgt_epi8(_mm_and_si128(lo, hi), _mm_setzero_si128())) != 0) { |
| 97 | return 0; |
| 98 | } |
| 99 | |
| 100 | const __m128i eq_2F = _mm_cmpeq_epi8(str, mask_2F); |
| 101 | const __m128i roll = _mm_shuffle_epi8(lut_roll, _mm_add_epi8(eq_2F, hi_nibbles)); |
| 102 | |
| 103 | // Now simply add the delta values to the input: |
| 104 | str = _mm_add_epi8(str, roll); |
| 105 | |
| 106 | // Reshuffle the input to packed 12-byte output format: |
| 107 | str = dec_reshuffle(str); |
| 108 | |
| 109 | // Store the output: |
| 110 | _mm_storeu_si128((__m128i *) *o, str); |
| 111 | |
| 112 | *s += 16; |
| 113 | *o += 12; |
| 114 | *rounds -= 1; |
| 115 | |
| 116 | return 1; |
| 117 | } |
| 118 | |
| 119 | static inline void |
| 120 | dec_loop_ssse3 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) |
| 121 | { |
| 122 | if (*slen < 24) { |
| 123 | return; |
| 124 | } |
| 125 | |
| 126 | // Process blocks of 16 bytes per round. Because 4 extra zero bytes are |
| 127 | // written after the output, ensure that there will be at least 8 bytes |
| 128 | // of input data left to cover the gap. (6 data bytes and up to two |
| 129 | // end-of-string markers.) |
| 130 | size_t rounds = (*slen - 8) / 16; |
| 131 | |
| 132 | *slen -= rounds * 16; // 16 bytes consumed per round |
| 133 | *olen += rounds * 12; // 12 bytes produced per round |
| 134 | |
| 135 | do { |
| 136 | if (rounds >= 8) { |
| 137 | if (dec_loop_ssse3_inner(s, o, &rounds) && |
| 138 | dec_loop_ssse3_inner(s, o, &rounds) && |
| 139 | dec_loop_ssse3_inner(s, o, &rounds) && |
| 140 | dec_loop_ssse3_inner(s, o, &rounds) && |
| 141 | dec_loop_ssse3_inner(s, o, &rounds) && |
| 142 | dec_loop_ssse3_inner(s, o, &rounds) && |
| 143 | dec_loop_ssse3_inner(s, o, &rounds) && |
| 144 | dec_loop_ssse3_inner(s, o, &rounds)) { |
| 145 | continue; |
| 146 | } |
| 147 | break; |
| 148 | } |
| 149 | if (rounds >= 4) { |
| 150 | if (dec_loop_ssse3_inner(s, o, &rounds) && |
| 151 | dec_loop_ssse3_inner(s, o, &rounds) && |
| 152 | dec_loop_ssse3_inner(s, o, &rounds) && |
| 153 | dec_loop_ssse3_inner(s, o, &rounds)) { |
| 154 | continue; |
| 155 | } |
| 156 | break; |
| 157 | } |
| 158 | if (rounds >= 2) { |
| 159 | if (dec_loop_ssse3_inner(s, o, &rounds) && |
| 160 | dec_loop_ssse3_inner(s, o, &rounds)) { |
| 161 | continue; |
| 162 | } |
| 163 | break; |
| 164 | } |
| 165 | dec_loop_ssse3_inner(s, o, &rounds); |
| 166 | break; |
| 167 | |
| 168 | } while (rounds > 0); |
| 169 | |
| 170 | // Adjust for any rounds that were skipped: |
| 171 | *slen += rounds * 16; |
| 172 | *olen -= rounds * 12; |
| 173 | } |
| 174 | |