| 1 | #include "Tables.hpp" |
| 2 | |
| 3 | const int32_t g_table[8][4] = { |
| 4 | { 2, 8, -2, -8 }, |
| 5 | { 5, 17, -5, -17 }, |
| 6 | { 9, 29, -9, -29 }, |
| 7 | { 13, 42, -13, -42 }, |
| 8 | { 18, 60, -18, -60 }, |
| 9 | { 24, 80, -24, -80 }, |
| 10 | { 33, 106, -33, -106 }, |
| 11 | { 47, 183, -47, -183 } |
| 12 | }; |
| 13 | |
| 14 | const int64_t g_table256[8][4] = { |
| 15 | { 2*256, 8*256, -2*256, -8*256 }, |
| 16 | { 5*256, 17*256, -5*256, -17*256 }, |
| 17 | { 9*256, 29*256, -9*256, -29*256 }, |
| 18 | { 13*256, 42*256, -13*256, -42*256 }, |
| 19 | { 18*256, 60*256, -18*256, -60*256 }, |
| 20 | { 24*256, 80*256, -24*256, -80*256 }, |
| 21 | { 33*256, 106*256, -33*256, -106*256 }, |
| 22 | { 47*256, 183*256, -47*256, -183*256 } |
| 23 | }; |
| 24 | |
| 25 | const uint32_t g_id[4][16] = { |
| 26 | { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 }, |
| 27 | { 3, 3, 2, 2, 3, 3, 2, 2, 3, 3, 2, 2, 3, 3, 2, 2 }, |
| 28 | { 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4 }, |
| 29 | { 7, 7, 6, 6, 7, 7, 6, 6, 7, 7, 6, 6, 7, 7, 6, 6 } |
| 30 | }; |
| 31 | |
| 32 | const uint32_t g_avg2[16] = { |
| 33 | 0x00, |
| 34 | 0x11, |
| 35 | 0x22, |
| 36 | 0x33, |
| 37 | 0x44, |
| 38 | 0x55, |
| 39 | 0x66, |
| 40 | 0x77, |
| 41 | 0x88, |
| 42 | 0x99, |
| 43 | 0xAA, |
| 44 | 0xBB, |
| 45 | 0xCC, |
| 46 | 0xDD, |
| 47 | 0xEE, |
| 48 | 0xFF |
| 49 | }; |
| 50 | |
| 51 | const uint32_t g_flags[64] = { |
| 52 | 0x80800402, 0x80800402, 0x80800402, 0x80800402, |
| 53 | 0x80800402, 0x80800402, 0x80800402, 0x8080E002, |
| 54 | 0x80800402, 0x80800402, 0x8080E002, 0x8080E002, |
| 55 | 0x80800402, 0x8080E002, 0x8080E002, 0x8080E002, |
| 56 | 0x80000402, 0x80000402, 0x80000402, 0x80000402, |
| 57 | 0x80000402, 0x80000402, 0x80000402, 0x8000E002, |
| 58 | 0x80000402, 0x80000402, 0x8000E002, 0x8000E002, |
| 59 | 0x80000402, 0x8000E002, 0x8000E002, 0x8000E002, |
| 60 | 0x00800402, 0x00800402, 0x00800402, 0x00800402, |
| 61 | 0x00800402, 0x00800402, 0x00800402, 0x0080E002, |
| 62 | 0x00800402, 0x00800402, 0x0080E002, 0x0080E002, |
| 63 | 0x00800402, 0x0080E002, 0x0080E002, 0x0080E002, |
| 64 | 0x00000402, 0x00000402, 0x00000402, 0x00000402, |
| 65 | 0x00000402, 0x00000402, 0x00000402, 0x0000E002, |
| 66 | 0x00000402, 0x00000402, 0x0000E002, 0x0000E002, |
| 67 | 0x00000402, 0x0000E002, 0x0000E002, 0x0000E002 |
| 68 | }; |
| 69 | |
| 70 | const int32_t g_alpha[16][8] = { |
| 71 | { -3, -6, -9, -15, 2, 5, 8, 14 }, |
| 72 | { -3, -7, -10, -13, 2, 6, 9, 12 }, |
| 73 | { -2, -5, -8, -13, 1, 4, 7, 12 }, |
| 74 | { -2, -4, -6, -13, 1, 3, 5, 12 }, |
| 75 | { -3, -6, -8, -12, 2, 5, 7, 11 }, |
| 76 | { -3, -7, -9, -11, 2, 6, 8, 10 }, |
| 77 | { -4, -7, -8, -11, 3, 6, 7, 10 }, |
| 78 | { -3, -5, -8, -11, 2, 4, 7, 10 }, |
| 79 | { -2, -6, -8, -10, 1, 5, 7, 9 }, |
| 80 | { -2, -5, -8, -10, 1, 4, 7, 9 }, |
| 81 | { -2, -4, -8, -10, 1, 3, 7, 9 }, |
| 82 | { -2, -5, -7, -10, 1, 4, 6, 9 }, |
| 83 | { -3, -4, -7, -10, 2, 3, 6, 9 }, |
| 84 | { -1, -2, -3, -10, 0, 1, 2, 9 }, |
| 85 | { -4, -6, -8, -9, 3, 5, 7, 8 }, |
| 86 | { -3, -5, -7, -9, 2, 4, 6, 8 } |
| 87 | }; |
| 88 | |
| 89 | const int32_t g_alphaRange[16] = { |
| 90 | 0x100FF / ( 1 + g_alpha[0][7] - g_alpha[0][3] ), |
| 91 | 0x100FF / ( 1 + g_alpha[1][7] - g_alpha[1][3] ), |
| 92 | 0x100FF / ( 1 + g_alpha[2][7] - g_alpha[2][3] ), |
| 93 | 0x100FF / ( 1 + g_alpha[3][7] - g_alpha[3][3] ), |
| 94 | 0x100FF / ( 1 + g_alpha[4][7] - g_alpha[4][3] ), |
| 95 | 0x100FF / ( 1 + g_alpha[5][7] - g_alpha[5][3] ), |
| 96 | 0x100FF / ( 1 + g_alpha[6][7] - g_alpha[6][3] ), |
| 97 | 0x100FF / ( 1 + g_alpha[7][7] - g_alpha[7][3] ), |
| 98 | 0x100FF / ( 1 + g_alpha[8][7] - g_alpha[8][3] ), |
| 99 | 0x100FF / ( 1 + g_alpha[9][7] - g_alpha[9][3] ), |
| 100 | 0x100FF / ( 1 + g_alpha[10][7] - g_alpha[10][3] ), |
| 101 | 0x100FF / ( 1 + g_alpha[11][7] - g_alpha[11][3] ), |
| 102 | 0x100FF / ( 1 + g_alpha[12][7] - g_alpha[12][3] ), |
| 103 | 0x100FF / ( 1 + g_alpha[13][7] - g_alpha[13][3] ), |
| 104 | 0x100FF / ( 1 + g_alpha[14][7] - g_alpha[14][3] ), |
| 105 | 0x100FF / ( 1 + g_alpha[15][7] - g_alpha[15][3] ), |
| 106 | }; |
| 107 | |
| 108 | #ifdef __SSE4_1__ |
| 109 | const __m128i g_table_SIMD[2] = |
| 110 | { |
| 111 | _mm_setr_epi16( 2, 5, 9, 13, 18, 24, 33, 47), |
| 112 | _mm_setr_epi16( 8, 17, 29, 42, 60, 80, 106, 183) |
| 113 | }; |
| 114 | const __m128i g_table128_SIMD[2] = |
| 115 | { |
| 116 | _mm_setr_epi16( 2*128, 5*128, 9*128, 13*128, 18*128, 24*128, 33*128, 47*128), |
| 117 | _mm_setr_epi16( 8*128, 17*128, 29*128, 42*128, 60*128, 80*128, 106*128, 183*128) |
| 118 | }; |
| 119 | const __m128i g_table256_SIMD[4] = |
| 120 | { |
| 121 | _mm_setr_epi32( 2*256, 5*256, 9*256, 13*256), |
| 122 | _mm_setr_epi32( 8*256, 17*256, 29*256, 42*256), |
| 123 | _mm_setr_epi32( 18*256, 24*256, 33*256, 47*256), |
| 124 | _mm_setr_epi32( 60*256, 80*256, 106*256, 183*256) |
| 125 | }; |
| 126 | |
| 127 | const __m128i g_alpha_SIMD[16] = { |
| 128 | _mm_setr_epi16( g_alpha[ 0][0], g_alpha[ 0][1], g_alpha[ 0][2], g_alpha[ 0][3], g_alpha[ 0][4], g_alpha[ 0][5], g_alpha[ 0][6], g_alpha[ 0][7] ), |
| 129 | _mm_setr_epi16( g_alpha[ 1][0], g_alpha[ 1][1], g_alpha[ 1][2], g_alpha[ 1][3], g_alpha[ 1][4], g_alpha[ 1][5], g_alpha[ 1][6], g_alpha[ 1][7] ), |
| 130 | _mm_setr_epi16( g_alpha[ 2][0], g_alpha[ 2][1], g_alpha[ 2][2], g_alpha[ 2][3], g_alpha[ 2][4], g_alpha[ 2][5], g_alpha[ 2][6], g_alpha[ 2][7] ), |
| 131 | _mm_setr_epi16( g_alpha[ 3][0], g_alpha[ 3][1], g_alpha[ 3][2], g_alpha[ 3][3], g_alpha[ 3][4], g_alpha[ 3][5], g_alpha[ 3][6], g_alpha[ 3][7] ), |
| 132 | _mm_setr_epi16( g_alpha[ 4][0], g_alpha[ 4][1], g_alpha[ 4][2], g_alpha[ 4][3], g_alpha[ 4][4], g_alpha[ 4][5], g_alpha[ 4][6], g_alpha[ 4][7] ), |
| 133 | _mm_setr_epi16( g_alpha[ 5][0], g_alpha[ 5][1], g_alpha[ 5][2], g_alpha[ 5][3], g_alpha[ 5][4], g_alpha[ 5][5], g_alpha[ 5][6], g_alpha[ 5][7] ), |
| 134 | _mm_setr_epi16( g_alpha[ 6][0], g_alpha[ 6][1], g_alpha[ 6][2], g_alpha[ 6][3], g_alpha[ 6][4], g_alpha[ 6][5], g_alpha[ 6][6], g_alpha[ 6][7] ), |
| 135 | _mm_setr_epi16( g_alpha[ 7][0], g_alpha[ 7][1], g_alpha[ 7][2], g_alpha[ 7][3], g_alpha[ 7][4], g_alpha[ 7][5], g_alpha[ 7][6], g_alpha[ 7][7] ), |
| 136 | _mm_setr_epi16( g_alpha[ 8][0], g_alpha[ 8][1], g_alpha[ 8][2], g_alpha[ 8][3], g_alpha[ 8][4], g_alpha[ 8][5], g_alpha[ 8][6], g_alpha[ 8][7] ), |
| 137 | _mm_setr_epi16( g_alpha[ 9][0], g_alpha[ 9][1], g_alpha[ 9][2], g_alpha[ 9][3], g_alpha[ 9][4], g_alpha[ 9][5], g_alpha[ 9][6], g_alpha[ 9][7] ), |
| 138 | _mm_setr_epi16( g_alpha[10][0], g_alpha[10][1], g_alpha[10][2], g_alpha[10][3], g_alpha[10][4], g_alpha[10][5], g_alpha[10][6], g_alpha[10][7] ), |
| 139 | _mm_setr_epi16( g_alpha[11][0], g_alpha[11][1], g_alpha[11][2], g_alpha[11][3], g_alpha[11][4], g_alpha[11][5], g_alpha[11][6], g_alpha[11][7] ), |
| 140 | _mm_setr_epi16( g_alpha[12][0], g_alpha[12][1], g_alpha[12][2], g_alpha[12][3], g_alpha[12][4], g_alpha[12][5], g_alpha[12][6], g_alpha[12][7] ), |
| 141 | _mm_setr_epi16( g_alpha[13][0], g_alpha[13][1], g_alpha[13][2], g_alpha[13][3], g_alpha[13][4], g_alpha[13][5], g_alpha[13][6], g_alpha[13][7] ), |
| 142 | _mm_setr_epi16( g_alpha[14][0], g_alpha[14][1], g_alpha[14][2], g_alpha[14][3], g_alpha[14][4], g_alpha[14][5], g_alpha[14][6], g_alpha[14][7] ), |
| 143 | _mm_setr_epi16( g_alpha[15][0], g_alpha[15][1], g_alpha[15][2], g_alpha[15][3], g_alpha[15][4], g_alpha[15][5], g_alpha[15][6], g_alpha[15][7] ), |
| 144 | }; |
| 145 | |
| 146 | const __m128i g_alphaRange_SIMD = _mm_setr_epi16( |
| 147 | g_alphaRange[0], |
| 148 | g_alphaRange[1], |
| 149 | g_alphaRange[4], |
| 150 | g_alphaRange[5], |
| 151 | g_alphaRange[8], |
| 152 | g_alphaRange[14], |
| 153 | 0, |
| 154 | 0 ); |
| 155 | #endif |
| 156 | |
| 157 | #ifdef __AVX2__ |
| 158 | const __m256i g_alpha_AVX[8] = { |
| 159 | _mm256_setr_epi16( g_alpha[ 0][0], g_alpha[ 1][0], g_alpha[ 2][0], g_alpha[ 3][0], g_alpha[ 4][0], g_alpha[ 5][0], g_alpha[ 6][0], g_alpha[ 7][0], g_alpha[ 8][0], g_alpha[ 9][0], g_alpha[10][0], g_alpha[11][0], g_alpha[12][0], g_alpha[13][0], g_alpha[14][0], g_alpha[15][0] ), |
| 160 | _mm256_setr_epi16( g_alpha[ 0][1], g_alpha[ 1][1], g_alpha[ 2][1], g_alpha[ 3][1], g_alpha[ 4][1], g_alpha[ 5][1], g_alpha[ 6][1], g_alpha[ 7][1], g_alpha[ 8][1], g_alpha[ 9][1], g_alpha[10][1], g_alpha[11][1], g_alpha[12][1], g_alpha[13][1], g_alpha[14][1], g_alpha[15][1] ), |
| 161 | _mm256_setr_epi16( g_alpha[ 0][2], g_alpha[ 1][2], g_alpha[ 2][2], g_alpha[ 3][2], g_alpha[ 4][2], g_alpha[ 5][2], g_alpha[ 6][2], g_alpha[ 7][2], g_alpha[ 8][2], g_alpha[ 9][2], g_alpha[10][2], g_alpha[11][2], g_alpha[12][2], g_alpha[13][2], g_alpha[14][2], g_alpha[15][2] ), |
| 162 | _mm256_setr_epi16( g_alpha[ 0][3], g_alpha[ 1][3], g_alpha[ 2][3], g_alpha[ 3][3], g_alpha[ 4][3], g_alpha[ 5][3], g_alpha[ 6][3], g_alpha[ 7][3], g_alpha[ 8][3], g_alpha[ 9][3], g_alpha[10][3], g_alpha[11][3], g_alpha[12][3], g_alpha[13][3], g_alpha[14][3], g_alpha[15][3] ), |
| 163 | _mm256_setr_epi16( g_alpha[ 0][4], g_alpha[ 1][4], g_alpha[ 2][4], g_alpha[ 3][4], g_alpha[ 4][4], g_alpha[ 5][4], g_alpha[ 6][4], g_alpha[ 7][4], g_alpha[ 8][4], g_alpha[ 9][4], g_alpha[10][4], g_alpha[11][4], g_alpha[12][4], g_alpha[13][4], g_alpha[14][4], g_alpha[15][4] ), |
| 164 | _mm256_setr_epi16( g_alpha[ 0][5], g_alpha[ 1][5], g_alpha[ 2][5], g_alpha[ 3][5], g_alpha[ 4][5], g_alpha[ 5][5], g_alpha[ 6][5], g_alpha[ 7][5], g_alpha[ 8][5], g_alpha[ 9][5], g_alpha[10][5], g_alpha[11][5], g_alpha[12][5], g_alpha[13][5], g_alpha[14][5], g_alpha[15][5] ), |
| 165 | _mm256_setr_epi16( g_alpha[ 0][6], g_alpha[ 1][6], g_alpha[ 2][6], g_alpha[ 3][6], g_alpha[ 4][6], g_alpha[ 5][6], g_alpha[ 6][6], g_alpha[ 7][6], g_alpha[ 8][6], g_alpha[ 9][6], g_alpha[10][6], g_alpha[11][6], g_alpha[12][6], g_alpha[13][6], g_alpha[14][6], g_alpha[15][6] ), |
| 166 | _mm256_setr_epi16( g_alpha[ 0][7], g_alpha[ 1][7], g_alpha[ 2][7], g_alpha[ 3][7], g_alpha[ 4][7], g_alpha[ 5][7], g_alpha[ 6][7], g_alpha[ 7][7], g_alpha[ 8][7], g_alpha[ 9][7], g_alpha[10][7], g_alpha[11][7], g_alpha[12][7], g_alpha[13][7], g_alpha[14][7], g_alpha[15][7] ), |
| 167 | }; |
| 168 | |
| 169 | const __m256i g_alphaRange_AVX = _mm256_setr_epi16( |
| 170 | g_alphaRange[ 0], g_alphaRange[ 1], g_alphaRange[ 2], g_alphaRange[ 3], g_alphaRange[ 4], g_alphaRange[ 5], g_alphaRange[ 6], g_alphaRange[ 7], |
| 171 | g_alphaRange[ 8], g_alphaRange[ 9], g_alphaRange[10], g_alphaRange[11], g_alphaRange[12], g_alphaRange[13], g_alphaRange[14], g_alphaRange[15] |
| 172 | ); |
| 173 | #endif |
| 174 | |
| 175 | #ifdef __ARM_NEON |
| 176 | const int16x8_t g_table128_NEON[2] = |
| 177 | { |
| 178 | { 2*128, 5*128, 9*128, 13*128, 18*128, 24*128, 33*128, 47*128 }, |
| 179 | { 8*128, 17*128, 29*128, 42*128, 60*128, 80*128, 106*128, 183*128 } |
| 180 | }; |
| 181 | |
| 182 | const int32x4_t g_table256_NEON[4] = |
| 183 | { |
| 184 | { 2*256, 5*256, 9*256, 13*256 }, |
| 185 | { 8*256, 17*256, 29*256, 42*256 }, |
| 186 | { 18*256, 24*256, 33*256, 47*256 }, |
| 187 | { 60*256, 80*256, 106*256, 183*256 } |
| 188 | }; |
| 189 | |
| 190 | const int16x8_t g_alpha_NEON[16] = |
| 191 | { |
| 192 | { -3, -6, -9, -15, 2, 5, 8, 14 }, |
| 193 | { -3, -7, -10, -13, 2, 6, 9, 12 }, |
| 194 | { -2, -5, -8, -13, 1, 4, 7, 12 }, |
| 195 | { -2, -4, -6, -13, 1, 3, 5, 12 }, |
| 196 | { -3, -6, -8, -12, 2, 5, 7, 11 }, |
| 197 | { -3, -7, -9, -11, 2, 6, 8, 10 }, |
| 198 | { -4, -7, -8, -11, 3, 6, 7, 10 }, |
| 199 | { -3, -5, -8, -11, 2, 4, 7, 10 }, |
| 200 | { -2, -6, -8, -10, 1, 5, 7, 9 }, |
| 201 | { -2, -5, -8, -10, 1, 4, 7, 9 }, |
| 202 | { -2, -4, -8, -10, 1, 3, 7, 9 }, |
| 203 | { -2, -5, -7, -10, 1, 4, 6, 9 }, |
| 204 | { -3, -4, -7, -10, 2, 3, 6, 9 }, |
| 205 | { -1, -2, -3, -10, 0, 1, 2, 9 }, |
| 206 | { -4, -6, -8, -9, 3, 5, 7, 8 }, |
| 207 | { -3, -5, -7, -9, 2, 4, 6, 8 } |
| 208 | }; |
| 209 | |
| 210 | const int16x8_t g_alphaRange_NEON = |
| 211 | { |
| 212 | (int16_t)g_alphaRange[0], |
| 213 | (int16_t)g_alphaRange[1], |
| 214 | (int16_t)g_alphaRange[4], |
| 215 | (int16_t)g_alphaRange[5], |
| 216 | (int16_t)g_alphaRange[8], |
| 217 | (int16_t)g_alphaRange[14], |
| 218 | 0, |
| 219 | 0 |
| 220 | }; |
| 221 | #endif |
| 222 | |