| 1 | // Copyright 2009 Intel Corporation |
| 2 | // All Rights Reserved |
| 3 | // |
| 4 | // Permission is granted to use, copy, distribute and prepare derivative works of this |
| 5 | // software for any purpose and without fee, provided, that the above copyright notice |
| 6 | // and this statement appear in all copies. Intel makes no representations about the |
| 7 | // suitability of this software for any purpose. THIS SOFTWARE IS PROVIDED "AS IS." |
| 8 | // INTEL SPECIFICALLY DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, AND ALL LIABILITY, |
| 9 | // INCLUDING CONSEQUENTIAL AND OTHER INDIRECT DAMAGES, FOR THE USE OF THIS SOFTWARE, |
| 10 | // INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PROPRIETARY RIGHTS, AND INCLUDING THE |
| 11 | // WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. Intel does not |
| 12 | // assume any responsibility for any errors which may appear in this software nor any |
| 13 | // responsibility to update it. |
| 14 | // |
| 15 | // From: |
| 16 | // https://software.intel.com/sites/default/files/m/d/4/1/d/8/UsingIntelAVXToImplementIDCT-r1_5.pdf |
| 17 | // https://software.intel.com/file/29048 |
| 18 | // |
| 19 | // Requires SSE |
| 20 | // |
| 21 | #ifdef _MSC_VER |
| 22 | #include <intrin.h> |
| 23 | #endif |
| 24 | #include <immintrin.h> |
| 25 | |
| 26 | #ifdef _MSC_VER |
| 27 | #define JPGD_SIMD_ALIGN(type, name) __declspec(align(16)) type name |
| 28 | #else |
| 29 | #define JPGD_SIMD_ALIGN(type, name) type name __attribute__((aligned(16))) |
| 30 | #endif |
| 31 | |
| 32 | #define BITS_INV_ACC 4 |
| 33 | #define SHIFT_INV_ROW 16 - BITS_INV_ACC |
| 34 | #define SHIFT_INV_COL 1 + BITS_INV_ACC |
| 35 | const short IRND_INV_ROW = 1024 * (6 - BITS_INV_ACC); //1 << (SHIFT_INV_ROW-1) |
| 36 | const short IRND_INV_COL = 16 * (BITS_INV_ACC - 3); // 1 << (SHIFT_INV_COL-1) |
| 37 | const short IRND_INV_CORR = IRND_INV_COL - 1; // correction -1.0 and round |
| 38 | |
| 39 | JPGD_SIMD_ALIGN(short, shortM128_one_corr[8]) = {1, 1, 1, 1, 1, 1, 1, 1}; |
| 40 | JPGD_SIMD_ALIGN(short, shortM128_round_inv_row[8]) = {IRND_INV_ROW, 0, IRND_INV_ROW, 0, IRND_INV_ROW, 0, IRND_INV_ROW, 0}; |
| 41 | JPGD_SIMD_ALIGN(short, shortM128_round_inv_col[8]) = {IRND_INV_COL, IRND_INV_COL, IRND_INV_COL, IRND_INV_COL, IRND_INV_COL, IRND_INV_COL, IRND_INV_COL, IRND_INV_COL}; |
| 42 | JPGD_SIMD_ALIGN(short, shortM128_round_inv_corr[8])= {IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR}; |
| 43 | JPGD_SIMD_ALIGN(short, shortM128_tg_1_16[8]) = {13036, 13036, 13036, 13036, 13036, 13036, 13036, 13036}; // tg * (2<<16) + 0.5 |
| 44 | JPGD_SIMD_ALIGN(short, shortM128_tg_2_16[8]) = {27146, 27146, 27146, 27146, 27146, 27146, 27146, 27146}; // tg * (2<<16) + 0.5 |
| 45 | JPGD_SIMD_ALIGN(short, shortM128_tg_3_16[8]) = {-21746, -21746, -21746, -21746, -21746, -21746, -21746, -21746}; // tg * (2<<16) + 0.5 |
| 46 | JPGD_SIMD_ALIGN(short, shortM128_cos_4_16[8]) = {-19195, -19195, -19195, -19195, -19195, -19195, -19195, -19195};// cos * (2<<16) + 0.5 |
| 47 | |
| 48 | //----------------------------------------------------------------------------- |
| 49 | // Table for rows 0,4 - constants are multiplied on cos_4_16 |
| 50 | // w15 w14 w11 w10 w07 w06 w03 w02 |
| 51 | // w29 w28 w25 w24 w21 w20 w17 w16 |
| 52 | // w31 w30 w27 w26 w23 w22 w19 w18 |
| 53 | //movq -> w05 w04 w01 w00 |
| 54 | JPGD_SIMD_ALIGN(short, shortM128_tab_i_04[]) = { |
| 55 | 16384, 21407, 16384, 8867, |
| 56 | 16384, -8867, 16384, -21407, // w13 w12 w09 w08 |
| 57 | 16384, 8867, -16384, -21407, // w07 w06 w03 w02 |
| 58 | -16384, 21407, 16384, -8867, // w15 w14 w11 w10 |
| 59 | 22725, 19266, 19266, -4520, // w21 w20 w17 w16 |
| 60 | 12873, -22725, 4520, -12873, // w29 w28 w25 w24 |
| 61 | 12873, 4520, -22725, -12873, // w23 w22 w19 w18 |
| 62 | 4520, 19266, 19266, -22725}; // w31 w30 w27 w26 |
| 63 | |
| 64 | // Table for rows 1,7 - constants are multiplied on cos_1_16 |
| 65 | //movq -> w05 w04 w01 w00 |
| 66 | JPGD_SIMD_ALIGN(short, shortM128_tab_i_17[]) = { |
| 67 | 22725, 29692, 22725, 12299, |
| 68 | 22725, -12299, 22725, -29692, // w13 w12 w09 w08 |
| 69 | 22725, 12299, -22725, -29692, // w07 w06 w03 w02 |
| 70 | -22725, 29692, 22725, -12299, // w15 w14 w11 w10 |
| 71 | 31521, 26722, 26722, -6270, // w21 w20 w17 w16 |
| 72 | 17855, -31521, 6270, -17855, // w29 w28 w25 w24 |
| 73 | 17855, 6270, -31521, -17855, // w23 w22 w19 w18 |
| 74 | 6270, 26722, 26722, -31521}; // w31 w30 w27 w26 |
| 75 | |
| 76 | // Table for rows 2,6 - constants are multiplied on cos_2_16 |
| 77 | //movq -> w05 w04 w01 w00 |
| 78 | JPGD_SIMD_ALIGN(short, shortM128_tab_i_26[]) = { |
| 79 | 21407, 27969, 21407, 11585, |
| 80 | 21407, -11585, 21407, -27969, // w13 w12 w09 w08 |
| 81 | 21407, 11585, -21407, -27969, // w07 w06 w03 w02 |
| 82 | -21407, 27969, 21407, -11585, // w15 w14 w11 w10 |
| 83 | 29692, 25172, 25172, -5906, // w21 w20 w17 w16 |
| 84 | 16819, -29692, 5906, -16819, // w29 w28 w25 w24 |
| 85 | 16819, 5906, -29692, -16819, // w23 w22 w19 w18 |
| 86 | 5906, 25172, 25172, -29692}; // w31 w30 w27 w26 |
| 87 | // Table for rows 3,5 - constants are multiplied on cos_3_16 |
| 88 | //movq -> w05 w04 w01 w00 |
| 89 | JPGD_SIMD_ALIGN(short, shortM128_tab_i_35[]) = { |
| 90 | 19266, 25172, 19266, 10426, |
| 91 | 19266, -10426, 19266, -25172, // w13 w12 w09 w08 |
| 92 | 19266, 10426, -19266, -25172, // w07 w06 w03 w02 |
| 93 | -19266, 25172, 19266, -10426, // w15 w14 w11 w10 |
| 94 | 26722, 22654, 22654, -5315, // w21 w20 w17 w16 |
| 95 | 15137, -26722, 5315, -15137, // w29 w28 w25 w24 |
| 96 | 15137, 5315, -26722, -15137, // w23 w22 w19 w18 |
| 97 | 5315, 22654, 22654, -26722}; // w31 w30 w27 w26 |
| 98 | |
| 99 | JPGD_SIMD_ALIGN(short, shortM128_128[8]) = { 128, 128, 128, 128, 128, 128, 128, 128 }; |
| 100 | |
| 101 | void idctSSEShortU8(const short *pInput, uint8_t * pOutputUB) |
| 102 | { |
| 103 | __m128i r_xmm0, r_xmm4; |
| 104 | __m128i r_xmm1, r_xmm2, r_xmm3, r_xmm5, r_xmm6, r_xmm7; |
| 105 | __m128i row0, row1, row2, row3, row4, row5, row6, row7; |
| 106 | short * pTab_i_04 = shortM128_tab_i_04; |
| 107 | short * pTab_i_26 = shortM128_tab_i_26; |
| 108 | |
| 109 | //Get pointers for this input and output |
| 110 | pTab_i_04 = shortM128_tab_i_04; |
| 111 | pTab_i_26 = shortM128_tab_i_26; |
| 112 | |
| 113 | //Row 1 and Row 3 |
| 114 | r_xmm0 = _mm_load_si128((__m128i *) pInput); |
| 115 | r_xmm4 = _mm_load_si128((__m128i *) (&pInput[2*8])); |
| 116 | |
| 117 | // *** Work on the data in xmm0 |
| 118 | //low shuffle mask = 0xd8 = 11 01 10 00 |
| 119 | //get short 2 and short 0 into ls 32-bits |
| 120 | r_xmm0 = _mm_shufflelo_epi16(r_xmm0, 0xd8); |
| 121 | |
| 122 | // copy short 2 and short 0 to all locations |
| 123 | r_xmm1 = _mm_shuffle_epi32(r_xmm0, 0); |
| 124 | |
| 125 | // add to those copies |
| 126 | r_xmm1 = _mm_madd_epi16(r_xmm1, *((__m128i *) pTab_i_04)); |
| 127 | |
| 128 | // shuffle mask = 0x55 = 01 01 01 01 |
| 129 | // copy short 3 and short 1 to all locations |
| 130 | r_xmm3 = _mm_shuffle_epi32(r_xmm0, 0x55); |
| 131 | |
| 132 | // high shuffle mask = 0xd8 = 11 01 10 00 |
| 133 | // get short 6 and short 4 into bit positions 64-95 |
| 134 | // get short 7 and short 5 into bit positions 96-127 |
| 135 | r_xmm0 = _mm_shufflehi_epi16(r_xmm0, 0xd8); |
| 136 | |
| 137 | // add to short 3 and short 1 |
| 138 | r_xmm3 = _mm_madd_epi16(r_xmm3, *((__m128i *) &pTab_i_04[16])); |
| 139 | |
| 140 | // shuffle mask = 0xaa = 10 10 10 10 |
| 141 | // copy short 6 and short 4 to all locations |
| 142 | r_xmm2 = _mm_shuffle_epi32(r_xmm0, 0xaa); |
| 143 | |
| 144 | // shuffle mask = 0xaa = 11 11 11 11 |
| 145 | // copy short 7 and short 5 to all locations |
| 146 | r_xmm0 = _mm_shuffle_epi32(r_xmm0, 0xff); |
| 147 | |
| 148 | // add to short 6 and short 4 |
| 149 | r_xmm2 = _mm_madd_epi16(r_xmm2, *((__m128i *) &pTab_i_04[8])); |
| 150 | |
| 151 | // *** Work on the data in xmm4 |
| 152 | // high shuffle mask = 0xd8 11 01 10 00 |
| 153 | // get short 6 and short 4 into bit positions 64-95 |
| 154 | // get short 7 and short 5 into bit positions 96-127 |
| 155 | r_xmm4 = _mm_shufflehi_epi16(r_xmm4, 0xd8); |
| 156 | |
| 157 | // (xmm0 short 2 and short 0 plus pSi) + some constants |
| 158 | r_xmm1 = _mm_add_epi32(r_xmm1, *((__m128i *) shortM128_round_inv_row)); |
| 159 | r_xmm4 = _mm_shufflelo_epi16(r_xmm4, 0xd8); |
| 160 | r_xmm0 = _mm_madd_epi16(r_xmm0, *((__m128i *) &pTab_i_04[24])); |
| 161 | r_xmm5 = _mm_shuffle_epi32(r_xmm4, 0); |
| 162 | r_xmm6 = _mm_shuffle_epi32(r_xmm4, 0xaa); |
| 163 | r_xmm5 = _mm_madd_epi16(r_xmm5, *((__m128i *) &shortM128_tab_i_26[0])); |
| 164 | r_xmm1 = _mm_add_epi32(r_xmm1, r_xmm2); |
| 165 | r_xmm2 = r_xmm1; |
| 166 | r_xmm7 = _mm_shuffle_epi32(r_xmm4, 0x55); |
| 167 | r_xmm6 = _mm_madd_epi16(r_xmm6, *((__m128i *) &shortM128_tab_i_26[8])); |
| 168 | r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm3); |
| 169 | r_xmm4 = _mm_shuffle_epi32(r_xmm4, 0xff); |
| 170 | r_xmm2 = _mm_sub_epi32(r_xmm2, r_xmm0); |
| 171 | r_xmm7 = _mm_madd_epi16(r_xmm7, *((__m128i *) &shortM128_tab_i_26[16])); |
| 172 | r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm1); |
| 173 | r_xmm2 = _mm_srai_epi32(r_xmm2, 12); |
| 174 | r_xmm5 = _mm_add_epi32(r_xmm5, *((__m128i *) shortM128_round_inv_row)); |
| 175 | r_xmm4 = _mm_madd_epi16(r_xmm4, *((__m128i *) &shortM128_tab_i_26[24])); |
| 176 | r_xmm5 = _mm_add_epi32(r_xmm5, r_xmm6); |
| 177 | r_xmm6 = r_xmm5; |
| 178 | r_xmm0 = _mm_srai_epi32(r_xmm0, 12); |
| 179 | r_xmm2 = _mm_shuffle_epi32(r_xmm2, 0x1b); |
| 180 | row0 = _mm_packs_epi32(r_xmm0, r_xmm2); |
| 181 | r_xmm4 = _mm_add_epi32(r_xmm4, r_xmm7); |
| 182 | r_xmm6 = _mm_sub_epi32(r_xmm6, r_xmm4); |
| 183 | r_xmm4 = _mm_add_epi32(r_xmm4, r_xmm5); |
| 184 | r_xmm6 = _mm_srai_epi32(r_xmm6, 12); |
| 185 | r_xmm4 = _mm_srai_epi32(r_xmm4, 12); |
| 186 | r_xmm6 = _mm_shuffle_epi32(r_xmm6, 0x1b); |
| 187 | row2 = _mm_packs_epi32(r_xmm4, r_xmm6); |
| 188 | |
| 189 | //Row 5 and row 7 |
| 190 | r_xmm0 = _mm_load_si128((__m128i *) (&pInput[4*8])); |
| 191 | r_xmm4 = _mm_load_si128((__m128i *) (&pInput[6*8])); |
| 192 | |
| 193 | r_xmm0 = _mm_shufflelo_epi16(r_xmm0, 0xd8); |
| 194 | r_xmm1 = _mm_shuffle_epi32(r_xmm0, 0); |
| 195 | r_xmm1 = _mm_madd_epi16(r_xmm1, *((__m128i *) pTab_i_04)); |
| 196 | r_xmm3 = _mm_shuffle_epi32(r_xmm0, 0x55); |
| 197 | r_xmm0 = _mm_shufflehi_epi16(r_xmm0, 0xd8); |
| 198 | r_xmm3 = _mm_madd_epi16(r_xmm3, *((__m128i *) &pTab_i_04[16])); |
| 199 | r_xmm2 = _mm_shuffle_epi32(r_xmm0, 0xaa); |
| 200 | r_xmm0 = _mm_shuffle_epi32(r_xmm0, 0xff); |
| 201 | r_xmm2 = _mm_madd_epi16(r_xmm2, *((__m128i *) &pTab_i_04[8])); |
| 202 | r_xmm4 = _mm_shufflehi_epi16(r_xmm4, 0xd8); |
| 203 | r_xmm1 = _mm_add_epi32(r_xmm1, *((__m128i *) shortM128_round_inv_row)); |
| 204 | r_xmm4 = _mm_shufflelo_epi16(r_xmm4, 0xd8); |
| 205 | r_xmm0 = _mm_madd_epi16(r_xmm0, *((__m128i *) &pTab_i_04[24])); |
| 206 | r_xmm5 = _mm_shuffle_epi32(r_xmm4, 0); |
| 207 | r_xmm6 = _mm_shuffle_epi32(r_xmm4, 0xaa); |
| 208 | r_xmm5 = _mm_madd_epi16(r_xmm5, *((__m128i *) &shortM128_tab_i_26[0])); |
| 209 | r_xmm1 = _mm_add_epi32(r_xmm1, r_xmm2); |
| 210 | r_xmm2 = r_xmm1; |
| 211 | r_xmm7 = _mm_shuffle_epi32(r_xmm4, 0x55); |
| 212 | r_xmm6 = _mm_madd_epi16(r_xmm6, *((__m128i *) &shortM128_tab_i_26[8])); |
| 213 | r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm3); |
| 214 | r_xmm4 = _mm_shuffle_epi32(r_xmm4, 0xff); |
| 215 | r_xmm2 = _mm_sub_epi32(r_xmm2, r_xmm0); |
| 216 | r_xmm7 = _mm_madd_epi16(r_xmm7, *((__m128i *) &shortM128_tab_i_26[16])); |
| 217 | r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm1); |
| 218 | r_xmm2 = _mm_srai_epi32(r_xmm2, 12); |
| 219 | r_xmm5 = _mm_add_epi32(r_xmm5, *((__m128i *) shortM128_round_inv_row)); |
| 220 | r_xmm4 = _mm_madd_epi16(r_xmm4, *((__m128i *) &shortM128_tab_i_26[24])); |
| 221 | r_xmm5 = _mm_add_epi32(r_xmm5, r_xmm6); |
| 222 | r_xmm6 = r_xmm5; |
| 223 | r_xmm0 = _mm_srai_epi32(r_xmm0, 12); |
| 224 | r_xmm2 = _mm_shuffle_epi32(r_xmm2, 0x1b); |
| 225 | row4 = _mm_packs_epi32(r_xmm0, r_xmm2); |
| 226 | r_xmm4 = _mm_add_epi32(r_xmm4, r_xmm7); |
| 227 | r_xmm6 = _mm_sub_epi32(r_xmm6, r_xmm4); |
| 228 | r_xmm4 = _mm_add_epi32(r_xmm4, r_xmm5); |
| 229 | r_xmm6 = _mm_srai_epi32(r_xmm6, 12); |
| 230 | r_xmm4 = _mm_srai_epi32(r_xmm4, 12); |
| 231 | r_xmm6 = _mm_shuffle_epi32(r_xmm6, 0x1b); |
| 232 | row6 = _mm_packs_epi32(r_xmm4, r_xmm6); |
| 233 | |
| 234 | //Row 4 and row 2 |
| 235 | pTab_i_04 = shortM128_tab_i_35; |
| 236 | pTab_i_26 = shortM128_tab_i_17; |
| 237 | r_xmm0 = _mm_load_si128((__m128i *) (&pInput[3*8])); |
| 238 | r_xmm4 = _mm_load_si128((__m128i *) (&pInput[1*8])); |
| 239 | |
| 240 | r_xmm0 = _mm_shufflelo_epi16(r_xmm0, 0xd8); |
| 241 | r_xmm1 = _mm_shuffle_epi32(r_xmm0, 0); |
| 242 | r_xmm1 = _mm_madd_epi16(r_xmm1, *((__m128i *) pTab_i_04)); |
| 243 | r_xmm3 = _mm_shuffle_epi32(r_xmm0, 0x55); |
| 244 | r_xmm0 = _mm_shufflehi_epi16(r_xmm0, 0xd8); |
| 245 | r_xmm3 = _mm_madd_epi16(r_xmm3, *((__m128i *) &pTab_i_04[16])); |
| 246 | r_xmm2 = _mm_shuffle_epi32(r_xmm0, 0xaa); |
| 247 | r_xmm0 = _mm_shuffle_epi32(r_xmm0, 0xff); |
| 248 | r_xmm2 = _mm_madd_epi16(r_xmm2, *((__m128i *) &pTab_i_04[8])); |
| 249 | r_xmm4 = _mm_shufflehi_epi16(r_xmm4, 0xd8); |
| 250 | r_xmm1 = _mm_add_epi32(r_xmm1, *((__m128i *) shortM128_round_inv_row)); |
| 251 | r_xmm4 = _mm_shufflelo_epi16(r_xmm4, 0xd8); |
| 252 | r_xmm0 = _mm_madd_epi16(r_xmm0, *((__m128i *) &pTab_i_04[24])); |
| 253 | r_xmm5 = _mm_shuffle_epi32(r_xmm4, 0); |
| 254 | r_xmm6 = _mm_shuffle_epi32(r_xmm4, 0xaa); |
| 255 | r_xmm5 = _mm_madd_epi16(r_xmm5, *((__m128i *) &pTab_i_26[0])); |
| 256 | r_xmm1 = _mm_add_epi32(r_xmm1, r_xmm2); |
| 257 | r_xmm2 = r_xmm1; |
| 258 | r_xmm7 = _mm_shuffle_epi32(r_xmm4, 0x55); |
| 259 | r_xmm6 = _mm_madd_epi16(r_xmm6, *((__m128i *) &pTab_i_26[8])); |
| 260 | r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm3); |
| 261 | r_xmm4 = _mm_shuffle_epi32(r_xmm4, 0xff); |
| 262 | r_xmm2 = _mm_sub_epi32(r_xmm2, r_xmm0); |
| 263 | r_xmm7 = _mm_madd_epi16(r_xmm7, *((__m128i *) &pTab_i_26[16])); |
| 264 | r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm1); |
| 265 | r_xmm2 = _mm_srai_epi32(r_xmm2, 12); |
| 266 | r_xmm5 = _mm_add_epi32(r_xmm5, *((__m128i *) shortM128_round_inv_row)); |
| 267 | r_xmm4 = _mm_madd_epi16(r_xmm4, *((__m128i *) &pTab_i_26[24])); |
| 268 | r_xmm5 = _mm_add_epi32(r_xmm5, r_xmm6); |
| 269 | r_xmm6 = r_xmm5; |
| 270 | r_xmm0 = _mm_srai_epi32(r_xmm0, 12); |
| 271 | r_xmm2 = _mm_shuffle_epi32(r_xmm2, 0x1b); |
| 272 | row3 = _mm_packs_epi32(r_xmm0, r_xmm2); |
| 273 | r_xmm4 = _mm_add_epi32(r_xmm4, r_xmm7); |
| 274 | r_xmm6 = _mm_sub_epi32(r_xmm6, r_xmm4); |
| 275 | r_xmm4 = _mm_add_epi32(r_xmm4, r_xmm5); |
| 276 | r_xmm6 = _mm_srai_epi32(r_xmm6, 12); |
| 277 | r_xmm4 = _mm_srai_epi32(r_xmm4, 12); |
| 278 | r_xmm6 = _mm_shuffle_epi32(r_xmm6, 0x1b); |
| 279 | row1 = _mm_packs_epi32(r_xmm4, r_xmm6); |
| 280 | |
| 281 | //Row 6 and row 8 |
| 282 | r_xmm0 = _mm_load_si128((__m128i *) (&pInput[5*8])); |
| 283 | r_xmm4 = _mm_load_si128((__m128i *) (&pInput[7*8])); |
| 284 | |
| 285 | r_xmm0 = _mm_shufflelo_epi16(r_xmm0, 0xd8); |
| 286 | r_xmm1 = _mm_shuffle_epi32(r_xmm0, 0); |
| 287 | r_xmm1 = _mm_madd_epi16(r_xmm1, *((__m128i *) pTab_i_04)); |
| 288 | r_xmm3 = _mm_shuffle_epi32(r_xmm0, 0x55); |
| 289 | r_xmm0 = _mm_shufflehi_epi16(r_xmm0, 0xd8); |
| 290 | r_xmm3 = _mm_madd_epi16(r_xmm3, *((__m128i *) &pTab_i_04[16])); |
| 291 | r_xmm2 = _mm_shuffle_epi32(r_xmm0, 0xaa); |
| 292 | r_xmm0 = _mm_shuffle_epi32(r_xmm0, 0xff); |
| 293 | r_xmm2 = _mm_madd_epi16(r_xmm2, *((__m128i *) &pTab_i_04[8])); |
| 294 | r_xmm4 = _mm_shufflehi_epi16(r_xmm4, 0xd8); |
| 295 | r_xmm1 = _mm_add_epi32(r_xmm1, *((__m128i *) shortM128_round_inv_row)); |
| 296 | r_xmm4 = _mm_shufflelo_epi16(r_xmm4, 0xd8); |
| 297 | r_xmm0 = _mm_madd_epi16(r_xmm0, *((__m128i *) &pTab_i_04[24])); |
| 298 | r_xmm5 = _mm_shuffle_epi32(r_xmm4, 0); |
| 299 | r_xmm6 = _mm_shuffle_epi32(r_xmm4, 0xaa); |
| 300 | r_xmm5 = _mm_madd_epi16(r_xmm5, *((__m128i *) &pTab_i_26[0])); |
| 301 | r_xmm1 = _mm_add_epi32(r_xmm1, r_xmm2); |
| 302 | r_xmm2 = r_xmm1; |
| 303 | r_xmm7 = _mm_shuffle_epi32(r_xmm4, 0x55); |
| 304 | r_xmm6 = _mm_madd_epi16(r_xmm6, *((__m128i *) &pTab_i_26[8])); |
| 305 | r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm3); |
| 306 | r_xmm4 = _mm_shuffle_epi32(r_xmm4, 0xff); |
| 307 | r_xmm2 = _mm_sub_epi32(r_xmm2, r_xmm0); |
| 308 | r_xmm7 = _mm_madd_epi16(r_xmm7, *((__m128i *) &pTab_i_26[16])); |
| 309 | r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm1); |
| 310 | r_xmm2 = _mm_srai_epi32(r_xmm2, 12); |
| 311 | r_xmm5 = _mm_add_epi32(r_xmm5, *((__m128i *) shortM128_round_inv_row)); |
| 312 | r_xmm4 = _mm_madd_epi16(r_xmm4, *((__m128i *) &pTab_i_26[24])); |
| 313 | r_xmm5 = _mm_add_epi32(r_xmm5, r_xmm6); |
| 314 | r_xmm6 = r_xmm5; |
| 315 | r_xmm0 = _mm_srai_epi32(r_xmm0, 12); |
| 316 | r_xmm2 = _mm_shuffle_epi32(r_xmm2, 0x1b); |
| 317 | row5 = _mm_packs_epi32(r_xmm0, r_xmm2); |
| 318 | r_xmm4 = _mm_add_epi32(r_xmm4, r_xmm7); |
| 319 | r_xmm6 = _mm_sub_epi32(r_xmm6, r_xmm4); |
| 320 | r_xmm4 = _mm_add_epi32(r_xmm4, r_xmm5); |
| 321 | r_xmm6 = _mm_srai_epi32(r_xmm6, 12); |
| 322 | r_xmm4 = _mm_srai_epi32(r_xmm4, 12); |
| 323 | r_xmm6 = _mm_shuffle_epi32(r_xmm6, 0x1b); |
| 324 | row7 = _mm_packs_epi32(r_xmm4, r_xmm6); |
| 325 | |
| 326 | r_xmm1 = _mm_load_si128((__m128i *) shortM128_tg_3_16); |
| 327 | r_xmm2 = row5; |
| 328 | r_xmm3 = row3; |
| 329 | r_xmm0 = _mm_mulhi_epi16(row5, r_xmm1); |
| 330 | |
| 331 | r_xmm1 = _mm_mulhi_epi16(r_xmm1, r_xmm3); |
| 332 | r_xmm5 = _mm_load_si128((__m128i *) shortM128_tg_1_16); |
| 333 | r_xmm6 = row7; |
| 334 | r_xmm4 = _mm_mulhi_epi16(row7, r_xmm5); |
| 335 | |
| 336 | r_xmm0 = _mm_adds_epi16(r_xmm0, r_xmm2); |
| 337 | r_xmm5 = _mm_mulhi_epi16(r_xmm5, row1); |
| 338 | r_xmm1 = _mm_adds_epi16(r_xmm1, r_xmm3); |
| 339 | r_xmm7 = row6; |
| 340 | |
| 341 | r_xmm0 = _mm_adds_epi16(r_xmm0, r_xmm3); |
| 342 | r_xmm3 = _mm_load_si128((__m128i *) shortM128_tg_2_16); |
| 343 | r_xmm2 = _mm_subs_epi16(r_xmm2, r_xmm1); |
| 344 | r_xmm7 = _mm_mulhi_epi16(r_xmm7, r_xmm3); |
| 345 | r_xmm1 = r_xmm0; |
| 346 | r_xmm3 = _mm_mulhi_epi16(r_xmm3, row2); |
| 347 | r_xmm5 = _mm_subs_epi16(r_xmm5, r_xmm6); |
| 348 | r_xmm4 = _mm_adds_epi16(r_xmm4, row1); |
| 349 | r_xmm0 = _mm_adds_epi16(r_xmm0, r_xmm4); |
| 350 | r_xmm0 = _mm_adds_epi16(r_xmm0, *((__m128i *) shortM128_one_corr)); |
| 351 | r_xmm4 = _mm_subs_epi16(r_xmm4, r_xmm1); |
| 352 | r_xmm6 = r_xmm5; |
| 353 | r_xmm5 = _mm_subs_epi16(r_xmm5, r_xmm2); |
| 354 | r_xmm5 = _mm_adds_epi16(r_xmm5, *((__m128i *) shortM128_one_corr)); |
| 355 | r_xmm6 = _mm_adds_epi16(r_xmm6, r_xmm2); |
| 356 | |
| 357 | //Intermediate results, needed later |
| 358 | __m128i temp3, temp7; |
| 359 | temp7 = r_xmm0; |
| 360 | |
| 361 | r_xmm1 = r_xmm4; |
| 362 | r_xmm0 = _mm_load_si128((__m128i *) shortM128_cos_4_16); |
| 363 | r_xmm4 = _mm_adds_epi16(r_xmm4, r_xmm5); |
| 364 | r_xmm2 = _mm_load_si128((__m128i *) shortM128_cos_4_16); |
| 365 | r_xmm2 = _mm_mulhi_epi16(r_xmm2, r_xmm4); |
| 366 | |
| 367 | //Intermediate results, needed later |
| 368 | temp3 = r_xmm6; |
| 369 | |
| 370 | r_xmm1 = _mm_subs_epi16(r_xmm1, r_xmm5); |
| 371 | r_xmm7 = _mm_adds_epi16(r_xmm7, row2); |
| 372 | r_xmm3 = _mm_subs_epi16(r_xmm3, row6); |
| 373 | r_xmm6 = row0; |
| 374 | r_xmm0 = _mm_mulhi_epi16(r_xmm0, r_xmm1); |
| 375 | r_xmm5 = row4; |
| 376 | r_xmm5 = _mm_adds_epi16(r_xmm5, r_xmm6); |
| 377 | r_xmm6 = _mm_subs_epi16(r_xmm6, row4); |
| 378 | r_xmm4 = _mm_adds_epi16(r_xmm4, r_xmm2); |
| 379 | |
| 380 | r_xmm4 = _mm_or_si128(r_xmm4, *((__m128i *) shortM128_one_corr)); |
| 381 | r_xmm0 = _mm_adds_epi16(r_xmm0, r_xmm1); |
| 382 | r_xmm0 = _mm_or_si128(r_xmm0, *((__m128i *) shortM128_one_corr)); |
| 383 | |
| 384 | r_xmm2 = r_xmm5; |
| 385 | r_xmm5 = _mm_adds_epi16(r_xmm5, r_xmm7); |
| 386 | r_xmm1 = r_xmm6; |
| 387 | r_xmm5 = _mm_adds_epi16(r_xmm5, *((__m128i *) shortM128_round_inv_col)); |
| 388 | r_xmm2 = _mm_subs_epi16(r_xmm2, r_xmm7); |
| 389 | r_xmm7 = temp7; |
| 390 | r_xmm6 = _mm_adds_epi16(r_xmm6, r_xmm3); |
| 391 | r_xmm6 = _mm_adds_epi16(r_xmm6, *((__m128i *) shortM128_round_inv_col)); |
| 392 | r_xmm7 = _mm_adds_epi16(r_xmm7, r_xmm5); |
| 393 | r_xmm7 = _mm_srai_epi16(r_xmm7, SHIFT_INV_COL); |
| 394 | r_xmm1 = _mm_subs_epi16(r_xmm1, r_xmm3); |
| 395 | r_xmm1 = _mm_adds_epi16(r_xmm1, *((__m128i *) shortM128_round_inv_corr)); |
| 396 | r_xmm3 = r_xmm6; |
| 397 | r_xmm2 = _mm_adds_epi16(r_xmm2, *((__m128i *) shortM128_round_inv_corr)); |
| 398 | r_xmm6 = _mm_adds_epi16(r_xmm6, r_xmm4); |
| 399 | |
| 400 | //Store results for row 0 |
| 401 | //_mm_store_si128((__m128i *) pOutput, r_xmm7); |
| 402 | __m128i r0 = r_xmm7; |
| 403 | |
| 404 | r_xmm6 = _mm_srai_epi16(r_xmm6, SHIFT_INV_COL); |
| 405 | r_xmm7 = r_xmm1; |
| 406 | r_xmm1 = _mm_adds_epi16(r_xmm1, r_xmm0); |
| 407 | |
| 408 | //Store results for row 1 |
| 409 | //_mm_store_si128((__m128i *) (&pOutput[1*8]), r_xmm6); |
| 410 | __m128i r1 = r_xmm6; |
| 411 | |
| 412 | r_xmm1 = _mm_srai_epi16(r_xmm1, SHIFT_INV_COL); |
| 413 | r_xmm6 = temp3; |
| 414 | r_xmm7 = _mm_subs_epi16(r_xmm7, r_xmm0); |
| 415 | r_xmm7 = _mm_srai_epi16(r_xmm7, SHIFT_INV_COL); |
| 416 | |
| 417 | //Store results for row 2 |
| 418 | //_mm_store_si128((__m128i *) (&pOutput[2*8]), r_xmm1); |
| 419 | __m128i r2 = r_xmm1; |
| 420 | |
| 421 | r_xmm5 = _mm_subs_epi16(r_xmm5, temp7); |
| 422 | r_xmm5 = _mm_srai_epi16(r_xmm5, SHIFT_INV_COL); |
| 423 | |
| 424 | //Store results for row 7 |
| 425 | //_mm_store_si128((__m128i *) (&pOutput[7*8]), r_xmm5); |
| 426 | __m128i r7 = r_xmm5; |
| 427 | |
| 428 | r_xmm3 = _mm_subs_epi16(r_xmm3, r_xmm4); |
| 429 | r_xmm6 = _mm_adds_epi16(r_xmm6, r_xmm2); |
| 430 | r_xmm2 = _mm_subs_epi16(r_xmm2, temp3); |
| 431 | r_xmm6 = _mm_srai_epi16(r_xmm6, SHIFT_INV_COL); |
| 432 | r_xmm2 = _mm_srai_epi16(r_xmm2, SHIFT_INV_COL); |
| 433 | |
| 434 | //Store results for row 3 |
| 435 | //_mm_store_si128((__m128i *) (&pOutput[3*8]), r_xmm6); |
| 436 | __m128i r3 = r_xmm6; |
| 437 | |
| 438 | r_xmm3 = _mm_srai_epi16(r_xmm3, SHIFT_INV_COL); |
| 439 | |
| 440 | //Store results for rows 4, 5, and 6 |
| 441 | //_mm_store_si128((__m128i *) (&pOutput[4*8]), r_xmm2); |
| 442 | //_mm_store_si128((__m128i *) (&pOutput[5*8]), r_xmm7); |
| 443 | //_mm_store_si128((__m128i *) (&pOutput[6*8]), r_xmm3); |
| 444 | |
| 445 | __m128i r4 = r_xmm2; |
| 446 | __m128i r5 = r_xmm7; |
| 447 | __m128i r6 = r_xmm3; |
| 448 | |
| 449 | r0 = _mm_add_epi16(*(const __m128i *)shortM128_128, r0); |
| 450 | r1 = _mm_add_epi16(*(const __m128i *)shortM128_128, r1); |
| 451 | r2 = _mm_add_epi16(*(const __m128i *)shortM128_128, r2); |
| 452 | r3 = _mm_add_epi16(*(const __m128i *)shortM128_128, r3); |
| 453 | r4 = _mm_add_epi16(*(const __m128i *)shortM128_128, r4); |
| 454 | r5 = _mm_add_epi16(*(const __m128i *)shortM128_128, r5); |
| 455 | r6 = _mm_add_epi16(*(const __m128i *)shortM128_128, r6); |
| 456 | r7 = _mm_add_epi16(*(const __m128i *)shortM128_128, r7); |
| 457 | |
| 458 | ((__m128i *)pOutputUB)[0] = _mm_packus_epi16(r0, r1); |
| 459 | ((__m128i *)pOutputUB)[1] = _mm_packus_epi16(r2, r3); |
| 460 | ((__m128i *)pOutputUB)[2] = _mm_packus_epi16(r4, r5); |
| 461 | ((__m128i *)pOutputUB)[3] = _mm_packus_epi16(r6, r7); |
| 462 | } |
| 463 | |