| 1 | /* | 
|---|
| 2 | * The copyright in this software is being made available under the 2-clauses | 
|---|
| 3 | * BSD License, included below. This software may be subject to other third | 
|---|
| 4 | * party and contributor rights, including patent rights, and no such rights | 
|---|
| 5 | * are granted under this license. | 
|---|
| 6 | * | 
|---|
| 7 | * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium | 
|---|
| 8 | * Copyright (c) 2002-2014, Professor Benoit Macq | 
|---|
| 9 | * Copyright (c) 2001-2003, David Janssens | 
|---|
| 10 | * Copyright (c) 2002-2003, Yannick Verschueren | 
|---|
| 11 | * Copyright (c) 2003-2007, Francois-Olivier Devaux | 
|---|
| 12 | * Copyright (c) 2003-2014, Antonin Descampe | 
|---|
| 13 | * Copyright (c) 2005, Herve Drolon, FreeImage Team | 
|---|
| 14 | * Copyright (c) 2008, 2011-2012, Centre National d'Etudes Spatiales (CNES), FR | 
|---|
| 15 | * Copyright (c) 2012, CS Systemes d'Information, France | 
|---|
| 16 | * All rights reserved. | 
|---|
| 17 | * | 
|---|
| 18 | * Redistribution and use in source and binary forms, with or without | 
|---|
| 19 | * modification, are permitted provided that the following conditions | 
|---|
| 20 | * are met: | 
|---|
| 21 | * 1. Redistributions of source code must retain the above copyright | 
|---|
| 22 | *    notice, this list of conditions and the following disclaimer. | 
|---|
| 23 | * 2. Redistributions in binary form must reproduce the above copyright | 
|---|
| 24 | *    notice, this list of conditions and the following disclaimer in the | 
|---|
| 25 | *    documentation and/or other materials provided with the distribution. | 
|---|
| 26 | * | 
|---|
| 27 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' | 
|---|
| 28 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | 
|---|
| 29 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | 
|---|
| 30 | * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | 
|---|
| 31 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | 
|---|
| 32 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | 
|---|
| 33 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | 
|---|
| 34 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | 
|---|
| 35 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | 
|---|
| 36 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | 
|---|
| 37 | * POSSIBILITY OF SUCH DAMAGE. | 
|---|
| 38 | */ | 
|---|
| 39 |  | 
|---|
| 40 | #ifdef __SSE__ | 
|---|
| 41 | #include <xmmintrin.h> | 
|---|
| 42 | #endif | 
|---|
| 43 | #ifdef __SSE2__ | 
|---|
| 44 | #include <emmintrin.h> | 
|---|
| 45 | #endif | 
|---|
| 46 | #ifdef __SSE4_1__ | 
|---|
| 47 | #include <smmintrin.h> | 
|---|
| 48 | #endif | 
|---|
| 49 |  | 
|---|
| 50 | #include "opj_includes.h" | 
|---|
| 51 |  | 
|---|
| 52 | /* <summary> */ | 
|---|
| 53 | /* This table contains the norms of the basis function of the reversible MCT. */ | 
|---|
| 54 | /* </summary> */ | 
|---|
| 55 | static const OPJ_FLOAT64 opj_mct_norms[3] = { 1.732, .8292, .8292 }; | 
|---|
| 56 |  | 
|---|
| 57 | /* <summary> */ | 
|---|
| 58 | /* This table contains the norms of the basis function of the irreversible MCT. */ | 
|---|
| 59 | /* </summary> */ | 
|---|
| 60 | static const OPJ_FLOAT64 opj_mct_norms_real[3] = { 1.732, 1.805, 1.573 }; | 
|---|
| 61 |  | 
|---|
| 62 | const OPJ_FLOAT64 * opj_mct_get_mct_norms() | 
|---|
| 63 | { | 
|---|
| 64 | return opj_mct_norms; | 
|---|
| 65 | } | 
|---|
| 66 |  | 
|---|
| 67 | const OPJ_FLOAT64 * opj_mct_get_mct_norms_real() | 
|---|
| 68 | { | 
|---|
| 69 | return opj_mct_norms_real; | 
|---|
| 70 | } | 
|---|
| 71 |  | 
|---|
| 72 | /* <summary> */ | 
|---|
| 73 | /* Forward reversible MCT. */ | 
|---|
| 74 | /* </summary> */ | 
|---|
| 75 | #ifdef __SSE2__ | 
|---|
| 76 | void opj_mct_encode( | 
|---|
| 77 | OPJ_INT32* OPJ_RESTRICT c0, | 
|---|
| 78 | OPJ_INT32* OPJ_RESTRICT c1, | 
|---|
| 79 | OPJ_INT32* OPJ_RESTRICT c2, | 
|---|
| 80 | OPJ_SIZE_T n) | 
|---|
| 81 | { | 
|---|
| 82 | OPJ_SIZE_T i; | 
|---|
| 83 | const OPJ_SIZE_T len = n; | 
|---|
| 84 | /* buffer are aligned on 16 bytes */ | 
|---|
| 85 | assert(((size_t)c0 & 0xf) == 0); | 
|---|
| 86 | assert(((size_t)c1 & 0xf) == 0); | 
|---|
| 87 | assert(((size_t)c2 & 0xf) == 0); | 
|---|
| 88 |  | 
|---|
| 89 | for (i = 0; i < (len & ~3U); i += 4) { | 
|---|
| 90 | __m128i y, u, v; | 
|---|
| 91 | __m128i r = _mm_load_si128((const __m128i *) & (c0[i])); | 
|---|
| 92 | __m128i g = _mm_load_si128((const __m128i *) & (c1[i])); | 
|---|
| 93 | __m128i b = _mm_load_si128((const __m128i *) & (c2[i])); | 
|---|
| 94 | y = _mm_add_epi32(g, g); | 
|---|
| 95 | y = _mm_add_epi32(y, b); | 
|---|
| 96 | y = _mm_add_epi32(y, r); | 
|---|
| 97 | y = _mm_srai_epi32(y, 2); | 
|---|
| 98 | u = _mm_sub_epi32(b, g); | 
|---|
| 99 | v = _mm_sub_epi32(r, g); | 
|---|
| 100 | _mm_store_si128((__m128i *) & (c0[i]), y); | 
|---|
| 101 | _mm_store_si128((__m128i *) & (c1[i]), u); | 
|---|
| 102 | _mm_store_si128((__m128i *) & (c2[i]), v); | 
|---|
| 103 | } | 
|---|
| 104 |  | 
|---|
| 105 | for (; i < len; ++i) { | 
|---|
| 106 | OPJ_INT32 r = c0[i]; | 
|---|
| 107 | OPJ_INT32 g = c1[i]; | 
|---|
| 108 | OPJ_INT32 b = c2[i]; | 
|---|
| 109 | OPJ_INT32 y = (r + (g * 2) + b) >> 2; | 
|---|
| 110 | OPJ_INT32 u = b - g; | 
|---|
| 111 | OPJ_INT32 v = r - g; | 
|---|
| 112 | c0[i] = y; | 
|---|
| 113 | c1[i] = u; | 
|---|
| 114 | c2[i] = v; | 
|---|
| 115 | } | 
|---|
| 116 | } | 
|---|
| 117 | #else | 
|---|
| 118 | void opj_mct_encode( | 
|---|
| 119 | OPJ_INT32* OPJ_RESTRICT c0, | 
|---|
| 120 | OPJ_INT32* OPJ_RESTRICT c1, | 
|---|
| 121 | OPJ_INT32* OPJ_RESTRICT c2, | 
|---|
| 122 | OPJ_SIZE_T n) | 
|---|
| 123 | { | 
|---|
| 124 | OPJ_SIZE_T i; | 
|---|
| 125 | const OPJ_SIZE_T len = n; | 
|---|
| 126 |  | 
|---|
| 127 | for (i = 0; i < len; ++i) { | 
|---|
| 128 | OPJ_INT32 r = c0[i]; | 
|---|
| 129 | OPJ_INT32 g = c1[i]; | 
|---|
| 130 | OPJ_INT32 b = c2[i]; | 
|---|
| 131 | OPJ_INT32 y = (r + (g * 2) + b) >> 2; | 
|---|
| 132 | OPJ_INT32 u = b - g; | 
|---|
| 133 | OPJ_INT32 v = r - g; | 
|---|
| 134 | c0[i] = y; | 
|---|
| 135 | c1[i] = u; | 
|---|
| 136 | c2[i] = v; | 
|---|
| 137 | } | 
|---|
| 138 | } | 
|---|
| 139 | #endif | 
|---|
| 140 |  | 
|---|
| 141 | /* <summary> */ | 
|---|
| 142 | /* Inverse reversible MCT. */ | 
|---|
| 143 | /* </summary> */ | 
|---|
| 144 | #ifdef __SSE2__ | 
|---|
| 145 | void opj_mct_decode( | 
|---|
| 146 | OPJ_INT32* OPJ_RESTRICT c0, | 
|---|
| 147 | OPJ_INT32* OPJ_RESTRICT c1, | 
|---|
| 148 | OPJ_INT32* OPJ_RESTRICT c2, | 
|---|
| 149 | OPJ_SIZE_T n) | 
|---|
| 150 | { | 
|---|
| 151 | OPJ_SIZE_T i; | 
|---|
| 152 | const OPJ_SIZE_T len = n; | 
|---|
| 153 |  | 
|---|
| 154 | for (i = 0; i < (len & ~3U); i += 4) { | 
|---|
| 155 | __m128i r, g, b; | 
|---|
| 156 | __m128i y = _mm_load_si128((const __m128i *) & (c0[i])); | 
|---|
| 157 | __m128i u = _mm_load_si128((const __m128i *) & (c1[i])); | 
|---|
| 158 | __m128i v = _mm_load_si128((const __m128i *) & (c2[i])); | 
|---|
| 159 | g = y; | 
|---|
| 160 | g = _mm_sub_epi32(g, _mm_srai_epi32(_mm_add_epi32(u, v), 2)); | 
|---|
| 161 | r = _mm_add_epi32(v, g); | 
|---|
| 162 | b = _mm_add_epi32(u, g); | 
|---|
| 163 | _mm_store_si128((__m128i *) & (c0[i]), r); | 
|---|
| 164 | _mm_store_si128((__m128i *) & (c1[i]), g); | 
|---|
| 165 | _mm_store_si128((__m128i *) & (c2[i]), b); | 
|---|
| 166 | } | 
|---|
| 167 | for (; i < len; ++i) { | 
|---|
| 168 | OPJ_INT32 y = c0[i]; | 
|---|
| 169 | OPJ_INT32 u = c1[i]; | 
|---|
| 170 | OPJ_INT32 v = c2[i]; | 
|---|
| 171 | OPJ_INT32 g = y - ((u + v) >> 2); | 
|---|
| 172 | OPJ_INT32 r = v + g; | 
|---|
| 173 | OPJ_INT32 b = u + g; | 
|---|
| 174 | c0[i] = r; | 
|---|
| 175 | c1[i] = g; | 
|---|
| 176 | c2[i] = b; | 
|---|
| 177 | } | 
|---|
| 178 | } | 
|---|
| 179 | #else | 
|---|
| 180 | void opj_mct_decode( | 
|---|
| 181 | OPJ_INT32* OPJ_RESTRICT c0, | 
|---|
| 182 | OPJ_INT32* OPJ_RESTRICT c1, | 
|---|
| 183 | OPJ_INT32* OPJ_RESTRICT c2, | 
|---|
| 184 | OPJ_SIZE_T n) | 
|---|
| 185 | { | 
|---|
| 186 | OPJ_UINT32 i; | 
|---|
| 187 | for (i = 0; i < n; ++i) { | 
|---|
| 188 | OPJ_INT32 y = c0[i]; | 
|---|
| 189 | OPJ_INT32 u = c1[i]; | 
|---|
| 190 | OPJ_INT32 v = c2[i]; | 
|---|
| 191 | OPJ_INT32 g = y - ((u + v) >> 2); | 
|---|
| 192 | OPJ_INT32 r = v + g; | 
|---|
| 193 | OPJ_INT32 b = u + g; | 
|---|
| 194 | c0[i] = r; | 
|---|
| 195 | c1[i] = g; | 
|---|
| 196 | c2[i] = b; | 
|---|
| 197 | } | 
|---|
| 198 | } | 
|---|
| 199 | #endif | 
|---|
| 200 |  | 
|---|
| 201 | /* <summary> */ | 
|---|
| 202 | /* Get norm of basis function of reversible MCT. */ | 
|---|
| 203 | /* </summary> */ | 
|---|
| 204 | OPJ_FLOAT64 opj_mct_getnorm(OPJ_UINT32 compno) | 
|---|
| 205 | { | 
|---|
| 206 | return opj_mct_norms[compno]; | 
|---|
| 207 | } | 
|---|
| 208 |  | 
|---|
| 209 | /* <summary> */ | 
|---|
| 210 | /* Forward irreversible MCT. */ | 
|---|
| 211 | /* </summary> */ | 
|---|
| 212 | #ifdef __SSE4_1__ | 
|---|
| 213 | void opj_mct_encode_real( | 
|---|
| 214 | OPJ_INT32* OPJ_RESTRICT c0, | 
|---|
| 215 | OPJ_INT32* OPJ_RESTRICT c1, | 
|---|
| 216 | OPJ_INT32* OPJ_RESTRICT c2, | 
|---|
| 217 | OPJ_SIZE_T n) | 
|---|
| 218 | { | 
|---|
| 219 | OPJ_SIZE_T i; | 
|---|
| 220 | const OPJ_SIZE_T len = n; | 
|---|
| 221 |  | 
|---|
| 222 | const __m128i ry = _mm_set1_epi32(2449); | 
|---|
| 223 | const __m128i gy = _mm_set1_epi32(4809); | 
|---|
| 224 | const __m128i by = _mm_set1_epi32(934); | 
|---|
| 225 | const __m128i ru = _mm_set1_epi32(1382); | 
|---|
| 226 | const __m128i gu = _mm_set1_epi32(2714); | 
|---|
| 227 | /* const __m128i bu = _mm_set1_epi32(4096); */ | 
|---|
| 228 | /* const __m128i rv = _mm_set1_epi32(4096); */ | 
|---|
| 229 | const __m128i gv = _mm_set1_epi32(3430); | 
|---|
| 230 | const __m128i bv = _mm_set1_epi32(666); | 
|---|
| 231 | const __m128i mulround = _mm_shuffle_epi32(_mm_cvtsi32_si128(4096), | 
|---|
| 232 | _MM_SHUFFLE(1, 0, 1, 0)); | 
|---|
| 233 |  | 
|---|
| 234 | for (i = 0; i < (len & ~3U); i += 4) { | 
|---|
| 235 | __m128i lo, hi; | 
|---|
| 236 | __m128i y, u, v; | 
|---|
| 237 | __m128i r = _mm_load_si128((const __m128i *) & (c0[i])); | 
|---|
| 238 | __m128i g = _mm_load_si128((const __m128i *) & (c1[i])); | 
|---|
| 239 | __m128i b = _mm_load_si128((const __m128i *) & (c2[i])); | 
|---|
| 240 |  | 
|---|
| 241 | lo = r; | 
|---|
| 242 | hi = _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 3, 1, 1)); | 
|---|
| 243 | lo = _mm_mul_epi32(lo, ry); | 
|---|
| 244 | hi = _mm_mul_epi32(hi, ry); | 
|---|
| 245 | lo = _mm_add_epi64(lo, mulround); | 
|---|
| 246 | hi = _mm_add_epi64(hi, mulround); | 
|---|
| 247 | lo = _mm_srli_epi64(lo, 13); | 
|---|
| 248 | hi = _mm_slli_epi64(hi, 32 - 13); | 
|---|
| 249 | y = _mm_blend_epi16(lo, hi, 0xCC); | 
|---|
| 250 |  | 
|---|
| 251 | lo = g; | 
|---|
| 252 | hi = _mm_shuffle_epi32(g, _MM_SHUFFLE(3, 3, 1, 1)); | 
|---|
| 253 | lo = _mm_mul_epi32(lo, gy); | 
|---|
| 254 | hi = _mm_mul_epi32(hi, gy); | 
|---|
| 255 | lo = _mm_add_epi64(lo, mulround); | 
|---|
| 256 | hi = _mm_add_epi64(hi, mulround); | 
|---|
| 257 | lo = _mm_srli_epi64(lo, 13); | 
|---|
| 258 | hi = _mm_slli_epi64(hi, 32 - 13); | 
|---|
| 259 | y = _mm_add_epi32(y, _mm_blend_epi16(lo, hi, 0xCC)); | 
|---|
| 260 |  | 
|---|
| 261 | lo = b; | 
|---|
| 262 | hi = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1)); | 
|---|
| 263 | lo = _mm_mul_epi32(lo, by); | 
|---|
| 264 | hi = _mm_mul_epi32(hi, by); | 
|---|
| 265 | lo = _mm_add_epi64(lo, mulround); | 
|---|
| 266 | hi = _mm_add_epi64(hi, mulround); | 
|---|
| 267 | lo = _mm_srli_epi64(lo, 13); | 
|---|
| 268 | hi = _mm_slli_epi64(hi, 32 - 13); | 
|---|
| 269 | y = _mm_add_epi32(y, _mm_blend_epi16(lo, hi, 0xCC)); | 
|---|
| 270 | _mm_store_si128((__m128i *) & (c0[i]), y); | 
|---|
| 271 |  | 
|---|
| 272 | /*lo = b; | 
|---|
| 273 | hi = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1)); | 
|---|
| 274 | lo = _mm_mul_epi32(lo, mulround); | 
|---|
| 275 | hi = _mm_mul_epi32(hi, mulround);*/ | 
|---|
| 276 | lo = _mm_cvtepi32_epi64(_mm_shuffle_epi32(b, _MM_SHUFFLE(3, 2, 2, 0))); | 
|---|
| 277 | hi = _mm_cvtepi32_epi64(_mm_shuffle_epi32(b, _MM_SHUFFLE(3, 2, 3, 1))); | 
|---|
| 278 | lo = _mm_slli_epi64(lo, 12); | 
|---|
| 279 | hi = _mm_slli_epi64(hi, 12); | 
|---|
| 280 | lo = _mm_add_epi64(lo, mulround); | 
|---|
| 281 | hi = _mm_add_epi64(hi, mulround); | 
|---|
| 282 | lo = _mm_srli_epi64(lo, 13); | 
|---|
| 283 | hi = _mm_slli_epi64(hi, 32 - 13); | 
|---|
| 284 | u = _mm_blend_epi16(lo, hi, 0xCC); | 
|---|
| 285 |  | 
|---|
| 286 | lo = r; | 
|---|
| 287 | hi = _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 3, 1, 1)); | 
|---|
| 288 | lo = _mm_mul_epi32(lo, ru); | 
|---|
| 289 | hi = _mm_mul_epi32(hi, ru); | 
|---|
| 290 | lo = _mm_add_epi64(lo, mulround); | 
|---|
| 291 | hi = _mm_add_epi64(hi, mulround); | 
|---|
| 292 | lo = _mm_srli_epi64(lo, 13); | 
|---|
| 293 | hi = _mm_slli_epi64(hi, 32 - 13); | 
|---|
| 294 | u = _mm_sub_epi32(u, _mm_blend_epi16(lo, hi, 0xCC)); | 
|---|
| 295 |  | 
|---|
| 296 | lo = g; | 
|---|
| 297 | hi = _mm_shuffle_epi32(g, _MM_SHUFFLE(3, 3, 1, 1)); | 
|---|
| 298 | lo = _mm_mul_epi32(lo, gu); | 
|---|
| 299 | hi = _mm_mul_epi32(hi, gu); | 
|---|
| 300 | lo = _mm_add_epi64(lo, mulround); | 
|---|
| 301 | hi = _mm_add_epi64(hi, mulround); | 
|---|
| 302 | lo = _mm_srli_epi64(lo, 13); | 
|---|
| 303 | hi = _mm_slli_epi64(hi, 32 - 13); | 
|---|
| 304 | u = _mm_sub_epi32(u, _mm_blend_epi16(lo, hi, 0xCC)); | 
|---|
| 305 | _mm_store_si128((__m128i *) & (c1[i]), u); | 
|---|
| 306 |  | 
|---|
| 307 | /*lo = r; | 
|---|
| 308 | hi = _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 3, 1, 1)); | 
|---|
| 309 | lo = _mm_mul_epi32(lo, mulround); | 
|---|
| 310 | hi = _mm_mul_epi32(hi, mulround);*/ | 
|---|
| 311 | lo = _mm_cvtepi32_epi64(_mm_shuffle_epi32(r, _MM_SHUFFLE(3, 2, 2, 0))); | 
|---|
| 312 | hi = _mm_cvtepi32_epi64(_mm_shuffle_epi32(r, _MM_SHUFFLE(3, 2, 3, 1))); | 
|---|
| 313 | lo = _mm_slli_epi64(lo, 12); | 
|---|
| 314 | hi = _mm_slli_epi64(hi, 12); | 
|---|
| 315 | lo = _mm_add_epi64(lo, mulround); | 
|---|
| 316 | hi = _mm_add_epi64(hi, mulround); | 
|---|
| 317 | lo = _mm_srli_epi64(lo, 13); | 
|---|
| 318 | hi = _mm_slli_epi64(hi, 32 - 13); | 
|---|
| 319 | v = _mm_blend_epi16(lo, hi, 0xCC); | 
|---|
| 320 |  | 
|---|
| 321 | lo = g; | 
|---|
| 322 | hi = _mm_shuffle_epi32(g, _MM_SHUFFLE(3, 3, 1, 1)); | 
|---|
| 323 | lo = _mm_mul_epi32(lo, gv); | 
|---|
| 324 | hi = _mm_mul_epi32(hi, gv); | 
|---|
| 325 | lo = _mm_add_epi64(lo, mulround); | 
|---|
| 326 | hi = _mm_add_epi64(hi, mulround); | 
|---|
| 327 | lo = _mm_srli_epi64(lo, 13); | 
|---|
| 328 | hi = _mm_slli_epi64(hi, 32 - 13); | 
|---|
| 329 | v = _mm_sub_epi32(v, _mm_blend_epi16(lo, hi, 0xCC)); | 
|---|
| 330 |  | 
|---|
| 331 | lo = b; | 
|---|
| 332 | hi = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1)); | 
|---|
| 333 | lo = _mm_mul_epi32(lo, bv); | 
|---|
| 334 | hi = _mm_mul_epi32(hi, bv); | 
|---|
| 335 | lo = _mm_add_epi64(lo, mulround); | 
|---|
| 336 | hi = _mm_add_epi64(hi, mulround); | 
|---|
| 337 | lo = _mm_srli_epi64(lo, 13); | 
|---|
| 338 | hi = _mm_slli_epi64(hi, 32 - 13); | 
|---|
| 339 | v = _mm_sub_epi32(v, _mm_blend_epi16(lo, hi, 0xCC)); | 
|---|
| 340 | _mm_store_si128((__m128i *) & (c2[i]), v); | 
|---|
| 341 | } | 
|---|
| 342 | for (; i < len; ++i) { | 
|---|
| 343 | OPJ_INT32 r = c0[i]; | 
|---|
| 344 | OPJ_INT32 g = c1[i]; | 
|---|
| 345 | OPJ_INT32 b = c2[i]; | 
|---|
| 346 | OPJ_INT32 y =  opj_int_fix_mul(r, 2449) + opj_int_fix_mul(g, | 
|---|
| 347 | 4809) + opj_int_fix_mul(b, 934); | 
|---|
| 348 | OPJ_INT32 u = -opj_int_fix_mul(r, 1382) - opj_int_fix_mul(g, | 
|---|
| 349 | 2714) + opj_int_fix_mul(b, 4096); | 
|---|
| 350 | OPJ_INT32 v =  opj_int_fix_mul(r, 4096) - opj_int_fix_mul(g, | 
|---|
| 351 | 3430) - opj_int_fix_mul(b, 666); | 
|---|
| 352 | c0[i] = y; | 
|---|
| 353 | c1[i] = u; | 
|---|
| 354 | c2[i] = v; | 
|---|
| 355 | } | 
|---|
| 356 | } | 
|---|
| 357 | #else | 
|---|
| 358 | void opj_mct_encode_real( | 
|---|
| 359 | OPJ_INT32* OPJ_RESTRICT c0, | 
|---|
| 360 | OPJ_INT32* OPJ_RESTRICT c1, | 
|---|
| 361 | OPJ_INT32* OPJ_RESTRICT c2, | 
|---|
| 362 | OPJ_SIZE_T n) | 
|---|
| 363 | { | 
|---|
| 364 | OPJ_UINT32 i; | 
|---|
| 365 | for (i = 0; i < n; ++i) { | 
|---|
| 366 | OPJ_INT32 r = c0[i]; | 
|---|
| 367 | OPJ_INT32 g = c1[i]; | 
|---|
| 368 | OPJ_INT32 b = c2[i]; | 
|---|
| 369 | OPJ_INT32 y =  opj_int_fix_mul(r, 2449) + opj_int_fix_mul(g, | 
|---|
| 370 | 4809) + opj_int_fix_mul(b, 934); | 
|---|
| 371 | OPJ_INT32 u = -opj_int_fix_mul(r, 1382) - opj_int_fix_mul(g, | 
|---|
| 372 | 2714) + opj_int_fix_mul(b, 4096); | 
|---|
| 373 | OPJ_INT32 v =  opj_int_fix_mul(r, 4096) - opj_int_fix_mul(g, | 
|---|
| 374 | 3430) - opj_int_fix_mul(b, 666); | 
|---|
| 375 | c0[i] = y; | 
|---|
| 376 | c1[i] = u; | 
|---|
| 377 | c2[i] = v; | 
|---|
| 378 | } | 
|---|
| 379 | } | 
|---|
| 380 | #endif | 
|---|
| 381 |  | 
|---|
| 382 | /* <summary> */ | 
|---|
| 383 | /* Inverse irreversible MCT. */ | 
|---|
| 384 | /* </summary> */ | 
|---|
| 385 | void opj_mct_decode_real( | 
|---|
| 386 | OPJ_FLOAT32* OPJ_RESTRICT c0, | 
|---|
| 387 | OPJ_FLOAT32* OPJ_RESTRICT c1, | 
|---|
| 388 | OPJ_FLOAT32* OPJ_RESTRICT c2, | 
|---|
| 389 | OPJ_SIZE_T n) | 
|---|
| 390 | { | 
|---|
| 391 | OPJ_UINT32 i; | 
|---|
| 392 | #ifdef __SSE__ | 
|---|
| 393 | __m128 vrv, vgu, vgv, vbu; | 
|---|
| 394 | vrv = _mm_set1_ps(1.402f); | 
|---|
| 395 | vgu = _mm_set1_ps(0.34413f); | 
|---|
| 396 | vgv = _mm_set1_ps(0.71414f); | 
|---|
| 397 | vbu = _mm_set1_ps(1.772f); | 
|---|
| 398 | for (i = 0; i < (n >> 3); ++i) { | 
|---|
| 399 | __m128 vy, vu, vv; | 
|---|
| 400 | __m128 vr, vg, vb; | 
|---|
| 401 |  | 
|---|
| 402 | vy = _mm_load_ps(c0); | 
|---|
| 403 | vu = _mm_load_ps(c1); | 
|---|
| 404 | vv = _mm_load_ps(c2); | 
|---|
| 405 | vr = _mm_add_ps(vy, _mm_mul_ps(vv, vrv)); | 
|---|
| 406 | vg = _mm_sub_ps(_mm_sub_ps(vy, _mm_mul_ps(vu, vgu)), _mm_mul_ps(vv, vgv)); | 
|---|
| 407 | vb = _mm_add_ps(vy, _mm_mul_ps(vu, vbu)); | 
|---|
| 408 | _mm_store_ps(c0, vr); | 
|---|
| 409 | _mm_store_ps(c1, vg); | 
|---|
| 410 | _mm_store_ps(c2, vb); | 
|---|
| 411 | c0 += 4; | 
|---|
| 412 | c1 += 4; | 
|---|
| 413 | c2 += 4; | 
|---|
| 414 |  | 
|---|
| 415 | vy = _mm_load_ps(c0); | 
|---|
| 416 | vu = _mm_load_ps(c1); | 
|---|
| 417 | vv = _mm_load_ps(c2); | 
|---|
| 418 | vr = _mm_add_ps(vy, _mm_mul_ps(vv, vrv)); | 
|---|
| 419 | vg = _mm_sub_ps(_mm_sub_ps(vy, _mm_mul_ps(vu, vgu)), _mm_mul_ps(vv, vgv)); | 
|---|
| 420 | vb = _mm_add_ps(vy, _mm_mul_ps(vu, vbu)); | 
|---|
| 421 | _mm_store_ps(c0, vr); | 
|---|
| 422 | _mm_store_ps(c1, vg); | 
|---|
| 423 | _mm_store_ps(c2, vb); | 
|---|
| 424 | c0 += 4; | 
|---|
| 425 | c1 += 4; | 
|---|
| 426 | c2 += 4; | 
|---|
| 427 | } | 
|---|
| 428 | n &= 7; | 
|---|
| 429 | #endif | 
|---|
| 430 | for (i = 0; i < n; ++i) { | 
|---|
| 431 | OPJ_FLOAT32 y = c0[i]; | 
|---|
| 432 | OPJ_FLOAT32 u = c1[i]; | 
|---|
| 433 | OPJ_FLOAT32 v = c2[i]; | 
|---|
| 434 | OPJ_FLOAT32 r = y + (v * 1.402f); | 
|---|
| 435 | OPJ_FLOAT32 g = y - (u * 0.34413f) - (v * (0.71414f)); | 
|---|
| 436 | OPJ_FLOAT32 b = y + (u * 1.772f); | 
|---|
| 437 | c0[i] = r; | 
|---|
| 438 | c1[i] = g; | 
|---|
| 439 | c2[i] = b; | 
|---|
| 440 | } | 
|---|
| 441 | } | 
|---|
| 442 |  | 
|---|
| 443 | /* <summary> */ | 
|---|
| 444 | /* Get norm of basis function of irreversible MCT. */ | 
|---|
| 445 | /* </summary> */ | 
|---|
| 446 | OPJ_FLOAT64 opj_mct_getnorm_real(OPJ_UINT32 compno) | 
|---|
| 447 | { | 
|---|
| 448 | return opj_mct_norms_real[compno]; | 
|---|
| 449 | } | 
|---|
| 450 |  | 
|---|
| 451 |  | 
|---|
| 452 | OPJ_BOOL opj_mct_encode_custom( | 
|---|
| 453 | OPJ_BYTE * pCodingdata, | 
|---|
| 454 | OPJ_SIZE_T n, | 
|---|
| 455 | OPJ_BYTE ** pData, | 
|---|
| 456 | OPJ_UINT32 pNbComp, | 
|---|
| 457 | OPJ_UINT32 isSigned) | 
|---|
| 458 | { | 
|---|
| 459 | OPJ_FLOAT32 * lMct = (OPJ_FLOAT32 *) pCodingdata; | 
|---|
| 460 | OPJ_SIZE_T i; | 
|---|
| 461 | OPJ_UINT32 j; | 
|---|
| 462 | OPJ_UINT32 k; | 
|---|
| 463 | OPJ_UINT32 lNbMatCoeff = pNbComp * pNbComp; | 
|---|
| 464 | OPJ_INT32 * lCurrentData = 00; | 
|---|
| 465 | OPJ_INT32 * lCurrentMatrix = 00; | 
|---|
| 466 | OPJ_INT32 ** lData = (OPJ_INT32 **) pData; | 
|---|
| 467 | OPJ_UINT32 lMultiplicator = 1 << 13; | 
|---|
| 468 | OPJ_INT32 * lMctPtr; | 
|---|
| 469 |  | 
|---|
| 470 | OPJ_ARG_NOT_USED(isSigned); | 
|---|
| 471 |  | 
|---|
| 472 | lCurrentData = (OPJ_INT32 *) opj_malloc((pNbComp + lNbMatCoeff) * sizeof( | 
|---|
| 473 | OPJ_INT32)); | 
|---|
| 474 | if (! lCurrentData) { | 
|---|
| 475 | return OPJ_FALSE; | 
|---|
| 476 | } | 
|---|
| 477 |  | 
|---|
| 478 | lCurrentMatrix = lCurrentData + pNbComp; | 
|---|
| 479 |  | 
|---|
| 480 | for (i = 0; i < lNbMatCoeff; ++i) { | 
|---|
| 481 | lCurrentMatrix[i] = (OPJ_INT32)(*(lMct++) * (OPJ_FLOAT32)lMultiplicator); | 
|---|
| 482 | } | 
|---|
| 483 |  | 
|---|
| 484 | for (i = 0; i < n; ++i)  { | 
|---|
| 485 | lMctPtr = lCurrentMatrix; | 
|---|
| 486 | for (j = 0; j < pNbComp; ++j) { | 
|---|
| 487 | lCurrentData[j] = (*(lData[j])); | 
|---|
| 488 | } | 
|---|
| 489 |  | 
|---|
| 490 | for (j = 0; j < pNbComp; ++j) { | 
|---|
| 491 | *(lData[j]) = 0; | 
|---|
| 492 | for (k = 0; k < pNbComp; ++k) { | 
|---|
| 493 | *(lData[j]) += opj_int_fix_mul(*lMctPtr, lCurrentData[k]); | 
|---|
| 494 | ++lMctPtr; | 
|---|
| 495 | } | 
|---|
| 496 |  | 
|---|
| 497 | ++lData[j]; | 
|---|
| 498 | } | 
|---|
| 499 | } | 
|---|
| 500 |  | 
|---|
| 501 | opj_free(lCurrentData); | 
|---|
| 502 |  | 
|---|
| 503 | return OPJ_TRUE; | 
|---|
| 504 | } | 
|---|
| 505 |  | 
|---|
| 506 | OPJ_BOOL opj_mct_decode_custom( | 
|---|
| 507 | OPJ_BYTE * pDecodingData, | 
|---|
| 508 | OPJ_SIZE_T n, | 
|---|
| 509 | OPJ_BYTE ** pData, | 
|---|
| 510 | OPJ_UINT32 pNbComp, | 
|---|
| 511 | OPJ_UINT32 isSigned) | 
|---|
| 512 | { | 
|---|
| 513 | OPJ_FLOAT32 * lMct; | 
|---|
| 514 | OPJ_SIZE_T i; | 
|---|
| 515 | OPJ_UINT32 j; | 
|---|
| 516 | OPJ_UINT32 k; | 
|---|
| 517 |  | 
|---|
| 518 | OPJ_FLOAT32 * lCurrentData = 00; | 
|---|
| 519 | OPJ_FLOAT32 * lCurrentResult = 00; | 
|---|
| 520 | OPJ_FLOAT32 ** lData = (OPJ_FLOAT32 **) pData; | 
|---|
| 521 |  | 
|---|
| 522 | OPJ_ARG_NOT_USED(isSigned); | 
|---|
| 523 |  | 
|---|
| 524 | lCurrentData = (OPJ_FLOAT32 *) opj_malloc(2 * pNbComp * sizeof(OPJ_FLOAT32)); | 
|---|
| 525 | if (! lCurrentData) { | 
|---|
| 526 | return OPJ_FALSE; | 
|---|
| 527 | } | 
|---|
| 528 | lCurrentResult = lCurrentData + pNbComp; | 
|---|
| 529 |  | 
|---|
| 530 | for (i = 0; i < n; ++i) { | 
|---|
| 531 | lMct = (OPJ_FLOAT32 *) pDecodingData; | 
|---|
| 532 | for (j = 0; j < pNbComp; ++j) { | 
|---|
| 533 | lCurrentData[j] = (OPJ_FLOAT32)(*(lData[j])); | 
|---|
| 534 | } | 
|---|
| 535 | for (j = 0; j < pNbComp; ++j) { | 
|---|
| 536 | lCurrentResult[j] = 0; | 
|---|
| 537 | for (k = 0; k < pNbComp; ++k) { | 
|---|
| 538 | lCurrentResult[j] += *(lMct++) * lCurrentData[k]; | 
|---|
| 539 | } | 
|---|
| 540 | *(lData[j]++) = (OPJ_FLOAT32)(lCurrentResult[j]); | 
|---|
| 541 | } | 
|---|
| 542 | } | 
|---|
| 543 | opj_free(lCurrentData); | 
|---|
| 544 | return OPJ_TRUE; | 
|---|
| 545 | } | 
|---|
| 546 |  | 
|---|
| 547 | void opj_calculate_norms(OPJ_FLOAT64 * pNorms, | 
|---|
| 548 | OPJ_UINT32 pNbComps, | 
|---|
| 549 | OPJ_FLOAT32 * pMatrix) | 
|---|
| 550 | { | 
|---|
| 551 | OPJ_UINT32 i, j, lIndex; | 
|---|
| 552 | OPJ_FLOAT32 lCurrentValue; | 
|---|
| 553 | OPJ_FLOAT64 * lNorms = (OPJ_FLOAT64 *) pNorms; | 
|---|
| 554 | OPJ_FLOAT32 * lMatrix = (OPJ_FLOAT32 *) pMatrix; | 
|---|
| 555 |  | 
|---|
| 556 | for (i = 0; i < pNbComps; ++i) { | 
|---|
| 557 | lNorms[i] = 0; | 
|---|
| 558 | lIndex = i; | 
|---|
| 559 |  | 
|---|
| 560 | for (j = 0; j < pNbComps; ++j) { | 
|---|
| 561 | lCurrentValue = lMatrix[lIndex]; | 
|---|
| 562 | lIndex += pNbComps; | 
|---|
| 563 | lNorms[i] += lCurrentValue * lCurrentValue; | 
|---|
| 564 | } | 
|---|
| 565 | lNorms[i] = sqrt(lNorms[i]); | 
|---|
| 566 | } | 
|---|
| 567 | } | 
|---|
| 568 |  | 
|---|