| 1 | /* |
| 2 | * The copyright in this software is being made available under the 2-clauses |
| 3 | * BSD License, included below. This software may be subject to other third |
| 4 | * party and contributor rights, including patent rights, and no such rights |
| 5 | * are granted under this license. |
| 6 | * |
| 7 | * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium |
| 8 | * Copyright (c) 2002-2014, Professor Benoit Macq |
| 9 | * Copyright (c) 2001-2003, David Janssens |
| 10 | * Copyright (c) 2002-2003, Yannick Verschueren |
| 11 | * Copyright (c) 2003-2007, Francois-Olivier Devaux |
| 12 | * Copyright (c) 2003-2014, Antonin Descampe |
| 13 | * Copyright (c) 2005, Herve Drolon, FreeImage Team |
| 14 | * Copyright (c) 2008, 2011-2012, Centre National d'Etudes Spatiales (CNES), FR |
| 15 | * Copyright (c) 2012, CS Systemes d'Information, France |
| 16 | * All rights reserved. |
| 17 | * |
| 18 | * Redistribution and use in source and binary forms, with or without |
| 19 | * modification, are permitted provided that the following conditions |
| 20 | * are met: |
| 21 | * 1. Redistributions of source code must retain the above copyright |
| 22 | * notice, this list of conditions and the following disclaimer. |
| 23 | * 2. Redistributions in binary form must reproduce the above copyright |
| 24 | * notice, this list of conditions and the following disclaimer in the |
| 25 | * documentation and/or other materials provided with the distribution. |
| 26 | * |
| 27 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' |
| 28 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 29 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 30 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
| 31 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| 32 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| 33 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| 34 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| 35 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| 36 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| 37 | * POSSIBILITY OF SUCH DAMAGE. |
| 38 | */ |
| 39 | |
| 40 | #ifdef __SSE__ |
| 41 | #include <xmmintrin.h> |
| 42 | #endif |
| 43 | #ifdef __SSE2__ |
| 44 | #include <emmintrin.h> |
| 45 | #endif |
| 46 | #ifdef __SSE4_1__ |
| 47 | #include <smmintrin.h> |
| 48 | #endif |
| 49 | |
| 50 | #include "opj_includes.h" |
| 51 | |
| 52 | /* <summary> */ |
| 53 | /* This table contains the norms of the basis function of the reversible MCT. */ |
| 54 | /* </summary> */ |
| 55 | static const OPJ_FLOAT64 opj_mct_norms[3] = { 1.732, .8292, .8292 }; |
| 56 | |
| 57 | /* <summary> */ |
| 58 | /* This table contains the norms of the basis function of the irreversible MCT. */ |
| 59 | /* </summary> */ |
| 60 | static const OPJ_FLOAT64 opj_mct_norms_real[3] = { 1.732, 1.805, 1.573 }; |
| 61 | |
| 62 | const OPJ_FLOAT64 * opj_mct_get_mct_norms() |
| 63 | { |
| 64 | return opj_mct_norms; |
| 65 | } |
| 66 | |
| 67 | const OPJ_FLOAT64 * opj_mct_get_mct_norms_real() |
| 68 | { |
| 69 | return opj_mct_norms_real; |
| 70 | } |
| 71 | |
| 72 | /* <summary> */ |
| 73 | /* Forward reversible MCT. */ |
| 74 | /* </summary> */ |
| 75 | #ifdef __SSE2__ |
| 76 | void opj_mct_encode( |
| 77 | OPJ_INT32* OPJ_RESTRICT c0, |
| 78 | OPJ_INT32* OPJ_RESTRICT c1, |
| 79 | OPJ_INT32* OPJ_RESTRICT c2, |
| 80 | OPJ_SIZE_T n) |
| 81 | { |
| 82 | OPJ_SIZE_T i; |
| 83 | const OPJ_SIZE_T len = n; |
| 84 | /* buffer are aligned on 16 bytes */ |
| 85 | assert(((size_t)c0 & 0xf) == 0); |
| 86 | assert(((size_t)c1 & 0xf) == 0); |
| 87 | assert(((size_t)c2 & 0xf) == 0); |
| 88 | |
| 89 | for (i = 0; i < (len & ~3U); i += 4) { |
| 90 | __m128i y, u, v; |
| 91 | __m128i r = _mm_load_si128((const __m128i *) & (c0[i])); |
| 92 | __m128i g = _mm_load_si128((const __m128i *) & (c1[i])); |
| 93 | __m128i b = _mm_load_si128((const __m128i *) & (c2[i])); |
| 94 | y = _mm_add_epi32(g, g); |
| 95 | y = _mm_add_epi32(y, b); |
| 96 | y = _mm_add_epi32(y, r); |
| 97 | y = _mm_srai_epi32(y, 2); |
| 98 | u = _mm_sub_epi32(b, g); |
| 99 | v = _mm_sub_epi32(r, g); |
| 100 | _mm_store_si128((__m128i *) & (c0[i]), y); |
| 101 | _mm_store_si128((__m128i *) & (c1[i]), u); |
| 102 | _mm_store_si128((__m128i *) & (c2[i]), v); |
| 103 | } |
| 104 | |
| 105 | for (; i < len; ++i) { |
| 106 | OPJ_INT32 r = c0[i]; |
| 107 | OPJ_INT32 g = c1[i]; |
| 108 | OPJ_INT32 b = c2[i]; |
| 109 | OPJ_INT32 y = (r + (g * 2) + b) >> 2; |
| 110 | OPJ_INT32 u = b - g; |
| 111 | OPJ_INT32 v = r - g; |
| 112 | c0[i] = y; |
| 113 | c1[i] = u; |
| 114 | c2[i] = v; |
| 115 | } |
| 116 | } |
| 117 | #else |
| 118 | void opj_mct_encode( |
| 119 | OPJ_INT32* OPJ_RESTRICT c0, |
| 120 | OPJ_INT32* OPJ_RESTRICT c1, |
| 121 | OPJ_INT32* OPJ_RESTRICT c2, |
| 122 | OPJ_SIZE_T n) |
| 123 | { |
| 124 | OPJ_SIZE_T i; |
| 125 | const OPJ_SIZE_T len = n; |
| 126 | |
| 127 | for (i = 0; i < len; ++i) { |
| 128 | OPJ_INT32 r = c0[i]; |
| 129 | OPJ_INT32 g = c1[i]; |
| 130 | OPJ_INT32 b = c2[i]; |
| 131 | OPJ_INT32 y = (r + (g * 2) + b) >> 2; |
| 132 | OPJ_INT32 u = b - g; |
| 133 | OPJ_INT32 v = r - g; |
| 134 | c0[i] = y; |
| 135 | c1[i] = u; |
| 136 | c2[i] = v; |
| 137 | } |
| 138 | } |
| 139 | #endif |
| 140 | |
| 141 | /* <summary> */ |
| 142 | /* Inverse reversible MCT. */ |
| 143 | /* </summary> */ |
| 144 | #ifdef __SSE2__ |
| 145 | void opj_mct_decode( |
| 146 | OPJ_INT32* OPJ_RESTRICT c0, |
| 147 | OPJ_INT32* OPJ_RESTRICT c1, |
| 148 | OPJ_INT32* OPJ_RESTRICT c2, |
| 149 | OPJ_SIZE_T n) |
| 150 | { |
| 151 | OPJ_SIZE_T i; |
| 152 | const OPJ_SIZE_T len = n; |
| 153 | |
| 154 | for (i = 0; i < (len & ~3U); i += 4) { |
| 155 | __m128i r, g, b; |
| 156 | __m128i y = _mm_load_si128((const __m128i *) & (c0[i])); |
| 157 | __m128i u = _mm_load_si128((const __m128i *) & (c1[i])); |
| 158 | __m128i v = _mm_load_si128((const __m128i *) & (c2[i])); |
| 159 | g = y; |
| 160 | g = _mm_sub_epi32(g, _mm_srai_epi32(_mm_add_epi32(u, v), 2)); |
| 161 | r = _mm_add_epi32(v, g); |
| 162 | b = _mm_add_epi32(u, g); |
| 163 | _mm_store_si128((__m128i *) & (c0[i]), r); |
| 164 | _mm_store_si128((__m128i *) & (c1[i]), g); |
| 165 | _mm_store_si128((__m128i *) & (c2[i]), b); |
| 166 | } |
| 167 | for (; i < len; ++i) { |
| 168 | OPJ_INT32 y = c0[i]; |
| 169 | OPJ_INT32 u = c1[i]; |
| 170 | OPJ_INT32 v = c2[i]; |
| 171 | OPJ_INT32 g = y - ((u + v) >> 2); |
| 172 | OPJ_INT32 r = v + g; |
| 173 | OPJ_INT32 b = u + g; |
| 174 | c0[i] = r; |
| 175 | c1[i] = g; |
| 176 | c2[i] = b; |
| 177 | } |
| 178 | } |
| 179 | #else |
| 180 | void opj_mct_decode( |
| 181 | OPJ_INT32* OPJ_RESTRICT c0, |
| 182 | OPJ_INT32* OPJ_RESTRICT c1, |
| 183 | OPJ_INT32* OPJ_RESTRICT c2, |
| 184 | OPJ_SIZE_T n) |
| 185 | { |
| 186 | OPJ_UINT32 i; |
| 187 | for (i = 0; i < n; ++i) { |
| 188 | OPJ_INT32 y = c0[i]; |
| 189 | OPJ_INT32 u = c1[i]; |
| 190 | OPJ_INT32 v = c2[i]; |
| 191 | OPJ_INT32 g = y - ((u + v) >> 2); |
| 192 | OPJ_INT32 r = v + g; |
| 193 | OPJ_INT32 b = u + g; |
| 194 | c0[i] = r; |
| 195 | c1[i] = g; |
| 196 | c2[i] = b; |
| 197 | } |
| 198 | } |
| 199 | #endif |
| 200 | |
| 201 | /* <summary> */ |
| 202 | /* Get norm of basis function of reversible MCT. */ |
| 203 | /* </summary> */ |
| 204 | OPJ_FLOAT64 opj_mct_getnorm(OPJ_UINT32 compno) |
| 205 | { |
| 206 | return opj_mct_norms[compno]; |
| 207 | } |
| 208 | |
| 209 | /* <summary> */ |
| 210 | /* Forward irreversible MCT. */ |
| 211 | /* </summary> */ |
| 212 | #ifdef __SSE4_1__ |
| 213 | void opj_mct_encode_real( |
| 214 | OPJ_INT32* OPJ_RESTRICT c0, |
| 215 | OPJ_INT32* OPJ_RESTRICT c1, |
| 216 | OPJ_INT32* OPJ_RESTRICT c2, |
| 217 | OPJ_SIZE_T n) |
| 218 | { |
| 219 | OPJ_SIZE_T i; |
| 220 | const OPJ_SIZE_T len = n; |
| 221 | |
| 222 | const __m128i ry = _mm_set1_epi32(2449); |
| 223 | const __m128i gy = _mm_set1_epi32(4809); |
| 224 | const __m128i by = _mm_set1_epi32(934); |
| 225 | const __m128i ru = _mm_set1_epi32(1382); |
| 226 | const __m128i gu = _mm_set1_epi32(2714); |
| 227 | /* const __m128i bu = _mm_set1_epi32(4096); */ |
| 228 | /* const __m128i rv = _mm_set1_epi32(4096); */ |
| 229 | const __m128i gv = _mm_set1_epi32(3430); |
| 230 | const __m128i bv = _mm_set1_epi32(666); |
| 231 | const __m128i mulround = _mm_shuffle_epi32(_mm_cvtsi32_si128(4096), |
| 232 | _MM_SHUFFLE(1, 0, 1, 0)); |
| 233 | |
| 234 | for (i = 0; i < (len & ~3U); i += 4) { |
| 235 | __m128i lo, hi; |
| 236 | __m128i y, u, v; |
| 237 | __m128i r = _mm_load_si128((const __m128i *) & (c0[i])); |
| 238 | __m128i g = _mm_load_si128((const __m128i *) & (c1[i])); |
| 239 | __m128i b = _mm_load_si128((const __m128i *) & (c2[i])); |
| 240 | |
| 241 | lo = r; |
| 242 | hi = _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 3, 1, 1)); |
| 243 | lo = _mm_mul_epi32(lo, ry); |
| 244 | hi = _mm_mul_epi32(hi, ry); |
| 245 | lo = _mm_add_epi64(lo, mulround); |
| 246 | hi = _mm_add_epi64(hi, mulround); |
| 247 | lo = _mm_srli_epi64(lo, 13); |
| 248 | hi = _mm_slli_epi64(hi, 32 - 13); |
| 249 | y = _mm_blend_epi16(lo, hi, 0xCC); |
| 250 | |
| 251 | lo = g; |
| 252 | hi = _mm_shuffle_epi32(g, _MM_SHUFFLE(3, 3, 1, 1)); |
| 253 | lo = _mm_mul_epi32(lo, gy); |
| 254 | hi = _mm_mul_epi32(hi, gy); |
| 255 | lo = _mm_add_epi64(lo, mulround); |
| 256 | hi = _mm_add_epi64(hi, mulround); |
| 257 | lo = _mm_srli_epi64(lo, 13); |
| 258 | hi = _mm_slli_epi64(hi, 32 - 13); |
| 259 | y = _mm_add_epi32(y, _mm_blend_epi16(lo, hi, 0xCC)); |
| 260 | |
| 261 | lo = b; |
| 262 | hi = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1)); |
| 263 | lo = _mm_mul_epi32(lo, by); |
| 264 | hi = _mm_mul_epi32(hi, by); |
| 265 | lo = _mm_add_epi64(lo, mulround); |
| 266 | hi = _mm_add_epi64(hi, mulround); |
| 267 | lo = _mm_srli_epi64(lo, 13); |
| 268 | hi = _mm_slli_epi64(hi, 32 - 13); |
| 269 | y = _mm_add_epi32(y, _mm_blend_epi16(lo, hi, 0xCC)); |
| 270 | _mm_store_si128((__m128i *) & (c0[i]), y); |
| 271 | |
| 272 | /*lo = b; |
| 273 | hi = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1)); |
| 274 | lo = _mm_mul_epi32(lo, mulround); |
| 275 | hi = _mm_mul_epi32(hi, mulround);*/ |
| 276 | lo = _mm_cvtepi32_epi64(_mm_shuffle_epi32(b, _MM_SHUFFLE(3, 2, 2, 0))); |
| 277 | hi = _mm_cvtepi32_epi64(_mm_shuffle_epi32(b, _MM_SHUFFLE(3, 2, 3, 1))); |
| 278 | lo = _mm_slli_epi64(lo, 12); |
| 279 | hi = _mm_slli_epi64(hi, 12); |
| 280 | lo = _mm_add_epi64(lo, mulround); |
| 281 | hi = _mm_add_epi64(hi, mulround); |
| 282 | lo = _mm_srli_epi64(lo, 13); |
| 283 | hi = _mm_slli_epi64(hi, 32 - 13); |
| 284 | u = _mm_blend_epi16(lo, hi, 0xCC); |
| 285 | |
| 286 | lo = r; |
| 287 | hi = _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 3, 1, 1)); |
| 288 | lo = _mm_mul_epi32(lo, ru); |
| 289 | hi = _mm_mul_epi32(hi, ru); |
| 290 | lo = _mm_add_epi64(lo, mulround); |
| 291 | hi = _mm_add_epi64(hi, mulround); |
| 292 | lo = _mm_srli_epi64(lo, 13); |
| 293 | hi = _mm_slli_epi64(hi, 32 - 13); |
| 294 | u = _mm_sub_epi32(u, _mm_blend_epi16(lo, hi, 0xCC)); |
| 295 | |
| 296 | lo = g; |
| 297 | hi = _mm_shuffle_epi32(g, _MM_SHUFFLE(3, 3, 1, 1)); |
| 298 | lo = _mm_mul_epi32(lo, gu); |
| 299 | hi = _mm_mul_epi32(hi, gu); |
| 300 | lo = _mm_add_epi64(lo, mulround); |
| 301 | hi = _mm_add_epi64(hi, mulround); |
| 302 | lo = _mm_srli_epi64(lo, 13); |
| 303 | hi = _mm_slli_epi64(hi, 32 - 13); |
| 304 | u = _mm_sub_epi32(u, _mm_blend_epi16(lo, hi, 0xCC)); |
| 305 | _mm_store_si128((__m128i *) & (c1[i]), u); |
| 306 | |
| 307 | /*lo = r; |
| 308 | hi = _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 3, 1, 1)); |
| 309 | lo = _mm_mul_epi32(lo, mulround); |
| 310 | hi = _mm_mul_epi32(hi, mulround);*/ |
| 311 | lo = _mm_cvtepi32_epi64(_mm_shuffle_epi32(r, _MM_SHUFFLE(3, 2, 2, 0))); |
| 312 | hi = _mm_cvtepi32_epi64(_mm_shuffle_epi32(r, _MM_SHUFFLE(3, 2, 3, 1))); |
| 313 | lo = _mm_slli_epi64(lo, 12); |
| 314 | hi = _mm_slli_epi64(hi, 12); |
| 315 | lo = _mm_add_epi64(lo, mulround); |
| 316 | hi = _mm_add_epi64(hi, mulround); |
| 317 | lo = _mm_srli_epi64(lo, 13); |
| 318 | hi = _mm_slli_epi64(hi, 32 - 13); |
| 319 | v = _mm_blend_epi16(lo, hi, 0xCC); |
| 320 | |
| 321 | lo = g; |
| 322 | hi = _mm_shuffle_epi32(g, _MM_SHUFFLE(3, 3, 1, 1)); |
| 323 | lo = _mm_mul_epi32(lo, gv); |
| 324 | hi = _mm_mul_epi32(hi, gv); |
| 325 | lo = _mm_add_epi64(lo, mulround); |
| 326 | hi = _mm_add_epi64(hi, mulround); |
| 327 | lo = _mm_srli_epi64(lo, 13); |
| 328 | hi = _mm_slli_epi64(hi, 32 - 13); |
| 329 | v = _mm_sub_epi32(v, _mm_blend_epi16(lo, hi, 0xCC)); |
| 330 | |
| 331 | lo = b; |
| 332 | hi = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1)); |
| 333 | lo = _mm_mul_epi32(lo, bv); |
| 334 | hi = _mm_mul_epi32(hi, bv); |
| 335 | lo = _mm_add_epi64(lo, mulround); |
| 336 | hi = _mm_add_epi64(hi, mulround); |
| 337 | lo = _mm_srli_epi64(lo, 13); |
| 338 | hi = _mm_slli_epi64(hi, 32 - 13); |
| 339 | v = _mm_sub_epi32(v, _mm_blend_epi16(lo, hi, 0xCC)); |
| 340 | _mm_store_si128((__m128i *) & (c2[i]), v); |
| 341 | } |
| 342 | for (; i < len; ++i) { |
| 343 | OPJ_INT32 r = c0[i]; |
| 344 | OPJ_INT32 g = c1[i]; |
| 345 | OPJ_INT32 b = c2[i]; |
| 346 | OPJ_INT32 y = opj_int_fix_mul(r, 2449) + opj_int_fix_mul(g, |
| 347 | 4809) + opj_int_fix_mul(b, 934); |
| 348 | OPJ_INT32 u = -opj_int_fix_mul(r, 1382) - opj_int_fix_mul(g, |
| 349 | 2714) + opj_int_fix_mul(b, 4096); |
| 350 | OPJ_INT32 v = opj_int_fix_mul(r, 4096) - opj_int_fix_mul(g, |
| 351 | 3430) - opj_int_fix_mul(b, 666); |
| 352 | c0[i] = y; |
| 353 | c1[i] = u; |
| 354 | c2[i] = v; |
| 355 | } |
| 356 | } |
| 357 | #else |
| 358 | void opj_mct_encode_real( |
| 359 | OPJ_INT32* OPJ_RESTRICT c0, |
| 360 | OPJ_INT32* OPJ_RESTRICT c1, |
| 361 | OPJ_INT32* OPJ_RESTRICT c2, |
| 362 | OPJ_SIZE_T n) |
| 363 | { |
| 364 | OPJ_UINT32 i; |
| 365 | for (i = 0; i < n; ++i) { |
| 366 | OPJ_INT32 r = c0[i]; |
| 367 | OPJ_INT32 g = c1[i]; |
| 368 | OPJ_INT32 b = c2[i]; |
| 369 | OPJ_INT32 y = opj_int_fix_mul(r, 2449) + opj_int_fix_mul(g, |
| 370 | 4809) + opj_int_fix_mul(b, 934); |
| 371 | OPJ_INT32 u = -opj_int_fix_mul(r, 1382) - opj_int_fix_mul(g, |
| 372 | 2714) + opj_int_fix_mul(b, 4096); |
| 373 | OPJ_INT32 v = opj_int_fix_mul(r, 4096) - opj_int_fix_mul(g, |
| 374 | 3430) - opj_int_fix_mul(b, 666); |
| 375 | c0[i] = y; |
| 376 | c1[i] = u; |
| 377 | c2[i] = v; |
| 378 | } |
| 379 | } |
| 380 | #endif |
| 381 | |
| 382 | /* <summary> */ |
| 383 | /* Inverse irreversible MCT. */ |
| 384 | /* </summary> */ |
| 385 | void opj_mct_decode_real( |
| 386 | OPJ_FLOAT32* OPJ_RESTRICT c0, |
| 387 | OPJ_FLOAT32* OPJ_RESTRICT c1, |
| 388 | OPJ_FLOAT32* OPJ_RESTRICT c2, |
| 389 | OPJ_SIZE_T n) |
| 390 | { |
| 391 | OPJ_UINT32 i; |
| 392 | #ifdef __SSE__ |
| 393 | __m128 vrv, vgu, vgv, vbu; |
| 394 | vrv = _mm_set1_ps(1.402f); |
| 395 | vgu = _mm_set1_ps(0.34413f); |
| 396 | vgv = _mm_set1_ps(0.71414f); |
| 397 | vbu = _mm_set1_ps(1.772f); |
| 398 | for (i = 0; i < (n >> 3); ++i) { |
| 399 | __m128 vy, vu, vv; |
| 400 | __m128 vr, vg, vb; |
| 401 | |
| 402 | vy = _mm_load_ps(c0); |
| 403 | vu = _mm_load_ps(c1); |
| 404 | vv = _mm_load_ps(c2); |
| 405 | vr = _mm_add_ps(vy, _mm_mul_ps(vv, vrv)); |
| 406 | vg = _mm_sub_ps(_mm_sub_ps(vy, _mm_mul_ps(vu, vgu)), _mm_mul_ps(vv, vgv)); |
| 407 | vb = _mm_add_ps(vy, _mm_mul_ps(vu, vbu)); |
| 408 | _mm_store_ps(c0, vr); |
| 409 | _mm_store_ps(c1, vg); |
| 410 | _mm_store_ps(c2, vb); |
| 411 | c0 += 4; |
| 412 | c1 += 4; |
| 413 | c2 += 4; |
| 414 | |
| 415 | vy = _mm_load_ps(c0); |
| 416 | vu = _mm_load_ps(c1); |
| 417 | vv = _mm_load_ps(c2); |
| 418 | vr = _mm_add_ps(vy, _mm_mul_ps(vv, vrv)); |
| 419 | vg = _mm_sub_ps(_mm_sub_ps(vy, _mm_mul_ps(vu, vgu)), _mm_mul_ps(vv, vgv)); |
| 420 | vb = _mm_add_ps(vy, _mm_mul_ps(vu, vbu)); |
| 421 | _mm_store_ps(c0, vr); |
| 422 | _mm_store_ps(c1, vg); |
| 423 | _mm_store_ps(c2, vb); |
| 424 | c0 += 4; |
| 425 | c1 += 4; |
| 426 | c2 += 4; |
| 427 | } |
| 428 | n &= 7; |
| 429 | #endif |
| 430 | for (i = 0; i < n; ++i) { |
| 431 | OPJ_FLOAT32 y = c0[i]; |
| 432 | OPJ_FLOAT32 u = c1[i]; |
| 433 | OPJ_FLOAT32 v = c2[i]; |
| 434 | OPJ_FLOAT32 r = y + (v * 1.402f); |
| 435 | OPJ_FLOAT32 g = y - (u * 0.34413f) - (v * (0.71414f)); |
| 436 | OPJ_FLOAT32 b = y + (u * 1.772f); |
| 437 | c0[i] = r; |
| 438 | c1[i] = g; |
| 439 | c2[i] = b; |
| 440 | } |
| 441 | } |
| 442 | |
| 443 | /* <summary> */ |
| 444 | /* Get norm of basis function of irreversible MCT. */ |
| 445 | /* </summary> */ |
| 446 | OPJ_FLOAT64 opj_mct_getnorm_real(OPJ_UINT32 compno) |
| 447 | { |
| 448 | return opj_mct_norms_real[compno]; |
| 449 | } |
| 450 | |
| 451 | |
| 452 | OPJ_BOOL opj_mct_encode_custom( |
| 453 | OPJ_BYTE * pCodingdata, |
| 454 | OPJ_SIZE_T n, |
| 455 | OPJ_BYTE ** pData, |
| 456 | OPJ_UINT32 pNbComp, |
| 457 | OPJ_UINT32 isSigned) |
| 458 | { |
| 459 | OPJ_FLOAT32 * lMct = (OPJ_FLOAT32 *) pCodingdata; |
| 460 | OPJ_SIZE_T i; |
| 461 | OPJ_UINT32 j; |
| 462 | OPJ_UINT32 k; |
| 463 | OPJ_UINT32 lNbMatCoeff = pNbComp * pNbComp; |
| 464 | OPJ_INT32 * lCurrentData = 00; |
| 465 | OPJ_INT32 * lCurrentMatrix = 00; |
| 466 | OPJ_INT32 ** lData = (OPJ_INT32 **) pData; |
| 467 | OPJ_UINT32 lMultiplicator = 1 << 13; |
| 468 | OPJ_INT32 * lMctPtr; |
| 469 | |
| 470 | OPJ_ARG_NOT_USED(isSigned); |
| 471 | |
| 472 | lCurrentData = (OPJ_INT32 *) opj_malloc((pNbComp + lNbMatCoeff) * sizeof( |
| 473 | OPJ_INT32)); |
| 474 | if (! lCurrentData) { |
| 475 | return OPJ_FALSE; |
| 476 | } |
| 477 | |
| 478 | lCurrentMatrix = lCurrentData + pNbComp; |
| 479 | |
| 480 | for (i = 0; i < lNbMatCoeff; ++i) { |
| 481 | lCurrentMatrix[i] = (OPJ_INT32)(*(lMct++) * (OPJ_FLOAT32)lMultiplicator); |
| 482 | } |
| 483 | |
| 484 | for (i = 0; i < n; ++i) { |
| 485 | lMctPtr = lCurrentMatrix; |
| 486 | for (j = 0; j < pNbComp; ++j) { |
| 487 | lCurrentData[j] = (*(lData[j])); |
| 488 | } |
| 489 | |
| 490 | for (j = 0; j < pNbComp; ++j) { |
| 491 | *(lData[j]) = 0; |
| 492 | for (k = 0; k < pNbComp; ++k) { |
| 493 | *(lData[j]) += opj_int_fix_mul(*lMctPtr, lCurrentData[k]); |
| 494 | ++lMctPtr; |
| 495 | } |
| 496 | |
| 497 | ++lData[j]; |
| 498 | } |
| 499 | } |
| 500 | |
| 501 | opj_free(lCurrentData); |
| 502 | |
| 503 | return OPJ_TRUE; |
| 504 | } |
| 505 | |
| 506 | OPJ_BOOL opj_mct_decode_custom( |
| 507 | OPJ_BYTE * pDecodingData, |
| 508 | OPJ_SIZE_T n, |
| 509 | OPJ_BYTE ** pData, |
| 510 | OPJ_UINT32 pNbComp, |
| 511 | OPJ_UINT32 isSigned) |
| 512 | { |
| 513 | OPJ_FLOAT32 * lMct; |
| 514 | OPJ_SIZE_T i; |
| 515 | OPJ_UINT32 j; |
| 516 | OPJ_UINT32 k; |
| 517 | |
| 518 | OPJ_FLOAT32 * lCurrentData = 00; |
| 519 | OPJ_FLOAT32 * lCurrentResult = 00; |
| 520 | OPJ_FLOAT32 ** lData = (OPJ_FLOAT32 **) pData; |
| 521 | |
| 522 | OPJ_ARG_NOT_USED(isSigned); |
| 523 | |
| 524 | lCurrentData = (OPJ_FLOAT32 *) opj_malloc(2 * pNbComp * sizeof(OPJ_FLOAT32)); |
| 525 | if (! lCurrentData) { |
| 526 | return OPJ_FALSE; |
| 527 | } |
| 528 | lCurrentResult = lCurrentData + pNbComp; |
| 529 | |
| 530 | for (i = 0; i < n; ++i) { |
| 531 | lMct = (OPJ_FLOAT32 *) pDecodingData; |
| 532 | for (j = 0; j < pNbComp; ++j) { |
| 533 | lCurrentData[j] = (OPJ_FLOAT32)(*(lData[j])); |
| 534 | } |
| 535 | for (j = 0; j < pNbComp; ++j) { |
| 536 | lCurrentResult[j] = 0; |
| 537 | for (k = 0; k < pNbComp; ++k) { |
| 538 | lCurrentResult[j] += *(lMct++) * lCurrentData[k]; |
| 539 | } |
| 540 | *(lData[j]++) = (OPJ_FLOAT32)(lCurrentResult[j]); |
| 541 | } |
| 542 | } |
| 543 | opj_free(lCurrentData); |
| 544 | return OPJ_TRUE; |
| 545 | } |
| 546 | |
| 547 | void opj_calculate_norms(OPJ_FLOAT64 * pNorms, |
| 548 | OPJ_UINT32 pNbComps, |
| 549 | OPJ_FLOAT32 * pMatrix) |
| 550 | { |
| 551 | OPJ_UINT32 i, j, lIndex; |
| 552 | OPJ_FLOAT32 lCurrentValue; |
| 553 | OPJ_FLOAT64 * lNorms = (OPJ_FLOAT64 *) pNorms; |
| 554 | OPJ_FLOAT32 * lMatrix = (OPJ_FLOAT32 *) pMatrix; |
| 555 | |
| 556 | for (i = 0; i < pNbComps; ++i) { |
| 557 | lNorms[i] = 0; |
| 558 | lIndex = i; |
| 559 | |
| 560 | for (j = 0; j < pNbComps; ++j) { |
| 561 | lCurrentValue = lMatrix[lIndex]; |
| 562 | lIndex += pNbComps; |
| 563 | lNorms[i] += lCurrentValue * lCurrentValue; |
| 564 | } |
| 565 | lNorms[i] = sqrt(lNorms[i]); |
| 566 | } |
| 567 | } |
| 568 | |