dwt.c source code [MuPDF/thirdparty/openjpeg/src/lib/openjp2/dwt.c]

1	/*
2	* The copyright in this software is being made available under the 2-clauses
3	* BSD License, included below. This software may be subject to other third
4	* party and contributor rights, including patent rights, and no such rights
5	* are granted under this license.
6	*
7	* Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium
8	* Copyright (c) 2002-2014, Professor Benoit Macq
9	* Copyright (c) 2001-2003, David Janssens
10	* Copyright (c) 2002-2003, Yannick Verschueren
11	* Copyright (c) 2003-2007, Francois-Olivier Devaux
12	* Copyright (c) 2003-2014, Antonin Descampe
13	* Copyright (c) 2005, Herve Drolon, FreeImage Team
14	* Copyright (c) 2007, Jonathan Ballard <dzonatas@dzonux.net>
15	* Copyright (c) 2007, Callum Lerwick <seg@haxxed.com>
16	* Copyright (c) 2017, IntoPIX SA <support@intopix.com>
17	* All rights reserved.
18	*
19	* Redistribution and use in source and binary forms, with or without
20	* modification, are permitted provided that the following conditions
21	* are met:
22	* 1. Redistributions of source code must retain the above copyright
23	* notice, this list of conditions and the following disclaimer.
24	* 2. Redistributions in binary form must reproduce the above copyright
25	* notice, this list of conditions and the following disclaimer in the
26	* documentation and/or other materials provided with the distribution.
27	*
28	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS'
29	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31	* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
32	* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38	* POSSIBILITY OF SUCH DAMAGE.
39	*/
40
41	#include <assert.h>
42
43	#define OPJ_SKIP_POISON
44	#include "opj_includes.h"
45
46	#ifdef __SSE__
47	#include <xmmintrin.h>
48	#endif
49	#ifdef __SSE2__
50	#include <emmintrin.h>
51	#endif
52	#ifdef __SSSE3__
53	#include <tmmintrin.h>
54	#endif
55	#ifdef __AVX2__
56	#include <immintrin.h>
57	#endif
58
59	#if defined(__GNUC__)
60	#pragma GCC poison malloc calloc realloc free
61	#endif
62
63	/* @defgroup DWT DWT - Implementation of a discrete wavelet transform /
64	/@{/
65
66	#define OPJ_WS(i) v->mem[(i)*2]
67	#define OPJ_WD(i) v->mem[(1+(i)*2)]
68
69	#ifdef __AVX2__
70	/* Number of int32 values in a AVX2 register /
71	#define VREG_INT_COUNT 8
72	#else
73	/* Number of int32 values in a SSE2 register /
74	#define VREG_INT_COUNT 4
75	#endif
76
77	/* Number of columns that we can process in parallel in the vertical pass /
78	#define PARALLEL_COLS_53 (2*VREG_INT_COUNT)
79
80	/* @name Local data structures /
81	/@{/
82
83	typedef struct dwt_local {
84	OPJ_INT32* mem;
85	OPJ_INT32 dn; / number of elements in high pass band /
86	OPJ_INT32 sn; / number of elements in low pass band /
87	OPJ_INT32 cas; / 0 = start on even coord, 1 = start on odd coord /
88	} opj_dwt_t;
89
90	typedef union {
91	OPJ_FLOAT32 f[`4`];
92	} opj_v4_t;
93
94	typedef struct v4dwt_local {
95	opj_v4_t* wavelet ;
96	OPJ_INT32 dn ; / number of elements in high pass band /
97	OPJ_INT32 sn ; / number of elements in low pass band /
98	OPJ_INT32 cas ; / 0 = start on even coord, 1 = start on odd coord /
99	OPJ_UINT32 win_l_x0; / start coord in low pass band /
100	OPJ_UINT32 win_l_x1; / end coord in low pass band /
101	OPJ_UINT32 win_h_x0; / start coord in high pass band /
102	OPJ_UINT32 win_h_x1; / end coord in high pass band /
103	} opj_v4dwt_t ;
104
105	static const OPJ_FLOAT32 opj_dwt_alpha = `1.586134342f`; / 12994 /
106	static const OPJ_FLOAT32 opj_dwt_beta = `0.052980118f`; / 434 /
107	static const OPJ_FLOAT32 opj_dwt_gamma = -`0.882911075f`; / -7233 /
108	static const OPJ_FLOAT32 opj_dwt_delta = -`0.443506852f`; / -3633 /
109
110	static const OPJ_FLOAT32 opj_K = `1.230174105f`; / 10078 /
111	static const OPJ_FLOAT32 opj_c13318 = `1.625732422f`;
112
113	/@}/
114
115	/**
116	Virtual function type for wavelet transform in 1-D
117	*/
118	typedef void (DWT1DFN)(const* opj_dwt_t* v);
119
120	/* @name Local static functions /
121	/@{/
122
123	/**
124	Forward lazy transform (horizontal)
125	*/
126	static void opj_dwt_deinterleave_h(OPJ_INT32 a, OPJ_INT32 b, OPJ_INT32 dn,
127	OPJ_INT32 sn, OPJ_INT32 cas);
128	/**
129	Forward lazy transform (vertical)
130	*/
131	static void opj_dwt_deinterleave_v(OPJ_INT32 a, OPJ_INT32 b, OPJ_INT32 dn,
132	OPJ_INT32 sn, OPJ_INT32 x, OPJ_INT32 cas);
133	/**
134	Forward 5-3 wavelet transform in 1-D
135	*/
136	static void opj_dwt_encode_1(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn,
137	OPJ_INT32 cas);
138	/**
139	Forward 9-7 wavelet transform in 1-D
140	*/
141	static void opj_dwt_encode_1_real(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn,
142	OPJ_INT32 cas);
143	/**
144	Explicit calculation of the Quantization Stepsizes
145	*/
146	static void opj_dwt_encode_stepsize(OPJ_INT32 stepsize, OPJ_INT32 numbps,
147	opj_stepsize_t *bandno_stepsize);
148	/**
149	Inverse wavelet transform in 2-D.
150	*/
151	static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp,
152	opj_tcd_tilecomp_t* tilec, OPJ_UINT32 i);
153
154	static OPJ_BOOL opj_dwt_decode_partial_tile(
155	opj_tcd_tilecomp_t* tilec,
156	OPJ_UINT32 numres);
157
158	static OPJ_BOOL opj_dwt_encode_procedure(opj_tcd_tilecomp_t * tilec,
159	void (p_function)(OPJ_INT32 , OPJ_INT32, OPJ_INT32, OPJ_INT32));
160
161	static OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* OPJ_RESTRICT r,
162	OPJ_UINT32 i);
163
164	/ <summary> /
165	/ Inverse 9-7 wavelet transform in 1-D. /
166	/ </summary> /
167	static void opj_v4dwt_decode(opj_v4dwt_t* OPJ_RESTRICT dwt);
168
169	static void opj_v4dwt_interleave_h(opj_v4dwt_t* OPJ_RESTRICT dwt,
170	OPJ_FLOAT32* OPJ_RESTRICT a,
171	OPJ_UINT32 width,
172	OPJ_UINT32 remaining_height);
173
174	static void opj_v4dwt_interleave_v(opj_v4dwt_t* OPJ_RESTRICT dwt,
175	OPJ_FLOAT32* OPJ_RESTRICT a,
176	OPJ_UINT32 width,
177	OPJ_UINT32 nb_elts_read);
178
179	#ifdef __SSE__
180	static void opj_v4dwt_decode_step1_sse(opj_v4_t* w,
181	OPJ_UINT32 start,
182	OPJ_UINT32 end,
183	const __m128 c);
184
185	static void opj_v4dwt_decode_step2_sse(opj_v4_t* l, opj_v4_t* w,
186	OPJ_UINT32 start,
187	OPJ_UINT32 end,
188	OPJ_UINT32 m, __m128 c);
189
190	#else
191	static void opj_v4dwt_decode_step1(opj_v4_t* w,
192	OPJ_UINT32 start,
193	OPJ_UINT32 end,
194	const OPJ_FLOAT32 c);
195
196	static void opj_v4dwt_decode_step2(opj_v4_t* l, opj_v4_t* w,
197	OPJ_UINT32 start,
198	OPJ_UINT32 end,
199	OPJ_UINT32 m,
200	OPJ_FLOAT32 c);
201
202	#endif
203
204	/@}/
205
206	/@}/
207
208	#define OPJ_S(i) a[(i)*2]
209	#define OPJ_D(i) a[(1+(i)*2)]
210	#define OPJ_S_(i) ((i)<0?OPJ_S(0):((i)>=sn?OPJ_S(sn-1):OPJ_S(i)))
211	#define OPJ_D_(i) ((i)<0?OPJ_D(0):((i)>=dn?OPJ_D(dn-1):OPJ_D(i)))
212	/ new /
213	#define OPJ_SS_(i) ((i)<0?OPJ_S(0):((i)>=dn?OPJ_S(dn-1):OPJ_S(i)))
214	#define OPJ_DD_(i) ((i)<0?OPJ_D(0):((i)>=sn?OPJ_D(sn-1):OPJ_D(i)))
215
216	/ <summary> /
217	/ This table contains the norms of the 5-3 wavelets for different bands. /
218	/ </summary> /
219	/ FIXME! the array should really be extended up to 33 resolution levels /
220	/ See https://github.com/uclouvain/openjpeg/issues/493 /
221	static const OPJ_FLOAT64 opj_dwt_norms[`4`][`10`] = {
222	{`1.000`, `1.500`, `2.750`, `5.375`, `10.68`, `21.34`, `42.67`, `85.33`, `170.7`, `341.3`},
223	{`1.038`, `1.592`, `2.919`, `5.703`, `11.33`, `22.64`, `45.25`, `90.48`, `180.9`},
224	{`1.038`, `1.592`, `2.919`, `5.703`, `11.33`, `22.64`, `45.25`, `90.48`, `180.9`},
225	{`.7186`, `.9218`, `1.586`, `3.043`, `6.019`, `12.01`, `24.00`, `47.97`, `95.93`}
226	};
227
228	/ <summary> /
229	/ This table contains the norms of the 9-7 wavelets for different bands. /
230	/ </summary> /
231	/ FIXME! the array should really be extended up to 33 resolution levels /
232	/ See https://github.com/uclouvain/openjpeg/issues/493 /
233	static const OPJ_FLOAT64 opj_dwt_norms_real[`4`][`10`] = {
234	{`1.000`, `1.965`, `4.177`, `8.403`, `16.90`, `33.84`, `67.69`, `135.3`, `270.6`, `540.9`},
235	{`2.022`, `3.989`, `8.355`, `17.04`, `34.27`, `68.63`, `137.3`, `274.6`, `549.0`},
236	{`2.022`, `3.989`, `8.355`, `17.04`, `34.27`, `68.63`, `137.3`, `274.6`, `549.0`},
237	{`2.080`, `3.865`, `8.307`, `17.18`, `34.71`, `69.59`, `139.3`, `278.6`, `557.2`}
238	};
239
240	/*
241	==========================================================
242	local functions
243	==========================================================
244	*/
245
246	/ <summary> /
247	/ Forward lazy transform (horizontal). /
248	/ </summary> /
249	static void opj_dwt_deinterleave_h(OPJ_INT32 a, OPJ_INT32 b, OPJ_INT32 dn,
250	OPJ_INT32 sn, OPJ_INT32 cas)
251	{
252	OPJ_INT32 i;
253	OPJ_INT32 * l_dest = b;
254	OPJ_INT32 * l_src = a + cas;
255
256	for (i = `0`; i < sn; ++i) {
257	l_dest++ = l_src;
258	l_src += `2`;
259	}
260
261	l_dest = b + sn;
262	l_src = a + `1` - cas;
263
264	for (i = `0`; i < dn; ++i) {
265	l_dest++ = l_src;
266	l_src += `2`;
267	}
268	}
269
270	/ <summary> /
271	/ Forward lazy transform (vertical). /
272	/ </summary> /
273	static void opj_dwt_deinterleave_v(OPJ_INT32 a, OPJ_INT32 b, OPJ_INT32 dn,
274	OPJ_INT32 sn, OPJ_INT32 x, OPJ_INT32 cas)
275	{
276	OPJ_INT32 i = sn;
277	OPJ_INT32 * l_dest = b;
278	OPJ_INT32 * l_src = a + cas;
279
280	while (i--) {
281	l_dest = l_src;
282	l_dest += x;
283	l_src += `2`;
284	} / b[ix]=a[2i+cas]; /
285
286	l_dest = b + (OPJ_SIZE_T)sn * (OPJ_SIZE_T)x;
287	l_src = a + `1` - cas;
288
289	i = dn;
290	while (i--) {
291	l_dest = l_src;
292	l_dest += x;
293	l_src += `2`;
294	} /b[(sn+i)x]=a[(2i+1-cas)];/
295	}
296
297	#ifdef STANDARD_SLOW_VERSION
298	/ <summary> /
299	/ Inverse lazy transform (horizontal). /
300	/ </summary> /
301	static void opj_dwt_interleave_h(const opj_dwt_t* h, OPJ_INT32 *a)
302	{
303	OPJ_INT32 *ai = a;
304	OPJ_INT32 *bi = h->mem + h->cas;
305	OPJ_INT32 i = h->sn;
306	while (i--) {
307	bi = (ai++);
308	bi += `2`;
309	}
310	ai = a + h->sn;
311	bi = h->mem + `1` - h->cas;
312	i = h->dn ;
313	while (i--) {
314	bi = (ai++);
315	bi += `2`;
316	}
317	}
318
319	/ <summary> /
320	/ Inverse lazy transform (vertical). /
321	/ </summary> /
322	static void opj_dwt_interleave_v(const opj_dwt_t* v, OPJ_INT32 *a, OPJ_INT32 x)
323	{
324	OPJ_INT32 *ai = a;
325	OPJ_INT32 *bi = v->mem + v->cas;
326	OPJ_INT32 i = v->sn;
327	while (i--) {
328	bi = ai;
329	bi += `2`;
330	ai += x;
331	}
332	ai = a + (v->sn * (OPJ_SIZE_T)x);
333	bi = v->mem + `1` - v->cas;
334	i = v->dn ;
335	while (i--) {
336	bi = ai;
337	bi += `2`;
338	ai += x;
339	}
340	}
341
342	#endif /* STANDARD_SLOW_VERSION */
343
344	/ <summary> /
345	/ Forward 5-3 wavelet transform in 1-D. /
346	/ </summary> /
347	static void opj_dwt_encode_1(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn,
348	OPJ_INT32 cas)
349	{
350	OPJ_INT32 i;
351
352	if (!cas) {
353	if ((dn > `0`) \|\| (sn > `1`)) { / NEW : CASE ONE ELEMENT /
354	for (i = `0`; i < dn; i++) {
355	OPJ_D(i) -= (OPJ_S_(i) + OPJ_S_(i + `1`)) >> `1`;
356	}
357	for (i = `0`; i < sn; i++) {
358	OPJ_S(i) += (OPJ_D_(i - `1`) + OPJ_D_(i) + `2`) >> `2`;
359	}
360	}
361	} else {
362	if (!sn && dn == `1`) { / NEW : CASE ONE ELEMENT /
363	OPJ_S(`0`) *= `2`;
364	} else {
365	for (i = `0`; i < dn; i++) {
366	OPJ_S(i) -= (OPJ_DD_(i) + OPJ_DD_(i - `1`)) >> `1`;
367	}
368	for (i = `0`; i < sn; i++) {
369	OPJ_D(i) += (OPJ_SS_(i) + OPJ_SS_(i + `1`) + `2`) >> `2`;
370	}
371	}
372	}
373	}
374
375	#ifdef STANDARD_SLOW_VERSION
376	/ <summary> /
377	/ Inverse 5-3 wavelet transform in 1-D. /
378	/ </summary> /
379	static void opj_dwt_decode_1_(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn,
380	OPJ_INT32 cas)
381	{
382	OPJ_INT32 i;
383
384	if (!cas) {
385	if ((dn > `0`) \|\| (sn > `1`)) { / NEW : CASE ONE ELEMENT /
386	for (i = `0`; i < sn; i++) {
387	OPJ_S(i) -= (OPJ_D_(i - `1`) + OPJ_D_(i) + `2`) >> `2`;
388	}
389	for (i = `0`; i < dn; i++) {
390	OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + `1`)) >> `1`;
391	}
392	}
393	} else {
394	if (!sn && dn == `1`) { / NEW : CASE ONE ELEMENT /
395	OPJ_S(`0`) /= `2`;
396	} else {
397	for (i = `0`; i < sn; i++) {
398	OPJ_D(i) -= (OPJ_SS_(i) + OPJ_SS_(i + `1`) + `2`) >> `2`;
399	}
400	for (i = `0`; i < dn; i++) {
401	OPJ_S(i) += (OPJ_DD_(i) + OPJ_DD_(i - `1`)) >> `1`;
402	}
403	}
404	}
405	}
406
407	static void opj_dwt_decode_1(const opj_dwt_t *v)
408	{
409	opj_dwt_decode_1_(v->mem, v->dn, v->sn, v->cas);
410	}
411
412	#endif /* STANDARD_SLOW_VERSION */
413
414	#if !defined(STANDARD_SLOW_VERSION)
415	static void opj_idwt53_h_cas0(OPJ_INT32* tmp,
416	const OPJ_INT32 sn,
417	const OPJ_INT32 len,
418	OPJ_INT32* tiledp)
419	{
420	OPJ_INT32 i, j;
421	const OPJ_INT32* in_even = &tiledp[`0`];
422	const OPJ_INT32* in_odd = &tiledp[sn];
423
424	#ifdef TWO_PASS_VERSION
425	/ For documentation purpose: performs lifting in two iterations, /
426	/ but without explicit interleaving /
427
428	assert(len > `1`);
429
430	/ Even /
431	tmp[`0`] = in_even[`0`] - ((in_odd[`0`] + `1`) >> `1`);
432	for (i = `2`, j = `0`; i <= len - `2`; i += `2`, j++) {
433	tmp[i] = in_even[j + `1`] - ((in_odd[j] + in_odd[j + `1`] + `2`) >> `2`);
434	}
435	if (len & `1`) { / if len is odd /
436	tmp[len - `1`] = in_even[(len - `1`) / `2`] - ((in_odd[(len - `2`) / `2`] + `1`) >> `1`);
437	}
438
439	/ Odd /
440	for (i = `1`, j = `0`; i < len - `1`; i += `2`, j++) {
441	tmp[i] = in_odd[j] + ((tmp[i - `1`] + tmp[i + `1`]) >> `1`);
442	}
443	if (!(len & `1`)) { / if len is even /
444	tmp[len - `1`] = in_odd[(len - `1`) / `2`] + tmp[len - `2`];
445	}
446	#else
447	OPJ_INT32 d1c, d1n, s1n, s0c, s0n;
448
449	assert(len > `1`);
450
451	/ Improved version of the TWO_PASS_VERSION: /
452	/ Performs lifting in one single iteration. Saves memory /
453	/ accesses and explicit interleaving. /
454	s1n = in_even[`0`];
455	d1n = in_odd[`0`];
456	s0n = s1n - ((d1n + `1`) >> `1`);
457
458	for (i = `0`, j = `1`; i < (len - `3`); i += `2`, j++) {
459	d1c = d1n;
460	s0c = s0n;
461
462	s1n = in_even[j];
463	d1n = in_odd[j];
464
465	s0n = s1n - ((d1c + d1n + `2`) >> `2`);
466
467	tmp[i ] = s0c;
468	tmp[i + `1`] = d1c + ((s0c + s0n) >> `1`);
469	}
470
471	tmp[i] = s0n;
472
473	if (len & `1`) {
474	tmp[len - `1`] = in_even[(len - `1`) / `2`] - ((d1n + `1`) >> `1`);
475	tmp[len - `2`] = d1n + ((s0n + tmp[len - `1`]) >> `1`);
476	} else {
477	tmp[len - `1`] = d1n + s0n;
478	}
479	#endif
480	memcpy(tiledp, tmp, (OPJ_UINT32)len * sizeof(OPJ_INT32));
481	}
482
483	static void opj_idwt53_h_cas1(OPJ_INT32* tmp,
484	const OPJ_INT32 sn,
485	const OPJ_INT32 len,
486	OPJ_INT32* tiledp)
487	{
488	OPJ_INT32 i, j;
489	const OPJ_INT32* in_even = &tiledp[sn];
490	const OPJ_INT32* in_odd = &tiledp[`0`];
491
492	#ifdef TWO_PASS_VERSION
493	/ For documentation purpose: performs lifting in two iterations, /
494	/ but without explicit interleaving /
495
496	assert(len > `2`);
497
498	/ Odd /
499	for (i = `1`, j = `0`; i < len - `1`; i += `2`, j++) {
500	tmp[i] = in_odd[j] - ((in_even[j] + in_even[j + `1`] + `2`) >> `2`);
501	}
502	if (!(len & `1`)) {
503	tmp[len - `1`] = in_odd[len / `2` - `1`] - ((in_even[len / `2` - `1`] + `1`) >> `1`);
504	}
505
506	/ Even /
507	tmp[`0`] = in_even[`0`] + tmp[`1`];
508	for (i = `2`, j = `1`; i < len - `1`; i += `2`, j++) {
509	tmp[i] = in_even[j] + ((tmp[i + `1`] + tmp[i - `1`]) >> `1`);
510	}
511	if (len & `1`) {
512	tmp[len - `1`] = in_even[len / `2`] + tmp[len - `2`];
513	}
514	#else
515	OPJ_INT32 s1, s2, dc, dn;
516
517	assert(len > `2`);
518
519	/ Improved version of the TWO_PASS_VERSION: /
520	/ Performs lifting in one single iteration. Saves memory /
521	/ accesses and explicit interleaving. /
522
523	s1 = in_even[`1`];
524	dc = in_odd[`0`] - ((in_even[`0`] + s1 + `2`) >> `2`);
525	tmp[`0`] = in_even[`0`] + dc;
526
527	for (i = `1`, j = `1`; i < (len - `2` - !(len & `1`)); i += `2`, j++) {
528
529	s2 = in_even[j + `1`];
530
531	dn = in_odd[j] - ((s1 + s2 + `2`) >> `2`);
532	tmp[i ] = dc;
533	tmp[i + `1`] = s1 + ((dn + dc) >> `1`);
534
535	dc = dn;
536	s1 = s2;
537	}
538
539	tmp[i] = dc;
540
541	if (!(len & `1`)) {
542	dn = in_odd[len / `2` - `1`] - ((s1 + `1`) >> `1`);
543	tmp[len - `2`] = s1 + ((dn + dc) >> `1`);
544	tmp[len - `1`] = dn;
545	} else {
546	tmp[len - `1`] = s1 + dc;
547	}
548	#endif
549	memcpy(tiledp, tmp, (OPJ_UINT32)len * sizeof(OPJ_INT32));
550	}
551
552
553	#endif /* !defined(STANDARD_SLOW_VERSION) */
554
555	/ <summary> /
556	/ Inverse 5-3 wavelet transform in 1-D for one row. /
557	/ </summary> /
558	/ Performs interleave, inverse wavelet transform and copy back to buffer /
559	static void opj_idwt53_h(const opj_dwt_t *dwt,
560	OPJ_INT32* tiledp)
561	{
562	#ifdef STANDARD_SLOW_VERSION
563	/ For documentation purpose /
564	opj_dwt_interleave_h(dwt, tiledp);
565	opj_dwt_decode_1(dwt);
566	memcpy(tiledp, dwt->mem, (OPJ_UINT32)(dwt->sn + dwt->dn) * sizeof(OPJ_INT32));
567	#else
568	const OPJ_INT32 sn = dwt->sn;
569	const OPJ_INT32 len = sn + dwt->dn;
570	if (dwt->cas == `0`) { / Left-most sample is on even coordinate /
571	if (len > `1`) {
572	opj_idwt53_h_cas0(dwt->mem, sn, len, tiledp);
573	} else {
574	/ Unmodified value /
575	}
576	} else { / Left-most sample is on odd coordinate /
577	if (len == `1`) {
578	tiledp[`0`] /= `2`;
579	} else if (len == `2`) {
580	OPJ_INT32* out = dwt->mem;
581	const OPJ_INT32* in_even = &tiledp[sn];
582	const OPJ_INT32* in_odd = &tiledp[`0`];
583	out[`1`] = in_odd[`0`] - ((in_even[`0`] + `1`) >> `1`);
584	out[`0`] = in_even[`0`] + out[`1`];
585	memcpy(tiledp, dwt->mem, (OPJ_UINT32)len * sizeof(OPJ_INT32));
586	} else if (len > `2`) {
587	opj_idwt53_h_cas1(dwt->mem, sn, len, tiledp);
588	}
589	}
590	#endif
591	}
592
593	#if (defined(__SSE2__) \|\| defined(__AVX2__)) && !defined(STANDARD_SLOW_VERSION)
594
595	/ Conveniency macros to improve the readabilty of the formulas /
596	#if __AVX2__
597	#define VREG __m256i
598	#define LOAD_CST(x) _mm256_set1_epi32(x)
599	#define LOAD(x) _mm256_load_si256((const VREG*)(x))
600	#define LOADU(x) _mm256_loadu_si256((const VREG*)(x))
601	#define STORE(x,y) _mm256_store_si256((VREG*)(x),(y))
602	#define STOREU(x,y) _mm256_storeu_si256((VREG*)(x),(y))
603	#define ADD(x,y) _mm256_add_epi32((x),(y))
604	#define SUB(x,y) _mm256_sub_epi32((x),(y))
605	#define SAR(x,y) _mm256_srai_epi32((x),(y))
606	#else
607	#define VREG __m128i
608	#define LOAD_CST(x) _mm_set1_epi32(x)
609	#define LOAD(x) _mm_load_si128((const VREG*)(x))
610	#define LOADU(x) _mm_loadu_si128((const VREG*)(x))
611	#define STORE(x,y) _mm_store_si128((VREG*)(x),(y))
612	#define STOREU(x,y) _mm_storeu_si128((VREG*)(x),(y))
613	#define ADD(x,y) _mm_add_epi32((x),(y))
614	#define SUB(x,y) _mm_sub_epi32((x),(y))
615	#define SAR(x,y) _mm_srai_epi32((x),(y))
616	#endif
617	#define ADD3(x,y,z) ADD(ADD(x,y),z)
618
619	static
620	void opj_idwt53_v_final_memcpy(OPJ_INT32* tiledp_col,
621	const OPJ_INT32* tmp,
622	OPJ_INT32 len,
623	OPJ_SIZE_T stride)
624	{
625	OPJ_INT32 i;
626	for (i = `0`; i < len; ++i) {
627	/ A memcpy(&tiledp_col[i * stride + 0],*
628	&tmp[PARALLEL_COLS_53 i + 0],*
629	PARALLEL_COLS_53 sizeof(OPJ_INT32))*
630	would do but would be a tiny bit slower.
631	We can take here advantage of our knowledge of alignment /*
632	STOREU(&tiledp_col[(OPJ_SIZE_T)i * stride + `0`],
633	LOAD(&tmp[PARALLEL_COLS_53 * i + `0`]));
634	STOREU(&tiledp_col[(OPJ_SIZE_T)i * stride + VREG_INT_COUNT],
635	LOAD(&tmp[PARALLEL_COLS_53 * i + VREG_INT_COUNT]));
636	}
637	}
638
639	/* Vertical inverse 5x3 wavelet transform for 8 columns in SSE2, or*
640	* 16 in AVX2, when top-most pixel is on even coordinate */
641	static void opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2(
642	OPJ_INT32* tmp,
643	const OPJ_INT32 sn,
644	const OPJ_INT32 len,
645	OPJ_INT32* tiledp_col,
646	const OPJ_SIZE_T stride)
647	{
648	const OPJ_INT32* in_even = &tiledp_col[`0`];
649	const OPJ_INT32* in_odd = &tiledp_col[(OPJ_SIZE_T)sn * stride];
650
651	OPJ_INT32 i;
652	OPJ_SIZE_T j;
653	VREG d1c_0, d1n_0, s1n_0, s0c_0, s0n_0;
654	VREG d1c_1, d1n_1, s1n_1, s0c_1, s0n_1;
655	const VREG two = LOAD_CST(`2`);
656
657	assert(len > `1`);
658	#if __AVX2__
659	assert(PARALLEL_COLS_53 == `16`);
660	assert(VREG_INT_COUNT == `8`);
661	#else
662	assert(PARALLEL_COLS_53 == `8`);
663	assert(VREG_INT_COUNT == `4`);
664	#endif
665
666	/ Note: loads of input even/odd values must be done in a unaligned /
667	/ fashion. But stores in tmp can be done with aligned store, since /
668	/ the temporary buffer is properly aligned /
669	assert((OPJ_SIZE_T)tmp % (sizeof(OPJ_INT32) * VREG_INT_COUNT) == `0`);
670
671	s1n_0 = LOADU(in_even + `0`);
672	s1n_1 = LOADU(in_even + VREG_INT_COUNT);
673	d1n_0 = LOADU(in_odd);
674	d1n_1 = LOADU(in_odd + VREG_INT_COUNT);
675
676	/ s0n = s1n - ((d1n + 1) >> 1); <==> /
677	/ s0n = s1n - ((d1n + d1n + 2) >> 2); /
678	s0n_0 = SUB(s1n_0, SAR(ADD3(d1n_0, d1n_0, two), `2`));
679	s0n_1 = SUB(s1n_1, SAR(ADD3(d1n_1, d1n_1, two), `2`));
680
681	for (i = `0`, j = `1`; i < (len - `3`); i += `2`, j++) {
682	d1c_0 = d1n_0;
683	s0c_0 = s0n_0;
684	d1c_1 = d1n_1;
685	s0c_1 = s0n_1;
686
687	s1n_0 = LOADU(in_even + j * stride);
688	s1n_1 = LOADU(in_even + j * stride + VREG_INT_COUNT);
689	d1n_0 = LOADU(in_odd + j * stride);
690	d1n_1 = LOADU(in_odd + j * stride + VREG_INT_COUNT);
691
692	/s0n = s1n - ((d1c + d1n + 2) >> 2);/
693	s0n_0 = SUB(s1n_0, SAR(ADD3(d1c_0, d1n_0, two), `2`));
694	s0n_1 = SUB(s1n_1, SAR(ADD3(d1c_1, d1n_1, two), `2`));
695
696	STORE(tmp + PARALLEL_COLS_53 * (i + `0`), s0c_0);
697	STORE(tmp + PARALLEL_COLS_53 * (i + `0`) + VREG_INT_COUNT, s0c_1);
698
699	/ d1c + ((s0c + s0n) >> 1) /
700	STORE(tmp + PARALLEL_COLS_53 * (i + `1`) + `0`,
701	ADD(d1c_0, SAR(ADD(s0c_0, s0n_0), `1`)));
702	STORE(tmp + PARALLEL_COLS_53 * (i + `1`) + VREG_INT_COUNT,
703	ADD(d1c_1, SAR(ADD(s0c_1, s0n_1), `1`)));
704	}
705
706	STORE(tmp + PARALLEL_COLS_53 * (i + `0`) + `0`, s0n_0);
707	STORE(tmp + PARALLEL_COLS_53 * (i + `0`) + VREG_INT_COUNT, s0n_1);
708
709	if (len & `1`) {
710	VREG tmp_len_minus_1;
711	s1n_0 = LOADU(in_even + (OPJ_SIZE_T)((len - `1`) / `2`) * stride);
712	/ tmp_len_minus_1 = s1n - ((d1n + 1) >> 1); /
713	tmp_len_minus_1 = SUB(s1n_0, SAR(ADD3(d1n_0, d1n_0, two), `2`));
714	STORE(tmp + PARALLEL_COLS_53 * (len - `1`), tmp_len_minus_1);
715	/ d1n + ((s0n + tmp_len_minus_1) >> 1) /
716	STORE(tmp + PARALLEL_COLS_53 * (len - `2`),
717	ADD(d1n_0, SAR(ADD(s0n_0, tmp_len_minus_1), `1`)));
718
719	s1n_1 = LOADU(in_even + (OPJ_SIZE_T)((len - `1`) / `2`) * stride + VREG_INT_COUNT);
720	/ tmp_len_minus_1 = s1n - ((d1n + 1) >> 1); /
721	tmp_len_minus_1 = SUB(s1n_1, SAR(ADD3(d1n_1, d1n_1, two), `2`));
722	STORE(tmp + PARALLEL_COLS_53 * (len - `1`) + VREG_INT_COUNT,
723	tmp_len_minus_1);
724	/ d1n + ((s0n + tmp_len_minus_1) >> 1) /
725	STORE(tmp + PARALLEL_COLS_53 * (len - `2`) + VREG_INT_COUNT,
726	ADD(d1n_1, SAR(ADD(s0n_1, tmp_len_minus_1), `1`)));
727
728
729	} else {
730	STORE(tmp + PARALLEL_COLS_53 * (len - `1`) + `0`,
731	ADD(d1n_0, s0n_0));
732	STORE(tmp + PARALLEL_COLS_53 * (len - `1`) + VREG_INT_COUNT,
733	ADD(d1n_1, s0n_1));
734	}
735
736	opj_idwt53_v_final_memcpy(tiledp_col, tmp, len, stride);
737	}
738
739
740	/* Vertical inverse 5x3 wavelet transform for 8 columns in SSE2, or*
741	* 16 in AVX2, when top-most pixel is on odd coordinate */
742	static void opj_idwt53_v_cas1_mcols_SSE2_OR_AVX2(
743	OPJ_INT32* tmp,
744	const OPJ_INT32 sn,
745	const OPJ_INT32 len,
746	OPJ_INT32* tiledp_col,
747	const OPJ_SIZE_T stride)
748	{
749	OPJ_INT32 i;
750	OPJ_SIZE_T j;
751
752	VREG s1_0, s2_0, dc_0, dn_0;
753	VREG s1_1, s2_1, dc_1, dn_1;
754	const VREG two = LOAD_CST(`2`);
755
756	const OPJ_INT32* in_even = &tiledp_col[(OPJ_SIZE_T)sn * stride];
757	const OPJ_INT32* in_odd = &tiledp_col[`0`];
758
759	assert(len > `2`);
760	#if __AVX2__
761	assert(PARALLEL_COLS_53 == `16`);
762	assert(VREG_INT_COUNT == `8`);
763	#else
764	assert(PARALLEL_COLS_53 == `8`);
765	assert(VREG_INT_COUNT == `4`);
766	#endif
767
768	/ Note: loads of input even/odd values must be done in a unaligned /
769	/ fashion. But stores in tmp can be done with aligned store, since /
770	/ the temporary buffer is properly aligned /
771	assert((OPJ_SIZE_T)tmp % (sizeof(OPJ_INT32) * VREG_INT_COUNT) == `0`);
772
773	s1_0 = LOADU(in_even + stride);
774	/ in_odd[0] - ((in_even[0] + s1 + 2) >> 2); /
775	dc_0 = SUB(LOADU(in_odd + `0`),
776	SAR(ADD3(LOADU(in_even + `0`), s1_0, two), `2`));
777	STORE(tmp + PARALLEL_COLS_53 * `0`, ADD(LOADU(in_even + `0`), dc_0));
778
779	s1_1 = LOADU(in_even + stride + VREG_INT_COUNT);
780	/ in_odd[0] - ((in_even[0] + s1 + 2) >> 2); /
781	dc_1 = SUB(LOADU(in_odd + VREG_INT_COUNT),
782	SAR(ADD3(LOADU(in_even + VREG_INT_COUNT), s1_1, two), `2`));
783	STORE(tmp + PARALLEL_COLS_53 * `0` + VREG_INT_COUNT,
784	ADD(LOADU(in_even + VREG_INT_COUNT), dc_1));
785
786	for (i = `1`, j = `1`; i < (len - `2` - !(len & `1`)); i += `2`, j++) {
787
788	s2_0 = LOADU(in_even + (j + `1`) * stride);
789	s2_1 = LOADU(in_even + (j + `1`) * stride + VREG_INT_COUNT);
790
791	/ dn = in_odd[j * stride] - ((s1 + s2 + 2) >> 2); /
792	dn_0 = SUB(LOADU(in_odd + j * stride),
793	SAR(ADD3(s1_0, s2_0, two), `2`));
794	dn_1 = SUB(LOADU(in_odd + j * stride + VREG_INT_COUNT),
795	SAR(ADD3(s1_1, s2_1, two), `2`));
796
797	STORE(tmp + PARALLEL_COLS_53 * i, dc_0);
798	STORE(tmp + PARALLEL_COLS_53 * i + VREG_INT_COUNT, dc_1);
799
800	/ tmp[i + 1] = s1 + ((dn + dc) >> 1); /
801	STORE(tmp + PARALLEL_COLS_53 * (i + `1`) + `0`,
802	ADD(s1_0, SAR(ADD(dn_0, dc_0), `1`)));
803	STORE(tmp + PARALLEL_COLS_53 * (i + `1`) + VREG_INT_COUNT,
804	ADD(s1_1, SAR(ADD(dn_1, dc_1), `1`)));
805
806	dc_0 = dn_0;
807	s1_0 = s2_0;
808	dc_1 = dn_1;
809	s1_1 = s2_1;
810	}
811	STORE(tmp + PARALLEL_COLS_53 * i, dc_0);
812	STORE(tmp + PARALLEL_COLS_53 * i + VREG_INT_COUNT, dc_1);
813
814	if (!(len & `1`)) {
815	/dn = in_odd[(len / 2 - 1) * stride] - ((s1 + 1) >> 1); /
816	dn_0 = SUB(LOADU(in_odd + (OPJ_SIZE_T)(len / `2` - `1`) * stride),
817	SAR(ADD3(s1_0, s1_0, two), `2`));
818	dn_1 = SUB(LOADU(in_odd + (OPJ_SIZE_T)(len / `2` - `1`) * stride + VREG_INT_COUNT),
819	SAR(ADD3(s1_1, s1_1, two), `2`));
820
821	/ tmp[len - 2] = s1 + ((dn + dc) >> 1); /
822	STORE(tmp + PARALLEL_COLS_53 * (len - `2`) + `0`,
823	ADD(s1_0, SAR(ADD(dn_0, dc_0), `1`)));
824	STORE(tmp + PARALLEL_COLS_53 * (len - `2`) + VREG_INT_COUNT,
825	ADD(s1_1, SAR(ADD(dn_1, dc_1), `1`)));
826
827	STORE(tmp + PARALLEL_COLS_53 * (len - `1`) + `0`, dn_0);
828	STORE(tmp + PARALLEL_COLS_53 * (len - `1`) + VREG_INT_COUNT, dn_1);
829	} else {
830	STORE(tmp + PARALLEL_COLS_53 * (len - `1`) + `0`, ADD(s1_0, dc_0));
831	STORE(tmp + PARALLEL_COLS_53 * (len - `1`) + VREG_INT_COUNT,
832	ADD(s1_1, dc_1));
833	}
834
835	opj_idwt53_v_final_memcpy(tiledp_col, tmp, len, stride);
836	}
837
838	#undef VREG
839	#undef LOAD_CST
840	#undef LOADU
841	#undef LOAD
842	#undef STORE
843	#undef STOREU
844	#undef ADD
845	#undef ADD3
846	#undef SUB
847	#undef SAR
848
849	#endif /* (defined(__SSE2__) \|\| defined(__AVX2__)) && !defined(STANDARD_SLOW_VERSION) */
850
851	#if !defined(STANDARD_SLOW_VERSION)
852	/* Vertical inverse 5x3 wavelet transform for one column, when top-most*
853	* pixel is on even coordinate */
854	static void opj_idwt3_v_cas0(OPJ_INT32* tmp,
855	const OPJ_INT32 sn,
856	const OPJ_INT32 len,
857	OPJ_INT32* tiledp_col,
858	const OPJ_SIZE_T stride)
859	{
860	OPJ_INT32 i, j;
861	OPJ_INT32 d1c, d1n, s1n, s0c, s0n;
862
863	assert(len > `1`);
864
865	/ Performs lifting in one single iteration. Saves memory /
866	/ accesses and explicit interleaving. /
867
868	s1n = tiledp_col[`0`];
869	d1n = tiledp_col[(OPJ_SIZE_T)sn * stride];
870	s0n = s1n - ((d1n + `1`) >> `1`);
871
872	for (i = `0`, j = `0`; i < (len - `3`); i += `2`, j++) {
873	d1c = d1n;
874	s0c = s0n;
875
876	s1n = tiledp_col[(OPJ_SIZE_T)(j + `1`) * stride];
877	d1n = tiledp_col[(OPJ_SIZE_T)(sn + j + `1`) * stride];
878
879	s0n = s1n - ((d1c + d1n + `2`) >> `2`);
880
881	tmp[i ] = s0c;
882	tmp[i + `1`] = d1c + ((s0c + s0n) >> `1`);
883	}
884
885	tmp[i] = s0n;
886
887	if (len & `1`) {
888	tmp[len - `1`] =
889	tiledp_col[(OPJ_SIZE_T)((len - `1`) / `2`) * stride] -
890	((d1n + `1`) >> `1`);
891	tmp[len - `2`] = d1n + ((s0n + tmp[len - `1`]) >> `1`);
892	} else {
893	tmp[len - `1`] = d1n + s0n;
894	}
895
896	for (i = `0`; i < len; ++i) {
897	tiledp_col[(OPJ_SIZE_T)i * stride] = tmp[i];
898	}
899	}
900
901
902	/* Vertical inverse 5x3 wavelet transform for one column, when top-most*
903	* pixel is on odd coordinate */
904	static void opj_idwt3_v_cas1(OPJ_INT32* tmp,
905	const OPJ_INT32 sn,
906	const OPJ_INT32 len,
907	OPJ_INT32* tiledp_col,
908	const OPJ_SIZE_T stride)
909	{
910	OPJ_INT32 i, j;
911	OPJ_INT32 s1, s2, dc, dn;
912	const OPJ_INT32* in_even = &tiledp_col[(OPJ_SIZE_T)sn * stride];
913	const OPJ_INT32* in_odd = &tiledp_col[`0`];
914
915	assert(len > `2`);
916
917	/ Performs lifting in one single iteration. Saves memory /
918	/ accesses and explicit interleaving. /
919
920	s1 = in_even[stride];
921	dc = in_odd[`0`] - ((in_even[`0`] + s1 + `2`) >> `2`);
922	tmp[`0`] = in_even[`0`] + dc;
923	for (i = `1`, j = `1`; i < (len - `2` - !(len & `1`)); i += `2`, j++) {
924
925	s2 = in_even[(OPJ_SIZE_T)(j + `1`) * stride];
926
927	dn = in_odd[(OPJ_SIZE_T)j * stride] - ((s1 + s2 + `2`) >> `2`);
928	tmp[i ] = dc;
929	tmp[i + `1`] = s1 + ((dn + dc) >> `1`);
930
931	dc = dn;
932	s1 = s2;
933	}
934	tmp[i] = dc;
935	if (!(len & `1`)) {
936	dn = in_odd[(OPJ_SIZE_T)(len / `2` - `1`) * stride] - ((s1 + `1`) >> `1`);
937	tmp[len - `2`] = s1 + ((dn + dc) >> `1`);
938	tmp[len - `1`] = dn;
939	} else {
940	tmp[len - `1`] = s1 + dc;
941	}
942
943	for (i = `0`; i < len; ++i) {
944	tiledp_col[(OPJ_SIZE_T)i * stride] = tmp[i];
945	}
946	}
947	#endif /* !defined(STANDARD_SLOW_VERSION) */
948
949	/ <summary> /
950	/ Inverse vertical 5-3 wavelet transform in 1-D for several columns. /
951	/ </summary> /
952	/ Performs interleave, inverse wavelet transform and copy back to buffer /
953	static void opj_idwt53_v(const opj_dwt_t *dwt,
954	OPJ_INT32* tiledp_col,
955	OPJ_SIZE_T stride,
956	OPJ_INT32 nb_cols)
957	{
958	#ifdef STANDARD_SLOW_VERSION
959	/ For documentation purpose /
960	OPJ_INT32 k, c;
961	for (c = `0`; c < nb_cols; c ++) {
962	opj_dwt_interleave_v(dwt, tiledp_col + c, stride);
963	opj_dwt_decode_1(dwt);
964	for (k = `0`; k < dwt->sn + dwt->dn; ++k) {
965	tiledp_col[c + k * stride] = dwt->mem[k];
966	}
967	}
968	#else
969	const OPJ_INT32 sn = dwt->sn;
970	const OPJ_INT32 len = sn + dwt->dn;
971	if (dwt->cas == `0`) {
972	/ If len == 1, unmodified value /
973
974	#if (defined(__SSE2__) \|\| defined(__AVX2__))
975	if (len > `1` && nb_cols == PARALLEL_COLS_53) {
976	/ Same as below general case, except that thanks to SSE2/AVX2 /
977	/ we can efficently process 8/16 columns in parallel /
978	opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2(dwt->mem, sn, len, tiledp_col, stride);
979	return;
980	}
981	#endif
982	if (len > `1`) {
983	OPJ_INT32 c;
984	for (c = `0`; c < nb_cols; c++, tiledp_col++) {
985	opj_idwt3_v_cas0(dwt->mem, sn, len, tiledp_col, stride);
986	}
987	return;
988	}
989	} else {
990	if (len == `1`) {
991	OPJ_INT32 c;
992	for (c = `0`; c < nb_cols; c++, tiledp_col++) {
993	tiledp_col[`0`] /= `2`;
994	}
995	return;
996	}
997
998	if (len == `2`) {
999	OPJ_INT32 c;
1000	OPJ_INT32* out = dwt->mem;
1001	for (c = `0`; c < nb_cols; c++, tiledp_col++) {
1002	OPJ_INT32 i;
1003	const OPJ_INT32* in_even = &tiledp_col[(OPJ_SIZE_T)sn * stride];
1004	const OPJ_INT32* in_odd = &tiledp_col[`0`];
1005
1006	out[`1`] = in_odd[`0`] - ((in_even[`0`] + `1`) >> `1`);
1007	out[`0`] = in_even[`0`] + out[`1`];
1008
1009	for (i = `0`; i < len; ++i) {
1010	tiledp_col[(OPJ_SIZE_T)i * stride] = out[i];
1011	}
1012	}
1013
1014	return;
1015	}
1016
1017	#if (defined(__SSE2__) \|\| defined(__AVX2__))
1018	if (len > `2` && nb_cols == PARALLEL_COLS_53) {
1019	/ Same as below general case, except that thanks to SSE2/AVX2 /
1020	/ we can efficently process 8/16 columns in parallel /
1021	opj_idwt53_v_cas1_mcols_SSE2_OR_AVX2(dwt->mem, sn, len, tiledp_col, stride);
1022	return;
1023	}
1024	#endif
1025	if (len > `2`) {
1026	OPJ_INT32 c;
1027	for (c = `0`; c < nb_cols; c++, tiledp_col++) {
1028	opj_idwt3_v_cas1(dwt->mem, sn, len, tiledp_col, stride);
1029	}
1030	return;
1031	}
1032	}
1033	#endif
1034	}
1035
1036
1037	/ <summary> /
1038	/ Forward 9-7 wavelet transform in 1-D. /
1039	/ </summary> /
1040	static void opj_dwt_encode_1_real(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn,
1041	OPJ_INT32 cas)
1042	{
1043	OPJ_INT32 i;
1044	if (!cas) {
1045	if ((dn > `0`) \|\| (sn > `1`)) { / NEW : CASE ONE ELEMENT /
1046	for (i = `0`; i < dn; i++) {
1047	OPJ_D(i) -= opj_int_fix_mul(OPJ_S_(i) + OPJ_S_(i + `1`), `12993`);
1048	}
1049	for (i = `0`; i < sn; i++) {
1050	OPJ_S(i) -= opj_int_fix_mul(OPJ_D_(i - `1`) + OPJ_D_(i), `434`);
1051	}
1052	for (i = `0`; i < dn; i++) {
1053	OPJ_D(i) += opj_int_fix_mul(OPJ_S_(i) + OPJ_S_(i + `1`), `7233`);
1054	}
1055	for (i = `0`; i < sn; i++) {
1056	OPJ_S(i) += opj_int_fix_mul(OPJ_D_(i - `1`) + OPJ_D_(i), `3633`);
1057	}
1058	for (i = `0`; i < dn; i++) {
1059	OPJ_D(i) = opj_int_fix_mul(OPJ_D(i), `5038`); /5038 /
1060	}
1061	for (i = `0`; i < sn; i++) {
1062	OPJ_S(i) = opj_int_fix_mul(OPJ_S(i), `6659`); /6660 /
1063	}
1064	}
1065	} else {
1066	if ((sn > `0`) \|\| (dn > `1`)) { / NEW : CASE ONE ELEMENT /
1067	for (i = `0`; i < dn; i++) {
1068	OPJ_S(i) -= opj_int_fix_mul(OPJ_DD_(i) + OPJ_DD_(i - `1`), `12993`);
1069	}
1070	for (i = `0`; i < sn; i++) {
1071	OPJ_D(i) -= opj_int_fix_mul(OPJ_SS_(i) + OPJ_SS_(i + `1`), `434`);
1072	}
1073	for (i = `0`; i < dn; i++) {
1074	OPJ_S(i) += opj_int_fix_mul(OPJ_DD_(i) + OPJ_DD_(i - `1`), `7233`);
1075	}
1076	for (i = `0`; i < sn; i++) {
1077	OPJ_D(i) += opj_int_fix_mul(OPJ_SS_(i) + OPJ_SS_(i + `1`), `3633`);
1078	}
1079	for (i = `0`; i < dn; i++) {
1080	OPJ_S(i) = opj_int_fix_mul(OPJ_S(i), `5038`); /5038 /
1081	}
1082	for (i = `0`; i < sn; i++) {
1083	OPJ_D(i) = opj_int_fix_mul(OPJ_D(i), `6659`); /6660 /
1084	}
1085	}
1086	}
1087	}
1088
1089	static void opj_dwt_encode_stepsize(OPJ_INT32 stepsize, OPJ_INT32 numbps,
1090	opj_stepsize_t *bandno_stepsize)
1091	{
1092	OPJ_INT32 p, n;
1093	p = opj_int_floorlog2(stepsize) - `13`;
1094	n = `11` - opj_int_floorlog2(stepsize);
1095	bandno_stepsize->mant = (n < `0` ? stepsize >> -n : stepsize << n) & `0x7ff`;
1096	bandno_stepsize->expn = numbps - p;
1097	}
1098
1099	/*
1100	==========================================================
1101	DWT interface
1102	==========================================================
1103	*/
1104
1105
1106	/ <summary> /
1107	/ Forward 5-3 wavelet transform in 2-D. /
1108	/ </summary> /
1109	static INLINE OPJ_BOOL opj_dwt_encode_procedure(opj_tcd_tilecomp_t * tilec,
1110	void (p_function)(OPJ_INT32 , OPJ_INT32, OPJ_INT32, OPJ_INT32))
1111	{
1112	OPJ_INT32 i, j, k;
1113	OPJ_INT32 *a = `00`;
1114	OPJ_INT32 *aj = `00`;
1115	OPJ_INT32 *bj = `00`;
1116	OPJ_INT32 w, l;
1117
1118	OPJ_INT32 rw; / width of the resolution level computed /
1119	OPJ_INT32 rh; / height of the resolution level computed /
1120	OPJ_SIZE_T l_data_size;
1121
1122	opj_tcd_resolution_t * l_cur_res = `0`;
1123	opj_tcd_resolution_t * l_last_res = `0`;
1124
1125	w = tilec->x1 - tilec->x0;
1126	l = (OPJ_INT32)tilec->numresolutions - `1`;
1127	a = tilec->data;
1128
1129	l_cur_res = tilec->resolutions + l;
1130	l_last_res = l_cur_res - `1`;
1131
1132	l_data_size = opj_dwt_max_resolution(tilec->resolutions, tilec->numresolutions);
1133	/ overflow check /
1134	if (l_data_size > (SIZE_MAX / sizeof(OPJ_INT32))) {
1135	/ FIXME event manager error callback /
1136	return OPJ_FALSE;
1137	}
1138	l_data_size = sizeof*(OPJ_INT32);
1139	bj = (OPJ_INT32*)opj_malloc(l_data_size);
1140	/ l_data_size is equal to 0 when numresolutions == 1 but bj is not used /
1141	/ in that case, so do not error out /
1142	if (l_data_size != `0` && ! bj) {
1143	return OPJ_FALSE;
1144	}
1145	i = l;
1146
1147	while (i--) {
1148	OPJ_INT32 rw1; / width of the resolution level once lower than computed one /
1149	OPJ_INT32 rh1; / height of the resolution level once lower than computed one /
1150	OPJ_INT32 cas_col; / 0 = non inversion on horizontal filtering 1 = inversion between low-pass and high-pass filtering /
1151	OPJ_INT32 cas_row; / 0 = non inversion on vertical filtering 1 = inversion between low-pass and high-pass filtering /
1152	OPJ_INT32 dn, sn;
1153
1154	rw = l_cur_res->x1 - l_cur_res->x0;
1155	rh = l_cur_res->y1 - l_cur_res->y0;
1156	rw1 = l_last_res->x1 - l_last_res->x0;
1157	rh1 = l_last_res->y1 - l_last_res->y0;
1158
1159	cas_row = l_cur_res->x0 & `1`;
1160	cas_col = l_cur_res->y0 & `1`;
1161
1162	sn = rh1;
1163	dn = rh - rh1;
1164	for (j = `0`; j < rw; ++j) {
1165	aj = a + j;
1166	for (k = `0`; k < rh; ++k) {
1167	bj[k] = aj[k * w];
1168	}
1169
1170	(*p_function)(bj, dn, sn, cas_col);
1171
1172	opj_dwt_deinterleave_v(bj, aj, dn, sn, w, cas_col);
1173	}
1174
1175	sn = rw1;
1176	dn = rw - rw1;
1177
1178	for (j = `0`; j < rh; j++) {
1179	aj = a + j * w;
1180	for (k = `0`; k < rw; k++) {
1181	bj[k] = aj[k];
1182	}
1183	(*p_function)(bj, dn, sn, cas_row);
1184	opj_dwt_deinterleave_h(bj, aj, dn, sn, cas_row);
1185	}
1186
1187	l_cur_res = l_last_res;
1188
1189	--l_last_res;
1190	}
1191
1192	opj_free(bj);
1193	return OPJ_TRUE;
1194	}
1195
1196	/ Forward 5-3 wavelet transform in 2-D. /
1197	/ </summary> /
1198	OPJ_BOOL opj_dwt_encode(opj_tcd_tilecomp_t * tilec)
1199	{
1200	return opj_dwt_encode_procedure(tilec, opj_dwt_encode_1);
1201	}
1202
1203	/ <summary> /
1204	/ Inverse 5-3 wavelet transform in 2-D. /
1205	/ </summary> /
1206	OPJ_BOOL opj_dwt_decode(opj_tcd_t p_tcd, opj_tcd_tilecomp_t tilec,
1207	OPJ_UINT32 numres)
1208	{
1209	if (p_tcd->whole_tile_decoding) {
1210	return opj_dwt_decode_tile(p_tcd->thread_pool, tilec, numres);
1211	} else {
1212	return opj_dwt_decode_partial_tile(tilec, numres);
1213	}
1214	}
1215
1216
1217	/ <summary> /
1218	/ Get gain of 5-3 wavelet transform. /
1219	/ </summary> /
1220	OPJ_UINT32 opj_dwt_getgain(OPJ_UINT32 orient)
1221	{
1222	if (orient == `0`) {
1223	return `0`;
1224	}
1225	if (orient == `1` \|\| orient == `2`) {
1226	return `1`;
1227	}
1228	return `2`;
1229	}
1230
1231	/ <summary> /
1232	/ Get norm of 5-3 wavelet. /
1233	/ </summary> /
1234	OPJ_FLOAT64 opj_dwt_getnorm(OPJ_UINT32 level, OPJ_UINT32 orient)
1235	{
1236	/ FIXME ! This is just a band-aid to avoid a buffer overflow /
1237	/ but the array should really be extended up to 33 resolution levels /
1238	/ See https://github.com/uclouvain/openjpeg/issues/493 /
1239	if (orient == `0` && level >= `10`) {
1240	level = `9`;
1241	} else if (orient > `0` && level >= `9`) {
1242	level = `8`;
1243	}
1244	return opj_dwt_norms[orient][level];
1245	}
1246
1247	/ <summary> /
1248	/ Forward 9-7 wavelet transform in 2-D. /
1249	/ </summary> /
1250	OPJ_BOOL opj_dwt_encode_real(opj_tcd_tilecomp_t * tilec)
1251	{
1252	return opj_dwt_encode_procedure(tilec, opj_dwt_encode_1_real);
1253	}
1254
1255	/ <summary> /
1256	/ Get gain of 9-7 wavelet transform. /
1257	/ </summary> /
1258	OPJ_UINT32 opj_dwt_getgain_real(OPJ_UINT32 orient)
1259	{
1260	(void)orient;
1261	return `0`;
1262	}
1263
1264	/ <summary> /
1265	/ Get norm of 9-7 wavelet. /
1266	/ </summary> /
1267	OPJ_FLOAT64 opj_dwt_getnorm_real(OPJ_UINT32 level, OPJ_UINT32 orient)
1268	{
1269	/ FIXME ! This is just a band-aid to avoid a buffer overflow /
1270	/ but the array should really be extended up to 33 resolution levels /
1271	/ See https://github.com/uclouvain/openjpeg/issues/493 /
1272	if (orient == `0` && level >= `10`) {
1273	level = `9`;
1274	} else if (orient > `0` && level >= `9`) {
1275	level = `8`;
1276	}
1277	return opj_dwt_norms_real[orient][level];
1278	}
1279
1280	void opj_dwt_calc_explicit_stepsizes(opj_tccp_t * tccp, OPJ_UINT32 prec)
1281	{
1282	OPJ_UINT32 numbands, bandno;
1283	numbands = `3` * tccp->numresolutions - `2`;
1284	for (bandno = `0`; bandno < numbands; bandno++) {
1285	OPJ_FLOAT64 stepsize;
1286	OPJ_UINT32 resno, level, orient, gain;
1287
1288	resno = (bandno == `0`) ? `0` : ((bandno - `1`) / `3` + `1`);
1289	orient = (bandno == `0`) ? `0` : ((bandno - `1`) % `3` + `1`);
1290	level = tccp->numresolutions - `1` - resno;
1291	gain = (tccp->qmfbid == `0`) ? `0` : ((orient == `0`) ? `0` : (((orient == `1`) \|\|
1292	(orient == `2`)) ? `1` : `2`));
1293	if (tccp->qntsty == J2K_CCP_QNTSTY_NOQNT) {
1294	stepsize = `1.0`;
1295	} else {
1296	OPJ_FLOAT64 norm = opj_dwt_norms_real[orient][level];
1297	stepsize = (`1` << (gain)) / norm;
1298	}
1299	opj_dwt_encode_stepsize((OPJ_INT32) floor(stepsize * `8192.0`),
1300	(OPJ_INT32)(prec + gain), &tccp->stepsizes[bandno]);
1301	}
1302	}
1303
1304	/ <summary> /
1305	/ Determine maximum computed resolution level for inverse wavelet transform /
1306	/ </summary> /
1307	static OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* OPJ_RESTRICT r,
1308	OPJ_UINT32 i)
1309	{
1310	OPJ_UINT32 mr = `0`;
1311	OPJ_UINT32 w;
1312	while (--i) {
1313	++r;
1314	if (mr < (w = (OPJ_UINT32)(r->x1 - r->x0))) {
1315	mr = w ;
1316	}
1317	if (mr < (w = (OPJ_UINT32)(r->y1 - r->y0))) {
1318	mr = w ;
1319	}
1320	}
1321	return mr ;
1322	}
1323
1324	typedef struct {
1325	opj_dwt_t h;
1326	OPJ_UINT32 rw;
1327	OPJ_UINT32 w;
1328	OPJ_INT32 * OPJ_RESTRICT tiledp;
1329	OPJ_UINT32 min_j;
1330	OPJ_UINT32 max_j;
1331	} opj_dwd_decode_h_job_t;
1332
1333	static void opj_dwt_decode_h_func(void* user_data, opj_tls_t* tls)
1334	{
1335	OPJ_UINT32 j;
1336	opj_dwd_decode_h_job_t* job;
1337	(void)tls;
1338
1339	job = (opj_dwd_decode_h_job_t*)user_data;
1340	for (j = job->min_j; j < job->max_j; j++) {
1341	opj_idwt53_h(&job->h, &job->tiledp[j * job->w]);
1342	}
1343
1344	opj_aligned_free(job->h.mem);
1345	opj_free(job);
1346	}
1347
1348	typedef struct {
1349	opj_dwt_t v;
1350	OPJ_UINT32 rh;
1351	OPJ_UINT32 w;
1352	OPJ_INT32 * OPJ_RESTRICT tiledp;
1353	OPJ_UINT32 min_j;
1354	OPJ_UINT32 max_j;
1355	} opj_dwd_decode_v_job_t;
1356
1357	static void opj_dwt_decode_v_func(void* user_data, opj_tls_t* tls)
1358	{
1359	OPJ_UINT32 j;
1360	opj_dwd_decode_v_job_t* job;
1361	(void)tls;
1362
1363	job = (opj_dwd_decode_v_job_t*)user_data;
1364	for (j = job->min_j; j + PARALLEL_COLS_53 <= job->max_j;
1365	j += PARALLEL_COLS_53) {
1366	opj_idwt53_v(&job->v, &job->tiledp[j], (OPJ_SIZE_T)job->w,
1367	PARALLEL_COLS_53);
1368	}
1369	if (j < job->max_j)
1370	opj_idwt53_v(&job->v, &job->tiledp[j], (OPJ_SIZE_T)job->w,
1371	(OPJ_INT32)(job->max_j - j));
1372
1373	opj_aligned_free(job->v.mem);
1374	opj_free(job);
1375	}
1376
1377
1378	/ <summary> /
1379	/ Inverse wavelet transform in 2-D. /
1380	/ </summary> /
1381	static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp,
1382	opj_tcd_tilecomp_t* tilec, OPJ_UINT32 numres)
1383	{
1384	opj_dwt_t h;
1385	opj_dwt_t v;
1386
1387	opj_tcd_resolution_t* tr = tilec->resolutions;
1388
1389	OPJ_UINT32 rw = (OPJ_UINT32)(tr->x1 -
1390	tr->x0); / width of the resolution level computed /
1391	OPJ_UINT32 rh = (OPJ_UINT32)(tr->y1 -
1392	tr->y0); / height of the resolution level computed /
1393
1394	OPJ_UINT32 w = (OPJ_UINT32)(tilec->resolutions[tilec->minimum_num_resolutions -
1395	`1`].x1 -
1396	tilec->resolutions[tilec->minimum_num_resolutions - `1`].x0);
1397	OPJ_SIZE_T h_mem_size;
1398	int num_threads;
1399
1400	if (numres == `1U`) {
1401	return OPJ_TRUE;
1402	}
1403	num_threads = opj_thread_pool_get_thread_count(tp);
1404	h_mem_size = opj_dwt_max_resolution(tr, numres);
1405	/ overflow check /
1406	if (h_mem_size > (SIZE_MAX / PARALLEL_COLS_53 / sizeof(OPJ_INT32))) {
1407	/ FIXME event manager error callback /
1408	return OPJ_FALSE;
1409	}
1410	/ We need PARALLEL_COLS_53 times the height of the array, /
1411	/ since for the vertical pass /
1412	/ we process PARALLEL_COLS_53 columns at a time /
1413	h_mem_size = PARALLEL_COLS_53 sizeof(OPJ_INT32);
1414	h.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size);
1415	if (! h.mem) {
1416	/ FIXME event manager error callback /
1417	return OPJ_FALSE;
1418	}
1419
1420	v.mem = h.mem;
1421
1422	while (--numres) {
1423	OPJ_INT32 * OPJ_RESTRICT tiledp = tilec->data;
1424	OPJ_UINT32 j;
1425
1426	++tr;
1427	h.sn = (OPJ_INT32)rw;
1428	v.sn = (OPJ_INT32)rh;
1429
1430	rw = (OPJ_UINT32)(tr->x1 - tr->x0);
1431	rh = (OPJ_UINT32)(tr->y1 - tr->y0);
1432
1433	h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn);
1434	h.cas = tr->x0 % `2`;
1435
1436	if (num_threads <= `1` \|\| rh <= `1`) {
1437	for (j = `0`; j < rh; ++j) {
1438	opj_idwt53_h(&h, &tiledp[(OPJ_SIZE_T)j * w]);
1439	}
1440	} else {
1441	OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads;
1442	OPJ_UINT32 step_j;
1443
1444	if (rh < num_jobs) {
1445	num_jobs = rh;
1446	}
1447	step_j = (rh / num_jobs);
1448
1449	for (j = `0`; j < num_jobs; j++) {
1450	opj_dwd_decode_h_job_t* job;
1451
1452	job = (opj_dwd_decode_h_job_t) opj_malloc(sizeof*(opj_dwd_decode_h_job_t));
1453	if (!job) {
1454	/ It would be nice to fallback to single thread case, but /
1455	/ unfortunately some jobs may be launched and have modified /
1456	/ tiledp, so it is not practical to recover from that error /
1457	/ FIXME event manager error callback /
1458	opj_thread_pool_wait_completion(tp, `0`);
1459	opj_aligned_free(h.mem);
1460	return OPJ_FALSE;
1461	}
1462	job->h = h;
1463	job->rw = rw;
1464	job->w = w;
1465	job->tiledp = tiledp;
1466	job->min_j = j * step_j;
1467	job->max_j = (j + `1U`) * step_j; / this can overflow /
1468	if (j == (num_jobs - `1U`)) { / this will take care of the overflow /
1469	job->max_j = rh;
1470	}
1471	job->h.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size);
1472	if (!job->h.mem) {
1473	/ FIXME event manager error callback /
1474	opj_thread_pool_wait_completion(tp, `0`);
1475	opj_free(job);
1476	opj_aligned_free(h.mem);
1477	return OPJ_FALSE;
1478	}
1479	opj_thread_pool_submit_job(tp, opj_dwt_decode_h_func, job);
1480	}
1481	opj_thread_pool_wait_completion(tp, `0`);
1482	}
1483
1484	v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn);
1485	v.cas = tr->y0 % `2`;
1486
1487	if (num_threads <= `1` \|\| rw <= `1`) {
1488	for (j = `0`; j + PARALLEL_COLS_53 <= rw;
1489	j += PARALLEL_COLS_53) {
1490	opj_idwt53_v(&v, &tiledp[j], (OPJ_SIZE_T)w, PARALLEL_COLS_53);
1491	}
1492	if (j < rw) {
1493	opj_idwt53_v(&v, &tiledp[j], (OPJ_SIZE_T)w, (OPJ_INT32)(rw - j));
1494	}
1495	} else {
1496	OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads;
1497	OPJ_UINT32 step_j;
1498
1499	if (rw < num_jobs) {
1500	num_jobs = rw;
1501	}
1502	step_j = (rw / num_jobs);
1503
1504	for (j = `0`; j < num_jobs; j++) {
1505	opj_dwd_decode_v_job_t* job;
1506
1507	job = (opj_dwd_decode_v_job_t) opj_malloc(sizeof*(opj_dwd_decode_v_job_t));
1508	if (!job) {
1509	/ It would be nice to fallback to single thread case, but /
1510	/ unfortunately some jobs may be launched and have modified /
1511	/ tiledp, so it is not practical to recover from that error /
1512	/ FIXME event manager error callback /
1513	opj_thread_pool_wait_completion(tp, `0`);
1514	opj_aligned_free(v.mem);
1515	return OPJ_FALSE;
1516	}
1517	job->v = v;
1518	job->rh = rh;
1519	job->w = w;
1520	job->tiledp = tiledp;
1521	job->min_j = j * step_j;
1522	job->max_j = (j + `1U`) * step_j; / this can overflow /
1523	if (j == (num_jobs - `1U`)) { / this will take care of the overflow /
1524	job->max_j = rw;
1525	}
1526	job->v.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size);
1527	if (!job->v.mem) {
1528	/ FIXME event manager error callback /
1529	opj_thread_pool_wait_completion(tp, `0`);
1530	opj_free(job);
1531	opj_aligned_free(v.mem);
1532	return OPJ_FALSE;
1533	}
1534	opj_thread_pool_submit_job(tp, opj_dwt_decode_v_func, job);
1535	}
1536	opj_thread_pool_wait_completion(tp, `0`);
1537	}
1538	}
1539	opj_aligned_free(h.mem);
1540	return OPJ_TRUE;
1541	}
1542
1543	static void opj_dwt_interleave_partial_h(OPJ_INT32 *dest,
1544	OPJ_INT32 cas,
1545	opj_sparse_array_int32_t* sa,
1546	OPJ_UINT32 sa_line,
1547	OPJ_UINT32 sn,
1548	OPJ_UINT32 win_l_x0,
1549	OPJ_UINT32 win_l_x1,
1550	OPJ_UINT32 win_h_x0,
1551	OPJ_UINT32 win_h_x1)
1552	{
1553	OPJ_BOOL ret;
1554	ret = opj_sparse_array_int32_read(sa,
1555	win_l_x0, sa_line,
1556	win_l_x1, sa_line + `1`,
1557	dest + cas + `2` * win_l_x0,
1558	`2`, `0`, OPJ_TRUE);
1559	assert(ret);
1560	ret = opj_sparse_array_int32_read(sa,
1561	sn + win_h_x0, sa_line,
1562	sn + win_h_x1, sa_line + `1`,
1563	dest + `1` - cas + `2` * win_h_x0,
1564	`2`, `0`, OPJ_TRUE);
1565	assert(ret);
1566	OPJ_UNUSED(ret);
1567	}
1568
1569
1570	static void opj_dwt_interleave_partial_v(OPJ_INT32 *dest,
1571	OPJ_INT32 cas,
1572	opj_sparse_array_int32_t* sa,
1573	OPJ_UINT32 sa_col,
1574	OPJ_UINT32 nb_cols,
1575	OPJ_UINT32 sn,
1576	OPJ_UINT32 win_l_y0,
1577	OPJ_UINT32 win_l_y1,
1578	OPJ_UINT32 win_h_y0,
1579	OPJ_UINT32 win_h_y1)
1580	{
1581	OPJ_BOOL ret;
1582	ret = opj_sparse_array_int32_read(sa,
1583	sa_col, win_l_y0,
1584	sa_col + nb_cols, win_l_y1,
1585	dest + cas * `4` + `2` * `4` * win_l_y0,
1586	`1`, `2` * `4`, OPJ_TRUE);
1587	assert(ret);
1588	ret = opj_sparse_array_int32_read(sa,
1589	sa_col, sn + win_h_y0,
1590	sa_col + nb_cols, sn + win_h_y1,
1591	dest + (`1` - cas) * `4` + `2` * `4` * win_h_y0,
1592	`1`, `2` * `4`, OPJ_TRUE);
1593	assert(ret);
1594	OPJ_UNUSED(ret);
1595	}
1596
1597	static void opj_dwt_decode_partial_1(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn,
1598	OPJ_INT32 cas,
1599	OPJ_INT32 win_l_x0,
1600	OPJ_INT32 win_l_x1,
1601	OPJ_INT32 win_h_x0,
1602	OPJ_INT32 win_h_x1)
1603	{
1604	OPJ_INT32 i;
1605
1606	if (!cas) {
1607	if ((dn > `0`) \|\| (sn > `1`)) { / NEW : CASE ONE ELEMENT /
1608
1609	/ Naive version is :*
1610	for (i = win_l_x0; i < i_max; i++) {
1611	OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
1612	}
1613	for (i = win_h_x0; i < win_h_x1; i++) {
1614	OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1;
1615	}
1616	but the compiler doesn't manage to unroll it to avoid bound
1617	checking in OPJ_S_ and OPJ_D_ macros
1618	*/
1619
1620	i = win_l_x0;
1621	if (i < win_l_x1) {
1622	OPJ_INT32 i_max;
1623
1624	/ Left-most case /
1625	OPJ_S(i) -= (OPJ_D_(i - `1`) + OPJ_D_(i) + `2`) >> `2`;
1626	i ++;
1627
1628	i_max = win_l_x1;
1629	if (i_max > dn) {
1630	i_max = dn;
1631	}
1632	for (; i < i_max; i++) {
1633	/ No bound checking /
1634	OPJ_S(i) -= (OPJ_D(i - `1`) + OPJ_D(i) + `2`) >> `2`;
1635	}
1636	for (; i < win_l_x1; i++) {
1637	/ Right-most case /
1638	OPJ_S(i) -= (OPJ_D_(i - `1`) + OPJ_D_(i) + `2`) >> `2`;
1639	}
1640	}
1641
1642	i = win_h_x0;
1643	if (i < win_h_x1) {
1644	OPJ_INT32 i_max = win_h_x1;
1645	if (i_max >= sn) {
1646	i_max = sn - `1`;
1647	}
1648	for (; i < i_max; i++) {
1649	/ No bound checking /
1650	OPJ_D(i) += (OPJ_S(i) + OPJ_S(i + `1`)) >> `1`;
1651	}
1652	for (; i < win_h_x1; i++) {
1653	/ Right-most case /
1654	OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + `1`)) >> `1`;
1655	}
1656	}
1657	}
1658	} else {
1659	if (!sn && dn == `1`) { / NEW : CASE ONE ELEMENT /
1660	OPJ_S(`0`) /= `2`;
1661	} else {
1662	for (i = win_l_x0; i < win_l_x1; i++) {
1663	OPJ_D(i) -= (OPJ_SS_(i) + OPJ_SS_(i + `1`) + `2`) >> `2`;
1664	}
1665	for (i = win_h_x0; i < win_h_x1; i++) {
1666	OPJ_S(i) += (OPJ_DD_(i) + OPJ_DD_(i - `1`)) >> `1`;
1667	}
1668	}
1669	}
1670	}
1671
1672	#define OPJ_S_off(i,off) a[(OPJ_UINT32)(i)24+off]
1673	#define OPJ_D_off(i,off) a[(1+(OPJ_UINT32)(i)2)4+off]
1674	#define OPJ_S__off(i,off) ((i)<0?OPJ_S_off(0,off):((i)>=sn?OPJ_S_off(sn-1,off):OPJ_S_off(i,off)))
1675	#define OPJ_D__off(i,off) ((i)<0?OPJ_D_off(0,off):((i)>=dn?OPJ_D_off(dn-1,off):OPJ_D_off(i,off)))
1676	#define OPJ_SS__off(i,off) ((i)<0?OPJ_S_off(0,off):((i)>=dn?OPJ_S_off(dn-1,off):OPJ_S_off(i,off)))
1677	#define OPJ_DD__off(i,off) ((i)<0?OPJ_D_off(0,off):((i)>=sn?OPJ_D_off(sn-1,off):OPJ_D_off(i,off)))
1678
1679	static void opj_dwt_decode_partial_1_parallel(OPJ_INT32 *a,
1680	OPJ_UINT32 nb_cols,
1681	OPJ_INT32 dn, OPJ_INT32 sn,
1682	OPJ_INT32 cas,
1683	OPJ_INT32 win_l_x0,
1684	OPJ_INT32 win_l_x1,
1685	OPJ_INT32 win_h_x0,
1686	OPJ_INT32 win_h_x1)
1687	{
1688	OPJ_INT32 i;
1689	OPJ_UINT32 off;
1690
1691	(void)nb_cols;
1692
1693	if (!cas) {
1694	if ((dn > `0`) \|\| (sn > `1`)) { / NEW : CASE ONE ELEMENT /
1695
1696	/ Naive version is :*
1697	for (i = win_l_x0; i < i_max; i++) {
1698	OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
1699	}
1700	for (i = win_h_x0; i < win_h_x1; i++) {
1701	OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1;
1702	}
1703	but the compiler doesn't manage to unroll it to avoid bound
1704	checking in OPJ_S_ and OPJ_D_ macros
1705	*/
1706
1707	i = win_l_x0;
1708	if (i < win_l_x1) {
1709	OPJ_INT32 i_max;
1710
1711	/ Left-most case /
1712	for (off = `0`; off < `4`; off++) {
1713	OPJ_S_off(i, off) -= (OPJ_D__off(i - `1`, off) + OPJ_D__off(i, off) + `2`) >> `2`;
1714	}
1715	i ++;
1716
1717	i_max = win_l_x1;
1718	if (i_max > dn) {
1719	i_max = dn;
1720	}
1721
1722	#ifdef __SSE2__
1723	if (i + `1` < i_max) {
1724	const __m128i two = _mm_set1_epi32(`2`);
1725	__m128i Dm1 = _mm_load_si128((__m128i * const)(a + `4` + (i - `1`) * `8`));
1726	for (; i + `1` < i_max; i += `2`) {
1727	/ No bound checking /
1728	__m128i S = _mm_load_si128((__m128i * const)(a + i * `8`));
1729	__m128i D = _mm_load_si128((__m128i * const)(a + `4` + i * `8`));
1730	__m128i S1 = _mm_load_si128((__m128i * const)(a + (i + `1`) * `8`));
1731	__m128i D1 = _mm_load_si128((__m128i * const)(a + `4` + (i + `1`) * `8`));
1732	S = _mm_sub_epi32(S,
1733	_mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(Dm1, D), two), `2`));
1734	S1 = _mm_sub_epi32(S1,
1735	_mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(D, D1), two), `2`));
1736	_mm_store_si128((__m128i)(a + i `8`), S);
1737	_mm_store_si128((__m128i)(a + (i + `1`) `8`), S1);
1738	Dm1 = D1;
1739	}
1740	}
1741	#endif
1742
1743	for (; i < i_max; i++) {
1744	/ No bound checking /
1745	for (off = `0`; off < `4`; off++) {
1746	OPJ_S_off(i, off) -= (OPJ_D_off(i - `1`, off) + OPJ_D_off(i, off) + `2`) >> `2`;
1747	}
1748	}
1749	for (; i < win_l_x1; i++) {
1750	/ Right-most case /
1751	for (off = `0`; off < `4`; off++) {
1752	OPJ_S_off(i, off) -= (OPJ_D__off(i - `1`, off) + OPJ_D__off(i, off) + `2`) >> `2`;
1753	}
1754	}
1755	}
1756
1757	i = win_h_x0;
1758	if (i < win_h_x1) {
1759	OPJ_INT32 i_max = win_h_x1;
1760	if (i_max >= sn) {
1761	i_max = sn - `1`;
1762	}
1763
1764	#ifdef __SSE2__
1765	if (i + `1` < i_max) {
1766	__m128i S = _mm_load_si128((__m128i * const)(a + i * `8`));
1767	for (; i + `1` < i_max; i += `2`) {
1768	/ No bound checking /
1769	__m128i D = _mm_load_si128((__m128i * const)(a + `4` + i * `8`));
1770	__m128i S1 = _mm_load_si128((__m128i * const)(a + (i + `1`) * `8`));
1771	__m128i D1 = _mm_load_si128((__m128i * const)(a + `4` + (i + `1`) * `8`));
1772	__m128i S2 = _mm_load_si128((__m128i * const)(a + (i + `2`) * `8`));
1773	D = _mm_add_epi32(D, _mm_srai_epi32(_mm_add_epi32(S, S1), `1`));
1774	D1 = _mm_add_epi32(D1, _mm_srai_epi32(_mm_add_epi32(S1, S2), `1`));
1775	_mm_store_si128((__m128i)(a + `4` + i `8`), D);
1776	_mm_store_si128((__m128i)(a + `4` + (i + `1`) `8`), D1);
1777	S = S2;
1778	}
1779	}
1780	#endif
1781
1782	for (; i < i_max; i++) {
1783	/ No bound checking /
1784	for (off = `0`; off < `4`; off++) {
1785	OPJ_D_off(i, off) += (OPJ_S_off(i, off) + OPJ_S_off(i + `1`, off)) >> `1`;
1786	}
1787	}
1788	for (; i < win_h_x1; i++) {
1789	/ Right-most case /
1790	for (off = `0`; off < `4`; off++) {
1791	OPJ_D_off(i, off) += (OPJ_S__off(i, off) + OPJ_S__off(i + `1`, off)) >> `1`;
1792	}
1793	}
1794	}
1795	}
1796	} else {
1797	if (!sn && dn == `1`) { / NEW : CASE ONE ELEMENT /
1798	for (off = `0`; off < `4`; off++) {
1799	OPJ_S_off(`0`, off) /= `2`;
1800	}
1801	} else {
1802	for (i = win_l_x0; i < win_l_x1; i++) {
1803	for (off = `0`; off < `4`; off++) {
1804	OPJ_D_off(i, off) -= (OPJ_SS__off(i, off) + OPJ_SS__off(i + `1`, off) + `2`) >> `2`;
1805	}
1806	}
1807	for (i = win_h_x0; i < win_h_x1; i++) {
1808	for (off = `0`; off < `4`; off++) {
1809	OPJ_S_off(i, off) += (OPJ_DD__off(i, off) + OPJ_DD__off(i - `1`, off)) >> `1`;
1810	}
1811	}
1812	}
1813	}
1814	}
1815
1816	static void opj_dwt_get_band_coordinates(opj_tcd_tilecomp_t* tilec,
1817	OPJ_UINT32 resno,
1818	OPJ_UINT32 bandno,
1819	OPJ_UINT32 tcx0,
1820	OPJ_UINT32 tcy0,
1821	OPJ_UINT32 tcx1,
1822	OPJ_UINT32 tcy1,
1823	OPJ_UINT32* tbx0,
1824	OPJ_UINT32* tby0,
1825	OPJ_UINT32* tbx1,
1826	OPJ_UINT32* tby1)
1827	{
1828	/ Compute number of decomposition for this band. See table F-1 /
1829	OPJ_UINT32 nb = (resno == `0`) ?
1830	tilec->numresolutions - `1` :
1831	tilec->numresolutions - resno;
1832	/ Map above tile-based coordinates to sub-band-based coordinates per /
1833	/ equation B-15 of the standard /
1834	OPJ_UINT32 x0b = bandno & `1`;
1835	OPJ_UINT32 y0b = bandno >> `1`;
1836	if (tbx0) {
1837	*tbx0 = (nb == `0`) ? tcx0 :
1838	(tcx0 <= (`1U` << (nb - `1`)) * x0b) ? `0` :
1839	opj_uint_ceildivpow2(tcx0 - (`1U` << (nb - `1`)) * x0b, nb);
1840	}
1841	if (tby0) {
1842	*tby0 = (nb == `0`) ? tcy0 :
1843	(tcy0 <= (`1U` << (nb - `1`)) * y0b) ? `0` :
1844	opj_uint_ceildivpow2(tcy0 - (`1U` << (nb - `1`)) * y0b, nb);
1845	}
1846	if (tbx1) {
1847	*tbx1 = (nb == `0`) ? tcx1 :
1848	(tcx1 <= (`1U` << (nb - `1`)) * x0b) ? `0` :
1849	opj_uint_ceildivpow2(tcx1 - (`1U` << (nb - `1`)) * x0b, nb);
1850	}
1851	if (tby1) {
1852	*tby1 = (nb == `0`) ? tcy1 :
1853	(tcy1 <= (`1U` << (nb - `1`)) * y0b) ? `0` :
1854	opj_uint_ceildivpow2(tcy1 - (`1U` << (nb - `1`)) * y0b, nb);
1855	}
1856	}
1857
1858	static void opj_dwt_segment_grow(OPJ_UINT32 filter_width,
1859	OPJ_UINT32 max_size,
1860	OPJ_UINT32* start,
1861	OPJ_UINT32* end)
1862	{
1863	start = opj_uint_subs(start, filter_width);
1864	end = opj_uint_adds(end, filter_width);
1865	end = opj_uint_min(end, max_size);
1866	}
1867
1868
1869	static opj_sparse_array_int32_t* opj_dwt_init_sparse_array(
1870	opj_tcd_tilecomp_t* tilec,
1871	OPJ_UINT32 numres)
1872	{
1873	opj_tcd_resolution_t* tr_max = &(tilec->resolutions[numres - `1`]);
1874	OPJ_UINT32 w = (OPJ_UINT32)(tr_max->x1 - tr_max->x0);
1875	OPJ_UINT32 h = (OPJ_UINT32)(tr_max->y1 - tr_max->y0);
1876	OPJ_UINT32 resno, bandno, precno, cblkno;
1877	opj_sparse_array_int32_t* sa = opj_sparse_array_int32_create(
1878	w, h, opj_uint_min(w, `64`), opj_uint_min(h, `64`));
1879	if (sa == NULL) {
1880	return NULL;
1881	}
1882
1883	for (resno = `0`; resno < numres; ++resno) {
1884	opj_tcd_resolution_t* res = &tilec->resolutions[resno];
1885
1886	for (bandno = `0`; bandno < res->numbands; ++bandno) {
1887	opj_tcd_band_t* band = &res->bands[bandno];
1888
1889	for (precno = `0`; precno < res->pw * res->ph; ++precno) {
1890	opj_tcd_precinct_t* precinct = &band->precincts[precno];
1891	for (cblkno = `0`; cblkno < precinct->cw * precinct->ch; ++cblkno) {
1892	opj_tcd_cblk_dec_t* cblk = &precinct->cblks.dec[cblkno];
1893	if (cblk->decoded_data != NULL) {
1894	OPJ_UINT32 x = (OPJ_UINT32)(cblk->x0 - band->x0);
1895	OPJ_UINT32 y = (OPJ_UINT32)(cblk->y0 - band->y0);
1896	OPJ_UINT32 cblk_w = (OPJ_UINT32)(cblk->x1 - cblk->x0);
1897	OPJ_UINT32 cblk_h = (OPJ_UINT32)(cblk->y1 - cblk->y0);
1898
1899	if (band->bandno & `1`) {
1900	opj_tcd_resolution_t* pres = &tilec->resolutions[resno - `1`];
1901	x += (OPJ_UINT32)(pres->x1 - pres->x0);
1902	}
1903	if (band->bandno & `2`) {
1904	opj_tcd_resolution_t* pres = &tilec->resolutions[resno - `1`];
1905	y += (OPJ_UINT32)(pres->y1 - pres->y0);
1906	}
1907
1908	if (!opj_sparse_array_int32_write(sa, x, y,
1909	x + cblk_w, y + cblk_h,
1910	cblk->decoded_data,
1911	`1`, cblk_w, OPJ_TRUE)) {
1912	opj_sparse_array_int32_free(sa);
1913	return NULL;
1914	}
1915	}
1916	}
1917	}
1918	}
1919	}
1920
1921	return sa;
1922	}
1923
1924
1925	static OPJ_BOOL opj_dwt_decode_partial_tile(
1926	opj_tcd_tilecomp_t* tilec,
1927	OPJ_UINT32 numres)
1928	{
1929	opj_sparse_array_int32_t* sa;
1930	opj_dwt_t h;
1931	opj_dwt_t v;
1932	OPJ_UINT32 resno;
1933	/ This value matches the maximum left/right extension given in tables /
1934	/ F.2 and F.3 of the standard. /
1935	const OPJ_UINT32 filter_width = `2U`;
1936
1937	opj_tcd_resolution_t* tr = tilec->resolutions;
1938	opj_tcd_resolution_t* tr_max = &(tilec->resolutions[numres - `1`]);
1939
1940	OPJ_UINT32 rw = (OPJ_UINT32)(tr->x1 -
1941	tr->x0); / width of the resolution level computed /
1942	OPJ_UINT32 rh = (OPJ_UINT32)(tr->y1 -
1943	tr->y0); / height of the resolution level computed /
1944
1945	OPJ_SIZE_T h_mem_size;
1946
1947	/ Compute the intersection of the area of interest, expressed in tile coordinates /
1948	/ with the tile coordinates /
1949	OPJ_UINT32 win_tcx0 = tilec->win_x0;
1950	OPJ_UINT32 win_tcy0 = tilec->win_y0;
1951	OPJ_UINT32 win_tcx1 = tilec->win_x1;
1952	OPJ_UINT32 win_tcy1 = tilec->win_y1;
1953
1954	if (tr_max->x0 == tr_max->x1 \|\| tr_max->y0 == tr_max->y1) {
1955	return OPJ_TRUE;
1956	}
1957
1958	sa = opj_dwt_init_sparse_array(tilec, numres);
1959	if (sa == NULL) {
1960	return OPJ_FALSE;
1961	}
1962
1963	if (numres == `1U`) {
1964	OPJ_BOOL ret = opj_sparse_array_int32_read(sa,
1965	tr_max->win_x0 - (OPJ_UINT32)tr_max->x0,
1966	tr_max->win_y0 - (OPJ_UINT32)tr_max->y0,
1967	tr_max->win_x1 - (OPJ_UINT32)tr_max->x0,
1968	tr_max->win_y1 - (OPJ_UINT32)tr_max->y0,
1969	tilec->data_win,
1970	`1`, tr_max->win_x1 - tr_max->win_x0,
1971	OPJ_TRUE);
1972	assert(ret);
1973	OPJ_UNUSED(ret);
1974	opj_sparse_array_int32_free(sa);
1975	return OPJ_TRUE;
1976	}
1977	h_mem_size = opj_dwt_max_resolution(tr, numres);
1978	/ overflow check /
1979	/ in vertical pass, we process 4 columns at a time /
1980	if (h_mem_size > (SIZE_MAX / (`4` * sizeof(OPJ_INT32)))) {
1981	/ FIXME event manager error callback /
1982	opj_sparse_array_int32_free(sa);
1983	return OPJ_FALSE;
1984	}
1985
1986	h_mem_size = `4` sizeof(OPJ_INT32);
1987	h.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size);
1988	if (! h.mem) {
1989	/ FIXME event manager error callback /
1990	opj_sparse_array_int32_free(sa);
1991	return OPJ_FALSE;
1992	}
1993
1994	v.mem = h.mem;
1995
1996	for (resno = `1`; resno < numres; resno ++) {
1997	OPJ_UINT32 i, j;
1998	/ Window of interest subband-based coordinates /
1999	OPJ_UINT32 win_ll_x0, win_ll_y0, win_ll_x1, win_ll_y1;
2000	OPJ_UINT32 win_hl_x0, win_hl_x1;
2001	OPJ_UINT32 win_lh_y0, win_lh_y1;
2002	/ Window of interest tile-resolution-based coordinates /
2003	OPJ_UINT32 win_tr_x0, win_tr_x1, win_tr_y0, win_tr_y1;
2004	/ Tile-resolution subband-based coordinates /
2005	OPJ_UINT32 tr_ll_x0, tr_ll_y0, tr_hl_x0, tr_lh_y0;
2006
2007	++tr;
2008
2009	h.sn = (OPJ_INT32)rw;
2010	v.sn = (OPJ_INT32)rh;
2011
2012	rw = (OPJ_UINT32)(tr->x1 - tr->x0);
2013	rh = (OPJ_UINT32)(tr->y1 - tr->y0);
2014
2015	h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn);
2016	h.cas = tr->x0 % `2`;
2017
2018	v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn);
2019	v.cas = tr->y0 % `2`;
2020
2021	/ Get the subband coordinates for the window of interest /
2022	/ LL band /
2023	opj_dwt_get_band_coordinates(tilec, resno, `0`,
2024	win_tcx0, win_tcy0, win_tcx1, win_tcy1,
2025	&win_ll_x0, &win_ll_y0,
2026	&win_ll_x1, &win_ll_y1);
2027
2028	/ HL band /
2029	opj_dwt_get_band_coordinates(tilec, resno, `1`,
2030	win_tcx0, win_tcy0, win_tcx1, win_tcy1,
2031	&win_hl_x0, NULL, &win_hl_x1, NULL);
2032
2033	/ LH band /
2034	opj_dwt_get_band_coordinates(tilec, resno, `2`,
2035	win_tcx0, win_tcy0, win_tcx1, win_tcy1,
2036	NULL, &win_lh_y0, NULL, &win_lh_y1);
2037
2038	/ Beware: band index for non-LL0 resolution are 0=HL, 1=LH and 2=HH /
2039	tr_ll_x0 = (OPJ_UINT32)tr->bands[`1`].x0;
2040	tr_ll_y0 = (OPJ_UINT32)tr->bands[`0`].y0;
2041	tr_hl_x0 = (OPJ_UINT32)tr->bands[`0`].x0;
2042	tr_lh_y0 = (OPJ_UINT32)tr->bands[`1`].y0;
2043
2044	/ Substract the origin of the bands for this tile, to the subwindow /
2045	/ of interest band coordinates, so as to get them relative to the /
2046	/ tile /
2047	win_ll_x0 = opj_uint_subs(win_ll_x0, tr_ll_x0);
2048	win_ll_y0 = opj_uint_subs(win_ll_y0, tr_ll_y0);
2049	win_ll_x1 = opj_uint_subs(win_ll_x1, tr_ll_x0);
2050	win_ll_y1 = opj_uint_subs(win_ll_y1, tr_ll_y0);
2051	win_hl_x0 = opj_uint_subs(win_hl_x0, tr_hl_x0);
2052	win_hl_x1 = opj_uint_subs(win_hl_x1, tr_hl_x0);
2053	win_lh_y0 = opj_uint_subs(win_lh_y0, tr_lh_y0);
2054	win_lh_y1 = opj_uint_subs(win_lh_y1, tr_lh_y0);
2055
2056	opj_dwt_segment_grow(filter_width, (OPJ_UINT32)h.sn, &win_ll_x0, &win_ll_x1);
2057	opj_dwt_segment_grow(filter_width, (OPJ_UINT32)h.dn, &win_hl_x0, &win_hl_x1);
2058
2059	opj_dwt_segment_grow(filter_width, (OPJ_UINT32)v.sn, &win_ll_y0, &win_ll_y1);
2060	opj_dwt_segment_grow(filter_width, (OPJ_UINT32)v.dn, &win_lh_y0, &win_lh_y1);
2061
2062	/ Compute the tile-resolution-based coordinates for the window of interest /
2063	if (h.cas == `0`) {
2064	win_tr_x0 = opj_uint_min(`2` * win_ll_x0, `2` * win_hl_x0 + `1`);
2065	win_tr_x1 = opj_uint_min(opj_uint_max(`2` * win_ll_x1, `2` * win_hl_x1 + `1`), rw);
2066	} else {
2067	win_tr_x0 = opj_uint_min(`2` * win_hl_x0, `2` * win_ll_x0 + `1`);
2068	win_tr_x1 = opj_uint_min(opj_uint_max(`2` * win_hl_x1, `2` * win_ll_x1 + `1`), rw);
2069	}
2070
2071	if (v.cas == `0`) {
2072	win_tr_y0 = opj_uint_min(`2` * win_ll_y0, `2` * win_lh_y0 + `1`);
2073	win_tr_y1 = opj_uint_min(opj_uint_max(`2` * win_ll_y1, `2` * win_lh_y1 + `1`), rh);
2074	} else {
2075	win_tr_y0 = opj_uint_min(`2` * win_lh_y0, `2` * win_ll_y0 + `1`);
2076	win_tr_y1 = opj_uint_min(opj_uint_max(`2` * win_lh_y1, `2` * win_ll_y1 + `1`), rh);
2077	}
2078
2079	for (j = `0`; j < rh; ++j) {
2080	if ((j >= win_ll_y0 && j < win_ll_y1) \|\|
2081	(j >= win_lh_y0 + (OPJ_UINT32)v.sn && j < win_lh_y1 + (OPJ_UINT32)v.sn)) {
2082
2083	/ Avoids dwt.c:1584:44 (in opj_dwt_decode_partial_1): runtime error: /
2084	/ signed integer overflow: -1094795586 + -1094795586 cannot be represented in type 'int' /
2085	/ on opj_decompress -i ../../openjpeg/MAPA.jp2 -o out.tif -d 0,0,256,256 /
2086	/ This is less extreme than memsetting the whole buffer to 0 /
2087	/ although we could potentially do better with better handling of edge conditions /
2088	if (win_tr_x1 >= `1` && win_tr_x1 < rw) {
2089	h.mem[win_tr_x1 - `1`] = `0`;
2090	}
2091	if (win_tr_x1 < rw) {
2092	h.mem[win_tr_x1] = `0`;
2093	}
2094
2095	opj_dwt_interleave_partial_h(h.mem,
2096	h.cas,
2097	sa,
2098	j,
2099	(OPJ_UINT32)h.sn,
2100	win_ll_x0,
2101	win_ll_x1,
2102	win_hl_x0,
2103	win_hl_x1);
2104	opj_dwt_decode_partial_1(h.mem, h.dn, h.sn, h.cas,
2105	(OPJ_INT32)win_ll_x0,
2106	(OPJ_INT32)win_ll_x1,
2107	(OPJ_INT32)win_hl_x0,
2108	(OPJ_INT32)win_hl_x1);
2109	if (!opj_sparse_array_int32_write(sa,
2110	win_tr_x0, j,
2111	win_tr_x1, j + `1`,
2112	h.mem + win_tr_x0,
2113	`1`, `0`, OPJ_TRUE)) {
2114	/ FIXME event manager error callback /
2115	opj_sparse_array_int32_free(sa);
2116	opj_aligned_free(h.mem);
2117	return OPJ_FALSE;
2118	}
2119	}
2120	}
2121
2122	for (i = win_tr_x0; i < win_tr_x1;) {
2123	OPJ_UINT32 nb_cols = opj_uint_min(`4U`, win_tr_x1 - i);
2124	opj_dwt_interleave_partial_v(v.mem,
2125	v.cas,
2126	sa,
2127	i,
2128	nb_cols,
2129	(OPJ_UINT32)v.sn,
2130	win_ll_y0,
2131	win_ll_y1,
2132	win_lh_y0,
2133	win_lh_y1);
2134	opj_dwt_decode_partial_1_parallel(v.mem, nb_cols, v.dn, v.sn, v.cas,
2135	(OPJ_INT32)win_ll_y0,
2136	(OPJ_INT32)win_ll_y1,
2137	(OPJ_INT32)win_lh_y0,
2138	(OPJ_INT32)win_lh_y1);
2139	if (!opj_sparse_array_int32_write(sa,
2140	i, win_tr_y0,
2141	i + nb_cols, win_tr_y1,
2142	v.mem + `4` * win_tr_y0,
2143	`1`, `4`, OPJ_TRUE)) {
2144	/ FIXME event manager error callback /
2145	opj_sparse_array_int32_free(sa);
2146	opj_aligned_free(h.mem);
2147	return OPJ_FALSE;
2148	}
2149
2150	i += nb_cols;
2151	}
2152	}
2153	opj_aligned_free(h.mem);
2154
2155	{
2156	OPJ_BOOL ret = opj_sparse_array_int32_read(sa,
2157	tr_max->win_x0 - (OPJ_UINT32)tr_max->x0,
2158	tr_max->win_y0 - (OPJ_UINT32)tr_max->y0,
2159	tr_max->win_x1 - (OPJ_UINT32)tr_max->x0,
2160	tr_max->win_y1 - (OPJ_UINT32)tr_max->y0,
2161	tilec->data_win,
2162	`1`, tr_max->win_x1 - tr_max->win_x0,
2163	OPJ_TRUE);
2164	assert(ret);
2165	OPJ_UNUSED(ret);
2166	}
2167	opj_sparse_array_int32_free(sa);
2168	return OPJ_TRUE;
2169	}
2170
2171	static void opj_v4dwt_interleave_h(opj_v4dwt_t* OPJ_RESTRICT dwt,
2172	OPJ_FLOAT32* OPJ_RESTRICT a,
2173	OPJ_UINT32 width,
2174	OPJ_UINT32 remaining_height)
2175	{
2176	OPJ_FLOAT32* OPJ_RESTRICT bi = (OPJ_FLOAT32*)(dwt->wavelet + dwt->cas);
2177	OPJ_UINT32 i, k;
2178	OPJ_UINT32 x0 = dwt->win_l_x0;
2179	OPJ_UINT32 x1 = dwt->win_l_x1;
2180
2181	for (k = `0`; k < `2`; ++k) {
2182	if (remaining_height >= `4` && ((OPJ_SIZE_T) a & `0x0f`) == `0` &&
2183	((OPJ_SIZE_T) bi & `0x0f`) == `0` && (width & `0x0f`) == `0`) {
2184	/ Fast code path /
2185	for (i = x0; i < x1; ++i) {
2186	OPJ_UINT32 j = i;
2187	bi[i * `8` ] = a[j];
2188	j += width;
2189	bi[i * `8` + `1`] = a[j];
2190	j += width;
2191	bi[i * `8` + `2`] = a[j];
2192	j += width;
2193	bi[i * `8` + `3`] = a[j];
2194	}
2195	} else {
2196	/ Slow code path /
2197	for (i = x0; i < x1; ++i) {
2198	OPJ_UINT32 j = i;
2199	bi[i * `8` ] = a[j];
2200	j += width;
2201	if (remaining_height == `1`) {
2202	continue;
2203	}
2204	bi[i * `8` + `1`] = a[j];
2205	j += width;
2206	if (remaining_height == `2`) {
2207	continue;
2208	}
2209	bi[i * `8` + `2`] = a[j];
2210	j += width;
2211	if (remaining_height == `3`) {
2212	continue;
2213	}
2214	bi[i * `8` + `3`] = a[j]; / This one/
2215	}
2216	}
2217
2218	bi = (OPJ_FLOAT32*)(dwt->wavelet + `1` - dwt->cas);
2219	a += dwt->sn;
2220	x0 = dwt->win_h_x0;
2221	x1 = dwt->win_h_x1;
2222	}
2223	}
2224
2225	static void opj_v4dwt_interleave_partial_h(opj_v4dwt_t* dwt,
2226	opj_sparse_array_int32_t* sa,
2227	OPJ_UINT32 sa_line,
2228	OPJ_UINT32 remaining_height)
2229	{
2230	OPJ_UINT32 i;
2231	for (i = `0`; i < remaining_height; i++) {
2232	OPJ_BOOL ret;
2233	ret = opj_sparse_array_int32_read(sa,
2234	dwt->win_l_x0, sa_line + i,
2235	dwt->win_l_x1, sa_line + i + `1`,
2236	/ Nasty cast from float* to int32* /
2237	(OPJ_INT32)(dwt->wavelet + dwt->cas + `2` dwt->win_l_x0) + i,
2238	`8`, `0`, OPJ_TRUE);
2239	assert(ret);
2240	ret = opj_sparse_array_int32_read(sa,
2241	(OPJ_UINT32)dwt->sn + dwt->win_h_x0, sa_line + i,
2242	(OPJ_UINT32)dwt->sn + dwt->win_h_x1, sa_line + i + `1`,
2243	/ Nasty cast from float* to int32* /
2244	(OPJ_INT32)(dwt->wavelet + `1` - dwt->cas + `2` dwt->win_h_x0) + i,
2245	`8`, `0`, OPJ_TRUE);
2246	assert(ret);
2247	OPJ_UNUSED(ret);
2248	}
2249	}
2250
2251	static void opj_v4dwt_interleave_v(opj_v4dwt_t* OPJ_RESTRICT dwt,
2252	OPJ_FLOAT32* OPJ_RESTRICT a,
2253	OPJ_UINT32 width,
2254	OPJ_UINT32 nb_elts_read)
2255	{
2256	opj_v4_t* OPJ_RESTRICT bi = dwt->wavelet + dwt->cas;
2257	OPJ_UINT32 i;
2258
2259	for (i = dwt->win_l_x0; i < dwt->win_l_x1; ++i) {
2260	memcpy(&bi[i * `2`], &a[i * (OPJ_SIZE_T)width],
2261	(OPJ_SIZE_T)nb_elts_read * sizeof(OPJ_FLOAT32));
2262	}
2263
2264	a += (OPJ_UINT32)dwt->sn * (OPJ_SIZE_T)width;
2265	bi = dwt->wavelet + `1` - dwt->cas;
2266
2267	for (i = dwt->win_h_x0; i < dwt->win_h_x1; ++i) {
2268	memcpy(&bi[i * `2`], &a[i * (OPJ_SIZE_T)width],
2269	(OPJ_SIZE_T)nb_elts_read * sizeof(OPJ_FLOAT32));
2270	}
2271	}
2272
2273	static void opj_v4dwt_interleave_partial_v(opj_v4dwt_t* OPJ_RESTRICT dwt,
2274	opj_sparse_array_int32_t* sa,
2275	OPJ_UINT32 sa_col,
2276	OPJ_UINT32 nb_elts_read)
2277	{
2278	OPJ_BOOL ret;
2279	ret = opj_sparse_array_int32_read(sa,
2280	sa_col, dwt->win_l_x0,
2281	sa_col + nb_elts_read, dwt->win_l_x1,
2282	(OPJ_INT32)(dwt->wavelet + dwt->cas + `2` dwt->win_l_x0),
2283	`1`, `8`, OPJ_TRUE);
2284	assert(ret);
2285	ret = opj_sparse_array_int32_read(sa,
2286	sa_col, (OPJ_UINT32)dwt->sn + dwt->win_h_x0,
2287	sa_col + nb_elts_read, (OPJ_UINT32)dwt->sn + dwt->win_h_x1,
2288	(OPJ_INT32)(dwt->wavelet + `1` - dwt->cas + `2` dwt->win_h_x0),
2289	`1`, `8`, OPJ_TRUE);
2290	assert(ret);
2291	OPJ_UNUSED(ret);
2292	}
2293
2294	#ifdef __SSE__
2295
2296	static void opj_v4dwt_decode_step1_sse(opj_v4_t* w,
2297	OPJ_UINT32 start,
2298	OPJ_UINT32 end,
2299	const __m128 c)
2300	{
2301	__m128* OPJ_RESTRICT vw = (__m128*) w;
2302	OPJ_UINT32 i;
2303	/ 4x unrolled loop /
2304	vw += `2` * start;
2305	for (i = start; i + `3` < end; i += `4`, vw += `8`) {
2306	__m128 xmm0 = _mm_mul_ps(vw[`0`], c);
2307	__m128 xmm2 = _mm_mul_ps(vw[`2`], c);
2308	__m128 xmm4 = _mm_mul_ps(vw[`4`], c);
2309	__m128 xmm6 = _mm_mul_ps(vw[`6`], c);
2310	vw[`0`] = xmm0;
2311	vw[`2`] = xmm2;
2312	vw[`4`] = xmm4;
2313	vw[`6`] = xmm6;
2314	}
2315	for (; i < end; ++i, vw += `2`) {
2316	vw[`0`] = _mm_mul_ps(vw[`0`], c);
2317	}
2318	}
2319
2320	static void opj_v4dwt_decode_step2_sse(opj_v4_t* l, opj_v4_t* w,
2321	OPJ_UINT32 start,
2322	OPJ_UINT32 end,
2323	OPJ_UINT32 m,
2324	__m128 c)
2325	{
2326	__m128* OPJ_RESTRICT vl = (__m128*) l;
2327	__m128* OPJ_RESTRICT vw = (__m128*) w;
2328	OPJ_UINT32 i;
2329	OPJ_UINT32 imax = opj_uint_min(end, m);
2330	__m128 tmp1, tmp2, tmp3;
2331	if (start == `0`) {
2332	tmp1 = vl[`0`];
2333	} else {
2334	vw += start * `2`;
2335	tmp1 = vw[-`3`];
2336	}
2337
2338	i = start;
2339
2340	/ 4x loop unrolling /
2341	for (; i + `3` < imax; i += `4`) {
2342	__m128 tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
2343	tmp2 = vw[-`1`];
2344	tmp3 = vw[ `0`];
2345	tmp4 = vw[ `1`];
2346	tmp5 = vw[ `2`];
2347	tmp6 = vw[ `3`];
2348	tmp7 = vw[ `4`];
2349	tmp8 = vw[ `5`];
2350	tmp9 = vw[ `6`];
2351	vw[-`1`] = _mm_add_ps(tmp2, _mm_mul_ps(_mm_add_ps(tmp1, tmp3), c));
2352	vw[ `1`] = _mm_add_ps(tmp4, _mm_mul_ps(_mm_add_ps(tmp3, tmp5), c));
2353	vw[ `3`] = _mm_add_ps(tmp6, _mm_mul_ps(_mm_add_ps(tmp5, tmp7), c));
2354	vw[ `5`] = _mm_add_ps(tmp8, _mm_mul_ps(_mm_add_ps(tmp7, tmp9), c));
2355	tmp1 = tmp9;
2356	vw += `8`;
2357	}
2358
2359	for (; i < imax; ++i) {
2360	tmp2 = vw[-`1`];
2361	tmp3 = vw[ `0`];
2362	vw[-`1`] = _mm_add_ps(tmp2, _mm_mul_ps(_mm_add_ps(tmp1, tmp3), c));
2363	tmp1 = tmp3;
2364	vw += `2`;
2365	}
2366	if (m < end) {
2367	assert(m + `1` == end);
2368	c = _mm_add_ps(c, c);
2369	c = _mm_mul_ps(c, vw[-`2`]);
2370	vw[-`1`] = _mm_add_ps(vw[-`1`], c);
2371	}
2372	}
2373
2374	#else
2375
2376	static void opj_v4dwt_decode_step1(opj_v4_t* w,
2377	OPJ_UINT32 start,
2378	OPJ_UINT32 end,
2379	const OPJ_FLOAT32 c)
2380	{
2381	OPJ_FLOAT32* OPJ_RESTRICT fw = (OPJ_FLOAT32*) w;
2382	OPJ_UINT32 i;
2383	for (i = start; i < end; ++i) {
2384	OPJ_FLOAT32 tmp1 = fw[i * `8` ];
2385	OPJ_FLOAT32 tmp2 = fw[i * `8` + `1`];
2386	OPJ_FLOAT32 tmp3 = fw[i * `8` + `2`];
2387	OPJ_FLOAT32 tmp4 = fw[i * `8` + `3`];
2388	fw[i * `8` ] = tmp1 * c;
2389	fw[i * `8` + `1`] = tmp2 * c;
2390	fw[i * `8` + `2`] = tmp3 * c;
2391	fw[i * `8` + `3`] = tmp4 * c;
2392	}
2393	}
2394
2395	static void opj_v4dwt_decode_step2(opj_v4_t* l, opj_v4_t* w,
2396	OPJ_UINT32 start,
2397	OPJ_UINT32 end,
2398	OPJ_UINT32 m,
2399	OPJ_FLOAT32 c)
2400	{
2401	OPJ_FLOAT32* fl = (OPJ_FLOAT32*) l;
2402	OPJ_FLOAT32* fw = (OPJ_FLOAT32*) w;
2403	OPJ_UINT32 i;
2404	OPJ_UINT32 imax = opj_uint_min(end, m);
2405	if (start > `0`) {
2406	fw += `8` * start;
2407	fl = fw - `8`;
2408	}
2409	for (i = start; i < imax; ++i) {
2410	OPJ_FLOAT32 tmp1_1 = fl[`0`];
2411	OPJ_FLOAT32 tmp1_2 = fl[`1`];
2412	OPJ_FLOAT32 tmp1_3 = fl[`2`];
2413	OPJ_FLOAT32 tmp1_4 = fl[`3`];
2414	OPJ_FLOAT32 tmp2_1 = fw[-`4`];
2415	OPJ_FLOAT32 tmp2_2 = fw[-`3`];
2416	OPJ_FLOAT32 tmp2_3 = fw[-`2`];
2417	OPJ_FLOAT32 tmp2_4 = fw[-`1`];
2418	OPJ_FLOAT32 tmp3_1 = fw[`0`];
2419	OPJ_FLOAT32 tmp3_2 = fw[`1`];
2420	OPJ_FLOAT32 tmp3_3 = fw[`2`];
2421	OPJ_FLOAT32 tmp3_4 = fw[`3`];
2422	fw[-`4`] = tmp2_1 + ((tmp1_1 + tmp3_1) * c);
2423	fw[-`3`] = tmp2_2 + ((tmp1_2 + tmp3_2) * c);
2424	fw[-`2`] = tmp2_3 + ((tmp1_3 + tmp3_3) * c);
2425	fw[-`1`] = tmp2_4 + ((tmp1_4 + tmp3_4) * c);
2426	fl = fw;
2427	fw += `8`;
2428	}
2429	if (m < end) {
2430	assert(m + `1` == end);
2431	c += c;
2432	fw[-`4`] = fw[-`4`] + fl[`0`] * c;
2433	fw[-`3`] = fw[-`3`] + fl[`1`] * c;
2434	fw[-`2`] = fw[-`2`] + fl[`2`] * c;
2435	fw[-`1`] = fw[-`1`] + fl[`3`] * c;
2436	}
2437	}
2438
2439	#endif
2440
2441	/ <summary> /
2442	/ Inverse 9-7 wavelet transform in 1-D. /
2443	/ </summary> /
2444	static void opj_v4dwt_decode(opj_v4dwt_t* OPJ_RESTRICT dwt)
2445	{
2446	OPJ_INT32 a, b;
2447	if (dwt->cas == `0`) {
2448	if (!((dwt->dn > `0`) \|\| (dwt->sn > `1`))) {
2449	return;
2450	}
2451	a = `0`;
2452	b = `1`;
2453	} else {
2454	if (!((dwt->sn > `0`) \|\| (dwt->dn > `1`))) {
2455	return;
2456	}
2457	a = `1`;
2458	b = `0`;
2459	}
2460	#ifdef __SSE__
2461	opj_v4dwt_decode_step1_sse(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1,
2462	_mm_set1_ps(opj_K));
2463	opj_v4dwt_decode_step1_sse(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1,
2464	_mm_set1_ps(opj_c13318));
2465	opj_v4dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + `1`,
2466	dwt->win_l_x0, dwt->win_l_x1,
2467	(OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a),
2468	_mm_set1_ps(opj_dwt_delta));
2469	opj_v4dwt_decode_step2_sse(dwt->wavelet + a, dwt->wavelet + b + `1`,
2470	dwt->win_h_x0, dwt->win_h_x1,
2471	(OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b),
2472	_mm_set1_ps(opj_dwt_gamma));
2473	opj_v4dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + `1`,
2474	dwt->win_l_x0, dwt->win_l_x1,
2475	(OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a),
2476	_mm_set1_ps(opj_dwt_beta));
2477	opj_v4dwt_decode_step2_sse(dwt->wavelet + a, dwt->wavelet + b + `1`,
2478	dwt->win_h_x0, dwt->win_h_x1,
2479	(OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b),
2480	_mm_set1_ps(opj_dwt_alpha));
2481	#else
2482	opj_v4dwt_decode_step1(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1,
2483	opj_K);
2484	opj_v4dwt_decode_step1(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1,
2485	opj_c13318);
2486	opj_v4dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + `1`,
2487	dwt->win_l_x0, dwt->win_l_x1,
2488	(OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a),
2489	opj_dwt_delta);
2490	opj_v4dwt_decode_step2(dwt->wavelet + a, dwt->wavelet + b + `1`,
2491	dwt->win_h_x0, dwt->win_h_x1,
2492	(OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b),
2493	opj_dwt_gamma);
2494	opj_v4dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + `1`,
2495	dwt->win_l_x0, dwt->win_l_x1,
2496	(OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a),
2497	opj_dwt_beta);
2498	opj_v4dwt_decode_step2(dwt->wavelet + a, dwt->wavelet + b + `1`,
2499	dwt->win_h_x0, dwt->win_h_x1,
2500	(OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b),
2501	opj_dwt_alpha);
2502	#endif
2503	}
2504
2505
2506	/ <summary> /
2507	/ Inverse 9-7 wavelet transform in 2-D. /
2508	/ </summary> /
2509	static
2510	OPJ_BOOL opj_dwt_decode_tile_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
2511	OPJ_UINT32 numres)
2512	{
2513	opj_v4dwt_t h;
2514	opj_v4dwt_t v;
2515
2516	opj_tcd_resolution_t* res = tilec->resolutions;
2517
2518	OPJ_UINT32 rw = (OPJ_UINT32)(res->x1 -
2519	res->x0); / width of the resolution level computed /
2520	OPJ_UINT32 rh = (OPJ_UINT32)(res->y1 -
2521	res->y0); / height of the resolution level computed /
2522
2523	OPJ_UINT32 w = (OPJ_UINT32)(tilec->resolutions[tilec->minimum_num_resolutions -
2524	`1`].x1 -
2525	tilec->resolutions[tilec->minimum_num_resolutions - `1`].x0);
2526
2527	OPJ_SIZE_T l_data_size;
2528
2529	l_data_size = opj_dwt_max_resolution(res, numres);
2530	/ overflow check /
2531	if (l_data_size > (SIZE_MAX - `5U`)) {
2532	/ FIXME event manager error callback /
2533	return OPJ_FALSE;
2534	}
2535	l_data_size += `5U`;
2536	/ overflow check /
2537	if (l_data_size > (SIZE_MAX / sizeof(opj_v4_t))) {
2538	/ FIXME event manager error callback /
2539	return OPJ_FALSE;
2540	}
2541	h.wavelet = (opj_v4_t) opj_aligned_malloc(l_data_size sizeof(opj_v4_t));
2542	if (!h.wavelet) {
2543	/ FIXME event manager error callback /
2544	return OPJ_FALSE;
2545	}
2546	v.wavelet = h.wavelet;
2547
2548	while (--numres) {
2549	OPJ_FLOAT32 * OPJ_RESTRICT aj = (OPJ_FLOAT32*) tilec->data;
2550	OPJ_UINT32 j;
2551
2552	h.sn = (OPJ_INT32)rw;
2553	v.sn = (OPJ_INT32)rh;
2554
2555	++res;
2556
2557	rw = (OPJ_UINT32)(res->x1 -
2558	res->x0); / width of the resolution level computed /
2559	rh = (OPJ_UINT32)(res->y1 -
2560	res->y0); / height of the resolution level computed /
2561
2562	h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn);
2563	h.cas = res->x0 % `2`;
2564
2565	h.win_l_x0 = `0`;
2566	h.win_l_x1 = (OPJ_UINT32)h.sn;
2567	h.win_h_x0 = `0`;
2568	h.win_h_x1 = (OPJ_UINT32)h.dn;
2569	for (j = `0`; j + `3` < rh; j += `4`) {
2570	OPJ_UINT32 k;
2571	opj_v4dwt_interleave_h(&h, aj, w, rh - j);
2572	opj_v4dwt_decode(&h);
2573
2574	for (k = `0`; k < rw; k++) {
2575	aj[k ] = h.wavelet[k].f[`0`];
2576	aj[k + (OPJ_SIZE_T)w ] = h.wavelet[k].f[`1`];
2577	aj[k + (OPJ_SIZE_T)w * `2`] = h.wavelet[k].f[`2`];
2578	aj[k + (OPJ_SIZE_T)w * `3`] = h.wavelet[k].f[`3`];
2579	}
2580
2581	aj += w * `4`;
2582	}
2583
2584	if (j < rh) {
2585	OPJ_UINT32 k;
2586	opj_v4dwt_interleave_h(&h, aj, w, rh - j);
2587	opj_v4dwt_decode(&h);
2588	for (k = `0`; k < rw; k++) {
2589	switch (rh - j) {
2590	case `3`:
2591	aj[k + (OPJ_SIZE_T)w * `2`] = h.wavelet[k].f[`2`];
2592	/ FALLTHRU /
2593	case `2`:
2594	aj[k + (OPJ_SIZE_T)w ] = h.wavelet[k].f[`1`];
2595	/ FALLTHRU /
2596	case `1`:
2597	aj[k] = h.wavelet[k].f[`0`];
2598	}
2599	}
2600	}
2601
2602	v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn);
2603	v.cas = res->y0 % `2`;
2604	v.win_l_x0 = `0`;
2605	v.win_l_x1 = (OPJ_UINT32)v.sn;
2606	v.win_h_x0 = `0`;
2607	v.win_h_x1 = (OPJ_UINT32)v.dn;
2608
2609	aj = (OPJ_FLOAT32*) tilec->data;
2610	for (j = rw; j > `3`; j -= `4`) {
2611	OPJ_UINT32 k;
2612
2613	opj_v4dwt_interleave_v(&v, aj, w, `4`);
2614	opj_v4dwt_decode(&v);
2615
2616	for (k = `0`; k < rh; ++k) {
2617	memcpy(&aj[k * (OPJ_SIZE_T)w], &v.wavelet[k], `4` * sizeof(OPJ_FLOAT32));
2618	}
2619	aj += `4`;
2620	}
2621
2622	if (rw & `0x03`) {
2623	OPJ_UINT32 k;
2624
2625	j = rw & `0x03`;
2626
2627	opj_v4dwt_interleave_v(&v, aj, w, j);
2628	opj_v4dwt_decode(&v);
2629
2630	for (k = `0`; k < rh; ++k) {
2631	memcpy(&aj[k * (OPJ_SIZE_T)w], &v.wavelet[k],
2632	(OPJ_SIZE_T)j * sizeof(OPJ_FLOAT32));
2633	}
2634	}
2635	}
2636
2637	opj_aligned_free(h.wavelet);
2638	return OPJ_TRUE;
2639	}
2640
2641	static
2642	OPJ_BOOL opj_dwt_decode_partial_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
2643	OPJ_UINT32 numres)
2644	{
2645	opj_sparse_array_int32_t* sa;
2646	opj_v4dwt_t h;
2647	opj_v4dwt_t v;
2648	OPJ_UINT32 resno;
2649	/ This value matches the maximum left/right extension given in tables /
2650	/ F.2 and F.3 of the standard. Note: in opj_tcd_is_subband_area_of_interest() /
2651	/ we currently use 3. /
2652	const OPJ_UINT32 filter_width = `4U`;
2653
2654	opj_tcd_resolution_t* tr = tilec->resolutions;
2655	opj_tcd_resolution_t* tr_max = &(tilec->resolutions[numres - `1`]);
2656
2657	OPJ_UINT32 rw = (OPJ_UINT32)(tr->x1 -
2658	tr->x0); / width of the resolution level computed /
2659	OPJ_UINT32 rh = (OPJ_UINT32)(tr->y1 -
2660	tr->y0); / height of the resolution level computed /
2661
2662	OPJ_SIZE_T l_data_size;
2663
2664	/ Compute the intersection of the area of interest, expressed in tile coordinates /
2665	/ with the tile coordinates /
2666	OPJ_UINT32 win_tcx0 = tilec->win_x0;
2667	OPJ_UINT32 win_tcy0 = tilec->win_y0;
2668	OPJ_UINT32 win_tcx1 = tilec->win_x1;
2669	OPJ_UINT32 win_tcy1 = tilec->win_y1;
2670
2671	if (tr_max->x0 == tr_max->x1 \|\| tr_max->y0 == tr_max->y1) {
2672	return OPJ_TRUE;
2673	}
2674
2675	sa = opj_dwt_init_sparse_array(tilec, numres);
2676	if (sa == NULL) {
2677	return OPJ_FALSE;
2678	}
2679
2680	if (numres == `1U`) {
2681	OPJ_BOOL ret = opj_sparse_array_int32_read(sa,
2682	tr_max->win_x0 - (OPJ_UINT32)tr_max->x0,
2683	tr_max->win_y0 - (OPJ_UINT32)tr_max->y0,
2684	tr_max->win_x1 - (OPJ_UINT32)tr_max->x0,
2685	tr_max->win_y1 - (OPJ_UINT32)tr_max->y0,
2686	tilec->data_win,
2687	`1`, tr_max->win_x1 - tr_max->win_x0,
2688	OPJ_TRUE);
2689	assert(ret);
2690	OPJ_UNUSED(ret);
2691	opj_sparse_array_int32_free(sa);
2692	return OPJ_TRUE;
2693	}
2694
2695	l_data_size = opj_dwt_max_resolution(tr, numres);
2696	/ overflow check /
2697	if (l_data_size > (SIZE_MAX - `5U`)) {
2698	/ FIXME event manager error callback /
2699	return OPJ_FALSE;
2700	}
2701	l_data_size += `5U`;
2702	/ overflow check /
2703	if (l_data_size > (SIZE_MAX / sizeof(opj_v4_t))) {
2704	/ FIXME event manager error callback /
2705	return OPJ_FALSE;
2706	}
2707	h.wavelet = (opj_v4_t) opj_aligned_malloc(l_data_size sizeof(opj_v4_t));
2708	if (!h.wavelet) {
2709	/ FIXME event manager error callback /
2710	return OPJ_FALSE;
2711	}
2712	v.wavelet = h.wavelet;
2713
2714	for (resno = `1`; resno < numres; resno ++) {
2715	OPJ_UINT32 j;
2716	/ Window of interest subband-based coordinates /
2717	OPJ_UINT32 win_ll_x0, win_ll_y0, win_ll_x1, win_ll_y1;
2718	OPJ_UINT32 win_hl_x0, win_hl_x1;
2719	OPJ_UINT32 win_lh_y0, win_lh_y1;
2720	/ Window of interest tile-resolution-based coordinates /
2721	OPJ_UINT32 win_tr_x0, win_tr_x1, win_tr_y0, win_tr_y1;
2722	/ Tile-resolution subband-based coordinates /
2723	OPJ_UINT32 tr_ll_x0, tr_ll_y0, tr_hl_x0, tr_lh_y0;
2724
2725	++tr;
2726
2727	h.sn = (OPJ_INT32)rw;
2728	v.sn = (OPJ_INT32)rh;
2729
2730	rw = (OPJ_UINT32)(tr->x1 - tr->x0);
2731	rh = (OPJ_UINT32)(tr->y1 - tr->y0);
2732
2733	h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn);
2734	h.cas = tr->x0 % `2`;
2735
2736	v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn);
2737	v.cas = tr->y0 % `2`;
2738
2739	/ Get the subband coordinates for the window of interest /
2740	/ LL band /
2741	opj_dwt_get_band_coordinates(tilec, resno, `0`,
2742	win_tcx0, win_tcy0, win_tcx1, win_tcy1,
2743	&win_ll_x0, &win_ll_y0,
2744	&win_ll_x1, &win_ll_y1);
2745
2746	/ HL band /
2747	opj_dwt_get_band_coordinates(tilec, resno, `1`,
2748	win_tcx0, win_tcy0, win_tcx1, win_tcy1,
2749	&win_hl_x0, NULL, &win_hl_x1, NULL);
2750
2751	/ LH band /
2752	opj_dwt_get_band_coordinates(tilec, resno, `2`,
2753	win_tcx0, win_tcy0, win_tcx1, win_tcy1,
2754	NULL, &win_lh_y0, NULL, &win_lh_y1);
2755
2756	/ Beware: band index for non-LL0 resolution are 0=HL, 1=LH and 2=HH /
2757	tr_ll_x0 = (OPJ_UINT32)tr->bands[`1`].x0;
2758	tr_ll_y0 = (OPJ_UINT32)tr->bands[`0`].y0;
2759	tr_hl_x0 = (OPJ_UINT32)tr->bands[`0`].x0;
2760	tr_lh_y0 = (OPJ_UINT32)tr->bands[`1`].y0;
2761
2762	/ Substract the origin of the bands for this tile, to the subwindow /
2763	/ of interest band coordinates, so as to get them relative to the /
2764	/ tile /
2765	win_ll_x0 = opj_uint_subs(win_ll_x0, tr_ll_x0);
2766	win_ll_y0 = opj_uint_subs(win_ll_y0, tr_ll_y0);
2767	win_ll_x1 = opj_uint_subs(win_ll_x1, tr_ll_x0);
2768	win_ll_y1 = opj_uint_subs(win_ll_y1, tr_ll_y0);
2769	win_hl_x0 = opj_uint_subs(win_hl_x0, tr_hl_x0);
2770	win_hl_x1 = opj_uint_subs(win_hl_x1, tr_hl_x0);
2771	win_lh_y0 = opj_uint_subs(win_lh_y0, tr_lh_y0);
2772	win_lh_y1 = opj_uint_subs(win_lh_y1, tr_lh_y0);
2773
2774	opj_dwt_segment_grow(filter_width, (OPJ_UINT32)h.sn, &win_ll_x0, &win_ll_x1);
2775	opj_dwt_segment_grow(filter_width, (OPJ_UINT32)h.dn, &win_hl_x0, &win_hl_x1);
2776
2777	opj_dwt_segment_grow(filter_width, (OPJ_UINT32)v.sn, &win_ll_y0, &win_ll_y1);
2778	opj_dwt_segment_grow(filter_width, (OPJ_UINT32)v.dn, &win_lh_y0, &win_lh_y1);
2779
2780	/ Compute the tile-resolution-based coordinates for the window of interest /
2781	if (h.cas == `0`) {
2782	win_tr_x0 = opj_uint_min(`2` * win_ll_x0, `2` * win_hl_x0 + `1`);
2783	win_tr_x1 = opj_uint_min(opj_uint_max(`2` * win_ll_x1, `2` * win_hl_x1 + `1`), rw);
2784	} else {
2785	win_tr_x0 = opj_uint_min(`2` * win_hl_x0, `2` * win_ll_x0 + `1`);
2786	win_tr_x1 = opj_uint_min(opj_uint_max(`2` * win_hl_x1, `2` * win_ll_x1 + `1`), rw);
2787	}
2788
2789	if (v.cas == `0`) {
2790	win_tr_y0 = opj_uint_min(`2` * win_ll_y0, `2` * win_lh_y0 + `1`);
2791	win_tr_y1 = opj_uint_min(opj_uint_max(`2` * win_ll_y1, `2` * win_lh_y1 + `1`), rh);
2792	} else {
2793	win_tr_y0 = opj_uint_min(`2` * win_lh_y0, `2` * win_ll_y0 + `1`);
2794	win_tr_y1 = opj_uint_min(opj_uint_max(`2` * win_lh_y1, `2` * win_ll_y1 + `1`), rh);
2795	}
2796
2797	h.win_l_x0 = win_ll_x0;
2798	h.win_l_x1 = win_ll_x1;
2799	h.win_h_x0 = win_hl_x0;
2800	h.win_h_x1 = win_hl_x1;
2801	for (j = `0`; j + `3` < rh; j += `4`) {
2802	if ((j + `3` >= win_ll_y0 && j < win_ll_y1) \|\|
2803	(j + `3` >= win_lh_y0 + (OPJ_UINT32)v.sn &&
2804	j < win_lh_y1 + (OPJ_UINT32)v.sn)) {
2805	opj_v4dwt_interleave_partial_h(&h, sa, j, opj_uint_min(`4U`, rh - j));
2806	opj_v4dwt_decode(&h);
2807	if (!opj_sparse_array_int32_write(sa,
2808	win_tr_x0, j,
2809	win_tr_x1, j + `4`,
2810	(OPJ_INT32*)&h.wavelet[win_tr_x0].f[`0`],
2811	`4`, `1`, OPJ_TRUE)) {
2812	/ FIXME event manager error callback /
2813	opj_sparse_array_int32_free(sa);
2814	opj_aligned_free(h.wavelet);
2815	return OPJ_FALSE;
2816	}
2817	}
2818	}
2819
2820	if (j < rh &&
2821	((j + `3` >= win_ll_y0 && j < win_ll_y1) \|\|
2822	(j + `3` >= win_lh_y0 + (OPJ_UINT32)v.sn &&
2823	j < win_lh_y1 + (OPJ_UINT32)v.sn))) {
2824	opj_v4dwt_interleave_partial_h(&h, sa, j, rh - j);
2825	opj_v4dwt_decode(&h);
2826	if (!opj_sparse_array_int32_write(sa,
2827	win_tr_x0, j,
2828	win_tr_x1, rh,
2829	(OPJ_INT32*)&h.wavelet[win_tr_x0].f[`0`],
2830	`4`, `1`, OPJ_TRUE)) {
2831	/ FIXME event manager error callback /
2832	opj_sparse_array_int32_free(sa);
2833	opj_aligned_free(h.wavelet);
2834	return OPJ_FALSE;
2835	}
2836	}
2837
2838	v.win_l_x0 = win_ll_y0;
2839	v.win_l_x1 = win_ll_y1;
2840	v.win_h_x0 = win_lh_y0;
2841	v.win_h_x1 = win_lh_y1;
2842	for (j = win_tr_x0; j < win_tr_x1; j += `4`) {
2843	OPJ_UINT32 nb_elts = opj_uint_min(`4U`, win_tr_x1 - j);
2844
2845	opj_v4dwt_interleave_partial_v(&v, sa, j, nb_elts);
2846	opj_v4dwt_decode(&v);
2847
2848	if (!opj_sparse_array_int32_write(sa,
2849	j, win_tr_y0,
2850	j + nb_elts, win_tr_y1,
2851	(OPJ_INT32*)&h.wavelet[win_tr_y0].f[`0`],
2852	`1`, `4`, OPJ_TRUE)) {
2853	/ FIXME event manager error callback /
2854	opj_sparse_array_int32_free(sa);
2855	opj_aligned_free(h.wavelet);
2856	return OPJ_FALSE;
2857	}
2858	}
2859	}
2860
2861	{
2862	OPJ_BOOL ret = opj_sparse_array_int32_read(sa,
2863	tr_max->win_x0 - (OPJ_UINT32)tr_max->x0,
2864	tr_max->win_y0 - (OPJ_UINT32)tr_max->y0,
2865	tr_max->win_x1 - (OPJ_UINT32)tr_max->x0,
2866	tr_max->win_y1 - (OPJ_UINT32)tr_max->y0,
2867	tilec->data_win,
2868	`1`, tr_max->win_x1 - tr_max->win_x0,
2869	OPJ_TRUE);
2870	assert(ret);
2871	OPJ_UNUSED(ret);
2872	}
2873	opj_sparse_array_int32_free(sa);
2874
2875	opj_aligned_free(h.wavelet);
2876	return OPJ_TRUE;
2877	}
2878
2879
2880	OPJ_BOOL opj_dwt_decode_real(opj_tcd_t *p_tcd,
2881	opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
2882	OPJ_UINT32 numres)
2883	{
2884	if (p_tcd->whole_tile_decoding) {
2885	return opj_dwt_decode_tile_97(tilec, numres);
2886	} else {
2887	return opj_dwt_decode_partial_97(tilec, numres);
2888	}
2889	}
2890

Browse the source code of MuPDF/thirdparty/openjpeg/src/lib/openjp2/dwt.c