1 | /* |
2 | * Copyright (c) 2003, 2011, Oracle and/or its affiliates. All rights reserved. |
3 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
4 | * |
5 | * This code is free software; you can redistribute it and/or modify it |
6 | * under the terms of the GNU General Public License version 2 only, as |
7 | * published by the Free Software Foundation. Oracle designates this |
8 | * particular file as subject to the "Classpath" exception as provided |
9 | * by Oracle in the LICENSE file that accompanied this code. |
10 | * |
11 | * This code is distributed in the hope that it will be useful, but WITHOUT |
12 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
13 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
14 | * version 2 for more details (a copy is included in the LICENSE file that |
15 | * accompanied this code). |
16 | * |
17 | * You should have received a copy of the GNU General Public License version |
18 | * 2 along with this work; if not, write to the Free Software Foundation, |
19 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
20 | * |
21 | * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
22 | * or visit www.oracle.com if you need additional information or have any |
23 | * questions. |
24 | */ |
25 | |
26 | |
27 | /* |
28 | * FUNCTION |
29 | * Internal functions for mlib_ImageConv* on D64/F32 type and |
30 | * MLIB_EDGE_DST_NO_WRITE mask |
31 | * |
32 | */ |
33 | |
34 | #include "mlib_image.h" |
35 | #include "mlib_ImageConv.h" |
36 | |
37 | /***************************************************************/ |
38 | /* |
39 | This define switches between functions of MLIB_DOUBLE and MLIB_FLOAT types: |
40 | Files mlib_ImageConv_D64nw.c and mlib_ImageConv_F32nw.c |
41 | */ |
42 | |
43 | #define TYPE_DOUBLE |
44 | |
45 | /***************************************************************/ |
46 | #ifdef TYPE_DOUBLE |
47 | |
48 | #define CONV_FUNC(KERN) mlib_conv##KERN##nw_d64 |
49 | |
50 | #define DTYPE mlib_d64 |
51 | |
52 | #else |
53 | |
54 | #define CONV_FUNC(KERN) mlib_conv##KERN##nw_f32 |
55 | |
56 | #define DTYPE mlib_f32 |
57 | |
58 | #endif /* TYPE_DOUBLE */ |
59 | |
60 | /***************************************************************/ |
61 | #define GET_SRC_DST_PARAMETERS(type) \ |
62 | mlib_s32 hgt = mlib_ImageGetHeight(src); \ |
63 | mlib_s32 wid = mlib_ImageGetWidth(src); \ |
64 | mlib_s32 sll = mlib_ImageGetStride(src) / sizeof(type); \ |
65 | mlib_s32 dll = mlib_ImageGetStride(dst) / sizeof(type); \ |
66 | type* adr_src = mlib_ImageGetData(src); \ |
67 | type* adr_dst = mlib_ImageGetData(dst); \ |
68 | mlib_s32 chan1 = mlib_ImageGetChannels(src) |
69 | |
70 | /***************************************************************/ |
71 | #define DEF_VARS(type) \ |
72 | GET_SRC_DST_PARAMETERS(type); \ |
73 | type *sl; \ |
74 | type *dl, *dp = NULL; \ |
75 | mlib_s32 i = 0, j, c |
76 | |
77 | /***************************************************************/ |
78 | #define BUFF_SIZE 1600 |
79 | |
80 | #define CACHE_SIZE (64*1024) |
81 | |
82 | static mlib_status mlib_ImageConv1xN(mlib_image *dst, |
83 | const mlib_image *src, |
84 | const DTYPE *k, |
85 | mlib_s32 n, |
86 | mlib_s32 dn, |
87 | mlib_s32 cmask) |
88 | { |
89 | DTYPE buff[BUFF_SIZE], *pbuff = buff; |
90 | const DTYPE *pk; |
91 | DTYPE k0, k1, k2, k3; |
92 | DTYPE p0, p1, p2, p3, p4; |
93 | DTYPE *sp, *sl_c, *dl_c, *sl0; |
94 | DEF_VARS(DTYPE); |
95 | mlib_s32 off, kh; |
96 | mlib_s32 l, hsize, max_hsize; |
97 | |
98 | hgt -= (n - 1); |
99 | adr_dst += dn*dll; |
100 | |
101 | max_hsize = (CACHE_SIZE/sizeof(DTYPE))/sll; |
102 | |
103 | if (!max_hsize) max_hsize = 1; |
104 | |
105 | if (max_hsize > BUFF_SIZE) { |
106 | pbuff = mlib_malloc(sizeof(DTYPE)*max_hsize); |
107 | } |
108 | |
109 | sl_c = adr_src; |
110 | dl_c = adr_dst; |
111 | |
112 | for (l = 0; l < hgt; l += hsize) { |
113 | hsize = hgt - l; |
114 | |
115 | if (hsize > max_hsize) hsize = max_hsize; |
116 | |
117 | for (c = 0; c < chan1; c++) { |
118 | if (!(cmask & (1 << (chan1 - 1 - c)))) continue; |
119 | |
120 | sl = sl_c + c; |
121 | dl = dl_c + c; |
122 | |
123 | #ifdef __SUNPRO_C |
124 | #pragma pipeloop(0) |
125 | #endif /* __SUNPRO_C */ |
126 | for (j = 0; j < hsize; j++) pbuff[j] = 0.0; |
127 | |
128 | for (i = 0; i < wid; i++) { |
129 | sl0 = sl; |
130 | |
131 | for (off = 0; off < (n - 4); off += 4) { |
132 | pk = k + off; |
133 | sp = sl0; |
134 | |
135 | k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3]; |
136 | p2 = sp[0]; p3 = sp[sll]; p4 = sp[2*sll]; |
137 | sp += 3*sll; |
138 | |
139 | #ifdef __SUNPRO_C |
140 | #pragma pipeloop(0) |
141 | #endif /* __SUNPRO_C */ |
142 | for (j = 0; j < hsize; j += 2) { |
143 | p0 = p2; p1 = p3; p2 = p4; |
144 | p3 = sp[0]; |
145 | p4 = sp[sll]; |
146 | |
147 | pbuff[j ] += p0*k0 + p1*k1 + p2*k2 + p3*k3; |
148 | pbuff[j + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3; |
149 | |
150 | sp += 2*sll; |
151 | } |
152 | |
153 | sl0 += 4*sll; |
154 | } |
155 | |
156 | pk = k + off; |
157 | sp = sl0; |
158 | |
159 | k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3]; |
160 | p2 = sp[0]; p3 = sp[sll]; p4 = sp[2*sll]; |
161 | |
162 | dp = dl; |
163 | kh = n - off; |
164 | |
165 | if (kh == 4) { |
166 | sp += 3*sll; |
167 | |
168 | #ifdef __SUNPRO_C |
169 | #pragma pipeloop(0) |
170 | #endif /* __SUNPRO_C */ |
171 | for (j = 0; j <= (hsize - 2); j += 2) { |
172 | p0 = p2; p1 = p3; p2 = p4; |
173 | p3 = sp[0]; |
174 | p4 = sp[sll]; |
175 | |
176 | dp[0 ] = p0*k0 + p1*k1 + p2*k2 + p3*k3 + pbuff[j]; |
177 | dp[dll] = p1*k0 + p2*k1 + p3*k2 + p4*k3 + pbuff[j + 1]; |
178 | |
179 | pbuff[j] = 0; |
180 | pbuff[j + 1] = 0; |
181 | |
182 | sp += 2*sll; |
183 | dp += 2*dll; |
184 | } |
185 | |
186 | if (j < hsize) { |
187 | p0 = p2; p1 = p3; p2 = p4; |
188 | p3 = sp[0]; |
189 | |
190 | dp[0] = p0*k0 + p1*k1 + p2*k2 + p3*k3 + pbuff[j]; |
191 | |
192 | pbuff[j] = 0; |
193 | } |
194 | |
195 | } else if (kh == 3) { |
196 | sp += 2*sll; |
197 | |
198 | #ifdef __SUNPRO_C |
199 | #pragma pipeloop(0) |
200 | #endif /* __SUNPRO_C */ |
201 | for (j = 0; j <= (hsize - 2); j += 2) { |
202 | p0 = p2; p1 = p3; |
203 | p2 = sp[0]; |
204 | p3 = sp[sll]; |
205 | |
206 | dp[0 ] = p0*k0 + p1*k1 + p2*k2 + pbuff[j]; |
207 | dp[dll] = p1*k0 + p2*k1 + p3*k2 + pbuff[j + 1]; |
208 | |
209 | pbuff[j] = 0; |
210 | pbuff[j + 1] = 0; |
211 | |
212 | sp += 2*sll; |
213 | dp += 2*dll; |
214 | } |
215 | |
216 | if (j < hsize) { |
217 | p0 = p2; p1 = p3; |
218 | p2 = sp[0]; |
219 | |
220 | dp[0] = p0*k0 + p1*k1 + p2*k2 + pbuff[j]; |
221 | |
222 | pbuff[j] = 0; |
223 | } |
224 | |
225 | } else if (kh == 2) { |
226 | sp += sll; |
227 | |
228 | #ifdef __SUNPRO_C |
229 | #pragma pipeloop(0) |
230 | #endif /* __SUNPRO_C */ |
231 | for (j = 0; j <= (hsize - 2); j += 2) { |
232 | p0 = p2; |
233 | p1 = sp[0]; |
234 | p2 = sp[sll]; |
235 | |
236 | dp[0 ] = p0*k0 + p1*k1 + pbuff[j]; |
237 | dp[dll] = p1*k0 + p2*k1 + pbuff[j + 1]; |
238 | |
239 | pbuff[j] = 0; |
240 | pbuff[j + 1] = 0; |
241 | |
242 | sp += 2*sll; |
243 | dp += 2*dll; |
244 | } |
245 | |
246 | if (j < hsize) { |
247 | p0 = p2; |
248 | p1 = sp[0]; |
249 | |
250 | dp[0] = p0*k0 + p1*k1 + pbuff[j]; |
251 | |
252 | pbuff[j] = 0; |
253 | } |
254 | |
255 | } else /* if (kh == 1) */ { |
256 | #ifdef __SUNPRO_C |
257 | #pragma pipeloop(0) |
258 | #endif /* __SUNPRO_C */ |
259 | for (j = 0; j < hsize; j++) { |
260 | p0 = sp[0]; |
261 | |
262 | dp[0] = p0*k0 + pbuff[j]; |
263 | |
264 | pbuff[j] = 0; |
265 | |
266 | sp += sll; |
267 | dp += dll; |
268 | } |
269 | } |
270 | |
271 | sl += chan1; |
272 | dl += chan1; |
273 | } |
274 | } |
275 | |
276 | sl_c += max_hsize*sll; |
277 | dl_c += max_hsize*dll; |
278 | } |
279 | |
280 | if (pbuff != buff) mlib_free(pbuff); |
281 | |
282 | return MLIB_SUCCESS; |
283 | } |
284 | |
285 | /***************************************************************/ |
286 | #define MAX_KER 7 |
287 | #define MAX_NM 81 |
288 | |
289 | mlib_status CONV_FUNC(MxN)(mlib_image *dst, |
290 | const mlib_image *src, |
291 | const mlib_d64 *ker, |
292 | mlib_s32 m, |
293 | mlib_s32 n, |
294 | mlib_s32 dm, |
295 | mlib_s32 dn, |
296 | mlib_s32 cmask) |
297 | { |
298 | DTYPE k0, k1, k2, k3, k4, k5, k6, *sp; |
299 | DTYPE p0, p1, p2, p3, p4, p5, p6, p7; |
300 | mlib_s32 l, off, kw; |
301 | DEF_VARS(DTYPE); |
302 | mlib_s32 chan2 = chan1 + chan1; |
303 | mlib_s32 chan3 = chan1 + chan2; |
304 | |
305 | #ifdef TYPE_DOUBLE |
306 | const mlib_d64 *k = ker; |
307 | #else |
308 | mlib_f32 k_arr[MAX_NM], *k = k_arr; |
309 | |
310 | if (n*m > MAX_NM) { |
311 | k = mlib_malloc(n*m*sizeof(mlib_f32)); |
312 | |
313 | if (k == NULL) return MLIB_FAILURE; |
314 | } |
315 | |
316 | for (i = 0; i < n*m; i++) k[i] = (mlib_f32)ker[i]; |
317 | #endif /* TYPE_DOUBLE */ |
318 | |
319 | if (m == 1) return mlib_ImageConv1xN(dst, src, k, n, dn, cmask); |
320 | |
321 | wid -= (m - 1); |
322 | hgt -= (n - 1); |
323 | adr_dst += dn*dll + dm*chan1; |
324 | |
325 | for (c = 0; c < chan1; c++) { |
326 | if (!(cmask & (1 << (chan1 - 1 - c)))) continue; |
327 | |
328 | sl = adr_src + c; |
329 | dl = adr_dst + c; |
330 | |
331 | for (j = 0; j < hgt; j++) { |
332 | const DTYPE *pk = k; |
333 | |
334 | for (l = 0; l < n; l++) { |
335 | DTYPE *sp0 = sl + l*sll; |
336 | |
337 | for (off = 0; off < m; off += kw, pk += kw, sp0 += chan1) { |
338 | kw = m - off; |
339 | |
340 | if (kw > 2*MAX_KER) kw = MAX_KER; else |
341 | if (kw > MAX_KER) kw = kw/2; |
342 | |
343 | p2 = sp0[0]; p3 = sp0[chan1]; p4 = sp0[chan2]; |
344 | sp0 += chan3; |
345 | p5 = sp0[0]; p6 = sp0[chan1]; p7 = sp0[chan2]; |
346 | |
347 | k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3]; |
348 | k4 = pk[4]; k5 = pk[5]; k6 = pk[6]; |
349 | |
350 | dp = dl; |
351 | |
352 | if (kw == 7) { |
353 | sp = sp0 += chan3; |
354 | |
355 | if (pk == k) { |
356 | #ifdef __SUNPRO_C |
357 | #pragma pipeloop(0) |
358 | #endif /* __SUNPRO_C */ |
359 | for (i = 0; i <= (wid - 2); i += 2) { |
360 | p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; |
361 | |
362 | p5 = sp[- chan1]; p6 = sp[0]; p7 = sp[chan1]; |
363 | |
364 | dp[0 ] = p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6; |
365 | dp[chan1] = p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6; |
366 | |
367 | sp += chan2; |
368 | dp += chan2; |
369 | } |
370 | |
371 | } else { |
372 | #ifdef __SUNPRO_C |
373 | #pragma pipeloop(0) |
374 | #endif /* __SUNPRO_C */ |
375 | for (i = 0; i <= (wid - 2); i += 2) { |
376 | p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; |
377 | |
378 | p5 = sp[- chan1]; p6 = sp[0]; p7 = sp[chan1]; |
379 | |
380 | dp[0 ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6; |
381 | dp[chan1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6; |
382 | |
383 | sp += chan2; |
384 | dp += chan2; |
385 | } |
386 | } |
387 | |
388 | } else if (kw == 6) { |
389 | sp = sp0 += chan2; |
390 | |
391 | if (pk == k) { |
392 | #ifdef __SUNPRO_C |
393 | #pragma pipeloop(0) |
394 | #endif /* __SUNPRO_C */ |
395 | for (i = 0; i <= (wid - 2); i += 2) { |
396 | p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; |
397 | |
398 | p5 = sp[0]; p6 = sp[chan1]; |
399 | |
400 | dp[0 ] = p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5; |
401 | dp[chan1] = p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5; |
402 | |
403 | sp += chan2; |
404 | dp += chan2; |
405 | } |
406 | |
407 | } else { |
408 | #ifdef __SUNPRO_C |
409 | #pragma pipeloop(0) |
410 | #endif /* __SUNPRO_C */ |
411 | for (i = 0; i <= (wid - 2); i += 2) { |
412 | p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; |
413 | |
414 | p5 = sp[0]; p6 = sp[chan1]; |
415 | |
416 | dp[0 ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5; |
417 | dp[chan1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5; |
418 | |
419 | sp += chan2; |
420 | dp += chan2; |
421 | } |
422 | } |
423 | |
424 | } else if (kw == 5) { |
425 | sp = sp0 += chan1; |
426 | |
427 | if (pk == k) { |
428 | #ifdef __SUNPRO_C |
429 | #pragma pipeloop(0) |
430 | #endif /* __SUNPRO_C */ |
431 | for (i = 0; i <= (wid - 2); i += 2) { |
432 | p0 = p2; p1 = p3; p2 = p4; p3 = p5; |
433 | |
434 | p4 = sp[0]; p5 = sp[chan1]; |
435 | |
436 | dp[0 ] = p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4; |
437 | dp[chan1] = p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4; |
438 | |
439 | sp += chan2; |
440 | dp += chan2; |
441 | } |
442 | |
443 | } else { |
444 | #ifdef __SUNPRO_C |
445 | #pragma pipeloop(0) |
446 | #endif /* __SUNPRO_C */ |
447 | for (i = 0; i <= (wid - 2); i += 2) { |
448 | p0 = p2; p1 = p3; p2 = p4; p3 = p5; |
449 | |
450 | p4 = sp[0]; p5 = sp[chan1]; |
451 | |
452 | dp[0 ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4; |
453 | dp[chan1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4; |
454 | |
455 | sp += chan2; |
456 | dp += chan2; |
457 | } |
458 | } |
459 | |
460 | } else if (kw == 4) { |
461 | |
462 | sp = sp0; |
463 | |
464 | if (pk == k) { |
465 | #ifdef __SUNPRO_C |
466 | #pragma pipeloop(0) |
467 | #endif /* __SUNPRO_C */ |
468 | for (i = 0; i <= (wid - 2); i += 2) { |
469 | p0 = p2; p1 = p3; p2 = p4; |
470 | |
471 | p3 = sp[0]; p4 = sp[chan1]; |
472 | |
473 | dp[0 ] = p0*k0 + p1*k1 + p2*k2 + p3*k3; |
474 | dp[chan1] = p1*k0 + p2*k1 + p3*k2 + p4*k3; |
475 | |
476 | sp += chan2; |
477 | dp += chan2; |
478 | } |
479 | |
480 | } else { |
481 | #ifdef __SUNPRO_C |
482 | #pragma pipeloop(0) |
483 | #endif /* __SUNPRO_C */ |
484 | for (i = 0; i <= (wid - 2); i += 2) { |
485 | p0 = p2; p1 = p3; p2 = p4; |
486 | |
487 | p3 = sp[0]; p4 = sp[chan1]; |
488 | |
489 | dp[0 ] += p0*k0 + p1*k1 + p2*k2 + p3*k3; |
490 | dp[chan1] += p1*k0 + p2*k1 + p3*k2 + p4*k3; |
491 | |
492 | sp += chan2; |
493 | dp += chan2; |
494 | } |
495 | } |
496 | |
497 | } else if (kw == 3) { |
498 | sp = sp0 -= chan1; |
499 | |
500 | if (pk == k) { |
501 | #ifdef __SUNPRO_C |
502 | #pragma pipeloop(0) |
503 | #endif /* __SUNPRO_C */ |
504 | for (i = 0; i <= (wid - 2); i += 2) { |
505 | p0 = p2; p1 = p3; |
506 | |
507 | p2 = sp[0]; p3 = sp[chan1]; |
508 | |
509 | dp[0 ] = p0*k0 + p1*k1 + p2*k2; |
510 | dp[chan1] = p1*k0 + p2*k1 + p3*k2; |
511 | |
512 | sp += chan2; |
513 | dp += chan2; |
514 | } |
515 | |
516 | } else { |
517 | #ifdef __SUNPRO_C |
518 | #pragma pipeloop(0) |
519 | #endif /* __SUNPRO_C */ |
520 | for (i = 0; i <= (wid - 2); i += 2) { |
521 | p0 = p2; p1 = p3; |
522 | |
523 | p2 = sp[0]; p3 = sp[chan1]; |
524 | |
525 | dp[0 ] += p0*k0 + p1*k1 + p2*k2; |
526 | dp[chan1] += p1*k0 + p2*k1 + p3*k2; |
527 | |
528 | sp += chan2; |
529 | dp += chan2; |
530 | } |
531 | } |
532 | |
533 | } else { /* kw == 2 */ |
534 | sp = sp0 -= chan2; |
535 | |
536 | if (pk == k) { |
537 | #ifdef __SUNPRO_C |
538 | #pragma pipeloop(0) |
539 | #endif /* __SUNPRO_C */ |
540 | for (i = 0; i <= (wid - 2); i += 2) { |
541 | p0 = p2; |
542 | |
543 | p1 = sp[0]; p2 = sp[chan1]; |
544 | |
545 | dp[0 ] = p0*k0 + p1*k1; |
546 | dp[chan1] = p1*k0 + p2*k1; |
547 | |
548 | sp += chan2; |
549 | dp += chan2; |
550 | } |
551 | |
552 | } else { |
553 | #ifdef __SUNPRO_C |
554 | #pragma pipeloop(0) |
555 | #endif /* __SUNPRO_C */ |
556 | for (i = 0; i <= (wid - 2); i += 2) { |
557 | p0 = p2; |
558 | |
559 | p1 = sp[0]; p2 = sp[chan1]; |
560 | |
561 | dp[0 ] += p0*k0 + p1*k1; |
562 | dp[chan1] += p1*k0 + p2*k1; |
563 | |
564 | sp += chan2; |
565 | dp += chan2; |
566 | } |
567 | } |
568 | } |
569 | } |
570 | } |
571 | |
572 | /* last pixels */ |
573 | |
574 | if (wid & 1) { |
575 | DTYPE *sp0 = sl + i*chan1, s = 0; |
576 | const DTYPE *pk = k; |
577 | mlib_s32 x; |
578 | |
579 | for (l = 0; l < n; l++) { |
580 | DTYPE *sp = sp0 + l*sll; |
581 | |
582 | for (x = 0; x < m; x++) s += sp[x*chan1] * (*pk++); |
583 | } |
584 | |
585 | dp[0] = s; |
586 | } |
587 | |
588 | /* next line */ |
589 | sl += sll; |
590 | dl += dll; |
591 | } |
592 | } |
593 | |
594 | #ifndef TYPE_DOUBLE |
595 | |
596 | if (k != k_arr) mlib_free(k); |
597 | #endif /* TYPE_DOUBLE */ |
598 | |
599 | return MLIB_SUCCESS; |
600 | } |
601 | |
602 | /***************************************************************/ |
603 | |