1 | // [Blend2D] |
2 | // 2D Vector Graphics Powered by a JIT Compiler. |
3 | // |
4 | // [License] |
5 | // Zlib - See LICENSE.md file in the package. |
6 | |
7 | #include "./blapi-build_p.h" |
8 | #ifdef BL_BUILD_OPT_AVX2 |
9 | |
10 | #include "./blpixelconverter_p.h" |
11 | #include "./blsimd_p.h" |
12 | #include "./blsupport_p.h" |
13 | |
14 | // ============================================================================ |
15 | // [BLPixelConverter - PRGB32 <- XRGB32 (AVX2)] |
16 | // ============================================================================ |
17 | |
18 | static BLResult BL_CDECL bl_convert_prgb32_from_xrgb32_avx2( |
19 | const BLPixelConverterCore* self, |
20 | uint8_t* dstData, intptr_t dstStride, |
21 | const uint8_t* srcData, intptr_t srcStride, uint32_t w, uint32_t h, const BLPixelConverterOptions* options) noexcept { |
22 | |
23 | using namespace SIMD; |
24 | if (!options) |
25 | options = &blPixelConverterDefaultOptions; |
26 | |
27 | const BLPixelConverterData::NativeFromExternal& d = blPixelConverterGetData(self)->nativeFromExternal; |
28 | size_t gap = options->gap; |
29 | |
30 | dstStride -= (w * 4) + gap; |
31 | srcStride -= (w * 4); |
32 | |
33 | I256 fillMask = vseti256u32(d.fillMask); |
34 | I256 predicate = vdupli128(vloadi128u(d.simdData)); |
35 | |
36 | for (uint32_t y = h; y != 0; y--) { |
37 | uint32_t i = w; |
38 | |
39 | while (i >= 32) { |
40 | I256 p0, p1, p2, p3; |
41 | |
42 | p0 = vloadi256u(srcData + 0); |
43 | p1 = vloadi256u(srcData + 32); |
44 | p2 = vloadi256u(srcData + 64); |
45 | p3 = vloadi256u(srcData + 96); |
46 | |
47 | p0 = vpshufb(p0, predicate); |
48 | p1 = vpshufb(p1, predicate); |
49 | p2 = vpshufb(p2, predicate); |
50 | p3 = vpshufb(p3, predicate); |
51 | |
52 | p0 = vor(p0, fillMask); |
53 | p1 = vor(p1, fillMask); |
54 | p2 = vor(p2, fillMask); |
55 | p3 = vor(p3, fillMask); |
56 | |
57 | vstorei256u(dstData + 0, p0); |
58 | vstorei256u(dstData + 32, p1); |
59 | vstorei256u(dstData + 64, p2); |
60 | vstorei256u(dstData + 96, p3); |
61 | |
62 | dstData += 128; |
63 | srcData += 128; |
64 | i -= 32; |
65 | } |
66 | |
67 | while (i >= 8) { |
68 | I256 p0; |
69 | |
70 | p0 = vloadi256u(srcData); |
71 | p0 = vpshufb(p0, predicate); |
72 | p0 = vor(p0, fillMask); |
73 | vstorei256u(dstData, p0); |
74 | |
75 | dstData += 32; |
76 | srcData += 32; |
77 | i -= 8; |
78 | } |
79 | |
80 | while (i) { |
81 | I128 p0; |
82 | |
83 | p0 = vloadi128_32(srcData); |
84 | p0 = vpshufb(p0, vcast<I128>(predicate)); |
85 | p0 = vor(p0, vcast<I128>(fillMask)); |
86 | vstorei32(dstData, p0); |
87 | |
88 | dstData += 4; |
89 | srcData += 4; |
90 | i--; |
91 | } |
92 | |
93 | dstData = blPixelConverterFillGap(dstData, gap); |
94 | dstData += dstStride; |
95 | srcData += srcStride; |
96 | } |
97 | |
98 | return BL_SUCCESS; |
99 | } |
100 | |
101 | // ============================================================================ |
102 | // [BLPixelConverter - PRGB32 <- ARGB32 (AVX2)] |
103 | // ============================================================================ |
104 | |
105 | static BLResult BL_CDECL bl_convert_prgb32_from_argb32_avx2( |
106 | const BLPixelConverterCore* self, |
107 | uint8_t* dstData, intptr_t dstStride, |
108 | const uint8_t* srcData, intptr_t srcStride, uint32_t w, uint32_t h, const BLPixelConverterOptions* options) noexcept { |
109 | |
110 | using namespace SIMD; |
111 | if (!options) |
112 | options = &blPixelConverterDefaultOptions; |
113 | |
114 | const BLPixelConverterData::NativeFromExternal& d = blPixelConverterGetData(self)->nativeFromExternal; |
115 | size_t gap = options->gap; |
116 | |
117 | dstStride -= (w * 4) + gap; |
118 | srcStride -= (w * 4); |
119 | |
120 | I256 a255 = vseti256u64(0x00FF000000000000u); |
121 | I256 fillMask = vseti256u32(d.fillMask); |
122 | I256 predicate = vdupli128(vloadi128u(d.simdData)); |
123 | |
124 | for (uint32_t y = h; y != 0; y--) { |
125 | uint32_t i = w; |
126 | |
127 | while (i >= 16) { |
128 | I256 p0, p1, p2, p3; |
129 | |
130 | p0 = vloadi256u(srcData + 0); |
131 | p2 = vloadi256u(srcData + 32); |
132 | |
133 | I256 zero = vzeroi256(); |
134 | p0 = vpshufb(p0, predicate); |
135 | p2 = vpshufb(p2, predicate); |
136 | |
137 | p1 = vunpackhi8(p0, zero); |
138 | p0 = vunpackli8(p0, zero); |
139 | p3 = vunpackhi8(p2, zero); |
140 | p2 = vunpackli8(p2, zero); |
141 | |
142 | p0 = vmuli16(vor(p0, a255), vswizi16<3, 3, 3, 3>(p0)); |
143 | p1 = vmuli16(vor(p1, a255), vswizi16<3, 3, 3, 3>(p1)); |
144 | p2 = vmuli16(vor(p2, a255), vswizi16<3, 3, 3, 3>(p2)); |
145 | p3 = vmuli16(vor(p3, a255), vswizi16<3, 3, 3, 3>(p3)); |
146 | |
147 | p0 = vdiv255u16(p0); |
148 | p1 = vdiv255u16(p1); |
149 | p2 = vdiv255u16(p2); |
150 | p3 = vdiv255u16(p3); |
151 | |
152 | p0 = vpacki16u8(p0, p1); |
153 | p2 = vpacki16u8(p2, p3); |
154 | |
155 | p0 = vor(p0, fillMask); |
156 | p2 = vor(p2, fillMask); |
157 | |
158 | vstorei256u(dstData + 0, p0); |
159 | vstorei256u(dstData + 32, p2); |
160 | |
161 | dstData += 64; |
162 | srcData += 64; |
163 | i -= 16; |
164 | } |
165 | |
166 | while (i >= 4) { |
167 | I128 p0, p1; |
168 | I128 a0, a1; |
169 | |
170 | p0 = vloadi128u(srcData); |
171 | I128 zero = vzeroi128(); |
172 | p0 = vpshufb(p0, vcast<I128>(predicate)); |
173 | |
174 | p1 = vunpackhi8(p0, zero); |
175 | p0 = vunpackli8(p0, zero); |
176 | |
177 | a1 = vswizi16<3, 3, 3, 3>(p1); |
178 | p1 = vor(p1, vcast<I128>(a255)); |
179 | |
180 | a0 = vswizi16<3, 3, 3, 3>(p0); |
181 | p0 = vor(p0, vcast<I128>(a255)); |
182 | |
183 | p1 = vdiv255u16(vmuli16(p1, a1)); |
184 | p0 = vdiv255u16(vmuli16(p0, a0)); |
185 | p0 = vpacki16u8(p0, p1); |
186 | p0 = vor(p0, vcast<I128>(fillMask)); |
187 | vstorei128u(dstData, p0); |
188 | |
189 | dstData += 16; |
190 | srcData += 16; |
191 | i -= 4; |
192 | } |
193 | |
194 | while (i) { |
195 | I128 p0; |
196 | I128 a0; |
197 | |
198 | p0 = vloadi128_32(srcData); |
199 | I128 zero = vzeroi128(); |
200 | |
201 | p0 = vpshufb(p0, vcast<I128>(predicate)); |
202 | p0 = vunpackli8(p0, zero); |
203 | a0 = vswizi16<3, 3, 3, 3>(p0); |
204 | p0 = vor(p0, vcast<I128>(a255)); |
205 | p0 = vdiv255u16(vmuli16(p0, a0)); |
206 | p0 = vpacki16u8(p0, p0); |
207 | p0 = vor(p0, vcast<I128>(fillMask)); |
208 | vstorei32(dstData, p0); |
209 | |
210 | dstData += 4; |
211 | srcData += 4; |
212 | i--; |
213 | } |
214 | |
215 | dstData = blPixelConverterFillGap(dstData, gap); |
216 | dstData += dstStride; |
217 | srcData += srcStride; |
218 | } |
219 | |
220 | return BL_SUCCESS; |
221 | } |
222 | |
223 | // ============================================================================ |
224 | // [BLPixelConverter - Init (AVX2)] |
225 | // ============================================================================ |
226 | |
227 | static BL_INLINE uint32_t blPixelConverterMakePshufbPredicate32(const BLPixelConverterData::NativeFromExternal& d) noexcept { |
228 | uint32_t rIndex = uint32_t(d.shifts[0]) >> 3; |
229 | uint32_t gIndex = uint32_t(d.shifts[1]) >> 3; |
230 | uint32_t bIndex = uint32_t(d.shifts[2]) >> 3; |
231 | uint32_t aIndex = uint32_t(d.shifts[3]) >> 3; |
232 | |
233 | return (rIndex << 16) | (gIndex << 8) | (bIndex << 0) | (aIndex << 24); |
234 | } |
235 | |
236 | bool blPixelConverterInitNativeFromXRGB_AVX2(BLPixelConverterCore* self, uint32_t dstFormat, const BLFormatInfo& srcInfo) noexcept { |
237 | BLPixelConverterData::NativeFromExternal& d = blPixelConverterGetData(self)->nativeFromExternal; |
238 | const BLFormatInfo& dstInfo = blPixelConverterFormatInfo[dstFormat]; |
239 | |
240 | if (srcInfo.depth == 32) { |
241 | // Only BYTE aligned components (8888 or X888 formats). |
242 | if (!(srcInfo.flags & BL_FORMAT_FLAG_BYTE_ALIGNED)) |
243 | return false; |
244 | |
245 | bool isARGB = (srcInfo.flags & BL_FORMAT_FLAG_ALPHA) != 0; |
246 | bool isPremultiplied = (srcInfo.flags & BL_FORMAT_FLAG_PREMULTIPLIED) != 0; |
247 | |
248 | switch (dstFormat) { |
249 | case BL_FORMAT_XRGB32: |
250 | case BL_FORMAT_PRGB32: |
251 | d.simdData[0] = blPixelConverterMakePshufbPredicate32(d); |
252 | d.simdData[1] = d.simdData[0] + 0x04040404u; |
253 | d.simdData[2] = d.simdData[0] + 0x08080808u; |
254 | d.simdData[3] = d.simdData[0] + 0x0C0C0C0Cu; |
255 | |
256 | self->convertFunc = (isARGB && !isPremultiplied) ? bl_convert_prgb32_from_argb32_avx2 |
257 | : bl_convert_prgb32_from_xrgb32_avx2; |
258 | return true; |
259 | |
260 | default: |
261 | return false; |
262 | } |
263 | } |
264 | |
265 | return false; |
266 | } |
267 | |
268 | #endif |
269 | |