1 | // [Blend2D] |
2 | // 2D Vector Graphics Powered by a JIT Compiler. |
3 | // |
4 | // [License] |
5 | // Zlib - See LICENSE.md file in the package. |
6 | |
7 | #include "./blapi-build_p.h" |
8 | #ifdef BL_BUILD_OPT_SSE2 |
9 | |
10 | #include "./blpixelconverter_p.h" |
11 | #include "./blsimd_p.h" |
12 | #include "./blsupport_p.h" |
13 | |
14 | // ============================================================================ |
15 | // [BLPixelConverter - PRGB32 <- XRGB32 (SSE2)] |
16 | // ============================================================================ |
17 | |
18 | static BLResult BL_CDECL bl_convert_prgb32_from_xrgb32_sse2( |
19 | const BLPixelConverterCore* self, |
20 | uint8_t* dstData, intptr_t dstStride, |
21 | const uint8_t* srcData, intptr_t srcStride, uint32_t w, uint32_t h, const BLPixelConverterOptions* options) noexcept { |
22 | |
23 | using namespace SIMD; |
24 | if (!options) |
25 | options = &blPixelConverterDefaultOptions; |
26 | |
27 | const BLPixelConverterData::NativeFromExternal& d = blPixelConverterGetData(self)->nativeFromExternal; |
28 | size_t gap = options->gap; |
29 | |
30 | dstStride -= (w * 4) + gap; |
31 | srcStride -= (w * 4); |
32 | |
33 | I128 fillMask = vseti128u32(d.fillMask); |
34 | |
35 | for (uint32_t y = h; y != 0; y--) { |
36 | uint32_t i = w; |
37 | |
38 | while (i >= 16) { |
39 | I128 p0, p1, p2, p3; |
40 | |
41 | p0 = vloadi128u(srcData + 0); |
42 | p1 = vloadi128u(srcData + 16); |
43 | p2 = vloadi128u(srcData + 32); |
44 | p3 = vloadi128u(srcData + 48); |
45 | |
46 | p0 = vor(p0, fillMask); |
47 | p1 = vor(p1, fillMask); |
48 | p2 = vor(p2, fillMask); |
49 | p3 = vor(p3, fillMask); |
50 | |
51 | vstorei128u(dstData + 0, p0); |
52 | vstorei128u(dstData + 16, p1); |
53 | vstorei128u(dstData + 32, p2); |
54 | vstorei128u(dstData + 48, p3); |
55 | |
56 | dstData += 64; |
57 | srcData += 64; |
58 | i -= 16; |
59 | } |
60 | |
61 | while (i >= 4) { |
62 | I128 p0; |
63 | |
64 | p0 = vloadi128u(srcData); |
65 | p0 = vor(p0, fillMask); |
66 | vstorei128u(dstData, p0); |
67 | |
68 | dstData += 16; |
69 | srcData += 16; |
70 | i -= 4; |
71 | } |
72 | |
73 | while (i) { |
74 | I128 p0; |
75 | |
76 | p0 = vloadi128_32(srcData); |
77 | p0 = vor(p0, fillMask); |
78 | vstorei32(dstData, p0); |
79 | |
80 | dstData += 4; |
81 | srcData += 4; |
82 | i--; |
83 | } |
84 | |
85 | dstData = blPixelConverterFillGap(dstData, gap); |
86 | dstData += dstStride; |
87 | srcData += srcStride; |
88 | } |
89 | |
90 | return BL_SUCCESS; |
91 | } |
92 | |
93 | // ============================================================================ |
94 | // [BLPixelConverter - PRGB32 <- ARGB32 (SSE2)] |
95 | // ============================================================================ |
96 | |
97 | static BLResult BL_CDECL bl_convert_prgb32_from_argb32_sse2( |
98 | const BLPixelConverterCore* self, |
99 | uint8_t* dstData, intptr_t dstStride, |
100 | const uint8_t* srcData, intptr_t srcStride, uint32_t w, uint32_t h, const BLPixelConverterOptions* options) noexcept { |
101 | |
102 | using namespace SIMD; |
103 | if (!options) |
104 | options = &blPixelConverterDefaultOptions; |
105 | |
106 | const BLPixelConverterData::NativeFromExternal& d = blPixelConverterGetData(self)->nativeFromExternal; |
107 | size_t gap = options->gap; |
108 | |
109 | dstStride -= (w * 4) + gap; |
110 | srcStride -= (w * 4); |
111 | |
112 | I128 zero = vzeroi128(); |
113 | I128 a255 = vseti128u64(0x00FF000000000000u); |
114 | I128 fillMask = vseti128u32(d.fillMask); |
115 | |
116 | for (uint32_t y = h; y != 0; y--) { |
117 | uint32_t i = w; |
118 | |
119 | while (i >= 4) { |
120 | I128 p0, p1; |
121 | I128 a0, a1; |
122 | |
123 | p0 = vloadi128u(srcData); |
124 | |
125 | p1 = vunpackhi8(p0, zero); |
126 | p0 = vunpackli8(p0, zero); |
127 | |
128 | a1 = vswizi16<3, 3, 3, 3>(p1); |
129 | p1 = vor(p1, a255); |
130 | |
131 | a0 = vswizi16<3, 3, 3, 3>(p0); |
132 | p0 = vor(p0, a255); |
133 | |
134 | p1 = vdiv255u16(vmuli16(p1, a1)); |
135 | p0 = vdiv255u16(vmuli16(p0, a0)); |
136 | p0 = vpacki16u8(p0, p1); |
137 | p0 = vor(p0, fillMask); |
138 | vstorei128u(dstData, p0); |
139 | |
140 | dstData += 16; |
141 | srcData += 16; |
142 | i -= 4; |
143 | } |
144 | |
145 | while (i) { |
146 | I128 p0; |
147 | I128 a0; |
148 | |
149 | p0 = vloadi128_32(srcData); |
150 | p0 = vunpackli8(p0, zero); |
151 | a0 = vswizi16<3, 3, 3, 3>(p0); |
152 | p0 = vor(p0, a255); |
153 | p0 = vdiv255u16(vmuli16(p0, a0)); |
154 | p0 = vpacki16u8(p0, p0); |
155 | p0 = vor(p0, fillMask); |
156 | vstorei32(dstData, p0); |
157 | |
158 | dstData += 4; |
159 | srcData += 4; |
160 | i--; |
161 | } |
162 | |
163 | dstData = blPixelConverterFillGap(dstData, gap); |
164 | dstData += dstStride; |
165 | srcData += srcStride; |
166 | } |
167 | |
168 | return BL_SUCCESS; |
169 | } |
170 | |
171 | // ============================================================================ |
172 | // [BLPixelConverter - Init (SSE2)] |
173 | // ============================================================================ |
174 | |
175 | bool blPixelConverterInitNativeFromXRGB_SSE2(BLPixelConverterCore* self, uint32_t dstFormat, const BLFormatInfo& srcInfo) noexcept { |
176 | BLPixelConverterData::NativeFromExternal& d = blPixelConverterGetData(self)->nativeFromExternal; |
177 | if (srcInfo.depth == 32) { |
178 | // Only BYTE aligned components (8888 or X888 formats). |
179 | if (!(srcInfo.flags & BL_FORMAT_FLAG_BYTE_ALIGNED)) |
180 | return false; |
181 | |
182 | // Only PRGB32, ARGB32, or XRGB32 formats. See SSSE3 implementation, which |
183 | // uses PSHUFB instruction and implements optimized conversion between all |
184 | // possible byte-aligned formats. |
185 | if (d.shifts[1] != 16 || d.shifts[2] != 8 || d.shifts[3] != 0) |
186 | return false; |
187 | |
188 | bool isARGB = d.shifts[0] == 24; |
189 | bool isPremultiplied = (srcInfo.flags & BL_FORMAT_FLAG_PREMULTIPLIED) != 0; |
190 | |
191 | switch (dstFormat) { |
192 | case BL_FORMAT_XRGB32: |
193 | case BL_FORMAT_PRGB32: |
194 | self->convertFunc = (isARGB && !isPremultiplied) ? bl_convert_prgb32_from_argb32_sse2 |
195 | : bl_convert_prgb32_from_xrgb32_sse2; |
196 | return true; |
197 | |
198 | default: |
199 | break; |
200 | } |
201 | } |
202 | |
203 | return false; |
204 | } |
205 | |
206 | #endif |
207 | |