1// [Blend2D]
2// 2D Vector Graphics Powered by a JIT Compiler.
3//
4// [License]
5// Zlib - See LICENSE.md file in the package.
6
7#include "./blapi-build_p.h"
8#ifdef BL_BUILD_OPT_SSE2
9
10#include "./blpixelconverter_p.h"
11#include "./blsimd_p.h"
12#include "./blsupport_p.h"
13
14// ============================================================================
15// [BLPixelConverter - PRGB32 <- XRGB32 (SSE2)]
16// ============================================================================
17
18static BLResult BL_CDECL bl_convert_prgb32_from_xrgb32_sse2(
19 const BLPixelConverterCore* self,
20 uint8_t* dstData, intptr_t dstStride,
21 const uint8_t* srcData, intptr_t srcStride, uint32_t w, uint32_t h, const BLPixelConverterOptions* options) noexcept {
22
23 using namespace SIMD;
24 if (!options)
25 options = &blPixelConverterDefaultOptions;
26
27 const BLPixelConverterData::NativeFromExternal& d = blPixelConverterGetData(self)->nativeFromExternal;
28 size_t gap = options->gap;
29
30 dstStride -= (w * 4) + gap;
31 srcStride -= (w * 4);
32
33 I128 fillMask = vseti128u32(d.fillMask);
34
35 for (uint32_t y = h; y != 0; y--) {
36 uint32_t i = w;
37
38 while (i >= 16) {
39 I128 p0, p1, p2, p3;
40
41 p0 = vloadi128u(srcData + 0);
42 p1 = vloadi128u(srcData + 16);
43 p2 = vloadi128u(srcData + 32);
44 p3 = vloadi128u(srcData + 48);
45
46 p0 = vor(p0, fillMask);
47 p1 = vor(p1, fillMask);
48 p2 = vor(p2, fillMask);
49 p3 = vor(p3, fillMask);
50
51 vstorei128u(dstData + 0, p0);
52 vstorei128u(dstData + 16, p1);
53 vstorei128u(dstData + 32, p2);
54 vstorei128u(dstData + 48, p3);
55
56 dstData += 64;
57 srcData += 64;
58 i -= 16;
59 }
60
61 while (i >= 4) {
62 I128 p0;
63
64 p0 = vloadi128u(srcData);
65 p0 = vor(p0, fillMask);
66 vstorei128u(dstData, p0);
67
68 dstData += 16;
69 srcData += 16;
70 i -= 4;
71 }
72
73 while (i) {
74 I128 p0;
75
76 p0 = vloadi128_32(srcData);
77 p0 = vor(p0, fillMask);
78 vstorei32(dstData, p0);
79
80 dstData += 4;
81 srcData += 4;
82 i--;
83 }
84
85 dstData = blPixelConverterFillGap(dstData, gap);
86 dstData += dstStride;
87 srcData += srcStride;
88 }
89
90 return BL_SUCCESS;
91}
92
93// ============================================================================
94// [BLPixelConverter - PRGB32 <- ARGB32 (SSE2)]
95// ============================================================================
96
97static BLResult BL_CDECL bl_convert_prgb32_from_argb32_sse2(
98 const BLPixelConverterCore* self,
99 uint8_t* dstData, intptr_t dstStride,
100 const uint8_t* srcData, intptr_t srcStride, uint32_t w, uint32_t h, const BLPixelConverterOptions* options) noexcept {
101
102 using namespace SIMD;
103 if (!options)
104 options = &blPixelConverterDefaultOptions;
105
106 const BLPixelConverterData::NativeFromExternal& d = blPixelConverterGetData(self)->nativeFromExternal;
107 size_t gap = options->gap;
108
109 dstStride -= (w * 4) + gap;
110 srcStride -= (w * 4);
111
112 I128 zero = vzeroi128();
113 I128 a255 = vseti128u64(0x00FF000000000000u);
114 I128 fillMask = vseti128u32(d.fillMask);
115
116 for (uint32_t y = h; y != 0; y--) {
117 uint32_t i = w;
118
119 while (i >= 4) {
120 I128 p0, p1;
121 I128 a0, a1;
122
123 p0 = vloadi128u(srcData);
124
125 p1 = vunpackhi8(p0, zero);
126 p0 = vunpackli8(p0, zero);
127
128 a1 = vswizi16<3, 3, 3, 3>(p1);
129 p1 = vor(p1, a255);
130
131 a0 = vswizi16<3, 3, 3, 3>(p0);
132 p0 = vor(p0, a255);
133
134 p1 = vdiv255u16(vmuli16(p1, a1));
135 p0 = vdiv255u16(vmuli16(p0, a0));
136 p0 = vpacki16u8(p0, p1);
137 p0 = vor(p0, fillMask);
138 vstorei128u(dstData, p0);
139
140 dstData += 16;
141 srcData += 16;
142 i -= 4;
143 }
144
145 while (i) {
146 I128 p0;
147 I128 a0;
148
149 p0 = vloadi128_32(srcData);
150 p0 = vunpackli8(p0, zero);
151 a0 = vswizi16<3, 3, 3, 3>(p0);
152 p0 = vor(p0, a255);
153 p0 = vdiv255u16(vmuli16(p0, a0));
154 p0 = vpacki16u8(p0, p0);
155 p0 = vor(p0, fillMask);
156 vstorei32(dstData, p0);
157
158 dstData += 4;
159 srcData += 4;
160 i--;
161 }
162
163 dstData = blPixelConverterFillGap(dstData, gap);
164 dstData += dstStride;
165 srcData += srcStride;
166 }
167
168 return BL_SUCCESS;
169}
170
171// ============================================================================
172// [BLPixelConverter - Init (SSE2)]
173// ============================================================================
174
175bool blPixelConverterInitNativeFromXRGB_SSE2(BLPixelConverterCore* self, uint32_t dstFormat, const BLFormatInfo& srcInfo) noexcept {
176 BLPixelConverterData::NativeFromExternal& d = blPixelConverterGetData(self)->nativeFromExternal;
177 if (srcInfo.depth == 32) {
178 // Only BYTE aligned components (8888 or X888 formats).
179 if (!(srcInfo.flags & BL_FORMAT_FLAG_BYTE_ALIGNED))
180 return false;
181
182 // Only PRGB32, ARGB32, or XRGB32 formats. See SSSE3 implementation, which
183 // uses PSHUFB instruction and implements optimized conversion between all
184 // possible byte-aligned formats.
185 if (d.shifts[1] != 16 || d.shifts[2] != 8 || d.shifts[3] != 0)
186 return false;
187
188 bool isARGB = d.shifts[0] == 24;
189 bool isPremultiplied = (srcInfo.flags & BL_FORMAT_FLAG_PREMULTIPLIED) != 0;
190
191 switch (dstFormat) {
192 case BL_FORMAT_XRGB32:
193 case BL_FORMAT_PRGB32:
194 self->convertFunc = (isARGB && !isPremultiplied) ? bl_convert_prgb32_from_argb32_sse2
195 : bl_convert_prgb32_from_xrgb32_sse2;
196 return true;
197
198 default:
199 break;
200 }
201 }
202
203 return false;
204}
205
206#endif
207