1 | /* |
2 | * Copyright (c) 2021 - 2023 the ThorVG project. All rights reserved. |
3 | |
4 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
5 | * of this software and associated documentation files (the "Software"), to deal |
6 | * in the Software without restriction, including without limitation the rights |
7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
8 | * copies of the Software, and to permit persons to whom the Software is |
9 | * furnished to do so, subject to the following conditions: |
10 | |
11 | * The above copyright notice and this permission notice shall be included in all |
12 | * copies or substantial portions of the Software. |
13 | |
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
20 | * SOFTWARE. |
21 | */ |
22 | |
23 | #ifdef THORVG_AVX_VECTOR_SUPPORT |
24 | |
25 | #include <immintrin.h> |
26 | |
27 | #define N_32BITS_IN_128REG 4 |
28 | #define N_32BITS_IN_256REG 8 |
29 | |
30 | static inline __m128i ALPHA_BLEND(__m128i c, __m128i a) |
31 | { |
32 | //1. set the masks for the A/G and R/B channels |
33 | auto AG = _mm_set1_epi32(0xff00ff00); |
34 | auto RB = _mm_set1_epi32(0x00ff00ff); |
35 | |
36 | //2. mask the alpha vector - originally quartet [a, a, a, a] |
37 | auto aAG = _mm_and_si128(a, AG); |
38 | auto aRB = _mm_and_si128(a, RB); |
39 | |
40 | //3. calculate the alpha blending of the 2nd and 4th channel |
41 | //- mask the color vector |
42 | //- multiply it by the masked alpha vector |
43 | //- add the correction to compensate bit shifting used instead of dividing by 255 |
44 | //- shift bits - corresponding to division by 256 |
45 | auto even = _mm_and_si128(c, RB); |
46 | even = _mm_mullo_epi16(even, aRB); |
47 | even =_mm_add_epi16(even, RB); |
48 | even = _mm_srli_epi16(even, 8); |
49 | |
50 | //4. calculate the alpha blending of the 1st and 3rd channel: |
51 | //- mask the color vector |
52 | //- multiply it by the corresponding masked alpha vector and store the high bits of the result |
53 | //- add the correction to compensate division by 256 instead of by 255 (next step) |
54 | //- remove the low 8 bits to mimic the division by 256 |
55 | auto odd = _mm_and_si128(c, AG); |
56 | odd = _mm_mulhi_epu16(odd, aAG); |
57 | odd = _mm_add_epi16(odd, RB); |
58 | odd = _mm_and_si128(odd, AG); |
59 | |
60 | //5. the final result |
61 | return _mm_or_si128(odd, even); |
62 | } |
63 | |
64 | |
65 | static void avxRasterPixel32(uint32_t *dst, uint32_t val, uint32_t offset, int32_t len) |
66 | { |
67 | //1. calculate how many iterations we need to cover the length |
68 | uint32_t iterations = len / N_32BITS_IN_256REG; |
69 | uint32_t avxFilled = iterations * N_32BITS_IN_256REG; |
70 | |
71 | //2. set the beginning of the array |
72 | dst += offset; |
73 | |
74 | //3. fill the octets |
75 | for (uint32_t i = 0; i < iterations; ++i, dst += N_32BITS_IN_256REG) { |
76 | _mm256_storeu_si256((__m256i*)dst, _mm256_set1_epi32(val)); |
77 | } |
78 | |
79 | //4. fill leftovers (in the first step we have to set the pointer to the place where the avx job is done) |
80 | int32_t leftovers = len - avxFilled; |
81 | while (leftovers--) *dst++ = val; |
82 | } |
83 | |
84 | |
85 | static bool avxRasterTranslucentRect(SwSurface* surface, const SwBBox& region, uint8_t r, uint8_t g, uint8_t b, uint8_t a) |
86 | { |
87 | if (surface->channelSize != sizeof(uint32_t)) { |
88 | TVGERR("SW_ENGINE" , "Unsupported Channel Size = %d" , surface->channelSize); |
89 | return false; |
90 | } |
91 | |
92 | auto color = surface->join(r, g, b, a); |
93 | auto buffer = surface->buf32 + (region.min.y * surface->stride) + region.min.x; |
94 | auto h = static_cast<uint32_t>(region.max.y - region.min.y); |
95 | auto w = static_cast<uint32_t>(region.max.x - region.min.x); |
96 | |
97 | uint32_t ialpha = 255 - a; |
98 | |
99 | auto avxColor = _mm_set1_epi32(color); |
100 | auto avxIalpha = _mm_set1_epi8(ialpha); |
101 | |
102 | for (uint32_t y = 0; y < h; ++y) { |
103 | auto dst = &buffer[y * surface->stride]; |
104 | |
105 | //1. fill the not aligned memory (for 128-bit registers a 16-bytes alignment is required) |
106 | auto notAligned = ((uintptr_t)dst & 0xf) / 4; |
107 | if (notAligned) { |
108 | notAligned = (N_32BITS_IN_128REG - notAligned > w ? w : N_32BITS_IN_128REG - notAligned); |
109 | for (uint32_t x = 0; x < notAligned; ++x, ++dst) { |
110 | *dst = color + ALPHA_BLEND(*dst, ialpha); |
111 | } |
112 | } |
113 | |
114 | //2. fill the aligned memory - N_32BITS_IN_128REG pixels processed at once |
115 | uint32_t iterations = (w - notAligned) / N_32BITS_IN_128REG; |
116 | uint32_t avxFilled = iterations * N_32BITS_IN_128REG; |
117 | auto avxDst = (__m128i*)dst; |
118 | for (uint32_t x = 0; x < iterations; ++x, ++avxDst) { |
119 | *avxDst = _mm_add_epi32(avxColor, ALPHA_BLEND(*avxDst, avxIalpha)); |
120 | } |
121 | |
122 | //3. fill the remaining pixels |
123 | int32_t leftovers = w - notAligned - avxFilled; |
124 | dst += avxFilled; |
125 | while (leftovers--) { |
126 | *dst = color + ALPHA_BLEND(*dst, ialpha); |
127 | dst++; |
128 | } |
129 | } |
130 | return true; |
131 | } |
132 | |
133 | |
134 | static bool avxRasterTranslucentRle(SwSurface* surface, const SwRleData* rle, uint8_t r, uint8_t g, uint8_t b, uint8_t a) |
135 | { |
136 | if (surface->channelSize != sizeof(uint32_t)) { |
137 | TVGERR("SW_ENGINE" , "Unsupported Channel Size = %d" , surface->channelSize); |
138 | return false; |
139 | } |
140 | |
141 | auto color = surface->join(r, g, b, a); |
142 | auto span = rle->spans; |
143 | uint32_t src; |
144 | |
145 | for (uint32_t i = 0; i < rle->size; ++i) { |
146 | auto dst = &surface->buf32[span->y * surface->stride + span->x]; |
147 | |
148 | if (span->coverage < 255) src = ALPHA_BLEND(color, span->coverage); |
149 | else src = color; |
150 | |
151 | auto ialpha = IA(src); |
152 | |
153 | //1. fill the not aligned memory (for 128-bit registers a 16-bytes alignment is required) |
154 | auto notAligned = ((uintptr_t)dst & 0xf) / 4; |
155 | if (notAligned) { |
156 | notAligned = (N_32BITS_IN_128REG - notAligned > span->len ? span->len : N_32BITS_IN_128REG - notAligned); |
157 | for (uint32_t x = 0; x < notAligned; ++x, ++dst) { |
158 | *dst = src + ALPHA_BLEND(*dst, ialpha); |
159 | } |
160 | } |
161 | |
162 | //2. fill the aligned memory using avx - N_32BITS_IN_128REG pixels processed at once |
163 | //In order to avoid unneccessary avx variables declarations a check is made whether there are any iterations at all |
164 | uint32_t iterations = (span->len - notAligned) / N_32BITS_IN_128REG; |
165 | uint32_t avxFilled = 0; |
166 | if (iterations > 0) { |
167 | auto avxSrc = _mm_set1_epi32(src); |
168 | auto avxIalpha = _mm_set1_epi8(ialpha); |
169 | |
170 | avxFilled = iterations * N_32BITS_IN_128REG; |
171 | auto avxDst = (__m128i*)dst; |
172 | for (uint32_t x = 0; x < iterations; ++x, ++avxDst) { |
173 | *avxDst = _mm_add_epi32(avxSrc, ALPHA_BLEND(*avxDst, avxIalpha)); |
174 | } |
175 | } |
176 | |
177 | //3. fill the remaining pixels |
178 | int32_t leftovers = span->len - notAligned - avxFilled; |
179 | dst += avxFilled; |
180 | while (leftovers--) { |
181 | *dst = src + ALPHA_BLEND(*dst, ialpha); |
182 | dst++; |
183 | } |
184 | |
185 | ++span; |
186 | } |
187 | return true; |
188 | } |
189 | |
190 | |
191 | #endif |
192 | |