1 | /* |
2 | Simple DirectMedia Layer |
3 | Copyright (C) 1997-2021 Sam Lantinga <slouken@libsdl.org> |
4 | |
5 | This software is provided 'as-is', without any express or implied |
6 | warranty. In no event will the authors be held liable for any damages |
7 | arising from the use of this software. |
8 | |
9 | Permission is granted to anyone to use this software for any purpose, |
10 | including commercial applications, and to alter it and redistribute it |
11 | freely, subject to the following restrictions: |
12 | |
13 | 1. The origin of this software must not be misrepresented; you must not |
14 | claim that you wrote the original software. If you use this software |
15 | in a product, an acknowledgment in the product documentation would be |
16 | appreciated but is not required. |
17 | 2. Altered source versions must be plainly marked as such, and must not be |
18 | misrepresented as being the original software. |
19 | 3. This notice may not be removed or altered from any source distribution. |
20 | */ |
21 | #include "../SDL_internal.h" |
22 | |
23 | #include "SDL_audio.h" |
24 | #include "SDL_audio_c.h" |
25 | #include "SDL_cpuinfo.h" |
26 | |
27 | #ifdef __ARM_NEON |
28 | #define HAVE_NEON_INTRINSICS 1 |
29 | #endif |
30 | |
31 | #ifdef __SSE2__ |
32 | #define HAVE_SSE2_INTRINSICS 1 |
33 | #endif |
34 | |
35 | #if defined(__x86_64__) && HAVE_SSE2_INTRINSICS |
36 | #define NEED_SCALAR_CONVERTER_FALLBACKS 0 /* x86_64 guarantees SSE2. */ |
37 | #elif __MACOSX__ && HAVE_SSE2_INTRINSICS |
38 | #define NEED_SCALAR_CONVERTER_FALLBACKS 0 /* Mac OS X/Intel guarantees SSE2. */ |
39 | #elif defined(__ARM_ARCH) && (__ARM_ARCH >= 8) && HAVE_NEON_INTRINSICS |
40 | #define NEED_SCALAR_CONVERTER_FALLBACKS 0 /* ARMv8+ promise NEON. */ |
41 | #elif defined(__APPLE__) && defined(__ARM_ARCH) && (__ARM_ARCH >= 7) && HAVE_NEON_INTRINSICS |
42 | #define NEED_SCALAR_CONVERTER_FALLBACKS 0 /* All Apple ARMv7 chips promise NEON support. */ |
43 | #endif |
44 | |
45 | /* Set to zero if platform is guaranteed to use a SIMD codepath here. */ |
46 | #ifndef NEED_SCALAR_CONVERTER_FALLBACKS |
47 | #define NEED_SCALAR_CONVERTER_FALLBACKS 1 |
48 | #endif |
49 | |
50 | /* Function pointers set to a CPU-specific implementation. */ |
51 | SDL_AudioFilter SDL_Convert_S8_to_F32 = NULL; |
52 | SDL_AudioFilter SDL_Convert_U8_to_F32 = NULL; |
53 | SDL_AudioFilter SDL_Convert_S16_to_F32 = NULL; |
54 | SDL_AudioFilter SDL_Convert_U16_to_F32 = NULL; |
55 | SDL_AudioFilter SDL_Convert_S32_to_F32 = NULL; |
56 | SDL_AudioFilter SDL_Convert_F32_to_S8 = NULL; |
57 | SDL_AudioFilter SDL_Convert_F32_to_U8 = NULL; |
58 | SDL_AudioFilter SDL_Convert_F32_to_S16 = NULL; |
59 | SDL_AudioFilter SDL_Convert_F32_to_U16 = NULL; |
60 | SDL_AudioFilter SDL_Convert_F32_to_S32 = NULL; |
61 | |
62 | |
63 | #define DIVBY128 0.0078125f |
64 | #define DIVBY32768 0.000030517578125f |
65 | #define DIVBY8388607 0.00000011920930376163766f |
66 | |
67 | |
68 | #if NEED_SCALAR_CONVERTER_FALLBACKS |
69 | static void SDLCALL |
70 | SDL_Convert_S8_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format) |
71 | { |
72 | const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1; |
73 | float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1; |
74 | int i; |
75 | |
76 | LOG_DEBUG_CONVERT("AUDIO_S8" , "AUDIO_F32" ); |
77 | |
78 | for (i = cvt->len_cvt; i; --i, --src, --dst) { |
79 | *dst = ((float) *src) * DIVBY128; |
80 | } |
81 | |
82 | cvt->len_cvt *= 4; |
83 | if (cvt->filters[++cvt->filter_index]) { |
84 | cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS); |
85 | } |
86 | } |
87 | |
88 | static void SDLCALL |
89 | SDL_Convert_U8_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format) |
90 | { |
91 | const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1; |
92 | float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1; |
93 | int i; |
94 | |
95 | LOG_DEBUG_CONVERT("AUDIO_U8" , "AUDIO_F32" ); |
96 | |
97 | for (i = cvt->len_cvt; i; --i, --src, --dst) { |
98 | *dst = (((float) *src) * DIVBY128) - 1.0f; |
99 | } |
100 | |
101 | cvt->len_cvt *= 4; |
102 | if (cvt->filters[++cvt->filter_index]) { |
103 | cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS); |
104 | } |
105 | } |
106 | |
107 | static void SDLCALL |
108 | SDL_Convert_S16_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format) |
109 | { |
110 | const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1; |
111 | float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1; |
112 | int i; |
113 | |
114 | LOG_DEBUG_CONVERT("AUDIO_S16" , "AUDIO_F32" ); |
115 | |
116 | for (i = cvt->len_cvt / sizeof (Sint16); i; --i, --src, --dst) { |
117 | *dst = ((float) *src) * DIVBY32768; |
118 | } |
119 | |
120 | cvt->len_cvt *= 2; |
121 | if (cvt->filters[++cvt->filter_index]) { |
122 | cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS); |
123 | } |
124 | } |
125 | |
126 | static void SDLCALL |
127 | SDL_Convert_U16_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format) |
128 | { |
129 | const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1; |
130 | float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1; |
131 | int i; |
132 | |
133 | LOG_DEBUG_CONVERT("AUDIO_U16" , "AUDIO_F32" ); |
134 | |
135 | for (i = cvt->len_cvt / sizeof (Uint16); i; --i, --src, --dst) { |
136 | *dst = (((float) *src) * DIVBY32768) - 1.0f; |
137 | } |
138 | |
139 | cvt->len_cvt *= 2; |
140 | if (cvt->filters[++cvt->filter_index]) { |
141 | cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS); |
142 | } |
143 | } |
144 | |
145 | static void SDLCALL |
146 | SDL_Convert_S32_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format) |
147 | { |
148 | const Sint32 *src = (const Sint32 *) cvt->buf; |
149 | float *dst = (float *) cvt->buf; |
150 | int i; |
151 | |
152 | LOG_DEBUG_CONVERT("AUDIO_S32" , "AUDIO_F32" ); |
153 | |
154 | for (i = cvt->len_cvt / sizeof (Sint32); i; --i, ++src, ++dst) { |
155 | *dst = ((float) (*src>>8)) * DIVBY8388607; |
156 | } |
157 | |
158 | if (cvt->filters[++cvt->filter_index]) { |
159 | cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS); |
160 | } |
161 | } |
162 | |
163 | static void SDLCALL |
164 | SDL_Convert_F32_to_S8_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format) |
165 | { |
166 | const float *src = (const float *) cvt->buf; |
167 | Sint8 *dst = (Sint8 *) cvt->buf; |
168 | int i; |
169 | |
170 | LOG_DEBUG_CONVERT("AUDIO_F32" , "AUDIO_S8" ); |
171 | |
172 | for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) { |
173 | const float sample = *src; |
174 | if (sample >= 1.0f) { |
175 | *dst = 127; |
176 | } else if (sample <= -1.0f) { |
177 | *dst = -128; |
178 | } else { |
179 | *dst = (Sint8)(sample * 127.0f); |
180 | } |
181 | } |
182 | |
183 | cvt->len_cvt /= 4; |
184 | if (cvt->filters[++cvt->filter_index]) { |
185 | cvt->filters[cvt->filter_index](cvt, AUDIO_S8); |
186 | } |
187 | } |
188 | |
189 | static void SDLCALL |
190 | SDL_Convert_F32_to_U8_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format) |
191 | { |
192 | const float *src = (const float *) cvt->buf; |
193 | Uint8 *dst = (Uint8 *) cvt->buf; |
194 | int i; |
195 | |
196 | LOG_DEBUG_CONVERT("AUDIO_F32" , "AUDIO_U8" ); |
197 | |
198 | for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) { |
199 | const float sample = *src; |
200 | if (sample >= 1.0f) { |
201 | *dst = 255; |
202 | } else if (sample <= -1.0f) { |
203 | *dst = 0; |
204 | } else { |
205 | *dst = (Uint8)((sample + 1.0f) * 127.0f); |
206 | } |
207 | } |
208 | |
209 | cvt->len_cvt /= 4; |
210 | if (cvt->filters[++cvt->filter_index]) { |
211 | cvt->filters[cvt->filter_index](cvt, AUDIO_U8); |
212 | } |
213 | } |
214 | |
215 | static void SDLCALL |
216 | SDL_Convert_F32_to_S16_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format) |
217 | { |
218 | const float *src = (const float *) cvt->buf; |
219 | Sint16 *dst = (Sint16 *) cvt->buf; |
220 | int i; |
221 | |
222 | LOG_DEBUG_CONVERT("AUDIO_F32" , "AUDIO_S16" ); |
223 | |
224 | for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) { |
225 | const float sample = *src; |
226 | if (sample >= 1.0f) { |
227 | *dst = 32767; |
228 | } else if (sample <= -1.0f) { |
229 | *dst = -32768; |
230 | } else { |
231 | *dst = (Sint16)(sample * 32767.0f); |
232 | } |
233 | } |
234 | |
235 | cvt->len_cvt /= 2; |
236 | if (cvt->filters[++cvt->filter_index]) { |
237 | cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS); |
238 | } |
239 | } |
240 | |
241 | static void SDLCALL |
242 | SDL_Convert_F32_to_U16_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format) |
243 | { |
244 | const float *src = (const float *) cvt->buf; |
245 | Uint16 *dst = (Uint16 *) cvt->buf; |
246 | int i; |
247 | |
248 | LOG_DEBUG_CONVERT("AUDIO_F32" , "AUDIO_U16" ); |
249 | |
250 | for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) { |
251 | const float sample = *src; |
252 | if (sample >= 1.0f) { |
253 | *dst = 65535; |
254 | } else if (sample <= -1.0f) { |
255 | *dst = 0; |
256 | } else { |
257 | *dst = (Uint16)((sample + 1.0f) * 32767.0f); |
258 | } |
259 | } |
260 | |
261 | cvt->len_cvt /= 2; |
262 | if (cvt->filters[++cvt->filter_index]) { |
263 | cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS); |
264 | } |
265 | } |
266 | |
267 | static void SDLCALL |
268 | SDL_Convert_F32_to_S32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format) |
269 | { |
270 | const float *src = (const float *) cvt->buf; |
271 | Sint32 *dst = (Sint32 *) cvt->buf; |
272 | int i; |
273 | |
274 | LOG_DEBUG_CONVERT("AUDIO_F32" , "AUDIO_S32" ); |
275 | |
276 | for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) { |
277 | const float sample = *src; |
278 | if (sample >= 1.0f) { |
279 | *dst = 2147483647; |
280 | } else if (sample <= -1.0f) { |
281 | *dst = (Sint32) -2147483648LL; |
282 | } else { |
283 | *dst = ((Sint32)(sample * 8388607.0f)) << 8; |
284 | } |
285 | } |
286 | |
287 | if (cvt->filters[++cvt->filter_index]) { |
288 | cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS); |
289 | } |
290 | } |
291 | #endif |
292 | |
293 | |
294 | #if HAVE_SSE2_INTRINSICS |
295 | static void SDLCALL |
296 | SDL_Convert_S8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) |
297 | { |
298 | const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1; |
299 | float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1; |
300 | int i; |
301 | |
302 | LOG_DEBUG_CONVERT("AUDIO_S8" , "AUDIO_F32 (using SSE2)" ); |
303 | |
304 | /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */ |
305 | for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) { |
306 | *dst = ((float) *src) * DIVBY128; |
307 | } |
308 | |
309 | src -= 15; dst -= 15; /* adjust to read SSE blocks from the start. */ |
310 | SDL_assert(!i || ((((size_t) dst) & 15) == 0)); |
311 | |
312 | /* Make sure src is aligned too. */ |
313 | if ((((size_t) src) & 15) == 0) { |
314 | /* Aligned! Do SSE blocks as long as we have 16 bytes available. */ |
315 | const __m128i *mmsrc = (const __m128i *) src; |
316 | const __m128i zero = _mm_setzero_si128(); |
317 | const __m128 divby128 = _mm_set1_ps(DIVBY128); |
318 | while (i >= 16) { /* 16 * 8-bit */ |
319 | const __m128i bytes = _mm_load_si128(mmsrc); /* get 16 sint8 into an XMM register. */ |
320 | /* treat as int16, shift left to clear every other sint16, then back right with sign-extend. Now sint16. */ |
321 | const __m128i shorts1 = _mm_srai_epi16(_mm_slli_epi16(bytes, 8), 8); |
322 | /* right-shift-sign-extend gets us sint16 with the other set of values. */ |
323 | const __m128i shorts2 = _mm_srai_epi16(bytes, 8); |
324 | /* unpack against zero to make these int32, shift to make them sign-extend, convert to float, multiply. Whew! */ |
325 | const __m128 floats1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(shorts1, zero), 16), 16)), divby128); |
326 | const __m128 floats2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(shorts2, zero), 16), 16)), divby128); |
327 | const __m128 floats3 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(shorts1, zero), 16), 16)), divby128); |
328 | const __m128 floats4 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(shorts2, zero), 16), 16)), divby128); |
329 | /* Interleave back into correct order, store. */ |
330 | _mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2)); |
331 | _mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2)); |
332 | _mm_store_ps(dst+8, _mm_unpacklo_ps(floats3, floats4)); |
333 | _mm_store_ps(dst+12, _mm_unpackhi_ps(floats3, floats4)); |
334 | i -= 16; mmsrc--; dst -= 16; |
335 | } |
336 | |
337 | src = (const Sint8 *) mmsrc; |
338 | } |
339 | |
340 | src += 15; dst += 15; /* adjust for any scalar finishing. */ |
341 | |
342 | /* Finish off any leftovers with scalar operations. */ |
343 | while (i) { |
344 | *dst = ((float) *src) * DIVBY128; |
345 | i--; src--; dst--; |
346 | } |
347 | |
348 | cvt->len_cvt *= 4; |
349 | if (cvt->filters[++cvt->filter_index]) { |
350 | cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS); |
351 | } |
352 | } |
353 | |
354 | static void SDLCALL |
355 | SDL_Convert_U8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) |
356 | { |
357 | const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1; |
358 | float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1; |
359 | int i; |
360 | |
361 | LOG_DEBUG_CONVERT("AUDIO_U8" , "AUDIO_F32 (using SSE2)" ); |
362 | |
363 | /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */ |
364 | for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) { |
365 | *dst = (((float) *src) * DIVBY128) - 1.0f; |
366 | } |
367 | |
368 | src -= 15; dst -= 15; /* adjust to read SSE blocks from the start. */ |
369 | SDL_assert(!i || ((((size_t) dst) & 15) == 0)); |
370 | |
371 | /* Make sure src is aligned too. */ |
372 | if ((((size_t) src) & 15) == 0) { |
373 | /* Aligned! Do SSE blocks as long as we have 16 bytes available. */ |
374 | const __m128i *mmsrc = (const __m128i *) src; |
375 | const __m128i zero = _mm_setzero_si128(); |
376 | const __m128 divby128 = _mm_set1_ps(DIVBY128); |
377 | const __m128 minus1 = _mm_set1_ps(-1.0f); |
378 | while (i >= 16) { /* 16 * 8-bit */ |
379 | const __m128i bytes = _mm_load_si128(mmsrc); /* get 16 uint8 into an XMM register. */ |
380 | /* treat as int16, shift left to clear every other sint16, then back right with zero-extend. Now uint16. */ |
381 | const __m128i shorts1 = _mm_srli_epi16(_mm_slli_epi16(bytes, 8), 8); |
382 | /* right-shift-zero-extend gets us uint16 with the other set of values. */ |
383 | const __m128i shorts2 = _mm_srli_epi16(bytes, 8); |
384 | /* unpack against zero to make these int32, convert to float, multiply, add. Whew! */ |
385 | /* Note that AVX2 can do floating point multiply+add in one instruction, fwiw. SSE2 cannot. */ |
386 | const __m128 floats1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts1, zero)), divby128), minus1); |
387 | const __m128 floats2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts2, zero)), divby128), minus1); |
388 | const __m128 floats3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts1, zero)), divby128), minus1); |
389 | const __m128 floats4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts2, zero)), divby128), minus1); |
390 | /* Interleave back into correct order, store. */ |
391 | _mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2)); |
392 | _mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2)); |
393 | _mm_store_ps(dst+8, _mm_unpacklo_ps(floats3, floats4)); |
394 | _mm_store_ps(dst+12, _mm_unpackhi_ps(floats3, floats4)); |
395 | i -= 16; mmsrc--; dst -= 16; |
396 | } |
397 | |
398 | src = (const Uint8 *) mmsrc; |
399 | } |
400 | |
401 | src += 15; dst += 15; /* adjust for any scalar finishing. */ |
402 | |
403 | /* Finish off any leftovers with scalar operations. */ |
404 | while (i) { |
405 | *dst = (((float) *src) * DIVBY128) - 1.0f; |
406 | i--; src--; dst--; |
407 | } |
408 | |
409 | cvt->len_cvt *= 4; |
410 | if (cvt->filters[++cvt->filter_index]) { |
411 | cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS); |
412 | } |
413 | } |
414 | |
415 | static void SDLCALL |
416 | SDL_Convert_S16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) |
417 | { |
418 | const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1; |
419 | float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1; |
420 | int i; |
421 | |
422 | LOG_DEBUG_CONVERT("AUDIO_S16" , "AUDIO_F32 (using SSE2)" ); |
423 | |
424 | /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */ |
425 | for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) { |
426 | *dst = ((float) *src) * DIVBY32768; |
427 | } |
428 | |
429 | src -= 7; dst -= 7; /* adjust to read SSE blocks from the start. */ |
430 | SDL_assert(!i || ((((size_t) dst) & 15) == 0)); |
431 | |
432 | /* Make sure src is aligned too. */ |
433 | if ((((size_t) src) & 15) == 0) { |
434 | /* Aligned! Do SSE blocks as long as we have 16 bytes available. */ |
435 | const __m128 divby32768 = _mm_set1_ps(DIVBY32768); |
436 | while (i >= 8) { /* 8 * 16-bit */ |
437 | const __m128i ints = _mm_load_si128((__m128i const *) src); /* get 8 sint16 into an XMM register. */ |
438 | /* treat as int32, shift left to clear every other sint16, then back right with sign-extend. Now sint32. */ |
439 | const __m128i a = _mm_srai_epi32(_mm_slli_epi32(ints, 16), 16); |
440 | /* right-shift-sign-extend gets us sint32 with the other set of values. */ |
441 | const __m128i b = _mm_srai_epi32(ints, 16); |
442 | /* Interleave these back into the right order, convert to float, multiply, store. */ |
443 | _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32768)); |
444 | _mm_store_ps(dst+4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32768)); |
445 | i -= 8; src -= 8; dst -= 8; |
446 | } |
447 | } |
448 | |
449 | src += 7; dst += 7; /* adjust for any scalar finishing. */ |
450 | |
451 | /* Finish off any leftovers with scalar operations. */ |
452 | while (i) { |
453 | *dst = ((float) *src) * DIVBY32768; |
454 | i--; src--; dst--; |
455 | } |
456 | |
457 | cvt->len_cvt *= 2; |
458 | if (cvt->filters[++cvt->filter_index]) { |
459 | cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS); |
460 | } |
461 | } |
462 | |
463 | static void SDLCALL |
464 | SDL_Convert_U16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) |
465 | { |
466 | const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1; |
467 | float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1; |
468 | int i; |
469 | |
470 | LOG_DEBUG_CONVERT("AUDIO_U16" , "AUDIO_F32 (using SSE2)" ); |
471 | |
472 | /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */ |
473 | for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) { |
474 | *dst = (((float) *src) * DIVBY32768) - 1.0f; |
475 | } |
476 | |
477 | src -= 7; dst -= 7; /* adjust to read SSE blocks from the start. */ |
478 | SDL_assert(!i || ((((size_t) dst) & 15) == 0)); |
479 | |
480 | /* Make sure src is aligned too. */ |
481 | if ((((size_t) src) & 15) == 0) { |
482 | /* Aligned! Do SSE blocks as long as we have 16 bytes available. */ |
483 | const __m128 divby32768 = _mm_set1_ps(DIVBY32768); |
484 | const __m128 minus1 = _mm_set1_ps(-1.0f); |
485 | while (i >= 8) { /* 8 * 16-bit */ |
486 | const __m128i ints = _mm_load_si128((__m128i const *) src); /* get 8 sint16 into an XMM register. */ |
487 | /* treat as int32, shift left to clear every other sint16, then back right with zero-extend. Now sint32. */ |
488 | const __m128i a = _mm_srli_epi32(_mm_slli_epi32(ints, 16), 16); |
489 | /* right-shift-sign-extend gets us sint32 with the other set of values. */ |
490 | const __m128i b = _mm_srli_epi32(ints, 16); |
491 | /* Interleave these back into the right order, convert to float, multiply, store. */ |
492 | _mm_store_ps(dst, _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32768), minus1)); |
493 | _mm_store_ps(dst+4, _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32768), minus1)); |
494 | i -= 8; src -= 8; dst -= 8; |
495 | } |
496 | } |
497 | |
498 | src += 7; dst += 7; /* adjust for any scalar finishing. */ |
499 | |
500 | /* Finish off any leftovers with scalar operations. */ |
501 | while (i) { |
502 | *dst = (((float) *src) * DIVBY32768) - 1.0f; |
503 | i--; src--; dst--; |
504 | } |
505 | |
506 | cvt->len_cvt *= 2; |
507 | if (cvt->filters[++cvt->filter_index]) { |
508 | cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS); |
509 | } |
510 | } |
511 | |
512 | static void SDLCALL |
513 | SDL_Convert_S32_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) |
514 | { |
515 | const Sint32 *src = (const Sint32 *) cvt->buf; |
516 | float *dst = (float *) cvt->buf; |
517 | int i; |
518 | |
519 | LOG_DEBUG_CONVERT("AUDIO_S32" , "AUDIO_F32 (using SSE2)" ); |
520 | |
521 | /* Get dst aligned to 16 bytes */ |
522 | for (i = cvt->len_cvt / sizeof (Sint32); i && (((size_t) dst) & 15); --i, ++src, ++dst) { |
523 | *dst = ((float) (*src>>8)) * DIVBY8388607; |
524 | } |
525 | |
526 | SDL_assert(!i || ((((size_t) dst) & 15) == 0)); |
527 | |
528 | /* Make sure src is aligned too. */ |
529 | if ((((size_t) src) & 15) == 0) { |
530 | /* Aligned! Do SSE blocks as long as we have 16 bytes available. */ |
531 | const __m128 divby8388607 = _mm_set1_ps(DIVBY8388607); |
532 | const __m128i *mmsrc = (const __m128i *) src; |
533 | while (i >= 4) { /* 4 * sint32 */ |
534 | /* shift out lowest bits so int fits in a float32. Small precision loss, but much faster. */ |
535 | _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_load_si128(mmsrc), 8)), divby8388607)); |
536 | i -= 4; mmsrc++; dst += 4; |
537 | } |
538 | src = (const Sint32 *) mmsrc; |
539 | } |
540 | |
541 | /* Finish off any leftovers with scalar operations. */ |
542 | while (i) { |
543 | *dst = ((float) (*src>>8)) * DIVBY8388607; |
544 | i--; src++; dst++; |
545 | } |
546 | |
547 | if (cvt->filters[++cvt->filter_index]) { |
548 | cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS); |
549 | } |
550 | } |
551 | |
552 | static void SDLCALL |
553 | SDL_Convert_F32_to_S8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) |
554 | { |
555 | const float *src = (const float *) cvt->buf; |
556 | Sint8 *dst = (Sint8 *) cvt->buf; |
557 | int i; |
558 | |
559 | LOG_DEBUG_CONVERT("AUDIO_F32" , "AUDIO_S8 (using SSE2)" ); |
560 | |
561 | /* Get dst aligned to 16 bytes */ |
562 | for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) { |
563 | const float sample = *src; |
564 | if (sample >= 1.0f) { |
565 | *dst = 127; |
566 | } else if (sample <= -1.0f) { |
567 | *dst = -128; |
568 | } else { |
569 | *dst = (Sint8)(sample * 127.0f); |
570 | } |
571 | } |
572 | |
573 | SDL_assert(!i || ((((size_t) dst) & 15) == 0)); |
574 | |
575 | /* Make sure src is aligned too. */ |
576 | if ((((size_t) src) & 15) == 0) { |
577 | /* Aligned! Do SSE blocks as long as we have 16 bytes available. */ |
578 | const __m128 one = _mm_set1_ps(1.0f); |
579 | const __m128 negone = _mm_set1_ps(-1.0f); |
580 | const __m128 mulby127 = _mm_set1_ps(127.0f); |
581 | __m128i *mmdst = (__m128i *) dst; |
582 | while (i >= 16) { /* 16 * float32 */ |
583 | const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */ |
584 | const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */ |
585 | const __m128i ints3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+8)), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */ |
586 | const __m128i ints4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+12)), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */ |
587 | _mm_store_si128(mmdst, _mm_packs_epi16(_mm_packs_epi32(ints1, ints2), _mm_packs_epi32(ints3, ints4))); /* pack down, store out. */ |
588 | i -= 16; src += 16; mmdst++; |
589 | } |
590 | dst = (Sint8 *) mmdst; |
591 | } |
592 | |
593 | /* Finish off any leftovers with scalar operations. */ |
594 | while (i) { |
595 | const float sample = *src; |
596 | if (sample >= 1.0f) { |
597 | *dst = 127; |
598 | } else if (sample <= -1.0f) { |
599 | *dst = -128; |
600 | } else { |
601 | *dst = (Sint8)(sample * 127.0f); |
602 | } |
603 | i--; src++; dst++; |
604 | } |
605 | |
606 | cvt->len_cvt /= 4; |
607 | if (cvt->filters[++cvt->filter_index]) { |
608 | cvt->filters[cvt->filter_index](cvt, AUDIO_S8); |
609 | } |
610 | } |
611 | |
612 | static void SDLCALL |
613 | SDL_Convert_F32_to_U8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) |
614 | { |
615 | const float *src = (const float *) cvt->buf; |
616 | Uint8 *dst = cvt->buf; |
617 | int i; |
618 | |
619 | LOG_DEBUG_CONVERT("AUDIO_F32" , "AUDIO_U8 (using SSE2)" ); |
620 | |
621 | /* Get dst aligned to 16 bytes */ |
622 | for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) { |
623 | const float sample = *src; |
624 | if (sample >= 1.0f) { |
625 | *dst = 255; |
626 | } else if (sample <= -1.0f) { |
627 | *dst = 0; |
628 | } else { |
629 | *dst = (Uint8)((sample + 1.0f) * 127.0f); |
630 | } |
631 | } |
632 | |
633 | SDL_assert(!i || ((((size_t) dst) & 15) == 0)); |
634 | |
635 | /* Make sure src is aligned too. */ |
636 | if ((((size_t) src) & 15) == 0) { |
637 | /* Aligned! Do SSE blocks as long as we have 16 bytes available. */ |
638 | const __m128 one = _mm_set1_ps(1.0f); |
639 | const __m128 negone = _mm_set1_ps(-1.0f); |
640 | const __m128 mulby127 = _mm_set1_ps(127.0f); |
641 | __m128i *mmdst = (__m128i *) dst; |
642 | while (i >= 16) { /* 16 * float32 */ |
643 | const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */ |
644 | const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */ |
645 | const __m128i ints3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+8)), one), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */ |
646 | const __m128i ints4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+12)), one), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */ |
647 | _mm_store_si128(mmdst, _mm_packus_epi16(_mm_packs_epi32(ints1, ints2), _mm_packs_epi32(ints3, ints4))); /* pack down, store out. */ |
648 | i -= 16; src += 16; mmdst++; |
649 | } |
650 | dst = (Uint8 *) mmdst; |
651 | } |
652 | |
653 | /* Finish off any leftovers with scalar operations. */ |
654 | while (i) { |
655 | const float sample = *src; |
656 | if (sample >= 1.0f) { |
657 | *dst = 255; |
658 | } else if (sample <= -1.0f) { |
659 | *dst = 0; |
660 | } else { |
661 | *dst = (Uint8)((sample + 1.0f) * 127.0f); |
662 | } |
663 | i--; src++; dst++; |
664 | } |
665 | |
666 | cvt->len_cvt /= 4; |
667 | if (cvt->filters[++cvt->filter_index]) { |
668 | cvt->filters[cvt->filter_index](cvt, AUDIO_U8); |
669 | } |
670 | } |
671 | |
672 | static void SDLCALL |
673 | SDL_Convert_F32_to_S16_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) |
674 | { |
675 | const float *src = (const float *) cvt->buf; |
676 | Sint16 *dst = (Sint16 *) cvt->buf; |
677 | int i; |
678 | |
679 | LOG_DEBUG_CONVERT("AUDIO_F32" , "AUDIO_S16 (using SSE2)" ); |
680 | |
681 | /* Get dst aligned to 16 bytes */ |
682 | for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) { |
683 | const float sample = *src; |
684 | if (sample >= 1.0f) { |
685 | *dst = 32767; |
686 | } else if (sample <= -1.0f) { |
687 | *dst = -32768; |
688 | } else { |
689 | *dst = (Sint16)(sample * 32767.0f); |
690 | } |
691 | } |
692 | |
693 | SDL_assert(!i || ((((size_t) dst) & 15) == 0)); |
694 | |
695 | /* Make sure src is aligned too. */ |
696 | if ((((size_t) src) & 15) == 0) { |
697 | /* Aligned! Do SSE blocks as long as we have 16 bytes available. */ |
698 | const __m128 one = _mm_set1_ps(1.0f); |
699 | const __m128 negone = _mm_set1_ps(-1.0f); |
700 | const __m128 mulby32767 = _mm_set1_ps(32767.0f); |
701 | __m128i *mmdst = (__m128i *) dst; |
702 | while (i >= 8) { /* 8 * float32 */ |
703 | const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby32767)); /* load 4 floats, clamp, convert to sint32 */ |
704 | const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), mulby32767)); /* load 4 floats, clamp, convert to sint32 */ |
705 | _mm_store_si128(mmdst, _mm_packs_epi32(ints1, ints2)); /* pack to sint16, store out. */ |
706 | i -= 8; src += 8; mmdst++; |
707 | } |
708 | dst = (Sint16 *) mmdst; |
709 | } |
710 | |
711 | /* Finish off any leftovers with scalar operations. */ |
712 | while (i) { |
713 | const float sample = *src; |
714 | if (sample >= 1.0f) { |
715 | *dst = 32767; |
716 | } else if (sample <= -1.0f) { |
717 | *dst = -32768; |
718 | } else { |
719 | *dst = (Sint16)(sample * 32767.0f); |
720 | } |
721 | i--; src++; dst++; |
722 | } |
723 | |
724 | cvt->len_cvt /= 2; |
725 | if (cvt->filters[++cvt->filter_index]) { |
726 | cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS); |
727 | } |
728 | } |
729 | |
730 | static void SDLCALL |
731 | SDL_Convert_F32_to_U16_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) |
732 | { |
733 | const float *src = (const float *) cvt->buf; |
734 | Uint16 *dst = (Uint16 *) cvt->buf; |
735 | int i; |
736 | |
737 | LOG_DEBUG_CONVERT("AUDIO_F32" , "AUDIO_U16 (using SSE2)" ); |
738 | |
739 | /* Get dst aligned to 16 bytes */ |
740 | for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) { |
741 | const float sample = *src; |
742 | if (sample >= 1.0f) { |
743 | *dst = 65535; |
744 | } else if (sample <= -1.0f) { |
745 | *dst = 0; |
746 | } else { |
747 | *dst = (Uint16)((sample + 1.0f) * 32767.0f); |
748 | } |
749 | } |
750 | |
751 | SDL_assert(!i || ((((size_t) dst) & 15) == 0)); |
752 | |
753 | /* Make sure src is aligned too. */ |
754 | if ((((size_t) src) & 15) == 0) { |
755 | /* Aligned! Do SSE blocks as long as we have 16 bytes available. */ |
756 | /* This calculates differently than the scalar path because SSE2 can't |
757 | pack int32 data down to unsigned int16. _mm_packs_epi32 does signed |
758 | saturation, so that would corrupt our data. _mm_packus_epi32 exists, |
759 | but not before SSE 4.1. So we convert from float to sint16, packing |
760 | that down with legit signed saturation, and then xor the top bit |
761 | against 1. This results in the correct unsigned 16-bit value, even |
762 | though it looks like dark magic. */ |
763 | const __m128 mulby32767 = _mm_set1_ps(32767.0f); |
764 | const __m128i topbit = _mm_set1_epi16(-32768); |
765 | const __m128 one = _mm_set1_ps(1.0f); |
766 | const __m128 negone = _mm_set1_ps(-1.0f); |
767 | __m128i *mmdst = (__m128i *) dst; |
768 | while (i >= 8) { /* 8 * float32 */ |
769 | const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby32767)); /* load 4 floats, clamp, convert to sint32 */ |
770 | const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), mulby32767)); /* load 4 floats, clamp, convert to sint32 */ |
771 | _mm_store_si128(mmdst, _mm_xor_si128(_mm_packs_epi32(ints1, ints2), topbit)); /* pack to sint16, xor top bit, store out. */ |
772 | i -= 8; src += 8; mmdst++; |
773 | } |
774 | dst = (Uint16 *) mmdst; |
775 | } |
776 | |
777 | /* Finish off any leftovers with scalar operations. */ |
778 | while (i) { |
779 | const float sample = *src; |
780 | if (sample >= 1.0f) { |
781 | *dst = 65535; |
782 | } else if (sample <= -1.0f) { |
783 | *dst = 0; |
784 | } else { |
785 | *dst = (Uint16)((sample + 1.0f) * 32767.0f); |
786 | } |
787 | i--; src++; dst++; |
788 | } |
789 | |
790 | cvt->len_cvt /= 2; |
791 | if (cvt->filters[++cvt->filter_index]) { |
792 | cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS); |
793 | } |
794 | } |
795 | |
796 | static void SDLCALL |
797 | SDL_Convert_F32_to_S32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) |
798 | { |
799 | const float *src = (const float *) cvt->buf; |
800 | Sint32 *dst = (Sint32 *) cvt->buf; |
801 | int i; |
802 | |
803 | LOG_DEBUG_CONVERT("AUDIO_F32" , "AUDIO_S32 (using SSE2)" ); |
804 | |
805 | /* Get dst aligned to 16 bytes */ |
806 | for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) { |
807 | const float sample = *src; |
808 | if (sample >= 1.0f) { |
809 | *dst = 2147483647; |
810 | } else if (sample <= -1.0f) { |
811 | *dst = (Sint32) -2147483648LL; |
812 | } else { |
813 | *dst = ((Sint32)(sample * 8388607.0f)) << 8; |
814 | } |
815 | } |
816 | |
817 | SDL_assert(!i || ((((size_t) dst) & 15) == 0)); |
818 | SDL_assert(!i || ((((size_t) src) & 15) == 0)); |
819 | |
820 | { |
821 | /* Aligned! Do SSE blocks as long as we have 16 bytes available. */ |
822 | const __m128 one = _mm_set1_ps(1.0f); |
823 | const __m128 negone = _mm_set1_ps(-1.0f); |
824 | const __m128 mulby8388607 = _mm_set1_ps(8388607.0f); |
825 | __m128i *mmdst = (__m128i *) dst; |
826 | while (i >= 4) { /* 4 * float32 */ |
827 | _mm_store_si128(mmdst, _mm_slli_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby8388607)), 8)); /* load 4 floats, clamp, convert to sint32 */ |
828 | i -= 4; src += 4; mmdst++; |
829 | } |
830 | dst = (Sint32 *) mmdst; |
831 | } |
832 | |
833 | /* Finish off any leftovers with scalar operations. */ |
834 | while (i) { |
835 | const float sample = *src; |
836 | if (sample >= 1.0f) { |
837 | *dst = 2147483647; |
838 | } else if (sample <= -1.0f) { |
839 | *dst = (Sint32) -2147483648LL; |
840 | } else { |
841 | *dst = ((Sint32)(sample * 8388607.0f)) << 8; |
842 | } |
843 | i--; src++; dst++; |
844 | } |
845 | |
846 | if (cvt->filters[++cvt->filter_index]) { |
847 | cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS); |
848 | } |
849 | } |
850 | #endif |
851 | |
852 | |
853 | #if HAVE_NEON_INTRINSICS |
854 | static void SDLCALL |
855 | SDL_Convert_S8_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format) |
856 | { |
857 | const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1; |
858 | float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1; |
859 | int i; |
860 | |
861 | LOG_DEBUG_CONVERT("AUDIO_S8" , "AUDIO_F32 (using NEON)" ); |
862 | |
863 | /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */ |
864 | for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) { |
865 | *dst = ((float) *src) * DIVBY128; |
866 | } |
867 | |
868 | src -= 15; dst -= 15; /* adjust to read NEON blocks from the start. */ |
869 | SDL_assert(!i || ((((size_t) dst) & 15) == 0)); |
870 | |
871 | /* Make sure src is aligned too. */ |
872 | if ((((size_t) src) & 15) == 0) { |
873 | /* Aligned! Do NEON blocks as long as we have 16 bytes available. */ |
874 | const int8_t *mmsrc = (const int8_t *) src; |
875 | const float32x4_t divby128 = vdupq_n_f32(DIVBY128); |
876 | while (i >= 16) { /* 16 * 8-bit */ |
877 | const int8x16_t bytes = vld1q_s8(mmsrc); /* get 16 sint8 into a NEON register. */ |
878 | const int16x8_t int16hi = vmovl_s8(vget_high_s8(bytes)); /* convert top 8 bytes to 8 int16 */ |
879 | const int16x8_t int16lo = vmovl_s8(vget_low_s8(bytes)); /* convert bottom 8 bytes to 8 int16 */ |
880 | /* split int16 to two int32, then convert to float, then multiply to normalize, store. */ |
881 | vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(int16lo))), divby128)); |
882 | vst1q_f32(dst+4, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(int16lo))), divby128)); |
883 | vst1q_f32(dst+8, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(int16hi))), divby128)); |
884 | vst1q_f32(dst+12, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(int16hi))), divby128)); |
885 | i -= 16; mmsrc -= 16; dst -= 16; |
886 | } |
887 | |
888 | src = (const Sint8 *) mmsrc; |
889 | } |
890 | |
891 | src += 15; dst += 15; /* adjust for any scalar finishing. */ |
892 | |
893 | /* Finish off any leftovers with scalar operations. */ |
894 | while (i) { |
895 | *dst = ((float) *src) * DIVBY128; |
896 | i--; src--; dst--; |
897 | } |
898 | |
899 | cvt->len_cvt *= 4; |
900 | if (cvt->filters[++cvt->filter_index]) { |
901 | cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS); |
902 | } |
903 | } |
904 | |
905 | static void SDLCALL |
906 | SDL_Convert_U8_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format) |
907 | { |
908 | const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1; |
909 | float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1; |
910 | int i; |
911 | |
912 | LOG_DEBUG_CONVERT("AUDIO_U8" , "AUDIO_F32 (using NEON)" ); |
913 | |
914 | /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */ |
915 | for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) { |
916 | *dst = (((float) *src) * DIVBY128) - 1.0f; |
917 | } |
918 | |
919 | src -= 15; dst -= 15; /* adjust to read NEON blocks from the start. */ |
920 | SDL_assert(!i || ((((size_t) dst) & 15) == 0)); |
921 | |
922 | /* Make sure src is aligned too. */ |
923 | if ((((size_t) src) & 15) == 0) { |
924 | /* Aligned! Do NEON blocks as long as we have 16 bytes available. */ |
925 | const uint8_t *mmsrc = (const uint8_t *) src; |
926 | const float32x4_t divby128 = vdupq_n_f32(DIVBY128); |
927 | const float32x4_t negone = vdupq_n_f32(-1.0f); |
928 | while (i >= 16) { /* 16 * 8-bit */ |
929 | const uint8x16_t bytes = vld1q_u8(mmsrc); /* get 16 uint8 into a NEON register. */ |
930 | const uint16x8_t uint16hi = vmovl_u8(vget_high_u8(bytes)); /* convert top 8 bytes to 8 uint16 */ |
931 | const uint16x8_t uint16lo = vmovl_u8(vget_low_u8(bytes)); /* convert bottom 8 bytes to 8 uint16 */ |
932 | /* split uint16 to two uint32, then convert to float, then multiply to normalize, subtract to adjust for sign, store. */ |
933 | vst1q_f32(dst, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16lo))), divby128)); |
934 | vst1q_f32(dst+4, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16lo))), divby128)); |
935 | vst1q_f32(dst+8, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16hi))), divby128)); |
936 | vst1q_f32(dst+12, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16hi))), divby128)); |
937 | i -= 16; mmsrc -= 16; dst -= 16; |
938 | } |
939 | |
940 | src = (const Uint8 *) mmsrc; |
941 | } |
942 | |
943 | src += 15; dst += 15; /* adjust for any scalar finishing. */ |
944 | |
945 | /* Finish off any leftovers with scalar operations. */ |
946 | while (i) { |
947 | *dst = (((float) *src) * DIVBY128) - 1.0f; |
948 | i--; src--; dst--; |
949 | } |
950 | |
951 | cvt->len_cvt *= 4; |
952 | if (cvt->filters[++cvt->filter_index]) { |
953 | cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS); |
954 | } |
955 | } |
956 | |
957 | static void SDLCALL |
958 | SDL_Convert_S16_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format) |
959 | { |
960 | const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1; |
961 | float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1; |
962 | int i; |
963 | |
964 | LOG_DEBUG_CONVERT("AUDIO_S16" , "AUDIO_F32 (using NEON)" ); |
965 | |
966 | /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */ |
967 | for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) { |
968 | *dst = ((float) *src) * DIVBY32768; |
969 | } |
970 | |
971 | src -= 7; dst -= 7; /* adjust to read NEON blocks from the start. */ |
972 | SDL_assert(!i || ((((size_t) dst) & 15) == 0)); |
973 | |
974 | /* Make sure src is aligned too. */ |
975 | if ((((size_t) src) & 15) == 0) { |
976 | /* Aligned! Do NEON blocks as long as we have 16 bytes available. */ |
977 | const float32x4_t divby32768 = vdupq_n_f32(DIVBY32768); |
978 | while (i >= 8) { /* 8 * 16-bit */ |
979 | const int16x8_t ints = vld1q_s16((int16_t const *) src); /* get 8 sint16 into a NEON register. */ |
980 | /* split int16 to two int32, then convert to float, then multiply to normalize, store. */ |
981 | vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(ints))), divby32768)); |
982 | vst1q_f32(dst+4, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(ints))), divby32768)); |
983 | i -= 8; src -= 8; dst -= 8; |
984 | } |
985 | } |
986 | |
987 | src += 7; dst += 7; /* adjust for any scalar finishing. */ |
988 | |
989 | /* Finish off any leftovers with scalar operations. */ |
990 | while (i) { |
991 | *dst = ((float) *src) * DIVBY32768; |
992 | i--; src--; dst--; |
993 | } |
994 | |
995 | cvt->len_cvt *= 2; |
996 | if (cvt->filters[++cvt->filter_index]) { |
997 | cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS); |
998 | } |
999 | } |
1000 | |
1001 | static void SDLCALL |
1002 | SDL_Convert_U16_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format) |
1003 | { |
1004 | const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1; |
1005 | float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1; |
1006 | int i; |
1007 | |
1008 | LOG_DEBUG_CONVERT("AUDIO_U16" , "AUDIO_F32 (using NEON)" ); |
1009 | |
1010 | /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */ |
1011 | for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) { |
1012 | *dst = (((float) *src) * DIVBY32768) - 1.0f; |
1013 | } |
1014 | |
1015 | src -= 7; dst -= 7; /* adjust to read NEON blocks from the start. */ |
1016 | SDL_assert(!i || ((((size_t) dst) & 15) == 0)); |
1017 | |
1018 | /* Make sure src is aligned too. */ |
1019 | if ((((size_t) src) & 15) == 0) { |
1020 | /* Aligned! Do NEON blocks as long as we have 16 bytes available. */ |
1021 | const float32x4_t divby32768 = vdupq_n_f32(DIVBY32768); |
1022 | const float32x4_t negone = vdupq_n_f32(-1.0f); |
1023 | while (i >= 8) { /* 8 * 16-bit */ |
1024 | const uint16x8_t uints = vld1q_u16((uint16_t const *) src); /* get 8 uint16 into a NEON register. */ |
1025 | /* split uint16 to two int32, then convert to float, then multiply to normalize, subtract for sign, store. */ |
1026 | vst1q_f32(dst, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uints))), divby32768)); |
1027 | vst1q_f32(dst+4, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uints))), divby32768)); |
1028 | i -= 8; src -= 8; dst -= 8; |
1029 | } |
1030 | } |
1031 | |
1032 | src += 7; dst += 7; /* adjust for any scalar finishing. */ |
1033 | |
1034 | /* Finish off any leftovers with scalar operations. */ |
1035 | while (i) { |
1036 | *dst = (((float) *src) * DIVBY32768) - 1.0f; |
1037 | i--; src--; dst--; |
1038 | } |
1039 | |
1040 | cvt->len_cvt *= 2; |
1041 | if (cvt->filters[++cvt->filter_index]) { |
1042 | cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS); |
1043 | } |
1044 | } |
1045 | |
1046 | static void SDLCALL |
1047 | SDL_Convert_S32_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format) |
1048 | { |
1049 | const Sint32 *src = (const Sint32 *) cvt->buf; |
1050 | float *dst = (float *) cvt->buf; |
1051 | int i; |
1052 | |
1053 | LOG_DEBUG_CONVERT("AUDIO_S32" , "AUDIO_F32 (using NEON)" ); |
1054 | |
1055 | /* Get dst aligned to 16 bytes */ |
1056 | for (i = cvt->len_cvt / sizeof (Sint32); i && (((size_t) dst) & 15); --i, ++src, ++dst) { |
1057 | *dst = ((float) (*src>>8)) * DIVBY8388607; |
1058 | } |
1059 | |
1060 | SDL_assert(!i || ((((size_t) dst) & 15) == 0)); |
1061 | |
1062 | /* Make sure src is aligned too. */ |
1063 | if ((((size_t) src) & 15) == 0) { |
1064 | /* Aligned! Do NEON blocks as long as we have 16 bytes available. */ |
1065 | const float32x4_t divby8388607 = vdupq_n_f32(DIVBY8388607); |
1066 | const int32_t *mmsrc = (const int32_t *) src; |
1067 | while (i >= 4) { /* 4 * sint32 */ |
1068 | /* shift out lowest bits so int fits in a float32. Small precision loss, but much faster. */ |
1069 | vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vshrq_n_s32(vld1q_s32(mmsrc), 8)), divby8388607)); |
1070 | i -= 4; mmsrc += 4; dst += 4; |
1071 | } |
1072 | src = (const Sint32 *) mmsrc; |
1073 | } |
1074 | |
1075 | /* Finish off any leftovers with scalar operations. */ |
1076 | while (i) { |
1077 | *dst = ((float) (*src>>8)) * DIVBY8388607; |
1078 | i--; src++; dst++; |
1079 | } |
1080 | |
1081 | if (cvt->filters[++cvt->filter_index]) { |
1082 | cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS); |
1083 | } |
1084 | } |
1085 | |
1086 | static void SDLCALL |
1087 | SDL_Convert_F32_to_S8_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format) |
1088 | { |
1089 | const float *src = (const float *) cvt->buf; |
1090 | Sint8 *dst = (Sint8 *) cvt->buf; |
1091 | int i; |
1092 | |
1093 | LOG_DEBUG_CONVERT("AUDIO_F32" , "AUDIO_S8 (using NEON)" ); |
1094 | |
1095 | /* Get dst aligned to 16 bytes */ |
1096 | for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) { |
1097 | const float sample = *src; |
1098 | if (sample >= 1.0f) { |
1099 | *dst = 127; |
1100 | } else if (sample <= -1.0f) { |
1101 | *dst = -128; |
1102 | } else { |
1103 | *dst = (Sint8)(sample * 127.0f); |
1104 | } |
1105 | } |
1106 | |
1107 | SDL_assert(!i || ((((size_t) dst) & 15) == 0)); |
1108 | |
1109 | /* Make sure src is aligned too. */ |
1110 | if ((((size_t) src) & 15) == 0) { |
1111 | /* Aligned! Do NEON blocks as long as we have 16 bytes available. */ |
1112 | const float32x4_t one = vdupq_n_f32(1.0f); |
1113 | const float32x4_t negone = vdupq_n_f32(-1.0f); |
1114 | const float32x4_t mulby127 = vdupq_n_f32(127.0f); |
1115 | int8_t *mmdst = (int8_t *) dst; |
1116 | while (i >= 16) { /* 16 * float32 */ |
1117 | const int32x4_t ints1 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */ |
1118 | const int32x4_t ints2 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+4)), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */ |
1119 | const int32x4_t ints3 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+8)), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */ |
1120 | const int32x4_t ints4 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+12)), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */ |
1121 | const int8x8_t i8lo = vmovn_s16(vcombine_s16(vmovn_s32(ints1), vmovn_s32(ints2))); /* narrow to sint16, combine, narrow to sint8 */ |
1122 | const int8x8_t i8hi = vmovn_s16(vcombine_s16(vmovn_s32(ints3), vmovn_s32(ints4))); /* narrow to sint16, combine, narrow to sint8 */ |
1123 | vst1q_s8(mmdst, vcombine_s8(i8lo, i8hi)); /* combine to int8x16_t, store out */ |
1124 | i -= 16; src += 16; mmdst += 16; |
1125 | } |
1126 | dst = (Sint8 *) mmdst; |
1127 | } |
1128 | |
1129 | /* Finish off any leftovers with scalar operations. */ |
1130 | while (i) { |
1131 | const float sample = *src; |
1132 | if (sample >= 1.0f) { |
1133 | *dst = 127; |
1134 | } else if (sample <= -1.0f) { |
1135 | *dst = -128; |
1136 | } else { |
1137 | *dst = (Sint8)(sample * 127.0f); |
1138 | } |
1139 | i--; src++; dst++; |
1140 | } |
1141 | |
1142 | cvt->len_cvt /= 4; |
1143 | if (cvt->filters[++cvt->filter_index]) { |
1144 | cvt->filters[cvt->filter_index](cvt, AUDIO_S8); |
1145 | } |
1146 | } |
1147 | |
1148 | static void SDLCALL |
1149 | SDL_Convert_F32_to_U8_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format) |
1150 | { |
1151 | const float *src = (const float *) cvt->buf; |
1152 | Uint8 *dst = (Uint8 *) cvt->buf; |
1153 | int i; |
1154 | |
1155 | LOG_DEBUG_CONVERT("AUDIO_F32" , "AUDIO_U8 (using NEON)" ); |
1156 | |
1157 | /* Get dst aligned to 16 bytes */ |
1158 | for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) { |
1159 | const float sample = *src; |
1160 | if (sample >= 1.0f) { |
1161 | *dst = 255; |
1162 | } else if (sample <= -1.0f) { |
1163 | *dst = 0; |
1164 | } else { |
1165 | *dst = (Uint8)((sample + 1.0f) * 127.0f); |
1166 | } |
1167 | } |
1168 | |
1169 | SDL_assert(!i || ((((size_t) dst) & 15) == 0)); |
1170 | |
1171 | /* Make sure src is aligned too. */ |
1172 | if ((((size_t) src) & 15) == 0) { |
1173 | /* Aligned! Do NEON blocks as long as we have 16 bytes available. */ |
1174 | const float32x4_t one = vdupq_n_f32(1.0f); |
1175 | const float32x4_t negone = vdupq_n_f32(-1.0f); |
1176 | const float32x4_t mulby127 = vdupq_n_f32(127.0f); |
1177 | uint8_t *mmdst = (uint8_t *) dst; |
1178 | while (i >= 16) { /* 16 * float32 */ |
1179 | const uint32x4_t uints1 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), one), mulby127)); /* load 4 floats, clamp, convert to uint32 */ |
1180 | const uint32x4_t uints2 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+4)), one), one), mulby127)); /* load 4 floats, clamp, convert to uint32 */ |
1181 | const uint32x4_t uints3 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+8)), one), one), mulby127)); /* load 4 floats, clamp, convert to uint32 */ |
1182 | const uint32x4_t uints4 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+12)), one), one), mulby127)); /* load 4 floats, clamp, convert to uint32 */ |
1183 | const uint8x8_t ui8lo = vmovn_u16(vcombine_u16(vmovn_u32(uints1), vmovn_u32(uints2))); /* narrow to uint16, combine, narrow to uint8 */ |
1184 | const uint8x8_t ui8hi = vmovn_u16(vcombine_u16(vmovn_u32(uints3), vmovn_u32(uints4))); /* narrow to uint16, combine, narrow to uint8 */ |
1185 | vst1q_u8(mmdst, vcombine_u8(ui8lo, ui8hi)); /* combine to uint8x16_t, store out */ |
1186 | i -= 16; src += 16; mmdst += 16; |
1187 | } |
1188 | |
1189 | dst = (Uint8 *) mmdst; |
1190 | } |
1191 | |
1192 | /* Finish off any leftovers with scalar operations. */ |
1193 | while (i) { |
1194 | const float sample = *src; |
1195 | if (sample >= 1.0f) { |
1196 | *dst = 255; |
1197 | } else if (sample <= -1.0f) { |
1198 | *dst = 0; |
1199 | } else { |
1200 | *dst = (Uint8)((sample + 1.0f) * 127.0f); |
1201 | } |
1202 | i--; src++; dst++; |
1203 | } |
1204 | |
1205 | cvt->len_cvt /= 4; |
1206 | if (cvt->filters[++cvt->filter_index]) { |
1207 | cvt->filters[cvt->filter_index](cvt, AUDIO_U8); |
1208 | } |
1209 | } |
1210 | |
1211 | static void SDLCALL |
1212 | SDL_Convert_F32_to_S16_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format) |
1213 | { |
1214 | const float *src = (const float *) cvt->buf; |
1215 | Sint16 *dst = (Sint16 *) cvt->buf; |
1216 | int i; |
1217 | |
1218 | LOG_DEBUG_CONVERT("AUDIO_F32" , "AUDIO_S16 (using NEON)" ); |
1219 | |
1220 | /* Get dst aligned to 16 bytes */ |
1221 | for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) { |
1222 | const float sample = *src; |
1223 | if (sample >= 1.0f) { |
1224 | *dst = 32767; |
1225 | } else if (sample <= -1.0f) { |
1226 | *dst = -32768; |
1227 | } else { |
1228 | *dst = (Sint16)(sample * 32767.0f); |
1229 | } |
1230 | } |
1231 | |
1232 | SDL_assert(!i || ((((size_t) dst) & 15) == 0)); |
1233 | |
1234 | /* Make sure src is aligned too. */ |
1235 | if ((((size_t) src) & 15) == 0) { |
1236 | /* Aligned! Do NEON blocks as long as we have 16 bytes available. */ |
1237 | const float32x4_t one = vdupq_n_f32(1.0f); |
1238 | const float32x4_t negone = vdupq_n_f32(-1.0f); |
1239 | const float32x4_t mulby32767 = vdupq_n_f32(32767.0f); |
1240 | int16_t *mmdst = (int16_t *) dst; |
1241 | while (i >= 8) { /* 8 * float32 */ |
1242 | const int32x4_t ints1 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), mulby32767)); /* load 4 floats, clamp, convert to sint32 */ |
1243 | const int32x4_t ints2 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+4)), one), mulby32767)); /* load 4 floats, clamp, convert to sint32 */ |
1244 | vst1q_s16(mmdst, vcombine_s16(vmovn_s32(ints1), vmovn_s32(ints2))); /* narrow to sint16, combine, store out. */ |
1245 | i -= 8; src += 8; mmdst += 8; |
1246 | } |
1247 | dst = (Sint16 *) mmdst; |
1248 | } |
1249 | |
1250 | /* Finish off any leftovers with scalar operations. */ |
1251 | while (i) { |
1252 | const float sample = *src; |
1253 | if (sample >= 1.0f) { |
1254 | *dst = 32767; |
1255 | } else if (sample <= -1.0f) { |
1256 | *dst = -32768; |
1257 | } else { |
1258 | *dst = (Sint16)(sample * 32767.0f); |
1259 | } |
1260 | i--; src++; dst++; |
1261 | } |
1262 | |
1263 | cvt->len_cvt /= 2; |
1264 | if (cvt->filters[++cvt->filter_index]) { |
1265 | cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS); |
1266 | } |
1267 | } |
1268 | |
1269 | static void SDLCALL |
1270 | SDL_Convert_F32_to_U16_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format) |
1271 | { |
1272 | const float *src = (const float *) cvt->buf; |
1273 | Uint16 *dst = (Uint16 *) cvt->buf; |
1274 | int i; |
1275 | |
1276 | LOG_DEBUG_CONVERT("AUDIO_F32" , "AUDIO_U16 (using NEON)" ); |
1277 | |
1278 | /* Get dst aligned to 16 bytes */ |
1279 | for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) { |
1280 | const float sample = *src; |
1281 | if (sample >= 1.0f) { |
1282 | *dst = 65535; |
1283 | } else if (sample <= -1.0f) { |
1284 | *dst = 0; |
1285 | } else { |
1286 | *dst = (Uint16)((sample + 1.0f) * 32767.0f); |
1287 | } |
1288 | } |
1289 | |
1290 | SDL_assert(!i || ((((size_t) dst) & 15) == 0)); |
1291 | |
1292 | /* Make sure src is aligned too. */ |
1293 | if ((((size_t) src) & 15) == 0) { |
1294 | /* Aligned! Do NEON blocks as long as we have 16 bytes available. */ |
1295 | const float32x4_t one = vdupq_n_f32(1.0f); |
1296 | const float32x4_t negone = vdupq_n_f32(-1.0f); |
1297 | const float32x4_t mulby32767 = vdupq_n_f32(32767.0f); |
1298 | uint16_t *mmdst = (uint16_t *) dst; |
1299 | while (i >= 8) { /* 8 * float32 */ |
1300 | const uint32x4_t uints1 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), one), mulby32767)); /* load 4 floats, clamp, convert to uint32 */ |
1301 | const uint32x4_t uints2 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+4)), one), one), mulby32767)); /* load 4 floats, clamp, convert to uint32 */ |
1302 | vst1q_u16(mmdst, vcombine_u16(vmovn_u32(uints1), vmovn_u32(uints2))); /* narrow to uint16, combine, store out. */ |
1303 | i -= 8; src += 8; mmdst += 8; |
1304 | } |
1305 | dst = (Uint16 *) mmdst; |
1306 | } |
1307 | |
1308 | /* Finish off any leftovers with scalar operations. */ |
1309 | while (i) { |
1310 | const float sample = *src; |
1311 | if (sample >= 1.0f) { |
1312 | *dst = 65535; |
1313 | } else if (sample <= -1.0f) { |
1314 | *dst = 0; |
1315 | } else { |
1316 | *dst = (Uint16)((sample + 1.0f) * 32767.0f); |
1317 | } |
1318 | i--; src++; dst++; |
1319 | } |
1320 | |
1321 | cvt->len_cvt /= 2; |
1322 | if (cvt->filters[++cvt->filter_index]) { |
1323 | cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS); |
1324 | } |
1325 | } |
1326 | |
1327 | static void SDLCALL |
1328 | SDL_Convert_F32_to_S32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format) |
1329 | { |
1330 | const float *src = (const float *) cvt->buf; |
1331 | Sint32 *dst = (Sint32 *) cvt->buf; |
1332 | int i; |
1333 | |
1334 | LOG_DEBUG_CONVERT("AUDIO_F32" , "AUDIO_S32 (using NEON)" ); |
1335 | |
1336 | /* Get dst aligned to 16 bytes */ |
1337 | for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) { |
1338 | const float sample = *src; |
1339 | if (sample >= 1.0f) { |
1340 | *dst = 2147483647; |
1341 | } else if (sample <= -1.0f) { |
1342 | *dst = (-2147483647) - 1; |
1343 | } else { |
1344 | *dst = ((Sint32)(sample * 8388607.0f)) << 8; |
1345 | } |
1346 | } |
1347 | |
1348 | SDL_assert(!i || ((((size_t) dst) & 15) == 0)); |
1349 | SDL_assert(!i || ((((size_t) src) & 15) == 0)); |
1350 | |
1351 | { |
1352 | /* Aligned! Do NEON blocks as long as we have 16 bytes available. */ |
1353 | const float32x4_t one = vdupq_n_f32(1.0f); |
1354 | const float32x4_t negone = vdupq_n_f32(-1.0f); |
1355 | const float32x4_t mulby8388607 = vdupq_n_f32(8388607.0f); |
1356 | int32_t *mmdst = (int32_t *) dst; |
1357 | while (i >= 4) { /* 4 * float32 */ |
1358 | vst1q_s32(mmdst, vshlq_n_s32(vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), mulby8388607)), 8)); |
1359 | i -= 4; src += 4; mmdst += 4; |
1360 | } |
1361 | dst = (Sint32 *) mmdst; |
1362 | } |
1363 | |
1364 | /* Finish off any leftovers with scalar operations. */ |
1365 | while (i) { |
1366 | const float sample = *src; |
1367 | if (sample >= 1.0f) { |
1368 | *dst = 2147483647; |
1369 | } else if (sample <= -1.0f) { |
1370 | *dst = (-2147483647) - 1; |
1371 | } else { |
1372 | *dst = ((Sint32)(sample * 8388607.0f)) << 8; |
1373 | } |
1374 | i--; src++; dst++; |
1375 | } |
1376 | |
1377 | if (cvt->filters[++cvt->filter_index]) { |
1378 | cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS); |
1379 | } |
1380 | } |
1381 | #endif |
1382 | |
1383 | |
1384 | |
1385 | void SDL_ChooseAudioConverters(void) |
1386 | { |
1387 | static SDL_bool converters_chosen = SDL_FALSE; |
1388 | |
1389 | if (converters_chosen) { |
1390 | return; |
1391 | } |
1392 | |
1393 | #define SET_CONVERTER_FUNCS(fntype) \ |
1394 | SDL_Convert_S8_to_F32 = SDL_Convert_S8_to_F32_##fntype; \ |
1395 | SDL_Convert_U8_to_F32 = SDL_Convert_U8_to_F32_##fntype; \ |
1396 | SDL_Convert_S16_to_F32 = SDL_Convert_S16_to_F32_##fntype; \ |
1397 | SDL_Convert_U16_to_F32 = SDL_Convert_U16_to_F32_##fntype; \ |
1398 | SDL_Convert_S32_to_F32 = SDL_Convert_S32_to_F32_##fntype; \ |
1399 | SDL_Convert_F32_to_S8 = SDL_Convert_F32_to_S8_##fntype; \ |
1400 | SDL_Convert_F32_to_U8 = SDL_Convert_F32_to_U8_##fntype; \ |
1401 | SDL_Convert_F32_to_S16 = SDL_Convert_F32_to_S16_##fntype; \ |
1402 | SDL_Convert_F32_to_U16 = SDL_Convert_F32_to_U16_##fntype; \ |
1403 | SDL_Convert_F32_to_S32 = SDL_Convert_F32_to_S32_##fntype; \ |
1404 | converters_chosen = SDL_TRUE |
1405 | |
1406 | #if HAVE_SSE2_INTRINSICS |
1407 | if (SDL_HasSSE2()) { |
1408 | SET_CONVERTER_FUNCS(SSE2); |
1409 | return; |
1410 | } |
1411 | #endif |
1412 | |
1413 | #if HAVE_NEON_INTRINSICS |
1414 | if (SDL_HasNEON()) { |
1415 | SET_CONVERTER_FUNCS(NEON); |
1416 | return; |
1417 | } |
1418 | #endif |
1419 | |
1420 | #if NEED_SCALAR_CONVERTER_FALLBACKS |
1421 | SET_CONVERTER_FUNCS(Scalar); |
1422 | #endif |
1423 | |
1424 | #undef SET_CONVERTER_FUNCS |
1425 | |
1426 | SDL_assert(converters_chosen == SDL_TRUE); |
1427 | } |
1428 | |
1429 | /* vi: set ts=4 sw=4 expandtab: */ |
1430 | |