1/*
2 Simple DirectMedia Layer
3 Copyright (C) 1997-2021 Sam Lantinga <slouken@libsdl.org>
4
5 This software is provided 'as-is', without any express or implied
6 warranty. In no event will the authors be held liable for any damages
7 arising from the use of this software.
8
9 Permission is granted to anyone to use this software for any purpose,
10 including commercial applications, and to alter it and redistribute it
11 freely, subject to the following restrictions:
12
13 1. The origin of this software must not be misrepresented; you must not
14 claim that you wrote the original software. If you use this software
15 in a product, an acknowledgment in the product documentation would be
16 appreciated but is not required.
17 2. Altered source versions must be plainly marked as such, and must not be
18 misrepresented as being the original software.
19 3. This notice may not be removed or altered from any source distribution.
20*/
21#include "../SDL_internal.h"
22
23#include "SDL_audio.h"
24#include "SDL_audio_c.h"
25#include "SDL_cpuinfo.h"
26
27#ifdef __ARM_NEON
28#define HAVE_NEON_INTRINSICS 1
29#endif
30
31#ifdef __SSE2__
32#define HAVE_SSE2_INTRINSICS 1
33#endif
34
35#if defined(__x86_64__) && HAVE_SSE2_INTRINSICS
36#define NEED_SCALAR_CONVERTER_FALLBACKS 0 /* x86_64 guarantees SSE2. */
37#elif __MACOSX__ && HAVE_SSE2_INTRINSICS
38#define NEED_SCALAR_CONVERTER_FALLBACKS 0 /* Mac OS X/Intel guarantees SSE2. */
39#elif defined(__ARM_ARCH) && (__ARM_ARCH >= 8) && HAVE_NEON_INTRINSICS
40#define NEED_SCALAR_CONVERTER_FALLBACKS 0 /* ARMv8+ promise NEON. */
41#elif defined(__APPLE__) && defined(__ARM_ARCH) && (__ARM_ARCH >= 7) && HAVE_NEON_INTRINSICS
42#define NEED_SCALAR_CONVERTER_FALLBACKS 0 /* All Apple ARMv7 chips promise NEON support. */
43#endif
44
45/* Set to zero if platform is guaranteed to use a SIMD codepath here. */
46#ifndef NEED_SCALAR_CONVERTER_FALLBACKS
47#define NEED_SCALAR_CONVERTER_FALLBACKS 1
48#endif
49
50/* Function pointers set to a CPU-specific implementation. */
51SDL_AudioFilter SDL_Convert_S8_to_F32 = NULL;
52SDL_AudioFilter SDL_Convert_U8_to_F32 = NULL;
53SDL_AudioFilter SDL_Convert_S16_to_F32 = NULL;
54SDL_AudioFilter SDL_Convert_U16_to_F32 = NULL;
55SDL_AudioFilter SDL_Convert_S32_to_F32 = NULL;
56SDL_AudioFilter SDL_Convert_F32_to_S8 = NULL;
57SDL_AudioFilter SDL_Convert_F32_to_U8 = NULL;
58SDL_AudioFilter SDL_Convert_F32_to_S16 = NULL;
59SDL_AudioFilter SDL_Convert_F32_to_U16 = NULL;
60SDL_AudioFilter SDL_Convert_F32_to_S32 = NULL;
61
62
63#define DIVBY128 0.0078125f
64#define DIVBY32768 0.000030517578125f
65#define DIVBY8388607 0.00000011920930376163766f
66
67
68#if NEED_SCALAR_CONVERTER_FALLBACKS
69static void SDLCALL
70SDL_Convert_S8_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
71{
72 const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1;
73 float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
74 int i;
75
76 LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32");
77
78 for (i = cvt->len_cvt; i; --i, --src, --dst) {
79 *dst = ((float) *src) * DIVBY128;
80 }
81
82 cvt->len_cvt *= 4;
83 if (cvt->filters[++cvt->filter_index]) {
84 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
85 }
86}
87
88static void SDLCALL
89SDL_Convert_U8_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
90{
91 const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1;
92 float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
93 int i;
94
95 LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32");
96
97 for (i = cvt->len_cvt; i; --i, --src, --dst) {
98 *dst = (((float) *src) * DIVBY128) - 1.0f;
99 }
100
101 cvt->len_cvt *= 4;
102 if (cvt->filters[++cvt->filter_index]) {
103 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
104 }
105}
106
107static void SDLCALL
108SDL_Convert_S16_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
109{
110 const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1;
111 float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
112 int i;
113
114 LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32");
115
116 for (i = cvt->len_cvt / sizeof (Sint16); i; --i, --src, --dst) {
117 *dst = ((float) *src) * DIVBY32768;
118 }
119
120 cvt->len_cvt *= 2;
121 if (cvt->filters[++cvt->filter_index]) {
122 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
123 }
124}
125
126static void SDLCALL
127SDL_Convert_U16_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
128{
129 const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1;
130 float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
131 int i;
132
133 LOG_DEBUG_CONVERT("AUDIO_U16", "AUDIO_F32");
134
135 for (i = cvt->len_cvt / sizeof (Uint16); i; --i, --src, --dst) {
136 *dst = (((float) *src) * DIVBY32768) - 1.0f;
137 }
138
139 cvt->len_cvt *= 2;
140 if (cvt->filters[++cvt->filter_index]) {
141 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
142 }
143}
144
145static void SDLCALL
146SDL_Convert_S32_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
147{
148 const Sint32 *src = (const Sint32 *) cvt->buf;
149 float *dst = (float *) cvt->buf;
150 int i;
151
152 LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32");
153
154 for (i = cvt->len_cvt / sizeof (Sint32); i; --i, ++src, ++dst) {
155 *dst = ((float) (*src>>8)) * DIVBY8388607;
156 }
157
158 if (cvt->filters[++cvt->filter_index]) {
159 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
160 }
161}
162
163static void SDLCALL
164SDL_Convert_F32_to_S8_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
165{
166 const float *src = (const float *) cvt->buf;
167 Sint8 *dst = (Sint8 *) cvt->buf;
168 int i;
169
170 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8");
171
172 for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
173 const float sample = *src;
174 if (sample >= 1.0f) {
175 *dst = 127;
176 } else if (sample <= -1.0f) {
177 *dst = -128;
178 } else {
179 *dst = (Sint8)(sample * 127.0f);
180 }
181 }
182
183 cvt->len_cvt /= 4;
184 if (cvt->filters[++cvt->filter_index]) {
185 cvt->filters[cvt->filter_index](cvt, AUDIO_S8);
186 }
187}
188
189static void SDLCALL
190SDL_Convert_F32_to_U8_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
191{
192 const float *src = (const float *) cvt->buf;
193 Uint8 *dst = (Uint8 *) cvt->buf;
194 int i;
195
196 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8");
197
198 for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
199 const float sample = *src;
200 if (sample >= 1.0f) {
201 *dst = 255;
202 } else if (sample <= -1.0f) {
203 *dst = 0;
204 } else {
205 *dst = (Uint8)((sample + 1.0f) * 127.0f);
206 }
207 }
208
209 cvt->len_cvt /= 4;
210 if (cvt->filters[++cvt->filter_index]) {
211 cvt->filters[cvt->filter_index](cvt, AUDIO_U8);
212 }
213}
214
215static void SDLCALL
216SDL_Convert_F32_to_S16_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
217{
218 const float *src = (const float *) cvt->buf;
219 Sint16 *dst = (Sint16 *) cvt->buf;
220 int i;
221
222 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S16");
223
224 for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
225 const float sample = *src;
226 if (sample >= 1.0f) {
227 *dst = 32767;
228 } else if (sample <= -1.0f) {
229 *dst = -32768;
230 } else {
231 *dst = (Sint16)(sample * 32767.0f);
232 }
233 }
234
235 cvt->len_cvt /= 2;
236 if (cvt->filters[++cvt->filter_index]) {
237 cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS);
238 }
239}
240
241static void SDLCALL
242SDL_Convert_F32_to_U16_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
243{
244 const float *src = (const float *) cvt->buf;
245 Uint16 *dst = (Uint16 *) cvt->buf;
246 int i;
247
248 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U16");
249
250 for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
251 const float sample = *src;
252 if (sample >= 1.0f) {
253 *dst = 65535;
254 } else if (sample <= -1.0f) {
255 *dst = 0;
256 } else {
257 *dst = (Uint16)((sample + 1.0f) * 32767.0f);
258 }
259 }
260
261 cvt->len_cvt /= 2;
262 if (cvt->filters[++cvt->filter_index]) {
263 cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS);
264 }
265}
266
267static void SDLCALL
268SDL_Convert_F32_to_S32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
269{
270 const float *src = (const float *) cvt->buf;
271 Sint32 *dst = (Sint32 *) cvt->buf;
272 int i;
273
274 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S32");
275
276 for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
277 const float sample = *src;
278 if (sample >= 1.0f) {
279 *dst = 2147483647;
280 } else if (sample <= -1.0f) {
281 *dst = (Sint32) -2147483648LL;
282 } else {
283 *dst = ((Sint32)(sample * 8388607.0f)) << 8;
284 }
285 }
286
287 if (cvt->filters[++cvt->filter_index]) {
288 cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS);
289 }
290}
291#endif
292
293
294#if HAVE_SSE2_INTRINSICS
295static void SDLCALL
296SDL_Convert_S8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
297{
298 const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1;
299 float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
300 int i;
301
302 LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32 (using SSE2)");
303
304 /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
305 for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
306 *dst = ((float) *src) * DIVBY128;
307 }
308
309 src -= 15; dst -= 15; /* adjust to read SSE blocks from the start. */
310 SDL_assert(!i || ((((size_t) dst) & 15) == 0));
311
312 /* Make sure src is aligned too. */
313 if ((((size_t) src) & 15) == 0) {
314 /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
315 const __m128i *mmsrc = (const __m128i *) src;
316 const __m128i zero = _mm_setzero_si128();
317 const __m128 divby128 = _mm_set1_ps(DIVBY128);
318 while (i >= 16) { /* 16 * 8-bit */
319 const __m128i bytes = _mm_load_si128(mmsrc); /* get 16 sint8 into an XMM register. */
320 /* treat as int16, shift left to clear every other sint16, then back right with sign-extend. Now sint16. */
321 const __m128i shorts1 = _mm_srai_epi16(_mm_slli_epi16(bytes, 8), 8);
322 /* right-shift-sign-extend gets us sint16 with the other set of values. */
323 const __m128i shorts2 = _mm_srai_epi16(bytes, 8);
324 /* unpack against zero to make these int32, shift to make them sign-extend, convert to float, multiply. Whew! */
325 const __m128 floats1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(shorts1, zero), 16), 16)), divby128);
326 const __m128 floats2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(shorts2, zero), 16), 16)), divby128);
327 const __m128 floats3 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(shorts1, zero), 16), 16)), divby128);
328 const __m128 floats4 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(shorts2, zero), 16), 16)), divby128);
329 /* Interleave back into correct order, store. */
330 _mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2));
331 _mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2));
332 _mm_store_ps(dst+8, _mm_unpacklo_ps(floats3, floats4));
333 _mm_store_ps(dst+12, _mm_unpackhi_ps(floats3, floats4));
334 i -= 16; mmsrc--; dst -= 16;
335 }
336
337 src = (const Sint8 *) mmsrc;
338 }
339
340 src += 15; dst += 15; /* adjust for any scalar finishing. */
341
342 /* Finish off any leftovers with scalar operations. */
343 while (i) {
344 *dst = ((float) *src) * DIVBY128;
345 i--; src--; dst--;
346 }
347
348 cvt->len_cvt *= 4;
349 if (cvt->filters[++cvt->filter_index]) {
350 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
351 }
352}
353
354static void SDLCALL
355SDL_Convert_U8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
356{
357 const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1;
358 float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
359 int i;
360
361 LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32 (using SSE2)");
362
363 /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
364 for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
365 *dst = (((float) *src) * DIVBY128) - 1.0f;
366 }
367
368 src -= 15; dst -= 15; /* adjust to read SSE blocks from the start. */
369 SDL_assert(!i || ((((size_t) dst) & 15) == 0));
370
371 /* Make sure src is aligned too. */
372 if ((((size_t) src) & 15) == 0) {
373 /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
374 const __m128i *mmsrc = (const __m128i *) src;
375 const __m128i zero = _mm_setzero_si128();
376 const __m128 divby128 = _mm_set1_ps(DIVBY128);
377 const __m128 minus1 = _mm_set1_ps(-1.0f);
378 while (i >= 16) { /* 16 * 8-bit */
379 const __m128i bytes = _mm_load_si128(mmsrc); /* get 16 uint8 into an XMM register. */
380 /* treat as int16, shift left to clear every other sint16, then back right with zero-extend. Now uint16. */
381 const __m128i shorts1 = _mm_srli_epi16(_mm_slli_epi16(bytes, 8), 8);
382 /* right-shift-zero-extend gets us uint16 with the other set of values. */
383 const __m128i shorts2 = _mm_srli_epi16(bytes, 8);
384 /* unpack against zero to make these int32, convert to float, multiply, add. Whew! */
385 /* Note that AVX2 can do floating point multiply+add in one instruction, fwiw. SSE2 cannot. */
386 const __m128 floats1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts1, zero)), divby128), minus1);
387 const __m128 floats2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts2, zero)), divby128), minus1);
388 const __m128 floats3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts1, zero)), divby128), minus1);
389 const __m128 floats4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts2, zero)), divby128), minus1);
390 /* Interleave back into correct order, store. */
391 _mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2));
392 _mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2));
393 _mm_store_ps(dst+8, _mm_unpacklo_ps(floats3, floats4));
394 _mm_store_ps(dst+12, _mm_unpackhi_ps(floats3, floats4));
395 i -= 16; mmsrc--; dst -= 16;
396 }
397
398 src = (const Uint8 *) mmsrc;
399 }
400
401 src += 15; dst += 15; /* adjust for any scalar finishing. */
402
403 /* Finish off any leftovers with scalar operations. */
404 while (i) {
405 *dst = (((float) *src) * DIVBY128) - 1.0f;
406 i--; src--; dst--;
407 }
408
409 cvt->len_cvt *= 4;
410 if (cvt->filters[++cvt->filter_index]) {
411 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
412 }
413}
414
415static void SDLCALL
416SDL_Convert_S16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
417{
418 const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1;
419 float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
420 int i;
421
422 LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32 (using SSE2)");
423
424 /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
425 for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
426 *dst = ((float) *src) * DIVBY32768;
427 }
428
429 src -= 7; dst -= 7; /* adjust to read SSE blocks from the start. */
430 SDL_assert(!i || ((((size_t) dst) & 15) == 0));
431
432 /* Make sure src is aligned too. */
433 if ((((size_t) src) & 15) == 0) {
434 /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
435 const __m128 divby32768 = _mm_set1_ps(DIVBY32768);
436 while (i >= 8) { /* 8 * 16-bit */
437 const __m128i ints = _mm_load_si128((__m128i const *) src); /* get 8 sint16 into an XMM register. */
438 /* treat as int32, shift left to clear every other sint16, then back right with sign-extend. Now sint32. */
439 const __m128i a = _mm_srai_epi32(_mm_slli_epi32(ints, 16), 16);
440 /* right-shift-sign-extend gets us sint32 with the other set of values. */
441 const __m128i b = _mm_srai_epi32(ints, 16);
442 /* Interleave these back into the right order, convert to float, multiply, store. */
443 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32768));
444 _mm_store_ps(dst+4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32768));
445 i -= 8; src -= 8; dst -= 8;
446 }
447 }
448
449 src += 7; dst += 7; /* adjust for any scalar finishing. */
450
451 /* Finish off any leftovers with scalar operations. */
452 while (i) {
453 *dst = ((float) *src) * DIVBY32768;
454 i--; src--; dst--;
455 }
456
457 cvt->len_cvt *= 2;
458 if (cvt->filters[++cvt->filter_index]) {
459 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
460 }
461}
462
463static void SDLCALL
464SDL_Convert_U16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
465{
466 const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1;
467 float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
468 int i;
469
470 LOG_DEBUG_CONVERT("AUDIO_U16", "AUDIO_F32 (using SSE2)");
471
472 /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
473 for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
474 *dst = (((float) *src) * DIVBY32768) - 1.0f;
475 }
476
477 src -= 7; dst -= 7; /* adjust to read SSE blocks from the start. */
478 SDL_assert(!i || ((((size_t) dst) & 15) == 0));
479
480 /* Make sure src is aligned too. */
481 if ((((size_t) src) & 15) == 0) {
482 /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
483 const __m128 divby32768 = _mm_set1_ps(DIVBY32768);
484 const __m128 minus1 = _mm_set1_ps(-1.0f);
485 while (i >= 8) { /* 8 * 16-bit */
486 const __m128i ints = _mm_load_si128((__m128i const *) src); /* get 8 sint16 into an XMM register. */
487 /* treat as int32, shift left to clear every other sint16, then back right with zero-extend. Now sint32. */
488 const __m128i a = _mm_srli_epi32(_mm_slli_epi32(ints, 16), 16);
489 /* right-shift-sign-extend gets us sint32 with the other set of values. */
490 const __m128i b = _mm_srli_epi32(ints, 16);
491 /* Interleave these back into the right order, convert to float, multiply, store. */
492 _mm_store_ps(dst, _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32768), minus1));
493 _mm_store_ps(dst+4, _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32768), minus1));
494 i -= 8; src -= 8; dst -= 8;
495 }
496 }
497
498 src += 7; dst += 7; /* adjust for any scalar finishing. */
499
500 /* Finish off any leftovers with scalar operations. */
501 while (i) {
502 *dst = (((float) *src) * DIVBY32768) - 1.0f;
503 i--; src--; dst--;
504 }
505
506 cvt->len_cvt *= 2;
507 if (cvt->filters[++cvt->filter_index]) {
508 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
509 }
510}
511
512static void SDLCALL
513SDL_Convert_S32_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
514{
515 const Sint32 *src = (const Sint32 *) cvt->buf;
516 float *dst = (float *) cvt->buf;
517 int i;
518
519 LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32 (using SSE2)");
520
521 /* Get dst aligned to 16 bytes */
522 for (i = cvt->len_cvt / sizeof (Sint32); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
523 *dst = ((float) (*src>>8)) * DIVBY8388607;
524 }
525
526 SDL_assert(!i || ((((size_t) dst) & 15) == 0));
527
528 /* Make sure src is aligned too. */
529 if ((((size_t) src) & 15) == 0) {
530 /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
531 const __m128 divby8388607 = _mm_set1_ps(DIVBY8388607);
532 const __m128i *mmsrc = (const __m128i *) src;
533 while (i >= 4) { /* 4 * sint32 */
534 /* shift out lowest bits so int fits in a float32. Small precision loss, but much faster. */
535 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_load_si128(mmsrc), 8)), divby8388607));
536 i -= 4; mmsrc++; dst += 4;
537 }
538 src = (const Sint32 *) mmsrc;
539 }
540
541 /* Finish off any leftovers with scalar operations. */
542 while (i) {
543 *dst = ((float) (*src>>8)) * DIVBY8388607;
544 i--; src++; dst++;
545 }
546
547 if (cvt->filters[++cvt->filter_index]) {
548 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
549 }
550}
551
552static void SDLCALL
553SDL_Convert_F32_to_S8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
554{
555 const float *src = (const float *) cvt->buf;
556 Sint8 *dst = (Sint8 *) cvt->buf;
557 int i;
558
559 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8 (using SSE2)");
560
561 /* Get dst aligned to 16 bytes */
562 for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
563 const float sample = *src;
564 if (sample >= 1.0f) {
565 *dst = 127;
566 } else if (sample <= -1.0f) {
567 *dst = -128;
568 } else {
569 *dst = (Sint8)(sample * 127.0f);
570 }
571 }
572
573 SDL_assert(!i || ((((size_t) dst) & 15) == 0));
574
575 /* Make sure src is aligned too. */
576 if ((((size_t) src) & 15) == 0) {
577 /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
578 const __m128 one = _mm_set1_ps(1.0f);
579 const __m128 negone = _mm_set1_ps(-1.0f);
580 const __m128 mulby127 = _mm_set1_ps(127.0f);
581 __m128i *mmdst = (__m128i *) dst;
582 while (i >= 16) { /* 16 * float32 */
583 const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */
584 const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */
585 const __m128i ints3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+8)), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */
586 const __m128i ints4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+12)), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */
587 _mm_store_si128(mmdst, _mm_packs_epi16(_mm_packs_epi32(ints1, ints2), _mm_packs_epi32(ints3, ints4))); /* pack down, store out. */
588 i -= 16; src += 16; mmdst++;
589 }
590 dst = (Sint8 *) mmdst;
591 }
592
593 /* Finish off any leftovers with scalar operations. */
594 while (i) {
595 const float sample = *src;
596 if (sample >= 1.0f) {
597 *dst = 127;
598 } else if (sample <= -1.0f) {
599 *dst = -128;
600 } else {
601 *dst = (Sint8)(sample * 127.0f);
602 }
603 i--; src++; dst++;
604 }
605
606 cvt->len_cvt /= 4;
607 if (cvt->filters[++cvt->filter_index]) {
608 cvt->filters[cvt->filter_index](cvt, AUDIO_S8);
609 }
610}
611
612static void SDLCALL
613SDL_Convert_F32_to_U8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
614{
615 const float *src = (const float *) cvt->buf;
616 Uint8 *dst = cvt->buf;
617 int i;
618
619 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8 (using SSE2)");
620
621 /* Get dst aligned to 16 bytes */
622 for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
623 const float sample = *src;
624 if (sample >= 1.0f) {
625 *dst = 255;
626 } else if (sample <= -1.0f) {
627 *dst = 0;
628 } else {
629 *dst = (Uint8)((sample + 1.0f) * 127.0f);
630 }
631 }
632
633 SDL_assert(!i || ((((size_t) dst) & 15) == 0));
634
635 /* Make sure src is aligned too. */
636 if ((((size_t) src) & 15) == 0) {
637 /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
638 const __m128 one = _mm_set1_ps(1.0f);
639 const __m128 negone = _mm_set1_ps(-1.0f);
640 const __m128 mulby127 = _mm_set1_ps(127.0f);
641 __m128i *mmdst = (__m128i *) dst;
642 while (i >= 16) { /* 16 * float32 */
643 const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */
644 const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */
645 const __m128i ints3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+8)), one), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */
646 const __m128i ints4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+12)), one), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */
647 _mm_store_si128(mmdst, _mm_packus_epi16(_mm_packs_epi32(ints1, ints2), _mm_packs_epi32(ints3, ints4))); /* pack down, store out. */
648 i -= 16; src += 16; mmdst++;
649 }
650 dst = (Uint8 *) mmdst;
651 }
652
653 /* Finish off any leftovers with scalar operations. */
654 while (i) {
655 const float sample = *src;
656 if (sample >= 1.0f) {
657 *dst = 255;
658 } else if (sample <= -1.0f) {
659 *dst = 0;
660 } else {
661 *dst = (Uint8)((sample + 1.0f) * 127.0f);
662 }
663 i--; src++; dst++;
664 }
665
666 cvt->len_cvt /= 4;
667 if (cvt->filters[++cvt->filter_index]) {
668 cvt->filters[cvt->filter_index](cvt, AUDIO_U8);
669 }
670}
671
672static void SDLCALL
673SDL_Convert_F32_to_S16_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
674{
675 const float *src = (const float *) cvt->buf;
676 Sint16 *dst = (Sint16 *) cvt->buf;
677 int i;
678
679 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S16 (using SSE2)");
680
681 /* Get dst aligned to 16 bytes */
682 for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
683 const float sample = *src;
684 if (sample >= 1.0f) {
685 *dst = 32767;
686 } else if (sample <= -1.0f) {
687 *dst = -32768;
688 } else {
689 *dst = (Sint16)(sample * 32767.0f);
690 }
691 }
692
693 SDL_assert(!i || ((((size_t) dst) & 15) == 0));
694
695 /* Make sure src is aligned too. */
696 if ((((size_t) src) & 15) == 0) {
697 /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
698 const __m128 one = _mm_set1_ps(1.0f);
699 const __m128 negone = _mm_set1_ps(-1.0f);
700 const __m128 mulby32767 = _mm_set1_ps(32767.0f);
701 __m128i *mmdst = (__m128i *) dst;
702 while (i >= 8) { /* 8 * float32 */
703 const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby32767)); /* load 4 floats, clamp, convert to sint32 */
704 const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), mulby32767)); /* load 4 floats, clamp, convert to sint32 */
705 _mm_store_si128(mmdst, _mm_packs_epi32(ints1, ints2)); /* pack to sint16, store out. */
706 i -= 8; src += 8; mmdst++;
707 }
708 dst = (Sint16 *) mmdst;
709 }
710
711 /* Finish off any leftovers with scalar operations. */
712 while (i) {
713 const float sample = *src;
714 if (sample >= 1.0f) {
715 *dst = 32767;
716 } else if (sample <= -1.0f) {
717 *dst = -32768;
718 } else {
719 *dst = (Sint16)(sample * 32767.0f);
720 }
721 i--; src++; dst++;
722 }
723
724 cvt->len_cvt /= 2;
725 if (cvt->filters[++cvt->filter_index]) {
726 cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS);
727 }
728}
729
730static void SDLCALL
731SDL_Convert_F32_to_U16_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
732{
733 const float *src = (const float *) cvt->buf;
734 Uint16 *dst = (Uint16 *) cvt->buf;
735 int i;
736
737 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U16 (using SSE2)");
738
739 /* Get dst aligned to 16 bytes */
740 for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
741 const float sample = *src;
742 if (sample >= 1.0f) {
743 *dst = 65535;
744 } else if (sample <= -1.0f) {
745 *dst = 0;
746 } else {
747 *dst = (Uint16)((sample + 1.0f) * 32767.0f);
748 }
749 }
750
751 SDL_assert(!i || ((((size_t) dst) & 15) == 0));
752
753 /* Make sure src is aligned too. */
754 if ((((size_t) src) & 15) == 0) {
755 /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
756 /* This calculates differently than the scalar path because SSE2 can't
757 pack int32 data down to unsigned int16. _mm_packs_epi32 does signed
758 saturation, so that would corrupt our data. _mm_packus_epi32 exists,
759 but not before SSE 4.1. So we convert from float to sint16, packing
760 that down with legit signed saturation, and then xor the top bit
761 against 1. This results in the correct unsigned 16-bit value, even
762 though it looks like dark magic. */
763 const __m128 mulby32767 = _mm_set1_ps(32767.0f);
764 const __m128i topbit = _mm_set1_epi16(-32768);
765 const __m128 one = _mm_set1_ps(1.0f);
766 const __m128 negone = _mm_set1_ps(-1.0f);
767 __m128i *mmdst = (__m128i *) dst;
768 while (i >= 8) { /* 8 * float32 */
769 const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby32767)); /* load 4 floats, clamp, convert to sint32 */
770 const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), mulby32767)); /* load 4 floats, clamp, convert to sint32 */
771 _mm_store_si128(mmdst, _mm_xor_si128(_mm_packs_epi32(ints1, ints2), topbit)); /* pack to sint16, xor top bit, store out. */
772 i -= 8; src += 8; mmdst++;
773 }
774 dst = (Uint16 *) mmdst;
775 }
776
777 /* Finish off any leftovers with scalar operations. */
778 while (i) {
779 const float sample = *src;
780 if (sample >= 1.0f) {
781 *dst = 65535;
782 } else if (sample <= -1.0f) {
783 *dst = 0;
784 } else {
785 *dst = (Uint16)((sample + 1.0f) * 32767.0f);
786 }
787 i--; src++; dst++;
788 }
789
790 cvt->len_cvt /= 2;
791 if (cvt->filters[++cvt->filter_index]) {
792 cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS);
793 }
794}
795
796static void SDLCALL
797SDL_Convert_F32_to_S32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
798{
799 const float *src = (const float *) cvt->buf;
800 Sint32 *dst = (Sint32 *) cvt->buf;
801 int i;
802
803 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S32 (using SSE2)");
804
805 /* Get dst aligned to 16 bytes */
806 for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
807 const float sample = *src;
808 if (sample >= 1.0f) {
809 *dst = 2147483647;
810 } else if (sample <= -1.0f) {
811 *dst = (Sint32) -2147483648LL;
812 } else {
813 *dst = ((Sint32)(sample * 8388607.0f)) << 8;
814 }
815 }
816
817 SDL_assert(!i || ((((size_t) dst) & 15) == 0));
818 SDL_assert(!i || ((((size_t) src) & 15) == 0));
819
820 {
821 /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
822 const __m128 one = _mm_set1_ps(1.0f);
823 const __m128 negone = _mm_set1_ps(-1.0f);
824 const __m128 mulby8388607 = _mm_set1_ps(8388607.0f);
825 __m128i *mmdst = (__m128i *) dst;
826 while (i >= 4) { /* 4 * float32 */
827 _mm_store_si128(mmdst, _mm_slli_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby8388607)), 8)); /* load 4 floats, clamp, convert to sint32 */
828 i -= 4; src += 4; mmdst++;
829 }
830 dst = (Sint32 *) mmdst;
831 }
832
833 /* Finish off any leftovers with scalar operations. */
834 while (i) {
835 const float sample = *src;
836 if (sample >= 1.0f) {
837 *dst = 2147483647;
838 } else if (sample <= -1.0f) {
839 *dst = (Sint32) -2147483648LL;
840 } else {
841 *dst = ((Sint32)(sample * 8388607.0f)) << 8;
842 }
843 i--; src++; dst++;
844 }
845
846 if (cvt->filters[++cvt->filter_index]) {
847 cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS);
848 }
849}
850#endif
851
852
853#if HAVE_NEON_INTRINSICS
854static void SDLCALL
855SDL_Convert_S8_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
856{
857 const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1;
858 float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
859 int i;
860
861 LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32 (using NEON)");
862
863 /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
864 for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
865 *dst = ((float) *src) * DIVBY128;
866 }
867
868 src -= 15; dst -= 15; /* adjust to read NEON blocks from the start. */
869 SDL_assert(!i || ((((size_t) dst) & 15) == 0));
870
871 /* Make sure src is aligned too. */
872 if ((((size_t) src) & 15) == 0) {
873 /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
874 const int8_t *mmsrc = (const int8_t *) src;
875 const float32x4_t divby128 = vdupq_n_f32(DIVBY128);
876 while (i >= 16) { /* 16 * 8-bit */
877 const int8x16_t bytes = vld1q_s8(mmsrc); /* get 16 sint8 into a NEON register. */
878 const int16x8_t int16hi = vmovl_s8(vget_high_s8(bytes)); /* convert top 8 bytes to 8 int16 */
879 const int16x8_t int16lo = vmovl_s8(vget_low_s8(bytes)); /* convert bottom 8 bytes to 8 int16 */
880 /* split int16 to two int32, then convert to float, then multiply to normalize, store. */
881 vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(int16lo))), divby128));
882 vst1q_f32(dst+4, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(int16lo))), divby128));
883 vst1q_f32(dst+8, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(int16hi))), divby128));
884 vst1q_f32(dst+12, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(int16hi))), divby128));
885 i -= 16; mmsrc -= 16; dst -= 16;
886 }
887
888 src = (const Sint8 *) mmsrc;
889 }
890
891 src += 15; dst += 15; /* adjust for any scalar finishing. */
892
893 /* Finish off any leftovers with scalar operations. */
894 while (i) {
895 *dst = ((float) *src) * DIVBY128;
896 i--; src--; dst--;
897 }
898
899 cvt->len_cvt *= 4;
900 if (cvt->filters[++cvt->filter_index]) {
901 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
902 }
903}
904
905static void SDLCALL
906SDL_Convert_U8_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
907{
908 const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1;
909 float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
910 int i;
911
912 LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32 (using NEON)");
913
914 /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
915 for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
916 *dst = (((float) *src) * DIVBY128) - 1.0f;
917 }
918
919 src -= 15; dst -= 15; /* adjust to read NEON blocks from the start. */
920 SDL_assert(!i || ((((size_t) dst) & 15) == 0));
921
922 /* Make sure src is aligned too. */
923 if ((((size_t) src) & 15) == 0) {
924 /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
925 const uint8_t *mmsrc = (const uint8_t *) src;
926 const float32x4_t divby128 = vdupq_n_f32(DIVBY128);
927 const float32x4_t negone = vdupq_n_f32(-1.0f);
928 while (i >= 16) { /* 16 * 8-bit */
929 const uint8x16_t bytes = vld1q_u8(mmsrc); /* get 16 uint8 into a NEON register. */
930 const uint16x8_t uint16hi = vmovl_u8(vget_high_u8(bytes)); /* convert top 8 bytes to 8 uint16 */
931 const uint16x8_t uint16lo = vmovl_u8(vget_low_u8(bytes)); /* convert bottom 8 bytes to 8 uint16 */
932 /* split uint16 to two uint32, then convert to float, then multiply to normalize, subtract to adjust for sign, store. */
933 vst1q_f32(dst, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16lo))), divby128));
934 vst1q_f32(dst+4, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16lo))), divby128));
935 vst1q_f32(dst+8, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16hi))), divby128));
936 vst1q_f32(dst+12, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16hi))), divby128));
937 i -= 16; mmsrc -= 16; dst -= 16;
938 }
939
940 src = (const Uint8 *) mmsrc;
941 }
942
943 src += 15; dst += 15; /* adjust for any scalar finishing. */
944
945 /* Finish off any leftovers with scalar operations. */
946 while (i) {
947 *dst = (((float) *src) * DIVBY128) - 1.0f;
948 i--; src--; dst--;
949 }
950
951 cvt->len_cvt *= 4;
952 if (cvt->filters[++cvt->filter_index]) {
953 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
954 }
955}
956
957static void SDLCALL
958SDL_Convert_S16_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
959{
960 const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1;
961 float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
962 int i;
963
964 LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32 (using NEON)");
965
966 /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
967 for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
968 *dst = ((float) *src) * DIVBY32768;
969 }
970
971 src -= 7; dst -= 7; /* adjust to read NEON blocks from the start. */
972 SDL_assert(!i || ((((size_t) dst) & 15) == 0));
973
974 /* Make sure src is aligned too. */
975 if ((((size_t) src) & 15) == 0) {
976 /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
977 const float32x4_t divby32768 = vdupq_n_f32(DIVBY32768);
978 while (i >= 8) { /* 8 * 16-bit */
979 const int16x8_t ints = vld1q_s16((int16_t const *) src); /* get 8 sint16 into a NEON register. */
980 /* split int16 to two int32, then convert to float, then multiply to normalize, store. */
981 vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(ints))), divby32768));
982 vst1q_f32(dst+4, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(ints))), divby32768));
983 i -= 8; src -= 8; dst -= 8;
984 }
985 }
986
987 src += 7; dst += 7; /* adjust for any scalar finishing. */
988
989 /* Finish off any leftovers with scalar operations. */
990 while (i) {
991 *dst = ((float) *src) * DIVBY32768;
992 i--; src--; dst--;
993 }
994
995 cvt->len_cvt *= 2;
996 if (cvt->filters[++cvt->filter_index]) {
997 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
998 }
999}
1000
1001static void SDLCALL
1002SDL_Convert_U16_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
1003{
1004 const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1;
1005 float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
1006 int i;
1007
1008 LOG_DEBUG_CONVERT("AUDIO_U16", "AUDIO_F32 (using NEON)");
1009
1010 /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
1011 for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
1012 *dst = (((float) *src) * DIVBY32768) - 1.0f;
1013 }
1014
1015 src -= 7; dst -= 7; /* adjust to read NEON blocks from the start. */
1016 SDL_assert(!i || ((((size_t) dst) & 15) == 0));
1017
1018 /* Make sure src is aligned too. */
1019 if ((((size_t) src) & 15) == 0) {
1020 /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
1021 const float32x4_t divby32768 = vdupq_n_f32(DIVBY32768);
1022 const float32x4_t negone = vdupq_n_f32(-1.0f);
1023 while (i >= 8) { /* 8 * 16-bit */
1024 const uint16x8_t uints = vld1q_u16((uint16_t const *) src); /* get 8 uint16 into a NEON register. */
1025 /* split uint16 to two int32, then convert to float, then multiply to normalize, subtract for sign, store. */
1026 vst1q_f32(dst, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uints))), divby32768));
1027 vst1q_f32(dst+4, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uints))), divby32768));
1028 i -= 8; src -= 8; dst -= 8;
1029 }
1030 }
1031
1032 src += 7; dst += 7; /* adjust for any scalar finishing. */
1033
1034 /* Finish off any leftovers with scalar operations. */
1035 while (i) {
1036 *dst = (((float) *src) * DIVBY32768) - 1.0f;
1037 i--; src--; dst--;
1038 }
1039
1040 cvt->len_cvt *= 2;
1041 if (cvt->filters[++cvt->filter_index]) {
1042 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
1043 }
1044}
1045
1046static void SDLCALL
1047SDL_Convert_S32_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
1048{
1049 const Sint32 *src = (const Sint32 *) cvt->buf;
1050 float *dst = (float *) cvt->buf;
1051 int i;
1052
1053 LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32 (using NEON)");
1054
1055 /* Get dst aligned to 16 bytes */
1056 for (i = cvt->len_cvt / sizeof (Sint32); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
1057 *dst = ((float) (*src>>8)) * DIVBY8388607;
1058 }
1059
1060 SDL_assert(!i || ((((size_t) dst) & 15) == 0));
1061
1062 /* Make sure src is aligned too. */
1063 if ((((size_t) src) & 15) == 0) {
1064 /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
1065 const float32x4_t divby8388607 = vdupq_n_f32(DIVBY8388607);
1066 const int32_t *mmsrc = (const int32_t *) src;
1067 while (i >= 4) { /* 4 * sint32 */
1068 /* shift out lowest bits so int fits in a float32. Small precision loss, but much faster. */
1069 vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vshrq_n_s32(vld1q_s32(mmsrc), 8)), divby8388607));
1070 i -= 4; mmsrc += 4; dst += 4;
1071 }
1072 src = (const Sint32 *) mmsrc;
1073 }
1074
1075 /* Finish off any leftovers with scalar operations. */
1076 while (i) {
1077 *dst = ((float) (*src>>8)) * DIVBY8388607;
1078 i--; src++; dst++;
1079 }
1080
1081 if (cvt->filters[++cvt->filter_index]) {
1082 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
1083 }
1084}
1085
1086static void SDLCALL
1087SDL_Convert_F32_to_S8_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
1088{
1089 const float *src = (const float *) cvt->buf;
1090 Sint8 *dst = (Sint8 *) cvt->buf;
1091 int i;
1092
1093 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8 (using NEON)");
1094
1095 /* Get dst aligned to 16 bytes */
1096 for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
1097 const float sample = *src;
1098 if (sample >= 1.0f) {
1099 *dst = 127;
1100 } else if (sample <= -1.0f) {
1101 *dst = -128;
1102 } else {
1103 *dst = (Sint8)(sample * 127.0f);
1104 }
1105 }
1106
1107 SDL_assert(!i || ((((size_t) dst) & 15) == 0));
1108
1109 /* Make sure src is aligned too. */
1110 if ((((size_t) src) & 15) == 0) {
1111 /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
1112 const float32x4_t one = vdupq_n_f32(1.0f);
1113 const float32x4_t negone = vdupq_n_f32(-1.0f);
1114 const float32x4_t mulby127 = vdupq_n_f32(127.0f);
1115 int8_t *mmdst = (int8_t *) dst;
1116 while (i >= 16) { /* 16 * float32 */
1117 const int32x4_t ints1 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */
1118 const int32x4_t ints2 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+4)), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */
1119 const int32x4_t ints3 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+8)), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */
1120 const int32x4_t ints4 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+12)), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */
1121 const int8x8_t i8lo = vmovn_s16(vcombine_s16(vmovn_s32(ints1), vmovn_s32(ints2))); /* narrow to sint16, combine, narrow to sint8 */
1122 const int8x8_t i8hi = vmovn_s16(vcombine_s16(vmovn_s32(ints3), vmovn_s32(ints4))); /* narrow to sint16, combine, narrow to sint8 */
1123 vst1q_s8(mmdst, vcombine_s8(i8lo, i8hi)); /* combine to int8x16_t, store out */
1124 i -= 16; src += 16; mmdst += 16;
1125 }
1126 dst = (Sint8 *) mmdst;
1127 }
1128
1129 /* Finish off any leftovers with scalar operations. */
1130 while (i) {
1131 const float sample = *src;
1132 if (sample >= 1.0f) {
1133 *dst = 127;
1134 } else if (sample <= -1.0f) {
1135 *dst = -128;
1136 } else {
1137 *dst = (Sint8)(sample * 127.0f);
1138 }
1139 i--; src++; dst++;
1140 }
1141
1142 cvt->len_cvt /= 4;
1143 if (cvt->filters[++cvt->filter_index]) {
1144 cvt->filters[cvt->filter_index](cvt, AUDIO_S8);
1145 }
1146}
1147
1148static void SDLCALL
1149SDL_Convert_F32_to_U8_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
1150{
1151 const float *src = (const float *) cvt->buf;
1152 Uint8 *dst = (Uint8 *) cvt->buf;
1153 int i;
1154
1155 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8 (using NEON)");
1156
1157 /* Get dst aligned to 16 bytes */
1158 for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
1159 const float sample = *src;
1160 if (sample >= 1.0f) {
1161 *dst = 255;
1162 } else if (sample <= -1.0f) {
1163 *dst = 0;
1164 } else {
1165 *dst = (Uint8)((sample + 1.0f) * 127.0f);
1166 }
1167 }
1168
1169 SDL_assert(!i || ((((size_t) dst) & 15) == 0));
1170
1171 /* Make sure src is aligned too. */
1172 if ((((size_t) src) & 15) == 0) {
1173 /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
1174 const float32x4_t one = vdupq_n_f32(1.0f);
1175 const float32x4_t negone = vdupq_n_f32(-1.0f);
1176 const float32x4_t mulby127 = vdupq_n_f32(127.0f);
1177 uint8_t *mmdst = (uint8_t *) dst;
1178 while (i >= 16) { /* 16 * float32 */
1179 const uint32x4_t uints1 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), one), mulby127)); /* load 4 floats, clamp, convert to uint32 */
1180 const uint32x4_t uints2 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+4)), one), one), mulby127)); /* load 4 floats, clamp, convert to uint32 */
1181 const uint32x4_t uints3 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+8)), one), one), mulby127)); /* load 4 floats, clamp, convert to uint32 */
1182 const uint32x4_t uints4 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+12)), one), one), mulby127)); /* load 4 floats, clamp, convert to uint32 */
1183 const uint8x8_t ui8lo = vmovn_u16(vcombine_u16(vmovn_u32(uints1), vmovn_u32(uints2))); /* narrow to uint16, combine, narrow to uint8 */
1184 const uint8x8_t ui8hi = vmovn_u16(vcombine_u16(vmovn_u32(uints3), vmovn_u32(uints4))); /* narrow to uint16, combine, narrow to uint8 */
1185 vst1q_u8(mmdst, vcombine_u8(ui8lo, ui8hi)); /* combine to uint8x16_t, store out */
1186 i -= 16; src += 16; mmdst += 16;
1187 }
1188
1189 dst = (Uint8 *) mmdst;
1190 }
1191
1192 /* Finish off any leftovers with scalar operations. */
1193 while (i) {
1194 const float sample = *src;
1195 if (sample >= 1.0f) {
1196 *dst = 255;
1197 } else if (sample <= -1.0f) {
1198 *dst = 0;
1199 } else {
1200 *dst = (Uint8)((sample + 1.0f) * 127.0f);
1201 }
1202 i--; src++; dst++;
1203 }
1204
1205 cvt->len_cvt /= 4;
1206 if (cvt->filters[++cvt->filter_index]) {
1207 cvt->filters[cvt->filter_index](cvt, AUDIO_U8);
1208 }
1209}
1210
1211static void SDLCALL
1212SDL_Convert_F32_to_S16_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
1213{
1214 const float *src = (const float *) cvt->buf;
1215 Sint16 *dst = (Sint16 *) cvt->buf;
1216 int i;
1217
1218 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S16 (using NEON)");
1219
1220 /* Get dst aligned to 16 bytes */
1221 for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
1222 const float sample = *src;
1223 if (sample >= 1.0f) {
1224 *dst = 32767;
1225 } else if (sample <= -1.0f) {
1226 *dst = -32768;
1227 } else {
1228 *dst = (Sint16)(sample * 32767.0f);
1229 }
1230 }
1231
1232 SDL_assert(!i || ((((size_t) dst) & 15) == 0));
1233
1234 /* Make sure src is aligned too. */
1235 if ((((size_t) src) & 15) == 0) {
1236 /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
1237 const float32x4_t one = vdupq_n_f32(1.0f);
1238 const float32x4_t negone = vdupq_n_f32(-1.0f);
1239 const float32x4_t mulby32767 = vdupq_n_f32(32767.0f);
1240 int16_t *mmdst = (int16_t *) dst;
1241 while (i >= 8) { /* 8 * float32 */
1242 const int32x4_t ints1 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), mulby32767)); /* load 4 floats, clamp, convert to sint32 */
1243 const int32x4_t ints2 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+4)), one), mulby32767)); /* load 4 floats, clamp, convert to sint32 */
1244 vst1q_s16(mmdst, vcombine_s16(vmovn_s32(ints1), vmovn_s32(ints2))); /* narrow to sint16, combine, store out. */
1245 i -= 8; src += 8; mmdst += 8;
1246 }
1247 dst = (Sint16 *) mmdst;
1248 }
1249
1250 /* Finish off any leftovers with scalar operations. */
1251 while (i) {
1252 const float sample = *src;
1253 if (sample >= 1.0f) {
1254 *dst = 32767;
1255 } else if (sample <= -1.0f) {
1256 *dst = -32768;
1257 } else {
1258 *dst = (Sint16)(sample * 32767.0f);
1259 }
1260 i--; src++; dst++;
1261 }
1262
1263 cvt->len_cvt /= 2;
1264 if (cvt->filters[++cvt->filter_index]) {
1265 cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS);
1266 }
1267}
1268
1269static void SDLCALL
1270SDL_Convert_F32_to_U16_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
1271{
1272 const float *src = (const float *) cvt->buf;
1273 Uint16 *dst = (Uint16 *) cvt->buf;
1274 int i;
1275
1276 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U16 (using NEON)");
1277
1278 /* Get dst aligned to 16 bytes */
1279 for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
1280 const float sample = *src;
1281 if (sample >= 1.0f) {
1282 *dst = 65535;
1283 } else if (sample <= -1.0f) {
1284 *dst = 0;
1285 } else {
1286 *dst = (Uint16)((sample + 1.0f) * 32767.0f);
1287 }
1288 }
1289
1290 SDL_assert(!i || ((((size_t) dst) & 15) == 0));
1291
1292 /* Make sure src is aligned too. */
1293 if ((((size_t) src) & 15) == 0) {
1294 /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
1295 const float32x4_t one = vdupq_n_f32(1.0f);
1296 const float32x4_t negone = vdupq_n_f32(-1.0f);
1297 const float32x4_t mulby32767 = vdupq_n_f32(32767.0f);
1298 uint16_t *mmdst = (uint16_t *) dst;
1299 while (i >= 8) { /* 8 * float32 */
1300 const uint32x4_t uints1 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), one), mulby32767)); /* load 4 floats, clamp, convert to uint32 */
1301 const uint32x4_t uints2 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+4)), one), one), mulby32767)); /* load 4 floats, clamp, convert to uint32 */
1302 vst1q_u16(mmdst, vcombine_u16(vmovn_u32(uints1), vmovn_u32(uints2))); /* narrow to uint16, combine, store out. */
1303 i -= 8; src += 8; mmdst += 8;
1304 }
1305 dst = (Uint16 *) mmdst;
1306 }
1307
1308 /* Finish off any leftovers with scalar operations. */
1309 while (i) {
1310 const float sample = *src;
1311 if (sample >= 1.0f) {
1312 *dst = 65535;
1313 } else if (sample <= -1.0f) {
1314 *dst = 0;
1315 } else {
1316 *dst = (Uint16)((sample + 1.0f) * 32767.0f);
1317 }
1318 i--; src++; dst++;
1319 }
1320
1321 cvt->len_cvt /= 2;
1322 if (cvt->filters[++cvt->filter_index]) {
1323 cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS);
1324 }
1325}
1326
1327static void SDLCALL
1328SDL_Convert_F32_to_S32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
1329{
1330 const float *src = (const float *) cvt->buf;
1331 Sint32 *dst = (Sint32 *) cvt->buf;
1332 int i;
1333
1334 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S32 (using NEON)");
1335
1336 /* Get dst aligned to 16 bytes */
1337 for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
1338 const float sample = *src;
1339 if (sample >= 1.0f) {
1340 *dst = 2147483647;
1341 } else if (sample <= -1.0f) {
1342 *dst = (-2147483647) - 1;
1343 } else {
1344 *dst = ((Sint32)(sample * 8388607.0f)) << 8;
1345 }
1346 }
1347
1348 SDL_assert(!i || ((((size_t) dst) & 15) == 0));
1349 SDL_assert(!i || ((((size_t) src) & 15) == 0));
1350
1351 {
1352 /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
1353 const float32x4_t one = vdupq_n_f32(1.0f);
1354 const float32x4_t negone = vdupq_n_f32(-1.0f);
1355 const float32x4_t mulby8388607 = vdupq_n_f32(8388607.0f);
1356 int32_t *mmdst = (int32_t *) dst;
1357 while (i >= 4) { /* 4 * float32 */
1358 vst1q_s32(mmdst, vshlq_n_s32(vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), mulby8388607)), 8));
1359 i -= 4; src += 4; mmdst += 4;
1360 }
1361 dst = (Sint32 *) mmdst;
1362 }
1363
1364 /* Finish off any leftovers with scalar operations. */
1365 while (i) {
1366 const float sample = *src;
1367 if (sample >= 1.0f) {
1368 *dst = 2147483647;
1369 } else if (sample <= -1.0f) {
1370 *dst = (-2147483647) - 1;
1371 } else {
1372 *dst = ((Sint32)(sample * 8388607.0f)) << 8;
1373 }
1374 i--; src++; dst++;
1375 }
1376
1377 if (cvt->filters[++cvt->filter_index]) {
1378 cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS);
1379 }
1380}
1381#endif
1382
1383
1384
1385void SDL_ChooseAudioConverters(void)
1386{
1387 static SDL_bool converters_chosen = SDL_FALSE;
1388
1389 if (converters_chosen) {
1390 return;
1391 }
1392
1393#define SET_CONVERTER_FUNCS(fntype) \
1394 SDL_Convert_S8_to_F32 = SDL_Convert_S8_to_F32_##fntype; \
1395 SDL_Convert_U8_to_F32 = SDL_Convert_U8_to_F32_##fntype; \
1396 SDL_Convert_S16_to_F32 = SDL_Convert_S16_to_F32_##fntype; \
1397 SDL_Convert_U16_to_F32 = SDL_Convert_U16_to_F32_##fntype; \
1398 SDL_Convert_S32_to_F32 = SDL_Convert_S32_to_F32_##fntype; \
1399 SDL_Convert_F32_to_S8 = SDL_Convert_F32_to_S8_##fntype; \
1400 SDL_Convert_F32_to_U8 = SDL_Convert_F32_to_U8_##fntype; \
1401 SDL_Convert_F32_to_S16 = SDL_Convert_F32_to_S16_##fntype; \
1402 SDL_Convert_F32_to_U16 = SDL_Convert_F32_to_U16_##fntype; \
1403 SDL_Convert_F32_to_S32 = SDL_Convert_F32_to_S32_##fntype; \
1404 converters_chosen = SDL_TRUE
1405
1406#if HAVE_SSE2_INTRINSICS
1407 if (SDL_HasSSE2()) {
1408 SET_CONVERTER_FUNCS(SSE2);
1409 return;
1410 }
1411#endif
1412
1413#if HAVE_NEON_INTRINSICS
1414 if (SDL_HasNEON()) {
1415 SET_CONVERTER_FUNCS(NEON);
1416 return;
1417 }
1418#endif
1419
1420#if NEED_SCALAR_CONVERTER_FALLBACKS
1421 SET_CONVERTER_FUNCS(Scalar);
1422#endif
1423
1424#undef SET_CONVERTER_FUNCS
1425
1426 SDL_assert(converters_chosen == SDL_TRUE);
1427}
1428
1429/* vi: set ts=4 sw=4 expandtab: */
1430