void Float32ToNativeInt32( const float *src, int *dst, unsigned int numToConvert ) { const float *src0 = src; int *dst0 = dst; unsigned int count = numToConvert; if (count >= 4) { // vector -- requires 4+ samples ROUNDMODE_NEG_INF const __m128 vround = (const __m128) { 0.5f, 0.5f, 0.5f, 0.5f }; const __m128 vmin = (const __m128) { -2147483648.0f, -2147483648.0f, -2147483648.0f, -2147483648.0f }; const __m128 vmax = (const __m128) { kMaxFloat32, kMaxFloat32, kMaxFloat32, kMaxFloat32 }; const __m128 vscale = (const __m128) { 2147483648.0f, 2147483648.0f, 2147483648.0f, 2147483648.0f }; __m128 vf0; __m128i vi0; #define F32TOLE32(x) \ vf##x = _mm_mul_ps(vf##x, vscale); \ vf##x = _mm_add_ps(vf##x, vround); \ vf##x = _mm_max_ps(vf##x, vmin); \ vf##x = _mm_min_ps(vf##x, vmax); \ vi##x = _mm_cvtps_epi32(vf##x); \ int falign = (uintptr_t)src & 0xF; int ialign = (uintptr_t)dst & 0xF; if (falign != 0 || ialign != 0) { // do one unaligned conversion vf0 = _mm_loadu_ps(src); F32TOLE32(0) _mm_storeu_si128((__m128i *)dst, vi0); // and advance such that the destination ints are aligned unsigned int n = (16 - ialign) / 4; src += n; dst += n; count -= n; falign = (uintptr_t)src & 0xF; if (falign != 0) { // unaligned loads, aligned stores while (count >= 4) { vf0 = _mm_loadu_ps(src); F32TOLE32(0) _mm_store_si128((__m128i *)dst, vi0); src += 4; dst += 4; count -= 4; } goto VectorCleanup; } } while (count >= 4) { vf0 = _mm_load_ps(src); F32TOLE32(0) _mm_store_si128((__m128i *)dst, vi0); src += 4; dst += 4; count -= 4; } VectorCleanup: if (count > 0) { // unaligned cleanup -- just do one unaligned vector at the end src = src0 + numToConvert - 4; dst = dst0 + numToConvert - 4; vf0 = _mm_loadu_ps(src); F32TOLE32(0) _mm_storeu_si128((__m128i *)dst, vi0); } RESTORE_ROUNDMODE return; } // scalar for small numbers of samples if (count > 0) { double scale = 2147483648.0, round = 0.5, max32 = 2147483648.0 - 1.0 - 0.5, min32 = 0.; ROUNDMODE_NEG_INF while (count-- > 0) { double f0 = *src++; f0 = f0 * scale + round; int i0 = FloatToInt(f0, min32, max32); *dst++ = i0; } RESTORE_ROUNDMODE } } void NativeInt32ToFloat32( const int *src, float *dst, unsigned int numToConvert ) { const int *src0 = src; float *dst0 = dst; unsigned int count = numToConvert; if (count >= 4) { // vector -- requires 4+ samples #define LEI32TOF32(x) \ vf##x = _mm_cvtepi32_ps(vi##x); \ vf##x = _mm_mul_ps(vf##x, vscale); \ const __m128 vscale = (const __m128) { 1.0/2147483648.0f, 1.0/2147483648.0f, 1.0/2147483648.0f, 1.0/2147483648.0f }; __m128 vf0; __m128i vi0; int ialign = (uintptr_t)src & 0xF; int falign = (uintptr_t)dst & 0xF; if (falign != 0 || ialign != 0) { // do one unaligned conversion vi0 = _mm_loadu_si128((__m128i const *)src); LEI32TOF32(0) _mm_storeu_ps(dst, vf0); // and advance such that the destination floats are aligned unsigned int n = (16 - falign) / 4; src += n; dst += n; count -= n; ialign = (uintptr_t)src & 0xF; if (ialign != 0) { // unaligned loads, aligned stores while (count >= 4) { vi0 = _mm_loadu_si128((__m128i const *)src); LEI32TOF32(0) _mm_store_ps(dst, vf0); src += 4; dst += 4; count -= 4; } goto VectorCleanup; } } // aligned loads, aligned stores while (count >= 4) { vi0 = _mm_load_si128((__m128i const *)src); LEI32TOF32(0) _mm_store_ps(dst, vf0); src += 4; dst += 4; count -= 4; } VectorCleanup: if (count > 0) { // unaligned cleanup -- just do one unaligned vector at the end src = src0 + numToConvert - 4; dst = dst0 + numToConvert - 4; vi0 = _mm_loadu_si128((__m128i const *)src); LEI32TOF32(0) _mm_storeu_ps(dst, vf0); } return; } // scalar for small numbers of samples if (count > 0) { double scale = 1./2147483648.0f; while (count-- > 0) { int i = *src++; double f = (double)i * scale; *dst++ = f; } } } int alsa_set_hwparams(alsa_dev_t *dev, snd_pcm_t *handle, snd_pcm_hw_params_t *params, snd_pcm_access_t access) { unsigned int rrate; snd_pcm_uframes_t size; int err, dir; /* choose all parameters */ err = snd_pcm_hw_params_any(handle, params); if (err < 0) { printf("Broken configuration for playback: no configurations available: %s\n", snd_strerror(err)); return err; } /* set the interleaved read/write format */ err = snd_pcm_hw_params_set_access(handle, params, access); if (err < 0) { printf("Access type not available for playback: %s\n", snd_strerror(err)); return err; } /* set the sample format */ err = snd_pcm_hw_params_set_format(handle, params, dev->format); if (err < 0) { printf("Sample format not available for playback: %s\n", snd_strerror(err)); return err; } /* set the count of channels */ err = snd_pcm_hw_params_set_channels(handle, params, dev->channels); if (err < 0) { printf("Channels count (%d) not available for playbacks: %s\n", dev->channels, snd_strerror(err)); return err; } /* set the stream rate */ rrate = dev->rate; err = snd_pcm_hw_params_set_rate_near(handle, params, &rrate, 0); if (err < 0) { printf("Rate %d Hz not available for playback: %s\n", dev->rate, snd_strerror(err)); return err; } if (rrate != dev->rate) { printf("Rate doesn't match (requested %dHz, get %dHz)\n", dev->rate, rrate); return -EINVAL; } /* set the period size */ err = snd_pcm_hw_params_set_period_size(handle, params, dev->period_size, 0); if (err < 0) { printf("Unable to set period size %d for playback: %s\n", (int)dev->period_size, snd_strerror(err)); return err; } err = snd_pcm_hw_params_get_period_size(params, &size, &dir); if (err < 0) { printf("Unable to get period size for playback: %s\n", snd_strerror(err)); return err; } if (dev->period_size != size) { printf("Period size doesn't match (requested %d, got %d)\n", (int)dev->period_size, (int)size); return -EINVAL; } /* set the buffer size */ err = snd_pcm_hw_params_set_buffer_size(handle, params, dev->buffer_size); if (err < 0) { printf("Unable to set buffer size %d for playback: %s\n", (int)dev->buffer_size, snd_strerror(err)); return err; } err = snd_pcm_hw_params_get_buffer_size(params, &size); if (err < 0) { printf("Unable to get buffer size for playback: %s\n", snd_strerror(err)); return err; } if (size != (snd_pcm_uframes_t)dev->buffer_size) { printf("Buffer size doesn't match (requested %d, got %d)\n", (int)dev->buffer_size, (int)size); return -EINVAL; } /* write the parameters to device */ err = snd_pcm_hw_params(handle, params); if (err < 0) { printf("Unable to set hw params for playback: %s\n", snd_strerror(err)); return err; } return 0; } int alsa_set_swparams(alsa_dev_t *dev, snd_pcm_t *handle, snd_pcm_sw_params_t *swparams) { int err; /* get the current swparams */ err = snd_pcm_sw_params_current(handle, swparams); if (err < 0) { printf("Unable to determine current swparams for playback: %s\n", snd_strerror(err)); return err; } /* allow the transfer when at least period_size samples can be processed */ /* or disable this mechanism when period event is enabled (aka interrupt like style processing) */ err = snd_pcm_sw_params_set_avail_min(handle, swparams, dev->period_size); if (err < 0) { printf("Unable to set avail min for playback: %s\n", snd_strerror(err)); return err; } /* enable period events */ err = snd_pcm_sw_params_set_period_event(handle, swparams, 1); if (err < 0) { printf("Unable to set period event: %s\n", snd_strerror(err)); return err; } /* write the parameters to the playback device */ err = snd_pcm_sw_params(handle, swparams); if (err < 0) { printf("Unable to set sw params for playback: %s\n", snd_strerror(err)); return err; } return 0; }
void Float32ToNativeInt16_X86( const Float32 *src, SInt16 *dst, unsigned int numToConvert ) { const float *src0 = src; int16_t *dst0 = dst; unsigned int count = numToConvert; if (count >= 8) { // vector -- requires 8+ samples ROUNDMODE_NEG_INF const __m128 vround = (const __m128) { 0.5f, 0.5f, 0.5f, 0.5f }; const __m128 vmin = (const __m128) { -32768.0f, -32768.0f, -32768.0f, -32768.0f }; const __m128 vmax = (const __m128) { 32767.0f, 32767.0f, 32767.0f, 32767.0f }; const __m128 vscale = (const __m128) { 32768.0f, 32768.0f, 32768.0f, 32768.0f }; __m128 vf0, vf1; __m128i vi0, vi1, vpack0; #define F32TOLE16 \ vf0 = _mm_mul_ps(vf0, vscale); \ vf1 = _mm_mul_ps(vf1, vscale); \ vf0 = _mm_add_ps(vf0, vround); \ vf1 = _mm_add_ps(vf1, vround); \ vf0 = _mm_max_ps(vf0, vmin); \ vf1 = _mm_max_ps(vf1, vmin); \ vf0 = _mm_min_ps(vf0, vmax); \ vf1 = _mm_min_ps(vf1, vmax); \ vi0 = _mm_cvtps_epi32(vf0); \ vi1 = _mm_cvtps_epi32(vf1); \ vpack0 = _mm_packs_epi32(vi0, vi1); int falign = (uintptr_t)src & 0xF; int ialign = (uintptr_t)dst & 0xF; if (falign != 0 || ialign != 0) { // do one unaligned conversion vf0 = _mm_loadu_ps(src); vf1 = _mm_loadu_ps(src+4); F32TOLE16 _mm_storeu_si128((__m128i *)dst, vpack0); // advance such that the destination ints are aligned unsigned int n = (16 - ialign) / 2; src += n; dst += n; count -= n; falign = (uintptr_t)src & 0xF; if (falign != 0) { // unaligned loads, aligned stores while (count >= 8) { vf0 = _mm_loadu_ps(src); vf1 = _mm_loadu_ps(src+4); F32TOLE16 _mm_store_si128((__m128i *)dst, vpack0); src += 8; dst += 8; count -= 8; } goto VectorCleanup; } } // aligned loads, aligned stores while (count >= 8) { vf0 = _mm_load_ps(src); vf1 = _mm_load_ps(src+4); F32TOLE16 _mm_store_si128((__m128i *)dst, vpack0); src += 8; dst += 8; count -= 8; } VectorCleanup: if (count > 0) { // unaligned cleanup -- just do one unaligned vector at the end src = src0 + numToConvert - 8; dst = dst0 + numToConvert - 8; vf0 = _mm_loadu_ps(src); vf1 = _mm_loadu_ps(src+4); F32TOLE16 _mm_storeu_si128((__m128i *)dst, vpack0); } RESTORE_ROUNDMODE return; } // scalar for small numbers of samples if (count > 0) { double scale = 2147483648.0, round = 32768.0, max32 = 2147483648.0 - 1.0 - 32768.0, min32 = 0.; ROUNDMODE_NEG_INF while (count-- > 0) { double f0 = *src++; f0 = f0 * scale + round; SInt32 i0 = FloatToInt(f0, min32, max32); i0 >>= 16; *dst++ = i0; } RESTORE_ROUNDMODE } } // =================================================================================================== void Float32ToSwapInt16_X86( const Float32 *src, SInt16 *dst, unsigned int numToConvert ) { const float *src0 = src; int16_t *dst0 = dst; unsigned int count = numToConvert; if (count >= 8) { // vector -- requires 8+ samples ROUNDMODE_NEG_INF const __m128 vround = (const __m128) { 0.5f, 0.5f, 0.5f, 0.5f }; const __m128 vmin = (const __m128) { -32768.0f, -32768.0f, -32768.0f, -32768.0f }; const __m128 vmax = (const __m128) { 32767.0f, 32767.0f, 32767.0f, 32767.0f }; const __m128 vscale = (const __m128) { 32768.0f, 32768.0f, 32768.0f, 32768.0f }; __m128 vf0, vf1; __m128i vi0, vi1, vpack0; #define F32TOBE16 \ vf0 = _mm_mul_ps(vf0, vscale); \ vf1 = _mm_mul_ps(vf1, vscale); \ vf0 = _mm_add_ps(vf0, vround); \ vf1 = _mm_add_ps(vf1, vround); \ vf0 = _mm_max_ps(vf0, vmin); \ vf1 = _mm_max_ps(vf1, vmin); \ vf0 = _mm_min_ps(vf0, vmax); \ vf1 = _mm_min_ps(vf1, vmax); \ vi0 = _mm_cvtps_epi32(vf0); \ vi1 = _mm_cvtps_epi32(vf1); \ vpack0 = _mm_packs_epi32(vi0, vi1); \ vpack0 = byteswap16(vpack0); int falign = (uintptr_t)src & 0xF; int ialign = (uintptr_t)dst & 0xF; if (falign != 0 || ialign != 0) { // do one unaligned conversion vf0 = _mm_loadu_ps(src); vf1 = _mm_loadu_ps(src+4); F32TOBE16 _mm_storeu_si128((__m128i *)dst, vpack0); // and advance such that the destination ints are aligned unsigned int n = (16 - ialign) / 2; src += n; dst += n; count -= n; falign = (uintptr_t)src & 0xF; if (falign != 0) { // unaligned loads, aligned stores while (count >= 8) { vf0 = _mm_loadu_ps(src); vf1 = _mm_loadu_ps(src+4); F32TOBE16 _mm_store_si128((__m128i *)dst, vpack0); src += 8; dst += 8; count -= 8; } goto VectorCleanup; } } // aligned loads, aligned stores while (count >= 8) { vf0 = _mm_load_ps(src); vf1 = _mm_load_ps(src+4); F32TOBE16 _mm_store_si128((__m128i *)dst, vpack0); src += 8; dst += 8; count -= 8; } VectorCleanup: if (count > 0) { // unaligned cleanup -- just do one unaligned vector at the end src = src0 + numToConvert - 8; dst = dst0 + numToConvert - 8; vf0 = _mm_loadu_ps(src); vf1 = _mm_loadu_ps(src+4); F32TOBE16 _mm_storeu_si128((__m128i *)dst, vpack0); } RESTORE_ROUNDMODE return; } // scalar for small numbers of samples if (count > 0) { double scale = 2147483648.0, round = 32768.0, max32 = 2147483648.0 - 1.0 - 32768.0, min32 = 0.; ROUNDMODE_NEG_INF while (count-- > 0) { double f0 = *src++; f0 = f0 * scale + round; SInt32 i0 = FloatToInt(f0, min32, max32); i0 >>= 16; *dst++ = OSSwapInt16(i0); } RESTORE_ROUNDMODE } } // =================================================================================================== void Float32ToNativeInt32_X86( const Float32 *src, SInt32 *dst, unsigned int numToConvert ) { const float *src0 = src; SInt32 *dst0 = dst; unsigned int count = numToConvert; if (count >= 4) { // vector -- requires 4+ samples ROUNDMODE_NEG_INF const __m128 vround = (const __m128) { 0.5f, 0.5f, 0.5f, 0.5f }; const __m128 vmin = (const __m128) { -2147483648.0f, -2147483648.0f, -2147483648.0f, -2147483648.0f }; const __m128 vmax = (const __m128) { kMaxFloat32, kMaxFloat32, kMaxFloat32, kMaxFloat32 }; const __m128 vscale = (const __m128) { 2147483648.0f, 2147483648.0f, 2147483648.0f, 2147483648.0f }; __m128 vf0; __m128i vi0; #define F32TOLE32(x) \ vf##x = _mm_mul_ps(vf##x, vscale); \ vf##x = _mm_add_ps(vf##x, vround); \ vf##x = _mm_max_ps(vf##x, vmin); \ vf##x = _mm_min_ps(vf##x, vmax); \ vi##x = _mm_cvtps_epi32(vf##x); \ int falign = (uintptr_t)src & 0xF; int ialign = (uintptr_t)dst & 0xF; if (falign != 0 || ialign != 0) { // do one unaligned conversion vf0 = _mm_loadu_ps(src); F32TOLE32(0) _mm_storeu_si128((__m128i *)dst, vi0); // and advance such that the destination ints are aligned unsigned int n = (16 - ialign) / 4; src += n; dst += n; count -= n; falign = (uintptr_t)src & 0xF; if (falign != 0) { // unaligned loads, aligned stores while (count >= 4) { vf0 = _mm_loadu_ps(src); F32TOLE32(0) _mm_store_si128((__m128i *)dst, vi0); src += 4; dst += 4; count -= 4; } goto VectorCleanup; } } while (count >= 4) { vf0 = _mm_load_ps(src); F32TOLE32(0) _mm_store_si128((__m128i *)dst, vi0); src += 4; dst += 4; count -= 4; } VectorCleanup: if (count > 0) { // unaligned cleanup -- just do one unaligned vector at the end src = src0 + numToConvert - 4; dst = dst0 + numToConvert - 4; vf0 = _mm_loadu_ps(src); F32TOLE32(0) _mm_storeu_si128((__m128i *)dst, vi0); } RESTORE_ROUNDMODE return; } // scalar for small numbers of samples if (count > 0) { double scale = 2147483648.0, round = 0.5, max32 = 2147483648.0 - 1.0 - 0.5, min32 = 0.; ROUNDMODE_NEG_INF while (count-- > 0) { double f0 = *src++; f0 = f0 * scale + round; SInt32 i0 = FloatToInt(f0, min32, max32); *dst++ = i0; } RESTORE_ROUNDMODE } } // =================================================================================================== void Float32ToSwapInt32_X86( const Float32 *src, SInt32 *dst, unsigned int numToConvert ) { const float *src0 = src; SInt32 *dst0 = dst; unsigned int count = numToConvert; if (count >= 4) { // vector -- requires 4+ samples ROUNDMODE_NEG_INF const __m128 vround = (const __m128) { 0.5f, 0.5f, 0.5f, 0.5f }; const __m128 vmin = (const __m128) { -2147483648.0f, -2147483648.0f, -2147483648.0f, -2147483648.0f }; const __m128 vmax = (const __m128) { kMaxFloat32, kMaxFloat32, kMaxFloat32, kMaxFloat32 }; const __m128 vscale = (const __m128) { 2147483648.0f, 2147483648.0f, 2147483648.0f, 2147483648.0f }; __m128 vf0; __m128i vi0; #define F32TOBE32(x) \ vf##x = _mm_mul_ps(vf##x, vscale); \ vf##x = _mm_add_ps(vf##x, vround); \ vf##x = _mm_max_ps(vf##x, vmin); \ vf##x = _mm_min_ps(vf##x, vmax); \ vi##x = _mm_cvtps_epi32(vf##x); \ vi##x = byteswap32(vi##x); int falign = (uintptr_t)src & 0xF; int ialign = (uintptr_t)dst & 0xF; if (falign != 0 || ialign != 0) { // do one unaligned conversion vf0 = _mm_loadu_ps(src); F32TOBE32(0) _mm_storeu_si128((__m128i *)dst, vi0); // and advance such that the destination ints are aligned unsigned int n = (16 - ialign) / 4; src += n; dst += n; count -= n; falign = (uintptr_t)src & 0xF; if (falign != 0) { // unaligned loads, aligned stores while (count >= 4) { vf0 = _mm_loadu_ps(src); F32TOBE32(0) _mm_store_si128((__m128i *)dst, vi0); src += 4; dst += 4; count -= 4; } goto VectorCleanup; } } while (count >= 4) { vf0 = _mm_load_ps(src); F32TOBE32(0) _mm_store_si128((__m128i *)dst, vi0); src += 4; dst += 4; count -= 4; } VectorCleanup: if (count > 0) { // unaligned cleanup -- just do one unaligned vector at the end src = src0 + numToConvert - 4; dst = dst0 + numToConvert - 4; vf0 = _mm_loadu_ps(src); F32TOBE32(0) _mm_storeu_si128((__m128i *)dst, vi0); } RESTORE_ROUNDMODE return; } // scalar for small numbers of samples if (count > 0) { double scale = 2147483648.0, round = 0.5, max32 = 2147483648.0 - 1.0 - 0.5, min32 = 0.; ROUNDMODE_NEG_INF while (count-- > 0) { double f0 = *src++; f0 = f0 * scale + round; SInt32 i0 = FloatToInt(f0, min32, max32); *dst++ = OSSwapInt32(i0); } RESTORE_ROUNDMODE } } // =================================================================================================== // ~14 instructions static inline __m128i Pack32ToLE24(__m128i val, __m128i mask) { __m128i store; val = _mm_srli_si128(val, 1); store = _mm_and_si128(val, mask); val = _mm_srli_si128(val, 1); mask = _mm_slli_si128(mask, 3); store = _mm_or_si128(store, _mm_and_si128(val, mask)); val = _mm_srli_si128(val, 1); mask = _mm_slli_si128(mask, 3); store = _mm_or_si128(store, _mm_and_si128(val, mask)); val = _mm_srli_si128(val, 1); mask = _mm_slli_si128(mask, 3); store = _mm_or_si128(store, _mm_and_si128(val, mask)); return store; } // marginally faster than scalar void Float32ToNativeInt24_X86( const Float32 *src, UInt8 *dst, unsigned int numToConvert ) { const Float32 *src0 = src; UInt8 *dst0 = dst; unsigned int count = numToConvert; if (count >= 6) { // vector -- requires 6+ samples ROUNDMODE_NEG_INF const __m128 vround = (const __m128) { 0.5f, 0.5f, 0.5f, 0.5f }; const __m128 vmin = (const __m128) { -2147483648.0f, -2147483648.0f, -2147483648.0f, -2147483648.0f }; const __m128 vmax = (const __m128) { kMaxFloat32, kMaxFloat32, kMaxFloat32, kMaxFloat32 }; const __m128 vscale = (const __m128) { 2147483648.0f, 2147483648.0f, 2147483648.0f, 2147483648.0f }; __m128i mask = _mm_setr_epi32(0x00FFFFFF, 0, 0, 0); // it is actually cheaper to copy and shift this mask on the fly than to have 4 of them __m128i store; union { UInt32 i[4]; __m128i v; } u; __m128 vf0; __m128i vi0; int falign = (uintptr_t)src & 0xF; if (falign != 0) { // do one unaligned conversion vf0 = _mm_loadu_ps(src); F32TOLE32(0) store = Pack32ToLE24(vi0, mask); _mm_storeu_si128((__m128i *)dst, store); // and advance such that the source floats are aligned unsigned int n = (16 - falign) / 4; src += n; dst += 3*n; // bytes count -= n; } while (count >= 6) { vf0 = _mm_load_ps(src); F32TOLE32(0) store = Pack32ToLE24(vi0, mask); _mm_storeu_si128((__m128i *)dst, store); // destination always unaligned src += 4; dst += 12; // bytes count -= 4; } if (count >= 4) { vf0 = _mm_load_ps(src); F32TOLE32(0) u.v = Pack32ToLE24(vi0, mask); ((UInt32 *)dst)[0] = u.i[0]; ((UInt32 *)dst)[1] = u.i[1]; ((UInt32 *)dst)[2] = u.i[2]; src += 4; dst += 12; // bytes count -= 4; } if (count > 0) { // unaligned cleanup -- just do one unaligned vector at the end src = src0 + numToConvert - 4; dst = dst0 + 3*numToConvert - 12; vf0 = _mm_loadu_ps(src); F32TOLE32(0) u.v = Pack32ToLE24(vi0, mask); ((UInt32 *)dst)[0] = u.i[0]; ((UInt32 *)dst)[1] = u.i[1]; ((UInt32 *)dst)[2] = u.i[2]; } RESTORE_ROUNDMODE return; } // scalar for small numbers of samples if (count > 0) { double scale = 2147483648.0, round = 0.5, max32 = 2147483648.0 - 1.0 - 0.5, min32 = 0.; ROUNDMODE_NEG_INF while (count-- > 0) { double f0 = *src++; f0 = f0 * scale + round; UInt32 i0 = FloatToInt(f0, min32, max32); dst[0] = (UInt8)(i0 >> 8); dst[1] = (UInt8)(i0 >> 16); dst[2] = (UInt8)(i0 >> 24); dst += 3; } RESTORE_ROUNDMODE } } // =================================================================================================== #pragma mark - #pragma mark Int -> Float void NativeInt16ToFloat32_X86( const SInt16 *src, Float32 *dst, unsigned int numToConvert ) { const SInt16 *src0 = src; Float32 *dst0 = dst; unsigned int count = numToConvert; if (count >= 8) { // vector -- requires 8+ samples // convert the 16-bit words to the high word of 32-bit values #define LEI16TOF32(x, y) \ vi##x = _mm_unpacklo_epi16(zero, vpack##x); \ vi##y = _mm_unpackhi_epi16(zero, vpack##x); \ vf##x = _mm_cvtepi32_ps(vi##x); \ vf##y = _mm_cvtepi32_ps(vi##y); \ vf##x = _mm_mul_ps(vf##x, vscale); \ vf##y = _mm_mul_ps(vf##y, vscale); const __m128 vscale = (const __m128) { 1.0/2147483648.0f, 1.0/2147483648.0f, 1.0/2147483648.0f, 1.0/2147483648.0f }; const __m128i zero = _mm_setzero_si128(); __m128 vf0, vf1; __m128i vi0, vi1, vpack0; int ialign = (uintptr_t)src & 0xF; int falign = (uintptr_t)dst & 0xF; if (falign != 0 || ialign != 0) { // do one unaligned conversion vpack0 = _mm_loadu_si128((__m128i const *)src); LEI16TOF32(0, 1) _mm_storeu_ps(dst, vf0); _mm_storeu_ps(dst+4, vf1); // and advance such that the destination floats are aligned unsigned int n = (16 - falign) / 4; src += n; dst += n; count -= n; ialign = (uintptr_t)src & 0xF; if (ialign != 0) { // unaligned loads, aligned stores while (count >= 8) { vpack0 = _mm_loadu_si128((__m128i const *)src); LEI16TOF32(0, 1) _mm_store_ps(dst, vf0); _mm_store_ps(dst+4, vf1); src += 8; dst += 8; count -= 8; } goto VectorCleanup; } } // aligned loads, aligned stores while (count >= 8) { vpack0 = _mm_load_si128((__m128i const *)src); LEI16TOF32(0, 1) _mm_store_ps(dst, vf0); _mm_store_ps(dst+4, vf1); src += 8; dst += 8; count -= 8; } VectorCleanup: if (count > 0) { // unaligned cleanup -- just do one unaligned vector at the end src = src0 + numToConvert - 8; dst = dst0 + numToConvert - 8; vpack0 = _mm_loadu_si128((__m128i const *)src); LEI16TOF32(0, 1) _mm_storeu_ps(dst, vf0); _mm_storeu_ps(dst+4, vf1); } return; } // scalar for small numbers of samples if (count > 0) { double scale = 1./32768.f; while (count-- > 0) { SInt16 i = *src++; double f = (double)i * scale; *dst++ = f; } } } // =================================================================================================== void SwapInt16ToFloat32_X86( const SInt16 *src, Float32 *dst, unsigned int numToConvert ) { const SInt16 *src0 = src; Float32 *dst0 = dst; unsigned int count = numToConvert; if (count >= 8) { // vector -- requires 8+ samples // convert the 16-bit words to the high word of 32-bit values #define BEI16TOF32 \ vpack0 = byteswap16(vpack0); \ vi0 = _mm_unpacklo_epi16(zero, vpack0); \ vi1 = _mm_unpackhi_epi16(zero, vpack0); \ vf0 = _mm_cvtepi32_ps(vi0); \ vf1 = _mm_cvtepi32_ps(vi1); \ vf0 = _mm_mul_ps(vf0, vscale); \ vf1 = _mm_mul_ps(vf1, vscale); const __m128 vscale = (const __m128) { 1.0/2147483648.0f, 1.0/2147483648.0f, 1.0/2147483648.0f, 1.0/2147483648.0f }; const __m128i zero = _mm_setzero_si128(); __m128 vf0, vf1; __m128i vi0, vi1, vpack0; int ialign = (uintptr_t)src & 0xF; int falign = (uintptr_t)dst & 0xF; if (falign != 0 || ialign != 0) { // do one unaligned conversion vpack0 = _mm_loadu_si128((__m128i const *)src); BEI16TOF32 _mm_storeu_ps(dst, vf0); _mm_storeu_ps(dst+4, vf1); // and advance such that the destination floats are aligned unsigned int n = (16 - falign) / 4; src += n; dst += n; count -= n; ialign = (uintptr_t)src & 0xF; if (ialign != 0) { // unaligned loads, aligned stores while (count >= 8) { vpack0 = _mm_loadu_si128((__m128i const *)src); BEI16TOF32 _mm_store_ps(dst, vf0); _mm_store_ps(dst+4, vf1); src += 8; dst += 8; count -= 8; } goto VectorCleanup; } } // aligned loads, aligned stores while (count >= 8) { vpack0 = _mm_load_si128((__m128i const *)src); BEI16TOF32 _mm_store_ps(dst, vf0); _mm_store_ps(dst+4, vf1); src += 8; dst += 8; count -= 8; } VectorCleanup: if (count > 0) { // unaligned cleanup -- just do one unaligned vector at the end src = src0 + numToConvert - 8; dst = dst0 + numToConvert - 8; vpack0 = _mm_loadu_si128((__m128i const *)src); BEI16TOF32 _mm_storeu_ps(dst, vf0); _mm_storeu_ps(dst+4, vf1); } return; } // scalar for small numbers of samples if (count > 0) { double scale = 1./32768.f; while (count-- > 0) { SInt16 i = *src++; i = OSSwapInt16(i); double f = (double)i * scale; *dst++ = f; } } } // =================================================================================================== void NativeInt32ToFloat32_X86( const SInt32 *src, Float32 *dst, unsigned int numToConvert ) { const SInt32 *src0 = src; Float32 *dst0 = dst; unsigned int count = numToConvert; if (count >= 4) { // vector -- requires 4+ samples #define LEI32TOF32(x) \ vf##x = _mm_cvtepi32_ps(vi##x); \ vf##x = _mm_mul_ps(vf##x, vscale); \ const __m128 vscale = (const __m128) { 1.0/2147483648.0f, 1.0/2147483648.0f, 1.0/2147483648.0f, 1.0/2147483648.0f }; __m128 vf0; __m128i vi0; int ialign = (uintptr_t)src & 0xF; int falign = (uintptr_t)dst & 0xF; if (falign != 0 || ialign != 0) { // do one unaligned conversion vi0 = _mm_loadu_si128((__m128i const *)src); LEI32TOF32(0) _mm_storeu_ps(dst, vf0); // and advance such that the destination floats are aligned unsigned int n = (16 - falign) / 4; src += n; dst += n; count -= n; ialign = (uintptr_t)src & 0xF; if (ialign != 0) { // unaligned loads, aligned stores while (count >= 4) { vi0 = _mm_loadu_si128((__m128i const *)src); LEI32TOF32(0) _mm_store_ps(dst, vf0); src += 4; dst += 4; count -= 4; } goto VectorCleanup; } } // aligned loads, aligned stores while (count >= 4) { vi0 = _mm_load_si128((__m128i const *)src); LEI32TOF32(0) _mm_store_ps(dst, vf0); src += 4; dst += 4; count -= 4; } VectorCleanup: if (count > 0) { // unaligned cleanup -- just do one unaligned vector at the end src = src0 + numToConvert - 4; dst = dst0 + numToConvert - 4; vi0 = _mm_loadu_si128((__m128i const *)src); LEI32TOF32(0) _mm_storeu_ps(dst, vf0); } return; } // scalar for small numbers of samples if (count > 0) { double scale = 1./2147483648.0f; while (count-- > 0) { SInt32 i = *src++; double f = (double)i * scale; *dst++ = f; } } } // =================================================================================================== void SwapInt32ToFloat32_X86( const SInt32 *src, Float32 *dst, unsigned int numToConvert ) { const SInt32 *src0 = src; Float32 *dst0 = dst; unsigned int count = numToConvert; if (count >= 4) { // vector -- requires 4+ samples #define BEI32TOF32(x) \ vi##x = byteswap32(vi##x); \ vf##x = _mm_cvtepi32_ps(vi##x); \ vf##x = _mm_mul_ps(vf##x, vscale); \ const __m128 vscale = (const __m128) { 1.0/2147483648.0f, 1.0/2147483648.0f, 1.0/2147483648.0f, 1.0/2147483648.0f }; __m128 vf0; __m128i vi0; int ialign = (uintptr_t)src & 0xF; int falign = (uintptr_t)dst & 0xF; if (falign != 0 || ialign != 0) { // do one unaligned conversion vi0 = _mm_loadu_si128((__m128i const *)src); BEI32TOF32(0) _mm_storeu_ps(dst, vf0); // and advance such that the destination floats are aligned unsigned int n = (16 - falign) / 4; src += n; dst += n; count -= n; ialign = (uintptr_t)src & 0xF; if (ialign != 0) { // unaligned loads, aligned stores while (count >= 4) { vi0 = _mm_loadu_si128((__m128i const *)src); BEI32TOF32(0) _mm_store_ps(dst, vf0); src += 4; dst += 4; count -= 4; } goto VectorCleanup; } } // aligned loads, aligned stores while (count >= 4) { vi0 = _mm_load_si128((__m128i const *)src); BEI32TOF32(0) _mm_store_ps(dst, vf0); src += 4; dst += 4; count -= 4; } VectorCleanup: if (count > 0) { // unaligned cleanup -- just do one unaligned vector at the end src = src0 + numToConvert - 4; dst = dst0 + numToConvert - 4; vi0 = _mm_loadu_si128((__m128i const *)src); BEI32TOF32(0) _mm_storeu_ps(dst, vf0); } return; } // scalar for small numbers of samples if (count > 0) { double scale = 1./2147483648.0f; while (count-- > 0) { SInt32 i = *src++; i = OSSwapInt32(i); double f = (double)i * scale; *dst++ = f; } } }