void audio_convert_s16_to_float_SSE2(float *out, const int16_t *in, size_t samples, float gain) { float fgain = gain / (0x7fff * 0x10000); __m128 factor = _mm_set1_ps(fgain); size_t i; for (i = 0; i + 8 <= samples; i += 8, in += 8, out += 8) { __m128i input = _mm_loadu_si128((const __m128i *)in); __m128i regs[2] = { _mm_unpacklo_epi16(_mm_setzero_si128(), input), _mm_unpackhi_epi16(_mm_setzero_si128(), input), }; __m128 output[2] = { _mm_mul_ps(_mm_cvtepi32_ps(regs[0]), factor), _mm_mul_ps(_mm_cvtepi32_ps(regs[1]), factor), }; _mm_storeu_ps(out + 0, output[0]); _mm_storeu_ps(out + 4, output[1]); } audio_convert_s16_to_float_C(out, in, samples - i, gain); }
void ProxyRwSse2 <SplFmt_INT16>::finish_read_flt (__m128 &src0, __m128 &src1, const __m128i &src, const __m128i &zero) { const __m128i src_03 = _mm_unpacklo_epi16 (src, zero); const __m128i src_47 = _mm_unpackhi_epi16 (src, zero); src0 = _mm_cvtepi32_ps (src_03); src1 = _mm_cvtepi32_ps (src_47); }
void SoundSSE::unpack_16bit_mono(short *input, int size, float *output) { #ifndef CL_DISABLE_SSE2 int sse_size = (size/8)*8; __m128i zero = _mm_setzero_si128(); __m128 constant1 = _mm_set1_ps(1.0f/32767.0f); for (int i = 0; i < sse_size; i+=8) { __m128i isamples = _mm_loadu_si128((__m128i*)(input+i)); __m128 samples0 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(zero, isamples), 16)); __m128 samples1 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(zero, isamples), 16)); samples0 = _mm_mul_ps(samples0, constant1); samples1 = _mm_mul_ps(samples1, constant1); _mm_storeu_ps(output+i+0, samples0); _mm_storeu_ps(output+i+4, samples1); } #else const int sse_size = 0; #endif // unpack remaining for (int i = sse_size; i < size; i++) { output[i] = ((float) input[i]) / 32767.0f; } }
static long conv_rgba16_rgbaF (const uint16_t *src, float *dst, long samples) { long i = 0; if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0) { long n = (samples / 2) * 2; const __m128i *s = (const __m128i*) src; __v4sf *d = (__v4sf*) dst; for (; i < n / 2; i++) { /* Expand shorts to ints by loading zero in the high bits */ const __m128i t0 = _mm_unpacklo_epi16 (s[i + 0], (__m128i)_mm_setzero_ps()); const __m128i t1 = _mm_unpackhi_epi16 (s[i + 0], (__m128i)_mm_setzero_ps()); /* Convert to float */ const __m128 u0 = _mm_cvtepi32_ps (t0); const __m128 u1 = _mm_cvtepi32_ps (t1); const __v4sf rgba0 = u0 * u16_float; const __v4sf rgba1 = u1 * u16_float; d[2 * i + 0] = rgba0; d[2 * i + 1] = rgba1; } _mm_empty(); } for (i *= 2 * 4; i != 4 * samples; i++) dst[i] = src[i] * (1.f / 65535); return samples; }
SIMD_INLINE __m128 Norm32fi(const HidHaarCascade & hid, size_t offset) { __m128 area = _mm_set1_ps(hid.windowArea); __m128 sum = _mm_cvtepi32_ps(Sum32ii(hid.p, offset)); __m128 sqsum = _mm_cvtepi32_ps(Sum32ii(hid.pq, offset)); return ValidSqrt(_mm_sub_ps(_mm_mul_ps(sqsum, area), _mm_mul_ps(sum, sum))); }
void SoundSSE::unpack_16bit_stereo(short *input, int size, float *output[2]) { #ifndef CL_DISABLE_SSE2 int sse_size = (size/8)*8; __m128i zero = _mm_setzero_si128(); __m128 constant1 = _mm_set1_ps(1.0f/32768.0f); for (int i = 0; i < sse_size; i+=8) { __m128i isamples = _mm_loadu_si128((__m128i*)(input+i)); __m128 samples0 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(zero, isamples), 16)); __m128 samples1 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(zero, isamples), 16)); samples0 = _mm_mul_ps(samples0, constant1); samples1 = _mm_mul_ps(samples1, constant1); __m128 tmp0, tmp1; tmp0 = _mm_shuffle_ps(samples0, samples1, _MM_SHUFFLE(2,0,2,0)); tmp1 = _mm_shuffle_ps(samples0, samples1, _MM_SHUFFLE(3,1,3,1)); _mm_storeu_ps(output[0]+i/2, tmp0); _mm_storeu_ps(output[1]+i/2, tmp1); } #else const int sse_size = 0; #endif // unpack remaining for (int i = sse_size; i < size; i+=2) { output[0][i/2] = ((float) input[i]) / 32767.0f; output[1][i/2] = ((float) input[i+1]) / 32767.0f; } }
static void SSE2_StereoMixToFloat(const int32 *pSrc, float *pOut1, float *pOut2, uint32 nCount, const float _i2fc) //---------------------------------------------------------------------------------------------------------------- { __m128 i2fc = _mm_load_ps1(&_i2fc); const __m128i *in = reinterpret_cast<const __m128i *>(pSrc); // We may read beyond the wanted length... this works because we know that we will always work on our buffers of size MIXBUFFERSIZE nCount = (nCount + 3) / 4; do { __m128i i1 = _mm_loadu_si128(in); // Load four integer values, LRLR __m128i i2 = _mm_loadu_si128(in + 1); // Load four integer values, LRLR in += 2; __m128 f1 = _mm_cvtepi32_ps(i1); // Convert to four floats, LRLR __m128 f2 = _mm_cvtepi32_ps(i2); // Convert to four floats, LRLR f1 = _mm_mul_ps(f1, i2fc); // Apply int->float factor f2 = _mm_mul_ps(f2, i2fc); // Apply int->float factor __m128 fl = _mm_shuffle_ps(f1, f2, _MM_SHUFFLE(2, 0, 2, 0)); // LRLR+LRLR => LLLL __m128 fr = _mm_shuffle_ps(f1, f2, _MM_SHUFFLE(3, 1, 3, 1)); // LRLR+LRLR => RRRR _mm_storeu_ps(pOut1, fl); // Store four float values, LLLL _mm_storeu_ps(pOut2, fr); // Store four float values, RRRR pOut1 += 4; pOut2 += 4; } while(--nCount); }
//---------------------------------------------------------------------------- void conv_Short1ToFloat1(void* dst, const void* s, s32 numSamples) { LSfloat* d = reinterpret_cast<LSfloat*>(dst); const LSshort* src = reinterpret_cast<const LSshort*>(s); s32 num = numSamples >> 3; //8個のshortをまとめて処理 s32 offset = num << 3; s32 rem = numSamples - offset; const __m128i izero = _mm_setzero_si128(); const __m128 fcoff = _mm_set1_ps(1.0f/32767.0f); const LSshort* p = src; LSfloat* q = d; for(s32 i=0; i<num; ++i){ //32bit浮動小数点r0, r1に変換 __m128i t0 = _mm_loadu_si128((const __m128i*)p); __m128i t1 = _mm_cmpgt_epi16(izero, t0); __m128 r0 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(t0, t1)); __m128 r1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(t0, t1)); r0 = _mm_mul_ps(r0, fcoff); r1 = _mm_mul_ps(r1, fcoff); _mm_storeu_ps((q+0), r1); _mm_storeu_ps((q+4), r0); p += 8; q += 8; } for(s32 i=0; i<rem; ++i){ q[i] = toFloat(p[i]); } }
static void GF_FUNC_ALIGN VS_CC convert_to_float_8bit(int radius, float *kernel, const uint8_t *srcp, float *buff, float *dstp, int width, int height, int src_stride, int dst_stride) { __m128i zero = _mm_setzero_si128(); for (int y = 0; y < height; y++) { for (int x = 0; x < width; x += 16) { __m128i xmm0 = _mm_load_si128((__m128i *)(srcp + x)); __m128i xmm1 = _mm_unpackhi_epi8(xmm0, zero); xmm0 = _mm_unpacklo_epi8(xmm0, zero); __m128 f0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(xmm0, zero)); __m128 f1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(xmm0, zero)); __m128 f2 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(xmm1, zero)); __m128 f3 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(xmm1, zero)); _mm_store_ps(dstp + x , f0); _mm_store_ps(dstp + x + 4, f1); _mm_store_ps(dstp + x + 8, f2); _mm_store_ps(dstp + x + 12, f3); } srcp += src_stride; dstp += dst_stride; } }
void ConvertInt8ToFloat(__m128i in_input, __m128 *out_output) { __m128i input_16, input_32; // convert first half to 16bit integer input_16 = _mm_unpacklo_epi8(in_input, ZERO); // convert first fourth to 32bit interger input_32 = _mm_unpacklo_epi16(input_16, ZERO); // convert first fourth to 32bit floating point value out_output[0] = _mm_cvtepi32_ps(input_32); // convert second fourth to 32bit integer input_32 = _mm_unpackhi_epi16(input_16, ZERO); // convert second fourth to 32bit floating point value out_output[1] = _mm_cvtepi32_ps(input_32); // convert second half to 16bit integer input_16 = _mm_unpackhi_epi8(in_input, ZERO); // convert third fourth to 32bit interger input_32 = _mm_unpacklo_epi16(input_16, ZERO); // convert third fourth to 32bit floating point value out_output[2] = _mm_cvtepi32_ps(input_32); // convert fourth fourth to 32bit integer input_32 = _mm_unpackhi_epi16(input_16, ZERO); // convert fourth fourth to 32bit floating point value out_output[3] = _mm_cvtepi32_ps(input_32); }
static inline void yuv_to_packed_shader_3(uint8_t** dstp, const uint8_t** srcp, const int dpitch, const int spitch, const int width, const int height, void* _buff) noexcept { const uint8_t* sr = srcp[0]; const uint8_t* sg = srcp[1]; const uint8_t* sb = srcp[2]; uint8_t* d = dstp[0]; float* buff = reinterpret_cast<float*>(_buff); const uint8_t *rlsb, *glsb, *blsb; if (STACK16) { rlsb = sr + height * spitch; glsb = sg + height * spitch; blsb = sb + height * spitch; } const __m128i zero = _mm_setzero_si128(); const __m128 rcp = _mm_set1_ps(1.0f / (STACK16 ? 65535 : 255)); for (int y = 0; y < height; ++y) { for (int x = 0; x < width; x += 4) { __m128i r, g, b; if (!STACK16) { r = _mm_cvtepu8_epi32(loadl(sr + x)); g = _mm_cvtepu8_epi32(loadl(sg + x)); b = _mm_cvtepu8_epi32(loadl(sb + x)); } else { r = _mm_unpacklo_epi16(_mm_unpacklo_epi8(loadl(rlsb + x), loadl(sr + x)), zero); g = _mm_unpacklo_epi16(_mm_unpacklo_epi8(loadl(glsb + x), loadl(sg + x)), zero); b = _mm_unpacklo_epi16(_mm_unpacklo_epi8(loadl(blsb + x), loadl(sb + x)), zero); } __m128i rg = _mm_unpacklo_epi32(r, g); __m128i ba = _mm_unpacklo_epi32(b, zero); __m128 rgba0 = _mm_cvtepi32_ps(_mm_unpacklo_epi64(rg, ba)); __m128 rgba1 = _mm_cvtepi32_ps(_mm_unpackhi_epi64(rg, ba)); _mm_store_ps(buff + 4 * x + 0, _mm_mul_ps(rgba0, rcp)); _mm_store_ps(buff + 4 * x + 4, _mm_mul_ps(rgba1, rcp)); rg = _mm_unpackhi_epi32(r, g); ba = _mm_unpackhi_epi32(b, zero); rgba0 = _mm_cvtepi32_ps(_mm_unpacklo_epi64(rg, ba)); rgba1 = _mm_cvtepi32_ps(_mm_unpackhi_epi64(rg, ba)); _mm_store_ps(buff + 4 * x + 8, _mm_mul_ps(rgba0, rcp)); _mm_store_ps(buff + 4 * x + 12, _mm_mul_ps(rgba1, rcp)); } convert_float_to_half(d, buff, width * 4); d += dpitch; sr += spitch; sg += spitch; sb += spitch; if (STACK16) { rlsb += spitch; glsb += spitch; blsb += spitch; } } }
JL_DLLEXPORT __m128i test_m128i(__m128i a, __m128i b, __m128i c, __m128i d) { // 64-bit x86 has only level 2 SSE, which does not have a <4 x int32> multiplication, // so we use floating-point instead, and assume caller knows about the hack. return _mm_add_epi32(a, _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(b), _mm_cvtepi32_ps(_mm_sub_epi32(c,d))))); }
template<> void cvtScale_<short, int, float>( const short* src, size_t sstep, int* dst, size_t dstep, Size size, float scale, float shift ) { sstep /= sizeof(src[0]); dstep /= sizeof(dst[0]); for( ; size.height--; src += sstep, dst += dstep ) { int x = 0; #if CV_SSE2 if(USE_SSE2)//~5X { __m128 scale128 = _mm_set1_ps (scale); __m128 shift128 = _mm_set1_ps (shift); for(; x <= size.width - 8; x += 8 ) { __m128i r0 = _mm_loadl_epi64((const __m128i*)(src + x)); __m128i r1 = _mm_loadl_epi64((const __m128i*)(src + x + 4)); __m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16)); __m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r1, r1), 16)); rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128); rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128); r0 = _mm_cvtps_epi32(rf0); r1 = _mm_cvtps_epi32(rf1); _mm_storeu_si128((__m128i*)(dst + x), r0); _mm_storeu_si128((__m128i*)(dst + x + 4), r1); } } #endif //We will wait Haswell /* #if CV_AVX if(USE_AVX)//2X - bad variant { ////TODO:AVX implementation (optimization?) required __m256 scale256 = _mm256_set1_ps (scale); __m256 shift256 = _mm256_set1_ps (shift); for(; x <= size.width - 8; x += 8 ) { __m256i buf = _mm256_set_epi32((int)(*(src+x+7)),(int)(*(src+x+6)),(int)(*(src+x+5)),(int)(*(src+x+4)),(int)(*(src+x+3)),(int)(*(src+x+2)),(int)(*(src+x+1)),(int)(*(src+x))); __m256 r0 = _mm256_add_ps( _mm256_mul_ps(_mm256_cvtepi32_ps (buf), scale256), shift256); __m256i res = _mm256_cvtps_epi32(r0); _mm256_storeu_si256 ((__m256i*)(dst+x), res); } } #endif*/ for(; x < size.width; x++ ) dst[x] = saturate_cast<int>(src[x]*scale + shift); } }
template <bool align> SIMD_INLINE void HogDirectionHistograms(const __m128i & t, const __m128i & l, const __m128i & r, const __m128i & b, Buffer & buffer, size_t col) { HogDirectionHistograms<align>( _mm_cvtepi32_ps(_mm_sub_epi32(_mm_unpacklo_epi16(r, K_ZERO), _mm_unpacklo_epi16(l, K_ZERO))), _mm_cvtepi32_ps(_mm_sub_epi32(_mm_unpacklo_epi16(b, K_ZERO), _mm_unpacklo_epi16(t, K_ZERO))), buffer, col + 0); HogDirectionHistograms<align>( _mm_cvtepi32_ps(_mm_sub_epi32(_mm_unpackhi_epi16(r, K_ZERO), _mm_unpackhi_epi16(l, K_ZERO))), _mm_cvtepi32_ps(_mm_sub_epi32(_mm_unpackhi_epi16(b, K_ZERO), _mm_unpackhi_epi16(t, K_ZERO))), buffer, col + 4); }
void process( struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, const void *const ivoid, void *ovoid, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out) { const float divider = (float)UINT16_MAX; const __m128 dividers = _mm_set_ps1(divider); #ifdef _OPENMP #pragma omp parallel for default(none) schedule(static) shared(ovoid) #endif for(int j = 0; j < roi_out->height; j++) { const uint16_t *in = ((uint16_t *)ivoid) + (size_t)j * roi_out->width; float *out = ((float *)ovoid) + (size_t)j * roi_out->width; int i = 0; int alignment = ((8 - (j * roi_out->width & (8 - 1))) & (8 - 1)); // process unaligned pixels for ( ; i < alignment ; i++, out++, in++) *out = ((float)(*in)) / divider; // process aligned pixels with SSE for( ; i < roi_out->width - (8 - 1); i += 8, in += 8) { const __m128i input = _mm_load_si128((__m128i *)in); __m128i ilo = _mm_unpacklo_epi16(input, _mm_set1_epi16(0)); __m128i ihi = _mm_unpackhi_epi16(input, _mm_set1_epi16(0)); __m128 flo = _mm_cvtepi32_ps(ilo); __m128 fhi = _mm_cvtepi32_ps(ihi); flo = _mm_div_ps(flo, dividers); fhi = _mm_div_ps(fhi, dividers); _mm_stream_ps(out, flo); out += 4; _mm_stream_ps(out, fhi); out += 4; } // process the rest for( ; i < roi_out->width; i++, out++, in++) *out = ((float)(*in)) / divider; } _mm_sfence(); }
static void GF_FUNC_ALIGN VS_CC proc_8bit(int radius, float *kernel, const uint8_t *srcp, float *buff, float *dstp, int width, int height, int src_stride, int dst_stride) { int length = radius * 2 + 1; const uint8_t *p[17]; for (int i = -radius; i <= radius; i++) { p[i + radius] = srcp + abs(i) * src_stride; } __m128i zero = _mm_setzero_si128(); for (int y = 0; y < height; y++) { for (int x = 0; x < width; x += 16) { __m128 sum[4]; sum[0] = _mm_setzero_ps(); sum[1] = _mm_setzero_ps(); sum[2] = _mm_setzero_ps(); sum[3] = _mm_setzero_ps(); for (int i = 0; i < length; i++) { __m128 f[4]; __m128i xmm0 = _mm_load_si128((__m128i *)(p[i] + x)); __m128i xmm1 = _mm_unpackhi_epi8(xmm0, zero); xmm0 = _mm_unpacklo_epi8(xmm0, zero); f[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(xmm0, zero)); f[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(xmm0, zero)); f[2] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(xmm1, zero)); f[3] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(xmm1, zero)); __m128 k = _mm_set1_ps(kernel[i]); for (int j = 0; j < 4; j++) { sum[j] = _mm_add_ps(sum[j], _mm_mul_ps(k, f[j])); } } _mm_store_ps(buff + x, sum[0]); _mm_store_ps(buff + x + 4, sum[1]); _mm_store_ps(buff + x + 8, sum[2]); _mm_store_ps(buff + x + 12, sum[3]); } proc_horizontal(buff, radius, length, width, kernel, dstp); for (int i = 0; i < length - 1; i++) { p[i] = p[i + 1]; } p[length - 1] += (y < height - radius - 1 ? 1 : -1) * src_stride; dstp += dst_stride; } }
void conv_Short1ToFloat2(void* dst, const void* s, s32 numSamples) { LSfloat* d = reinterpret_cast<LSfloat*>(dst); const LSshort* src = reinterpret_cast<const LSshort*>(s); s32 num = numSamples >> 3; //8個のshortをまとめて処理 s32 offset = num << 3; s32 rem = numSamples - offset; const __m128i izero = _mm_setzero_si128(); const __m128 fcoff = _mm_set1_ps(1.0f/32767.0f); const LSshort* p = src; LSfloat* q = d; for(s32 i=0; i<num; ++i){ __m128i t = _mm_loadu_si128((const __m128i*)p); __m128i s16_0 = _mm_unpackhi_epi16(t, t); __m128i s16_1 = _mm_unpacklo_epi16(t, t); __m128i t1 = _mm_cmpgt_epi16(izero, s16_0); __m128i t2 = _mm_cmpgt_epi16(izero, s16_1); __m128i s32_0 = _mm_unpackhi_epi16(s16_0, t1); __m128i s32_1 = _mm_unpacklo_epi16(s16_0, t1); __m128i s32_2 = _mm_unpackhi_epi16(s16_1, t2); __m128i s32_3 = _mm_unpacklo_epi16(s16_1, t2); //32bit浮動小数点に変換 __m128 f32_0 = _mm_mul_ps(_mm_cvtepi32_ps(s32_0), fcoff); __m128 f32_1 = _mm_mul_ps(_mm_cvtepi32_ps(s32_1), fcoff); __m128 f32_2 = _mm_mul_ps(_mm_cvtepi32_ps(s32_2), fcoff); __m128 f32_3 = _mm_mul_ps(_mm_cvtepi32_ps(s32_3), fcoff); _mm_storeu_ps((q+0), f32_3); _mm_storeu_ps((q+4), f32_2); _mm_storeu_ps((q+8), f32_1); _mm_storeu_ps((q+12), f32_0); p += 8; q += 16; } for(s32 i=0; i<rem; ++i){ s32 j = i<<1; q[j+0] = toFloat(p[i]); q[j+1] = toFloat(p[i]); } }
static inline void yuv_to_planar_shader_3(uint8_t** dstp, const uint8_t** srcp, const int dpitch, const int spitch, const int width, const int height, void* _buff) noexcept { const __m128i zero = _mm_setzero_si128(); const __m128 rcp = _mm_set1_ps(1.0f / (STACK16 ? 65535 : 255)); float* buff = reinterpret_cast<float*>(_buff); for (int p = 0; p < 3; ++p) { const uint8_t* s = srcp[p]; const uint8_t* lsb = s + height * spitch; uint8_t* d = dstp[p]; for (int y = 0; y < height; ++y) { for (int x = 0; x < width; x += 16) { __m128i msbx = _mm_load_si128(reinterpret_cast<const __m128i*>(s + x)); __m128i d0, lsbx; if (!STACK16) { d0 = _mm_unpacklo_epi8(msbx, zero); } else { lsbx = _mm_load_si128(reinterpret_cast<const __m128i*>(lsb + x)); d0 = _mm_unpacklo_epi8(lsbx, msbx); } __m128 f0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(d0, zero)); _mm_store_ps(buff + x + 0, _mm_mul_ps(rcp, f0)); f0 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(d0, zero)); _mm_store_ps(buff + x + 4, _mm_mul_ps(rcp, f0)); if (!STACK16) { d0 = _mm_unpackhi_epi8(msbx, zero); } else { d0 = _mm_unpackhi_epi8(lsbx, msbx); } f0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(d0, zero)); _mm_store_ps(buff + x + 8, _mm_mul_ps(rcp, f0)); f0 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(d0, zero)); _mm_store_ps(buff + x + 12, _mm_mul_ps(rcp, f0)); } convert_float_to_half(d, buff, width); s += spitch; d += dpitch; if (STACK16) { lsb += spitch; } } } }
static inline __m128 log2f4(__m128 x) { __m128i exp = _mm_load_si128((__m128i*)_exp_mask); __m128i mant = _mm_load_si128((__m128i*)_mantissa_mask); __m128 one = _mm_load_ps(_ones_ps); __m128i i = _mm_castps_si128(x); __m128 e = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, exp), 23), _mm_load_si128((__m128i*)_one27))); __m128 m = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mant)), one); __m128 p; /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[ */ #if LOG_POLY_DEGREE == 6 p = POLY5( m, log_p5_0, log_p5_1, log_p5_2, log_p5_3, log_p5_4, log_p5_5); #elif LOG_POLY_DEGREE == 5 p = POLY4(m, log_p4_0, log_p4_1, log_p4_2, log_p4_3, log_p4_4); #elif LOG_POLY_DEGREE == 4 p = POLY3(m, log_p3_0, log_p3_1, log_p3_2, log_p3_3); #elif LOG_POLY_DEGREE == 3 p = POLY2(m, log_p2_0, log_p2_1, log_p2_2); #else #error #endif /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/ p = _mm_mul_ps(p, _mm_sub_ps(m, one)); return _mm_add_ps(p, e); }
/** * Count the number of bits set within the intersection of two * fingerprints using the SSSE3 instruction set. * @warning 1) fp1 & fp2 must be aligned to 16 byte boundaries. * 2) Use (get_cpuid_flags() & bit_SSSE3) from cpuid.h to * test if the CPU supports the SSSE3 instructions. */ int chemfp_intersect_popcount_SSSE3(int size, const unsigned *fp1, const unsigned *fp2) { #if defined(GENERATE_SSSE3) /* 2^5 loop iters might overflow 8-bit counter, so cap it at 2^4 iters per chunk */ const int iters = 1 << 4; const int N = (size + 3) / 4; int i, count; __m128i count32 = _mm_setzero_si128(); for (i = 0; i + iters * 4 <= N; i += iters * 4) { count32 = _mm_add_epi32(count32, intersect_popcount_SSSE3_helper(&fp1[i], &fp2[i], iters)); } if (i < N) { count32 = _mm_add_epi32(count32, intersect_popcount_SSSE3_helper(&fp1[i], &fp2[i], (N - i + 3) / 4)); } /* Layout coming from PSADBW accumulation is 2*{0,32}: 0 S1 0 S0 */ count = _mm_cvt_ss2si(_mm_cvtepi32_ps(_mm_add_epi32( count32, _mm_shuffle_epi32(count32, _MM_SHUFFLE(2, 2, 2, 2))))); return count; #else UNUSED(size); UNUSED(fp1); UNUSED(fp2); return 0; #endif }
static inline __m128 sigmoid_positive_ps( __m128 xin ) { union { __m128i i; int32_t i32[4]; } i; __m128 ex; float *ex_elem = (float*) &ex; __m128 x1 = _mm_min_ps( xin, tens.ps ); x1 = _mm_mul_ps( x1, tens.ps ); i.i = _mm_cvttps_epi32( x1 ); ex_elem[0] = e[i.i32[0]]; ex_elem[1] = e[i.i32[1]]; ex_elem[2] = e[i.i32[2]]; ex_elem[3] = e[i.i32[3]]; x1 = _mm_sub_ps( x1, _mm_cvtepi32_ps( i.i ) ); x1 = _mm_add_ps( x1, tens.ps ); x1 = _mm_mul_ps( x1, ex ); x1 = _mm_add_ps( x1, ones.ps ); #ifdef __FAST_MATH__ return _mm_rcp_ps( x1 ); #else return _mm_div_ps( ones.ps, x1 ); #endif }
static inline __v4sf sse_init_newton (__v4sf x, double exponent, double c0, double c1, double c2) { double norm = exponent*M_LN2/FLT_MANTISSA; __v4sf y = _mm_cvtepi32_ps((__m128i)((__v4si)x - splat4i(FLT_ONE))); return splat4f(c0) + splat4f(c1*norm)*y + splat4f(c2*norm*norm)*y*y; }
static inline __m128 lanczos_sse(__m128 width, __m128 t) { /* Compute a value for sinf(pi.t) in [-pi pi] for which the value will be * correct */ __m128i a = _mm_cvtps_epi32(t); __m128 r = _mm_sub_ps(t, _mm_cvtepi32_ps(a)); // Compute the correct sign for sinf(pi.r) static const uint32_t fone[] __attribute__((aligned(SSE_ALIGNMENT))) = { 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000}; static const uint32_t ione[] __attribute__((aligned(SSE_ALIGNMENT))) = { 1, 1, 1, 1}; static const __m128 eps = {DT_LANCZOS_EPSILON, DT_LANCZOS_EPSILON, DT_LANCZOS_EPSILON, DT_LANCZOS_EPSILON}; static const __m128 pi = {M_PI, M_PI, M_PI, M_PI}; static const __m128 pi2 = {M_PI*M_PI, M_PI*M_PI, M_PI*M_PI, M_PI*M_PI}; __m128i isign = _mm_and_si128(*(__m128i*)ione, a); isign = _mm_slli_epi64(isign, 31); isign = _mm_or_si128(*(__m128i*)fone, isign); __m128 fsign = _mm_castsi128_ps(isign); __m128 num = _mm_mul_ps(width, fsign); num = _mm_mul_ps(num, sinf_fast_sse(_mm_mul_ps(pi, r))); num = _mm_mul_ps(num, sinf_fast_sse(_mm_div_ps(_mm_mul_ps(pi, t), width))); num = _mm_add_ps(eps, num); __m128 den = _mm_mul_ps(pi2, _mm_mul_ps(t, t)); den = _mm_add_ps(eps, den); return _mm_div_ps(num, den); }
void SubpixelMaximizer::fitUsingSSE3(float coef[FitMatrix::ROWS], const signed short data[3][3][3]) const { assert(FitMatrix::PADDEDCOLS == 32); __m128 localFitMatrixScale = _mm_set_ss(fitMatrix.scale); const short* localFitMatrix = fitMatrix(); // Load data into four SSE Registers __m128i x[4]; signed short* dataFlat = (signed short*) data; // flat arraw of 27 signed shorts x[0] = _mm_loadu_si128((__m128i*)(dataFlat + 0)); x[1] = _mm_loadu_si128((__m128i*)(dataFlat + 8)); x[2] = _mm_loadu_si128((__m128i*)(dataFlat + 16)); x[3] = _mm_loadu_si128((__m128i*)(dataFlat + 24)); x[3] = _mm_srli_si128(_mm_slli_si128(x[3], 10), 10); // Clear dataFlat[27..31] for(int i = 0; i < FitMatrix::ROWS; i++) { // Compute scalar product between ((float*)x)[0..31] and localFitMatrix __m128i sum = _mm_madd_epi16(x[0], *(__m128i*)(localFitMatrix + 0)); sum = _mm_add_epi32(sum, _mm_madd_epi16(x[1], *(__m128i*)(localFitMatrix + 8))); sum = _mm_add_epi32(sum, _mm_madd_epi16(x[2], *(__m128i*)(localFitMatrix + 16))); sum = _mm_add_epi32(sum, _mm_madd_epi16(x[3], *(__m128i*)(localFitMatrix + 24))); sum = _mm_hadd_epi32(sum, sum); sum = _mm_hadd_epi32(sum, sum); _mm_store_ss(coef + i, _mm_mul_ss(_mm_cvtepi32_ps(sum), localFitMatrixScale)); localFitMatrix += 32; } }
void bp_ray_trace_packet (const ray4_t *ray, vector_t *colors, simd4i_t srcprim_id, int depth, simd4_t fdepth) { unsigned int activeMask; intersect4_t isect4; ASSIGN (colors [0], background); ASSIGN (colors [1], background); ASSIGN (colors [2], background); ASSIGN (colors [3], background); if ((depth > curr_scene->settings.max_trace_level) | (simd4_extract_sign (simd4_float_lt (fdepth, simd4_from_float (curr_scene->settings.adc_bailout))) == 0xf)) { return; } isect4.prim_id = simd4i_minus_ones; bp_kd_tree_packet_find_nearest (curr_scene->kd_tree_root, ray, &isect4); activeMask = _mm_movemask_ps (simd4_float_eq (_mm_cvtepi32_ps (isect4.prim_id), simd4f_minus_ones)); /* If there was no intersection terminate early */ if (activeMask == 0xF) return; bp_shade_packet (curr_scene, &isect4, ray, colors, depth, fdepth, srcprim_id); }
static inline __m128 exp2f4(__m128 x) { __m128i ipart; __m128 fpart, expipart, expfpart; x = _mm_min_ps(x, _mm_load_ps(_one29_ps)); x = _mm_max_ps(x, _mm_load_ps(_minusone27_ps)); /* ipart = int(x - 0.5) */ ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_load_ps(_half_ps))); /* fpart = x - ipart */ fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart)); /* expipart = (float) (1 << ipart) */ expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_load_si128((__m128i*)_one27)), 23)); /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */ #if EXP_POLY_DEGREE == 5 expfpart = POLY5(fpart, exp_p5_0, exp_p5_1, exp_p5_2, exp_p5_3, exp_p5_4, exp_p5_5); #elif EXP_POLY_DEGREE == 4 expfpart = POLY4(fpart, exp_p4_0, exp_p4_1, exp_p4_2, exp_p4_3, exp_p4_4); #elif EXP_POLY_DEGREE == 3 expfpart = POLY3(fpart, exp_p3_0, exp_p3_1, exp_p3_2, exp_p3_3); #elif EXP_POLY_DEGREE == 2 expfpart = POLY2(fpart, exp_p2_0, exp_p2_1, exp_p2_2); #else #error #endif return _mm_mul_ps(expipart, expfpart); }
__m128 test_mm_cvtepi32_ps(__m128i A) { // DAG-LABEL: test_mm_cvtepi32_ps // DAG: call <4 x float> @llvm.x86.sse2.cvtdq2ps // // ASM-LABEL: test_mm_cvtepi32_ps // ASM: cvtdq2ps return _mm_cvtepi32_ps(A); }
void SoundSSE::unpack_8bit_stereo(unsigned char *input, int size, float *output[2]) { #ifndef CL_DISABLE_SSE2 int sse_size = (size/16)*16; __m128i zero = _mm_setzero_si128(); __m128 constant1 = _mm_set1_ps(1.0f/128.0f); __m128i constant2 = _mm_set1_epi16(128); for (int i = 0; i < sse_size; i+=16) { __m128i isamples = _mm_loadu_si128((__m128i*)(input+i)); __m128i isamples0 = _mm_sub_epi16(_mm_unpacklo_epi8(isamples, zero), constant2); __m128i isamples1 = _mm_sub_epi16(_mm_unpackhi_epi8(isamples, zero), constant2); __m128 samples0 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(zero, isamples0), 16)); __m128 samples1 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(zero, isamples0), 16)); __m128 samples2 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(zero, isamples1), 16)); __m128 samples3 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(zero, isamples1), 16)); samples0 = _mm_mul_ps(samples0, constant1); samples1 = _mm_mul_ps(samples1, constant1); samples2 = _mm_mul_ps(samples2, constant1); samples3 = _mm_mul_ps(samples3, constant1); __m128 tmp0, tmp1, tmp2, tmp3; tmp0 = _mm_shuffle_ps(samples0, samples1, _MM_SHUFFLE(2,0,2,0)); tmp1 = _mm_shuffle_ps(samples0, samples1, _MM_SHUFFLE(3,1,3,1)); tmp2 = _mm_shuffle_ps(samples2, samples3, _MM_SHUFFLE(2,0,2,0)); tmp3 = _mm_shuffle_ps(samples2, samples3, _MM_SHUFFLE(3,1,3,1)); _mm_storeu_ps(output[0]+i/2, tmp0); _mm_storeu_ps(output[1]+i/2, tmp1); _mm_storeu_ps(output[0]+i/2+4, tmp2); _mm_storeu_ps(output[1]+i/2+4, tmp3); } #else const int sse_size = 0; #endif // unpack remaining for (int i = sse_size; i < size; i+=2) { int value = input[i]; output[0][i/2] = ((float) (value - 128)) / 128.0f; value = input[i+1]; output[1][i/2] = ((float) (value - 128)) / 128.0f; } }
/* Calculate modulus using vectors */ inline vec4 _mm2_mod_ps2(vec4 a, vec4 aDiv) { vec4 c = a / aDiv; ivec4 i = _mm_cvttps_epi32(c); i = _mm_abs_epi32(i); vec4 cTrunc = _mm_cvtepi32_ps(i); vec4 base = cTrunc * aDiv; vec4 r = a - base; return r; }
SIMD_INLINE __m128 WeightedSum32f(const WeightedRect & rect, size_t offset) { __m128i s0 = _mm_loadu_si128((__m128i*)(rect.p0 + offset)); __m128i s1 = _mm_loadu_si128((__m128i*)(rect.p1 + offset)); __m128i s2 = _mm_loadu_si128((__m128i*)(rect.p2 + offset)); __m128i s3 = _mm_loadu_si128((__m128i*)(rect.p3 + offset)); __m128i sum = _mm_sub_epi32(_mm_sub_epi32(s0, s1), _mm_sub_epi32(s2, s3)); return _mm_mul_ps(_mm_cvtepi32_ps(sum), _mm_set1_ps(rect.weight)); }