Exemple #1
0
void audio_convert_s16_to_float_SSE2(float *out,
      const int16_t *in, size_t samples, float gain)
{
   float fgain = gain / (0x7fff * 0x10000);
   __m128 factor = _mm_set1_ps(fgain);
   size_t i;
   for (i = 0; i + 8 <= samples; i += 8, in += 8, out += 8)
   {
      __m128i input = _mm_loadu_si128((const __m128i *)in);
      __m128i regs[2] = {
         _mm_unpacklo_epi16(_mm_setzero_si128(), input),
         _mm_unpackhi_epi16(_mm_setzero_si128(), input),
      };

      __m128 output[2] = {
         _mm_mul_ps(_mm_cvtepi32_ps(regs[0]), factor),
         _mm_mul_ps(_mm_cvtepi32_ps(regs[1]), factor),
      };

      _mm_storeu_ps(out + 0, output[0]);
      _mm_storeu_ps(out + 4, output[1]);
   }

   audio_convert_s16_to_float_C(out, in, samples - i, gain);
}
Exemple #2
0
void	ProxyRwSse2 <SplFmt_INT16>::finish_read_flt (__m128 &src0, __m128 &src1, const __m128i &src, const __m128i &zero)
{
	const __m128i	src_03 = _mm_unpacklo_epi16 (src, zero);
	const __m128i	src_47 = _mm_unpackhi_epi16 (src, zero);
	src0 = _mm_cvtepi32_ps (src_03);
	src1 = _mm_cvtepi32_ps (src_47);
}
Exemple #3
0
void SoundSSE::unpack_16bit_mono(short *input, int size, float *output)
{
#ifndef CL_DISABLE_SSE2
	int sse_size = (size/8)*8;

	__m128i zero = _mm_setzero_si128();
	__m128 constant1 = _mm_set1_ps(1.0f/32767.0f);
	for (int i = 0; i < sse_size; i+=8)
	{
		__m128i isamples = _mm_loadu_si128((__m128i*)(input+i));
		__m128 samples0 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(zero, isamples), 16));
		__m128 samples1 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(zero, isamples), 16));
		samples0 = _mm_mul_ps(samples0, constant1);
		samples1 = _mm_mul_ps(samples1, constant1);
		_mm_storeu_ps(output+i+0, samples0);
		_mm_storeu_ps(output+i+4, samples1);
	}
#else
	const int sse_size = 0;
#endif

	// unpack remaining
	for (int i = sse_size; i < size; i++)
	{
		output[i] = ((float) input[i]) / 32767.0f;
	}
}
Exemple #4
0
static long
conv_rgba16_rgbaF (const uint16_t *src, float *dst, long samples)
{
  long i = 0;

  if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0)
    {
      long           n  = (samples / 2) * 2;
      const __m128i *s  = (const __m128i*) src;
            __v4sf  *d  = (__v4sf*) dst;

      for (; i < n / 2; i++)
        {
          /* Expand shorts to ints by loading zero in the high bits */
          const __m128i t0 = _mm_unpacklo_epi16 (s[i + 0], (__m128i)_mm_setzero_ps());
          const __m128i t1 = _mm_unpackhi_epi16 (s[i + 0], (__m128i)_mm_setzero_ps());

          /* Convert to float */
          const __m128  u0 = _mm_cvtepi32_ps (t0);
          const __m128  u1 = _mm_cvtepi32_ps (t1);

          const __v4sf rgba0 = u0 * u16_float;
          const __v4sf rgba1 = u1 * u16_float;

          d[2 * i + 0] = rgba0;
          d[2 * i + 1] = rgba1;
        }
      _mm_empty();
    }

  for (i *= 2 * 4; i != 4 * samples; i++)
    dst[i] = src[i] * (1.f / 65535);

  return samples;
}
Exemple #5
0
 SIMD_INLINE __m128 Norm32fi(const HidHaarCascade & hid, size_t offset)
 {
     __m128 area = _mm_set1_ps(hid.windowArea);
     __m128 sum = _mm_cvtepi32_ps(Sum32ii(hid.p, offset));
     __m128 sqsum = _mm_cvtepi32_ps(Sum32ii(hid.pq, offset));
     return ValidSqrt(_mm_sub_ps(_mm_mul_ps(sqsum, area), _mm_mul_ps(sum, sum)));
 }
Exemple #6
0
void SoundSSE::unpack_16bit_stereo(short *input, int size, float *output[2])
{
#ifndef CL_DISABLE_SSE2
	int sse_size = (size/8)*8;

	__m128i zero = _mm_setzero_si128();
	__m128 constant1 = _mm_set1_ps(1.0f/32768.0f);
	for (int i = 0; i < sse_size; i+=8)
	{
		__m128i isamples = _mm_loadu_si128((__m128i*)(input+i));
	
		__m128 samples0 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(zero, isamples), 16));
		__m128 samples1 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(zero, isamples), 16));
		samples0 = _mm_mul_ps(samples0, constant1);
		samples1 = _mm_mul_ps(samples1, constant1);
	
		__m128 tmp0, tmp1;
		tmp0 = _mm_shuffle_ps(samples0, samples1, _MM_SHUFFLE(2,0,2,0));
		tmp1 = _mm_shuffle_ps(samples0, samples1, _MM_SHUFFLE(3,1,3,1));
		_mm_storeu_ps(output[0]+i/2, tmp0);
		_mm_storeu_ps(output[1]+i/2, tmp1);
	}
#else
	const int sse_size = 0;
#endif
	// unpack remaining
	for (int i = sse_size; i < size; i+=2)
	{
		output[0][i/2] = ((float) input[i]) / 32767.0f;
		output[1][i/2] = ((float) input[i+1]) / 32767.0f;
	}
}
Exemple #7
0
static void SSE2_StereoMixToFloat(const int32 *pSrc, float *pOut1, float *pOut2, uint32 nCount, const float _i2fc)
//----------------------------------------------------------------------------------------------------------------
{
	__m128 i2fc = _mm_load_ps1(&_i2fc);
	const __m128i *in = reinterpret_cast<const __m128i *>(pSrc);

	// We may read beyond the wanted length... this works because we know that we will always work on our buffers of size MIXBUFFERSIZE
	nCount = (nCount + 3) / 4;
	do
	{
		__m128i i1 = _mm_loadu_si128(in);		// Load four integer values, LRLR
		__m128i i2 = _mm_loadu_si128(in + 1);	// Load four integer values, LRLR
		in += 2;
		__m128 f1 = _mm_cvtepi32_ps(i1);		// Convert to four floats, LRLR
		__m128 f2 = _mm_cvtepi32_ps(i2);		// Convert to four floats, LRLR
		f1 = _mm_mul_ps(f1, i2fc);				// Apply int->float factor
		f2 = _mm_mul_ps(f2, i2fc);				// Apply int->float factor
		__m128 fl = _mm_shuffle_ps(f1, f2, _MM_SHUFFLE(2, 0, 2, 0));	// LRLR+LRLR => LLLL
		__m128 fr = _mm_shuffle_ps(f1, f2, _MM_SHUFFLE(3, 1, 3, 1));	// LRLR+LRLR => RRRR
		_mm_storeu_ps(pOut1, fl);				// Store four float values, LLLL
		_mm_storeu_ps(pOut2, fr);				// Store four float values, RRRR
		pOut1 += 4;
		pOut2 += 4;
	} while(--nCount);
}
Exemple #8
0
    //----------------------------------------------------------------------------
    void conv_Short1ToFloat1(void* dst, const void* s, s32 numSamples)
    {
        LSfloat* d = reinterpret_cast<LSfloat*>(dst);
        const LSshort* src = reinterpret_cast<const LSshort*>(s);

        s32 num = numSamples >> 3; //8個のshortをまとめて処理
        s32 offset = num << 3;
        s32 rem = numSamples - offset;

        const __m128i izero = _mm_setzero_si128();
        const __m128 fcoff = _mm_set1_ps(1.0f/32767.0f);

        const LSshort* p = src;
        LSfloat* q = d;
        for(s32 i=0; i<num; ++i){
            //32bit浮動小数点r0, r1に変換
            __m128i t0 = _mm_loadu_si128((const __m128i*)p);
            __m128i t1 = _mm_cmpgt_epi16(izero, t0);
            __m128 r0 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(t0, t1));
            __m128 r1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(t0, t1));

            r0 = _mm_mul_ps(r0, fcoff);
            r1 = _mm_mul_ps(r1, fcoff);
            _mm_storeu_ps((q+0), r1);
            _mm_storeu_ps((q+4), r0);
            p += 8;
            q += 8;
        }

        for(s32 i=0; i<rem; ++i){
            q[i] = toFloat(p[i]);
        }
    }
static void GF_FUNC_ALIGN VS_CC
convert_to_float_8bit(int radius, float *kernel, const uint8_t *srcp,
                      float *buff, float *dstp, int width, int height,
                      int src_stride, int dst_stride)
{
    __m128i zero = _mm_setzero_si128();
    
    for (int y = 0; y < height; y++) {
        for (int x = 0; x < width; x += 16) {
            __m128i xmm0 = _mm_load_si128((__m128i *)(srcp + x));
            __m128i xmm1 = _mm_unpackhi_epi8(xmm0, zero);
            xmm0 = _mm_unpacklo_epi8(xmm0, zero);
            __m128 f0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(xmm0, zero));
            __m128 f1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(xmm0, zero));
            __m128 f2 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(xmm1, zero));
            __m128 f3 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(xmm1, zero));
            
            _mm_store_ps(dstp + x , f0);
            _mm_store_ps(dstp + x + 4, f1);
            _mm_store_ps(dstp + x + 8, f2);
            _mm_store_ps(dstp + x + 12, f3);
        }
        srcp += src_stride;
        dstp += dst_stride;
    }
}
	void ConvertInt8ToFloat(__m128i in_input,
		__m128 *out_output)
	{
		__m128i input_16, input_32;

		// convert first half to 16bit integer
		input_16 = _mm_unpacklo_epi8(in_input, ZERO);
		// convert first fourth to 32bit interger
		input_32 = _mm_unpacklo_epi16(input_16, ZERO);
		// convert first fourth to 32bit floating point value
		out_output[0] = _mm_cvtepi32_ps(input_32);

		// convert second fourth to 32bit integer
		input_32 = _mm_unpackhi_epi16(input_16, ZERO);
		// convert second fourth to 32bit floating point value
		out_output[1] = _mm_cvtepi32_ps(input_32);

		// convert second half to 16bit integer
		input_16 = _mm_unpackhi_epi8(in_input, ZERO);
		// convert third fourth to 32bit interger
		input_32 = _mm_unpacklo_epi16(input_16, ZERO);
		// convert third fourth to 32bit floating point value
		out_output[2] = _mm_cvtepi32_ps(input_32);

		// convert fourth fourth to 32bit integer
		input_32 = _mm_unpackhi_epi16(input_16, ZERO);
		// convert fourth fourth to 32bit floating point value
		out_output[3] = _mm_cvtepi32_ps(input_32);
	}
static inline void
yuv_to_packed_shader_3(uint8_t** dstp, const uint8_t** srcp, const int dpitch,
    const int spitch, const int width, const int height, void* _buff) noexcept
{
    const uint8_t* sr = srcp[0];
    const uint8_t* sg = srcp[1];
    const uint8_t* sb = srcp[2];

    uint8_t* d = dstp[0];
    float* buff = reinterpret_cast<float*>(_buff);

    const uint8_t *rlsb, *glsb, *blsb;
    if (STACK16) {
        rlsb = sr + height * spitch;
        glsb = sg + height * spitch;
        blsb = sb + height * spitch;
    }

    const __m128i zero = _mm_setzero_si128();
    const __m128 rcp = _mm_set1_ps(1.0f / (STACK16 ? 65535 : 255));

    for (int y = 0; y < height; ++y) {
        for (int x = 0; x < width; x += 4) {
            __m128i r, g, b;
            if (!STACK16) {
                r = _mm_cvtepu8_epi32(loadl(sr + x));
                g = _mm_cvtepu8_epi32(loadl(sg + x));
                b = _mm_cvtepu8_epi32(loadl(sb + x));
            } else {
                r = _mm_unpacklo_epi16(_mm_unpacklo_epi8(loadl(rlsb + x), loadl(sr + x)), zero);
                g = _mm_unpacklo_epi16(_mm_unpacklo_epi8(loadl(glsb + x), loadl(sg + x)), zero);
                b = _mm_unpacklo_epi16(_mm_unpacklo_epi8(loadl(blsb + x), loadl(sb + x)), zero);
            }
            __m128i rg = _mm_unpacklo_epi32(r, g);
            __m128i ba = _mm_unpacklo_epi32(b, zero);
            __m128 rgba0 = _mm_cvtepi32_ps(_mm_unpacklo_epi64(rg, ba));
            __m128 rgba1 = _mm_cvtepi32_ps(_mm_unpackhi_epi64(rg, ba));
            _mm_store_ps(buff + 4 * x + 0, _mm_mul_ps(rgba0, rcp));
            _mm_store_ps(buff + 4 * x + 4, _mm_mul_ps(rgba1, rcp));

            rg = _mm_unpackhi_epi32(r, g);
            ba = _mm_unpackhi_epi32(b, zero);
            rgba0 = _mm_cvtepi32_ps(_mm_unpacklo_epi64(rg, ba));
            rgba1 = _mm_cvtepi32_ps(_mm_unpackhi_epi64(rg, ba));
            _mm_store_ps(buff + 4 * x +  8, _mm_mul_ps(rgba0, rcp));
            _mm_store_ps(buff + 4 * x + 12, _mm_mul_ps(rgba1, rcp));
        }
        convert_float_to_half(d, buff, width * 4);
        d += dpitch;
        sr += spitch;
        sg += spitch;
        sb += spitch;
        if (STACK16) {
            rlsb += spitch;
            glsb += spitch;
            blsb += spitch;
        }
    }
}
Exemple #12
0
JL_DLLEXPORT __m128i test_m128i(__m128i a, __m128i b, __m128i c, __m128i d)
{
    // 64-bit x86 has only level 2 SSE, which does not have a <4 x int32> multiplication,
    // so we use floating-point instead, and assume caller knows about the hack.
    return _mm_add_epi32(a,
                         _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(b),
                                                    _mm_cvtepi32_ps(_mm_sub_epi32(c,d)))));
}
Exemple #13
0
template<> void
cvtScale_<short, int, float>( const short* src, size_t sstep,
           int* dst, size_t dstep, Size size,
           float scale, float shift )
{
    sstep /= sizeof(src[0]);
    dstep /= sizeof(dst[0]);

    for( ; size.height--; src += sstep, dst += dstep )
    {
        int x = 0;

         #if CV_SSE2
            if(USE_SSE2)//~5X
            {
                __m128 scale128 = _mm_set1_ps (scale);
                __m128 shift128 = _mm_set1_ps (shift);
                for(; x <= size.width - 8; x += 8 )
                {
                    __m128i r0 = _mm_loadl_epi64((const __m128i*)(src + x));
                    __m128i r1 = _mm_loadl_epi64((const __m128i*)(src + x + 4));
                    __m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16));
                    __m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r1, r1), 16));
                    rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128);
                    rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128);
                    r0 = _mm_cvtps_epi32(rf0);
                    r1 = _mm_cvtps_epi32(rf1);

                    _mm_storeu_si128((__m128i*)(dst + x), r0);
                    _mm_storeu_si128((__m128i*)(dst + x + 4), r1);
                }
            }
        #endif

        //We will wait Haswell
        /*
        #if CV_AVX
            if(USE_AVX)//2X - bad variant
            {
                ////TODO:AVX implementation (optimization?) required
                __m256 scale256 = _mm256_set1_ps (scale);
                __m256 shift256 = _mm256_set1_ps (shift);
                for(; x <= size.width - 8; x += 8 )
                {
                    __m256i buf = _mm256_set_epi32((int)(*(src+x+7)),(int)(*(src+x+6)),(int)(*(src+x+5)),(int)(*(src+x+4)),(int)(*(src+x+3)),(int)(*(src+x+2)),(int)(*(src+x+1)),(int)(*(src+x)));
                    __m256 r0 = _mm256_add_ps( _mm256_mul_ps(_mm256_cvtepi32_ps (buf), scale256), shift256);
                    __m256i res = _mm256_cvtps_epi32(r0);
                    _mm256_storeu_si256 ((__m256i*)(dst+x), res);
                }
            }
        #endif*/

        for(; x < size.width; x++ )
            dst[x] = saturate_cast<int>(src[x]*scale + shift);
    }
}
Exemple #14
0
 template <bool align> SIMD_INLINE void HogDirectionHistograms(const __m128i & t, const __m128i & l, const __m128i & r, const __m128i & b, Buffer & buffer, size_t col)
 {
     HogDirectionHistograms<align>(
         _mm_cvtepi32_ps(_mm_sub_epi32(_mm_unpacklo_epi16(r, K_ZERO), _mm_unpacklo_epi16(l, K_ZERO))), 
         _mm_cvtepi32_ps(_mm_sub_epi32(_mm_unpacklo_epi16(b, K_ZERO), _mm_unpacklo_epi16(t, K_ZERO))), 
         buffer, col + 0);
     HogDirectionHistograms<align>(
         _mm_cvtepi32_ps(_mm_sub_epi32(_mm_unpackhi_epi16(r, K_ZERO), _mm_unpackhi_epi16(l, K_ZERO))), 
         _mm_cvtepi32_ps(_mm_sub_epi32(_mm_unpackhi_epi16(b, K_ZERO), _mm_unpackhi_epi16(t, K_ZERO))), 
         buffer, col + 4);
 }
Exemple #15
0
void
process(
  struct dt_iop_module_t *self,
  dt_dev_pixelpipe_iop_t *piece,
  const void *const ivoid,
  void *ovoid,
  const dt_iop_roi_t *const roi_in,
  const dt_iop_roi_t *const roi_out)
{
  const  float divider  = (float)UINT16_MAX;
  const __m128 dividers = _mm_set_ps1(divider);

#ifdef _OPENMP
  #pragma omp parallel for default(none) schedule(static) shared(ovoid)
#endif
  for(int j = 0; j < roi_out->height; j++)
  {
    const uint16_t *in = ((uint16_t *)ivoid) + (size_t)j * roi_out->width;
    float *out = ((float *)ovoid) + (size_t)j * roi_out->width;

    int i = 0;
    int alignment = ((8 - (j * roi_out->width & (8 - 1))) & (8 - 1));

    // process unaligned pixels
    for ( ; i < alignment ; i++, out++, in++)
      *out = ((float)(*in)) / divider;

    // process aligned pixels with SSE
    for( ; i < roi_out->width - (8 - 1); i += 8, in += 8)
    {
      const __m128i input = _mm_load_si128((__m128i *)in);

      __m128i ilo = _mm_unpacklo_epi16(input, _mm_set1_epi16(0));
      __m128i ihi = _mm_unpackhi_epi16(input, _mm_set1_epi16(0));

      __m128 flo = _mm_cvtepi32_ps(ilo);
      __m128 fhi = _mm_cvtepi32_ps(ihi);

      flo = _mm_div_ps(flo, dividers);
      fhi = _mm_div_ps(fhi, dividers);

      _mm_stream_ps(out, flo);
      out += 4;
      _mm_stream_ps(out, fhi);
      out += 4;
    }

    // process the rest
    for( ; i < roi_out->width; i++, out++, in++)
      *out = ((float)(*in)) / divider;
  }
  _mm_sfence();
}
static void GF_FUNC_ALIGN VS_CC
proc_8bit(int radius, float *kernel, const uint8_t *srcp, float *buff,
          float *dstp, int width, int height, int src_stride, int dst_stride)
{
    int length = radius * 2 + 1;
    const uint8_t *p[17];
    for (int i = -radius; i <= radius; i++) {
        p[i + radius] = srcp + abs(i) * src_stride;
    }

    __m128i zero = _mm_setzero_si128();

    for (int y = 0; y < height; y++) {
        for (int x = 0; x < width; x += 16) {
            __m128 sum[4];
            sum[0] = _mm_setzero_ps();
            sum[1] = _mm_setzero_ps();
            sum[2] = _mm_setzero_ps();
            sum[3] = _mm_setzero_ps();
            
            for (int i = 0; i < length; i++) {
                __m128 f[4];
                __m128i xmm0 = _mm_load_si128((__m128i *)(p[i] + x));
                __m128i xmm1 = _mm_unpackhi_epi8(xmm0, zero);
                xmm0 = _mm_unpacklo_epi8(xmm0, zero);
                f[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(xmm0, zero));
                f[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(xmm0, zero));
                f[2] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(xmm1, zero));
                f[3] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(xmm1, zero));
                __m128 k = _mm_set1_ps(kernel[i]);
                
                for (int j = 0; j < 4; j++) {
                    sum[j] = _mm_add_ps(sum[j], _mm_mul_ps(k, f[j]));
                }
            }
            _mm_store_ps(buff + x,      sum[0]);
            _mm_store_ps(buff + x +  4, sum[1]);
            _mm_store_ps(buff + x +  8, sum[2]);
            _mm_store_ps(buff + x + 12, sum[3]);
        }
        proc_horizontal(buff, radius, length, width, kernel, dstp);
        
        for (int i = 0; i < length - 1; i++) {
            p[i] = p[i + 1];
        }
        p[length - 1] += (y < height - radius - 1 ? 1 : -1) * src_stride;
        dstp += dst_stride;
    }
}
Exemple #17
0
    void conv_Short1ToFloat2(void* dst, const void* s, s32 numSamples)
    {
        LSfloat* d = reinterpret_cast<LSfloat*>(dst);
        const LSshort* src = reinterpret_cast<const LSshort*>(s);

        s32 num = numSamples >> 3; //8個のshortをまとめて処理
        s32 offset = num << 3;
        s32 rem = numSamples - offset;

        const __m128i izero = _mm_setzero_si128();
        const __m128 fcoff = _mm_set1_ps(1.0f/32767.0f);

        const LSshort* p = src;
        LSfloat* q = d;
        for(s32 i=0; i<num; ++i){
            __m128i t = _mm_loadu_si128((const __m128i*)p);
            __m128i s16_0 = _mm_unpackhi_epi16(t, t);
            __m128i s16_1 = _mm_unpacklo_epi16(t, t);

            __m128i t1 = _mm_cmpgt_epi16(izero, s16_0);
            __m128i t2 = _mm_cmpgt_epi16(izero, s16_1);

            __m128i s32_0 = _mm_unpackhi_epi16(s16_0, t1);
            __m128i s32_1 = _mm_unpacklo_epi16(s16_0, t1);
            __m128i s32_2 = _mm_unpackhi_epi16(s16_1, t2);
            __m128i s32_3 = _mm_unpacklo_epi16(s16_1, t2);

            //32bit浮動小数点に変換
            __m128 f32_0 = _mm_mul_ps(_mm_cvtepi32_ps(s32_0), fcoff);
            __m128 f32_1 = _mm_mul_ps(_mm_cvtepi32_ps(s32_1), fcoff);
            __m128 f32_2 = _mm_mul_ps(_mm_cvtepi32_ps(s32_2), fcoff);
            __m128 f32_3 = _mm_mul_ps(_mm_cvtepi32_ps(s32_3), fcoff);

            _mm_storeu_ps((q+0), f32_3);
            _mm_storeu_ps((q+4), f32_2);
            _mm_storeu_ps((q+8), f32_1);
            _mm_storeu_ps((q+12), f32_0);

            p += 8;
            q += 16;
        }

        for(s32 i=0; i<rem; ++i){
            s32 j = i<<1;
            q[j+0] = toFloat(p[i]);
            q[j+1] = toFloat(p[i]);
        }
    }
static inline void
yuv_to_planar_shader_3(uint8_t** dstp, const uint8_t** srcp, const int dpitch,
    const int spitch, const int width, const int height, void* _buff) noexcept
{
    const __m128i zero = _mm_setzero_si128();
    const __m128 rcp = _mm_set1_ps(1.0f / (STACK16 ? 65535 : 255));

    float* buff = reinterpret_cast<float*>(_buff);

    for (int p = 0; p < 3; ++p) {
        const uint8_t* s = srcp[p];
        const uint8_t* lsb = s + height * spitch;
        uint8_t* d = dstp[p];

        for (int y = 0; y < height; ++y) {
            for (int x = 0; x < width; x += 16) {
                __m128i msbx = _mm_load_si128(reinterpret_cast<const __m128i*>(s + x));
                __m128i d0, lsbx;
                if (!STACK16) {
                    d0 = _mm_unpacklo_epi8(msbx, zero);
                } else {
                    lsbx = _mm_load_si128(reinterpret_cast<const __m128i*>(lsb + x));
                    d0 = _mm_unpacklo_epi8(lsbx, msbx);
                }
                __m128 f0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(d0, zero));
                _mm_store_ps(buff + x + 0, _mm_mul_ps(rcp, f0));
                f0 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(d0, zero));
                _mm_store_ps(buff + x + 4, _mm_mul_ps(rcp, f0));

                if (!STACK16) {
                    d0 = _mm_unpackhi_epi8(msbx, zero);
                } else {
                    d0 = _mm_unpackhi_epi8(lsbx, msbx);
                }
                f0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(d0, zero));
                _mm_store_ps(buff + x + 8, _mm_mul_ps(rcp, f0));
                f0 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(d0, zero));
                _mm_store_ps(buff + x + 12, _mm_mul_ps(rcp, f0));
            }
            convert_float_to_half(d, buff, width);
            s += spitch;
            d += dpitch;
            if (STACK16) {
                lsb += spitch;
            }
        }
    }
}
static inline __m128 
log2f4(__m128 x)
{
	__m128i exp = _mm_load_si128((__m128i*)_exp_mask);
	__m128i mant = _mm_load_si128((__m128i*)_mantissa_mask);
	__m128 one = _mm_load_ps(_ones_ps);
	__m128i i = _mm_castps_si128(x);
	__m128 e = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, exp), 23), _mm_load_si128((__m128i*)_one27)));
	__m128 m = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mant)), one);
	__m128 p;

	/* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[ */
#if LOG_POLY_DEGREE == 6
	p = POLY5( m, log_p5_0, log_p5_1, log_p5_2, log_p5_3, log_p5_4, log_p5_5);
#elif LOG_POLY_DEGREE == 5
	p = POLY4(m, log_p4_0, log_p4_1, log_p4_2, log_p4_3, log_p4_4);
#elif LOG_POLY_DEGREE == 4
	p = POLY3(m, log_p3_0, log_p3_1, log_p3_2, log_p3_3);
#elif LOG_POLY_DEGREE == 3
	p = POLY2(m, log_p2_0, log_p2_1, log_p2_2);
#else
#error
#endif

	/* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
	p = _mm_mul_ps(p, _mm_sub_ps(m, one));

	return _mm_add_ps(p, e);
}
/**
 * Count the number of bits set within the intersection of two
 * fingerprints using the SSSE3 instruction set.
 * @warning  1) fp1 & fp2 must be aligned to 16 byte boundaries.
 *           2) Use (get_cpuid_flags() & bit_SSSE3) from cpuid.h to
 *              test if the CPU supports the SSSE3 instructions.
 */
int chemfp_intersect_popcount_SSSE3(int size, const unsigned *fp1, const unsigned *fp2) {
#if defined(GENERATE_SSSE3)
  /* 2^5 loop iters might overflow 8-bit counter,
     so cap it at 2^4 iters per chunk */
  const int iters = 1 << 4;
  const int N = (size + 3) / 4;
  int i, count;
  __m128i count32 = _mm_setzero_si128();
  for (i = 0; i + iters * 4 <= N; i += iters * 4) {
    count32 = _mm_add_epi32(count32, intersect_popcount_SSSE3_helper(&fp1[i], &fp2[i], iters));
  }
  if (i < N) {
    count32 = _mm_add_epi32(count32, intersect_popcount_SSSE3_helper(&fp1[i], &fp2[i], (N - i + 3) / 4));
  }
  /* Layout coming from PSADBW accumulation is 2*{0,32}: 0 S1 0 S0 */
  count = _mm_cvt_ss2si(_mm_cvtepi32_ps(_mm_add_epi32(
     count32, _mm_shuffle_epi32(count32, _MM_SHUFFLE(2, 2, 2, 2)))));
  return count;
#else
  UNUSED(size);
  UNUSED(fp1);
  UNUSED(fp2);
  return 0;
#endif
}
Exemple #21
0
static inline __m128 sigmoid_positive_ps( __m128 xin )
{
	union {
		__m128i i;
		int32_t i32[4];
	} i;
	__m128 ex;
	float *ex_elem = (float*) &ex;
	__m128 x1 = _mm_min_ps( xin, tens.ps );

	x1 = _mm_mul_ps( x1, tens.ps );
	i.i  = _mm_cvttps_epi32( x1 );
 	ex_elem[0] = e[i.i32[0]];
	ex_elem[1] = e[i.i32[1]];
	ex_elem[2] = e[i.i32[2]];
	ex_elem[3] = e[i.i32[3]];
	x1 = _mm_sub_ps( x1, _mm_cvtepi32_ps( i.i ) ); 
	x1 = _mm_add_ps( x1, tens.ps );
	x1 = _mm_mul_ps( x1, ex );
	x1 = _mm_add_ps( x1, ones.ps ); 
#ifdef __FAST_MATH__
	return _mm_rcp_ps( x1 );
#else
	return _mm_div_ps( ones.ps, x1 );
#endif
}
Exemple #22
0
static inline __v4sf
sse_init_newton (__v4sf x, double exponent, double c0, double c1, double c2)
{
    double norm = exponent*M_LN2/FLT_MANTISSA;
    __v4sf y = _mm_cvtepi32_ps((__m128i)((__v4si)x - splat4i(FLT_ONE)));
    return splat4f(c0) + splat4f(c1*norm)*y + splat4f(c2*norm*norm)*y*y;
}
Exemple #23
0
static inline __m128
lanczos_sse(__m128 width, __m128 t)
{
    /* Compute a value for sinf(pi.t) in [-pi pi] for which the value will be
     * correct */
    __m128i a = _mm_cvtps_epi32(t);
    __m128 r = _mm_sub_ps(t, _mm_cvtepi32_ps(a));

    // Compute the correct sign for sinf(pi.r)
    static const uint32_t fone[] __attribute__((aligned(SSE_ALIGNMENT))) = { 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000};
    static const uint32_t ione[] __attribute__((aligned(SSE_ALIGNMENT))) = { 1, 1, 1, 1};
    static const __m128 eps = {DT_LANCZOS_EPSILON, DT_LANCZOS_EPSILON, DT_LANCZOS_EPSILON, DT_LANCZOS_EPSILON};
    static const __m128 pi = {M_PI, M_PI, M_PI, M_PI};
    static const __m128 pi2 = {M_PI*M_PI, M_PI*M_PI, M_PI*M_PI, M_PI*M_PI};

    __m128i isign = _mm_and_si128(*(__m128i*)ione, a);
    isign = _mm_slli_epi64(isign, 31);
    isign = _mm_or_si128(*(__m128i*)fone, isign);
    __m128 fsign = _mm_castsi128_ps(isign);

    __m128 num = _mm_mul_ps(width, fsign);
    num = _mm_mul_ps(num, sinf_fast_sse(_mm_mul_ps(pi, r)));
    num = _mm_mul_ps(num, sinf_fast_sse(_mm_div_ps(_mm_mul_ps(pi, t), width)));
    num = _mm_add_ps(eps, num);

    __m128 den = _mm_mul_ps(pi2, _mm_mul_ps(t, t));
    den = _mm_add_ps(eps, den);

    return _mm_div_ps(num, den);
}
void SubpixelMaximizer::fitUsingSSE3(float coef[FitMatrix::ROWS], const signed short data[3][3][3]) const
{
  assert(FitMatrix::PADDEDCOLS == 32);
  __m128 localFitMatrixScale = _mm_set_ss(fitMatrix.scale);
  const short* localFitMatrix = fitMatrix();
  // Load data into four SSE Registers
  __m128i x[4];
  signed short* dataFlat = (signed short*) data; // flat arraw of 27 signed shorts
  x[0] = _mm_loadu_si128((__m128i*)(dataFlat + 0));
  x[1] = _mm_loadu_si128((__m128i*)(dataFlat + 8));
  x[2] = _mm_loadu_si128((__m128i*)(dataFlat + 16));
  x[3] = _mm_loadu_si128((__m128i*)(dataFlat + 24));
  x[3] = _mm_srli_si128(_mm_slli_si128(x[3], 10), 10);   // Clear dataFlat[27..31]

  for(int i = 0; i < FitMatrix::ROWS; i++)
  {
    // Compute scalar product between ((float*)x)[0..31] and localFitMatrix
    __m128i sum =             _mm_madd_epi16(x[0], *(__m128i*)(localFitMatrix + 0));
    sum = _mm_add_epi32(sum, _mm_madd_epi16(x[1], *(__m128i*)(localFitMatrix + 8)));
    sum = _mm_add_epi32(sum, _mm_madd_epi16(x[2], *(__m128i*)(localFitMatrix + 16)));
    sum = _mm_add_epi32(sum, _mm_madd_epi16(x[3], *(__m128i*)(localFitMatrix + 24)));
    sum = _mm_hadd_epi32(sum, sum);
    sum = _mm_hadd_epi32(sum, sum);
    _mm_store_ss(coef + i, _mm_mul_ss(_mm_cvtepi32_ps(sum), localFitMatrixScale));
    localFitMatrix += 32;
  }
}
Exemple #25
0
void 
bp_ray_trace_packet (const ray4_t *ray, vector_t *colors, simd4i_t srcprim_id, int depth, simd4_t fdepth)
{
	unsigned int activeMask;
	intersect4_t isect4;

	ASSIGN (colors [0], background);
	ASSIGN (colors [1], background);
	ASSIGN (colors [2], background);
	ASSIGN (colors [3], background);

 	if ((depth > curr_scene->settings.max_trace_level) 
	    | (simd4_extract_sign (simd4_float_lt (fdepth, simd4_from_float (curr_scene->settings.adc_bailout))) == 0xf)) {
		return;
	}

 	isect4.prim_id = simd4i_minus_ones;

 	bp_kd_tree_packet_find_nearest (curr_scene->kd_tree_root, ray, &isect4);

	activeMask = _mm_movemask_ps (simd4_float_eq (_mm_cvtepi32_ps (isect4.prim_id), simd4f_minus_ones));

 	/* If there was no intersection terminate early */
	if (activeMask == 0xF)
		return;

 	bp_shade_packet (curr_scene, &isect4, ray, colors, depth, fdepth, srcprim_id);
}
static inline __m128 
exp2f4(__m128 x)
{
	__m128i ipart;
	__m128 fpart, expipart, expfpart;

	x = _mm_min_ps(x, _mm_load_ps(_one29_ps));
	x = _mm_max_ps(x, _mm_load_ps(_minusone27_ps));

	/* ipart = int(x - 0.5) */
	ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_load_ps(_half_ps)));

	/* fpart = x - ipart */
	fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));

	/* expipart = (float) (1 << ipart) */
	expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_load_si128((__m128i*)_one27)), 23));

	/* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
#if EXP_POLY_DEGREE == 5
	expfpart = POLY5(fpart, exp_p5_0, exp_p5_1, exp_p5_2, exp_p5_3, exp_p5_4, exp_p5_5);
#elif EXP_POLY_DEGREE == 4
	expfpart = POLY4(fpart, exp_p4_0, exp_p4_1, exp_p4_2, exp_p4_3, exp_p4_4);
#elif EXP_POLY_DEGREE == 3
	expfpart = POLY3(fpart, exp_p3_0, exp_p3_1, exp_p3_2, exp_p3_3);
#elif EXP_POLY_DEGREE == 2
	expfpart = POLY2(fpart, exp_p2_0, exp_p2_1, exp_p2_2);
#else
#error
#endif

	return _mm_mul_ps(expipart, expfpart);
}
Exemple #27
0
__m128 test_mm_cvtepi32_ps(__m128i A) {
  // DAG-LABEL: test_mm_cvtepi32_ps
  // DAG: call <4 x float> @llvm.x86.sse2.cvtdq2ps
  //
  // ASM-LABEL: test_mm_cvtepi32_ps
  // ASM: cvtdq2ps
  return _mm_cvtepi32_ps(A);
}
Exemple #28
0
void SoundSSE::unpack_8bit_stereo(unsigned char *input, int size, float *output[2])
{
#ifndef CL_DISABLE_SSE2
	int sse_size = (size/16)*16;

	__m128i zero = _mm_setzero_si128();
	__m128 constant1 = _mm_set1_ps(1.0f/128.0f);
	__m128i constant2 = _mm_set1_epi16(128);
	for (int i = 0; i < sse_size; i+=16)
	{
		__m128i isamples = _mm_loadu_si128((__m128i*)(input+i));
		__m128i isamples0 = _mm_sub_epi16(_mm_unpacklo_epi8(isamples, zero), constant2);
		__m128i isamples1 = _mm_sub_epi16(_mm_unpackhi_epi8(isamples, zero), constant2);
		__m128 samples0 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(zero, isamples0), 16));
		__m128 samples1 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(zero, isamples0), 16));
		__m128 samples2 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(zero, isamples1), 16));
		__m128 samples3 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(zero, isamples1), 16));
		samples0 = _mm_mul_ps(samples0, constant1);
		samples1 = _mm_mul_ps(samples1, constant1);
		samples2 = _mm_mul_ps(samples2, constant1);
		samples3 = _mm_mul_ps(samples3, constant1);

		__m128 tmp0, tmp1, tmp2, tmp3;
		tmp0 = _mm_shuffle_ps(samples0, samples1, _MM_SHUFFLE(2,0,2,0));
		tmp1 = _mm_shuffle_ps(samples0, samples1, _MM_SHUFFLE(3,1,3,1));
		tmp2 = _mm_shuffle_ps(samples2, samples3, _MM_SHUFFLE(2,0,2,0));
		tmp3 = _mm_shuffle_ps(samples2, samples3, _MM_SHUFFLE(3,1,3,1));

		 _mm_storeu_ps(output[0]+i/2, tmp0);
		 _mm_storeu_ps(output[1]+i/2, tmp1);
		 _mm_storeu_ps(output[0]+i/2+4, tmp2);
		 _mm_storeu_ps(output[1]+i/2+4, tmp3);
	}
#else
	const int sse_size = 0;
#endif
	// unpack remaining
	for (int i = sse_size; i < size; i+=2)
	{
		int value = input[i];
		output[0][i/2] = ((float) (value - 128)) / 128.0f;

		value = input[i+1];
		output[1][i/2] = ((float) (value - 128)) / 128.0f;
	}
}
Exemple #29
0
/* Calculate modulus using vectors */
inline vec4 _mm2_mod_ps2(vec4 a, vec4 aDiv) {
	vec4 c = a / aDiv;
	ivec4 i = _mm_cvttps_epi32(c);
	i = _mm_abs_epi32(i);
	vec4 cTrunc = _mm_cvtepi32_ps(i);
	vec4 base = cTrunc * aDiv;
	vec4 r = a - base;
	return r;
}
Exemple #30
0
 SIMD_INLINE __m128 WeightedSum32f(const WeightedRect & rect, size_t offset)
 {
     __m128i s0 = _mm_loadu_si128((__m128i*)(rect.p0 + offset));
     __m128i s1 = _mm_loadu_si128((__m128i*)(rect.p1 + offset));
     __m128i s2 = _mm_loadu_si128((__m128i*)(rect.p2 + offset));
     __m128i s3 = _mm_loadu_si128((__m128i*)(rect.p3 + offset));
     __m128i sum = _mm_sub_epi32(_mm_sub_epi32(s0, s1), _mm_sub_epi32(s2, s3));
     return _mm_mul_ps(_mm_cvtepi32_ps(sum), _mm_set1_ps(rect.weight));
 }