Esempio n. 1
0
    mul(avx_simd_complex_float<T> lhs, avx_simd_complex_float<T> rhs) {
        //lhs = [x1.real, x1.img, x2.real, x2.img, ...]
        //rhs = [y1.real, y1.img, y2.real, y2.img, ...]

        //ymm1 = [y1.real, y1.real, y2.real, y2.real, ...]
        __m256 ymm1 = _mm256_moveldup_ps(rhs.value);

        //ymm2 = [x1.img, x1.real, x2.img, x2.real]
        __m256 ymm2 = _mm256_permute_ps(lhs.value, 0b10110001);

        //ymm3 = [y1.imag, y1.imag, y2.imag, y2.imag]
        __m256 ymm3 = _mm256_movehdup_ps(rhs.value);

        //ymm4 = ymm2 * ymm3
        __m256 ymm4 = _mm256_mul_ps(ymm2, ymm3);

        //result = [(lhs * ymm1) -+ ymm4];

#ifdef __FMA__
        return _mm256_fmaddsub_ps(lhs.value, ymm1, ymm4);
#elif defined(__FMA4__)
        return _mm256_maddsub_ps(lhs.value, ymm1, ymm4);
#else
        __m256 tmp = _mm256_mul_ps(lhs.value, ymm1);
        return _mm256_addsub_ps(tmp, ymm4);
#endif
    }
	/****************************************************************
	 * This technique for efficient SIMD complex-complex multiplication was found at
	 *			https://software.intel.com/file/1000
	*****************************************************************/
	inline __m256 avx_multiply_float_complex_(const __m256& vecA, const __m256& vecB) {
		__m256 vec1 = _mm256_moveldup_ps(vecB);
		__m256 vec2 = _mm256_movehdup_ps(vecB);
		vec1 = _mm256_mul_ps(vecA,vec1);
		vec2 = _mm256_mul_ps(vecA,vec2); 
		vec2 = _mm256_permute_ps(vec2,0xB1); 
		return _mm256_addsub_ps(vec1,vec2);
	}
Esempio n. 3
0
// =============================================================
// ====================== RGBX2BGRX_32F ========================
// =============================================================
void _rgbx2bgrx_32f(const float* _src, float* _dest, unsigned int _width,
                    unsigned int _pitchs, unsigned int _pitchd,
                    unsigned int _start, unsigned int _stop) {

#ifdef USE_SSE

    const unsigned int widthz = (_pitchs/8);

    // Get start positions for buffers
    const float* tsrc;
    float* tdest;

    for( unsigned int y=_start; y<=_stop; ++y ) {
        tsrc = _src+(y*_pitchs);
        tdest = _dest+(y*_pitchd);
        for( unsigned int x=0; x<widthz; ++x ) {

#ifdef USE_AVX1
            const __m256 v0 = _mm256_load_ps(tsrc);
            tsrc+=8;

            __m256 r0 = _mm256_permute_ps(v0,0xc6);

            _mm256_store_ps(tdest, r0 );
            tdest+=8;
#else // NOT TESTED

            const __m128 v0 = _mm_load_ps(tsrc);
            tsrc+=4;
            const __m128 v1 = _mm_load_ps(tsrc);
            tsrc+=4;

            //__m128 r0 = _mm_shuffle_ps(v0,0xc6);
            //__m128 r1 = _mm_shuffle_ps(v1,0xc6);

            //_mm_store_ps(tdest, r0 ); tdest+=4;
            //_mm_store_ps(tdest, r1 ); tdest+=4;
#endif

        }
    }

#else
    const float* tsrc;
    float* tdest;

    for( unsigned int y=_start; y<=_stop; ++y ) {
        tsrc = _src+(y*_pitchs);
        tdest = _dest+(y*_pitchd);
        for( unsigned int x=0; x<_width; ++x ) {
            float t = tsrc[4*x];
            tdest[4*x] = tsrc[4*x+2];
            tdest[4*x+2] = t;
        }
    }
#endif
}
Esempio n. 4
0
bool PaTriList2(PA_STATE &pa, UINT slot, simdvector tri[3])
{
    simdvector &a = PaGetSimdVector(pa, 0, slot);
    simdvector &b = PaGetSimdVector(pa, 1, slot);
    simdvector &c = PaGetSimdVector(pa, 2, slot);
    simdscalar s;

    for (int i = 0; i < 4; ++i)
    {
        simdvector &v0 = tri[0];
        v0[i] = _simd_blend_ps(a[i], b[i], 0x92);
        v0[i] = _simd_blend_ps(v0[i], c[i], 0x24);
        v0[i] = _mm256_permute_ps(v0[i], 0x6C);
        s = _mm256_permute2f128_ps(v0[i], v0[i], 0x21);
        v0[i] = _simd_blend_ps(v0[i], s, 0x44);

        simdvector &v1 = tri[1];
        v1[i] = _simd_blend_ps(a[i], b[i], 0x24);
        v1[i] = _simd_blend_ps(v1[i], c[i], 0x49);
        v1[i] = _mm256_permute_ps(v1[i], 0xB1);
        s = _mm256_permute2f128_ps(v1[i], v1[i], 0x21);
        v1[i] = _simd_blend_ps(v1[i], s, 0x66);

        simdvector &v2 = tri[2];
        v2[i] = _simd_blend_ps(a[i], b[i], 0x49);
        v2[i] = _simd_blend_ps(v2[i], c[i], 0x92);
        v2[i] = _mm256_permute_ps(v2[i], 0xC6);
        s = _mm256_permute2f128_ps(v2[i], v2[i], 0x21);
        v2[i] = _simd_blend_ps(v2[i], s, 0x22);
    }

    SetNextPaState(pa, PaTriList0, PaTriListSingle0);
    pa.reset = true;
    pa.numPrimsComplete += KNOB_VS_SIMD_WIDTH;
    return true;
}
Esempio n. 5
0
    div(avx_simd_complex_float<T> lhs, avx_simd_complex_float<T> rhs) {
        //lhs = [x1.real, x1.img, x2.real, x2.img ...]
        //rhs = [y1.real, y1.img, y2.real, y2.img ...]

        //ymm0 = [y1.real, y1.real, y2.real, y2.real, ...]
        __m256 ymm0 = _mm256_moveldup_ps(rhs.value);

        //ymm1 = [y1.imag, y1.imag, y2.imag, y2.imag]
        __m256 ymm1 = _mm256_movehdup_ps(rhs.value);

        //ymm2 = [x1.img, x1.real, x2.img, x2.real]
        __m256 ymm2 = _mm256_permute_ps(lhs.value, 0b10110001);

        //ymm4 = [x.img * y.img, x.real * y.img]
        __m256 ymm4 = _mm256_mul_ps(ymm2, ymm1);

        //ymm5 = subadd((lhs * ymm0), ymm4)

#ifdef __FMA__
        __m256 ymm5 = _mm256_fmsubadd_ps(lhs.value, ymm0, ymm4);
#else
        __m256 t1    = _mm256_mul_ps(lhs.value, ymm0);
        __m256 t2    = _mm256_sub_ps(_mm256_set1_ps(0.0), ymm4);
        __m256 ymm5  = _mm256_addsub_ps(t1, t2);
#endif

        //ymm3 = [y.imag^2, y.imag^2]
        __m256 ymm3 = _mm256_mul_ps(ymm1, ymm1);

        //ymm0 = (ymm0 * ymm0 + ymm3)

#ifdef __FMA__
        ymm0 = _mm256_fmadd_ps(ymm0, ymm0, ymm3);
#else
        __m256 t3    = _mm256_mul_ps(ymm0, ymm0);
        ymm0         = _mm256_add_ps(t3, ymm3);
#endif

        //result = ymm5 / ymm0
        return _mm256_div_ps(ymm5, ymm0);
    }
Esempio n. 6
0
void
test8bit (void)
{
  i1 = _mm_cmpistrm (i2, i3, k4);	  /* { dg-error "the third argument must be an 8-bit immediate" } */
  k1 = _mm_cmpistri (i2, i3, k4);	  /* { dg-error "the third argument must be an 8-bit immediate" } */
  k1 = _mm_cmpistra (i2, i3, k4);	  /* { dg-error "the third argument must be an 8-bit immediate" } */
  k1 = _mm_cmpistrc (i2, i3, k4);	  /* { dg-error "the third argument must be an 8-bit immediate" } */
  k1 = _mm_cmpistro (i2, i3, k4);	  /* { dg-error "the third argument must be an 8-bit immediate" } */
  k1 = _mm_cmpistrs (i2, i3, k4);	  /* { dg-error "the third argument must be an 8-bit immediate" } */
  k1 = _mm_cmpistrz (i2, i3, k4);	  /* { dg-error "the third argument must be an 8-bit immediate" } */
  i1 = _mm_cmpestrm (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */
  k1 = _mm_cmpestri (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */
  k1 = _mm_cmpestra (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */
  k1 = _mm_cmpestrc (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */
  k1 = _mm_cmpestro (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */
  k1 = _mm_cmpestrs (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */
  k1 = _mm_cmpestrz (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */
  b1 = _mm256_blend_ps (b2, b3, k4);	  /* { dg-error "the last argument must be an 8-bit immediate" } */
  k1 = _cvtss_sh (f1, k4);		  /* { dg-error "the last argument must be an 8-bit immediate" } */
  i1 = _mm256_cvtps_ph (b2, k4);	  /* { dg-error "the last argument must be an 8-bit immediate" } */
  b1 = _mm256_dp_ps (b2, b3, k4);	  /* { dg-error "the last argument must be an 8-bit immediate" } */
  e1 = _mm256_permute2f128_pd (e2, e3, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */
  b1 = _mm256_permute2f128_ps (b2, b3, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */
  l1 = _mm256_permute2f128_si256 (l2, l3, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */
  b1 = _mm256_permute_ps (b2, k4);	  /* { dg-error "the last argument must be an 8-bit immediate" } */
  i1 = _mm_aeskeygenassist_si128 (i2, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */
  i1 = _mm_blend_epi16 (i2, i3, k4);	  /* { dg-error "the last argument must be an 8-bit immediate" } */
  i1 = _mm_clmulepi64_si128 (i2, i3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */
  i1 = _mm_cvtps_ph (a1, k4);		  /* { dg-error "the last argument must be an 8-bit immediate" } */
  d1 = _mm_dp_pd (d2, d3, k4);		  /* { dg-error "the last argument must be an 8-bit immediate" } */
  a1 = _mm_dp_ps (a2, a3, k4);		  /* { dg-error "the last argument must be an 8-bit immediate" } */
  a1 = _mm_insert_ps (a2, a3, k4);	  /* { dg-error "the last argument must be an 8-bit immediate" } */
  i1 = _mm_mpsadbw_epu8 (i2, i3, k4);	  /* { dg-error "the last argument must be an 8-bit immediate" } */
  a1 = _mm_permute_ps (a2, k4);		  /* { dg-error "the last argument must be an 8-bit immediate" } */
  i1 = _mm_slli_si128 (i2, k4);		  /* { dg-error "the last argument must be an 8-bit immediate" } */
  i1 = _mm_srli_si128 (i2, k4);		  /* { dg-error "the last argument must be an 8-bit immediate" } */
}
Esempio n. 7
0
INLINE avxb shuffle(const avxb& a) {
  return _mm256_permute_ps(a, _MM_SHUFFLE(i3, i2, i1, i0));
}
Esempio n. 8
0
 template<index_t index_0, index_t index_1, index_t index_2, index_t index_3> INLINE const avxi shuffle( const avxi& a ) {
   return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(a), _MM_SHUFFLE(index_3, index_2, index_1, index_0)));
 }
Esempio n. 9
0
M_ALWAYS_INLINE
static void bar(float (& input)[8]) 
{
    
    /*
	static constexpr uint_fast8_t idx[][2] = {
		{0, 1}, {3, 2}, {4, 5}, {7, 6}, // (1)
		{0, 2}, {1, 3}, {6, 4}, {7, 5}, // (2)
		{0, 1}, {2, 3}, {5, 4}, {7, 6}, // (3)
		{0, 4}, {1, 5}, {2, 6}, {3, 7}, // (4)
		{0, 2}, {1, 3}, {4, 6}, {5, 7}, // (5)
		{0, 1}, {2, 3}, {4, 5}, {6, 7} // (6)
	};
    */
    // Индекса трябва да представим в по удобен вид за
    // AVX инструкциите. Няма смисъл от цикъл и после развиване
    // защото (4)-тия случай е специален... По добре на ръка.
    
    static constexpr int blend_mask_1 =0b10011001;
    static constexpr int blend_mask_2=0b11000011;
    static constexpr int blend_mask_3 =0b10100101;
    static constexpr int blend_mask_4 =0b00001111;
    static constexpr int blend_mask_5=0b00110011;
    static constexpr int blend_mask_6=0b01010101;
    
    // Отговаря на (1), (3) и (6)
    static constexpr int permute_mask_1=0b10110001;
    
    
    // Отговаря на (2) и (5)
    static constexpr int permute_mask_2=0b01001110;
    
    
    __m256 result= _mm256_load_ps(input);
    
    // (1)  
    
    __m256 mapped=_mm256_permute_ps(result,permute_mask_1);
    
    __m256 min=_mm256_min_ps(result,mapped);
    __m256 max=_mm256_max_ps(result,mapped);
    
    result=_mm256_blend_ps(max,min,blend_mask_1);
    
    // (2)
    
    mapped=_mm256_permute_ps(result,permute_mask_2);
    
    min=_mm256_min_ps(result,mapped);
    max=_mm256_max_ps(result,mapped);
    
    result=_mm256_blend_ps(max,min,blend_mask_2);
    
    // (3)
    
    mapped=_mm256_permute_ps(result,permute_mask_1);
    
    min=_mm256_min_ps(result,mapped);
    max=_mm256_max_ps(result,mapped);
    
    result=_mm256_blend_ps(max,min,blend_mask_3);
    
    // (4) Специалния случай тук трябва да пермутираме
    // между двете половини на YMM регистъра.
    
    mapped=_mm256_permute2f128_ps(result,result,1);
   
    min=_mm256_min_ps(result,mapped);
    max=_mm256_max_ps(result,mapped);
    
    result=_mm256_blend_ps(max,min,blend_mask_4);
   
    // (5)
    
    mapped=_mm256_permute_ps(result,permute_mask_2);
    
    min=_mm256_min_ps(result,mapped);
    max=_mm256_max_ps(result,mapped);
    
    result=_mm256_blend_ps(max,min,blend_mask_5);
    
    // (6)
    
    mapped=_mm256_permute_ps(result,permute_mask_1);
    
    min=_mm256_min_ps(result,mapped);
    max=_mm256_max_ps(result,mapped);
    
    result=_mm256_blend_ps(max,min,blend_mask_6);
     /**/
    _mm256_store_ps(input,result);
}
Esempio n. 10
0
  }
  nl = (1 << log);
  *newLength = nl;
  float *ret = mallocf(nl + additionalLength);
  memcpy(ret, ptr, length * sizeof(float));
  memsetf(ret + length, 0.f, nl - length);
  return ret;
}

float *rmemcpyf(float *__restrict dest,
                const float *__restrict src, size_t length) {
#ifdef __AVX__
  for (int i = 0; i < (int)length - 7; i += 8) {
    __m256 vec = _mm256_loadu_ps(src + i);
    vec = _mm256_permute2f128_ps(vec, vec, 1);
    vec = _mm256_permute_ps(vec, 0x1B);
    _mm256_storeu_ps(dest + length - i - 8, vec);
  }

  for (size_t i = (length & ~0x7); i < length; i++) {
    dest[length - i - 1] = src[i];
  }
#elif defined(__ARM_NEON__)
  for (int i = 0; i < (int)length - 3; i += 4) {
    float32x4_t vec = vld1q_f32(src + i);
    vec = vrev64q_f32(vec);
    vec = vcombine_f32(vget_high_f32(vec), vget_low_f32(vec));
    vst1q_f32(dest + length - i - 4, vec);
  }

  for (size_t i = (length & ~0x3); i < length; i++) {
	// PRE: all vectors aligned, 
	//		imag_c = [i1,i1,...,i4,i4]
	//		vec = [v1r,v1i,...,v4r,v4i]
	//		component-wise multiplication
	// POST: returns [-i1*v1i,i1*v1r,...,-i4*v4i,i4*v4r]
	inline __m256 avx_multiply_float_imag_(const __m256& imag_c, const __m256& vec) {
		static const __m256 zero = _mm256_setzero_ps();
		__m256 vec1 = _mm256_mul_ps(imag_c,vec);
		vec1 = _mm256_permute_ps(vec1,0xB1);
		return _mm256_addsub_ps(zero,vec1);
	}
Esempio n. 12
0
inline  __m256 _mm256_broadcast_3_ss(__m256 a) {
	__m256 b = _mm256_permute_ps(a, _MM_SHUFFLE(3, 3, 3, 3));
	return _mm256_blend_ps(b, _mm256_permute2f128_ps(b, b, 1), 0xF0);
}
Esempio n. 13
0
inline  __m256 _mm256_broadcast_lo_ss(__m256 a) {
	__m256 b = _mm256_permute_ps(a, _MM_SHUFFLE(0, 0, 0, 0)); \
	return _mm256_blend_ps(b, _mm256_permute2f128_ps(b, b, 1), 0xF0); \
}
Esempio n. 14
0
__m256 test_mm256_permute_ps(__m256 a) {
  // Check if the mask is correct
  // CHECK: shufflevector{{.*}}<i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
  return _mm256_permute_ps(a, 0x1b);
}