Example #1
0
File: main.cpp Project: sclc/DPP
//-------------------------------------------------------------------
// blend
void
effect(float *pBuffer[], const Cbmp *bmp, const float weight)
{
    size_t width = bmp->getWidth();
    size_t height = bmp->getAbsHeight();

    const float fmul0 = weight;
    const float fmul1 = 1.0f - weight;
    __m256 weight0 = _mm256_broadcast_ss(&fmul0);
    __m256 weight1 = _mm256_broadcast_ss(&fmul1);

    float *pF[2];
    pF[0] = pBuffer[0];
    pF[1] = pBuffer[1];

    for (size_t y = 0; y < height; y++)
    {
        for (size_t x = 0; x < width; x += 8, pF[0] += 8, pF[1] += 8)
        {
            __m256 p0 = _mm256_load_ps(pF[0]);
            p0 = _mm256_mul_ps(p0, weight0);

            __m256 p1 = _mm256_load_ps(pF[1]);
            p1 = _mm256_mul_ps(p1, weight1);

            __m256 r = _mm256_add_ps(p0, p1);

            _mm256_store_ps(pF[0], r);
        }
    }
}
Example #2
0
/*
 * compute: s = sqrt( t**2 - x**2 - y**2 - z**2 ), with s, t, x, y, z
 * member variables of the st_coords structure arr
 */
void
comp_s(st_coords arr, int L)
{
  for(int i=0; i<L; i+=8) {
    __m256 x = _mm256_load_ps(&arr.x[i]);
    __m256 y = _mm256_load_ps(&arr.y[i]);
    __m256 z = _mm256_load_ps(&arr.z[i]);
    __m256 t = _mm256_load_ps(&arr.t[i]);
    register __m256 s0, s1; /* Temporaries */
    /* 
       _TODO_B_ 

       Complete this function using intrinsics so that is computes: 

       s[i:i+8] = sqrt(t[i:i+8]**2 - (x[i:i+8]**2 + y[i:i+8]**2 + z[i:i+8]**2))

       where, s, t, x, y, z are members of arr.

       You will use:
       
       _mm256_mul_ps();
       _mm256_add_ps();
       _mm256_sub_ps();
       _mm256_sqrt_ps();
    */
    
    _mm256_store_ps(&arr.s[i], s0);
  }  
  return;
}
Example #3
0
File: main.cpp Project: sclc/DPP
//-----------------------------------------------------------------
// SOA -> AOS
//
//    pBlu:  b0, b1, b2, b3, b4, ...
//    pGrn:  g0, g1, g2, g3, g4, ...
//    pRed:  r0, r1, r2, r3, r4, ...
// ->
//    pBgr: b0,g0,r0, b1,g1,r1, b2,g2,r2, b3,g3,r3, b4,g4,r4, ...
//
void
soa2aos(float *pBlu, float *pGrn, float *pRed, float *pBgr, const size_t length)
{
    __m128 *bgr = (__m128 *)pBgr;

    //1回に24ユニット、8x+8y+8z、x,y,z=float
    for (size_t i = 0; i < length; i += 24)
    {
        __m256 b = _mm256_load_ps(pBlu + (i / 3));
        __m256 g = _mm256_load_ps(pGrn + (i / 3));
        __m256 r = _mm256_load_ps(pRed + (i / 3));

        __m256 bg = _mm256_shuffle_ps(b, g, _MM_SHUFFLE(2, 0, 2, 0));
        __m256 gr = _mm256_shuffle_ps(g, r, _MM_SHUFFLE(3, 1, 3, 1));
        __m256 rb = _mm256_shuffle_ps(r, b, _MM_SHUFFLE(3, 1, 2, 0));

        __m256 r03 = _mm256_shuffle_ps(bg, rb, _MM_SHUFFLE(2, 0, 2, 0));
        __m256 r14 = _mm256_shuffle_ps(gr, bg, _MM_SHUFFLE(3, 1, 2, 0));
        __m256 r25 = _mm256_shuffle_ps(rb, gr, _MM_SHUFFLE(3, 1, 3, 1));

        *bgr++ = _mm256_castps256_ps128(r03);
        *bgr++ = _mm256_castps256_ps128(r14);
        *bgr++ = _mm256_castps256_ps128(r25);
        *bgr++ = _mm256_extractf128_ps(r03, 1);
        *bgr++ = _mm256_extractf128_ps(r14, 1);
        *bgr++ = _mm256_extractf128_ps(r25, 1);
    }
}
Example #4
0
/*
 * Compute: s = sqrt( t**2 - x**2 - y**2 - z**2 ), with s, t, x, y, z
 * member variables of the st_coords structure arr.
 *
 * Traverse elements randomly
 */
void
comp_s(st_coords arr, int L)
{
  for(int j=0; j<L; j+=8) {
    int i = (rand() % (L/8)) * 8;
    __m256 x = _mm256_load_ps(&arr.x[i]);
    __m256 y = _mm256_load_ps(&arr.y[i]);
    __m256 z = _mm256_load_ps(&arr.z[i]);
    __m256 t = _mm256_load_ps(&arr.t[i]);
#ifdef FMA
    register __m256 s0;
    s0 = _mm256_mul_ps(x, x);
    s0 = _mm256_fmadd_ps(y, y, s0);
    s0 = _mm256_fmadd_ps(z, z, s0);
    s0 = _mm256_fmsub_ps(t, t, s0);
    s0 = _mm256_sqrt_ps(s0);
#else
    register __m256 s0, s1;
    s1 = _mm256_mul_ps(x, x);
    s0 = _mm256_mul_ps(y, y);
    s1 = _mm256_add_ps(s0, s1);
    s0 = _mm256_mul_ps(z, z);
    s1 = _mm256_add_ps(s0, s1);
    s0 = _mm256_mul_ps(t, t);
    s1 = _mm256_sub_ps(s0, s1);
    s0 = _mm256_sqrt_ps(s1);
#endif
    
    _mm256_store_ps(&arr.s[i], s0);
  }  
  return;
}
static inline void rectifier_kernel_avx5(float *a, const size_t blocks) {
    for (size_t i = 0; i < blocks; ++i) {
        _mm256_store_ps( &a[i*8*5 + 0*8], _mm256_max_ps( _mm256_load_ps( &a[i*8*5 + 0*8] ) , _mm256_setzero_ps() ) );
        _mm256_store_ps( &a[i*8*5 + 1*8], _mm256_max_ps( _mm256_load_ps( &a[i*8*5 + 1*8] ) , _mm256_setzero_ps() ) );
        _mm256_store_ps( &a[i*8*5 + 2*8], _mm256_max_ps( _mm256_load_ps( &a[i*8*5 + 2*8] ) , _mm256_setzero_ps() ) );
        _mm256_store_ps( &a[i*8*5 + 3*8], _mm256_max_ps( _mm256_load_ps( &a[i*8*5 + 3*8] ) , _mm256_setzero_ps() ) );
        _mm256_store_ps( &a[i*8*5 + 4*8], _mm256_max_ps( _mm256_load_ps( &a[i*8*5 + 4*8] ) , _mm256_setzero_ps() ) );
    }
}
Example #6
0
void MaddMemcpy(float* arg1, float* arg2, float* arg3, int size1, int size2, float* result) {
    memcpy(arg2, arg1, size1);
    memcpy(arg3, arg1, size2);
    __m256 vec1 = _mm256_load_ps(arg1);
    __m256 vec2 = _mm256_load_ps(arg2);
    __m256 vec3 = _mm256_load_ps(arg3);
    __m256 res  = _mm256_fmadd_ps(vec1, vec2, vec3);
    _mm256_storeu_ps(result, res);
}
Example #7
0
irreg_poly_area_func_sign(float, _avx) {
    if (__builtin_expect(is_null(cords) || cords_len == 0, 0))
        return 0;

    __m256
        values_0_3,
        values_4_7,
        values_8_11,
        values_12_15,
        values_16_19 = _mm256_load_ps((const float *)&cords[0][0]),
        accum_sum = _mm256_setzero_ps();
    float accum_sum_aux;

    #define _float_cords_dot_prod(curr, next, index)                    \
        _mm256_dp_ps(                                                   \
            curr,                                                       \
            _mm256_xor_ps(                                              \
                _mm256_shuffle_ps(curr, _mm256_permute2f128_ps(curr, next, 0b00100001), 0b00011011),\
                _mm256_setr_ps(0, -0.0f, 0, -0.0f, 0, -0.0f, 0, -0.0f)  \
            ),                                                          \
            0b11110000 | (1 << (index))                                 \
        )


    unsigned long index;
    for (index = 0; index < (cords_len - 16); index += 16) {
        values_0_3   = values_16_19;
        values_4_7   = _mm256_load_ps((const float *)&cords[index + 4]);
        values_8_11  = _mm256_load_ps((const float *)&cords[index + 8]);
        values_12_15 = _mm256_load_ps((const float *)&cords[index + 12]);
        values_16_19 = _mm256_load_ps((const float *)&cords[index + 16]);

        accum_sum = _mm256_add_ps(
            accum_sum,
            _mm256_add_ps(
                _mm256_add_ps(
                    _float_cords_dot_prod(values_0_3, values_4_7, 0),
                    _float_cords_dot_prod(values_4_7, values_8_11, 1)
                ),
                _mm256_add_ps(
                    _float_cords_dot_prod(values_8_11, values_12_15, 2),
                    _float_cords_dot_prod(values_12_15, values_16_19, 3)
                )
            )
        );
    }

    accum_sum = _mm256_hadd_ps(accum_sum, _mm256_permute2f128_ps(accum_sum, accum_sum, 1)); // a0+a1, a2+a3, a4+a5, a6+a7, a4+a5, a6+a7, a0+a1, a2+a3
    accum_sum = _mm256_hadd_ps(accum_sum, accum_sum); // a0+a1+a2+a3, a4+a5+a6+a7, ...
    accum_sum = _mm256_hadd_ps(accum_sum, accum_sum); // a0+a1+a2+a3+a4+a5+a6+a7, ...
    for (accum_sum_aux = _mm_cvtss_f32(_mm256_castps256_ps128(accum_sum)); index < (cords_len - 1); index++)
        accum_sum_aux += _calc_diff_of_adj_prods(cords, index);

    return accum_sum_aux;
//    return scalar_half(scalar_abs(accum_sum_aux));
}
Example #8
0
void AlignedAvxMult(float* d, float const* a, float const* b)
{
	for(int i = 0; i < gNumFloats; i += 8)
	{
		__m256 v1 = _mm256_load_ps(&a[i]);
		__m256 v2 = _mm256_load_ps(&b[i]);
		__m256 r = _mm256_mul_ps(v1, v2);
		_mm256_store_ps(&d[i], r);
	}
}
Example #9
0
static void process_sinc(rarch_sinc_resampler_t *resamp, float *out_buffer)
{
    unsigned i;
    __m256 sum_l             = _mm256_setzero_ps();
    __m256 sum_r             = _mm256_setzero_ps();

    const float *buffer_l    = resamp->buffer_l + resamp->ptr;
    const float *buffer_r    = resamp->buffer_r + resamp->ptr;

    unsigned taps            = resamp->taps;
    unsigned phase           = resamp->time >> SUBPHASE_BITS;
#if SINC_COEFF_LERP
    const float *phase_table = resamp->phase_table + phase * taps * 2;
    const float *delta_table = phase_table + taps;
    __m256 delta             = _mm256_set1_ps((float)
                               (resamp->time & SUBPHASE_MASK) * SUBPHASE_MOD);
#else
    const float *phase_table = resamp->phase_table + phase * taps;
#endif

    for (i = 0; i < taps; i += 8)
    {
        __m256 buf_l  = _mm256_loadu_ps(buffer_l + i);
        __m256 buf_r  = _mm256_loadu_ps(buffer_r + i);

#if SINC_COEFF_LERP
        __m256 deltas = _mm256_load_ps(delta_table + i);
        __m256 sinc   = _mm256_add_ps(_mm256_load_ps(phase_table + i),
                                      _mm256_mul_ps(deltas, delta));
#else
        __m256 sinc   = _mm256_load_ps(phase_table + i);
#endif
        sum_l         = _mm256_add_ps(sum_l, _mm256_mul_ps(buf_l, sinc));
        sum_r         = _mm256_add_ps(sum_r, _mm256_mul_ps(buf_r, sinc));
    }

    /* hadd on AVX is weird, and acts on low-lanes
     * and high-lanes separately. */
    __m256 res_l = _mm256_hadd_ps(sum_l, sum_l);
    __m256 res_r = _mm256_hadd_ps(sum_r, sum_r);
    res_l        = _mm256_hadd_ps(res_l, res_l);
    res_r        = _mm256_hadd_ps(res_r, res_r);
    res_l        = _mm256_add_ps(_mm256_permute2f128_ps(res_l, res_l, 1), res_l);
    res_r        = _mm256_add_ps(_mm256_permute2f128_ps(res_r, res_r, 1), res_r);

    /* This is optimized to mov %xmmN, [mem].
     * There doesn't seem to be any _mm256_store_ss intrinsic. */
    _mm_store_ss(out_buffer + 0, _mm256_extractf128_ps(res_l, 0));
    _mm_store_ss(out_buffer + 1, _mm256_extractf128_ps(res_r, 0));
}
Example #10
0
static void quantize_block(const float *in_data, float *out_data, float *quant_tbl)
{
	int zigzag;

	__m256 result, dct_values, quant_values;
	__m256 factor = _mm256_set1_ps(0.25f);

	for (zigzag = 0; zigzag < 64; zigzag += 8)
	{
		// Set the dct_values for the current interation
		dct_values = _mm256_set_ps(in_data[UV_indexes[zigzag + 7]], in_data[UV_indexes[zigzag + 6]],
				in_data[UV_indexes[zigzag + 5]], in_data[UV_indexes[zigzag + 4]],
				in_data[UV_indexes[zigzag + 3]], in_data[UV_indexes[zigzag + 2]],
				in_data[UV_indexes[zigzag + 1]], in_data[UV_indexes[zigzag]]);

		// Multiply with 0.25 to divide by 4.0
		result = _mm256_mul_ps(dct_values, factor);

		// Load quant-values and multiply with previous product
		quant_values = _mm256_load_ps(quant_tbl + zigzag);
		result = _mm256_div_ps(result, quant_values);

		// Round off values and store in out_data buffer
		result = c63_mm256_roundhalfawayfromzero_ps(result);
		_mm256_store_ps(out_data + zigzag, result);
	}
}
static inline void rectifier_kernel_back_avx(float *a, const size_t blocks) {
    assert(blocks > 0);
    size_t i = blocks;
    do {
        _mm256_store_ps( &a[i*8], _mm256_max_ps( _mm256_load_ps( &a[i*8] ) , _mm256_setzero_ps() ) );
    } while(--i);
}
Example #12
0
File: Play.cpp Project: zhangce/nn
void aaaa(const int c, float * const a3, float * const a4, const float const * fck){
	__m256 a_fck = _mm256_load_ps(fck);

	for(int i=0;i<c;i+=16){
		
		__m256 b_i = _mm256_load_ps(&a4[i]);
		__m256 out_i = _mm256_mul_ps(b_i, a_fck);

		__m256 c_i = _mm256_load_ps(&a4[i+8]);
		__m256 out_i2 = _mm256_mul_ps(c_i, a_fck);

		_mm256_store_ps(&a3[i], out_i);
		_mm256_store_ps(&a3[i+8], out_i2);

	}
}
Example #13
0
float 
nv_vector_dot(const nv_matrix_t *vec1, int m1,
			  const nv_matrix_t *vec2, int m2)
{
	NV_ASSERT(vec1->n == vec2->n);
	
#if NV_ENABLE_AVX
	{
		NV_ALIGNED(float, mm[8], 32);
		__m256 x, u;
		int n;
		int pk_lp = (vec1->n & 0xfffffff8);
		float dp = 0.0f;
		
		u = _mm256_setzero_ps();
		for (n = 0; n < pk_lp; n += 8) {
			x = _mm256_load_ps(&NV_MAT_V(vec2, m2, n));
			u = _mm256_add_ps(u, _mm256_mul_ps(x, *(__m256 *)&NV_MAT_V(vec1, m1, n)));
		}
		_mm256_store_ps(mm, u);
		dp = mm[0] + mm[1] + mm[2] + mm[3] + mm[4] + mm[5] + mm[6] + mm[7];
		for (n = pk_lp; n < vec1->n; ++n) {
			dp += NV_MAT_V(vec1, m1, n) * NV_MAT_V(vec2, m2, n);
		}
		
		return dp;
	}
#elif NV_ENABLE_SSE2
	{
		NV_ALIGNED(float, mm[4], 16);
		__m128 x, u;
		int n;
		int pk_lp = (vec1->n & 0xfffffffc);
		float dp = 0.0f;

		u = _mm_setzero_ps();
		for (n = 0; n < pk_lp; n += 4) {
			x = _mm_load_ps(&NV_MAT_V(vec2, m2, n));
			u = _mm_add_ps(u,
				_mm_mul_ps(x, *(__m128 *)&NV_MAT_V(vec1, m1, n)));
		}
		_mm_store_ps(mm, u);
		dp = mm[0] + mm[1] + mm[2] + mm[3];
		for (n = pk_lp; n < vec1->n; ++n) {
			dp += NV_MAT_V(vec1, m1, n) * NV_MAT_V(vec2, m2, n);
		}
  
		return dp;
	}
#else
	{
		int n;
		float dp = 0.0f;
		for (n = 0; n < vec1->n; ++n) {
			dp += NV_MAT_V(vec1, m1, n) * NV_MAT_V(vec2, m2, n);
		}
		return dp;
	}
#endif
}
Example #14
0
inline float DatabaseBuilder::Distance(PackedSample* x, PackedSample* y)
{ 
#ifdef AVX
	//Black magic
	//But it does produce the same results as the not AVX code
	__m256 accumulator;
	__m256 x_s = _mm256_load_ps(x->Features);
	__m256 y_s = _mm256_load_ps(y->Features);
	__m256 result = _mm256_sub_ps(x_s, y_s);
	accumulator = _mm256_mul_ps(result, result);

	x_s = _mm256_load_ps(&x->Features[8]);
	y_s = _mm256_load_ps(&y->Features[8]);
	result = _mm256_sub_ps(x_s, y_s);
	result = _mm256_mul_ps(result, result);
	accumulator = _mm256_add_ps(accumulator, result);

	x_s = _mm256_load_ps(&x->Features[16]);
	y_s = _mm256_load_ps(&y->Features[16]);
	result = _mm256_sub_ps(x_s, y_s);
	result = _mm256_mul_ps(result, result);
	accumulator = _mm256_add_ps(accumulator, result);

	x_s = _mm256_load_ps(&x->Features[24]);
	y_s = _mm256_load_ps(&y->Features[24]);
	result = _mm256_sub_ps(x_s, y_s);
	result = _mm256_mul_ps(result, result);
	accumulator = _mm256_add_ps(accumulator, result);
	//We now have a vector of 8 floats

	__m256 t1 = _mm256_hadd_ps(accumulator, accumulator);
	__m256 t2 = _mm256_hadd_ps(t1, t1);
	__m128 t3 = _mm256_extractf128_ps(t2, 1);
	__m128 t4 = _mm_add_ss(_mm256_castps256_ps128(t2), t3);
	//And now we don't
	return std::sqrtf(_mm_cvtss_f32(t4));
#endif
#ifndef AVX
	//Can be autovectorized
	float accumulator[32];
	float distance = 0;
	for (int i = 0; i < 30; i++)
	{
		accumulator[i] = x->Features[i] - y->Features[i];
	}

	//If done properly this should be 4(8) instructions
	for (int i = 0; i < 30; i++)
	{
		distance += accumulator[i] * accumulator[i];
	}

	return std::sqrtf(distance);
#endif

	
}
Example #15
0
// =============================================================
// ====================== RGBX2BGRX_32F ========================
// =============================================================
void _rgbx2bgrx_32f(const float* _src, float* _dest, unsigned int _width,
                    unsigned int _pitchs, unsigned int _pitchd,
                    unsigned int _start, unsigned int _stop) {

#ifdef USE_SSE

    const unsigned int widthz = (_pitchs/8);

    // Get start positions for buffers
    const float* tsrc;
    float* tdest;

    for( unsigned int y=_start; y<=_stop; ++y ) {
        tsrc = _src+(y*_pitchs);
        tdest = _dest+(y*_pitchd);
        for( unsigned int x=0; x<widthz; ++x ) {

#ifdef USE_AVX1
            const __m256 v0 = _mm256_load_ps(tsrc);
            tsrc+=8;

            __m256 r0 = _mm256_permute_ps(v0,0xc6);

            _mm256_store_ps(tdest, r0 );
            tdest+=8;
#else // NOT TESTED

            const __m128 v0 = _mm_load_ps(tsrc);
            tsrc+=4;
            const __m128 v1 = _mm_load_ps(tsrc);
            tsrc+=4;

            //__m128 r0 = _mm_shuffle_ps(v0,0xc6);
            //__m128 r1 = _mm_shuffle_ps(v1,0xc6);

            //_mm_store_ps(tdest, r0 ); tdest+=4;
            //_mm_store_ps(tdest, r1 ); tdest+=4;
#endif

        }
    }

#else
    const float* tsrc;
    float* tdest;

    for( unsigned int y=_start; y<=_stop; ++y ) {
        tsrc = _src+(y*_pitchs);
        tdest = _dest+(y*_pitchd);
        for( unsigned int x=0; x<_width; ++x ) {
            float t = tsrc[4*x];
            tdest[4*x] = tsrc[4*x+2];
            tdest[4*x+2] = t;
        }
    }
#endif
}
void rectifier_avx_2(float *a, const size_t len) {
	float *p = a;
	for (; p + 8 <= &a[len]; p += 8) {
		_mm256_store_ps(p, _mm256_max_ps( _mm256_load_ps(p) , _mm256_setzero_ps() ) );
	}
	for (; p < &a[len]; ++p) {
		*p = *p > 0.0 ? *p : 0.0;
	}
}
static __forceinline void
convert_float_to_half(uint8_t* dstp, const float* srcp, size_t count)
{
    for (size_t x = 0; x < count; x += 8) {
        __m256 s = _mm256_load_ps(srcp + x);
        __m128i d = _mm256_cvtps_ph(s, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
        _mm_stream_si128(reinterpret_cast<__m128i*>(dstp + 2 * x), d);
    }
}
int main(int argc, const char * argv[])
{
    ALIGN32 float a1[ 16 ] = {
        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
    };
    ALIGN32 float a2[ 16 ] = {
        15, 12, 4, 7, 9, 0, 3, 13, 6, 10, 1, 8, 5, 11, 2, 14
    };
    ALIGN32 float aout[ 16 ];
    
    __m128 x1[ 4 ] = { _mm_load_ps( a1 ), _mm_load_ps( a1 + 4 ), _mm_load_ps( a1 + 8 ), _mm_load_ps( a1 + 12 ) };
    __m128 x2[ 4 ] = { _mm_load_ps( a2 ), _mm_load_ps( a2 + 4 ), _mm_load_ps( a2 + 8 ), _mm_load_ps( a2 + 12 ) };
    __m128 xout[ 4 ];
    
    __m256 y1[ 2 ] = { _mm256_load_ps( a1 ), _mm256_load_ps( a1 + 8 ) };
    __m256 y2[ 2 ] = { _mm256_load_ps( a2 ), _mm256_load_ps( a2 + 8 ) };
    __m256 yout[ 2 ];
    
    std::cout << "FPU Mult" << std::endl;
    mul( a1, a2, aout );
    trace( aout, 4 );
    
    std::cout << "SSE Mult" << std::endl;
    mulX4( x1, x2, xout );
    trace( xout, 4 );
    
    std::cout << "AVX2 Mult" << std::endl;
    mulX8( y1, y2, yout );
    trace( yout, 4 );
    
    std::cout << "FPU Transpose" << std::endl;
    transpose( a1, aout );
    trace( aout, 4 );
    
    std::cout << "SSE Transpose" << std::endl;
    transposeX4( x1, xout );
    trace( xout, 4 );
    
    std::cout << "AVX Transpose" << std::endl;
    transposeX8( y1, yout );
    trace( yout, 4 );
    
    return 0;
}
Example #19
0
static void scale_block(float *in_data, float *out_data)
{
	__m256 in_vector, result;

	// Load the a1 values into a register
	static float a1_values[8] __attribute__((aligned(32))) = { ISQRT2, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
			1.0f, 1.0f };
	__m256 a1 = _mm256_load_ps(a1_values);

	// Load the a2 values into a register for the exception case
	__m256 a2 = _mm256_set1_ps(ISQRT2);

	/* First row is an exception
	 * Requires two _mm256_mul_ps operations */
	in_vector = _mm256_load_ps(in_data);
	result = _mm256_mul_ps(in_vector, a1);
	result = _mm256_mul_ps(result, a2);
	_mm256_store_ps(out_data, result);

	// Remaining calculations can be done with one _mm256_mul_ps operation
	in_vector = _mm256_load_ps(in_data + 8);
	result = _mm256_mul_ps(in_vector, a1);
	_mm256_store_ps(out_data + 8, result);

	in_vector = _mm256_load_ps(in_data + 16);
	result = _mm256_mul_ps(in_vector, a1);
	_mm256_store_ps(out_data + 16, result);

	in_vector = _mm256_load_ps(in_data + 24);
	result = _mm256_mul_ps(in_vector, a1);
	_mm256_store_ps(out_data + 24, result);

	in_vector = _mm256_load_ps(in_data + 32);
	result = _mm256_mul_ps(in_vector, a1);
	_mm256_store_ps(out_data + 32, result);

	in_vector = _mm256_load_ps(in_data + 40);
	result = _mm256_mul_ps(in_vector, a1);
	_mm256_store_ps(out_data + 40, result);

	in_vector = _mm256_load_ps(in_data + 48);
	result = _mm256_mul_ps(in_vector, a1);
	_mm256_store_ps(out_data + 48, result);

	in_vector = _mm256_load_ps(in_data + 56);
	result = _mm256_mul_ps(in_vector, a1);
	_mm256_store_ps(out_data + 56, result);
}
Example #20
0
float dot_product(const int N, const float *X, const int incX, const float *Y,
                  const int incY) {
  __m256 accum = _mm256_setzero_ps();
  for (int i = 0; i < N; i += 8, X += 8 * incX, Y += 8 * incY) {
    __m256 xval = _mm256_load_ps(X);
    __m256 yval = _mm256_load_ps(Y);
    __m256 val = _mm256_mul_ps(xval, yval);
    accum = _mm256_add_ps(val, accum);
  }
  // Reduce the values in accum into the smallest 32-bit subsection
  // a0 a1 a2 a3 a4 a5 a6 a7 -> b0 b1 b2 b3
  __m128 accum2 = _mm_add_ps(_mm256_castps256_ps128(accum),
      _mm256_extractf128_ps(accum, 1));
  // b0 b1 b2 b3 -> c0 c1 b2 b3
  accum2 = _mm_add_ps(accum2,
      _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(accum2), 8)));
  __m128 final_val = _mm_add_ss(
      _mm_insert_ps(accum2, accum2, 0x4e), accum2);
  // Add the high and low halves
  return final_val[0];
}
Example #21
0
static void dct_1d_general(float* in_data, float* out_data, float lookup[64])
{
	__m256 current, dct_values, multiplied, sum;

	current = _mm256_broadcast_ss(in_data);
	dct_values = _mm256_load_ps(lookup);
	multiplied = _mm256_mul_ps(dct_values, current);
	sum = multiplied;

	// Broadcasts a single float (scalar) to every element in 'current'.
	current = _mm256_broadcast_ss(in_data + 1);
	// Loads DCT values from the lookup table. iDCT uses a transposed lookup table here.
	dct_values = _mm256_load_ps(lookup + 8);
	// Vertically multiply the scalar with the DCT values.
	multiplied = _mm256_mul_ps(dct_values, current);
	// Vertically add to the previous sum.
	sum = _mm256_add_ps(sum, multiplied);

	current = _mm256_broadcast_ss(in_data + 2);
	dct_values = _mm256_load_ps(lookup + 16);
	multiplied = _mm256_mul_ps(dct_values, current);
	sum = _mm256_add_ps(sum, multiplied);

	current = _mm256_broadcast_ss(in_data + 3);
	dct_values = _mm256_load_ps(lookup + 24);
	multiplied = _mm256_mul_ps(dct_values, current);
	sum = _mm256_add_ps(sum, multiplied);

	current = _mm256_broadcast_ss(in_data + 4);
	dct_values = _mm256_load_ps(lookup + 32);
	multiplied = _mm256_mul_ps(dct_values, current);
	sum = _mm256_add_ps(sum, multiplied);

	current = _mm256_broadcast_ss(in_data + 5);
	dct_values = _mm256_load_ps(lookup + 40);
	multiplied = _mm256_mul_ps(dct_values, current);
	sum = _mm256_add_ps(sum, multiplied);

	current = _mm256_broadcast_ss(in_data + 6);
	dct_values = _mm256_load_ps(lookup + 48);
	multiplied = _mm256_mul_ps(dct_values, current);
	sum = _mm256_add_ps(sum, multiplied);

	current = _mm256_broadcast_ss(in_data + 7);
	dct_values = _mm256_load_ps(lookup + 56);
	multiplied = _mm256_mul_ps(dct_values, current);
	sum = _mm256_add_ps(sum, multiplied);

	_mm256_store_ps(out_data, sum);
}
void rectifier_avx_3(float *a, const size_t len) {
	float *p = a;
	assert(len > 8);
	for (; (uintptr_t)p%32 != 0; ++p) {
		*p = *p > 0.0 ? *p : 0.0;
	}
	for (; p + 8 <= &a[len]; p += 8) {
		_mm256_stream_ps(p, _mm256_max_ps( _mm256_load_ps(p) , _mm256_setzero_ps() ) );
	}
	for (; p < &a[len]; ++p) {
		*p = *p > 0.0 ? *p : 0.0;
	}
}
Example #23
0
void 
nv_vector_add(nv_matrix_t *vec0, int m0,
			  const nv_matrix_t *vec1, int m1,
			  const nv_matrix_t *vec2, int m2)
{
	NV_ASSERT(vec1->n == vec2->n);
	NV_ASSERT(vec2->n == vec0->n);
	
#if NV_ENABLE_AVX
	{
		__m256 x;
		int n;
		int pk_lp = (vec1->n & 0xfffffff8);
		
		for (n = 0; n < pk_lp; n += 8) {
			x = _mm256_load_ps(&NV_MAT_V(vec1, m1, n));
			_mm256_store_ps(&NV_MAT_V(vec0, m0, n),
							_mm256_add_ps(x, *(const __m256 *)&NV_MAT_V(vec2, m2, n)));
		}
		for (n = pk_lp; n < vec1->n; ++n) {
			NV_MAT_V(vec0, m0, n) = NV_MAT_V(vec1, m1, n) + NV_MAT_V(vec2, m2, n);
		}
	}
#elif NV_ENABLE_SSE2
	{
		
		int n;
		int pk_lp = (vec1->n & 0xfffffffc);

#ifdef _OPENMP
//#pragma omp parallel for
#endif
		for (n = 0; n < pk_lp; n += 4) {
			__m128 x = _mm_load_ps(&NV_MAT_V(vec1, m1, n));
			_mm_store_ps(&NV_MAT_V(vec0, m0, n),
						 _mm_add_ps(x, *(const __m128 *)&NV_MAT_V(vec2, m2, n)));
		}
		for (n = pk_lp; n < vec1->n; ++n) {
			NV_MAT_V(vec0, m0, n) = NV_MAT_V(vec1, m1, n) + NV_MAT_V(vec2, m2, n);
		}
	}
#else
	{
		int n;
		for (n = 0; n < vec1->n; ++n) {
			NV_MAT_V(vec0, m0, n) = NV_MAT_V(vec1, m1, n) + NV_MAT_V(vec2, m2, n);
		}
	}
#endif
}
Example #24
0
static void dequantize_block(float *in_data, float *out_data, float *quant_tbl)
{
	int zigzag;

	// Temporary buffer
	float temp_buf[8] __attribute__((aligned(32)));

	__m256 result, dct_values, quant_values;
	__m256 factor = _mm256_set1_ps(0.25f);

	for (zigzag = 0; zigzag < 64; zigzag += 8)
	{
		// Load dct-values
		dct_values = _mm256_load_ps(in_data + zigzag);

		quant_values = _mm256_load_ps(quant_tbl + zigzag);
		result = _mm256_mul_ps(dct_values, quant_values);

		// Multiply with 0.25 to divide by 4.0
		result = _mm256_mul_ps(result, factor);

		// Round off products and store them temporarily
		result = c63_mm256_roundhalfawayfromzero_ps(result);
		_mm256_store_ps(temp_buf, result);

		// Store the results at the correct places in the out_data buffer
		out_data[UV_indexes[zigzag]] = temp_buf[0];
		out_data[UV_indexes[zigzag + 1]] = temp_buf[1];
		out_data[UV_indexes[zigzag + 2]] = temp_buf[2];
		out_data[UV_indexes[zigzag + 3]] = temp_buf[3];
		out_data[UV_indexes[zigzag + 4]] = temp_buf[4];
		out_data[UV_indexes[zigzag + 5]] = temp_buf[5];
		out_data[UV_indexes[zigzag + 6]] = temp_buf[6];
		out_data[UV_indexes[zigzag + 7]] = temp_buf[7];
	}
}
Example #25
0
void 
nv_vector_inv(nv_matrix_t *a, int am, const nv_matrix_t *x, int xm)
{
	NV_ASSERT(a->n >= x->n);
#if NV_ENABLE_AVX
	{
		__m256 xx, vv;
		int n;
		int pk_lp = (x->n & 0xfffffff8);

		vv = _mm256_set1_ps(1.0f);

		for (n = 0; n < pk_lp; n += 8) {
			xx = _mm256_load_ps(&NV_MAT_V(x, xm, n));
			xx = _mm256_div_ps(vv, xx);
			_mm256_store_ps(&NV_MAT_V(a, am, n), xx);
		}
		for (n = pk_lp; n < x->n; ++n) {
			NV_MAT_V(a, am, n) = 1.0f / NV_MAT_V(x, xm, n);
		}
	}
#elif NV_ENABLE_SSE2
	{
		__m128 xx, vv;
		int n;
		int pk_lp = (x->n & 0xfffffffc);

		vv = _mm_set1_ps(1.0f);

		for (n = 0; n < pk_lp; n += 4) {
			xx = _mm_load_ps(&NV_MAT_V(x, xm, n));
			xx = _mm_div_ps(vv, xx);
			_mm_store_ps(&NV_MAT_V(a, am, n), xx);
		}
		for (n = pk_lp; n < x->n; ++n) {
			NV_MAT_V(a, am, n) = 1.0f / NV_MAT_V(x, xm, n);
		}
	}
#else
	{
		int n;
		for (n = 0; n < x->n; ++n) {
			NV_MAT_V(a, am, n) = 1.0f / NV_MAT_V(x, xm, n);
		}
	}
#endif
}
Example #26
0
void warmup(float *x, float *y, int size, float alpha)
{
    #pragma ivdep
    int i;

    __m256 m = _mm256_set_ps(1.0/alpha, 2.0, 1.0/alpha, 2.0, 1.0/alpha, 2.0, 1.0/alpha, 2.0);
    #pragma vector aligned
    for (i=0; i<size; i+=4)
    {
        __m256 t = _mm256_load_ps(x+2*i);
        __m256 l = _mm256_mul_ps(t, m); // premultiply
        __m256 r = _mm256_permute2f128_ps( l , l , 1); // swap lower and higher 128 bits
        __m256 res = _mm256_hadd_ps(l, r);
        __m128 s = _mm256_extractf128_ps (res, 0);
        _mm_store_ps(y+i,s); // store it
    }
}
void polynomial(float *ret, const float *const r_values, int num) {
  // r*r*r*(10+r*(-15+r*6));

  __m256 const_6 = _mm256_set1_ps(6.0f);
  __m256 const_neg_15 = _mm256_set1_ps(-15.0f);
  __m256 const_10 = _mm256_set1_ps(10.0f);
  // constants

  const int loop_factor = 8;

  for (int i = 0; i < num; i+=loop_factor) {

#ifdef USE_IACA
  IACA_START
#endif
    __m256 r;
    __m256 left;
    __m256 right;
    // aligned load of 256 bits r
    r = _mm256_load_ps(&r_values[i]);
    left = _mm256_mul_ps(r, r); // r * r
#ifndef __FMA__

    right = _mm256_mul_ps(r, const_6); // r * 6
    left = _mm256_mul_ps(left, r); // r * r * r
    right = _mm256_add_ps(right, const_neg_15); //-15 + r * 6
    right = _mm256_mul_ps(right, r); //r * (-15 + r * 6)
    right = _mm256_add_ps(right, const_10); //10 + (r * (-15 + r * 6))

#else
    right = _mm256_fmadd_ps(r, const_6, const_neg_15);
    left = _mm256_mul_ps(left, r);

    right = _mm256_fmadd_ps(r, right, const_10);

#endif
    right = _mm256_mul_ps(right, left); // r*r*r *(10 + r * (-15 + r * 6))

    _mm256_store_ps(&ret[i], right); // store 8 values to ret[i]

  }
#ifdef USE_IACA
  IACA_END
#endif
}
Example #28
0
void	TransLut_FindIndexAvx2 <TransLut::MapperLin>::find_index (const TransLut::FloatIntMix val_arr [8], __m256i &index, __m256 &frac)
{
	assert (val_arr != 0);

	const __m256   scale     = _mm256_set1_ps (1 << LINLUT_RES_L2);
	const __m256i  offset    =
		_mm256_set1_epi32 (-LINLUT_MIN_F * (1 << LINLUT_RES_L2));
	const __m256i  val_min   = _mm256_setzero_si256 ();
	const __m256i  val_max   = _mm256_set1_epi32 (LINLUT_SIZE_F - 2);

	const __m256   v         =
		_mm256_load_ps (reinterpret_cast <const float *> (val_arr));
	const __m256   val_scl   = _mm256_mul_ps (v, scale);
	const __m256i  index_raw = _mm256_cvtps_epi32 (val_scl);
	__m256i        index_tmp = _mm256_add_epi32 (index_raw, offset);
	index_tmp = _mm256_min_epi32 (index_tmp, val_max);
	index     = _mm256_max_epi32 (index_tmp, val_min);
	frac      = _mm256_sub_ps (val_scl, _mm256_cvtepi32_ps (index_raw));
}
Example #29
0
    bool enqueue_try_nosync(ArenaT& arena, const T* entry)
    {
        const float* pSrc = (const float*)entry;
        float* pDst = (float*)&mCurBlock[mTail];

        auto lambda = [&](int32_t i)
        {
            __m256 vSrc = _mm256_load_ps(pSrc + i*KNOB_SIMD_WIDTH);
            _mm256_stream_ps(pDst + i*KNOB_SIMD_WIDTH, vSrc);
        };
            
        const uint32_t numSimdLines = sizeof(T) / (KNOB_SIMD_WIDTH*4);
        static_assert(numSimdLines * KNOB_SIMD_WIDTH * 4 == sizeof(T),
            "FIFO element size should be multiple of SIMD width.");

        UnrollerL<0, numSimdLines, 1>::step(lambda);

        mTail ++;
        if (mTail == mBlockSize)
        {
            if (++mCurBlockIdx < mBlocks.size())
            {
                mCurBlock = mBlocks[mCurBlockIdx];
            }
            else
            {
                T* newBlock = (T*)arena.AllocAligned(sizeof(T)*mBlockSize, KNOB_SIMD_WIDTH*4);
                SWR_ASSERT(newBlock);

                mBlocks.push_back(newBlock);
                mCurBlock = newBlock;
            }

            mTail = 0;
        }

        mNumEntries ++;
        return true;
    }
Example #30
0
// memory load and store operations
INLINE avxb load8b(const void* const a) {
  return _mm256_load_ps((const float*)a);
}