Esempio n. 1
0
bool PaQuadList1(PA_STATE &pa, UINT slot, simdvector tri[3])
{
    simdvector &a = PaGetSimdVector(pa, 0, slot);
    simdvector &b = PaGetSimdVector(pa, 1, slot);
    simdscalar s1, s2;

    for (int i = 0; i < 4; ++i)
    {
        simdscalar a0 = a[i];
        simdscalar b0 = b[i];

        s1 = _mm256_permute2f128_ps(a0, b0, 0x20);
        s2 = _mm256_permute2f128_ps(a0, b0, 0x31);

        simdvector &v0 = tri[0];
        v0[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(0, 0, 0, 0));

        simdvector &v1 = tri[1];
        v1[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(2, 1, 2, 1));

        simdvector &v2 = tri[2];
        v2[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(3, 2, 3, 2));
    }

    SetNextPaState(pa, PaQuadList0, PaQuadListSingle0);
    pa.reset = true;
    pa.numPrimsComplete += KNOB_VS_SIMD_WIDTH;
    return true;
}
static void NOINLINE mulX8( const __m256 *v1, const __m256 *v2, __m256 *vout )
{
    static const int ALIGN32 p1[ 8 ] = { 0, 0, 0, 0, 1, 1, 1, 1 };
    static const int ALIGN32 p2[ 8 ] = { 2, 2, 2, 2, 3, 3, 3, 3 };
    static const int ALIGN32 p3[ 8 ] = { 4, 4, 4, 4, 5, 5, 5, 5 };
    static const int ALIGN32 p4[ 8 ] = { 6, 6, 6, 6, 7, 7, 7, 7 };
    const __m256i perm1 = _mm256_load_si256( reinterpret_cast< const __m256i* >( p1 ) );
    const __m256i perm2 = _mm256_load_si256( reinterpret_cast< const __m256i* >( p2 ) );
    const __m256i perm3 = _mm256_load_si256( reinterpret_cast< const __m256i* >( p3 ) );
    const __m256i perm4 = _mm256_load_si256( reinterpret_cast< const __m256i* >( p4 ) );
    for( int r = 0; r < 2; r++ ) {
        __m256 a0 = _mm256_permutevar8x32_ps( v1[ r ], perm1 );
        __m256 a1 = _mm256_permutevar8x32_ps( v1[ r ], perm2 );
        __m256 a2 = _mm256_permutevar8x32_ps( v1[ r ], perm3 );
        __m256 a3 = _mm256_permutevar8x32_ps( v1[ r ], perm4 );
        
        __m256 b0 = _mm256_mul_ps( a0, v2[ 0 ] );
        __m256 b1 = _mm256_mul_ps( a1, v2[ 1 ] );
        __m256 b2 = _mm256_mul_ps( a2, v2[ 0 ] );
        __m256 b3 = _mm256_mul_ps( a3, v2[ 1 ] );
        
        __m256 c0 = _mm256_add_ps( b0, b1 );
        __m256 c1 = _mm256_add_ps( b2, b3 );
        __m256 d0 = _mm256_permute2f128_ps( c0, c1, _MM_SHUFFLE( 0, 2, 0, 0 ) );
        __m256 d1 = _mm256_permute2f128_ps( c0, c1, _MM_SHUFFLE( 0, 3, 0, 1 ) );
        vout[ r ] = _mm256_add_ps( d0, d1 );
    }
}
Esempio n. 3
0
static void process_sinc(rarch_sinc_resampler_t *resamp, float *out_buffer)
{
    unsigned i;
    __m256 sum_l             = _mm256_setzero_ps();
    __m256 sum_r             = _mm256_setzero_ps();

    const float *buffer_l    = resamp->buffer_l + resamp->ptr;
    const float *buffer_r    = resamp->buffer_r + resamp->ptr;

    unsigned taps            = resamp->taps;
    unsigned phase           = resamp->time >> SUBPHASE_BITS;
#if SINC_COEFF_LERP
    const float *phase_table = resamp->phase_table + phase * taps * 2;
    const float *delta_table = phase_table + taps;
    __m256 delta             = _mm256_set1_ps((float)
                               (resamp->time & SUBPHASE_MASK) * SUBPHASE_MOD);
#else
    const float *phase_table = resamp->phase_table + phase * taps;
#endif

    for (i = 0; i < taps; i += 8)
    {
        __m256 buf_l  = _mm256_loadu_ps(buffer_l + i);
        __m256 buf_r  = _mm256_loadu_ps(buffer_r + i);

#if SINC_COEFF_LERP
        __m256 deltas = _mm256_load_ps(delta_table + i);
        __m256 sinc   = _mm256_add_ps(_mm256_load_ps(phase_table + i),
                                      _mm256_mul_ps(deltas, delta));
#else
        __m256 sinc   = _mm256_load_ps(phase_table + i);
#endif
        sum_l         = _mm256_add_ps(sum_l, _mm256_mul_ps(buf_l, sinc));
        sum_r         = _mm256_add_ps(sum_r, _mm256_mul_ps(buf_r, sinc));
    }

    /* hadd on AVX is weird, and acts on low-lanes
     * and high-lanes separately. */
    __m256 res_l = _mm256_hadd_ps(sum_l, sum_l);
    __m256 res_r = _mm256_hadd_ps(sum_r, sum_r);
    res_l        = _mm256_hadd_ps(res_l, res_l);
    res_r        = _mm256_hadd_ps(res_r, res_r);
    res_l        = _mm256_add_ps(_mm256_permute2f128_ps(res_l, res_l, 1), res_l);
    res_r        = _mm256_add_ps(_mm256_permute2f128_ps(res_r, res_r, 1), res_r);

    /* This is optimized to mov %xmmN, [mem].
     * There doesn't seem to be any _mm256_store_ss intrinsic. */
    _mm_store_ss(out_buffer + 0, _mm256_extractf128_ps(res_l, 0));
    _mm_store_ss(out_buffer + 1, _mm256_extractf128_ps(res_r, 0));
}
Esempio n. 4
0
bool PaTriStrip1(PA_STATE &pa, UINT slot, simdvector tri[3])
{
    simdvector &a = PaGetSimdVector(pa, pa.prev, slot);
    simdvector &b = PaGetSimdVector(pa, pa.cur, slot);
    simdscalar s;

    for (int i = 0; i < 4; ++i)
    {
        simdscalar a0 = a[i];
        simdscalar b0 = b[i];
        /* Tri Pattern
          v0 -> 02244668
          v1 -> 11335577
          v2 -> 23456789
        */
        simdvector &v1 = tri[1];
        v1[i] = _simd_shuffle_ps(a0, a0, _MM_SHUFFLE(3, 3, 1, 1));

        simdvector &v2 = tri[2];
        s = _mm256_permute2f128_ps(a0, b0, 0x21);
        v2[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(1, 0, 3, 2));

        simdvector &v0 = tri[0];
        v0[i] = _simd_shuffle_ps(a0, v2[i], _MM_SHUFFLE(2, 0, 2, 0));
    }

    SetNextPaState(pa, PaTriStrip1, PaTriStripSingle0);
    pa.numPrimsComplete += KNOB_VS_SIMD_WIDTH;
    return true;
}
Esempio n. 5
0
irreg_poly_area_func_sign(float, _avx) {
    if (__builtin_expect(is_null(cords) || cords_len == 0, 0))
        return 0;

    __m256
        values_0_3,
        values_4_7,
        values_8_11,
        values_12_15,
        values_16_19 = _mm256_load_ps((const float *)&cords[0][0]),
        accum_sum = _mm256_setzero_ps();
    float accum_sum_aux;

    #define _float_cords_dot_prod(curr, next, index)                    \
        _mm256_dp_ps(                                                   \
            curr,                                                       \
            _mm256_xor_ps(                                              \
                _mm256_shuffle_ps(curr, _mm256_permute2f128_ps(curr, next, 0b00100001), 0b00011011),\
                _mm256_setr_ps(0, -0.0f, 0, -0.0f, 0, -0.0f, 0, -0.0f)  \
            ),                                                          \
            0b11110000 | (1 << (index))                                 \
        )


    unsigned long index;
    for (index = 0; index < (cords_len - 16); index += 16) {
        values_0_3   = values_16_19;
        values_4_7   = _mm256_load_ps((const float *)&cords[index + 4]);
        values_8_11  = _mm256_load_ps((const float *)&cords[index + 8]);
        values_12_15 = _mm256_load_ps((const float *)&cords[index + 12]);
        values_16_19 = _mm256_load_ps((const float *)&cords[index + 16]);

        accum_sum = _mm256_add_ps(
            accum_sum,
            _mm256_add_ps(
                _mm256_add_ps(
                    _float_cords_dot_prod(values_0_3, values_4_7, 0),
                    _float_cords_dot_prod(values_4_7, values_8_11, 1)
                ),
                _mm256_add_ps(
                    _float_cords_dot_prod(values_8_11, values_12_15, 2),
                    _float_cords_dot_prod(values_12_15, values_16_19, 3)
                )
            )
        );
    }

    accum_sum = _mm256_hadd_ps(accum_sum, _mm256_permute2f128_ps(accum_sum, accum_sum, 1)); // a0+a1, a2+a3, a4+a5, a6+a7, a4+a5, a6+a7, a0+a1, a2+a3
    accum_sum = _mm256_hadd_ps(accum_sum, accum_sum); // a0+a1+a2+a3, a4+a5+a6+a7, ...
    accum_sum = _mm256_hadd_ps(accum_sum, accum_sum); // a0+a1+a2+a3+a4+a5+a6+a7, ...
    for (accum_sum_aux = _mm_cvtss_f32(_mm256_castps256_ps128(accum_sum)); index < (cords_len - 1); index++)
        accum_sum_aux += _calc_diff_of_adj_prods(cords, index);

    return accum_sum_aux;
//    return scalar_half(scalar_abs(accum_sum_aux));
}
static void NOINLINE transposeX8( const __m256 *v1, __m256 *vout )
{
#if 0 // AVX1
    __m256 a0 = _mm256_unpacklo_ps( v1[ 0 ], v1[ 1 ] );
    __m256 a1 = _mm256_unpackhi_ps( v1[ 0 ], v1[ 1 ] );
    __m256 b0 = _mm256_permute2f128_ps( a0, a1, _MM_SHUFFLE( 0, 2, 0, 0 ) );
    __m256 b1 = _mm256_permute2f128_ps( a0, a1, _MM_SHUFFLE( 0, 3, 0, 1 ) );
    __m256 c0 = _mm256_unpacklo_ps( b0, b1 );
    __m256 c1 = _mm256_unpackhi_ps( b0, b1 );
    vout[ 0 ] = _mm256_permute2f128_ps( c0, c1, _MM_SHUFFLE( 0, 2, 0, 0 ) );
    vout[ 1 ] = _mm256_permute2f128_ps( c0, c1, _MM_SHUFFLE( 0, 3, 0, 1 ) );
#else // AVX2
    static const int ALIGN32 p1[ 8 ] = { 0, 4, 2, 6, 1, 5, 3, 7 };
    static const int ALIGN32 p2[ 8 ] = { 2, 6, 0, 4, 3, 7, 1, 5 };
    const __m256i perm1 = _mm256_load_si256( reinterpret_cast< const __m256i* >( p1 ) );
    const __m256i perm2 = _mm256_load_si256( reinterpret_cast< const __m256i* >( p2 ) );
    __m256 a0 = _mm256_permutevar8x32_ps( v1[ 0 ], perm1 );
    __m256 a1 = _mm256_permutevar8x32_ps( v1[ 1 ], perm2 );
    vout[ 0 ] = _mm256_blend_ps( a0, a1, 0xCC );
    vout[ 1 ] = _mm256_shuffle_ps( a0, a1, 0x4E );
#endif
}
Esempio n. 7
0
File: main.cpp Progetto: sclc/DPP
int
main(void)
{
    __m256  da = _mm256_setr_ps(1,2,3,4,5,6,7,8);
    __m256  db = _mm256_setr_ps(11,12,13,14,15,16,17,18);
    __m256  dc;

    printDc("da: ", da);
    printDc("db: ", db);
    printf("\n");


    dc = _mm256_permute2f128_ps(da, db, 0x02);
    printDc("dc: ", dc);

    dc = _mm256_permute2f128_ps(da, db, 0x02|0x08);
    printDc("dc: ", dc);

    dc = _mm256_permute2f128_ps(da, db, 0x21);
    printDc("dc: ", dc);

    return 0;
}
Esempio n. 8
0
/* Adjust MBR to fit all child MBRs */
inline 
void 
adjustMbrArraySTRNode(ArraySTRNode nodes[], ulong_t cur)
{
	ArraySTRNode *node, *child;
	ulong_t k;
	
	node = &nodes[cur];
	child = &nodes[node->pos];
	
	/* enlarge mbr to include all childlen's mbr */
#ifdef ENABLE_SSE_ADJUST 
	{
		__m128 v_nlow = _mm_load_ps(child[0].mbr.low);
		__m128 v_nupp = _mm_load_ps(child[0].mbr.upp);
		for (k = 1; k < node->len; k++) {
			v_nlow = _mm_min_ps(v_nlow, _mm_load_ps(child[k].mbr.low));
			v_nupp = _mm_max_ps(v_nupp, _mm_load_ps(child[k].mbr.upp));
		}
		_mm_store_ps(node->mbr.low, v_nlow);
		_mm_store_ps(node->mbr.upp, v_nupp);
	}
#else
#ifdef ENABLE_AVX_TEST1	
	{
		__m256 v_nmbr = _mm256_loadu_ps((float *)&child[0].mbr);
		for (k = 1; k < node->len; k++) {
			__m256 v_cmbr = _mm256_loadu_ps((float *)&child[k].mbr);
			__m256 v_min = _mm256_min_ps(v_nmbr, v_cmbr);
			__m256 v_max = _mm256_max_ps(v_nmbr, v_cmbr);
			v_nmbr = _mm256_permute2f128_ps(v_min, v_max, 0x12);
		}
		_mm256_storeu_ps((float *)&node->mbr, v_nmbr);
	}
#else
	/* copy first child's mbr */
	node->mbr = child[0].mbr;
	for (k = 1; k < node->len; k++) {
		int i;
		for (i = 0; i < NDIMS; i++) {
			if (node->mbr.low[i] > child[k].mbr.low[i])
				node->mbr.low[i] = child[k].mbr.low[i];
			if (node->mbr.upp[i] < child[k].mbr.upp[i])
				node->mbr.upp[i] = child[k].mbr.upp[i];
		}
	}
#endif
#endif
}
Esempio n. 9
0
bool PaTriList2(PA_STATE &pa, UINT slot, simdvector tri[3])
{
    simdvector &a = PaGetSimdVector(pa, 0, slot);
    simdvector &b = PaGetSimdVector(pa, 1, slot);
    simdvector &c = PaGetSimdVector(pa, 2, slot);
    simdscalar s;

    for (int i = 0; i < 4; ++i)
    {
        simdvector &v0 = tri[0];
        v0[i] = _simd_blend_ps(a[i], b[i], 0x92);
        v0[i] = _simd_blend_ps(v0[i], c[i], 0x24);
        v0[i] = _mm256_permute_ps(v0[i], 0x6C);
        s = _mm256_permute2f128_ps(v0[i], v0[i], 0x21);
        v0[i] = _simd_blend_ps(v0[i], s, 0x44);

        simdvector &v1 = tri[1];
        v1[i] = _simd_blend_ps(a[i], b[i], 0x24);
        v1[i] = _simd_blend_ps(v1[i], c[i], 0x49);
        v1[i] = _mm256_permute_ps(v1[i], 0xB1);
        s = _mm256_permute2f128_ps(v1[i], v1[i], 0x21);
        v1[i] = _simd_blend_ps(v1[i], s, 0x66);

        simdvector &v2 = tri[2];
        v2[i] = _simd_blend_ps(a[i], b[i], 0x49);
        v2[i] = _simd_blend_ps(v2[i], c[i], 0x92);
        v2[i] = _mm256_permute_ps(v2[i], 0xC6);
        s = _mm256_permute2f128_ps(v2[i], v2[i], 0x21);
        v2[i] = _simd_blend_ps(v2[i], s, 0x22);
    }

    SetNextPaState(pa, PaTriList0, PaTriListSingle0);
    pa.reset = true;
    pa.numPrimsComplete += KNOB_VS_SIMD_WIDTH;
    return true;
}
Esempio n. 10
0
void warmup(float *x, float *y, int size, float alpha)
{
    #pragma ivdep
    int i;

    __m256 m = _mm256_set_ps(1.0/alpha, 2.0, 1.0/alpha, 2.0, 1.0/alpha, 2.0, 1.0/alpha, 2.0);
    #pragma vector aligned
    for (i=0; i<size; i+=4)
    {
        __m256 t = _mm256_load_ps(x+2*i);
        __m256 l = _mm256_mul_ps(t, m); // premultiply
        __m256 r = _mm256_permute2f128_ps( l , l , 1); // swap lower and higher 128 bits
        __m256 res = _mm256_hadd_ps(l, r);
        __m128 s = _mm256_extractf128_ps (res, 0);
        _mm_store_ps(y+i,s); // store it
    }
}
Esempio n. 11
0
bool PaPoints0(PA_STATE &pa, UINT slot, simdvector tri[3])
{
    simdvector &a = PaGetSimdVector(pa, pa.cur, slot);

    for (UINT i = 0; i < 4; ++i)
    {
        __m256 vLow128 = _mm256_unpacklo_ps(a.v[i], a.v[i]);                // 0 0 1 1 4 4 5 5
        __m256 vHigh128 = _mm256_unpackhi_ps(a.v[i], a.v[i]);               // 2 2 3 3 6 6 7 7
        __m256 vCombined = _mm256_permute2f128_ps(vLow128, vHigh128, 0x20); // 0 0 1 1 2 2 3 3

        tri[0].v[i] = tri[1].v[i] = tri[2].v[i] = vCombined;
    }

    SetNextPaState(pa, PaPoints1, PaPointsSingle0, 1);
    pa.numPrimsComplete += KNOB_VS_SIMD_WIDTH;
    return true;
}
Esempio n. 12
0
bool PaTriFan0(PA_STATE &pa, UINT slot, simdvector tri[3])
{
    simdvector &a = PaGetSimdVector(pa, pa.cur, slot);

    // Extract vertex 0 to every lane of first vector
    for (int i = 0; i < 4; ++i)
    {
        __m256 a0 = a[i];
        simdvector &v0 = tri[0];
        v0[i] = _simd_shuffle_ps(a0, a0, _MM_SHUFFLE(0, 0, 0, 0));
        v0[i] = _mm256_permute2f128_ps(v0[i], a0, 0x00);
    }

    // store off leading vertex for attributes
    pa.leadingVertex = pa.vout[pa.cur];

    SetNextPaState(pa, PaTriFan1, PaTriFanSingle0);
    return false; // Not enough vertices to assemble 8 triangles.
}
Esempio n. 13
0
inline 
Mbr 
getMbrRTreeNode(RTreeNode *node)
{
	Mbr mbr;
	int k;
	
	mbr = node->mbrs[0];
	for (k = 1; k < node->nchilds; k++) {
#ifdef ENABLE_SSE_TEST1
                __m128 v_nlow = _mm_load_ps(mbr.low);
                __m128 v_nupp = _mm_load_ps(mbr.upp);
                __m128 v_clow = _mm_load_ps(node->mbrs[k].low);
                __m128 v_cupp = _mm_load_ps(node->mbrs[k].upp);
                _mm_store_ps(node->mbr.low, _mm_min_ps(v_nlow, v_clow));
                _mm_store_ps(node->mbr.upp, _mm_max_ps(v_nupp, v_cupp));
#else
#ifdef ENABLE_AVX_TEST1
                __m256 v_nmbr = _mm256_loadu_ps((float *)&mbr);
                __m256 v_cmbr = _mm256_loadu_ps((float *)&node->mbrs[k]);
                __m256 v_min = _mm256_min_ps(v_nmbr, v_cmbr);
                __m256 v_max = _mm256_max_ps(v_nmbr, v_cmbr);
                __m256 v_tmp;
                v_tmp = _mm256_permute2f128_ps(v_min, v_max, 0x12);
                _mm256_storeu_ps((float *)&mbr, v_tmp);
#else
		int i;
                for (i = 0; i < NDIMS; i++) {
                        if (mbr.low[i] > node->mbrs[k].low[i])
				mbr.low[i] = node->mbrs[k].low[i];
                        if (mbr.upp[i] < node->mbrs[k].upp[i])
                                mbr.upp[i] = node->mbrs[k].upp[i];
                }
#endif
#endif
	}
		return mbr;
}
Esempio n. 14
0
void
test8bit (void)
{
  i1 = _mm_cmpistrm (i2, i3, k4);	  /* { dg-error "the third argument must be an 8-bit immediate" } */
  k1 = _mm_cmpistri (i2, i3, k4);	  /* { dg-error "the third argument must be an 8-bit immediate" } */
  k1 = _mm_cmpistra (i2, i3, k4);	  /* { dg-error "the third argument must be an 8-bit immediate" } */
  k1 = _mm_cmpistrc (i2, i3, k4);	  /* { dg-error "the third argument must be an 8-bit immediate" } */
  k1 = _mm_cmpistro (i2, i3, k4);	  /* { dg-error "the third argument must be an 8-bit immediate" } */
  k1 = _mm_cmpistrs (i2, i3, k4);	  /* { dg-error "the third argument must be an 8-bit immediate" } */
  k1 = _mm_cmpistrz (i2, i3, k4);	  /* { dg-error "the third argument must be an 8-bit immediate" } */
  i1 = _mm_cmpestrm (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */
  k1 = _mm_cmpestri (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */
  k1 = _mm_cmpestra (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */
  k1 = _mm_cmpestrc (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */
  k1 = _mm_cmpestro (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */
  k1 = _mm_cmpestrs (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */
  k1 = _mm_cmpestrz (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */
  b1 = _mm256_blend_ps (b2, b3, k4);	  /* { dg-error "the last argument must be an 8-bit immediate" } */
  k1 = _cvtss_sh (f1, k4);		  /* { dg-error "the last argument must be an 8-bit immediate" } */
  i1 = _mm256_cvtps_ph (b2, k4);	  /* { dg-error "the last argument must be an 8-bit immediate" } */
  b1 = _mm256_dp_ps (b2, b3, k4);	  /* { dg-error "the last argument must be an 8-bit immediate" } */
  e1 = _mm256_permute2f128_pd (e2, e3, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */
  b1 = _mm256_permute2f128_ps (b2, b3, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */
  l1 = _mm256_permute2f128_si256 (l2, l3, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */
  b1 = _mm256_permute_ps (b2, k4);	  /* { dg-error "the last argument must be an 8-bit immediate" } */
  i1 = _mm_aeskeygenassist_si128 (i2, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */
  i1 = _mm_blend_epi16 (i2, i3, k4);	  /* { dg-error "the last argument must be an 8-bit immediate" } */
  i1 = _mm_clmulepi64_si128 (i2, i3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */
  i1 = _mm_cvtps_ph (a1, k4);		  /* { dg-error "the last argument must be an 8-bit immediate" } */
  d1 = _mm_dp_pd (d2, d3, k4);		  /* { dg-error "the last argument must be an 8-bit immediate" } */
  a1 = _mm_dp_ps (a2, a3, k4);		  /* { dg-error "the last argument must be an 8-bit immediate" } */
  a1 = _mm_insert_ps (a2, a3, k4);	  /* { dg-error "the last argument must be an 8-bit immediate" } */
  i1 = _mm_mpsadbw_epu8 (i2, i3, k4);	  /* { dg-error "the last argument must be an 8-bit immediate" } */
  a1 = _mm_permute_ps (a2, k4);		  /* { dg-error "the last argument must be an 8-bit immediate" } */
  i1 = _mm_slli_si128 (i2, k4);		  /* { dg-error "the last argument must be an 8-bit immediate" } */
  i1 = _mm_srli_si128 (i2, k4);		  /* { dg-error "the last argument must be an 8-bit immediate" } */
}
Esempio n. 15
0
bool PaTriFan1(PA_STATE &pa, UINT slot, simdvector tri[3])
{
    simdvector &a = PaGetSimdVector(pa, pa.prev, slot);
    simdvector &b = PaGetSimdVector(pa, pa.cur, slot);
    simdscalar s;

    // only need to fill vectors 1/2 with new verts
    for (int i = 0; i < 4; ++i)
    {
        simdscalar a0 = a[i];
        simdscalar b0 = b[i];

        simdvector &v2 = tri[2];
        s = _mm256_permute2f128_ps(a0, b0, 0x21);
        v2[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(1, 0, 3, 2));

        simdvector &v1 = tri[1];
        v1[i] = _simd_shuffle_ps(a0, v2[i], _MM_SHUFFLE(2, 1, 2, 1));
    }

    SetNextPaState(pa, PaTriFan1, PaTriFanSingle0);
    pa.numPrimsComplete += KNOB_VS_SIMD_WIDTH;
    return true;
}
Esempio n. 16
0
void kernel_ssymv_4_lib8(int kmax, int kna, float *A, int sda, float *x_n, float *y_n, float *x_t, float *y_t, int tri, int alg)
	{
	
	if(kmax<=0) 
		return;
	
	const int lda = 8;
	
	__builtin_prefetch( A + 0*lda );
	__builtin_prefetch( A + 2*lda );

	int 
		k, k_left, ii;
	
	float 
		k_left_d;

	const float mask_f[] = {7.5, 6.5, 5.5, 4.5, 3.5, 2.5, 1.5, 0.5};
	float temp_space[8] = {};

	__m256
		mask,
		zeros, temp,
		a_00, a_01, a_02, a_03,
		x_n_0, x_n_1, x_n_2, x_n_3, y_n_0,
		x_t_0, y_t_0, y_t_1, y_t_2, y_t_3;
	
	mask = _mm256_loadu_ps( mask_f ); 

	zeros = _mm256_setzero_ps();

	x_n_0 = _mm256_broadcast_ss( &x_n[0] );
	x_n_1 = _mm256_broadcast_ss( &x_n[1] );
	x_n_2 = _mm256_broadcast_ss( &x_n[2] );
	x_n_3 = _mm256_broadcast_ss( &x_n[3] );

	if(alg==-1) // TODO xor
		{
		x_n_0 = _mm256_sub_ps( zeros, x_n_0 );
		x_n_1 = _mm256_sub_ps( zeros, x_n_1 );
		x_n_2 = _mm256_sub_ps( zeros, x_n_2 );
		x_n_3 = _mm256_sub_ps( zeros, x_n_3 );
		}

	y_t_0 = _mm256_setzero_ps();
	y_t_1 = _mm256_setzero_ps();
	y_t_2 = _mm256_setzero_ps();
	y_t_3 = _mm256_setzero_ps();
	
	k=0;

	// corner
	if(tri==1)
		{
		
		k_left = kna-k;

		k_left_d = 8.0 - k_left;
/*printf("\nk_left = %d\n", k_left);*/

/*		y_n_0 = _mm_load_ps( &y_n[0] );*/
/*		y_n_0 = _mm_setzero_ps();*/
		x_t_0 = _mm256_loadu_ps( &x_t[0] );
		x_t_0 = _mm256_blendv_ps( x_t_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) );

/*_mm256_storeu_ps( temp_space, x_t_0 );		*/
/*printf("\nx = %f %f %f %f %f %f %f %f \n", temp_space[0], temp_space[1], temp_space[2], temp_space[3], temp_space[4], temp_space[5], temp_space[6], temp_space[7] );*/
/*exit(1);*/

		a_00  = _mm256_loadu_ps( &A[0+lda*0] );
		a_00  = _mm256_blend_ps( a_00, zeros, 0x00 );
/*		temp  = _mm256_mul_ps( a_00, x_n_0 );*/
/*		y_n_0 = _mm256_add_ps( y_n_0, temp );*/
		y_n_0 = _mm256_mul_ps( a_00, x_n_0 );
		a_00  = _mm256_blend_ps( a_00, zeros, 0x01 );
		temp  = _mm256_mul_ps( a_00, x_t_0 );
		y_t_0 = _mm256_add_ps( y_t_0, temp );

		a_01  = _mm256_loadu_ps( &A[0+lda*1] );
		a_01  = _mm256_blend_ps( a_01, zeros, 0x01 );
		temp  = _mm256_mul_ps( a_01, x_n_1 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		a_01  = _mm256_blend_ps( a_01, zeros, 0x03 );
		temp  = _mm256_mul_ps( a_01, x_t_0 );
		y_t_1 = _mm256_add_ps( y_t_1, temp );

		a_02  = _mm256_loadu_ps( &A[0+lda*2] );
		a_02  = _mm256_blend_ps( a_02, zeros, 0x03 );
		temp  = _mm256_mul_ps( a_02, x_n_2 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		a_02  = _mm256_blend_ps( a_02, zeros, 0x07 );
		temp  = _mm256_mul_ps( a_02, x_t_0 );
		y_t_2 = _mm256_add_ps( y_t_2, temp );

		a_03  = _mm256_loadu_ps( &A[0+lda*3] );
		a_03  = _mm256_blend_ps( a_03, zeros, 0x07 );
		temp  = _mm256_mul_ps( a_03, x_n_3 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		a_03  = _mm256_blend_ps( a_03, zeros, 0x0f );
		temp  = _mm256_mul_ps( a_03, x_t_0 );
		y_t_3 = _mm256_add_ps( y_t_3, temp );
		
/*_mm256_storeu_ps( temp_space, y_n_0 );		*/
/*printf("\ny = %f %f %f %f %f %f %f %f \n", temp_space[0], temp_space[1], temp_space[2], temp_space[3], temp_space[4], temp_space[5], temp_space[6], temp_space[7] );*/
		y_n_0 = _mm256_blendv_ps( y_n_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) );
/*_mm256_storeu_ps( temp_space, y_n_0 );		*/
/*printf("\ny = %f %f %f %f %f %f %f %f \n", temp_space[0], temp_space[1], temp_space[2], temp_space[3], temp_space[4], temp_space[5], temp_space[6], temp_space[7] );*/
		x_t_0 = _mm256_loadu_ps( &y_n[0] );
		y_n_0 = _mm256_add_ps( y_n_0, x_t_0 );
		_mm256_storeu_ps( &y_n[0], y_n_0 );
		

		A   += k_left;
		y_n += k_left;
		x_t += k_left;

		k   += k_left;

		}

	if(k<kna) // it can be only k_left = {1, 2, 3, 4, 5, 6, 7}
/*	for(; k<kna; k++)*/
		{
		
		k_left = kna-k;

		k_left_d = 8.0 - k_left;
/*printf("\nk_left = %d\n", k_left);*/

/*		y_n_0 = _mm_load_ps( &y_n[0] );*/
/*		y_n_0 = _mm_setzero_ps();*/
		x_t_0 = _mm256_loadu_ps( &x_t[0] );
		x_t_0 = _mm256_blendv_ps( x_t_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) );
/*_mm256_storeu_ps( temp_space, x_t_0 );		*/
/*printf("\nx = %f %f %f %f %f %f %f %f \n", temp_space[0], temp_space[1], temp_space[2], temp_space[3], temp_space[4], temp_space[5], temp_space[6], temp_space[7] );*/
		
		a_00  = _mm256_loadu_ps( &A[0+lda*0] );
		a_01  = _mm256_loadu_ps( &A[0+lda*1] );
		a_02  = _mm256_loadu_ps( &A[0+lda*2] );
		a_03  = _mm256_loadu_ps( &A[0+lda*3] );
		
/*		temp  = _mm256_mul_ps( a_00, x_n_0 );*/
/*		y_n_0 = _mm256_add_ps( y_n_0, temp );*/
		y_n_0 = _mm256_mul_ps( a_00, x_n_0 );
		temp  = _mm256_mul_ps( a_00, x_t_0 );
		y_t_0 = _mm256_add_ps( y_t_0, temp );
		temp  = _mm256_mul_ps( a_01, x_n_1 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_01, x_t_0 );
		y_t_1 = _mm256_add_ps( y_t_1, temp );
		temp  = _mm256_mul_ps( a_02, x_n_2 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_02, x_t_0 );
		y_t_2 = _mm256_add_ps( y_t_2, temp );
		temp  = _mm256_mul_ps( a_03, x_n_3 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_03, x_t_0 );
		y_t_3 = _mm256_add_ps( y_t_3, temp );
		
		y_n_0 = _mm256_blendv_ps( y_n_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) );
		x_t_0 = _mm256_loadu_ps( &y_n[0] );
		y_n_0 = _mm256_add_ps( y_n_0, x_t_0 );
		_mm256_storeu_ps( &y_n[0], y_n_0 );
/*		_mm256_maskstore_ps( &y_n[0], (__m256i) _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ), y_n_0 );*/

/*		_mm256_storeu_ps( temp_space, y_n_0 );*/
/*		for(ii=0; ii<k_left; ii++)*/
/*			y_n[ii] = temp_space[ii];*/

/*printf("\nk_left = %d\n", k_left);*/
/*exit(1);*/
	
		A   += k_left;
		y_n += k_left;
		x_t += k_left;

		k   += k_left;
		
		}
	if(kna>0 || tri==1)
		{
		A += (sda-1)*lda;
		}
	for(; k<kmax-7; k+=8)
		{
		
		__builtin_prefetch( A + sda*lda + 0*lda );
		__builtin_prefetch( A + sda*lda + 2*lda );

		y_n_0 = _mm256_loadu_ps( &y_n[0] );
		x_t_0 = _mm256_loadu_ps( &x_t[0] );
		
		a_00  = _mm256_load_ps( &A[0+lda*0] );
		temp  = _mm256_mul_ps( a_00, x_n_0 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_00, x_t_0 );
		y_t_0 = _mm256_add_ps( y_t_0, temp );

		a_01  = _mm256_load_ps( &A[0+lda*1] );
		temp  = _mm256_mul_ps( a_01, x_n_1 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_01, x_t_0 );
		y_t_1 = _mm256_add_ps( y_t_1, temp );

		a_02  = _mm256_load_ps( &A[0+lda*2] );
		temp  = _mm256_mul_ps( a_02, x_n_2 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_02, x_t_0 );
		y_t_2 = _mm256_add_ps( y_t_2, temp );

		a_03  = _mm256_load_ps( &A[0+lda*3] );
		temp  = _mm256_mul_ps( a_03, x_n_3 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_03, x_t_0 );
		y_t_3 = _mm256_add_ps( y_t_3, temp );
		
		_mm256_storeu_ps( &y_n[0], y_n_0 );
		

		A   += sda*lda;
		y_n += 8;
		x_t += 8;

		}
	
	if(k<kmax) // it can be only k_left = {1, 2, 3, 4, 5, 6, 7}
		{
		
		k_left = kmax-k;

		k_left_d = 8.0 - k_left;

/*		y_n_0 = _mm_load_ps( &y_n[0] );*/
/*		y_n_0 = _mm_setzero_ps();*/
		x_t_0 = _mm256_loadu_ps( &x_t[0] );
		x_t_0 = _mm256_blendv_ps( x_t_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) );
		
/*printf("\nk_left2 = %d\n", k_left, kmax, k);*/
		a_00  = _mm256_load_ps( &A[0+lda*0] );
/*printf("\nk_left2 = %d\n", k_left);*/
		a_01  = _mm256_load_ps( &A[0+lda*1] );
		a_02  = _mm256_load_ps( &A[0+lda*2] );
		a_03  = _mm256_load_ps( &A[0+lda*3] );
		
/*		temp  = _mm256_mul_ps( a_00, x_n_0 );*/
/*		y_n_0 = _mm256_add_ps( y_n_0, temp );*/
		y_n_0 = _mm256_mul_ps( a_00, x_n_0 );
		temp  = _mm256_mul_ps( a_00, x_t_0 );
		y_t_0 = _mm256_add_ps( y_t_0, temp );
		temp  = _mm256_mul_ps( a_01, x_n_1 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_01, x_t_0 );
		y_t_1 = _mm256_add_ps( y_t_1, temp );
		temp  = _mm256_mul_ps( a_02, x_n_2 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_02, x_t_0 );
		y_t_2 = _mm256_add_ps( y_t_2, temp );
		temp  = _mm256_mul_ps( a_03, x_n_3 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_03, x_t_0 );
		y_t_3 = _mm256_add_ps( y_t_3, temp );
		
		y_n_0 = _mm256_blendv_ps( y_n_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) );
		x_t_0 = _mm256_loadu_ps( &y_n[0] );
		y_n_0 = _mm256_add_ps( y_n_0, x_t_0 );
		_mm256_storeu_ps( &y_n[0], y_n_0 );
/*		_mm256_maskstore_ps( &y_n[0], (__m256i) _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ), y_n_0 );*/
	
/*		_mm256_storeu_ps( temp_space, y_n_0 );*/
/*		for(ii=0; ii<k_left; ii++)*/
/*			y_n[ii] = temp_space[ii];*/

/*		A   += 1;*/
/*		y_n += 1;*/
/*		x_t += 1;*/
		
		}

	// reduction
	__m128
		z_0, z_1;

	y_t_0 = _mm256_hadd_ps(y_t_0, y_t_1);
	y_t_2 = _mm256_hadd_ps(y_t_2, y_t_3);

	y_t_0 = _mm256_hadd_ps(y_t_0, y_t_2);

	y_t_1 = _mm256_permute2f128_ps(y_t_0, y_t_0, 0x01);
	
	z_0 = _mm256_castps256_ps128(y_t_0);
	z_1 = _mm256_castps256_ps128(y_t_1);
	
	z_1 = _mm_add_ps(z_0, z_1);

	if(alg==1)
		{
		z_0 = _mm_loadu_ps( &y_t[0] );

		z_0 = _mm_add_ps(z_0, z_1);

		_mm_storeu_ps( &y_t[0], z_0 );
		}
	else // alg==-1
		{
		z_0 = _mm_loadu_ps( &y_t[0] );

		z_0 = _mm_sub_ps(z_0, z_1);

		_mm_storeu_ps( &y_t[0], z_0 );
		}
	
	}
Esempio n. 17
0
void kernel_strmv_u_t_8_lib8(int kmax, float *A, int sda, float *x, float *y, int alg)
	{

/*	if(kmax<=0) */
/*		return;*/
	
	const int lda = 8;
/*	const int bs  = 8;*/
	
	__builtin_prefetch( A + 0*lda );
	__builtin_prefetch( A + 2*lda );
	__builtin_prefetch( A + 4*lda );
	__builtin_prefetch( A + 6*lda );

	int
		k;
	
	__m256
		zeros,
		ax_temp,
		a_00, a_01, a_02, a_03,
		x_0,
		y_0, y_1, y_2, y_3, y_4, y_5, y_6, y_7;
	
	zeros = _mm256_setzero_ps();

	y_0 = _mm256_setzero_ps();
	y_1 = _mm256_setzero_ps();
	y_2 = _mm256_setzero_ps();
	y_3 = _mm256_setzero_ps();
	y_4 = _mm256_setzero_ps();
	y_5 = _mm256_setzero_ps();
	y_6 = _mm256_setzero_ps();
	y_7 = _mm256_setzero_ps();

	k=0;
	for(; k<kmax-7; k+=8)
		{
		
		x_0 = _mm256_loadu_ps( &x[0] );

		__builtin_prefetch( A + sda*lda + 0*lda );
		__builtin_prefetch( A + sda*lda + 2*lda );

		a_00 = _mm256_load_ps( &A[0+lda*0] );
		ax_temp = _mm256_mul_ps( a_00, x_0 );
		y_0 = _mm256_add_ps( y_0, ax_temp );
		a_01 = _mm256_load_ps( &A[0+lda*1] );
		ax_temp = _mm256_mul_ps( a_01, x_0 );
		y_1 = _mm256_add_ps( y_1, ax_temp );
		a_02 = _mm256_load_ps( &A[0+lda*2] );
		ax_temp = _mm256_mul_ps( a_02, x_0 );
		y_2 = _mm256_add_ps( y_2, ax_temp );
		a_03 = _mm256_load_ps( &A[0+lda*3] );
		ax_temp = _mm256_mul_ps( a_03, x_0 );
		y_3 = _mm256_add_ps( y_3, ax_temp );
	
		__builtin_prefetch( A + sda*lda + 4*lda );
		__builtin_prefetch( A + sda*lda + 6*lda );

		a_00 = _mm256_load_ps( &A[0+lda*4] );
		ax_temp = _mm256_mul_ps( a_00, x_0 );
		y_4 = _mm256_add_ps( y_4, ax_temp );
		a_01 = _mm256_load_ps( &A[0+lda*5] );
		ax_temp = _mm256_mul_ps( a_01, x_0 );
		y_5 = _mm256_add_ps( y_5, ax_temp );
		a_02 = _mm256_load_ps( &A[0+lda*6] );
		ax_temp = _mm256_mul_ps( a_02, x_0 );
		y_6 = _mm256_add_ps( y_6, ax_temp );
		a_03 = _mm256_load_ps( &A[0+lda*7] );
		ax_temp = _mm256_mul_ps( a_03, x_0 );
		y_7 = _mm256_add_ps( y_7, ax_temp );

		A += sda*lda;
		x += lda;

		}

	x_0 = _mm256_loadu_ps( &x[0] );

	a_00 = _mm256_load_ps( &A[0+lda*0] );
	a_00 = _mm256_blend_ps( zeros, a_00, 0x01 );
	ax_temp = _mm256_mul_ps( a_00, x_0 );
	y_0 = _mm256_add_ps( y_0, ax_temp );
	a_01 = _mm256_load_ps( &A[0+lda*1] );
	a_01 = _mm256_blend_ps( zeros, a_01, 0x03 );
	ax_temp = _mm256_mul_ps( a_01, x_0 );
	y_1 = _mm256_add_ps( y_1, ax_temp );
	a_02 = _mm256_load_ps( &A[0+lda*2] );
	a_02 = _mm256_blend_ps( zeros, a_02, 0x07 );
	ax_temp = _mm256_mul_ps( a_02, x_0 );
	y_2 = _mm256_add_ps( y_2, ax_temp );
	a_03 = _mm256_load_ps( &A[0+lda*3] );
	a_03 = _mm256_blend_ps( zeros, a_03, 0x0f );
	ax_temp = _mm256_mul_ps( a_03, x_0 );
	y_3 = _mm256_add_ps( y_3, ax_temp );

	a_00 = _mm256_load_ps( &A[0+lda*4] );
	a_00 = _mm256_blend_ps( zeros, a_00, 0x1f );
	ax_temp = _mm256_mul_ps( a_00, x_0 );
	y_4 = _mm256_add_ps( y_4, ax_temp );
	a_01 = _mm256_load_ps( &A[0+lda*5] );
	a_01 = _mm256_blend_ps( zeros, a_01, 0x3f );
	ax_temp = _mm256_mul_ps( a_01, x_0 );
	y_5 = _mm256_add_ps( y_5, ax_temp );
	a_02 = _mm256_load_ps( &A[0+lda*6] );
	a_02 = _mm256_blend_ps( zeros, a_02, 0x7f );
	ax_temp = _mm256_mul_ps( a_02, x_0 );
	y_6 = _mm256_add_ps( y_6, ax_temp );
	a_03 = _mm256_load_ps( &A[0+lda*7] );
/*	a_03 = _mm256_blend_ps( zeros, a_03, 0xff );*/
	ax_temp = _mm256_mul_ps( a_03, x_0 );
	y_7 = _mm256_add_ps( y_7, ax_temp );

	// reduction
	__m256
		z_0;

	y_0 = _mm256_hadd_ps(y_0, y_1);
	y_2 = _mm256_hadd_ps(y_2, y_3);
	y_4 = _mm256_hadd_ps(y_4, y_5);
	y_6 = _mm256_hadd_ps(y_6, y_7);

	y_0 = _mm256_hadd_ps(y_0, y_2);
	y_4 = _mm256_hadd_ps(y_4, y_6);

	y_1 = _mm256_permute2f128_ps(y_0, y_4, 0x20);
	y_2 = _mm256_permute2f128_ps(y_0, y_4, 0x31);
	
	y_0 = _mm256_add_ps(y_1, y_2);

	// store
	if(alg==0)
		{
		_mm256_storeu_ps(&y[0], y_0);
		}
	else if(alg==1)
		{
		z_0 = _mm256_loadu_ps( &y[0] );

		z_0 = _mm256_add_ps(z_0, y_0);

		_mm256_storeu_ps(&y[0], z_0);
		}
	else // alg==-1
		{
		z_0 = _mm256_loadu_ps( &y[0] );

		z_0 = _mm256_sub_ps(z_0, y_0);

		_mm256_storeu_ps(&y[0], z_0);
		}

	}
Esempio n. 18
0
inline  __m256 _mm256_broadcast_lo_ss(__m256 a) {
	__m256 b = _mm256_permute_ps(a, _MM_SHUFFLE(0, 0, 0, 0)); \
	return _mm256_blend_ps(b, _mm256_permute2f128_ps(b, b, 1), 0xF0); \
}
Esempio n. 19
0
inline  __m256 _mm256_broadcast_3_ss(__m256 a) {
	__m256 b = _mm256_permute_ps(a, _MM_SHUFFLE(3, 3, 3, 3));
	return _mm256_blend_ps(b, _mm256_permute2f128_ps(b, b, 1), 0xF0);
}
Esempio n. 20
0
    log++;
  }
  nl = (1 << log);
  *newLength = nl;
  float *ret = mallocf(nl + additionalLength);
  memcpy(ret, ptr, length * sizeof(float));
  memsetf(ret + length, 0.f, nl - length);
  return ret;
}

float *rmemcpyf(float *__restrict dest,
                const float *__restrict src, size_t length) {
#ifdef __AVX__
  for (int i = 0; i < (int)length - 7; i += 8) {
    __m256 vec = _mm256_loadu_ps(src + i);
    vec = _mm256_permute2f128_ps(vec, vec, 1);
    vec = _mm256_permute_ps(vec, 0x1B);
    _mm256_storeu_ps(dest + length - i - 8, vec);
  }

  for (size_t i = (length & ~0x7); i < length; i++) {
    dest[length - i - 1] = src[i];
  }
#elif defined(__ARM_NEON__)
  for (int i = 0; i < (int)length - 3; i += 4) {
    float32x4_t vec = vld1q_f32(src + i);
    vec = vrev64q_f32(vec);
    vec = vcombine_f32(vget_high_f32(vec), vget_low_f32(vec));
    vst1q_f32(dest + length - i - 4, vec);
  }
Esempio n. 21
0
M_ALWAYS_INLINE
static void bar(float (& input)[8]) 
{
    
    /*
	static constexpr uint_fast8_t idx[][2] = {
		{0, 1}, {3, 2}, {4, 5}, {7, 6}, // (1)
		{0, 2}, {1, 3}, {6, 4}, {7, 5}, // (2)
		{0, 1}, {2, 3}, {5, 4}, {7, 6}, // (3)
		{0, 4}, {1, 5}, {2, 6}, {3, 7}, // (4)
		{0, 2}, {1, 3}, {4, 6}, {5, 7}, // (5)
		{0, 1}, {2, 3}, {4, 5}, {6, 7} // (6)
	};
    */
    // Индекса трябва да представим в по удобен вид за
    // AVX инструкциите. Няма смисъл от цикъл и после развиване
    // защото (4)-тия случай е специален... По добре на ръка.
    
    static constexpr int blend_mask_1 =0b10011001;
    static constexpr int blend_mask_2=0b11000011;
    static constexpr int blend_mask_3 =0b10100101;
    static constexpr int blend_mask_4 =0b00001111;
    static constexpr int blend_mask_5=0b00110011;
    static constexpr int blend_mask_6=0b01010101;
    
    // Отговаря на (1), (3) и (6)
    static constexpr int permute_mask_1=0b10110001;
    
    
    // Отговаря на (2) и (5)
    static constexpr int permute_mask_2=0b01001110;
    
    
    __m256 result= _mm256_load_ps(input);
    
    // (1)  
    
    __m256 mapped=_mm256_permute_ps(result,permute_mask_1);
    
    __m256 min=_mm256_min_ps(result,mapped);
    __m256 max=_mm256_max_ps(result,mapped);
    
    result=_mm256_blend_ps(max,min,blend_mask_1);
    
    // (2)
    
    mapped=_mm256_permute_ps(result,permute_mask_2);
    
    min=_mm256_min_ps(result,mapped);
    max=_mm256_max_ps(result,mapped);
    
    result=_mm256_blend_ps(max,min,blend_mask_2);
    
    // (3)
    
    mapped=_mm256_permute_ps(result,permute_mask_1);
    
    min=_mm256_min_ps(result,mapped);
    max=_mm256_max_ps(result,mapped);
    
    result=_mm256_blend_ps(max,min,blend_mask_3);
    
    // (4) Специалния случай тук трябва да пермутираме
    // между двете половини на YMM регистъра.
    
    mapped=_mm256_permute2f128_ps(result,result,1);
   
    min=_mm256_min_ps(result,mapped);
    max=_mm256_max_ps(result,mapped);
    
    result=_mm256_blend_ps(max,min,blend_mask_4);
   
    // (5)
    
    mapped=_mm256_permute_ps(result,permute_mask_2);
    
    min=_mm256_min_ps(result,mapped);
    max=_mm256_max_ps(result,mapped);
    
    result=_mm256_blend_ps(max,min,blend_mask_5);
    
    // (6)
    
    mapped=_mm256_permute_ps(result,permute_mask_1);
    
    min=_mm256_min_ps(result,mapped);
    max=_mm256_max_ps(result,mapped);
    
    result=_mm256_blend_ps(max,min,blend_mask_6);
     /**/
    _mm256_store_ps(input,result);
}
Esempio n. 22
0
 static BOOST_FORCEINLINE T perm2_ ( __m256 const& a0, __m256 const& a1, Mask const&)
 {
   return _mm256_permute2f128_ps(a0, a1, Mask::value);
 }
Esempio n. 23
0
INLINE avxb shuffle(const avxb& a,  const avxb& b) {
  return _mm256_permute2f128_ps(a, b, (i1 << 4) | (i0 << 0));
}
Esempio n. 24
0
__m256 test_mm256_permute2f128_ps(__m256 a, __m256 b) {
  // Check if the mask is correct
  // CHECK: shufflevector{{.*}}<i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
  return _mm256_permute2f128_ps(a, b, 0x13);
}