Пример #1
0
Файл: main.cpp Проект: sclc/DPP
//-----------------------------------------------------------------
// AOS -> SOA
//
//    pBgr: b0,g0,r0, b1,g1,r1, b2,g2,r2, b3,g3,r3, b4,g4,r4, ...
// ->
//    pBlu:  b0, b1, b2, b3, b4, ...
//    pGrn:  g0, g1, g2, g3, g4, ...
//    pRed:  r0, r1, r2, r3, r4, ...
void
aos2soa(float *pBgr, float *pBlu, float *pGrn, float *pRed, const size_t length)
{
    __m128 *bgr = (__m128 *)pBgr;
    float *b = pBlu;
    float *g = pGrn;
    float *r = pRed;

    for (size_t i = 0; i < length; i += 24, b += 8, g += 8, r += 8)
    {
        __m256 m03 = _mm256_castps128_ps256(*bgr++);  // 下半分のロード
        __m256 m14 = _mm256_castps128_ps256(*bgr++);
        __m256 m25 = _mm256_castps128_ps256(*bgr++);
        m03 = _mm256_insertf128_ps(m03, *bgr++, 1);   // 上半分のロード
        m14 = _mm256_insertf128_ps(m14, *bgr++, 1);
        m25 = _mm256_insertf128_ps(m25, *bgr++, 1);

        __m256 bg = _mm256_shuffle_ps(m14, m25, _MM_SHUFFLE(2, 1, 3, 2)); // b と g の上部分
        __m256 gr = _mm256_shuffle_ps(m03, m14, _MM_SHUFFLE(1, 0, 2, 1)); // g と r の下部分
        __m256 bb = _mm256_shuffle_ps(m03, bg,  _MM_SHUFFLE(2, 0, 3, 0));
        __m256 gg = _mm256_shuffle_ps(gr, bg,   _MM_SHUFFLE(3, 1, 2, 0));
        __m256 rr = _mm256_shuffle_ps(gr, m25,  _MM_SHUFFLE(3, 0, 3, 1));

        _mm256_store_ps(b, bb);
        _mm256_store_ps(g, gg);
        _mm256_store_ps(r, rr);
    }
}
Пример #2
0
void DoubleToComplex(double *srcI, double *srcQ, Complex *dst, const unsigned int len)
{
    __m256d avxR_D, avxI_D, avxX_D, avxY_D, avxA_D, avxB_D;
    __m128 avxA, avxB;
#if 1
    __m256 avxD;
#endif
    for (unsigned int i=0; i+4<=len; i+=4) {
        avxR_D = _mm256_loadu_pd(srcI + i);
        avxI_D = _mm256_loadu_pd(srcQ + i);
        avxX_D = _mm256_unpacklo_pd(avxR_D, avxI_D); //swizzle
        avxY_D = _mm256_unpackhi_pd(avxR_D, avxI_D);
        avxA_D = _mm256_permute2f128_pd(avxX_D, avxY_D, 0x20);
        avxB_D = _mm256_permute2f128_pd(avxX_D, avxY_D, 0x31);
        avxA = _mm256_cvtpd_ps(avxA_D); //double to float
        avxB = _mm256_cvtpd_ps(avxB_D);
#if 0
        avxD = _mm256_castps128_ps256(avxA); 
        avxD = _mm256_insertf128_ps(avxD, avxB, 1);
        _mm256_storeu_ps((float*)(dst+i), avxD);
#else
        _mm_maskstore_ps((float*)(dst+i), _mm_set_epi32(SET_1, SET_1, SET_1, SET_1), avxA);
        _mm_maskstore_ps((float*)(dst+i+2), _mm_set_epi32(SET_1, SET_1, SET_1, SET_1), avxB);
#endif
    }

    for (unsigned int i=len-(len & 0x03); i<len; ++i) {
        dst[i].m_real = static_cast<float>(srcI[i]);
        dst[i].m_imag = static_cast<float>(srcQ[i]);
    }
}
Пример #3
0
void PaLineStrip0Common(PA_STATE &pa, UINT slot, simdvector tri[3])
{
    simdvector &a = PaGetSimdVector(pa, pa.cur, slot);

    for (int i = 0; i < 4; ++i)
    {
        simdscalar a0 = a[i];

        // index 0
        simdvector &v0 = tri[0];

        // 01234 -> 01122334
        __m128 vLow = _mm256_extractf128_ps(a0, 0);
        __m128 vHigh = _mm256_extractf128_ps(a0, 1);

        // 0123 -> 0112
        // 0123 -> 233x
        __m128 vOutLow = _mm_shuffle_ps(vLow, vLow, _MM_SHUFFLE(2, 1, 1, 0));
        __m128 vOutHigh = _mm_shuffle_ps(vLow, vLow, _MM_SHUFFLE(3, 3, 3, 2));

        float f;
        _MM_EXTRACT_FLOAT(f, vHigh, 0);
        vOutHigh = _mm_insert_ps(vOutHigh, _mm_set1_ps(f), 0xf0);

        v0[i] = _mm256_insertf128_ps(v0[i], vOutLow, 0);
        v0[i] = _mm256_insertf128_ps(v0[i], vOutHigh, 1);

        // index 1 is same as index 0, but position needs to be adjusted to bloat the line
        // into 2 tris 1 pixel wide
        simdvector &v1 = tri[1];
        v1[i] = v0[i];

        // index 2
        // 01234 -> 10213243
        simdvector &v2 = tri[2];

        // 0123 -> 1021
        // 0123 -> 32x3
        vOutLow = _mm_shuffle_ps(vLow, vLow, _MM_SHUFFLE(1, 2, 0, 1));
        vOutHigh = _mm_shuffle_ps(vLow, vLow, _MM_SHUFFLE(3, 3, 2, 3));
        vOutHigh = _mm_insert_ps(vOutHigh, _mm_set1_ps(f), 0xa0);

        v2[i] = _mm256_insertf128_ps(v2[i], vOutLow, 0);
        v2[i] = _mm256_insertf128_ps(v2[i], vOutHigh, 1);
    }
}
Пример #4
0
void
test1bit (void)
{
  d1 = _mm256_extractf128_pd (e2, k4);	  /* { dg-error "the last argument must be a 1-bit immediate" } */
  a1 = _mm256_extractf128_ps (b2, k4);	  /* { dg-error "the last argument must be a 1-bit immediate" } */
  i1 = _mm256_extractf128_si256 (l2, k4); /* { dg-error "the last argument must be a 1-bit immediate" } */
  e1 = _mm256_insertf128_pd (e2, d1, k4); /* { dg-error "the last argument must be a 1-bit immediate" } */
  b1 = _mm256_insertf128_ps (b2, a1, k4); /* { dg-error "the last argument must be a 1-bit immediate" } */
  l1 = _mm256_insertf128_si256 (l2, i1, k4);/* { dg-error "the last argument must be a 1-bit immediate" } */
}
Пример #5
0
void PaLineStrip1Common(PA_STATE &pa, UINT slot, simdvector tri[3])
{
    simdvector &a = PaGetSimdVector(pa, pa.prev, slot);
    simdvector &b = PaGetSimdVector(pa, pa.cur, slot);

    for (int i = 0; i < 4; ++i)
    {
        simdscalar a0 = a[i];
        simdscalar b0 = b[i];

        // index 0
        simdvector &v0 = tri[0];

        // 45670 -> 45566770
        __m128 vPrevHigh = _mm256_extractf128_ps(a0, 1);
        __m128 vOutLow = _mm_shuffle_ps(vPrevHigh, vPrevHigh, _MM_SHUFFLE(2, 1, 1, 0));
        __m128 vOutHigh = _mm_shuffle_ps(vPrevHigh, vPrevHigh, _MM_SHUFFLE(3, 3, 3, 2));

        __m128 vCurLow = _mm256_extractf128_ps(b0, 0);
        float f;
        _MM_EXTRACT_FLOAT(f, vCurLow, 0);
        vOutHigh = _mm_insert_ps(vOutHigh, _mm_set1_ps(f), 0xf0);

        v0[i] = _mm256_insertf128_ps(v0[i], vOutLow, 0);
        v0[i] = _mm256_insertf128_ps(v0[i], vOutHigh, 1);

        // index 1
        // 45670 -> 45566770
        // index 1 same as index 0
        simdvector &v1 = tri[1];
        v1[i] = v0[i];

        // index 2
        // 45670 -> 54657607
        simdvector &v2 = tri[2];
        vOutLow = _mm_shuffle_ps(vPrevHigh, vPrevHigh, _MM_SHUFFLE(1, 2, 0, 1));
        vOutHigh = _mm_shuffle_ps(vPrevHigh, vPrevHigh, _MM_SHUFFLE(3, 3, 2, 3));
        vOutHigh = _mm_insert_ps(vOutHigh, _mm_set1_ps(f), 0xa0);

        v2[i] = _mm256_insertf128_ps(v2[i], vOutLow, 0);
        v2[i] = _mm256_insertf128_ps(v2[i], vOutHigh, 1);
    }
}
Пример #6
0
__m256 exp_256(
  const __m256& x) {

  //! Clip the value
  __m256 y = _mm256_max_ps(_mm256_min_ps(x, _mm256_set1_ps(88.3762626647949f)),
                                            _mm256_set1_ps(-88.3762626647949f));

  //! Express exp(x) as exp(g + n * log(2))
  __m256 fx = y * _mm256_set1_ps(1.44269504088896341) + _mm256_set1_ps(0.5f);

  //! Floor
  const __m256 tmp = _mm256_round_ps(fx, _MM_FROUND_TO_ZERO);

  //! If greater, substract 1
  const __m256 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS),
                                    _mm256_set1_ps(1.f));
  fx = tmp - mask;

  y -= fx * _mm256_set1_ps(0.693359375 - 2.12194440e-4);
  const __m256 z = y * y;


  const __m256 t = (((((_mm256_set1_ps(1.9875691500E-4)  * y +
                        _mm256_set1_ps(1.3981999507E-3)) * y +
                        _mm256_set1_ps(8.3334519073E-3)) * y +
                        _mm256_set1_ps(4.1665795894E-2)) * y +
                        _mm256_set1_ps(1.6666665459E-1)) * y +
                        _mm256_set1_ps(5.0000001201E-1)) * z + y +
                        _mm256_set1_ps(1.f);

  //! Build 2^n (split it into two SSE array, since AVX2 equivalent functions
  //! aren't available.
  const __m128i emm0 = _mm_add_epi32(_mm_cvttps_epi32(_mm256_castps256_ps128(fx)), _mm_set1_epi32(0x7f));
  const __m128i emm1 = _mm_add_epi32(_mm_cvttps_epi32(_mm256_extractf128_ps(fx, 1)), _mm_set1_epi32(0x7f));

  fx = _mm256_castps128_ps256(_mm_castsi128_ps(_mm_slli_epi32(emm0, 23)));
  fx = _mm256_insertf128_ps(fx, _mm_castsi128_ps(_mm_slli_epi32(emm1, 23)), 1);

  //! Return the result
  return t * fx;
}
Пример #7
0
INLINE avxb insert (const avxb& a, const sseb& b) {
  return _mm256_insertf128_ps (a,b,i);
}
double bst_compute_129_m256_maskstore_root_aligned( void*_bst_obj, double* p, double* q, size_t nn ) {
    segments_t* mem = (segments_t*) _bst_obj;
    int n, i, r, l_end, j, l_end_pre;
    double t, e_tmp;
    double* e = mem->e, *w = mem->w;
    int* root = mem->r;
    __m256d v_tmp;
    __m256d v00, v01, v02, v03;
    __m256d v10, v11, v12, v13;
    __m256d v20, v21, v22, v23;
    __m256d v30, v31, v32, v33;
    __m256i v_cur_roots;
    __m256 v_rootmask0, v_rootmask1;
    // initialization
    // mem->n = nn;
    n = nn; // subtractions with n potentially negative. say hello to all the bugs

    int idx1, idx1_root;
    int idx2;
    int idx3, idx3_root;
    int pad_root, pad, pad_r;
    
    idx1      = ((int) mem->e_sz) - 1;
    idx1_root = ((int) mem->r_sz);
    // the conventio is that iteration i, idx1 points to the first element of line i+1
    e[idx1++] = q[n];
    
    // pad contains the padding for row i+1
    // for row n it's always 3
    pad = 3;
    pad_root = 7;
    for (i = n-1; i >= 0; --i) {
        idx1      -= 2*(n-i)+1 + pad;
        idx1_root -= 2*(n-i)+1 + pad_root;
        idx2       = idx1 + 1;
        e[idx1]    = q[i];
        w[idx1]    = q[i];
        for (j = i+1; j < n+1; ++j,++idx2) {
            e[idx2] = INFINITY;
            w[idx2] = w[idx2-1] + p[j-1] + q[j];
        }
        idx2     += pad; // padding of line i+1
        // idx2 now points to the first element of the next line

        idx3      = idx1;
        idx3_root = idx1_root;
        pad_r     = pad;
        for (r = i; r < n; ++r) {
            pad_r     = (pad_r+1)&3; // padding of line r+1
            idx1      = idx3;
            idx1_root = idx3_root;
            l_end     = idx2 + (n-r);
            // l_end points to the first entry after the current row
            e_tmp     = e[idx1++];
            idx1_root++;
            // calculate until a multiple of 8 doubles is left
            // 8 = 4 * 2 128-bit vectors
            l_end_pre = idx2 + ((n-r)&15);
            for( ; (idx2 < l_end_pre) && (idx2 < l_end); ++idx2 ) {
                t = e_tmp + e[idx2] + w[idx1];
                if (t < e[idx1]) {
                    e[idx1] = t;
                    root[idx1_root] = r;
                }
                idx1++;
                idx1_root++;
            }
            
            v_tmp = _mm256_set_pd( e_tmp, e_tmp, e_tmp, e_tmp );
            // execute the shit for 4 vectors of size 2
            v_cur_roots = _mm256_set_epi32(r, r, r, r, r, r, r, r);
            for( ; idx2 < l_end; idx2 += 16 ) {
                v01 = _mm256_load_pd( &w[idx1   ] );
                v11 = _mm256_load_pd( &w[idx1+ 4] );
                v21 = _mm256_load_pd( &w[idx1+ 8] );
                v31 = _mm256_load_pd( &w[idx1+12] );

                v00 = _mm256_load_pd( &e[idx2   ] );
                v01 = _mm256_add_pd( v01, v_tmp ); 
                v10 = _mm256_load_pd( &e[idx2+ 4] );
                v11 = _mm256_add_pd( v11, v_tmp );
                v20 = _mm256_load_pd( &e[idx2+ 8] );
                v21 = _mm256_add_pd( v21, v_tmp );
                v30 = _mm256_load_pd( &e[idx2+12] );
                v31 = _mm256_add_pd( v31, v_tmp );

                v01 = _mm256_add_pd( v01, v00 );
                v03 = _mm256_load_pd( &e[idx1   ] );
                v11 = _mm256_add_pd( v11, v10 );
                v13 = _mm256_load_pd( &e[idx1+ 4] );
                v21 = _mm256_add_pd( v21, v20 );
                v23 = _mm256_load_pd( &e[idx1+ 8] );
                v31 = _mm256_add_pd( v31, v30 );
                v33 = _mm256_load_pd( &e[idx1+12] );

                v02 = _mm256_cmp_pd( v01, v03, _CMP_LT_OQ );
                v12 = _mm256_cmp_pd( v11, v13, _CMP_LT_OQ );
                v22 = _mm256_cmp_pd( v21, v23, _CMP_LT_OQ );
                v32 = _mm256_cmp_pd( v31, v33, _CMP_LT_OQ );

                _mm256_maskstore_pd( &e[idx1   ],
                        _mm256_castpd_si256( v02 ), v01 );
                _mm256_maskstore_pd( &e[idx1+ 4],
                        _mm256_castpd_si256( v12 ), v11 );

                v_rootmask0 = _mm256_insertf128_ps(
                        _mm256_castps128_ps256(
                            _mm256_cvtpd_ps(v02)),
                            _mm256_cvtpd_ps(v12) , 1
                    );

                _mm256_maskstore_pd( &e[idx1+ 8],
                        _mm256_castpd_si256( v22 ), v21 );
                _mm256_maskstore_pd( &e[idx1+12], 
                        _mm256_castpd_si256( v32 ), v31 );
                v_rootmask1 = _mm256_insertf128_ps(
                        _mm256_castps128_ps256(
                            _mm256_cvtpd_ps(v22)),
                            _mm256_cvtpd_ps(v32) , 1
                    );
                
                _mm256_maskstore_ps( &root[idx1_root    ],
                        _mm256_castps_si256( v_rootmask0 ),
                        _mm256_castsi256_ps( v_cur_roots ) );
                _mm256_maskstore_ps( &root[idx1_root + 8],
                        _mm256_castps_si256( v_rootmask1 ),
                        _mm256_castsi256_ps( v_cur_roots ) );
                idx1      += 16;
                idx1_root += 16;
            }
            idx2 += pad_r;
            idx3++;
            idx3_root++;
        }
        pad      = (pad     -1)&3;
        pad_root = (pad_root-1)&7;
    }
    // the index of the last item of the first row is ((n/4)+1)*4-1, due to the padding
    // if n is even, the total number of entries in the first
    // row of the table is odd, so we need padding
    return e[ ((n/4)+1)*4 - 1 ];
}
Пример #9
0
float tricub_x86_f(float *src, float *abcd, float x, float y){
  float *s;
  float x0, x1, x2, x3, y0, y1, y2, y3;
  float dst[4];
#if defined(__AVX2__) && defined(__x86_64__)
  __m256 v1, v2, v3, v4;
  __m256 va, vb, vc, vd;
  __m128 va4, vb4, vc4, vd4;
  __m128 v128a, v128b;
  __m128 vy0, vy1, vy2, vy3;
#else
  int i, ni2, ni3, ninj2, ninj3;
  float va4[4], vb4[4], vc4[4], vd4[4];
  ninj2 = ninj + ninj;
  ninj3 = ninj2 + ninj;
  ni2 = ni + ni;
  ni3 = ni2 + ni;
#endif

#if defined(__AVX2__) && defined(__x86_64__)

// ==== interpolation along Z, vector length is 16 (2 vectors of length 8 per plane) ====

  va = _mm256_broadcast_ss(abcd);   // promote constants to vectors
  vb = _mm256_broadcast_ss(abcd+1);
  vc = _mm256_broadcast_ss(abcd+2);
  vd = _mm256_broadcast_ss(abcd+3);

  s = src;                          // rows 0 and 1, 4 planes (Z0, Z1, Z2, Z3)
  v128a = _mm_loadu_ps(s);          // Z0 row 0
  v1 = _mm256_insertf128_ps(v1,v128a,0);
  v128b = _mm_loadu_ps(s+ni);       // Z0 row 1
  v1 = _mm256_insertf128_ps(v1,v128b,1);
  v1 = _mm256_mul_ps(v1,va);        // v1 = v1*va

  s += ninj;
  v128a = _mm_loadu_ps(s);          // Z1 row 0
  v2 = _mm256_insertf128_ps(v2,v128a,0);
  v128b = _mm_loadu_ps(s+ni);       // Z1 row 1
  v2 = _mm256_insertf128_ps(v2,v128b,1);
  v1 = _mm256_fmadd_ps(v2,vb,v1);   // v1 += v2*vb

  s += ninj;
  v128a = _mm_loadu_ps(s);          // Z2 row 0
  v3 = _mm256_insertf128_ps(v3,v128a,0);
  v128b = _mm_loadu_ps(s+ni);       // Z2 row 1
  v3 = _mm256_insertf128_ps(v3,v128b,1);
  v1 = _mm256_fmadd_ps(v3,vc,v1);   // v1 += v3*vc

  s += ninj;
  v128a = _mm_loadu_ps(s);          // Z3 row 0
  v4 = _mm256_insertf128_ps(v4,v128a,0);
  v128b = _mm_loadu_ps(s+ni);       // Z3 row 1
  v4 = _mm256_insertf128_ps(v4,v128b,1);
  v1 = _mm256_fmadd_ps(v4,vd,v1);   // v1 += v4*vd
                                    // split vector of length 8 into 2 vectors of length 4
  vy0 = _mm256_extractf128_ps(v1,0);// Y0 : row 0 (v1 low)
  vy1 = _mm256_extractf128_ps(v1,1);// Y1 : row 1 (v1 high)

  s = src + 2*ni;                   // rows 2 and 3, 4 planes (Z0, Z1, Z2, Z3)
  v128a = _mm_loadu_ps(s);          // Z0 row 2
  v1 = _mm256_insertf128_ps(v1,v128a,0);
  v128b = _mm_loadu_ps(s+ni);       // Z0 row 3
  v1 = _mm256_insertf128_ps(v1,v128b,1);
  v1 = _mm256_mul_ps(v1,va);        // v1 = v1*va

  s += ninj;
  v128a = _mm_loadu_ps(s);          // Z1 row 2
  v2 = _mm256_insertf128_ps(v2,v128a,0);
  v128b = _mm_loadu_ps(s+ni);       // Z1 row 3
  v2 = _mm256_insertf128_ps(v2,v128b,1);
  v1 = _mm256_fmadd_ps(v2,vb,v1);   // v1 += v2*vb

  s += ninj;
  v128a = _mm_loadu_ps(s);          // Z2 row 2
  v3 = _mm256_insertf128_ps(v3,v128a,0);
  v128b = _mm_loadu_ps(s+ni);       // Z2 row 3
  v3 = _mm256_insertf128_ps(v3,v128b,1);
  v1 = _mm256_fmadd_ps(v3,vc,v1);   // v1 += v3*vc

  s += ninj;
  v128a = _mm_loadu_ps(s);          // Z3 row 2
  v4 = _mm256_insertf128_ps(v4,v128a,0);
  v128b = _mm_loadu_ps(s+ni);       // Z3 row 3
  v4 = _mm256_insertf128_ps(v4,v128b,1);
  v1 = _mm256_fmadd_ps(v4,vd,v1);   // v1 += v4*vd
                                    // split vector of length 8 into 2 vectors of length 4
  vy2 = _mm256_extractf128_ps(v1,0);// Y2 : row 2  (v1 low)
  vy3 = _mm256_extractf128_ps(v1,1);// Y3 : row 3  (v1 high)

// ==== interpolation along Y, vector length is 4 (4 rows) ====

  y0 = cm167*y*(y-one)*(y-two);
  y1 = cp5*(y+one)*(y-one)*(y-two);
  y2 = cm5*y*(y+one)*(y-two);
  y3 = cp167*y*(y+one)*(y-one);

  va4 = _mm_broadcast_ss(&y0);      // promote constants to vectors
  vb4 = _mm_broadcast_ss(&y1);
  vc4 = _mm_broadcast_ss(&y2);
  vd4 = _mm_broadcast_ss(&y3);

  vy0 = _mm_mul_ps(vy0,va4);        //    vy0 * va4
  vy0 = _mm_fmadd_ps(vy1,vb4,vy0);  // += vy1 * vb4
  vy0 = _mm_fmadd_ps(vy2,vc4,vy0);  // += vy2 * vc4
  vy0 = _mm_fmadd_ps(vy3,vd4,vy0);  // += vy3 * vd4
  
  _mm_storeu_ps(dst,vy0);           // store 4 values along X
#else
  y0 = cm167*y*(y-one)*(y-two);
  y1 = cp5*(y+one)*(y-one)*(y-two);
  y2 = cm5*y*(y+one)*(y-two);
  y3 = cp167*y*(y+one)*(y-one);
  for (i=0 ; i<4 ; i++){
    va4[i] = src[i    ]*abcd[0] + src[i    +ninj]*abcd[1] +  src[i    +ninj2]*abcd[2] + src[i    +ninj3]*abcd[3];
    vb4[i] = src[i+ni ]*abcd[0] + src[i+ni +ninj]*abcd[1] +  src[i+ni +ninj2]*abcd[2] + src[i+ni +ninj3]*abcd[3];
    vc4[i] = src[i+ni2]*abcd[0] + src[i+ni2+ninj]*abcd[1] +  src[i+ni2+ninj2]*abcd[2] + src[i+ni2+ninj3]*abcd[3];
    vd4[i] = src[i+ni3]*abcd[0] + src[i+ni3+ninj]*abcd[1] +  src[i+ni3+ninj2]*abcd[2] + src[i+ni3+ninj3]*abcd[3];
    dst[i] = va4[i]*y0 + vb4[i]*y1 + vc4[i]*y2 + vd4[i]*y3;
  }
#endif

// ==== interpolation along x, scalar ====

  x0 = cm167*x*(x-one)*(x-two);
  x1 = cp5*(x+one)*(x-one)*(x-two);
  x2 = cm5*x*(x+one)*(x-two);
  x3 = cp167*x*(x+one)*(x-one);

  return(dst[0]*x0 + dst[1]*x1 + dst[2]*x2 + dst[3]*x3);
}
Пример #10
0
__m256 test_mm256_insertf128_ps_1(__m256 a, __m128 b) {
  // CHECK-LABEL: @test_mm256_insertf128_ps_1
  // CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
  return _mm256_insertf128_ps(a, b, 1);
}
Пример #11
0
__m256 test_mm256_insertf128_ps_0(__m256 a, __m128 b) {
  // CHECK-LABEL: @test_mm256_insertf128_ps_0
  // CHECK: shufflevector{{.*}}<i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
  return _mm256_insertf128_ps(a, b, 0);
}
Пример #12
0
   typedef typename meta::scalar_of<A0>::type sctype;		
   typedef typename simd::native<sctype, tag::sse_ >  svtype;
   std::cout << " == a0 " << a0 << std::endl;
   svtype a011;
   a011=  _mm256_extractf128_ps(a0, 1);
   svtype a000;
   a000 =  _mm256_extractf128_ps(a0, 0);			
   std::cout << " == a000 " << a000 << std::endl;
   std::cout << " == a011 " << a011 << std::endl;
   svtype a00 =  cumsum(a000);
   svtype a01 =  cumsum(a011); 
   svtype z = splat<svtype>(a00[meta::cardinal_of<svtype>::value-1]);
   std::cout << " == a00 " << a00 << std::endl;
   std::cout << " == a01 " << a01 << std::endl;
   std::cout << " == z   " << z   << std::endl; 
   A0 that = {_mm256_insertf128_ps(that,a00, 0)};		
   that =  _mm256_insertf128_ps(that, a01+z, 1); 
   return that;
 }
 NT2_FUNCTOR_CALL_EVAL_IF(1,      double)
 {
   typedef typename meta::scalar_of<A0>::type sctype;		
   typedef typename simd::native<sctype, tag::sse_ >  svtype;
   svtype a000 = { _mm256_extractf128_pd(a0, 0)};			
   svtype a011 = { _mm256_extractf128_pd(a0, 1)};
   svtype a00 =  cumsum(a000);
   svtype a01 =  cumsum(a011); 
   svtype z = splat<svtype>(a00[meta::cardinal_of<svtype>::value-1]);
   A0 that = simd::native_cast<A0>(_mm256_insertf128_pd(that,a00, 0));		
   that =  simd::native_cast<A0>(_mm256_insertf128_pd(that, a01+z, 1))			; 
   return that;