示例#1
0
void PaLineStrip1Common(PA_STATE &pa, UINT slot, simdvector tri[3])
{
    simdvector &a = PaGetSimdVector(pa, pa.prev, slot);
    simdvector &b = PaGetSimdVector(pa, pa.cur, slot);

    for (int i = 0; i < 4; ++i)
    {
        simdscalar a0 = a[i];
        simdscalar b0 = b[i];

        // index 0
        simdvector &v0 = tri[0];

        // 45670 -> 45566770
        __m128 vPrevHigh = _mm256_extractf128_ps(a0, 1);
        __m128 vOutLow = _mm_shuffle_ps(vPrevHigh, vPrevHigh, _MM_SHUFFLE(2, 1, 1, 0));
        __m128 vOutHigh = _mm_shuffle_ps(vPrevHigh, vPrevHigh, _MM_SHUFFLE(3, 3, 3, 2));

        __m128 vCurLow = _mm256_extractf128_ps(b0, 0);
        float f;
        _MM_EXTRACT_FLOAT(f, vCurLow, 0);
        vOutHigh = _mm_insert_ps(vOutHigh, _mm_set1_ps(f), 0xf0);

        v0[i] = _mm256_insertf128_ps(v0[i], vOutLow, 0);
        v0[i] = _mm256_insertf128_ps(v0[i], vOutHigh, 1);

        // index 1
        // 45670 -> 45566770
        // index 1 same as index 0
        simdvector &v1 = tri[1];
        v1[i] = v0[i];

        // index 2
        // 45670 -> 54657607
        simdvector &v2 = tri[2];
        vOutLow = _mm_shuffle_ps(vPrevHigh, vPrevHigh, _MM_SHUFFLE(1, 2, 0, 1));
        vOutHigh = _mm_shuffle_ps(vPrevHigh, vPrevHigh, _MM_SHUFFLE(3, 3, 2, 3));
        vOutHigh = _mm_insert_ps(vOutHigh, _mm_set1_ps(f), 0xa0);

        v2[i] = _mm256_insertf128_ps(v2[i], vOutLow, 0);
        v2[i] = _mm256_insertf128_ps(v2[i], vOutHigh, 1);
    }
}
示例#2
0
文件: testimm-3.c 项目: 0day-ci/gcc
void
test1bit (void)
{
  d1 = _mm256_extractf128_pd (e2, k4);	  /* { dg-error "the last argument must be a 1-bit immediate" } */
  a1 = _mm256_extractf128_ps (b2, k4);	  /* { dg-error "the last argument must be a 1-bit immediate" } */
  i1 = _mm256_extractf128_si256 (l2, k4); /* { dg-error "the last argument must be a 1-bit immediate" } */
  e1 = _mm256_insertf128_pd (e2, d1, k4); /* { dg-error "the last argument must be a 1-bit immediate" } */
  b1 = _mm256_insertf128_ps (b2, a1, k4); /* { dg-error "the last argument must be a 1-bit immediate" } */
  l1 = _mm256_insertf128_si256 (l2, i1, k4);/* { dg-error "the last argument must be a 1-bit immediate" } */
}
示例#3
0
//Thanks stack overflow.
static inline float _mm256_reduce_add_ps(__m256 x) {
    /* ( x3+x7, x2+x6, x1+x5, x0+x4 ) */
    const int imm = 1;
    const __m128 x128 = _mm_add_ps(_mm256_extractf128_ps(x, imm), _mm256_castps256_ps128(x));
    /* ( -, -, x1+x3+x5+x7, x0+x2+x4+x6 ) */
    const __m128 x64 = _mm_add_ps(x128, _mm_movehl_ps(x128, x128));
    /* ( -, -, -, x0+x1+x2+x3+x4+x5+x6+x7 ) */
    const __m128 x32 = _mm_add_ss(x64, _mm_shuffle_ps(x64, x64, 0x55));
    /* Conversion to float is a no-op on x86-64 */
    return _mm_cvtss_f32(x32);
}
/* http://stackoverflow.com/questions/13219146/how-to-sum-m256-horizontally */
static inline float horizontal_sum_avx2(__m256 x)
{
  const __m128 hi_quad = _mm256_extractf128_ps(x, 1);
  const __m128 lo_quad = _mm256_castps256_ps128(x);
  const __m128 sum_quad = _mm_add_ps(lo_quad, hi_quad);
  const __m128 lo_dual = sum_quad;
  const __m128 hi_dual = _mm_movehl_ps(sum_quad, sum_quad);
  const __m128 sum_dual = _mm_add_ps(lo_dual, hi_dual);
  const __m128 lo = sum_dual;
  const __m128 hi = _mm_shuffle_ps(sum_dual, sum_dual, 0x1);
  const __m128 sum = _mm_add_ss(lo, hi);
  return _mm_cvtss_f32(sum);
}
示例#5
0
void warmup(float *x, float *y, int size, float alpha)
{
    #pragma ivdep
    int i;

    __m256 m = _mm256_set_ps(1.0/alpha, 2.0, 1.0/alpha, 2.0, 1.0/alpha, 2.0, 1.0/alpha, 2.0);
    #pragma vector aligned
    for (i=0; i<size; i+=4)
    {
        __m256 t = _mm256_load_ps(x+2*i);
        __m256 l = _mm256_mul_ps(t, m); // premultiply
        __m256 r = _mm256_permute2f128_ps( l , l , 1); // swap lower and higher 128 bits
        __m256 res = _mm256_hadd_ps(l, r);
        __m128 s = _mm256_extractf128_ps (res, 0);
        _mm_store_ps(y+i,s); // store it
    }
}
示例#6
0
文件: LibSSE.cpp 项目: cpalmann/s2p
__m256 exp_256(
  const __m256& x) {

  //! Clip the value
  __m256 y = _mm256_max_ps(_mm256_min_ps(x, _mm256_set1_ps(88.3762626647949f)),
                                            _mm256_set1_ps(-88.3762626647949f));

  //! Express exp(x) as exp(g + n * log(2))
  __m256 fx = y * _mm256_set1_ps(1.44269504088896341) + _mm256_set1_ps(0.5f);

  //! Floor
  const __m256 tmp = _mm256_round_ps(fx, _MM_FROUND_TO_ZERO);

  //! If greater, substract 1
  const __m256 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS),
                                    _mm256_set1_ps(1.f));
  fx = tmp - mask;

  y -= fx * _mm256_set1_ps(0.693359375 - 2.12194440e-4);
  const __m256 z = y * y;


  const __m256 t = (((((_mm256_set1_ps(1.9875691500E-4)  * y +
                        _mm256_set1_ps(1.3981999507E-3)) * y +
                        _mm256_set1_ps(8.3334519073E-3)) * y +
                        _mm256_set1_ps(4.1665795894E-2)) * y +
                        _mm256_set1_ps(1.6666665459E-1)) * y +
                        _mm256_set1_ps(5.0000001201E-1)) * z + y +
                        _mm256_set1_ps(1.f);

  //! Build 2^n (split it into two SSE array, since AVX2 equivalent functions
  //! aren't available.
  const __m128i emm0 = _mm_add_epi32(_mm_cvttps_epi32(_mm256_castps256_ps128(fx)), _mm_set1_epi32(0x7f));
  const __m128i emm1 = _mm_add_epi32(_mm_cvttps_epi32(_mm256_extractf128_ps(fx, 1)), _mm_set1_epi32(0x7f));

  fx = _mm256_castps128_ps256(_mm_castsi128_ps(_mm_slli_epi32(emm0, 23)));
  fx = _mm256_insertf128_ps(fx, _mm_castsi128_ps(_mm_slli_epi32(emm1, 23)), 1);

  //! Return the result
  return t * fx;
}
示例#7
0
float dot_product(const int N, const float *X, const int incX, const float *Y,
                  const int incY) {
  __m256 accum = _mm256_setzero_ps();
  for (int i = 0; i < N; i += 8, X += 8 * incX, Y += 8 * incY) {
    __m256 xval = _mm256_load_ps(X);
    __m256 yval = _mm256_load_ps(Y);
    __m256 val = _mm256_mul_ps(xval, yval);
    accum = _mm256_add_ps(val, accum);
  }
  // Reduce the values in accum into the smallest 32-bit subsection
  // a0 a1 a2 a3 a4 a5 a6 a7 -> b0 b1 b2 b3
  __m128 accum2 = _mm_add_ps(_mm256_castps256_ps128(accum),
      _mm256_extractf128_ps(accum, 1));
  // b0 b1 b2 b3 -> c0 c1 b2 b3
  accum2 = _mm_add_ps(accum2,
      _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(accum2), 8)));
  __m128 final_val = _mm_add_ss(
      _mm_insert_ps(accum2, accum2, 0x4e), accum2);
  // Add the high and low halves
  return final_val[0];
}
示例#8
0
文件: csr.cpp 项目: dsheffie/Toys
/* http://stackoverflow.com/questions/13219146/how-to-sum-m256-horizontally */
inline float sum8(__m256 x) 
{
  // hiQuad = ( x7, x6, x5, x4 )
  const __m128 hiQuad = _mm256_extractf128_ps(x, 1);
  // loQuad = ( x3, x2, x1, x0 )
  const __m128 loQuad = _mm256_castps256_ps128(x);
  // sumQuad = ( x3 + x7, x2 + x6, x1 + x5, x0 + x4 )
  const __m128 sumQuad = _mm_add_ps(loQuad, hiQuad);
  // loDual = ( -, -, x1 + x5, x0 + x4 )
  const __m128 loDual = sumQuad;
  // hiDual = ( -, -, x3 + x7, x2 + x6 )
  const __m128 hiDual = _mm_movehl_ps(sumQuad, sumQuad);
  // sumDual = ( -, -, x1 + x3 + x5 + x7, x0 + x2 + x4 + x6 )
  const __m128 sumDual = _mm_add_ps(loDual, hiDual);
  // lo = ( -, -, -, x0 + x2 + x4 + x6 )
  const __m128 lo = sumDual;
  // hi = ( -, -, -, x1 + x3 + x5 + x7 )
  const __m128 hi = _mm_shuffle_ps(sumDual, sumDual, 0x1);
  // sum = ( -, -, -, x0 + x1 + x2 + x3 + x4 + x5 + x6 + x7 )
  const __m128 sum = _mm_add_ss(lo, hi);
  return _mm_cvtss_f32(sum);
}
示例#9
0
float tricub_x86_f(float *src, float *abcd, float x, float y){
  float *s;
  float x0, x1, x2, x3, y0, y1, y2, y3;
  float dst[4];
#if defined(__AVX2__) && defined(__x86_64__)
  __m256 v1, v2, v3, v4;
  __m256 va, vb, vc, vd;
  __m128 va4, vb4, vc4, vd4;
  __m128 v128a, v128b;
  __m128 vy0, vy1, vy2, vy3;
#else
  int i, ni2, ni3, ninj2, ninj3;
  float va4[4], vb4[4], vc4[4], vd4[4];
  ninj2 = ninj + ninj;
  ninj3 = ninj2 + ninj;
  ni2 = ni + ni;
  ni3 = ni2 + ni;
#endif

#if defined(__AVX2__) && defined(__x86_64__)

// ==== interpolation along Z, vector length is 16 (2 vectors of length 8 per plane) ====

  va = _mm256_broadcast_ss(abcd);   // promote constants to vectors
  vb = _mm256_broadcast_ss(abcd+1);
  vc = _mm256_broadcast_ss(abcd+2);
  vd = _mm256_broadcast_ss(abcd+3);

  s = src;                          // rows 0 and 1, 4 planes (Z0, Z1, Z2, Z3)
  v128a = _mm_loadu_ps(s);          // Z0 row 0
  v1 = _mm256_insertf128_ps(v1,v128a,0);
  v128b = _mm_loadu_ps(s+ni);       // Z0 row 1
  v1 = _mm256_insertf128_ps(v1,v128b,1);
  v1 = _mm256_mul_ps(v1,va);        // v1 = v1*va

  s += ninj;
  v128a = _mm_loadu_ps(s);          // Z1 row 0
  v2 = _mm256_insertf128_ps(v2,v128a,0);
  v128b = _mm_loadu_ps(s+ni);       // Z1 row 1
  v2 = _mm256_insertf128_ps(v2,v128b,1);
  v1 = _mm256_fmadd_ps(v2,vb,v1);   // v1 += v2*vb

  s += ninj;
  v128a = _mm_loadu_ps(s);          // Z2 row 0
  v3 = _mm256_insertf128_ps(v3,v128a,0);
  v128b = _mm_loadu_ps(s+ni);       // Z2 row 1
  v3 = _mm256_insertf128_ps(v3,v128b,1);
  v1 = _mm256_fmadd_ps(v3,vc,v1);   // v1 += v3*vc

  s += ninj;
  v128a = _mm_loadu_ps(s);          // Z3 row 0
  v4 = _mm256_insertf128_ps(v4,v128a,0);
  v128b = _mm_loadu_ps(s+ni);       // Z3 row 1
  v4 = _mm256_insertf128_ps(v4,v128b,1);
  v1 = _mm256_fmadd_ps(v4,vd,v1);   // v1 += v4*vd
                                    // split vector of length 8 into 2 vectors of length 4
  vy0 = _mm256_extractf128_ps(v1,0);// Y0 : row 0 (v1 low)
  vy1 = _mm256_extractf128_ps(v1,1);// Y1 : row 1 (v1 high)

  s = src + 2*ni;                   // rows 2 and 3, 4 planes (Z0, Z1, Z2, Z3)
  v128a = _mm_loadu_ps(s);          // Z0 row 2
  v1 = _mm256_insertf128_ps(v1,v128a,0);
  v128b = _mm_loadu_ps(s+ni);       // Z0 row 3
  v1 = _mm256_insertf128_ps(v1,v128b,1);
  v1 = _mm256_mul_ps(v1,va);        // v1 = v1*va

  s += ninj;
  v128a = _mm_loadu_ps(s);          // Z1 row 2
  v2 = _mm256_insertf128_ps(v2,v128a,0);
  v128b = _mm_loadu_ps(s+ni);       // Z1 row 3
  v2 = _mm256_insertf128_ps(v2,v128b,1);
  v1 = _mm256_fmadd_ps(v2,vb,v1);   // v1 += v2*vb

  s += ninj;
  v128a = _mm_loadu_ps(s);          // Z2 row 2
  v3 = _mm256_insertf128_ps(v3,v128a,0);
  v128b = _mm_loadu_ps(s+ni);       // Z2 row 3
  v3 = _mm256_insertf128_ps(v3,v128b,1);
  v1 = _mm256_fmadd_ps(v3,vc,v1);   // v1 += v3*vc

  s += ninj;
  v128a = _mm_loadu_ps(s);          // Z3 row 2
  v4 = _mm256_insertf128_ps(v4,v128a,0);
  v128b = _mm_loadu_ps(s+ni);       // Z3 row 3
  v4 = _mm256_insertf128_ps(v4,v128b,1);
  v1 = _mm256_fmadd_ps(v4,vd,v1);   // v1 += v4*vd
                                    // split vector of length 8 into 2 vectors of length 4
  vy2 = _mm256_extractf128_ps(v1,0);// Y2 : row 2  (v1 low)
  vy3 = _mm256_extractf128_ps(v1,1);// Y3 : row 3  (v1 high)

// ==== interpolation along Y, vector length is 4 (4 rows) ====

  y0 = cm167*y*(y-one)*(y-two);
  y1 = cp5*(y+one)*(y-one)*(y-two);
  y2 = cm5*y*(y+one)*(y-two);
  y3 = cp167*y*(y+one)*(y-one);

  va4 = _mm_broadcast_ss(&y0);      // promote constants to vectors
  vb4 = _mm_broadcast_ss(&y1);
  vc4 = _mm_broadcast_ss(&y2);
  vd4 = _mm_broadcast_ss(&y3);

  vy0 = _mm_mul_ps(vy0,va4);        //    vy0 * va4
  vy0 = _mm_fmadd_ps(vy1,vb4,vy0);  // += vy1 * vb4
  vy0 = _mm_fmadd_ps(vy2,vc4,vy0);  // += vy2 * vc4
  vy0 = _mm_fmadd_ps(vy3,vd4,vy0);  // += vy3 * vd4
  
  _mm_storeu_ps(dst,vy0);           // store 4 values along X
#else
  y0 = cm167*y*(y-one)*(y-two);
  y1 = cp5*(y+one)*(y-one)*(y-two);
  y2 = cm5*y*(y+one)*(y-two);
  y3 = cp167*y*(y+one)*(y-one);
  for (i=0 ; i<4 ; i++){
    va4[i] = src[i    ]*abcd[0] + src[i    +ninj]*abcd[1] +  src[i    +ninj2]*abcd[2] + src[i    +ninj3]*abcd[3];
    vb4[i] = src[i+ni ]*abcd[0] + src[i+ni +ninj]*abcd[1] +  src[i+ni +ninj2]*abcd[2] + src[i+ni +ninj3]*abcd[3];
    vc4[i] = src[i+ni2]*abcd[0] + src[i+ni2+ninj]*abcd[1] +  src[i+ni2+ninj2]*abcd[2] + src[i+ni2+ninj3]*abcd[3];
    vd4[i] = src[i+ni3]*abcd[0] + src[i+ni3+ninj]*abcd[1] +  src[i+ni3+ninj2]*abcd[2] + src[i+ni3+ninj3]*abcd[3];
    dst[i] = va4[i]*y0 + vb4[i]*y1 + vc4[i]*y2 + vd4[i]*y3;
  }
#endif

// ==== interpolation along x, scalar ====

  x0 = cm167*x*(x-one)*(x-two);
  x1 = cp5*(x+one)*(x-one)*(x-two);
  x2 = cm5*x*(x+one)*(x-two);
  x3 = cp167*x*(x+one)*(x-one);

  return(dst[0]*x0 + dst[1]*x1 + dst[2]*x2 + dst[3]*x3);
}
示例#10
0
void PaTriStripSingle0(PA_STATE &pa, UINT slot, UINT triIndex, __m128 triverts[3])
{
    simdvector &a = PaGetSimdVector(pa, pa.prev, slot);
    simdvector &b = PaGetSimdVector(pa, pa.cur, slot);
    simdscalar tmp0;
    simdscalar tmp1;

    // Convert from vertical to horizontal.
    switch (triIndex)
    {
    case 0:
        // Grab vertex 0 from lane 0 and store it in tri[0]
        tmp0 = _mm256_unpacklo_ps(a.x, a.z);
        tmp1 = _mm256_unpacklo_ps(a.y, a.w);
        triverts[0] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);

        // Grab vertex 1 from lane 1 and store it in tri[1]
        tmp0 = _mm256_unpacklo_ps(a.x, a.z);
        tmp1 = _mm256_unpacklo_ps(a.y, a.w);
        triverts[1] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);

        // Grab vertex 2 from lane 2 and store it in tri[2]
        tmp0 = _mm256_unpackhi_ps(a.x, a.z);
        tmp1 = _mm256_unpackhi_ps(a.y, a.w);
        triverts[2] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);

        break;
    case 1:
        // Grab vertex 2 from lane 2 and store it in tri[2]
        tmp0 = _mm256_unpackhi_ps(a.x, a.z);
        tmp1 = _mm256_unpackhi_ps(a.y, a.w);
        triverts[0] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);

        // Grab vertex 1 from lane 1 and store it in tri[1]
        tmp0 = _mm256_unpacklo_ps(a.x, a.z);
        tmp1 = _mm256_unpacklo_ps(a.y, a.w);
        triverts[1] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);

        // Grab vertex 0 from lane 3 from 'a' and store it in tri[0]
        tmp0 = _mm256_unpackhi_ps(a.x, a.z);
        tmp1 = _mm256_unpackhi_ps(a.y, a.w);
        triverts[2] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
        break;
    case 2:
        // Grab vertex 2 from lane 2 and store it in tri[2]
        tmp0 = _mm256_unpackhi_ps(a.x, a.z);
        tmp1 = _mm256_unpackhi_ps(a.y, a.w);
        triverts[0] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);

        // Grab vertex 0 from lane 3 from 'a' and store it in tri[0]
        tmp0 = _mm256_unpackhi_ps(a.x, a.z);
        tmp1 = _mm256_unpackhi_ps(a.y, a.w);
        triverts[1] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);

        // Grab vertex 1 from lane 4 from 'a' and store it in tri[1]
        tmp0 = _mm256_unpacklo_ps(a.x, a.z);
        tmp1 = _mm256_unpacklo_ps(a.y, a.w);
        triverts[2] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
        break;
    case 3:
        // Grab vertex 1 from lane 4 from 'a' and store it in tri[1]
        tmp0 = _mm256_unpacklo_ps(a.x, a.z);
        tmp1 = _mm256_unpacklo_ps(a.y, a.w);
        triverts[0] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);

        // Grab vertex 0 from lane 3 from 'a' and store it in tri[0]
        tmp0 = _mm256_unpackhi_ps(a.x, a.z);
        tmp1 = _mm256_unpackhi_ps(a.y, a.w);
        triverts[1] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);

        // Grab vertex 2 from lane 5 from 'a' and store it in tri[2]
        tmp0 = _mm256_unpacklo_ps(a.x, a.z);
        tmp1 = _mm256_unpacklo_ps(a.y, a.w);
        triverts[2] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
        break;

    case 4:
        // Grab vertex 1 from lane 4 from 'a' and store it in tri[1]
        tmp0 = _mm256_unpacklo_ps(a.x, a.z);
        tmp1 = _mm256_unpacklo_ps(a.y, a.w);
        triverts[0] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);

        // Grab vertex 2 from lane 5 from 'a' and store it in tri[2]
        tmp0 = _mm256_unpacklo_ps(a.x, a.z);
        tmp1 = _mm256_unpacklo_ps(a.y, a.w);
        triverts[1] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);

        // Grab vertex 0 from lane 6 from 'a' and store it in tri[0]
        tmp0 = _mm256_unpackhi_ps(a.x, a.z);
        tmp1 = _mm256_unpackhi_ps(a.y, a.w);
        triverts[2] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
        break;
    case 5:
        // Grab vertex 0 from lane 6 from 'a' and store it in tri[0]
        tmp0 = _mm256_unpackhi_ps(a.x, a.z);
        tmp1 = _mm256_unpackhi_ps(a.y, a.w);
        triverts[0] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);

        // Grab vertex 2 from lane 5 from 'a' and store it in tri[2]
        tmp0 = _mm256_unpacklo_ps(a.x, a.z);
        tmp1 = _mm256_unpacklo_ps(a.y, a.w);
        triverts[1] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);

        // Grab vertex 1 from lane 7 from 'a' and store it in tri[1]
        tmp0 = _mm256_unpackhi_ps(a.x, a.z);
        tmp1 = _mm256_unpackhi_ps(a.y, a.w);
        triverts[2] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);

        break;
    case 6:
        // Grab vertex 0 from lane 6 from 'a' and store it in tri[0]
        tmp0 = _mm256_unpackhi_ps(a.x, a.z);
        tmp1 = _mm256_unpackhi_ps(a.y, a.w);
        triverts[0] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);

        // Grab vertex 1 from lane 7 from 'a' and store it in tri[1]
        tmp0 = _mm256_unpackhi_ps(a.x, a.z);
        tmp1 = _mm256_unpackhi_ps(a.y, a.w);
        triverts[1] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);

        // Grab vertex 2 from lane 0 from 'b' and store it in tri[2]
        tmp0 = _mm256_unpacklo_ps(b.x, b.z);
        tmp1 = _mm256_unpacklo_ps(b.y, b.w);
        triverts[2] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
        break;
    case 7:
        // Grab vertex 2 from lane 0 from 'b' and store it in tri[2]
        tmp0 = _mm256_unpacklo_ps(b.x, b.z);
        tmp1 = _mm256_unpacklo_ps(b.y, b.w);
        triverts[0] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);

        // Grab vertex 1 from lane 7 from 'a' and store it in tri[1]
        tmp0 = _mm256_unpackhi_ps(a.x, a.z);
        tmp1 = _mm256_unpackhi_ps(a.y, a.w);
        triverts[1] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);

        // Grab vertex 0 from lane 1 from 'b' and store it in tri[0]
        tmp0 = _mm256_unpacklo_ps(b.x, b.z);
        tmp1 = _mm256_unpacklo_ps(b.y, b.w);
        triverts[2] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
        break;
    };
}
示例#11
0
__m128 test_mm256_extractf128_ps_0(__m256 a) {
  // CHECK-LABEL: @test_mm256_extractf128_ps_0
  // CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 2, i32 3>
  return _mm256_extractf128_ps(a, 0);
}
示例#12
0
INLINE sseb extract(const avxb& a) {
  return _mm256_extractf128_ps(a,i);
}
示例#13
0
void neuralNet::feedForward_layer(layerIterator_t nLayer)
{
	constFloatIterator_t pActivations, cWeight, endWeight;
	__m256 vTotal, vSub0, vSub1;
	__m256 *vWeight, *vAct, *vEndWeight;

	// summate each neuron's contribution
	for (neuronIterator_t cNeuron = nLayer->begin(), end = nLayer->end(); 
		cNeuron != end; 
		++cNeuron)
	{
		// foreach [previous neuron, current weight], up to endWeight
		pActivations = activations.begin() + (nLayer - 1)->front().iNeuronIndex;
		cWeight = cNeuron->weightsBegin(*this);
		endWeight = cNeuron->weightsEnd(*this);

		// (first 15 neurons) (TODO: redesign preamble and remove assertions for multiple of 16 size widths in neuralNet.h!)

		// summate all neurons of previous layer: (remaining batches of 8 neurons)
		vWeight = (__m256*)&cWeight[0];
		vAct = (__m256*)&pActivations[0];

		vEndWeight = (__m256*)&endWeight[0];

		// initialize the activation of this neuron to its bias weight. The bias weight's neuron is always on:
		vTotal = _mm256_set_ps(0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, *endWeight); // can this be made with an aligned load?

		do // Take advantage of SIMD instructions by doing 16 multiplies per iteration
		{
			/* 
			 * each neuron's contribution is:
			 * input[j] += weight[i,j] * activation[i]
			 */
			// multiply:
			vSub0 = _mm256_mul_ps(vWeight[0], vAct[0]);
			vSub1 = _mm256_mul_ps(vWeight[1], vAct[1]);

			// prefetch next values: (these don't appear to help, are the networks too small for this to matter?)
			//_mm_prefetch((char*)(vWeight0+4), _MM_HINT_T0);
			//_mm_prefetch((char*)(vAct0+4), _MM_HINT_T0);

			// add to accumulator:
			vTotal = _mm256_add_ps(vTotal, vSub0);
			vTotal = _mm256_add_ps(vTotal, vSub1);

			// increment pointers:
			vWeight += 2;
			vAct += 2;
		}
		while (vWeight != vEndWeight);

		//finalize: (combine all 4 accumulators)
		{
			vTotal = _mm256_hadd_ps(vTotal, vTotal);
			vTotal = _mm256_hadd_ps(vTotal, vTotal);
			__m128 vUpperTotal = _mm256_extractf128_ps(vTotal, 1);
			vUpperTotal = _mm_add_ps(vUpperTotal, _mm256_castps256_ps128(vTotal));

			// store the lowest float into cInput:
			_mm_store_ss(&activations[cNeuron->iNeuronIndex], vUpperTotal);
		}
	}

	// activate all neurons in this layer:
	float* cActivation = (&activations.front() + nLayer->front().iNeuronIndex);
	float* lActivation = (&activations.front() + nLayer->back().iNeuronIndex + 1);
	float* lVectorActivation = lActivation - ((lActivation - cActivation)&(ALIGN_SIZE-1)); // equivalent to mod ALIGN_SIZE

	// aligned activations:
	while (cActivation != lVectorActivation)
	{
		activation_approx_avx(cActivation, cActivation);
		cActivation += ALIGN_SIZE;
	};

	// postscript: (unaligned activations):
	{
		size_t dActivation = (lActivation - cActivation);
		switch(dActivation)
		{
		case 7:
			activation_approx(cActivation+6,cActivation+6);
		case 6:
			activation_approx(cActivation+5,cActivation+5);
		case 5:
			activation_approx(cActivation+4,cActivation+4);
		case 4:
			activation_approx_sse(cActivation+0,cActivation+0);
			break;
		case 3:
			activation_approx(cActivation+2, cActivation+2);
		case 2:
			activation_approx(cActivation+1, cActivation+1);
		case 1:
			activation_approx(cActivation+0, cActivation+0);
		case 0:
			break;
		}
	}
}; // endOf feedForward_layer
示例#14
0
__m128 test_mm256_extractf128_ps_1(__m256 a) {
  // CHECK-LABEL: @test_mm256_extractf128_ps_1
  // CHECK: shufflevector{{.*}}<i32 4, i32 5, i32 6, i32 7>
  return _mm256_extractf128_ps(a, 1);
}
示例#15
0
 /*!
  * \brief Perform an horizontal sum of the given vector.
  * \param in The input vector type
  * \return the horizontal sum of the vector
  */
 ETL_STATIC_INLINE(float) hadd(avx_simd_float in) {
     const __m128 x128 = _mm_add_ps(_mm256_extractf128_ps(in.value, 1), _mm256_castps256_ps128(in.value));
     const __m128 x64  = _mm_add_ps(x128, _mm_movehl_ps(x128, x128));
     const __m128 x32  = _mm_add_ss(x64, _mm_shuffle_ps(x64, x64, 0x55));
     return _mm_cvtss_f32(x32);
 }
示例#16
0
void PaTriListSingle0(PA_STATE &pa, UINT slot, UINT triIndex, __m128 triverts[3])
{
    // We have 12 simdscalars contained within 3 simdvectors which
    // hold at least 8 triangles worth of data. We want to assemble a single
    // triangle with data in horizontal form.
    simdvector &a = PaGetSimdVector(pa, 0, slot);
    simdvector &b = PaGetSimdVector(pa, 1, slot);
    simdvector &c = PaGetSimdVector(pa, 2, slot);
    simdscalar tmp0;
    simdscalar tmp1;

    // Convert from vertical to horizontal.
    switch (triIndex)
    {
    case 0:
        // Grab vertex 0 from lane 0 and store it in tri[0]
        tmp0 = _mm256_unpacklo_ps(a.x, a.z);
        tmp1 = _mm256_unpacklo_ps(a.y, a.w);
        triverts[0] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);

        // Grab vertex 1 from lane 1 and store it in tri[1]
        tmp0 = _mm256_unpacklo_ps(a.x, a.z);
        tmp1 = _mm256_unpacklo_ps(a.y, a.w);
        triverts[1] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);

        // Grab vertex 2 from lane 2 and store it in tri[2]
        tmp0 = _mm256_unpackhi_ps(a.x, a.z);
        tmp1 = _mm256_unpackhi_ps(a.y, a.w);
        triverts[2] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);

        break;
    case 1:
        // Grab vertex 0 from lane 3 from 'a' and store it in tri[0]
        tmp0 = _mm256_unpackhi_ps(a.x, a.z);
        tmp1 = _mm256_unpackhi_ps(a.y, a.w);
        triverts[0] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);

        // Grab vertex 1 from lane 4 from 'a' and store it in tri[1]
        tmp0 = _mm256_unpacklo_ps(a.x, a.z);
        tmp1 = _mm256_unpacklo_ps(a.y, a.w);
        triverts[1] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);

        // Grab vertex 2 from lane 5 from 'a' and store it in tri[2]
        tmp0 = _mm256_unpacklo_ps(a.x, a.z);
        tmp1 = _mm256_unpacklo_ps(a.y, a.w);
        triverts[2] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);

        break;
    case 2:
        // Grab vertex 0 from lane 6 from 'a' and store it in tri[0]
        tmp0 = _mm256_unpackhi_ps(a.x, a.z);
        tmp1 = _mm256_unpackhi_ps(a.y, a.w);
        triverts[0] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);

        // Grab vertex 1 from lane 7 from 'a' and store it in tri[1]
        tmp0 = _mm256_unpackhi_ps(a.x, a.z);
        tmp1 = _mm256_unpackhi_ps(a.y, a.w);
        triverts[1] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);

        // Grab vertex 2 from lane 0 from 'b' and store it in tri[2]
        tmp0 = _mm256_unpacklo_ps(b.x, b.z);
        tmp1 = _mm256_unpacklo_ps(b.y, b.w);
        triverts[2] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);

        break;
    case 3:
        // Grab vertex 0 from lane 1 from 'b' and store it in tri[0]
        tmp0 = _mm256_unpacklo_ps(b.x, b.z);
        tmp1 = _mm256_unpacklo_ps(b.y, b.w);
        triverts[0] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);

        // Grab vertex 1 from lane 2 from 'b' and store it in tri[1]
        tmp0 = _mm256_unpackhi_ps(b.x, b.z);
        tmp1 = _mm256_unpackhi_ps(b.y, b.w);
        triverts[1] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);

        // Grab vertex 2 from lane 3 from 'b' and store it in tri[2]
        tmp0 = _mm256_unpackhi_ps(b.x, b.z);
        tmp1 = _mm256_unpackhi_ps(b.y, b.w);
        triverts[2] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);

        break;

    case 4:
        // Grab vertex 0 from lane 4 from 'b' and store it in tri[0]
        tmp0 = _mm256_unpacklo_ps(b.x, b.z);
        tmp1 = _mm256_unpacklo_ps(b.y, b.w);
        triverts[0] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);

        // Grab vertex 1 from lane 5 from 'b' and store it in tri[1]
        tmp0 = _mm256_unpacklo_ps(b.x, b.z);
        tmp1 = _mm256_unpacklo_ps(b.y, b.w);
        triverts[1] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);

        // Grab vertex 2 from lane 6 from 'b' and store it in tri[2]
        tmp0 = _mm256_unpackhi_ps(b.x, b.z);
        tmp1 = _mm256_unpackhi_ps(b.y, b.w);
        triverts[2] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);

        break;
    case 5:
        // Grab vertex 0 from lane 7 from 'b' and store it in tri[0]
        tmp0 = _mm256_unpackhi_ps(b.x, b.z);
        tmp1 = _mm256_unpackhi_ps(b.y, b.w);
        triverts[0] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);

        // Grab vertex 1 from lane 0 from 'c' and store it in tri[1]
        tmp0 = _mm256_unpacklo_ps(c.x, c.z);
        tmp1 = _mm256_unpacklo_ps(c.y, c.w);
        triverts[1] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);

        // Grab vertex 2 from lane 1 from 'c' and store it in tri[2]
        tmp0 = _mm256_unpacklo_ps(c.x, c.z);
        tmp1 = _mm256_unpacklo_ps(c.y, c.w);
        triverts[2] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
        break;
    case 6:
        // Grab vertex 0 from lane 2 from 'c' and store it in tri[0]
        tmp0 = _mm256_unpackhi_ps(c.x, c.z);
        tmp1 = _mm256_unpackhi_ps(c.y, c.w);
        triverts[0] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);

        // Grab vertex 1 from lane 3 from 'c' and store it in tri[1]
        tmp0 = _mm256_unpackhi_ps(c.x, c.z);
        tmp1 = _mm256_unpackhi_ps(c.y, c.w);
        triverts[1] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);

        // Grab vertex 2 from lane 4 from 'c' and store it in tri[2]
        tmp0 = _mm256_unpacklo_ps(c.x, c.z);
        tmp1 = _mm256_unpacklo_ps(c.y, c.w);
        triverts[2] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);

        break;
    case 7:
        // Grab vertex 0 from lane 5 from 'c' and store it in tri[0]
        tmp0 = _mm256_unpacklo_ps(c.x, c.z);
        tmp1 = _mm256_unpacklo_ps(c.y, c.w);
        triverts[0] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);

        // Grab vertex 1 from lane 6 from 'c' and store it in tri[1]
        tmp0 = _mm256_unpackhi_ps(c.x, c.z);
        tmp1 = _mm256_unpackhi_ps(c.y, c.w);
        triverts[1] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);

        // Grab vertex 2 from lane 7 from 'c' and store it in tri[2]
        tmp0 = _mm256_unpackhi_ps(c.x, c.z);
        tmp1 = _mm256_unpackhi_ps(c.y, c.w);
        triverts[2] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
        break;
    };
}
示例#17
0
INLINE __m128 swizzleLane7(simdvector &a)
{
    simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
    simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
    return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
}
    void run_softmax_int32_float_work_item_latency(nn_workload_item *const work_item)
    {
        nn_workload_data_t *input_view = work_item->input[0]->output;
        const auto &arguments = work_item->arguments.forward_softmax_fixedpoint;

        const auto input_width = input_view->parent->lengths.t[NN_DATA_COORD_z] * input_view->parent->lengths.t[NN_DATA_COORD_p];
        const auto output_width = work_item->output->view_end.t[NN_DATA_COORD_x] - work_item->output->view_begin.t[NN_DATA_COORD_x] + 1;

        const auto num_full_blocks = output_width / C_data_stride;
        const auto partial_block_size = (output_width / C_simd_width) % C_max_acc;
        const auto subsimd_block_size = output_width % C_simd_width;

        const auto output_view_start = work_item->output->view_begin.t[NN_DATA_COORD_x];

        const auto input_view_start = input_view->view_begin.t[NN_DATA_COORD_z] * input_view->parent->lengths.t[NN_DATA_COORD_p];

        const auto out_fraction = arguments.input_fraction;

        float * input_f = (float*)_mm_malloc(input_width * sizeof(float), 64);

        auto input_buffer = &static_cast<int32_t*>(input_view->parent->data_buffer)[input_view_start];

        auto shift = out_fraction;
        if (shift > 0)
        {
            for (uint32_t i = 0; i < input_width; i++)
                input_f[i] = (float)(input_buffer[i]) / (1 << shift);
        }
        else if (shift < 0)
        {
            for (uint32_t i = 0; i < input_width; i++)
                input_f[i] = (float)(input_buffer[i]) * (1 << -shift);
        }
        else
        {
            for (uint32_t i = 0; i < input_width; i++)
                input_f[i] = (float)(input_buffer[i]);
        }

        __m256 acc_sum = _mm256_setzero_ps();
        float subsimd_sum = 0.0f;
        {
            auto input_buffer = input_f;
            auto output_buffer = &static_cast<float*>(work_item->output->parent->data_buffer)[output_view_start];

            for (auto block = 0u; block < num_full_blocks; ++block)
            {
                // Run computation.
                softmax_compute_block<C_max_acc>(input_buffer, output_buffer, acc_sum);
            }

            switch (partial_block_size)
            {
            case  0: break;
            case  1: softmax_compute_block< 1>(input_buffer, output_buffer, acc_sum); break;
            case  2: softmax_compute_block< 2>(input_buffer, output_buffer, acc_sum); break;
            case  3: softmax_compute_block< 3>(input_buffer, output_buffer, acc_sum); break;
            case  4: softmax_compute_block< 4>(input_buffer, output_buffer, acc_sum); break;
            case  5: softmax_compute_block< 5>(input_buffer, output_buffer, acc_sum); break;
            case  6: softmax_compute_block< 6>(input_buffer, output_buffer, acc_sum); break;
            case  7: softmax_compute_block< 7>(input_buffer, output_buffer, acc_sum); break;
            case  8: softmax_compute_block< 8>(input_buffer, output_buffer, acc_sum); break;
            case  9: softmax_compute_block< 9>(input_buffer, output_buffer, acc_sum); break;
            case 10: softmax_compute_block<10>(input_buffer, output_buffer, acc_sum); break;
            case 11: softmax_compute_block<11>(input_buffer, output_buffer, acc_sum); break;
            case 12: softmax_compute_block<12>(input_buffer, output_buffer, acc_sum); break;
            case 13: softmax_compute_block<13>(input_buffer, output_buffer, acc_sum); break;
            case 14: softmax_compute_block<14>(input_buffer, output_buffer, acc_sum); break;
            default: NN_UNREACHABLE_CODE;
            }

            switch (subsimd_block_size)
            {
            case 0: break;
            case 1: softmax_compute_subsimd<1>(input_buffer, output_buffer, subsimd_sum); break;
            case 2: softmax_compute_subsimd<2>(input_buffer, output_buffer, subsimd_sum); break;
            case 3: softmax_compute_subsimd<3>(input_buffer, output_buffer, subsimd_sum); break;
            case 4: softmax_compute_subsimd<4>(input_buffer, output_buffer, subsimd_sum); break;
            case 5: softmax_compute_subsimd<5>(input_buffer, output_buffer, subsimd_sum); break;
            case 6: softmax_compute_subsimd<6>(input_buffer, output_buffer, subsimd_sum); break;
            case 7: softmax_compute_subsimd<7>(input_buffer, output_buffer, subsimd_sum); break;
            default: NN_UNREACHABLE_CODE;
            }
        }

        {
            __m256 intermediate_sum = _mm256_hadd_ps(acc_sum, acc_sum);
            intermediate_sum = _mm256_permutevar8x32_ps(intermediate_sum, _mm256_set_epi32(0, 1, 4, 5, 2, 3, 6, 7));
            intermediate_sum = _mm256_hadd_ps(intermediate_sum, intermediate_sum);
            intermediate_sum = _mm256_hadd_ps(intermediate_sum, intermediate_sum);

            acc_sum = _mm256_add_ps(intermediate_sum, _mm256_set1_ps(subsimd_sum));
            subsimd_sum = _mm_cvtss_f32(_mm256_extractf128_ps(acc_sum, 0));

            acc_sum = _mm256_div_ps(_mm256_set1_ps(1.0f), acc_sum);
            subsimd_sum = 1.0f / subsimd_sum;
        }

        {
            auto output_buffer = &static_cast<float*>(work_item->output->parent->data_buffer)[output_view_start];

            for (auto block = 0u; block < num_full_blocks; ++block)
            {
                // Run computation.
                softmax_finalize_block<C_max_acc>(output_buffer, acc_sum);
            }

            switch (partial_block_size)
            {
            case  0: break;
            case  1: softmax_finalize_block< 1>(output_buffer, acc_sum); break;
            case  2: softmax_finalize_block< 2>(output_buffer, acc_sum); break;
            case  3: softmax_finalize_block< 3>(output_buffer, acc_sum); break;
            case  4: softmax_finalize_block< 4>(output_buffer, acc_sum); break;
            case  5: softmax_finalize_block< 5>(output_buffer, acc_sum); break;
            case  6: softmax_finalize_block< 6>(output_buffer, acc_sum); break;
            case  7: softmax_finalize_block< 7>(output_buffer, acc_sum); break;
            case  8: softmax_finalize_block< 8>(output_buffer, acc_sum); break;
            case  9: softmax_finalize_block< 9>(output_buffer, acc_sum); break;
            case 10: softmax_finalize_block<10>(output_buffer, acc_sum); break;
            case 11: softmax_finalize_block<11>(output_buffer, acc_sum); break;
            case 12: softmax_finalize_block<12>(output_buffer, acc_sum); break;
            case 13: softmax_finalize_block<13>(output_buffer, acc_sum); break;
            case 14: softmax_finalize_block<14>(output_buffer, acc_sum); break;
            default: NN_UNREACHABLE_CODE;
            }

            switch (subsimd_block_size)
            {
            case 0: break;
            case 1: softmax_finalize_subsimd<1>(output_buffer, subsimd_sum); break;
            case 2: softmax_finalize_subsimd<2>(output_buffer, subsimd_sum); break;
            case 3: softmax_finalize_subsimd<3>(output_buffer, subsimd_sum); break;
            case 4: softmax_finalize_subsimd<4>(output_buffer, subsimd_sum); break;
            case 5: softmax_finalize_subsimd<5>(output_buffer, subsimd_sum); break;
            case 6: softmax_finalize_subsimd<6>(output_buffer, subsimd_sum); break;
            case 7: softmax_finalize_subsimd<7>(output_buffer, subsimd_sum); break;
            default: NN_UNREACHABLE_CODE;
            }
        }
        _mm_free(input_f);
    }
示例#19
0
文件: cumsum.hpp 项目: Mathieu-/nt2
      : meta::strip<A0>{};//

    NT2_FUNCTOR_CALL_DISPATCH(
      1,
      typename nt2::meta::scalar_of<A0>::type,
      (3, (double, float, arithmetic_))
    )
      
    NT2_FUNCTOR_CALL_EVAL_IF(1,      float)
    {
      cout << "pb lié à gcc 4.5 ?" << std::endl; 
      typedef typename meta::scalar_of<A0>::type sctype;		
      typedef typename simd::native<sctype, tag::sse_ >  svtype;
      std::cout << " == a0 " << a0 << std::endl;
      svtype a011;
      a011=  _mm256_extractf128_ps(a0, 1);
      svtype a000;
      a000 =  _mm256_extractf128_ps(a0, 0);			
      std::cout << " == a000 " << a000 << std::endl;
      std::cout << " == a011 " << a011 << std::endl;
      svtype a00 =  cumsum(a000);
      svtype a01 =  cumsum(a011); 
      svtype z = splat<svtype>(a00[meta::cardinal_of<svtype>::value-1]);
      std::cout << " == a00 " << a00 << std::endl;
      std::cout << " == a01 " << a01 << std::endl;
      std::cout << " == z   " << z   << std::endl; 
      A0 that = {_mm256_insertf128_ps(that,a00, 0)};		
      that =  _mm256_insertf128_ps(that, a01+z, 1); 
      return that;
    }
    NT2_FUNCTOR_CALL_EVAL_IF(1,      double)