//----------------------------------------------------------------- // AOS -> SOA // // pBgr: b0,g0,r0, b1,g1,r1, b2,g2,r2, b3,g3,r3, b4,g4,r4, ... // -> // pBlu: b0, b1, b2, b3, b4, ... // pGrn: g0, g1, g2, g3, g4, ... // pRed: r0, r1, r2, r3, r4, ... void aos2soa(float *pBgr, float *pBlu, float *pGrn, float *pRed, const size_t length) { __m128 *bgr = (__m128 *)pBgr; float *b = pBlu; float *g = pGrn; float *r = pRed; for (size_t i = 0; i < length; i += 24, b += 8, g += 8, r += 8) { __m256 m03 = _mm256_castps128_ps256(*bgr++); // 下半分のロード __m256 m14 = _mm256_castps128_ps256(*bgr++); __m256 m25 = _mm256_castps128_ps256(*bgr++); m03 = _mm256_insertf128_ps(m03, *bgr++, 1); // 上半分のロード m14 = _mm256_insertf128_ps(m14, *bgr++, 1); m25 = _mm256_insertf128_ps(m25, *bgr++, 1); __m256 bg = _mm256_shuffle_ps(m14, m25, _MM_SHUFFLE(2, 1, 3, 2)); // b と g の上部分 __m256 gr = _mm256_shuffle_ps(m03, m14, _MM_SHUFFLE(1, 0, 2, 1)); // g と r の下部分 __m256 bb = _mm256_shuffle_ps(m03, bg, _MM_SHUFFLE(2, 0, 3, 0)); __m256 gg = _mm256_shuffle_ps(gr, bg, _MM_SHUFFLE(3, 1, 2, 0)); __m256 rr = _mm256_shuffle_ps(gr, m25, _MM_SHUFFLE(3, 0, 3, 1)); _mm256_store_ps(b, bb); _mm256_store_ps(g, gg); _mm256_store_ps(r, rr); } }
void DoubleToComplex(double *srcI, double *srcQ, Complex *dst, const unsigned int len) { __m256d avxR_D, avxI_D, avxX_D, avxY_D, avxA_D, avxB_D; __m128 avxA, avxB; #if 1 __m256 avxD; #endif for (unsigned int i=0; i+4<=len; i+=4) { avxR_D = _mm256_loadu_pd(srcI + i); avxI_D = _mm256_loadu_pd(srcQ + i); avxX_D = _mm256_unpacklo_pd(avxR_D, avxI_D); //swizzle avxY_D = _mm256_unpackhi_pd(avxR_D, avxI_D); avxA_D = _mm256_permute2f128_pd(avxX_D, avxY_D, 0x20); avxB_D = _mm256_permute2f128_pd(avxX_D, avxY_D, 0x31); avxA = _mm256_cvtpd_ps(avxA_D); //double to float avxB = _mm256_cvtpd_ps(avxB_D); #if 0 avxD = _mm256_castps128_ps256(avxA); avxD = _mm256_insertf128_ps(avxD, avxB, 1); _mm256_storeu_ps((float*)(dst+i), avxD); #else _mm_maskstore_ps((float*)(dst+i), _mm_set_epi32(SET_1, SET_1, SET_1, SET_1), avxA); _mm_maskstore_ps((float*)(dst+i+2), _mm_set_epi32(SET_1, SET_1, SET_1, SET_1), avxB); #endif } for (unsigned int i=len-(len & 0x03); i<len; ++i) { dst[i].m_real = static_cast<float>(srcI[i]); dst[i].m_imag = static_cast<float>(srcQ[i]); } }
void PaLineStrip0Common(PA_STATE &pa, UINT slot, simdvector tri[3]) { simdvector &a = PaGetSimdVector(pa, pa.cur, slot); for (int i = 0; i < 4; ++i) { simdscalar a0 = a[i]; // index 0 simdvector &v0 = tri[0]; // 01234 -> 01122334 __m128 vLow = _mm256_extractf128_ps(a0, 0); __m128 vHigh = _mm256_extractf128_ps(a0, 1); // 0123 -> 0112 // 0123 -> 233x __m128 vOutLow = _mm_shuffle_ps(vLow, vLow, _MM_SHUFFLE(2, 1, 1, 0)); __m128 vOutHigh = _mm_shuffle_ps(vLow, vLow, _MM_SHUFFLE(3, 3, 3, 2)); float f; _MM_EXTRACT_FLOAT(f, vHigh, 0); vOutHigh = _mm_insert_ps(vOutHigh, _mm_set1_ps(f), 0xf0); v0[i] = _mm256_insertf128_ps(v0[i], vOutLow, 0); v0[i] = _mm256_insertf128_ps(v0[i], vOutHigh, 1); // index 1 is same as index 0, but position needs to be adjusted to bloat the line // into 2 tris 1 pixel wide simdvector &v1 = tri[1]; v1[i] = v0[i]; // index 2 // 01234 -> 10213243 simdvector &v2 = tri[2]; // 0123 -> 1021 // 0123 -> 32x3 vOutLow = _mm_shuffle_ps(vLow, vLow, _MM_SHUFFLE(1, 2, 0, 1)); vOutHigh = _mm_shuffle_ps(vLow, vLow, _MM_SHUFFLE(3, 3, 2, 3)); vOutHigh = _mm_insert_ps(vOutHigh, _mm_set1_ps(f), 0xa0); v2[i] = _mm256_insertf128_ps(v2[i], vOutLow, 0); v2[i] = _mm256_insertf128_ps(v2[i], vOutHigh, 1); } }
void test1bit (void) { d1 = _mm256_extractf128_pd (e2, k4); /* { dg-error "the last argument must be a 1-bit immediate" } */ a1 = _mm256_extractf128_ps (b2, k4); /* { dg-error "the last argument must be a 1-bit immediate" } */ i1 = _mm256_extractf128_si256 (l2, k4); /* { dg-error "the last argument must be a 1-bit immediate" } */ e1 = _mm256_insertf128_pd (e2, d1, k4); /* { dg-error "the last argument must be a 1-bit immediate" } */ b1 = _mm256_insertf128_ps (b2, a1, k4); /* { dg-error "the last argument must be a 1-bit immediate" } */ l1 = _mm256_insertf128_si256 (l2, i1, k4);/* { dg-error "the last argument must be a 1-bit immediate" } */ }
void PaLineStrip1Common(PA_STATE &pa, UINT slot, simdvector tri[3]) { simdvector &a = PaGetSimdVector(pa, pa.prev, slot); simdvector &b = PaGetSimdVector(pa, pa.cur, slot); for (int i = 0; i < 4; ++i) { simdscalar a0 = a[i]; simdscalar b0 = b[i]; // index 0 simdvector &v0 = tri[0]; // 45670 -> 45566770 __m128 vPrevHigh = _mm256_extractf128_ps(a0, 1); __m128 vOutLow = _mm_shuffle_ps(vPrevHigh, vPrevHigh, _MM_SHUFFLE(2, 1, 1, 0)); __m128 vOutHigh = _mm_shuffle_ps(vPrevHigh, vPrevHigh, _MM_SHUFFLE(3, 3, 3, 2)); __m128 vCurLow = _mm256_extractf128_ps(b0, 0); float f; _MM_EXTRACT_FLOAT(f, vCurLow, 0); vOutHigh = _mm_insert_ps(vOutHigh, _mm_set1_ps(f), 0xf0); v0[i] = _mm256_insertf128_ps(v0[i], vOutLow, 0); v0[i] = _mm256_insertf128_ps(v0[i], vOutHigh, 1); // index 1 // 45670 -> 45566770 // index 1 same as index 0 simdvector &v1 = tri[1]; v1[i] = v0[i]; // index 2 // 45670 -> 54657607 simdvector &v2 = tri[2]; vOutLow = _mm_shuffle_ps(vPrevHigh, vPrevHigh, _MM_SHUFFLE(1, 2, 0, 1)); vOutHigh = _mm_shuffle_ps(vPrevHigh, vPrevHigh, _MM_SHUFFLE(3, 3, 2, 3)); vOutHigh = _mm_insert_ps(vOutHigh, _mm_set1_ps(f), 0xa0); v2[i] = _mm256_insertf128_ps(v2[i], vOutLow, 0); v2[i] = _mm256_insertf128_ps(v2[i], vOutHigh, 1); } }
__m256 exp_256( const __m256& x) { //! Clip the value __m256 y = _mm256_max_ps(_mm256_min_ps(x, _mm256_set1_ps(88.3762626647949f)), _mm256_set1_ps(-88.3762626647949f)); //! Express exp(x) as exp(g + n * log(2)) __m256 fx = y * _mm256_set1_ps(1.44269504088896341) + _mm256_set1_ps(0.5f); //! Floor const __m256 tmp = _mm256_round_ps(fx, _MM_FROUND_TO_ZERO); //! If greater, substract 1 const __m256 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), _mm256_set1_ps(1.f)); fx = tmp - mask; y -= fx * _mm256_set1_ps(0.693359375 - 2.12194440e-4); const __m256 z = y * y; const __m256 t = (((((_mm256_set1_ps(1.9875691500E-4) * y + _mm256_set1_ps(1.3981999507E-3)) * y + _mm256_set1_ps(8.3334519073E-3)) * y + _mm256_set1_ps(4.1665795894E-2)) * y + _mm256_set1_ps(1.6666665459E-1)) * y + _mm256_set1_ps(5.0000001201E-1)) * z + y + _mm256_set1_ps(1.f); //! Build 2^n (split it into two SSE array, since AVX2 equivalent functions //! aren't available. const __m128i emm0 = _mm_add_epi32(_mm_cvttps_epi32(_mm256_castps256_ps128(fx)), _mm_set1_epi32(0x7f)); const __m128i emm1 = _mm_add_epi32(_mm_cvttps_epi32(_mm256_extractf128_ps(fx, 1)), _mm_set1_epi32(0x7f)); fx = _mm256_castps128_ps256(_mm_castsi128_ps(_mm_slli_epi32(emm0, 23))); fx = _mm256_insertf128_ps(fx, _mm_castsi128_ps(_mm_slli_epi32(emm1, 23)), 1); //! Return the result return t * fx; }
INLINE avxb insert (const avxb& a, const sseb& b) { return _mm256_insertf128_ps (a,b,i); }
double bst_compute_129_m256_maskstore_root_aligned( void*_bst_obj, double* p, double* q, size_t nn ) { segments_t* mem = (segments_t*) _bst_obj; int n, i, r, l_end, j, l_end_pre; double t, e_tmp; double* e = mem->e, *w = mem->w; int* root = mem->r; __m256d v_tmp; __m256d v00, v01, v02, v03; __m256d v10, v11, v12, v13; __m256d v20, v21, v22, v23; __m256d v30, v31, v32, v33; __m256i v_cur_roots; __m256 v_rootmask0, v_rootmask1; // initialization // mem->n = nn; n = nn; // subtractions with n potentially negative. say hello to all the bugs int idx1, idx1_root; int idx2; int idx3, idx3_root; int pad_root, pad, pad_r; idx1 = ((int) mem->e_sz) - 1; idx1_root = ((int) mem->r_sz); // the conventio is that iteration i, idx1 points to the first element of line i+1 e[idx1++] = q[n]; // pad contains the padding for row i+1 // for row n it's always 3 pad = 3; pad_root = 7; for (i = n-1; i >= 0; --i) { idx1 -= 2*(n-i)+1 + pad; idx1_root -= 2*(n-i)+1 + pad_root; idx2 = idx1 + 1; e[idx1] = q[i]; w[idx1] = q[i]; for (j = i+1; j < n+1; ++j,++idx2) { e[idx2] = INFINITY; w[idx2] = w[idx2-1] + p[j-1] + q[j]; } idx2 += pad; // padding of line i+1 // idx2 now points to the first element of the next line idx3 = idx1; idx3_root = idx1_root; pad_r = pad; for (r = i; r < n; ++r) { pad_r = (pad_r+1)&3; // padding of line r+1 idx1 = idx3; idx1_root = idx3_root; l_end = idx2 + (n-r); // l_end points to the first entry after the current row e_tmp = e[idx1++]; idx1_root++; // calculate until a multiple of 8 doubles is left // 8 = 4 * 2 128-bit vectors l_end_pre = idx2 + ((n-r)&15); for( ; (idx2 < l_end_pre) && (idx2 < l_end); ++idx2 ) { t = e_tmp + e[idx2] + w[idx1]; if (t < e[idx1]) { e[idx1] = t; root[idx1_root] = r; } idx1++; idx1_root++; } v_tmp = _mm256_set_pd( e_tmp, e_tmp, e_tmp, e_tmp ); // execute the shit for 4 vectors of size 2 v_cur_roots = _mm256_set_epi32(r, r, r, r, r, r, r, r); for( ; idx2 < l_end; idx2 += 16 ) { v01 = _mm256_load_pd( &w[idx1 ] ); v11 = _mm256_load_pd( &w[idx1+ 4] ); v21 = _mm256_load_pd( &w[idx1+ 8] ); v31 = _mm256_load_pd( &w[idx1+12] ); v00 = _mm256_load_pd( &e[idx2 ] ); v01 = _mm256_add_pd( v01, v_tmp ); v10 = _mm256_load_pd( &e[idx2+ 4] ); v11 = _mm256_add_pd( v11, v_tmp ); v20 = _mm256_load_pd( &e[idx2+ 8] ); v21 = _mm256_add_pd( v21, v_tmp ); v30 = _mm256_load_pd( &e[idx2+12] ); v31 = _mm256_add_pd( v31, v_tmp ); v01 = _mm256_add_pd( v01, v00 ); v03 = _mm256_load_pd( &e[idx1 ] ); v11 = _mm256_add_pd( v11, v10 ); v13 = _mm256_load_pd( &e[idx1+ 4] ); v21 = _mm256_add_pd( v21, v20 ); v23 = _mm256_load_pd( &e[idx1+ 8] ); v31 = _mm256_add_pd( v31, v30 ); v33 = _mm256_load_pd( &e[idx1+12] ); v02 = _mm256_cmp_pd( v01, v03, _CMP_LT_OQ ); v12 = _mm256_cmp_pd( v11, v13, _CMP_LT_OQ ); v22 = _mm256_cmp_pd( v21, v23, _CMP_LT_OQ ); v32 = _mm256_cmp_pd( v31, v33, _CMP_LT_OQ ); _mm256_maskstore_pd( &e[idx1 ], _mm256_castpd_si256( v02 ), v01 ); _mm256_maskstore_pd( &e[idx1+ 4], _mm256_castpd_si256( v12 ), v11 ); v_rootmask0 = _mm256_insertf128_ps( _mm256_castps128_ps256( _mm256_cvtpd_ps(v02)), _mm256_cvtpd_ps(v12) , 1 ); _mm256_maskstore_pd( &e[idx1+ 8], _mm256_castpd_si256( v22 ), v21 ); _mm256_maskstore_pd( &e[idx1+12], _mm256_castpd_si256( v32 ), v31 ); v_rootmask1 = _mm256_insertf128_ps( _mm256_castps128_ps256( _mm256_cvtpd_ps(v22)), _mm256_cvtpd_ps(v32) , 1 ); _mm256_maskstore_ps( &root[idx1_root ], _mm256_castps_si256( v_rootmask0 ), _mm256_castsi256_ps( v_cur_roots ) ); _mm256_maskstore_ps( &root[idx1_root + 8], _mm256_castps_si256( v_rootmask1 ), _mm256_castsi256_ps( v_cur_roots ) ); idx1 += 16; idx1_root += 16; } idx2 += pad_r; idx3++; idx3_root++; } pad = (pad -1)&3; pad_root = (pad_root-1)&7; } // the index of the last item of the first row is ((n/4)+1)*4-1, due to the padding // if n is even, the total number of entries in the first // row of the table is odd, so we need padding return e[ ((n/4)+1)*4 - 1 ]; }
float tricub_x86_f(float *src, float *abcd, float x, float y){ float *s; float x0, x1, x2, x3, y0, y1, y2, y3; float dst[4]; #if defined(__AVX2__) && defined(__x86_64__) __m256 v1, v2, v3, v4; __m256 va, vb, vc, vd; __m128 va4, vb4, vc4, vd4; __m128 v128a, v128b; __m128 vy0, vy1, vy2, vy3; #else int i, ni2, ni3, ninj2, ninj3; float va4[4], vb4[4], vc4[4], vd4[4]; ninj2 = ninj + ninj; ninj3 = ninj2 + ninj; ni2 = ni + ni; ni3 = ni2 + ni; #endif #if defined(__AVX2__) && defined(__x86_64__) // ==== interpolation along Z, vector length is 16 (2 vectors of length 8 per plane) ==== va = _mm256_broadcast_ss(abcd); // promote constants to vectors vb = _mm256_broadcast_ss(abcd+1); vc = _mm256_broadcast_ss(abcd+2); vd = _mm256_broadcast_ss(abcd+3); s = src; // rows 0 and 1, 4 planes (Z0, Z1, Z2, Z3) v128a = _mm_loadu_ps(s); // Z0 row 0 v1 = _mm256_insertf128_ps(v1,v128a,0); v128b = _mm_loadu_ps(s+ni); // Z0 row 1 v1 = _mm256_insertf128_ps(v1,v128b,1); v1 = _mm256_mul_ps(v1,va); // v1 = v1*va s += ninj; v128a = _mm_loadu_ps(s); // Z1 row 0 v2 = _mm256_insertf128_ps(v2,v128a,0); v128b = _mm_loadu_ps(s+ni); // Z1 row 1 v2 = _mm256_insertf128_ps(v2,v128b,1); v1 = _mm256_fmadd_ps(v2,vb,v1); // v1 += v2*vb s += ninj; v128a = _mm_loadu_ps(s); // Z2 row 0 v3 = _mm256_insertf128_ps(v3,v128a,0); v128b = _mm_loadu_ps(s+ni); // Z2 row 1 v3 = _mm256_insertf128_ps(v3,v128b,1); v1 = _mm256_fmadd_ps(v3,vc,v1); // v1 += v3*vc s += ninj; v128a = _mm_loadu_ps(s); // Z3 row 0 v4 = _mm256_insertf128_ps(v4,v128a,0); v128b = _mm_loadu_ps(s+ni); // Z3 row 1 v4 = _mm256_insertf128_ps(v4,v128b,1); v1 = _mm256_fmadd_ps(v4,vd,v1); // v1 += v4*vd // split vector of length 8 into 2 vectors of length 4 vy0 = _mm256_extractf128_ps(v1,0);// Y0 : row 0 (v1 low) vy1 = _mm256_extractf128_ps(v1,1);// Y1 : row 1 (v1 high) s = src + 2*ni; // rows 2 and 3, 4 planes (Z0, Z1, Z2, Z3) v128a = _mm_loadu_ps(s); // Z0 row 2 v1 = _mm256_insertf128_ps(v1,v128a,0); v128b = _mm_loadu_ps(s+ni); // Z0 row 3 v1 = _mm256_insertf128_ps(v1,v128b,1); v1 = _mm256_mul_ps(v1,va); // v1 = v1*va s += ninj; v128a = _mm_loadu_ps(s); // Z1 row 2 v2 = _mm256_insertf128_ps(v2,v128a,0); v128b = _mm_loadu_ps(s+ni); // Z1 row 3 v2 = _mm256_insertf128_ps(v2,v128b,1); v1 = _mm256_fmadd_ps(v2,vb,v1); // v1 += v2*vb s += ninj; v128a = _mm_loadu_ps(s); // Z2 row 2 v3 = _mm256_insertf128_ps(v3,v128a,0); v128b = _mm_loadu_ps(s+ni); // Z2 row 3 v3 = _mm256_insertf128_ps(v3,v128b,1); v1 = _mm256_fmadd_ps(v3,vc,v1); // v1 += v3*vc s += ninj; v128a = _mm_loadu_ps(s); // Z3 row 2 v4 = _mm256_insertf128_ps(v4,v128a,0); v128b = _mm_loadu_ps(s+ni); // Z3 row 3 v4 = _mm256_insertf128_ps(v4,v128b,1); v1 = _mm256_fmadd_ps(v4,vd,v1); // v1 += v4*vd // split vector of length 8 into 2 vectors of length 4 vy2 = _mm256_extractf128_ps(v1,0);// Y2 : row 2 (v1 low) vy3 = _mm256_extractf128_ps(v1,1);// Y3 : row 3 (v1 high) // ==== interpolation along Y, vector length is 4 (4 rows) ==== y0 = cm167*y*(y-one)*(y-two); y1 = cp5*(y+one)*(y-one)*(y-two); y2 = cm5*y*(y+one)*(y-two); y3 = cp167*y*(y+one)*(y-one); va4 = _mm_broadcast_ss(&y0); // promote constants to vectors vb4 = _mm_broadcast_ss(&y1); vc4 = _mm_broadcast_ss(&y2); vd4 = _mm_broadcast_ss(&y3); vy0 = _mm_mul_ps(vy0,va4); // vy0 * va4 vy0 = _mm_fmadd_ps(vy1,vb4,vy0); // += vy1 * vb4 vy0 = _mm_fmadd_ps(vy2,vc4,vy0); // += vy2 * vc4 vy0 = _mm_fmadd_ps(vy3,vd4,vy0); // += vy3 * vd4 _mm_storeu_ps(dst,vy0); // store 4 values along X #else y0 = cm167*y*(y-one)*(y-two); y1 = cp5*(y+one)*(y-one)*(y-two); y2 = cm5*y*(y+one)*(y-two); y3 = cp167*y*(y+one)*(y-one); for (i=0 ; i<4 ; i++){ va4[i] = src[i ]*abcd[0] + src[i +ninj]*abcd[1] + src[i +ninj2]*abcd[2] + src[i +ninj3]*abcd[3]; vb4[i] = src[i+ni ]*abcd[0] + src[i+ni +ninj]*abcd[1] + src[i+ni +ninj2]*abcd[2] + src[i+ni +ninj3]*abcd[3]; vc4[i] = src[i+ni2]*abcd[0] + src[i+ni2+ninj]*abcd[1] + src[i+ni2+ninj2]*abcd[2] + src[i+ni2+ninj3]*abcd[3]; vd4[i] = src[i+ni3]*abcd[0] + src[i+ni3+ninj]*abcd[1] + src[i+ni3+ninj2]*abcd[2] + src[i+ni3+ninj3]*abcd[3]; dst[i] = va4[i]*y0 + vb4[i]*y1 + vc4[i]*y2 + vd4[i]*y3; } #endif // ==== interpolation along x, scalar ==== x0 = cm167*x*(x-one)*(x-two); x1 = cp5*(x+one)*(x-one)*(x-two); x2 = cm5*x*(x+one)*(x-two); x3 = cp167*x*(x+one)*(x-one); return(dst[0]*x0 + dst[1]*x1 + dst[2]*x2 + dst[3]*x3); }
__m256 test_mm256_insertf128_ps_1(__m256 a, __m128 b) { // CHECK-LABEL: @test_mm256_insertf128_ps_1 // CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11> return _mm256_insertf128_ps(a, b, 1); }
__m256 test_mm256_insertf128_ps_0(__m256 a, __m128 b) { // CHECK-LABEL: @test_mm256_insertf128_ps_0 // CHECK: shufflevector{{.*}}<i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7> return _mm256_insertf128_ps(a, b, 0); }
typedef typename meta::scalar_of<A0>::type sctype; typedef typename simd::native<sctype, tag::sse_ > svtype; std::cout << " == a0 " << a0 << std::endl; svtype a011; a011= _mm256_extractf128_ps(a0, 1); svtype a000; a000 = _mm256_extractf128_ps(a0, 0); std::cout << " == a000 " << a000 << std::endl; std::cout << " == a011 " << a011 << std::endl; svtype a00 = cumsum(a000); svtype a01 = cumsum(a011); svtype z = splat<svtype>(a00[meta::cardinal_of<svtype>::value-1]); std::cout << " == a00 " << a00 << std::endl; std::cout << " == a01 " << a01 << std::endl; std::cout << " == z " << z << std::endl; A0 that = {_mm256_insertf128_ps(that,a00, 0)}; that = _mm256_insertf128_ps(that, a01+z, 1); return that; } NT2_FUNCTOR_CALL_EVAL_IF(1, double) { typedef typename meta::scalar_of<A0>::type sctype; typedef typename simd::native<sctype, tag::sse_ > svtype; svtype a000 = { _mm256_extractf128_pd(a0, 0)}; svtype a011 = { _mm256_extractf128_pd(a0, 1)}; svtype a00 = cumsum(a000); svtype a01 = cumsum(a011); svtype z = splat<svtype>(a00[meta::cardinal_of<svtype>::value-1]); A0 that = simd::native_cast<A0>(_mm256_insertf128_pd(that,a00, 0)); that = simd::native_cast<A0>(_mm256_insertf128_pd(that, a01+z, 1)) ; return that;