bool PaQuadList1(PA_STATE &pa, UINT slot, simdvector tri[3]) { simdvector &a = PaGetSimdVector(pa, 0, slot); simdvector &b = PaGetSimdVector(pa, 1, slot); simdscalar s1, s2; for (int i = 0; i < 4; ++i) { simdscalar a0 = a[i]; simdscalar b0 = b[i]; s1 = _mm256_permute2f128_ps(a0, b0, 0x20); s2 = _mm256_permute2f128_ps(a0, b0, 0x31); simdvector &v0 = tri[0]; v0[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(0, 0, 0, 0)); simdvector &v1 = tri[1]; v1[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(2, 1, 2, 1)); simdvector &v2 = tri[2]; v2[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(3, 2, 3, 2)); } SetNextPaState(pa, PaQuadList0, PaQuadListSingle0); pa.reset = true; pa.numPrimsComplete += KNOB_VS_SIMD_WIDTH; return true; }
static void NOINLINE mulX8( const __m256 *v1, const __m256 *v2, __m256 *vout ) { static const int ALIGN32 p1[ 8 ] = { 0, 0, 0, 0, 1, 1, 1, 1 }; static const int ALIGN32 p2[ 8 ] = { 2, 2, 2, 2, 3, 3, 3, 3 }; static const int ALIGN32 p3[ 8 ] = { 4, 4, 4, 4, 5, 5, 5, 5 }; static const int ALIGN32 p4[ 8 ] = { 6, 6, 6, 6, 7, 7, 7, 7 }; const __m256i perm1 = _mm256_load_si256( reinterpret_cast< const __m256i* >( p1 ) ); const __m256i perm2 = _mm256_load_si256( reinterpret_cast< const __m256i* >( p2 ) ); const __m256i perm3 = _mm256_load_si256( reinterpret_cast< const __m256i* >( p3 ) ); const __m256i perm4 = _mm256_load_si256( reinterpret_cast< const __m256i* >( p4 ) ); for( int r = 0; r < 2; r++ ) { __m256 a0 = _mm256_permutevar8x32_ps( v1[ r ], perm1 ); __m256 a1 = _mm256_permutevar8x32_ps( v1[ r ], perm2 ); __m256 a2 = _mm256_permutevar8x32_ps( v1[ r ], perm3 ); __m256 a3 = _mm256_permutevar8x32_ps( v1[ r ], perm4 ); __m256 b0 = _mm256_mul_ps( a0, v2[ 0 ] ); __m256 b1 = _mm256_mul_ps( a1, v2[ 1 ] ); __m256 b2 = _mm256_mul_ps( a2, v2[ 0 ] ); __m256 b3 = _mm256_mul_ps( a3, v2[ 1 ] ); __m256 c0 = _mm256_add_ps( b0, b1 ); __m256 c1 = _mm256_add_ps( b2, b3 ); __m256 d0 = _mm256_permute2f128_ps( c0, c1, _MM_SHUFFLE( 0, 2, 0, 0 ) ); __m256 d1 = _mm256_permute2f128_ps( c0, c1, _MM_SHUFFLE( 0, 3, 0, 1 ) ); vout[ r ] = _mm256_add_ps( d0, d1 ); } }
static void process_sinc(rarch_sinc_resampler_t *resamp, float *out_buffer) { unsigned i; __m256 sum_l = _mm256_setzero_ps(); __m256 sum_r = _mm256_setzero_ps(); const float *buffer_l = resamp->buffer_l + resamp->ptr; const float *buffer_r = resamp->buffer_r + resamp->ptr; unsigned taps = resamp->taps; unsigned phase = resamp->time >> SUBPHASE_BITS; #if SINC_COEFF_LERP const float *phase_table = resamp->phase_table + phase * taps * 2; const float *delta_table = phase_table + taps; __m256 delta = _mm256_set1_ps((float) (resamp->time & SUBPHASE_MASK) * SUBPHASE_MOD); #else const float *phase_table = resamp->phase_table + phase * taps; #endif for (i = 0; i < taps; i += 8) { __m256 buf_l = _mm256_loadu_ps(buffer_l + i); __m256 buf_r = _mm256_loadu_ps(buffer_r + i); #if SINC_COEFF_LERP __m256 deltas = _mm256_load_ps(delta_table + i); __m256 sinc = _mm256_add_ps(_mm256_load_ps(phase_table + i), _mm256_mul_ps(deltas, delta)); #else __m256 sinc = _mm256_load_ps(phase_table + i); #endif sum_l = _mm256_add_ps(sum_l, _mm256_mul_ps(buf_l, sinc)); sum_r = _mm256_add_ps(sum_r, _mm256_mul_ps(buf_r, sinc)); } /* hadd on AVX is weird, and acts on low-lanes * and high-lanes separately. */ __m256 res_l = _mm256_hadd_ps(sum_l, sum_l); __m256 res_r = _mm256_hadd_ps(sum_r, sum_r); res_l = _mm256_hadd_ps(res_l, res_l); res_r = _mm256_hadd_ps(res_r, res_r); res_l = _mm256_add_ps(_mm256_permute2f128_ps(res_l, res_l, 1), res_l); res_r = _mm256_add_ps(_mm256_permute2f128_ps(res_r, res_r, 1), res_r); /* This is optimized to mov %xmmN, [mem]. * There doesn't seem to be any _mm256_store_ss intrinsic. */ _mm_store_ss(out_buffer + 0, _mm256_extractf128_ps(res_l, 0)); _mm_store_ss(out_buffer + 1, _mm256_extractf128_ps(res_r, 0)); }
bool PaTriStrip1(PA_STATE &pa, UINT slot, simdvector tri[3]) { simdvector &a = PaGetSimdVector(pa, pa.prev, slot); simdvector &b = PaGetSimdVector(pa, pa.cur, slot); simdscalar s; for (int i = 0; i < 4; ++i) { simdscalar a0 = a[i]; simdscalar b0 = b[i]; /* Tri Pattern v0 -> 02244668 v1 -> 11335577 v2 -> 23456789 */ simdvector &v1 = tri[1]; v1[i] = _simd_shuffle_ps(a0, a0, _MM_SHUFFLE(3, 3, 1, 1)); simdvector &v2 = tri[2]; s = _mm256_permute2f128_ps(a0, b0, 0x21); v2[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(1, 0, 3, 2)); simdvector &v0 = tri[0]; v0[i] = _simd_shuffle_ps(a0, v2[i], _MM_SHUFFLE(2, 0, 2, 0)); } SetNextPaState(pa, PaTriStrip1, PaTriStripSingle0); pa.numPrimsComplete += KNOB_VS_SIMD_WIDTH; return true; }
irreg_poly_area_func_sign(float, _avx) { if (__builtin_expect(is_null(cords) || cords_len == 0, 0)) return 0; __m256 values_0_3, values_4_7, values_8_11, values_12_15, values_16_19 = _mm256_load_ps((const float *)&cords[0][0]), accum_sum = _mm256_setzero_ps(); float accum_sum_aux; #define _float_cords_dot_prod(curr, next, index) \ _mm256_dp_ps( \ curr, \ _mm256_xor_ps( \ _mm256_shuffle_ps(curr, _mm256_permute2f128_ps(curr, next, 0b00100001), 0b00011011),\ _mm256_setr_ps(0, -0.0f, 0, -0.0f, 0, -0.0f, 0, -0.0f) \ ), \ 0b11110000 | (1 << (index)) \ ) unsigned long index; for (index = 0; index < (cords_len - 16); index += 16) { values_0_3 = values_16_19; values_4_7 = _mm256_load_ps((const float *)&cords[index + 4]); values_8_11 = _mm256_load_ps((const float *)&cords[index + 8]); values_12_15 = _mm256_load_ps((const float *)&cords[index + 12]); values_16_19 = _mm256_load_ps((const float *)&cords[index + 16]); accum_sum = _mm256_add_ps( accum_sum, _mm256_add_ps( _mm256_add_ps( _float_cords_dot_prod(values_0_3, values_4_7, 0), _float_cords_dot_prod(values_4_7, values_8_11, 1) ), _mm256_add_ps( _float_cords_dot_prod(values_8_11, values_12_15, 2), _float_cords_dot_prod(values_12_15, values_16_19, 3) ) ) ); } accum_sum = _mm256_hadd_ps(accum_sum, _mm256_permute2f128_ps(accum_sum, accum_sum, 1)); // a0+a1, a2+a3, a4+a5, a6+a7, a4+a5, a6+a7, a0+a1, a2+a3 accum_sum = _mm256_hadd_ps(accum_sum, accum_sum); // a0+a1+a2+a3, a4+a5+a6+a7, ... accum_sum = _mm256_hadd_ps(accum_sum, accum_sum); // a0+a1+a2+a3+a4+a5+a6+a7, ... for (accum_sum_aux = _mm_cvtss_f32(_mm256_castps256_ps128(accum_sum)); index < (cords_len - 1); index++) accum_sum_aux += _calc_diff_of_adj_prods(cords, index); return accum_sum_aux; // return scalar_half(scalar_abs(accum_sum_aux)); }
static void NOINLINE transposeX8( const __m256 *v1, __m256 *vout ) { #if 0 // AVX1 __m256 a0 = _mm256_unpacklo_ps( v1[ 0 ], v1[ 1 ] ); __m256 a1 = _mm256_unpackhi_ps( v1[ 0 ], v1[ 1 ] ); __m256 b0 = _mm256_permute2f128_ps( a0, a1, _MM_SHUFFLE( 0, 2, 0, 0 ) ); __m256 b1 = _mm256_permute2f128_ps( a0, a1, _MM_SHUFFLE( 0, 3, 0, 1 ) ); __m256 c0 = _mm256_unpacklo_ps( b0, b1 ); __m256 c1 = _mm256_unpackhi_ps( b0, b1 ); vout[ 0 ] = _mm256_permute2f128_ps( c0, c1, _MM_SHUFFLE( 0, 2, 0, 0 ) ); vout[ 1 ] = _mm256_permute2f128_ps( c0, c1, _MM_SHUFFLE( 0, 3, 0, 1 ) ); #else // AVX2 static const int ALIGN32 p1[ 8 ] = { 0, 4, 2, 6, 1, 5, 3, 7 }; static const int ALIGN32 p2[ 8 ] = { 2, 6, 0, 4, 3, 7, 1, 5 }; const __m256i perm1 = _mm256_load_si256( reinterpret_cast< const __m256i* >( p1 ) ); const __m256i perm2 = _mm256_load_si256( reinterpret_cast< const __m256i* >( p2 ) ); __m256 a0 = _mm256_permutevar8x32_ps( v1[ 0 ], perm1 ); __m256 a1 = _mm256_permutevar8x32_ps( v1[ 1 ], perm2 ); vout[ 0 ] = _mm256_blend_ps( a0, a1, 0xCC ); vout[ 1 ] = _mm256_shuffle_ps( a0, a1, 0x4E ); #endif }
int main(void) { __m256 da = _mm256_setr_ps(1,2,3,4,5,6,7,8); __m256 db = _mm256_setr_ps(11,12,13,14,15,16,17,18); __m256 dc; printDc("da: ", da); printDc("db: ", db); printf("\n"); dc = _mm256_permute2f128_ps(da, db, 0x02); printDc("dc: ", dc); dc = _mm256_permute2f128_ps(da, db, 0x02|0x08); printDc("dc: ", dc); dc = _mm256_permute2f128_ps(da, db, 0x21); printDc("dc: ", dc); return 0; }
/* Adjust MBR to fit all child MBRs */ inline void adjustMbrArraySTRNode(ArraySTRNode nodes[], ulong_t cur) { ArraySTRNode *node, *child; ulong_t k; node = &nodes[cur]; child = &nodes[node->pos]; /* enlarge mbr to include all childlen's mbr */ #ifdef ENABLE_SSE_ADJUST { __m128 v_nlow = _mm_load_ps(child[0].mbr.low); __m128 v_nupp = _mm_load_ps(child[0].mbr.upp); for (k = 1; k < node->len; k++) { v_nlow = _mm_min_ps(v_nlow, _mm_load_ps(child[k].mbr.low)); v_nupp = _mm_max_ps(v_nupp, _mm_load_ps(child[k].mbr.upp)); } _mm_store_ps(node->mbr.low, v_nlow); _mm_store_ps(node->mbr.upp, v_nupp); } #else #ifdef ENABLE_AVX_TEST1 { __m256 v_nmbr = _mm256_loadu_ps((float *)&child[0].mbr); for (k = 1; k < node->len; k++) { __m256 v_cmbr = _mm256_loadu_ps((float *)&child[k].mbr); __m256 v_min = _mm256_min_ps(v_nmbr, v_cmbr); __m256 v_max = _mm256_max_ps(v_nmbr, v_cmbr); v_nmbr = _mm256_permute2f128_ps(v_min, v_max, 0x12); } _mm256_storeu_ps((float *)&node->mbr, v_nmbr); } #else /* copy first child's mbr */ node->mbr = child[0].mbr; for (k = 1; k < node->len; k++) { int i; for (i = 0; i < NDIMS; i++) { if (node->mbr.low[i] > child[k].mbr.low[i]) node->mbr.low[i] = child[k].mbr.low[i]; if (node->mbr.upp[i] < child[k].mbr.upp[i]) node->mbr.upp[i] = child[k].mbr.upp[i]; } } #endif #endif }
bool PaTriList2(PA_STATE &pa, UINT slot, simdvector tri[3]) { simdvector &a = PaGetSimdVector(pa, 0, slot); simdvector &b = PaGetSimdVector(pa, 1, slot); simdvector &c = PaGetSimdVector(pa, 2, slot); simdscalar s; for (int i = 0; i < 4; ++i) { simdvector &v0 = tri[0]; v0[i] = _simd_blend_ps(a[i], b[i], 0x92); v0[i] = _simd_blend_ps(v0[i], c[i], 0x24); v0[i] = _mm256_permute_ps(v0[i], 0x6C); s = _mm256_permute2f128_ps(v0[i], v0[i], 0x21); v0[i] = _simd_blend_ps(v0[i], s, 0x44); simdvector &v1 = tri[1]; v1[i] = _simd_blend_ps(a[i], b[i], 0x24); v1[i] = _simd_blend_ps(v1[i], c[i], 0x49); v1[i] = _mm256_permute_ps(v1[i], 0xB1); s = _mm256_permute2f128_ps(v1[i], v1[i], 0x21); v1[i] = _simd_blend_ps(v1[i], s, 0x66); simdvector &v2 = tri[2]; v2[i] = _simd_blend_ps(a[i], b[i], 0x49); v2[i] = _simd_blend_ps(v2[i], c[i], 0x92); v2[i] = _mm256_permute_ps(v2[i], 0xC6); s = _mm256_permute2f128_ps(v2[i], v2[i], 0x21); v2[i] = _simd_blend_ps(v2[i], s, 0x22); } SetNextPaState(pa, PaTriList0, PaTriListSingle0); pa.reset = true; pa.numPrimsComplete += KNOB_VS_SIMD_WIDTH; return true; }
void warmup(float *x, float *y, int size, float alpha) { #pragma ivdep int i; __m256 m = _mm256_set_ps(1.0/alpha, 2.0, 1.0/alpha, 2.0, 1.0/alpha, 2.0, 1.0/alpha, 2.0); #pragma vector aligned for (i=0; i<size; i+=4) { __m256 t = _mm256_load_ps(x+2*i); __m256 l = _mm256_mul_ps(t, m); // premultiply __m256 r = _mm256_permute2f128_ps( l , l , 1); // swap lower and higher 128 bits __m256 res = _mm256_hadd_ps(l, r); __m128 s = _mm256_extractf128_ps (res, 0); _mm_store_ps(y+i,s); // store it } }
bool PaPoints0(PA_STATE &pa, UINT slot, simdvector tri[3]) { simdvector &a = PaGetSimdVector(pa, pa.cur, slot); for (UINT i = 0; i < 4; ++i) { __m256 vLow128 = _mm256_unpacklo_ps(a.v[i], a.v[i]); // 0 0 1 1 4 4 5 5 __m256 vHigh128 = _mm256_unpackhi_ps(a.v[i], a.v[i]); // 2 2 3 3 6 6 7 7 __m256 vCombined = _mm256_permute2f128_ps(vLow128, vHigh128, 0x20); // 0 0 1 1 2 2 3 3 tri[0].v[i] = tri[1].v[i] = tri[2].v[i] = vCombined; } SetNextPaState(pa, PaPoints1, PaPointsSingle0, 1); pa.numPrimsComplete += KNOB_VS_SIMD_WIDTH; return true; }
bool PaTriFan0(PA_STATE &pa, UINT slot, simdvector tri[3]) { simdvector &a = PaGetSimdVector(pa, pa.cur, slot); // Extract vertex 0 to every lane of first vector for (int i = 0; i < 4; ++i) { __m256 a0 = a[i]; simdvector &v0 = tri[0]; v0[i] = _simd_shuffle_ps(a0, a0, _MM_SHUFFLE(0, 0, 0, 0)); v0[i] = _mm256_permute2f128_ps(v0[i], a0, 0x00); } // store off leading vertex for attributes pa.leadingVertex = pa.vout[pa.cur]; SetNextPaState(pa, PaTriFan1, PaTriFanSingle0); return false; // Not enough vertices to assemble 8 triangles. }
inline Mbr getMbrRTreeNode(RTreeNode *node) { Mbr mbr; int k; mbr = node->mbrs[0]; for (k = 1; k < node->nchilds; k++) { #ifdef ENABLE_SSE_TEST1 __m128 v_nlow = _mm_load_ps(mbr.low); __m128 v_nupp = _mm_load_ps(mbr.upp); __m128 v_clow = _mm_load_ps(node->mbrs[k].low); __m128 v_cupp = _mm_load_ps(node->mbrs[k].upp); _mm_store_ps(node->mbr.low, _mm_min_ps(v_nlow, v_clow)); _mm_store_ps(node->mbr.upp, _mm_max_ps(v_nupp, v_cupp)); #else #ifdef ENABLE_AVX_TEST1 __m256 v_nmbr = _mm256_loadu_ps((float *)&mbr); __m256 v_cmbr = _mm256_loadu_ps((float *)&node->mbrs[k]); __m256 v_min = _mm256_min_ps(v_nmbr, v_cmbr); __m256 v_max = _mm256_max_ps(v_nmbr, v_cmbr); __m256 v_tmp; v_tmp = _mm256_permute2f128_ps(v_min, v_max, 0x12); _mm256_storeu_ps((float *)&mbr, v_tmp); #else int i; for (i = 0; i < NDIMS; i++) { if (mbr.low[i] > node->mbrs[k].low[i]) mbr.low[i] = node->mbrs[k].low[i]; if (mbr.upp[i] < node->mbrs[k].upp[i]) mbr.upp[i] = node->mbrs[k].upp[i]; } #endif #endif } return mbr; }
void test8bit (void) { i1 = _mm_cmpistrm (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ k1 = _mm_cmpistri (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ k1 = _mm_cmpistra (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ k1 = _mm_cmpistrc (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ k1 = _mm_cmpistro (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ k1 = _mm_cmpistrs (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ k1 = _mm_cmpistrz (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ i1 = _mm_cmpestrm (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ k1 = _mm_cmpestri (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ k1 = _mm_cmpestra (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ k1 = _mm_cmpestrc (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ k1 = _mm_cmpestro (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ k1 = _mm_cmpestrs (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ k1 = _mm_cmpestrz (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ b1 = _mm256_blend_ps (b2, b3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ k1 = _cvtss_sh (f1, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm256_cvtps_ph (b2, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ b1 = _mm256_dp_ps (b2, b3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ e1 = _mm256_permute2f128_pd (e2, e3, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */ b1 = _mm256_permute2f128_ps (b2, b3, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */ l1 = _mm256_permute2f128_si256 (l2, l3, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */ b1 = _mm256_permute_ps (b2, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_aeskeygenassist_si128 (i2, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_blend_epi16 (i2, i3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_clmulepi64_si128 (i2, i3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_cvtps_ph (a1, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ d1 = _mm_dp_pd (d2, d3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ a1 = _mm_dp_ps (a2, a3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ a1 = _mm_insert_ps (a2, a3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_mpsadbw_epu8 (i2, i3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ a1 = _mm_permute_ps (a2, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_slli_si128 (i2, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_srli_si128 (i2, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ }
bool PaTriFan1(PA_STATE &pa, UINT slot, simdvector tri[3]) { simdvector &a = PaGetSimdVector(pa, pa.prev, slot); simdvector &b = PaGetSimdVector(pa, pa.cur, slot); simdscalar s; // only need to fill vectors 1/2 with new verts for (int i = 0; i < 4; ++i) { simdscalar a0 = a[i]; simdscalar b0 = b[i]; simdvector &v2 = tri[2]; s = _mm256_permute2f128_ps(a0, b0, 0x21); v2[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(1, 0, 3, 2)); simdvector &v1 = tri[1]; v1[i] = _simd_shuffle_ps(a0, v2[i], _MM_SHUFFLE(2, 1, 2, 1)); } SetNextPaState(pa, PaTriFan1, PaTriFanSingle0); pa.numPrimsComplete += KNOB_VS_SIMD_WIDTH; return true; }
void kernel_ssymv_4_lib8(int kmax, int kna, float *A, int sda, float *x_n, float *y_n, float *x_t, float *y_t, int tri, int alg) { if(kmax<=0) return; const int lda = 8; __builtin_prefetch( A + 0*lda ); __builtin_prefetch( A + 2*lda ); int k, k_left, ii; float k_left_d; const float mask_f[] = {7.5, 6.5, 5.5, 4.5, 3.5, 2.5, 1.5, 0.5}; float temp_space[8] = {}; __m256 mask, zeros, temp, a_00, a_01, a_02, a_03, x_n_0, x_n_1, x_n_2, x_n_3, y_n_0, x_t_0, y_t_0, y_t_1, y_t_2, y_t_3; mask = _mm256_loadu_ps( mask_f ); zeros = _mm256_setzero_ps(); x_n_0 = _mm256_broadcast_ss( &x_n[0] ); x_n_1 = _mm256_broadcast_ss( &x_n[1] ); x_n_2 = _mm256_broadcast_ss( &x_n[2] ); x_n_3 = _mm256_broadcast_ss( &x_n[3] ); if(alg==-1) // TODO xor { x_n_0 = _mm256_sub_ps( zeros, x_n_0 ); x_n_1 = _mm256_sub_ps( zeros, x_n_1 ); x_n_2 = _mm256_sub_ps( zeros, x_n_2 ); x_n_3 = _mm256_sub_ps( zeros, x_n_3 ); } y_t_0 = _mm256_setzero_ps(); y_t_1 = _mm256_setzero_ps(); y_t_2 = _mm256_setzero_ps(); y_t_3 = _mm256_setzero_ps(); k=0; // corner if(tri==1) { k_left = kna-k; k_left_d = 8.0 - k_left; /*printf("\nk_left = %d\n", k_left);*/ /* y_n_0 = _mm_load_ps( &y_n[0] );*/ /* y_n_0 = _mm_setzero_ps();*/ x_t_0 = _mm256_loadu_ps( &x_t[0] ); x_t_0 = _mm256_blendv_ps( x_t_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) ); /*_mm256_storeu_ps( temp_space, x_t_0 ); */ /*printf("\nx = %f %f %f %f %f %f %f %f \n", temp_space[0], temp_space[1], temp_space[2], temp_space[3], temp_space[4], temp_space[5], temp_space[6], temp_space[7] );*/ /*exit(1);*/ a_00 = _mm256_loadu_ps( &A[0+lda*0] ); a_00 = _mm256_blend_ps( a_00, zeros, 0x00 ); /* temp = _mm256_mul_ps( a_00, x_n_0 );*/ /* y_n_0 = _mm256_add_ps( y_n_0, temp );*/ y_n_0 = _mm256_mul_ps( a_00, x_n_0 ); a_00 = _mm256_blend_ps( a_00, zeros, 0x01 ); temp = _mm256_mul_ps( a_00, x_t_0 ); y_t_0 = _mm256_add_ps( y_t_0, temp ); a_01 = _mm256_loadu_ps( &A[0+lda*1] ); a_01 = _mm256_blend_ps( a_01, zeros, 0x01 ); temp = _mm256_mul_ps( a_01, x_n_1 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); a_01 = _mm256_blend_ps( a_01, zeros, 0x03 ); temp = _mm256_mul_ps( a_01, x_t_0 ); y_t_1 = _mm256_add_ps( y_t_1, temp ); a_02 = _mm256_loadu_ps( &A[0+lda*2] ); a_02 = _mm256_blend_ps( a_02, zeros, 0x03 ); temp = _mm256_mul_ps( a_02, x_n_2 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); a_02 = _mm256_blend_ps( a_02, zeros, 0x07 ); temp = _mm256_mul_ps( a_02, x_t_0 ); y_t_2 = _mm256_add_ps( y_t_2, temp ); a_03 = _mm256_loadu_ps( &A[0+lda*3] ); a_03 = _mm256_blend_ps( a_03, zeros, 0x07 ); temp = _mm256_mul_ps( a_03, x_n_3 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); a_03 = _mm256_blend_ps( a_03, zeros, 0x0f ); temp = _mm256_mul_ps( a_03, x_t_0 ); y_t_3 = _mm256_add_ps( y_t_3, temp ); /*_mm256_storeu_ps( temp_space, y_n_0 ); */ /*printf("\ny = %f %f %f %f %f %f %f %f \n", temp_space[0], temp_space[1], temp_space[2], temp_space[3], temp_space[4], temp_space[5], temp_space[6], temp_space[7] );*/ y_n_0 = _mm256_blendv_ps( y_n_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) ); /*_mm256_storeu_ps( temp_space, y_n_0 ); */ /*printf("\ny = %f %f %f %f %f %f %f %f \n", temp_space[0], temp_space[1], temp_space[2], temp_space[3], temp_space[4], temp_space[5], temp_space[6], temp_space[7] );*/ x_t_0 = _mm256_loadu_ps( &y_n[0] ); y_n_0 = _mm256_add_ps( y_n_0, x_t_0 ); _mm256_storeu_ps( &y_n[0], y_n_0 ); A += k_left; y_n += k_left; x_t += k_left; k += k_left; } if(k<kna) // it can be only k_left = {1, 2, 3, 4, 5, 6, 7} /* for(; k<kna; k++)*/ { k_left = kna-k; k_left_d = 8.0 - k_left; /*printf("\nk_left = %d\n", k_left);*/ /* y_n_0 = _mm_load_ps( &y_n[0] );*/ /* y_n_0 = _mm_setzero_ps();*/ x_t_0 = _mm256_loadu_ps( &x_t[0] ); x_t_0 = _mm256_blendv_ps( x_t_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) ); /*_mm256_storeu_ps( temp_space, x_t_0 ); */ /*printf("\nx = %f %f %f %f %f %f %f %f \n", temp_space[0], temp_space[1], temp_space[2], temp_space[3], temp_space[4], temp_space[5], temp_space[6], temp_space[7] );*/ a_00 = _mm256_loadu_ps( &A[0+lda*0] ); a_01 = _mm256_loadu_ps( &A[0+lda*1] ); a_02 = _mm256_loadu_ps( &A[0+lda*2] ); a_03 = _mm256_loadu_ps( &A[0+lda*3] ); /* temp = _mm256_mul_ps( a_00, x_n_0 );*/ /* y_n_0 = _mm256_add_ps( y_n_0, temp );*/ y_n_0 = _mm256_mul_ps( a_00, x_n_0 ); temp = _mm256_mul_ps( a_00, x_t_0 ); y_t_0 = _mm256_add_ps( y_t_0, temp ); temp = _mm256_mul_ps( a_01, x_n_1 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_01, x_t_0 ); y_t_1 = _mm256_add_ps( y_t_1, temp ); temp = _mm256_mul_ps( a_02, x_n_2 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_02, x_t_0 ); y_t_2 = _mm256_add_ps( y_t_2, temp ); temp = _mm256_mul_ps( a_03, x_n_3 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_03, x_t_0 ); y_t_3 = _mm256_add_ps( y_t_3, temp ); y_n_0 = _mm256_blendv_ps( y_n_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) ); x_t_0 = _mm256_loadu_ps( &y_n[0] ); y_n_0 = _mm256_add_ps( y_n_0, x_t_0 ); _mm256_storeu_ps( &y_n[0], y_n_0 ); /* _mm256_maskstore_ps( &y_n[0], (__m256i) _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ), y_n_0 );*/ /* _mm256_storeu_ps( temp_space, y_n_0 );*/ /* for(ii=0; ii<k_left; ii++)*/ /* y_n[ii] = temp_space[ii];*/ /*printf("\nk_left = %d\n", k_left);*/ /*exit(1);*/ A += k_left; y_n += k_left; x_t += k_left; k += k_left; } if(kna>0 || tri==1) { A += (sda-1)*lda; } for(; k<kmax-7; k+=8) { __builtin_prefetch( A + sda*lda + 0*lda ); __builtin_prefetch( A + sda*lda + 2*lda ); y_n_0 = _mm256_loadu_ps( &y_n[0] ); x_t_0 = _mm256_loadu_ps( &x_t[0] ); a_00 = _mm256_load_ps( &A[0+lda*0] ); temp = _mm256_mul_ps( a_00, x_n_0 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_00, x_t_0 ); y_t_0 = _mm256_add_ps( y_t_0, temp ); a_01 = _mm256_load_ps( &A[0+lda*1] ); temp = _mm256_mul_ps( a_01, x_n_1 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_01, x_t_0 ); y_t_1 = _mm256_add_ps( y_t_1, temp ); a_02 = _mm256_load_ps( &A[0+lda*2] ); temp = _mm256_mul_ps( a_02, x_n_2 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_02, x_t_0 ); y_t_2 = _mm256_add_ps( y_t_2, temp ); a_03 = _mm256_load_ps( &A[0+lda*3] ); temp = _mm256_mul_ps( a_03, x_n_3 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_03, x_t_0 ); y_t_3 = _mm256_add_ps( y_t_3, temp ); _mm256_storeu_ps( &y_n[0], y_n_0 ); A += sda*lda; y_n += 8; x_t += 8; } if(k<kmax) // it can be only k_left = {1, 2, 3, 4, 5, 6, 7} { k_left = kmax-k; k_left_d = 8.0 - k_left; /* y_n_0 = _mm_load_ps( &y_n[0] );*/ /* y_n_0 = _mm_setzero_ps();*/ x_t_0 = _mm256_loadu_ps( &x_t[0] ); x_t_0 = _mm256_blendv_ps( x_t_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) ); /*printf("\nk_left2 = %d\n", k_left, kmax, k);*/ a_00 = _mm256_load_ps( &A[0+lda*0] ); /*printf("\nk_left2 = %d\n", k_left);*/ a_01 = _mm256_load_ps( &A[0+lda*1] ); a_02 = _mm256_load_ps( &A[0+lda*2] ); a_03 = _mm256_load_ps( &A[0+lda*3] ); /* temp = _mm256_mul_ps( a_00, x_n_0 );*/ /* y_n_0 = _mm256_add_ps( y_n_0, temp );*/ y_n_0 = _mm256_mul_ps( a_00, x_n_0 ); temp = _mm256_mul_ps( a_00, x_t_0 ); y_t_0 = _mm256_add_ps( y_t_0, temp ); temp = _mm256_mul_ps( a_01, x_n_1 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_01, x_t_0 ); y_t_1 = _mm256_add_ps( y_t_1, temp ); temp = _mm256_mul_ps( a_02, x_n_2 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_02, x_t_0 ); y_t_2 = _mm256_add_ps( y_t_2, temp ); temp = _mm256_mul_ps( a_03, x_n_3 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_03, x_t_0 ); y_t_3 = _mm256_add_ps( y_t_3, temp ); y_n_0 = _mm256_blendv_ps( y_n_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) ); x_t_0 = _mm256_loadu_ps( &y_n[0] ); y_n_0 = _mm256_add_ps( y_n_0, x_t_0 ); _mm256_storeu_ps( &y_n[0], y_n_0 ); /* _mm256_maskstore_ps( &y_n[0], (__m256i) _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ), y_n_0 );*/ /* _mm256_storeu_ps( temp_space, y_n_0 );*/ /* for(ii=0; ii<k_left; ii++)*/ /* y_n[ii] = temp_space[ii];*/ /* A += 1;*/ /* y_n += 1;*/ /* x_t += 1;*/ } // reduction __m128 z_0, z_1; y_t_0 = _mm256_hadd_ps(y_t_0, y_t_1); y_t_2 = _mm256_hadd_ps(y_t_2, y_t_3); y_t_0 = _mm256_hadd_ps(y_t_0, y_t_2); y_t_1 = _mm256_permute2f128_ps(y_t_0, y_t_0, 0x01); z_0 = _mm256_castps256_ps128(y_t_0); z_1 = _mm256_castps256_ps128(y_t_1); z_1 = _mm_add_ps(z_0, z_1); if(alg==1) { z_0 = _mm_loadu_ps( &y_t[0] ); z_0 = _mm_add_ps(z_0, z_1); _mm_storeu_ps( &y_t[0], z_0 ); } else // alg==-1 { z_0 = _mm_loadu_ps( &y_t[0] ); z_0 = _mm_sub_ps(z_0, z_1); _mm_storeu_ps( &y_t[0], z_0 ); } }
void kernel_strmv_u_t_8_lib8(int kmax, float *A, int sda, float *x, float *y, int alg) { /* if(kmax<=0) */ /* return;*/ const int lda = 8; /* const int bs = 8;*/ __builtin_prefetch( A + 0*lda ); __builtin_prefetch( A + 2*lda ); __builtin_prefetch( A + 4*lda ); __builtin_prefetch( A + 6*lda ); int k; __m256 zeros, ax_temp, a_00, a_01, a_02, a_03, x_0, y_0, y_1, y_2, y_3, y_4, y_5, y_6, y_7; zeros = _mm256_setzero_ps(); y_0 = _mm256_setzero_ps(); y_1 = _mm256_setzero_ps(); y_2 = _mm256_setzero_ps(); y_3 = _mm256_setzero_ps(); y_4 = _mm256_setzero_ps(); y_5 = _mm256_setzero_ps(); y_6 = _mm256_setzero_ps(); y_7 = _mm256_setzero_ps(); k=0; for(; k<kmax-7; k+=8) { x_0 = _mm256_loadu_ps( &x[0] ); __builtin_prefetch( A + sda*lda + 0*lda ); __builtin_prefetch( A + sda*lda + 2*lda ); a_00 = _mm256_load_ps( &A[0+lda*0] ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_0 = _mm256_add_ps( y_0, ax_temp ); a_01 = _mm256_load_ps( &A[0+lda*1] ); ax_temp = _mm256_mul_ps( a_01, x_0 ); y_1 = _mm256_add_ps( y_1, ax_temp ); a_02 = _mm256_load_ps( &A[0+lda*2] ); ax_temp = _mm256_mul_ps( a_02, x_0 ); y_2 = _mm256_add_ps( y_2, ax_temp ); a_03 = _mm256_load_ps( &A[0+lda*3] ); ax_temp = _mm256_mul_ps( a_03, x_0 ); y_3 = _mm256_add_ps( y_3, ax_temp ); __builtin_prefetch( A + sda*lda + 4*lda ); __builtin_prefetch( A + sda*lda + 6*lda ); a_00 = _mm256_load_ps( &A[0+lda*4] ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_4 = _mm256_add_ps( y_4, ax_temp ); a_01 = _mm256_load_ps( &A[0+lda*5] ); ax_temp = _mm256_mul_ps( a_01, x_0 ); y_5 = _mm256_add_ps( y_5, ax_temp ); a_02 = _mm256_load_ps( &A[0+lda*6] ); ax_temp = _mm256_mul_ps( a_02, x_0 ); y_6 = _mm256_add_ps( y_6, ax_temp ); a_03 = _mm256_load_ps( &A[0+lda*7] ); ax_temp = _mm256_mul_ps( a_03, x_0 ); y_7 = _mm256_add_ps( y_7, ax_temp ); A += sda*lda; x += lda; } x_0 = _mm256_loadu_ps( &x[0] ); a_00 = _mm256_load_ps( &A[0+lda*0] ); a_00 = _mm256_blend_ps( zeros, a_00, 0x01 ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_0 = _mm256_add_ps( y_0, ax_temp ); a_01 = _mm256_load_ps( &A[0+lda*1] ); a_01 = _mm256_blend_ps( zeros, a_01, 0x03 ); ax_temp = _mm256_mul_ps( a_01, x_0 ); y_1 = _mm256_add_ps( y_1, ax_temp ); a_02 = _mm256_load_ps( &A[0+lda*2] ); a_02 = _mm256_blend_ps( zeros, a_02, 0x07 ); ax_temp = _mm256_mul_ps( a_02, x_0 ); y_2 = _mm256_add_ps( y_2, ax_temp ); a_03 = _mm256_load_ps( &A[0+lda*3] ); a_03 = _mm256_blend_ps( zeros, a_03, 0x0f ); ax_temp = _mm256_mul_ps( a_03, x_0 ); y_3 = _mm256_add_ps( y_3, ax_temp ); a_00 = _mm256_load_ps( &A[0+lda*4] ); a_00 = _mm256_blend_ps( zeros, a_00, 0x1f ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_4 = _mm256_add_ps( y_4, ax_temp ); a_01 = _mm256_load_ps( &A[0+lda*5] ); a_01 = _mm256_blend_ps( zeros, a_01, 0x3f ); ax_temp = _mm256_mul_ps( a_01, x_0 ); y_5 = _mm256_add_ps( y_5, ax_temp ); a_02 = _mm256_load_ps( &A[0+lda*6] ); a_02 = _mm256_blend_ps( zeros, a_02, 0x7f ); ax_temp = _mm256_mul_ps( a_02, x_0 ); y_6 = _mm256_add_ps( y_6, ax_temp ); a_03 = _mm256_load_ps( &A[0+lda*7] ); /* a_03 = _mm256_blend_ps( zeros, a_03, 0xff );*/ ax_temp = _mm256_mul_ps( a_03, x_0 ); y_7 = _mm256_add_ps( y_7, ax_temp ); // reduction __m256 z_0; y_0 = _mm256_hadd_ps(y_0, y_1); y_2 = _mm256_hadd_ps(y_2, y_3); y_4 = _mm256_hadd_ps(y_4, y_5); y_6 = _mm256_hadd_ps(y_6, y_7); y_0 = _mm256_hadd_ps(y_0, y_2); y_4 = _mm256_hadd_ps(y_4, y_6); y_1 = _mm256_permute2f128_ps(y_0, y_4, 0x20); y_2 = _mm256_permute2f128_ps(y_0, y_4, 0x31); y_0 = _mm256_add_ps(y_1, y_2); // store if(alg==0) { _mm256_storeu_ps(&y[0], y_0); } else if(alg==1) { z_0 = _mm256_loadu_ps( &y[0] ); z_0 = _mm256_add_ps(z_0, y_0); _mm256_storeu_ps(&y[0], z_0); } else // alg==-1 { z_0 = _mm256_loadu_ps( &y[0] ); z_0 = _mm256_sub_ps(z_0, y_0); _mm256_storeu_ps(&y[0], z_0); } }
inline __m256 _mm256_broadcast_lo_ss(__m256 a) { __m256 b = _mm256_permute_ps(a, _MM_SHUFFLE(0, 0, 0, 0)); \ return _mm256_blend_ps(b, _mm256_permute2f128_ps(b, b, 1), 0xF0); \ }
inline __m256 _mm256_broadcast_3_ss(__m256 a) { __m256 b = _mm256_permute_ps(a, _MM_SHUFFLE(3, 3, 3, 3)); return _mm256_blend_ps(b, _mm256_permute2f128_ps(b, b, 1), 0xF0); }
log++; } nl = (1 << log); *newLength = nl; float *ret = mallocf(nl + additionalLength); memcpy(ret, ptr, length * sizeof(float)); memsetf(ret + length, 0.f, nl - length); return ret; } float *rmemcpyf(float *__restrict dest, const float *__restrict src, size_t length) { #ifdef __AVX__ for (int i = 0; i < (int)length - 7; i += 8) { __m256 vec = _mm256_loadu_ps(src + i); vec = _mm256_permute2f128_ps(vec, vec, 1); vec = _mm256_permute_ps(vec, 0x1B); _mm256_storeu_ps(dest + length - i - 8, vec); } for (size_t i = (length & ~0x7); i < length; i++) { dest[length - i - 1] = src[i]; } #elif defined(__ARM_NEON__) for (int i = 0; i < (int)length - 3; i += 4) { float32x4_t vec = vld1q_f32(src + i); vec = vrev64q_f32(vec); vec = vcombine_f32(vget_high_f32(vec), vget_low_f32(vec)); vst1q_f32(dest + length - i - 4, vec); }
M_ALWAYS_INLINE static void bar(float (& input)[8]) { /* static constexpr uint_fast8_t idx[][2] = { {0, 1}, {3, 2}, {4, 5}, {7, 6}, // (1) {0, 2}, {1, 3}, {6, 4}, {7, 5}, // (2) {0, 1}, {2, 3}, {5, 4}, {7, 6}, // (3) {0, 4}, {1, 5}, {2, 6}, {3, 7}, // (4) {0, 2}, {1, 3}, {4, 6}, {5, 7}, // (5) {0, 1}, {2, 3}, {4, 5}, {6, 7} // (6) }; */ // Индекса трябва да представим в по удобен вид за // AVX инструкциите. Няма смисъл от цикъл и после развиване // защото (4)-тия случай е специален... По добре на ръка. static constexpr int blend_mask_1 =0b10011001; static constexpr int blend_mask_2=0b11000011; static constexpr int blend_mask_3 =0b10100101; static constexpr int blend_mask_4 =0b00001111; static constexpr int blend_mask_5=0b00110011; static constexpr int blend_mask_6=0b01010101; // Отговаря на (1), (3) и (6) static constexpr int permute_mask_1=0b10110001; // Отговаря на (2) и (5) static constexpr int permute_mask_2=0b01001110; __m256 result= _mm256_load_ps(input); // (1) __m256 mapped=_mm256_permute_ps(result,permute_mask_1); __m256 min=_mm256_min_ps(result,mapped); __m256 max=_mm256_max_ps(result,mapped); result=_mm256_blend_ps(max,min,blend_mask_1); // (2) mapped=_mm256_permute_ps(result,permute_mask_2); min=_mm256_min_ps(result,mapped); max=_mm256_max_ps(result,mapped); result=_mm256_blend_ps(max,min,blend_mask_2); // (3) mapped=_mm256_permute_ps(result,permute_mask_1); min=_mm256_min_ps(result,mapped); max=_mm256_max_ps(result,mapped); result=_mm256_blend_ps(max,min,blend_mask_3); // (4) Специалния случай тук трябва да пермутираме // между двете половини на YMM регистъра. mapped=_mm256_permute2f128_ps(result,result,1); min=_mm256_min_ps(result,mapped); max=_mm256_max_ps(result,mapped); result=_mm256_blend_ps(max,min,blend_mask_4); // (5) mapped=_mm256_permute_ps(result,permute_mask_2); min=_mm256_min_ps(result,mapped); max=_mm256_max_ps(result,mapped); result=_mm256_blend_ps(max,min,blend_mask_5); // (6) mapped=_mm256_permute_ps(result,permute_mask_1); min=_mm256_min_ps(result,mapped); max=_mm256_max_ps(result,mapped); result=_mm256_blend_ps(max,min,blend_mask_6); /**/ _mm256_store_ps(input,result); }
static BOOST_FORCEINLINE T perm2_ ( __m256 const& a0, __m256 const& a1, Mask const&) { return _mm256_permute2f128_ps(a0, a1, Mask::value); }
INLINE avxb shuffle(const avxb& a, const avxb& b) { return _mm256_permute2f128_ps(a, b, (i1 << 4) | (i0 << 0)); }
__m256 test_mm256_permute2f128_ps(__m256 a, __m256 b) { // Check if the mask is correct // CHECK: shufflevector{{.*}}<i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> return _mm256_permute2f128_ps(a, b, 0x13); }