static inline void hyperloglog_count_avx2(const uint8_t *registers, uint32_t n_registers, float *inverse_sum, uint32_t *n_zeros) { const __m256i ones = (__m256i)_mm256_set1_ps(1.0f); __m256 agg = _mm256_set1_ps(0.0f); for (size_t i = 0; i < n_registers / sizeof(__m256i); ++i) { const __m256i simd = _mm256_load_si256((__m256i *)registers + i); /* For some reason, VPSRLDQ works on lane of 128bits instead of 256. */ const __m128i low = _mm256_extracti128_si256(simd, 0); const __m128i high = _mm256_extracti128_si256(simd, 1); __m256i sums = inverse_power_avx2(low); agg = _mm256_add_ps(agg, (__m256)sums); sums = inverse_power_avx2(_mm_srli_si128(low, 8)); agg = _mm256_add_ps(agg, (__m256)sums); sums = inverse_power_avx2(high); agg = _mm256_add_ps(agg, (__m256)sums); sums = inverse_power_avx2(_mm_srli_si128(high, 8)); agg = _mm256_add_ps(agg, (__m256)sums); *n_zeros += _mm256_cntz_epi8(simd); } *inverse_sum = horizontal_sum_avx2(agg); }
/* * Compute: s = sqrt( t**2 - x**2 - y**2 - z**2 ), with s, t, x, y, z * member variables of the st_coords structure arr. * * Traverse elements randomly */ void comp_s(st_coords arr, int L) { for(int j=0; j<L; j+=8) { int i = (rand() % (L/8)) * 8; __m256 x = _mm256_load_ps(&arr.x[i]); __m256 y = _mm256_load_ps(&arr.y[i]); __m256 z = _mm256_load_ps(&arr.z[i]); __m256 t = _mm256_load_ps(&arr.t[i]); #ifdef FMA register __m256 s0; s0 = _mm256_mul_ps(x, x); s0 = _mm256_fmadd_ps(y, y, s0); s0 = _mm256_fmadd_ps(z, z, s0); s0 = _mm256_fmsub_ps(t, t, s0); s0 = _mm256_sqrt_ps(s0); #else register __m256 s0, s1; s1 = _mm256_mul_ps(x, x); s0 = _mm256_mul_ps(y, y); s1 = _mm256_add_ps(s0, s1); s0 = _mm256_mul_ps(z, z); s1 = _mm256_add_ps(s0, s1); s0 = _mm256_mul_ps(t, t); s1 = _mm256_sub_ps(s0, s1); s0 = _mm256_sqrt_ps(s1); #endif _mm256_store_ps(&arr.s[i], s0); } return; }
void NBodyAlgorithmCPU::calculateAcceleration(const float3(&posI)[8], const float massJ, const float3 posJ, float *accI) { __m256 pix = _mm256_set_ps(posI[7].x, posI[6].x, posI[5].x, posI[4].x, posI[3].x, posI[2].x, posI[1].x, posI[0].x); __m256 piy = _mm256_set_ps(posI[7].y, posI[6].y, posI[5].y, posI[4].y, posI[3].y, posI[2].y, posI[1].y, posI[0].y); __m256 piz = _mm256_set_ps(posI[7].z, posI[6].z, posI[5].z, posI[4].z, posI[3].z, posI[2].z, posI[1].z, posI[0].z); __m256 pjx = _mm256_set1_ps(posJ.x); __m256 pjy = _mm256_set1_ps(posJ.y); __m256 pjz = _mm256_set1_ps(posJ.z); __m256 rx = _mm256_sub_ps(pjx, pix); __m256 ry = _mm256_sub_ps(pjy, piy); __m256 rz = _mm256_sub_ps(pjz, piz); __m256 eps2 = _mm256_set1_ps(mp_properties->EPS2); __m256 rx2 = _mm256_mul_ps(rx, rx); __m256 ry2 = _mm256_mul_ps(ry, ry); __m256 rz2 = _mm256_mul_ps(rz, rz); __m256 rabs = _mm256_sqrt_ps(_mm256_add_ps(_mm256_add_ps(rx2, ry2), _mm256_add_ps(rz2, eps2))); __m256 m = _mm256_set1_ps(massJ); __m256 rabsInv = _mm256_div_ps(m, _mm256_mul_ps(_mm256_mul_ps(rabs, rabs), rabs)); __m256 aix = _mm256_mul_ps(rx, rabsInv); __m256 aiy = _mm256_mul_ps(ry, rabsInv); __m256 aiz = _mm256_mul_ps(rz, rabsInv); _mm256_store_ps(accI, aix); _mm256_store_ps(accI + 8, aiy); _mm256_store_ps(accI + 16, aiz); }
void NBodyAlgorithmCPU::calculateAcceleration(const float3(&posI)[8], const float massJ, const float3 posJ, float3(&accI)[8]) { __m256 pix = _mm256_set_ps(posI[0].x, posI[1].x, posI[2].x, posI[3].x, posI[4].x, posI[5].x, posI[6].x, posI[7].x); __m256 piy = _mm256_set_ps(posI[0].y, posI[1].y, posI[2].y, posI[3].y, posI[4].y, posI[5].y, posI[6].y, posI[7].y); __m256 piz = _mm256_set_ps(posI[0].z, posI[1].z, posI[2].z, posI[3].z, posI[4].z, posI[5].z, posI[6].z, posI[7].z); __m256 pjx = _mm256_set1_ps(posJ.x); __m256 pjy = _mm256_set1_ps(posJ.y); __m256 pjz = _mm256_set1_ps(posJ.z); __m256 rx = _mm256_sub_ps(pjx, pix); __m256 ry = _mm256_sub_ps(pjy, piy); __m256 rz = _mm256_sub_ps(pjz, piz); __m256 eps2 = _mm256_set1_ps(mp_properties->EPS2); __m256 rx2 = _mm256_mul_ps(rx, rx); __m256 ry2 = _mm256_mul_ps(ry, ry); __m256 rz2 = _mm256_mul_ps(rz, rz); __m256 rabs = _mm256_sqrt_ps(_mm256_add_ps(_mm256_add_ps(rx2, ry2), _mm256_add_ps(rz2, eps2))); __m256 m = _mm256_set1_ps(massJ); __m256 rabsInv = _mm256_div_ps(m, _mm256_mul_ps(_mm256_mul_ps(rabs, rabs), rabs)); __m256 aix = _mm256_mul_ps(rx, rabsInv); __m256 aiy = _mm256_mul_ps(ry, rabsInv); __m256 aiz = _mm256_mul_ps(rz, rabsInv); for (int i = 0; i < 8; i++) { accI[7 - i].x = aix.m256_f32[i]; accI[7 - i].y = aiy.m256_f32[i]; accI[7 - i].z = aiz.m256_f32[i]; } }
static inline void blend_unorm8_argb(struct reg *src, __m256i dst_argb) { if (gt.blend.enable) { const __m256i mask = _mm256_set1_epi32(0xff); const __m256 scale = _mm256_set1_ps(1.0f / 255.0f); struct reg dst[4]; /* Convert to float */ dst[2].reg = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(dst_argb, mask)), scale); dst_argb = _mm256_srli_epi32(dst_argb, 8); dst[1].reg = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(dst_argb, mask)), scale); dst_argb = _mm256_srli_epi32(dst_argb, 8); dst[0].reg = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(dst_argb, mask)), scale); dst_argb = _mm256_srli_epi32(dst_argb, 8); dst[3].reg = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(dst_argb, mask)), scale); /* Blend, assuming src BLENDFACTOR_SRC_ALPHA, dst * BLENDFACTOR_INV_SRC_ALPHA, and BLENDFUNCTION_ADD. */ const __m256 inv_alpha = _mm256_sub_ps(_mm256_set1_ps(1.0f), src[3].reg); src[0].reg = _mm256_add_ps(_mm256_mul_ps(src[3].reg, src[0].reg), _mm256_mul_ps(inv_alpha, dst[0].reg)); src[1].reg = _mm256_add_ps(_mm256_mul_ps(src[3].reg, src[1].reg), _mm256_mul_ps(inv_alpha, dst[1].reg)); src[2].reg = _mm256_add_ps(_mm256_mul_ps(src[3].reg, src[2].reg), _mm256_mul_ps(inv_alpha, dst[2].reg)); src[3].reg = _mm256_add_ps(_mm256_mul_ps(src[3].reg, src[3].reg), _mm256_mul_ps(inv_alpha, dst[3].reg)); } }
static void NOINLINE mulX8( const __m256 *v1, const __m256 *v2, __m256 *vout ) { static const int ALIGN32 p1[ 8 ] = { 0, 0, 0, 0, 1, 1, 1, 1 }; static const int ALIGN32 p2[ 8 ] = { 2, 2, 2, 2, 3, 3, 3, 3 }; static const int ALIGN32 p3[ 8 ] = { 4, 4, 4, 4, 5, 5, 5, 5 }; static const int ALIGN32 p4[ 8 ] = { 6, 6, 6, 6, 7, 7, 7, 7 }; const __m256i perm1 = _mm256_load_si256( reinterpret_cast< const __m256i* >( p1 ) ); const __m256i perm2 = _mm256_load_si256( reinterpret_cast< const __m256i* >( p2 ) ); const __m256i perm3 = _mm256_load_si256( reinterpret_cast< const __m256i* >( p3 ) ); const __m256i perm4 = _mm256_load_si256( reinterpret_cast< const __m256i* >( p4 ) ); for( int r = 0; r < 2; r++ ) { __m256 a0 = _mm256_permutevar8x32_ps( v1[ r ], perm1 ); __m256 a1 = _mm256_permutevar8x32_ps( v1[ r ], perm2 ); __m256 a2 = _mm256_permutevar8x32_ps( v1[ r ], perm3 ); __m256 a3 = _mm256_permutevar8x32_ps( v1[ r ], perm4 ); __m256 b0 = _mm256_mul_ps( a0, v2[ 0 ] ); __m256 b1 = _mm256_mul_ps( a1, v2[ 1 ] ); __m256 b2 = _mm256_mul_ps( a2, v2[ 0 ] ); __m256 b3 = _mm256_mul_ps( a3, v2[ 1 ] ); __m256 c0 = _mm256_add_ps( b0, b1 ); __m256 c1 = _mm256_add_ps( b2, b3 ); __m256 d0 = _mm256_permute2f128_ps( c0, c1, _MM_SHUFFLE( 0, 2, 0, 0 ) ); __m256 d1 = _mm256_permute2f128_ps( c0, c1, _MM_SHUFFLE( 0, 3, 0, 1 ) ); vout[ r ] = _mm256_add_ps( d0, d1 ); } }
inline float DatabaseBuilder::Distance(PackedSample* x, PackedSample* y) { #ifdef AVX //Black magic //But it does produce the same results as the not AVX code __m256 accumulator; __m256 x_s = _mm256_load_ps(x->Features); __m256 y_s = _mm256_load_ps(y->Features); __m256 result = _mm256_sub_ps(x_s, y_s); accumulator = _mm256_mul_ps(result, result); x_s = _mm256_load_ps(&x->Features[8]); y_s = _mm256_load_ps(&y->Features[8]); result = _mm256_sub_ps(x_s, y_s); result = _mm256_mul_ps(result, result); accumulator = _mm256_add_ps(accumulator, result); x_s = _mm256_load_ps(&x->Features[16]); y_s = _mm256_load_ps(&y->Features[16]); result = _mm256_sub_ps(x_s, y_s); result = _mm256_mul_ps(result, result); accumulator = _mm256_add_ps(accumulator, result); x_s = _mm256_load_ps(&x->Features[24]); y_s = _mm256_load_ps(&y->Features[24]); result = _mm256_sub_ps(x_s, y_s); result = _mm256_mul_ps(result, result); accumulator = _mm256_add_ps(accumulator, result); //We now have a vector of 8 floats __m256 t1 = _mm256_hadd_ps(accumulator, accumulator); __m256 t2 = _mm256_hadd_ps(t1, t1); __m128 t3 = _mm256_extractf128_ps(t2, 1); __m128 t4 = _mm_add_ss(_mm256_castps256_ps128(t2), t3); //And now we don't return std::sqrtf(_mm_cvtss_f32(t4)); #endif #ifndef AVX //Can be autovectorized float accumulator[32]; float distance = 0; for (int i = 0; i < 30; i++) { accumulator[i] = x->Features[i] - y->Features[i]; } //If done properly this should be 4(8) instructions for (int i = 0; i < 30; i++) { distance += accumulator[i] * accumulator[i]; } return std::sqrtf(distance); #endif }
irreg_poly_area_func_sign(float, _avx) { if (__builtin_expect(is_null(cords) || cords_len == 0, 0)) return 0; __m256 values_0_3, values_4_7, values_8_11, values_12_15, values_16_19 = _mm256_load_ps((const float *)&cords[0][0]), accum_sum = _mm256_setzero_ps(); float accum_sum_aux; #define _float_cords_dot_prod(curr, next, index) \ _mm256_dp_ps( \ curr, \ _mm256_xor_ps( \ _mm256_shuffle_ps(curr, _mm256_permute2f128_ps(curr, next, 0b00100001), 0b00011011),\ _mm256_setr_ps(0, -0.0f, 0, -0.0f, 0, -0.0f, 0, -0.0f) \ ), \ 0b11110000 | (1 << (index)) \ ) unsigned long index; for (index = 0; index < (cords_len - 16); index += 16) { values_0_3 = values_16_19; values_4_7 = _mm256_load_ps((const float *)&cords[index + 4]); values_8_11 = _mm256_load_ps((const float *)&cords[index + 8]); values_12_15 = _mm256_load_ps((const float *)&cords[index + 12]); values_16_19 = _mm256_load_ps((const float *)&cords[index + 16]); accum_sum = _mm256_add_ps( accum_sum, _mm256_add_ps( _mm256_add_ps( _float_cords_dot_prod(values_0_3, values_4_7, 0), _float_cords_dot_prod(values_4_7, values_8_11, 1) ), _mm256_add_ps( _float_cords_dot_prod(values_8_11, values_12_15, 2), _float_cords_dot_prod(values_12_15, values_16_19, 3) ) ) ); } accum_sum = _mm256_hadd_ps(accum_sum, _mm256_permute2f128_ps(accum_sum, accum_sum, 1)); // a0+a1, a2+a3, a4+a5, a6+a7, a4+a5, a6+a7, a0+a1, a2+a3 accum_sum = _mm256_hadd_ps(accum_sum, accum_sum); // a0+a1+a2+a3, a4+a5+a6+a7, ... accum_sum = _mm256_hadd_ps(accum_sum, accum_sum); // a0+a1+a2+a3+a4+a5+a6+a7, ... for (accum_sum_aux = _mm_cvtss_f32(_mm256_castps256_ps128(accum_sum)); index < (cords_len - 1); index++) accum_sum_aux += _calc_diff_of_adj_prods(cords, index); return accum_sum_aux; // return scalar_half(scalar_abs(accum_sum_aux)); }
/* AVX implementation for Minimum Mean Squared Error (MMSE) solver */ inline void srslte_mat_2x2_mmse_avx(__m256 y0, __m256 y1, __m256 h00, __m256 h01, __m256 h10, __m256 h11, __m256 *x0, __m256 *x1, float noise_estimate, float norm) { __m256 _noise_estimate = _mm256_set_ps(0.0f, noise_estimate, 0.0f, noise_estimate, 0.0f, noise_estimate, 0.0f, noise_estimate); __m256 _norm = _mm256_set1_ps(norm); /* Create conjugated matrix */ __m256 _h00 = _MM256_CONJ_PS(h00); __m256 _h01 = _MM256_CONJ_PS(h01); __m256 _h10 = _MM256_CONJ_PS(h10); __m256 _h11 = _MM256_CONJ_PS(h11); /* 1. A = H' x H + No*/ #ifdef LV_HAVE_FMA __m256 a00 = _MM256_SQMOD_ADD_PS(h00, h10, _noise_estimate); __m256 a01 = _MM256_PROD_ADD_PS(_h00, h01, _MM256_PROD_PS(_h10, h11)); __m256 a10 = _MM256_PROD_ADD_PS(_h01, h00, _MM256_PROD_PS(_h11, h10)); __m256 a11 = _MM256_SQMOD_ADD_PS(h01, h11, _noise_estimate); #else __m256 a00 = _mm256_add_ps(_MM256_SQMOD_PS(h00, h10), _noise_estimate); __m256 a01 = _mm256_add_ps(_MM256_PROD_PS(_h00, h01), _MM256_PROD_PS(_h10, h11)); __m256 a10 = _mm256_add_ps(_MM256_PROD_PS(_h01, h00), _MM256_PROD_PS(_h11, h10)); __m256 a11 = _mm256_add_ps(_MM256_SQMOD_PS(h01, h11), _noise_estimate); #endif /* LV_HAVE_FMA */ /* 2. B = inv(H' x H + No) = inv(A) */ __m256 b00 = a11; __m256 b01 = _mm256_xor_ps(a01, _mm256_set1_ps(-0.0f)); __m256 b10 = _mm256_xor_ps(a10, _mm256_set1_ps(-0.0f)); __m256 b11 = a00; _norm = _mm256_mul_ps(_norm, srslte_mat_cf_recip_avx(srslte_mat_2x2_det_avx(a00, a01, a10, a11))); /* 3. W = inv(H' x H + No) x H' = B x H' */ #ifdef LV_HAVE_FMA __m256 w00 = _MM256_PROD_ADD_PS(b00, _h00, _MM256_PROD_PS(b01, _h01)); __m256 w01 = _MM256_PROD_ADD_PS(b00, _h10, _MM256_PROD_PS(b01, _h11)); __m256 w10 = _MM256_PROD_ADD_PS(b10, _h00, _MM256_PROD_PS(b11, _h01)); __m256 w11 = _MM256_PROD_ADD_PS(b10, _h10, _MM256_PROD_PS(b11, _h11)); #else __m256 w00 = _mm256_add_ps(_MM256_PROD_PS(b00, _h00), _MM256_PROD_PS(b01, _h01)); __m256 w01 = _mm256_add_ps(_MM256_PROD_PS(b00, _h10), _MM256_PROD_PS(b01, _h11)); __m256 w10 = _mm256_add_ps(_MM256_PROD_PS(b10, _h00), _MM256_PROD_PS(b11, _h01)); __m256 w11 = _mm256_add_ps(_MM256_PROD_PS(b10, _h10), _MM256_PROD_PS(b11, _h11)); #endif /* LV_HAVE_FMA */ /* 4. X = W x Y */ #ifdef LV_HAVE_FMA *x0 = _MM256_PROD_PS(_MM256_PROD_ADD_PS(y0, w00, _MM256_PROD_PS(y1, w01)), _norm); *x1 = _MM256_PROD_PS(_MM256_PROD_ADD_PS(y0, w10, _MM256_PROD_PS(y1, w11)), _norm); #else *x0 = _MM256_PROD_PS(_mm256_add_ps(_MM256_PROD_PS(y0, w00), _MM256_PROD_PS(y1, w01)), _norm); *x1 = _MM256_PROD_PS(_mm256_add_ps(_MM256_PROD_PS(y0, w10), _MM256_PROD_PS(y1, w11)), _norm); #endif /* LV_HAVE_FMA */ }
int SymmColumnVec_32f_Unsymm_AVX(const float** src, const float* ky, float* dst, float delta, int width, int ksize2) { int i = 0, k; const float *S, *S2; const __m128 d4 = _mm_set1_ps(delta); const __m256 d8 = _mm256_set1_ps(delta); for (; i <= width - 16; i += 16) { __m256 f, s0 = d8, s1 = d8; __m256 x0; S = src[0] + i; for (k = 1; k <= ksize2; k++) { S = src[k] + i; S2 = src[-k] + i; f = _mm256_set1_ps(ky[k]); x0 = _mm256_sub_ps(_mm256_loadu_ps(S), _mm256_loadu_ps(S2)); #if CV_FMA3 s0 = _mm256_fmadd_ps(x0, f, s0); #else s0 = _mm256_add_ps(s0, _mm256_mul_ps(x0, f)); #endif x0 = _mm256_sub_ps(_mm256_loadu_ps(S + 8), _mm256_loadu_ps(S2 + 8)); #if CV_FMA3 s1 = _mm256_fmadd_ps(x0, f, s1); #else s1 = _mm256_add_ps(s1, _mm256_mul_ps(x0, f)); #endif } _mm256_storeu_ps(dst + i, s0); _mm256_storeu_ps(dst + i + 8, s1); } for (; i <= width - 4; i += 4) { __m128 f, x0, s0 = d4; for (k = 1; k <= ksize2; k++) { f = _mm_set1_ps(ky[k]); x0 = _mm_sub_ps(_mm_load_ps(src[k] + i), _mm_load_ps(src[-k] + i)); s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); } _mm_storeu_ps(dst + i, s0); } _mm256_zeroupper(); return i; }
static void process_sinc(rarch_sinc_resampler_t *resamp, float *out_buffer) { unsigned i; __m256 sum_l = _mm256_setzero_ps(); __m256 sum_r = _mm256_setzero_ps(); const float *buffer_l = resamp->buffer_l + resamp->ptr; const float *buffer_r = resamp->buffer_r + resamp->ptr; unsigned taps = resamp->taps; unsigned phase = resamp->time >> SUBPHASE_BITS; #if SINC_COEFF_LERP const float *phase_table = resamp->phase_table + phase * taps * 2; const float *delta_table = phase_table + taps; __m256 delta = _mm256_set1_ps((float) (resamp->time & SUBPHASE_MASK) * SUBPHASE_MOD); #else const float *phase_table = resamp->phase_table + phase * taps; #endif for (i = 0; i < taps; i += 8) { __m256 buf_l = _mm256_loadu_ps(buffer_l + i); __m256 buf_r = _mm256_loadu_ps(buffer_r + i); #if SINC_COEFF_LERP __m256 deltas = _mm256_load_ps(delta_table + i); __m256 sinc = _mm256_add_ps(_mm256_load_ps(phase_table + i), _mm256_mul_ps(deltas, delta)); #else __m256 sinc = _mm256_load_ps(phase_table + i); #endif sum_l = _mm256_add_ps(sum_l, _mm256_mul_ps(buf_l, sinc)); sum_r = _mm256_add_ps(sum_r, _mm256_mul_ps(buf_r, sinc)); } /* hadd on AVX is weird, and acts on low-lanes * and high-lanes separately. */ __m256 res_l = _mm256_hadd_ps(sum_l, sum_l); __m256 res_r = _mm256_hadd_ps(sum_r, sum_r); res_l = _mm256_hadd_ps(res_l, res_l); res_r = _mm256_hadd_ps(res_r, res_r); res_l = _mm256_add_ps(_mm256_permute2f128_ps(res_l, res_l, 1), res_l); res_r = _mm256_add_ps(_mm256_permute2f128_ps(res_r, res_r, 1), res_r); /* This is optimized to mov %xmmN, [mem]. * There doesn't seem to be any _mm256_store_ss intrinsic. */ _mm_store_ss(out_buffer + 0, _mm256_extractf128_ps(res_l, 0)); _mm_store_ss(out_buffer + 1, _mm256_extractf128_ps(res_r, 0)); }
void avx2_csr_spmv( float *A, int32_t *nIdx, int32_t **indices, float *x, int32_t n, float *y) { int32_t A_offset = 0; for(int32_t i = 0; i < n; i++) { int32_t nElem = nIdx[i]; float t = 0.0f; __m256 vT = _mm256_setzero_ps(); int32_t smLen = nElem - (nElem & 7); for(int32_t j = 0; j < smLen; j+=8) { __m256i vIdx = _mm256_load_si256((__m256i*)&(indices[i][j])); __m256 vX = _mm256_i32gather_ps((float const*)x,vIdx,4); __m256 vA = _mm256_loadu_ps(&A[A_offset + j]); vT = _mm256_add_ps(vT, _mm256_mul_ps(vX,vA)); } t += sum8(vT); for(int32_t j = smLen; j < nElem; j++) { int32_t idx = indices[i][j]; t += x[idx]*A[A_offset + j]; } y[i] = t; A_offset += nElem; } }
void THFloatVector_adds_AVX(float *y, const float *x, const float c, const ptrdiff_t n) { ptrdiff_t i; __m256 YMM15 = _mm256_set_ps(c, c, c, c, c, c, c, c); __m256 YMM0, YMM1; for (i=0; i<=((n)-16); i+=16) { YMM0 = _mm256_loadu_ps(x+i); YMM1 = _mm256_loadu_ps(x+i+8); YMM0 = _mm256_add_ps(YMM0, YMM15); YMM1 = _mm256_add_ps(YMM1, YMM15); _mm256_storeu_ps(y+i, YMM0); _mm256_storeu_ps(y+i+8, YMM1); } for (; i<(n); i++) { y[i] = x[i] + c; } }
//------------------------------------------------------------------- // blend void effect(float *pBuffer[], const Cbmp *bmp, const float weight) { size_t width = bmp->getWidth(); size_t height = bmp->getAbsHeight(); const float fmul0 = weight; const float fmul1 = 1.0f - weight; __m256 weight0 = _mm256_broadcast_ss(&fmul0); __m256 weight1 = _mm256_broadcast_ss(&fmul1); float *pF[2]; pF[0] = pBuffer[0]; pF[1] = pBuffer[1]; for (size_t y = 0; y < height; y++) { for (size_t x = 0; x < width; x += 8, pF[0] += 8, pF[1] += 8) { __m256 p0 = _mm256_load_ps(pF[0]); p0 = _mm256_mul_ps(p0, weight0); __m256 p1 = _mm256_load_ps(pF[1]); p1 = _mm256_mul_ps(p1, weight1); __m256 r = _mm256_add_ps(p0, p1); _mm256_store_ps(pF[0], r); } } }
float avx_dot_product(std::vector<float> &av, std::vector<float> &bv) { /* Get SIMD-vector pointers to the start of each vector */ unsigned int niters = av.size() / 8; float *a = (float *) aligned_alloc(32, av.size()*sizeof(float)); float *b = (float *) aligned_alloc(32, av.size()*sizeof(float)); memcpy(a,&av[0],av.size()*sizeof(float)); memcpy(b,&bv[0],bv.size()*sizeof(float)); __m256 *ptrA = (__m256*) &a[0], *ptrB = (__m256*) &b[0]; __m256 res = _mm256_set1_ps(0.0); for (unsigned int i = 0; i < niters; i++, ptrA++,ptrB++) res = _mm256_add_ps(_mm256_dp_ps(*ptrA, *ptrB, 255), res); /* Get result back from the SIMD vector */ float fres[8]; _mm256_storeu_ps (fres, res); int q = 8 * niters; for (unsigned int i = 0; i < av.size() % 8; i++) fres[0] += (a[i+q]*b[i+q]); free(a); free(b); return fres[0] + fres[4]; }
float nv_vector_dot(const nv_matrix_t *vec1, int m1, const nv_matrix_t *vec2, int m2) { NV_ASSERT(vec1->n == vec2->n); #if NV_ENABLE_AVX { NV_ALIGNED(float, mm[8], 32); __m256 x, u; int n; int pk_lp = (vec1->n & 0xfffffff8); float dp = 0.0f; u = _mm256_setzero_ps(); for (n = 0; n < pk_lp; n += 8) { x = _mm256_load_ps(&NV_MAT_V(vec2, m2, n)); u = _mm256_add_ps(u, _mm256_mul_ps(x, *(__m256 *)&NV_MAT_V(vec1, m1, n))); } _mm256_store_ps(mm, u); dp = mm[0] + mm[1] + mm[2] + mm[3] + mm[4] + mm[5] + mm[6] + mm[7]; for (n = pk_lp; n < vec1->n; ++n) { dp += NV_MAT_V(vec1, m1, n) * NV_MAT_V(vec2, m2, n); } return dp; } #elif NV_ENABLE_SSE2 { NV_ALIGNED(float, mm[4], 16); __m128 x, u; int n; int pk_lp = (vec1->n & 0xfffffffc); float dp = 0.0f; u = _mm_setzero_ps(); for (n = 0; n < pk_lp; n += 4) { x = _mm_load_ps(&NV_MAT_V(vec2, m2, n)); u = _mm_add_ps(u, _mm_mul_ps(x, *(__m128 *)&NV_MAT_V(vec1, m1, n))); } _mm_store_ps(mm, u); dp = mm[0] + mm[1] + mm[2] + mm[3]; for (n = pk_lp; n < vec1->n; ++n) { dp += NV_MAT_V(vec1, m1, n) * NV_MAT_V(vec2, m2, n); } return dp; } #else { int n; float dp = 0.0f; for (n = 0; n < vec1->n; ++n) { dp += NV_MAT_V(vec1, m1, n) * NV_MAT_V(vec2, m2, n); } return dp; } #endif }
void polynomial(float *ret, const float *const r_values, int num) { // r*r*r*(10+r*(-15+r*6)); __m256 const_6 = _mm256_set1_ps(6.0f); __m256 const_neg_15 = _mm256_set1_ps(-15.0f); __m256 const_10 = _mm256_set1_ps(10.0f); // constants const int loop_factor = 8; for (int i = 0; i < num; i+=loop_factor) { #ifdef USE_IACA IACA_START #endif __m256 r; __m256 left; __m256 right; // aligned load of 256 bits r r = _mm256_load_ps(&r_values[i]); left = _mm256_mul_ps(r, r); // r * r #ifndef __FMA__ right = _mm256_mul_ps(r, const_6); // r * 6 left = _mm256_mul_ps(left, r); // r * r * r right = _mm256_add_ps(right, const_neg_15); //-15 + r * 6 right = _mm256_mul_ps(right, r); //r * (-15 + r * 6) right = _mm256_add_ps(right, const_10); //10 + (r * (-15 + r * 6)) #else right = _mm256_fmadd_ps(r, const_6, const_neg_15); left = _mm256_mul_ps(left, r); right = _mm256_fmadd_ps(r, right, const_10); #endif right = _mm256_mul_ps(right, left); // r*r*r *(10 + r * (-15 + r * 6)) _mm256_store_ps(&ret[i], right); // store 8 values to ret[i] } #ifdef USE_IACA IACA_END #endif }
__m256 mm256_exp_ps(__m256 x) { __m256 tmp = _mm256_setzero_ps(), fx; __m256i emm0; __m256 one = *(__m256*)m256_ps_1; x = _mm256_min_ps(x, *(__m256*)m256_ps_exp_hi); x = _mm256_max_ps(x, *(__m256*)m256_ps_exp_lo); /* express exp(x) as exp(g + n*log(2)) */ fx = _mm256_mul_ps(x, *(__m256*)m256_ps_cephes_LOG2EF); fx = _mm256_add_ps(fx, *(__m256*)m256_ps_0p5); /* how to perform a floorf with SSE: just below */ /* step 1 : cast to int */ emm0 = _mm256_cvttps_epi32(fx); /* step 2 : cast back to float */ tmp = _mm256_cvtepi32_ps(emm0); /* if greater, substract 1 */ __m256 mask = _mm256_cmp_ps( tmp, fx, _CMP_GT_OS ); mask = _mm256_and_ps(mask, one); fx = _mm256_sub_ps(tmp, mask); tmp = _mm256_mul_ps(fx, *(__m256*)m256_ps_cephes_exp_C1); __m256 z = _mm256_mul_ps(fx, *(__m256*)m256_ps_cephes_exp_C2); x = _mm256_sub_ps(x, tmp); x = _mm256_sub_ps(x, z); z = _mm256_mul_ps(x,x); __m256 y = *(__m256*)m256_ps_cephes_exp_p0; y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(__m256*)m256_ps_cephes_exp_p1); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(__m256*)m256_ps_cephes_exp_p2); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(__m256*)m256_ps_cephes_exp_p3); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(__m256*)m256_ps_cephes_exp_p4); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(__m256*)m256_ps_cephes_exp_p5); y = _mm256_mul_ps(y, z); y = _mm256_add_ps(y, x); y = _mm256_add_ps(y, one); /* build 2^n */ emm0 = _mm256_cvttps_epi32(fx); emm0 = _mm256_add_epi32(emm0, *(__m256i*)m256_pi32_0x7f); emm0 = _mm256_slli_epi32(emm0, 23); __m256 pow2n = _mm256_castsi256_ps(emm0); y = _mm256_mul_ps(y, pow2n); _mm256_zeroupper(); return y; }
v8sf exp256_ps(v8sf x) { v8sf tmp = _mm256_setzero_ps(), fx; v8si imm0; v8sf one = *(v8sf*)_ps256_1; x = _mm256_min_ps(x, *(v8sf*)_ps256_exp_hi); x = _mm256_max_ps(x, *(v8sf*)_ps256_exp_lo); /* express exp(x) as exp(g + n*log(2)) */ fx = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_LOG2EF); fx = _mm256_add_ps(fx, *(v8sf*)_ps256_0p5); /* how to perform a floorf with SSE: just below */ //imm0 = _mm256_cvttps_epi32(fx); //tmp = _mm256_cvtepi32_ps(imm0); tmp = _mm256_floor_ps(fx); /* if greater, substract 1 */ //v8sf mask = _mm256_cmpgt_ps(tmp, fx); v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS); mask = _mm256_and_ps(mask, one); fx = _mm256_sub_ps(tmp, mask); tmp = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C1); v8sf z = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C2); x = _mm256_sub_ps(x, tmp); x = _mm256_sub_ps(x, z); z = _mm256_mul_ps(x,x); v8sf y = *(v8sf*)_ps256_cephes_exp_p0; y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p1); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p2); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p3); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p4); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p5); y = _mm256_mul_ps(y, z); y = _mm256_add_ps(y, x); y = _mm256_add_ps(y, one); /* build 2^n */ imm0 = _mm256_cvttps_epi32(fx); // another two AVX2 instructions imm0 = _mm256_add_epi32(imm0, *(v8si*)_pi32_256_0x7f); imm0 = _mm256_slli_epi32(imm0, 23); v8sf pow2n = _mm256_castsi256_ps(imm0); y = _mm256_mul_ps(y, pow2n); return y; }
/* AVX implementation for complex reciprocal */ inline __m256 srslte_mat_cf_recip_avx(__m256 a) { __m256 conj = _MM256_CONJ_PS(a); __m256 sqabs = _mm256_mul_ps(a, a); sqabs = _mm256_add_ps(_mm256_movehdup_ps(sqabs), _mm256_moveldup_ps(sqabs)); __m256 recp = _mm256_rcp_ps(sqabs); return _mm256_mul_ps(recp, conj); }
void mandel_avx(unsigned char *image, const struct spec *s) { __m256 xmin = _mm256_set1_ps(s->xlim[0]); __m256 ymin = _mm256_set1_ps(s->ylim[0]); __m256 xscale = _mm256_set1_ps((s->xlim[1] - s->xlim[0]) / s->width); __m256 yscale = _mm256_set1_ps((s->ylim[1] - s->ylim[0]) / s->height); __m256 threshold = _mm256_set1_ps(4); __m256 one = _mm256_set1_ps(1); __m256 iter_scale = _mm256_set1_ps(1.0f / s->iterations); __m256 depth_scale = _mm256_set1_ps(s->depth - 1); #pragma omp parallel for schedule(dynamic, 1) for (int y = 0; y < s->height; y++) { for (int x = 0; x < s->width; x += 8) { __m256 mx = _mm256_set_ps(x + 7, x + 6, x + 5, x + 4, x + 3, x + 2, x + 1, x + 0); __m256 my = _mm256_set1_ps(y); __m256 cr = _mm256_add_ps(_mm256_mul_ps(mx, xscale), xmin); __m256 ci = _mm256_add_ps(_mm256_mul_ps(my, yscale), ymin); __m256 zr = cr; __m256 zi = ci; int k = 1; __m256 mk = _mm256_set1_ps(k); while (++k < s->iterations) { /* Compute z1 from z0 */ __m256 zr2 = _mm256_mul_ps(zr, zr); __m256 zi2 = _mm256_mul_ps(zi, zi); __m256 zrzi = _mm256_mul_ps(zr, zi); /* zr1 = zr0 * zr0 - zi0 * zi0 + cr */ /* zi1 = zr0 * zi0 + zr0 * zi0 + ci */ zr = _mm256_add_ps(_mm256_sub_ps(zr2, zi2), cr); zi = _mm256_add_ps(_mm256_add_ps(zrzi, zrzi), ci); /* Increment k */ zr2 = _mm256_mul_ps(zr, zr); zi2 = _mm256_mul_ps(zi, zi); __m256 mag2 = _mm256_add_ps(zr2, zi2); __m256 mask = _mm256_cmp_ps(mag2, threshold, _CMP_LT_OS); mk = _mm256_add_ps(_mm256_and_ps(mask, one), mk); /* Early bailout? */ if (_mm256_testz_ps(mask, _mm256_set1_ps(-1))) break; } mk = _mm256_mul_ps(mk, iter_scale); mk = _mm256_sqrt_ps(mk); mk = _mm256_mul_ps(mk, depth_scale); __m256i pixels = _mm256_cvtps_epi32(mk); unsigned char *dst = image + y * s->width * 3 + x * 3; unsigned char *src = (unsigned char *)&pixels; for (int i = 0; i < 8; i++) { dst[i * 3 + 0] = src[i * 4]; dst[i * 3 + 1] = src[i * 4]; dst[i * 3 + 2] = src[i * 4]; } } } }
void NBodyAlgorithmCPU::calculateAccelerationWithColor(const float3(&posI)[8], const float massJ, const float3 posJ, float3(&accI)[8], unsigned int(&numNeighbours)[8]) { __m256 pix = _mm256_set_ps(posI[0].x, posI[1].x, posI[2].x, posI[3].x, posI[4].x, posI[5].x, posI[6].x, posI[7].x); __m256 piy = _mm256_set_ps(posI[0].y, posI[1].y, posI[2].y, posI[3].y, posI[4].y, posI[5].y, posI[6].y, posI[7].y); __m256 piz = _mm256_set_ps(posI[0].z, posI[1].z, posI[2].z, posI[3].z, posI[4].z, posI[5].z, posI[6].z, posI[7].z); __m256 pjx = _mm256_set1_ps(posJ.x); __m256 pjy = _mm256_set1_ps(posJ.y); __m256 pjz = _mm256_set1_ps(posJ.z); __m256 rx = _mm256_sub_ps(pjx, pix); __m256 ry = _mm256_sub_ps(pjy, piy); __m256 rz = _mm256_sub_ps(pjz, piz); __m256 eps2 = _mm256_set1_ps(mp_properties->EPS2); __m256 rx2 = _mm256_mul_ps(rx, rx); __m256 ry2 = _mm256_mul_ps(ry, ry); __m256 rz2 = _mm256_mul_ps(rz, rz); __m256 rabs = _mm256_sqrt_ps(_mm256_add_ps(_mm256_add_ps(rx2, ry2), _mm256_add_ps(rz2, eps2))); __m256 cmpDistance = _mm256_set1_ps(float(mp_properties->positionScale)); __m256 close = _mm256_cmp_ps(rabs, cmpDistance, 2); for (int i = 0; i < 8; i++) { if (close.m256_f32[i] == 0) { numNeighbours[7 - i] = 0; } } __m256 m = _mm256_set1_ps(massJ); __m256 rabsInv = _mm256_div_ps(m, _mm256_mul_ps(_mm256_mul_ps(rabs, rabs), rabs)); __m256 aix = _mm256_mul_ps(rx, rabsInv); __m256 aiy = _mm256_mul_ps(ry, rabsInv); __m256 aiz = _mm256_mul_ps(rz, rabsInv); for (int i = 0; i < 8; i++) { accI[7 - i].x = aix.m256_f32[i]; accI[7 - i].y = aiy.m256_f32[i]; accI[7 - i].z = aiz.m256_f32[i]; } }
static void sfid_render_cache_rt_write_simd8_unorm8_ymajor(struct thread *t, const struct sfid_render_cache_args *args) { const int slice_y = args->rt.minimum_array_element * args->rt.qpitch; const int x = t->grf[1].uw[4]; const int y = t->grf[1].uw[5] + slice_y; const int cpp = 4; struct reg *src = &t->grf[args->src]; const __m256 scale = _mm256_set1_ps(255.0f); const __m256 half = _mm256_set1_ps(0.5f); __m256i r, g, b, a; __m256i rgba; switch (args->rt.format) { case SF_R8G8B8A8_UNORM: r = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[0].reg, scale), half)); g = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[1].reg, scale), half)); b = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[2].reg, scale), half)); a = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[3].reg, scale), half)); break; case SF_B8G8R8A8_UNORM: b = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[0].reg, scale), half)); g = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[1].reg, scale), half)); r = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[2].reg, scale), half)); a = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[3].reg, scale), half)); break; default: stub("unorm8 ymajor format"); return; } rgba = _mm256_slli_epi32(a, 8); rgba = _mm256_or_si256(rgba, b); rgba = _mm256_slli_epi32(rgba, 8); rgba = _mm256_or_si256(rgba, g); rgba = _mm256_slli_epi32(rgba, 8); rgba = _mm256_or_si256(rgba, r); /* Swizzle two middle pixel pairs so that dword 0-3 and 4-7 * form linear owords of pixels. */ rgba = _mm256_permute4x64_epi64(rgba, SWIZZLE(0, 2, 1, 3)); __m256i mask = _mm256_permute4x64_epi64(t->mask_q1, SWIZZLE(0, 2, 1, 3)); void *base = ymajor_offset(args->rt.pixels, x, y, args->rt.stride, cpp); _mm_maskstore_epi32(base, _mm256_extractf128_si256(mask, 0), _mm256_extractf128_si256(rgba, 0)); _mm_maskstore_epi32(base + 16, _mm256_extractf128_si256(mask, 1), _mm256_extractf128_si256(rgba, 1)); }
__m256 distance(const __m256& x1, const __m256& y1, const __m256& x2, const __m256& y2) { const __m256 x_diff = _mm256_sub_ps(x1, x2); const __m256 y_diff = _mm256_sub_ps(y1, y2); const __m256 x_diff2 = _mm256_mul_ps(x_diff, x_diff); const __m256 y_diff2 = _mm256_mul_ps(y_diff, y_diff); const __m256 sum = _mm256_add_ps(x_diff2, y_diff2); const __m256 dist = _mm256_sqrt_ps(sum); return dist; }
static inline __m256i to_unorm(__m256 reg, float scale_f) { const __m256 scale = _mm256_set1_ps(scale_f); const __m256 one = _mm256_set1_ps(1.0f); const __m256 zero = _mm256_set1_ps(0.0f); const __m256 half = _mm256_set1_ps(0.5f); const __m256 clamped = _mm256_max_ps(_mm256_min_ps(reg, one), zero); return _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(clamped, scale), half)); }
void Scaler::process_vect_flt_avx2 (__m256 &sum0, __m256 &sum1, int kernel_size, const float *coef_base_ptr, typename SRC::PtrConst::Type pix_ptr, const __m256i &zero, int src_stride, const __m256 &add_cst, int len) { // Possible optimization: initialize the sum with DST::OFFSET + _add_cst_flt // and save the add in the write proxy. sum0 = add_cst; sum1 = add_cst; for (int k = 0; k < kernel_size; ++k) { __m256 coef = _mm256_set1_ps (coef_base_ptr [k]); __m256 src0; __m256 src1; ReadWrapperFlt <SRC, PF>::read (pix_ptr, src0, src1, zero, len); const __m256 val0 = _mm256_mul_ps (src0, coef); const __m256 val1 = _mm256_mul_ps (src1, coef); sum0 = _mm256_add_ps (sum0, val0); sum1 = _mm256_add_ps (sum1, val1); SRC::PtrConst::jump (pix_ptr, src_stride); } }
inline avx_m256_t newsin_ps(avx_m256_t x) { avx_m256_t sign_bit = _mm256_and_ps(x, _ps_sign_mask); x = _mm256_and_ps(x, _ps_inv_sign_mask); avx_m256_t y = _mm256_mul_ps(x, _ps_cephes_FOPI); avx_m256i_t emm2 = _mm256_cvttps_epi32(y); emm2 = _mm256_add_epi32(emm2, _pi32_1); emm2 = _mm256_and_si256(emm2, _pi32_inv1); y = _mm256_cvtepi32_ps(emm2); avx_m256i_t emm0 = _mm256_and_si256(emm2, _pi32_4); emm0 = _mm256_slli_epi32(emm0, 29); emm2 = _mm256_and_si256(emm2, _pi32_2); emm2 = _mm256_cmpeq_epi32(emm2, _mm256_setzero_si256()); avx_m256_t swap_sign_bit = _mm256_castsi256_ps(emm0); avx_m256_t poly_mask = _mm256_castsi256_ps(emm2); sign_bit = _mm256_xor_ps(sign_bit, swap_sign_bit); avx_m256_t temp = _ps_minus_cephes_DP123; temp = _mm256_mul_ps(y, temp); x = _mm256_add_ps(x, temp); avx_m256_t x2 = _mm256_mul_ps(x, x); avx_m256_t x3 = _mm256_mul_ps(x2, x); avx_m256_t x4 = _mm256_mul_ps(x2, x2); y = _ps_coscof_p0; avx_m256_t y2 = _ps_sincof_p0; y = _mm256_mul_ps(y, x2); y2 = _mm256_mul_ps(y2, x2); y = _mm256_add_ps(y, _ps_coscof_p1); y2 = _mm256_add_ps(y2, _ps_sincof_p1); y = _mm256_mul_ps(y, x2); y2 = _mm256_mul_ps(y2, x2); y = _mm256_add_ps(y, _ps_coscof_p2); y2 = _mm256_add_ps(y2, _ps_sincof_p2); y = _mm256_mul_ps(y, x4); y2 = _mm256_mul_ps(y2, x3); temp = _mm256_mul_ps(x2, _ps_0p5); temp = _mm256_sub_ps(temp, _ps_1); y = _mm256_sub_ps(y, temp); y2 = _mm256_add_ps(y2, x); y = _mm256_andnot_ps(poly_mask, y); y2 = _mm256_and_ps(poly_mask, y2); y = _mm256_add_ps(y, y2); y = _mm256_xor_ps(y, sign_bit); return y; } // newsin_ps()
void somap::train(const imgdata & obj) { if (this->weight <= 0.0)return; __m256 tmp; __m256 *v1 = (__m256*)(this->fvex); const __m256 *v2 = (__m256*)(obj.fvex); const __m256 ws = _mm256_set1_ps(this->weight); for (int i = 0; i < f; i++) { tmp = _mm256_sub_ps(v2[i], v1[i]); tmp = _mm256_mul_ps(tmp, ws); v1[i] = _mm256_add_ps(v1[i], tmp); } }
double compute_pi_leibniz_avx_opt_single(size_t n) { double pi = 0.0; register __m256 ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8; register __m256 ymm9, ymm10, ymm11, ymm12, ymm13; ymm0 = _mm256_set_ps(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0); ymm1 = _mm256_set_ps(1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0); ymm2 = _mm256_set_ps(17.0, 19.0, 21.0, 23.0, 25.0, 27.0, 29.0, 31.0); ymm3 = _mm256_set_ps(33.0, 35.0, 37.0, 39.0, 41.0, 43.0, 45.0, 47.0); ymm4 = _mm256_set_ps(49.0, 51.0, 53.0, 55.0, 57.0, 59.0, 61.0, 63.0); ymm13 = _mm256_set1_ps(64.0); ymm5 = _mm256_setzero_ps(); ymm6 = _mm256_setzero_ps(); ymm7 = _mm256_setzero_ps(); ymm8 = _mm256_setzero_ps(); for (int i = 0; i <= n - 32; i += 32) { ymm9 = _mm256_div_ps(ymm0, ymm1); ymm1 = _mm256_add_ps(ymm1, ymm13); ymm10 = _mm256_div_ps(ymm0, ymm2); ymm2 = _mm256_add_ps(ymm2, ymm13); ymm11 = _mm256_div_ps(ymm0, ymm3); ymm3 = _mm256_add_ps(ymm3, ymm13); ymm12 = _mm256_div_ps(ymm0, ymm4); ymm4 = _mm256_add_ps(ymm4, ymm13); ymm5 = _mm256_add_ps(ymm5, ymm9); ymm6 = _mm256_add_ps(ymm6, ymm10); ymm7 = _mm256_add_ps(ymm7, ymm11); ymm8 = _mm256_add_ps(ymm8, ymm12); } float tmp[8] __attribute__((aligned(32))); _mm256_store_ps(tmp, ymm5); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3] + \ tmp[4] + tmp[5] + tmp[6] + tmp[7]; _mm256_store_ps(tmp, ymm6); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3] + \ tmp[4] + tmp[5] + tmp[6] + tmp[7]; _mm256_store_ps(tmp, ymm7); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3] + \ tmp[4] + tmp[5] + tmp[6] + tmp[7]; _mm256_store_ps(tmp, ymm8); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3] + \ tmp[4] + tmp[5] + tmp[6] + tmp[7]; return pi * 4.0; }
void nv_vector_add(nv_matrix_t *vec0, int m0, const nv_matrix_t *vec1, int m1, const nv_matrix_t *vec2, int m2) { NV_ASSERT(vec1->n == vec2->n); NV_ASSERT(vec2->n == vec0->n); #if NV_ENABLE_AVX { __m256 x; int n; int pk_lp = (vec1->n & 0xfffffff8); for (n = 0; n < pk_lp; n += 8) { x = _mm256_load_ps(&NV_MAT_V(vec1, m1, n)); _mm256_store_ps(&NV_MAT_V(vec0, m0, n), _mm256_add_ps(x, *(const __m256 *)&NV_MAT_V(vec2, m2, n))); } for (n = pk_lp; n < vec1->n; ++n) { NV_MAT_V(vec0, m0, n) = NV_MAT_V(vec1, m1, n) + NV_MAT_V(vec2, m2, n); } } #elif NV_ENABLE_SSE2 { int n; int pk_lp = (vec1->n & 0xfffffffc); #ifdef _OPENMP //#pragma omp parallel for #endif for (n = 0; n < pk_lp; n += 4) { __m128 x = _mm_load_ps(&NV_MAT_V(vec1, m1, n)); _mm_store_ps(&NV_MAT_V(vec0, m0, n), _mm_add_ps(x, *(const __m128 *)&NV_MAT_V(vec2, m2, n))); } for (n = pk_lp; n < vec1->n; ++n) { NV_MAT_V(vec0, m0, n) = NV_MAT_V(vec1, m1, n) + NV_MAT_V(vec2, m2, n); } } #else { int n; for (n = 0; n < vec1->n; ++n) { NV_MAT_V(vec0, m0, n) = NV_MAT_V(vec1, m1, n) + NV_MAT_V(vec2, m2, n); } } #endif }