inline float DatabaseBuilder::Distance(PackedSample* x, PackedSample* y) { #ifdef AVX //Black magic //But it does produce the same results as the not AVX code __m256 accumulator; __m256 x_s = _mm256_load_ps(x->Features); __m256 y_s = _mm256_load_ps(y->Features); __m256 result = _mm256_sub_ps(x_s, y_s); accumulator = _mm256_mul_ps(result, result); x_s = _mm256_load_ps(&x->Features[8]); y_s = _mm256_load_ps(&y->Features[8]); result = _mm256_sub_ps(x_s, y_s); result = _mm256_mul_ps(result, result); accumulator = _mm256_add_ps(accumulator, result); x_s = _mm256_load_ps(&x->Features[16]); y_s = _mm256_load_ps(&y->Features[16]); result = _mm256_sub_ps(x_s, y_s); result = _mm256_mul_ps(result, result); accumulator = _mm256_add_ps(accumulator, result); x_s = _mm256_load_ps(&x->Features[24]); y_s = _mm256_load_ps(&y->Features[24]); result = _mm256_sub_ps(x_s, y_s); result = _mm256_mul_ps(result, result); accumulator = _mm256_add_ps(accumulator, result); //We now have a vector of 8 floats __m256 t1 = _mm256_hadd_ps(accumulator, accumulator); __m256 t2 = _mm256_hadd_ps(t1, t1); __m128 t3 = _mm256_extractf128_ps(t2, 1); __m128 t4 = _mm_add_ss(_mm256_castps256_ps128(t2), t3); //And now we don't return std::sqrtf(_mm_cvtss_f32(t4)); #endif #ifndef AVX //Can be autovectorized float accumulator[32]; float distance = 0; for (int i = 0; i < 30; i++) { accumulator[i] = x->Features[i] - y->Features[i]; } //If done properly this should be 4(8) instructions for (int i = 0; i < 30; i++) { distance += accumulator[i] * accumulator[i]; } return std::sqrtf(distance); #endif }
irreg_poly_area_func_sign(float, _avx) { if (__builtin_expect(is_null(cords) || cords_len == 0, 0)) return 0; __m256 values_0_3, values_4_7, values_8_11, values_12_15, values_16_19 = _mm256_load_ps((const float *)&cords[0][0]), accum_sum = _mm256_setzero_ps(); float accum_sum_aux; #define _float_cords_dot_prod(curr, next, index) \ _mm256_dp_ps( \ curr, \ _mm256_xor_ps( \ _mm256_shuffle_ps(curr, _mm256_permute2f128_ps(curr, next, 0b00100001), 0b00011011),\ _mm256_setr_ps(0, -0.0f, 0, -0.0f, 0, -0.0f, 0, -0.0f) \ ), \ 0b11110000 | (1 << (index)) \ ) unsigned long index; for (index = 0; index < (cords_len - 16); index += 16) { values_0_3 = values_16_19; values_4_7 = _mm256_load_ps((const float *)&cords[index + 4]); values_8_11 = _mm256_load_ps((const float *)&cords[index + 8]); values_12_15 = _mm256_load_ps((const float *)&cords[index + 12]); values_16_19 = _mm256_load_ps((const float *)&cords[index + 16]); accum_sum = _mm256_add_ps( accum_sum, _mm256_add_ps( _mm256_add_ps( _float_cords_dot_prod(values_0_3, values_4_7, 0), _float_cords_dot_prod(values_4_7, values_8_11, 1) ), _mm256_add_ps( _float_cords_dot_prod(values_8_11, values_12_15, 2), _float_cords_dot_prod(values_12_15, values_16_19, 3) ) ) ); } accum_sum = _mm256_hadd_ps(accum_sum, _mm256_permute2f128_ps(accum_sum, accum_sum, 1)); // a0+a1, a2+a3, a4+a5, a6+a7, a4+a5, a6+a7, a0+a1, a2+a3 accum_sum = _mm256_hadd_ps(accum_sum, accum_sum); // a0+a1+a2+a3, a4+a5+a6+a7, ... accum_sum = _mm256_hadd_ps(accum_sum, accum_sum); // a0+a1+a2+a3+a4+a5+a6+a7, ... for (accum_sum_aux = _mm_cvtss_f32(_mm256_castps256_ps128(accum_sum)); index < (cords_len - 1); index++) accum_sum_aux += _calc_diff_of_adj_prods(cords, index); return accum_sum_aux; // return scalar_half(scalar_abs(accum_sum_aux)); }
static void process_sinc(rarch_sinc_resampler_t *resamp, float *out_buffer) { unsigned i; __m256 sum_l = _mm256_setzero_ps(); __m256 sum_r = _mm256_setzero_ps(); const float *buffer_l = resamp->buffer_l + resamp->ptr; const float *buffer_r = resamp->buffer_r + resamp->ptr; unsigned taps = resamp->taps; unsigned phase = resamp->time >> SUBPHASE_BITS; #if SINC_COEFF_LERP const float *phase_table = resamp->phase_table + phase * taps * 2; const float *delta_table = phase_table + taps; __m256 delta = _mm256_set1_ps((float) (resamp->time & SUBPHASE_MASK) * SUBPHASE_MOD); #else const float *phase_table = resamp->phase_table + phase * taps; #endif for (i = 0; i < taps; i += 8) { __m256 buf_l = _mm256_loadu_ps(buffer_l + i); __m256 buf_r = _mm256_loadu_ps(buffer_r + i); #if SINC_COEFF_LERP __m256 deltas = _mm256_load_ps(delta_table + i); __m256 sinc = _mm256_add_ps(_mm256_load_ps(phase_table + i), _mm256_mul_ps(deltas, delta)); #else __m256 sinc = _mm256_load_ps(phase_table + i); #endif sum_l = _mm256_add_ps(sum_l, _mm256_mul_ps(buf_l, sinc)); sum_r = _mm256_add_ps(sum_r, _mm256_mul_ps(buf_r, sinc)); } /* hadd on AVX is weird, and acts on low-lanes * and high-lanes separately. */ __m256 res_l = _mm256_hadd_ps(sum_l, sum_l); __m256 res_r = _mm256_hadd_ps(sum_r, sum_r); res_l = _mm256_hadd_ps(res_l, res_l); res_r = _mm256_hadd_ps(res_r, res_r); res_l = _mm256_add_ps(_mm256_permute2f128_ps(res_l, res_l, 1), res_l); res_r = _mm256_add_ps(_mm256_permute2f128_ps(res_r, res_r, 1), res_r); /* This is optimized to mov %xmmN, [mem]. * There doesn't seem to be any _mm256_store_ss intrinsic. */ _mm_store_ss(out_buffer + 0, _mm256_extractf128_ps(res_l, 0)); _mm_store_ss(out_buffer + 1, _mm256_extractf128_ps(res_r, 0)); }
void warmup(float *x, float *y, int size, float alpha) { #pragma ivdep int i; __m256 m = _mm256_set_ps(1.0/alpha, 2.0, 1.0/alpha, 2.0, 1.0/alpha, 2.0, 1.0/alpha, 2.0); #pragma vector aligned for (i=0; i<size; i+=4) { __m256 t = _mm256_load_ps(x+2*i); __m256 l = _mm256_mul_ps(t, m); // premultiply __m256 r = _mm256_permute2f128_ps( l , l , 1); // swap lower and higher 128 bits __m256 res = _mm256_hadd_ps(l, r); __m128 s = _mm256_extractf128_ps (res, 0); _mm_store_ps(y+i,s); // store it } }
void run_softmax_int32_float_work_item_latency(nn_workload_item *const work_item) { nn_workload_data_t *input_view = work_item->input[0]->output; const auto &arguments = work_item->arguments.forward_softmax_fixedpoint; const auto input_width = input_view->parent->lengths.t[NN_DATA_COORD_z] * input_view->parent->lengths.t[NN_DATA_COORD_p]; const auto output_width = work_item->output->view_end.t[NN_DATA_COORD_x] - work_item->output->view_begin.t[NN_DATA_COORD_x] + 1; const auto num_full_blocks = output_width / C_data_stride; const auto partial_block_size = (output_width / C_simd_width) % C_max_acc; const auto subsimd_block_size = output_width % C_simd_width; const auto output_view_start = work_item->output->view_begin.t[NN_DATA_COORD_x]; const auto input_view_start = input_view->view_begin.t[NN_DATA_COORD_z] * input_view->parent->lengths.t[NN_DATA_COORD_p]; const auto out_fraction = arguments.input_fraction; float * input_f = (float*)_mm_malloc(input_width * sizeof(float), 64); auto input_buffer = &static_cast<int32_t*>(input_view->parent->data_buffer)[input_view_start]; auto shift = out_fraction; if (shift > 0) { for (uint32_t i = 0; i < input_width; i++) input_f[i] = (float)(input_buffer[i]) / (1 << shift); } else if (shift < 0) { for (uint32_t i = 0; i < input_width; i++) input_f[i] = (float)(input_buffer[i]) * (1 << -shift); } else { for (uint32_t i = 0; i < input_width; i++) input_f[i] = (float)(input_buffer[i]); } __m256 acc_sum = _mm256_setzero_ps(); float subsimd_sum = 0.0f; { auto input_buffer = input_f; auto output_buffer = &static_cast<float*>(work_item->output->parent->data_buffer)[output_view_start]; for (auto block = 0u; block < num_full_blocks; ++block) { // Run computation. softmax_compute_block<C_max_acc>(input_buffer, output_buffer, acc_sum); } switch (partial_block_size) { case 0: break; case 1: softmax_compute_block< 1>(input_buffer, output_buffer, acc_sum); break; case 2: softmax_compute_block< 2>(input_buffer, output_buffer, acc_sum); break; case 3: softmax_compute_block< 3>(input_buffer, output_buffer, acc_sum); break; case 4: softmax_compute_block< 4>(input_buffer, output_buffer, acc_sum); break; case 5: softmax_compute_block< 5>(input_buffer, output_buffer, acc_sum); break; case 6: softmax_compute_block< 6>(input_buffer, output_buffer, acc_sum); break; case 7: softmax_compute_block< 7>(input_buffer, output_buffer, acc_sum); break; case 8: softmax_compute_block< 8>(input_buffer, output_buffer, acc_sum); break; case 9: softmax_compute_block< 9>(input_buffer, output_buffer, acc_sum); break; case 10: softmax_compute_block<10>(input_buffer, output_buffer, acc_sum); break; case 11: softmax_compute_block<11>(input_buffer, output_buffer, acc_sum); break; case 12: softmax_compute_block<12>(input_buffer, output_buffer, acc_sum); break; case 13: softmax_compute_block<13>(input_buffer, output_buffer, acc_sum); break; case 14: softmax_compute_block<14>(input_buffer, output_buffer, acc_sum); break; default: NN_UNREACHABLE_CODE; } switch (subsimd_block_size) { case 0: break; case 1: softmax_compute_subsimd<1>(input_buffer, output_buffer, subsimd_sum); break; case 2: softmax_compute_subsimd<2>(input_buffer, output_buffer, subsimd_sum); break; case 3: softmax_compute_subsimd<3>(input_buffer, output_buffer, subsimd_sum); break; case 4: softmax_compute_subsimd<4>(input_buffer, output_buffer, subsimd_sum); break; case 5: softmax_compute_subsimd<5>(input_buffer, output_buffer, subsimd_sum); break; case 6: softmax_compute_subsimd<6>(input_buffer, output_buffer, subsimd_sum); break; case 7: softmax_compute_subsimd<7>(input_buffer, output_buffer, subsimd_sum); break; default: NN_UNREACHABLE_CODE; } } { __m256 intermediate_sum = _mm256_hadd_ps(acc_sum, acc_sum); intermediate_sum = _mm256_permutevar8x32_ps(intermediate_sum, _mm256_set_epi32(0, 1, 4, 5, 2, 3, 6, 7)); intermediate_sum = _mm256_hadd_ps(intermediate_sum, intermediate_sum); intermediate_sum = _mm256_hadd_ps(intermediate_sum, intermediate_sum); acc_sum = _mm256_add_ps(intermediate_sum, _mm256_set1_ps(subsimd_sum)); subsimd_sum = _mm_cvtss_f32(_mm256_extractf128_ps(acc_sum, 0)); acc_sum = _mm256_div_ps(_mm256_set1_ps(1.0f), acc_sum); subsimd_sum = 1.0f / subsimd_sum; } { auto output_buffer = &static_cast<float*>(work_item->output->parent->data_buffer)[output_view_start]; for (auto block = 0u; block < num_full_blocks; ++block) { // Run computation. softmax_finalize_block<C_max_acc>(output_buffer, acc_sum); } switch (partial_block_size) { case 0: break; case 1: softmax_finalize_block< 1>(output_buffer, acc_sum); break; case 2: softmax_finalize_block< 2>(output_buffer, acc_sum); break; case 3: softmax_finalize_block< 3>(output_buffer, acc_sum); break; case 4: softmax_finalize_block< 4>(output_buffer, acc_sum); break; case 5: softmax_finalize_block< 5>(output_buffer, acc_sum); break; case 6: softmax_finalize_block< 6>(output_buffer, acc_sum); break; case 7: softmax_finalize_block< 7>(output_buffer, acc_sum); break; case 8: softmax_finalize_block< 8>(output_buffer, acc_sum); break; case 9: softmax_finalize_block< 9>(output_buffer, acc_sum); break; case 10: softmax_finalize_block<10>(output_buffer, acc_sum); break; case 11: softmax_finalize_block<11>(output_buffer, acc_sum); break; case 12: softmax_finalize_block<12>(output_buffer, acc_sum); break; case 13: softmax_finalize_block<13>(output_buffer, acc_sum); break; case 14: softmax_finalize_block<14>(output_buffer, acc_sum); break; default: NN_UNREACHABLE_CODE; } switch (subsimd_block_size) { case 0: break; case 1: softmax_finalize_subsimd<1>(output_buffer, subsimd_sum); break; case 2: softmax_finalize_subsimd<2>(output_buffer, subsimd_sum); break; case 3: softmax_finalize_subsimd<3>(output_buffer, subsimd_sum); break; case 4: softmax_finalize_subsimd<4>(output_buffer, subsimd_sum); break; case 5: softmax_finalize_subsimd<5>(output_buffer, subsimd_sum); break; case 6: softmax_finalize_subsimd<6>(output_buffer, subsimd_sum); break; case 7: softmax_finalize_subsimd<7>(output_buffer, subsimd_sum); break; default: NN_UNREACHABLE_CODE; } } _mm_free(input_f); }
void kernel_strmv_u_t_8_lib8(int kmax, float *A, int sda, float *x, float *y, int alg) { /* if(kmax<=0) */ /* return;*/ const int lda = 8; /* const int bs = 8;*/ __builtin_prefetch( A + 0*lda ); __builtin_prefetch( A + 2*lda ); __builtin_prefetch( A + 4*lda ); __builtin_prefetch( A + 6*lda ); int k; __m256 zeros, ax_temp, a_00, a_01, a_02, a_03, x_0, y_0, y_1, y_2, y_3, y_4, y_5, y_6, y_7; zeros = _mm256_setzero_ps(); y_0 = _mm256_setzero_ps(); y_1 = _mm256_setzero_ps(); y_2 = _mm256_setzero_ps(); y_3 = _mm256_setzero_ps(); y_4 = _mm256_setzero_ps(); y_5 = _mm256_setzero_ps(); y_6 = _mm256_setzero_ps(); y_7 = _mm256_setzero_ps(); k=0; for(; k<kmax-7; k+=8) { x_0 = _mm256_loadu_ps( &x[0] ); __builtin_prefetch( A + sda*lda + 0*lda ); __builtin_prefetch( A + sda*lda + 2*lda ); a_00 = _mm256_load_ps( &A[0+lda*0] ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_0 = _mm256_add_ps( y_0, ax_temp ); a_01 = _mm256_load_ps( &A[0+lda*1] ); ax_temp = _mm256_mul_ps( a_01, x_0 ); y_1 = _mm256_add_ps( y_1, ax_temp ); a_02 = _mm256_load_ps( &A[0+lda*2] ); ax_temp = _mm256_mul_ps( a_02, x_0 ); y_2 = _mm256_add_ps( y_2, ax_temp ); a_03 = _mm256_load_ps( &A[0+lda*3] ); ax_temp = _mm256_mul_ps( a_03, x_0 ); y_3 = _mm256_add_ps( y_3, ax_temp ); __builtin_prefetch( A + sda*lda + 4*lda ); __builtin_prefetch( A + sda*lda + 6*lda ); a_00 = _mm256_load_ps( &A[0+lda*4] ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_4 = _mm256_add_ps( y_4, ax_temp ); a_01 = _mm256_load_ps( &A[0+lda*5] ); ax_temp = _mm256_mul_ps( a_01, x_0 ); y_5 = _mm256_add_ps( y_5, ax_temp ); a_02 = _mm256_load_ps( &A[0+lda*6] ); ax_temp = _mm256_mul_ps( a_02, x_0 ); y_6 = _mm256_add_ps( y_6, ax_temp ); a_03 = _mm256_load_ps( &A[0+lda*7] ); ax_temp = _mm256_mul_ps( a_03, x_0 ); y_7 = _mm256_add_ps( y_7, ax_temp ); A += sda*lda; x += lda; } x_0 = _mm256_loadu_ps( &x[0] ); a_00 = _mm256_load_ps( &A[0+lda*0] ); a_00 = _mm256_blend_ps( zeros, a_00, 0x01 ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_0 = _mm256_add_ps( y_0, ax_temp ); a_01 = _mm256_load_ps( &A[0+lda*1] ); a_01 = _mm256_blend_ps( zeros, a_01, 0x03 ); ax_temp = _mm256_mul_ps( a_01, x_0 ); y_1 = _mm256_add_ps( y_1, ax_temp ); a_02 = _mm256_load_ps( &A[0+lda*2] ); a_02 = _mm256_blend_ps( zeros, a_02, 0x07 ); ax_temp = _mm256_mul_ps( a_02, x_0 ); y_2 = _mm256_add_ps( y_2, ax_temp ); a_03 = _mm256_load_ps( &A[0+lda*3] ); a_03 = _mm256_blend_ps( zeros, a_03, 0x0f ); ax_temp = _mm256_mul_ps( a_03, x_0 ); y_3 = _mm256_add_ps( y_3, ax_temp ); a_00 = _mm256_load_ps( &A[0+lda*4] ); a_00 = _mm256_blend_ps( zeros, a_00, 0x1f ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_4 = _mm256_add_ps( y_4, ax_temp ); a_01 = _mm256_load_ps( &A[0+lda*5] ); a_01 = _mm256_blend_ps( zeros, a_01, 0x3f ); ax_temp = _mm256_mul_ps( a_01, x_0 ); y_5 = _mm256_add_ps( y_5, ax_temp ); a_02 = _mm256_load_ps( &A[0+lda*6] ); a_02 = _mm256_blend_ps( zeros, a_02, 0x7f ); ax_temp = _mm256_mul_ps( a_02, x_0 ); y_6 = _mm256_add_ps( y_6, ax_temp ); a_03 = _mm256_load_ps( &A[0+lda*7] ); /* a_03 = _mm256_blend_ps( zeros, a_03, 0xff );*/ ax_temp = _mm256_mul_ps( a_03, x_0 ); y_7 = _mm256_add_ps( y_7, ax_temp ); // reduction __m256 z_0; y_0 = _mm256_hadd_ps(y_0, y_1); y_2 = _mm256_hadd_ps(y_2, y_3); y_4 = _mm256_hadd_ps(y_4, y_5); y_6 = _mm256_hadd_ps(y_6, y_7); y_0 = _mm256_hadd_ps(y_0, y_2); y_4 = _mm256_hadd_ps(y_4, y_6); y_1 = _mm256_permute2f128_ps(y_0, y_4, 0x20); y_2 = _mm256_permute2f128_ps(y_0, y_4, 0x31); y_0 = _mm256_add_ps(y_1, y_2); // store if(alg==0) { _mm256_storeu_ps(&y[0], y_0); } else if(alg==1) { z_0 = _mm256_loadu_ps( &y[0] ); z_0 = _mm256_add_ps(z_0, y_0); _mm256_storeu_ps(&y[0], z_0); } else // alg==-1 { z_0 = _mm256_loadu_ps( &y[0] ); z_0 = _mm256_sub_ps(z_0, y_0); _mm256_storeu_ps(&y[0], z_0); } }
void sEnv_process(HvBase *_c, SignalEnvelope *o, hv_bInf_t bIn, void (*sendMessage)(HvBase *, int, const HvMessage *)) { #if HV_SIMD_AVX _mm256_stream_ps(o->buffer+o->numSamplesInBuffer, _mm256_mul_ps(bIn,bIn)); // store bIn^2, no need to cache block o->numSamplesInBuffer += HV_N_SIMD; if (o->numSamplesInBuffer >= o->windowSize) { int n4 = o->windowSize & ~HV_N_SIMD_MASK; __m256 sum = _mm256_setzero_ps(); while (n4) { __m256 x = _mm256_load_ps(o->buffer + n4 - HV_N_SIMD); __m256 h = _mm256_load_ps(o->hanningWeights + n4 - HV_N_SIMD); x = _mm256_mul_ps(x, h); sum = _mm256_add_ps(sum, x); n4 -= HV_N_SIMD; } sum = _mm256_hadd_ps(sum,sum); // horizontal sum sum = _mm256_hadd_ps(sum,sum); sEnv_sendMessage(_c, o, sum[0]+sum[4], sendMessage); // updates numSamplesInBuffer } #elif HV_SIMD_SSE _mm_stream_ps(o->buffer+o->numSamplesInBuffer, _mm_mul_ps(bIn,bIn)); // store bIn^2, no need to cache block o->numSamplesInBuffer += HV_N_SIMD; if (o->numSamplesInBuffer >= o->windowSize) { int n4 = o->windowSize & ~HV_N_SIMD_MASK; __m128 sum = _mm_setzero_ps(); while (n4) { __m128 x = _mm_load_ps(o->buffer + n4 - HV_N_SIMD); __m128 h = _mm_load_ps(o->hanningWeights + n4 - HV_N_SIMD); x = _mm_mul_ps(x, h); sum = _mm_add_ps(sum, x); n4 -= HV_N_SIMD; } sum = _mm_hadd_ps(sum,sum); // horizontal sum sum = _mm_hadd_ps(sum,sum); sEnv_sendMessage(_c, o, sum[0], sendMessage); } #elif HV_SIMD_NEON vst1q_f32(o->buffer+o->numSamplesInBuffer, vmulq_f32(bIn,bIn)); // store bIn^2, no need to cache block o->numSamplesInBuffer += HV_N_SIMD; if (o->numSamplesInBuffer >= o->windowSize) { int n4 = o->windowSize & ~HV_N_SIMD_MASK; float32x4_t sum = vdupq_n_f32(0.0f); while (n4) { float32x4_t x = vld1q_f32(o->buffer + n4 - HV_N_SIMD); float32x4_t h = vld1q_f32(o->hanningWeights + n4 - HV_N_SIMD); x = vmulq_f32(x, h); sum = vaddq_f32(sum, x); n4 -= HV_N_SIMD; } sEnv_sendMessage(_c, o, sum[0]+sum[1]+sum[2]+sum[3], sendMessage); } #else // HV_SIMD_NONE o->buffer[o->numSamplesInBuffer] = (bIn*bIn); o->numSamplesInBuffer += HV_N_SIMD; if (o->numSamplesInBuffer >= o->windowSize) { float sum = 0.0f; for (int i = 0; i < o->windowSize; ++i) { sum += (o->hanningWeights[i] * o->buffer[i]); } sEnv_sendMessage(_c, o, sum, sendMessage); } #endif }
void animate() { float mx; float my; if(ManualControl) { POINT pos; GetCursorPos(&pos); RECT rc; GetClientRect(hMainWnd, &rc); ScreenToClient(hMainWnd, &pos); mx = pos.x; my = pos.y; } else { UpdatePosition(mx, my); } const auto size = partCount; VertexData *pVertexBuffer; pVertexObject->Lock(0, 0, (void**)&pVertexBuffer, D3DLOCK_DISCARD); _mm256_zeroall(); #pragma omp parallel \ shared(pVertexBuffer, particlesCoord, particlesVel, mx, my, size) { #pragma omp for nowait for(int i = 0; i < size; i += 4) { float mouseCoordVec[8] = { mx, my, mx, my, mx, my, mx, my }; float *particleCoordsVec = (float*)particlesCoord + i; float *velocityVec = (float*)particlesVel + i; auto xyCoord = _mm256_loadu_ps(particleCoordsVec); auto hwTempData = _mm256_sub_ps(xyCoord, _mm256_loadu_ps(mouseCoordVec)); auto squares = _mm256_mul_ps(hwTempData, hwTempData); auto distSquare = _mm256_hadd_ps(squares, squares); distSquare = _mm256_shuffle_ps(distSquare, distSquare, 0x50); auto theForce = _mm256_div_ps(_mm256_set1_ps(G), distSquare); if(distSquare.m256_f32[0] < 400) { theForce.m256_f32[0] = 0; theForce.m256_f32[1] = 0; } if(distSquare.m256_f32[2] < 400) { theForce.m256_f32[2] = 0; theForce.m256_f32[3] = 0; } if(distSquare.m256_f32[4] < 400) { theForce.m256_f32[4] = 0; theForce.m256_f32[5] = 0; } if(distSquare.m256_f32[6] < 400) { theForce.m256_f32[6] = 0; theForce.m256_f32[7] = 0; } auto xyForces = _mm256_mul_ps(_mm256_xor_ps(hwTempData, _mm256_set1_ps(-0.f)), theForce); auto xyVelocities = _mm256_loadu_ps(velocityVec); xyVelocities = _mm256_mul_ps(xyVelocities, _mm256_set1_ps(Resistance)); xyVelocities = _mm256_add_ps(xyVelocities, xyForces); xyCoord = _mm256_add_ps(xyCoord, xyVelocities); _mm256_storeu_ps(velocityVec, xyVelocities); _mm256_storeu_ps(particleCoordsVec, xyCoord); processIfOutOfBounds(((ParticleCoord*)particleCoordsVec)[0], ((ParticleVel*)velocityVec)[0]); processIfOutOfBounds(((ParticleCoord*)particleCoordsVec)[1], ((ParticleVel*)velocityVec)[1]); processIfOutOfBounds(((ParticleCoord*)particleCoordsVec)[2], ((ParticleVel*)velocityVec)[2]); processIfOutOfBounds(((ParticleCoord*)particleCoordsVec)[3], ((ParticleVel*)velocityVec)[3]); pVertexBuffer[i].x = ((ParticleCoord*)particleCoordsVec)[0].x; pVertexBuffer[i].y = ((ParticleCoord*)particleCoordsVec)[0].y; pVertexBuffer[i + 1].x = ((ParticleCoord*)particleCoordsVec)[1].x; pVertexBuffer[i + 1].y = ((ParticleCoord*)particleCoordsVec)[1].y; pVertexBuffer[i + 2].x = ((ParticleCoord*)particleCoordsVec)[2].x; pVertexBuffer[i + 2].y = ((ParticleCoord*)particleCoordsVec)[2].y; pVertexBuffer[i + 3].x = ((ParticleCoord*)particleCoordsVec)[3].x; pVertexBuffer[i + 3].y = ((ParticleCoord*)particleCoordsVec)[3].y; } } pVertexObject->Unlock(); _mm256_zeroall(); }
void electric_field(struct Structure This_Structure, float grid_span, int grid_size, fftw_real * grid, int *shared_x, struct atom_values *atoms, int natoms_in) { /************/ /* Counters */ int residue, atom, i; /* Co-ordinates */ int x, y, z; float x_centre, y_centre, z_centre; /* Variables */ float distance; float phi, epsilon; while (1) { pthread_mutex_lock(&shared_x_mutex); x = *shared_x; *shared_x = *shared_x + 1; pthread_mutex_unlock(&shared_x_mutex); if (x >= grid_size) break; printf("."); x_centre = gcentre(x, grid_span, grid_size); __mtype mx_centre = _set1_ps(x_centre); for (y = 0; y < grid_size; y++) { y_centre = gcentre(y, grid_span, grid_size); __mtype my_centre = _set1_ps(y_centre); for (z = 0; z < grid_size; z++) { z_centre = gcentre(z, grid_span, grid_size); __mtype mz_centre = _set1_ps(z_centre); phi = 0; __mtype phis = _set1_ps(0.0); for (i = 0; i < natoms_in; i++) { __mtype xs = _load_ps(atoms[i].xs); __mtype ys = _load_ps(atoms[i].ys); __mtype zs = _load_ps(atoms[i].zs); __mtype charges = _load_ps(atoms[i].charges); __mtype distances; // Calculo distancias (el original pythagoras) __mtype diffxs = _sub_ps(xs, mx_centre); __mtype diffys = _sub_ps(ys, my_centre); __mtype diffzs = _sub_ps(zs, mz_centre); diffxs = _mul_ps(diffxs, diffxs); diffys = _mul_ps(diffys, diffys); diffzs = _mul_ps(diffzs, diffzs); distances = _add_ps(diffxs, diffys); distances = _add_ps(distances, diffzs); distances = _sqrt_ps(distances); // A partir de aquí implemento los if's originales usando solo máscaras de bits // Trunco a 2 como mínimo distances = _max_ps(distances, _set1_ps(2.0)); __mtype epsilons = _set1_ps(0.0); __mtype tmp; __mtype tmp2; // if >= 8 tmp = _cmpge_ps(distances, _set1_ps(8.0)); epsilons = _and_ps(tmp, _set1_ps(80.0)); // else if <= 6 tmp = _cmple_ps(distances, _set1_ps(6.0)); tmp = _and_ps(tmp, _set1_ps(4.0)); epsilons = _or_ps(epsilons, tmp); // else tmp = _cmpgt_ps(distances, _set1_ps(6.0)); tmp2 = _cmpeq_ps(epsilons, _set1_ps(0.0)); tmp = _and_ps(tmp, tmp2); tmp2 = _mul_ps(distances, _set1_ps(38.0)); tmp2 = _sub_ps(tmp2, _set1_ps(224.0)); tmp = _and_ps(tmp, tmp2); // Valor final epsilons = _or_ps(epsilons, tmp); // Calculo las phis tmp = _mul_ps(epsilons, distances); tmp = _div_ps(charges, tmp); // Acumulo las phis phis = _add_ps(phis, tmp); } #ifdef USE_AVX phis = _mm256_hadd_ps(phis, phis); phis = _mm256_hadd_ps(phis, phis); phi = phis[0] + phis[4]; #else float tmp, tmp2; tmp = phis[0] + phis[1]; tmp2 = phis[2] + phis[3]; phi = tmp + tmp2; #endif grid[gaddress(x, y, z, grid_size)] = (fftw_real) phi; } } } /************/ return; }
void kernel_ssymv_4_lib8(int kmax, int kna, float *A, int sda, float *x_n, float *y_n, float *x_t, float *y_t, int tri, int alg) { if(kmax<=0) return; const int lda = 8; __builtin_prefetch( A + 0*lda ); __builtin_prefetch( A + 2*lda ); int k, k_left, ii; float k_left_d; const float mask_f[] = {7.5, 6.5, 5.5, 4.5, 3.5, 2.5, 1.5, 0.5}; float temp_space[8] = {}; __m256 mask, zeros, temp, a_00, a_01, a_02, a_03, x_n_0, x_n_1, x_n_2, x_n_3, y_n_0, x_t_0, y_t_0, y_t_1, y_t_2, y_t_3; mask = _mm256_loadu_ps( mask_f ); zeros = _mm256_setzero_ps(); x_n_0 = _mm256_broadcast_ss( &x_n[0] ); x_n_1 = _mm256_broadcast_ss( &x_n[1] ); x_n_2 = _mm256_broadcast_ss( &x_n[2] ); x_n_3 = _mm256_broadcast_ss( &x_n[3] ); if(alg==-1) // TODO xor { x_n_0 = _mm256_sub_ps( zeros, x_n_0 ); x_n_1 = _mm256_sub_ps( zeros, x_n_1 ); x_n_2 = _mm256_sub_ps( zeros, x_n_2 ); x_n_3 = _mm256_sub_ps( zeros, x_n_3 ); } y_t_0 = _mm256_setzero_ps(); y_t_1 = _mm256_setzero_ps(); y_t_2 = _mm256_setzero_ps(); y_t_3 = _mm256_setzero_ps(); k=0; // corner if(tri==1) { k_left = kna-k; k_left_d = 8.0 - k_left; /*printf("\nk_left = %d\n", k_left);*/ /* y_n_0 = _mm_load_ps( &y_n[0] );*/ /* y_n_0 = _mm_setzero_ps();*/ x_t_0 = _mm256_loadu_ps( &x_t[0] ); x_t_0 = _mm256_blendv_ps( x_t_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) ); /*_mm256_storeu_ps( temp_space, x_t_0 ); */ /*printf("\nx = %f %f %f %f %f %f %f %f \n", temp_space[0], temp_space[1], temp_space[2], temp_space[3], temp_space[4], temp_space[5], temp_space[6], temp_space[7] );*/ /*exit(1);*/ a_00 = _mm256_loadu_ps( &A[0+lda*0] ); a_00 = _mm256_blend_ps( a_00, zeros, 0x00 ); /* temp = _mm256_mul_ps( a_00, x_n_0 );*/ /* y_n_0 = _mm256_add_ps( y_n_0, temp );*/ y_n_0 = _mm256_mul_ps( a_00, x_n_0 ); a_00 = _mm256_blend_ps( a_00, zeros, 0x01 ); temp = _mm256_mul_ps( a_00, x_t_0 ); y_t_0 = _mm256_add_ps( y_t_0, temp ); a_01 = _mm256_loadu_ps( &A[0+lda*1] ); a_01 = _mm256_blend_ps( a_01, zeros, 0x01 ); temp = _mm256_mul_ps( a_01, x_n_1 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); a_01 = _mm256_blend_ps( a_01, zeros, 0x03 ); temp = _mm256_mul_ps( a_01, x_t_0 ); y_t_1 = _mm256_add_ps( y_t_1, temp ); a_02 = _mm256_loadu_ps( &A[0+lda*2] ); a_02 = _mm256_blend_ps( a_02, zeros, 0x03 ); temp = _mm256_mul_ps( a_02, x_n_2 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); a_02 = _mm256_blend_ps( a_02, zeros, 0x07 ); temp = _mm256_mul_ps( a_02, x_t_0 ); y_t_2 = _mm256_add_ps( y_t_2, temp ); a_03 = _mm256_loadu_ps( &A[0+lda*3] ); a_03 = _mm256_blend_ps( a_03, zeros, 0x07 ); temp = _mm256_mul_ps( a_03, x_n_3 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); a_03 = _mm256_blend_ps( a_03, zeros, 0x0f ); temp = _mm256_mul_ps( a_03, x_t_0 ); y_t_3 = _mm256_add_ps( y_t_3, temp ); /*_mm256_storeu_ps( temp_space, y_n_0 ); */ /*printf("\ny = %f %f %f %f %f %f %f %f \n", temp_space[0], temp_space[1], temp_space[2], temp_space[3], temp_space[4], temp_space[5], temp_space[6], temp_space[7] );*/ y_n_0 = _mm256_blendv_ps( y_n_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) ); /*_mm256_storeu_ps( temp_space, y_n_0 ); */ /*printf("\ny = %f %f %f %f %f %f %f %f \n", temp_space[0], temp_space[1], temp_space[2], temp_space[3], temp_space[4], temp_space[5], temp_space[6], temp_space[7] );*/ x_t_0 = _mm256_loadu_ps( &y_n[0] ); y_n_0 = _mm256_add_ps( y_n_0, x_t_0 ); _mm256_storeu_ps( &y_n[0], y_n_0 ); A += k_left; y_n += k_left; x_t += k_left; k += k_left; } if(k<kna) // it can be only k_left = {1, 2, 3, 4, 5, 6, 7} /* for(; k<kna; k++)*/ { k_left = kna-k; k_left_d = 8.0 - k_left; /*printf("\nk_left = %d\n", k_left);*/ /* y_n_0 = _mm_load_ps( &y_n[0] );*/ /* y_n_0 = _mm_setzero_ps();*/ x_t_0 = _mm256_loadu_ps( &x_t[0] ); x_t_0 = _mm256_blendv_ps( x_t_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) ); /*_mm256_storeu_ps( temp_space, x_t_0 ); */ /*printf("\nx = %f %f %f %f %f %f %f %f \n", temp_space[0], temp_space[1], temp_space[2], temp_space[3], temp_space[4], temp_space[5], temp_space[6], temp_space[7] );*/ a_00 = _mm256_loadu_ps( &A[0+lda*0] ); a_01 = _mm256_loadu_ps( &A[0+lda*1] ); a_02 = _mm256_loadu_ps( &A[0+lda*2] ); a_03 = _mm256_loadu_ps( &A[0+lda*3] ); /* temp = _mm256_mul_ps( a_00, x_n_0 );*/ /* y_n_0 = _mm256_add_ps( y_n_0, temp );*/ y_n_0 = _mm256_mul_ps( a_00, x_n_0 ); temp = _mm256_mul_ps( a_00, x_t_0 ); y_t_0 = _mm256_add_ps( y_t_0, temp ); temp = _mm256_mul_ps( a_01, x_n_1 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_01, x_t_0 ); y_t_1 = _mm256_add_ps( y_t_1, temp ); temp = _mm256_mul_ps( a_02, x_n_2 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_02, x_t_0 ); y_t_2 = _mm256_add_ps( y_t_2, temp ); temp = _mm256_mul_ps( a_03, x_n_3 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_03, x_t_0 ); y_t_3 = _mm256_add_ps( y_t_3, temp ); y_n_0 = _mm256_blendv_ps( y_n_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) ); x_t_0 = _mm256_loadu_ps( &y_n[0] ); y_n_0 = _mm256_add_ps( y_n_0, x_t_0 ); _mm256_storeu_ps( &y_n[0], y_n_0 ); /* _mm256_maskstore_ps( &y_n[0], (__m256i) _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ), y_n_0 );*/ /* _mm256_storeu_ps( temp_space, y_n_0 );*/ /* for(ii=0; ii<k_left; ii++)*/ /* y_n[ii] = temp_space[ii];*/ /*printf("\nk_left = %d\n", k_left);*/ /*exit(1);*/ A += k_left; y_n += k_left; x_t += k_left; k += k_left; } if(kna>0 || tri==1) { A += (sda-1)*lda; } for(; k<kmax-7; k+=8) { __builtin_prefetch( A + sda*lda + 0*lda ); __builtin_prefetch( A + sda*lda + 2*lda ); y_n_0 = _mm256_loadu_ps( &y_n[0] ); x_t_0 = _mm256_loadu_ps( &x_t[0] ); a_00 = _mm256_load_ps( &A[0+lda*0] ); temp = _mm256_mul_ps( a_00, x_n_0 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_00, x_t_0 ); y_t_0 = _mm256_add_ps( y_t_0, temp ); a_01 = _mm256_load_ps( &A[0+lda*1] ); temp = _mm256_mul_ps( a_01, x_n_1 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_01, x_t_0 ); y_t_1 = _mm256_add_ps( y_t_1, temp ); a_02 = _mm256_load_ps( &A[0+lda*2] ); temp = _mm256_mul_ps( a_02, x_n_2 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_02, x_t_0 ); y_t_2 = _mm256_add_ps( y_t_2, temp ); a_03 = _mm256_load_ps( &A[0+lda*3] ); temp = _mm256_mul_ps( a_03, x_n_3 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_03, x_t_0 ); y_t_3 = _mm256_add_ps( y_t_3, temp ); _mm256_storeu_ps( &y_n[0], y_n_0 ); A += sda*lda; y_n += 8; x_t += 8; } if(k<kmax) // it can be only k_left = {1, 2, 3, 4, 5, 6, 7} { k_left = kmax-k; k_left_d = 8.0 - k_left; /* y_n_0 = _mm_load_ps( &y_n[0] );*/ /* y_n_0 = _mm_setzero_ps();*/ x_t_0 = _mm256_loadu_ps( &x_t[0] ); x_t_0 = _mm256_blendv_ps( x_t_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) ); /*printf("\nk_left2 = %d\n", k_left, kmax, k);*/ a_00 = _mm256_load_ps( &A[0+lda*0] ); /*printf("\nk_left2 = %d\n", k_left);*/ a_01 = _mm256_load_ps( &A[0+lda*1] ); a_02 = _mm256_load_ps( &A[0+lda*2] ); a_03 = _mm256_load_ps( &A[0+lda*3] ); /* temp = _mm256_mul_ps( a_00, x_n_0 );*/ /* y_n_0 = _mm256_add_ps( y_n_0, temp );*/ y_n_0 = _mm256_mul_ps( a_00, x_n_0 ); temp = _mm256_mul_ps( a_00, x_t_0 ); y_t_0 = _mm256_add_ps( y_t_0, temp ); temp = _mm256_mul_ps( a_01, x_n_1 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_01, x_t_0 ); y_t_1 = _mm256_add_ps( y_t_1, temp ); temp = _mm256_mul_ps( a_02, x_n_2 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_02, x_t_0 ); y_t_2 = _mm256_add_ps( y_t_2, temp ); temp = _mm256_mul_ps( a_03, x_n_3 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_03, x_t_0 ); y_t_3 = _mm256_add_ps( y_t_3, temp ); y_n_0 = _mm256_blendv_ps( y_n_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) ); x_t_0 = _mm256_loadu_ps( &y_n[0] ); y_n_0 = _mm256_add_ps( y_n_0, x_t_0 ); _mm256_storeu_ps( &y_n[0], y_n_0 ); /* _mm256_maskstore_ps( &y_n[0], (__m256i) _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ), y_n_0 );*/ /* _mm256_storeu_ps( temp_space, y_n_0 );*/ /* for(ii=0; ii<k_left; ii++)*/ /* y_n[ii] = temp_space[ii];*/ /* A += 1;*/ /* y_n += 1;*/ /* x_t += 1;*/ } // reduction __m128 z_0, z_1; y_t_0 = _mm256_hadd_ps(y_t_0, y_t_1); y_t_2 = _mm256_hadd_ps(y_t_2, y_t_3); y_t_0 = _mm256_hadd_ps(y_t_0, y_t_2); y_t_1 = _mm256_permute2f128_ps(y_t_0, y_t_0, 0x01); z_0 = _mm256_castps256_ps128(y_t_0); z_1 = _mm256_castps256_ps128(y_t_1); z_1 = _mm_add_ps(z_0, z_1); if(alg==1) { z_0 = _mm_loadu_ps( &y_t[0] ); z_0 = _mm_add_ps(z_0, z_1); _mm_storeu_ps( &y_t[0], z_0 ); } else // alg==-1 { z_0 = _mm_loadu_ps( &y_t[0] ); z_0 = _mm_sub_ps(z_0, z_1); _mm_storeu_ps( &y_t[0], z_0 ); } }
void neuralNet::feedForward_layer(layerIterator_t nLayer) { constFloatIterator_t pActivations, cWeight, endWeight; __m256 vTotal, vSub0, vSub1; __m256 *vWeight, *vAct, *vEndWeight; // summate each neuron's contribution for (neuronIterator_t cNeuron = nLayer->begin(), end = nLayer->end(); cNeuron != end; ++cNeuron) { // foreach [previous neuron, current weight], up to endWeight pActivations = activations.begin() + (nLayer - 1)->front().iNeuronIndex; cWeight = cNeuron->weightsBegin(*this); endWeight = cNeuron->weightsEnd(*this); // (first 15 neurons) (TODO: redesign preamble and remove assertions for multiple of 16 size widths in neuralNet.h!) // summate all neurons of previous layer: (remaining batches of 8 neurons) vWeight = (__m256*)&cWeight[0]; vAct = (__m256*)&pActivations[0]; vEndWeight = (__m256*)&endWeight[0]; // initialize the activation of this neuron to its bias weight. The bias weight's neuron is always on: vTotal = _mm256_set_ps(0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, *endWeight); // can this be made with an aligned load? do // Take advantage of SIMD instructions by doing 16 multiplies per iteration { /* * each neuron's contribution is: * input[j] += weight[i,j] * activation[i] */ // multiply: vSub0 = _mm256_mul_ps(vWeight[0], vAct[0]); vSub1 = _mm256_mul_ps(vWeight[1], vAct[1]); // prefetch next values: (these don't appear to help, are the networks too small for this to matter?) //_mm_prefetch((char*)(vWeight0+4), _MM_HINT_T0); //_mm_prefetch((char*)(vAct0+4), _MM_HINT_T0); // add to accumulator: vTotal = _mm256_add_ps(vTotal, vSub0); vTotal = _mm256_add_ps(vTotal, vSub1); // increment pointers: vWeight += 2; vAct += 2; } while (vWeight != vEndWeight); //finalize: (combine all 4 accumulators) { vTotal = _mm256_hadd_ps(vTotal, vTotal); vTotal = _mm256_hadd_ps(vTotal, vTotal); __m128 vUpperTotal = _mm256_extractf128_ps(vTotal, 1); vUpperTotal = _mm_add_ps(vUpperTotal, _mm256_castps256_ps128(vTotal)); // store the lowest float into cInput: _mm_store_ss(&activations[cNeuron->iNeuronIndex], vUpperTotal); } } // activate all neurons in this layer: float* cActivation = (&activations.front() + nLayer->front().iNeuronIndex); float* lActivation = (&activations.front() + nLayer->back().iNeuronIndex + 1); float* lVectorActivation = lActivation - ((lActivation - cActivation)&(ALIGN_SIZE-1)); // equivalent to mod ALIGN_SIZE // aligned activations: while (cActivation != lVectorActivation) { activation_approx_avx(cActivation, cActivation); cActivation += ALIGN_SIZE; }; // postscript: (unaligned activations): { size_t dActivation = (lActivation - cActivation); switch(dActivation) { case 7: activation_approx(cActivation+6,cActivation+6); case 6: activation_approx(cActivation+5,cActivation+5); case 5: activation_approx(cActivation+4,cActivation+4); case 4: activation_approx_sse(cActivation+0,cActivation+0); break; case 3: activation_approx(cActivation+2, cActivation+2); case 2: activation_approx(cActivation+1, cActivation+1); case 1: activation_approx(cActivation+0, cActivation+0); case 0: break; } } }; // endOf feedForward_layer