//----------------------------------------------------------------- // SOA -> AOS // // pBlu: b0, b1, b2, b3, b4, ... // pGrn: g0, g1, g2, g3, g4, ... // pRed: r0, r1, r2, r3, r4, ... // -> // pBgr: b0,g0,r0, b1,g1,r1, b2,g2,r2, b3,g3,r3, b4,g4,r4, ... // void soa2aos(float *pBlu, float *pGrn, float *pRed, float *pBgr, const size_t length) { __m128 *bgr = (__m128 *)pBgr; //1回に24ユニット、8x+8y+8z、x,y,z=float for (size_t i = 0; i < length; i += 24) { __m256 b = _mm256_load_ps(pBlu + (i / 3)); __m256 g = _mm256_load_ps(pGrn + (i / 3)); __m256 r = _mm256_load_ps(pRed + (i / 3)); __m256 bg = _mm256_shuffle_ps(b, g, _MM_SHUFFLE(2, 0, 2, 0)); __m256 gr = _mm256_shuffle_ps(g, r, _MM_SHUFFLE(3, 1, 3, 1)); __m256 rb = _mm256_shuffle_ps(r, b, _MM_SHUFFLE(3, 1, 2, 0)); __m256 r03 = _mm256_shuffle_ps(bg, rb, _MM_SHUFFLE(2, 0, 2, 0)); __m256 r14 = _mm256_shuffle_ps(gr, bg, _MM_SHUFFLE(3, 1, 2, 0)); __m256 r25 = _mm256_shuffle_ps(rb, gr, _MM_SHUFFLE(3, 1, 3, 1)); *bgr++ = _mm256_castps256_ps128(r03); *bgr++ = _mm256_castps256_ps128(r14); *bgr++ = _mm256_castps256_ps128(r25); *bgr++ = _mm256_extractf128_ps(r03, 1); *bgr++ = _mm256_extractf128_ps(r14, 1); *bgr++ = _mm256_extractf128_ps(r25, 1); } }
static void sfid_render_cache_rt_write_rep16_bgra_unorm8_xmajor(struct thread *t, const struct sfid_render_cache_args *args) { const __m128 scale = _mm_set1_ps(255.0f); const __m128 half = _mm_set1_ps(0.5f); struct reg src[1]; memcpy(src, &t->grf[args->src], sizeof(src)); if (srgb_format(args->rt.format)) { const __m256 inv_gamma = _mm256_set1_ps(1.0f / 2.4f); src[0].reg = _ZGVdN8vv_powf(src[0].reg, inv_gamma); /* Don't gamma correct alpha */ src[0].f[3] = t->grf[args->src].f[3]; } __m128 bgra = _mm_shuffle_ps(_mm256_castps256_ps128(src[0].reg), _mm256_castps256_ps128(src[0].reg), SWIZZLE(2, 1, 0, 3)); bgra = _mm_mul_ps(bgra, scale); bgra = _mm_add_ps(bgra, half); __m128i bgra_i = _mm_cvtps_epi32(bgra); bgra_i = _mm_packus_epi32(bgra_i, bgra_i); bgra_i = _mm_packus_epi16(bgra_i, bgra_i); /* Swizzle two middle mask pairs so that dword 0-3 and 4-7 * form linear owords of pixels. */ __m256i mask = _mm256_permute4x64_epi64(t->mask_q1, SWIZZLE(0, 2, 1, 3)); const int slice_y = args->rt.minimum_array_element * args->rt.qpitch; const int x0 = t->grf[1].uw[4]; const int y0 = t->grf[1].uw[5] + slice_y; const int cpp = 4; void *base0 = xmajor_offset(args->rt.pixels, x0, y0, args->rt.stride, cpp); _mm_maskstore_epi32(base0, _mm256_extractf128_si256(mask, 0), bgra_i); _mm_maskstore_epi32(base0 + 512, _mm256_extractf128_si256(mask, 1), bgra_i); const int x1 = t->grf[1].uw[8]; const int y1 = t->grf[1].uw[9] + slice_y; void *base1 = xmajor_offset(args->rt.pixels, x1, y1, args->rt.stride, 4); __m256i mask1 = _mm256_permute4x64_epi64(t->mask_q2, SWIZZLE(0, 2, 1, 3)); _mm_maskstore_epi32(base1, _mm256_extractf128_si256(mask1, 0), bgra_i); _mm_maskstore_epi32(base1 + 512, _mm256_extractf128_si256(mask1, 1), bgra_i); }
inline float DatabaseBuilder::Distance(PackedSample* x, PackedSample* y) { #ifdef AVX //Black magic //But it does produce the same results as the not AVX code __m256 accumulator; __m256 x_s = _mm256_load_ps(x->Features); __m256 y_s = _mm256_load_ps(y->Features); __m256 result = _mm256_sub_ps(x_s, y_s); accumulator = _mm256_mul_ps(result, result); x_s = _mm256_load_ps(&x->Features[8]); y_s = _mm256_load_ps(&y->Features[8]); result = _mm256_sub_ps(x_s, y_s); result = _mm256_mul_ps(result, result); accumulator = _mm256_add_ps(accumulator, result); x_s = _mm256_load_ps(&x->Features[16]); y_s = _mm256_load_ps(&y->Features[16]); result = _mm256_sub_ps(x_s, y_s); result = _mm256_mul_ps(result, result); accumulator = _mm256_add_ps(accumulator, result); x_s = _mm256_load_ps(&x->Features[24]); y_s = _mm256_load_ps(&y->Features[24]); result = _mm256_sub_ps(x_s, y_s); result = _mm256_mul_ps(result, result); accumulator = _mm256_add_ps(accumulator, result); //We now have a vector of 8 floats __m256 t1 = _mm256_hadd_ps(accumulator, accumulator); __m256 t2 = _mm256_hadd_ps(t1, t1); __m128 t3 = _mm256_extractf128_ps(t2, 1); __m128 t4 = _mm_add_ss(_mm256_castps256_ps128(t2), t3); //And now we don't return std::sqrtf(_mm_cvtss_f32(t4)); #endif #ifndef AVX //Can be autovectorized float accumulator[32]; float distance = 0; for (int i = 0; i < 30; i++) { accumulator[i] = x->Features[i] - y->Features[i]; } //If done properly this should be 4(8) instructions for (int i = 0; i < 30; i++) { distance += accumulator[i] * accumulator[i]; } return std::sqrtf(distance); #endif }
irreg_poly_area_func_sign(float, _avx) { if (__builtin_expect(is_null(cords) || cords_len == 0, 0)) return 0; __m256 values_0_3, values_4_7, values_8_11, values_12_15, values_16_19 = _mm256_load_ps((const float *)&cords[0][0]), accum_sum = _mm256_setzero_ps(); float accum_sum_aux; #define _float_cords_dot_prod(curr, next, index) \ _mm256_dp_ps( \ curr, \ _mm256_xor_ps( \ _mm256_shuffle_ps(curr, _mm256_permute2f128_ps(curr, next, 0b00100001), 0b00011011),\ _mm256_setr_ps(0, -0.0f, 0, -0.0f, 0, -0.0f, 0, -0.0f) \ ), \ 0b11110000 | (1 << (index)) \ ) unsigned long index; for (index = 0; index < (cords_len - 16); index += 16) { values_0_3 = values_16_19; values_4_7 = _mm256_load_ps((const float *)&cords[index + 4]); values_8_11 = _mm256_load_ps((const float *)&cords[index + 8]); values_12_15 = _mm256_load_ps((const float *)&cords[index + 12]); values_16_19 = _mm256_load_ps((const float *)&cords[index + 16]); accum_sum = _mm256_add_ps( accum_sum, _mm256_add_ps( _mm256_add_ps( _float_cords_dot_prod(values_0_3, values_4_7, 0), _float_cords_dot_prod(values_4_7, values_8_11, 1) ), _mm256_add_ps( _float_cords_dot_prod(values_8_11, values_12_15, 2), _float_cords_dot_prod(values_12_15, values_16_19, 3) ) ) ); } accum_sum = _mm256_hadd_ps(accum_sum, _mm256_permute2f128_ps(accum_sum, accum_sum, 1)); // a0+a1, a2+a3, a4+a5, a6+a7, a4+a5, a6+a7, a0+a1, a2+a3 accum_sum = _mm256_hadd_ps(accum_sum, accum_sum); // a0+a1+a2+a3, a4+a5+a6+a7, ... accum_sum = _mm256_hadd_ps(accum_sum, accum_sum); // a0+a1+a2+a3+a4+a5+a6+a7, ... for (accum_sum_aux = _mm_cvtss_f32(_mm256_castps256_ps128(accum_sum)); index < (cords_len - 1); index++) accum_sum_aux += _calc_diff_of_adj_prods(cords, index); return accum_sum_aux; // return scalar_half(scalar_abs(accum_sum_aux)); }
//Thanks stack overflow. static inline float _mm256_reduce_add_ps(__m256 x) { /* ( x3+x7, x2+x6, x1+x5, x0+x4 ) */ const int imm = 1; const __m128 x128 = _mm_add_ps(_mm256_extractf128_ps(x, imm), _mm256_castps256_ps128(x)); /* ( -, -, x1+x3+x5+x7, x0+x2+x4+x6 ) */ const __m128 x64 = _mm_add_ps(x128, _mm_movehl_ps(x128, x128)); /* ( -, -, -, x0+x1+x2+x3+x4+x5+x6+x7 ) */ const __m128 x32 = _mm_add_ss(x64, _mm_shuffle_ps(x64, x64, 0x55)); /* Conversion to float is a no-op on x86-64 */ return _mm_cvtss_f32(x32); }
/* http://stackoverflow.com/questions/13219146/how-to-sum-m256-horizontally */ static inline float horizontal_sum_avx2(__m256 x) { const __m128 hi_quad = _mm256_extractf128_ps(x, 1); const __m128 lo_quad = _mm256_castps256_ps128(x); const __m128 sum_quad = _mm_add_ps(lo_quad, hi_quad); const __m128 lo_dual = sum_quad; const __m128 hi_dual = _mm_movehl_ps(sum_quad, sum_quad); const __m128 sum_dual = _mm_add_ps(lo_dual, hi_dual); const __m128 lo = sum_dual; const __m128 hi = _mm_shuffle_ps(sum_dual, sum_dual, 0x1); const __m128 sum = _mm_add_ss(lo, hi); return _mm_cvtss_f32(sum); }
__m256 exp_256( const __m256& x) { //! Clip the value __m256 y = _mm256_max_ps(_mm256_min_ps(x, _mm256_set1_ps(88.3762626647949f)), _mm256_set1_ps(-88.3762626647949f)); //! Express exp(x) as exp(g + n * log(2)) __m256 fx = y * _mm256_set1_ps(1.44269504088896341) + _mm256_set1_ps(0.5f); //! Floor const __m256 tmp = _mm256_round_ps(fx, _MM_FROUND_TO_ZERO); //! If greater, substract 1 const __m256 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), _mm256_set1_ps(1.f)); fx = tmp - mask; y -= fx * _mm256_set1_ps(0.693359375 - 2.12194440e-4); const __m256 z = y * y; const __m256 t = (((((_mm256_set1_ps(1.9875691500E-4) * y + _mm256_set1_ps(1.3981999507E-3)) * y + _mm256_set1_ps(8.3334519073E-3)) * y + _mm256_set1_ps(4.1665795894E-2)) * y + _mm256_set1_ps(1.6666665459E-1)) * y + _mm256_set1_ps(5.0000001201E-1)) * z + y + _mm256_set1_ps(1.f); //! Build 2^n (split it into two SSE array, since AVX2 equivalent functions //! aren't available. const __m128i emm0 = _mm_add_epi32(_mm_cvttps_epi32(_mm256_castps256_ps128(fx)), _mm_set1_epi32(0x7f)); const __m128i emm1 = _mm_add_epi32(_mm_cvttps_epi32(_mm256_extractf128_ps(fx, 1)), _mm_set1_epi32(0x7f)); fx = _mm256_castps128_ps256(_mm_castsi128_ps(_mm_slli_epi32(emm0, 23))); fx = _mm256_insertf128_ps(fx, _mm_castsi128_ps(_mm_slli_epi32(emm1, 23)), 1); //! Return the result return t * fx; }
float dot_product(const int N, const float *X, const int incX, const float *Y, const int incY) { __m256 accum = _mm256_setzero_ps(); for (int i = 0; i < N; i += 8, X += 8 * incX, Y += 8 * incY) { __m256 xval = _mm256_load_ps(X); __m256 yval = _mm256_load_ps(Y); __m256 val = _mm256_mul_ps(xval, yval); accum = _mm256_add_ps(val, accum); } // Reduce the values in accum into the smallest 32-bit subsection // a0 a1 a2 a3 a4 a5 a6 a7 -> b0 b1 b2 b3 __m128 accum2 = _mm_add_ps(_mm256_castps256_ps128(accum), _mm256_extractf128_ps(accum, 1)); // b0 b1 b2 b3 -> c0 c1 b2 b3 accum2 = _mm_add_ps(accum2, _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(accum2), 8))); __m128 final_val = _mm_add_ss( _mm_insert_ps(accum2, accum2, 0x4e), accum2); // Add the high and low halves return final_val[0]; }
/* http://stackoverflow.com/questions/13219146/how-to-sum-m256-horizontally */ inline float sum8(__m256 x) { // hiQuad = ( x7, x6, x5, x4 ) const __m128 hiQuad = _mm256_extractf128_ps(x, 1); // loQuad = ( x3, x2, x1, x0 ) const __m128 loQuad = _mm256_castps256_ps128(x); // sumQuad = ( x3 + x7, x2 + x6, x1 + x5, x0 + x4 ) const __m128 sumQuad = _mm_add_ps(loQuad, hiQuad); // loDual = ( -, -, x1 + x5, x0 + x4 ) const __m128 loDual = sumQuad; // hiDual = ( -, -, x3 + x7, x2 + x6 ) const __m128 hiDual = _mm_movehl_ps(sumQuad, sumQuad); // sumDual = ( -, -, x1 + x3 + x5 + x7, x0 + x2 + x4 + x6 ) const __m128 sumDual = _mm_add_ps(loDual, hiDual); // lo = ( -, -, -, x0 + x2 + x4 + x6 ) const __m128 lo = sumDual; // hi = ( -, -, -, x1 + x3 + x5 + x7 ) const __m128 hi = _mm_shuffle_ps(sumDual, sumDual, 0x1); // sum = ( -, -, -, x0 + x1 + x2 + x3 + x4 + x5 + x6 + x7 ) const __m128 sum = _mm_add_ss(lo, hi); return _mm_cvtss_f32(sum); }
void kernel_strmv_u_t_4_lib8(int kmax, float *A, int sda, float *x, float *y, int alg) { /* if(kmax<=0) */ /* return;*/ const int lda = 8; /* const int bs = 8;*/ __builtin_prefetch( A + 0*lda ); __builtin_prefetch( A + 2*lda ); int k; __m256 zeros, ax_temp, a_00, a_01, a_02, a_03, x_0, y_0, y_1, y_2, y_3, y_4, y_5, y_6, y_7; zeros = _mm256_setzero_ps(); y_0 = _mm256_setzero_ps(); y_1 = _mm256_setzero_ps(); y_2 = _mm256_setzero_ps(); y_3 = _mm256_setzero_ps(); y_4 = _mm256_setzero_ps(); y_5 = _mm256_setzero_ps(); y_6 = _mm256_setzero_ps(); y_7 = _mm256_setzero_ps(); k=0; for(; k<kmax-7; k+=8) { x_0 = _mm256_loadu_ps( &x[0] ); __builtin_prefetch( A + sda*lda + 0*lda ); __builtin_prefetch( A + sda*lda + 2*lda ); a_00 = _mm256_load_ps( &A[0+lda*0] ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_0 = _mm256_add_ps( y_0, ax_temp ); a_01 = _mm256_load_ps( &A[0+lda*1] ); ax_temp = _mm256_mul_ps( a_01, x_0 ); y_1 = _mm256_add_ps( y_1, ax_temp ); a_02 = _mm256_load_ps( &A[0+lda*2] ); ax_temp = _mm256_mul_ps( a_02, x_0 ); y_2 = _mm256_add_ps( y_2, ax_temp ); a_03 = _mm256_load_ps( &A[0+lda*3] ); ax_temp = _mm256_mul_ps( a_03, x_0 ); y_3 = _mm256_add_ps( y_3, ax_temp ); A += sda*lda; x += 8; } x_0 = _mm256_loadu_ps( &x[0] ); a_00 = _mm256_load_ps( &A[0+lda*0] ); a_00 = _mm256_blend_ps( zeros, a_00, 0x01 ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_0 = _mm256_add_ps( y_0, ax_temp ); a_01 = _mm256_load_ps( &A[0+lda*1] ); a_01 = _mm256_blend_ps( zeros, a_01, 0x03 ); ax_temp = _mm256_mul_ps( a_01, x_0 ); y_1 = _mm256_add_ps( y_1, ax_temp ); a_02 = _mm256_load_ps( &A[0+lda*2] ); a_02 = _mm256_blend_ps( zeros, a_02, 0x07 ); ax_temp = _mm256_mul_ps( a_02, x_0 ); y_2 = _mm256_add_ps( y_2, ax_temp ); a_03 = _mm256_load_ps( &A[0+lda*3] ); a_03 = _mm256_blend_ps( zeros, a_03, 0x0f ); ax_temp = _mm256_mul_ps( a_03, x_0 ); y_3 = _mm256_add_ps( y_3, ax_temp ); // reduction __m128 z_0, z_1; y_0 = _mm256_hadd_ps(y_0, y_1); y_2 = _mm256_hadd_ps(y_2, y_3); y_0 = _mm256_hadd_ps(y_0, y_2); y_1 = _mm256_permute2f128_ps(y_0, y_0, 0x01); z_0 = _mm256_castps256_ps128(y_0); z_1 = _mm256_castps256_ps128(y_1); z_1 = _mm_add_ps(z_0, z_1); if(alg==0) { _mm_storeu_ps(&y[0], z_1); } else if(alg==1) { z_0 = _mm_loadu_ps( &y[0] ); z_0 = _mm_add_ps(z_0, z_1); _mm_storeu_ps(&y[0], z_0); } else // alg==-1 { z_0 = _mm_loadu_ps( &y[0] ); z_0 = _mm_sub_ps(z_0, z_1); _mm_storeu_ps(&y[0], z_0); } }
/*! * \brief Perform an horizontal sum of the given vector. * \param in The input vector type * \return the horizontal sum of the vector */ ETL_STATIC_INLINE(float) hadd(avx_simd_float in) { const __m128 x128 = _mm_add_ps(_mm256_extractf128_ps(in.value, 1), _mm256_castps256_ps128(in.value)); const __m128 x64 = _mm_add_ps(x128, _mm_movehl_ps(x128, x128)); const __m128 x32 = _mm_add_ss(x64, _mm_shuffle_ps(x64, x64, 0x55)); return _mm_cvtss_f32(x32); }
void kernel_ssymv_4_lib8(int kmax, int kna, float *A, int sda, float *x_n, float *y_n, float *x_t, float *y_t, int tri, int alg) { if(kmax<=0) return; const int lda = 8; __builtin_prefetch( A + 0*lda ); __builtin_prefetch( A + 2*lda ); int k, k_left, ii; float k_left_d; const float mask_f[] = {7.5, 6.5, 5.5, 4.5, 3.5, 2.5, 1.5, 0.5}; float temp_space[8] = {}; __m256 mask, zeros, temp, a_00, a_01, a_02, a_03, x_n_0, x_n_1, x_n_2, x_n_3, y_n_0, x_t_0, y_t_0, y_t_1, y_t_2, y_t_3; mask = _mm256_loadu_ps( mask_f ); zeros = _mm256_setzero_ps(); x_n_0 = _mm256_broadcast_ss( &x_n[0] ); x_n_1 = _mm256_broadcast_ss( &x_n[1] ); x_n_2 = _mm256_broadcast_ss( &x_n[2] ); x_n_3 = _mm256_broadcast_ss( &x_n[3] ); if(alg==-1) // TODO xor { x_n_0 = _mm256_sub_ps( zeros, x_n_0 ); x_n_1 = _mm256_sub_ps( zeros, x_n_1 ); x_n_2 = _mm256_sub_ps( zeros, x_n_2 ); x_n_3 = _mm256_sub_ps( zeros, x_n_3 ); } y_t_0 = _mm256_setzero_ps(); y_t_1 = _mm256_setzero_ps(); y_t_2 = _mm256_setzero_ps(); y_t_3 = _mm256_setzero_ps(); k=0; // corner if(tri==1) { k_left = kna-k; k_left_d = 8.0 - k_left; /*printf("\nk_left = %d\n", k_left);*/ /* y_n_0 = _mm_load_ps( &y_n[0] );*/ /* y_n_0 = _mm_setzero_ps();*/ x_t_0 = _mm256_loadu_ps( &x_t[0] ); x_t_0 = _mm256_blendv_ps( x_t_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) ); /*_mm256_storeu_ps( temp_space, x_t_0 ); */ /*printf("\nx = %f %f %f %f %f %f %f %f \n", temp_space[0], temp_space[1], temp_space[2], temp_space[3], temp_space[4], temp_space[5], temp_space[6], temp_space[7] );*/ /*exit(1);*/ a_00 = _mm256_loadu_ps( &A[0+lda*0] ); a_00 = _mm256_blend_ps( a_00, zeros, 0x00 ); /* temp = _mm256_mul_ps( a_00, x_n_0 );*/ /* y_n_0 = _mm256_add_ps( y_n_0, temp );*/ y_n_0 = _mm256_mul_ps( a_00, x_n_0 ); a_00 = _mm256_blend_ps( a_00, zeros, 0x01 ); temp = _mm256_mul_ps( a_00, x_t_0 ); y_t_0 = _mm256_add_ps( y_t_0, temp ); a_01 = _mm256_loadu_ps( &A[0+lda*1] ); a_01 = _mm256_blend_ps( a_01, zeros, 0x01 ); temp = _mm256_mul_ps( a_01, x_n_1 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); a_01 = _mm256_blend_ps( a_01, zeros, 0x03 ); temp = _mm256_mul_ps( a_01, x_t_0 ); y_t_1 = _mm256_add_ps( y_t_1, temp ); a_02 = _mm256_loadu_ps( &A[0+lda*2] ); a_02 = _mm256_blend_ps( a_02, zeros, 0x03 ); temp = _mm256_mul_ps( a_02, x_n_2 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); a_02 = _mm256_blend_ps( a_02, zeros, 0x07 ); temp = _mm256_mul_ps( a_02, x_t_0 ); y_t_2 = _mm256_add_ps( y_t_2, temp ); a_03 = _mm256_loadu_ps( &A[0+lda*3] ); a_03 = _mm256_blend_ps( a_03, zeros, 0x07 ); temp = _mm256_mul_ps( a_03, x_n_3 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); a_03 = _mm256_blend_ps( a_03, zeros, 0x0f ); temp = _mm256_mul_ps( a_03, x_t_0 ); y_t_3 = _mm256_add_ps( y_t_3, temp ); /*_mm256_storeu_ps( temp_space, y_n_0 ); */ /*printf("\ny = %f %f %f %f %f %f %f %f \n", temp_space[0], temp_space[1], temp_space[2], temp_space[3], temp_space[4], temp_space[5], temp_space[6], temp_space[7] );*/ y_n_0 = _mm256_blendv_ps( y_n_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) ); /*_mm256_storeu_ps( temp_space, y_n_0 ); */ /*printf("\ny = %f %f %f %f %f %f %f %f \n", temp_space[0], temp_space[1], temp_space[2], temp_space[3], temp_space[4], temp_space[5], temp_space[6], temp_space[7] );*/ x_t_0 = _mm256_loadu_ps( &y_n[0] ); y_n_0 = _mm256_add_ps( y_n_0, x_t_0 ); _mm256_storeu_ps( &y_n[0], y_n_0 ); A += k_left; y_n += k_left; x_t += k_left; k += k_left; } if(k<kna) // it can be only k_left = {1, 2, 3, 4, 5, 6, 7} /* for(; k<kna; k++)*/ { k_left = kna-k; k_left_d = 8.0 - k_left; /*printf("\nk_left = %d\n", k_left);*/ /* y_n_0 = _mm_load_ps( &y_n[0] );*/ /* y_n_0 = _mm_setzero_ps();*/ x_t_0 = _mm256_loadu_ps( &x_t[0] ); x_t_0 = _mm256_blendv_ps( x_t_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) ); /*_mm256_storeu_ps( temp_space, x_t_0 ); */ /*printf("\nx = %f %f %f %f %f %f %f %f \n", temp_space[0], temp_space[1], temp_space[2], temp_space[3], temp_space[4], temp_space[5], temp_space[6], temp_space[7] );*/ a_00 = _mm256_loadu_ps( &A[0+lda*0] ); a_01 = _mm256_loadu_ps( &A[0+lda*1] ); a_02 = _mm256_loadu_ps( &A[0+lda*2] ); a_03 = _mm256_loadu_ps( &A[0+lda*3] ); /* temp = _mm256_mul_ps( a_00, x_n_0 );*/ /* y_n_0 = _mm256_add_ps( y_n_0, temp );*/ y_n_0 = _mm256_mul_ps( a_00, x_n_0 ); temp = _mm256_mul_ps( a_00, x_t_0 ); y_t_0 = _mm256_add_ps( y_t_0, temp ); temp = _mm256_mul_ps( a_01, x_n_1 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_01, x_t_0 ); y_t_1 = _mm256_add_ps( y_t_1, temp ); temp = _mm256_mul_ps( a_02, x_n_2 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_02, x_t_0 ); y_t_2 = _mm256_add_ps( y_t_2, temp ); temp = _mm256_mul_ps( a_03, x_n_3 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_03, x_t_0 ); y_t_3 = _mm256_add_ps( y_t_3, temp ); y_n_0 = _mm256_blendv_ps( y_n_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) ); x_t_0 = _mm256_loadu_ps( &y_n[0] ); y_n_0 = _mm256_add_ps( y_n_0, x_t_0 ); _mm256_storeu_ps( &y_n[0], y_n_0 ); /* _mm256_maskstore_ps( &y_n[0], (__m256i) _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ), y_n_0 );*/ /* _mm256_storeu_ps( temp_space, y_n_0 );*/ /* for(ii=0; ii<k_left; ii++)*/ /* y_n[ii] = temp_space[ii];*/ /*printf("\nk_left = %d\n", k_left);*/ /*exit(1);*/ A += k_left; y_n += k_left; x_t += k_left; k += k_left; } if(kna>0 || tri==1) { A += (sda-1)*lda; } for(; k<kmax-7; k+=8) { __builtin_prefetch( A + sda*lda + 0*lda ); __builtin_prefetch( A + sda*lda + 2*lda ); y_n_0 = _mm256_loadu_ps( &y_n[0] ); x_t_0 = _mm256_loadu_ps( &x_t[0] ); a_00 = _mm256_load_ps( &A[0+lda*0] ); temp = _mm256_mul_ps( a_00, x_n_0 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_00, x_t_0 ); y_t_0 = _mm256_add_ps( y_t_0, temp ); a_01 = _mm256_load_ps( &A[0+lda*1] ); temp = _mm256_mul_ps( a_01, x_n_1 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_01, x_t_0 ); y_t_1 = _mm256_add_ps( y_t_1, temp ); a_02 = _mm256_load_ps( &A[0+lda*2] ); temp = _mm256_mul_ps( a_02, x_n_2 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_02, x_t_0 ); y_t_2 = _mm256_add_ps( y_t_2, temp ); a_03 = _mm256_load_ps( &A[0+lda*3] ); temp = _mm256_mul_ps( a_03, x_n_3 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_03, x_t_0 ); y_t_3 = _mm256_add_ps( y_t_3, temp ); _mm256_storeu_ps( &y_n[0], y_n_0 ); A += sda*lda; y_n += 8; x_t += 8; } if(k<kmax) // it can be only k_left = {1, 2, 3, 4, 5, 6, 7} { k_left = kmax-k; k_left_d = 8.0 - k_left; /* y_n_0 = _mm_load_ps( &y_n[0] );*/ /* y_n_0 = _mm_setzero_ps();*/ x_t_0 = _mm256_loadu_ps( &x_t[0] ); x_t_0 = _mm256_blendv_ps( x_t_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) ); /*printf("\nk_left2 = %d\n", k_left, kmax, k);*/ a_00 = _mm256_load_ps( &A[0+lda*0] ); /*printf("\nk_left2 = %d\n", k_left);*/ a_01 = _mm256_load_ps( &A[0+lda*1] ); a_02 = _mm256_load_ps( &A[0+lda*2] ); a_03 = _mm256_load_ps( &A[0+lda*3] ); /* temp = _mm256_mul_ps( a_00, x_n_0 );*/ /* y_n_0 = _mm256_add_ps( y_n_0, temp );*/ y_n_0 = _mm256_mul_ps( a_00, x_n_0 ); temp = _mm256_mul_ps( a_00, x_t_0 ); y_t_0 = _mm256_add_ps( y_t_0, temp ); temp = _mm256_mul_ps( a_01, x_n_1 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_01, x_t_0 ); y_t_1 = _mm256_add_ps( y_t_1, temp ); temp = _mm256_mul_ps( a_02, x_n_2 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_02, x_t_0 ); y_t_2 = _mm256_add_ps( y_t_2, temp ); temp = _mm256_mul_ps( a_03, x_n_3 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_03, x_t_0 ); y_t_3 = _mm256_add_ps( y_t_3, temp ); y_n_0 = _mm256_blendv_ps( y_n_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) ); x_t_0 = _mm256_loadu_ps( &y_n[0] ); y_n_0 = _mm256_add_ps( y_n_0, x_t_0 ); _mm256_storeu_ps( &y_n[0], y_n_0 ); /* _mm256_maskstore_ps( &y_n[0], (__m256i) _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ), y_n_0 );*/ /* _mm256_storeu_ps( temp_space, y_n_0 );*/ /* for(ii=0; ii<k_left; ii++)*/ /* y_n[ii] = temp_space[ii];*/ /* A += 1;*/ /* y_n += 1;*/ /* x_t += 1;*/ } // reduction __m128 z_0, z_1; y_t_0 = _mm256_hadd_ps(y_t_0, y_t_1); y_t_2 = _mm256_hadd_ps(y_t_2, y_t_3); y_t_0 = _mm256_hadd_ps(y_t_0, y_t_2); y_t_1 = _mm256_permute2f128_ps(y_t_0, y_t_0, 0x01); z_0 = _mm256_castps256_ps128(y_t_0); z_1 = _mm256_castps256_ps128(y_t_1); z_1 = _mm_add_ps(z_0, z_1); if(alg==1) { z_0 = _mm_loadu_ps( &y_t[0] ); z_0 = _mm_add_ps(z_0, z_1); _mm_storeu_ps( &y_t[0], z_0 ); } else // alg==-1 { z_0 = _mm_loadu_ps( &y_t[0] ); z_0 = _mm_sub_ps(z_0, z_1); _mm_storeu_ps( &y_t[0], z_0 ); } }
void neuralNet::feedForward_layer(layerIterator_t nLayer) { constFloatIterator_t pActivations, cWeight, endWeight; __m256 vTotal, vSub0, vSub1; __m256 *vWeight, *vAct, *vEndWeight; // summate each neuron's contribution for (neuronIterator_t cNeuron = nLayer->begin(), end = nLayer->end(); cNeuron != end; ++cNeuron) { // foreach [previous neuron, current weight], up to endWeight pActivations = activations.begin() + (nLayer - 1)->front().iNeuronIndex; cWeight = cNeuron->weightsBegin(*this); endWeight = cNeuron->weightsEnd(*this); // (first 15 neurons) (TODO: redesign preamble and remove assertions for multiple of 16 size widths in neuralNet.h!) // summate all neurons of previous layer: (remaining batches of 8 neurons) vWeight = (__m256*)&cWeight[0]; vAct = (__m256*)&pActivations[0]; vEndWeight = (__m256*)&endWeight[0]; // initialize the activation of this neuron to its bias weight. The bias weight's neuron is always on: vTotal = _mm256_set_ps(0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, *endWeight); // can this be made with an aligned load? do // Take advantage of SIMD instructions by doing 16 multiplies per iteration { /* * each neuron's contribution is: * input[j] += weight[i,j] * activation[i] */ // multiply: vSub0 = _mm256_mul_ps(vWeight[0], vAct[0]); vSub1 = _mm256_mul_ps(vWeight[1], vAct[1]); // prefetch next values: (these don't appear to help, are the networks too small for this to matter?) //_mm_prefetch((char*)(vWeight0+4), _MM_HINT_T0); //_mm_prefetch((char*)(vAct0+4), _MM_HINT_T0); // add to accumulator: vTotal = _mm256_add_ps(vTotal, vSub0); vTotal = _mm256_add_ps(vTotal, vSub1); // increment pointers: vWeight += 2; vAct += 2; } while (vWeight != vEndWeight); //finalize: (combine all 4 accumulators) { vTotal = _mm256_hadd_ps(vTotal, vTotal); vTotal = _mm256_hadd_ps(vTotal, vTotal); __m128 vUpperTotal = _mm256_extractf128_ps(vTotal, 1); vUpperTotal = _mm_add_ps(vUpperTotal, _mm256_castps256_ps128(vTotal)); // store the lowest float into cInput: _mm_store_ss(&activations[cNeuron->iNeuronIndex], vUpperTotal); } } // activate all neurons in this layer: float* cActivation = (&activations.front() + nLayer->front().iNeuronIndex); float* lActivation = (&activations.front() + nLayer->back().iNeuronIndex + 1); float* lVectorActivation = lActivation - ((lActivation - cActivation)&(ALIGN_SIZE-1)); // equivalent to mod ALIGN_SIZE // aligned activations: while (cActivation != lVectorActivation) { activation_approx_avx(cActivation, cActivation); cActivation += ALIGN_SIZE; }; // postscript: (unaligned activations): { size_t dActivation = (lActivation - cActivation); switch(dActivation) { case 7: activation_approx(cActivation+6,cActivation+6); case 6: activation_approx(cActivation+5,cActivation+5); case 5: activation_approx(cActivation+4,cActivation+4); case 4: activation_approx_sse(cActivation+0,cActivation+0); break; case 3: activation_approx(cActivation+2, cActivation+2); case 2: activation_approx(cActivation+1, cActivation+1); case 1: activation_approx(cActivation+0, cActivation+0); case 0: break; } } }; // endOf feedForward_layer