void sgemm( int m, int n, float *A, float *C ) { int i, j, k, jtn, cieling; float B[n * m]; float buf[2]; __m128d sum, ab, cd, ef, AB, CD, EF; transpose(m, n, A, B); for (i = 0; i < m; i += 1) { for (j = 0; j < m; j += 1) { jtn = j * n; for (k = 0, cieling = n - 5; k < cieling; k += 6) { ab = _mm_load1_pd(A + i + k * m); cd = _mm_load1_pd(A + i + (k + 2) * m); ef = _mm_load1_pd(A + i + (k + 4) * m); AB = _mm_loadu_pd(B + k + jtn); CD = _mm_loadu_pd(B + k + 2 + jtn); EF = _mm_loadu_pd(B + k + 4 + jtn); sum = _mm_add_pd(sum, _mm_mul_sd(ab, AB)); sum = _mm_add_pd(sum, _mm_mul_sd(cd, CD)); sum = _mm_add_pd(sum, _mm_mul_sd(ef, EF)); } _mm_storeu_pd(buf, sum); C[i + j * m] = buf[0]; if (n % 6 != 0) { for ( ; k < n; k += 1) { C[i + j * m] += A[i + k * m] * A[k + jtn]; } } } } }
__m128d test_mm_mul_sd(__m128d A, __m128d B) { // DAG-LABEL: test_mm_mul_sd // DAG: fmul double %{{.*}}, %{{.*}} // // ASM-LABEL: test_mm_mul_sd // ASM: mulsd return _mm_mul_sd(A, B); }
__inline __m128d Length(__m128d vec1,__m128d vec2) { __m128d result1 = _mm_mul_pd(vec1, vec1); __m128d result2 = _mm_mul_sd(vec2, vec2); __m128d result3 = _mm_shuffle_pd(result1, result1, 1); __m128d result4 = _mm_add_sd(result1, result2); __m128d result5 = _mm_add_sd(result4, result3); __m128d result6 = _mm_sqrt_sd(vec1, result5); return result6; }
// it moves vertically across blocks void kernel_dsymv_4_lib4(int kmax, double *A, int sda, double *x_n, double *y_n, double *z_n, double *x_t, double *y_t, double *z_t, int tri, int alg) { if(kmax<=0) return; /*printf("\nciao %d\n", kmax); */ const int bs = 4; __builtin_prefetch( A + bs*0 ); __builtin_prefetch( A + bs*2 ); int k, ka; ka = kmax; // number from aligned positon double k_left; // double *sA, *sy_n, *sx_t; static double d_mask[4] = {0.5, 1.5, 2.5, 3.5}; __m256d v_mask, zeros, temp, a_00, a_01, a_02, a_03, x_n_0, x_n_1, x_n_2, x_n_3, y_n_0, x_t_0, y_t_0, y_t_1, y_t_2, y_t_3; __m256i i_mask; #if 0 __m128d stemp, sa_00, sa_01, sa_02, sa_03, sx_n_0, sx_n_1, sx_n_2, sx_n_3, sy_n_0, sx_t_0, sy_t_0, sy_t_1, sy_t_2, sy_t_3; #endif zeros = _mm256_setzero_pd(); x_n_0 = _mm256_broadcast_sd( &x_n[0] ); x_n_1 = _mm256_broadcast_sd( &x_n[1] ); x_n_2 = _mm256_broadcast_sd( &x_n[2] ); x_n_3 = _mm256_broadcast_sd( &x_n[3] ); if(alg==-1) // TODO xor { x_n_0 = _mm256_sub_pd( zeros, x_n_0 ); x_n_1 = _mm256_sub_pd( zeros, x_n_1 ); x_n_2 = _mm256_sub_pd( zeros, x_n_2 ); x_n_3 = _mm256_sub_pd( zeros, x_n_3 ); } y_t_0 = _mm256_setzero_pd(); y_t_1 = _mm256_setzero_pd(); y_t_2 = _mm256_setzero_pd(); y_t_3 = _mm256_setzero_pd(); #if 0 sx_n_0 = _mm256_castpd256_pd128( x_n_0 ); sx_n_1 = _mm256_castpd256_pd128( x_n_1 ); sx_n_2 = _mm256_castpd256_pd128( x_n_2 ); sx_n_3 = _mm256_castpd256_pd128( x_n_3 ); sy_t_0 = _mm256_castpd256_pd128( y_t_0 ); sy_t_1 = _mm256_castpd256_pd128( y_t_1 ); sy_t_2 = _mm256_castpd256_pd128( y_t_2 ); sy_t_3 = _mm256_castpd256_pd128( y_t_3 ); k = bs*(ka/bs); sA = A + (ka/bs)*sda*bs; sy_n = y_n + (ka/bs)*bs; sx_t = x_t + (ka/bs)*bs; for(; k<ka; k++) { sy_n_0 = _mm_load_sd( &sy_n[0] ); sx_t_0 = _mm_load_sd( &sx_t[0] ); sa_00 = _mm_load_sd( &sA[0+bs*0] ); sa_01 = _mm_load_sd( &sA[0+bs*1] ); sa_02 = _mm_load_sd( &sA[0+bs*2] ); sa_03 = _mm_load_sd( &sA[0+bs*3] ); stemp = _mm_mul_sd( sa_00, sx_n_0 ); sy_n_0 = _mm_add_sd( sy_n_0, stemp ); stemp = _mm_mul_sd( sa_00, sx_t_0 ); sy_t_0 = _mm_add_sd( sy_t_0, stemp ); stemp = _mm_mul_sd( sa_01, sx_n_1 ); sy_n_0 = _mm_add_sd( sy_n_0, stemp ); stemp = _mm_mul_sd( sa_01, sx_t_0 ); sy_t_1 = _mm_add_sd( sy_t_1, stemp ); stemp = _mm_mul_sd( sa_02, sx_n_2 ); sy_n_0 = _mm_add_sd( sy_n_0, stemp ); stemp = _mm_mul_sd( sa_02, sx_t_0 ); sy_t_2 = _mm_add_sd( sy_t_2, stemp ); stemp = _mm_mul_sd( sa_03, sx_n_3 ); sy_n_0 = _mm_add_sd( sy_n_0, stemp ); stemp = _mm_mul_sd( sa_03, sx_t_0 ); sy_t_3 = _mm_add_sd( sy_t_3, stemp ); _mm_store_sd( &sy_n[0], sy_n_0 ); sA += 1; sy_n += 1; sx_t += 1; } y_t_0 = _mm256_castpd128_pd256( sy_t_0 ); y_t_1 = _mm256_castpd128_pd256( sy_t_1 ); y_t_2 = _mm256_castpd128_pd256( sy_t_2 ); y_t_3 = _mm256_castpd128_pd256( sy_t_3 ); #endif k=0; // corner if(tri==1) { __builtin_prefetch( A + sda*bs +bs*0 ); __builtin_prefetch( A + sda*bs +bs*2 ); y_n_0 = _mm256_loadu_pd( &y_n[0] ); x_t_0 = _mm256_loadu_pd( &x_t[0] ); a_00 = _mm256_load_pd( &A[0+bs*0] ); a_01 = _mm256_load_pd( &A[0+bs*1] ); a_02 = _mm256_load_pd( &A[0+bs*2] ); a_03 = _mm256_load_pd( &A[0+bs*3] ); temp = _mm256_mul_pd( a_00, x_n_0 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_00, x_t_0 ); temp = _mm256_blend_pd( zeros, temp, 14 ); y_t_0 = _mm256_add_pd( y_t_0, temp ); temp = _mm256_mul_pd( a_01, x_n_1 ); temp = _mm256_blend_pd( zeros, temp, 14 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_01, x_t_0 ); temp = _mm256_blend_pd( zeros, temp, 12 ); y_t_1 = _mm256_add_pd( y_t_1, temp ); temp = _mm256_mul_pd( a_02, x_n_2 ); temp = _mm256_blend_pd( zeros, temp, 12 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_02, x_t_0 ); temp = _mm256_blend_pd( zeros, temp, 8 ); y_t_2 = _mm256_add_pd( y_t_2, temp ); temp = _mm256_mul_pd( a_03, x_n_3 ); temp = _mm256_blend_pd( zeros, temp, 8 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); _mm256_storeu_pd( &z_n[0], y_n_0 ); A += sda*bs; y_n += 4; z_n += 4; x_t += 4; k += 4; } for(; k<ka-7; k+=2*bs) { __builtin_prefetch( A + sda*bs +bs*0 ); __builtin_prefetch( A + sda*bs +bs*2 ); y_n_0 = _mm256_loadu_pd( &y_n[0] ); x_t_0 = _mm256_loadu_pd( &x_t[0] ); a_00 = _mm256_load_pd( &A[0+bs*0] ); a_01 = _mm256_load_pd( &A[0+bs*1] ); a_02 = _mm256_load_pd( &A[0+bs*2] ); a_03 = _mm256_load_pd( &A[0+bs*3] ); temp = _mm256_mul_pd( a_00, x_n_0 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_00, x_t_0 ); y_t_0 = _mm256_add_pd( y_t_0, temp ); temp = _mm256_mul_pd( a_01, x_n_1 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_01, x_t_0 ); y_t_1 = _mm256_add_pd( y_t_1, temp ); temp = _mm256_mul_pd( a_02, x_n_2 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_02, x_t_0 ); y_t_2 = _mm256_add_pd( y_t_2, temp ); temp = _mm256_mul_pd( a_03, x_n_3 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_03, x_t_0 ); y_t_3 = _mm256_add_pd( y_t_3, temp ); _mm256_storeu_pd( &z_n[0], y_n_0 ); A += sda*bs; y_n += 4; z_n += 4; x_t += 4; __builtin_prefetch( A + sda*bs +bs*0 ); __builtin_prefetch( A + sda*bs +bs*2 ); y_n_0 = _mm256_loadu_pd( &y_n[0] ); x_t_0 = _mm256_loadu_pd( &x_t[0] ); a_00 = _mm256_load_pd( &A[0+bs*0] ); a_01 = _mm256_load_pd( &A[0+bs*1] ); a_02 = _mm256_load_pd( &A[0+bs*2] ); a_03 = _mm256_load_pd( &A[0+bs*3] ); temp = _mm256_mul_pd( a_00, x_n_0 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_00, x_t_0 ); y_t_0 = _mm256_add_pd( y_t_0, temp ); temp = _mm256_mul_pd( a_01, x_n_1 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_01, x_t_0 ); y_t_1 = _mm256_add_pd( y_t_1, temp ); temp = _mm256_mul_pd( a_02, x_n_2 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_02, x_t_0 ); y_t_2 = _mm256_add_pd( y_t_2, temp ); temp = _mm256_mul_pd( a_03, x_n_3 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_03, x_t_0 ); y_t_3 = _mm256_add_pd( y_t_3, temp ); _mm256_storeu_pd( &z_n[0], y_n_0 ); A += sda*bs; y_n += 4; z_n += 4; x_t += 4; } for(; k<ka-3; k+=bs) { __builtin_prefetch( A + sda*bs +bs*0 ); __builtin_prefetch( A + sda*bs +bs*2 ); y_n_0 = _mm256_loadu_pd( &y_n[0] ); x_t_0 = _mm256_loadu_pd( &x_t[0] ); a_00 = _mm256_load_pd( &A[0+bs*0] ); a_01 = _mm256_load_pd( &A[0+bs*1] ); a_02 = _mm256_load_pd( &A[0+bs*2] ); a_03 = _mm256_load_pd( &A[0+bs*3] ); temp = _mm256_mul_pd( a_00, x_n_0 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_00, x_t_0 ); y_t_0 = _mm256_add_pd( y_t_0, temp ); temp = _mm256_mul_pd( a_01, x_n_1 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_01, x_t_0 ); y_t_1 = _mm256_add_pd( y_t_1, temp ); temp = _mm256_mul_pd( a_02, x_n_2 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_02, x_t_0 ); y_t_2 = _mm256_add_pd( y_t_2, temp ); temp = _mm256_mul_pd( a_03, x_n_3 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_03, x_t_0 ); y_t_3 = _mm256_add_pd( y_t_3, temp ); _mm256_storeu_pd( &z_n[0], y_n_0 ); A += sda*bs; y_n += 4; z_n += 4; x_t += 4; } if(k<ka) { k_left = ka-k; v_mask = _mm256_sub_pd( _mm256_loadu_pd( d_mask ), _mm256_broadcast_sd( &k_left ) ); i_mask = _mm256_castpd_si256( v_mask ); // __builtin_prefetch( A + sda*bs +bs*0 ); // __builtin_prefetch( A + sda*bs +bs*2 ); y_n_0 = _mm256_loadu_pd( &y_n[0] ); x_t_0 = _mm256_maskload_pd( &x_t[0], i_mask ); a_00 = _mm256_load_pd( &A[0+bs*0] ); a_01 = _mm256_load_pd( &A[0+bs*1] ); a_02 = _mm256_load_pd( &A[0+bs*2] ); a_03 = _mm256_load_pd( &A[0+bs*3] ); temp = _mm256_mul_pd( a_00, x_n_0 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_00, x_t_0 ); y_t_0 = _mm256_add_pd( y_t_0, temp ); temp = _mm256_mul_pd( a_01, x_n_1 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_01, x_t_0 ); y_t_1 = _mm256_add_pd( y_t_1, temp ); temp = _mm256_mul_pd( a_02, x_n_2 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_02, x_t_0 ); y_t_2 = _mm256_add_pd( y_t_2, temp ); temp = _mm256_mul_pd( a_03, x_n_3 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_03, x_t_0 ); y_t_3 = _mm256_add_pd( y_t_3, temp ); _mm256_maskstore_pd( &z_n[0], i_mask, y_n_0 ); // A += sda*bs; // y_n += 4; // z_n += 4; // x_t += 4; } __m256d y_0_1_2_3; y_t_0 = _mm256_hadd_pd( y_t_0, y_t_1 ); y_t_2 = _mm256_hadd_pd( y_t_2, y_t_3 ); y_t_1 = _mm256_permute2f128_pd( y_t_2, y_t_0, 2 ); y_t_0 = _mm256_permute2f128_pd( y_t_2, y_t_0, 19 ); y_t_0 = _mm256_add_pd( y_t_0, y_t_1 ); if(alg==1) { y_0_1_2_3 = _mm256_loadu_pd( &y_t[0] ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_t_0 ); _mm256_storeu_pd( &z_t[0], y_0_1_2_3 ); } else // alg==-1 { y_0_1_2_3 = _mm256_loadu_pd( &y_t[0] ); y_0_1_2_3 = _mm256_sub_pd( y_0_1_2_3, y_t_0 ); _mm256_storeu_pd( &z_t[0], y_0_1_2_3 ); } }
// it moves horizontally inside a block void kernel_dtrmv_u_n_8_lib4(int kmax, double *A0, int sda, double *x, double *y, int alg) { if(kmax<=0) return; double *A1 = A0 + 4*sda; const int lda = 4; int k; __m128d tmp0, z_0, y_0_1, a_00_10; __m256d zeros, ax_temp, a_00_10_20_30, a_01_11_21_31, a_40_50_60_70, a_41_51_61_71, x_0, x_1, y_0_1_2_3, y_0_1_2_3_b, z_0_1_2_3, y_4_5_6_7, y_4_5_6_7_b, z_4_5_6_7; /* y_0_1_2_3 = _mm256_setzero_pd(); */ /* y_4_5_6_7 = _mm256_setzero_pd(); */ /* y_0_1_2_3_b = _mm256_setzero_pd(); */ /* y_4_5_6_7_b = _mm256_setzero_pd(); */ zeros = _mm256_setzero_pd(); /* y_0_1_2_3 = _mm256_setzero_pd(); */ /* y_0_1_2_3_b = _mm256_setzero_pd(); */ /* y_0_1_2_3_c = _mm256_setzero_pd(); */ /* y_0_1_2_3_d = _mm256_setzero_pd();*/ // upper triangular // second col (avoid zero y_0_1) z_0 = _mm_loaddup_pd( &x[1] ); a_00_10 = _mm_load_pd( &A0[0+lda*1] ); y_0_1 = _mm_mul_pd( a_00_10, z_0 ); // first col z_0 = _mm_load_sd( &x[0] ); a_00_10 = _mm_load_sd( &A0[0+lda*0] ); tmp0 = _mm_mul_sd( a_00_10, z_0 ); y_0_1 = _mm_add_sd( y_0_1, tmp0 ); y_0_1_2_3_b = _mm256_castpd128_pd256( y_0_1 ); y_0_1_2_3_b = _mm256_blend_pd( y_0_1_2_3_b, y_0_1_2_3_b, 0xc ); // forth col (avoid zero y_0_1_2_3) x_1 = _mm256_broadcast_sd( &x[3] ); a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*3] ); y_0_1_2_3 = _mm256_mul_pd( a_01_11_21_31, x_1 ); // first col x_0 = _mm256_broadcast_sd( &x[2] ); x_0 = _mm256_blend_pd( x_0, zeros, 0x8 ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*2] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); A0 += 4*lda; A1 += 4*lda; x += 4; // upper squared x_0 = _mm256_broadcast_sd( &x[0] ); x_1 = _mm256_broadcast_sd( &x[1] ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*1] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); x_0 = _mm256_broadcast_sd( &x[2] ); x_1 = _mm256_broadcast_sd( &x[3] ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*2] ); a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*3] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); // lower triangular // second col (avoid zero y_0_1) z_0 = _mm_loaddup_pd( &x[1] ); a_00_10 = _mm_load_pd( &A1[0+lda*1] ); y_0_1 = _mm_mul_pd( a_00_10, z_0 ); // first col z_0 = _mm_load_sd( &x[0] ); a_00_10 = _mm_load_sd( &A1[0+lda*0] ); tmp0 = _mm_mul_sd( a_00_10, z_0 ); y_0_1 = _mm_add_sd( y_0_1, tmp0 ); y_4_5_6_7_b = _mm256_castpd128_pd256( y_0_1 ); y_4_5_6_7_b = _mm256_blend_pd( y_4_5_6_7_b, y_4_5_6_7_b, 0xc ); // forth col (avoid zero y_4_5_6_7) x_1 = _mm256_broadcast_sd( &x[3] ); a_01_11_21_31 = _mm256_load_pd( &A1[0+lda*3] ); y_4_5_6_7 = _mm256_mul_pd( a_01_11_21_31, x_1 ); // first col x_0 = _mm256_broadcast_sd( &x[2] ); x_0 = _mm256_blend_pd( x_0, zeros, 0x8 ); a_00_10_20_30 = _mm256_load_pd( &A1[0+lda*2] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp ); A0 += 4*lda; A1 += 4*lda; x += 4; k=8; for(; k<kmax-3; k+=4) { /* __builtin_prefetch( A0 + 4*lda );*/ /* __builtin_prefetch( A1 + 4*lda );*/ x_0 = _mm256_broadcast_sd( &x[0] ); x_1 = _mm256_broadcast_sd( &x[1] ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] ); a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*1] ); a_41_51_61_71 = _mm256_load_pd( &A1[0+lda*1] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); ax_temp = _mm256_mul_pd( a_41_51_61_71, x_1 ); y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp ); /* __builtin_prefetch( A0 + 5*lda );*/ /* __builtin_prefetch( A1 + 5*lda );*/ x_0 = _mm256_broadcast_sd( &x[2] ); x_1 = _mm256_broadcast_sd( &x[3] ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*2] ); a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*2] ); a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*3] ); a_41_51_61_71 = _mm256_load_pd( &A1[0+lda*3] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); ax_temp = _mm256_mul_pd( a_41_51_61_71, x_1 ); y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp ); A0 += 4*lda; A1 += 4*lda; x += 4; } if(kmax%4>=2) { x_0 = _mm256_broadcast_sd( &x[0] ); x_1 = _mm256_broadcast_sd( &x[1] ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] ); a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*1] ); a_41_51_61_71 = _mm256_load_pd( &A1[0+lda*1] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); ax_temp = _mm256_mul_pd( a_41_51_61_71, x_1 ); y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp ); A0 += 2*lda; A1 += 2*lda; x += 2; } y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_0_1_2_3_b ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, y_4_5_6_7_b ); if(kmax%2==1) { x_0 = _mm256_broadcast_sd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] ); a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*0] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp ); /* A0 += 1*lda;*/ /* A1 += 1*lda;*/ /* x += 1;*/ } if(alg==0) { _mm256_storeu_pd(&y[0], y_0_1_2_3); _mm256_storeu_pd(&y[4], y_4_5_6_7); } else if(alg==1) { z_0_1_2_3 = _mm256_loadu_pd( &y[0] ); z_4_5_6_7 = _mm256_loadu_pd( &y[4] ); z_0_1_2_3 = _mm256_add_pd( z_0_1_2_3, y_0_1_2_3 ); z_4_5_6_7 = _mm256_add_pd( z_4_5_6_7, y_4_5_6_7 ); _mm256_storeu_pd(&y[0], z_0_1_2_3); _mm256_storeu_pd(&y[4], z_4_5_6_7); } else // alg==-1 { z_0_1_2_3 = _mm256_loadu_pd( &y[0] ); z_4_5_6_7 = _mm256_loadu_pd( &y[4] ); z_0_1_2_3 = _mm256_sub_pd( z_0_1_2_3, y_0_1_2_3 ); z_4_5_6_7 = _mm256_sub_pd( z_4_5_6_7, y_4_5_6_7 ); _mm256_storeu_pd(&y[0], z_0_1_2_3); _mm256_storeu_pd(&y[4], z_4_5_6_7); } }
// it moves horizontally inside a block (A upper triangular) void kernel_dtrmv_u_n_4_lib4(int kmax, double *A, double *x, double *y, int alg) { if(kmax<=0) return; const int lda = 4; int k; __m128d tmp0, z_0, y_0_1, a_00_10; __m256d zeros, ax_temp, a_00_10_20_30, a_01_11_21_31, a_02_12_22_32, a_03_13_23_33, x_0, x_1, x_2, x_3, y_0_1_2_3, y_0_1_2_3_b, y_0_1_2_3_c, y_0_1_2_3_d, z_0_1_2_3; zeros = _mm256_setzero_pd(); /* y_0_1_2_3 = _mm256_setzero_pd(); */ /* y_0_1_2_3_b = _mm256_setzero_pd(); */ /* y_0_1_2_3_c = _mm256_setzero_pd(); */ y_0_1_2_3_d = _mm256_setzero_pd(); // second col (avoid zero y_0_1) z_0 = _mm_loaddup_pd( &x[1] ); a_00_10 = _mm_load_pd( &A[0+lda*1] ); y_0_1 = _mm_mul_pd( a_00_10, z_0 ); // first col z_0 = _mm_load_sd( &x[0] ); a_00_10 = _mm_load_sd( &A[0+lda*0] ); tmp0 = _mm_mul_sd( a_00_10, z_0 ); y_0_1 = _mm_add_sd( y_0_1, tmp0 ); y_0_1_2_3_c = _mm256_castpd128_pd256( y_0_1 ); y_0_1_2_3_c = _mm256_blend_pd( y_0_1_2_3_c, y_0_1_2_3_d, 0xc ); // forth col (avoid zero y_0_1_2_3) x_3 = _mm256_broadcast_sd( &x[3] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); y_0_1_2_3 = _mm256_mul_pd( a_03_13_23_33, x_3 ); // first col x_2 = _mm256_broadcast_sd( &x[2] ); x_2 = _mm256_blend_pd( x_2, zeros, 0x8 ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); y_0_1_2_3_b = _mm256_mul_pd( a_02_12_22_32, x_2 ); A += 4*lda; x += 4; k=4; for(; k<kmax-3; k+=4) { x_0 = _mm256_broadcast_sd( &x[0] ); x_1 = _mm256_broadcast_sd( &x[1] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); x_2 = _mm256_broadcast_sd( &x[2] ); x_3 = _mm256_broadcast_sd( &x[3] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); ax_temp = _mm256_mul_pd( a_02_12_22_32, x_2 ); y_0_1_2_3_c = _mm256_add_pd( y_0_1_2_3_c, ax_temp ); ax_temp = _mm256_mul_pd( a_03_13_23_33, x_3 ); y_0_1_2_3_d = _mm256_add_pd( y_0_1_2_3_d, ax_temp ); A += 4*lda; x += 4; } y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_0_1_2_3_c ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, y_0_1_2_3_d ); if(kmax%4>=2) { x_0 = _mm256_broadcast_sd( &x[0] ); x_1 = _mm256_broadcast_sd( &x[1] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); A += 2*lda; x += 2; } y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_0_1_2_3_b ); if(kmax%2==1) { x_0 = _mm256_broadcast_sd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); } if(alg==0) { _mm256_storeu_pd(&y[0], y_0_1_2_3); } else if(alg==1) { z_0_1_2_3 = _mm256_loadu_pd( &y[0] ); z_0_1_2_3 = _mm256_add_pd ( z_0_1_2_3, y_0_1_2_3 ); _mm256_storeu_pd(&y[0], z_0_1_2_3); } else // alg==-1 { z_0_1_2_3 = _mm256_loadu_pd( &y[0] ); z_0_1_2_3 = _mm256_sub_pd ( z_0_1_2_3, y_0_1_2_3 ); _mm256_storeu_pd(&y[0], z_0_1_2_3); } }
// it moves horizontally inside a block void kernel_dtrmv_u_n_2_lib4(int kmax, double *A, double *x, double *y, int alg) { if(kmax<=0) return; const int lda = 4; int k; __m128d ax_temp, a_00_10, a_01_11, a_02_12, a_03_13, x_0, x_1, x_2, x_3, y_0_1, y_0_1_b, y_0_1_c, y_0_1_d, z_0_1; /* y_0_1 = _mm_setzero_pd(); */ // second col (avoid zero y_0_1) x_0 = _mm_loaddup_pd( &x[1] ); a_00_10 = _mm_load_pd( &A[0+lda*1] ); y_0_1 = _mm_mul_pd( a_00_10, x_0 ); // first col x_0 = _mm_load_sd( &x[0] ); a_00_10 = _mm_load_sd( &A[0+lda*0] ); ax_temp = _mm_mul_sd( a_00_10, x_0 ); y_0_1 = _mm_add_sd( y_0_1, ax_temp ); A += 2*lda; x += 2; k=2; for(; k<kmax-1; k+=2) { x_0 = _mm_loaddup_pd( &x[0] ); x_1 = _mm_loaddup_pd( &x[1] ); a_00_10 = _mm_load_pd( &A[0+lda*0] ); a_01_11 = _mm_load_pd( &A[0+lda*1] ); ax_temp = _mm_mul_pd( a_00_10, x_0 ); y_0_1 = _mm_add_pd( y_0_1, ax_temp ); ax_temp = _mm_mul_pd( a_01_11, x_1 ); y_0_1 = _mm_add_pd( y_0_1, ax_temp ); A += 2*lda; x += 2; } if(kmax%2==1) { x_0 = _mm_loaddup_pd( &x[0] ); a_00_10 = _mm_load_pd( &A[0+lda*0] ); ax_temp = _mm_mul_pd( a_00_10, x_0 ); y_0_1 = _mm_add_pd( y_0_1, ax_temp ); } if(alg==0) { _mm_storeu_pd(&y[0], y_0_1); } else if(alg==1) { z_0_1 = _mm_loadu_pd( &y[0] ); z_0_1 = _mm_add_pd( z_0_1, y_0_1 ); _mm_storeu_pd(&y[0], z_0_1); } else // alg==-1 { z_0_1 = _mm_loadu_pd( &y[0] ); z_0_1 = _mm_sub_pd( z_0_1, y_0_1 ); _mm_storeu_pd(&y[0], z_0_1); } }
// it moves vertically across blocks void kernel_dtrmv_u_t_1_lib4(int kmax, double *A, int sda, double *x, double *y, int alg) { /* if(kmax<=0) */ /* return;*/ const int lda = 4; double *tA, *tx; int k; __m256d tmp0, a_00_10_20_30, x_0_1_2_3, y_00; y_00 = _mm256_setzero_pd(); k=0; for(; k<kmax-3; k+=4) { x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, tmp0 ); A += 4 + (sda-1)*lda; x += 4; } __m128d tm0, a_00_10, a_01_11, x_0_1, y_0, y_1, y_0_1; tm0 = _mm256_extractf128_pd( y_00, 0x1 ); y_0 = _mm256_castpd256_pd128( y_00 ); y_0 = _mm_add_pd( y_0, tm0 ); if(k<kmax-1) { x_0_1 = _mm_loadu_pd( &x[0] ); a_00_10 = _mm_load_pd( &A[0+lda*0] ); tm0 = _mm_mul_pd( a_00_10, x_0_1 ); y_0 = _mm_add_pd( y_0, tm0 ); A += 2; x += 2; } x_0_1 = _mm_load_sd( &x[0] ); a_00_10 = _mm_load_sd( &A[0+lda*0] ); tm0 = _mm_mul_sd( a_00_10, x_0_1 ); y_0 = _mm_add_sd( y_0, tm0 ); y_0 = _mm_hadd_pd( y_0, y_0 ); if(alg==0) { _mm_store_sd(&y[0], y_0); } else if(alg==1) { y_0_1 = _mm_load_sd( &y[0] ); y_0_1 = _mm_add_sd( y_0_1, y_0 ); _mm_store_sd(&y[0], y_0_1); } else // alg==-1 { y_0_1 = _mm_load_sd( &y[0] ); y_0_1 = _mm_sub_sd( y_0_1, y_0 ); _mm_store_sd(&y[0], y_0_1); } }
// it moves horizontally inside a block void kernel_dgemv_n_1_lib4(int kmax, double *A, double *x, double *y, int alg) { if(kmax<=0) return; const int lda = 4; int k; __m128d ax_temp, a_00, a_01, a_02, a_03, x_0, x_1, x_2, x_3, y_0, y_0_b, y_0_c, y_0_d, z_0; y_0 = _mm_setzero_pd(); y_0_b = _mm_setzero_pd(); y_0_c = _mm_setzero_pd(); y_0_d = _mm_setzero_pd(); k=0; for(; k<kmax-3; k+=4) { x_0 = _mm_load_sd( &x[0] ); x_1 = _mm_load_sd( &x[1] ); a_00 = _mm_load_sd( &A[0+lda*0] ); a_01 = _mm_load_sd( &A[0+lda*1] ); x_2 = _mm_load_sd( &x[2] ); x_3 = _mm_load_sd( &x[3] ); a_02 = _mm_load_sd( &A[0+lda*2] ); a_03 = _mm_load_sd( &A[0+lda*3] ); /* y_0 += a_00 * x_0;*/ ax_temp = _mm_mul_sd( a_00, x_0 ); y_0 = _mm_add_sd( y_0, ax_temp ); /* y_0 += a_01 * x_1;*/ ax_temp = _mm_mul_sd( a_01, x_1 ); y_0_b = _mm_add_sd( y_0_b, ax_temp ); /* y_0 += a_02 * x_2;*/ ax_temp = _mm_mul_sd( a_02, x_2 ); y_0_c = _mm_add_sd( y_0_c, ax_temp ); /* y_0 += a_03 * x_3;*/ ax_temp = _mm_mul_sd( a_03, x_3 ); y_0_d = _mm_add_sd( y_0_d, ax_temp ); A += 4*lda; x += 4; } y_0 = _mm_add_pd( y_0, y_0_c ); y_0_b = _mm_add_pd( y_0_b, y_0_d ); if(kmax%4>=2) { x_0 = _mm_load_sd( &x[0] ); x_1 = _mm_load_sd( &x[1] ); a_00 = _mm_load_sd( &A[0+lda*0] ); a_01 = _mm_load_sd( &A[0+lda*1] ); /* y_0 += a_00 * x_0;*/ ax_temp = _mm_mul_sd( a_00, x_0 ); y_0 = _mm_add_sd( y_0, ax_temp ); /* y_0 += a_01 * x_1;*/ ax_temp = _mm_mul_sd( a_01, x_1 ); y_0_b = _mm_add_sd( y_0_b, ax_temp ); A += 2*lda; x += 2; } y_0 = _mm_add_pd( y_0, y_0_b ); if(kmax%2==1) { x_0 = _mm_load_sd( &x[0] ); a_00 = _mm_load_sd( &A[0+lda*0] ); /* y_0 += a_00 * x_0;*/ ax_temp = _mm_mul_sd( a_00, x_0 ); y_0 = _mm_add_sd( y_0, ax_temp ); /* A += 1*lda;*/ /* x += 1;*/ } if(alg==0) { _mm_store_sd(&y[0], y_0); } else if(alg==1) { z_0 = _mm_load_sd( &y[0] ); /* z_0 += y_0;*/ z_0 = _mm_add_sd( z_0, y_0 ); _mm_store_sd(&y[0], z_0); } else // alg==-1 { z_0 = _mm_load_sd( &y[0] ); /* z_0 -= y_0;*/ z_0 = _mm_sub_sd( z_0, y_0 ); _mm_store_sd(&y[0], z_0); } }
void ATL_UGEMV(ATL_CINT M, ATL_CINT N, const TYPE *A, ATL_CINT lda1, const TYPE *X, TYPE *Y) {/* BEGIN GEMV: nMU=1, MU=2, NU=8 */ ATL_INT i, j; ATL_CINT MAp = ((((size_t)A)&0xF) || M==1) ? 1 : 2; ATL_CINT MA = M - MAp; #define A0 A const TYPE *A1=A0+lda1, *A2=A1+lda1, *A3=A2+lda1, *A4=A3+lda1, *A5=A4+lda1, *A6=A5+lda1, *A7=A6+lda1; ATL_CINT M2=((((((MA) >> 1)) << 1)))+MAp, N8=(((((N) >> 3)) << 3)), lda8=(((lda1) << 3)); __m128d x0, x1, y0, y1, y2, y3, y4, y5, y6, y7, a0_0, a0_1, a0_2, a0_3, a0_4, a0_5, a0_6, a0_7; if (!M || !N) return; for (j=0; j < N8; j += 8, A0 += lda8, A1 += lda8, A2 += lda8, A3 += lda8, A4 += lda8, A5 += lda8, A6 += lda8, A7 += lda8) {/* BEGIN N-LOOP UR=8 */ if (MAp != 1) {/* peel to zero Y */ i=0; x0 = _mm_load_pd(X+i+0); y0 = _mm_load_pd(A0+i); y0 = _mm_mul_pd(y0, x0); y1 = _mm_load_pd(A1+i); y1 = _mm_mul_pd(y1, x0); y2 = _mm_load_pd(A2+i); y2 = _mm_mul_pd(y2, x0); y3 = _mm_load_pd(A3+i); y3 = _mm_mul_pd(y3, x0); y4 = _mm_load_pd(A4+i); y4 = _mm_mul_pd(y4, x0); y5 = _mm_load_pd(A5+i); y5 = _mm_mul_pd(y5, x0); y6 = _mm_load_pd(A6+i); y6 = _mm_mul_pd(y6, x0); y7 = _mm_load_pd(A7+i); y7 = _mm_mul_pd(y7, x0); } /* end zero Y peel */ else /* if (MAp == 1)*/ {/* peel to force X/A alignment, zero Y */ i=0; x0 = _mm_load_sd(X+i+0); y0 = _mm_load_sd(A0+i); y0 = _mm_mul_sd(y0, x0); y1 = _mm_load_sd(A1+i); y1 = _mm_mul_sd(y1, x0); y2 = _mm_load_sd(A2+i); y2 = _mm_mul_sd(y2, x0); y3 = _mm_load_sd(A3+i); y3 = _mm_mul_sd(y3, x0); y4 = _mm_load_sd(A4+i); y4 = _mm_mul_sd(y4, x0); y5 = _mm_load_sd(A5+i); y5 = _mm_mul_sd(y5, x0); y6 = _mm_load_sd(A6+i); y6 = _mm_mul_sd(y6, x0); y7 = _mm_load_sd(A7+i); y7 = _mm_mul_sd(y7, x0); } /* end force-align/zeroY peel */ for (i=MAp; i < M2; i += 2) {/* ----- BEGIN M-LOOP BODY ----- */ /* --- BEGIN MUxNU UNROLL 0 --- */ x0 = _mm_load_pd(X+i+0); a0_0 = _mm_load_pd(A0+i); a0_0 = _mm_mul_pd(a0_0, x0); y0 = _mm_add_pd(y0, a0_0); a0_1 = _mm_load_pd(A1+i); a0_1 = _mm_mul_pd(a0_1, x0); y1 = _mm_add_pd(y1, a0_1); a0_2 = _mm_load_pd(A2+i); a0_2 = _mm_mul_pd(a0_2, x0); y2 = _mm_add_pd(y2, a0_2); a0_3 = _mm_load_pd(A3+i); a0_3 = _mm_mul_pd(a0_3, x0); y3 = _mm_add_pd(y3, a0_3); a0_4 = _mm_load_pd(A4+i); a0_4 = _mm_mul_pd(a0_4, x0); y4 = _mm_add_pd(y4, a0_4); a0_5 = _mm_load_pd(A5+i); a0_5 = _mm_mul_pd(a0_5, x0); y5 = _mm_add_pd(y5, a0_5); a0_6 = _mm_load_pd(A6+i); a0_6 = _mm_mul_pd(a0_6, x0); y6 = _mm_add_pd(y6, a0_6); a0_7 = _mm_load_pd(A7+i); a0_7 = _mm_mul_pd(a0_7, x0); y7 = _mm_add_pd(y7, a0_7); /* --- END MUxNU UNROLL 0 --- */ }/* ----- END M-LOOP BODY ----- */ if (M != M2) {/* ----- BEGIN SCALAR M CLEANUP ----- */ x0 = _mm_load_sd(X+i+0); a0_0 = _mm_load_sd(A0+i); a0_0 = _mm_mul_sd(a0_0, x0); y0 = _mm_add_sd(y0, a0_0); a0_1 = _mm_load_sd(A1+i); a0_1 = _mm_mul_sd(a0_1, x0); y1 = _mm_add_sd(y1, a0_1); a0_2 = _mm_load_sd(A2+i); a0_2 = _mm_mul_sd(a0_2, x0); y2 = _mm_add_sd(y2, a0_2); a0_3 = _mm_load_sd(A3+i); a0_3 = _mm_mul_sd(a0_3, x0); y3 = _mm_add_sd(y3, a0_3); a0_4 = _mm_load_sd(A4+i); a0_4 = _mm_mul_sd(a0_4, x0); y4 = _mm_add_sd(y4, a0_4); a0_5 = _mm_load_sd(A5+i); a0_5 = _mm_mul_sd(a0_5, x0); y5 = _mm_add_sd(y5, a0_5); a0_6 = _mm_load_sd(A6+i); a0_6 = _mm_mul_sd(a0_6, x0); y6 = _mm_add_sd(y6, a0_6); a0_7 = _mm_load_sd(A7+i); a0_7 = _mm_mul_sd(a0_7, x0); y7 = _mm_add_sd(y7, a0_7); }/* ----- END SCALAR M CLEANUP ----- */ _my_hadd_pd(y0, y1); #ifndef BETA0 a0_0 = _mm_load_pd(Y+j+0); y0 = _mm_add_pd(y0, a0_0); #endif _mm_store_pd(Y+j+0, y0); _my_hadd_pd(y2, y3); #ifndef BETA0 a0_1 = _mm_load_pd(Y+j+2); y2 = _mm_add_pd(y2, a0_1); #endif _mm_store_pd(Y+j+2, y2); _my_hadd_pd(y4, y5); #ifndef BETA0 a0_2 = _mm_load_pd(Y+j+4); y4 = _mm_add_pd(y4, a0_2); #endif _mm_store_pd(Y+j+4, y4); _my_hadd_pd(y6, y7); #ifndef BETA0 a0_3 = _mm_load_pd(Y+j+6); y6 = _mm_add_pd(y6, a0_3); #endif _mm_store_pd(Y+j+6, y6); }/* END N-LOOP UR=8 */ for (j=N8; j < N; j++, A0 += lda1) {/* BEGIN N-LOOP UR=1 */ if (MAp != 1) {/* peel to zero Y */ i=0; x0 = _mm_load_pd(X+i+0); y0 = _mm_load_pd(A0+i); y0 = _mm_mul_pd(y0, x0); } /* end zero Y peel */ else /* if (MAp == 1)*/ {/* peel to force X/A alignment, zero Y */ i=0; x0 = _mm_load_sd(X+i+0); y0 = _mm_load_sd(A0+i); y0 = _mm_mul_sd(y0, x0); } /* end force-align/zeroY peel */ for (i=MAp; i < M2; i += 2) {/* ----- BEGIN M-LOOP BODY ----- */ /* --- BEGIN MUxNU UNROLL 0 --- */ x0 = _mm_load_pd(X+i+0); a0_0 = _mm_load_pd(A0+i); a0_0 = _mm_mul_pd(a0_0, x0); y0 = _mm_add_pd(y0, a0_0); /* --- END MUxNU UNROLL 0 --- */ }/* ----- END M-LOOP BODY ----- */ if (M != M2) {/* ----- BEGIN SCALAR M CLEANUP ----- */ x0 = _mm_load_sd(X+i+0); a0_0 = _mm_load_sd(A0+i); a0_0 = _mm_mul_sd(a0_0, x0); y0 = _mm_add_sd(y0, a0_0); }/* ----- END SCALAR M CLEANUP ----- */ _my_hadd_pd(y0, y0); #ifndef BETA0 a0_0 = _mm_load_sd(Y+j+0); y0 = _mm_add_sd(y0, a0_0); #endif _mm_store_sd(Y+j+0, y0); }/* END N-LOOP UR=1 */ }/* END GEMV: nMU=1, MU=2, NU=8 */
void nb_kernel430_ia32_sse2(int * p_nri, int * iinr, int * jindex, int * jjnr, int * shift, double * shiftvec, double * fshift, int * gid, double * pos, double * faction, double * charge, double * p_facel, double * p_krf, double * p_crf, double * vc, int * type, int * p_ntype, double * vdwparam, double * vvdw, double * p_tabscale, double * VFtab, double * invsqrta, double * dvda, double * p_gbtabscale, double * GBtab, int * p_nthreads, int * count, void * mtx, int * outeriter, int * inneriter, double * work) { int nri,ntype,nthreads; int n,ii,is3,ii3,k,nj0,nj1,ggid; double shX,shY,shZ; int offset,nti; int jnrA,jnrB; int j3A,j3B; int tjA,tjB; gmx_gbdata_t *gbdata; double * gpol; __m128d iq,qq,jq,isai; __m128d ix,iy,iz; __m128d jx,jy,jz; __m128d dx,dy,dz; __m128d vctot,vvdwtot,vgbtot,dvdasum,gbfactor; __m128d fix,fiy,fiz,tx,ty,tz,rsq; __m128d rinv,isaj,isaprod; __m128d vcoul,fscal,gbscale,c6,c12; __m128d rinvsq,r,rtab; __m128d eps,Y,F,G,H; __m128d VV,FF,Fp; __m128d vgb,fijGB,dvdatmp; __m128d rinvsix,vvdw6,vvdw12,vvdwtmp; __m128d facel,gbtabscale,dvdaj; __m128d fijD,fijR; __m128d xmm1,tabscale,eps2; __m128i n0, nnn; const __m128d neg = _mm_set1_pd(-1.0); const __m128d zero = _mm_set1_pd(0.0); const __m128d minushalf = _mm_set1_pd(-0.5); const __m128d two = _mm_set1_pd(2.0); gbdata = (gmx_gbdata_t *)work; gpol = gbdata->gpol; nri = *p_nri; ntype = *p_ntype; gbfactor = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent))); gbtabscale = _mm_load1_pd(p_gbtabscale); facel = _mm_load1_pd(p_facel); tabscale = _mm_load1_pd(p_tabscale); nj1 = 0; jnrA = jnrB = 0; j3A = j3B = 0; jx = _mm_setzero_pd(); jy = _mm_setzero_pd(); jz = _mm_setzero_pd(); c6 = _mm_setzero_pd(); c12 = _mm_setzero_pd(); for(n=0;n<nri;n++) { is3 = 3*shift[n]; shX = shiftvec[is3]; shY = shiftvec[is3+1]; shZ = shiftvec[is3+2]; nj0 = jindex[n]; nj1 = jindex[n+1]; ii = iinr[n]; ii3 = 3*ii; ix = _mm_set1_pd(shX+pos[ii3+0]); iy = _mm_set1_pd(shY+pos[ii3+1]); iz = _mm_set1_pd(shZ+pos[ii3+2]); iq = _mm_load1_pd(charge+ii); iq = _mm_mul_pd(iq,facel); isai = _mm_load1_pd(invsqrta+ii); nti = 2*ntype*type[ii]; vctot = _mm_setzero_pd(); vvdwtot = _mm_setzero_pd(); vgbtot = _mm_setzero_pd(); dvdasum = _mm_setzero_pd(); fix = _mm_setzero_pd(); fiy = _mm_setzero_pd(); fiz = _mm_setzero_pd(); for(k=nj0;k<nj1-1; k+=2) { jnrA = jjnr[k]; jnrB = jjnr[k+1]; j3A = jnrA * 3; j3B = jnrB * 3; GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz); dx = _mm_sub_pd(ix,jx); dy = _mm_sub_pd(iy,jy); dz = _mm_sub_pd(iz,jz); rsq = gmx_mm_calc_rsq_pd(dx,dy,dz); rinv = gmx_mm_invsqrt_pd(rsq); rinvsq = _mm_mul_pd(rinv,rinv); /***********************************/ /* INTERACTION SECTION STARTS HERE */ /***********************************/ GMX_MM_LOAD_2VALUES_PD(charge+jnrA,charge+jnrB,jq); GMX_MM_LOAD_2VALUES_PD(invsqrta+jnrA,invsqrta+jnrB,isaj); /* Lennard-Jones */ tjA = nti+2*type[jnrA]; tjB = nti+2*type[jnrB]; GMX_MM_LOAD_2PAIRS_PD(vdwparam+tjA,vdwparam+tjB,c6,c12); isaprod = _mm_mul_pd(isai,isaj); qq = _mm_mul_pd(iq,jq); vcoul = _mm_mul_pd(qq,rinv); fscal = _mm_mul_pd(vcoul,rinv); vctot = _mm_add_pd(vctot,vcoul); /* Polarization interaction */ qq = _mm_mul_pd(qq,_mm_mul_pd(isaprod,gbfactor)); gbscale = _mm_mul_pd(isaprod,gbtabscale); /* Calculate GB table index */ r = _mm_mul_pd(rsq,rinv); rtab = _mm_mul_pd(r,gbscale); n0 = _mm_cvttpd_epi32(rtab); eps = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0)); nnn = _mm_slli_epi32(n0,2); /* the tables are 16-byte aligned, so we can use _mm_load_pd */ Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); F = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))); GMX_MM_TRANSPOSE2_PD(Y,F); G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); H = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2); GMX_MM_TRANSPOSE2_PD(G,H); G = _mm_mul_pd(G,eps); H = _mm_mul_pd(H, _mm_mul_pd(eps,eps) ); F = _mm_add_pd(F, _mm_add_pd( G , H ) ); Y = _mm_add_pd(Y, _mm_mul_pd(F, eps)); F = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two))); vgb = _mm_mul_pd(Y, qq); fijGB = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale)); dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf); vgbtot = _mm_add_pd(vgbtot, vgb); dvdasum = _mm_add_pd(dvdasum, dvdatmp); dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj)); GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp); /* Calculate VDW table index */ rtab = _mm_mul_pd(r,tabscale); n0 = _mm_cvttpd_epi32(rtab); eps = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0)); eps2 = _mm_mul_pd(eps,eps); nnn = _mm_slli_epi32(n0,3); /* Dispersion */ Y = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))); F = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))); GMX_MM_TRANSPOSE2_PD(Y,F); G = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+2); H = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+2); GMX_MM_TRANSPOSE2_PD(G,H); G = _mm_mul_pd(G,eps); H = _mm_mul_pd(H,eps2); Fp = _mm_add_pd(F,G); Fp = _mm_add_pd(Fp,H); VV = _mm_mul_pd(Fp,eps); VV = _mm_add_pd(Y,VV); xmm1 = _mm_mul_pd(two,H); FF = _mm_add_pd(Fp,G); FF = _mm_add_pd(FF,xmm1); vvdw6 = _mm_mul_pd(c6,VV); fijD = _mm_mul_pd(c6,FF); /* Dispersion */ Y = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+4); F = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+4); GMX_MM_TRANSPOSE2_PD(Y,F); G = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+6); H = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+6); GMX_MM_TRANSPOSE2_PD(G,H); G = _mm_mul_pd(G,eps); H = _mm_mul_pd(H,eps2); Fp = _mm_add_pd(F,G); Fp = _mm_add_pd(Fp,H); VV = _mm_mul_pd(Fp,eps); VV = _mm_add_pd(Y,VV); xmm1 = _mm_mul_pd(two,H); FF = _mm_add_pd(Fp,G); FF = _mm_add_pd(FF,xmm1); vvdw12 = _mm_mul_pd(c12,VV); fijR = _mm_mul_pd(c12,FF); vvdwtmp = _mm_add_pd(vvdw12,vvdw6); vvdwtot = _mm_add_pd(vvdwtot,vvdwtmp); xmm1 = _mm_add_pd(fijD,fijR); xmm1 = _mm_mul_pd(xmm1,tabscale); xmm1 = _mm_add_pd(xmm1,fijGB); xmm1 = _mm_sub_pd(xmm1,fscal); fscal = _mm_mul_pd(xmm1,neg); fscal = _mm_mul_pd(fscal,rinv); /***********************************/ /* INTERACTION SECTION ENDS HERE */ /***********************************/ /* Calculate temporary vectorial force */ tx = _mm_mul_pd(fscal,dx); ty = _mm_mul_pd(fscal,dy); tz = _mm_mul_pd(fscal,dz); /* Increment i atom force */ fix = _mm_add_pd(fix,tx); fiy = _mm_add_pd(fiy,ty); fiz = _mm_add_pd(fiz,tz); /* Store j forces back */ GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(faction+j3A,faction+j3B,tx,ty,tz); } /* In double precision, offset can only be either 0 or 1 */ if(k<nj1) { jnrA = jjnr[k]; j3A = jnrA * 3; GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz); dx = _mm_sub_sd(ix,jx); dy = _mm_sub_sd(iy,jy); dz = _mm_sub_sd(iz,jz); rsq = gmx_mm_calc_rsq_pd(dx,dy,dz); rinv = gmx_mm_invsqrt_pd(rsq); rinvsq = _mm_mul_sd(rinv,rinv); /* These reason for zeroing these variables here is for fixing bug 585 * What happens is that __m128d _mm_add_sd(a,b) gives back r0=a[0]+b[0], * and r1=0, but it should be r1=a[1]. * This might be a compiler issue (tested with gcc-4.1.3 and -O3). * To work around it, we zero these variables and use _mm_add_pd (**) instead * Note that the only variables that get affected are the energies since * the total sum needs to be correct */ vgb = _mm_setzero_pd(); vcoul = _mm_setzero_pd(); dvdatmp = _mm_setzero_pd(); vvdw6 = _mm_setzero_pd(); vvdw12 = _mm_setzero_pd(); /***********************************/ /* INTERACTION SECTION STARTS HERE */ /***********************************/ GMX_MM_LOAD_1VALUE_PD(charge+jnrA,jq); GMX_MM_LOAD_1VALUE_PD(invsqrta+jnrA,isaj); /* Lennard-Jones */ tjA = nti+2*type[jnrA]; GMX_MM_LOAD_1PAIR_PD(vdwparam+tjA,c6,c12); isaprod = _mm_mul_sd(isai,isaj); qq = _mm_mul_sd(jq,iq); vcoul = _mm_mul_sd(qq,rinv); fscal = _mm_mul_sd(vcoul,rinv); vctot = _mm_add_pd(vctot,vcoul); /* (**) */ /* Polarization interaction */ qq = _mm_mul_sd(qq,_mm_mul_sd(isaprod,gbfactor)); gbscale = _mm_mul_sd(isaprod,gbtabscale); /* Calculate GB table index */ r = _mm_mul_sd(rsq,rinv); rtab = _mm_mul_sd(r,gbscale); n0 = _mm_cvttpd_epi32(rtab); eps = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0)); nnn = _mm_slli_epi32(n0,2); /* the tables are 16-byte aligned, so we can use _mm_load_pd */ Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); F = _mm_setzero_pd(); GMX_MM_TRANSPOSE2_PD(Y,F); G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); H = _mm_setzero_pd(); GMX_MM_TRANSPOSE2_PD(G,H); G = _mm_mul_sd(G,eps); H = _mm_mul_sd(H, _mm_mul_sd(eps,eps) ); F = _mm_add_sd(F, _mm_add_sd( G , H ) ); Y = _mm_add_sd(Y, _mm_mul_sd(F, eps)); F = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two))); vgb = _mm_mul_sd(Y, qq); fijGB = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale)); dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf); vgbtot = _mm_add_pd(vgbtot, vgb); /* (**) */ dvdasum = _mm_add_pd(dvdasum, dvdatmp); /* (**) */ dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj)); GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp); /* Calculate VDW table index */ rtab = _mm_mul_sd(r,tabscale); n0 = _mm_cvttpd_epi32(rtab); eps = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0)); eps2 = _mm_mul_sd(eps,eps); nnn = _mm_slli_epi32(n0,3); /* Dispersion */ Y = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))); F = _mm_setzero_pd(); GMX_MM_TRANSPOSE2_PD(Y,F); G = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+2); H = _mm_setzero_pd(); GMX_MM_TRANSPOSE2_PD(G,H); G = _mm_mul_sd(G,eps); H = _mm_mul_sd(H,eps2); Fp = _mm_add_sd(F,G); Fp = _mm_add_sd(Fp,H); VV = _mm_mul_sd(Fp,eps); VV = _mm_add_sd(Y,VV); xmm1 = _mm_mul_sd(two,H); FF = _mm_add_sd(Fp,G); FF = _mm_add_sd(FF,xmm1); vvdw6 = _mm_mul_sd(c6,VV); fijD = _mm_mul_sd(c6,FF); /* Dispersion */ Y = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+4); F = _mm_setzero_pd(); GMX_MM_TRANSPOSE2_PD(Y,F); G = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+6); H = _mm_setzero_pd(); GMX_MM_TRANSPOSE2_PD(G,H); G = _mm_mul_sd(G,eps); H = _mm_mul_sd(H,eps2); Fp = _mm_add_sd(F,G); Fp = _mm_add_sd(Fp,H); VV = _mm_mul_sd(Fp,eps); VV = _mm_add_sd(Y,VV); xmm1 = _mm_mul_sd(two,H); FF = _mm_add_sd(Fp,G); FF = _mm_add_sd(FF,xmm1); vvdw12 = _mm_mul_sd(c12,VV); fijR = _mm_mul_sd(c12,FF); vvdwtmp = _mm_add_sd(vvdw12,vvdw6); vvdwtot = _mm_add_pd(vvdwtot,vvdwtmp); /* (**) */ xmm1 = _mm_add_sd(fijD,fijR); xmm1 = _mm_mul_sd(xmm1,tabscale); xmm1 = _mm_add_sd(xmm1,fijGB); xmm1 = _mm_sub_sd(xmm1,fscal); fscal = _mm_mul_sd(xmm1,neg); fscal = _mm_mul_sd(fscal,rinv); /***********************************/ /* INTERACTION SECTION ENDS HERE */ /***********************************/ /* Calculate temporary vectorial force */ tx = _mm_mul_sd(fscal,dx); ty = _mm_mul_sd(fscal,dy); tz = _mm_mul_sd(fscal,dz); /* Increment i atom force */ fix = _mm_add_sd(fix,tx); fiy = _mm_add_sd(fiy,ty); fiz = _mm_add_sd(fiz,tz); /* Store j forces back */ GMX_MM_DECREMENT_1RVEC_1POINTER_PD(faction+j3A,tx,ty,tz); } dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai)); gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3); ggid = gid[n]; gmx_mm_update_1pot_pd(vctot,vc+ggid); gmx_mm_update_1pot_pd(vgbtot,gpol+ggid); gmx_mm_update_1pot_pd(dvdasum,dvda+ii); gmx_mm_update_1pot_pd(vvdwtot,vvdw+ggid); } *outeriter = nri; *inneriter = nj1; }
void kernel_dgemv_t_4_lib4(int kmax, double *A, int sda, double *x, double *y, int alg) { if(kmax<=0) return; const int lda = 4; __builtin_prefetch( A + 0*lda ); __builtin_prefetch( A + 2*lda ); double *tA, *tx; int k; int ka = kmax; // number from aligned positon __m256d aaxx_temp, a_00_10_20_30, a_01_11_21_31, a_02_12_22_32, a_03_13_23_33, x_0_1_2_3, y_00, y_11, y_22, y_33; __m128d ax_temp, a_00_10, a_01_11, a_02_12, a_03_13, x_0_1, y_0, y_1, y_2, y_3; y_00 = _mm256_setzero_pd(); y_11 = _mm256_setzero_pd(); y_22 = _mm256_setzero_pd(); y_33 = _mm256_setzero_pd(); y_0 = _mm256_castpd256_pd128(y_00); y_1 = _mm256_castpd256_pd128(y_11); y_2 = _mm256_castpd256_pd128(y_22); y_3 = _mm256_castpd256_pd128(y_33); k = lda*(ka/lda); tA = A + (ka/lda)*sda*lda; tx = x + (ka/lda)*lda; for(; k<ka; k++) { x_0_1 = _mm_load_sd( &tx[0] ); a_00_10 = _mm_load_sd( &tA[0+lda*0] ); a_01_11 = _mm_load_sd( &tA[0+lda*1] ); a_02_12 = _mm_load_sd( &tA[0+lda*2] ); a_03_13 = _mm_load_sd( &tA[0+lda*3] ); ax_temp = _mm_mul_sd( a_00_10, x_0_1 ); y_0 = _mm_add_sd (y_0, ax_temp ); ax_temp = _mm_mul_sd( a_01_11, x_0_1 ); y_1 = _mm_add_sd (y_1, ax_temp ); ax_temp = _mm_mul_sd( a_02_12, x_0_1 ); y_2 = _mm_add_sd (y_2, ax_temp ); ax_temp = _mm_mul_sd( a_03_13, x_0_1 ); y_3 = _mm_add_sd (y_3, ax_temp ); tA += 1; tx += 1; } y_00 = _mm256_castpd128_pd256(y_0); y_11 = _mm256_castpd128_pd256(y_1); y_22 = _mm256_castpd128_pd256(y_2); y_33 = _mm256_castpd128_pd256(y_3); k=0; for(; k<ka-7; k+=8) { __builtin_prefetch( A + sda*lda + 0*lda ); __builtin_prefetch( A + sda*lda + 2*lda ); x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_11 = _mm256_add_pd( y_11, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); y_22 = _mm256_add_pd( y_22, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_33 = _mm256_add_pd( y_33, aaxx_temp ); A += 4 + (sda-1)*lda; x += 4; __builtin_prefetch( A + sda*lda + 0*lda ); __builtin_prefetch( A + sda*lda + 2*lda ); x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_11 = _mm256_add_pd( y_11, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); y_22 = _mm256_add_pd( y_22, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_33 = _mm256_add_pd( y_33, aaxx_temp ); A += 4 + (sda-1)*lda; x += 4; } for(; k<ka-3; k+=4) { __builtin_prefetch( A + sda*lda + 0*lda ); __builtin_prefetch( A + sda*lda + 2*lda ); x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_11 = _mm256_add_pd( y_11, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); y_22 = _mm256_add_pd( y_22, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_33 = _mm256_add_pd( y_33, aaxx_temp ); A += 4 + (sda-1)*lda; x += 4; } __m256d y_0_1_2_3; y_00 = _mm256_hadd_pd(y_00, y_11); y_22 = _mm256_hadd_pd(y_22, y_33); y_11 = _mm256_permute2f128_pd(y_22, y_00, 2 ); y_00 = _mm256_permute2f128_pd(y_22, y_00, 19); y_00 = _mm256_add_pd( y_00, y_11 ); if(alg==0) { _mm256_storeu_pd(&y[0], y_00); } else if(alg==1) { y_0_1_2_3 = _mm256_loadu_pd( &y[0] ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_00 ); _mm256_storeu_pd(&y[0], y_0_1_2_3); } else // alg==-1 { y_0_1_2_3 = _mm256_loadu_pd( &y[0] ); y_0_1_2_3 = _mm256_sub_pd( y_0_1_2_3, y_00 ); _mm256_storeu_pd(&y[0], y_0_1_2_3); } }
int calc_gb_chainrule_sse2_double(int natoms, t_nblist *nl, double *dadx, double *dvda, double *x, double *f, double *fshift, double *shiftvec, int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md) { int i,k,n,ii,jnr,ii3,is3,nj0,nj1,n0,n1; int jnrA,jnrB; int j3A,j3B; int * jjnr; double rbi,shX,shY,shZ; double *rb; __m128d ix,iy,iz; __m128d jx,jy,jz; __m128d fix,fiy,fiz; __m128d dx,dy,dz; __m128d tx,ty,tz; __m128d rbai,rbaj,f_gb, f_gb_ai; __m128d xmm1,xmm2,xmm3; const __m128d two = _mm_set1_pd(2.0); rb = born->work; jjnr = nl->jjnr; /* Loop to get the proper form for the Born radius term, sse style */ n0 = 0; n1 = natoms; if(gb_algorithm==egbSTILL) { for(i=n0;i<n1;i++) { rbi = born->bRad[i]; rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0; } } else if(gb_algorithm==egbHCT) { for(i=n0;i<n1;i++) { rbi = born->bRad[i]; rb[i] = rbi * rbi * dvda[i]; } } else if(gb_algorithm==egbOBC) { for(i=n0;i<n1;i++) { rbi = born->bRad[i]; rb[i] = rbi * rbi * born->drobc[i] * dvda[i]; } } jz = _mm_setzero_pd(); n = j3A = j3B = 0; for(i=0;i<nl->nri;i++) { ii = nl->iinr[i]; ii3 = ii*3; is3 = 3*nl->shift[i]; shX = shiftvec[is3]; shY = shiftvec[is3+1]; shZ = shiftvec[is3+2]; nj0 = nl->jindex[i]; nj1 = nl->jindex[i+1]; ix = _mm_set1_pd(shX+x[ii3+0]); iy = _mm_set1_pd(shY+x[ii3+1]); iz = _mm_set1_pd(shZ+x[ii3+2]); rbai = _mm_load1_pd(rb+ii); fix = _mm_setzero_pd(); fiy = _mm_setzero_pd(); fiz = _mm_setzero_pd(); for(k=nj0;k<nj1-1;k+=2) { jnrA = jjnr[k]; jnrB = jjnr[k+1]; j3A = 3*jnrA; j3B = 3*jnrB; GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A,x+j3B,jx,jy,jz); dx = _mm_sub_pd(ix,jx); dy = _mm_sub_pd(iy,jy); dz = _mm_sub_pd(iz,jz); GMX_MM_LOAD_2VALUES_PD(rb+jnrA,rb+jnrB,rbaj); /* load chain rule terms for j1-4 */ f_gb = _mm_load_pd(dadx); dadx += 2; f_gb_ai = _mm_load_pd(dadx); dadx += 2; /* calculate scalar force */ f_gb = _mm_mul_pd(f_gb,rbai); f_gb_ai = _mm_mul_pd(f_gb_ai,rbaj); f_gb = _mm_add_pd(f_gb,f_gb_ai); tx = _mm_mul_pd(f_gb,dx); ty = _mm_mul_pd(f_gb,dy); tz = _mm_mul_pd(f_gb,dz); fix = _mm_add_pd(fix,tx); fiy = _mm_add_pd(fiy,ty); fiz = _mm_add_pd(fiz,tz); GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(f+j3A,f+j3B,tx,ty,tz); } /*deal with odd elements */ if(k<nj1) { jnrA = jjnr[k]; j3A = 3*jnrA; GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A,jx,jy,jz); dx = _mm_sub_sd(ix,jx); dy = _mm_sub_sd(iy,jy); dz = _mm_sub_sd(iz,jz); GMX_MM_LOAD_1VALUE_PD(rb+jnrA,rbaj); /* load chain rule terms */ f_gb = _mm_load_pd(dadx); dadx += 2; f_gb_ai = _mm_load_pd(dadx); dadx += 2; /* calculate scalar force */ f_gb = _mm_mul_sd(f_gb,rbai); f_gb_ai = _mm_mul_sd(f_gb_ai,rbaj); f_gb = _mm_add_sd(f_gb,f_gb_ai); tx = _mm_mul_sd(f_gb,dx); ty = _mm_mul_sd(f_gb,dy); tz = _mm_mul_sd(f_gb,dz); fix = _mm_add_sd(fix,tx); fiy = _mm_add_sd(fiy,ty); fiz = _mm_add_sd(fiz,tz); GMX_MM_DECREMENT_1RVEC_1POINTER_PD(f+j3A,tx,ty,tz); } /* fix/fiy/fiz now contain four partial force terms, that all should be * added to the i particle forces and shift forces. */ gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,f+ii3,fshift+is3); } return 0; }
int calc_gb_rad_still_sse2_double(t_commrec *cr, t_forcerec *fr, int natoms, gmx_localtop_t *top, const t_atomtypes *atype, double *x, t_nblist *nl, gmx_genborn_t *born) { int i,k,n,ii,is3,ii3,nj0,nj1,offset; int jnrA,jnrB,j3A,j3B; int *mdtype; double shX,shY,shZ; int *jjnr; double *shiftvec; double gpi_ai,gpi2; double factor; double *gb_radius; double *vsolv; double *work; double *dadx; __m128d ix,iy,iz; __m128d jx,jy,jz; __m128d dx,dy,dz; __m128d tx,ty,tz; __m128d rsq,rinv,rinv2,rinv4,rinv6; __m128d ratio,gpi,rai,raj,vai,vaj,rvdw; __m128d ccf,dccf,theta,cosq,term,sinq,res,prod,prod_ai,tmp; __m128d mask,icf4,icf6,mask_cmp; const __m128d half = _mm_set1_pd(0.5); const __m128d three = _mm_set1_pd(3.0); const __m128d one = _mm_set1_pd(1.0); const __m128d two = _mm_set1_pd(2.0); const __m128d zero = _mm_set1_pd(0.0); const __m128d four = _mm_set1_pd(4.0); const __m128d still_p5inv = _mm_set1_pd(STILL_P5INV); const __m128d still_pip5 = _mm_set1_pd(STILL_PIP5); const __m128d still_p4 = _mm_set1_pd(STILL_P4); factor = 0.5 * ONE_4PI_EPS0; gb_radius = born->gb_radius; vsolv = born->vsolv; work = born->gpol_still_work; jjnr = nl->jjnr; shiftvec = fr->shift_vec[0]; dadx = fr->dadx; jnrA = jnrB = 0; jx = _mm_setzero_pd(); jy = _mm_setzero_pd(); jz = _mm_setzero_pd(); n = 0; for(i=0;i<natoms;i++) { work[i]=0; } for(i=0;i<nl->nri;i++) { ii = nl->iinr[i]; ii3 = ii*3; is3 = 3*nl->shift[i]; shX = shiftvec[is3]; shY = shiftvec[is3+1]; shZ = shiftvec[is3+2]; nj0 = nl->jindex[i]; nj1 = nl->jindex[i+1]; ix = _mm_set1_pd(shX+x[ii3+0]); iy = _mm_set1_pd(shY+x[ii3+1]); iz = _mm_set1_pd(shZ+x[ii3+2]); /* Polarization energy for atom ai */ gpi = _mm_setzero_pd(); rai = _mm_load1_pd(gb_radius+ii); prod_ai = _mm_set1_pd(STILL_P4*vsolv[ii]); for(k=nj0;k<nj1-1;k+=2) { jnrA = jjnr[k]; jnrB = jjnr[k+1]; j3A = 3*jnrA; j3B = 3*jnrB; GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A,x+j3B,jx,jy,jz); GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA,gb_radius+jnrB,raj); GMX_MM_LOAD_2VALUES_PD(vsolv+jnrA,vsolv+jnrB,vaj); dx = _mm_sub_pd(ix,jx); dy = _mm_sub_pd(iy,jy); dz = _mm_sub_pd(iz,jz); rsq = gmx_mm_calc_rsq_pd(dx,dy,dz); rinv = gmx_mm_invsqrt_pd(rsq); rinv2 = _mm_mul_pd(rinv,rinv); rinv4 = _mm_mul_pd(rinv2,rinv2); rinv6 = _mm_mul_pd(rinv4,rinv2); rvdw = _mm_add_pd(rai,raj); ratio = _mm_mul_pd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw,rvdw))); mask_cmp = _mm_cmple_pd(ratio,still_p5inv); /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */ if( 0 == _mm_movemask_pd(mask_cmp) ) { /* if ratio>still_p5inv for ALL elements */ ccf = one; dccf = _mm_setzero_pd(); } else { ratio = _mm_min_pd(ratio,still_p5inv); theta = _mm_mul_pd(ratio,still_pip5); gmx_mm_sincos_pd(theta,&sinq,&cosq); term = _mm_mul_pd(half,_mm_sub_pd(one,cosq)); ccf = _mm_mul_pd(term,term); dccf = _mm_mul_pd(_mm_mul_pd(two,term), _mm_mul_pd(sinq,theta)); } prod = _mm_mul_pd(still_p4,vaj); icf4 = _mm_mul_pd(ccf,rinv4); icf6 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four,ccf),dccf), rinv6); GMX_MM_INCREMENT_2VALUES_PD(work+jnrA,work+jnrB,_mm_mul_pd(prod_ai,icf4)); gpi = _mm_add_pd(gpi, _mm_mul_pd(prod,icf4) ); _mm_store_pd(dadx,_mm_mul_pd(prod,icf6)); dadx+=2; _mm_store_pd(dadx,_mm_mul_pd(prod_ai,icf6)); dadx+=2; } if(k<nj1) { jnrA = jjnr[k]; j3A = 3*jnrA; GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A,jx,jy,jz); GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA,raj); GMX_MM_LOAD_1VALUE_PD(vsolv+jnrA,vaj); dx = _mm_sub_sd(ix,jx); dy = _mm_sub_sd(iy,jy); dz = _mm_sub_sd(iz,jz); rsq = gmx_mm_calc_rsq_pd(dx,dy,dz); rinv = gmx_mm_invsqrt_pd(rsq); rinv2 = _mm_mul_sd(rinv,rinv); rinv4 = _mm_mul_sd(rinv2,rinv2); rinv6 = _mm_mul_sd(rinv4,rinv2); rvdw = _mm_add_sd(rai,raj); ratio = _mm_mul_sd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw,rvdw))); mask_cmp = _mm_cmple_sd(ratio,still_p5inv); /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */ if( 0 == _mm_movemask_pd(mask_cmp) ) { /* if ratio>still_p5inv for ALL elements */ ccf = one; dccf = _mm_setzero_pd(); } else { ratio = _mm_min_sd(ratio,still_p5inv); theta = _mm_mul_sd(ratio,still_pip5); gmx_mm_sincos_pd(theta,&sinq,&cosq); term = _mm_mul_sd(half,_mm_sub_sd(one,cosq)); ccf = _mm_mul_sd(term,term); dccf = _mm_mul_sd(_mm_mul_sd(two,term), _mm_mul_sd(sinq,theta)); } prod = _mm_mul_sd(still_p4,vaj); icf4 = _mm_mul_sd(ccf,rinv4); icf6 = _mm_mul_sd( _mm_sub_sd( _mm_mul_sd(four,ccf),dccf), rinv6); GMX_MM_INCREMENT_1VALUE_PD(work+jnrA,_mm_mul_sd(prod_ai,icf4)); gpi = _mm_add_sd(gpi, _mm_mul_sd(prod,icf4) ); _mm_store_pd(dadx,_mm_mul_pd(prod,icf6)); dadx+=2; _mm_store_pd(dadx,_mm_mul_pd(prod_ai,icf6)); dadx+=2; } gmx_mm_update_1pot_pd(gpi,work+ii); } /* Sum up the polarization energy from other nodes */ if(PARTDECOMP(cr)) { gmx_sum(natoms, work, cr); } else if(DOMAINDECOMP(cr)) { dd_atom_sum_real(cr->dd, work); } /* Compute the radii */ for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */ { if(born->use[i] != 0) { gpi_ai = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/ gpi2 = gpi_ai * gpi_ai; born->bRad[i] = factor*gmx_invsqrt(gpi2); fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]); } } /* Extra (local) communication required for DD */ if(DOMAINDECOMP(cr)) { dd_atom_spread_real(cr->dd, born->bRad); dd_atom_spread_real(cr->dd, fr->invsqrta); } return 0; }
int calc_gb_rad_hct_obc_sse2_double(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top, const t_atomtypes *atype, double *x, t_nblist *nl, gmx_genborn_t *born,t_mdatoms *md,int gb_algorithm) { int i,ai,k,n,ii,ii3,is3,nj0,nj1,at0,at1,offset; int jnrA,jnrB; int j3A,j3B; double shX,shY,shZ; double rr,rr_inv,rr_inv2,sum_tmp,sum,sum2,sum3,gbr; double sum_ai2, sum_ai3,tsum,tchain,doffset; double *obc_param; double *gb_radius; double *work; int * jjnr; double *dadx; double *shiftvec; double min_rad,rad; __m128d ix,iy,iz,jx,jy,jz; __m128d dx,dy,dz,t1,t2,t3,t4; __m128d rsq,rinv,r; __m128d rai,rai_inv,raj, raj_inv,rai_inv2,sk,sk2,lij,dlij,duij; __m128d uij,lij2,uij2,lij3,uij3,diff2; __m128d lij_inv,sk2_inv,prod,log_term,tmp,tmp_sum; __m128d sum_ai, tmp_ai,sk_ai,sk_aj,sk2_ai,sk2_aj,sk2_rinv; __m128d dadx1,dadx2; __m128d logterm; __m128d mask; __m128d obc_mask1,obc_mask2,obc_mask3; __m128d oneeighth = _mm_set1_pd(0.125); __m128d onefourth = _mm_set1_pd(0.25); const __m128d half = _mm_set1_pd(0.5); const __m128d three = _mm_set1_pd(3.0); const __m128d one = _mm_set1_pd(1.0); const __m128d two = _mm_set1_pd(2.0); const __m128d zero = _mm_set1_pd(0.0); const __m128d neg = _mm_set1_pd(-1.0); /* Set the dielectric offset */ doffset = born->gb_doffset; gb_radius = born->gb_radius; obc_param = born->param; work = born->gpol_hct_work; jjnr = nl->jjnr; dadx = fr->dadx; shiftvec = fr->shift_vec[0]; jx = _mm_setzero_pd(); jy = _mm_setzero_pd(); jz = _mm_setzero_pd(); jnrA = jnrB = 0; for(i=0;i<born->nr;i++) { work[i] = 0; } for(i=0;i<nl->nri;i++) { ii = nl->iinr[i]; ii3 = ii*3; is3 = 3*nl->shift[i]; shX = shiftvec[is3]; shY = shiftvec[is3+1]; shZ = shiftvec[is3+2]; nj0 = nl->jindex[i]; nj1 = nl->jindex[i+1]; ix = _mm_set1_pd(shX+x[ii3+0]); iy = _mm_set1_pd(shY+x[ii3+1]); iz = _mm_set1_pd(shZ+x[ii3+2]); rai = _mm_load1_pd(gb_radius+ii); rai_inv= gmx_mm_inv_pd(rai); sum_ai = _mm_setzero_pd(); sk_ai = _mm_load1_pd(born->param+ii); sk2_ai = _mm_mul_pd(sk_ai,sk_ai); for(k=nj0;k<nj1-1;k+=2) { jnrA = jjnr[k]; jnrB = jjnr[k+1]; j3A = 3*jnrA; j3B = 3*jnrB; GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A,x+j3B,jx,jy,jz); GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA,gb_radius+jnrB,raj); GMX_MM_LOAD_2VALUES_PD(obc_param+jnrA,obc_param+jnrB,sk_aj); dx = _mm_sub_pd(ix, jx); dy = _mm_sub_pd(iy, jy); dz = _mm_sub_pd(iz, jz); rsq = gmx_mm_calc_rsq_pd(dx,dy,dz); rinv = gmx_mm_invsqrt_pd(rsq); r = _mm_mul_pd(rsq,rinv); /* Compute raj_inv aj1-4 */ raj_inv = gmx_mm_inv_pd(raj); /* Evaluate influence of atom aj -> ai */ t1 = _mm_add_pd(r,sk_aj); t2 = _mm_sub_pd(r,sk_aj); t3 = _mm_sub_pd(sk_aj,r); obc_mask1 = _mm_cmplt_pd(rai, t1); obc_mask2 = _mm_cmplt_pd(rai, t2); obc_mask3 = _mm_cmplt_pd(rai, t3); uij = gmx_mm_inv_pd(t1); lij = _mm_or_pd( _mm_and_pd(obc_mask2,gmx_mm_inv_pd(t2)), _mm_andnot_pd(obc_mask2,rai_inv)); dlij = _mm_and_pd(one,obc_mask2); uij2 = _mm_mul_pd(uij, uij); uij3 = _mm_mul_pd(uij2,uij); lij2 = _mm_mul_pd(lij, lij); lij3 = _mm_mul_pd(lij2,lij); diff2 = _mm_sub_pd(uij2,lij2); lij_inv = gmx_mm_invsqrt_pd(lij2); sk2_aj = _mm_mul_pd(sk_aj,sk_aj); sk2_rinv = _mm_mul_pd(sk2_aj,rinv); prod = _mm_mul_pd(onefourth,sk2_rinv); logterm = gmx_mm_log_pd(_mm_mul_pd(uij,lij_inv)); t1 = _mm_sub_pd(lij,uij); t2 = _mm_mul_pd(diff2, _mm_sub_pd(_mm_mul_pd(onefourth,r), prod)); t3 = _mm_mul_pd(half,_mm_mul_pd(rinv,logterm)); t1 = _mm_add_pd(t1,_mm_add_pd(t2,t3)); t4 = _mm_mul_pd(two,_mm_sub_pd(rai_inv,lij)); t4 = _mm_and_pd(t4,obc_mask3); t1 = _mm_mul_pd(half,_mm_add_pd(t1,t4)); sum_ai = _mm_add_pd(sum_ai, _mm_and_pd(t1,obc_mask1) ); t1 = _mm_add_pd(_mm_mul_pd(half,lij2), _mm_mul_pd(prod,lij3)); t1 = _mm_sub_pd(t1, _mm_mul_pd(onefourth, _mm_add_pd(_mm_mul_pd(lij,rinv), _mm_mul_pd(lij3,r)))); t2 = _mm_mul_pd(onefourth, _mm_add_pd(_mm_mul_pd(uij,rinv), _mm_mul_pd(uij3,r))); t2 = _mm_sub_pd(t2, _mm_add_pd(_mm_mul_pd(half,uij2), _mm_mul_pd(prod,uij3))); t3 = _mm_mul_pd(_mm_mul_pd(onefourth,logterm), _mm_mul_pd(rinv,rinv)); t3 = _mm_sub_pd(t3, _mm_mul_pd(_mm_mul_pd(diff2,oneeighth), _mm_add_pd(one, _mm_mul_pd(sk2_rinv,rinv)))); t1 = _mm_mul_pd(rinv, _mm_add_pd(_mm_mul_pd(dlij,t1), _mm_add_pd(t2,t3))); dadx1 = _mm_and_pd(t1,obc_mask1); /* Evaluate influence of atom ai -> aj */ t1 = _mm_add_pd(r,sk_ai); t2 = _mm_sub_pd(r,sk_ai); t3 = _mm_sub_pd(sk_ai,r); obc_mask1 = _mm_cmplt_pd(raj, t1); obc_mask2 = _mm_cmplt_pd(raj, t2); obc_mask3 = _mm_cmplt_pd(raj, t3); uij = gmx_mm_inv_pd(t1); lij = _mm_or_pd( _mm_and_pd(obc_mask2,gmx_mm_inv_pd(t2)), _mm_andnot_pd(obc_mask2,raj_inv)); dlij = _mm_and_pd(one,obc_mask2); uij2 = _mm_mul_pd(uij, uij); uij3 = _mm_mul_pd(uij2,uij); lij2 = _mm_mul_pd(lij, lij); lij3 = _mm_mul_pd(lij2,lij); diff2 = _mm_sub_pd(uij2,lij2); lij_inv = gmx_mm_invsqrt_pd(lij2); sk2_rinv = _mm_mul_pd(sk2_ai,rinv); prod = _mm_mul_pd(onefourth,sk2_rinv); logterm = gmx_mm_log_pd(_mm_mul_pd(uij,lij_inv)); t1 = _mm_sub_pd(lij,uij); t2 = _mm_mul_pd(diff2, _mm_sub_pd(_mm_mul_pd(onefourth,r), prod)); t3 = _mm_mul_pd(half,_mm_mul_pd(rinv,logterm)); t1 = _mm_add_pd(t1,_mm_add_pd(t2,t3)); t4 = _mm_mul_pd(two,_mm_sub_pd(raj_inv,lij)); t4 = _mm_and_pd(t4,obc_mask3); t1 = _mm_mul_pd(half,_mm_add_pd(t1,t4)); GMX_MM_INCREMENT_2VALUES_PD(work+jnrA,work+jnrB,_mm_and_pd(t1,obc_mask1)); t1 = _mm_add_pd(_mm_mul_pd(half,lij2), _mm_mul_pd(prod,lij3)); t1 = _mm_sub_pd(t1, _mm_mul_pd(onefourth, _mm_add_pd(_mm_mul_pd(lij,rinv), _mm_mul_pd(lij3,r)))); t2 = _mm_mul_pd(onefourth, _mm_add_pd(_mm_mul_pd(uij,rinv), _mm_mul_pd(uij3,r))); t2 = _mm_sub_pd(t2, _mm_add_pd(_mm_mul_pd(half,uij2), _mm_mul_pd(prod,uij3))); t3 = _mm_mul_pd(_mm_mul_pd(onefourth,logterm), _mm_mul_pd(rinv,rinv)); t3 = _mm_sub_pd(t3, _mm_mul_pd(_mm_mul_pd(diff2,oneeighth), _mm_add_pd(one, _mm_mul_pd(sk2_rinv,rinv)))); t1 = _mm_mul_pd(rinv, _mm_add_pd(_mm_mul_pd(dlij,t1), _mm_add_pd(t2,t3))); dadx2 = _mm_and_pd(t1,obc_mask1); _mm_store_pd(dadx,dadx1); dadx += 2; _mm_store_pd(dadx,dadx2); dadx += 2; } /* end normal inner loop */ if(k<nj1) { jnrA = jjnr[k]; j3A = 3*jnrA; GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A,jx,jy,jz); GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA,raj); GMX_MM_LOAD_1VALUE_PD(obc_param+jnrA,sk_aj); dx = _mm_sub_sd(ix, jx); dy = _mm_sub_sd(iy, jy); dz = _mm_sub_sd(iz, jz); rsq = gmx_mm_calc_rsq_pd(dx,dy,dz); rinv = gmx_mm_invsqrt_pd(rsq); r = _mm_mul_sd(rsq,rinv); /* Compute raj_inv aj1-4 */ raj_inv = gmx_mm_inv_pd(raj); /* Evaluate influence of atom aj -> ai */ t1 = _mm_add_sd(r,sk_aj); t2 = _mm_sub_sd(r,sk_aj); t3 = _mm_sub_sd(sk_aj,r); obc_mask1 = _mm_cmplt_sd(rai, t1); obc_mask2 = _mm_cmplt_sd(rai, t2); obc_mask3 = _mm_cmplt_sd(rai, t3); uij = gmx_mm_inv_pd(t1); lij = _mm_or_pd(_mm_and_pd(obc_mask2,gmx_mm_inv_pd(t2)), _mm_andnot_pd(obc_mask2,rai_inv)); dlij = _mm_and_pd(one,obc_mask2); uij2 = _mm_mul_sd(uij, uij); uij3 = _mm_mul_sd(uij2,uij); lij2 = _mm_mul_sd(lij, lij); lij3 = _mm_mul_sd(lij2,lij); diff2 = _mm_sub_sd(uij2,lij2); lij_inv = gmx_mm_invsqrt_pd(lij2); sk2_aj = _mm_mul_sd(sk_aj,sk_aj); sk2_rinv = _mm_mul_sd(sk2_aj,rinv); prod = _mm_mul_sd(onefourth,sk2_rinv); logterm = gmx_mm_log_pd(_mm_mul_sd(uij,lij_inv)); t1 = _mm_sub_sd(lij,uij); t2 = _mm_mul_sd(diff2, _mm_sub_sd(_mm_mul_pd(onefourth,r), prod)); t3 = _mm_mul_sd(half,_mm_mul_sd(rinv,logterm)); t1 = _mm_add_sd(t1,_mm_add_sd(t2,t3)); t4 = _mm_mul_sd(two,_mm_sub_sd(rai_inv,lij)); t4 = _mm_and_pd(t4,obc_mask3); t1 = _mm_mul_sd(half,_mm_add_sd(t1,t4)); sum_ai = _mm_add_sd(sum_ai, _mm_and_pd(t1,obc_mask1) ); t1 = _mm_add_sd(_mm_mul_sd(half,lij2), _mm_mul_sd(prod,lij3)); t1 = _mm_sub_sd(t1, _mm_mul_sd(onefourth, _mm_add_sd(_mm_mul_sd(lij,rinv), _mm_mul_sd(lij3,r)))); t2 = _mm_mul_sd(onefourth, _mm_add_sd(_mm_mul_sd(uij,rinv), _mm_mul_sd(uij3,r))); t2 = _mm_sub_sd(t2, _mm_add_sd(_mm_mul_sd(half,uij2), _mm_mul_sd(prod,uij3))); t3 = _mm_mul_sd(_mm_mul_sd(onefourth,logterm), _mm_mul_sd(rinv,rinv)); t3 = _mm_sub_sd(t3, _mm_mul_sd(_mm_mul_sd(diff2,oneeighth), _mm_add_sd(one, _mm_mul_sd(sk2_rinv,rinv)))); t1 = _mm_mul_sd(rinv, _mm_add_sd(_mm_mul_sd(dlij,t1), _mm_add_pd(t2,t3))); dadx1 = _mm_and_pd(t1,obc_mask1); /* Evaluate influence of atom ai -> aj */ t1 = _mm_add_sd(r,sk_ai); t2 = _mm_sub_sd(r,sk_ai); t3 = _mm_sub_sd(sk_ai,r); obc_mask1 = _mm_cmplt_sd(raj, t1); obc_mask2 = _mm_cmplt_sd(raj, t2); obc_mask3 = _mm_cmplt_sd(raj, t3); uij = gmx_mm_inv_pd(t1); lij = _mm_or_pd( _mm_and_pd(obc_mask2,gmx_mm_inv_pd(t2)), _mm_andnot_pd(obc_mask2,raj_inv)); dlij = _mm_and_pd(one,obc_mask2); uij2 = _mm_mul_sd(uij, uij); uij3 = _mm_mul_sd(uij2,uij); lij2 = _mm_mul_sd(lij, lij); lij3 = _mm_mul_sd(lij2,lij); diff2 = _mm_sub_sd(uij2,lij2); lij_inv = gmx_mm_invsqrt_pd(lij2); sk2_rinv = _mm_mul_sd(sk2_ai,rinv); prod = _mm_mul_sd(onefourth,sk2_rinv); logterm = gmx_mm_log_pd(_mm_mul_sd(uij,lij_inv)); t1 = _mm_sub_sd(lij,uij); t2 = _mm_mul_sd(diff2, _mm_sub_sd(_mm_mul_sd(onefourth,r), prod)); t3 = _mm_mul_sd(half,_mm_mul_sd(rinv,logterm)); t1 = _mm_add_sd(t1,_mm_add_sd(t2,t3)); t4 = _mm_mul_sd(two,_mm_sub_sd(raj_inv,lij)); t4 = _mm_and_pd(t4,obc_mask3); t1 = _mm_mul_sd(half,_mm_add_sd(t1,t4)); GMX_MM_INCREMENT_1VALUE_PD(work+jnrA,_mm_and_pd(t1,obc_mask1)); t1 = _mm_add_sd(_mm_mul_sd(half,lij2), _mm_mul_sd(prod,lij3)); t1 = _mm_sub_sd(t1, _mm_mul_sd(onefourth, _mm_add_sd(_mm_mul_sd(lij,rinv), _mm_mul_sd(lij3,r)))); t2 = _mm_mul_sd(onefourth, _mm_add_sd(_mm_mul_sd(uij,rinv), _mm_mul_sd(uij3,r))); t2 = _mm_sub_sd(t2, _mm_add_sd(_mm_mul_sd(half,uij2), _mm_mul_sd(prod,uij3))); t3 = _mm_mul_sd(_mm_mul_sd(onefourth,logterm), _mm_mul_sd(rinv,rinv)); t3 = _mm_sub_sd(t3, _mm_mul_sd(_mm_mul_sd(diff2,oneeighth), _mm_add_sd(one, _mm_mul_sd(sk2_rinv,rinv)))); t1 = _mm_mul_sd(rinv, _mm_add_sd(_mm_mul_sd(dlij,t1), _mm_add_sd(t2,t3))); dadx2 = _mm_and_pd(t1,obc_mask1); _mm_store_pd(dadx,dadx1); dadx += 2; _mm_store_pd(dadx,dadx2); dadx += 2; } gmx_mm_update_1pot_pd(sum_ai,work+ii); } /* Parallel summations */ if(PARTDECOMP(cr)) { gmx_sum(natoms, work, cr); } else if(DOMAINDECOMP(cr)) { dd_atom_sum_real(cr->dd, work); } if(gb_algorithm==egbHCT) { /* HCT */ for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */ { if(born->use[i] != 0) { rr = top->atomtypes.gb_radius[md->typeA[i]]-doffset; sum = 1.0/rr - work[i]; min_rad = rr + doffset; rad = 1.0/sum; born->bRad[i] = rad > min_rad ? rad : min_rad; fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]); } } /* Extra communication required for DD */ if(DOMAINDECOMP(cr)) { dd_atom_spread_real(cr->dd, born->bRad); dd_atom_spread_real(cr->dd, fr->invsqrta); } } else { /* OBC */ for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */ { if(born->use[i] != 0) { rr = top->atomtypes.gb_radius[md->typeA[i]]; rr_inv2 = 1.0/rr; rr = rr-doffset; rr_inv = 1.0/rr; sum = rr * work[i]; sum2 = sum * sum; sum3 = sum2 * sum; tsum = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3); born->bRad[i] = rr_inv - tsum*rr_inv2; born->bRad[i] = 1.0 / born->bRad[i]; fr->invsqrta[i]=gmx_invsqrt(born->bRad[i]); tchain = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2); born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2; } } /* Extra (local) communication required for DD */ if(DOMAINDECOMP(cr)) { dd_atom_spread_real(cr->dd, born->bRad); dd_atom_spread_real(cr->dd, fr->invsqrta); dd_atom_spread_real(cr->dd, born->drobc); } } return 0; }
void kernel_dgemv_t_1_lib4(int kmax, double *A, int sda, double *x, double *y, int alg) { if(kmax<=0) return; const int lda = 4; double *tA, *tx; int k; int ka = kmax; // number from aligned positon __m256d aaxx_temp, a_00_10_20_30, x_0_1_2_3, y_00; __m128d ax_temp, a_00_10, x_0_1, y_0, y_1, y_0_1; y_00 = _mm256_setzero_pd(); y_0 = _mm256_castpd256_pd128(y_00); k = lda*(ka/lda); tA = A + (ka/lda)*sda*lda; tx = x + (ka/lda)*lda; for(; k<ka; k++) { x_0_1 = _mm_load_sd( &tx[0] ); a_00_10 = _mm_load_sd( &tA[0+lda*0] ); /* y_0 += a_00_10 * x_0_1;*/ ax_temp = _mm_mul_sd( a_00_10, x_0_1 ); y_0 = _mm_add_sd (y_0, ax_temp ); tA += 1; tx += 1; } y_00 = _mm256_castpd128_pd256(y_0); k=0; for(; k<ka-3; k+=4) { x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); /* y_00 += a_00_10_20_30 * x_0_1_2_3;*/ aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, aaxx_temp ); A += 4 + (sda-1)*lda; x += 4; } y_00 = _mm256_hadd_pd(y_00, y_00); y_1 = _mm256_extractf128_pd(y_00, 1); y_0 = _mm256_castpd256_pd128(y_00); /* y_0 += y_1;*/ y_0 = _mm_add_sd( y_0, y_1 ); if(alg==0) { _mm_store_sd(&y[0], y_0); } else if(alg==1) { y_0_1 = _mm_load_sd( &y[0] ); /* y_0_1 += y_0;*/ y_0_1 = _mm_add_sd( y_0_1, y_0 ); _mm_store_sd(&y[0], y_0_1); } else // alg==-1 { y_0_1 = _mm_load_sd( &y[0] ); /* y_0_1 -= y_0;*/ y_0_1 = _mm_sub_sd( y_0_1, y_0 ); _mm_store_sd(&y[0], y_0_1); } }
void kernel_dgemv_t_8_lib4(int kmax, double *A, int sda, double *x, double *y, int alg) { if(kmax<=0) return; const int lda = 4; __builtin_prefetch( A + 0*lda ); __builtin_prefetch( A + 2*lda ); __builtin_prefetch( A + 4*lda ); __builtin_prefetch( A + 6*lda ); double *tA, *tx; int k; int ka = kmax; // number from aligned positon __m256d aaxx_temp, a_00_10_20_30, a_01_11_21_31, a_02_12_22_32, a_03_13_23_33, x_0_1_2_3, y_00, y_11, y_22, y_33, y_44, y_55, y_66, y_77; __m128d ax_temp, a_00_10, a_01_11, a_02_12, a_03_13, x_0_1, y_0, y_1, y_2, y_3, y_4, y_5, y_6, y_7; y_00 = _mm256_setzero_pd(); y_11 = _mm256_setzero_pd(); y_22 = _mm256_setzero_pd(); y_33 = _mm256_setzero_pd(); y_44 = _mm256_setzero_pd(); y_55 = _mm256_setzero_pd(); y_66 = _mm256_setzero_pd(); y_77 = _mm256_setzero_pd(); y_0 = _mm256_castpd256_pd128(y_00); y_1 = _mm256_castpd256_pd128(y_11); y_2 = _mm256_castpd256_pd128(y_22); y_3 = _mm256_castpd256_pd128(y_33); y_4 = _mm256_castpd256_pd128(y_44); y_5 = _mm256_castpd256_pd128(y_55); y_6 = _mm256_castpd256_pd128(y_66); y_7 = _mm256_castpd256_pd128(y_77); k = lda*(ka/lda); tA = A + (ka/lda)*sda*lda; tx = x + (ka/lda)*lda; if(ka-k>0) // it can be only ka-k = {1, 2, 3} { if((ka-k)>=2) { x_0_1 = _mm_load_pd( &tx[0] ); a_00_10 = _mm_load_pd( &tA[0+lda*0] ); a_01_11 = _mm_load_pd( &tA[0+lda*1] ); a_02_12 = _mm_load_pd( &tA[0+lda*2] ); a_03_13 = _mm_load_pd( &tA[0+lda*3] ); ax_temp = _mm_mul_pd( a_00_10, x_0_1 ); y_0 = _mm_add_pd (y_0, ax_temp ); ax_temp = _mm_mul_pd( a_01_11, x_0_1 ); y_1 = _mm_add_pd (y_1, ax_temp ); ax_temp = _mm_mul_pd( a_02_12, x_0_1 ); y_2 = _mm_add_pd (y_2, ax_temp ); ax_temp = _mm_mul_pd( a_03_13, x_0_1 ); y_3 = _mm_add_pd (y_3, ax_temp ); a_00_10 = _mm_load_pd( &tA[0+lda*4] ); a_01_11 = _mm_load_pd( &tA[0+lda*5] ); a_02_12 = _mm_load_pd( &tA[0+lda*6] ); a_03_13 = _mm_load_pd( &tA[0+lda*7] ); ax_temp = _mm_mul_pd( a_00_10, x_0_1 ); y_4 = _mm_add_pd (y_4, ax_temp ); ax_temp = _mm_mul_pd( a_01_11, x_0_1 ); y_5 = _mm_add_pd (y_5, ax_temp ); ax_temp = _mm_mul_pd( a_02_12, x_0_1 ); y_6 = _mm_add_pd (y_6, ax_temp ); ax_temp = _mm_mul_pd( a_03_13, x_0_1 ); y_7 = _mm_add_pd (y_7, ax_temp ); tA += 2; tx += 2; k+=2; } if((ka-k)==1) { x_0_1 = _mm_load_sd( &tx[0] ); a_00_10 = _mm_load_sd( &tA[0+lda*0] ); a_01_11 = _mm_load_sd( &tA[0+lda*1] ); a_02_12 = _mm_load_sd( &tA[0+lda*2] ); a_03_13 = _mm_load_sd( &tA[0+lda*3] ); ax_temp = _mm_mul_sd( a_00_10, x_0_1 ); y_0 = _mm_add_sd (y_0, ax_temp ); ax_temp = _mm_mul_sd( a_01_11, x_0_1 ); y_1 = _mm_add_sd (y_1, ax_temp ); ax_temp = _mm_mul_sd( a_02_12, x_0_1 ); y_2 = _mm_add_sd (y_2, ax_temp ); ax_temp = _mm_mul_sd( a_03_13, x_0_1 ); y_3 = _mm_add_sd (y_3, ax_temp ); a_00_10 = _mm_load_sd( &tA[0+lda*4] ); a_01_11 = _mm_load_sd( &tA[0+lda*5] ); a_02_12 = _mm_load_sd( &tA[0+lda*6] ); a_03_13 = _mm_load_sd( &tA[0+lda*7] ); ax_temp = _mm_mul_sd( a_00_10, x_0_1 ); y_4 = _mm_add_sd (y_4, ax_temp ); ax_temp = _mm_mul_sd( a_01_11, x_0_1 ); y_5 = _mm_add_sd (y_5, ax_temp ); ax_temp = _mm_mul_sd( a_02_12, x_0_1 ); y_6 = _mm_add_sd (y_6, ax_temp ); ax_temp = _mm_mul_sd( a_03_13, x_0_1 ); y_7 = _mm_add_sd (y_7, ax_temp ); tA += 1; tx += 1; k++; } } y_00 = _mm256_castpd128_pd256(y_0); y_11 = _mm256_castpd128_pd256(y_1); y_22 = _mm256_castpd128_pd256(y_2); y_33 = _mm256_castpd128_pd256(y_3); y_44 = _mm256_castpd128_pd256(y_4); y_55 = _mm256_castpd128_pd256(y_5); y_66 = _mm256_castpd128_pd256(y_6); y_77 = _mm256_castpd128_pd256(y_7); k=0; for(; k<ka-7; k+=8) { __builtin_prefetch( A + sda*lda + 0*lda ); __builtin_prefetch( A + sda*lda + 2*lda ); x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_11 = _mm256_add_pd( y_11, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); y_22 = _mm256_add_pd( y_22, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_33 = _mm256_add_pd( y_33, aaxx_temp ); __builtin_prefetch( A + sda*lda + 4*lda ); __builtin_prefetch( A + sda*lda + 6*lda ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] ); aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); y_44 = _mm256_add_pd( y_44, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_55 = _mm256_add_pd( y_55, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); y_66 = _mm256_add_pd( y_66, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_77 = _mm256_add_pd( y_77, aaxx_temp ); A += 4 + (sda-1)*lda; x += 4; __builtin_prefetch( A + sda*lda + 0*lda ); __builtin_prefetch( A + sda*lda + 2*lda ); x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_11 = _mm256_add_pd( y_11, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); y_22 = _mm256_add_pd( y_22, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_33 = _mm256_add_pd( y_33, aaxx_temp ); __builtin_prefetch( A + sda*lda + 4*lda ); __builtin_prefetch( A + sda*lda + 6*lda ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] ); aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); y_44 = _mm256_add_pd( y_44, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_55 = _mm256_add_pd( y_55, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); y_66 = _mm256_add_pd( y_66, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_77 = _mm256_add_pd( y_77, aaxx_temp ); A += 4 + (sda-1)*lda; x += 4; } for(; k<ka-3; k+=4) { __builtin_prefetch( A + sda*lda + 0*lda ); __builtin_prefetch( A + sda*lda + 2*lda ); x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_11 = _mm256_add_pd( y_11, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); y_22 = _mm256_add_pd( y_22, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_33 = _mm256_add_pd( y_33, aaxx_temp ); __builtin_prefetch( A + sda*lda + 4*lda ); __builtin_prefetch( A + sda*lda + 6*lda ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] ); aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); y_44 = _mm256_add_pd( y_44, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_55 = _mm256_add_pd( y_55, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); y_66 = _mm256_add_pd( y_66, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_77 = _mm256_add_pd( y_77, aaxx_temp ); A += 4 + (sda-1)*lda; x += 4; } __m256d y_0_1_2_3, y_4_5_6_7; y_00 = _mm256_hadd_pd(y_00, y_11); y_22 = _mm256_hadd_pd(y_22, y_33); y_44 = _mm256_hadd_pd(y_44, y_55); y_66 = _mm256_hadd_pd(y_66, y_77); y_11 = _mm256_permute2f128_pd(y_22, y_00, 2 ); y_00 = _mm256_permute2f128_pd(y_22, y_00, 19); y_55 = _mm256_permute2f128_pd(y_66, y_44, 2 ); y_44 = _mm256_permute2f128_pd(y_66, y_44, 19); y_00 = _mm256_add_pd( y_00, y_11 ); y_44 = _mm256_add_pd( y_44, y_55 ); if(alg==0) { _mm256_storeu_pd(&y[0], y_00); _mm256_storeu_pd(&y[4], y_44); } else if(alg==1) { y_0_1_2_3 = _mm256_loadu_pd( &y[0] ); y_4_5_6_7 = _mm256_loadu_pd( &y[4] ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_00 ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, y_44 ); _mm256_storeu_pd(&y[0], y_0_1_2_3); _mm256_storeu_pd(&y[4], y_4_5_6_7); } else // alg==-1 { y_0_1_2_3 = _mm256_loadu_pd( &y[0] ); y_4_5_6_7 = _mm256_loadu_pd( &y[4] ); y_0_1_2_3 = _mm256_sub_pd( y_0_1_2_3, y_00 ); y_4_5_6_7 = _mm256_sub_pd( y_4_5_6_7, y_44 ); _mm256_storeu_pd(&y[0], y_0_1_2_3); _mm256_storeu_pd(&y[4], y_4_5_6_7); } }
double GetResult(double * LeftMatrix, double * RightMatrix, int N, int L, int M) { // матрица LeftMatrix хранится по строкам // матрица RighttMatrix хранится по строкам // L ― число столбцов LeftMatrix и число строк RighttMatrix // N ― число строк LeftMatrix // M ― число столбцов RighttMatrix // Возвращаемый результат ― сумма всех элементов произведения LeftMatrix на RightMatrix слева направо int i=0; int j=0; int k=0; int k0,ktop; int leftindex=0; int rightindex=0; double sum=0.0; #ifdef __SSE2__ int MX = (M&1) ? M : 0; int M2 = M & ~1; #endif int kstride = MIN(L2_CACHE*3/L/sizeof(double)/4, TLB_SIZE*PAGE_SIZE*3/L/sizeof(double)/4); int istride = TLB_SIZE/4; int jstride = L1_CACHE*3/sizeof(double)/4; #pragma omp parallel private(i, j, k, k0, ktop) reduction(+: sum) { #ifdef __SSE2__ double temp[2]; __m128d sum2 = _mm_set1_pd(0.0); __m128d sum3 = _mm_set1_pd(0.0); __m128d sum4 = _mm_set1_pd(0.0); __m128d sum5 = _mm_set1_pd(0.0); __m128d sum6 = _mm_set1_pd(0.0); __m128d sum7 = _mm_set1_pd(0.0); #endif for(k0=0;k0<L;k0+=kstride) { ktop = MIN(k0+kstride,L); #ifdef _OPENMP for(int i0=omp_get_thread_num()*istride;i0<N;i0+=omp_get_num_threads()*istride) #else for(int i0=0;i0<N;i0+=istride) #endif { int itop = MIN(i0+istride,N); for(k=k0;k<ktop;k++) { for (int j0=0;j0<M;j0+=jstride) { #ifdef __SSE2__ int jtop = MIN(jstride,M2-j0); int MX2 = (jtop < jstride ? MX-j0 : 0); #else int jtop = MIN(jstride,M-j0); #endif double *pright = RightMatrix + k*M + j0; for(i=i0;i<itop;i++) { double left = LeftMatrix[i*L+k]; #ifdef __SSE2__ __m128d left2 = _mm_set1_pd(left); if (((long)pright)&0xF) { for(j=0;j<jtop-10;j+=12) { sum2 = _mm_add_pd(sum2, _mm_mul_pd(left2, _mm_loadu_pd(pright+j))); sum3 = _mm_add_pd(sum3, _mm_mul_pd(left2, _mm_loadu_pd(pright+j+2))); sum4 = _mm_add_pd(sum4, _mm_mul_pd(left2, _mm_loadu_pd(pright+j+4))); sum5 = _mm_add_pd(sum5, _mm_mul_pd(left2, _mm_loadu_pd(pright+j+6))); sum6 = _mm_add_pd(sum6, _mm_mul_pd(left2, _mm_loadu_pd(pright+j+8))); sum7 = _mm_add_pd(sum7, _mm_mul_pd(left2, _mm_loadu_pd(pright+j+10))); } for(;j<jtop;j+=2) sum2 = _mm_add_pd(sum2, _mm_mul_pd(left2, _mm_loadu_pd(pright+j))); } else { for(j=0;j<jtop-10;j+=12) { sum2 = _mm_add_pd(sum2, _mm_mul_pd(left2, _mm_load_pd(pright+j))); sum3 = _mm_add_pd(sum3, _mm_mul_pd(left2, _mm_load_pd(pright+j+2))); sum4 = _mm_add_pd(sum4, _mm_mul_pd(left2, _mm_load_pd(pright+j+4))); sum5 = _mm_add_pd(sum5, _mm_mul_pd(left2, _mm_load_pd(pright+j+6))); sum6 = _mm_add_pd(sum6, _mm_mul_pd(left2, _mm_load_pd(pright+j+8))); sum7 = _mm_add_pd(sum7, _mm_mul_pd(left2, _mm_load_pd(pright+j+10))); } for(;j<jtop;j+=2) sum2 = _mm_add_pd(sum2, _mm_mul_pd(left2, _mm_load_pd(pright+j))); } if (MX2) sum3 = _mm_add_sd(sum3, _mm_mul_sd(left2, _mm_load_sd(pright+MX2-1))); #else double s1=0,s2=0,s3=0,s4=0; for(j=0;j<jtop-3;j+=4) { s1 += left*pright[j]; s2 += left*pright[j+1]; s3 += left*pright[j+2]; s4 += left*pright[j+3]; } for(;j<jtop;j++) sum += left*pright[j]; sum += s1 + s2 + s3 + s4; #endif } } } } } #ifdef __SSE2__ _mm_storeu_pd(temp, _mm_add_pd(_mm_add_pd(sum2,_mm_add_pd(sum3,sum6)),_mm_add_pd(sum4,_mm_add_pd(sum5,sum7)))); sum += temp[0]+temp[1]; #endif } return sum; }
void CalcGravity(int sp, PSpot* allSpot,int* length) { __m128d force1 = _mm_set1_pd(0); __m128d force2 = _mm_set1_pd(0); PSpot* spotSp = &allSpot[sp]; for(int i=0;i<sp;i++) { __m128d diff1 = _mm_sub_pd(allSpot[i].pos1, spotSp->pos1); __m128d diff2 = _mm_sub_sd(allSpot[i].pos2, spotSp->pos2); __m128d r = Length(diff1, diff2); if (r.m128d_f64[0]*2 < (spotSp->qmass + allSpot[i].qmass)) { if (allSpot[i].mass > spotSp->mass) { allSpot[i].heading1 = _mm_add_pd( allSpot[i].heading1, _mm_mul_pd( _mm_sub_pd(spotSp->heading1, allSpot[i].heading1), _mm_set1_pd(spotSp->mass / (spotSp->mass + allSpot[i].mass)) ) ); allSpot[i].heading2 = _mm_add_sd( allSpot[i].heading2, _mm_mul_sd( _mm_sub_sd(spotSp->heading2, allSpot[i].heading2), _mm_set1_pd(spotSp->mass / (spotSp->mass + allSpot[i].mass)) ) ); allSpot[i].mass += spotSp->mass; allSpot[i].qmass = pow(allSpot[i].mass, 0.33333); spotSp->mass = 0; (*length)--; PSpot temp; temp = allSpot[sp]; allSpot[sp] = allSpot[*length]; allSpot[*length] = temp; return; } else { spotSp->heading1 = _mm_add_pd( spotSp->heading1, _mm_mul_pd( _mm_sub_pd(allSpot[i].heading1, spotSp->heading1), _mm_set1_pd(allSpot[i].mass / (spotSp->mass + allSpot[i].mass)) ) ); spotSp->heading2 = _mm_add_sd( spotSp->heading2, _mm_mul_sd( _mm_sub_sd(allSpot[i].heading2, spotSp->heading2), _mm_set1_pd(allSpot[i].mass / (spotSp->mass + allSpot[i].mass)) ) ); spotSp->mass += allSpot[i].mass; spotSp->qmass = pow(spotSp->mass, 0.33333); allSpot[i].mass = 0; (*length)--; PSpot temp; temp = allSpot[i]; allSpot[i] = allSpot[*length]; allSpot[*length] = temp; return; } } //float f = (G * spotSp->mass * allSpot[i].mass) / (r.m128d_f64[0] * r.m128d_f64[0] * r.m128d_f64[0]); __m128d r1 = r; r1.m128d_f64[1] = G; __m128d r2 = r; r2.m128d_f64[1] = spotSp->mass; __m128d r3 = r; r3.m128d_f64[1] = allSpot[i].mass; __m128d r4 = _mm_mul_pd(_mm_mul_pd(r1, r2), r3); __m128d r5 = _mm_shuffle_pd(r4, r4, 3); r4 = _mm_shuffle_pd(r4, r4, 0); __m128d r6 = _mm_div_pd(r5, r4); force1 = _mm_add_pd(force1,_mm_mul_pd(diff1, r6)); force2 = _mm_add_sd(force2,_mm_mul_sd(diff2, r6)); } for(int i=sp+1;i<*length;i++) { __m128d diff1 = _mm_sub_pd(allSpot[i].pos1, spotSp->pos1); __m128d diff2 = _mm_sub_sd(allSpot[i].pos2, spotSp->pos2); __m128d r = Length(diff1, diff2); if (r.m128d_f64[0]*2 < (spotSp->qmass + allSpot[i].qmass)) { if (allSpot[i].mass > spotSp->mass) { allSpot[i].heading1 = _mm_add_pd( allSpot[i].heading1, _mm_mul_pd( _mm_sub_pd(spotSp->heading1, allSpot[i].heading1), _mm_set1_pd(spotSp->mass / (spotSp->mass + allSpot[i].mass)) ) ); allSpot[i].heading2 = _mm_add_sd( allSpot[i].heading2, _mm_mul_sd( _mm_sub_sd(spotSp->heading2, allSpot[i].heading2), _mm_set1_pd(spotSp->mass / (spotSp->mass + allSpot[i].mass)) ) ); allSpot[i].mass += spotSp->mass; allSpot[i].qmass = pow(allSpot[i].mass, 0.33333); spotSp->mass = 0; (*length)--; PSpot temp; temp = allSpot[sp]; allSpot[sp] = allSpot[*length]; allSpot[*length] = temp; return; } else { spotSp->heading1 = _mm_add_pd( spotSp->heading1, _mm_mul_pd( _mm_sub_pd(allSpot[i].heading1, spotSp->heading1), _mm_set1_pd(allSpot[i].mass / (spotSp->mass + allSpot[i].mass)) ) ); spotSp->heading2 = _mm_add_sd( spotSp->heading2, _mm_mul_sd( _mm_sub_sd(allSpot[i].heading2, spotSp->heading2), _mm_set1_pd(allSpot[i].mass / (spotSp->mass + allSpot[i].mass)) ) ); spotSp->mass += allSpot[i].mass; spotSp->qmass = pow(spotSp->mass, 0.33333); allSpot[i].mass = 0; (*length)--; PSpot temp; temp = allSpot[i]; allSpot[i] = allSpot[*length]; allSpot[*length] = temp; return; } } //float f = (G * spotSp->mass * allSpot[i].mass) / (r.m128d_f64[0] * r.m128d_f64[0] * r.m128d_f64[0]); __m128d r1 = r; r1.m128d_f64[1] = G; __m128d r2 = r; r2.m128d_f64[1] = spotSp->mass; __m128d r3 = r; r3.m128d_f64[1] = allSpot[i].mass; __m128d r4 = _mm_mul_pd(_mm_mul_pd(r1, r2), r3); __m128d r5 = _mm_shuffle_pd(r4, r4, 3); r4 = _mm_shuffle_pd(r4, r4, 0); __m128d r6 = _mm_div_pd(r5, r4); force1 = _mm_add_pd(force1,_mm_mul_pd(diff1, r6)); force2 = _mm_add_sd(force2,_mm_mul_sd(diff2, r6)); } force1 = _mm_div_pd(force1, _mm_set1_pd(spotSp->mass)); force2 = _mm_div_sd(force2, _mm_set1_pd(spotSp->mass)); __m128d forcef = Length(force1, force2); if (forcef.m128d_f64[0] > 0) { double gate = 0.001f; double step = gate / forcef.m128d_f64[0]; if (spotSp->process + step < 1) { spotSp->process += step; } else { step = 1 - spotSp->process; spotSp->process = 1; } __m128d stepd = _mm_set1_pd(step); spotSp->heading1 = _mm_add_pd(spotSp->heading1,_mm_mul_pd(force1,stepd)); spotSp->heading2 = _mm_add_sd(spotSp->heading2,_mm_mul_sd(force2,stepd)); spotSp->pos1 = _mm_add_pd(spotSp->pos1, _mm_mul_pd(spotSp->heading1,stepd)); spotSp->pos2 = _mm_add_sd(spotSp->pos2, _mm_mul_sd(spotSp->heading2,stepd)); } else { spotSp->pos1 = _mm_add_pd(spotSp->pos1, spotSp->heading1); spotSp->pos2 = _mm_add_sd(spotSp->pos2, spotSp->heading2); spotSp->process = 1; } }
static inline __m128d my_invrsq_pd(__m128d x) { const __m128d three = (const __m128d) {3.0f, 3.0f}; const __m128d half = (const __m128d) {0.5f, 0.5f}; __m128 t = _mm_rsqrt_ps(_mm_cvtpd_ps(x)); /* Convert to single precision and do _mm_rsqrt_ps() */ __m128d t1 = _mm_cvtps_pd(t); /* Convert back to double precision */ /* First Newton-Rapson step, accuracy is now 24 bits */ __m128d t2 = _mm_mul_pd(half,_mm_mul_pd(t1,_mm_sub_pd(three,_mm_mul_pd(x,_mm_mul_pd(t1,t1))))); /* Return second Newton-Rapson step, accuracy 48 bits */ return (__m128d) _mm_mul_pd(half,_mm_mul_pd(t2,_mm_sub_pd(three,_mm_mul_pd(x,_mm_mul_pd(t2,t2))))); } /* to extract single integers from a __m128i datatype */ #define _mm_extract_epi64(x, imm) \ _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm))) void nb_kernel400_x86_64_sse2(int * p_nri, int * iinr, int * jindex, int * jjnr, int * shift, double * shiftvec, double * fshift, int * gid, double * pos, double * faction, double * charge, double * p_facel, double * p_krf, double * p_crf, double * Vc, int * type, int * p_ntype, double * vdwparam, double * Vvdw, double * p_tabscale, double * VFtab, double * invsqrta, double * dvda, double * p_gbtabscale, double * GBtab, int * p_nthreads, int * count, void * mtx, int * outeriter, int * inneriter, double * work) { int nri,ntype,nthreads,offset; int n,ii,is3,ii3,k,nj0,nj1,jnr1,jnr2,j13,j23,ggid; double facel,krf,crf,tabscl,gbtabscl,vct,vgbt; double shX,shY,shZ,isai_d,dva; gmx_gbdata_t *gbdata; float * gpol; __m128d ix,iy,iz,jx,jy,jz; __m128d dx,dy,dz,t1,t2,t3; __m128d fix,fiy,fiz,rsq11,rinv,r,fscal,rt,eps,eps2; __m128d q,iq,qq,isai,isaj,isaprod,vcoul,gbscale,dvdai,dvdaj; __m128d Y,F,G,H,Fp,VV,FF,vgb,fijC,dvdatmp,dvdasum,vctot,vgbtot,n0d; __m128d xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8; __m128d fac,tabscale,gbtabscale; __m128i n0,nnn; const __m128d neg = {-1.0f,-1.0f}; const __m128d zero = {0.0f,0.0f}; const __m128d half = {0.5f,0.5f}; const __m128d two = {2.0f,2.0f}; const __m128d three = {3.0f,3.0f}; gbdata = (gmx_gbdata_t *)work; gpol = gbdata->gpol; nri = *p_nri; ntype = *p_ntype; nthreads = *p_nthreads; facel = (*p_facel) * (1.0 - (1.0/gbdata->gb_epsilon_solvent)); krf = *p_krf; crf = *p_crf; tabscl = *p_tabscale; gbtabscl = *p_gbtabscale; nj1 = 0; /* Splat variables */ fac = _mm_load1_pd(&facel); tabscale = _mm_load1_pd(&tabscl); gbtabscale = _mm_load1_pd(&gbtabscl); /* Keep compiler happy */ dvdatmp = _mm_setzero_pd(); vgb = _mm_setzero_pd(); dvdaj = _mm_setzero_pd(); isaj = _mm_setzero_pd(); vcoul = _mm_setzero_pd(); t1 = _mm_setzero_pd(); t2 = _mm_setzero_pd(); t3 = _mm_setzero_pd(); jnr1=jnr2=0; j13=j23=0; for(n=0;n<nri;n++) { is3 = 3*shift[n]; shX = shiftvec[is3]; shY = shiftvec[is3+1]; shZ = shiftvec[is3+2]; nj0 = jindex[n]; nj1 = jindex[n+1]; offset = (nj1-nj0)%2; ii = iinr[n]; ii3 = ii*3; ix = _mm_set1_pd(shX+pos[ii3+0]); iy = _mm_set1_pd(shX+pos[ii3+1]); iz = _mm_set1_pd(shX+pos[ii3+2]); q = _mm_set1_pd(charge[ii]); iq = _mm_mul_pd(fac,q); isai_d = invsqrta[ii]; isai = _mm_load1_pd(&isai_d); fix = _mm_setzero_pd(); fiy = _mm_setzero_pd(); fiz = _mm_setzero_pd(); dvdasum = _mm_setzero_pd(); vctot = _mm_setzero_pd(); vgbtot = _mm_setzero_pd(); for(k=nj0;k<nj1-offset; k+=2) { jnr1 = jjnr[k]; jnr2 = jjnr[k+1]; j13 = jnr1 * 3; j23 = jnr2 * 3; /* Load coordinates */ xmm1 = _mm_loadu_pd(pos+j13); /* x1 y1 */ xmm2 = _mm_loadu_pd(pos+j23); /* x2 y2 */ xmm5 = _mm_load_sd(pos+j13+2); /* z1 - */ xmm6 = _mm_load_sd(pos+j23+2); /* z2 - */ /* transpose */ jx = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); jy = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); jz = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* distances */ dx = _mm_sub_pd(ix,jx); dy = _mm_sub_pd(iy,jy); dz = _mm_sub_pd(iz,jz); rsq11 = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) ); rinv = my_invrsq_pd(rsq11); /* Load invsqrta */ isaj = _mm_loadl_pd(isaj,invsqrta+jnr1); isaj = _mm_loadh_pd(isaj,invsqrta+jnr2); isaprod = _mm_mul_pd(isai,isaj); /* Load charges */ q = _mm_loadl_pd(q,charge+jnr1); q = _mm_loadh_pd(q,charge+jnr2); qq = _mm_mul_pd(iq,q); vcoul = _mm_mul_pd(qq,rinv); fscal = _mm_mul_pd(vcoul,rinv); qq = _mm_mul_pd(isaprod,qq); qq = _mm_mul_pd(qq,neg); gbscale = _mm_mul_pd(isaprod,gbtabscale); /* Load dvdaj */ dvdaj = _mm_loadl_pd(dvdaj, dvda+jnr1); dvdaj = _mm_loadh_pd(dvdaj, dvda+jnr2); r = _mm_mul_pd(rsq11,rinv); rt = _mm_mul_pd(r,gbscale); n0 = _mm_cvttpd_epi32(rt); n0d = _mm_cvtepi32_pd(n0); eps = _mm_sub_pd(rt,n0d); eps2 = _mm_mul_pd(eps,eps); nnn = _mm_slli_epi64(n0,2); xmm1 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))); /* Y1 F1 */ xmm2 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))); /* Y2 F2 */ xmm3 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))+2); /* G1 H1 */ xmm4 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))+2); /* G2 H2 */ Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */ F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */ G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */ H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */ G = _mm_mul_pd(G,eps); H = _mm_mul_pd(H,eps2); Fp = _mm_add_pd(F,G); Fp = _mm_add_pd(Fp,H); VV = _mm_mul_pd(Fp,eps); VV = _mm_add_pd(Y,VV); H = _mm_mul_pd(two,H); FF = _mm_add_pd(Fp,G); FF = _mm_add_pd(FF,H); vgb = _mm_mul_pd(qq,VV); fijC = _mm_mul_pd(qq,FF); fijC = _mm_mul_pd(fijC,gbscale); dvdatmp = _mm_mul_pd(fijC,r); dvdatmp = _mm_add_pd(vgb,dvdatmp); dvdatmp = _mm_mul_pd(dvdatmp,neg); dvdatmp = _mm_mul_pd(dvdatmp,half); dvdasum = _mm_add_pd(dvdasum,dvdatmp); xmm1 = _mm_mul_pd(dvdatmp,isaj); xmm1 = _mm_mul_pd(xmm1,isaj); dvdaj = _mm_add_pd(dvdaj,xmm1); /* store dvda */ _mm_storel_pd(dvda+jnr1,dvdaj); _mm_storeh_pd(dvda+jnr2,dvdaj); vctot = _mm_add_pd(vctot,vcoul); vgbtot = _mm_add_pd(vgbtot,vgb); fscal = _mm_sub_pd(fijC,fscal); fscal = _mm_mul_pd(fscal,neg); fscal = _mm_mul_pd(fscal,rinv); /* calculate partial force terms */ t1 = _mm_mul_pd(fscal,dx); t2 = _mm_mul_pd(fscal,dy); t3 = _mm_mul_pd(fscal,dz); /* update the i force */ fix = _mm_add_pd(fix,t1); fiy = _mm_add_pd(fiy,t2); fiz = _mm_add_pd(fiz,t3); /* accumulate forces from memory */ xmm1 = _mm_loadu_pd(faction+j13); /* fx1 fy1 */ xmm2 = _mm_loadu_pd(faction+j23); /* fx2 fy2 */ xmm5 = _mm_load1_pd(faction+j13+2); /* fz1 fz1 */ xmm6 = _mm_load1_pd(faction+j23+2); /* fz2 fz2 */ /* transpose */ xmm7 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fz1 fz2 */ xmm5 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* fx1 fx2 */ xmm6 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */ /* subtract partial forces */ xmm5 = _mm_sub_pd(xmm5,t1); xmm6 = _mm_sub_pd(xmm6,t2); xmm7 = _mm_sub_pd(xmm7,t3); xmm1 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fx1 fy1 */ xmm2 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */ /* store fx and fy */ _mm_storeu_pd(faction+j13,xmm1); _mm_storeu_pd(faction+j23,xmm2); /* .. then fz */ _mm_storel_pd(faction+j13+2,xmm7); _mm_storel_pd(faction+j23+2,xmm7); } /* In double precision, offset can only be either 0 or 1 */ if(offset!=0) { jnr1 = jjnr[k]; j13 = jnr1*3; jx = _mm_load_sd(pos+j13); jy = _mm_load_sd(pos+j13+1); jz = _mm_load_sd(pos+j13+2); isaj = _mm_load_sd(invsqrta+jnr1); isaprod = _mm_mul_sd(isai,isaj); dvdaj = _mm_load_sd(dvda+jnr1); q = _mm_load_sd(charge+jnr1); qq = _mm_mul_sd(iq,q); dx = _mm_sub_sd(ix,jx); dy = _mm_sub_sd(iy,jy); dz = _mm_sub_sd(iz,jz); rsq11 = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) ); rinv = my_invrsq_pd(rsq11); vcoul = _mm_mul_sd(qq,rinv); fscal = _mm_mul_sd(vcoul,rinv); qq = _mm_mul_sd(isaprod,qq); qq = _mm_mul_sd(qq,neg); gbscale = _mm_mul_sd(isaprod,gbtabscale); r = _mm_mul_sd(rsq11,rinv); rt = _mm_mul_sd(r,gbscale); n0 = _mm_cvttpd_epi32(rt); n0d = _mm_cvtepi32_pd(n0); eps = _mm_sub_sd(rt,n0d); eps2 = _mm_mul_sd(eps,eps); nnn = _mm_slli_epi64(n0,2); xmm1 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))); xmm2 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))); xmm3 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))+2); xmm4 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))+2); Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); G = _mm_mul_sd(G,eps); H = _mm_mul_sd(H,eps2); Fp = _mm_add_sd(F,G); Fp = _mm_add_sd(Fp,H); VV = _mm_mul_sd(Fp,eps); VV = _mm_add_sd(Y,VV); H = _mm_mul_sd(two,H); FF = _mm_add_sd(Fp,G); FF = _mm_add_sd(FF,H); vgb = _mm_mul_sd(qq,VV); fijC = _mm_mul_sd(qq,FF); fijC = _mm_mul_sd(fijC,gbscale); dvdatmp = _mm_mul_sd(fijC,r); dvdatmp = _mm_add_sd(vgb,dvdatmp); dvdatmp = _mm_mul_sd(dvdatmp,neg); dvdatmp = _mm_mul_sd(dvdatmp,half); dvdasum = _mm_add_sd(dvdasum,dvdatmp); xmm1 = _mm_mul_sd(dvdatmp,isaj); xmm1 = _mm_mul_sd(xmm1,isaj); dvdaj = _mm_add_sd(dvdaj,xmm1); /* store dvda */ _mm_storel_pd(dvda+jnr1,dvdaj); vctot = _mm_add_sd(vctot,vcoul); vgbtot = _mm_add_sd(vgbtot,vgb); fscal = _mm_sub_sd(fijC,fscal); fscal = _mm_mul_sd(fscal,neg); fscal = _mm_mul_sd(fscal,rinv); /* calculate partial force terms */ t1 = _mm_mul_sd(fscal,dx); t2 = _mm_mul_sd(fscal,dy); t3 = _mm_mul_sd(fscal,dz); /* update the i force */ fix = _mm_add_sd(fix,t1); fiy = _mm_add_sd(fiy,t2); fiz = _mm_add_sd(fiz,t3); /* accumulate forces from memory */ xmm5 = _mm_load_sd(faction+j13); /* fx */ xmm6 = _mm_load_sd(faction+j13+1); /* fy */ xmm7 = _mm_load_sd(faction+j13+2); /* fz */ /* subtract partial forces */ xmm5 = _mm_sub_sd(xmm5,t1); xmm6 = _mm_sub_sd(xmm6,t2); xmm7 = _mm_sub_sd(xmm7,t3); /* store forces */ _mm_store_sd(faction+j13,xmm5); _mm_store_sd(faction+j13+1,xmm6); _mm_store_sd(faction+j13+2,xmm7); } /* fix/fiy/fiz now contain four partial terms, that all should be * added to the i particle forces */ t1 = _mm_unpacklo_pd(t1,fix); t2 = _mm_unpacklo_pd(t2,fiy); t3 = _mm_unpacklo_pd(t3,fiz); fix = _mm_add_pd(fix,t1); fiy = _mm_add_pd(fiy,t2); fiz = _mm_add_pd(fiz,t3); fix = _mm_shuffle_pd(fix,fix,_MM_SHUFFLE2(1,1)); fiy = _mm_shuffle_pd(fiy,fiy,_MM_SHUFFLE2(1,1)); fiz = _mm_shuffle_pd(fiz,fiz,_MM_SHUFFLE2(1,1)); /* Load i forces from memory */ xmm1 = _mm_load_sd(faction+ii3); xmm2 = _mm_load_sd(faction+ii3+1); xmm3 = _mm_load_sd(faction+ii3+2); /* Add to i force */ fix = _mm_add_sd(fix,xmm1); fiy = _mm_add_sd(fiy,xmm2); fiz = _mm_add_sd(fiz,xmm3); /* store i forces to memory */ _mm_store_sd(faction+ii3,fix); _mm_store_sd(faction+ii3+1,fiy); _mm_store_sd(faction+ii3+2,fiz); /* now do dvda */ dvdatmp = _mm_unpacklo_pd(dvdatmp,dvdasum); dvdasum = _mm_add_pd(dvdasum,dvdatmp); _mm_storeh_pd(&dva,dvdasum); dvda[ii] = dvda[ii] + dva*isai_d*isai_d; ggid = gid[n]; /* Coulomb potential */ vcoul = _mm_unpacklo_pd(vcoul,vctot); vctot = _mm_add_pd(vctot,vcoul); _mm_storeh_pd(&vct,vctot); Vc[ggid] = Vc[ggid] + vct; /* GB potential */ vgb = _mm_unpacklo_pd(vgb,vgbtot); vgbtot = _mm_add_pd(vgbtot,vgb); _mm_storeh_pd(&vgbt,vgbtot); gpol[ggid] = gpol[ggid] + vgbt; } *outeriter = nri; *inneriter = nj1; }
void SpringEmbedderFRExact::mainStep_sse3(ArrayGraph &C) { //#if (defined(OGDF_ARCH_X86) || defined(OGDF_ARCH_X64)) && !(defined(__GNUC__) && !defined(__SSE3__)) #ifdef OGDF_SSE3_EXTENSIONS const int n = C.numberOfNodes(); #ifdef _OPENMP const int work = 256; const int nThreadsRep = min(omp_get_max_threads(), 1 + n*n/work); const int nThreadsPrev = min(omp_get_max_threads(), 1 + n /work); #endif const double k = m_idealEdgeLength; const double kSquare = k*k; const double c_rep = 0.052 * kSquare; // 0.2 = factor for repulsive forces as suggested by Warshal const double minDist = 10e-6;//100*DBL_EPSILON; const double minDistSquare = minDist*minDist; double *disp_x = (double*) System::alignedMemoryAlloc16(n*sizeof(double)); double *disp_y = (double*) System::alignedMemoryAlloc16(n*sizeof(double)); __m128d mm_kSquare = _mm_set1_pd(kSquare); __m128d mm_minDist = _mm_set1_pd(minDist); __m128d mm_minDistSquare = _mm_set1_pd(minDistSquare); __m128d mm_c_rep = _mm_set1_pd(c_rep); #pragma omp parallel num_threads(nThreadsRep) { double tx = m_txNull; double ty = m_tyNull; int cF = 1; for(int i = 1; i <= m_iterations; i++) { // repulsive forces #pragma omp for for(int v = 0; v < n; ++v) { __m128d mm_disp_xv = _mm_setzero_pd(); __m128d mm_disp_yv = _mm_setzero_pd(); __m128d mm_xv = _mm_set1_pd(C.m_x[v]); __m128d mm_yv = _mm_set1_pd(C.m_y[v]); int u; for(u = 0; u+1 < v; u += 2) { __m128d mm_delta_x = _mm_sub_pd(mm_xv, _mm_load_pd(&C.m_x[u])); __m128d mm_delta_y = _mm_sub_pd(mm_yv, _mm_load_pd(&C.m_y[u])); __m128d mm_distSquare = _mm_max_pd(mm_minDistSquare, _mm_add_pd(_mm_mul_pd(mm_delta_x,mm_delta_x),_mm_mul_pd(mm_delta_y,mm_delta_y)) ); __m128d mm_t = _mm_div_pd(_mm_load_pd(&C.m_nodeWeight[u]), mm_distSquare); mm_disp_xv = _mm_add_pd(mm_disp_xv, _mm_mul_pd(mm_delta_x, mm_t)); mm_disp_yv = _mm_add_pd(mm_disp_yv, _mm_mul_pd(mm_delta_y, mm_t)); //mm_disp_xv = _mm_add_pd(mm_disp_xv, _mm_mul_pd(mm_delta_x, _mm_div_pd(mm_kSquare,mm_distSquare))); //mm_disp_yv = _mm_add_pd(mm_disp_yv, _mm_mul_pd(mm_delta_y, _mm_div_pd(mm_kSquare,mm_distSquare))); } int uStart = u+2; if(u == v) ++u; if(u < n) { __m128d mm_delta_x = _mm_sub_sd(mm_xv, _mm_load_sd(&C.m_x[u])); __m128d mm_delta_y = _mm_sub_sd(mm_yv, _mm_load_sd(&C.m_y[u])); __m128d mm_distSquare = _mm_max_sd(mm_minDistSquare, _mm_add_sd(_mm_mul_sd(mm_delta_x,mm_delta_x),_mm_mul_sd(mm_delta_y,mm_delta_y)) ); __m128d mm_t = _mm_div_sd(_mm_load_sd(&C.m_nodeWeight[u]), mm_distSquare); mm_disp_xv = _mm_add_sd(mm_disp_xv, _mm_mul_sd(mm_delta_x, mm_t)); mm_disp_yv = _mm_add_sd(mm_disp_yv, _mm_mul_sd(mm_delta_y, mm_t)); //mm_disp_xv = _mm_add_sd(mm_disp_xv, _mm_mul_sd(mm_delta_x, _mm_div_sd(mm_kSquare,mm_distSquare))); //mm_disp_yv = _mm_add_sd(mm_disp_yv, _mm_mul_sd(mm_delta_y, _mm_div_sd(mm_kSquare,mm_distSquare))); } for(u = uStart; u < n; u += 2) { __m128d mm_delta_x = _mm_sub_pd(mm_xv, _mm_load_pd(&C.m_x[u])); __m128d mm_delta_y = _mm_sub_pd(mm_yv, _mm_load_pd(&C.m_y[u])); __m128d mm_distSquare = _mm_max_pd(mm_minDistSquare, _mm_add_pd(_mm_mul_pd(mm_delta_x,mm_delta_x),_mm_mul_pd(mm_delta_y,mm_delta_y)) ); __m128d mm_t = _mm_div_pd(_mm_load_pd(&C.m_nodeWeight[u]), mm_distSquare); mm_disp_xv = _mm_add_pd(mm_disp_xv, _mm_mul_pd(mm_delta_x, mm_t)); mm_disp_yv = _mm_add_pd(mm_disp_yv, _mm_mul_pd(mm_delta_y, mm_t)); //mm_disp_xv = _mm_add_pd(mm_disp_xv, _mm_mul_pd(mm_delta_x, _mm_div_pd(mm_kSquare,mm_distSquare))); //mm_disp_yv = _mm_add_pd(mm_disp_yv, _mm_mul_pd(mm_delta_y, _mm_div_pd(mm_kSquare,mm_distSquare))); } if(u < n) { __m128d mm_delta_x = _mm_sub_sd(mm_xv, _mm_load_sd(&C.m_x[u])); __m128d mm_delta_y = _mm_sub_sd(mm_yv, _mm_load_sd(&C.m_y[u])); __m128d mm_distSquare = _mm_max_sd(mm_minDistSquare, _mm_add_sd(_mm_mul_sd(mm_delta_x,mm_delta_x),_mm_mul_sd(mm_delta_y,mm_delta_y)) ); __m128d mm_t = _mm_div_sd(_mm_load_sd(&C.m_nodeWeight[u]), mm_distSquare); mm_disp_xv = _mm_add_sd(mm_disp_xv, _mm_mul_sd(mm_delta_x, mm_t)); mm_disp_yv = _mm_add_sd(mm_disp_yv, _mm_mul_sd(mm_delta_y, mm_t)); //mm_disp_xv = _mm_add_sd(mm_disp_xv, _mm_mul_sd(mm_delta_x, _mm_div_sd(mm_kSquare,mm_distSquare))); //mm_disp_yv = _mm_add_sd(mm_disp_yv, _mm_mul_sd(mm_delta_y, _mm_div_sd(mm_kSquare,mm_distSquare))); } mm_disp_xv = _mm_hadd_pd(mm_disp_xv,mm_disp_xv); mm_disp_yv = _mm_hadd_pd(mm_disp_yv,mm_disp_yv); _mm_store_sd(&disp_x[v], _mm_mul_sd(mm_disp_xv, mm_c_rep)); _mm_store_sd(&disp_y[v], _mm_mul_sd(mm_disp_yv, mm_c_rep)); } // attractive forces #pragma omp single for(int e = 0; e < C.numberOfEdges(); ++e) { int v = C.m_src[e]; int u = C.m_tgt[e]; double delta_x = C.m_x[v] - C.m_x[u]; double delta_y = C.m_y[v] - C.m_y[u]; double dist = max(minDist, sqrt(delta_x*delta_x + delta_y*delta_y)); disp_x[v] -= delta_x * dist / k; disp_y[v] -= delta_y * dist / k; disp_x[u] += delta_x * dist / k; disp_y[u] += delta_y * dist / k; } // limit the maximum displacement to the temperature (m_tx,m_ty) __m128d mm_tx = _mm_set1_pd(tx); __m128d mm_ty = _mm_set1_pd(ty); #pragma omp for nowait for(int v = 0; v < n-1; v += 2) { __m128d mm_disp_xv = _mm_load_pd(&disp_x[v]); __m128d mm_disp_yv = _mm_load_pd(&disp_y[v]); __m128d mm_dist = _mm_max_pd(mm_minDist, _mm_sqrt_pd( _mm_add_pd(_mm_mul_pd(mm_disp_xv,mm_disp_xv),_mm_mul_pd(mm_disp_yv,mm_disp_yv)) )); _mm_store_pd(&C.m_x[v], _mm_add_pd(_mm_load_pd(&C.m_x[v]), _mm_mul_pd(_mm_div_pd(mm_disp_xv, mm_dist), _mm_min_pd(mm_dist,mm_tx)) )); _mm_store_pd(&C.m_y[v], _mm_add_pd(_mm_load_pd(&C.m_y[v]), _mm_mul_pd(_mm_div_pd(mm_disp_yv, mm_dist), _mm_min_pd(mm_dist,mm_ty)) )); } #pragma omp single nowait { if(n % 2) { int v = n-1; double dist = max(minDist, sqrt(disp_x[v]*disp_x[v] + disp_y[v]*disp_y[v])); C.m_x[v] += disp_x[v] / dist * min(dist,tx); C.m_y[v] += disp_y[v] / dist * min(dist,ty); } } cool(tx,ty,cF); #pragma omp barrier } } System::alignedMemoryFree(disp_x); System::alignedMemoryFree(disp_y); #else mainStep(C); #endif }