void assign_add_mul_r_32(spinor32 * const R, spinor32 * const S, const float c, const int N) { #ifdef TM_USE_OMP #pragma omp parallel { #endif vector4double x0, x1, x2, x3, x4, x5, y0, y1, y2, y3, y4, y5; vector4double z0, z1, z2, z3, z4, z5, k; float *s, *r; float ALIGN32 _c; _c = c; __prefetch_by_load(S); __prefetch_by_load(R); k = vec_splats((double)_c); __alignx(16, s); __alignx(16, r); __alignx(16, S); __alignx(16, R); #ifdef TM_USE_OMP #pragma omp for #else #pragma unroll(2) #endif for(int i = 0; i < N; i++) { s=(float*)((spinor32 *) S + i); r=(float*)((spinor32 *) R + i); __prefetch_by_load(S + i + 1); __prefetch_by_stream(1, R + i + 1); x0 = vec_ld(0, r); x1 = vec_ld(0, r+4); x2 = vec_ld(0, r+8); x3 = vec_ld(0, r+12); x4 = vec_ld(0, r+16); x5 = vec_ld(0, r+20); y0 = vec_ld(0, s); y1 = vec_ld(0, s+4); y2 = vec_ld(0, s+8); y3 = vec_ld(0, s+12); y4 = vec_ld(0, s+16); y5 = vec_ld(0, s+20); z0 = vec_madd(k, y0, x0); z1 = vec_madd(k, y1, x1); z2 = vec_madd(k, y2, x2); z3 = vec_madd(k, y3, x3); z4 = vec_madd(k, y4, x4); z5 = vec_madd(k, y5, x5); vec_st(z0, 0, r); vec_st(z1, 0, r+4); vec_st(z2, 0, r+8); vec_st(z3, 0, r+12); vec_st(z4, 0, r+16); vec_st(z5, 0, r+20); } #ifdef TM_USE_OMP } /* OpenMP closing brace */ #endif return; }
float square_norm_32(spinor32 * const P, const int N, const int parallel) { float ALIGN32 res = 0.0; #ifdef TM_USE_MPI float ALIGN32 mres; #endif #ifdef TM_USE_OMP #pragma omp parallel { int thread_num = omp_get_thread_num(); #endif vector4double x0, x1, x2, x3, x4, x5, y0, y1, y2, y3, y4, y5; vector4double ds,tt,tr,ts,kc,ks,buffer; float *s ALIGN32; ks = vec_splats(0.); kc = vec_splats(0.); #ifndef TM_USE_OMP #pragma unroll(4) #else #pragma omp for #endif for(int i = 0; i < N; i++) { s = (float*)((spinor32*) P+i); __prefetch_by_load(P+i+1); x0 = vec_ld(0, s); x1 = vec_ld(0, s+4); x2 = vec_ld(0, s+8); x3 = vec_ld(0, s+12); x4 = vec_ld(0, s+16); x5 = vec_ld(0, s+20); y0 = vec_mul(x0, x0); y1 = vec_mul(x1, x1); y2 = vec_mul(x2, x2); y3 = vec_mul(x3, x3); y4 = vec_mul(x4, x4); y5 = vec_mul(x5, x5); x0 = vec_add(y0, y1); x1 = vec_add(y2, y3); x2 = vec_add(y4, y5); x3 = vec_add(x0, x1); ds = vec_add(x2, x3); tr = vec_add(ds, kc); ts = vec_add(tr, ks); tt = vec_sub(ts, ks); ks = ts; kc = vec_sub(tr, tt); } buffer = vec_add(kc,ks); #ifdef TM_USE_OMP g_omp_acc_re[thread_num] = buffer[0] + buffer[1] + buffer[2] + buffer[3]; } /* OpenMP closing brace */ for(int i = 0; i < omp_num_threads; ++i) res += g_omp_acc_re[i]; #else res = buffer[0] + buffer[1] + buffer[2] + buffer[3]; #endif # ifdef TM_USE_MPI if(parallel) { MPI_Allreduce(&res, &mres, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD); return mres; } # endif return res; }
void diff(spinor * const Q,const spinor * const R,const spinor * const S, const int N) { #ifdef OMP #pragma omp parallel { #endif vector4double x0, x1, x2, x3, x4, x5, y0, y1, y2, y3, y4, y5; vector4double z0, z1, z2, z3, z4, z5; double *s, *r, *q; __alignx(32, s); __alignx(32, r); __alignx(32, q); __alignx(32, S); __alignx(32, R); __prefetch_by_load(S); __prefetch_by_load(R); __prefetch_by_load(Q); #ifndef OMP #pragma unroll(2) #else #pragma omp for #endif for (int ix = 0; ix < N; ++ix) { s=(double*)((spinor *) S + ix); r=(double*)((spinor *) R + ix); q=(double*)((spinor *) Q + ix); __prefetch_by_load(S + ix + 1); __prefetch_by_load(R + ix + 1); __prefetch_by_stream(1, Q + ix + 1); x0 = vec_ld(0, r); x1 = vec_ld(0, r+4); x2 = vec_ld(0, r+8); x3 = vec_ld(0, r+12); x4 = vec_ld(0, r+16); x5 = vec_ld(0, r+20); y0 = vec_ld(0, s); y1 = vec_ld(0, s+4); y2 = vec_ld(0, s+8); y3 = vec_ld(0, s+12); y4 = vec_ld(0, s+16); y5 = vec_ld(0, s+20); z0 = vec_sub(x0, y0); z1 = vec_sub(x1, y1); z2 = vec_sub(x2, y2); z3 = vec_sub(x3, y3); z4 = vec_sub(x4, y4); z5 = vec_sub(x5, y5); vec_st(z0, 0, q); vec_st(z1, 0, q+4); vec_st(z2, 0, q+8); vec_st(z3, 0, q+12); vec_st(z4, 0, q+16); vec_st(z5, 0, q+20); } #ifdef OMP } /* OpenMP parallel closing brace */ #endif return; }