void assign_add_mul_r_32(spinor32 * const R, spinor32 * const S, const float c, const int N) { #ifdef TM_USE_OMP #pragma omp parallel { #endif vector4double x0, x1, x2, x3, x4, x5, y0, y1, y2, y3, y4, y5; vector4double z0, z1, z2, z3, z4, z5, k; float *s, *r; float ALIGN32 _c; _c = c; __prefetch_by_load(S); __prefetch_by_load(R); k = vec_splats((double)_c); __alignx(16, s); __alignx(16, r); __alignx(16, S); __alignx(16, R); #ifdef TM_USE_OMP #pragma omp for #else #pragma unroll(2) #endif for(int i = 0; i < N; i++) { s=(float*)((spinor32 *) S + i); r=(float*)((spinor32 *) R + i); __prefetch_by_load(S + i + 1); __prefetch_by_stream(1, R + i + 1); x0 = vec_ld(0, r); x1 = vec_ld(0, r+4); x2 = vec_ld(0, r+8); x3 = vec_ld(0, r+12); x4 = vec_ld(0, r+16); x5 = vec_ld(0, r+20); y0 = vec_ld(0, s); y1 = vec_ld(0, s+4); y2 = vec_ld(0, s+8); y3 = vec_ld(0, s+12); y4 = vec_ld(0, s+16); y5 = vec_ld(0, s+20); z0 = vec_madd(k, y0, x0); z1 = vec_madd(k, y1, x1); z2 = vec_madd(k, y2, x2); z3 = vec_madd(k, y3, x3); z4 = vec_madd(k, y4, x4); z5 = vec_madd(k, y5, x5); vec_st(z0, 0, r); vec_st(z1, 0, r+4); vec_st(z2, 0, r+8); vec_st(z3, 0, r+12); vec_st(z4, 0, r+16); vec_st(z5, 0, r+20); } #ifdef TM_USE_OMP } /* OpenMP closing brace */ #endif return; }
double scalar_prod_r(spinor * const S,spinor * const R, const int N, const int parallel){ int ix; static double ks,kc,ds,tr,ts,tt; spinor *s,*r; ks=0.0; kc=0.0; #if (defined BGL && defined XLC) __alignx(16, S); __alignx(16, R); #endif for (ix=0;ix<N;ix++){ s = (spinor *) S + ix; r = (spinor *) R + ix; ds=(*r).s0.c0.re*(*s).s0.c0.re + (*r).s0.c0.im*(*s).s0.c0.im + (*r).s0.c1.re*(*s).s0.c1.re + (*r).s0.c1.im*(*s).s0.c1.im + (*r).s0.c2.re*(*s).s0.c2.re + (*r).s0.c2.im*(*s).s0.c2.im + (*r).s1.c0.re*(*s).s1.c0.re + (*r).s1.c0.im*(*s).s1.c0.im + (*r).s1.c1.re*(*s).s1.c1.re + (*r).s1.c1.im*(*s).s1.c1.im + (*r).s1.c2.re*(*s).s1.c2.re + (*r).s1.c2.im*(*s).s1.c2.im + (*r).s2.c0.re*(*s).s2.c0.re + (*r).s2.c0.im*(*s).s2.c0.im + (*r).s2.c1.re*(*s).s2.c1.re + (*r).s2.c1.im*(*s).s2.c1.im + (*r).s2.c2.re*(*s).s2.c2.re + (*r).s2.c2.im*(*s).s2.c2.im + (*r).s3.c0.re*(*s).s3.c0.re + (*r).s3.c0.im*(*s).s3.c0.im + (*r).s3.c1.re*(*s).s3.c1.re + (*r).s3.c1.im*(*s).s3.c1.im + (*r).s3.c2.re*(*s).s3.c2.re + (*r).s3.c2.im*(*s).s3.c2.im; tr = ds + kc; ts = tr + ks; tt = ts-ks; ks = ts; kc = tr-tt; } kc = ks + kc; #if defined MPI if(parallel) { MPI_Allreduce(&kc, &ks, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); return ks; } #endif return kc; }
double scalar_prod_r(spinor * const S, spinor * const R, const int N, const int parallel) { static double ks,kc,ds,tr,ts,tt; spinor *s,*r; ks=0.0; kc=0.0; #if (defined BGL && defined XLC) __alignx(16, S); __alignx(16, R); #endif for (int ix = 0; ix < N; ++ix) { s=(spinor *) S + ix; r=(spinor *) R + ix; ds = r->s0.c0 * conj(s->s0.c0) + r->s0.c1 * conj(s->s0.c1) + r->s0.c2 * conj(s->s0.c2) + r->s1.c0 * conj(s->s1.c0) + r->s1.c1 * conj(s->s1.c1) + r->s1.c2 * conj(s->s1.c2) + r->s2.c0 * conj(s->s2.c0) + r->s2.c1 * conj(s->s2.c1) + r->s2.c2 * conj(s->s2.c2) + r->s3.c0 * conj(s->s3.c0) + r->s3.c1 * conj(s->s3.c1) + r->s3.c2 * conj(s->s3.c2); tr=ds+kc; ts=tr+ks; tt=ts-ks; ks=ts; kc=tr-tt; } kc=ks+kc; #if defined MPI if(parallel) { double buffer = kc; MPI_Allreduce(&buffer, &kc, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); } #endif return kc; }
/* 3. */ void xchange_halffield() { # ifdef MPI MPI_Status status[16]; # ifdef PARALLELT int reqcount = 4; # elif defined PARALLELXT int reqcount = 8; # elif defined PARALLELXYT int reqcount = 12; # elif defined PARALLELXYZT int x0=0, x1=0, x2=0, ix=0; int reqcount = 16; # endif # if (defined XLC && defined BGL) __alignx(16, HalfSpinor); # endif MPI_Startall(reqcount, prequests); MPI_Waitall(reqcount, prequests, status); # endif /* MPI */ return; }
void Hopping_Matrix(const int ieo, spinor * const l, spinor * const k){ int ix; su3 * restrict ALIGN U; spinor * restrict ALIGN s; halfspinor * restrict * phi ALIGN; halfspinor32 * restrict * phi32 ALIGN; /* We have 32 registers available */ _declare_hregs(); #ifdef _KOJAK_INST #pragma pomp inst begin(hoppingmatrix) #endif #pragma disjoint(*s, *U) #ifdef _GAUGE_COPY if(g_update_gauge_copy) { update_backward_gauge(g_gauge_field); } #endif __alignx(16, l); __alignx(16, k); if(g_sloppy_precision == 1 && g_sloppy_precision_flag == 1) { __alignx(16, HalfSpinor32); /* We will run through the source vector now */ /* instead of the solution vector */ s = k; _prefetch_spinor(s); /* s contains the source vector */ if(ieo == 0) { U = g_gauge_field_copy[0][0]; } else { U = g_gauge_field_copy[1][0]; } phi32 = NBPointer32[ieo]; _prefetch_su3(U); /**************** loop over all lattice sites ******************/ ix=0; for(int i = 0; i < (VOLUME)/2; i++){ /*********************** direction +0 ************************/ _hop_t_p_pre32(); s++; U++; ix++; /*********************** direction -0 ************************/ _hop_t_m_pre32(); ix++; /*********************** direction +1 ************************/ _hop_x_p_pre32(); ix++; U++; /*********************** direction -1 ************************/ _hop_x_m_pre32(); ix++; /*********************** direction +2 ************************/ _hop_y_p_pre32(); ix++; U++; /*********************** direction -2 ************************/ _hop_y_m_pre32(); ix++; /*********************** direction +3 ************************/ _hop_z_p_pre32(); ix++; U++; /*********************** direction -3 ************************/ _hop_z_m_pre32(); ix++; /************************ end of loop ************************/ } # if (defined TM_USE_MPI && !defined _NO_COMM) xchange_halffield32(); # endif s = l; phi32 = NBPointer32[2 + ieo]; if(ieo == 0) { U = g_gauge_field_copy[1][0]; } else { U = g_gauge_field_copy[0][0]; } //_prefetch_halfspinor(phi32[0]); _prefetch_su3(U); /* Now we sum up and expand to a full spinor */ ix = 0; /* _prefetch_spinor_for_store(s); */ for(int i = 0; i < (VOLUME)/2; i++){ /* This causes a lot of trouble, do we understand this? */ /* _prefetch_spinor_for_store(s); */ //_prefetch_halfspinor(phi32[ix+1]); /*********************** direction +0 ************************/ _hop_t_p_post32(); ix++; /*********************** direction -0 ************************/ _hop_t_m_post32(); U++; ix++; /*********************** direction +1 ************************/ _hop_x_p_post32(); ix++; /*********************** direction -1 ************************/ _hop_x_m_post32(); U++; ix++; /*********************** direction +2 ************************/ _hop_y_p_post32(); ix++; /*********************** direction -2 ************************/ _hop_y_m_post32(); U++; ix++; /*********************** direction +3 ************************/ _hop_z_p_post32(); ix++; /*********************** direction -3 ************************/ _hop_z_m_post32(); U++; ix++; s++; } } else { __alignx(16, HalfSpinor); /* We will run through the source vector now */ /* instead of the solution vector */ s = k; _prefetch_spinor(s); /* s contains the source vector */ if(ieo == 0) { U = g_gauge_field_copy[0][0]; } else { U = g_gauge_field_copy[1][0]; } phi = NBPointer[ieo]; _prefetch_su3(U); /**************** loop over all lattice sites ******************/ ix=0; for(int i = 0; i < (VOLUME)/2; i++){ /*********************** direction +0 ************************/ _hop_t_p_pre(); s++; U++; ix++; /*********************** direction -0 ************************/ _hop_t_m_pre(); ix++; /*********************** direction +1 ************************/ _hop_x_p_pre(); ix++; U++; /*********************** direction -1 ************************/ _hop_x_m_pre(); ix++; /*********************** direction +2 ************************/ _hop_y_p_pre(); ix++; U++; /*********************** direction -2 ************************/ _hop_y_m_pre(); ix++; /*********************** direction +3 ************************/ _hop_z_p_pre(); ix++; U++; /*********************** direction -3 ************************/ _hop_z_m_pre(); ix++; /************************ end of loop ************************/ } # if (defined TM_USE_MPI && !defined _NO_COMM) xchange_halffield(); # endif s = l; phi = NBPointer[2 + ieo]; //_prefetch_halfspinor(phi[0]); if(ieo == 0) { U = g_gauge_field_copy[1][0]; } else { U = g_gauge_field_copy[0][0]; } _prefetch_su3(U); /* Now we sum up and expand to a full spinor */ ix = 0; /* _prefetch_spinor_for_store(s); */ for(int i = 0; i < (VOLUME)/2; i++){ /* This causes a lot of trouble, do we understand this? */ /* _prefetch_spinor_for_store(s); */ //_prefetch_halfspinor(phi[ix+1]); /*********************** direction +0 ************************/ _hop_t_p_post(); ix++; /*********************** direction -0 ************************/ _hop_t_m_post(); U++; ix++; /*********************** direction +1 ************************/ _hop_x_p_post(); ix++; /*********************** direction -1 ************************/ _hop_x_m_post(); U++; ix++; /*********************** direction +2 ************************/ _hop_y_p_post(); ix++; /*********************** direction -2 ************************/ _hop_y_m_post(); U++; ix++; /*********************** direction +3 ************************/ _hop_z_p_post(); ix++; /*********************** direction -3 ************************/ _hop_z_m_post(); U++; ix++; s++; } } #ifdef _KOJAK_INST #pragma pomp inst end(hoppingmatrix) #endif }
/* 2. */ void init_xchange_halffield() { # ifdef MPI # ifdef PARALLELT int reqcount = 4; # elif defined PARALLELXT int reqcount = 8; # elif defined PARALLELXYT int reqcount = 12; # elif defined PARALLELXYZT int x0=0, x1=0, x2=0, ix=0; int reqcount = 16; # endif # if (defined XLC && defined BGL) __alignx(16, HalfSpinor); # endif /* send the data to the neighbour on the right in t direction */ /* recieve the data from the neighbour on the left in t direction */ MPI_Send_init((void*)(HalfSpinor + 4*VOLUME), LX*LY*LZ*12/2, MPI_DOUBLE, g_nb_t_up, 81, g_cart_grid, &prequests[0]); MPI_Recv_init((void*)(HalfSpinor + 4*VOLUME + RAND/2 + LX*LY*LZ/2), LX*LY*LZ*12/2, MPI_DOUBLE, g_nb_t_dn, 81, g_cart_grid, &prequests[1]); /* send the data to the neighbour on the left in t direction */ /* recieve the data from the neighbour on the right in t direction */ MPI_Send_init((void*)(HalfSpinor + 4*VOLUME + LX*LY*LZ/2), LX*LY*LZ*12/2, MPI_DOUBLE, g_nb_t_dn, 82, g_cart_grid, &prequests[2]); MPI_Recv_init((void*)(HalfSpinor + 4*VOLUME + RAND/2), LX*LY*LZ*12/2, MPI_DOUBLE, g_nb_t_up, 82, g_cart_grid, &prequests[3]); # if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) /* send the data to the neighbour on the right in x direction */ /* recieve the data from the neighbour on the left in x direction */ MPI_Send_init((void*)(HalfSpinor + 4*VOLUME + LX*LY*LZ), T*LY*LZ*12/2, MPI_DOUBLE, g_nb_x_up, 91, g_cart_grid, &prequests[4]); MPI_Recv_init((void*)(HalfSpinor + 4*VOLUME + RAND/2 + LX*LY*LZ + T*LY*LZ/2), T*LY*LZ*12/2, MPI_DOUBLE, g_nb_x_dn, 91, g_cart_grid, &prequests[5]); /* send the data to the neighbour on the left in x direction */ /* recieve the data from the neighbour on the right in x direction */ MPI_Send_init((void*)(HalfSpinor + 4*VOLUME + LX*LY*LZ + T*LY*LZ/2), T*LY*LZ*12/2, MPI_DOUBLE, g_nb_x_dn, 92, g_cart_grid, &prequests[6]); MPI_Recv_init((void*)(HalfSpinor + 4*VOLUME + RAND/2 + LX*LY*LZ), T*LY*LZ*12/2, MPI_DOUBLE, g_nb_x_up, 92, g_cart_grid, &prequests[7]); # endif # if (defined PARALLELXYT || defined PARALLELXYZT) /* send the data to the neighbour on the right in y direction */ /* recieve the data from the neighbour on the left in y direction */ MPI_Send_init((void*)(HalfSpinor + 4*VOLUME + LX*LY*LZ + T*LY*LZ), T*LX*LZ*12/2, MPI_DOUBLE, g_nb_y_up, 101, g_cart_grid, &prequests[8]); MPI_Recv_init((void*)(HalfSpinor + 4*VOLUME + RAND/2 + LX*LY*LZ + T*LY*LZ + T*LX*LZ/2), T*LX*LZ*12/2, MPI_DOUBLE, g_nb_y_dn, 101, g_cart_grid, &prequests[9]); /* send the data to the neighbour on the leftt in y direction */ /* recieve the data from the neighbour on the right in y direction */ MPI_Send_init((void*)(HalfSpinor + 4*VOLUME + LX*LY*LZ + T*LY*LZ + T*LX*LZ/2), T*LX*LZ*12/2, MPI_DOUBLE, g_nb_y_dn, 102, g_cart_grid, &prequests[10]); MPI_Recv_init((void*)(HalfSpinor + 4*VOLUME + RAND/2 + LX*LY*LZ + T*LY*LZ), T*LX*LZ*12/2, MPI_DOUBLE, g_nb_y_up, 102, g_cart_grid, &prequests[11]); # endif # if (defined PARALLELXYZT) /* send the data to the neighbour on the right in z direction */ /* recieve the data from the neighbour on the left in z direction */ MPI_Send_init((void*)(HalfSpinor + 4*VOLUME + LX*LY*LZ + T*LY*LZ + T*LX*LZ), T*LX*LY*12/2, MPI_DOUBLE, g_nb_z_up, 503, g_cart_grid, &prequests[12]); MPI_Recv_init((void*)(HalfSpinor + 4*VOLUME + RAND/2 + LX*LY*LZ + T*LY*LZ + T*LX*LZ + T*LX*LY/2), T*LX*LY*12/2, MPI_DOUBLE, g_nb_z_dn, 503, g_cart_grid, &prequests[13]); /* send the data to the neighbour on the left in z direction */ /* recieve the data from the neighbour on the right in z direction */ MPI_Send_init((void*)(HalfSpinor + 4*VOLUME + LX*LY*LZ + T*LY*LZ + T*LX*LZ + T*LX*LY/2), 12*T*LX*LY/2, MPI_DOUBLE, g_nb_z_dn, 504, g_cart_grid, &prequests[14]); MPI_Recv_init((void*)(HalfSpinor + 4*VOLUME + RAND/2 + LX*LY*LZ + T*LY*LZ + T*LX*LZ), T*LX*LY*12/2, MPI_DOUBLE, g_nb_z_up, 504, g_cart_grid, &prequests[15]); # endif # endif /* MPI */ return; }
void xchange_lexicfield32(spinor32 * const l) { MPI_Request requests[16]; MPI_Status status[16]; # ifdef PARALLELT int reqcount = 4; # elif defined PARALLELXT int reqcount = 8; # elif defined PARALLELXYT int reqcount = 12; # elif defined PARALLELXYZT int reqcount = 16; # endif #ifdef _KOJAK_INST #pragma pomp inst begin(xchange_lexicfield32) #endif # if (defined BGL && defined XLC) __alignx(16, l); # endif # ifdef TM_USE_MPI /* send the data to the neighbour on the left */ /* recieve the data from the neighbour on the right */ MPI_Isend((void*)l, 1, lfield_time_slice_cont32, g_nb_t_dn, 5081, g_cart_grid, &requests[0]); MPI_Irecv((void*)(l+VOLUME), 1, lfield_time_slice_cont32, g_nb_t_up, 5081, g_cart_grid, &requests[1]); # if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) /* send the data to the neighbour on the left in x direction */ /* recieve the data from the neighbour on the right in x direction */ MPI_Isend((void*)l, 1, lfield_x_slice_gath32, g_nb_x_dn, 5091, g_cart_grid, &requests[4]); MPI_Irecv((void*)(l+(T+2)*LX*LY*LZ), 1, lfield_x_slice_cont32, g_nb_x_up, 5091, g_cart_grid, &requests[5]); # endif # if (defined PARALLELXYT || defined PARALLELXYZT) /* send the data to the neighbour on the left in y direction */ /* recieve the data from the neighbour on the right in y direction */ MPI_Isend((void*)l, 1, lfield_y_slice_gath32, g_nb_y_dn, 5101, g_cart_grid, &requests[8]); MPI_Irecv((void*)(l + VOLUME + 2*LZ*(LX*LY + T*LY)), 1, lfield_y_slice_cont32, g_nb_y_up, 5101, g_cart_grid, &requests[9]); # endif # if (defined PARALLELXYZT) /* send the data to the neighbour on the left in z direction */ /* recieve the data from the neighbour on the right in z direction */ MPI_Isend((void*)l, 1, lfield_z_slice_gath32, g_nb_z_dn, 5503, g_cart_grid, &requests[12]); MPI_Irecv((void*)(l+VOLUME + 2*LZ*(LX*LY + T*LY) + 2*LZ*T*LX), 1, lfield_z_slice_cont32, g_nb_z_up, 5503, g_cart_grid, &requests[13]); # endif /* send the data to the neighbour on the right */ /* recieve the data from the neighbour on the left */ MPI_Isend((void*)(l+(T-1)*LX*LY*LZ), 1, lfield_time_slice_cont32, g_nb_t_up, 5082, g_cart_grid, &requests[2]); MPI_Irecv((void*)(l+(T+1)*LX*LY*LZ), 1, lfield_time_slice_cont32, g_nb_t_dn, 5082, g_cart_grid, &requests[3]); # if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) /* send the data to the neighbour on the right in x direction */ /* recieve the data from the neighbour on the left in x direction */ MPI_Isend((void*)(l+(LX-1)*LY*LZ), 1, lfield_x_slice_gath32, g_nb_x_up, 5092, g_cart_grid, &requests[6]); MPI_Irecv((void*)(l+((T+2)*LX*LY*LZ + T*LY*LZ)), 1, lfield_x_slice_cont32, g_nb_x_dn, 5092, g_cart_grid, &requests[7]); # endif # if (defined PARALLELXYT || defined PARALLELXYZT) /* send the data to the neighbour on the right in y direction */ /* recieve the data from the neighbour on the left in y direction */ MPI_Isend((void*)(l+(LY-1)*LZ), 1, lfield_y_slice_gath32, g_nb_y_up, 5102, g_cart_grid, &requests[10]); MPI_Irecv((void*)(l+VOLUME + 2*LZ*(LX*LY + T*LY) + T*LX*LZ), 1, lfield_y_slice_cont32, g_nb_y_dn, 5102, g_cart_grid, &requests[11]); # endif # if defined PARALLELXYZT /* send the data to the neighbour on the right in y direction */ /* recieve the data from the neighbour on the left in y direction */ MPI_Isend((void*)(l+LZ-1), 1, lfield_z_slice_gath32, g_nb_z_up, 5504, g_cart_grid, &requests[14]); MPI_Irecv((void*)(l+VOLUME + 2*LZ*(LX*LY + T*LY) + 2*T*LX*LZ + T*LX*LY), 1, lfield_z_slice_cont32, g_nb_z_dn, 5504, g_cart_grid, &requests[15]); # endif MPI_Waitall(reqcount, requests, status); # endif return; #ifdef _KOJAK_INST #pragma pomp inst end(xchange_lexicfield32) #endif }
void xchange_lexicfield(spinor * const l) { #ifdef TM_USE_MPI MPI_Request requests[16]; MPI_Status status[16]; #endif int ireq; # if ( defined PARALLELT || defined PARALLELX ) int reqcount = 4; # elif ( defined PARALLELXT || defined PARALLELXY ) int reqcount = 8; # elif ( defined PARALLELXYT || defined PARALLELXYZ ) int reqcount = 12; # elif defined PARALLELXYZT int ix=0; int reqcount = 16; # endif #ifdef _KOJAK_INST #pragma pomp inst begin(xchange_lexicfield) #endif # if (defined BGL && defined XLC) __alignx(16, l); # endif # ifdef TM_USE_MPI ireq=0; # if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT ) /* send the data to the neighbour on the left */ /* recieve the data from the neighbour on the right */ MPI_Isend((void*)(l+gI_0_0_0_0), 1, lfield_time_slice_cont, g_nb_t_dn, 5081, g_cart_grid, &requests[ireq]); MPI_Irecv((void*)(l+gI_L_0_0_0), 1, lfield_time_slice_cont, g_nb_t_up, 5081, g_cart_grid, &requests[ireq+1]); ireq=ireq+4; # endif # if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ ) /* send the data to the neighbour on the left in x direction */ /* recieve the data from the neighbour on the right in x direction */ MPI_Isend((void*)(l+gI_0_0_0_0), 1, lfield_x_slice_gath, g_nb_x_dn, 5091, g_cart_grid, &requests[ireq]); MPI_Irecv((void*)(l+gI_0_L_0_0), 1, lfield_x_slice_cont, g_nb_x_up, 5091, g_cart_grid, &requests[ireq+1]); ireq=ireq+4; # endif # if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ ) /* send the data to the neighbour on the left in y direction */ /* recieve the data from the neighbour on the right in y direction */ MPI_Isend((void*)(l+gI_0_0_0_0), 1, lfield_y_slice_gath, g_nb_y_dn, 5101, g_cart_grid, &requests[ireq]); MPI_Irecv((void*)(l+gI_0_0_L_0), 1, lfield_y_slice_cont, g_nb_y_up, 5101, g_cart_grid, &requests[ireq+1]); ireq=ireq+4; # endif # if (defined PARALLELXYZT || defined PARALLELXYZ ) /* send the data to the neighbour on the left in z direction */ /* recieve the data from the neighbour on the right in z direction */ MPI_Isend((void*)(l+gI_0_0_0_0), 1, lfield_z_slice_gath, g_nb_z_dn, 5503, g_cart_grid, &requests[ireq]); MPI_Irecv((void*)(l+gI_0_0_0_L), 1, lfield_z_slice_cont, g_nb_z_up, 5503, g_cart_grid, &requests[ireq+1]); ireq=ireq+4; # endif ireq=2; # if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT ) /* send the data to the neighbour on the right */ /* recieve the data from the neighbour on the left */ MPI_Isend((void*)(l+gI_Lm1_0_0_0), 1, lfield_time_slice_cont, g_nb_t_up, 5082, g_cart_grid, &requests[ireq]); MPI_Irecv((void*)(l+gI_m1_0_0_0), 1, lfield_time_slice_cont, g_nb_t_dn, 5082, g_cart_grid, &requests[ireq+1]); ireq=ireq+4; #endif # if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ ) /* send the data to the neighbour on the right in x direction */ /* recieve the data from the neighbour on the left in x direction */ MPI_Isend((void*)(l+gI_0_Lm1_0_0), 1, lfield_x_slice_gath, g_nb_x_up, 5092, g_cart_grid, &requests[ireq]); MPI_Irecv((void*)(l+gI_0_m1_0_0), 1, lfield_x_slice_cont, g_nb_x_dn, 5092, g_cart_grid, &requests[ireq+1]); ireq=ireq+4; # endif # if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ ) /* send the data to the neighbour on the right in y direction */ /* recieve the data from the neighbour on the left in y direction */ MPI_Isend((void*)(l+gI_0_0_Lm1_0), 1, lfield_y_slice_gath, g_nb_y_up, 5102, g_cart_grid, &requests[ireq]); MPI_Irecv((void*)(l+gI_0_0_m1_0), 1, lfield_y_slice_cont, g_nb_y_dn, 5102, g_cart_grid, &requests[ireq+1]); ireq=ireq+4; # endif # if ( defined PARALLELXYZT || defined PARALLELXYZ ) /* send the data to the neighbour on the right in y direction */ /* recieve the data from the neighbour on the left in y direction */ MPI_Isend((void*)(l+gI_0_0_0_Lm1), 1, lfield_z_slice_gath, g_nb_z_up, 5504, g_cart_grid, &requests[ireq]); MPI_Irecv((void*)(l+gI_0_0_0_m1), 1, lfield_z_slice_cont, g_nb_z_dn, 5504, g_cart_grid, &requests[ireq+1]); # endif MPI_Waitall(reqcount, requests, status); # endif return; #ifdef _KOJAK_INST #pragma pomp inst end(xchange_lexicfield) #endif }
void deriv_Sb_D_psi(spinor * const l, spinor * const k) { /* const int l, const int k){ */ int ix,iy; su3 * restrict up ALIGN; su3 * restrict um ALIGN; /* su3adj * restrict ddd; */ /* static su3adj der; */ static su3 v1,v2; static su3_vector psia,psib,phia,phib; static spinor rr; /* spinor * restrict r ALIGN; */ spinor * restrict sp ALIGN; spinor * restrict sm ALIGN; #ifdef _KOJAK_INST #pragma pomp inst begin(derivSb) #endif #ifdef XLC #pragma disjoint(*sp, *sm, *up, *um) #endif #ifdef BGL __alignx(16, l); __alignx(16, k); #endif /* for parallelization */ #ifdef MPI xchange_lexicfield(k); xchange_lexicfield(l); #endif /************** loop over all lattice sites ****************/ for(ix = 0; ix < (VOLUME); ix++){ rr = (*(l + ix)); /* rr=g_spinor_field[l][icx-ioff]; */ /*multiply the left vector with gamma5*/ _vector_minus_assign(rr.s2, rr.s2); _vector_minus_assign(rr.s3, rr.s3); /*********************** direction +0 ********************/ iy=g_iup[ix][0]; sp = k + iy; /* sp=&g_spinor_field[k][icy]; */ up=&g_gauge_field[ix][0]; _vector_add(psia,(*sp).s0,(*sp).s2); _vector_add(psib,(*sp).s1,(*sp).s3); _vector_add(phia,rr.s0,rr.s2); _vector_add(phib,rr.s1,rr.s3); _vector_tensor_vector_add(v1, phia, psia, phib, psib); /* _vector_tensor_vector(v1,phia,psia); */ /* _vector_tensor_vector(v2,phib,psib); */ /* _su3_plus_su3(v1,v1,v2); */ _su3_times_su3d(v2,*up,v1); _complex_times_su3(v1,ka0,v2); _trace_lambda_add_assign(df0[ix][0], v1); /* _trace_lambda(der,v1); */ /* ddd=&df0[ix][0]; */ /* _add_su3adj(*ddd,der); */ /************** direction -0 ****************************/ iy=g_idn[ix][0]; sm = k + iy; /* sm=&g_spinor_field[k][icy]; */ um=&g_gauge_field[iy][0]; _vector_sub(psia,(*sm).s0,(*sm).s2); _vector_sub(psib,(*sm).s1,(*sm).s3); _vector_sub(phia,rr.s0,rr.s2); _vector_sub(phib,rr.s1,rr.s3); _vector_tensor_vector_add(v1, psia, phia, psib, phib); /* _vector_tensor_vector(v1,psia,phia); */ /* _vector_tensor_vector(v2,psib,phib); */ /* _su3_plus_su3(v1,v1,v2); */ _su3_times_su3d(v2,*um,v1); _complex_times_su3(v1,ka0,v2); _trace_lambda_add_assign(df0[iy][0], v1); /* _trace_lambda(der,v1); */ /* ddd=&df0[iy][0]; */ /* _add_su3adj(*ddd,der); */ /*************** direction +1 **************************/ iy=g_iup[ix][1]; sp = k + iy; /* sp=&g_spinor_field[k][icy]; */ up=&g_gauge_field[ix][1]; _vector_i_add(psia,(*sp).s0,(*sp).s3); _vector_i_add(psib,(*sp).s1,(*sp).s2); _vector_i_add(phia,rr.s0,rr.s3); _vector_i_add(phib,rr.s1,rr.s2); _vector_tensor_vector_add(v1, phia, psia, phib, psib); /* _vector_tensor_vector(v1,phia,psia); */ /* _vector_tensor_vector(v2,phib,psib); */ /* _su3_plus_su3(v1,v1,v2); */ _su3_times_su3d(v2,*up,v1); _complex_times_su3(v1,ka1,v2); _trace_lambda_add_assign(df0[ix][1], v1); /* _trace_lambda(der,v1); */ /* ddd=&df0[ix][1]; */ /* _add_su3adj(*ddd,der); */ /**************** direction -1 *************************/ iy=g_idn[ix][1]; sm = k + iy; /* sm=&g_spinor_field[k][icy]; */ um=&g_gauge_field[iy][1]; _vector_i_sub(psia,(*sm).s0,(*sm).s3); _vector_i_sub(psib,(*sm).s1,(*sm).s2); _vector_i_sub(phia,rr.s0,rr.s3); _vector_i_sub(phib,rr.s1,rr.s2); _vector_tensor_vector_add(v1, psia, phia, psib, phib); /* _vector_tensor_vector(v1,psia,phia); */ /* _vector_tensor_vector(v2,psib,phib); */ /* _su3_plus_su3(v1,v1,v2); */ _su3_times_su3d(v2,*um,v1); _complex_times_su3(v1,ka1,v2); _trace_lambda_add_assign(df0[iy][1], v1); /* _trace_lambda(der,v1); */ /* ddd=&df0[iy][1]; */ /* _add_su3adj(*ddd,der); */ /*************** direction +2 **************************/ iy=g_iup[ix][2]; sp = k + iy; /* sp=&g_spinor_field[k][icy]; */ up=&g_gauge_field[ix][2]; _vector_add(psia,(*sp).s0,(*sp).s3); _vector_sub(psib,(*sp).s1,(*sp).s2); _vector_add(phia,rr.s0,rr.s3); _vector_sub(phib,rr.s1,rr.s2); _vector_tensor_vector_add(v1, phia, psia, phib, psib); /* _vector_tensor_vector(v1,phia,psia); */ /* _vector_tensor_vector(v2,phib,psib); */ /* _su3_plus_su3(v1,v1,v2); */ _su3_times_su3d(v2,*up,v1); _complex_times_su3(v1,ka2,v2); _trace_lambda_add_assign(df0[ix][2], v1); /* _trace_lambda(der,v1); */ /* ddd=&df0[ix][2]; */ /* _add_su3adj(*ddd,der); */ /***************** direction -2 ************************/ iy=g_idn[ix][2]; sm = k + iy; /* sm=&g_spinor_field[k][icy]; */ um=&g_gauge_field[iy][2]; _vector_sub(psia,(*sm).s0,(*sm).s3); _vector_add(psib,(*sm).s1,(*sm).s2); _vector_sub(phia,rr.s0,rr.s3); _vector_add(phib,rr.s1,rr.s2); _vector_tensor_vector_add(v1, psia, phia, psib, phib); /* _vector_tensor_vector(v1,psia,phia); */ /* _vector_tensor_vector(v2,psib,phib); */ /* _su3_plus_su3(v1,v1,v2); */ _su3_times_su3d(v2,*um,v1); _complex_times_su3(v1,ka2,v2); _trace_lambda_add_assign(df0[iy][2], v1); /* _trace_lambda(der,v1); */ /* ddd=&df0[iy][2]; */ /* _add_su3adj(*ddd,der); */ /****************** direction +3 ***********************/ iy=g_iup[ix][3]; sp = k + iy; /* sp=&g_spinor_field[k][icy]; */ up=&g_gauge_field[ix][3]; _vector_i_add(psia,(*sp).s0,(*sp).s2); _vector_i_sub(psib,(*sp).s1,(*sp).s3); _vector_i_add(phia,rr.s0,rr.s2); _vector_i_sub(phib,rr.s1,rr.s3); _vector_tensor_vector_add(v1, phia, psia, phib, psib); /* _vector_tensor_vector(v1,phia,psia); */ /* _vector_tensor_vector(v2,phib,psib); */ /* _su3_plus_su3(v1,v1,v2); */ _su3_times_su3d(v2,*up,v1); _complex_times_su3(v1,ka3,v2); _trace_lambda_add_assign(df0[ix][3], v1); /* _trace_lambda(der,v1); */ /* ddd=&df0[ix][3]; */ /* _add_su3adj(*ddd,der); */ /***************** direction -3 ************************/ iy=g_idn[ix][3]; sm = k + iy; /* sm=&g_spinor_field[k][icy]; */ um=&g_gauge_field[iy][3]; _vector_i_sub(psia,(*sm).s0,(*sm).s2); _vector_i_add(psib,(*sm).s1,(*sm).s3); _vector_i_sub(phia,rr.s0,rr.s2); _vector_i_add(phib,rr.s1,rr.s3); _vector_tensor_vector_add(v1, psia, phia, psib, phib); /* _vector_tensor_vector(v1,psia,phia); */ /* _vector_tensor_vector(v2,psib,phib); */ /* _su3_plus_su3(v1,v1,v2); */ _su3_times_su3d(v2,*um,v1); _complex_times_su3(v1,ka3,v2); _trace_lambda_add_assign(df0[iy][3], v1); /* _trace_lambda(der,v1); */ /* ddd=&df0[iy][3]; */ /* _add_su3adj(*ddd,der); */ /****************** end of loop ************************/ } #ifdef _KOJAK_INST #pragma pomp inst end(derivSb) #endif }
void diff(spinor * const Q,spinor * const R,spinor * const S, const int N) { int ix = 1; double *s ALIGN; double *sp ALIGN; double *r ALIGN; double *rp ALIGN; double *q ALIGN; double _Complex x00, x01, x02, x03, x04, x05, x06, x07, x08, x09, x10, x11; double _Complex y00, y01, y02, y03, y04, y05, y06, y07, y08, y09, y10, y11; #pragma disjoint(*R, *S) __alignx(16, Q); __alignx(16, R); __alignx(16, S); r = (double*) R; s = (double*) S; q = (double*) Q; rp = r + 24; sp = s + 24; _prefetch_spinor(rp); _prefetch_spinor(sp); x00 = __lfpd(r); x01 = __lfpd(r+2); x02 = __lfpd(r+4); x03 = __lfpd(r+6); x04 = __lfpd(r+8); x05 = __lfpd(r+10); x06 = __lfpd(r+12); x07 = __lfpd(r+14); x08 = __lfpd(r+16); x09 = __lfpd(r+18); x10 = __lfpd(r+20); x11 = __lfpd(r+22); y00 = __lfpd(s); y01 = __lfpd(s+2); y02 = __lfpd(s+4); y03 = __lfpd(s+6); y04 = __lfpd(s+8); y05 = __lfpd(s+10); y06 = __lfpd(s+12); y07 = __lfpd(s+14); y08 = __lfpd(s+16); y09 = __lfpd(s+18); y10 = __lfpd(s+20); y11 = __lfpd(s+22); __stfpd(q, __fpsub(x00, y00)); __stfpd(q+2, __fpsub(x01, y01)); __stfpd(q+4, __fpsub(x02, y02)); __stfpd(q+6, __fpsub(x03, y03)); __stfpd(q+8, __fpsub(x04, y04)); __stfpd(q+10, __fpsub(x05, y05)); __stfpd(q+12, __fpsub(x06, y06)); __stfpd(q+14, __fpsub(x07, y07)); __stfpd(q+16, __fpsub(x08, y08)); __stfpd(q+18, __fpsub(x09, y09)); __stfpd(q+20, __fpsub(x10, y10)); __stfpd(q+22, __fpsub(x11, y11)); s = sp; r = rp; q+=24; #pragma unroll(12) for(ix = 1; ix < N-1; ix++) { rp+=24; sp+=24; _prefetch_spinor(rp); _prefetch_spinor(sp); x00 = __lfpd(r); x01 = __lfpd(r+2); x02 = __lfpd(r+4); x03 = __lfpd(r+6); x04 = __lfpd(r+8); x05 = __lfpd(r+10); x06 = __lfpd(r+12); x07 = __lfpd(r+14); x08 = __lfpd(r+16); x09 = __lfpd(r+18); x10 = __lfpd(r+20); x11 = __lfpd(r+22); y00 = __lfpd(s); y01 = __lfpd(s+2); y02 = __lfpd(s+4); y03 = __lfpd(s+6); y04 = __lfpd(s+8); y05 = __lfpd(s+10); y06 = __lfpd(s+12); y07 = __lfpd(s+14); y08 = __lfpd(s+16); y09 = __lfpd(s+18); y10 = __lfpd(s+20); y11 = __lfpd(s+22); __stfpd(q, __fpsub(x00, y00)); __stfpd(q+2, __fpsub(x01, y01)); __stfpd(q+4, __fpsub(x02, y02)); __stfpd(q+6, __fpsub(x03, y03)); __stfpd(q+8, __fpsub(x04, y04)); __stfpd(q+10, __fpsub(x05, y05)); __stfpd(q+12, __fpsub(x06, y06)); __stfpd(q+14, __fpsub(x07, y07)); __stfpd(q+16, __fpsub(x08, y08)); __stfpd(q+18, __fpsub(x09, y09)); __stfpd(q+20, __fpsub(x10, y10)); __stfpd(q+22, __fpsub(x11, y11)); s = sp; r = rp; q+=24; } x00 = __lfpd(r); x01 = __lfpd(r+2); x02 = __lfpd(r+4); x03 = __lfpd(r+6); x04 = __lfpd(r+8); x05 = __lfpd(r+10); x06 = __lfpd(r+12); x07 = __lfpd(r+14); x08 = __lfpd(r+16); x09 = __lfpd(r+18); x10 = __lfpd(r+20); x11 = __lfpd(r+22); y00 = __lfpd(s); y01 = __lfpd(s+2); y02 = __lfpd(s+4); y03 = __lfpd(s+6); y04 = __lfpd(s+8); y05 = __lfpd(s+10); y06 = __lfpd(s+12); y07 = __lfpd(s+14); y08 = __lfpd(s+16); y09 = __lfpd(s+18); y10 = __lfpd(s+20); y11 = __lfpd(s+22); __stfpd(q, __fpsub(x00, y00)); __stfpd(q+2, __fpsub(x01, y01)); __stfpd(q+4, __fpsub(x02, y02)); __stfpd(q+6, __fpsub(x03, y03)); __stfpd(q+8, __fpsub(x04, y04)); __stfpd(q+10, __fpsub(x05, y05)); __stfpd(q+12, __fpsub(x06, y06)); __stfpd(q+14, __fpsub(x07, y07)); __stfpd(q+16, __fpsub(x08, y08)); __stfpd(q+18, __fpsub(x09, y09)); __stfpd(q+20, __fpsub(x10, y10)); __stfpd(q+22, __fpsub(x11, y11)); return; }
void diff(spinor * const Q,const spinor * const R,const spinor * const S, const int N) { #ifdef OMP #pragma omp parallel { #endif vector4double x0, x1, x2, x3, x4, x5, y0, y1, y2, y3, y4, y5; vector4double z0, z1, z2, z3, z4, z5; double *s, *r, *q; __alignx(32, s); __alignx(32, r); __alignx(32, q); __alignx(32, S); __alignx(32, R); __prefetch_by_load(S); __prefetch_by_load(R); __prefetch_by_load(Q); #ifndef OMP #pragma unroll(2) #else #pragma omp for #endif for (int ix = 0; ix < N; ++ix) { s=(double*)((spinor *) S + ix); r=(double*)((spinor *) R + ix); q=(double*)((spinor *) Q + ix); __prefetch_by_load(S + ix + 1); __prefetch_by_load(R + ix + 1); __prefetch_by_stream(1, Q + ix + 1); x0 = vec_ld(0, r); x1 = vec_ld(0, r+4); x2 = vec_ld(0, r+8); x3 = vec_ld(0, r+12); x4 = vec_ld(0, r+16); x5 = vec_ld(0, r+20); y0 = vec_ld(0, s); y1 = vec_ld(0, s+4); y2 = vec_ld(0, s+8); y3 = vec_ld(0, s+12); y4 = vec_ld(0, s+16); y5 = vec_ld(0, s+20); z0 = vec_sub(x0, y0); z1 = vec_sub(x1, y1); z2 = vec_sub(x2, y2); z3 = vec_sub(x3, y3); z4 = vec_sub(x4, y4); z5 = vec_sub(x5, y5); vec_st(z0, 0, q); vec_st(z1, 0, q+4); vec_st(z2, 0, q+8); vec_st(z3, 0, q+12); vec_st(z4, 0, q+16); vec_st(z5, 0, q+20); } #ifdef OMP } /* OpenMP parallel closing brace */ #endif return; }
/* <S,R>=S^* times R */ _Complex double scalar_prod(const spinor * const S, const spinor * const R, const int N, const int parallel) { _Complex double ALIGN res = 0.0; #ifdef MPI _Complex double ALIGN mres; #endif #ifdef OMP #pragma omp parallel { int thread_num = omp_get_thread_num(); #endif _Complex double ALIGN ds,tr,ts,tt,ks,kc; const spinor *s,*r; ks = 0.0; kc = 0.0; #if (defined BGL && defined XLC) __alignx(16, S); __alignx(16, R); #endif #ifdef OMP #pragma omp for #endif for (int ix = 0; ix < N; ix++) { s= S + ix; r= R + ix; ds = r->s0.c0 * conj(s->s0.c0) + r->s0.c1 * conj(s->s0.c1) + r->s0.c2 * conj(s->s0.c2) + r->s1.c0 * conj(s->s1.c0) + r->s1.c1 * conj(s->s1.c1) + r->s1.c2 * conj(s->s1.c2) + r->s2.c0 * conj(s->s2.c0) + r->s2.c1 * conj(s->s2.c1) + r->s2.c2 * conj(s->s2.c2) + r->s3.c0 * conj(s->s3.c0) + r->s3.c1 * conj(s->s3.c1) + r->s3.c2 * conj(s->s3.c2); /* Kahan Summation */ tr=ds+kc; ts=tr+ks; tt=ts-ks; ks=ts; kc=tr-tt; } kc=ks+kc; #ifdef OMP g_omp_acc_cp[thread_num] = kc; } /* OpenMP closing brace */ /* having left the parallel section, we can now sum up the Kahan corrected sums from each thread into kc */ for(int i = 0; i < omp_num_threads; ++i) res += g_omp_acc_cp[i]; #else res=kc; #endif #ifdef MPI if(parallel == 1) { MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD); return(mres); } #endif return(res); }
void deriv_Sb_D_psi(spinor * const l, spinor * const k, hamiltonian_field_t * const hf, const double factor) { int ix,iy, iz; int ioff,ioff2,icx,icy, icz; su3 * restrict up ALIGN; su3 * restrict um ALIGN; su3adj * restrict ddd; static su3adj der; static su3 v1,v2; static su3_vector psia,psib,phia,phib; static spinor rr; spinor * restrict r ALIGN; spinor * restrict sp ALIGN; spinor * restrict sm ALIGN; /* We have 32 registers available */ double _Complex reg00, reg01, reg02, reg03, reg04, reg05; double _Complex reg10, reg11, reg12, reg13, reg14, reg15; /* For su3 matrix, use reg00 for missing register */ double _Complex v00, v01, v02, v10, v11, v12, v20, v21; /* The following contains the left spinor (12 regs) and the final */ /* su3 matrix to trace over */ double _Complex r00, r01, r02, r10, r11, r12, r20, r21, r22, r30, r31, r32; #ifdef _KOJAK_INST # pragma pomp inst begin(derivSb) #endif #pragma disjoint(*r, *sp, *sm, *up, *um, *ddd) __alignx(16, l); __alignx(16, k); if(ieo==0) { ioff=0; } else { ioff=(VOLUME+RAND)/2; } ioff2=(VOLUME+RAND)/2-ioff; /* for parallelization */ #ifdef MPI xchange_field(k, ieo); xchange_field(l, (ieo+1)%2); #endif /************** loop over all lattice sites ****************/ ix=ioff; iy=g_iup[ix][0]; icy=iy; sp = k + icy; _prefetch_spinor(sp); up=&hf->gaugefield[ix][0]; _prefetch_su3(up); for(icx = ioff; icx < (VOLUME+ioff); icx++){ /* load left vector r and */ /* multiply with gamma5 */ r = l + (icx-ioff); ix=icx; /*********************** direction +0 ********************/ ddd = &hf->derivative[ix][0]; _bgl_load_r0((*r).s0); _bgl_load_r1((*r).s1); _bgl_load_minus_r2((*r).s2); _bgl_load_minus_r3((*r).s3); _bgl_load_reg0((*sp).s0); _bgl_load_reg0_up((*sp).s1); _bgl_load_reg1((*sp).s2); _bgl_load_reg1_up((*sp).s3); _bgl_add_to_reg0_reg1(); _bgl_add_to_reg0_up_reg1_up(); _bgl_add_r0_to_r2_reg1(); _bgl_add_r1_to_r3_reg1_up(); iy=g_idn[ix][0]; icy=iy; sm = k + icy; _prefetch_spinor(sm); um=&hf->gaugefield[iy][0]; _prefetch_su3(um); _bgl_tensor_product_and_add(); /* result in v now */ _bgl_su3_times_v_dagger(*up); /* result in r now */ _bgl_complex_times_r(ka0); _bgl_trace_lambda_mul_add_assign((*ddd), 2.*factor); /************** direction -0 ****************************/ ddd = &hf->derivative[iy][0]; _bgl_load_r0((*r).s0); _bgl_load_r1((*r).s1); _bgl_load_minus_r2((*r).s2); _bgl_load_minus_r3((*r).s3); _bgl_load_reg0((*sm).s0); _bgl_load_reg0_up((*sm).s1); _bgl_load_reg1((*sm).s2); _bgl_load_reg1_up((*sm).s3); _bgl_sub_from_reg0_reg1(); _bgl_sub_from_reg0_up_reg1_up(); _bgl_sub_from_r0_r2_reg1(); _bgl_sub_from_r1_r3_reg1_up(); iy=g_iup[ix][1]; icy=[iy]; sp = k + icy; _prefetch_spinor(sp); up=&hf->gaugefield[ix][1]; _prefetch_su3(up); _bgl_tensor_product_and_add_d(); /* result in v now */ _bgl_su3_times_v_dagger(*um); /* result in r now */ _bgl_complex_times_r(ka0); _bgl_trace_lambda_mul_add_assign((*ddd), 2.*factor); /*************** direction +1 **************************/ ddd = &hf->derivative[ix][1]; _bgl_load_r0((*r).s0); _bgl_load_r1((*r).s1); _bgl_load_minus_r2((*r).s2); _bgl_load_minus_r3((*r).s3); _bgl_load_reg0((*sp).s0); _bgl_load_reg0_up((*sp).s1); _bgl_load_reg1((*sp).s2); _bgl_load_reg1_up((*sp).s3); _bgl_i_mul_add_to_reg0_reg1_up(); _bgl_i_mul_add_to_reg0_up_reg1(); _bgl_i_mul_add_r0_to_r3_reg1(); _bgl_i_mul_add_r1_to_r2_reg1_up(); iy=g_idn[ix][1]; icy=iy; sm = k + icy; _prefetch_spinor(sm); um=&hf->gaugefield[iy][1]; _prefetch_su3(um); _bgl_tensor_product_and_add(); /* result in v now */ _bgl_su3_times_v_dagger(*up); /* result in r now */ _bgl_complex_times_r(ka1); _bgl_trace_lambda_mul_add_assign((*ddd), 2.*factor); /**************** direction -1 *************************/ ddd = &hf->derivative[iy][1]; _bgl_load_r0((*r).s0); _bgl_load_r1((*r).s1); _bgl_load_minus_r2((*r).s2); _bgl_load_minus_r3((*r).s3); _bgl_load_reg0((*sp).s0); _bgl_load_reg0_up((*sp).s1); _bgl_load_reg1((*sp).s2); _bgl_load_reg1_up((*sp).s3); _bgl_i_mul_sub_from_reg0_reg1_up(); _bgl_i_mul_sub_from_reg0_up_reg1(); _bgl_i_mul_sub_from_r0_r3_reg1(); _bgl_i_mul_sub_from_r1_r2_reg1_up(); iy=g_iup[ix][2]; icy=iy; sp = k + icy; _prefetch_spinor(sp); up=&hf->gaugefield[ix][2]; _prefetch_su3(up); _bgl_tensor_product_and_add_d(); /* result in v now */ _bgl_su3_times_v_dagger(*um); /* result in r now */ _bgl_complex_times_r(ka1); _bgl_trace_lambda_mul_add_assign((*ddd), 2.*factor); /*************** direction +2 **************************/ ddd = &hf->derivative[ix][2]; _bgl_load_r0((*r).s0); _bgl_load_r1((*r).s1); _bgl_load_minus_r2((*r).s2); _bgl_load_minus_r3((*r).s3); _bgl_load_reg0((*sp).s0); _bgl_load_reg0_up((*sp).s1); _bgl_load_reg1((*sp).s2); _bgl_load_reg1_up((*sp).s3); _bgl_add_to_reg0_reg1_up(); _bgl_sub_from_reg0_up_reg1(); _bgl_add_r0_to_r3_reg1(); _bgl_sub_from_r1_r2_reg1_up(); iy=g_idn[ix][2]; icy=iy; sm = k + icy; _prefetch_spinor(sm); um=&hf->gaugefield[iy][2]; _prefetch_su3(um); _bgl_tensor_product_and_add(); /* result in v now */ _bgl_su3_times_v_dagger(*up); /* result in r now */ _bgl_complex_times_r(ka2); _bgl_trace_lambda_mul_add_assign((*ddd), 2.*factor); /***************** direction -2 ************************/ ddd = &hf->derivative[iy][2]; _bgl_load_r0((*r).s0); _bgl_load_r1((*r).s1); _bgl_load_minus_r2((*r).s2); _bgl_load_minus_r3((*r).s3); _bgl_load_reg0((*sp).s0); _bgl_load_reg0_up((*sp).s1); _bgl_load_reg1((*sp).s2); _bgl_load_reg1_up((*sp).s3); _bgl_sub_from_reg0_reg1_up(); _bgl_add_to_reg0_up_reg1(); _bgl_sub_from_r0_r3_reg1(); _bgl_add_r1_to_r2_reg1_up(); iy=g_iup[ix][3]; icy=iy; sp = k + icy; _prefetch_spinor(sp); up=&hf->gaugefield[ix][3]; _prefetch_su3(up); _bgl_tensor_product_and_add_d(); /* result in v now */ _bgl_su3_times_v_dagger(*um); /* result in r now */ _bgl_complex_times_r(ka1); _bgl_trace_lambda_mul_add_assign(*ddd, 2.*factor); /****************** direction +3 ***********************/ ddd = &hf->derivative[ix][3]; _bgl_load_r0((*r).s0); _bgl_load_r1((*r).s1); _bgl_load_minus_r2((*r).s2); _bgl_load_minus_r3((*r).s3); _bgl_load_reg0((*sp).s0); _bgl_load_reg0_up((*sp).s1); _bgl_load_reg1((*sp).s2); _bgl_load_reg1_up((*sp).s3); _bgl_i_mul_add_to_reg0_reg1(); _bgl_i_mul_sub_from_reg0_up_reg1_up(); _bgl_i_mul_add_r0_to_r2_reg1(); _bgl_i_mul_sub_from_r1_r3_reg1_up(); iy=g_idn[ix][3]; icy=iy; sm = k + icy; _prefetch_spinor(sm); um=&hf->gaugefield[iy][3]; _prefetch_su3(um); _bgl_tensor_product_and_add(); /* result in v now */ _bgl_su3_times_v_dagger(*up); /* result in r now */ _bgl_complex_times_r(ka3); _bgl_trace_lambda_mul_add_assign((*ddd), 2.*factor); /***************** direction -3 ************************/ ddd = &hf->derivative[iy][3]; _bgl_load_r0((*r).s0); _bgl_load_r1((*r).s1); _bgl_load_minus_r2((*r).s2); _bgl_load_minus_r3((*r).s3); _bgl_load_reg0((*sp).s0); _bgl_load_reg0_up((*sp).s1); _bgl_load_reg1((*sp).s2); _bgl_load_reg1_up((*sp).s3); _bgl_i_mul_sub_from_reg0_reg1(); _bgl_i_mul_add_to_reg0_up_reg1_up(); _bgl_i_mul_sub_from_r0_r2_reg1(); _bgl_i_mul_add_r1_to_r3_reg1_up(); /* something wrong here...*/ icz=icx+1; if(icz==((VOLUME+RAND)/2+ioff)) icz=ioff; iz=icz; iy=g_iup[iz][0]; icy=iy; sp = k + icy; _prefetch_spinor(sp); up=&hf->gaugefield[iz][0]; _prefetch_su3(up); _bgl_tensor_product_and_add_d(); /* result in v now */ _bgl_su3_times_v_dagger(*um); /* result in r now */ _bgl_complex_times_r(ka3); _bgl_trace_lambda_mul_add_assign((*ddd), 2.*factor); /****************** end of loop ************************/ } #ifdef _KOJAK_INST #pragma pomp inst end(derivSb) #endif }
/* this is the hopping part only */ void local_H(spinor * const rr, spinor * const s, su3 * u, int * _idx) { int * idx = _idx; su3 * restrict up ALIGN; su3 * restrict um ALIGN; spinor * restrict sp ALIGN; spinor * restrict sm ALIGN; #pragma disjoint(*s, *sp, *sm, *rr, *up, *um) __alignx(16,rr); __alignx(16,s); /*********************** direction +0 ************************/ up = u; sp = (spinor *) s + (*idx); idx++; um = up+1; _prefetch_su3(um); sm = (spinor *) s + (*idx); _prefetch_spinor(sm); idx++; _bgl_load_reg0(sp->s0); _bgl_load_reg1(sp->s1); _bgl_load_reg0_up(sp->s2); _bgl_load_reg1_up(sp->s3); _bgl_vector_add_reg0(); _bgl_vector_add_reg1(); /* result is now in regx0, regx1, regx2 x = 0,1 */ _bgl_su3_multiply_double((*up)); _bgl_vector_cmplx_mul_double(phase_0); _bgl_add_to_rs0_reg0(); _bgl_add_to_rs2_reg0(); _bgl_add_to_rs1_reg1(); _bgl_add_to_rs3_reg1(); /*********************** direction -0 ************************/ up = um+1; _prefetch_su3(up); sp = (spinor*) s + (*idx); _prefetch_spinor(sp); idx++; _bgl_load_reg0(sm->s0); _bgl_load_reg1(sm->s1); _bgl_load_reg0_up(sm->s2); _bgl_load_reg1_up(sm->s3); _bgl_vector_sub_reg0(); _bgl_vector_sub_reg1(); _bgl_su3_inverse_multiply_double((*um)); _bgl_vector_cmplxcg_mul_double(phase_0); _bgl_add_to_rs0_reg0(); _bgl_sub_from_rs2_reg0(); _bgl_add_to_rs1_reg1(); _bgl_sub_from_rs3_reg1(); /*********************** direction +1 ************************/ um = up+1; _prefetch_su3(um); sm = (spinor*) s + (*idx); _prefetch_spinor(sm); idx++; _bgl_load_reg0(sp->s0); _bgl_load_reg1(sp->s1); _bgl_load_reg0_up(sp->s3); _bgl_load_reg1_up(sp->s2); _bgl_vector_i_mul_add_reg0(); _bgl_vector_i_mul_add_reg1(); _bgl_su3_multiply_double((*up)); _bgl_vector_cmplx_mul_double(phase_1); _bgl_add_to_rs0_reg0(); _bgl_i_mul_sub_from_rs3_reg0(); _bgl_add_to_rs1_reg1(); _bgl_i_mul_sub_from_rs2_reg1(); /*********************** direction -1 ************************/ up = um+1; _prefetch_su3(up); sp = (spinor*) s + (*idx); _prefetch_spinor(sp); idx++; _bgl_load_reg0(sm->s0); _bgl_load_reg1(sm->s1); _bgl_load_reg0_up(sm->s3); _bgl_load_reg1_up(sm->s2); _bgl_vector_i_mul_sub_reg0(); _bgl_vector_i_mul_sub_reg1(); _bgl_su3_inverse_multiply_double((*um)); _bgl_vector_cmplxcg_mul_double(phase_1); _bgl_add_to_rs0_reg0(); _bgl_add_to_rs1_reg1(); _bgl_i_mul_add_to_rs3_reg0(); _bgl_i_mul_add_to_rs2_reg1(); /*********************** direction +2 ************************/ um = up+1; _prefetch_su3(um); sm = (spinor*) s + (*idx); _prefetch_spinor(sm); idx++; _bgl_load_reg0(sp->s0); _bgl_load_reg1(sp->s1); _bgl_load_reg1_up(sp->s2); _bgl_load_reg0_up(sp->s3); _bgl_vector_add_reg0(); _bgl_vector_sub_reg1(); _bgl_su3_multiply_double((*up)); _bgl_vector_cmplx_mul_double(phase_2); _bgl_add_to_rs0_reg0(); _bgl_add_to_rs1_reg1(); _bgl_sub_from_rs2_reg1(); _bgl_add_to_rs3_reg0(); /*********************** direction -2 ************************/ up = um+1; _prefetch_su3(up); sp = (spinor*) s + (*idx); _prefetch_spinor(sp); idx++; _bgl_load_reg0(sm->s0); _bgl_load_reg1(sm->s1); _bgl_load_reg1_up(sm->s2); _bgl_load_reg0_up(sm->s3); _bgl_vector_sub_reg0(); _bgl_vector_add_reg1(); _bgl_su3_inverse_multiply_double((*um)); _bgl_vector_cmplxcg_mul_double(phase_2); _bgl_add_to_rs0_reg0(); _bgl_add_to_rs1_reg1(); _bgl_add_to_rs2_reg1(); _bgl_sub_from_rs3_reg0(); /*********************** direction +3 ************************/ um = up+1; _prefetch_su3(um); sm = (spinor*) s + (*idx); _prefetch_spinor(sm); _bgl_load_reg0(sp->s0); _bgl_load_reg1(sp->s1); _bgl_load_reg0_up(sp->s2); _bgl_load_reg1_up(sp->s3); _bgl_vector_i_mul_add_reg0(); _bgl_vector_i_mul_sub_reg1(); _bgl_su3_multiply_double((*up)); _bgl_vector_cmplx_mul_double(phase_3); _bgl_add_to_rs0_reg0(); _bgl_add_to_rs1_reg1(); _bgl_i_mul_sub_from_rs2_reg0(); _bgl_i_mul_add_to_rs3_reg1(); /*********************** direction -3 ************************/ _bgl_load_reg0(sm->s0); _bgl_load_reg1(sm->s1); _bgl_load_reg0_up(sm->s2); _bgl_load_reg1_up(sm->s3); _bgl_vector_i_mul_sub_reg0(); _bgl_vector_i_mul_add_reg1(); _bgl_su3_inverse_multiply_double((*um)); _bgl_vector_cmplxcg_mul_double(phase_3); _bgl_add_to_rs0_reg0(); _bgl_store_rs0(rr->s0); _bgl_i_mul_add_to_rs2_reg0(); _bgl_store_rs2(rr->s2); _bgl_add_to_rs1_reg1(); _bgl_store_rs1(rr->s1); _bgl_i_mul_sub_from_rs3_reg1(); _bgl_store_rs3(rr->s3); }
void xchange_field(spinor * const l, const int ieo) { #ifdef MPI MPI_Request requests[16]; MPI_Status status[16]; #endif int ireq; # if ( defined PARALLELT || defined PARALLELX ) int reqcount = 4; # elif ( defined PARALLELXT || defined PARALLELXY ) int reqcount = 8; # elif ( defined PARALLELXYT || defined PARALLELXYZ ) int reqcount = 12; # elif defined PARALLELXYZT int ix=0; int reqcount = 16; # endif #ifdef _KOJAK_INST #pragma pomp inst begin(xchangefield) #endif # if (defined BGL && defined XLC) __alignx(16, l); # endif # ifdef MPI /* In 4 dimensions there are two processors sharing the */ /* communication bandwidth. So the first should start */ /* in forward direction, the second in backward direction */ /* This might only work if the third direction is */ /* parallelised only on the node */ if(g_proc_coords[3]%2 == 0) { ireq=0; # if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT ) /* send the data to the neighbour on the left */ /* recieve the data from the neighbour on the right */ MPI_Isend((void*)(l+g_1st_t_int_dn), 1, field_time_slice_cont, g_nb_t_dn, 81, g_cart_grid, &requests[ireq]); MPI_Irecv( (void*)(l+g_1st_t_ext_up), 1, field_time_slice_cont, g_nb_t_up, 81, g_cart_grid, &requests[ireq+1]); ireq=ireq+4; # endif # if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ ) /* send the data to the neighbour on the left in x direction */ /* recieve the data from the neighbour on the right in x direction */ MPI_Isend((void*)(l+g_1st_x_int_dn), 1, field_x_slice_gath, g_nb_x_dn, 91, g_cart_grid, &requests[ireq]); MPI_Irecv((void*)(l+g_1st_x_ext_up), 1, field_x_slice_cont, g_nb_x_up, 91, g_cart_grid, &requests[ireq+1]); ireq=ireq+4; # endif # if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ ) /* send the data to the neighbour on the left in y direction */ /* recieve the data from the neighbour on the right in y direction */ MPI_Isend((void*)(l+g_1st_y_int_dn), 1, field_y_slice_gath, g_nb_y_dn, 101, g_cart_grid, &requests[ireq]); MPI_Irecv((void*)(l+g_1st_y_ext_up), 1, field_y_slice_cont, g_nb_y_up, 101, g_cart_grid, &requests[ireq+1]); ireq=ireq+4; # endif # if (defined PARALLELXYZT || defined PARALLELXYZ ) /* This is now depending on whether the field is even or odd */ /* send the data to the neighbour on the left in z direction */ /* recieve the data from the neighbour on the right in z direction */ if(ieo == 1) { MPI_Isend((void*)(l+g_1st_z_int_dn),1,field_z_slice_even_dn,g_nb_z_dn,503,g_cart_grid,&requests[ireq]); MPI_Irecv((void*)(l+g_1st_z_ext_up),1,field_z_slice_cont,g_nb_z_up,503,g_cart_grid,&requests[ireq+1]); } else { MPI_Isend((void*)(l+g_1st_z_int_dn),1,field_z_slice_odd_dn,g_nb_z_dn,503,g_cart_grid,&requests[ireq]); MPI_Irecv((void*)(l+g_1st_z_ext_up),1,field_z_slice_cont,g_nb_z_up,503,g_cart_grid,&requests[ireq+1]); } # endif ireq=2; # if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT ) /* send the data to the neighbour on the right */ /* recieve the data from the neighbour on the left */ MPI_Isend((void*)(l+g_1st_t_int_up), 1, field_time_slice_cont, g_nb_t_up, 82, g_cart_grid, &requests[ireq]); MPI_Irecv((void*)(l+g_1st_t_ext_dn), 1, field_time_slice_cont, g_nb_t_dn, 82, g_cart_grid, &requests[ireq+1]); ireq=ireq+4; # endif # if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ ) /* send the data to the neighbour on the right in x direction */ /* recieve the data from the neighbour on the left in x direction */ MPI_Isend((void*)(l+g_1st_x_int_up), 1, field_x_slice_gath, g_nb_x_up, 92, g_cart_grid, &requests[ireq]); MPI_Irecv((void*)(l+g_1st_x_ext_dn), 1, field_x_slice_cont, g_nb_x_dn, 92, g_cart_grid, &requests[ireq+1]); ireq=ireq+4; # endif # if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ ) /* send the data to the neighbour on the right in y direction */ /* recieve the data from the neighbour on the left in y direction */ MPI_Isend((void*)(l+g_1st_y_int_up), 1, field_y_slice_gath, g_nb_y_up, 102, g_cart_grid, &requests[ireq]); MPI_Irecv((void*)(l+g_1st_y_ext_dn), 1, field_y_slice_cont, g_nb_y_dn, 102, g_cart_grid, &requests[ireq+1]); ireq=ireq+4; # endif # if ( defined PARALLELXYZT || defined PARALLELXYZ ) /* send the data to the neighbour on the right in z direction */ /* recieve the data from the neighbour on the left in z direction */ if(ieo == 1) { MPI_Isend((void*)(l+g_1st_z_int_up),1,field_z_slice_even_up,g_nb_z_up,504,g_cart_grid,&requests[ireq]); MPI_Irecv((void*)(l+g_1st_z_ext_dn),1,field_z_slice_cont,g_nb_z_dn,504,g_cart_grid,&requests[ireq+1]); } else { MPI_Isend((void*)(l+g_1st_z_int_up),1,field_z_slice_odd_up,g_nb_z_up,504,g_cart_grid,&requests[ireq]); MPI_Irecv((void*)(l+g_1st_z_ext_dn),1,field_z_slice_cont,g_nb_z_dn,504,g_cart_grid,&requests[ireq+1]); } # endif } else { ireq=0; # if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT ) /* send the data to the neighbour on the right */ /* recieve the data from the neighbour on the left */ MPI_Isend((void*)(l+g_1st_t_int_up), 1, field_time_slice_cont, g_nb_t_up, 82, g_cart_grid, &requests[ireq]); MPI_Irecv((void*)(l+g_1st_t_ext_dn), 1, field_time_slice_cont, g_nb_t_dn, 82, g_cart_grid, &requests[ireq+1]); ireq=ireq+4; # endif # if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ ) /* send the data to the neighbour on the right in x direction */ /* recieve the data from the neighbour on the left in x direction */ MPI_Isend((void*)(l+g_1st_x_int_up), 1, field_x_slice_gath, g_nb_x_up, 92, g_cart_grid, &requests[ireq]); MPI_Irecv((void*)(l+g_1st_x_ext_dn), 1, field_x_slice_cont, g_nb_x_dn, 92, g_cart_grid, &requests[ireq+1]); ireq=ireq+4; # endif # if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ ) /* send the data to the neighbour on the right in y direction */ /* recieve the data from the neighbour on the left in y direction */ MPI_Isend((void*)(l+g_1st_y_int_up), 1, field_y_slice_gath, g_nb_y_up, 102, g_cart_grid, &requests[ireq]); MPI_Irecv((void*)(l+g_1st_y_ext_dn), 1, field_y_slice_cont, g_nb_y_dn, 102, g_cart_grid, &requests[ireq+1]); ireq=ireq+4; # endif # if (defined PARALLELXYZT || defined PARALLELXYZ ) /* This is now depending on whether the field is even or odd */ /* send the data to the neighbour on the left in z direction */ /* recieve the data from the neighbour on the right in z direction */ if(ieo == 1) { MPI_Isend((void*)(l+g_1st_z_int_dn),1,field_z_slice_even_dn,g_nb_z_dn,503,g_cart_grid,&requests[ireq]); MPI_Irecv((void*)(l+g_1st_z_ext_up),1,field_z_slice_cont,g_nb_z_up,503,g_cart_grid,&requests[ireq+1]); } else { MPI_Isend((void*)(l+g_1st_z_int_dn),1,field_z_slice_odd_dn,g_nb_z_dn,503,g_cart_grid,&requests[ireq]); MPI_Irecv((void*)(l+g_1st_z_ext_up),1,field_z_slice_cont,g_nb_z_up,503,g_cart_grid,&requests[ireq+1]); } # endif ireq=2; # if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT ) /* send the data to the neighbour on the left */ /* recieve the data from the neighbour on the right */ MPI_Isend((void*)(l+g_1st_t_int_dn), 1, field_time_slice_cont, g_nb_t_dn, 81, g_cart_grid, &requests[ireq]); MPI_Irecv((void*)(l+g_1st_t_ext_up), 1, field_time_slice_cont, g_nb_t_up, 81, g_cart_grid, &requests[ireq+1]); ireq=ireq+4; # endif # if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ ) /* send the data to the neighbour on the left in x direction */ /* recieve the data from the neighbour on the right in x direction */ MPI_Isend((void*)(l+g_1st_x_int_dn), 1, field_x_slice_gath, g_nb_x_dn, 91, g_cart_grid, &requests[ireq]); MPI_Irecv((void*)(l+g_1st_x_ext_up), 1, field_x_slice_cont, g_nb_x_up, 91, g_cart_grid, &requests[ireq+1]); ireq=ireq+4; # endif # if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ ) /* send the data to the neighbour on the left in y direction */ /* recieve the data from the neighbour on the right in y direction */ MPI_Isend((void*)(l+g_1st_y_int_dn), 1, field_y_slice_gath, g_nb_y_dn, 101, g_cart_grid, &requests[ireq]); MPI_Irecv((void*)(l+g_1st_y_ext_up), 1, field_y_slice_cont, g_nb_y_up, 101, g_cart_grid, &requests[ireq+1]); ireq=ireq+4; # endif # if ( defined PARALLELXYZT || defined PARALLELXYZ ) /* send the data to the neighbour on the right in z direction */ /* recieve the data from the neighbour on the left in z direction */ if(ieo == 1) { MPI_Isend((void*)(l+g_1st_z_int_up),1,field_z_slice_even_up,g_nb_z_up,504,g_cart_grid,&requests[ireq]); MPI_Irecv((void*)(l+g_1st_z_ext_dn),1,field_z_slice_cont,g_nb_z_dn,504,g_cart_grid,&requests[ireq+1]); } else { MPI_Isend((void*)(l+g_1st_z_int_up),1,field_z_slice_odd_up,g_nb_z_up,504,g_cart_grid,&requests[ireq]); MPI_Irecv((void*)(l+g_1st_z_ext_dn),1,field_z_slice_cont,g_nb_z_dn,504,g_cart_grid,&requests[ireq+1]); } # endif } MPI_Waitall(reqcount, requests, status); # endif /* MPI */ return; #ifdef _KOJAK_INST #pragma pomp inst end(xchangefield) #endif }
void xchange_field(spinor * const l, const int ieo) { #ifdef MPI MPI_Request requests[16]; MPI_Status status[16]; #endif # ifdef PARALLELT int reqcount = 4; # elif defined PARALLELXT int reqcount = 8; # elif defined PARALLELXYT int reqcount = 12; # elif defined PARALLELXYZT int ix=0; int reqcount = 16; # endif #ifdef _KOJAK_INST #pragma pomp inst begin(xchangefield) #endif # if (defined BGL && defined XLC) # ifdef PARALLELXYZT __alignx(16, field_buffer_z); __alignx(16, field_buffer_z2); # endif __alignx(16, l); # endif # ifdef MPI /* In 4 dimensions there are two processors sharing the */ /* communication bandwidth. So the first should start */ /* in forward direction, the second in backward direction */ /* This might only work if the third direction is */ /* parallelised only on the node */ if(g_proc_coords[3]%2 == 0) { /* send the data to the neighbour on the left */ /* recieve the data from the neighbour on the right */ MPI_Isend((void*)l, 1, field_time_slice_cont, g_nb_t_dn, 81, g_cart_grid, &requests[0]); MPI_Irecv((void*)(l+T*LX*LY*LZ/2), 1, field_time_slice_cont, g_nb_t_up, 81, g_cart_grid, &requests[1]); # if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) /* send the data to the neighbour on the left in x direction */ /* recieve the data from the neighbour on the right in x direction */ MPI_Isend((void*)l, 1, field_x_slice_gath, g_nb_x_dn, 91, g_cart_grid, &requests[4]); MPI_Irecv((void*)(l+(T+2)*LX*LY*LZ/2), 1, field_x_slice_cont, g_nb_x_up, 91, g_cart_grid, &requests[5]); # endif # if (defined PARALLELXYT || defined PARALLELXYZT) /* send the data to the neighbour on the left in y direction */ /* recieve the data from the neighbour on the right in y direction */ MPI_Isend((void*)l, 1, field_y_slice_gath, g_nb_y_dn, 101, g_cart_grid, &requests[8]); MPI_Irecv((void*)(l+((T+2)*LX*LY*LZ + 2*T*LY*LZ)/2), 1, field_y_slice_cont, g_nb_y_up, 101, g_cart_grid, &requests[9]); # endif # if (defined PARALLELXYZT) /* fill buffer ! */ /* This is now depending on whether the field is */ /* even or odd */ if(ieo == 1) { for(ix = 0; ix < T*LX*LY/2; ix++) { field_buffer_z[ix] = l[ g_field_z_ipt_even[ix] ]; } } else { for(ix = 0; ix < T*LX*LY/2; ix++) { field_buffer_z[ix] = l[ g_field_z_ipt_odd[ix] ]; } } /* send the data to the neighbour on the left in z direction */ /* recieve the data from the neighbour on the right in z direction */ MPI_Isend((void*)field_buffer_z, 12*T*LX*LY, MPI_DOUBLE, g_nb_z_dn, 503, g_cart_grid, &requests[12]); MPI_Irecv((void*)(l+(VOLUME/2 + LX*LY*LZ + T*LY*LZ +T*LX*LZ)), 12*T*LX*LY, MPI_DOUBLE, g_nb_z_up, 503, g_cart_grid, &requests[13]); # endif /* send the data to the neighbour on the right */ /* recieve the data from the neighbour on the left */ MPI_Isend((void*)(l+(T-1)*LX*LY*LZ/2), 1, field_time_slice_cont, g_nb_t_up, 82, g_cart_grid, &requests[2]); MPI_Irecv((void*)(l+(T+1)*LX*LY*LZ/2), 1, field_time_slice_cont, g_nb_t_dn, 82, g_cart_grid, &requests[3]); # if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) /* send the data to the neighbour on the right in x direction */ /* recieve the data from the neighbour on the left in x direction */ MPI_Isend((void*)(l+(LX-1)*LY*LZ/2), 1, field_x_slice_gath, g_nb_x_up, 92, g_cart_grid, &requests[6]); MPI_Irecv((void*)(l+((T+2)*LX*LY*LZ + T*LY*LZ)/2), 1, field_x_slice_cont, g_nb_x_dn, 92, g_cart_grid, &requests[7]); # endif # if (defined PARALLELXYT || defined PARALLELXYZT) /* send the data to the neighbour on the right in y direction */ /* recieve the data from the neighbour on the left in y direction */ MPI_Isend((void*)(l+(LY-1)*LZ/2), 1, field_y_slice_gath, g_nb_y_up, 102, g_cart_grid, &requests[10]); MPI_Irecv((void*)(l+((T+2)*LX*LY*LZ + 2*T*LY*LZ + T*LX*LZ)/2), 1, field_y_slice_cont, g_nb_y_dn, 102, g_cart_grid, &requests[11]); # endif # if defined PARALLELXYZT if(ieo == 1) { for(ix = T*LX*LY/2; ix < T*LX*LY; ix++) { field_buffer_z2[ix-T*LX*LY/2] = l[ g_field_z_ipt_even[ix] ]; } } else { for(ix = T*LX*LY/2; ix < T*LX*LY; ix++) { field_buffer_z2[ix-T*LX*LY/2] = l[ g_field_z_ipt_odd[ix] ]; } } /* send the data to the neighbour on the right in y direction */ /* recieve the data from the neighbour on the left in y direction */ MPI_Isend((void*)field_buffer_z2, 12*T*LX*LY, MPI_DOUBLE, g_nb_z_up, 504, g_cart_grid, &requests[14]); MPI_Irecv((void*)(l+(VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + T*LX*LY)/2), 12*T*LX*LY, MPI_DOUBLE, g_nb_z_dn, 504, g_cart_grid, &requests[15]); # endif } else { /* send the data to the neighbour on the right */ /* recieve the data from the neighbour on the left */ MPI_Isend((void*)(l+(T-1)*LX*LY*LZ/2), 1, field_time_slice_cont, g_nb_t_up, 82, g_cart_grid, &requests[0]); MPI_Irecv((void*)(l+(T+1)*LX*LY*LZ/2), 1, field_time_slice_cont, g_nb_t_dn, 82, g_cart_grid, &requests[1]); # if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) /* send the data to the neighbour on the right in x direction */ /* recieve the data from the neighbour on the left in x direction */ MPI_Isend((void*)(l+(LX-1)*LY*LZ/2), 1, field_x_slice_gath, g_nb_x_up, 92, g_cart_grid, &requests[4]); MPI_Irecv((void*)(l+((T+2)*LX*LY*LZ + T*LY*LZ)/2), 1, field_x_slice_cont, g_nb_x_dn, 92, g_cart_grid, &requests[5]); # endif # if (defined PARALLELXYT || defined PARALLELXYZT) /* send the data to the neighbour on the right in y direction */ /* recieve the data from the neighbour on the left in y direction */ MPI_Isend((void*)(l+(LY-1)*LZ/2), 1, field_y_slice_gath, g_nb_y_up, 102, g_cart_grid, &requests[8]); MPI_Irecv((void*)(l+((T+2)*LX*LY*LZ + 2*T*LY*LZ + T*LX*LZ)/2), 1, field_y_slice_cont, g_nb_y_dn, 102, g_cart_grid, &requests[9]); # endif # if (defined PARALLELXYZT) /* fill buffer ! */ /* This is now depending on whether the field is */ /* even or odd */ if(ieo == 1) { for(ix = 0; ix < T*LX*LY/2; ix++) { field_buffer_z[ix] = l[ g_field_z_ipt_even[ix] ]; } } else { for(ix = 0; ix < T*LX*LY/2; ix++) { field_buffer_z[ix] = l[ g_field_z_ipt_odd[ix] ]; } } /* send the data to the neighbour on the left in z direction */ /* recieve the data from the neighbour on the right in z direction */ MPI_Isend((void*)field_buffer_z, 12*T*LX*LY, MPI_DOUBLE, g_nb_z_dn, 503, g_cart_grid, &requests[12]); MPI_Irecv((void*)(l+(VOLUME/2 + LX*LY*LZ + T*LY*LZ +T*LX*LZ)), 12*T*LX*LY, MPI_DOUBLE, g_nb_z_up, 503, g_cart_grid, &requests[13]); # endif /* send the data to the neighbour on the left */ /* recieve the data from the neighbour on the right */ MPI_Isend((void*)l, 1, field_time_slice_cont, g_nb_t_dn, 81, g_cart_grid, &requests[2]); MPI_Irecv((void*)(l+T*LX*LY*LZ/2), 1, field_time_slice_cont, g_nb_t_up, 81, g_cart_grid, &requests[3]); # if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) /* send the data to the neighbour on the left in x direction */ /* recieve the data from the neighbour on the right in x direction */ MPI_Isend((void*)l, 1, field_x_slice_gath, g_nb_x_dn, 91, g_cart_grid, &requests[6]); MPI_Irecv((void*)(l+(T+2)*LX*LY*LZ/2), 1, field_x_slice_cont, g_nb_x_up, 91, g_cart_grid, &requests[7]); # endif # if (defined PARALLELXYT || defined PARALLELXYZT) /* send the data to the neighbour on the left in y direction */ /* recieve the data from the neighbour on the right in y direction */ MPI_Isend((void*)l, 1, field_y_slice_gath, g_nb_y_dn, 101, g_cart_grid, &requests[10]); MPI_Irecv((void*)(l+((T+2)*LX*LY*LZ + 2*T*LY*LZ)/2), 1, field_y_slice_cont, g_nb_y_up, 101, g_cart_grid, &requests[11]); # endif # if defined PARALLELXYZT if(ieo == 1) { for(ix = T*LX*LY/2; ix < T*LX*LY; ix++) { field_buffer_z2[ix-T*LX*LY/2] = l[ g_field_z_ipt_even[ix] ]; } } else { for(ix = T*LX*LY/2; ix < T*LX*LY; ix++) { field_buffer_z2[ix-T*LX*LY/2] = l[ g_field_z_ipt_odd[ix] ]; } } /* send the data to the neighbour on the right in y direction */ /* recieve the data from the neighbour on the left in y direction */ MPI_Isend((void*)field_buffer_z2, 12*T*LX*LY, MPI_DOUBLE, g_nb_z_up, 504, g_cart_grid, &requests[14]); MPI_Irecv((void*)(l+(VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + T*LX*LY)/2), 12*T*LX*LY, MPI_DOUBLE, g_nb_z_dn, 504, g_cart_grid, &requests[15]); # endif } MPI_Waitall(reqcount, requests, status); //fprintf(stdout,"recv-rank=%d_",g_proc_id); // DEBUG //for (ix = 0; ix < T*LX*LY/2; ix++){ // fprintf(stdout,"%e:",(*(l+(VOLUME/2 + LX*LY*LZ + T*LY*LZ +T*LX*LZ)+ix)).s0.c0.re); //} //fprintf(stdout,"\n"); //fflush(stdout); # endif return; #ifdef _KOJAK_INST #pragma pomp inst end(xchangefield) #endif }
double square_norm(spinor * const P, const int N, const int parallel) { int ix=0; double res, res2; double *s ALIGN; double *sp ALIGN; double _Complex x00, x01, x02, x03, x04, x05, x06, x07, x08, x09, x10, x11; double _Complex y00, y01, y02, y03, y04, y05, y06, y07, y08, y09, y10, y11; __alignx(16, P); s = (double*)P; sp = s+24; _prefetch_spinor(sp); x00 = __lfpd(s); x01 = __lfpd(s+2); x02 = __lfpd(s+4); x03 = __lfpd(s+6); x04 = __lfpd(s+8); x05 = __lfpd(s+10); x06 = __lfpd(s+12); x07 = __lfpd(s+14); x08 = __lfpd(s+16); x09 = __lfpd(s+18); x10 = __lfpd(s+20); x11 = __lfpd(s+22); y00 = __fpmul(x00, x00); y01 = __fpmul(x01, x01); y02 = __fpmul(x02, x02); y03 = __fpmul(x03, x03); y04 = __fpmul(x04, x04); y05 = __fpmul(x05, x05); y06 = __fpmul(x06, x06); y07 = __fpmul(x07, x07); y08 = __fpmul(x08, x08); y09 = __fpmul(x09, x09); y10 = __fpmul(x10, x10); y11 = __fpmul(x11, x11); s = sp; #pragma unroll(12) for(ix = 1; ix < N-1; ix++) { sp+=24;; _prefetch_spinor(sp); x00 = __lfpd(s); x01 = __lfpd(s+2); x02 = __lfpd(s+4); x03 = __lfpd(s+6); x04 = __lfpd(s+8); x05 = __lfpd(s+10); x06 = __lfpd(s+12); x07 = __lfpd(s+14); x08 = __lfpd(s+16); x09 = __lfpd(s+18); x10 = __lfpd(s+20); x11 = __lfpd(s+22); y00 = __fpmadd(y00, x00, x00); y01 = __fpmadd(y01, x01, x01); y02 = __fpmadd(y02, x02, x02); y03 = __fpmadd(y03, x03, x03); y04 = __fpmadd(y04, x04, x04); y05 = __fpmadd(y05, x05, x05); y06 = __fpmadd(y06, x06, x06); y07 = __fpmadd(y07, x07, x07); y08 = __fpmadd(y08, x08, x08); y09 = __fpmadd(y09, x09, x09); y10 = __fpmadd(y10, x10, x10); y11 = __fpmadd(y11, x11, x11); s=sp; } x00 = __lfpd(s); x01 = __lfpd(s+2); x02 = __lfpd(s+4); x03 = __lfpd(s+6); x04 = __lfpd(s+8); x05 = __lfpd(s+10); x06 = __lfpd(s+12); x07 = __lfpd(s+14); x08 = __lfpd(s+16); x09 = __lfpd(s+18); x10 = __lfpd(s+20); x11 = __lfpd(s+22); y00 = __fpmadd(y00, x00, x00); y01 = __fpmadd(y01, x01, x01); y02 = __fpmadd(y02, x02, x02); y03 = __fpmadd(y03, x03, x03); y04 = __fpmadd(y04, x04, x04); y05 = __fpmadd(y05, x05, x05); y06 = __fpmadd(y06, x06, x06); y07 = __fpmadd(y07, x07, x07); y08 = __fpmadd(y08, x08, x08); y09 = __fpmadd(y09, x09, x09); y10 = __fpmadd(y10, x10, x10); y11 = __fpmadd(y11, x11, x11); y00 = __fpadd(y00, y01); y02 = __fpadd(y02, y03); y04 = __fpadd(y04, y05); y06 = __fpadd(y06, y07); y08 = __fpadd(y08, y09); y10 = __fpadd(y10, y11); y00 = __fpadd(y00, y02); y04 = __fpadd(y04, y06); y08 = __fpadd(y08, y10); y00 = __fpadd(y00, y04); y00 = __fpadd(y00, y08); res = __creal(y00)+__cimag(y00); # ifdef TM_USE_MPI if(parallel) { MPI_Allreduce(&res, &res2, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); return res2; } # endif return res; }
//------------------------------------------------------------------------------------------------------------------------------ void smooth(level_type * level, int x_id, int rhs_id, double a, double b){ // allocate a buffer to hold fluxes... if(level->fluxes==NULL)level->fluxes = (double*)MALLOC( ( (4*level->num_threads)*(BLOCKCOPY_TILE_J+1)*(level->box_jStride) + BOX_ALIGN_JSTRIDE)*sizeof(double) ); // align fluxes to BOX_ALIGN_JSTRIDE double * __restrict__ fluxes_aligned = level->fluxes; uint64_t unaligned_by = (uint64_t)(fluxes_aligned) & (BOX_ALIGN_JSTRIDE-1)*sizeof(double); if(unaligned_by)fluxes_aligned = (double*)( (uint64_t)(fluxes_aligned) + BOX_ALIGN_JSTRIDE*sizeof(double) - unaligned_by ); int s;for(s=0;s<2*NUM_SMOOTHS;s++){ // there are two sweeps per GSRB smooth // exchange the ghost zone... if((s&1)==0){ exchange_boundary(level, x_id,stencil_get_shape()); apply_BCs(level, x_id,stencil_get_shape()); }else{ exchange_boundary(level,VECTOR_TEMP,stencil_get_shape()); apply_BCs(level,VECTOR_TEMP,stencil_get_shape()); } // apply the smoother... double _timeStart = getTime(); double h2inv = 1.0/(level->h*level->h); // loop over all block/tiles this process owns... #ifdef _OPENMP #pragma omp parallel if(level->num_my_blocks>1) #endif { int block; int threadID=0; #ifdef _OPENMP threadID=omp_get_thread_num(); #endif // [thread][flux][ij] layout double * __restrict__ flux_i = fluxes_aligned + (4*threadID + 0)*(BLOCKCOPY_TILE_J+1)*(level->box_jStride); double * __restrict__ flux_j = fluxes_aligned + (4*threadID + 1)*(BLOCKCOPY_TILE_J+1)*(level->box_jStride); double * __restrict__ flux_k[2] = {fluxes_aligned + (4*threadID + 2)*(BLOCKCOPY_TILE_J+1)*(level->box_jStride), fluxes_aligned + (4*threadID + 3)*(BLOCKCOPY_TILE_J+1)*(level->box_jStride)}; // loop over (cache) blocks... #ifdef _OPENMP #pragma omp for schedule(static,1) #endif for(block=0;block<level->num_my_blocks;block++){ const int box = level->my_blocks[block].read.box; const int jlo = level->my_blocks[block].read.j; const int klo = level->my_blocks[block].read.k; const int jdim = level->my_blocks[block].dim.j; const int kdim = level->my_blocks[block].dim.k; const int ghosts = level->my_boxes[box].ghosts; const int jStride = level->my_boxes[box].jStride; const int kStride = level->my_boxes[box].kStride; const double * __restrict__ rhs = level->my_boxes[box].vectors[ rhs_id] + ghosts*(1+jStride+kStride) + (jlo*jStride + klo*kStride); #ifdef VECTOR_ALPHA const double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride) + (jlo*jStride + klo*kStride); #else const double * __restrict__ alpha = NULL; #endif const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride) + (jlo*jStride + klo*kStride); const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride) + (jlo*jStride + klo*kStride); const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride) + (jlo*jStride + klo*kStride); const double * __restrict__ Dinv = level->my_boxes[box].vectors[VECTOR_DINV ] + ghosts*(1+jStride+kStride) + (jlo*jStride + klo*kStride); const double * __restrict__ x_n; double * __restrict__ x_np1; if((s&1)==0){x_n = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride) + (jlo*jStride + klo*kStride); x_np1 = level->my_boxes[box].vectors[VECTOR_TEMP ] + ghosts*(1+jStride+kStride) + (jlo*jStride + klo*kStride);} else{x_n = level->my_boxes[box].vectors[VECTOR_TEMP ] + ghosts*(1+jStride+kStride) + (jlo*jStride + klo*kStride); x_np1 = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride) + (jlo*jStride + klo*kStride);} #ifdef __INTEL_COMPILER // superfluous with OMP4 simd (?) //__assume_aligned(x_n ,BOX_ALIGN_JSTRIDE*sizeof(double)); //__assume_aligned(x_np1 ,BOX_ALIGN_JSTRIDE*sizeof(double)); //__assume_aligned(rhs ,BOX_ALIGN_JSTRIDE*sizeof(double)); //__assume_aligned(alpha ,BOX_ALIGN_JSTRIDE*sizeof(double)); //__assume_aligned(beta_i ,BOX_ALIGN_JSTRIDE*sizeof(double)); //__assume_aligned(beta_j ,BOX_ALIGN_JSTRIDE*sizeof(double)); //__assume_aligned(beta_k ,BOX_ALIGN_JSTRIDE*sizeof(double)); //__assume_aligned(Dinv ,BOX_ALIGN_JSTRIDE*sizeof(double)); //__assume_aligned(flux_i ,BOX_ALIGN_JSTRIDE*sizeof(double)); //__assume_aligned(flux_j ,BOX_ALIGN_JSTRIDE*sizeof(double)); //__assume_aligned(flux_k[0],BOX_ALIGN_JSTRIDE*sizeof(double)); //__assume_aligned(flux_k[1],BOX_ALIGN_JSTRIDE*sizeof(double)); __assume( jStride % BOX_ALIGN_JSTRIDE == 0); // e.g. jStride%4==0 or jStride%8==0, hence x+jStride is aligned __assume( kStride % BOX_ALIGN_JSTRIDE == 0); __assume( jStride >= BOX_ALIGN_JSTRIDE); __assume( kStride >= 3*BOX_ALIGN_JSTRIDE); __assume( jdim > 0); __assume( kdim > 0); #elif __xlC__ __alignx(BOX_ALIGN_JSTRIDE*sizeof(double), rhs ); __alignx(BOX_ALIGN_JSTRIDE*sizeof(double), alpha ); __alignx(BOX_ALIGN_JSTRIDE*sizeof(double), beta_i ); __alignx(BOX_ALIGN_JSTRIDE*sizeof(double), beta_j ); __alignx(BOX_ALIGN_JSTRIDE*sizeof(double), beta_k ); __alignx(BOX_ALIGN_JSTRIDE*sizeof(double), Dinv ); __alignx(BOX_ALIGN_JSTRIDE*sizeof(double), x_n ); __alignx(BOX_ALIGN_JSTRIDE*sizeof(double), x_np1 ); __alignx(BOX_ALIGN_JSTRIDE*sizeof(double), flux_i ); __alignx(BOX_ALIGN_JSTRIDE*sizeof(double), flux_j ); __alignx(BOX_ALIGN_JSTRIDE*sizeof(double), flux_k[0]); __alignx(BOX_ALIGN_JSTRIDE*sizeof(double), flux_k[1]); #endif int ij,k; double * __restrict__ flux_klo = flux_k[0]; // startup / prolog... calculate flux_klo (bottom of cell)... #if (_OPENMP>=201307) #pragma omp simd aligned(beta_k,x_n,flux_klo:BOX_ALIGN_JSTRIDE*sizeof(double)) #endif for(ij=0;ij<jdim*jStride;ij++){ flux_klo[ij] = beta_dxdk(x_n,ij); // k==0 } // wavefront loop... for(k=0;k<kdim;k++){ double * __restrict__ flux_klo = flux_k[(k )&0x1]; double * __restrict__ flux_khi = flux_k[(k+1)&0x1]; // calculate flux_i and flux_j together #if (_OPENMP>=201307) #pragma omp simd aligned(beta_i,beta_j,x_n,flux_i,flux_j:BOX_ALIGN_JSTRIDE*sizeof(double)) #endif for(ij=0;ij<jdim*jStride;ij++){ int ijk = ij + k*kStride; flux_i[ij] = beta_dxdi(x_n,ijk); flux_j[ij] = beta_dxdj(x_n,ijk); } // calculate flux_jhi #if (_OPENMP>=201307) #pragma omp simd aligned(beta_j,x_n,flux_j:BOX_ALIGN_JSTRIDE*sizeof(double)) #endif for(ij=jdim*jStride;ij<(jdim+1)*jStride;ij++){ int ijk = ij + k*kStride; flux_j[ij] = beta_dxdj(x_n,ijk); } // calculate flux_khi (top of cell) #if (_OPENMP>=201307) #pragma omp simd aligned(beta_k,x_n,flux_khi:BOX_ALIGN_JSTRIDE*sizeof(double)) #endif for(ij=0;ij<jdim*jStride;ij++){ int ijk = ij + k*kStride; flux_khi[ij] = beta_dxdk(x_n,ijk+kStride); // k+1 } const int color000 = (level->my_boxes[box].low.i^level->my_boxes[box].low.j^level->my_boxes[box].low.k^jlo^klo^s); // is element 000 of this *BLOCK* 000 red or black on this sweep const double * __restrict__ RedBlack = level->RedBlack_FP + ghosts*(1+jStride) + jStride*((k^color000)&0x1); // Red/Black pencils... presumes ghost zones were corectly colored #if (_OPENMP>=201307) #pragma omp simd aligned(flux_i,flux_j,flux_klo,flux_khi,alpha,rhs,Dinv,x_n,x_np1,RedBlack:BOX_ALIGN_JSTRIDE*sizeof(double)) #endif #ifdef __INTEL_COMPILER #pragma vector nontemporal // generally, we don't expect to reuse x_np1 #endif for(ij=0;ij<jdim*jStride;ij++){ int ijk = ij + k*kStride; double Lx = - flux_i[ ij] + flux_i[ ij+ 1] - flux_j[ ij] + flux_j[ ij+jStride] - flux_klo[ij] + flux_khi[ij ]; #ifdef USE_HELMHOLTZ double Ax = a*alpha[ijk]*x_n[ijk] - b*Lx; #else double Ax = -b*Lx; #endif x_np1[ijk] = x_n[ijk] + RedBlack[ij]*Dinv[ijk]*(rhs[ijk]-Ax); } } // kdim } // block } // omp level->timers.smooth += (double)(getTime()-_timeStart); } // s-loop }
/* 32-2. */ void xchange_halffield32() { # ifdef MPI MPI_Request requests[16]; MPI_Status status[16]; # ifdef PARALLELT int reqcount = 4; # elif defined PARALLELXT int reqcount = 8; # elif defined PARALLELXYT int reqcount = 12; # elif defined PARALLELXYZT int reqcount = 16; # endif #ifdef _KOJAK_INST #pragma pomp inst begin(xchangehalf32) #endif # if (defined XLC && defined BGL) __alignx(16, HalfSpinor32); # endif /* send the data to the neighbour on the right in t direction */ /* recieve the data from the neighbour on the left in t direction */ MPI_Isend((void*)(HalfSpinor32 + 4*VOLUME), LX*LY*LZ*12/2, MPI_FLOAT, g_nb_t_up, 81, g_cart_grid, &requests[0]); MPI_Irecv((void*)(HalfSpinor32 + 4*VOLUME + RAND/2 + LX*LY*LZ/2), LX*LY*LZ*12/2, MPI_FLOAT, g_nb_t_dn, 81, g_cart_grid, &requests[1]); /* send the data to the neighbour on the left in t direction */ /* recieve the data from the neighbour on the right in t direction */ MPI_Isend((void*)(HalfSpinor32 + 4*VOLUME + LX*LY*LZ/2), LX*LY*LZ*12/2, MPI_FLOAT, g_nb_t_dn, 82, g_cart_grid, &requests[2]); MPI_Irecv((void*)(HalfSpinor32 + 4*VOLUME + RAND/2), LX*LY*LZ*12/2, MPI_FLOAT, g_nb_t_up, 82, g_cart_grid, &requests[3]); # if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) /* send the data to the neighbour on the right in x direction */ /* recieve the data from the neighbour on the left in x direction */ MPI_Isend((void*)(HalfSpinor32 + 4*VOLUME + LX*LY*LZ), T*LY*LZ*12/2, MPI_FLOAT, g_nb_x_up, 91, g_cart_grid, &requests[4]); MPI_Irecv((void*)(HalfSpinor32 + 4*VOLUME + RAND/2 + LX*LY*LZ + T*LY*LZ/2), T*LY*LZ*12/2, MPI_FLOAT, g_nb_x_dn, 91, g_cart_grid, &requests[5]); /* send the data to the neighbour on the left in x direction */ /* recieve the data from the neighbour on the right in x direction */ MPI_Isend((void*)(HalfSpinor32 + 4*VOLUME + LX*LY*LZ + T*LY*LZ/2), T*LY*LZ*12/2, MPI_FLOAT, g_nb_x_dn, 92, g_cart_grid, &requests[6]); MPI_Irecv((void*)(HalfSpinor32 + 4*VOLUME + RAND/2 + LX*LY*LZ), T*LY*LZ*12/2, MPI_FLOAT, g_nb_x_up, 92, g_cart_grid, &requests[7]); # endif # if (defined PARALLELXYT || defined PARALLELXYZT) /* send the data to the neighbour on the right in y direction */ /* recieve the data from the neighbour on the left in y direction */ MPI_Isend((void*)(HalfSpinor32 + 4*VOLUME + LX*LY*LZ + T*LY*LZ), T*LX*LZ*12/2, MPI_FLOAT, g_nb_y_up, 101, g_cart_grid, &requests[8]); MPI_Irecv((void*)(HalfSpinor32 + 4*VOLUME + RAND/2 + LX*LY*LZ + T*LY*LZ + T*LX*LZ/2), T*LX*LZ*12/2, MPI_FLOAT, g_nb_y_dn, 101, g_cart_grid, &requests[9]); /* send the data to the neighbour on the leftt in y direction */ /* recieve the data from the neighbour on the right in y direction */ MPI_Isend((void*)(HalfSpinor32 + 4*VOLUME + LX*LY*LZ + T*LY*LZ + T*LX*LZ/2), T*LX*LZ*12/2, MPI_FLOAT, g_nb_y_dn, 102, g_cart_grid, &requests[10]); MPI_Irecv((void*)(HalfSpinor32 + 4*VOLUME + RAND/2 + LX*LY*LZ + T*LY*LZ), T*LX*LZ*12/2, MPI_FLOAT, g_nb_y_up, 102, g_cart_grid, &requests[11]); # endif # if (defined PARALLELXYZT) /* send the data to the neighbour on the right in z direction */ /* recieve the data from the neighbour on the left in z direction */ MPI_Isend((void*)(HalfSpinor32 + 4*VOLUME + LX*LY*LZ + T*LY*LZ + T*LX*LZ), T*LX*LY*12/2, MPI_FLOAT, g_nb_z_up, 503, g_cart_grid, &requests[12]); MPI_Irecv((void*)(HalfSpinor32 + 4*VOLUME + RAND/2 + LX*LY*LZ + T*LY*LZ + T*LX*LZ + T*LX*LY/2), T*LX*LY*12/2, MPI_FLOAT, g_nb_z_dn, 503, g_cart_grid, &requests[13]); /* send the data to the neighbour on the left in z direction */ /* recieve the data from the neighbour on the right in z direction */ MPI_Isend((void*)(HalfSpinor32 + 4*VOLUME + LX*LY*LZ + T*LY*LZ + T*LX*LZ + T*LX*LY/2), 12*T*LX*LY/2, MPI_FLOAT, g_nb_z_dn, 504, g_cart_grid, &requests[14]); MPI_Irecv((void*)(HalfSpinor32 + 4*VOLUME + RAND/2 + LX*LY*LZ + T*LY*LZ + T*LX*LZ), T*LX*LY*12/2, MPI_FLOAT, g_nb_z_up, 504, g_cart_grid, &requests[15]); # endif MPI_Waitall(reqcount, requests, status); # endif /* MPI */ return; #ifdef _KOJAK_INST #pragma pomp inst end(xchangehalf32) #endif }
void xchange_2fields(spinor * const l, spinor * const k, const int ieo) { MPI_Request requests[32]; MPI_Status status[32]; int reqcount = 0; #if defined PARALLELXYZT int ix=0; #endif #ifdef _KOJAK_INST #pragma pomp inst begin(xchange2fields) #endif # ifdef MPI # if (defined BGL && defined XLC) # ifdef PARALLELXYZT __alignx(16, field_buffer_z); __alignx(16, field_buffer_z2); __alignx(16, field_buffer_z3); __alignx(16, field_buffer_z4); # endif __alignx(16, l); # endif /* send the data to the neighbour on the left */ /* recieve the data from the neighbour on the right */ MPI_Isend((void*)l, 1, field_time_slice_cont, g_nb_t_dn, 81, g_cart_grid, &requests[reqcount]); MPI_Irecv((void*)(l+T*LX*LY*LZ/2), 1, field_time_slice_cont, g_nb_t_up, 81, g_cart_grid, &requests[reqcount+1]); reqcount=reqcount+2; /* send the data to the neighbour on the right */ /* recieve the data from the neighbour on the left */ MPI_Isend((void*)(l+(T-1)*LX*LY*LZ/2), 1, field_time_slice_cont, g_nb_t_up, 82, g_cart_grid, &requests[reqcount]); MPI_Irecv((void*)(l+(T+1)*LX*LY*LZ/2), 1, field_time_slice_cont, g_nb_t_dn, 82, g_cart_grid, &requests[reqcount+1]); reqcount=reqcount+2; /* send the data to the neighbour on the left */ /* recieve the data from the neighbour on the right */ MPI_Isend((void*)k, 1, field_time_slice_cont, g_nb_t_dn, 83, g_cart_grid, &requests[reqcount]); MPI_Irecv((void*)(k+T*LX*LY*LZ/2), 1, field_time_slice_cont, g_nb_t_up, 83, g_cart_grid, &requests[reqcount+1]); reqcount=reqcount+2; /* send the data to the neighbour on the right */ /* recieve the data from the neighbour on the left */ MPI_Isend((void*)(k+(T-1)*LX*LY*LZ/2), 1, field_time_slice_cont, g_nb_t_up, 84, g_cart_grid, &requests[reqcount]); MPI_Irecv((void*)(k+(T+1)*LX*LY*LZ/2), 1, field_time_slice_cont, g_nb_t_dn, 84, g_cart_grid, &requests[reqcount+1]); reqcount=reqcount+2; # if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) /* send the data to the neighbour on the left in x direction */ /* recieve the data from the neighbour on the right in x direction */ MPI_Isend((void*)l, 1, field_x_slice_gath, g_nb_x_dn, 91, g_cart_grid, &requests[reqcount]); MPI_Irecv((void*)(l+(T+2)*LX*LY*LZ/2), 1, field_x_slice_cont, g_nb_x_up, 91, g_cart_grid, &requests[reqcount+1]); reqcount=reqcount+2; /* send the data to the neighbour on the right in x direction */ /* recieve the data from the neighbour on the left in x direction */ MPI_Isend((void*)(l+(LX-1)*LY*LZ/2), 1, field_x_slice_gath, g_nb_x_up, 92, g_cart_grid, &requests[reqcount]); MPI_Irecv((void*)(l+((T+2)*LX*LY*LZ + T*LY*LZ)/2), 1, field_x_slice_cont, g_nb_x_dn, 92, g_cart_grid, &requests[reqcount+1]); reqcount=reqcount+2; /* send the data to the neighbour on the left in x direction */ /* recieve the data from the neighbour on the right in x direction */ MPI_Isend((void*)k, 1, field_x_slice_gath, g_nb_x_dn, 93, g_cart_grid, &requests[reqcount]); MPI_Irecv((void*)(k+(T+2)*LX*LY*LZ/2), 1, field_x_slice_cont, g_nb_x_up, 93, g_cart_grid, &requests[reqcount+1]); reqcount=reqcount+2; /* send the data to the neighbour on the right in x direction */ /* recieve the data from the neighbour on the left in x direction */ MPI_Isend((void*)(k+(LX-1)*LY*LZ/2), 1, field_x_slice_gath, g_nb_x_up, 94, g_cart_grid, &requests[reqcount]); MPI_Irecv((void*)(k+((T+2)*LX*LY*LZ + T*LY*LZ)/2), 1, field_x_slice_cont, g_nb_x_dn, 94, g_cart_grid, &requests[reqcount+1]); reqcount=reqcount+2; # endif # if (defined PARALLELXYT || defined PARALLELXYZT) /* send the data to the neighbour on the left in y direction */ /* recieve the data from the neighbour on the right in y direction */ MPI_Isend((void*)l, 1, field_y_slice_gath, g_nb_y_dn, 101, g_cart_grid, &requests[reqcount]); MPI_Irecv((void*)(l+((T+2)*LX*LY*LZ + 2*T*LY*LZ)/2), 1, field_y_slice_cont, g_nb_y_up, 101, g_cart_grid, &requests[reqcount+1]); reqcount=reqcount+2; /* send the data to the neighbour on the right in y direction */ /* recieve the data from the neighbour on the left in y direction */ MPI_Isend((void*)(l+(LY-1)*LZ/2), 1, field_y_slice_gath, g_nb_y_up, 102, g_cart_grid, &requests[reqcount]); MPI_Irecv((void*)(l+((T+2)*LX*LY*LZ + 2*T*LY*LZ + T*LX*LZ)/2), 1, field_y_slice_cont, g_nb_y_dn, 102, g_cart_grid, &requests[reqcount+1]); reqcount=reqcount+2; /* send the data to the neighbour on the left in y direction */ /* recieve the data from the neighbour on the right in y direction */ MPI_Isend((void*)k, 1, field_y_slice_gath, g_nb_y_dn, 103, g_cart_grid, &requests[reqcount]); MPI_Irecv((void*)(k+((T+2)*LX*LY*LZ + 2*T*LY*LZ)/2), 1, field_y_slice_cont, g_nb_y_up, 103, g_cart_grid, &requests[reqcount+1]); reqcount=reqcount+2; /* send the data to the neighbour on the right in y direction */ /* recieve the data from the neighbour on the left in y direction */ MPI_Isend((void*)(k+(LY-1)*LZ/2), 1, field_y_slice_gath, g_nb_y_up, 104, g_cart_grid, &requests[reqcount]); MPI_Irecv((void*)(k+((T+2)*LX*LY*LZ + 2*T*LY*LZ + T*LX*LZ)/2), 1, field_y_slice_cont, g_nb_y_dn, 104, g_cart_grid, &requests[reqcount+1]); reqcount=reqcount+2; # endif # if (defined PARALLELXYZT) /* fill buffer ! */ /* This is now depending on whether the field is */ /* even or odd */ if(ieo == 1) { for(ix = 0; ix < T*LX*LY/2; ix++) { field_buffer_z[ix] = l[ g_field_z_ipt_even[ix] ]; } } else { for(ix = 0; ix < T*LX*LY/2; ix++) { field_buffer_z[ix] = l[ g_field_z_ipt_odd[ix] ]; } } if(ieo == 1) { for(ix = T*LX*LY/2; ix < T*LX*LY; ix++) { field_buffer_z2[ix-T*LX*LY/2] = l[ g_field_z_ipt_even[ix] ]; } } else { for(ix = T*LX*LY/2; ix < T*LX*LY; ix++) { field_buffer_z2[ix-T*LX*LY/2] = l[ g_field_z_ipt_odd[ix] ]; } } /* send the data to the neighbour on the left in z direction */ /* recieve the data from the neighbour on the right in z direction */ MPI_Isend((void*)field_buffer_z, 12*T*LX*LY, MPI_DOUBLE, g_nb_z_dn, 503, g_cart_grid, &requests[reqcount]); MPI_Irecv((void*)(l+(VOLUME/2 + LX*LY*LZ + T*LY*LZ +T*LX*LZ)), 12*T*LX*LY, MPI_DOUBLE, g_nb_z_up, 503, g_cart_grid, &requests[reqcount+1]); reqcount=reqcount+2; /* send the data to the neighbour on the right in y direction */ /* recieve the data from the neighbour on the left in y direction */ MPI_Isend((void*)field_buffer_z2, 12*T*LX*LY, MPI_DOUBLE, g_nb_z_up, 504, g_cart_grid, &requests[reqcount]); MPI_Irecv((void*)(l+(VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + T*LX*LY)/2), 12*T*LX*LY, MPI_DOUBLE, g_nb_z_dn, 504, g_cart_grid, &requests[reqcount+1]); reqcount=reqcount+2; /* fill buffer ! */ /* This is now depending on whether the field is */ /* even or odd */ if(ieo == 0) { for(ix = 0; ix < T*LX*LY/2; ix++) { field_buffer_z3[ix] = k[ g_field_z_ipt_even[ix] ]; } } else { for(ix = 0; ix < T*LX*LY/2; ix++) { field_buffer_z3[ix] = k[ g_field_z_ipt_odd[ix] ]; } } if(ieo == 0) { for(ix = T*LX*LY/2; ix < T*LX*LY; ix++) { field_buffer_z4[ix-T*LX*LY/2] = k[ g_field_z_ipt_even[ix] ]; } } else { for(ix = T*LX*LY/2; ix < T*LX*LY; ix++) { field_buffer_z4[ix-T*LX*LY/2] = k[ g_field_z_ipt_odd[ix] ]; } } /* send the data to the neighbour on the left in z direction */ /* recieve the data from the neighbour on the right in z direction */ MPI_Isend((void*)field_buffer_z3, 12*T*LX*LY, MPI_DOUBLE, g_nb_z_dn, 505, g_cart_grid, &requests[reqcount]); MPI_Irecv((void*)(k+(VOLUME/2 + LX*LY*LZ + T*LY*LZ +T*LX*LZ)), 12*T*LX*LY, MPI_DOUBLE, g_nb_z_up, 505, g_cart_grid, &requests[reqcount+1]); reqcount=reqcount+2; /* send the data to the neighbour on the right in y direction */ /* recieve the data from the neighbour on the left in y direction */ MPI_Isend((void*)field_buffer_z4, 12*T*LX*LY, MPI_DOUBLE, g_nb_z_up, 506, g_cart_grid, &requests[reqcount]); MPI_Irecv((void*)(k+(VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + T*LX*LY)/2), 12*T*LX*LY, MPI_DOUBLE, g_nb_z_dn, 506, g_cart_grid, &requests[reqcount+1]); reqcount=reqcount+2; # endif MPI_Waitall(reqcount, requests, status); # endif return; #ifdef _KOJAK_INST #pragma pomp inst end(xchange2fields) #endif }
void xchange_2fields(spinor * const l, spinor * const k, const int ieo) { #ifdef MPI MPI_Request requests[32]; MPI_Status status[32]; #endif int reqcount = 0; #if defined PARALLELXYZT int ix=0; #endif #ifdef _KOJAK_INST #pragma pomp inst begin(xchange2fields) #endif # ifdef MPI # if (defined BGL && defined XLC) __alignx(16, l); # endif # if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT ) /* send the data to the neighbour on the left */ /* recieve the data from the neighbour on the right */ MPI_Isend((void*)(l+g_1st_t_int_dn), 1, field_time_slice_cont, g_nb_t_dn, 81, g_cart_grid, &requests[reqcount]); MPI_Irecv((void*)(l+g_1st_t_ext_up), 1, field_time_slice_cont, g_nb_t_up, 81, g_cart_grid, &requests[reqcount+1]); reqcount=reqcount+2; /* send the data to the neighbour on the right */ /* recieve the data from the neighbour on the left */ MPI_Isend((void*)(l+g_1st_t_int_up), 1, field_time_slice_cont, g_nb_t_up, 82, g_cart_grid, &requests[reqcount]); MPI_Irecv((void*)(l+g_1st_t_ext_dn), 1, field_time_slice_cont, g_nb_t_dn, 82, g_cart_grid, &requests[reqcount+1]); reqcount=reqcount+2; /* send the data to the neighbour on the left */ /* recieve the data from the neighbour on the right */ MPI_Isend((void*)(k+g_1st_t_int_dn), 1, field_time_slice_cont, g_nb_t_dn, 83, g_cart_grid, &requests[reqcount]); MPI_Irecv((void*)(k+g_1st_t_ext_up), 1, field_time_slice_cont, g_nb_t_up, 83, g_cart_grid, &requests[reqcount+1]); reqcount=reqcount+2; /* send the data to the neighbour on the right */ /* recieve the data from the neighbour on the left */ MPI_Isend((void*)(k+g_1st_t_int_up), 1, field_time_slice_cont, g_nb_t_up, 84, g_cart_grid, &requests[reqcount]); MPI_Irecv((void*)(k+g_1st_t_int_dn), 1, field_time_slice_cont, g_nb_t_dn, 84, g_cart_grid, &requests[reqcount+1]); reqcount=reqcount+2; # endif # if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ ) /* send the data to the neighbour on the left in x direction */ /* recieve the data from the neighbour on the right in x direction */ MPI_Isend((void*)(l+g_1st_x_int_dn), 1, field_x_slice_gath, g_nb_x_dn, 91, g_cart_grid, &requests[reqcount]); MPI_Irecv((void*)(l+g_1st_x_ext_up), 1, field_x_slice_cont, g_nb_x_up, 91, g_cart_grid, &requests[reqcount+1]); reqcount=reqcount+2; /* send the data to the neighbour on the right in x direction */ /* recieve the data from the neighbour on the left in x direction */ MPI_Isend((void*)(l+g_1st_x_int_up), 1, field_x_slice_gath, g_nb_x_up, 92, g_cart_grid, &requests[reqcount]); MPI_Irecv((void*)(l+g_1st_x_ext_dn), 1, field_x_slice_cont, g_nb_x_dn, 92, g_cart_grid, &requests[reqcount+1]); reqcount=reqcount+2; /* send the data to the neighbour on the left in x direction */ /* recieve the data from the neighbour on the right in x direction */ MPI_Isend((void*)(k+g_1st_x_int_dn), 1, field_x_slice_gath, g_nb_x_dn, 93, g_cart_grid, &requests[reqcount]); MPI_Irecv((void*)(k+g_1st_x_ext_up), 1, field_x_slice_cont, g_nb_x_up, 93, g_cart_grid, &requests[reqcount+1]); reqcount=reqcount+2; /* send the data to the neighbour on the right in x direction */ /* recieve the data from the neighbour on the left in x direction */ MPI_Isend((void*)(k+g_1st_x_int_up), 1, field_x_slice_gath, g_nb_x_up, 94, g_cart_grid, &requests[reqcount]); MPI_Irecv((void*)(k+g_1st_x_ext_dn), 1, field_x_slice_cont, g_nb_x_dn, 94, g_cart_grid, &requests[reqcount+1]); reqcount=reqcount+2; # endif # if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ ) /* send the data to the neighbour on the left in y direction */ /* recieve the data from the neighbour on the right in y direction */ MPI_Isend((void*)(l+g_1st_y_int_dn), 1, field_y_slice_gath, g_nb_y_dn, 101, g_cart_grid, &requests[reqcount]); MPI_Irecv((void*)(l+g_1st_y_ext_up), 1, field_y_slice_cont, g_nb_y_up, 101, g_cart_grid, &requests[reqcount+1]); reqcount=reqcount+2; /* send the data to the neighbour on the right in y direction */ /* recieve the data from the neighbour on the left in y direction */ MPI_Isend((void*)(l+g_1st_y_int_up), 1, field_y_slice_gath, g_nb_y_up, 102, g_cart_grid, &requests[reqcount]); MPI_Irecv((void*)(l+g_1st_y_ext_dn), 1, field_y_slice_cont, g_nb_y_dn, 102, g_cart_grid, &requests[reqcount+1]); reqcount=reqcount+2; /* send the data to the neighbour on the left in y direction */ /* recieve the data from the neighbour on the right in y direction */ MPI_Isend((void*)(k+g_1st_y_int_dn), 1, field_y_slice_gath, g_nb_y_dn, 103, g_cart_grid, &requests[reqcount]); MPI_Irecv((void*)(k+g_1st_y_ext_up), 1, field_y_slice_cont, g_nb_y_up, 103, g_cart_grid, &requests[reqcount+1]); reqcount=reqcount+2; /* send the data to the neighbour on the right in y direction */ /* recieve the data from the neighbour on the left in y direction */ MPI_Isend((void*)(k+g_1st_y_int_up), 1, field_y_slice_gath, g_nb_y_up, 104, g_cart_grid, &requests[reqcount]); MPI_Irecv((void*)(k+g_1st_y_ext_dn), 1, field_y_slice_cont, g_nb_y_dn, 104, g_cart_grid, &requests[reqcount+1]); reqcount=reqcount+2; # endif # if (defined PARALLELXYZ || defined PARALLELXYZT) /* send the data to the neighbour on the left in z direction */ /* recieve the data from the neighbour on the right in z direction */ if(ieo == 1) { MPI_Isend((void*)(l+g_1st_z_int_dn),1,field_z_slice_even_dn,g_nb_z_dn,503,g_cart_grid,&requests[reqcount]); MPI_Irecv((void*)(l+g_1st_z_ext_up),1,field_z_slice_cont,g_nb_z_up,503,g_cart_grid,&requests[reqcount+1]); reqcount=reqcount+2; } else { MPI_Isend((void*)(l+g_1st_z_int_dn),1,field_z_slice_odd_dn,g_nb_z_dn,503,g_cart_grid,&requests[reqcount]); MPI_Irecv((void*)(l+g_1st_z_ext_up),1,field_z_slice_cont,g_nb_z_up,503,g_cart_grid,&requests[reqcount+1]); reqcount=reqcount+2; } if(ieo == 1) { MPI_Isend((void*)(k+g_1st_z_int_dn),1,field_z_slice_even_dn,g_nb_z_dn,505,g_cart_grid,&requests[reqcount]); MPI_Irecv((void*)(k+g_1st_z_ext_up),1,field_z_slice_cont,g_nb_z_up,505,g_cart_grid,&requests[reqcount+1]); reqcount=reqcount+2; } else { MPI_Isend((void*)(k+g_1st_z_int_dn),1,field_z_slice_odd_dn,g_nb_z_dn,505,g_cart_grid,&requests[reqcount]); MPI_Irecv((void*)(k+g_1st_z_ext_up),1,field_z_slice_cont,g_nb_z_up,505,g_cart_grid,&requests[reqcount+1]); reqcount=reqcount+2; } /* send the data to the neighbour on the right in z direction */ /* recieve the data from the neighbour on the left in z direction */ if(ieo == 1) { MPI_Isend((void*)(l+g_1st_z_int_up),1,field_z_slice_even_up,g_nb_z_up,504,g_cart_grid,&requests[reqcount]); MPI_Irecv((void*)(l+g_1st_z_ext_dn),1,field_z_slice_cont,g_nb_z_dn,504,g_cart_grid,&requests[reqcount+1]); reqcount=reqcount+2; } else { MPI_Isend((void*)(l+g_1st_z_int_up),1,field_z_slice_odd_up,g_nb_z_up,504,g_cart_grid,&requests[reqcount]); MPI_Irecv((void*)(l+g_1st_z_ext_dn),1,field_z_slice_cont,g_nb_z_dn,504,g_cart_grid,&requests[reqcount+1]); reqcount=reqcount+2; } if(ieo == 1) { MPI_Isend((void*)(k+g_1st_z_int_up),1,field_z_slice_even_up,g_nb_z_up,506,g_cart_grid,&requests[reqcount]); MPI_Irecv((void*)(k+g_1st_z_ext_dn),1,field_z_slice_cont,g_nb_z_dn,506,g_cart_grid,&requests[reqcount+1]); reqcount=reqcount+2; } else { MPI_Isend((void*)(k+g_1st_z_int_up),1,field_z_slice_odd_up,g_nb_z_up,506,g_cart_grid,&requests[reqcount]); MPI_Irecv((void*)(k+g_1st_z_ext_dn),1,field_z_slice_cont,g_nb_z_dn,506,g_cart_grid,&requests[reqcount+1]); reqcount=reqcount+2; } # endif MPI_Waitall(reqcount, requests, status); # endif return; #ifdef _KOJAK_INST #pragma pomp inst end(xchange2fields) #endif }
void deriv_Sb_D_psi(spinor * const l, spinor * const k, hamiltonian_field_t * const hf, const double factor) { #ifdef BGL __alignx(16, l); __alignx(16, k); #endif /* for parallelization */ #ifdef MPI xchange_lexicfield(k); xchange_lexicfield(l); #endif #ifdef OMP #define static #pragma omp parallel { #endif int ix,iy; su3 * restrict up ALIGN; su3 * restrict um ALIGN; static su3 v1,v2; static su3_vector psia,psib,phia,phib; static spinor rr; /* spinor * restrict r ALIGN; */ spinor * restrict sp ALIGN; spinor * restrict sm ALIGN; #ifdef OMP #undef static #endif #ifdef _KOJAK_INST #pragma pomp inst begin(derivSb) #endif #ifdef XLC #pragma disjoint(*sp, *sm, *up, *um) #endif /************** loop over all lattice sites ****************/ #ifdef OMP #pragma omp for #endif for(ix = 0; ix < (VOLUME); ix++){ rr = (*(l + ix)); /* rr=g_spinor_field[l][icx-ioff]; */ /*multiply the left vector with gamma5*/ _vector_minus_assign(rr.s2, rr.s2); _vector_minus_assign(rr.s3, rr.s3); /*********************** direction +0 ********************/ iy=g_iup[ix][0]; sp = k + iy; up=&hf->gaugefield[ix][0]; _vector_add(psia,(*sp).s0,(*sp).s2); _vector_add(psib,(*sp).s1,(*sp).s3); _vector_add(phia,rr.s0,rr.s2); _vector_add(phib,rr.s1,rr.s3); _vector_tensor_vector_add(v1, phia, psia, phib, psib); _su3_times_su3d(v2,*up,v1); _complex_times_su3(v1,ka0,v2); _trace_lambda_mul_add_assign_nonlocal(hf->derivative[ix][0], 2.*factor, v1); /************** direction -0 ****************************/ iy=g_idn[ix][0]; sm = k + iy; um=&hf->gaugefield[iy][0]; _vector_sub(psia,(*sm).s0,(*sm).s2); _vector_sub(psib,(*sm).s1,(*sm).s3); _vector_sub(phia,rr.s0,rr.s2); _vector_sub(phib,rr.s1,rr.s3); _vector_tensor_vector_add(v1, psia, phia, psib, phib); _su3_times_su3d(v2,*um,v1); _complex_times_su3(v1,ka0,v2); _trace_lambda_mul_add_assign_nonlocal(hf->derivative[iy][0], 2.*factor, v1); /*************** direction +1 **************************/ iy=g_iup[ix][1]; sp = k + iy; up=&hf->gaugefield[ix][1]; _vector_i_add(psia,(*sp).s0,(*sp).s3); _vector_i_add(psib,(*sp).s1,(*sp).s2); _vector_i_add(phia,rr.s0,rr.s3); _vector_i_add(phib,rr.s1,rr.s2); _vector_tensor_vector_add(v1, phia, psia, phib, psib); _su3_times_su3d(v2,*up,v1); _complex_times_su3(v1,ka1,v2); _trace_lambda_mul_add_assign_nonlocal(hf->derivative[ix][1], 2.*factor, v1); /**************** direction -1 *************************/ iy=g_idn[ix][1]; sm = k + iy; um=&hf->gaugefield[iy][1]; _vector_i_sub(psia,(*sm).s0,(*sm).s3); _vector_i_sub(psib,(*sm).s1,(*sm).s2); _vector_i_sub(phia,rr.s0,rr.s3); _vector_i_sub(phib,rr.s1,rr.s2); _vector_tensor_vector_add(v1, psia, phia, psib, phib); _su3_times_su3d(v2,*um,v1); _complex_times_su3(v1,ka1,v2); _trace_lambda_mul_add_assign_nonlocal(hf->derivative[iy][1], 2.*factor, v1); /*************** direction +2 **************************/ iy=g_iup[ix][2]; sp = k + iy; up=&hf->gaugefield[ix][2]; _vector_add(psia,(*sp).s0,(*sp).s3); _vector_sub(psib,(*sp).s1,(*sp).s2); _vector_add(phia,rr.s0,rr.s3); _vector_sub(phib,rr.s1,rr.s2); _vector_tensor_vector_add(v1, phia, psia, phib, psib); _su3_times_su3d(v2,*up,v1); _complex_times_su3(v1,ka2,v2); _trace_lambda_mul_add_assign_nonlocal(hf->derivative[ix][2], 2.*factor, v1); /***************** direction -2 ************************/ iy=g_idn[ix][2]; sm = k + iy; um=&hf->gaugefield[iy][2]; _vector_sub(psia,(*sm).s0,(*sm).s3); _vector_add(psib,(*sm).s1,(*sm).s2); _vector_sub(phia,rr.s0,rr.s3); _vector_add(phib,rr.s1,rr.s2); _vector_tensor_vector_add(v1, psia, phia, psib, phib); _su3_times_su3d(v2,*um,v1); _complex_times_su3(v1,ka2,v2); _trace_lambda_mul_add_assign_nonlocal(hf->derivative[iy][2], 2.*factor, v1); /****************** direction +3 ***********************/ iy=g_iup[ix][3]; sp = k + iy; up=&hf->gaugefield[ix][3]; _vector_i_add(psia,(*sp).s0,(*sp).s2); _vector_i_sub(psib,(*sp).s1,(*sp).s3); _vector_i_add(phia,rr.s0,rr.s2); _vector_i_sub(phib,rr.s1,rr.s3); _vector_tensor_vector_add(v1, phia, psia, phib, psib); _su3_times_su3d(v2,*up,v1); _complex_times_su3(v1,ka3,v2); _trace_lambda_mul_add_assign_nonlocal(hf->derivative[ix][3], 2.*factor, v1); /***************** direction -3 ************************/ iy=g_idn[ix][3]; sm = k + iy; um=&hf->gaugefield[iy][3]; _vector_i_sub(psia,(*sm).s0,(*sm).s2); _vector_i_add(psib,(*sm).s1,(*sm).s3); _vector_i_sub(phia,rr.s0,rr.s2); _vector_i_add(phib,rr.s1,rr.s3); _vector_tensor_vector_add(v1, psia, phia, psib, phib); _su3_times_su3d(v2,*um,v1); _complex_times_su3(v1,ka3,v2); _trace_lambda_mul_add_assign_nonlocal(hf->derivative[iy][3], 2.*factor, v1); /****************** end of loop ************************/ } #ifdef _KOJAK_INST #pragma pomp inst end(derivSb) #endif #ifdef OMP } /*OpenMP closing brace */ #endif }
/* 4. -IIG */ void xchange_halffield() { # ifdef TM_USE_MPI MPI_Request requests[16]; MPI_Status status[16]; # if ((defined PARALLELT) || (defined PARALLELX)) int reqcount = 4; # elif ((defined PARALLELXT) || (defined PARALLELXY)) int reqcount = 8; # elif ((defined PARALLELXYT) || (defined PARALLELXYZ)) int reqcount = 12; # elif defined PARALLELXYZT int reqcount = 16; # endif # if (defined XLC && defined BGL) __alignx(16, HalfSpinor); # endif #ifdef _KOJAK_INST #pragma pomp inst begin(xchangehalf) #endif # if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT ) /* send the data to the neighbour on the right in t direction */ /* recieve the data from the neighbour on the left in t direction */ MPI_Isend((void*)(sendBuffer + g_HS_shift_t), LX*LY*LZ*12/2, MPI_DOUBLE, g_nb_t_up, 81, g_cart_grid, &requests[0]); MPI_Irecv((void*)(recvBuffer + g_HS_shift_t + LX*LY*LZ/2), LX*LY*LZ*12/2, MPI_DOUBLE, g_nb_t_dn, 81, g_cart_grid, &requests[1]); /* send the data to the neighbour on the left in t direction */ /* recieve the data from the neighbour on the right in t direction */ MPI_Isend((void*)(sendBuffer + g_HS_shift_t + LX*LY*LZ/2), LX*LY*LZ*12/2, MPI_DOUBLE, g_nb_t_dn, 82, g_cart_grid, &requests[2]); MPI_Irecv((void*)(recvBuffer + g_HS_shift_t), LX*LY*LZ*12/2, MPI_DOUBLE, g_nb_t_up, 82, g_cart_grid, &requests[3]); # endif # if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ ) /* send the data to the neighbour on the right in x direction */ /* recieve the data from the neighbour on the left in x direction */ MPI_Isend((void*)(sendBuffer + g_HS_shift_x), T*LY*LZ*12/2, MPI_DOUBLE, g_nb_x_up, 91, g_cart_grid, &requests[4]); MPI_Irecv((void*)(recvBuffer + g_HS_shift_x + T*LY*LZ/2), T*LY*LZ*12/2, MPI_DOUBLE, g_nb_x_dn, 91, g_cart_grid, &requests[5]); /* send the data to the neighbour on the left in x direction */ /* recieve the data from the neighbour on the right in x direction */ MPI_Isend((void*)(sendBuffer + g_HS_shift_x + T*LY*LZ/2), T*LY*LZ*12/2, MPI_DOUBLE, g_nb_x_dn, 92, g_cart_grid, &requests[6]); MPI_Irecv((void*)(recvBuffer + g_HS_shift_x), T*LY*LZ*12/2, MPI_DOUBLE, g_nb_x_up, 92, g_cart_grid, &requests[7]); # endif # if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ ) /* send the data to the neighbour on the right in y direction */ /* recieve the data from the neighbour on the left in y direction */ MPI_Isend((void*)(sendBuffer + g_HS_shift_y), T*LX*LZ*12/2, MPI_DOUBLE, g_nb_y_up, 101, g_cart_grid, &requests[8]); MPI_Irecv((void*)(recvBuffer + g_HS_shift_y + T*LX*LZ/2), T*LX*LZ*12/2, MPI_DOUBLE, g_nb_y_dn, 101, g_cart_grid, &requests[9]); /* send the data to the neighbour on the leftt in y direction */ /* recieve the data from the neighbour on the right in y direction */ MPI_Isend((void*)(sendBuffer + g_HS_shift_y + T*LX*LZ/2), T*LX*LZ*12/2, MPI_DOUBLE, g_nb_y_dn, 102, g_cart_grid, &requests[10]); MPI_Irecv((void*)(recvBuffer + g_HS_shift_y), T*LX*LZ*12/2, MPI_DOUBLE, g_nb_y_up, 102, g_cart_grid, &requests[11]); # endif # if (defined PARALLELXYZT || defined PARALLELXYZ ) /* send the data to the neighbour on the right in z direction */ /* recieve the data from the neighbour on the left in z direction */ MPI_Isend((void*)(sendBuffer + g_HS_shift_z), T*LX*LY*12/2, MPI_DOUBLE, g_nb_z_up, 503, g_cart_grid, &requests[12]); MPI_Irecv((void*)(recvBuffer + g_HS_shift_z + T*LX*LY/2), T*LX*LY*12/2, MPI_DOUBLE, g_nb_z_dn, 503, g_cart_grid, &requests[13]); /* send the data to the neighbour on the left in z direction */ /* recieve the data from the neighbour on the right in z direction */ MPI_Isend((void*)(sendBuffer + g_HS_shift_z + T*LX*LY/2), 12*T*LX*LY/2, MPI_DOUBLE, g_nb_z_dn, 504, g_cart_grid, &requests[14]); MPI_Irecv((void*)(recvBuffer + g_HS_shift_z), T*LX*LY*12/2, MPI_DOUBLE, g_nb_z_up, 504, g_cart_grid, &requests[15]); # endif MPI_Waitall(reqcount, requests, status); # endif /* MPI */ return; #ifdef _KOJAK_INST #pragma pomp inst end(xchangehalf) #endif }