int main (int argc, char** argv) { QMP_status_t status; int this_node; QMP_thread_level_t req, prv; /* Start QMP */ req = QMP_THREAD_SINGLE; status = QMP_init_msg_passing (&argc, &argv, req, &prv); if (status != QMP_SUCCESS) { QMP_error ("QMP_init failed: %s\n", QMP_error_string(status)); QMP_abort(1); } /* Get my logical node number */ this_node = QMP_get_node_number(); /* Print the result */ printf("%04d",this_node); /* Quit */ QMP_finalize_msg_passing (); return 0; }
/* creates data array if necessary and destroys pointers */ void QDP_switch_ptr_to_data(QDP_data_common_t *dc) { ENTER; if(*(dc->data)==NULL) { //*(dc->data) = (char *) malloc(QDP_sites_on_node*dc->size); dc->qmpmem = QMP_allocate_aligned_memory( QDP_sites_on_node_L(dc->lat)*dc->size, QDP_mem_align, QDP_mem_flags ); if(!dc->qmpmem) { QMP_error("QDP error: can't allocate memory\n"); QDP_abort(1); } *(dc->data) = QMP_get_memory_pointer(dc->qmpmem); } else { QDP_clear_valid_shift_dest(dc); } if(*(dc->ptr)!=NULL) { QDP_finish_shifts(dc); if(!dc->discarded) QDP_copy_ptr_to_data(dc); QDP_clear_shift_src(dc); free((void*)*(dc->ptr)); *(dc->ptr) = NULL; } LEAVE; }
unsigned int sync() { QMP_status_t sync_status = QMP_barrier(); if (sync_status != QMP_SUCCESS) { QMP_error("Error in QMP sync:%s\n", QMP_error_string(sync_status)); return 0; } return 1; }
QMP_status_t QMP_comm_binary_reduction_mpi(QMP_comm_t comm, void *lbuffer, size_t count, QMP_binary_func bfunc) { QMP_status_t status = QMP_SUCCESS; ENTER; QMP_assert(qmp_user_bfunc==NULL); /* set up user binary reduction pointer */ qmp_user_bfunc = bfunc; if(!op_inited) { status = MPI_Op_create(qmp_bfunc_mpi, 1, &bop); if (status != MPI_SUCCESS) { QMP_error ("Cannot create MPI operator for binary reduction.\n"); goto leave; } op_inited = 1; } char *rbuffer; QMP_alloc(rbuffer, char, count); int err = MPI_Allreduce(lbuffer,rbuffer,count, MPI_BYTE, bop, comm->mpicomm); if(err != MPI_SUCCESS) status = err; else { memcpy (lbuffer, rbuffer, count); QMP_free(rbuffer); } /* signal end of the binary reduction session */ qmp_user_bfunc = NULL; leave: LEAVE; return QMP_SUCCESS; }
CPS_START_NAMESPACE #ifndef USE_QMP #define QMP #endif void GlobalDataShift::Shift(int direction, int n_disp){ if (n_disp==0) return; SCUDir s_dir,r_dir; int a_disp; void *send_p,*recv_p,*temp_p; #ifndef USE_QMP if (n_disp>0){ a_disp = n_disp; s_dir = gjp_scu_dir[i*2]; r_dir = gjp_scu_dir[i*2+1]; } else { a_disp = -n_disp; s_dir = gjp_scu_dir[i*2+1]; r_dir = gjp_scu_dir[i*2]; } #else // int direction = i; int sflag; if (n_disp > 0) sflag = +1; else sflag = -1; #endif send_p = addr; recv_p = temp_buf; #ifndef USE_QMP SCUDirArgIR Send(send_p,s_dir,SCU_SEND,data_len); SCUDirArgIR Recv(recv_p,r_dir,SCU_REC,data_len); #else QMP_msgmem_t msgmem[2]; QMP_msghandle_t msghandle[2]; QMP_status_t status; QMP_msghandle_t multiple; #endif // sys_cacheflush(0); for(int i = 0;i<a_disp-1;i++){ #ifndef USE_QMP Send.StartTrans();Recv.StartTrans(); Send.TransComplete();Recv.TransComplete(); #else msgmem[0] = QMP_declare_msgmem((void *)send_p, data_len); msgmem[1] = QMP_declare_msgmem((void *)recv_p, data_len); msghandle[0] = QMP_declare_send_relative(msgmem[0], direction, sflag, 0); msghandle[1] = QMP_declare_receive_relative(msgmem[1], direction, -sflag, 0); multiple = QMP_declare_multiple(msghandle, 2); QMP_start(multiple); status = QMP_wait(multiple); if (status != QMP_SUCCESS) QMP_error("Error in GlobalDataShift::Shift:%s\n", QMP_error_string(status)); QMP_free_msghandle(multiple); QMP_free_msgmem(msgmem[0]); QMP_free_msgmem(msgmem[1]); #endif temp_p = send_p; send_p = recv_p; recv_p = temp_p; #ifndef USE_QMP Send.Addr(send_p); Recv.Addr(recv_p); #endif } #ifndef USE_QMP Send.StartTrans();Recv.StartTrans(); Send.TransComplete();Recv.TransComplete(); #else msgmem[0] = QMP_declare_msgmem((void *)send_p, data_len); msgmem[1] = QMP_declare_msgmem((void *)recv_p, data_len); msghandle[0] = QMP_declare_send_relative(msgmem[0], direction, sflag, 0); msghandle[1] = QMP_declare_receive_relative(msgmem[1], direction, -sflag, 0); multiple = QMP_declare_multiple(msghandle, 2); QMP_start(multiple); status = QMP_wait(multiple); if (status != QMP_SUCCESS) QMP_error("Error in GlobalDataShift::Shift:%s\n", QMP_error_string(status)); QMP_free_msghandle(multiple); QMP_free_msgmem(msgmem[0]); QMP_free_msgmem(msgmem[1]); #endif if (recv_p != addr) memcpy(addr,recv_p,data_len); }
void PT::mat_cb_norm(int n, IFloat **mout, IFloat **min, const int *dir, int parity, IFloat * gauge) { //List of the different directions int wire[MAX_DIR]; int i; // printf("PT::mat_cb_norm\n"); QMP_msgmem_t *msg_mem_p = (QMP_msgmem_t *)Alloc("","vec_cb_norm", "msg_mem_p", 2*non_local_dirs*sizeof(QMP_msgmem_t)); QMP_msghandle_t* msg_handle_p = (QMP_msghandle_t *)Alloc("","vec_cb_norm", "msg_handle_p", 2*non_local_dirs*sizeof(QMP_msghandle_t)); QMP_msghandle_t multiple; static int call_num = 0; int vlen = VECT_LEN; int vlen2 = VECT_LEN; call_num++; //Name our function char *fname="pt_mat_cb()"; // VRB.Func("",fname); //Set the transfer directions //If wire[i] is even, then we have communication in the negative direction //If wire[i] is odd, then we have communication in the positive direction for(i=0;i<n;i++) wire[i]=dir[i]; #ifdef PROFILE Float dtime = - dclock(); #endif int non_local_dir=0; //#pragma omp parallel default(shared) { //If wire[i] is odd, then we have parallel transport in the //positive direction. In this case, multiplication by the link matrix is //done before the field is transferred over to the adjacent node // //If we have transfer in the negative T direction (wire[i] = 6), then //we have to copy the appropriate fields to a send buffer //#pragma omp for for(i=0;i<n;i++) { if(!local[wire[i]/2]) { if(wire[i]%2) { if(conjugated) pt_cmm_cpp(non_local_chi_cb[wire[i]],(long)uc_nl_cb_pre[parity][wire[i]/2],(long)min[i],(long)snd_buf_cb[wire[i]/2],(long)gauge); else pt_cmm_dag_cpp(non_local_chi_cb[wire[i]],(long)uc_nl_cb_pre[parity][wire[i]/2],(long)min[i],(long)snd_buf_cb[wire[i]/2],(long)gauge); } else if((wire[i] == 6)) { for(int j = 0; j < non_local_chi_cb[6];j++) memcpy(snd_buf_t_cb + j*GAUGE_LEN,min[i] + 3 * *(Toffset[parity]+j)*3,GAUGE_LEN*sizeof(IFloat)); } } } //#pragma omp barrier //#pragma omp master { for(i=0;i<n;i++) if(!local[wire[i]/2]) { //Calculate the starting address for the data to be sent IFloat *addr = min[i] + GAUGE_LEN * offset_cb[wire[i]]; msg_mem_p[2*non_local_dir] = QMP_declare_msgmem((void *)rcv_buf[wire[i]], 3*non_local_chi_cb[wire[i]]*VECT_LEN*sizeof(IFloat)); //Initialize the msg_mem for sends if(wire[i]%2) msg_mem_p[2*non_local_dir+1] = QMP_declare_msgmem((void *)snd_buf_cb[wire[i]/2], 3*non_local_chi_cb[wire[i]]*VECT_LEN*sizeof(IFloat)); else if(wire[i] == 6) msg_mem_p[2*non_local_dir+1] = QMP_declare_msgmem((void *)snd_buf_t_cb, 3*non_local_chi_cb[wire[i]]*VECT_LEN*sizeof(IFloat)); else msg_mem_p[2*non_local_dir+1] = QMP_declare_strided_msgmem((void *)addr, (size_t)(3*blklen_cb[wire[i]]), numblk_cb[wire[i]], (ptrdiff_t)(3*stride_cb[wire[i]]+3*blklen_cb[wire[i]])); msg_handle_p[2*non_local_dir] = QMP_declare_receive_relative(msg_mem_p[2*non_local_dir], wire[i]/2, 1-2*(wire[i]%2), 0); msg_handle_p[2*non_local_dir+1] = QMP_declare_send_relative(msg_mem_p[2*non_local_dir+1], wire[i]/2, 2*(wire[i]%2)-1, 0); non_local_dir++; } if(non_local_dir) { multiple = QMP_declare_multiple(msg_handle_p, 2*non_local_dir); QMP_start(multiple); } } //#pragma omp master { //Do local calculations //#pragma omp for for(i=0;i<n;i++) { if((wire[i]%2 && conjugated) || ((wire[i]%2 == 0) && (conjugated == 0))) pt_cmm_cpp(local_chi_cb[wire[i]],(long)uc_l_cb[parity][wire[i]],(long)min[i],(long)mout[i],(long)gauge); else pt_cmm_dag_cpp(local_chi_cb[wire[i]],(long)uc_l_cb[parity][wire[i]],(long)min[i],(long)mout[i],(long)gauge); } //#pragma omp barrier //#pragma omp master { if(non_local_dir) { QMP_status_t qmp_complete_status = QMP_wait(multiple); if (qmp_complete_status != QMP_SUCCESS) QMP_error("Send failed in vec_cb_norm: %s\n", QMP_error_string(qmp_complete_status)); QMP_free_msghandle(multiple); for(int i = 0; i < 2*non_local_dir; i++) QMP_free_msgmem(msg_mem_p[i]); Free(msg_handle_p); Free(msg_mem_p); } } //#pragma omp master { //If wire[i] is even, then we have transport in the negative direction //In this case, the vector field is multiplied by the SU(3) link matrix //after all communication is complete IFloat *fp0,*fp1; //#pragma omp for for(i=0;i<n;i++) { if(!local[wire[i]/2]) { if(!(wire[i]%2)) { if(conjugated) pt_cmm_dag_cpp(non_local_chi_cb[wire[i]],(long)uc_nl_cb[parity][wire[i]],(long)rcv_buf[wire[i]],(long)mout[i],(long)gauge); else pt_cmm_cpp(non_local_chi_cb[wire[i]],(long)uc_nl_cb[parity][wire[i]],(long)rcv_buf[wire[i]],(long)mout[i],(long)gauge); } //Otherwise we have parallel transport in the positive direction. //In this case, the received data has already been pre-multiplied //All we need to do is to put the transported field in the correct place else { //int destination, source; //Place the data in the receive buffer into the result vector for(int s=0;s<non_local_chi_cb[wire[i]];s++) { //source = uc_nl_cb[parity][wire[i]][s].src; fp0 = (IFloat *)((long)rcv_buf[wire[i]]+3*uc_nl_cb[parity][wire[i]][s].src); //destination = uc_nl_cb[parity][wire[i]][s].dest; fp1 = (IFloat *)(mout[i]+3*uc_nl_cb[parity][wire[i]][s].dest); memcpy(fp1,fp0,GAUGE_LEN*sizeof(IFloat)); } } } } } //#pragma omp parallel #ifdef PROFILE dtime +=dclock(); print_flops("",fname,99*vol*n,dtime); #endif // ParTrans::PTflops +=99*n*vol; }
void PT::mat(int n, matrix **mout, matrix **min, const int *dir){ int wire[MAX_DIR]; int i; QMP_msgmem_t msg_mem_p[2*MAX_DIR]; QMP_msghandle_t msg_handle_p[2*MAX_DIR]; QMP_msghandle_t multiple; static double setup=0.,qmp=0.,localt=0.,nonlocal=0.; static int call_num = 0; call_num++; // char *fname="pt_mat()"; // VRB.Func("",fname); // if (call_num%100==1) printf("PT:mat()\n"); for(i=0;i<n;i++) wire[i] = dir[i]; #ifdef PROFILE Float dtime2 = - dclock(); #endif double dtime = -dclock(); int non_local_dir=0; for(i=0;i<n;i++) if (!local[wire[i]/2]) { //Calculate the address for transfer in a particular direction Float * addr = ((Float *)min[i]+GAUGE_LEN*offset[wire[i]]); msg_mem_p[2*non_local_dir] = QMP_declare_msgmem((void *)rcv_buf[wire[i]], 3*non_local_chi[wire[i]]*VECT_LEN*sizeof(IFloat)); msg_mem_p[2*non_local_dir+1] = QMP_declare_strided_msgmem((void *)addr, (size_t)(3*blklen[wire[i]]), numblk[wire[i]], (ptrdiff_t)(3*stride[wire[i]]+3*blklen[wire[i]])); msg_handle_p[2*non_local_dir] = QMP_declare_receive_relative(msg_mem_p[2*non_local_dir], wire[i]/2, 1-2*(wire[i]%2), 0); msg_handle_p[2*non_local_dir+1] = QMP_declare_send_relative(msg_mem_p[2*non_local_dir+1], wire[i]/2, 2*(wire[i]%2)-1, 0); non_local_dir++; } if (call_num==1 && !QMP_get_node_number()) printf("non_local_dir=%d\n",non_local_dir); if(non_local_dir) { multiple = QMP_declare_multiple(msg_handle_p, 2*non_local_dir); QMP_start(multiple); } dtime += dclock(); setup +=dtime; dtime = -dclock(); int if_print = 0; // if ( (call_num%10000==1) && (!QMP_get_node_number()) ) if_print=1; #define USE_TEST2 #ifdef USE_TEST2 //assume nt > n! static char *cname="mat()"; #pragma omp parallel default(shared) { int iam,nt,ipoints,istart,offset; iam = omp_get_thread_num(); nt = omp_get_num_threads(); int nt_dir = nt/n; int n_t = iam/nt_dir; int i_t = iam%nt_dir; if (n_t >= n ){ n_t = n-1; i_t = iam - (n-1)*nt_dir; nt_dir = nt -(n-1)*nt_dir; } int w_t = wire[n_t]; ipoints = (local_chi[w_t]/2)/nt_dir; offset = ipoints*i_t; if (i_t == (nt_dir-1)) ipoints = (local_chi[w_t]/2)-offset; if ( if_print ) printf("thread %d of %d nt_dir n_t i_t ipoints offset= %d %d %d %d %d\n",iam,nt,nt_dir,n_t,i_t,ipoints,offset); //Interleaving of local computation of matrix multiplication partrans_cmm_agg((uc_l[w_t]+offset*2),min[n_t],mout[n_t],ipoints); if ( if_print ) printf("thread %d of %d done\n",iam,nt); } #else { //Interleaving of local computation of matrix multiplication #pragma omp parallel for default(shared) for(i=0;i<n;i++){ partrans_cmm_agg(uc_l[wire[i]],min[i],mout[i],local_chi[wire[i]]/2); } } #endif dtime += dclock(); localt +=dtime; dtime = -dclock(); //#pragma omp barrier //#pragma omp master { if(non_local_dir) { QMP_status_t qmp_complete_status = QMP_wait(multiple); if (qmp_complete_status != QMP_SUCCESS) QMP_error("Send failed in vec_cb_norm: %s\n", QMP_error_string(qmp_complete_status)); QMP_free_msghandle(multiple); for(int i = 0; i < 2*non_local_dir; i++) QMP_free_msgmem(msg_mem_p[i]); // Free(msg_handle_p); // Free(msg_mem_p); } } //#pragma omp master { dtime += dclock(); qmp +=dtime; dtime = -dclock(); //Do non-local computations #ifdef USE_TEST2 //assume nt > n! #pragma omp parallel default(shared) { int iam,nt,ipoints,istart,offset; iam = omp_get_thread_num(); nt = omp_get_num_threads(); int nt_dir = nt/n; int n_t = iam/nt_dir; int i_t = iam%nt_dir; if (n_t >= n ){ n_t = n-1; i_t = iam - (n-1)*nt_dir; nt_dir = nt -(n-1)*nt_dir; } int w_t = wire[n_t]; ipoints = (non_local_chi[w_t]/2)/nt_dir; offset = ipoints*i_t; if (i_t == (nt_dir-1)) ipoints = (non_local_chi[w_t]/2)-offset; if ( if_print ) printf("thread %d of %d nt_dir n_t i_t ipoints offset= %d %d %d %d %d\n",iam,nt,nt_dir,n_t,i_t,ipoints,offset); //Non-local computation if (ipoints>0) partrans_cmm_agg((uc_nl[w_t]+offset*2),(matrix *)rcv_buf[w_t],mout[n_t],ipoints); if ( if_print ) printf("thread %d of %d done\n",iam,nt); } #else { #pragma omp parallel for for(i=0;i<n;i++) if (!local[wire[i]/2]) { #ifdef USE_OMP if (call_num%10000==1 && !QMP_get_node_number() ) printf("thread %d of %d i=%d\n",omp_get_thread_num(),omp_get_num_threads(),i); #endif partrans_cmm_agg(uc_nl[wire[i]],(matrix *)rcv_buf[wire[i]],mout[i],non_local_chi[wire[i]]/2); } }//#pragma omp parallel #endif dtime += dclock(); nonlocal +=dtime; if (call_num%100==0){ static char *cname="mat()"; if (!QMP_get_node_number() ) { print_flops("mat():local*100",0,localt); print_flops("mat():nonlocal*100",0,nonlocal); print_flops("mat():qmp*100",0,qmp); print_flops("mat():setup*100",0,setup); } localt=nonlocal=qmp=setup=0.; } #ifdef PROFILE dtime2 +=dclock(); print_flops("",fname,198*vol*n,dtime2); #endif // ParTrans::PTflops +=198*n*vol; }
/*! Computes sum[x] = vect2[x] vect[x + hop dir]^dagger where the sum is over n_vect vectors and the hop is in a forward direction. */ void PT::vvpd(IFloat **vect2, IFloat ***vect, int n_vect, const int *dir, int n_dir, int hop, IFloat **sum, int overwrite){ char *fname = "pt_vvpd()"; #if 1 // ERR.NotImplemented(cname,fname); QMP_error("%s""%s Not implemented\n"); #else // VRB.Func("",fname); int i, s, v; Float f = 2.0; int wire[MAX_DIR]; for(i=0;i<n_dir;i++) wire[i] = dir[i]; // from (x,y,z,t) to (t,x,y,z) QMP_msgmem_t *msg_mem_p = (QMP_msgmem_t *)Alloc("","vvpd", "msg_mem_p", 2*non_local_dirs*sizeof(QMP_msgmem_t)); QMP_msgmem_t *msg_mem_p2 = (QMP_msgmem_t *)Alloc("","vvpd", "msg_mem_p", 2*non_local_dirs*sizeof(QMP_msgmem_t)); QMP_msghandle_t* msg_handle_p = (QMP_msghandle_t *)Alloc("","vvpd", "msg_handle_p", 2*non_local_dirs*sizeof(QMP_msghandle_t)); QMP_msghandle_t* msg_handle_p2 = (QMP_msghandle_t *)Alloc("","vvpd", "msg_handle_p", 2*non_local_dirs*sizeof(QMP_msghandle_t)); QMP_msghandle_t multiple; //Setup communciation int comms=0; for(i=0;i<n_dir;i++) if( !local[wire[i]/2]) { if ( size[wire[i]/2] <hop) fprintf(stderr, "%s:size(%d) in direction %d is smaller than the hop(%d)\n", fname,size[wire[i]],wire[i],hop); comms++; } for(v=0; v<n_vect; v++){ if (v%2==0) { comms=0; for(i=0;i<n_dir;i++) if( !local[wire[i]/2]){ msg_mem_p[2*comms] = QMP_declare_msgmem((void *)rcv_buf[wire[i]], hop*non_local_chi[wire[i]]*VECT_LEN*sizeof(IFloat)); msg_handle_p[2*comms] = QMP_declare_receive_relative(msg_mem_p[2*comms], wire[i]/2, 1-2*(wire[i]%2), 0); msg_mem_p[2*comms+1] = QMP_declare_strided_msgmem((void *)(vect[v][i]+VECT_LEN*set_offset(wire[i], hop)), (size_t)(hop*blklen[wire[i]]), numblk[wire[i]], (ptrdiff_t)(stride[wire[i]] + blklen[wire[i]])); msg_handle_p[2*comms+1] = QMP_declare_send_relative(msg_mem_p[2*comms+1], wire[i]/2, 2*(wire[i]%2)-1, 0); comms++; } // Start communication if(comms) { multiple = QMP_declare_multiple(msg_handle_p, 2*comms); } if (comms) { QMP_start(multiple); QMP_status_t qmp_complete_status = QMP_wait(multiple); if (qmp_complete_status != QMP_SUCCESS) QMP_error("Send failed in vvpd: %s\n", QMP_error_string(qmp_complete_status)); QMP_free_msghandle(multiple); for(int i = 0; i < 2*comms; i++) QMP_free_msgmem(msg_mem_p[i]); } } else { comms=0; for(i=0;i<n_dir;i++) if( !local[wire[i]/2]){ msg_mem_p2[2*comms] = QMP_declare_msgmem((void *)rcv_buf2[wire[i]], hop*non_local_chi[wire[i]]*VECT_LEN*sizeof(IFloat)); msg_handle_p2[2*comms] = QMP_declare_receive_relative(msg_mem_p2[2*comms], wire[i]/2, 1-2*(wire[i]%2), 0); msg_mem_p2[2*comms+1] = QMP_declare_strided_msgmem((void *)(vect[v][i]+VECT_LEN*set_offset(wire[i], hop)), (size_t)(hop*blklen[wire[i]]), numblk[wire[i]], (ptrdiff_t)(stride[wire[i]] + blklen[wire[i]])); msg_handle_p2[2*comms+1] = QMP_declare_send_relative(msg_mem_p2[2*comms+1], wire[i]/2, 2*(wire[i]%2)-1, 0); comms++; } // Start communication if(comms) { multiple = QMP_declare_multiple(msg_handle_p2, 2*comms); } if (comms) { QMP_start(multiple); QMP_status_t qmp_complete_status = QMP_wait(multiple); if (qmp_complete_status != QMP_SUCCESS) QMP_error("Send failed in vvpd: %s\n", QMP_error_string(qmp_complete_status)); QMP_free_msghandle(multiple); for(int i = 0; i < 2*comms; i++) QMP_free_msgmem(msg_mem_p2[i]); } } // Perform non-local calculation for previous v if (v>0) if (v==1 && overwrite==1) { for(i=0; i<n_dir; i++) if(non_local_chi[wire[i]]>0) cross_over_lin(sum[i], &f, vect2[v-1],rcv_buf[wire[i]], hop*non_local_chi[wire[i]], src_nl[hop-1][wire[i]], dest_nl[hop-1][wire[i]]); } else if (v%2==1) { for(i=0; i<n_dir; i++) if(non_local_chi[wire[i]]>0) cross_lin(sum[i], &f, vect2[v-1],rcv_buf[wire[i]], hop*non_local_chi[wire[i]], src_nl[hop-1][wire[i]], dest_nl[hop-1][wire[i]]); } else { for(i=0; i<n_dir; i++) if(non_local_chi[wire[i]]>0) cross_lin(sum[i], &f,vect2[v-1],rcv_buf2[wire[i]], hop*non_local_chi[wire[i]], src_nl[hop-1][wire[i]], dest_nl[hop-1][wire[i]]); } // Perform local calculation for current v if (v==0 && overwrite==1) { for(i=0; i<n_dir; i++) if((vol-hop*non_local_chi[wire[i]])>0) cross_over_look(sum[i], &f, vect2[v], vect[v][i], vol-hop*non_local_chi[wire[i]], src_l[hop-1][wire[i]], dest_l[hop-1][wire[i]]); } else { for(i=0; i<n_dir; i++) if((vol-hop*non_local_chi[wire[i]])>0) cross_look(sum[i], &f, vect2[v], vect[v][i], vol-hop*non_local_chi[wire[i]], src_l[hop-1][wire[i]], dest_l[hop-1][wire[i]]); } } if (v==1 && overwrite==1) { for(i=0; i<n_dir; i++) if(non_local_chi[wire[i]]>0) cross_over_lin(sum[i], &f, vect2[v-1],rcv_buf[wire[i]], hop*non_local_chi[wire[i]], src_nl[hop-1][wire[i]], dest_nl[hop-1][wire[i]]); } else if (v%2==1) { for(i=0; i<n_dir; i++) if(non_local_chi[wire[i]]>0) cross_lin(sum[i], &f, vect2[v-1],rcv_buf[wire[i]], hop*non_local_chi[wire[i]], src_nl[hop-1][wire[i]], dest_nl[hop-1][wire[i]]); } else { for(i=0; i<n_dir; i++) if(non_local_chi[wire[i]]>0) cross_lin(sum[i], &f,vect2[v-1],rcv_buf2[wire[i]], hop*non_local_chi[wire[i]], src_nl[hop-1][wire[i]], dest_nl[hop-1][wire[i]]); } #endif // ParTrans::PTflops += 90*n_vect*n_dir*vol; }
//! u[x] = v[x+dir] for n_dir forward or backward directions dir. void PT::shift_field(IFloat **v, const int *dir, int n_dir, int hop, IFloat **u){ int i, length; int wire[n_dir]; for (i=0; i<n_dir;i++) wire[i] = dir[i]; #ifdef USE_QMP QMP_msgmem_t msg_mem_p[20]; QMP_msghandle_t msg_handle_p[20]; QMP_msghandle_t multiple; #else SCUDirArgMulti SCUmulti; SCUDirArgIR *SCUarg_p[2*n_dir]; #endif int comms=0; for (i=0; i<n_dir; i++) if (!local[wire[i]/2]){ #ifndef USE_QMP SCUarg_p[2*comms] = SCUarg_mat[hop-1][2*wire[i]]; SCUarg_p[2*comms+1] = SCUarg_mat[hop-1][2*wire[i]+1]; SCUarg_p[2*comms+1]->Addr((void *)(v[i]+GAUGE_LEN*set_offset(wire[i], hop))); #else msg_mem_p[2*comms] = QMP_declare_msgmem((void *)rcv_buf[wire[i]], 3*hop*non_local_chi[wire[i]]*VECT_LEN*sizeof(IFloat)); msg_mem_p[2*comms+1] = QMP_declare_strided_msgmem((void *)(v[i]+GAUGE_LEN*set_offset(wire[i], hop)), (size_t)(3*hop*blklen[wire[i]]), numblk[wire[i]], (ptrdiff_t)(3*stride[wire[i]]+3*blklen[wire[i]])); msg_handle_p[2*comms] = QMP_declare_receive_relative(msg_mem_p[2*comms], wire[i]/2, 1-2*(wire[i]%2), 0); msg_handle_p[2*comms+1] = QMP_declare_send_relative(msg_mem_p[2*comms+1], wire[i]/2, 2*(wire[i]%2)-1, 0); #endif comms++; } #ifndef USE_QMP if (comms) SCUmulti.Init(SCUarg_p,2*comms); if (comms) SCUmulti.SlowStartTrans(); #else if(comms) { multiple = QMP_declare_multiple(msg_handle_p, 2*comms); QMP_start(multiple); } #endif // SCUmulti.TransComplete(); for (i=0; i<n_dir; i++) { length = vol-hop*non_local_chi[wire[i]]; copy_matrix(u[i],v[i],&length,dest_l[hop-1][wire[i]], src_l[hop-1][wire[i]]); } #ifndef USE_QMP if (comms) SCUmulti.TransComplete(); #else if(comms) { QMP_status_t qmp_complete_status = QMP_wait(multiple); if (qmp_complete_status != QMP_SUCCESS) QMP_error("Send failed in shift_field: %s\n", QMP_error_string(qmp_complete_status)); QMP_free_msghandle(multiple); for(int i = 0; i < 2*comms; i++) QMP_free_msgmem(msg_mem_p[i]); } #endif for (i=0; i<n_dir; i++) { length = hop*non_local_chi[wire[i]]; copy_matrix(u[i],(IFloat*)rcv_buf[wire[i]],&length, dest_nl[hop-1][wire[i]],src_nl[hop-1][wire[i]]); } }
void init_qmp(int * argc, char ***argv) { #if 0 printf("init_qmp(%d %p)\n",*argc,*argv); for(int i = 0; i<*argc;i++){ printf("argv[%d](before)=%s\n",i,(*argv)[i]); } #endif #if 0 spi_init(); #endif QMP_thread_level_t prv; #ifndef UNIFORM_SEED_NO_COMMS QMP_status_t init_status = QMP_init_msg_passing(argc, argv, QMP_THREAD_SINGLE, &prv); if (init_status) printf("QMP_init_msg_passing returned %d\n",init_status); peRank = QMP_get_node_number(); peNum = QMP_get_number_of_nodes(); if(!peRank)printf("QMP_init_msg_passing returned %d\n",init_status); if (init_status != QMP_SUCCESS) { QMP_error("%s\n",QMP_error_string(init_status)); } // check QMP thread level // Added by Hantao if(peRank == 0) { switch(prv) { case QMP_THREAD_SINGLE: printf("QMP thread level = QMP_THREAD_SINGLE\n"); break; case QMP_THREAD_FUNNELED: printf("QMP thread level = QMP_THREAD_FUNNELED\n"); break; case QMP_THREAD_SERIALIZED: printf("QMP thread level = QMP_THREAD_SERIALIZED\n"); break; case QMP_THREAD_MULTIPLE: printf("QMP thread level = QMP_THREAD_MULTIPLE\n"); break; default: printf("QMP thread level = no idea what this is, boom!\n"); } } //Check to make sure that this machine is a GRID machine //Exit if not GRID machine QMP_ictype qmp_type = QMP_get_msg_passing_type(); //Get information about the allocated machine peNum = QMP_get_number_of_nodes(); NDIM = QMP_get_allocated_number_of_dimensions(); peGrid = QMP_get_allocated_dimensions(); pePos = QMP_get_allocated_coordinates(); if(peRank==0){ for(int i = 0; i<*argc;i++){ printf("argv[%d])(after)=%s\n",i,(*argv)[i]); } } #else QMP_status_t init_status = QMP_SUCCESS; peRank=0; peNum=1; NDIM=4; #endif //#if (TARGET == BGL) || (TARGET == BGP) if (NDIM>5){ peNum = 1; for(int i = 0;i<5;i++) peNum *= peGrid[i]; peRank = peRank % peNum; } int if_print=1; for(int i = 0;i<NDIM;i++) if (pePos[i]>=2) if_print=0; if (if_print){ printf("Rank=%d Num=%d NDIM=%d\n",peRank,peNum,NDIM); printf("dim:"); for(int i = 0;i<NDIM;i++) printf(" %d",peGrid[i]); printf("\n"); printf("pos:"); for(int i = 0;i<NDIM;i++) printf(" %d",pePos[i]); printf("\n"); #if 0 int rc; BGLPersonality pers; rts_get_personality(&pers, sizeof(pers)); printf("from personality: %d %d %d %d\n",pers.xCoord,pers.yCoord,pers.zCoord,rts_get_processor_id()); #endif } // printf("from personality:\n"); #if 0 if ( (qmp_type!= QMP_GRID) && (qmp_type !=QMP_MESH) ) { QMP_error("CPS on QMP only implemented for GRID or MESH, not (%d) machines\n",qmp_type); } #endif // printf("QMP_declare_logical_topology(peGrid, NDIM)\n"); #ifndef UNIFORM_SEED_NO_COMMS //Declare the logical topology (Redundant for GRID machines) if (QMP_declare_logical_topology(peGrid, NDIM) != QMP_SUCCESS) { QMP_error("Node %d: Failed to declare logical topology\n",peRank); exit(-4); } #endif initialized = true; printf("Rank=%d init_qmp() done\n",peRank); }
int main (int argc, char** argv) { int i, nc; QMP_status_t status; int **smem, **rmem; QMP_msgmem_t *recvmem; QMP_msghandle_t *recvh; QMP_msgmem_t *sendmem; QMP_msghandle_t *sendh; struct perf_argv pargv; QMP_thread_level_t req, prv; /** * Simple point to point topology */ int dims[4] = {2,2,2,2}; int ndims = 1; //if(QMP_get_node_number()==0) //printf("starting init\n"); fflush(stdout); req = QMP_THREAD_SINGLE; status = QMP_init_msg_passing (&argc, &argv, req, &prv); if (status != QMP_SUCCESS) { fprintf (stderr, "QMP_init failed\n"); return -1; } if(QMP_get_node_number()==0) printf("finished init\n"); fflush(stdout); if (parse_options (argc, argv, &pargv) == -1) { if(QMP_get_node_number()==0) usage (argv[0]); exit (1); } { int maxdims = 4; int k=0; int nodes = QMP_get_number_of_nodes(); ndims = 0; while( (nodes&1) == 0 ) { if(ndims<maxdims) ndims++; else { dims[k] *= 2; k++; if(k>=maxdims) k = 0; } nodes /= 2; } if(nodes != 1) { QMP_error("invalid number of nodes %i", QMP_get_number_of_nodes()); QMP_error(" must power of 2"); QMP_abort(1); } pargv.ndims = ndims; } status = QMP_declare_logical_topology (dims, ndims); if (status != QMP_SUCCESS) { fprintf (stderr, "Cannot declare logical grid\n"); return -1; } /* do a broadcast of parameter */ if (QMP_broadcast (&pargv, sizeof (pargv)) != QMP_SUCCESS) { QMP_printf ("Broadcast parameter failed\n"); exit (1); } { int k=1; const int *lc = QMP_get_logical_coordinates(); for(i=0; i<ndims; i++) k += lc[i]; pargv.sender = k&1; } QMP_printf("%s options: num_channels[%d] verify[%d] option[%d] datasize[%d] numloops[%d] sender[%d] strided_send[%i] strided_recv[%i] strided_array_send[%i] ", argv[0], pargv.num_channels, pargv.verify, pargv.option, pargv.size, pargv.loops, pargv.sender, strided_send, strided_recv, strided_array_send); fflush(stdout); /** * Create memory */ nc = pargv.num_channels; smem = (int **)malloc(nc*sizeof (int *)); rmem = (int **)malloc(nc*sizeof (int *)); sendmem = (QMP_msgmem_t *)malloc(ndims*nc*sizeof (QMP_msgmem_t)); recvmem = (QMP_msgmem_t *)malloc(ndims*nc*sizeof (QMP_msgmem_t)); sendh = (QMP_msghandle_t *)malloc(nc*sizeof (QMP_msghandle_t)); recvh = (QMP_msghandle_t *)malloc(nc*sizeof (QMP_msghandle_t)); QMP_barrier(); if(QMP_get_node_number()==0) printf("\n"); fflush(stdout); if(pargv.option & TEST_SIMUL) { int opts = pargv.option; pargv.option = TEST_SIMUL; if(QMP_get_node_number()==0) QMP_printf("starting simultaneous sends"); fflush(stdout); for(i=pargv.minsize; i<=pargv.maxsize; i*=pargv.facsize) { pargv.size = i; create_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc, i, &pargv); test_simultaneous_send (smem, rmem, sendh, recvh, &pargv); check_mem(rmem, ndims, nc, i); free_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc); } if(QMP_get_node_number()==0) QMP_printf("finished simultaneous sends\n"); fflush(stdout); pargv.option = opts; } if(pargv.option & TEST_PINGPONG) { int opts = pargv.option; pargv.option = TEST_PINGPONG; if(QMP_get_node_number()==0) QMP_printf("starting ping pong sends"); fflush(stdout); for(i=pargv.minsize; i<=pargv.maxsize; i*=pargv.facsize) { pargv.size = i; create_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc, i, &pargv); if(pargv.verify) test_pingpong_verify(smem, rmem, sendh, recvh, &pargv); else test_pingpong(smem, rmem, sendh, recvh, &pargv); check_mem(rmem, ndims, nc, i); free_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc); } if(QMP_get_node_number()==0) QMP_printf("finished ping pong sends\n"); fflush(stdout); pargv.option = opts; } if(pargv.option & TEST_ONEWAY) { int opts = pargv.option; pargv.option = TEST_ONEWAY; if(QMP_get_node_number()==0) QMP_printf("starting one way sends"); fflush(stdout); for(i=pargv.minsize; i<=pargv.maxsize; i*=pargv.facsize) { pargv.size = i; create_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc, i, &pargv); test_oneway (smem, rmem, sendh, recvh, &pargv); if(!pargv.sender) check_mem(rmem, ndims, nc, i); free_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc); } if(QMP_get_node_number()==0) QMP_printf("finished one way sends"); fflush(stdout); pargv.option = opts; } /** * Free memory */ free (smem); free (rmem); free (sendh); free (recvh); free (sendmem); free (recvmem); QMP_finalize_msg_passing (); return 0; }
void dwf_dslash_5_plus_slice(Vector *out, Vector *in, Float mass, int dag, Dwf *dwf_lib_arg, int s_slice) { int x; int s; // Initializations //------------------------------------------------------------------ #if 0 int local_ls = GJP.SnodeSites(); int s_nodes = GJP.Snodes(); int s_node_coor = GJP.SnodeCoor(); int vol_4d_cb = dwf_lib_arg->vol_4d / 2; int ls_stride = 24 * vol_4d_cb; #endif IFloat *f_in; IFloat *f_out; IFloat *f_temp; IFloat *comm_buf = dwf_lib_arg->comm_buf; IFloat two_over_a5 = 2.0 * GJP.DwfA5Inv(); IFloat neg_mass_two_over_a5 = -2.0 * mass * GJP.DwfA5Inv(); // [1 + gamma_5] term (if dag=1 [1 - gamma_5] term) // // out[s] = [1 + gamma_5] in[s-1] //------------------------------------------------------------------ if (s_slice<0 || s_slice >=local_ls) ERR.General("","dwf_dslash_5_plus_slice","s_slice=%d local_ls=%d!\n",s_slice,local_ls); if(s_slice>0 ){ f_in = (IFloat *) in; f_out = (IFloat *) out; f_in += (s_slice-1)*ls_stride; f_out += (s_slice)*ls_stride; if(dag == 1){ f_in = f_in + 12; f_out = f_out + 12; } FtV1pV2Skip_asm(f_out,&two_over_a5,f_in,f_out,vol_4d_cb); } // [1 + gamma_5] for lower boundary term (if dag=1 [1 - gamma_5] term) // If there's only one node along fifth direction, no communication // is necessary; Otherwise data from adjacent node in minus direction // will be needed. // If the lower boundary is the s=0 term // out[0] = - m_f * [1 + gamma_5] in[ls-1] // else, out[s] = [1 + gamma_5] in[s-1] // //------------------------------------------------------------------ if (s_slice == 0 ){ f_in = (IFloat *) in; f_in = f_in + (local_ls-1)*ls_stride; f_out = (IFloat *) out; if(dag == 1){ f_in = f_in + 12; f_out = f_out + 12; } f_temp = f_in; if (s_nodes > 1 ) { #ifdef USE_GETPLUS getMinusData(comm_buf, f_in, 24*vol_4d_cb, 4); f_temp = comm_buf; #else QMP_status_t send_status = QMP_wait(msghandle_down[0]); if (send_status != QMP_SUCCESS) QMP_error("Send failed in dwf_dslash_5_plus_slice: %s\n", QMP_error_string(send_status)); QMP_status_t recv_status = QMP_wait(msghandle_down[1]); if (recv_status != QMP_SUCCESS) QMP_error("Receive failed in dwf_dslash_5_plus_slice: %s\n", QMP_error_string(recv_status)); f_temp = rbuf_down; if(dag == 1) f_temp = f_temp + 12; #endif } if(s_node_coor == 0) { FtV1pV2Skip_asm(f_out,&neg_mass_two_over_a5,f_temp,f_out,vol_4d_cb); } else { FtV1pV2Skip_asm(f_out,&two_over_a5,f_temp,f_out,vol_4d_cb); } } // [1 - gamma_5] term (if dag=1 [1 + gamma_5] term) // // out[s] = [1 - gamma_5] in[s+1] //------------------------------------------------------------------ if(s_slice > 0 ){ f_in = (IFloat *) in; f_out = (IFloat *) out; f_in += (s_slice)*ls_stride; f_out += (s_slice-1)*ls_stride; if(dag == 0){ f_in = f_in + 12; f_out = f_out + 12; } FtV1pV2Skip_asm(f_out,&two_over_a5,f_in,f_out,vol_4d_cb); } // [1 - gamma_5] for upper boundary term (if dag=1 [1 + gamma_5] term) // If there's only one node along fifth direction, no communication // is necessary; Otherwise data from adjacent node in minus direction // will be needed. // If the upper boundary is the s=ls term // out[ls-1] = - m_f * [1 - gamma_5] in[0] // else out[s] = [1 - gamma_5] in[s+1] // //------------------------------------------------------------------ if(s_slice == (local_ls-1) ){ f_in = (IFloat *) in; f_out = (IFloat *) out; if(dag == 0){ f_in = f_in + 12; f_out = f_out + 12; } f_out = f_out + (local_ls-1)*ls_stride; f_temp = f_in; if (s_nodes > 1 ) { #ifdef USE_GETPLUS getPlusData(comm_buf, f_in, 24*vol_4d_cb, 4); f_temp = comm_buf; #else QMP_status_t send_status = QMP_wait(msghandle_up[0]); if (send_status != QMP_SUCCESS) QMP_error("Send failed in dwf_dslash_5_plus_slice: %s\n", QMP_error_string(send_status)); QMP_status_t recv_status = QMP_wait(msghandle_up[1]); if (recv_status != QMP_SUCCESS) QMP_error("Receive failed in dwf_dslash_5_plus_slice: %s\n", QMP_error_string(recv_status)); f_temp = rbuf_up; if(dag == 0) f_temp = f_temp + 12; #endif } if(s_node_coor == s_nodes - 1) { FtV1pV2Skip_asm(f_out,&neg_mass_two_over_a5,f_temp,f_out,vol_4d_cb); } else { FtV1pV2Skip_asm(f_out,&two_over_a5,f_temp,f_out,vol_4d_cb); } } // DiracOp::CGflops+=2*2*vol_4d_cb*local_ls*12; }
void make_shift_tables(int bound[2][4][4], halfspinor_array* chi1, halfspinor_array* chi2, halfspinor_array* recv_bufs[2][4], halfspinor_array* send_bufs[2][4], void (*QDP_getSiteCoords)(int coord[], int node, int linearsite), int (*QDP_getLinearSiteIndex)(const int coord[]), int (*QDP_getNodeNumber)(const int coord[])) { volatile int dir,i; const int my_node = QMP_get_node_number(); int coord[4]; int gcoord[4]; int gcoord2[4]; int linear; int **shift_table; int x,y,z,t; int *subgrid_size = getSubgridSize(); int mu; int offset; int cb; const int *node_coord = QMP_get_logical_coordinates(); int p; int site, index; InvTab4 *xinvtab; InvTab4 *invtab; int qdp_index; int my_index; int num; int offsite_found; /* Setup the subgrid volume for ever after */ subgrid_vol = 1; for(i=0; i < getNumDim(); ++i) { subgrid_vol *= getSubgridSize()[i]; } /* Get the checkerboard size for ever after */ subgrid_vol_cb = subgrid_vol / 2; /* Now I want to build the site table */ /* I want it cache line aligned? */ xsite_table = (int *)malloc(sizeof(int)*subgrid_vol+63L); if(xsite_table == 0x0 ) { QMP_error("Couldnt allocate site table"); QMP_abort(1); } site_table = (int *)((((ptrdiff_t)(xsite_table))+63L)&(-64L)); xinvtab = (InvTab4 *)malloc(sizeof(InvTab4)*subgrid_vol + 63L); if(xinvtab == 0x0 ) { QMP_error("Couldnt allocate site table"); QMP_abort(1); } invtab = (InvTab4 *)((((ptrdiff_t)(xinvtab))+63L)&(-64L)); /* Inversity of functions check: Check that myLinearSiteIndex3D is in fact the inverse of mySiteCoords3D, and that QDP_getSiteCoords is the inverse of QDP_linearSiteIndex() */ for(p=0; p < 2; p++) { for(site=0; site < subgrid_vol_cb; site++) { /* Linear site index */ my_index = site + subgrid_vol_cb*p; QDP_getSiteCoords(gcoord, my_node, my_index); linear=QDP_getLinearSiteIndex(gcoord); if( linear != my_index ) { printf("P%d cb=%d site=%d : QDP_getSiteCoords not inverse of QDP_getLinearSiteIndex(): my_index=%d linear=%d\n", my_node, p,site, my_index,linear); } mySiteCoords4D(gcoord, my_node, my_index); linear=myLinearSiteIndex4D(gcoord); if( linear != my_index ) { printf("P%d cb=%d site=%d : mySiteCoords3D not inverse of myLinearSiteIndex3D(): my_index=%d linear=%d\n", my_node, p,site, my_index,linear); } } } /* Loop through sites - you can choose your path below */ /* This is a checkerboarded order which is identical hopefully to QDP++'s rb2 subset when QDP++ is in a CB2 layout */ for(p=0; p < 2; p++) { for(t=0; t < subgrid_size[3]; t++) { for(z=0; z < subgrid_size[2]; z++) { for(y=0; y < subgrid_size[1]; y++) { for(x=0; x < subgrid_size[0]/2; x++) { coord[0] = 2*x + p; coord[1] = y; coord[2] = z; coord[3] = t; /* Make global */ for(i=0; i < 4; i++) { coord[i] += subgrid_size[i]*node_coord[i]; } /* Index of coordinate -- NB this is not lexicographic but takes into account checkerboarding in QDP++ */ qdp_index = QDP_getLinearSiteIndex(coord); /* Index of coordinate in my layout. -- NB this is not lexicographic but takes into account my 3D checkerbaording */ my_index = myLinearSiteIndex4D(coord); site_table[my_index] = qdp_index; cb=parity(coord); linear = my_index%subgrid_vol_cb; invtab[qdp_index].cb=cb; invtab[qdp_index].linearcb=linear; } } } } } /* Site table transitivity check: for each site, convert to index in cb3d, convert to qdp index convert qdp_index to coordinate convert coordinate to back index in cb3d Check that your cb3d at the end is the same as you started with */ for(p=0; p < 2; p++) { for(site=0; site < subgrid_vol_cb; site++) { /* My local index */ my_index = site + subgrid_vol_cb*p; /* Convert to QDP index */ qdp_index = site_table[ my_index ]; /* Switch QDP index to coordinates */ QDP_getSiteCoords(gcoord, my_node,qdp_index); /* Convert back to cb3d index */ linear = myLinearSiteIndex4D(gcoord); /* Check new cb,cbsite index matches the old cb index */ if (linear != my_index) { printf("P%d The Circle is broken. My index=%d qdp_index=%d coords=%d,%d,%d,%d linear(=my_index?)=%d\n", my_node, my_index, qdp_index, gcoord[0],gcoord[1],gcoord[2],gcoord[3],linear); } } } /* Consistency check 2: Test mySiteCoords 3D for all 3d cb,cb3index convert to cb3d linear index (my_index) convert to qdp_index (lookup in site table) Now convert qdp_index and my_index both to coordinates. They should produce the same coordinates */ for(p=0; p < 2; p++) { for(site=0; site < subgrid_vol_cb; site++) { /* My local index */ my_index = site + subgrid_vol_cb*p; mySiteCoords4D(gcoord, my_node, my_index); qdp_index = site_table[ my_index ]; QDP_getSiteCoords(gcoord2, my_node,qdp_index); for(mu=0 ; mu < 4; mu++) { if( gcoord2[mu] != gcoord[mu] ) { printf("P%d: my_index=%d qdp_index=%d mySiteCoords=(%d,%d,%d,%d) QDPsiteCoords=(%d,%d,%d,%d)\n", my_node, my_index, qdp_index, gcoord[0], gcoord[1], gcoord[2], gcoord[3], gcoord2[0], gcoord2[1], gcoord2[2], gcoord2[3]); continue; } } } } /* Allocate the shift table */ /* The structure is as follows: There are 4 shift tables in order: [ Table 1 | Table 2 | Table 3 | Table 4 ] Table 1: decomp_scatter_index[mu][site] Table 2: decomp_hvv_scatter_index[mu][site] Table 3: recons_mvv_gather_index[mu][site] Table 4: recons_gather_index[mu][site] */ /* This 4 is for the 4 tables: Table 1-4*/ if ((shift_table = (int **)malloc(4*sizeof(int*))) == 0 ) { QMP_error("init_wnxtsu3dslash: could not initialize shift_table"); QMP_abort(1); } for(i=0; i < 4; i++) { /* This 4 is for the 4 comms dierctions: */ if ((shift_table[i] = (int *)malloc(4*subgrid_vol*sizeof(int))) == 0) { QMP_error("init_wnxtsu3dslash: could not initialize shift_table"); QMP_abort(1); } } /* Initialize the boundary counters */ for(cb=0; cb < 2; cb++) { for(dir=0; dir < 4; dir++) { bound[cb][0][dir] = 0; bound[cb][1][dir] = 0; bound[cb][2][dir] = 0; bound[cb][3][dir] = 0; } } for(cb=0; cb < 2; cb++) { for(site=0; site < subgrid_vol_cb; ++site) { index = cb*subgrid_vol_cb + site; /* Fetch site from site table */ qdp_index = site_table[index]; /* Get its coords */ QDP_getSiteCoords(coord, my_node, qdp_index); /* Loop over directions building up shift tables */ for(dir=0; dir < 4; dir++) { int fcoord[4], bcoord[4]; int fnode, bnode; int blinear, flinear; /* Backwards displacement*/ offs(bcoord, coord, dir, -1); bnode = QDP_getNodeNumber(bcoord); blinear = QDP_getLinearSiteIndex(bcoord); /* Forward displacement */ offs(fcoord, coord, dir, +1); fnode = QDP_getNodeNumber(fcoord); flinear = QDP_getLinearSiteIndex(fcoord); /* Scatter: decomp_{plus,minus} */ /* Operation: a^F(shift(x,type=0),dir) <- decomp(psi(x),dir) */ /* Send backwards - also called a receive from forward */ if (bnode != my_node) { /* Offnode */ /* Append to Tail 1, increase boundary count */ /* This is the correct code */ shift_table[DECOMP_SCATTER][dir+4*index] = subgrid_vol_cb + bound[1-cb][DECOMP_SCATTER][dir]; bound[1-cb][DECOMP_SCATTER][dir]++; } else { /* On node. Note the linear part of its (cb3, linear) bit, using a reverse lookup */ shift_table[DECOMP_SCATTER][dir+4*index] = invtab[blinear].linearcb; } /* Scatter: decomp_hvv_{plus,minus} */ /* Operation: a^B(shift(x,type=1),dir) <- U^dag(x,dir)*decomp(psi(x),dir) */ /* Send forwards - also called a receive from backward */ if (fnode != my_node) { /* Offnode */ /* Append to Tail 1, increase boundary count */ shift_table[DECOMP_HVV_SCATTER][dir+4*index] = subgrid_vol_cb + bound[1-cb][DECOMP_HVV_SCATTER][dir]; bound[1-cb][DECOMP_HVV_SCATTER][dir]++; } else { /* On node. Note the linear part of its (cb3, linear) bit, using a reverse lookup */ shift_table[DECOMP_HVV_SCATTER][dir+4*index] /* Onnode */ = invtab[flinear].linearcb ; } /* Gather: mvv_recons_{plus,minus} */ /* Operation: chi(x) <- \sum_dir U(x,dir)*a^F(shift(x,type=2),dir) */ /* Receive from forward */ if (fnode != my_node) { /* Offnode */ /* Append to Tail 2, increase boundary count */ shift_table[RECONS_MVV_GATHER][dir+4*index] = 2*subgrid_vol_cb + (bound[cb][RECONS_MVV_GATHER][dir]); bound[cb][RECONS_MVV_GATHER][dir]++; } else { /* On node. Note the linear part of its (cb3, linear) bit, using a reverse lookup. Note this is a recons post shift, so the linear coordinate to invert is mine rather than the neighbours */ shift_table[RECONS_MVV_GATHER][dir+4*index] = invtab[qdp_index].linearcb ; } /* Gather: recons_{plus,minus} */ /* Operation: chi(x) += \sum_dir recons(a^B(shift(x,type=3),dir),dir) */ /* Receive from backward */ if (bnode != my_node) { shift_table[RECONS_GATHER][dir+4*index] = 2*subgrid_vol_cb + bound[cb][RECONS_GATHER][dir]; bound[cb][RECONS_GATHER][dir]++; } else { /* On node. Note the linear part of its (cb3, linear) bit, using a reverse lookup. Note this is a recons post shift, so the linear coordinate to invert is mine rather than the neighbours */ shift_table[RECONS_GATHER][dir+4*index] = invtab[qdp_index].linearcb ; } } } } /* Sanity check - make sure the sending and receiving counters match */ for(cb=0; cb < 2; cb++) { for(dir=0; dir < 4; dir++) { /* Sanity 1: Must have same number of boundary sites on each cb for a given operation */ for(i = 0; i < 4; i++) { if (bound[1-cb][i][dir] != bound[cb][i][dir]) { QMP_error("SSE Wilson dslash - make_shift_tables: type 0 diff. cb send/recv counts do not match: %d %d", bound[1-cb][i][dir],bound[cb][i][dir]); QMP_abort(1); } } } } /* Now I want to make the offset table into the half spinor temporaries */ /* The half spinor temporaries will look like this: dir=0 [ Body Half Spinors ][ Tail 1 Half Spinors ][ Tail 2 Half Spinors ] dir=1 [ Body Half Spinors ][ Tail 1 Half Spinors ][ Tail 2 Half Spinors ] ... And each of these blocks of half spinors will be sized to vol_cb sites (ie half volume only). The shift_table() for a given site and direction indexes into one of these lines. So the offset table essentially delineates which line one picks, by adding an offset of 3*subgrid_vol_cb*dir To the shift. The result from offset table, can be used directly as a pointer displacement on the temporaries. Perhaps the best way to condsider this is to consider a value of shift_table[type][dir/site] that lands in the body. The shift table merely gives me a site index. But the data needs to be different for each direction for that site index. Hence we need to replicate the body, for each dir. The 3xsubgrid_vol_cb is just there to take care of the buffers. Or another way to think of it is that there is a 'body element' index specified by the shift table lookup, and that dir is just the slowest varying index. */ /* 4 dims, 4 types, rest of the magic is to align the thingie */ xoffset_table = (halfspinor_array **)malloc(4*4*subgrid_vol*sizeof(halfspinor_array*)+63L); if( xoffset_table == 0 ) { QMP_error("init_wnxtsu3dslash: could not initialize offset_table[i]"); QMP_abort(1); } /* This is the bit what aligns straight from AMD Manual */ offset_table = (halfspinor_array**)((((ptrdiff_t)(xoffset_table)) + 63L) & (-64L)); /* Walk through the shift_table and remap the offsets into actual pointers */ /* DECOMP_SCATTER */ num=0; for(dir =0; dir < Nd; dir++) { /* Loop through all the sites. Remap the offsets either to local arrays or pointers */ offsite_found=0; for(site=0; site < subgrid_vol; site++) { offset = shift_table[DECOMP_SCATTER][dir+4*site]; if( offset >= subgrid_vol_cb ) { /* Found an offsite guy. It's address must be to the send back buffer */ /* send to back index = recv from forward index = 0 */ offsite_found++; offset_table[ dir + 4*(site + subgrid_vol*DECOMP_SCATTER) ] = send_bufs[0][num]+(offset - subgrid_vol_cb); } else { /* Guy is onsite: This is DECOMP_SCATTER so offset to chi1 */ offset_table[ dir + 4*(site + subgrid_vol*DECOMP_SCATTER) ] = chi1+shift_table[DECOMP_SCATTER][dir+4*site]+subgrid_vol_cb*dir; } } if( offsite_found > 0 ) { /* If we found an offsite guy, next direction has to go into the next dir part of the send bufs */ num++; } } /* DECOMP_HVV_SCATTER */ /* Restart num-s */ num=0; for(dir =0; dir <Nd; dir++) { offsite_found=0; for(site=0; site < subgrid_vol; site++) { offset = shift_table[DECOMP_HVV_SCATTER][dir+4*site]; if( offset >= subgrid_vol_cb ) { /* Found an offsite guy. It's address must be to the send forw buffer */ /* send to forward / receive from backward index = 1 */ offsite_found++; offset_table[ dir + 4*(site + subgrid_vol*DECOMP_HVV_SCATTER) ] = send_bufs[1][num]+(offset - subgrid_vol_cb); } else { /* Guy is onsite. This is DECOMP_HVV_SCATTER so offset to chi2 */ offset_table[ dir + 4*(site + subgrid_vol*DECOMP_HVV_SCATTER) ] = chi2+shift_table[DECOMP_HVV_SCATTER][dir+4*site ]+subgrid_vol_cb*dir; } } if( offsite_found > 0 ) { num++; } } /* RECONS_MVV_GATHER */ num=0; for(dir =0; dir <Nd; dir++) { offsite_found=0; for(site=0; site < subgrid_vol; site++) { offset = shift_table[RECONS_MVV_GATHER][dir+4*site]; if( offset >= 2*subgrid_vol_cb ) { /* Found an offsite guy. It's address must be to the recv from front buffer */ /* recv_from front index = send to back index = 0 */ offsite_found++; offset_table[ dir + 4*(site + subgrid_vol*RECONS_MVV_GATHER) ] = recv_bufs[0][num]+(offset - 2*subgrid_vol_cb); } else { /* Guy is onsite */ /* This is RECONS_MVV_GATHER so offset with respect to chi1 */ offset_table[ dir + 4*(site + subgrid_vol*RECONS_MVV_GATHER) ] = chi1+shift_table[RECONS_MVV_GATHER][dir+4*site ]+subgrid_vol_cb*dir; } } if( offsite_found > 0 ) { num++; } } /* RECONS_GATHER */ num=0; for(dir =0; dir <Nd; dir++) { offsite_found=0; for(site=0; site < subgrid_vol; site++) { offset = shift_table[RECONS_GATHER][dir+4*site]; if( offset >= 2*subgrid_vol_cb ) { /* Found an offsite guy. It's address must be to the recv from back buffer */ /* receive from back = send to forward index = 1*/ offsite_found++; offset_table[ dir + 4*(site + subgrid_vol*RECONS_GATHER) ] = recv_bufs[1][num]+(offset - 2*subgrid_vol_cb); } else { /* Guy is onsite */ /* This is RECONS_GATHER so offset with respect to chi2 */ offset_table[ dir + 4*(site + subgrid_vol*RECONS_GATHER ) ] = chi2+shift_table[RECONS_GATHER][dir+4*site ]+subgrid_vol_cb*dir; } } if( offsite_found > 0 ) { num++; } } /* Free shift table - it is no longer needed. We deal solely with offsets */ for(i=0; i < 4; i++) { free( (shift_table)[i] ); } free( shift_table ); free( xinvtab ); }