static void test_simultaneous_send (int** smem, int** rmem, QMP_msghandle_t* sendh, QMP_msghandle_t* recvh, struct perf_argv* pargv) { double it, ft, dt, bwval; int i, j; QMP_status_t err; int nc, ndims; int nloops; int dsize; nc = pargv->num_channels; nloops = pargv->loops; dsize = pargv->size; ndims = pargv->ndims; /* do a test for nloops */ QMP_barrier(); it = get_current_time (); for (i = 0; i < nloops; i++) { for (j = 0; j < nc; j++) { /* receive operation */ if ((err = QMP_start (recvh[j])) != QMP_SUCCESS) QMP_printf ("Start receiving failed: %s\n", QMP_error_string(err)); } for (j = 0; j < nc; j++) { /* Send operation */ if ((err = QMP_start (sendh[j])) != QMP_SUCCESS) QMP_printf ("Start sending failed: %s\n", QMP_error_string(err)); } for (j = 0; j < nc; j++) { if (QMP_wait (sendh[j]) != QMP_SUCCESS) QMP_printf ("Error in sending %d\n", j ); } for (j = 0; j < nc; j++) { if (QMP_wait (recvh[j]) != QMP_SUCCESS) QMP_printf ("Error in receiving %d\n", j); } } ft = get_current_time (); if(QMP_get_node_number()==0) { dt = (ft - it); /* in milli seconds */ bwval = dsize/(double)1000.0 * 4.0 * nloops * nc*ndims/dt; QMP_printf ("Simultaneous send B/W for datasize %d is %g (MB/s)", dsize * 4, bwval); QMP_printf ("Time difference is %lf micro seconds", dt*1000.0/nloops); } QMP_barrier(); }
void qx(op_CmB)(struct FermionX *r_x, struct eo_lattice *xy, const struct SUn *U, const struct FermionX *a_x, const struct FermionX *a_y, long long *flops, long long *sent, long long *received) { qx(boundary)(xy, qx(up_project_n), qx(down_project_n), U, a_y, flops); if (xy->h_valid) QMP_start(xy->handle); *flops += qx(do_CmB)(r_x, 0, xy->body_size, xy->neighbor, U, a_x, a_y, NULL); if (xy->h_valid) QMP_wait(xy->handle); *flops += qx(do_CmB)(r_x, xy->body_size, xy->face_size, xy->neighbor, U, a_x, a_y, xy->receive_buf); *sent += xy->total_send; *received += xy->total_receive; }
int DML_get_bytes(char *buf, size_t size, int fromnode){ QMP_msgmem_t mm; QMP_msghandle_t mh; mm = QMP_declare_msgmem(buf, size); mh = QMP_declare_receive_from(mm, fromnode, 0); QMP_start(mh); QMP_wait(mh); QMP_free_msghandle(mh); QMP_free_msgmem(mm); return 0; }
int DML_send_bytes(char *buf, size_t size, int tonode){ QMP_msgmem_t mm; QMP_msghandle_t mh; mm = QMP_declare_msgmem(buf, size); mh = QMP_declare_send_to(mm, tonode, 0); QMP_start(mh); QMP_wait(mh); QMP_free_msghandle(mh); QMP_free_msgmem(mm); return 0; }
void stupid_broadcast(void *send_buf, int count) { int node; int num_nodes = QMP_get_number_of_nodes(); QMP_msgmem_t request_msg = QMP_declare_msgmem(send_buf, count); QMP_msghandle_t request_mh; // Send to each node for(node=1; node < num_nodes; ++node) { if (QMP_get_node_number() == node) { request_mh = QMP_declare_receive_from(request_msg, 0, 0); if (QMP_start(request_mh) != QMP_SUCCESS) QMP_abort_string(1, "recvFromWait failed\n"); QMP_wait(request_mh); QMP_free_msghandle(request_mh); } if (QMP_is_primary_node()) { request_mh = QMP_declare_send_to(request_msg, node, 0); if (QMP_start(request_mh) != QMP_SUCCESS) QMP_abort_string(1, "sendToWait failed\n"); QMP_wait(request_mh); QMP_free_msghandle(request_mh); } } QMP_free_msgmem(request_msg); }
CPS_START_NAMESPACE #ifndef USE_QMP #define QMP #endif void GlobalDataShift::Shift(int direction, int n_disp){ if (n_disp==0) return; SCUDir s_dir,r_dir; int a_disp; void *send_p,*recv_p,*temp_p; #ifndef USE_QMP if (n_disp>0){ a_disp = n_disp; s_dir = gjp_scu_dir[i*2]; r_dir = gjp_scu_dir[i*2+1]; } else { a_disp = -n_disp; s_dir = gjp_scu_dir[i*2+1]; r_dir = gjp_scu_dir[i*2]; } #else // int direction = i; int sflag; if (n_disp > 0) sflag = +1; else sflag = -1; #endif send_p = addr; recv_p = temp_buf; #ifndef USE_QMP SCUDirArgIR Send(send_p,s_dir,SCU_SEND,data_len); SCUDirArgIR Recv(recv_p,r_dir,SCU_REC,data_len); #else QMP_msgmem_t msgmem[2]; QMP_msghandle_t msghandle[2]; QMP_status_t status; QMP_msghandle_t multiple; #endif // sys_cacheflush(0); for(int i = 0;i<a_disp-1;i++){ #ifndef USE_QMP Send.StartTrans();Recv.StartTrans(); Send.TransComplete();Recv.TransComplete(); #else msgmem[0] = QMP_declare_msgmem((void *)send_p, data_len); msgmem[1] = QMP_declare_msgmem((void *)recv_p, data_len); msghandle[0] = QMP_declare_send_relative(msgmem[0], direction, sflag, 0); msghandle[1] = QMP_declare_receive_relative(msgmem[1], direction, -sflag, 0); multiple = QMP_declare_multiple(msghandle, 2); QMP_start(multiple); status = QMP_wait(multiple); if (status != QMP_SUCCESS) QMP_error("Error in GlobalDataShift::Shift:%s\n", QMP_error_string(status)); QMP_free_msghandle(multiple); QMP_free_msgmem(msgmem[0]); QMP_free_msgmem(msgmem[1]); #endif temp_p = send_p; send_p = recv_p; recv_p = temp_p; #ifndef USE_QMP Send.Addr(send_p); Recv.Addr(recv_p); #endif } #ifndef USE_QMP Send.StartTrans();Recv.StartTrans(); Send.TransComplete();Recv.TransComplete(); #else msgmem[0] = QMP_declare_msgmem((void *)send_p, data_len); msgmem[1] = QMP_declare_msgmem((void *)recv_p, data_len); msghandle[0] = QMP_declare_send_relative(msgmem[0], direction, sflag, 0); msghandle[1] = QMP_declare_receive_relative(msgmem[1], direction, -sflag, 0); multiple = QMP_declare_multiple(msghandle, 2); QMP_start(multiple); status = QMP_wait(multiple); if (status != QMP_SUCCESS) QMP_error("Error in GlobalDataShift::Shift:%s\n", QMP_error_string(status)); QMP_free_msghandle(multiple); QMP_free_msgmem(msgmem[0]); QMP_free_msgmem(msgmem[1]); #endif if (recv_p != addr) memcpy(addr,recv_p,data_len); }
void comm_wait(MsgHandle *mh) { QMP_CHECK( QMP_wait(mh->handle) ); }
void PT::mat_cb_norm(int n, IFloat **mout, IFloat **min, const int *dir, int parity, IFloat * gauge) { //List of the different directions int wire[MAX_DIR]; int i; // printf("PT::mat_cb_norm\n"); QMP_msgmem_t *msg_mem_p = (QMP_msgmem_t *)Alloc("","vec_cb_norm", "msg_mem_p", 2*non_local_dirs*sizeof(QMP_msgmem_t)); QMP_msghandle_t* msg_handle_p = (QMP_msghandle_t *)Alloc("","vec_cb_norm", "msg_handle_p", 2*non_local_dirs*sizeof(QMP_msghandle_t)); QMP_msghandle_t multiple; static int call_num = 0; int vlen = VECT_LEN; int vlen2 = VECT_LEN; call_num++; //Name our function char *fname="pt_mat_cb()"; // VRB.Func("",fname); //Set the transfer directions //If wire[i] is even, then we have communication in the negative direction //If wire[i] is odd, then we have communication in the positive direction for(i=0;i<n;i++) wire[i]=dir[i]; #ifdef PROFILE Float dtime = - dclock(); #endif int non_local_dir=0; //#pragma omp parallel default(shared) { //If wire[i] is odd, then we have parallel transport in the //positive direction. In this case, multiplication by the link matrix is //done before the field is transferred over to the adjacent node // //If we have transfer in the negative T direction (wire[i] = 6), then //we have to copy the appropriate fields to a send buffer //#pragma omp for for(i=0;i<n;i++) { if(!local[wire[i]/2]) { if(wire[i]%2) { if(conjugated) pt_cmm_cpp(non_local_chi_cb[wire[i]],(long)uc_nl_cb_pre[parity][wire[i]/2],(long)min[i],(long)snd_buf_cb[wire[i]/2],(long)gauge); else pt_cmm_dag_cpp(non_local_chi_cb[wire[i]],(long)uc_nl_cb_pre[parity][wire[i]/2],(long)min[i],(long)snd_buf_cb[wire[i]/2],(long)gauge); } else if((wire[i] == 6)) { for(int j = 0; j < non_local_chi_cb[6];j++) memcpy(snd_buf_t_cb + j*GAUGE_LEN,min[i] + 3 * *(Toffset[parity]+j)*3,GAUGE_LEN*sizeof(IFloat)); } } } //#pragma omp barrier //#pragma omp master { for(i=0;i<n;i++) if(!local[wire[i]/2]) { //Calculate the starting address for the data to be sent IFloat *addr = min[i] + GAUGE_LEN * offset_cb[wire[i]]; msg_mem_p[2*non_local_dir] = QMP_declare_msgmem((void *)rcv_buf[wire[i]], 3*non_local_chi_cb[wire[i]]*VECT_LEN*sizeof(IFloat)); //Initialize the msg_mem for sends if(wire[i]%2) msg_mem_p[2*non_local_dir+1] = QMP_declare_msgmem((void *)snd_buf_cb[wire[i]/2], 3*non_local_chi_cb[wire[i]]*VECT_LEN*sizeof(IFloat)); else if(wire[i] == 6) msg_mem_p[2*non_local_dir+1] = QMP_declare_msgmem((void *)snd_buf_t_cb, 3*non_local_chi_cb[wire[i]]*VECT_LEN*sizeof(IFloat)); else msg_mem_p[2*non_local_dir+1] = QMP_declare_strided_msgmem((void *)addr, (size_t)(3*blklen_cb[wire[i]]), numblk_cb[wire[i]], (ptrdiff_t)(3*stride_cb[wire[i]]+3*blklen_cb[wire[i]])); msg_handle_p[2*non_local_dir] = QMP_declare_receive_relative(msg_mem_p[2*non_local_dir], wire[i]/2, 1-2*(wire[i]%2), 0); msg_handle_p[2*non_local_dir+1] = QMP_declare_send_relative(msg_mem_p[2*non_local_dir+1], wire[i]/2, 2*(wire[i]%2)-1, 0); non_local_dir++; } if(non_local_dir) { multiple = QMP_declare_multiple(msg_handle_p, 2*non_local_dir); QMP_start(multiple); } } //#pragma omp master { //Do local calculations //#pragma omp for for(i=0;i<n;i++) { if((wire[i]%2 && conjugated) || ((wire[i]%2 == 0) && (conjugated == 0))) pt_cmm_cpp(local_chi_cb[wire[i]],(long)uc_l_cb[parity][wire[i]],(long)min[i],(long)mout[i],(long)gauge); else pt_cmm_dag_cpp(local_chi_cb[wire[i]],(long)uc_l_cb[parity][wire[i]],(long)min[i],(long)mout[i],(long)gauge); } //#pragma omp barrier //#pragma omp master { if(non_local_dir) { QMP_status_t qmp_complete_status = QMP_wait(multiple); if (qmp_complete_status != QMP_SUCCESS) QMP_error("Send failed in vec_cb_norm: %s\n", QMP_error_string(qmp_complete_status)); QMP_free_msghandle(multiple); for(int i = 0; i < 2*non_local_dir; i++) QMP_free_msgmem(msg_mem_p[i]); Free(msg_handle_p); Free(msg_mem_p); } } //#pragma omp master { //If wire[i] is even, then we have transport in the negative direction //In this case, the vector field is multiplied by the SU(3) link matrix //after all communication is complete IFloat *fp0,*fp1; //#pragma omp for for(i=0;i<n;i++) { if(!local[wire[i]/2]) { if(!(wire[i]%2)) { if(conjugated) pt_cmm_dag_cpp(non_local_chi_cb[wire[i]],(long)uc_nl_cb[parity][wire[i]],(long)rcv_buf[wire[i]],(long)mout[i],(long)gauge); else pt_cmm_cpp(non_local_chi_cb[wire[i]],(long)uc_nl_cb[parity][wire[i]],(long)rcv_buf[wire[i]],(long)mout[i],(long)gauge); } //Otherwise we have parallel transport in the positive direction. //In this case, the received data has already been pre-multiplied //All we need to do is to put the transported field in the correct place else { //int destination, source; //Place the data in the receive buffer into the result vector for(int s=0;s<non_local_chi_cb[wire[i]];s++) { //source = uc_nl_cb[parity][wire[i]][s].src; fp0 = (IFloat *)((long)rcv_buf[wire[i]]+3*uc_nl_cb[parity][wire[i]][s].src); //destination = uc_nl_cb[parity][wire[i]][s].dest; fp1 = (IFloat *)(mout[i]+3*uc_nl_cb[parity][wire[i]][s].dest); memcpy(fp1,fp0,GAUGE_LEN*sizeof(IFloat)); } } } } } //#pragma omp parallel #ifdef PROFILE dtime +=dclock(); print_flops("",fname,99*vol*n,dtime); #endif // ParTrans::PTflops +=99*n*vol; }
void PT::mat(int n, matrix **mout, matrix **min, const int *dir){ int wire[MAX_DIR]; int i; QMP_msgmem_t msg_mem_p[2*MAX_DIR]; QMP_msghandle_t msg_handle_p[2*MAX_DIR]; QMP_msghandle_t multiple; static double setup=0.,qmp=0.,localt=0.,nonlocal=0.; static int call_num = 0; call_num++; // char *fname="pt_mat()"; // VRB.Func("",fname); // if (call_num%100==1) printf("PT:mat()\n"); for(i=0;i<n;i++) wire[i] = dir[i]; #ifdef PROFILE Float dtime2 = - dclock(); #endif double dtime = -dclock(); int non_local_dir=0; for(i=0;i<n;i++) if (!local[wire[i]/2]) { //Calculate the address for transfer in a particular direction Float * addr = ((Float *)min[i]+GAUGE_LEN*offset[wire[i]]); msg_mem_p[2*non_local_dir] = QMP_declare_msgmem((void *)rcv_buf[wire[i]], 3*non_local_chi[wire[i]]*VECT_LEN*sizeof(IFloat)); msg_mem_p[2*non_local_dir+1] = QMP_declare_strided_msgmem((void *)addr, (size_t)(3*blklen[wire[i]]), numblk[wire[i]], (ptrdiff_t)(3*stride[wire[i]]+3*blklen[wire[i]])); msg_handle_p[2*non_local_dir] = QMP_declare_receive_relative(msg_mem_p[2*non_local_dir], wire[i]/2, 1-2*(wire[i]%2), 0); msg_handle_p[2*non_local_dir+1] = QMP_declare_send_relative(msg_mem_p[2*non_local_dir+1], wire[i]/2, 2*(wire[i]%2)-1, 0); non_local_dir++; } if (call_num==1 && !QMP_get_node_number()) printf("non_local_dir=%d\n",non_local_dir); if(non_local_dir) { multiple = QMP_declare_multiple(msg_handle_p, 2*non_local_dir); QMP_start(multiple); } dtime += dclock(); setup +=dtime; dtime = -dclock(); int if_print = 0; // if ( (call_num%10000==1) && (!QMP_get_node_number()) ) if_print=1; #define USE_TEST2 #ifdef USE_TEST2 //assume nt > n! static char *cname="mat()"; #pragma omp parallel default(shared) { int iam,nt,ipoints,istart,offset; iam = omp_get_thread_num(); nt = omp_get_num_threads(); int nt_dir = nt/n; int n_t = iam/nt_dir; int i_t = iam%nt_dir; if (n_t >= n ){ n_t = n-1; i_t = iam - (n-1)*nt_dir; nt_dir = nt -(n-1)*nt_dir; } int w_t = wire[n_t]; ipoints = (local_chi[w_t]/2)/nt_dir; offset = ipoints*i_t; if (i_t == (nt_dir-1)) ipoints = (local_chi[w_t]/2)-offset; if ( if_print ) printf("thread %d of %d nt_dir n_t i_t ipoints offset= %d %d %d %d %d\n",iam,nt,nt_dir,n_t,i_t,ipoints,offset); //Interleaving of local computation of matrix multiplication partrans_cmm_agg((uc_l[w_t]+offset*2),min[n_t],mout[n_t],ipoints); if ( if_print ) printf("thread %d of %d done\n",iam,nt); } #else { //Interleaving of local computation of matrix multiplication #pragma omp parallel for default(shared) for(i=0;i<n;i++){ partrans_cmm_agg(uc_l[wire[i]],min[i],mout[i],local_chi[wire[i]]/2); } } #endif dtime += dclock(); localt +=dtime; dtime = -dclock(); //#pragma omp barrier //#pragma omp master { if(non_local_dir) { QMP_status_t qmp_complete_status = QMP_wait(multiple); if (qmp_complete_status != QMP_SUCCESS) QMP_error("Send failed in vec_cb_norm: %s\n", QMP_error_string(qmp_complete_status)); QMP_free_msghandle(multiple); for(int i = 0; i < 2*non_local_dir; i++) QMP_free_msgmem(msg_mem_p[i]); // Free(msg_handle_p); // Free(msg_mem_p); } } //#pragma omp master { dtime += dclock(); qmp +=dtime; dtime = -dclock(); //Do non-local computations #ifdef USE_TEST2 //assume nt > n! #pragma omp parallel default(shared) { int iam,nt,ipoints,istart,offset; iam = omp_get_thread_num(); nt = omp_get_num_threads(); int nt_dir = nt/n; int n_t = iam/nt_dir; int i_t = iam%nt_dir; if (n_t >= n ){ n_t = n-1; i_t = iam - (n-1)*nt_dir; nt_dir = nt -(n-1)*nt_dir; } int w_t = wire[n_t]; ipoints = (non_local_chi[w_t]/2)/nt_dir; offset = ipoints*i_t; if (i_t == (nt_dir-1)) ipoints = (non_local_chi[w_t]/2)-offset; if ( if_print ) printf("thread %d of %d nt_dir n_t i_t ipoints offset= %d %d %d %d %d\n",iam,nt,nt_dir,n_t,i_t,ipoints,offset); //Non-local computation if (ipoints>0) partrans_cmm_agg((uc_nl[w_t]+offset*2),(matrix *)rcv_buf[w_t],mout[n_t],ipoints); if ( if_print ) printf("thread %d of %d done\n",iam,nt); } #else { #pragma omp parallel for for(i=0;i<n;i++) if (!local[wire[i]/2]) { #ifdef USE_OMP if (call_num%10000==1 && !QMP_get_node_number() ) printf("thread %d of %d i=%d\n",omp_get_thread_num(),omp_get_num_threads(),i); #endif partrans_cmm_agg(uc_nl[wire[i]],(matrix *)rcv_buf[wire[i]],mout[i],non_local_chi[wire[i]]/2); } }//#pragma omp parallel #endif dtime += dclock(); nonlocal +=dtime; if (call_num%100==0){ static char *cname="mat()"; if (!QMP_get_node_number() ) { print_flops("mat():local*100",0,localt); print_flops("mat():nonlocal*100",0,nonlocal); print_flops("mat():qmp*100",0,qmp); print_flops("mat():setup*100",0,setup); } localt=nonlocal=qmp=setup=0.; } #ifdef PROFILE dtime2 +=dclock(); print_flops("",fname,198*vol*n,dtime2); #endif // ParTrans::PTflops +=198*n*vol; }
/*! Computes sum[x] = vect2[x] vect[x + hop dir]^dagger where the sum is over n_vect vectors and the hop is in a forward direction. */ void PT::vvpd(IFloat **vect2, IFloat ***vect, int n_vect, const int *dir, int n_dir, int hop, IFloat **sum, int overwrite){ char *fname = "pt_vvpd()"; #if 1 // ERR.NotImplemented(cname,fname); QMP_error("%s""%s Not implemented\n"); #else // VRB.Func("",fname); int i, s, v; Float f = 2.0; int wire[MAX_DIR]; for(i=0;i<n_dir;i++) wire[i] = dir[i]; // from (x,y,z,t) to (t,x,y,z) QMP_msgmem_t *msg_mem_p = (QMP_msgmem_t *)Alloc("","vvpd", "msg_mem_p", 2*non_local_dirs*sizeof(QMP_msgmem_t)); QMP_msgmem_t *msg_mem_p2 = (QMP_msgmem_t *)Alloc("","vvpd", "msg_mem_p", 2*non_local_dirs*sizeof(QMP_msgmem_t)); QMP_msghandle_t* msg_handle_p = (QMP_msghandle_t *)Alloc("","vvpd", "msg_handle_p", 2*non_local_dirs*sizeof(QMP_msghandle_t)); QMP_msghandle_t* msg_handle_p2 = (QMP_msghandle_t *)Alloc("","vvpd", "msg_handle_p", 2*non_local_dirs*sizeof(QMP_msghandle_t)); QMP_msghandle_t multiple; //Setup communciation int comms=0; for(i=0;i<n_dir;i++) if( !local[wire[i]/2]) { if ( size[wire[i]/2] <hop) fprintf(stderr, "%s:size(%d) in direction %d is smaller than the hop(%d)\n", fname,size[wire[i]],wire[i],hop); comms++; } for(v=0; v<n_vect; v++){ if (v%2==0) { comms=0; for(i=0;i<n_dir;i++) if( !local[wire[i]/2]){ msg_mem_p[2*comms] = QMP_declare_msgmem((void *)rcv_buf[wire[i]], hop*non_local_chi[wire[i]]*VECT_LEN*sizeof(IFloat)); msg_handle_p[2*comms] = QMP_declare_receive_relative(msg_mem_p[2*comms], wire[i]/2, 1-2*(wire[i]%2), 0); msg_mem_p[2*comms+1] = QMP_declare_strided_msgmem((void *)(vect[v][i]+VECT_LEN*set_offset(wire[i], hop)), (size_t)(hop*blklen[wire[i]]), numblk[wire[i]], (ptrdiff_t)(stride[wire[i]] + blklen[wire[i]])); msg_handle_p[2*comms+1] = QMP_declare_send_relative(msg_mem_p[2*comms+1], wire[i]/2, 2*(wire[i]%2)-1, 0); comms++; } // Start communication if(comms) { multiple = QMP_declare_multiple(msg_handle_p, 2*comms); } if (comms) { QMP_start(multiple); QMP_status_t qmp_complete_status = QMP_wait(multiple); if (qmp_complete_status != QMP_SUCCESS) QMP_error("Send failed in vvpd: %s\n", QMP_error_string(qmp_complete_status)); QMP_free_msghandle(multiple); for(int i = 0; i < 2*comms; i++) QMP_free_msgmem(msg_mem_p[i]); } } else { comms=0; for(i=0;i<n_dir;i++) if( !local[wire[i]/2]){ msg_mem_p2[2*comms] = QMP_declare_msgmem((void *)rcv_buf2[wire[i]], hop*non_local_chi[wire[i]]*VECT_LEN*sizeof(IFloat)); msg_handle_p2[2*comms] = QMP_declare_receive_relative(msg_mem_p2[2*comms], wire[i]/2, 1-2*(wire[i]%2), 0); msg_mem_p2[2*comms+1] = QMP_declare_strided_msgmem((void *)(vect[v][i]+VECT_LEN*set_offset(wire[i], hop)), (size_t)(hop*blklen[wire[i]]), numblk[wire[i]], (ptrdiff_t)(stride[wire[i]] + blklen[wire[i]])); msg_handle_p2[2*comms+1] = QMP_declare_send_relative(msg_mem_p2[2*comms+1], wire[i]/2, 2*(wire[i]%2)-1, 0); comms++; } // Start communication if(comms) { multiple = QMP_declare_multiple(msg_handle_p2, 2*comms); } if (comms) { QMP_start(multiple); QMP_status_t qmp_complete_status = QMP_wait(multiple); if (qmp_complete_status != QMP_SUCCESS) QMP_error("Send failed in vvpd: %s\n", QMP_error_string(qmp_complete_status)); QMP_free_msghandle(multiple); for(int i = 0; i < 2*comms; i++) QMP_free_msgmem(msg_mem_p2[i]); } } // Perform non-local calculation for previous v if (v>0) if (v==1 && overwrite==1) { for(i=0; i<n_dir; i++) if(non_local_chi[wire[i]]>0) cross_over_lin(sum[i], &f, vect2[v-1],rcv_buf[wire[i]], hop*non_local_chi[wire[i]], src_nl[hop-1][wire[i]], dest_nl[hop-1][wire[i]]); } else if (v%2==1) { for(i=0; i<n_dir; i++) if(non_local_chi[wire[i]]>0) cross_lin(sum[i], &f, vect2[v-1],rcv_buf[wire[i]], hop*non_local_chi[wire[i]], src_nl[hop-1][wire[i]], dest_nl[hop-1][wire[i]]); } else { for(i=0; i<n_dir; i++) if(non_local_chi[wire[i]]>0) cross_lin(sum[i], &f,vect2[v-1],rcv_buf2[wire[i]], hop*non_local_chi[wire[i]], src_nl[hop-1][wire[i]], dest_nl[hop-1][wire[i]]); } // Perform local calculation for current v if (v==0 && overwrite==1) { for(i=0; i<n_dir; i++) if((vol-hop*non_local_chi[wire[i]])>0) cross_over_look(sum[i], &f, vect2[v], vect[v][i], vol-hop*non_local_chi[wire[i]], src_l[hop-1][wire[i]], dest_l[hop-1][wire[i]]); } else { for(i=0; i<n_dir; i++) if((vol-hop*non_local_chi[wire[i]])>0) cross_look(sum[i], &f, vect2[v], vect[v][i], vol-hop*non_local_chi[wire[i]], src_l[hop-1][wire[i]], dest_l[hop-1][wire[i]]); } } if (v==1 && overwrite==1) { for(i=0; i<n_dir; i++) if(non_local_chi[wire[i]]>0) cross_over_lin(sum[i], &f, vect2[v-1],rcv_buf[wire[i]], hop*non_local_chi[wire[i]], src_nl[hop-1][wire[i]], dest_nl[hop-1][wire[i]]); } else if (v%2==1) { for(i=0; i<n_dir; i++) if(non_local_chi[wire[i]]>0) cross_lin(sum[i], &f, vect2[v-1],rcv_buf[wire[i]], hop*non_local_chi[wire[i]], src_nl[hop-1][wire[i]], dest_nl[hop-1][wire[i]]); } else { for(i=0; i<n_dir; i++) if(non_local_chi[wire[i]]>0) cross_lin(sum[i], &f,vect2[v-1],rcv_buf2[wire[i]], hop*non_local_chi[wire[i]], src_nl[hop-1][wire[i]], dest_nl[hop-1][wire[i]]); } #endif // ParTrans::PTflops += 90*n_vect*n_dir*vol; }
//! u[x] = v[x+dir] for n_dir forward or backward directions dir. void PT::shift_field(IFloat **v, const int *dir, int n_dir, int hop, IFloat **u){ int i, length; int wire[n_dir]; for (i=0; i<n_dir;i++) wire[i] = dir[i]; #ifdef USE_QMP QMP_msgmem_t msg_mem_p[20]; QMP_msghandle_t msg_handle_p[20]; QMP_msghandle_t multiple; #else SCUDirArgMulti SCUmulti; SCUDirArgIR *SCUarg_p[2*n_dir]; #endif int comms=0; for (i=0; i<n_dir; i++) if (!local[wire[i]/2]){ #ifndef USE_QMP SCUarg_p[2*comms] = SCUarg_mat[hop-1][2*wire[i]]; SCUarg_p[2*comms+1] = SCUarg_mat[hop-1][2*wire[i]+1]; SCUarg_p[2*comms+1]->Addr((void *)(v[i]+GAUGE_LEN*set_offset(wire[i], hop))); #else msg_mem_p[2*comms] = QMP_declare_msgmem((void *)rcv_buf[wire[i]], 3*hop*non_local_chi[wire[i]]*VECT_LEN*sizeof(IFloat)); msg_mem_p[2*comms+1] = QMP_declare_strided_msgmem((void *)(v[i]+GAUGE_LEN*set_offset(wire[i], hop)), (size_t)(3*hop*blklen[wire[i]]), numblk[wire[i]], (ptrdiff_t)(3*stride[wire[i]]+3*blklen[wire[i]])); msg_handle_p[2*comms] = QMP_declare_receive_relative(msg_mem_p[2*comms], wire[i]/2, 1-2*(wire[i]%2), 0); msg_handle_p[2*comms+1] = QMP_declare_send_relative(msg_mem_p[2*comms+1], wire[i]/2, 2*(wire[i]%2)-1, 0); #endif comms++; } #ifndef USE_QMP if (comms) SCUmulti.Init(SCUarg_p,2*comms); if (comms) SCUmulti.SlowStartTrans(); #else if(comms) { multiple = QMP_declare_multiple(msg_handle_p, 2*comms); QMP_start(multiple); } #endif // SCUmulti.TransComplete(); for (i=0; i<n_dir; i++) { length = vol-hop*non_local_chi[wire[i]]; copy_matrix(u[i],v[i],&length,dest_l[hop-1][wire[i]], src_l[hop-1][wire[i]]); } #ifndef USE_QMP if (comms) SCUmulti.TransComplete(); #else if(comms) { QMP_status_t qmp_complete_status = QMP_wait(multiple); if (qmp_complete_status != QMP_SUCCESS) QMP_error("Send failed in shift_field: %s\n", QMP_error_string(qmp_complete_status)); QMP_free_msghandle(multiple); for(int i = 0; i < 2*comms; i++) QMP_free_msgmem(msg_mem_p[i]); } #endif for (i=0; i<n_dir; i++) { length = hop*non_local_chi[wire[i]]; copy_matrix(u[i],(IFloat*)rcv_buf[wire[i]],&length, dest_nl[hop-1][wire[i]],src_nl[hop-1][wire[i]]); } }
/** * Test oneway blast send */ static void test_oneway (int** smem, int** rmem, QMP_msghandle_t* sendh, QMP_msghandle_t* recvh, struct perf_argv* pargv) { double it, ft, dt, bwval; int i, j; QMP_status_t err; int nc, ndims; int nloops; int dsize; QMP_bool_t sender; nc = pargv->num_channels; nloops = pargv->loops; dsize = pargv->size; sender = pargv->sender; ndims = pargv->ndims; QMP_barrier(); if (sender) { it = get_current_time (); for (i = 0; i < nloops; i++) { for (j = 0; j < nc; j++) { /* Send operation */ if ((err = QMP_start (sendh[j])) != QMP_SUCCESS) QMP_printf ("Start sending failed: %s\n", QMP_error_string(err)); } for (j = 0; j < nc; j++) { if (QMP_wait (sendh[j]) != QMP_SUCCESS) QMP_printf ("Error in sending %d\n", j ); } } ft = get_current_time (); /* In milli seconds */ dt = (ft - it); /* actual send time milli seconds */ bwval = dsize/(double)1000.0 * 4 * nloops*nc*ndims/dt; if(QMP_get_node_number()==0) { QMP_printf ("Oneway Bandwidth for datasize %d is %g (MB/s)", dsize * 4, bwval); QMP_printf ("time is %lf micro seconds", dt*1000.0/nloops); fflush(stdout); } QMP_barrier(); } else { it = get_current_time (); for (i = 0; i < nloops; i++) { for (j = 0; j < nc; j++) { /* receive operation */ if ((err = QMP_start (recvh[j])) != QMP_SUCCESS) QMP_printf ("Start receiving failed: %s\n", QMP_error_string(err)); } for (j = 0; j < nc; j++) { if (QMP_wait (recvh[j]) != QMP_SUCCESS) QMP_printf ("Error in receiving %d\n", j); } } ft = get_current_time (); /* In milli seconds */ dt = (ft - it); /* actual send time milli seconds */ bwval = dsize/(double)1000.0 * 4 * nloops*nc*ndims/dt; QMP_barrier(); if(QMP_get_node_number()==1) { QMP_printf ("Oneway Bandwidth for datasize %d is %g (MB/s)", dsize * 4, bwval); QMP_printf ("time is %lf micro seconds", dt*1000.0/nloops); fflush(stdout); } } QMP_barrier(); }
/** * Test ping and verify received message. */ static void test_pingpong_verify (int** smem, int** rmem, QMP_msghandle_t* sendh, QMP_msghandle_t* recvh, struct perf_argv* pargv) { double it, ft, dt, bwval; int i, j, k; QMP_status_t err; int nc, ndims; int nloops; int dsize; QMP_bool_t sender; nc = pargv->num_channels; nloops = pargv->loops; dsize = pargv->size; sender = pargv->sender; ndims = pargv->ndims; QMP_barrier(); it = get_current_time (); for (i = 0; i < nloops; i++) { for (j = 0; j < nc; j++) { for (k = 0; k < dsize; k++) { rmem[j][k] = 0; smem[j][k] = i + k * j + nc*nc; } } if (sender) { for (j = 0; j < nc; j++) { /* Send operation */ if ((err = QMP_start (sendh[j])) != QMP_SUCCESS) QMP_printf ("Start sending failed: %s\n", QMP_error_string(err)); } for (j = 0; j < nc; j++) { if (QMP_wait (sendh[j]) != QMP_SUCCESS) QMP_printf ("Error in sending %d\n", j ); } for (j = 0; j < nc; j++) { /* receive operation */ if ((err = QMP_start (recvh[j])) != QMP_SUCCESS) QMP_printf ("Start receiving failed: %s\n", QMP_error_string(err)); } for (j = 0; j < nc; j++) { if (QMP_wait (recvh[j]) != QMP_SUCCESS) QMP_printf ("Error in receiving %d\n", j); } } else { for (j = 0; j < nc; j++) { /* receive operation */ if ((err = QMP_start (recvh[j])) != QMP_SUCCESS) QMP_printf ("Start receiving failed: %s\n", QMP_error_string(err)); } for (j = 0; j < nc; j++) { if (QMP_wait (recvh[j]) != QMP_SUCCESS) QMP_printf ("Error in receiving %d\n", j); } for (j = 0; j < nc; j++) { /* Send operation */ if ((err = QMP_start (sendh[j])) != QMP_SUCCESS) QMP_printf ("Start sending failed: %s\n", QMP_error_string(err)); } for (j = 0; j < nc; j++) { if (QMP_wait (sendh[j]) != QMP_SUCCESS) QMP_printf ("Error in sending %d\n", j ); } } /* verify memory */ for (j = 0; j < nc; j++) { for (k = 0; k < dsize; k++) if (rmem[j][k] != i + k * j + nc* nc) QMP_printf ("Receiving memory error for memory %d %d %d\n", j, k, rmem[j][k]); } } ft = get_current_time (); /* In milli seconds */ dt = (ft - it); /* actual send time milli seconds */ bwval = 2 * dsize/(double)1000.0 * 4 * nloops*nc*ndims/dt; QMP_printf ("Ping Pong Bandwidth for datasize %d is %g (MB/s)", dsize * 4, bwval); QMP_printf ("RTT/2 is %lf micro seconds", dt*1000.0/nloops/2); }
void dwf_dslash_5_plus_slice(Vector *out, Vector *in, Float mass, int dag, Dwf *dwf_lib_arg, int s_slice) { int x; int s; // Initializations //------------------------------------------------------------------ #if 0 int local_ls = GJP.SnodeSites(); int s_nodes = GJP.Snodes(); int s_node_coor = GJP.SnodeCoor(); int vol_4d_cb = dwf_lib_arg->vol_4d / 2; int ls_stride = 24 * vol_4d_cb; #endif IFloat *f_in; IFloat *f_out; IFloat *f_temp; IFloat *comm_buf = dwf_lib_arg->comm_buf; IFloat two_over_a5 = 2.0 * GJP.DwfA5Inv(); IFloat neg_mass_two_over_a5 = -2.0 * mass * GJP.DwfA5Inv(); // [1 + gamma_5] term (if dag=1 [1 - gamma_5] term) // // out[s] = [1 + gamma_5] in[s-1] //------------------------------------------------------------------ if (s_slice<0 || s_slice >=local_ls) ERR.General("","dwf_dslash_5_plus_slice","s_slice=%d local_ls=%d!\n",s_slice,local_ls); if(s_slice>0 ){ f_in = (IFloat *) in; f_out = (IFloat *) out; f_in += (s_slice-1)*ls_stride; f_out += (s_slice)*ls_stride; if(dag == 1){ f_in = f_in + 12; f_out = f_out + 12; } FtV1pV2Skip_asm(f_out,&two_over_a5,f_in,f_out,vol_4d_cb); } // [1 + gamma_5] for lower boundary term (if dag=1 [1 - gamma_5] term) // If there's only one node along fifth direction, no communication // is necessary; Otherwise data from adjacent node in minus direction // will be needed. // If the lower boundary is the s=0 term // out[0] = - m_f * [1 + gamma_5] in[ls-1] // else, out[s] = [1 + gamma_5] in[s-1] // //------------------------------------------------------------------ if (s_slice == 0 ){ f_in = (IFloat *) in; f_in = f_in + (local_ls-1)*ls_stride; f_out = (IFloat *) out; if(dag == 1){ f_in = f_in + 12; f_out = f_out + 12; } f_temp = f_in; if (s_nodes > 1 ) { #ifdef USE_GETPLUS getMinusData(comm_buf, f_in, 24*vol_4d_cb, 4); f_temp = comm_buf; #else QMP_status_t send_status = QMP_wait(msghandle_down[0]); if (send_status != QMP_SUCCESS) QMP_error("Send failed in dwf_dslash_5_plus_slice: %s\n", QMP_error_string(send_status)); QMP_status_t recv_status = QMP_wait(msghandle_down[1]); if (recv_status != QMP_SUCCESS) QMP_error("Receive failed in dwf_dslash_5_plus_slice: %s\n", QMP_error_string(recv_status)); f_temp = rbuf_down; if(dag == 1) f_temp = f_temp + 12; #endif } if(s_node_coor == 0) { FtV1pV2Skip_asm(f_out,&neg_mass_two_over_a5,f_temp,f_out,vol_4d_cb); } else { FtV1pV2Skip_asm(f_out,&two_over_a5,f_temp,f_out,vol_4d_cb); } } // [1 - gamma_5] term (if dag=1 [1 + gamma_5] term) // // out[s] = [1 - gamma_5] in[s+1] //------------------------------------------------------------------ if(s_slice > 0 ){ f_in = (IFloat *) in; f_out = (IFloat *) out; f_in += (s_slice)*ls_stride; f_out += (s_slice-1)*ls_stride; if(dag == 0){ f_in = f_in + 12; f_out = f_out + 12; } FtV1pV2Skip_asm(f_out,&two_over_a5,f_in,f_out,vol_4d_cb); } // [1 - gamma_5] for upper boundary term (if dag=1 [1 + gamma_5] term) // If there's only one node along fifth direction, no communication // is necessary; Otherwise data from adjacent node in minus direction // will be needed. // If the upper boundary is the s=ls term // out[ls-1] = - m_f * [1 - gamma_5] in[0] // else out[s] = [1 - gamma_5] in[s+1] // //------------------------------------------------------------------ if(s_slice == (local_ls-1) ){ f_in = (IFloat *) in; f_out = (IFloat *) out; if(dag == 0){ f_in = f_in + 12; f_out = f_out + 12; } f_out = f_out + (local_ls-1)*ls_stride; f_temp = f_in; if (s_nodes > 1 ) { #ifdef USE_GETPLUS getPlusData(comm_buf, f_in, 24*vol_4d_cb, 4); f_temp = comm_buf; #else QMP_status_t send_status = QMP_wait(msghandle_up[0]); if (send_status != QMP_SUCCESS) QMP_error("Send failed in dwf_dslash_5_plus_slice: %s\n", QMP_error_string(send_status)); QMP_status_t recv_status = QMP_wait(msghandle_up[1]); if (recv_status != QMP_SUCCESS) QMP_error("Receive failed in dwf_dslash_5_plus_slice: %s\n", QMP_error_string(recv_status)); f_temp = rbuf_up; if(dag == 0) f_temp = f_temp + 12; #endif } if(s_node_coor == s_nodes - 1) { FtV1pV2Skip_asm(f_out,&neg_mass_two_over_a5,f_temp,f_out,vol_4d_cb); } else { FtV1pV2Skip_asm(f_out,&two_over_a5,f_temp,f_out,vol_4d_cb); } } // DiracOp::CGflops+=2*2*vol_4d_cb*local_ls*12; }
void wfm_comm(){ char *fname="wfm_comm()"; const int group = 16; void *addr[group]; size_t blksize[group]; int numblk[group]; ptrdiff_t stride[group]; int index; const int MAX_MSGHANDLE=20; if (wfm_max_numchunk/group+1 >MAX_MSGHANDLE) ERR.General("",fname,"wfm_max_numchunk(%d)/group+1 >MAX_MSGHANDLE",wfm_max_numchunk); static QMP_msgmem_t send_mem[8][MAX_MSGHANDLE]; static QMP_msgmem_t recv_mem[8][MAX_MSGHANDLE]; static QMP_msghandle_t send_h[8][MAX_MSGHANDLE]; static QMP_msghandle_t recv_h[8][MAX_MSGHANDLE]; static int pir=0; static int wfm_blocks[8]; if (wilson_initted || !initted){ VRB.Flow("",fname,"wilson_initted=%d initted=%d\n",wilson_initted,initted); for(int dir=0;dir<8;dir++) wfm_blocks[dir]=1; for(int ig=0; ig<1; ig++){ // VRB.Flow("",fname,"ig=%d",ig); for(int dir=0;dir<8;dir++){ int sign=1; if(dir>3) sign = -1; // VRB.Flow("",fname,"dir=%d",dir); int n_site=0; addr[n_site] = wfm_s_start[dir]; blksize[n_site] = wfm_blklen[dir]; numblk[n_site] = wfm_numblk[dir]; stride[n_site] = wfm_stride[dir]; n_site++; if (initted){ QMP_free_msghandle(send_h[dir][ig]); QMP_free_msghandle(recv_h[dir][ig]); QMP_free_msgmem(send_mem[dir][ig]); QMP_free_msgmem(recv_mem[dir][ig]); } if(n_site>0){ send_mem[dir][ig] = QMP_declare_strided_array_msgmem(addr,blksize,numblk,stride,n_site); send_h[dir][ig] = QMP_declare_send_relative(send_mem[dir][ig],dir%4,sign,0); wfm_blocks[dir]=ig+1; } int r_site=0; addr[r_site] = wfm_r_start[dir]; blksize[r_site] = wfm_blklen[dir]; numblk[r_site] = wfm_numblk[dir]; stride[r_site] = wfm_stride[dir]; r_site++; if (n_site!=r_site) ERR.General("",fname,"n_site(%d)!=r_site(%d)\n",n_site,r_site); // VRB.Flow("",fname,"n_site=%d r_site=%d",n_site,r_site); if(r_site>0){ recv_mem[dir][ig] = QMP_declare_strided_array_msgmem(addr,blksize,numblk,stride,r_site); recv_h[dir][ig] = QMP_declare_receive_relative(recv_mem[dir][ig],dir%4,-sign,0); } } // dir } // ig for(int dir=0;dir<8;dir++) VRB.Flow("",fname,"wfm_blocks[%d]=%d",dir,wfm_blocks[dir]); pir = CoorT()%2; // pir = 0; initted=1; wilson_initted=0; } #if 0 for(int dir=0;dir<8;dir++) for ( index=0; index <wfm_numchunk[dir];index++){ Float *tmp_p = wfm_send_ad[dir+8*index]; if ( (*tmp_p)*(*tmp_p) >0.0001) printf("Node %d: wfm_send_ad[%d][%d]=%e\n",UniqueID(),dir,index,*tmp_p); } #endif int dir_g=4; for(int ig=0; ig<1; ig++){ for(index=0;index<8;index++){ QMP_start(send_h[index][ig]); QMP_start(recv_h[index][ig]); } for(index=0;index<8;index++){ QMP_wait(send_h[index][ig]); QMP_wait(recv_h[index][ig]); } } #if 0 for(int dir=0;dir<8;dir++) for ( index=0; index <wfm_numchunk[dir];index++){ Float *tmp_p = wfm_recv_ad[dir+8*index]; if ( (*tmp_p)*(*tmp_p) >1e-10) printf("Node %d: wfm_recv_ad[%d][%d]=%e\n",UniqueID(),dir,index,*tmp_p); } #endif }