static void test_simultaneous_send (int** smem, int** rmem, QMP_msghandle_t* sendh, QMP_msghandle_t* recvh, struct perf_argv* pargv) { double it, ft, dt, bwval; int i, j; QMP_status_t err; int nc, ndims; int nloops; int dsize; nc = pargv->num_channels; nloops = pargv->loops; dsize = pargv->size; ndims = pargv->ndims; /* do a test for nloops */ QMP_barrier(); it = get_current_time (); for (i = 0; i < nloops; i++) { for (j = 0; j < nc; j++) { /* receive operation */ if ((err = QMP_start (recvh[j])) != QMP_SUCCESS) QMP_printf ("Start receiving failed: %s\n", QMP_error_string(err)); } for (j = 0; j < nc; j++) { /* Send operation */ if ((err = QMP_start (sendh[j])) != QMP_SUCCESS) QMP_printf ("Start sending failed: %s\n", QMP_error_string(err)); } for (j = 0; j < nc; j++) { if (QMP_wait (sendh[j]) != QMP_SUCCESS) QMP_printf ("Error in sending %d\n", j ); } for (j = 0; j < nc; j++) { if (QMP_wait (recvh[j]) != QMP_SUCCESS) QMP_printf ("Error in receiving %d\n", j); } } ft = get_current_time (); if(QMP_get_node_number()==0) { dt = (ft - it); /* in milli seconds */ bwval = dsize/(double)1000.0 * 4.0 * nloops * nc*ndims/dt; QMP_printf ("Simultaneous send B/W for datasize %d is %g (MB/s)", dsize * 4, bwval); QMP_printf ("Time difference is %lf micro seconds", dt*1000.0/nloops); } QMP_barrier(); }
unsigned int sync() { QMP_status_t sync_status = QMP_barrier(); if (sync_status != QMP_SUCCESS) { QMP_error("Error in QMP sync:%s\n", QMP_error_string(sync_status)); return 0; } return 1; }
double bench_inv(QOP_info_t *info, QOP_invert_arg_t *inv_arg, QOP_resid_arg_t *res_arg, QDP_DiracFermion *out, QDP_DiracFermion *in) { static QLA_Real r2s=-1, r2; double sec=0, flop=0, mf=0; int i, iter=0; QOP_DiracFermion *qopout, *qopin; QDP_D_eq_zero(out, QDP_all); qopout = QOP_create_D_from_qdp(out); qopin = QOP_create_D_from_qdp(in); for(i=0; i<=nit; i++) { QMP_barrier(); QOP_wilson_invert(info, flw, inv_arg, res_arg, kappa, qopout, qopin); QMP_barrier(); printf("%i\t%i\t%g\t%i\n", i, res_arg->final_iter, info->final_sec, (int)info->final_flop); if(i>0) { iter += res_arg->final_iter; sec += info->final_sec; flop += info->final_flop; //mf += info->final_flop/(1e6*info->final_sec); } } QOP_destroy_D(qopout); QOP_destroy_D(qopin); QDP_r_eq_norm2_D(&r2, out, QDP_even); if(r2s<0) r2s = r2; if(fabs(1-r2/r2s)>1e-3) { printf0("first norm = %g this norn = %g\n", r2s, r2); } mf = 1; QMP_sum_double(&mf); QMP_sum_double(&sec); QMP_sum_double(&flop); res_arg->final_iter = iter/nit; info->final_sec = sec/(mf*nit); info->final_flop = flop/(mf*nit); mf = info->final_flop/(1e6*info->final_sec); return mf; }
void comm_barrier(void) { QMP_CHECK( QMP_barrier() ); }
void test_solver(BfmSolver solver) { g5dParams parms; int Ls=16; double M5=1.8; double mq=0.0001; double wilson_lo = 0.05; double wilson_hi = 6.8; double shamir_lo = 0.025; double shamir_hi = 1.7; double ht_scale=1.7; double hw_scale=1.0; if ( solver != DWF ) { exit(0); Printf("Should be testing HtCayleyTanh aka DWF\n"); } parms.pDWF(mq,M5,Ls); multi1d<LatticeColorMatrix> u(4); HotSt(u); // ArchivGauge_t Header ; readArchiv(Header,u,"ckpoint_lat.3000"); multi1d<LatticeFermion> src(Ls); /* Rudy calculate some eigenvectors */ BfmWrapperParams BWP; BWP.BfmInverter = BfmInv_CG; BWP.BfmMatrix = BfmMat_M; BWP.BfmPrecision= Bfm64bit; BWP.MaxIter = 10000; BWP.RsdTarget.resize(1); BWP.RsdTarget[0]= 1.0e-9; BWP.Delta = 1.0e-4; BWP.BAP = parms; BfmWrapper bfm(BWP); bfmarg bfma; #if defined(QDP_USE_OMP_THREADS) bfma.Threads(omp_get_max_threads()); #else bfma.Threads(16); #endif bfma.Verbose(0); //Physics parameters bfmActionParams *bfmap = (bfmActionParams *) &bfma; *bfmap = bfm.invParam.BAP; // Algorithm & code control bfma.time_report_iter=-100; bfma.max_iter = bfm.invParam.MaxIter; bfma.residual = toDouble(bfm.invParam.RsdTarget[0]); int lx = QDP::Layout::subgridLattSize()[0]; int ly = QDP::Layout::subgridLattSize()[1]; int lz = QDP::Layout::subgridLattSize()[2]; int lt = QDP::Layout::subgridLattSize()[3]; //Geometry bfma.node_latt[0] = lx; bfma.node_latt[1] = ly; bfma.node_latt[2] = lz; bfma.node_latt[3] = lt; multi1d<int> procs = QDP::Layout::logicalSize(); for(int mu=0;mu<4;mu++){ if (procs[mu]>1) bfma.local_comm[mu] = 0; else bfma.local_comm[mu] = 1; } // Bfm object bfm_qdp<double> bfm_eig; bfm_eig.init(bfma); //Gauge field import bfm_eig.importGauge(u); //Subspace #define NumberGaussian (1) Fermion_t subspace[NumberGaussian]; Fermion_t check; Fermion_t mp; Fermion_t mmp; Fermion_t tmp_t; check = bfm_eig.allocFermion(); mp = bfm_eig.allocFermion(); mmp = bfm_eig.allocFermion(); tmp_t = bfm_eig.allocFermion(); bfm_eig.importFermion(src,check,1); QDPIO::cout << "Ls = "<<Ls<<endl; for(int g=0;g<NumberGaussian;g++){ for(int s=0;s<Ls;s++){ gaussian(src[s]); } subspace[g]=bfm_eig.allocFermion(); bfm_eig.importFermion(src,subspace[g],1); // Half parity gaussian if ( g==0) { bfm_eig.importFermion(src,check,1); } for(int s=0;s<Ls;s++){ src[s]=zero; } bfm_eig.exportFermion(src,subspace[g],1); QDPIO::cout << "Subspace norm " << norm2(src)<<endl; } for(int s=0;s<Ls;s++){ gaussian(src[s]); } QDPIO::cout << "Got here " << endl; // Handle< LinearOperatorArray<T> > linop =GetLinOp(u, parms); int block[5]; for(int i=0;i<5;i++) block[i]=4; QDPIO::cout << "Initialised dirac op"<<endl; BfmLittleDiracOperator ldop(Ls,NumberGaussian,block,subspace,&bfm_eig); int ns = ldop.SubspaceDimension(); QDPIO::cout << "subspace dimension is "<< ns<<endl; ns = ldop.SubspaceLocalDimension(); QDPIO::cout << "subspace dimension per node is "<< ns<<endl; std::vector<std::complex<double> > decomp(ns); ldop.ProjectToSubspace(check,decomp); if (QMP_is_primary_node()){ FILE * fp = fopen("coeff.dat","w"); for(int s=0;s<ns;s++){ fprintf(fp,"coeff %d %le %le\n",s,real(decomp[s]),imag(decomp[s])); } fclose(fp); } for(int s=0;s<ns;s++){ QDPIO::cout << "coeff "<<s<<" " << real(decomp[s]) << " " << imag(decomp[s])<<endl; } ldop.PromoteFromSubspace(decomp,mp); double n; #pragma omp parallel { omp_set_num_threads(bfm_eig.nthread); #pragma omp for for(int t=0;t<bfm_eig.nthread;t++) { bfm_eig.axpy(check,mp,check,-1); n = bfm_eig.norm(check); } } QDPIO::cout << "project/promote n2diff "<< n<<endl; QMP_barrier(); QDPIO::cout << "Computing little dirac matrix"<<endl; ldop.ComputeLittleMatrixColored(); QDPIO::cout << "Done"<<endl; std::vector<std::complex<double> > Aphi(ns); // phi^dag DdagD phi = |Dphi|^2 with phi a subspace vector // should be equal to Project/Apply/Promote + inner product #pragma omp parallel { #pragma omp for for(int t=0;t<bfm_eig.nthread;t++) { bfm_eig.Mprec(subspace[0],mp,tmp_t,0); } } QDPIO::cout << "Applied BFM matrix "<<endl; double n2; #pragma omp parallel { omp_set_num_threads(bfm_eig.nthread); #pragma omp for for(int t=0;t<bfm_eig.nthread;t++) { n2 = bfm_eig.norm(mp); } } QDPIO::cout << "Applied BFM matrix "<<n2<<endl; ldop.ProjectToSubspace(subspace[0],decomp); QDPIO::cout << "Projected to subspace "<<endl; ldop.Apply(decomp,Aphi); QDPIO::cout << "Applied A "<<endl; ldop.PromoteFromSubspace(Aphi,check); QDPIO::cout << "Promoted "<<endl; complex<double> inn; #pragma omp parallel { #pragma omp for for(int t=0;t<bfm_eig.nthread;t++) { inn = bfm_eig.inner(subspace[0],check); } } QDPIO::cout << "phi^dag Ddag D phi check " << n2 << " " <<real(inn) << imag(inn) <<endl; std::vector<std::complex<double> > AinvAphi(ns); ldop.ProjectToSubspace(subspace[0],decomp); ldop.Apply(decomp,Aphi); for(int s=0;s<ns;s++){ QDPIO::cout << "Aphi "<<s<<" " << real(Aphi[s]) <<" " << imag(Aphi[s])<<endl; } ldop.PromoteFromSubspace(Aphi,check); #pragma omp parallel { #pragma omp for for(int t=0;t<bfm_eig.nthread;t++) { bfm_eig.Mprec(subspace[0],mp,tmp_t,0); bfm_eig.Mprec(mp,mmp,tmp_t,1); } } ldop.ProjectToSubspace(mmp,decomp); ldop.PromoteFromSubspace(decomp,mmp); #pragma omp parallel { #pragma omp for for(int t=0;t<bfm_eig.nthread;t++) { bfm_eig.axpy(check,mmp,check,-1.0); n2 = bfm_eig.norm(check); } } QDPIO::cout << "PMdagMP check n2diff "<< n2<<endl; QMP_barrier(); QDPIO::cout << "Applying inverse"<<endl; ldop.ApplyInverse(Aphi,AinvAphi); QMP_barrier(); for(int s=0;s<ns;s++){ QDPIO::cout << "AinvAphi "<<s<<" " << real(AinvAphi[s]) << " " << imag(AinvAphi[s])<<endl; } ldop.PromoteFromSubspace(AinvAphi,check); #pragma omp parallel { #pragma omp for for(int t=0;t<bfm_eig.nthread;t++) { bfm_eig.axpy(check,subspace[0],check,-1.0); n2 = bfm_eig.norm(check); } } QDPIO::cout << "AinvA check n2diff "<< n2<<endl; }
int main (int argc, char** argv) { int i, nc; QMP_status_t status; int **smem, **rmem; QMP_msgmem_t *recvmem; QMP_msghandle_t *recvh; QMP_msgmem_t *sendmem; QMP_msghandle_t *sendh; struct perf_argv pargv; QMP_thread_level_t req, prv; /** * Simple point to point topology */ int dims[4] = {2,2,2,2}; int ndims = 1; //if(QMP_get_node_number()==0) //printf("starting init\n"); fflush(stdout); req = QMP_THREAD_SINGLE; status = QMP_init_msg_passing (&argc, &argv, req, &prv); if (status != QMP_SUCCESS) { fprintf (stderr, "QMP_init failed\n"); return -1; } if(QMP_get_node_number()==0) printf("finished init\n"); fflush(stdout); if (parse_options (argc, argv, &pargv) == -1) { if(QMP_get_node_number()==0) usage (argv[0]); exit (1); } { int maxdims = 4; int k=0; int nodes = QMP_get_number_of_nodes(); ndims = 0; while( (nodes&1) == 0 ) { if(ndims<maxdims) ndims++; else { dims[k] *= 2; k++; if(k>=maxdims) k = 0; } nodes /= 2; } if(nodes != 1) { QMP_error("invalid number of nodes %i", QMP_get_number_of_nodes()); QMP_error(" must power of 2"); QMP_abort(1); } pargv.ndims = ndims; } status = QMP_declare_logical_topology (dims, ndims); if (status != QMP_SUCCESS) { fprintf (stderr, "Cannot declare logical grid\n"); return -1; } /* do a broadcast of parameter */ if (QMP_broadcast (&pargv, sizeof (pargv)) != QMP_SUCCESS) { QMP_printf ("Broadcast parameter failed\n"); exit (1); } { int k=1; const int *lc = QMP_get_logical_coordinates(); for(i=0; i<ndims; i++) k += lc[i]; pargv.sender = k&1; } QMP_printf("%s options: num_channels[%d] verify[%d] option[%d] datasize[%d] numloops[%d] sender[%d] strided_send[%i] strided_recv[%i] strided_array_send[%i] ", argv[0], pargv.num_channels, pargv.verify, pargv.option, pargv.size, pargv.loops, pargv.sender, strided_send, strided_recv, strided_array_send); fflush(stdout); /** * Create memory */ nc = pargv.num_channels; smem = (int **)malloc(nc*sizeof (int *)); rmem = (int **)malloc(nc*sizeof (int *)); sendmem = (QMP_msgmem_t *)malloc(ndims*nc*sizeof (QMP_msgmem_t)); recvmem = (QMP_msgmem_t *)malloc(ndims*nc*sizeof (QMP_msgmem_t)); sendh = (QMP_msghandle_t *)malloc(nc*sizeof (QMP_msghandle_t)); recvh = (QMP_msghandle_t *)malloc(nc*sizeof (QMP_msghandle_t)); QMP_barrier(); if(QMP_get_node_number()==0) printf("\n"); fflush(stdout); if(pargv.option & TEST_SIMUL) { int opts = pargv.option; pargv.option = TEST_SIMUL; if(QMP_get_node_number()==0) QMP_printf("starting simultaneous sends"); fflush(stdout); for(i=pargv.minsize; i<=pargv.maxsize; i*=pargv.facsize) { pargv.size = i; create_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc, i, &pargv); test_simultaneous_send (smem, rmem, sendh, recvh, &pargv); check_mem(rmem, ndims, nc, i); free_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc); } if(QMP_get_node_number()==0) QMP_printf("finished simultaneous sends\n"); fflush(stdout); pargv.option = opts; } if(pargv.option & TEST_PINGPONG) { int opts = pargv.option; pargv.option = TEST_PINGPONG; if(QMP_get_node_number()==0) QMP_printf("starting ping pong sends"); fflush(stdout); for(i=pargv.minsize; i<=pargv.maxsize; i*=pargv.facsize) { pargv.size = i; create_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc, i, &pargv); if(pargv.verify) test_pingpong_verify(smem, rmem, sendh, recvh, &pargv); else test_pingpong(smem, rmem, sendh, recvh, &pargv); check_mem(rmem, ndims, nc, i); free_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc); } if(QMP_get_node_number()==0) QMP_printf("finished ping pong sends\n"); fflush(stdout); pargv.option = opts; } if(pargv.option & TEST_ONEWAY) { int opts = pargv.option; pargv.option = TEST_ONEWAY; if(QMP_get_node_number()==0) QMP_printf("starting one way sends"); fflush(stdout); for(i=pargv.minsize; i<=pargv.maxsize; i*=pargv.facsize) { pargv.size = i; create_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc, i, &pargv); test_oneway (smem, rmem, sendh, recvh, &pargv); if(!pargv.sender) check_mem(rmem, ndims, nc, i); free_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc); } if(QMP_get_node_number()==0) QMP_printf("finished one way sends"); fflush(stdout); pargv.option = opts; } /** * Free memory */ free (smem); free (rmem); free (sendh); free (recvh); free (sendmem); free (recvmem); QMP_finalize_msg_passing (); return 0; }
/** * Test oneway blast send */ static void test_oneway (int** smem, int** rmem, QMP_msghandle_t* sendh, QMP_msghandle_t* recvh, struct perf_argv* pargv) { double it, ft, dt, bwval; int i, j; QMP_status_t err; int nc, ndims; int nloops; int dsize; QMP_bool_t sender; nc = pargv->num_channels; nloops = pargv->loops; dsize = pargv->size; sender = pargv->sender; ndims = pargv->ndims; QMP_barrier(); if (sender) { it = get_current_time (); for (i = 0; i < nloops; i++) { for (j = 0; j < nc; j++) { /* Send operation */ if ((err = QMP_start (sendh[j])) != QMP_SUCCESS) QMP_printf ("Start sending failed: %s\n", QMP_error_string(err)); } for (j = 0; j < nc; j++) { if (QMP_wait (sendh[j]) != QMP_SUCCESS) QMP_printf ("Error in sending %d\n", j ); } } ft = get_current_time (); /* In milli seconds */ dt = (ft - it); /* actual send time milli seconds */ bwval = dsize/(double)1000.0 * 4 * nloops*nc*ndims/dt; if(QMP_get_node_number()==0) { QMP_printf ("Oneway Bandwidth for datasize %d is %g (MB/s)", dsize * 4, bwval); QMP_printf ("time is %lf micro seconds", dt*1000.0/nloops); fflush(stdout); } QMP_barrier(); } else { it = get_current_time (); for (i = 0; i < nloops; i++) { for (j = 0; j < nc; j++) { /* receive operation */ if ((err = QMP_start (recvh[j])) != QMP_SUCCESS) QMP_printf ("Start receiving failed: %s\n", QMP_error_string(err)); } for (j = 0; j < nc; j++) { if (QMP_wait (recvh[j]) != QMP_SUCCESS) QMP_printf ("Error in receiving %d\n", j); } } ft = get_current_time (); /* In milli seconds */ dt = (ft - it); /* actual send time milli seconds */ bwval = dsize/(double)1000.0 * 4 * nloops*nc*ndims/dt; QMP_barrier(); if(QMP_get_node_number()==1) { QMP_printf ("Oneway Bandwidth for datasize %d is %g (MB/s)", dsize * 4, bwval); QMP_printf ("time is %lf micro seconds", dt*1000.0/nloops); fflush(stdout); } } QMP_barrier(); }
/** * Test ping and verify received message. */ static void test_pingpong_verify (int** smem, int** rmem, QMP_msghandle_t* sendh, QMP_msghandle_t* recvh, struct perf_argv* pargv) { double it, ft, dt, bwval; int i, j, k; QMP_status_t err; int nc, ndims; int nloops; int dsize; QMP_bool_t sender; nc = pargv->num_channels; nloops = pargv->loops; dsize = pargv->size; sender = pargv->sender; ndims = pargv->ndims; QMP_barrier(); it = get_current_time (); for (i = 0; i < nloops; i++) { for (j = 0; j < nc; j++) { for (k = 0; k < dsize; k++) { rmem[j][k] = 0; smem[j][k] = i + k * j + nc*nc; } } if (sender) { for (j = 0; j < nc; j++) { /* Send operation */ if ((err = QMP_start (sendh[j])) != QMP_SUCCESS) QMP_printf ("Start sending failed: %s\n", QMP_error_string(err)); } for (j = 0; j < nc; j++) { if (QMP_wait (sendh[j]) != QMP_SUCCESS) QMP_printf ("Error in sending %d\n", j ); } for (j = 0; j < nc; j++) { /* receive operation */ if ((err = QMP_start (recvh[j])) != QMP_SUCCESS) QMP_printf ("Start receiving failed: %s\n", QMP_error_string(err)); } for (j = 0; j < nc; j++) { if (QMP_wait (recvh[j]) != QMP_SUCCESS) QMP_printf ("Error in receiving %d\n", j); } } else { for (j = 0; j < nc; j++) { /* receive operation */ if ((err = QMP_start (recvh[j])) != QMP_SUCCESS) QMP_printf ("Start receiving failed: %s\n", QMP_error_string(err)); } for (j = 0; j < nc; j++) { if (QMP_wait (recvh[j]) != QMP_SUCCESS) QMP_printf ("Error in receiving %d\n", j); } for (j = 0; j < nc; j++) { /* Send operation */ if ((err = QMP_start (sendh[j])) != QMP_SUCCESS) QMP_printf ("Start sending failed: %s\n", QMP_error_string(err)); } for (j = 0; j < nc; j++) { if (QMP_wait (sendh[j]) != QMP_SUCCESS) QMP_printf ("Error in sending %d\n", j ); } } /* verify memory */ for (j = 0; j < nc; j++) { for (k = 0; k < dsize; k++) if (rmem[j][k] != i + k * j + nc* nc) QMP_printf ("Receiving memory error for memory %d %d %d\n", j, k, rmem[j][k]); } } ft = get_current_time (); /* In milli seconds */ dt = (ft - it); /* actual send time milli seconds */ bwval = 2 * dsize/(double)1000.0 * 4 * nloops*nc*ndims/dt; QMP_printf ("Ping Pong Bandwidth for datasize %d is %g (MB/s)", dsize * 4, bwval); QMP_printf ("RTT/2 is %lf micro seconds", dt*1000.0/nloops/2); }