int main(int argc, char *argv[]) { const char *msg; int status = 1; int mu, i; struct QOP_CLOVER_State *clover_state; QDP_Int *I_seed; int i_seed; QDP_RandomState *state; QLA_Real plaq; QLA_Real n[NELEMS(F)]; struct QOP_CLOVER_Gauge *c_g; struct QOP_CLOVER_Fermion *c_f[NELEMS(F)]; double kappa; double c_sw; double in_eps; int in_iter; int log_flag; double out_eps; int out_iter; int cg_status; double run_time; long long flops, sent, received; /* start QDP */ QDP_initialize(&argc, &argv); if (argc != 1 + NDIM + 6) { printf0("ERROR: usage: %s Lx ... seed kappa c_sw iter eps log?\n", argv[0]); goto end; } for (mu = 0; mu < NDIM; mu++) { lattice[mu] = atoi(argv[1 + mu]); } i_seed = atoi(argv[1 + NDIM]); kappa = atof(argv[2 + NDIM]); c_sw = atof(argv[3 + NDIM]); in_iter = atoi(argv[4 + NDIM]); in_eps = atof(argv[5 + NDIM]); log_flag = atoi(argv[6 + NDIM]) == 0? 0: QOP_CLOVER_LOG_EVERYTHING; /* set lattice size and create layout */ QDP_set_latsize(NDIM, lattice); QDP_create_layout(); primary = QMP_is_primary_node(); self = QMP_get_node_number(); get_vector(network, 1, QMP_get_logical_number_of_dimensions(), QMP_get_logical_dimensions()); get_vector(node, 0, QMP_get_logical_number_of_dimensions(), QMP_get_logical_coordinates()); printf0("network: "); for (i = 0; i < NDIM; i++) printf0(" %d", network[i]); printf0("\n"); printf0("node: "); for (i = 0; i < NDIM; i++) printf0(" %d", node[i]); printf0("\n"); printf0("kappa: %20.15f\n", kappa); printf0("c_sw: %20.15f\n", c_sw); printf0("in_iter: %d\n", in_iter); printf0("in_eps: %15.2e\n", in_eps); /* allocate the gauge field */ create_Mvector(U, NELEMS(U)); create_Mvector(C, NELEMS(C)); create_Dvector(F, NELEMS(F)); I_seed = QDP_create_I(); QDP_I_eq_funci(I_seed, icoord, QDP_all); state = QDP_create_S(); QDP_S_eq_seed_i_I(state, i_seed, I_seed, QDP_all); for (mu = 0; mu < NELEMS(U); mu++) { QDP_M_eq_gaussian_S(U[mu], state, QDP_all); } for (i = 0; i < NELEMS(F); i++) { QDP_D_eq_gaussian_S(F[i], state, QDP_all); } /* build the clovers */ clover(C, U); /* initialize CLOVER */ if (QOP_CLOVER_init(&clover_state, lattice, network, node, primary, sublattice, NULL)) { printf0("CLOVER_init() failed\n"); goto end; } if (QOP_CLOVER_import_fermion(&c_f[0], clover_state, f_reader, F[0])) { printf0("CLOVER_import_fermion(0) failed\n"); goto end; } if (QOP_CLOVER_allocate_fermion(&c_f[1], clover_state)) { printf0("CLOVER_allocate_fermion(1) failed\n"); goto end; } if (QOP_CLOVER_allocate_fermion(&c_f[2], clover_state)) { printf0("CLOVER_allocate_fermion(2) failed\n"); goto end; } if (QOP_CLOVER_allocate_fermion(&c_f[3], clover_state)) { printf0("CLOVER_allocate_fermion(3) failed\n"); goto end; } if (QOP_CLOVER_import_gauge(&c_g, clover_state, kappa, c_sw, u_reader, c_reader, NULL)) { printf("CLOVER_import_gauge() failed\n"); goto end; } QOP_CLOVER_D_operator(c_f[2], c_g, c_f[0]); cg_status = QOP_CLOVER_D_CG(c_f[3], &out_iter, &out_eps, c_f[2], c_g, c_f[2], in_iter, in_eps, log_flag); msg = QOP_CLOVER_error(clover_state); QOP_CLOVER_performance(&run_time, &flops, &sent, &received, clover_state); QOP_CLOVER_export_fermion(f_writer, F[3], c_f[3]); printf0("CG status: %d\n", cg_status); printf0("CG error message: %s\n", msg? msg: "<NONE>"); printf0("CG iter: %d\n", out_iter); printf0("CG eps: %20.10e\n", out_eps); printf0("CG performance: runtime %e sec\n", run_time); printf0("CG performance: flops %.3e MFlop/s (%lld)\n", flops * 1e-6 / run_time, flops); printf0("CG performance: snd %.3e MB/s (%lld)\n", sent * 1e-6 / run_time, sent); printf0("CG performance: rcv %.3e MB (%lld)/s\n", received * 1e-6 / run_time, received); /* free CLOVER */ QOP_CLOVER_free_gauge(&c_g); for (i = 0; i < NELEMS(c_f); i++) QOP_CLOVER_free_fermion(&c_f[i]); QOP_CLOVER_fini(&clover_state); /* Compute plaquette */ plaq = plaquette(U); /* field norms */ for (i = 0; i < NELEMS(F); i++) QDP_r_eq_norm2_D(&n[i], F[i], QDP_all); /* Display the values */ printf0("plaquette = %g\n", plaq / (QDP_volume() * QDP_Nc * NDIM * (NDIM - 1) / 2 )); for (i = 0; i < NELEMS(F); i++) printf0(" |f|^2 [%d] = %20.10e\n", i, (double)(n[i])); /* Compute and display <f[1] f[0]> */ show_dot("1|orig", F[1], F[0]); /* Compute and display <f[1] f[3]> */ show_dot("1|solv", F[1], F[3]); QDP_destroy_S(state); QDP_destroy_I(I_seed); destroy_Mvector(U, NELEMS(U)); destroy_Mvector(C, NELEMS(C)); destroy_Dvector(F, NELEMS(F)); status = 0; end: /* shutdown QDP */ printf0("end\n"); QDP_finalize(); return status; }
const int *get_logical_coordinate(){ return QMP_get_logical_coordinates(); }
void wfm::init(WilsonArg *wilson_p) /* pointer to Wilson type structure */ { int spinor_words; /* size of the spinor field on the */ /* sublattice checkerboard */ int half_spinor_words; /* size of the spin-projected "half_spinors*/ /* on the sublattice checkerboard including*/ /* the communications padding */ int slx; /* x-direction size of node sublattice */ int sly; /* y-direction size of node sublattice */ int slz; /* z-direction size of node sublattice */ int slt; /* t-direction size of node sublattice */ int i; int mu; SloppyPrecision = wilson_p->SloppyPrecision; WFM_BGL = wilson_p->WFM_BGL; // if ( isBoss() ) printf("wfm::init setting up BG/L MMU state\n"); mmu_optimise(); mmu_print(); // CoreCount( wilson_p->CoreCount ); CoreCount( 1 ); if ( WFM_BGL ) PAD_HALF_SPINOR_SIZE = 12; else PAD_HALF_SPINOR_SIZE = 16; if ( WFM_BGL && (nthread > 1) && SloppyPrecision ) { if ( isBoss() ) printf("Bagel does not maintain L1 coherence in dual core + single precision mode on BlueGene\n"); if ( isBoss() ) printf("Get on to IBM to give me access to SWOA MMU options, or even better a non-cache image of DRAM\n"); if ( isBoss() ) printf("If they give me the tools, I'm happy to do the heroics of mainting sfw coherence\n"); if ( isBoss() ) printf("Bagel insanity check exiting\n"); exit(-1); } IR = wilson_p->instruction_reg_num; /*--------------------------------------------------------------------------*/ /* Set sublattice direction sizes */ /*--------------------------------------------------------------------------*/ local_latt[0] = wilson_p->local_latt[0]; local_latt[1] = wilson_p->local_latt[1]; local_latt[2] = wilson_p->local_latt[2]; local_latt[3] = wilson_p->local_latt[3]; slx = local_latt[0]; sly = local_latt[1]; slz = local_latt[2]; slt = local_latt[3]; #if (defined USE_COMMS_QMP) && (!defined UNIFORM_SEED_NO_COMMS) QMP_bool_t qmp_inited=QMP_is_initialized(); if( !qmp_inited ) { if ( isBoss() ) printf("QMP_not_initialized\n"); exit(-1); } const int *ncoor = QMP_get_logical_coordinates(); base_parity =(ncoor[0]*local_latt[0] + ncoor[1]*local_latt[1] + ncoor[2]*local_latt[2] + ncoor[3]*local_latt[3])&0x1; #else base_parity = 0; #endif /*--------------------------------------------------------------------------*/ /* Set periodic wrap back or not */ /*--------------------------------------------------------------------------*/ local_comm[0] = wilson_p->local_comm[0]; local_comm[1] = wilson_p->local_comm[1]; local_comm[2] = wilson_p->local_comm[2]; local_comm[3] = wilson_p->local_comm[3]; #ifdef UNIFORM_SEED_NO_COMMS for(int i=0;i<4;i++) if(local_comm[0]!=1){ fprintf(stderr,"wfm::local_comm[%d]=%d!\n",i,local_comm[i]); exit(-33); } #endif /*-----------------------------------------------------------------------*/ /* compute the subgrd volume of each chkbd ... at least two local dims */ /* must be even for this code to be correct. */ /*-----------------------------------------------------------------------*/ vol = (slx * sly * slz * slt)/2; nbound[0] = (sly * slz * slt)/2; nbound[1] = (slx * slz * slt)/2; nbound[2] = (slx * sly * slt)/2; nbound[3] = (slx * sly * slz)/2; allbound = nbound[0] + nbound[1] + nbound[2] + nbound[3]; if ( nbound[0] * slx * 2 != (slx*sly*slz*slt) ) { if ( isBoss() ) printf("wfm::init Even x logic bomb\n"); exit(-1); } if ( nbound[1] * sly * 2 != (slx*sly*slz*slt) ) { if ( isBoss() ) printf("wfm::init Even y logic bomb\n"); exit(-1); } if ( nbound[2] * slz * 2 != (slx*sly*slz*slt) ) { if ( isBoss() ) printf("wfm::init Even z logic bomb\n"); exit(-1); } if ( nbound[3] * slt * 2 != (slx*sly*slz*slt) ) { if ( isBoss() ) printf("wfm::init Even t logic bomb\n"); exit(-1); } /*------------------------------------------------------------------------*/ /* Check shape */ /*------------------------------------------------------------------------*/ if ( (slx&1) ) { if ( isBoss() ) printf("Bagel is refusing to run as x-sub latt is odd\n"); exit(-1); } if ( (sly&1) &&(slz&1)&&(slt&1) ) { if ( isBoss() ) printf("Bagel is refusing to run as y,z,t sub latts are all odd\n"); exit(-1); } /*--------------------------------------------------------------------------*/ /* Reserve memory for 1 temporary spinor (needed by mdagm) */ /*--------------------------------------------------------------------------*/ spinor_words = SPINOR_SIZE * vol; spinor_tmp = (Float *)ALLOC(spinor_words*sizeof(Float)*2); //printf("wfm_init::spinor_tmp=%p\n",spinor_tmp); // VRB.Flow(cname,fname,"spinor_tmp=%p\n",spinor_tmp); #ifdef USE_QALLOC // If we used QALLOC, and the ALLOC macro failed we can try // qalloc but without the QFAST flag. Even tho the spinor_tmp is // not communicated we leave the QCOMMS bit on in case it puts // spinor tmp into a better place in the memory map if(spinor_tmp == 0) { if ( isBoss() ) printf("BAGEL: Warning spinor_tmp has spilled out of Edram\n"); spinor_tmp = (Float *) qalloc(QCOMMS,spinor_words*sizeof(Float)*2); } #endif // USE QALLOC if(spinor_tmp == 0){ if ( isBoss() ) printf("wfm::spinor_tmp allocate\n"); exit(-1); } //~~ //~~ twisted mass fermions: sets WilsonArg.spinor_tmp tp //~~ address of temporary spinor in wfm class //~~ wilson_p->spinor_tmp = spinor_tmp; //~~ /*--------------------------------------------------------------------------*/ /* Reserve memory for the 4 forward and 4 backward spin projected half */ /* spinors. */ /*--------------------------------------------------------------------------*/ /*PAB 10/1/2001 */ half_spinor_words = NMinusPlus * ND * PAD_HALF_SPINOR_SIZE * vol; #ifndef USE_COMMS_QMP two_spinor = (Float *)ALLOC(half_spinor_words*sizeof(Float)); #ifdef USE_QALLOC // If we are using QALLOC and the ALLOC macro failed we can still // try to get slow memory. Leave on the QCOMMS bit for good memory map // placement if(two_spinor == 0) { if ( isBoss() ) printf("BAGEL : warning two spinors have spilled out of Edram\n"); two_spinor = (Float *)qalloc(QCOMMS,half_spinor_words*sizeof(Float)); } #endif // USE_QALLOC if(two_spinor == 0){ if ( isBoss() ) printf("wfm::two_spinor allocate\n"); exit(-1); } #else // Since two spinor is now communicated because of the Tface // receive I have to allocate it in the style of QMP two_spinor_mem_t = QMP_allocate_aligned_memory( half_spinor_words*sizeof(Float), WFM_ALIGN_ARG, (QMP_MEM_COMMS|QMP_MEM_FAST)); if( two_spinor_mem_t == 0x0 ) { // Try slow allocation two_spinor_mem_t = QMP_allocate_aligned_memory( half_spinor_words*sizeof(Float), WFM_ALIGN_ARG, QMP_MEM_COMMS); if( two_spinor_mem_t == 0x0 ) { if ( isBoss() ) printf("wfm_init::could not allocate two spinor_mem_t\n"); exit(-1); } } two_spinor = (Float *)QMP_get_memory_pointer(two_spinor_mem_t); if (two_spinor == 0x0) { if ( isBoss() ) printf("wfm::init QMP_get_memory_pointer returned NULL pointer from non NULL QMP_mem_t\n"); exit(-1); } #endif /*--------------------------------------------------------------------------*/ /* Reserve memory for the 4 forward and 4 backward spin projected half */ /* spinors. */ /*--------------------------------------------------------------------------*/ for ( int pm = 0;pm<2;pm++ ) { for ( mu = 0 ; mu < 4 ; mu++) if (local_comm[mu]==0) { half_spinor_words = PAD_HALF_SPINOR_SIZE * nbound[mu]; // These things are (potentially) communicated so need QMP Style // allocation if using QMP // // Note: I am allocating the buffers in all directions regardless // of whether we are communicating in that dir or not (Copying CPS) #ifndef USE_COMMS_QMP // Not using QMP recv_bufs[pm][mu] = (Float *)ALLOC(half_spinor_words*sizeof(Float)); #ifdef USE_QALLOC // If ALLOC fails try slow memory but with QCOMMS bit still set if( recv_bufs[pm][mu] == 0x0 ) recv_bufs[pm][mu] = (Float *)qalloc(QCOMMS, half_spinor_words*sizeof(Float)); #endif if(recv_bufs[pm][mu] == 0){ if ( isBoss() ) printf("wfm::recv_bufs allocate\n"); exit(-1); } send_bufs[pm][mu]=(Float *)SEND_ALLOC(half_spinor_words*sizeof(Float)); #ifdef USE_QALLOC // If SEND ALLOC macro fails try slow memory but with QNONCACHE bit // still set if( send_bufs[pm][mu] == 0 ) send_bufs[pm][mu]=(Float *)qalloc(QNONCACHE, half_spinor_words*sizeof(Float)); #endif if(send_bufs[pm][mu] == 0){ if ( isBoss() ) printf("wfm::send_bufs allocate\n"); exit(-1); } #else /* QMP memory allocation: A little involved */ /* Must allocate "opaque" QMP_mem_t first and then get aligned pointer out of it. It's either what is below or a very complicated send alloc */ /* Peter in the CPS allocs recv_bufs with ALLOC = QCOMMS|FAST */ recv_bufs_mem_t[pm][mu] = QMP_allocate_aligned_memory(half_spinor_words*sizeof(Float), WFM_ALIGN_ARG, (QMP_MEM_COMMS|QMP_MEM_FAST)); if( recv_bufs_mem_t[pm][mu] == 0x0 ) { // If QMP_allocate memory fails with FAST, try SLOW but keep COMMS recv_bufs_mem_t[pm][mu] = QMP_allocate_aligned_memory(half_spinor_words*sizeof(Float), WFM_ALIGN_ARG, QMP_MEM_COMMS); if( recv_bufs_mem_t[pm][mu] == 0x0 ) { if ( isBoss() ) printf("wfm::init recv_bufs_mem_t[%d][%d]: QMP_allocate_aligned_memory returned NULL\n", pm, mu); exit(-1); } } /* Now get the aligned pointer */ recv_bufs[pm][mu] =(Float *)QMP_get_memory_pointer(recv_bufs_mem_t[pm][mu]); if( recv_bufs[pm][mu] == 0x0 ) { if ( isBoss() ) printf("wfm::init recv_bufs[%d][%d]: NULL aligned pointer in non NULL QMP_mem_t struct \n", pm, mu); exit(-1); } /* Now do the same for the send bufs */ /* In CPS Peter allocates as SEND_ALLOC = QNONCACHE | QFAST */ send_bufs_mem_t[pm][mu] = QMP_allocate_aligned_memory(half_spinor_words*sizeof(Float), WFM_ALIGN_ARG, (QMP_MEM_NONCACHE|QMP_MEM_FAST)); if( send_bufs_mem_t[pm][mu] == 0x0 ) { // if allocator fails, try slow but still NONCACHE send_bufs_mem_t[pm][mu] = QMP_allocate_aligned_memory(half_spinor_words*sizeof(Float), WFM_ALIGN_ARG, QMP_MEM_NONCACHE); if( send_bufs_mem_t[pm][mu] == 0x0 ) { if ( isBoss() ) printf("wfm::init: send_bufs_mem_t[%d][%d]: QMP_allocate_aligned_memory returned NULL\n", pm, mu); exit(-1); } } /* Now get the aligned pointer */ send_bufs[pm][mu] =(Float *)QMP_get_memory_pointer(send_bufs_mem_t[pm][mu]); if( send_bufs[pm][mu] == 0x0 ) { if ( isBoss() ) printf("wfm::init send_bufs[%d][%d]: NULL aligned pointer in non NULL QMP_mem_t struct \n", pm, mu); exit(-1); } #endif } } /*----------------------------------------------------------------------*/ /* Build the pointer table */ /*----------------------------------------------------------------------*/ pointers_init(); /*----------------------------------------------------------------------*/ /* Initialise the comms */ /*----------------------------------------------------------------------*/ comm_init(); }
int main(int argc, char *argv[]) { int status = 1; int mu, i; struct QOP_CLOVER_State *clover_state; QDP_Int *I_seed; int i_seed; QDP_RandomState *state; QLA_Real plaq; QLA_Real n[NELEMS(F)]; struct QOP_CLOVER_Gauge *c_g; struct QOP_CLOVER_Fermion *c_f[NELEMS(F)]; double kappa; double c_sw; /* start QDP */ QDP_initialize(&argc, &argv); if (argc != 1 + NDIM + 3) { printf0("ERROR: usage: %s Lx ... seed kappa c_sw\n", argv[0]); goto end; } for (mu = 0; mu < NDIM; mu++) { lattice[mu] = atoi(argv[1 + mu]); } i_seed = atoi(argv[1 + NDIM]); kappa = atof(argv[2 + NDIM]); c_sw = atof(argv[3 + NDIM]); /* set lattice size and create layout */ QDP_set_latsize(NDIM, lattice); QDP_create_layout(); primary = QMP_is_primary_node(); self = QMP_get_node_number(); get_vector(network, 1, QMP_get_logical_number_of_dimensions(), QMP_get_logical_dimensions()); get_vector(node, 0, QMP_get_logical_number_of_dimensions(), QMP_get_logical_coordinates()); printf0("network: "); for (i = 0; i < NDIM; i++) printf0(" %d", network[i]); printf0("\n"); printf0("node: "); for (i = 0; i < NDIM; i++) printf0(" %d", node[i]); printf0("\n"); printf0("kappa: %20.15f\n", kappa); printf0("c_sw: %20.15f\n", c_sw); /* allocate the gauge field */ create_Mvector(U, NELEMS(U)); create_Mvector(C, NELEMS(C)); create_Dvector(F, NELEMS(F)); I_seed = QDP_create_I(); QDP_I_eq_funci(I_seed, icoord, QDP_all); state = QDP_create_S(); QDP_S_eq_seed_i_I(state, i_seed, I_seed, QDP_all); for (mu = 0; mu < NELEMS(U); mu++) { QDP_M_eq_gaussian_S(U[mu], state, QDP_all); } for (i = 0; i < NELEMS(F); i++) { QDP_D_eq_gaussian_S(F[i], state, QDP_all); } /* build the clovers */ clover(C, U); /* initialize CLOVER */ if (QOP_CLOVER_init(&clover_state, lattice, network, node, primary, sublattice, NULL)) { printf0("CLOVER_init() failed\n"); goto end; } if (QOP_CLOVER_import_fermion(&c_f[0], clover_state, f_reader, F[0])) { printf0("CLOVER_import_fermion(0) failed\n"); goto end; } if (QOP_CLOVER_import_fermion(&c_f[1], clover_state, f_reader, F[1])) { printf0("CLOVER_import_fermion(1) failed\n"); goto end; } if (QOP_CLOVER_allocate_fermion(&c_f[2], clover_state)) { printf0("CLOVER_allocate_fermion(2) failed\n"); goto end; } if (QOP_CLOVER_allocate_fermion(&c_f[3], clover_state)) { printf0("CLOVER_allocate_fermion(3) failed\n"); goto end; } if (QOP_CLOVER_import_gauge(&c_g, clover_state, kappa, c_sw, u_reader, c_reader, NULL)) { printf("CLOVER_import_gauge() failed\n"); goto end; } QOP_CLOVER_D_operator(c_f[2], c_g, c_f[0]); QOP_CLOVER_export_fermion(f_writer, F[2], c_f[2]); QOP_CLOVER_D_operator_conjugated(c_f[3], c_g, c_f[1]); QOP_CLOVER_export_fermion(f_writer, F[3], c_f[3]); /* free CLOVER */ QOP_CLOVER_free_gauge(&c_g); for (i = 0; i < NELEMS(c_f); i++) QOP_CLOVER_free_fermion(&c_f[i]); QOP_CLOVER_fini(&clover_state); /* Compute plaquette */ plaq = plaquette(U); /* field norms */ for (i = 0; i < NELEMS(F); i++) QDP_r_eq_norm2_D(&n[i], F[i], QDP_all); /* Display the values */ printf0("plaquette = %g\n", plaq / (QDP_volume() * QDP_Nc * NDIM * (NDIM - 1) / 2 )); for (i = 0; i < NELEMS(F); i++) printf0(" |f|^2 [%d] = %20.10e\n", i, (double)(n[i])); /* Compute and display <f[1] f[2]> */ show_dot("1|D0", F[1], F[2]); /* Compute and display <f[3] f[0]> */ show_dot("X1|0", F[3], F[0]); QDP_destroy_S(state); QDP_destroy_I(I_seed); destroy_Mvector(U, NELEMS(U)); destroy_Mvector(C, NELEMS(C)); destroy_Dvector(F, NELEMS(F)); status = 0; end: /* shutdown QDP */ printf0("end\n"); QDP_finalize(); return status; }
int main(int argc, char *argv[]) { struct QOP_MDWF_State *mdwf_state = NULL; struct QOP_MDWF_Parameters *mdwf_params = NULL; QMP_thread_level_t qt = QMP_THREAD_SINGLE; int status = 1; int i; if (QMP_init_msg_passing(&argc, &argv, qt, &qt) != QMP_SUCCESS) { fprintf(stderr, "QMP_init() failed\n"); return 1; } for (i = 0; i < NELEM(b5); i++) { b5[i] = 0.1 * i * (NELEM(b5) - i); c5[i] = 0.1 * i * i * (NELEM(b5) - i); } self = QMP_get_node_number(); primary = QMP_is_primary_node(); if (argc != 7) { zprint("7 arguments expected, found %d", argc); zprint("usage: localheat Lx Ly Lz Lt Ls time"); QMP_finalize_msg_passing(); return 1; } for (i = 0; i < 4; i++) { mynetwork[i] = 1; mylocal[i] = atoi(argv[i+1]); mylattice[i] = mylocal[i] * mynetwork[i]; } mylocal[4] = mylattice[4] = atoi(argv[5]); total_sec = atoi(argv[6]); zshowv4("network", mynetwork); zshowv5("local lattice", mylocal); zshowv5("lattice", mylattice); zprint("total requested runtime %.0f sec", total_sec); #if 0 if (QMP_declare_logical_topology(mynetwork, 4) != QMP_SUCCESS) { zprint("declare_logical_top failed"); goto end; } getv(mynode, 0, QMP_get_logical_number_of_dimensions(), QMP_get_logical_coordinates()); #else { int i; for (i = 0; i < 4; i++) mynode[i] = 0; } #endif if (QOP_MDWF_init(&mdwf_state, mylattice, mynetwork, mynode, primary, getsub, NULL)) { zprint("MDWF_init() failed"); goto end; } zprint("MDWF_init() done"); if (QOP_MDWF_set_generic(&mdwf_params, mdwf_state, b5, c5, 0.123, 0.05)) { zprint("MDW_set_generic() failed"); goto end; } zprint("MDWF_set_generic() done"); if (do_run(mdwf_state, mdwf_params)) { zprint("float test failed"); goto end; } QOP_MDWF_fini(&mdwf_state); zprint("Heater test finished"); status = 0; end: QMP_finalize_msg_passing(); return status; }
int main (int argc, char** argv) { int i, nc; QMP_status_t status; int **smem, **rmem; QMP_msgmem_t *recvmem; QMP_msghandle_t *recvh; QMP_msgmem_t *sendmem; QMP_msghandle_t *sendh; struct perf_argv pargv; QMP_thread_level_t req, prv; /** * Simple point to point topology */ int dims[4] = {2,2,2,2}; int ndims = 1; //if(QMP_get_node_number()==0) //printf("starting init\n"); fflush(stdout); req = QMP_THREAD_SINGLE; status = QMP_init_msg_passing (&argc, &argv, req, &prv); if (status != QMP_SUCCESS) { fprintf (stderr, "QMP_init failed\n"); return -1; } if(QMP_get_node_number()==0) printf("finished init\n"); fflush(stdout); if (parse_options (argc, argv, &pargv) == -1) { if(QMP_get_node_number()==0) usage (argv[0]); exit (1); } { int maxdims = 4; int k=0; int nodes = QMP_get_number_of_nodes(); ndims = 0; while( (nodes&1) == 0 ) { if(ndims<maxdims) ndims++; else { dims[k] *= 2; k++; if(k>=maxdims) k = 0; } nodes /= 2; } if(nodes != 1) { QMP_error("invalid number of nodes %i", QMP_get_number_of_nodes()); QMP_error(" must power of 2"); QMP_abort(1); } pargv.ndims = ndims; } status = QMP_declare_logical_topology (dims, ndims); if (status != QMP_SUCCESS) { fprintf (stderr, "Cannot declare logical grid\n"); return -1; } /* do a broadcast of parameter */ if (QMP_broadcast (&pargv, sizeof (pargv)) != QMP_SUCCESS) { QMP_printf ("Broadcast parameter failed\n"); exit (1); } { int k=1; const int *lc = QMP_get_logical_coordinates(); for(i=0; i<ndims; i++) k += lc[i]; pargv.sender = k&1; } QMP_printf("%s options: num_channels[%d] verify[%d] option[%d] datasize[%d] numloops[%d] sender[%d] strided_send[%i] strided_recv[%i] strided_array_send[%i] ", argv[0], pargv.num_channels, pargv.verify, pargv.option, pargv.size, pargv.loops, pargv.sender, strided_send, strided_recv, strided_array_send); fflush(stdout); /** * Create memory */ nc = pargv.num_channels; smem = (int **)malloc(nc*sizeof (int *)); rmem = (int **)malloc(nc*sizeof (int *)); sendmem = (QMP_msgmem_t *)malloc(ndims*nc*sizeof (QMP_msgmem_t)); recvmem = (QMP_msgmem_t *)malloc(ndims*nc*sizeof (QMP_msgmem_t)); sendh = (QMP_msghandle_t *)malloc(nc*sizeof (QMP_msghandle_t)); recvh = (QMP_msghandle_t *)malloc(nc*sizeof (QMP_msghandle_t)); QMP_barrier(); if(QMP_get_node_number()==0) printf("\n"); fflush(stdout); if(pargv.option & TEST_SIMUL) { int opts = pargv.option; pargv.option = TEST_SIMUL; if(QMP_get_node_number()==0) QMP_printf("starting simultaneous sends"); fflush(stdout); for(i=pargv.minsize; i<=pargv.maxsize; i*=pargv.facsize) { pargv.size = i; create_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc, i, &pargv); test_simultaneous_send (smem, rmem, sendh, recvh, &pargv); check_mem(rmem, ndims, nc, i); free_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc); } if(QMP_get_node_number()==0) QMP_printf("finished simultaneous sends\n"); fflush(stdout); pargv.option = opts; } if(pargv.option & TEST_PINGPONG) { int opts = pargv.option; pargv.option = TEST_PINGPONG; if(QMP_get_node_number()==0) QMP_printf("starting ping pong sends"); fflush(stdout); for(i=pargv.minsize; i<=pargv.maxsize; i*=pargv.facsize) { pargv.size = i; create_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc, i, &pargv); if(pargv.verify) test_pingpong_verify(smem, rmem, sendh, recvh, &pargv); else test_pingpong(smem, rmem, sendh, recvh, &pargv); check_mem(rmem, ndims, nc, i); free_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc); } if(QMP_get_node_number()==0) QMP_printf("finished ping pong sends\n"); fflush(stdout); pargv.option = opts; } if(pargv.option & TEST_ONEWAY) { int opts = pargv.option; pargv.option = TEST_ONEWAY; if(QMP_get_node_number()==0) QMP_printf("starting one way sends"); fflush(stdout); for(i=pargv.minsize; i<=pargv.maxsize; i*=pargv.facsize) { pargv.size = i; create_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc, i, &pargv); test_oneway (smem, rmem, sendh, recvh, &pargv); if(!pargv.sender) check_mem(rmem, ndims, nc, i); free_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc); } if(QMP_get_node_number()==0) QMP_printf("finished one way sends"); fflush(stdout); pargv.option = opts; } /** * Free memory */ free (smem); free (rmem); free (sendh); free (recvh); free (sendmem); free (recvmem); QMP_finalize_msg_passing (); return 0; }
void make_shift_tables(int bound[2][4][4], halfspinor_array* chi1, halfspinor_array* chi2, halfspinor_array* recv_bufs[2][4], halfspinor_array* send_bufs[2][4], void (*QDP_getSiteCoords)(int coord[], int node, int linearsite), int (*QDP_getLinearSiteIndex)(const int coord[]), int (*QDP_getNodeNumber)(const int coord[])) { volatile int dir,i; const int my_node = QMP_get_node_number(); int coord[4]; int gcoord[4]; int gcoord2[4]; int linear; int **shift_table; int x,y,z,t; int *subgrid_size = getSubgridSize(); int mu; int offset; int cb; const int *node_coord = QMP_get_logical_coordinates(); int p; int site, index; InvTab4 *xinvtab; InvTab4 *invtab; int qdp_index; int my_index; int num; int offsite_found; /* Setup the subgrid volume for ever after */ subgrid_vol = 1; for(i=0; i < getNumDim(); ++i) { subgrid_vol *= getSubgridSize()[i]; } /* Get the checkerboard size for ever after */ subgrid_vol_cb = subgrid_vol / 2; /* Now I want to build the site table */ /* I want it cache line aligned? */ xsite_table = (int *)malloc(sizeof(int)*subgrid_vol+63L); if(xsite_table == 0x0 ) { QMP_error("Couldnt allocate site table"); QMP_abort(1); } site_table = (int *)((((ptrdiff_t)(xsite_table))+63L)&(-64L)); xinvtab = (InvTab4 *)malloc(sizeof(InvTab4)*subgrid_vol + 63L); if(xinvtab == 0x0 ) { QMP_error("Couldnt allocate site table"); QMP_abort(1); } invtab = (InvTab4 *)((((ptrdiff_t)(xinvtab))+63L)&(-64L)); /* Inversity of functions check: Check that myLinearSiteIndex3D is in fact the inverse of mySiteCoords3D, and that QDP_getSiteCoords is the inverse of QDP_linearSiteIndex() */ for(p=0; p < 2; p++) { for(site=0; site < subgrid_vol_cb; site++) { /* Linear site index */ my_index = site + subgrid_vol_cb*p; QDP_getSiteCoords(gcoord, my_node, my_index); linear=QDP_getLinearSiteIndex(gcoord); if( linear != my_index ) { printf("P%d cb=%d site=%d : QDP_getSiteCoords not inverse of QDP_getLinearSiteIndex(): my_index=%d linear=%d\n", my_node, p,site, my_index,linear); } mySiteCoords4D(gcoord, my_node, my_index); linear=myLinearSiteIndex4D(gcoord); if( linear != my_index ) { printf("P%d cb=%d site=%d : mySiteCoords3D not inverse of myLinearSiteIndex3D(): my_index=%d linear=%d\n", my_node, p,site, my_index,linear); } } } /* Loop through sites - you can choose your path below */ /* This is a checkerboarded order which is identical hopefully to QDP++'s rb2 subset when QDP++ is in a CB2 layout */ for(p=0; p < 2; p++) { for(t=0; t < subgrid_size[3]; t++) { for(z=0; z < subgrid_size[2]; z++) { for(y=0; y < subgrid_size[1]; y++) { for(x=0; x < subgrid_size[0]/2; x++) { coord[0] = 2*x + p; coord[1] = y; coord[2] = z; coord[3] = t; /* Make global */ for(i=0; i < 4; i++) { coord[i] += subgrid_size[i]*node_coord[i]; } /* Index of coordinate -- NB this is not lexicographic but takes into account checkerboarding in QDP++ */ qdp_index = QDP_getLinearSiteIndex(coord); /* Index of coordinate in my layout. -- NB this is not lexicographic but takes into account my 3D checkerbaording */ my_index = myLinearSiteIndex4D(coord); site_table[my_index] = qdp_index; cb=parity(coord); linear = my_index%subgrid_vol_cb; invtab[qdp_index].cb=cb; invtab[qdp_index].linearcb=linear; } } } } } /* Site table transitivity check: for each site, convert to index in cb3d, convert to qdp index convert qdp_index to coordinate convert coordinate to back index in cb3d Check that your cb3d at the end is the same as you started with */ for(p=0; p < 2; p++) { for(site=0; site < subgrid_vol_cb; site++) { /* My local index */ my_index = site + subgrid_vol_cb*p; /* Convert to QDP index */ qdp_index = site_table[ my_index ]; /* Switch QDP index to coordinates */ QDP_getSiteCoords(gcoord, my_node,qdp_index); /* Convert back to cb3d index */ linear = myLinearSiteIndex4D(gcoord); /* Check new cb,cbsite index matches the old cb index */ if (linear != my_index) { printf("P%d The Circle is broken. My index=%d qdp_index=%d coords=%d,%d,%d,%d linear(=my_index?)=%d\n", my_node, my_index, qdp_index, gcoord[0],gcoord[1],gcoord[2],gcoord[3],linear); } } } /* Consistency check 2: Test mySiteCoords 3D for all 3d cb,cb3index convert to cb3d linear index (my_index) convert to qdp_index (lookup in site table) Now convert qdp_index and my_index both to coordinates. They should produce the same coordinates */ for(p=0; p < 2; p++) { for(site=0; site < subgrid_vol_cb; site++) { /* My local index */ my_index = site + subgrid_vol_cb*p; mySiteCoords4D(gcoord, my_node, my_index); qdp_index = site_table[ my_index ]; QDP_getSiteCoords(gcoord2, my_node,qdp_index); for(mu=0 ; mu < 4; mu++) { if( gcoord2[mu] != gcoord[mu] ) { printf("P%d: my_index=%d qdp_index=%d mySiteCoords=(%d,%d,%d,%d) QDPsiteCoords=(%d,%d,%d,%d)\n", my_node, my_index, qdp_index, gcoord[0], gcoord[1], gcoord[2], gcoord[3], gcoord2[0], gcoord2[1], gcoord2[2], gcoord2[3]); continue; } } } } /* Allocate the shift table */ /* The structure is as follows: There are 4 shift tables in order: [ Table 1 | Table 2 | Table 3 | Table 4 ] Table 1: decomp_scatter_index[mu][site] Table 2: decomp_hvv_scatter_index[mu][site] Table 3: recons_mvv_gather_index[mu][site] Table 4: recons_gather_index[mu][site] */ /* This 4 is for the 4 tables: Table 1-4*/ if ((shift_table = (int **)malloc(4*sizeof(int*))) == 0 ) { QMP_error("init_wnxtsu3dslash: could not initialize shift_table"); QMP_abort(1); } for(i=0; i < 4; i++) { /* This 4 is for the 4 comms dierctions: */ if ((shift_table[i] = (int *)malloc(4*subgrid_vol*sizeof(int))) == 0) { QMP_error("init_wnxtsu3dslash: could not initialize shift_table"); QMP_abort(1); } } /* Initialize the boundary counters */ for(cb=0; cb < 2; cb++) { for(dir=0; dir < 4; dir++) { bound[cb][0][dir] = 0; bound[cb][1][dir] = 0; bound[cb][2][dir] = 0; bound[cb][3][dir] = 0; } } for(cb=0; cb < 2; cb++) { for(site=0; site < subgrid_vol_cb; ++site) { index = cb*subgrid_vol_cb + site; /* Fetch site from site table */ qdp_index = site_table[index]; /* Get its coords */ QDP_getSiteCoords(coord, my_node, qdp_index); /* Loop over directions building up shift tables */ for(dir=0; dir < 4; dir++) { int fcoord[4], bcoord[4]; int fnode, bnode; int blinear, flinear; /* Backwards displacement*/ offs(bcoord, coord, dir, -1); bnode = QDP_getNodeNumber(bcoord); blinear = QDP_getLinearSiteIndex(bcoord); /* Forward displacement */ offs(fcoord, coord, dir, +1); fnode = QDP_getNodeNumber(fcoord); flinear = QDP_getLinearSiteIndex(fcoord); /* Scatter: decomp_{plus,minus} */ /* Operation: a^F(shift(x,type=0),dir) <- decomp(psi(x),dir) */ /* Send backwards - also called a receive from forward */ if (bnode != my_node) { /* Offnode */ /* Append to Tail 1, increase boundary count */ /* This is the correct code */ shift_table[DECOMP_SCATTER][dir+4*index] = subgrid_vol_cb + bound[1-cb][DECOMP_SCATTER][dir]; bound[1-cb][DECOMP_SCATTER][dir]++; } else { /* On node. Note the linear part of its (cb3, linear) bit, using a reverse lookup */ shift_table[DECOMP_SCATTER][dir+4*index] = invtab[blinear].linearcb; } /* Scatter: decomp_hvv_{plus,minus} */ /* Operation: a^B(shift(x,type=1),dir) <- U^dag(x,dir)*decomp(psi(x),dir) */ /* Send forwards - also called a receive from backward */ if (fnode != my_node) { /* Offnode */ /* Append to Tail 1, increase boundary count */ shift_table[DECOMP_HVV_SCATTER][dir+4*index] = subgrid_vol_cb + bound[1-cb][DECOMP_HVV_SCATTER][dir]; bound[1-cb][DECOMP_HVV_SCATTER][dir]++; } else { /* On node. Note the linear part of its (cb3, linear) bit, using a reverse lookup */ shift_table[DECOMP_HVV_SCATTER][dir+4*index] /* Onnode */ = invtab[flinear].linearcb ; } /* Gather: mvv_recons_{plus,minus} */ /* Operation: chi(x) <- \sum_dir U(x,dir)*a^F(shift(x,type=2),dir) */ /* Receive from forward */ if (fnode != my_node) { /* Offnode */ /* Append to Tail 2, increase boundary count */ shift_table[RECONS_MVV_GATHER][dir+4*index] = 2*subgrid_vol_cb + (bound[cb][RECONS_MVV_GATHER][dir]); bound[cb][RECONS_MVV_GATHER][dir]++; } else { /* On node. Note the linear part of its (cb3, linear) bit, using a reverse lookup. Note this is a recons post shift, so the linear coordinate to invert is mine rather than the neighbours */ shift_table[RECONS_MVV_GATHER][dir+4*index] = invtab[qdp_index].linearcb ; } /* Gather: recons_{plus,minus} */ /* Operation: chi(x) += \sum_dir recons(a^B(shift(x,type=3),dir),dir) */ /* Receive from backward */ if (bnode != my_node) { shift_table[RECONS_GATHER][dir+4*index] = 2*subgrid_vol_cb + bound[cb][RECONS_GATHER][dir]; bound[cb][RECONS_GATHER][dir]++; } else { /* On node. Note the linear part of its (cb3, linear) bit, using a reverse lookup. Note this is a recons post shift, so the linear coordinate to invert is mine rather than the neighbours */ shift_table[RECONS_GATHER][dir+4*index] = invtab[qdp_index].linearcb ; } } } } /* Sanity check - make sure the sending and receiving counters match */ for(cb=0; cb < 2; cb++) { for(dir=0; dir < 4; dir++) { /* Sanity 1: Must have same number of boundary sites on each cb for a given operation */ for(i = 0; i < 4; i++) { if (bound[1-cb][i][dir] != bound[cb][i][dir]) { QMP_error("SSE Wilson dslash - make_shift_tables: type 0 diff. cb send/recv counts do not match: %d %d", bound[1-cb][i][dir],bound[cb][i][dir]); QMP_abort(1); } } } } /* Now I want to make the offset table into the half spinor temporaries */ /* The half spinor temporaries will look like this: dir=0 [ Body Half Spinors ][ Tail 1 Half Spinors ][ Tail 2 Half Spinors ] dir=1 [ Body Half Spinors ][ Tail 1 Half Spinors ][ Tail 2 Half Spinors ] ... And each of these blocks of half spinors will be sized to vol_cb sites (ie half volume only). The shift_table() for a given site and direction indexes into one of these lines. So the offset table essentially delineates which line one picks, by adding an offset of 3*subgrid_vol_cb*dir To the shift. The result from offset table, can be used directly as a pointer displacement on the temporaries. Perhaps the best way to condsider this is to consider a value of shift_table[type][dir/site] that lands in the body. The shift table merely gives me a site index. But the data needs to be different for each direction for that site index. Hence we need to replicate the body, for each dir. The 3xsubgrid_vol_cb is just there to take care of the buffers. Or another way to think of it is that there is a 'body element' index specified by the shift table lookup, and that dir is just the slowest varying index. */ /* 4 dims, 4 types, rest of the magic is to align the thingie */ xoffset_table = (halfspinor_array **)malloc(4*4*subgrid_vol*sizeof(halfspinor_array*)+63L); if( xoffset_table == 0 ) { QMP_error("init_wnxtsu3dslash: could not initialize offset_table[i]"); QMP_abort(1); } /* This is the bit what aligns straight from AMD Manual */ offset_table = (halfspinor_array**)((((ptrdiff_t)(xoffset_table)) + 63L) & (-64L)); /* Walk through the shift_table and remap the offsets into actual pointers */ /* DECOMP_SCATTER */ num=0; for(dir =0; dir < Nd; dir++) { /* Loop through all the sites. Remap the offsets either to local arrays or pointers */ offsite_found=0; for(site=0; site < subgrid_vol; site++) { offset = shift_table[DECOMP_SCATTER][dir+4*site]; if( offset >= subgrid_vol_cb ) { /* Found an offsite guy. It's address must be to the send back buffer */ /* send to back index = recv from forward index = 0 */ offsite_found++; offset_table[ dir + 4*(site + subgrid_vol*DECOMP_SCATTER) ] = send_bufs[0][num]+(offset - subgrid_vol_cb); } else { /* Guy is onsite: This is DECOMP_SCATTER so offset to chi1 */ offset_table[ dir + 4*(site + subgrid_vol*DECOMP_SCATTER) ] = chi1+shift_table[DECOMP_SCATTER][dir+4*site]+subgrid_vol_cb*dir; } } if( offsite_found > 0 ) { /* If we found an offsite guy, next direction has to go into the next dir part of the send bufs */ num++; } } /* DECOMP_HVV_SCATTER */ /* Restart num-s */ num=0; for(dir =0; dir <Nd; dir++) { offsite_found=0; for(site=0; site < subgrid_vol; site++) { offset = shift_table[DECOMP_HVV_SCATTER][dir+4*site]; if( offset >= subgrid_vol_cb ) { /* Found an offsite guy. It's address must be to the send forw buffer */ /* send to forward / receive from backward index = 1 */ offsite_found++; offset_table[ dir + 4*(site + subgrid_vol*DECOMP_HVV_SCATTER) ] = send_bufs[1][num]+(offset - subgrid_vol_cb); } else { /* Guy is onsite. This is DECOMP_HVV_SCATTER so offset to chi2 */ offset_table[ dir + 4*(site + subgrid_vol*DECOMP_HVV_SCATTER) ] = chi2+shift_table[DECOMP_HVV_SCATTER][dir+4*site ]+subgrid_vol_cb*dir; } } if( offsite_found > 0 ) { num++; } } /* RECONS_MVV_GATHER */ num=0; for(dir =0; dir <Nd; dir++) { offsite_found=0; for(site=0; site < subgrid_vol; site++) { offset = shift_table[RECONS_MVV_GATHER][dir+4*site]; if( offset >= 2*subgrid_vol_cb ) { /* Found an offsite guy. It's address must be to the recv from front buffer */ /* recv_from front index = send to back index = 0 */ offsite_found++; offset_table[ dir + 4*(site + subgrid_vol*RECONS_MVV_GATHER) ] = recv_bufs[0][num]+(offset - 2*subgrid_vol_cb); } else { /* Guy is onsite */ /* This is RECONS_MVV_GATHER so offset with respect to chi1 */ offset_table[ dir + 4*(site + subgrid_vol*RECONS_MVV_GATHER) ] = chi1+shift_table[RECONS_MVV_GATHER][dir+4*site ]+subgrid_vol_cb*dir; } } if( offsite_found > 0 ) { num++; } } /* RECONS_GATHER */ num=0; for(dir =0; dir <Nd; dir++) { offsite_found=0; for(site=0; site < subgrid_vol; site++) { offset = shift_table[RECONS_GATHER][dir+4*site]; if( offset >= 2*subgrid_vol_cb ) { /* Found an offsite guy. It's address must be to the recv from back buffer */ /* receive from back = send to forward index = 1*/ offsite_found++; offset_table[ dir + 4*(site + subgrid_vol*RECONS_GATHER) ] = recv_bufs[1][num]+(offset - 2*subgrid_vol_cb); } else { /* Guy is onsite */ /* This is RECONS_GATHER so offset with respect to chi2 */ offset_table[ dir + 4*(site + subgrid_vol*RECONS_GATHER ) ] = chi2+shift_table[RECONS_GATHER][dir+4*site ]+subgrid_vol_cb*dir; } } if( offsite_found > 0 ) { num++; } } /* Free shift table - it is no longer needed. We deal solely with offsets */ for(i=0; i < 4; i++) { free( (shift_table)[i] ); } free( shift_table ); free( xinvtab ); }