void comm_init(int ndim, const int *dims, QudaCommsMap rank_from_coords, void *map_data) { if ( QMP_is_initialized() != QMP_TRUE ) { errorQuda("QMP has not been initialized"); } int grid_size = 1; for (int i = 0; i < ndim; i++) { grid_size *= dims[i]; } if (grid_size != QMP_get_number_of_nodes()) { errorQuda("Communication grid size declared via initCommsGridQuda() does not match" " total number of QMP nodes (%d != %d)", grid_size, QMP_get_number_of_nodes()); } Topology *topo = comm_create_topology(ndim, dims, rank_from_coords, map_data); comm_set_default_topology(topo); // determine which GPU this process will use (FIXME: adopt the scheme in comm_mpi.cpp) int device_count; cudaGetDeviceCount(&device_count); if (device_count == 0) { errorQuda("No CUDA devices found"); } gpuid = (comm_rank() % device_count); }
int main(int argc, char **argv) { #ifdef QMP_COMMS int ndim=4, dims[4]; QMP_thread_level_t tl; QMP_init_msg_passing(&argc, &argv, QMP_THREAD_SINGLE, &tl); dims[0] = dims[1] = dims[2] = 1; dims[3] = QMP_get_number_of_nodes(); QMP_declare_logical_topology(dims, ndim); #endif SU3Test(); #ifdef QMP_COMMS QMP_finalize_msg_passing(); #endif return 0; }
void stupid_broadcast(void *send_buf, int count) { int node; int num_nodes = QMP_get_number_of_nodes(); QMP_msgmem_t request_msg = QMP_declare_msgmem(send_buf, count); QMP_msghandle_t request_mh; // Send to each node for(node=1; node < num_nodes; ++node) { if (QMP_get_node_number() == node) { request_mh = QMP_declare_receive_from(request_msg, 0, 0); if (QMP_start(request_mh) != QMP_SUCCESS) QMP_abort_string(1, "recvFromWait failed\n"); QMP_wait(request_mh); QMP_free_msghandle(request_mh); } if (QMP_is_primary_node()) { request_mh = QMP_declare_send_to(request_msg, node, 0); if (QMP_start(request_mh) != QMP_SUCCESS) QMP_abort_string(1, "sendToWait failed\n"); QMP_wait(request_mh); QMP_free_msghandle(request_mh); } } QMP_free_msgmem(request_msg); }
int comm_size(void) { return QMP_get_number_of_nodes(); }
void initQuda(int dev) { static int initialized = 0; if (initialized) { return; } initialized = 1; #if (CUDA_VERSION >= 4000) && defined(MULTI_GPU) //check if CUDA_NIC_INTEROP is set to 1 in the enviroment char* cni_str = getenv("CUDA_NIC_INTEROP"); if(cni_str == NULL){ errorQuda("Environment variable CUDA_NIC_INTEROP is not set\n"); } int cni_int = atoi(cni_str); if (cni_int != 1){ errorQuda("Environment variable CUDA_NIC_INTEROP is not set to 1\n"); } #endif int deviceCount; cudaGetDeviceCount(&deviceCount); if (deviceCount == 0) { errorQuda("No devices supporting CUDA"); } for(int i=0; i<deviceCount; i++) { cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, i); printfQuda("QUDA: Found device %d: %s\n", i, deviceProp.name); } #ifdef QMP_COMMS int ndim; const int *dim; if ( QMP_is_initialized() != QMP_TRUE ) { errorQuda("QMP is not initialized"); } num_QMP=QMP_get_number_of_nodes(); rank_QMP=QMP_get_node_number(); dev += rank_QMP % deviceCount; ndim = QMP_get_logical_number_of_dimensions(); dim = QMP_get_logical_dimensions(); #elif defined(MPI_COMMS) comm_init(); dev=comm_gpuid(); #else if (dev < 0) dev = deviceCount - 1; #endif // Used for applying the gauge field boundary condition if( commCoords(3) == 0 ) qudaPt0=true; else qudaPt0=false; if( commCoords(3) == commDim(3)-1 ) qudaPtNm1=true; else qudaPtNm1=false; cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); if (deviceProp.major < 1) { errorQuda("Device %d does not support CUDA", dev); } printfQuda("QUDA: Using device %d: %s\n", dev, deviceProp.name); cudaSetDevice(dev); #ifdef HAVE_NUMA if(numa_config_set){ if(gpu_affinity[dev] >=0){ printfQuda("Numa setting to cpu node %d\n", gpu_affinity[dev]); if(numa_run_on_node(gpu_affinity[dev]) != 0){ printfQuda("Warning: Setting numa to cpu node %d failed\n", gpu_affinity[dev]); } } } #endif initCache(); quda::initBlas(); }
int qio_test(int output_volfmt, int output_serpar, int ildgstyle, int input_volfmt, int input_serpar, int argc, char *argv[]){ float array_in[NARRAY], array_out[NARRAY]; float *field_in[NREAL], *subset_in[NREAL], *field_out[NREAL], *subset_out[NREAL]; suN_matrix *field_su3_out[NMATRIX], *field_su3_in[NMATRIX]; QIO_Writer *outfile; QIO_Reader *infile; float diff_field = 0, diff_array = 0, diff_su3 = 0, diff_subset = 0; QMP_thread_level_t provided; int status; int sites_on_node = 0; int i,volume; char filename[] = "binary_test"; int dim = 4; int lower[4] = {1, 0, 0, 2}; int upper[4] = {2, 3, 3, 2}; char myname[] = "qio_test"; /* Start message passing */ QMP_init_msg_passing(&argc, &argv, QMP_THREAD_SINGLE, &provided); this_node = mynode(); printf("%s(%d) QMP_init_msg_passing done\n",myname,this_node); /* Lattice dimensions */ lattice_dim = 4; lattice_size[0] = 8; lattice_size[1] = 4; lattice_size[2] = 4; lattice_size[3] = 4; volume = 1; for(i = 0; i < lattice_dim; i++){ volume *= lattice_size[i]; } /* Set the mapping of coordinates to nodes */ if(setup_layout(lattice_size, 4, QMP_get_number_of_nodes())!=0) return 1; printf("%s(%d) layout set for %d nodes\n",myname,this_node, QMP_get_number_of_nodes()); sites_on_node = num_sites(this_node); /* Build the layout structure */ layout.node_number = node_number; layout.node_index = node_index; layout.get_coords = get_coords; layout.num_sites = num_sites; layout.latsize = lattice_size; layout.latdim = lattice_dim; layout.volume = volume; layout.sites_on_node = sites_on_node; layout.this_node = this_node; layout.number_of_nodes = QMP_get_number_of_nodes(); /* Open the test output file */ outfile = open_test_output(filename, output_volfmt, output_serpar, ildgstyle, myname); if(outfile == NULL)return 1; /* If this is not the ILDG file test */ if(ildgstyle == QIO_ILDGNO){ /* Create the test output field */ status = vcreate_R(field_out, NREAL); if(status)return status; /* Set some values for the field */ vset_R(field_out, NREAL); /* Write the real test field */ status = write_real_field(outfile, NREAL, field_out, myname); if(status)return status; /* Write a subset of the real test field */ status = write_real_field_subset(outfile, NREAL, field_out, lower, upper, dim, myname); if(status)return status; /* Set some values for the global array */ for(i = 0; i < NARRAY; i++) array_out[i] = i; /* Write the real global array */ status = write_real_global(outfile, NARRAY, array_out, myname); if(status)return status; } /* Create the test output su3 field */ status = vcreate_M(field_su3_out, NMATRIX); if(status)return status; /* Set some values for the su3 field */ vset_M(field_su3_out, NMATRIX); /* Write the su3 test field */ status = write_su3_field(outfile, NMATRIX, field_su3_out, myname); if(status)return status; /* Close the file */ QIO_close_write(outfile); printf("%s(%d): Closed file for writing\n",myname,this_node); /* Set up a dummy input field */ status = vcreate_R(field_in, NREAL); if(status)return status; /* Set up a dummy input field for subset */ status = vcreate_R(subset_in, NREAL); if(status)return status; /* Set up a dummy input SU(N) field */ status = vcreate_M(field_su3_in, NMATRIX); if(status)return status; /* Open the test file for reading */ infile = open_test_input(filename, input_volfmt, input_serpar, myname); if(infile == NULL)return 1; if(ildgstyle == QIO_ILDGNO){ /* Peek at the field record */ status = peek_record_info(infile, myname); if(status != QIO_SUCCESS)return status; /* Skip the record */ #if(0) /* Skip the field */ status = QIO_next_record(infile); if(status != QIO_SUCCESS)return status; #else /* Read the field record */ printf("%s(%d) reading real field\n",myname,this_node); fflush(stdout); status = read_real_field(infile, NREAL, field_in, myname); if(status)return status; #endif /* Read the subset of the field */ printf("%s(%d) reading subset of real field\n", myname,this_node); fflush(stdout); status = read_real_field_subset(infile, NREAL, subset_in, myname); if(status)return status; /* Read the global array record */ printf("%s(%d) reading global field\n",myname,this_node); fflush(stdout); status = read_real_global(infile, NARRAY, array_in, myname); if(status)return status; } /* Read the su3 field record */ printf("%s(%d) reading su3 field\n",myname,this_node); fflush(stdout); status = read_su3_field(infile, NMATRIX, field_su3_in, myname); if(status)return status; /* Close the file */ QIO_close_read(infile); printf("%s(%d): Closed file for reading\n",myname,this_node); if(ildgstyle == QIO_ILDGNO){ /* Compare the input and output fields */ diff_field = vcompare_R(field_out, field_in, NREAL); if(this_node == 0){ printf("%s(%d): Comparison of in and out real fields |in - out|^2 = %e\n", myname,this_node,diff_field); } /* Create the subset output field */ status = vcreate_R(subset_out, NREAL); if(status)return status; /* Copy the subset */ vsubset_R(subset_out, field_out, lower, upper, NREAL); /* Compare the input and output subsets */ diff_subset = vcompare_R(subset_out, subset_in, NREAL); if(this_node == 0){ printf("%s(%d): Comparison of subsets of in and out real fields |in - out|^2 = %e\n", myname,this_node,diff_subset); } /* Compare the input and output global arrays */ diff_array = vcompare_r(array_out, array_in, NREAL); if(this_node == 0){ printf("%s(%d): Comparison of in and out real global arrays |in - out|^2 = %e\n", myname, this_node, diff_array); } } /* Compare the input and output suN fields */ diff_su3 = vcompare_M(field_su3_out, field_su3_in, NMATRIX); if(this_node == 0){ printf("%s(%d): Comparison of in and out suN fields |in - out|^2 = %e\n", myname, this_node, diff_field); } /* Clean up */ if(ildgstyle == QIO_ILDGNO){ vdestroy_R(field_out, NREAL); vdestroy_R(field_in, NREAL); vdestroy_R(subset_in, NREAL); vdestroy_R(subset_out, NREAL); } vdestroy_M(field_su3_in, NMATRIX); vdestroy_M(field_su3_out, NMATRIX); /* Shut down QMP */ QMP_finalize_msg_passing(); /* Report result */ if(diff_field + diff_subset + diff_su3 + diff_array > 0){ printf("%s(%d): Test failed\n",myname,this_node); return 1; } printf("%s(%d): Test passed\n",myname,this_node); return 0; }
static void eo_setup(QDP_Lattice *lat, void *args) { mLattice *S = args; QDP_allocate_lattice_params(lat, sizeof (params)); params *p = QDP_get_lattice_params(lat); p->S = S; if (QMP_get_msg_passing_type() != QMP_SWITCH) { int nd2 = QMP_get_allocated_number_of_dimensions(); const int *nsquares2 = QMP_get_allocated_dimensions(); int i; for (i = 0; i < S->rank; i++) { S->net[i] = (i < nd2) ? nsquares2[i] : 1; } } else { /* not QMP_GRID */ int squaresize[QLUA_MAX_LATTICE_RANK]; int extrafactors[QLUA_MAX_LATTICE_RANK]; int i; for (i = 0; i < S->rank; i++) { squaresize[i] = S->dim[i]; extrafactors[i] = 1; S->net[i] = 1; } /* Figure out dimensions of rectangle */ int n = QMP_get_number_of_nodes(); /* nodes to factor */ int k = MAXPRIMES-1; while (n > 1) { /* figure out which prime to divide by starting with largest */ /* if no factor found, assume n is prime */ while ((k >= 0) && (n % prime[k] != 0)) --k; int pfac = (k>=0) ? prime[k] : n; /* figure out which direction to divide */ /* find largest divisible dimension of h-cubes */ /* if one direction with largest dimension has already been divided, divide it again. Otherwise divide first direction with largest dimension. */ int j = -1; int i; for (i = 0; i < S->rank; i++) { if (squaresize[i] % pfac == 0) { if ((j<0) || (extrafactors[j] * squaresize[i] > extrafactors[i] * squaresize[j])) { j = i; } else if (extrafactors[j] * squaresize[i] == extrafactors[i] * squaresize[j]) { if ((S->net[j] == 1) || (S->net[i] != 1)) j = i; } } } /* This can fail if we run out of prime factors in the dimensions */ /* then just choose largest dimension */ if (j < 0) { int i; for (i = 0; i < S->rank; i++) { if ((j<0) || (extrafactors[j] * squaresize[i] > extrafactors[i] * squaresize[j]) ) { j = i; } else if (extrafactors[j] * squaresize[i] == extrafactors[i] * squaresize[j]) { if((S->net[j] == 1) || (S->net[i] != 1)) j = i; } } n /= pfac; extrafactors[j] *= pfac; S->net[j] *= pfac; } else { n /= pfac; squaresize[j] /= pfac; S->net[j] *= pfac; } } } /* not QMP_GRID */ int mc[QLUA_MAX_LATTICE_RANK]; int i; S->node = QDP_this_node; node2coord(mc, QDP_this_node, S); for (i = 0; i < S->rank; i++) { int x = mc[i]; mc[i] = x + 1; if (mc[i] == S->net[i]) mc[i] = 0; S->neighbor_up[i] = coord2node(mc, S); mc[i] = x - 1; if (mc[i] < 0) mc[i] = S->net[i] - 1; S->neighbor_down[i] = coord2node(mc, S); mc[i] = x; } }
void init_qmp(int * argc, char ***argv) { #if 0 printf("init_qmp(%d %p)\n",*argc,*argv); for(int i = 0; i<*argc;i++){ printf("argv[%d](before)=%s\n",i,(*argv)[i]); } #endif #if 0 spi_init(); #endif QMP_thread_level_t prv; #ifndef UNIFORM_SEED_NO_COMMS QMP_status_t init_status = QMP_init_msg_passing(argc, argv, QMP_THREAD_SINGLE, &prv); if (init_status) printf("QMP_init_msg_passing returned %d\n",init_status); peRank = QMP_get_node_number(); peNum = QMP_get_number_of_nodes(); if(!peRank)printf("QMP_init_msg_passing returned %d\n",init_status); if (init_status != QMP_SUCCESS) { QMP_error("%s\n",QMP_error_string(init_status)); } // check QMP thread level // Added by Hantao if(peRank == 0) { switch(prv) { case QMP_THREAD_SINGLE: printf("QMP thread level = QMP_THREAD_SINGLE\n"); break; case QMP_THREAD_FUNNELED: printf("QMP thread level = QMP_THREAD_FUNNELED\n"); break; case QMP_THREAD_SERIALIZED: printf("QMP thread level = QMP_THREAD_SERIALIZED\n"); break; case QMP_THREAD_MULTIPLE: printf("QMP thread level = QMP_THREAD_MULTIPLE\n"); break; default: printf("QMP thread level = no idea what this is, boom!\n"); } } //Check to make sure that this machine is a GRID machine //Exit if not GRID machine QMP_ictype qmp_type = QMP_get_msg_passing_type(); //Get information about the allocated machine peNum = QMP_get_number_of_nodes(); NDIM = QMP_get_allocated_number_of_dimensions(); peGrid = QMP_get_allocated_dimensions(); pePos = QMP_get_allocated_coordinates(); if(peRank==0){ for(int i = 0; i<*argc;i++){ printf("argv[%d])(after)=%s\n",i,(*argv)[i]); } } #else QMP_status_t init_status = QMP_SUCCESS; peRank=0; peNum=1; NDIM=4; #endif //#if (TARGET == BGL) || (TARGET == BGP) if (NDIM>5){ peNum = 1; for(int i = 0;i<5;i++) peNum *= peGrid[i]; peRank = peRank % peNum; } int if_print=1; for(int i = 0;i<NDIM;i++) if (pePos[i]>=2) if_print=0; if (if_print){ printf("Rank=%d Num=%d NDIM=%d\n",peRank,peNum,NDIM); printf("dim:"); for(int i = 0;i<NDIM;i++) printf(" %d",peGrid[i]); printf("\n"); printf("pos:"); for(int i = 0;i<NDIM;i++) printf(" %d",pePos[i]); printf("\n"); #if 0 int rc; BGLPersonality pers; rts_get_personality(&pers, sizeof(pers)); printf("from personality: %d %d %d %d\n",pers.xCoord,pers.yCoord,pers.zCoord,rts_get_processor_id()); #endif } // printf("from personality:\n"); #if 0 if ( (qmp_type!= QMP_GRID) && (qmp_type !=QMP_MESH) ) { QMP_error("CPS on QMP only implemented for GRID or MESH, not (%d) machines\n",qmp_type); } #endif // printf("QMP_declare_logical_topology(peGrid, NDIM)\n"); #ifndef UNIFORM_SEED_NO_COMMS //Declare the logical topology (Redundant for GRID machines) if (QMP_declare_logical_topology(peGrid, NDIM) != QMP_SUCCESS) { QMP_error("Node %d: Failed to declare logical topology\n",peRank); exit(-4); } #endif initialized = true; printf("Rank=%d init_qmp() done\n",peRank); }
int main (int argc, char** argv) { int i, nc; QMP_status_t status; int **smem, **rmem; QMP_msgmem_t *recvmem; QMP_msghandle_t *recvh; QMP_msgmem_t *sendmem; QMP_msghandle_t *sendh; struct perf_argv pargv; QMP_thread_level_t req, prv; /** * Simple point to point topology */ int dims[4] = {2,2,2,2}; int ndims = 1; //if(QMP_get_node_number()==0) //printf("starting init\n"); fflush(stdout); req = QMP_THREAD_SINGLE; status = QMP_init_msg_passing (&argc, &argv, req, &prv); if (status != QMP_SUCCESS) { fprintf (stderr, "QMP_init failed\n"); return -1; } if(QMP_get_node_number()==0) printf("finished init\n"); fflush(stdout); if (parse_options (argc, argv, &pargv) == -1) { if(QMP_get_node_number()==0) usage (argv[0]); exit (1); } { int maxdims = 4; int k=0; int nodes = QMP_get_number_of_nodes(); ndims = 0; while( (nodes&1) == 0 ) { if(ndims<maxdims) ndims++; else { dims[k] *= 2; k++; if(k>=maxdims) k = 0; } nodes /= 2; } if(nodes != 1) { QMP_error("invalid number of nodes %i", QMP_get_number_of_nodes()); QMP_error(" must power of 2"); QMP_abort(1); } pargv.ndims = ndims; } status = QMP_declare_logical_topology (dims, ndims); if (status != QMP_SUCCESS) { fprintf (stderr, "Cannot declare logical grid\n"); return -1; } /* do a broadcast of parameter */ if (QMP_broadcast (&pargv, sizeof (pargv)) != QMP_SUCCESS) { QMP_printf ("Broadcast parameter failed\n"); exit (1); } { int k=1; const int *lc = QMP_get_logical_coordinates(); for(i=0; i<ndims; i++) k += lc[i]; pargv.sender = k&1; } QMP_printf("%s options: num_channels[%d] verify[%d] option[%d] datasize[%d] numloops[%d] sender[%d] strided_send[%i] strided_recv[%i] strided_array_send[%i] ", argv[0], pargv.num_channels, pargv.verify, pargv.option, pargv.size, pargv.loops, pargv.sender, strided_send, strided_recv, strided_array_send); fflush(stdout); /** * Create memory */ nc = pargv.num_channels; smem = (int **)malloc(nc*sizeof (int *)); rmem = (int **)malloc(nc*sizeof (int *)); sendmem = (QMP_msgmem_t *)malloc(ndims*nc*sizeof (QMP_msgmem_t)); recvmem = (QMP_msgmem_t *)malloc(ndims*nc*sizeof (QMP_msgmem_t)); sendh = (QMP_msghandle_t *)malloc(nc*sizeof (QMP_msghandle_t)); recvh = (QMP_msghandle_t *)malloc(nc*sizeof (QMP_msghandle_t)); QMP_barrier(); if(QMP_get_node_number()==0) printf("\n"); fflush(stdout); if(pargv.option & TEST_SIMUL) { int opts = pargv.option; pargv.option = TEST_SIMUL; if(QMP_get_node_number()==0) QMP_printf("starting simultaneous sends"); fflush(stdout); for(i=pargv.minsize; i<=pargv.maxsize; i*=pargv.facsize) { pargv.size = i; create_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc, i, &pargv); test_simultaneous_send (smem, rmem, sendh, recvh, &pargv); check_mem(rmem, ndims, nc, i); free_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc); } if(QMP_get_node_number()==0) QMP_printf("finished simultaneous sends\n"); fflush(stdout); pargv.option = opts; } if(pargv.option & TEST_PINGPONG) { int opts = pargv.option; pargv.option = TEST_PINGPONG; if(QMP_get_node_number()==0) QMP_printf("starting ping pong sends"); fflush(stdout); for(i=pargv.minsize; i<=pargv.maxsize; i*=pargv.facsize) { pargv.size = i; create_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc, i, &pargv); if(pargv.verify) test_pingpong_verify(smem, rmem, sendh, recvh, &pargv); else test_pingpong(smem, rmem, sendh, recvh, &pargv); check_mem(rmem, ndims, nc, i); free_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc); } if(QMP_get_node_number()==0) QMP_printf("finished ping pong sends\n"); fflush(stdout); pargv.option = opts; } if(pargv.option & TEST_ONEWAY) { int opts = pargv.option; pargv.option = TEST_ONEWAY; if(QMP_get_node_number()==0) QMP_printf("starting one way sends"); fflush(stdout); for(i=pargv.minsize; i<=pargv.maxsize; i*=pargv.facsize) { pargv.size = i; create_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc, i, &pargv); test_oneway (smem, rmem, sendh, recvh, &pargv); if(!pargv.sender) check_mem(rmem, ndims, nc, i); free_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc); } if(QMP_get_node_number()==0) QMP_printf("finished one way sends"); fflush(stdout); pargv.option = opts; } /** * Free memory */ free (smem); free (rmem); free (sendh); free (recvh); free (sendmem); free (recvmem); QMP_finalize_msg_passing (); return 0; }