void comm_init(int ndim, const int *dims, QudaCommsMap rank_from_coords, void *map_data) { if ( QMP_is_initialized() != QMP_TRUE ) { errorQuda("QMP has not been initialized"); } int grid_size = 1; for (int i = 0; i < ndim; i++) { grid_size *= dims[i]; } if (grid_size != QMP_get_number_of_nodes()) { errorQuda("Communication grid size declared via initCommsGridQuda() does not match" " total number of QMP nodes (%d != %d)", grid_size, QMP_get_number_of_nodes()); } Topology *topo = comm_create_topology(ndim, dims, rank_from_coords, map_data); comm_set_default_topology(topo); // determine which GPU this process will use (FIXME: adopt the scheme in comm_mpi.cpp) int device_count; cudaGetDeviceCount(&device_count); if (device_count == 0) { errorQuda("No CUDA devices found"); } gpuid = (comm_rank() % device_count); }
void initQuda(int dev) { static int initialized = 0; if (initialized) { return; } initialized = 1; #if (CUDA_VERSION >= 4000) && defined(MULTI_GPU) //check if CUDA_NIC_INTEROP is set to 1 in the enviroment char* cni_str = getenv("CUDA_NIC_INTEROP"); if(cni_str == NULL){ errorQuda("Environment variable CUDA_NIC_INTEROP is not set\n"); } int cni_int = atoi(cni_str); if (cni_int != 1){ errorQuda("Environment variable CUDA_NIC_INTEROP is not set to 1\n"); } #endif int deviceCount; cudaGetDeviceCount(&deviceCount); if (deviceCount == 0) { errorQuda("No devices supporting CUDA"); } for(int i=0; i<deviceCount; i++) { cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, i); printfQuda("QUDA: Found device %d: %s\n", i, deviceProp.name); } #ifdef QMP_COMMS int ndim; const int *dim; if ( QMP_is_initialized() != QMP_TRUE ) { errorQuda("QMP is not initialized"); } num_QMP=QMP_get_number_of_nodes(); rank_QMP=QMP_get_node_number(); dev += rank_QMP % deviceCount; ndim = QMP_get_logical_number_of_dimensions(); dim = QMP_get_logical_dimensions(); #elif defined(MPI_COMMS) comm_init(); dev=comm_gpuid(); #else if (dev < 0) dev = deviceCount - 1; #endif // Used for applying the gauge field boundary condition if( commCoords(3) == 0 ) qudaPt0=true; else qudaPt0=false; if( commCoords(3) == commDim(3)-1 ) qudaPtNm1=true; else qudaPtNm1=false; cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); if (deviceProp.major < 1) { errorQuda("Device %d does not support CUDA", dev); } printfQuda("QUDA: Using device %d: %s\n", dev, deviceProp.name); cudaSetDevice(dev); #ifdef HAVE_NUMA if(numa_config_set){ if(gpu_affinity[dev] >=0){ printfQuda("Numa setting to cpu node %d\n", gpu_affinity[dev]); if(numa_run_on_node(gpu_affinity[dev]) != 0){ printfQuda("Warning: Setting numa to cpu node %d failed\n", gpu_affinity[dev]); } } } #endif initCache(); quda::initBlas(); }
void wfm::init(WilsonArg *wilson_p) /* pointer to Wilson type structure */ { int spinor_words; /* size of the spinor field on the */ /* sublattice checkerboard */ int half_spinor_words; /* size of the spin-projected "half_spinors*/ /* on the sublattice checkerboard including*/ /* the communications padding */ int slx; /* x-direction size of node sublattice */ int sly; /* y-direction size of node sublattice */ int slz; /* z-direction size of node sublattice */ int slt; /* t-direction size of node sublattice */ int i; int mu; SloppyPrecision = wilson_p->SloppyPrecision; WFM_BGL = wilson_p->WFM_BGL; // if ( isBoss() ) printf("wfm::init setting up BG/L MMU state\n"); mmu_optimise(); mmu_print(); // CoreCount( wilson_p->CoreCount ); CoreCount( 1 ); if ( WFM_BGL ) PAD_HALF_SPINOR_SIZE = 12; else PAD_HALF_SPINOR_SIZE = 16; if ( WFM_BGL && (nthread > 1) && SloppyPrecision ) { if ( isBoss() ) printf("Bagel does not maintain L1 coherence in dual core + single precision mode on BlueGene\n"); if ( isBoss() ) printf("Get on to IBM to give me access to SWOA MMU options, or even better a non-cache image of DRAM\n"); if ( isBoss() ) printf("If they give me the tools, I'm happy to do the heroics of mainting sfw coherence\n"); if ( isBoss() ) printf("Bagel insanity check exiting\n"); exit(-1); } IR = wilson_p->instruction_reg_num; /*--------------------------------------------------------------------------*/ /* Set sublattice direction sizes */ /*--------------------------------------------------------------------------*/ local_latt[0] = wilson_p->local_latt[0]; local_latt[1] = wilson_p->local_latt[1]; local_latt[2] = wilson_p->local_latt[2]; local_latt[3] = wilson_p->local_latt[3]; slx = local_latt[0]; sly = local_latt[1]; slz = local_latt[2]; slt = local_latt[3]; #if (defined USE_COMMS_QMP) && (!defined UNIFORM_SEED_NO_COMMS) QMP_bool_t qmp_inited=QMP_is_initialized(); if( !qmp_inited ) { if ( isBoss() ) printf("QMP_not_initialized\n"); exit(-1); } const int *ncoor = QMP_get_logical_coordinates(); base_parity =(ncoor[0]*local_latt[0] + ncoor[1]*local_latt[1] + ncoor[2]*local_latt[2] + ncoor[3]*local_latt[3])&0x1; #else base_parity = 0; #endif /*--------------------------------------------------------------------------*/ /* Set periodic wrap back or not */ /*--------------------------------------------------------------------------*/ local_comm[0] = wilson_p->local_comm[0]; local_comm[1] = wilson_p->local_comm[1]; local_comm[2] = wilson_p->local_comm[2]; local_comm[3] = wilson_p->local_comm[3]; #ifdef UNIFORM_SEED_NO_COMMS for(int i=0;i<4;i++) if(local_comm[0]!=1){ fprintf(stderr,"wfm::local_comm[%d]=%d!\n",i,local_comm[i]); exit(-33); } #endif /*-----------------------------------------------------------------------*/ /* compute the subgrd volume of each chkbd ... at least two local dims */ /* must be even for this code to be correct. */ /*-----------------------------------------------------------------------*/ vol = (slx * sly * slz * slt)/2; nbound[0] = (sly * slz * slt)/2; nbound[1] = (slx * slz * slt)/2; nbound[2] = (slx * sly * slt)/2; nbound[3] = (slx * sly * slz)/2; allbound = nbound[0] + nbound[1] + nbound[2] + nbound[3]; if ( nbound[0] * slx * 2 != (slx*sly*slz*slt) ) { if ( isBoss() ) printf("wfm::init Even x logic bomb\n"); exit(-1); } if ( nbound[1] * sly * 2 != (slx*sly*slz*slt) ) { if ( isBoss() ) printf("wfm::init Even y logic bomb\n"); exit(-1); } if ( nbound[2] * slz * 2 != (slx*sly*slz*slt) ) { if ( isBoss() ) printf("wfm::init Even z logic bomb\n"); exit(-1); } if ( nbound[3] * slt * 2 != (slx*sly*slz*slt) ) { if ( isBoss() ) printf("wfm::init Even t logic bomb\n"); exit(-1); } /*------------------------------------------------------------------------*/ /* Check shape */ /*------------------------------------------------------------------------*/ if ( (slx&1) ) { if ( isBoss() ) printf("Bagel is refusing to run as x-sub latt is odd\n"); exit(-1); } if ( (sly&1) &&(slz&1)&&(slt&1) ) { if ( isBoss() ) printf("Bagel is refusing to run as y,z,t sub latts are all odd\n"); exit(-1); } /*--------------------------------------------------------------------------*/ /* Reserve memory for 1 temporary spinor (needed by mdagm) */ /*--------------------------------------------------------------------------*/ spinor_words = SPINOR_SIZE * vol; spinor_tmp = (Float *)ALLOC(spinor_words*sizeof(Float)*2); //printf("wfm_init::spinor_tmp=%p\n",spinor_tmp); // VRB.Flow(cname,fname,"spinor_tmp=%p\n",spinor_tmp); #ifdef USE_QALLOC // If we used QALLOC, and the ALLOC macro failed we can try // qalloc but without the QFAST flag. Even tho the spinor_tmp is // not communicated we leave the QCOMMS bit on in case it puts // spinor tmp into a better place in the memory map if(spinor_tmp == 0) { if ( isBoss() ) printf("BAGEL: Warning spinor_tmp has spilled out of Edram\n"); spinor_tmp = (Float *) qalloc(QCOMMS,spinor_words*sizeof(Float)*2); } #endif // USE QALLOC if(spinor_tmp == 0){ if ( isBoss() ) printf("wfm::spinor_tmp allocate\n"); exit(-1); } //~~ //~~ twisted mass fermions: sets WilsonArg.spinor_tmp tp //~~ address of temporary spinor in wfm class //~~ wilson_p->spinor_tmp = spinor_tmp; //~~ /*--------------------------------------------------------------------------*/ /* Reserve memory for the 4 forward and 4 backward spin projected half */ /* spinors. */ /*--------------------------------------------------------------------------*/ /*PAB 10/1/2001 */ half_spinor_words = NMinusPlus * ND * PAD_HALF_SPINOR_SIZE * vol; #ifndef USE_COMMS_QMP two_spinor = (Float *)ALLOC(half_spinor_words*sizeof(Float)); #ifdef USE_QALLOC // If we are using QALLOC and the ALLOC macro failed we can still // try to get slow memory. Leave on the QCOMMS bit for good memory map // placement if(two_spinor == 0) { if ( isBoss() ) printf("BAGEL : warning two spinors have spilled out of Edram\n"); two_spinor = (Float *)qalloc(QCOMMS,half_spinor_words*sizeof(Float)); } #endif // USE_QALLOC if(two_spinor == 0){ if ( isBoss() ) printf("wfm::two_spinor allocate\n"); exit(-1); } #else // Since two spinor is now communicated because of the Tface // receive I have to allocate it in the style of QMP two_spinor_mem_t = QMP_allocate_aligned_memory( half_spinor_words*sizeof(Float), WFM_ALIGN_ARG, (QMP_MEM_COMMS|QMP_MEM_FAST)); if( two_spinor_mem_t == 0x0 ) { // Try slow allocation two_spinor_mem_t = QMP_allocate_aligned_memory( half_spinor_words*sizeof(Float), WFM_ALIGN_ARG, QMP_MEM_COMMS); if( two_spinor_mem_t == 0x0 ) { if ( isBoss() ) printf("wfm_init::could not allocate two spinor_mem_t\n"); exit(-1); } } two_spinor = (Float *)QMP_get_memory_pointer(two_spinor_mem_t); if (two_spinor == 0x0) { if ( isBoss() ) printf("wfm::init QMP_get_memory_pointer returned NULL pointer from non NULL QMP_mem_t\n"); exit(-1); } #endif /*--------------------------------------------------------------------------*/ /* Reserve memory for the 4 forward and 4 backward spin projected half */ /* spinors. */ /*--------------------------------------------------------------------------*/ for ( int pm = 0;pm<2;pm++ ) { for ( mu = 0 ; mu < 4 ; mu++) if (local_comm[mu]==0) { half_spinor_words = PAD_HALF_SPINOR_SIZE * nbound[mu]; // These things are (potentially) communicated so need QMP Style // allocation if using QMP // // Note: I am allocating the buffers in all directions regardless // of whether we are communicating in that dir or not (Copying CPS) #ifndef USE_COMMS_QMP // Not using QMP recv_bufs[pm][mu] = (Float *)ALLOC(half_spinor_words*sizeof(Float)); #ifdef USE_QALLOC // If ALLOC fails try slow memory but with QCOMMS bit still set if( recv_bufs[pm][mu] == 0x0 ) recv_bufs[pm][mu] = (Float *)qalloc(QCOMMS, half_spinor_words*sizeof(Float)); #endif if(recv_bufs[pm][mu] == 0){ if ( isBoss() ) printf("wfm::recv_bufs allocate\n"); exit(-1); } send_bufs[pm][mu]=(Float *)SEND_ALLOC(half_spinor_words*sizeof(Float)); #ifdef USE_QALLOC // If SEND ALLOC macro fails try slow memory but with QNONCACHE bit // still set if( send_bufs[pm][mu] == 0 ) send_bufs[pm][mu]=(Float *)qalloc(QNONCACHE, half_spinor_words*sizeof(Float)); #endif if(send_bufs[pm][mu] == 0){ if ( isBoss() ) printf("wfm::send_bufs allocate\n"); exit(-1); } #else /* QMP memory allocation: A little involved */ /* Must allocate "opaque" QMP_mem_t first and then get aligned pointer out of it. It's either what is below or a very complicated send alloc */ /* Peter in the CPS allocs recv_bufs with ALLOC = QCOMMS|FAST */ recv_bufs_mem_t[pm][mu] = QMP_allocate_aligned_memory(half_spinor_words*sizeof(Float), WFM_ALIGN_ARG, (QMP_MEM_COMMS|QMP_MEM_FAST)); if( recv_bufs_mem_t[pm][mu] == 0x0 ) { // If QMP_allocate memory fails with FAST, try SLOW but keep COMMS recv_bufs_mem_t[pm][mu] = QMP_allocate_aligned_memory(half_spinor_words*sizeof(Float), WFM_ALIGN_ARG, QMP_MEM_COMMS); if( recv_bufs_mem_t[pm][mu] == 0x0 ) { if ( isBoss() ) printf("wfm::init recv_bufs_mem_t[%d][%d]: QMP_allocate_aligned_memory returned NULL\n", pm, mu); exit(-1); } } /* Now get the aligned pointer */ recv_bufs[pm][mu] =(Float *)QMP_get_memory_pointer(recv_bufs_mem_t[pm][mu]); if( recv_bufs[pm][mu] == 0x0 ) { if ( isBoss() ) printf("wfm::init recv_bufs[%d][%d]: NULL aligned pointer in non NULL QMP_mem_t struct \n", pm, mu); exit(-1); } /* Now do the same for the send bufs */ /* In CPS Peter allocates as SEND_ALLOC = QNONCACHE | QFAST */ send_bufs_mem_t[pm][mu] = QMP_allocate_aligned_memory(half_spinor_words*sizeof(Float), WFM_ALIGN_ARG, (QMP_MEM_NONCACHE|QMP_MEM_FAST)); if( send_bufs_mem_t[pm][mu] == 0x0 ) { // if allocator fails, try slow but still NONCACHE send_bufs_mem_t[pm][mu] = QMP_allocate_aligned_memory(half_spinor_words*sizeof(Float), WFM_ALIGN_ARG, QMP_MEM_NONCACHE); if( send_bufs_mem_t[pm][mu] == 0x0 ) { if ( isBoss() ) printf("wfm::init: send_bufs_mem_t[%d][%d]: QMP_allocate_aligned_memory returned NULL\n", pm, mu); exit(-1); } } /* Now get the aligned pointer */ send_bufs[pm][mu] =(Float *)QMP_get_memory_pointer(send_bufs_mem_t[pm][mu]); if( send_bufs[pm][mu] == 0x0 ) { if ( isBoss() ) printf("wfm::init send_bufs[%d][%d]: NULL aligned pointer in non NULL QMP_mem_t struct \n", pm, mu); exit(-1); } #endif } } /*----------------------------------------------------------------------*/ /* Build the pointer table */ /*----------------------------------------------------------------------*/ pointers_init(); /*----------------------------------------------------------------------*/ /* Initialise the comms */ /*----------------------------------------------------------------------*/ comm_init(); }