static void retrieveParityHw(void *res, ParityHw hw, QudaPrecision cpu_prec) { if (hw.precision != QUDA_HALF_PRECISION) { void *packedHw1 = pinned_malloc(hw.bytes); cudaMemcpy(packedHw1, hw.data, hw.bytes, cudaMemcpyDeviceToHost); if (hw.precision == QUDA_DOUBLE_PRECISION) { unpackParityHw((double*)res, (double2*)packedHw1, hw.volume); } else { if (cpu_prec == QUDA_DOUBLE_PRECISION) { unpackParityHw((double*)res, (float2*)packedHw1, hw.volume); } else { unpackParityHw((float*)res, (float2*)packedHw1, hw.volume); } } host_free(packedHw1); } else { //half precision /* ParityHw tmp = allocateParityHw(hw.X, QUDA_SINGLE_PRECISION); copyCuda(tmp, hw); retrieveParityHw(res, tmp, cpu_prec, dirac_order); freeParityHw(tmp); */ } }
void static loadParityHw(ParityHw ret, void *hw, QudaPrecision cpu_prec) { if (ret.precision == QUDA_DOUBLE_PRECISION && cpu_prec != QUDA_DOUBLE_PRECISION) { errorQuda("CUDA double precision requires CPU double precision"); } if (ret.precision != QUDA_HALF_PRECISION) { void *packedHw1 = pinned_malloc(ret.bytes); if (ret.precision == QUDA_DOUBLE_PRECISION) { packParityHw((double2*)packedHw1, (double*)hw, ret.volume); } else { if (cpu_prec == QUDA_DOUBLE_PRECISION) { packParityHw((float2*)packedHw1, (double*)hw, ret.volume); } else { packParityHw((float2*)packedHw1, (float*)hw, ret.volume); } } cudaMemcpy(ret.data, packedHw1, ret.bytes, cudaMemcpyHostToDevice); host_free(packedHw1); } else { //half precision /* ParityHw tmp = allocateParityHw(ret.X, QUDA_SINGLE_PRECISION); loadParityHw(tmp, hw, cpu_prec, dirac_order); copyCuda(ret, tmp); freeParityHw(tmp); */ } }
CloverField::CloverField(const CloverFieldParam ¶m) : LatticeField(param), bytes(0), norm_bytes(0), nColor(3), nSpin(4), clover(0), norm(0), cloverInv(0), invNorm(0), order(param.order), create(param.create), trlog(static_cast<double*>(pinned_malloc(2*sizeof(double)))) { if (nDim != 4) errorQuda("Number of dimensions must be 4, not %d", nDim); if (order == QUDA_QDPJIT_CLOVER_ORDER && create != QUDA_REFERENCE_FIELD_CREATE) errorQuda("QDPJIT ordered clover fields only supported for reference fields"); real_length = 2*volumeCB*nColor*nColor*nSpin*nSpin/2; // block-diagonal Hermitian (72 reals) length = 2*stride*nColor*nColor*nSpin*nSpin/2; bytes = length*precision; bytes = ALIGNMENT_ADJUST(bytes); if (precision == QUDA_HALF_PRECISION) { norm_bytes = sizeof(float)*2*stride*2; // 2 chirality norm_bytes = ALIGNMENT_ADJUST(norm_bytes); } //for twisted mass only: twisted = false;//param.twisted; mu2 = 0.0; //param.mu2; }
static void gauge_force_test(void) { int max_length = 6; initQuda(device); setVerbosityQuda(QUDA_VERBOSE,"",stdout); qudaGaugeParam = newQudaGaugeParam(); qudaGaugeParam.X[0] = xdim; qudaGaugeParam.X[1] = ydim; qudaGaugeParam.X[2] = zdim; qudaGaugeParam.X[3] = tdim; setDims(qudaGaugeParam.X); qudaGaugeParam.anisotropy = 1.0; qudaGaugeParam.cpu_prec = link_prec; qudaGaugeParam.cuda_prec = link_prec; qudaGaugeParam.cuda_prec_sloppy = link_prec; qudaGaugeParam.reconstruct = link_recon; qudaGaugeParam.reconstruct_sloppy = link_recon; qudaGaugeParam.type = QUDA_SU3_LINKS; // in this context, just means these are site links qudaGaugeParam.gauge_order = gauge_order; qudaGaugeParam.t_boundary = QUDA_PERIODIC_T; qudaGaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO; qudaGaugeParam.ga_pad = 0; qudaGaugeParam.mom_ga_pad = 0; size_t gSize = qudaGaugeParam.cpu_prec; void* sitelink; void* sitelink_1d; #ifdef GPU_DIRECT sitelink_1d = pinned_malloc(4*V*gaugeSiteSize*gSize); #else sitelink_1d = safe_malloc(4*V*gaugeSiteSize*gSize); #endif // this is a hack to have site link generated in 2d // then copied to 1d array in "MILC" format void* sitelink_2d[4]; #ifdef GPU_DIRECT for(int i=0;i<4;i++) sitelink_2d[i] = pinned_malloc(V*gaugeSiteSize*qudaGaugeParam.cpu_prec); #else for(int i=0;i<4;i++) sitelink_2d[i] = safe_malloc(V*gaugeSiteSize*qudaGaugeParam.cpu_prec); #endif // fills the gauge field with random numbers createSiteLinkCPU(sitelink_2d, qudaGaugeParam.cpu_prec, 0); //copy the 2d sitelink to 1d milc format for(int dir = 0; dir < 4; dir++){ for(int i=0; i < V; i++){ char* src = ((char*)sitelink_2d[dir]) + i * gaugeSiteSize* qudaGaugeParam.cpu_prec; char* dst = ((char*)sitelink_1d) + (4*i+dir)*gaugeSiteSize*qudaGaugeParam.cpu_prec ; memcpy(dst, src, gaugeSiteSize*qudaGaugeParam.cpu_prec); } } if (qudaGaugeParam.gauge_order == QUDA_MILC_GAUGE_ORDER){ sitelink = sitelink_1d; }else if (qudaGaugeParam.gauge_order == QUDA_QDP_GAUGE_ORDER) { sitelink = (void**)sitelink_2d; } else { errorQuda("Unsupported gauge order %d", qudaGaugeParam.gauge_order); } #ifdef MULTI_GPU void* sitelink_ex_2d[4]; void* sitelink_ex_1d; sitelink_ex_1d = pinned_malloc(4*V_ex*gaugeSiteSize*gSize); for(int i=0;i < 4;i++) sitelink_ex_2d[i] = pinned_malloc(V_ex*gaugeSiteSize*gSize); int X1= Z[0]; int X2= Z[1]; int X3= Z[2]; int X4= Z[3]; for(int i=0; i < V_ex; i++){ int sid = i; int oddBit=0; if(i >= Vh_ex){ sid = i - Vh_ex; oddBit = 1; } int za = sid/E1h; int x1h = sid - za*E1h; int zb = za/E2; int x2 = za - zb*E2; int x4 = zb/E3; int x3 = zb - x4*E3; int x1odd = (x2 + x3 + x4 + oddBit) & 1; int x1 = 2*x1h + x1odd; if( x1< 2 || x1 >= X1 +2 || x2< 2 || x2 >= X2 +2 || x3< 2 || x3 >= X3 +2 || x4< 2 || x4 >= X4 +2){ continue; } x1 = (x1 - 2 + X1) % X1; x2 = (x2 - 2 + X2) % X2; x3 = (x3 - 2 + X3) % X3; x4 = (x4 - 2 + X4) % X4; int idx = (x4*X3*X2*X1+x3*X2*X1+x2*X1+x1)>>1; if(oddBit){ idx += Vh; } for(int dir= 0; dir < 4; dir++){ char* src = (char*)sitelink_2d[dir]; char* dst = (char*)sitelink_ex_2d[dir]; memcpy(dst+i*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize); }//dir }//i for(int dir = 0; dir < 4; dir++){ for(int i=0; i < V_ex; i++){ char* src = ((char*)sitelink_ex_2d[dir]) + i * gaugeSiteSize* qudaGaugeParam.cpu_prec; char* dst = ((char*)sitelink_ex_1d) + (4*i+dir)*gaugeSiteSize*qudaGaugeParam.cpu_prec ; memcpy(dst, src, gaugeSiteSize*qudaGaugeParam.cpu_prec); } } #endif void* mom = safe_malloc(4*V*momSiteSize*gSize); void* refmom = safe_malloc(4*V*momSiteSize*gSize); memset(mom, 0, 4*V*momSiteSize*gSize); //initialize some data in cpuMom createMomCPU(mom, qudaGaugeParam.cpu_prec); memcpy(refmom, mom, 4*V*momSiteSize*gSize); double loop_coeff_d[sizeof(loop_coeff_f)/sizeof(float)]; for(unsigned int i=0;i < sizeof(loop_coeff_f)/sizeof(float); i++){ loop_coeff_d[i] = loop_coeff_f[i]; } void* loop_coeff; if(qudaGaugeParam.cuda_prec == QUDA_SINGLE_PRECISION){ loop_coeff = (void*)&loop_coeff_f[0]; }else{ loop_coeff = loop_coeff_d; } double eb3 = 0.3; int num_paths = sizeof(path_dir_x)/sizeof(path_dir_x[0]); int** input_path_buf[4]; for(int dir =0; dir < 4; dir++){ input_path_buf[dir] = (int**)safe_malloc(num_paths*sizeof(int*)); for(int i=0;i < num_paths;i++){ input_path_buf[dir][i] = (int*)safe_malloc(length[i]*sizeof(int)); if(dir == 0) memcpy(input_path_buf[dir][i], path_dir_x[i], length[i]*sizeof(int)); else if(dir ==1) memcpy(input_path_buf[dir][i], path_dir_y[i], length[i]*sizeof(int)); else if(dir ==2) memcpy(input_path_buf[dir][i], path_dir_z[i], length[i]*sizeof(int)); else if(dir ==3) memcpy(input_path_buf[dir][i], path_dir_t[i], length[i]*sizeof(int)); } } if (tune) { printfQuda("Tuning...\n"); setTuning(QUDA_TUNE_YES); } struct timeval t0, t1; double timeinfo[3]; /* Multiple execution to exclude warmup time in the first run*/ for (int i =0;i < attempts; i++){ gettimeofday(&t0, NULL); computeGaugeForceQuda(mom, sitelink, input_path_buf, length, loop_coeff_d, num_paths, max_length, eb3, &qudaGaugeParam, timeinfo); gettimeofday(&t1, NULL); } double total_time = t1.tv_sec - t0.tv_sec + 0.000001*(t1.tv_usec - t0.tv_usec); //The number comes from CPU implementation in MILC, gauge_force_imp.c int flops=153004; if (verify_results){ for(int i = 0;i < attempts;i++){ #ifdef MULTI_GPU //last arg=0 means no optimization for communication, i.e. exchange data in all directions //even they are not partitioned int R[4] = {2, 2, 2, 2}; exchange_cpu_sitelink_ex(qudaGaugeParam.X, R, (void**)sitelink_ex_2d, QUDA_QDP_GAUGE_ORDER, qudaGaugeParam.cpu_prec, 0, 4); gauge_force_reference(refmom, eb3, sitelink_2d, sitelink_ex_2d, qudaGaugeParam.cpu_prec, input_path_buf, length, loop_coeff, num_paths); #else gauge_force_reference(refmom, eb3, sitelink_2d, NULL, qudaGaugeParam.cpu_prec, input_path_buf, length, loop_coeff, num_paths); #endif } int res; res = compare_floats(mom, refmom, 4*V*momSiteSize, 1e-3, qudaGaugeParam.cpu_prec); strong_check_mom(mom, refmom, 4*V, qudaGaugeParam.cpu_prec); printf("Test %s\n",(1 == res) ? "PASSED" : "FAILED"); } double perf = 1.0* flops*V/(total_time*1e+9); double kernel_perf = 1.0*flops*V/(timeinfo[1]*1e+9); printf("init and cpu->gpu time: %.2f ms, kernel time: %.2f ms, gpu->cpu and cleanup time: %.2f total time =%.2f ms\n", timeinfo[0]*1e+3, timeinfo[1]*1e+3, timeinfo[2]*1e+3, total_time*1e+3); printf("kernel performance: %.2f GFLOPS, overall performance : %.2f GFLOPS\n", kernel_perf, perf); for(int dir = 0; dir < 4; dir++){ for(int i=0;i < num_paths; i++) host_free(input_path_buf[dir][i]); host_free(input_path_buf[dir]); } host_free(sitelink_1d); for(int dir=0;dir < 4;dir++) host_free(sitelink_2d[dir]); #ifdef MULTI_GPU host_free(sitelink_ex_1d); for(int dir=0; dir < 4; dir++) host_free(sitelink_ex_2d[dir]); #endif host_free(mom); host_free(refmom); endQuda(); }
void loadLinkToGPU(cudaGaugeField* cudaGauge, cpuGaugeField* cpuGauge, QudaGaugeParam* param) { if (cudaGauge->Precision() != cpuGauge->Precision()){ errorQuda("Mismatch between CPU precision and CUDA precision"); } QudaPrecision prec = cudaGauge->Precision(); #ifdef MULTI_GPU const int* Z = cudaGauge->X(); #endif int pad = cudaGauge->Pad(); int Vsh_x = param->X[1]*param->X[2]*param->X[3]/2; int Vsh_y = param->X[0]*param->X[2]*param->X[3]/2; int Vsh_z = param->X[0]*param->X[1]*param->X[3]/2; int Vsh_t = param->X[0]*param->X[1]*param->X[2]/2; static void* ghost_cpuGauge[4]; static void* ghost_cpuGauge_diag[16]; #ifdef MULTI_GPU static int allocated = 0; int Vs[4] = {2*Vsh_x, 2*Vsh_y, 2*Vsh_z, 2*Vsh_t}; if (!allocated) { for(int i=0;i < 4; i++) { size_t ghost_bytes = 8*Vs[i]*gaugeSiteSize*prec; #ifdef GPU_DIRECT ghost_cpuGauge[i] = pinned_malloc(ghost_bytes); #else ghost_cpuGauge[i] = safe_malloc(ghost_bytes); #endif } /* * nu | | * |_____| * mu */ for(int nu=0;nu < 4;nu++){ for(int mu=0; mu < 4;mu++){ if(nu == mu){ ghost_cpuGauge_diag[nu*4+mu] = NULL; }else{ //the other directions int dir1, dir2; for(dir1= 0; dir1 < 4; dir1++){ if(dir1 !=nu && dir1 != mu){ break; } } for(dir2=0; dir2 < 4; dir2++){ if(dir2 != nu && dir2 != mu && dir2 != dir1){ break; } } //int rc = posix_memalign((void**)&ghost_cpuGauge_diag[nu*4+mu], ALIGNMENT, Z[dir1]*Z[dir2]*gaugeSiteSize*prec); size_t nbytes = Z[dir1]*Z[dir2]*gaugeSiteSize*prec; #ifdef GPU_DIRECT ghost_cpuGauge_diag[nu*4+mu] = pinned_malloc(nbytes); #else ghost_cpuGauge_diag[nu*4+mu] = safe_malloc(nbytes); #endif memset(ghost_cpuGauge_diag[nu*4+mu], 0, nbytes); } } } allocated = 1; } int optflag=1; // driver for for packalllink exchange_cpu_sitelink(param->X, (void**)cpuGauge->Gauge_p(), ghost_cpuGauge, ghost_cpuGauge_diag, prec, param, optflag); #endif do_loadLinkToGPU(param->X, cudaGauge->Even_p(), cudaGauge->Odd_p(), (void**)cpuGauge->Gauge_p(), ghost_cpuGauge, ghost_cpuGauge_diag, cudaGauge->Reconstruct(), cudaGauge->Bytes(), cudaGauge->VolumeCB(), pad, Vsh_x, Vsh_y, Vsh_z, Vsh_t, prec, cpuGauge->Order()); #ifdef MULTI_GPU if(!(param->preserve_gauge & QUDA_FAT_PRESERVE_COMM_MEM)) { for(int i=0;i < 4;i++){ host_free(ghost_cpuGauge[i]); } for(int i=0;i <4; i++){ for(int j=0;j <4; j++){ if (i != j) host_free(ghost_cpuGauge_diag[i*4+j]); } } allocated = 0; } #endif }
cpuGaugeField::cpuGaugeField(const GaugeFieldParam ¶m) : GaugeField(param), pinned(param.pinned) { if (precision == QUDA_HALF_PRECISION) { errorQuda("CPU fields do not support half precision"); } if (pad != 0) { errorQuda("CPU fields do not support non-zero padding"); } if (reconstruct != QUDA_RECONSTRUCT_NO && reconstruct != QUDA_RECONSTRUCT_10) { errorQuda("Reconstruction type %d not supported", reconstruct); } if (reconstruct == QUDA_RECONSTRUCT_10 && order != QUDA_MILC_GAUGE_ORDER) { errorQuda("10-reconstruction only supported with MILC gauge order"); } if (order == QUDA_QDP_GAUGE_ORDER) { gauge = (void**) safe_malloc(nDim * sizeof(void*)); for (int d=0; d<nDim; d++) { size_t nbytes = volume * reconstruct * precision; if (create == QUDA_NULL_FIELD_CREATE || create == QUDA_ZERO_FIELD_CREATE) { gauge[d] = (pinned ? pinned_malloc(nbytes) : safe_malloc(nbytes)); if (create == QUDA_ZERO_FIELD_CREATE){ memset(gauge[d], 0, nbytes); } } else if (create == QUDA_REFERENCE_FIELD_CREATE) { gauge[d] = ((void**)param.gauge)[d]; } else { errorQuda("Unsupported creation type %d", create); } } } else if (order == QUDA_CPS_WILSON_GAUGE_ORDER || order == QUDA_MILC_GAUGE_ORDER || order == QUDA_BQCD_GAUGE_ORDER) { if (create == QUDA_NULL_FIELD_CREATE || create == QUDA_ZERO_FIELD_CREATE) { size_t nbytes = nDim * volume * reconstruct * precision; gauge = (void **) (pinned ? pinned_malloc(nbytes) : safe_malloc(nbytes)); if(create == QUDA_ZERO_FIELD_CREATE){ memset(gauge, 0, nbytes); } } else if (create == QUDA_REFERENCE_FIELD_CREATE) { gauge = (void**) param.gauge; } else { errorQuda("Unsupported creation type %d", create); } } else { errorQuda("Unsupported gauge order type %d", order); } // no need to exchange data if this is a momentum field if (link_type != QUDA_ASQTAD_MOM_LINKS) { // Ghost zone is always 2-dimensional for (int i=0; i<nDim; i++) { size_t nbytes = nFace * surface[i] * reconstruct * precision; ghost[i] = safe_malloc(nbytes); // no need to use pinned memory for this } // exchange the boundaries exchangeGhost(); } // compute the fat link max now in case it is needed later (i.e., for half precision) if (param.compute_fat_link_max) fat_link_max = maxGauge(*this); }