cudaCloverField::cudaCloverField(const void *h_clov, const void *h_clov_inv, const QudaPrecision cpu_prec, const QudaCloverFieldOrder cpu_order, const CloverFieldParam ¶m) : CloverField(param), clover(0), norm(0), cloverInv(0), invNorm(0) { if (h_clov) { clover = device_malloc(bytes); if (precision == QUDA_HALF_PRECISION) { norm = device_malloc(norm_bytes); } even = clover; odd = (char*)clover + bytes/2; evenNorm = norm; oddNorm = (char*)norm + norm_bytes/2; loadCPUField(clover, norm, h_clov, cpu_prec, cpu_order); } if (h_clov_inv) { cloverInv = device_malloc(bytes); if (precision == QUDA_HALF_PRECISION) { invNorm = device_malloc(bytes); } evenInv = cloverInv; oddInv = (char*)cloverInv + bytes/2; evenInvNorm = invNorm; oddInvNorm = (char*)invNorm + norm_bytes/2; total_bytes += bytes + norm_bytes; loadCPUField(cloverInv, invNorm, h_clov_inv, cpu_prec, cpu_order); // this is a hack to ensure that we can autotune the clover // operator when just using symmetric preconditioning if (!clover) { clover = cloverInv; even = evenInv; odd = oddInv; } if (!norm) { norm = invNorm; evenNorm = evenInvNorm; oddNorm = oddInvNorm; } } #ifdef USE_TEXTURE_OBJECTS createTexObject(evenTex, evenNormTex, even, evenNorm); createTexObject(oddTex, oddNormTex, odd, oddNorm); createTexObject(evenInvTex, evenInvNormTex, evenInv, evenInvNorm); createTexObject(oddInvTex, oddInvNormTex, oddInv, oddInvNorm); #endif }
static void do_storeLinkToCPU(Float* cpuGauge, FloatN *even, FloatN *odd, int bytes, int Vh, int stride, QudaPrecision prec) { int datalen = 4*Vh*gaugeSiteSize*sizeof(Float); double *unpackedDataEven = (double *) device_malloc(datalen); double *unpackedDataOdd = unpackedDataEven; //unpack even data kernel link_format_gpu_to_cpu((void*)unpackedDataEven, (void*)even, Vh, stride, prec, streams[0]); #ifdef GPU_DIRECT cudaMemcpyAsync(cpuGauge, unpackedDataEven, datalen, cudaMemcpyDeviceToHost, streams[0]); #else cudaMemcpy(cpuGauge, unpackedDataEven, datalen, cudaMemcpyDeviceToHost); #endif //unpack odd data kernel link_format_gpu_to_cpu((void*)unpackedDataOdd, (void*)odd, Vh, stride, prec, streams[0]); #ifdef GPU_DIRECT cudaMemcpyAsync(cpuGauge + 4*Vh*gaugeSiteSize, unpackedDataOdd, datalen, cudaMemcpyDeviceToHost, streams[0]); #else cudaMemcpy(cpuGauge + 4*Vh*gaugeSiteSize, unpackedDataOdd, datalen, cudaMemcpyDeviceToHost); #endif device_free(unpackedDataEven); }
static ParityHw allocateParityHw(int *X, QudaPrecision precision) { ParityHw ret; ret.precision = precision; ret.X[0] = X[0]/2; ret.volume = X[0]/2; for (int d=1; d<4; d++) { ret.X[d] = X[d]; ret.volume *= X[d]; } ret.Nc = 3; ret.Ns = 2; ret.length = ret.volume*ret.Nc*ret.Ns*2; if (precision == QUDA_DOUBLE_PRECISION) ret.bytes = ret.length*sizeof(double); else if (precision == QUDA_SINGLE_PRECISION) ret.bytes = ret.length*sizeof(float); else ret.bytes = ret.length*sizeof(short); ret.data = device_malloc(ret.bytes); cudaMemset(ret.data, 0, ret.bytes); if (precision == QUDA_HALF_PRECISION) { errorQuda("Half precision not supported at present"); //FIXME //ret.dataNorm = device_malloc(2*ret.bytes/spinorSiteSize); } return ret; }
static void do_loadLinkToGPU_ex(const int* X, void *even, void *odd, void**cpuGauge, QudaReconstructType reconstruct, int bytes, int Vh_ex, int pad, QudaPrecision prec, QudaGaugeFieldOrder cpu_order) { int len = Vh_ex*gaugeSiteSize*prec; char *tmp_even = (char *) device_malloc(4*len); char *tmp_odd = tmp_even; //even links if(cpu_order == QUDA_QDP_GAUGE_ORDER){ for(int i=0; i < 4; i++){ #ifdef GPU_DIRECT cudaMemcpyAsync(tmp_even + i*len, cpuGauge[i], len, cudaMemcpyHostToDevice); #else cudaMemcpy(tmp_even + i*len, cpuGauge[i], len, cudaMemcpyHostToDevice); #endif } } else { //QUDA_MILC_GAUGE_ORDER #ifdef GPU_DIRECT cudaMemcpyAsync(tmp_even, (char*)cpuGauge, 4*len, cudaMemcpyHostToDevice); #else cudaMemcpy(tmp_even, (char*)cpuGauge, 4*len, cudaMemcpyHostToDevice); #endif } link_format_cpu_to_gpu((void*)even, (void*)tmp_even, reconstruct, Vh_ex, pad, 0, prec, cpu_order, 0/*default stream*/); //odd links if(cpu_order == QUDA_QDP_GAUGE_ORDER){ for(int i=0; i < 4; i++){ #ifdef GPU_DIRECT cudaMemcpyAsync(tmp_odd + i*len, ((char*)cpuGauge[i]) + Vh_ex*gaugeSiteSize*prec, len, cudaMemcpyHostToDevice); #else cudaMemcpy(tmp_odd + i*len, ((char*)cpuGauge[i]) + Vh_ex*gaugeSiteSize*prec, len, cudaMemcpyHostToDevice); #endif } } else {//QUDA_MILC_GAUGE_ORDER #ifdef GPU_DIRECT cudaMemcpyAsync(tmp_odd, ((char*)cpuGauge) + 4*Vh_ex*gaugeSiteSize*prec, 4*len, cudaMemcpyHostToDevice); #else cudaMemcpy(tmp_odd, ((char*)cpuGauge) + 4*Vh_ex*gaugeSiteSize*prec, 4*len, cudaMemcpyHostToDevice); #endif } link_format_cpu_to_gpu((void*)odd, (void*)tmp_odd, reconstruct, Vh_ex, pad, 0, prec, cpu_order, 0 /*default stream*/); device_free(tmp_even); }
cudaCloverField::cudaCloverField(const CloverFieldParam ¶m) : CloverField(param) { if (create != QUDA_NULL_FIELD_CREATE && create != QUDA_REFERENCE_FIELD_CREATE) errorQuda("Create type %d not supported", create); if (param.direct) { if (create != QUDA_REFERENCE_FIELD_CREATE) { clover = device_malloc(bytes); if (precision == QUDA_HALF_PRECISION) norm = device_malloc(norm_bytes); } else { clover = param.clover; norm = param.norm; } even = clover; odd = (char*)clover + bytes/2; evenNorm = norm; oddNorm = (char*)norm + norm_bytes/2; total_bytes += bytes + norm_bytes; } if (param.inverse) { if (create != QUDA_REFERENCE_FIELD_CREATE) { cloverInv = device_malloc(bytes); if (precision == QUDA_HALF_PRECISION) invNorm = device_malloc(norm_bytes); } else { cloverInv = param.cloverInv; invNorm = param.invNorm; } evenInv = cloverInv; oddInv = (char*)cloverInv + bytes/2; evenInvNorm = invNorm; oddInvNorm = (char*)invNorm + norm_bytes/2; total_bytes += bytes + norm_bytes; // this is a hack to ensure that we can autotune the clover // operator when just using symmetric preconditioning if (!param.direct) { clover = cloverInv; even = evenInv; odd = oddInv; norm = invNorm; evenNorm = evenInvNorm; oddNorm = oddInvNorm; } } #ifdef USE_TEXTURE_OBJECTS createTexObject(evenTex, evenNormTex, even, evenNorm); createTexObject(oddTex, oddNormTex, odd, oddNorm); createTexObject(evenInvTex, evenInvNormTex, evenInv, evenInvNorm); createTexObject(oddInvTex, oddInvNormTex, oddInv, oddInvNorm); #endif twisted = param.twisted; mu2 = param.mu2; }
static void do_loadLinkToGPU(int* X, void *even, void*odd, void **cpuGauge, void** ghost_cpuGauge, void** ghost_cpuGauge_diag, QudaReconstructType reconstruct, int bytes, int Vh, int pad, int Vsh_x, int Vsh_y, int Vsh_z, int Vsh_t, QudaPrecision prec, QudaGaugeFieldOrder cpu_order) { int Vh_2d_max = MAX(X[0]*X[1]/2, X[0]*X[2]/2); Vh_2d_max = MAX(Vh_2d_max, X[0]*X[3]/2); Vh_2d_max = MAX(Vh_2d_max, X[1]*X[2]/2); Vh_2d_max = MAX(Vh_2d_max, X[1]*X[3]/2); Vh_2d_max = MAX(Vh_2d_max, X[2]*X[3]/2); int i; int len = Vh*gaugeSiteSize*prec; #ifdef MULTI_GPU int glen[4] = { Vsh_x*gaugeSiteSize*prec, Vsh_y*gaugeSiteSize*prec, Vsh_z*gaugeSiteSize*prec, Vsh_t*gaugeSiteSize*prec }; int ghostV = 2*(Vsh_x+Vsh_y+Vsh_z+Vsh_t)+4*Vh_2d_max; #else int ghostV = 0; #endif int glen_sum = ghostV*gaugeSiteSize*prec; char *tmp_even = (char *) device_malloc(4*(len+glen_sum)); char *tmp_odd = tmp_even; //even links if(cpu_order == QUDA_QDP_GAUGE_ORDER){ for(i=0;i < 4; i++){ #ifdef GPU_DIRECT cudaMemcpyAsync(tmp_even + i*(len+glen_sum), cpuGauge[i], len, cudaMemcpyHostToDevice, streams[0]); #else cudaMemcpy(tmp_even + i*(len+glen_sum), cpuGauge[i], len, cudaMemcpyHostToDevice); #endif } } else { //QUDA_MILC_GAUGE_ORDER #ifdef MULTI_GPU errorQuda("Multi-GPU for MILC gauge order is not supported"); #endif #ifdef GPU_DIRECT cudaMemcpyAsync(tmp_even, ((char*)cpuGauge), 4*len, cudaMemcpyHostToDevice, streams[0]); #else cudaMemcpy(tmp_even, ((char*)cpuGauge), 4*len, cudaMemcpyHostToDevice); #endif } for(i=0;i < 4;i++){ #ifdef MULTI_GPU //dir: the source direction char* dest = tmp_even + i*(len+glen_sum)+len; for(int dir = 0; dir < 4; dir++){ #ifdef GPU_DIRECT cudaMemcpyAsync(dest, ((char*)ghost_cpuGauge[dir])+i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice, streams[0]); cudaMemcpyAsync(dest + glen[dir], ((char*)ghost_cpuGauge[dir])+8*glen[dir]+i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice, streams[0]); #else cudaMemcpy(dest, ((char*)ghost_cpuGauge[dir])+i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice); cudaMemcpy(dest + glen[dir], ((char*)ghost_cpuGauge[dir])+8*glen[dir]+i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice); #endif dest += 2*glen[dir]; } //fill in diag //@nu is @i, mu iterats from 0 to 4 and mu != nu int nu = i; for(int mu = 0; mu < 4; mu++){ if(nu == mu ){ continue; } int dir1, dir2; for(dir1=0; dir1 < 4; dir1 ++){ if(dir1 != nu && dir1 != mu){ break; } } for(dir2=0; dir2 < 4; dir2 ++){ if(dir2 != nu && dir2 != mu && dir2 != dir1){ break; } } #ifdef GPU_DIRECT cudaMemcpyAsync(dest+ mu *Vh_2d_max*gaugeSiteSize*prec,ghost_cpuGauge_diag[nu*4+mu], X[dir1]*X[dir2]/2*gaugeSiteSize*prec, cudaMemcpyHostToDevice, streams[0]); #else cudaMemcpy(dest+ mu *Vh_2d_max*gaugeSiteSize*prec,ghost_cpuGauge_diag[nu*4+mu], X[dir1]*X[dir2]/2*gaugeSiteSize*prec, cudaMemcpyHostToDevice); #endif } #endif } link_format_cpu_to_gpu((void*)even, (void*)tmp_even, reconstruct, Vh, pad, ghostV, prec, cpu_order, streams[0]); //odd links if(cpu_order == QUDA_QDP_GAUGE_ORDER){ for(i=0;i < 4; i++){ #ifdef GPU_DIRECT cudaMemcpyAsync(tmp_odd + i*(len+glen_sum), ((char*)cpuGauge[i]) + Vh*gaugeSiteSize*prec, len, cudaMemcpyHostToDevice, streams[0]); #else cudaMemcpy(tmp_odd + i*(len+glen_sum), ((char*)cpuGauge[i]) + Vh*gaugeSiteSize*prec, len, cudaMemcpyHostToDevice); #endif } }else{ //QUDA_MILC_GAUGE_ORDER #ifdef GPU_DIRECT cudaMemcpyAsync(tmp_odd , ((char*)cpuGauge)+4*Vh*gaugeSiteSize*prec, 4*len, cudaMemcpyHostToDevice, streams[0]); #else cudaMemcpy(tmp_odd, (char*)cpuGauge+4*Vh*gaugeSiteSize*prec, 4*len, cudaMemcpyHostToDevice); #endif } for(i=0;i < 4; i++){ #ifdef MULTI_GPU char* dest = tmp_odd + i*(len+glen_sum)+len; for(int dir = 0; dir < 4; dir++){ #ifdef GPU_DIRECT cudaMemcpyAsync(dest, ((char*)ghost_cpuGauge[dir])+glen[dir] +i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice, streams[0]); cudaMemcpyAsync(dest + glen[dir], ((char*)ghost_cpuGauge[dir])+8*glen[dir]+glen[dir] +i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice, streams[0]); #else cudaMemcpy(dest, ((char*)ghost_cpuGauge[dir])+glen[dir] +i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice); cudaMemcpy(dest + glen[dir], ((char*)ghost_cpuGauge[dir])+8*glen[dir]+glen[dir] +i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice); #endif dest += 2*glen[dir]; } //fill in diag //@nu is @i, mu iterats from 0 to 4 and mu != nu int nu = i; for(int mu = 0; mu < 4; mu++){ if(nu == mu ){ continue; } int dir1, dir2; for(dir1=0; dir1 < 4; dir1 ++){ if(dir1 != nu && dir1 != mu){ break; } } for(dir2=0; dir2 < 4; dir2 ++){ if(dir2 != nu && dir2 != mu && dir2 != dir1){ break; } } #ifdef GPU_DIRECT cudaMemcpyAsync(dest+ mu *Vh_2d_max*gaugeSiteSize*prec,((char*)ghost_cpuGauge_diag[nu*4+mu])+X[dir1]*X[dir2]/2*gaugeSiteSize*prec, X[dir1]*X[dir2]/2*gaugeSiteSize*prec, cudaMemcpyHostToDevice, streams[0]); #else cudaMemcpy(dest+ mu *Vh_2d_max*gaugeSiteSize*prec,((char*)ghost_cpuGauge_diag[nu*4+mu])+X[dir1]*X[dir2]/2*gaugeSiteSize*prec, X[dir1]*X[dir2]/2*gaugeSiteSize*prec, cudaMemcpyHostToDevice ); #endif } #endif } link_format_cpu_to_gpu((void*)odd, (void*)tmp_odd, reconstruct, Vh, pad, ghostV, prec, cpu_order, streams[0]); cudaStreamSynchronize(streams[0]); device_free(tmp_even); }