Example #1
0
  cudaCloverField::cudaCloverField(const void *h_clov, const void *h_clov_inv, 
				   const QudaPrecision cpu_prec, 
				   const QudaCloverFieldOrder cpu_order,
				   const CloverFieldParam &param)
    : CloverField(param), clover(0), norm(0), cloverInv(0), invNorm(0)
  {
    if (h_clov) {
      clover = device_malloc(bytes);
      if (precision == QUDA_HALF_PRECISION) {
	norm = device_malloc(norm_bytes);
      }

      even = clover;
      odd = (char*)clover + bytes/2;
    
      evenNorm = norm;
      oddNorm = (char*)norm + norm_bytes/2;

      loadCPUField(clover, norm, h_clov, cpu_prec, cpu_order);
    } 

    if (h_clov_inv) {
      cloverInv = device_malloc(bytes);
      if (precision == QUDA_HALF_PRECISION) {
	invNorm = device_malloc(bytes);
      }

      evenInv = cloverInv;
      oddInv = (char*)cloverInv + bytes/2;
    
      evenInvNorm = invNorm;
      oddInvNorm = (char*)invNorm + norm_bytes/2;

      total_bytes += bytes + norm_bytes;

      loadCPUField(cloverInv, invNorm, h_clov_inv, cpu_prec, cpu_order);

      // this is a hack to ensure that we can autotune the clover
      // operator when just using symmetric preconditioning
      if (!clover) {
	clover = cloverInv;
	even = evenInv;
	odd = oddInv;
      }
      if (!norm) {
	norm = invNorm;
	evenNorm = evenInvNorm;
	oddNorm = oddInvNorm;
      }
    } 

#ifdef USE_TEXTURE_OBJECTS
    createTexObject(evenTex, evenNormTex, even, evenNorm);
    createTexObject(oddTex, oddNormTex, odd, oddNorm);
    createTexObject(evenInvTex, evenInvNormTex, evenInv, evenInvNorm);
    createTexObject(oddInvTex, oddInvNormTex, oddInv, oddInvNorm);
#endif
    
  }
Example #2
0
  static void do_storeLinkToCPU(Float* cpuGauge, FloatN *even, FloatN *odd, 
				int bytes, int Vh, int stride, QudaPrecision prec) 
  {  
    int datalen = 4*Vh*gaugeSiteSize*sizeof(Float);

    double *unpackedDataEven = (double *) device_malloc(datalen);
    double *unpackedDataOdd = unpackedDataEven;

    //unpack even data kernel
    link_format_gpu_to_cpu((void*)unpackedDataEven, (void*)even, Vh, stride, prec, streams[0]);

#ifdef GPU_DIRECT 
    cudaMemcpyAsync(cpuGauge, unpackedDataEven, datalen, cudaMemcpyDeviceToHost, streams[0]);
#else
    cudaMemcpy(cpuGauge, unpackedDataEven, datalen, cudaMemcpyDeviceToHost);
#endif
  
    //unpack odd data kernel
    link_format_gpu_to_cpu((void*)unpackedDataOdd, (void*)odd, Vh, stride, prec, streams[0]);
#ifdef GPU_DIRECT 
    cudaMemcpyAsync(cpuGauge + 4*Vh*gaugeSiteSize, unpackedDataOdd, datalen, cudaMemcpyDeviceToHost, streams[0]);  
#else
    cudaMemcpy(cpuGauge + 4*Vh*gaugeSiteSize, unpackedDataOdd, datalen, cudaMemcpyDeviceToHost);  
#endif
  
    device_free(unpackedDataEven);
  }
Example #3
0
static ParityHw allocateParityHw(int *X, QudaPrecision precision)
{
    ParityHw ret;

    ret.precision = precision;
    ret.X[0] = X[0]/2;
    ret.volume = X[0]/2;
    for (int d=1; d<4; d++) {
        ret.X[d] = X[d];
        ret.volume *= X[d];
    }
    ret.Nc = 3;
    ret.Ns = 2;
    ret.length = ret.volume*ret.Nc*ret.Ns*2;

    if (precision == QUDA_DOUBLE_PRECISION) ret.bytes = ret.length*sizeof(double);
    else if (precision == QUDA_SINGLE_PRECISION) ret.bytes = ret.length*sizeof(float);
    else ret.bytes = ret.length*sizeof(short);

    ret.data = device_malloc(ret.bytes);
    cudaMemset(ret.data, 0, ret.bytes);

    if (precision == QUDA_HALF_PRECISION) {
        errorQuda("Half precision not supported at present"); //FIXME
        //ret.dataNorm = device_malloc(2*ret.bytes/spinorSiteSize);
    }
    return ret;
}
Example #4
0
  static void
  do_loadLinkToGPU_ex(const int* X, void *even, void *odd, void**cpuGauge,
		      QudaReconstructType reconstruct, int bytes, int Vh_ex, int pad,
		      QudaPrecision prec, QudaGaugeFieldOrder cpu_order)
  {
    int len = Vh_ex*gaugeSiteSize*prec;
    
    char *tmp_even = (char *) device_malloc(4*len);
    char *tmp_odd = tmp_even;

    //even links
    if(cpu_order == QUDA_QDP_GAUGE_ORDER){
      for(int i=0; i < 4; i++){
#ifdef GPU_DIRECT 
	cudaMemcpyAsync(tmp_even + i*len, cpuGauge[i], len, cudaMemcpyHostToDevice);
#else
	cudaMemcpy(tmp_even + i*len, cpuGauge[i], len, cudaMemcpyHostToDevice);
#endif
      
      }
    } else { //QUDA_MILC_GAUGE_ORDER
#ifdef GPU_DIRECT 
      cudaMemcpyAsync(tmp_even, (char*)cpuGauge, 4*len, cudaMemcpyHostToDevice);
#else
      cudaMemcpy(tmp_even, (char*)cpuGauge, 4*len, cudaMemcpyHostToDevice);
#endif
    }
  
    link_format_cpu_to_gpu((void*)even, (void*)tmp_even,  reconstruct, Vh_ex, pad, 0, prec, cpu_order, 0/*default stream*/);
  
    //odd links
    if(cpu_order == QUDA_QDP_GAUGE_ORDER){
      for(int i=0; i < 4; i++){
#ifdef GPU_DIRECT 
	cudaMemcpyAsync(tmp_odd + i*len, ((char*)cpuGauge[i]) + Vh_ex*gaugeSiteSize*prec, len, cudaMemcpyHostToDevice);
#else
	cudaMemcpy(tmp_odd + i*len, ((char*)cpuGauge[i]) + Vh_ex*gaugeSiteSize*prec, len, cudaMemcpyHostToDevice);
#endif
      }
    } else {//QUDA_MILC_GAUGE_ORDER
#ifdef GPU_DIRECT 
      cudaMemcpyAsync(tmp_odd, ((char*)cpuGauge) + 4*Vh_ex*gaugeSiteSize*prec, 4*len, cudaMemcpyHostToDevice);
#else
      cudaMemcpy(tmp_odd, ((char*)cpuGauge) + 4*Vh_ex*gaugeSiteSize*prec, 4*len, cudaMemcpyHostToDevice);
#endif    
    }
    link_format_cpu_to_gpu((void*)odd, (void*)tmp_odd, reconstruct, Vh_ex, pad, 0, prec, cpu_order, 0 /*default stream*/);
  
    device_free(tmp_even);
  }
Example #5
0
  cudaCloverField::cudaCloverField(const CloverFieldParam &param) : CloverField(param) {
    
    if (create != QUDA_NULL_FIELD_CREATE && create != QUDA_REFERENCE_FIELD_CREATE) 
      errorQuda("Create type %d not supported", create);

    if (param.direct) {
      if (create != QUDA_REFERENCE_FIELD_CREATE) {
	clover = device_malloc(bytes);
	if (precision == QUDA_HALF_PRECISION) norm = device_malloc(norm_bytes);
      } else {
	clover = param.clover;
	norm = param.norm;
      }

      even = clover;
      odd = (char*)clover + bytes/2;
    
      evenNorm = norm;
      oddNorm = (char*)norm + norm_bytes/2;

      total_bytes += bytes + norm_bytes;
    } 

    if (param.inverse) {
      if (create != QUDA_REFERENCE_FIELD_CREATE) {
	cloverInv = device_malloc(bytes);
	if (precision == QUDA_HALF_PRECISION) invNorm = device_malloc(norm_bytes);
      } else {
	cloverInv = param.cloverInv;
	invNorm = param.invNorm;
      }

      evenInv = cloverInv;
      oddInv = (char*)cloverInv + bytes/2;
    
      evenInvNorm = invNorm;
      oddInvNorm = (char*)invNorm + norm_bytes/2;

      total_bytes += bytes + norm_bytes;

      // this is a hack to ensure that we can autotune the clover
      // operator when just using symmetric preconditioning
      if (!param.direct) {
	clover = cloverInv;
	even = evenInv;
	odd = oddInv;
	norm = invNorm;
	evenNorm = evenInvNorm;
	oddNorm = oddInvNorm;
      }
    } 

#ifdef USE_TEXTURE_OBJECTS
    createTexObject(evenTex, evenNormTex, even, evenNorm);
    createTexObject(oddTex, oddNormTex, odd, oddNorm);
    createTexObject(evenInvTex, evenInvNormTex, evenInv, evenInvNorm);
    createTexObject(oddInvTex, oddInvNormTex, oddInv, oddInvNorm);
#endif
    twisted = param.twisted;
    mu2 = param.mu2;

  }
Example #6
0
  static void 
  do_loadLinkToGPU(int* X, void *even, void*odd, void **cpuGauge, void** ghost_cpuGauge,
		   void** ghost_cpuGauge_diag, 
		   QudaReconstructType reconstruct, int bytes, int Vh, int pad, 
		   int Vsh_x, int Vsh_y, int Vsh_z, int Vsh_t,
		   QudaPrecision prec, QudaGaugeFieldOrder cpu_order) 
  {
    int Vh_2d_max = MAX(X[0]*X[1]/2, X[0]*X[2]/2);
    Vh_2d_max = MAX(Vh_2d_max, X[0]*X[3]/2);
    Vh_2d_max = MAX(Vh_2d_max, X[1]*X[2]/2);
    Vh_2d_max = MAX(Vh_2d_max, X[1]*X[3]/2);
    Vh_2d_max = MAX(Vh_2d_max, X[2]*X[3]/2);

    int i;
    int len = Vh*gaugeSiteSize*prec;

#ifdef MULTI_GPU    
    int glen[4] = {
      Vsh_x*gaugeSiteSize*prec,
      Vsh_y*gaugeSiteSize*prec,
      Vsh_z*gaugeSiteSize*prec,
      Vsh_t*gaugeSiteSize*prec
    };
    int ghostV = 2*(Vsh_x+Vsh_y+Vsh_z+Vsh_t)+4*Vh_2d_max;
#else
    int ghostV = 0;
#endif  

    int glen_sum = ghostV*gaugeSiteSize*prec;
    char *tmp_even = (char *) device_malloc(4*(len+glen_sum));
    char *tmp_odd = tmp_even; 

    //even links
    if(cpu_order == QUDA_QDP_GAUGE_ORDER){
      for(i=0;i < 4; i++){
#ifdef GPU_DIRECT
	cudaMemcpyAsync(tmp_even + i*(len+glen_sum), cpuGauge[i], len, cudaMemcpyHostToDevice, streams[0]); 
#else
	cudaMemcpy(tmp_even + i*(len+glen_sum), cpuGauge[i], len, cudaMemcpyHostToDevice); 
#endif
      }
    } else { //QUDA_MILC_GAUGE_ORDER
    
#ifdef MULTI_GPU
      errorQuda("Multi-GPU for MILC gauge order is not supported");
#endif    
#ifdef GPU_DIRECT
      cudaMemcpyAsync(tmp_even, ((char*)cpuGauge), 4*len, cudaMemcpyHostToDevice, streams[0]);
#else
      cudaMemcpy(tmp_even, ((char*)cpuGauge), 4*len, cudaMemcpyHostToDevice);
#endif
    }


    for(i=0;i < 4;i++){
#ifdef MULTI_GPU 
      //dir: the source direction
      char* dest = tmp_even + i*(len+glen_sum)+len;
      for(int dir = 0; dir < 4; dir++){
#ifdef GPU_DIRECT 
	cudaMemcpyAsync(dest, ((char*)ghost_cpuGauge[dir])+i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice, streams[0]); 
	cudaMemcpyAsync(dest + glen[dir], ((char*)ghost_cpuGauge[dir])+8*glen[dir]+i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice, streams[0]); 	
#else
	cudaMemcpy(dest, ((char*)ghost_cpuGauge[dir])+i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice); 
	cudaMemcpy(dest + glen[dir], ((char*)ghost_cpuGauge[dir])+8*glen[dir]+i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice); 
#endif
	dest += 2*glen[dir];
      }
      //fill in diag 
      //@nu is @i, mu iterats from 0 to 4 and mu != nu
      int nu = i;
      for(int mu = 0; mu < 4; mu++){
	if(nu  == mu ){
	  continue;
	}
	int dir1, dir2;
	for(dir1=0; dir1 < 4; dir1 ++){
	  if(dir1 != nu && dir1 != mu){
	    break;
	  }
	}
	for(dir2=0; dir2 < 4; dir2 ++){
	  if(dir2 != nu && dir2 != mu && dir2 != dir1){
	    break;
	  }
	}
#ifdef GPU_DIRECT 
	cudaMemcpyAsync(dest+ mu *Vh_2d_max*gaugeSiteSize*prec,ghost_cpuGauge_diag[nu*4+mu], 
			X[dir1]*X[dir2]/2*gaugeSiteSize*prec, cudaMemcpyHostToDevice, streams[0]);	
#else	
	cudaMemcpy(dest+ mu *Vh_2d_max*gaugeSiteSize*prec,ghost_cpuGauge_diag[nu*4+mu], 
		   X[dir1]*X[dir2]/2*gaugeSiteSize*prec, cudaMemcpyHostToDevice);	
#endif
      
      }
    
#endif
    }    
  
    link_format_cpu_to_gpu((void*)even, (void*)tmp_even,  reconstruct, Vh, pad, ghostV, prec, cpu_order, streams[0]); 

    //odd links
    if(cpu_order ==  QUDA_QDP_GAUGE_ORDER){
      for(i=0;i < 4; i++){
#ifdef GPU_DIRECT 
	cudaMemcpyAsync(tmp_odd + i*(len+glen_sum), ((char*)cpuGauge[i]) + Vh*gaugeSiteSize*prec, len, cudaMemcpyHostToDevice, streams[0]);
#else
	cudaMemcpy(tmp_odd + i*(len+glen_sum), ((char*)cpuGauge[i]) + Vh*gaugeSiteSize*prec, len, cudaMemcpyHostToDevice);
#endif
      }
    }else{  //QUDA_MILC_GAUGE_ORDER
#ifdef GPU_DIRECT 
      cudaMemcpyAsync(tmp_odd , ((char*)cpuGauge)+4*Vh*gaugeSiteSize*prec, 4*len, cudaMemcpyHostToDevice, streams[0]);
#else
      cudaMemcpy(tmp_odd, (char*)cpuGauge+4*Vh*gaugeSiteSize*prec, 4*len, cudaMemcpyHostToDevice);
#endif    
    }
  

    for(i=0;i < 4; i++){
#ifdef MULTI_GPU  
      char* dest = tmp_odd + i*(len+glen_sum)+len;
      for(int dir = 0; dir < 4; dir++){
#ifdef GPU_DIRECT 
	cudaMemcpyAsync(dest, ((char*)ghost_cpuGauge[dir])+glen[dir] +i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice, streams[0]); 
	cudaMemcpyAsync(dest + glen[dir], ((char*)ghost_cpuGauge[dir])+8*glen[dir]+glen[dir] +i*2*glen[dir], glen[dir], 
			cudaMemcpyHostToDevice, streams[0]); 
#else
	cudaMemcpy(dest, ((char*)ghost_cpuGauge[dir])+glen[dir] +i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice); 
	cudaMemcpy(dest + glen[dir], ((char*)ghost_cpuGauge[dir])+8*glen[dir]+glen[dir] +i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice); 

#endif

	dest += 2*glen[dir];
      }
      //fill in diag 
      //@nu is @i, mu iterats from 0 to 4 and mu != nu
      int nu = i;
      for(int mu = 0; mu < 4; mu++){
	if(nu  == mu ){
	  continue;
	}
	int dir1, dir2;
	for(dir1=0; dir1 < 4; dir1 ++){
	  if(dir1 != nu && dir1 != mu){
	    break;
	  }
	}
	for(dir2=0; dir2 < 4; dir2 ++){
	  if(dir2 != nu && dir2 != mu && dir2 != dir1){
	    break;
	  }
	}
#ifdef GPU_DIRECT 
	cudaMemcpyAsync(dest+ mu *Vh_2d_max*gaugeSiteSize*prec,((char*)ghost_cpuGauge_diag[nu*4+mu])+X[dir1]*X[dir2]/2*gaugeSiteSize*prec, 
			X[dir1]*X[dir2]/2*gaugeSiteSize*prec, cudaMemcpyHostToDevice, streams[0]);	
#else
	cudaMemcpy(dest+ mu *Vh_2d_max*gaugeSiteSize*prec,((char*)ghost_cpuGauge_diag[nu*4+mu])+X[dir1]*X[dir2]/2*gaugeSiteSize*prec, 
		   X[dir1]*X[dir2]/2*gaugeSiteSize*prec, cudaMemcpyHostToDevice );		
#endif
      }
      

#endif
    }
    link_format_cpu_to_gpu((void*)odd, (void*)tmp_odd, reconstruct, Vh, pad, ghostV, prec, cpu_order, streams[0]); 
  
    cudaStreamSynchronize(streams[0]);

    device_free(tmp_even);

  }