Beispiel #1
0
static void retrieveParityHw(void *res, ParityHw hw, QudaPrecision cpu_prec)
{
    if (hw.precision != QUDA_HALF_PRECISION) {

        void *packedHw1 = pinned_malloc(hw.bytes);
        cudaMemcpy(packedHw1, hw.data, hw.bytes, cudaMemcpyDeviceToHost);

        if (hw.precision == QUDA_DOUBLE_PRECISION) {
            unpackParityHw((double*)res, (double2*)packedHw1, hw.volume);
        } else {
            if (cpu_prec == QUDA_DOUBLE_PRECISION) {
                unpackParityHw((double*)res, (float2*)packedHw1, hw.volume);
            } else {
                unpackParityHw((float*)res, (float2*)packedHw1, hw.volume);
            }
        }
        host_free(packedHw1);

    } else {
        //half precision
        /*
          ParityHw tmp = allocateParityHw(hw.X, QUDA_SINGLE_PRECISION);
          copyCuda(tmp, hw);
          retrieveParityHw(res, tmp, cpu_prec, dirac_order);
          freeParityHw(tmp);
        */
    }
}
Beispiel #2
0
void static loadParityHw(ParityHw ret, void *hw, QudaPrecision cpu_prec)
{
    if (ret.precision == QUDA_DOUBLE_PRECISION && cpu_prec != QUDA_DOUBLE_PRECISION) {
        errorQuda("CUDA double precision requires CPU double precision");
    }

    if (ret.precision != QUDA_HALF_PRECISION) {

        void *packedHw1 = pinned_malloc(ret.bytes);

        if (ret.precision == QUDA_DOUBLE_PRECISION) {
            packParityHw((double2*)packedHw1, (double*)hw, ret.volume);
        } else {
            if (cpu_prec == QUDA_DOUBLE_PRECISION) {
                packParityHw((float2*)packedHw1, (double*)hw, ret.volume);
            } else {
                packParityHw((float2*)packedHw1, (float*)hw, ret.volume);
            }
        }
        cudaMemcpy(ret.data, packedHw1, ret.bytes, cudaMemcpyHostToDevice);
        host_free(packedHw1);

    } else {
        //half precision
        /*
          ParityHw tmp = allocateParityHw(ret.X, QUDA_SINGLE_PRECISION);
          loadParityHw(tmp, hw, cpu_prec, dirac_order);
          copyCuda(ret, tmp);
          freeParityHw(tmp);
        */
    }
}
Beispiel #3
0
  CloverField::CloverField(const CloverFieldParam &param) :
    LatticeField(param), bytes(0), norm_bytes(0), nColor(3), nSpin(4), 
    clover(0), norm(0), cloverInv(0), invNorm(0), order(param.order), create(param.create),
    trlog(static_cast<double*>(pinned_malloc(2*sizeof(double))))
  {
    if (nDim != 4) errorQuda("Number of dimensions must be 4, not %d", nDim);

    if (order == QUDA_QDPJIT_CLOVER_ORDER && create != QUDA_REFERENCE_FIELD_CREATE)
      errorQuda("QDPJIT ordered clover fields only supported for reference fields");

    real_length = 2*volumeCB*nColor*nColor*nSpin*nSpin/2;  // block-diagonal Hermitian (72 reals)
    length = 2*stride*nColor*nColor*nSpin*nSpin/2;

    bytes = length*precision;
    bytes = ALIGNMENT_ADJUST(bytes);
    if (precision == QUDA_HALF_PRECISION) {
      norm_bytes = sizeof(float)*2*stride*2; // 2 chirality
      norm_bytes = ALIGNMENT_ADJUST(norm_bytes);
    }
//for twisted mass only:
    twisted = false;//param.twisted;
    mu2 = 0.0; //param.mu2;
  }
Beispiel #4
0
static void
gauge_force_test(void) 
{
  int max_length = 6;    
  
  initQuda(device);
  setVerbosityQuda(QUDA_VERBOSE,"",stdout);

  qudaGaugeParam = newQudaGaugeParam();
  
  qudaGaugeParam.X[0] = xdim;
  qudaGaugeParam.X[1] = ydim;
  qudaGaugeParam.X[2] = zdim;
  qudaGaugeParam.X[3] = tdim;
  
  setDims(qudaGaugeParam.X);
  
  qudaGaugeParam.anisotropy = 1.0;
  qudaGaugeParam.cpu_prec = link_prec;
  qudaGaugeParam.cuda_prec = link_prec;
  qudaGaugeParam.cuda_prec_sloppy = link_prec;
  qudaGaugeParam.reconstruct = link_recon;  
  qudaGaugeParam.reconstruct_sloppy = link_recon;  
  qudaGaugeParam.type = QUDA_SU3_LINKS; // in this context, just means these are site links   
  
  qudaGaugeParam.gauge_order = gauge_order;
  qudaGaugeParam.t_boundary = QUDA_PERIODIC_T;
  qudaGaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO;
  qudaGaugeParam.ga_pad = 0;
  qudaGaugeParam.mom_ga_pad = 0;

  size_t gSize = qudaGaugeParam.cpu_prec;
    
  void* sitelink;  
  void* sitelink_1d;
  
#ifdef GPU_DIRECT
  sitelink_1d = pinned_malloc(4*V*gaugeSiteSize*gSize);
#else
  sitelink_1d = safe_malloc(4*V*gaugeSiteSize*gSize);
#endif
  
  // this is a hack to have site link generated in 2d 
  // then copied to 1d array in "MILC" format
  void* sitelink_2d[4];
#ifdef GPU_DIRECT
  for(int i=0;i<4;i++) sitelink_2d[i] = pinned_malloc(V*gaugeSiteSize*qudaGaugeParam.cpu_prec); 
#else
  for(int i=0;i<4;i++) sitelink_2d[i] = safe_malloc(V*gaugeSiteSize*qudaGaugeParam.cpu_prec);
#endif
  
  // fills the gauge field with random numbers
  createSiteLinkCPU(sitelink_2d, qudaGaugeParam.cpu_prec, 0);
  
  //copy the 2d sitelink to 1d milc format 
  
  for(int dir = 0; dir < 4; dir++){
    for(int i=0; i < V; i++){
      char* src =  ((char*)sitelink_2d[dir]) + i * gaugeSiteSize* qudaGaugeParam.cpu_prec;
      char* dst =  ((char*)sitelink_1d) + (4*i+dir)*gaugeSiteSize*qudaGaugeParam.cpu_prec ;
      memcpy(dst, src, gaugeSiteSize*qudaGaugeParam.cpu_prec);
    }
  }
  if (qudaGaugeParam.gauge_order ==  QUDA_MILC_GAUGE_ORDER){ 
    sitelink =  sitelink_1d;    
  }else if (qudaGaugeParam.gauge_order == QUDA_QDP_GAUGE_ORDER) {
    sitelink = (void**)sitelink_2d;
  } else {
    errorQuda("Unsupported gauge order %d", qudaGaugeParam.gauge_order);
  }
  
#ifdef MULTI_GPU
  void* sitelink_ex_2d[4];
  void* sitelink_ex_1d;

  sitelink_ex_1d = pinned_malloc(4*V_ex*gaugeSiteSize*gSize);
  for(int i=0;i < 4;i++) sitelink_ex_2d[i] = pinned_malloc(V_ex*gaugeSiteSize*gSize);

  int X1= Z[0];
  int X2= Z[1];
  int X3= Z[2];
  int X4= Z[3];

  for(int i=0; i < V_ex; i++){
    int sid = i;
    int oddBit=0;
    if(i >= Vh_ex){
      sid = i - Vh_ex;
      oddBit = 1;
    }
    
    int za = sid/E1h;
    int x1h = sid - za*E1h;
    int zb = za/E2;
    int x2 = za - zb*E2;
    int x4 = zb/E3;
    int x3 = zb - x4*E3;
    int x1odd = (x2 + x3 + x4 + oddBit) & 1;
    int x1 = 2*x1h + x1odd;

    if( x1< 2 || x1 >= X1 +2
        || x2< 2 || x2 >= X2 +2
        || x3< 2 || x3 >= X3 +2
        || x4< 2 || x4 >= X4 +2){
      continue;
    }
    
    x1 = (x1 - 2 + X1) % X1;
    x2 = (x2 - 2 + X2) % X2;
    x3 = (x3 - 2 + X3) % X3;
    x4 = (x4 - 2 + X4) % X4;
    
    int idx = (x4*X3*X2*X1+x3*X2*X1+x2*X1+x1)>>1;
    if(oddBit){
      idx += Vh;
    }
    for(int dir= 0; dir < 4; dir++){
      char* src = (char*)sitelink_2d[dir];
      char* dst = (char*)sitelink_ex_2d[dir];
      memcpy(dst+i*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize);
    }//dir
  }//i
  
  
  for(int dir = 0; dir < 4; dir++){
    for(int i=0; i < V_ex; i++){
      char* src =  ((char*)sitelink_ex_2d[dir]) + i * gaugeSiteSize* qudaGaugeParam.cpu_prec;
      char* dst =  ((char*)sitelink_ex_1d) + (4*i+dir)*gaugeSiteSize*qudaGaugeParam.cpu_prec ;
      memcpy(dst, src, gaugeSiteSize*qudaGaugeParam.cpu_prec);
    }
  }
  
#endif

  void* mom = safe_malloc(4*V*momSiteSize*gSize);
  void* refmom = safe_malloc(4*V*momSiteSize*gSize);

  memset(mom, 0, 4*V*momSiteSize*gSize);
  //initialize some data in cpuMom
  createMomCPU(mom, qudaGaugeParam.cpu_prec);      
  memcpy(refmom, mom, 4*V*momSiteSize*gSize);
  
  
  double loop_coeff_d[sizeof(loop_coeff_f)/sizeof(float)];
  for(unsigned int i=0;i < sizeof(loop_coeff_f)/sizeof(float); i++){
    loop_coeff_d[i] = loop_coeff_f[i];
  }
    
  void* loop_coeff;
  if(qudaGaugeParam.cuda_prec == QUDA_SINGLE_PRECISION){
    loop_coeff = (void*)&loop_coeff_f[0];
  }else{
    loop_coeff = loop_coeff_d;
  }
  double eb3 = 0.3;
  int num_paths = sizeof(path_dir_x)/sizeof(path_dir_x[0]);
  
  int** input_path_buf[4];
  for(int dir =0; dir < 4; dir++){
    input_path_buf[dir] = (int**)safe_malloc(num_paths*sizeof(int*));    
    for(int i=0;i < num_paths;i++){
      input_path_buf[dir][i] = (int*)safe_malloc(length[i]*sizeof(int));
      if(dir == 0) memcpy(input_path_buf[dir][i], path_dir_x[i], length[i]*sizeof(int));
      else if(dir ==1) memcpy(input_path_buf[dir][i], path_dir_y[i], length[i]*sizeof(int));
      else if(dir ==2) memcpy(input_path_buf[dir][i], path_dir_z[i], length[i]*sizeof(int));
      else if(dir ==3) memcpy(input_path_buf[dir][i], path_dir_t[i], length[i]*sizeof(int));
    }
  }

  if (tune) {
    printfQuda("Tuning...\n");
    setTuning(QUDA_TUNE_YES);
  }
  
  struct timeval t0, t1;
  double timeinfo[3];
  /* Multiple execution to exclude warmup time in the first run*/
  for (int i =0;i < attempts; i++){
    gettimeofday(&t0, NULL);
    computeGaugeForceQuda(mom, sitelink,  input_path_buf, length,
			  loop_coeff_d, num_paths, max_length, eb3,
			  &qudaGaugeParam, timeinfo);
    gettimeofday(&t1, NULL);
  }
 
  double total_time = t1.tv_sec - t0.tv_sec + 0.000001*(t1.tv_usec - t0.tv_usec);
  //The number comes from CPU implementation in MILC, gauge_force_imp.c
  int flops=153004;
    
  if (verify_results){	
    for(int i = 0;i < attempts;i++){
#ifdef MULTI_GPU
      //last arg=0 means no optimization for communication, i.e. exchange data in all directions 
      //even they are not partitioned
      int R[4] = {2, 2, 2, 2};
      exchange_cpu_sitelink_ex(qudaGaugeParam.X, R, (void**)sitelink_ex_2d,
			       QUDA_QDP_GAUGE_ORDER, qudaGaugeParam.cpu_prec, 0, 4);    
      gauge_force_reference(refmom, eb3, sitelink_2d, sitelink_ex_2d, qudaGaugeParam.cpu_prec,
			    input_path_buf, length, loop_coeff, num_paths);
#else
      gauge_force_reference(refmom, eb3, sitelink_2d, NULL, qudaGaugeParam.cpu_prec,
			    input_path_buf, length, loop_coeff, num_paths);
#endif
    }
  
    int res;
    res = compare_floats(mom, refmom, 4*V*momSiteSize, 1e-3, qudaGaugeParam.cpu_prec);
    
    strong_check_mom(mom, refmom, 4*V, qudaGaugeParam.cpu_prec);
    
    printf("Test %s\n",(1 == res) ? "PASSED" : "FAILED");
  }  

  double perf = 1.0* flops*V/(total_time*1e+9);
  double kernel_perf = 1.0*flops*V/(timeinfo[1]*1e+9);
  printf("init and cpu->gpu time: %.2f ms, kernel time: %.2f ms, gpu->cpu and cleanup time: %.2f  total time =%.2f ms\n", 
	 timeinfo[0]*1e+3, timeinfo[1]*1e+3, timeinfo[2]*1e+3, total_time*1e+3);
  printf("kernel performance: %.2f GFLOPS, overall performance : %.2f GFLOPS\n", kernel_perf, perf);
  
  for(int dir = 0; dir < 4; dir++){
    for(int i=0;i < num_paths; i++) host_free(input_path_buf[dir][i]);
    host_free(input_path_buf[dir]);
  }
  
  host_free(sitelink_1d);
  for(int dir=0;dir < 4;dir++) host_free(sitelink_2d[dir]);
  
#ifdef MULTI_GPU  
  host_free(sitelink_ex_1d);
  for(int dir=0; dir < 4; dir++) host_free(sitelink_ex_2d[dir]);
#endif


  host_free(mom);
  host_free(refmom);
  endQuda();
}            
Beispiel #5
0
  void loadLinkToGPU(cudaGaugeField* cudaGauge, cpuGaugeField* cpuGauge, QudaGaugeParam* param)
  {
    if (cudaGauge->Precision() != cpuGauge->Precision()){
      errorQuda("Mismatch between CPU precision and CUDA precision");
    }
    QudaPrecision prec = cudaGauge->Precision();

#ifdef MULTI_GPU
    const int* Z = cudaGauge->X();
#endif
    int pad = cudaGauge->Pad();
    int Vsh_x = param->X[1]*param->X[2]*param->X[3]/2;
    int Vsh_y = param->X[0]*param->X[2]*param->X[3]/2;
    int Vsh_z = param->X[0]*param->X[1]*param->X[3]/2;
    int Vsh_t = param->X[0]*param->X[1]*param->X[2]/2;

    static void* ghost_cpuGauge[4];
    static void* ghost_cpuGauge_diag[16];

#ifdef MULTI_GPU
    static int allocated = 0;
    int Vs[4] = {2*Vsh_x, 2*Vsh_y, 2*Vsh_z, 2*Vsh_t};
  
    if (!allocated) {

      for(int i=0;i < 4; i++) {
	size_t ghost_bytes = 8*Vs[i]*gaugeSiteSize*prec;      
#ifdef GPU_DIRECT 
	ghost_cpuGauge[i] = pinned_malloc(ghost_bytes);
#else
	ghost_cpuGauge[i] = safe_malloc(ghost_bytes);
#endif
      }

      /*
       *  nu |     |
       *     |_____|
       *       mu     
       */
    
      for(int nu=0;nu < 4;nu++){
	for(int mu=0; mu < 4;mu++){
	  if(nu == mu){
	    ghost_cpuGauge_diag[nu*4+mu] = NULL;
	  }else{
	    //the other directions
	    int dir1, dir2;
	    for(dir1= 0; dir1 < 4; dir1++){
	      if(dir1 !=nu && dir1 != mu){
		break;
	      }
	    }
	    for(dir2=0; dir2 < 4; dir2++){
	      if(dir2 != nu && dir2 != mu && dir2 != dir1){
		break;
	      }
	    }
	    //int rc = posix_memalign((void**)&ghost_cpuGauge_diag[nu*4+mu], ALIGNMENT, Z[dir1]*Z[dir2]*gaugeSiteSize*prec);

	    size_t nbytes = Z[dir1]*Z[dir2]*gaugeSiteSize*prec;
#ifdef GPU_DIRECT 
	    ghost_cpuGauge_diag[nu*4+mu] = pinned_malloc(nbytes);
#else
	    ghost_cpuGauge_diag[nu*4+mu] = safe_malloc(nbytes);
#endif
	    memset(ghost_cpuGauge_diag[nu*4+mu], 0, nbytes);
	  }	
	}
      }
      allocated = 1;
    }

    int optflag=1;
    // driver for for packalllink
    exchange_cpu_sitelink(param->X, (void**)cpuGauge->Gauge_p(), ghost_cpuGauge, ghost_cpuGauge_diag, prec, param, optflag);
  
#endif
  
    do_loadLinkToGPU(param->X, cudaGauge->Even_p(), cudaGauge->Odd_p(), (void**)cpuGauge->Gauge_p(), 
		     ghost_cpuGauge, ghost_cpuGauge_diag, 
		     cudaGauge->Reconstruct(), cudaGauge->Bytes(), cudaGauge->VolumeCB(), pad, 
		     Vsh_x, Vsh_y, Vsh_z, Vsh_t, 
		     prec, cpuGauge->Order());
  
#ifdef MULTI_GPU
    if(!(param->preserve_gauge & QUDA_FAT_PRESERVE_COMM_MEM)) {
    
      for(int i=0;i < 4;i++){
	host_free(ghost_cpuGauge[i]);
      }
      for(int i=0;i <4; i++){ 
	for(int j=0;j <4; j++){
	  if (i != j) host_free(ghost_cpuGauge_diag[i*4+j]);
	}
      }    
      allocated = 0;
    }
#endif
  
  }
Beispiel #6
0
  cpuGaugeField::cpuGaugeField(const GaugeFieldParam &param) : 
    GaugeField(param), pinned(param.pinned)
  {
    if (precision == QUDA_HALF_PRECISION) {
      errorQuda("CPU fields do not support half precision");
    }
    if (pad != 0) {
      errorQuda("CPU fields do not support non-zero padding");
    }
    if (reconstruct != QUDA_RECONSTRUCT_NO && reconstruct != QUDA_RECONSTRUCT_10) {
      errorQuda("Reconstruction type %d not supported", reconstruct);
    }
    if (reconstruct == QUDA_RECONSTRUCT_10 && order != QUDA_MILC_GAUGE_ORDER) {
      errorQuda("10-reconstruction only supported with MILC gauge order");
    }

    if (order == QUDA_QDP_GAUGE_ORDER) {

      gauge = (void**) safe_malloc(nDim * sizeof(void*));

      for (int d=0; d<nDim; d++) {
	size_t nbytes = volume * reconstruct * precision;
	if (create == QUDA_NULL_FIELD_CREATE || create == QUDA_ZERO_FIELD_CREATE) {
	  gauge[d] = (pinned ? pinned_malloc(nbytes) : safe_malloc(nbytes));
	  if (create == QUDA_ZERO_FIELD_CREATE){
	    memset(gauge[d], 0, nbytes);
	  }
	} else if (create == QUDA_REFERENCE_FIELD_CREATE) {
	  gauge[d] = ((void**)param.gauge)[d];
	} else {
	  errorQuda("Unsupported creation type %d", create);
	}
      }
    
    } else if (order == QUDA_CPS_WILSON_GAUGE_ORDER || order == QUDA_MILC_GAUGE_ORDER || order == QUDA_BQCD_GAUGE_ORDER) {

      if (create == QUDA_NULL_FIELD_CREATE || create == QUDA_ZERO_FIELD_CREATE) {
	size_t nbytes = nDim * volume * reconstruct * precision;
	gauge = (void **) (pinned ? pinned_malloc(nbytes) : safe_malloc(nbytes));
	if(create == QUDA_ZERO_FIELD_CREATE){
	  memset(gauge, 0, nbytes);
	}
      } else if (create == QUDA_REFERENCE_FIELD_CREATE) {
	gauge = (void**) param.gauge;
      } else {
	errorQuda("Unsupported creation type %d", create);
      }

    } else {
      errorQuda("Unsupported gauge order type %d", order);
    }
  
    // no need to exchange data if this is a momentum field
    if (link_type != QUDA_ASQTAD_MOM_LINKS) {
      // Ghost zone is always 2-dimensional    
      for (int i=0; i<nDim; i++) {
	size_t nbytes = nFace * surface[i] * reconstruct * precision;
	ghost[i] = safe_malloc(nbytes); // no need to use pinned memory for this
      }  
      // exchange the boundaries
      exchangeGhost();
    }

    // compute the fat link max now in case it is needed later (i.e., for half precision)
    if (param.compute_fat_link_max) fat_link_max = maxGauge(*this);
  }