コード例 #1
0
ファイル: su3_test.c プロジェクト: adenbley/quda
void init() {

  param.cpu_prec = QUDA_DOUBLE_PRECISION;
  param.cuda_prec = QUDA_SINGLE_PRECISION;
  param.reconstruct = QUDA_RECONSTRUCT_12;
  param.cuda_prec_sloppy = param.cuda_prec;
  param.reconstruct_sloppy = param.reconstruct;
  
  param.X[0] = 8;
  param.X[1] = 8;
  param.X[2] = 8;
  param.X[3] = 8;
  setDims(param.X);

  param.anisotropy = 1.0;
  param.t_boundary = QUDA_ANTI_PERIODIC_T;
  param.gauge_fix = QUDA_GAUGE_FIXED_NO;
#ifdef MULTI_GPU
  param.ga_pad = param.X[0]*param.X[1]*param.X[2]/2;
#endif

  // construct gauge fields
  for (int dir = 0; dir < 4; dir++) {
    gauge[dir] = malloc(V*gaugeSiteSize*param.cpu_prec);
    new_gauge[dir] = malloc(V*gaugeSiteSize*param.cpu_prec);
  }

  int dev = 0;
  initQuda(dev);
}
コード例 #2
0
ファイル: pack_test.cpp プロジェクト: adenbley/quda
void init() {

  param.cpu_prec = QUDA_SINGLE_PRECISION;
  param.cuda_prec = QUDA_HALF_PRECISION;
  param.reconstruct = QUDA_RECONSTRUCT_8;
  param.cuda_prec_sloppy = param.cuda_prec;
  param.reconstruct_sloppy = param.reconstruct;
  
  param.X[0] = 4;
  param.X[1] = 4;
  param.X[2] = 4;
  param.X[3] = 4;
  param.ga_pad = 0;
  setDims(param.X);

  param.anisotropy = 2.3;
  param.t_boundary = QUDA_ANTI_PERIODIC_T;
  param.gauge_fix = QUDA_GAUGE_FIXED_NO;

  // construct input fields
  for (int dir = 0; dir < 4; dir++) {
    qdpGauge[dir] = malloc(V*gaugeSiteSize*param.cpu_prec);
  }
  cpsGauge = malloc(4*V*gaugeSiteSize*param.cpu_prec);

  csParam.fieldLocation = QUDA_CPU_FIELD_LOCATION;
  csParam.nColor = 3;
  csParam.nSpin = 4;
  csParam.nDim = 4;
  for (int d=0; d<4; d++) csParam.x[d] = param.X[d];
  csParam.precision = QUDA_SINGLE_PRECISION;
  csParam.pad = 0;
  csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
  csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
  csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
  csParam.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS;
  csParam.create = QUDA_NULL_FIELD_CREATE;

  spinor = new cpuColorSpinorField(csParam);
  spinor2 = new cpuColorSpinorField(csParam);

  spinor->Source(QUDA_RANDOM_SOURCE);

  initQuda(0);

  csParam.fieldLocation = QUDA_CUDA_FIELD_LOCATION;
  csParam.fieldOrder = QUDA_FLOAT4_FIELD_ORDER;
  csParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS;
  csParam.pad = 0;
  csParam.precision = QUDA_HALF_PRECISION;

  cudaSpinor = new cudaColorSpinorField(csParam);
}
コード例 #3
0
ファイル: su3_test.cpp プロジェクト: amabdelrehim/quda
void init() {

  param = newQudaGaugeParam();

  param.cpu_prec = QUDA_DOUBLE_PRECISION;
  param.cuda_prec = prec;
  param.reconstruct = link_recon;
  param.cuda_prec_sloppy = prec;
  param.reconstruct_sloppy = link_recon;
  
  param.type = QUDA_WILSON_LINKS;
  param.tadpole_coeff = 0.8;
  param.gauge_order = QUDA_QDP_GAUGE_ORDER;

  param.X[0] = xdim;
  param.X[1] = ydim;
  param.X[2] = zdim;
  param.X[3] = tdim;
  setDims(param.X);

  param.anisotropy = 1.0;
  param.t_boundary = QUDA_PERIODIC_T;
  param.gauge_fix = QUDA_GAUGE_FIXED_NO;
#ifdef MULTI_GPU
  int x_face_size = param.X[1]*param.X[2]*param.X[3]/2;
  int y_face_size = param.X[0]*param.X[2]*param.X[3]/2;
  int z_face_size = param.X[0]*param.X[1]*param.X[3]/2;
  int t_face_size = param.X[0]*param.X[1]*param.X[2]/2;
  int pad_size = MAX(x_face_size, y_face_size);
  pad_size = MAX(pad_size, z_face_size);
  pad_size = MAX(pad_size, t_face_size);
  param.ga_pad = pad_size;    
#else
  param.ga_pad = 0;    
#endif

  // construct gauge fields
  for (int dir = 0; dir < 4; dir++) {
    gauge[dir] = malloc(V*gaugeSiteSize*param.cpu_prec);
    new_gauge[dir] = malloc(V*gaugeSiteSize*param.cpu_prec);
  }

  initQuda(device);
}
コード例 #4
0
int
invert_test(void)
{
  QudaGaugeParam gaugeParam = newQudaGaugeParam();
  QudaInvertParam inv_param = newQudaInvertParam();

  double mass = 0.1;

  set_params(&gaugeParam, &inv_param,
	     xdim, ydim, zdim, tdim,
	     cpu_prec, prec, prec_sloppy,
	     link_recon, link_recon_sloppy, mass, tol, 500, 1e-3,
	     0.8);
  
  // this must be before the FaceBuffer is created (this is because it allocates pinned memory - FIXME)
  initQuda(device);

  setDims(gaugeParam.X);
  setDimConstants(gaugeParam.X);

  size_t gSize = (gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
  for (int dir = 0; dir < 4; dir++) {
    fatlink[dir] = malloc(V*gaugeSiteSize*gSize);
    longlink[dir] = malloc(V*gaugeSiteSize*gSize);
  }
  
  construct_fat_long_gauge_field(fatlink, longlink, 1, gaugeParam.cpu_prec, &gaugeParam);
    
  for (int dir = 0; dir < 4; dir++) {
    for(int i = 0;i < V*gaugeSiteSize;i++){
      if (gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION){
	((double*)fatlink[dir])[i] = 0.5 *rand()/RAND_MAX;
      }else{
	((float*)fatlink[dir])[i] = 0.5* rand()/RAND_MAX;
      }
    }
  }  
 
  ColorSpinorParam csParam;
  csParam.fieldLocation = QUDA_CPU_FIELD_LOCATION;
  csParam.nColor=3;
  csParam.nSpin=1;
  csParam.nDim=4;
  for(int d = 0; d < 4; d++) {
    csParam.x[d] = gaugeParam.X[d];
  }
  csParam.x[0] /= 2;
  
  csParam.precision = inv_param.cpu_prec;
  csParam.pad = 0;
  csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
  csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
  csParam.fieldOrder  = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
  csParam.gammaBasis = inv_param.gamma_basis;
  csParam.create = QUDA_ZERO_FIELD_CREATE;  
  in = new cpuColorSpinorField(csParam);  
  out = new cpuColorSpinorField(csParam);  
  ref = new cpuColorSpinorField(csParam);  
  tmp = new cpuColorSpinorField(csParam);  
  
  if (inv_param.cpu_prec == QUDA_SINGLE_PRECISION){
    constructSpinorField((float*)in->V());    
  }else{
    constructSpinorField((double*)in->V());
  }

  int tmp_value = MAX(ydim*zdim*tdim/2, xdim*zdim*tdim/2);
   tmp_value = MAX(tmp_value, xdim*ydim*tdim/2);
   tmp_value = MAX(tmp_value, xdim*ydim*zdim/2);

  int fat_pad = tmp_value;
  int link_pad =  3*tmp_value;

#ifdef MULTI_GPU
  gaugeParam.type = QUDA_ASQTAD_FAT_LINKS;
  gaugeParam.reconstruct = QUDA_RECONSTRUCT_NO;
  GaugeFieldParam cpuFatParam(fatlink, gaugeParam);
  cpuFat = new cpuGaugeField(cpuFatParam);
  cpuFat->exchangeGhost();
  ghost_fatlink = (void**)cpuFat->Ghost();
  
  gaugeParam.type = QUDA_ASQTAD_LONG_LINKS;
  GaugeFieldParam cpuLongParam(longlink, gaugeParam);
  cpuLong = new cpuGaugeField(cpuLongParam);
  cpuLong->exchangeGhost();
  ghost_longlink = (void**)cpuLong->Ghost();
#endif
  
  if(testtype == 6){    
    record_gauge(gaugeParam.X, fatlink, fat_pad,
		 longlink, link_pad,
		 link_recon, link_recon_sloppy,
		 &gaugeParam);        
   }else{ 
    
#ifdef MULTI_GPU
 

    gaugeParam.type = QUDA_ASQTAD_FAT_LINKS;
    gaugeParam.ga_pad = fat_pad;
    gaugeParam.reconstruct= gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO;
    loadGaugeQuda(fatlink, &gaugeParam);
    
    gaugeParam.type = QUDA_ASQTAD_LONG_LINKS;
    gaugeParam.ga_pad = link_pad;
    gaugeParam.reconstruct= link_recon;
    gaugeParam.reconstruct_sloppy = link_recon_sloppy;
    loadGaugeQuda(longlink, &gaugeParam);
#else
    gaugeParam.type = QUDA_ASQTAD_FAT_LINKS;
    gaugeParam.reconstruct = gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO;
    loadGaugeQuda(fatlink, &gaugeParam);
    
    gaugeParam.type = QUDA_ASQTAD_LONG_LINKS;
    gaugeParam.reconstruct = link_recon;
    gaugeParam.reconstruct_sloppy = link_recon_sloppy;
    loadGaugeQuda(longlink, &gaugeParam);
#endif
  }

  
  double time0 = -((double)clock()); // Start the timer
  
  unsigned long volume = Vh;
  unsigned long nflops=2*1187; //from MILC's CG routine
  double nrm2=0;
  double src2=0;
  int ret = 0;

  switch(testtype){
  case 0: //even
    volume = Vh;
    inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
    
    invertQuda(out->V(), in->V(), &inv_param);
    
    time0 += clock(); 
    time0 /= CLOCKS_PER_SEC;

#ifdef MULTI_GPU    
    matdagmat_mg4dir(ref, fatlink, longlink, ghost_fatlink, ghost_longlink, 
		     out, mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, QUDA_EVEN_PARITY);
#else
    matdagmat(ref->V(), fatlink, longlink, out->V(), mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->V(), QUDA_EVEN_PARITY);
#endif
    
    mxpy(in->V(), ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec);
    nrm2 = norm_2(ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec);
    src2 = norm_2(in->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec);

    {
      double sol = norm_2(out->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec);
      double refe = norm_2(ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec);
    }
    break;

  case 1: //odd
	
    volume = Vh;    
    inv_param.matpc_type = QUDA_MATPC_ODD_ODD;
    invertQuda(out->V(), in->V(), &inv_param);	
    time0 += clock(); // stop the timer
    time0 /= CLOCKS_PER_SEC;
    
#ifdef MULTI_GPU
    matdagmat_mg4dir(ref, fatlink, longlink, ghost_fatlink, ghost_longlink, 
		     out, mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, QUDA_ODD_PARITY);
#else
    matdagmat(ref->V(), fatlink, longlink, out->V(), mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->V(), QUDA_ODD_PARITY);	
#endif
    mxpy(in->V(), ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec);
    nrm2 = norm_2(ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec);
    src2 = norm_2(in->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec);
	
    break;
    
  case 2: //full spinor

    errorQuda("full spinor not supported\n");
    break;
    
  case 3: //multi mass CG, even
  case 4:
  case 5:
  case 6:

#define NUM_OFFSETS 7
        
    nflops = 2*(1205 + 15* NUM_OFFSETS); //from MILC's multimass CG routine
    double masses[NUM_OFFSETS] ={5.05, 1.23, 2.64, 2.33, 2.70, 2.77, 2.81};
    double offsets[NUM_OFFSETS];	
    int num_offsets =NUM_OFFSETS;
    void* outArray[NUM_OFFSETS];
    int len;
    
    cpuColorSpinorField* spinorOutArray[NUM_OFFSETS];
    spinorOutArray[0] = out;    
    for(int i=1;i < num_offsets; i++){
      spinorOutArray[i] = new cpuColorSpinorField(csParam);       
    }
    
    for(int i=0;i < num_offsets; i++){
      outArray[i] = spinorOutArray[i]->V();
    }

    for (int i=0; i< num_offsets;i++){
      offsets[i] = 4*masses[i]*masses[i];
    }
    
    len=Vh;
    volume = Vh;      

    if (testtype == 3 || testtype == 6){
      inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;      
    } else if (testtype == 4){
      inv_param.matpc_type = QUDA_MATPC_ODD_ODD;      
    }else { //testtype ==5
      errorQuda("test 5 not supported\n");
    }
    
    double residue_sq;
    if (testtype == 6){
      invertMultiShiftQudaMixed(outArray, in->V(), &inv_param, offsets, num_offsets, &residue_sq);
    }else{      
      invertMultiShiftQuda(outArray, in->V(), &inv_param, offsets, num_offsets, &residue_sq);	
    }
    cudaThreadSynchronize();
    printfQuda("Final residue squred =%g\n", residue_sq);
    time0 += clock(); // stop the timer
    time0 /= CLOCKS_PER_SEC;
    
    printfQuda("done: total time = %g secs, %i iter / %g secs = %g gflops, \n", 
	       time0, inv_param.iter, inv_param.secs,
	       inv_param.gflops/inv_param.secs);
    
    
    printfQuda("checking the solution\n");
    QudaParity parity;
    if (inv_param.solve_type == QUDA_NORMEQ_SOLVE){
      //parity = QUDA_EVENODD_PARITY;
      errorQuda("full parity not supported\n");
    }else if (inv_param.matpc_type == QUDA_MATPC_EVEN_EVEN){
      parity = QUDA_EVEN_PARITY;
    }else if (inv_param.matpc_type == QUDA_MATPC_ODD_ODD){
      parity = QUDA_ODD_PARITY;
    }else{
      errorQuda("ERROR: invalid spinor parity \n");
      exit(1);
    }
    for(int i=0;i < num_offsets;i++){
      printfQuda("%dth solution: mass=%f, ", i, masses[i]);
#ifdef MULTI_GPU
      matdagmat_mg4dir(ref, fatlink, longlink, ghost_fatlink, ghost_longlink, 
		       spinorOutArray[i], masses[i], 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, parity);
#else
      matdagmat(ref->V(), fatlink, longlink, outArray[i], masses[i], 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->V(), parity);
#endif
      mxpy(in->V(), ref->V(), len*mySpinorSiteSize, inv_param.cpu_prec);
      double nrm2 = norm_2(ref->V(), len*mySpinorSiteSize, inv_param.cpu_prec);
      double src2 = norm_2(in->V(), len*mySpinorSiteSize, inv_param.cpu_prec);
      
      printfQuda("relative residual, requested = %g, actual = %g\n", inv_param.tol, sqrt(nrm2/src2));

      //emperical, if the cpu residue is more than 2 order the target accuracy, the it fails to converge
      if (sqrt(nrm2/src2) > 100*inv_param.tol){
	ret |=1;
      }
    }

    if (ret ==1){
      errorQuda("Converge failed!\n");
    }

    for(int i=1; i < num_offsets;i++){
      delete spinorOutArray[i];
    }

    
  }//switch
    

  if (testtype <=2){

    printfQuda("Relative residual, requested = %g, actual = %g\n", inv_param.tol, sqrt(nrm2/src2));
	
    printfQuda("done: total time = %g secs, %i iter / %g secs = %g gflops, \n", 
	       time0, inv_param.iter, inv_param.secs,
	       inv_param.gflops/inv_param.secs);
    
    //emperical, if the cpu residue is more than 2 order the target accuracy, the it fails to converge
    if (sqrt(nrm2/src2) > 100*inv_param.tol){
      ret = 1;
      errorQuda("Convergence failed!\n");
    }
  }

  end();
  return ret;
}
コード例 #5
0
ファイル: wilson_dslash_test.cpp プロジェクト: mchengcit/quda
void init(int argc, char **argv) {


  kernelPackT = false; // Set true for kernel T face packing
  cuda_prec= prec;

  gauge_param = newQudaGaugeParam();
  inv_param = newQudaInvertParam();

  gauge_param.X[0] = xdim;
  gauge_param.X[1] = ydim;
  gauge_param.X[2] = zdim;
  gauge_param.X[3] = tdim;
  setDims(gauge_param.X);

  gauge_param.anisotropy = 1.0;

  gauge_param.type = QUDA_WILSON_LINKS;
  gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER;
  gauge_param.t_boundary = QUDA_PERIODIC_T;

  gauge_param.cpu_prec = cpu_prec;
  gauge_param.cuda_prec = cuda_prec;
  gauge_param.reconstruct = link_recon;
  gauge_param.reconstruct_sloppy = link_recon;
  gauge_param.cuda_prec_sloppy = cuda_prec;
  gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO;

  inv_param.kappa = 0.1;

  if (dslash_type == QUDA_TWISTED_MASS_DSLASH) {
    inv_param.mu = 0.01;
    inv_param.twist_flavor = QUDA_TWIST_MINUS;
  }

  inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
  inv_param.dagger = dagger;

  inv_param.cpu_prec = cpu_prec;
  if (inv_param.cpu_prec != gauge_param.cpu_prec) 
    errorQuda("Gauge and spinor cpu precisions must match");

  inv_param.cuda_prec = cuda_prec;

#ifndef MULTI_GPU // free parameter for single GPU
  gauge_param.ga_pad = 0;
#else // must be this one c/b face for multi gpu
  int x_face_size = gauge_param.X[1]*gauge_param.X[2]*gauge_param.X[3]/2;
  int y_face_size = gauge_param.X[0]*gauge_param.X[2]*gauge_param.X[3]/2;
  int z_face_size = gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[3]/2;
  int t_face_size = gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[2]/2;
  int pad_size =MAX(x_face_size, y_face_size);
  pad_size = MAX(pad_size, z_face_size);
  pad_size = MAX(pad_size, t_face_size);
  gauge_param.ga_pad = pad_size;    
#endif
  inv_param.sp_pad = 0;
  inv_param.cl_pad = 0;

  //inv_param.sp_pad = 24*24*24;
  //inv_param.cl_pad = 24*24*24;

  inv_param.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; // test code only supports DeGrand-Rossi Basis
  inv_param.dirac_order = QUDA_DIRAC_ORDER;

  if (test_type == 2) {
    inv_param.solution_type = QUDA_MAT_SOLUTION;
  } else {
    inv_param.solution_type = QUDA_MATPC_SOLUTION;
  }

  inv_param.dslash_type = dslash_type;

  if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
    inv_param.clover_cpu_prec = cpu_prec;
    inv_param.clover_cuda_prec = cuda_prec;
    inv_param.clover_cuda_prec_sloppy = inv_param.clover_cuda_prec;
    inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER;
    //if (test_type > 0) {
      hostClover = malloc(V*cloverSiteSize*inv_param.clover_cpu_prec);
      hostCloverInv = hostClover; // fake it
      /*} else {
      hostClover = NULL;
      hostCloverInv = malloc(V*cloverSiteSize*inv_param.clover_cpu_prec);
      }*/
  } else if (dslash_type == QUDA_TWISTED_MASS_DSLASH) {

  }

  //inv_param.verbosity = QUDA_VERBOSE;

  // construct input fields
  for (int dir = 0; dir < 4; dir++) hostGauge[dir] = malloc(V*gaugeSiteSize*gauge_param.cpu_prec);

  ColorSpinorParam csParam;
  
  csParam.fieldLocation = QUDA_CPU_FIELD_LOCATION;
  csParam.nColor = 3;
  csParam.nSpin = 4;
  if (dslash_type == QUDA_TWISTED_MASS_DSLASH) {
    csParam.twistFlavor = inv_param.twist_flavor;
  }
  csParam.nDim = 4;
  for (int d=0; d<4; d++) csParam.x[d] = gauge_param.X[d];
  csParam.precision = inv_param.cpu_prec;
  csParam.pad = 0;
  if (test_type < 2) {
    csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
    csParam.x[0] /= 2;
  } else {
    csParam.siteSubset = QUDA_FULL_SITE_SUBSET;
  }    
  csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
  csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
  csParam.gammaBasis = inv_param.gamma_basis; 
  csParam.create = QUDA_ZERO_FIELD_CREATE;

  //csParam.verbose = QUDA_DEBUG_VERBOSE;

  spinor = new cpuColorSpinorField(csParam);
  spinorOut = new cpuColorSpinorField(csParam);
  spinorRef = new cpuColorSpinorField(csParam);

  csParam.siteSubset = QUDA_FULL_SITE_SUBSET;
  csParam.x[0] = gauge_param.X[0];
  
  printfQuda("Randomizing fields... ");

  if (strcmp(latfile,"")) {  // load in the command line supplied gauge field
    read_gauge_field(latfile, hostGauge, gauge_param.cpu_prec, gauge_param.X, argc, argv);
    construct_gauge_field(hostGauge, 2, gauge_param.cpu_prec, &gauge_param);
  } else { // else generate a random SU(3) field
    construct_gauge_field(hostGauge, 1, gauge_param.cpu_prec, &gauge_param);
  }

  spinor->Source(QUDA_RANDOM_SOURCE);

  if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
    double norm = 0.0; // clover components are random numbers in the range (-norm, norm)
    double diag = 1.0; // constant added to the diagonal

    if (test_type == 2) {
      construct_clover_field(hostClover, norm, diag, inv_param.clover_cpu_prec);
    } else {
      construct_clover_field(hostCloverInv, norm, diag, inv_param.clover_cpu_prec);
    }
  }
  printfQuda("done.\n"); fflush(stdout);
  
  initQuda(device);

  printfQuda("Sending gauge field to GPU\n");
  loadGaugeQuda(hostGauge, &gauge_param);

  if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
    printfQuda("Sending clover field to GPU\n");
    loadCloverQuda(hostClover, hostCloverInv, &inv_param);
    //clover = cudaCloverPrecise;
  }

  if (!transfer) {
    csParam.fieldLocation = QUDA_CUDA_FIELD_LOCATION;
    csParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS;
    csParam.pad = inv_param.sp_pad;
    csParam.precision = inv_param.cuda_prec;
    if (csParam.precision == QUDA_DOUBLE_PRECISION ) {
      csParam.fieldOrder = QUDA_FLOAT2_FIELD_ORDER;
    } else {
      /* Single and half */
      csParam.fieldOrder = QUDA_FLOAT4_FIELD_ORDER;
    }
 
    if (test_type < 2) {
      csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
      csParam.x[0] /= 2;
    }

    printfQuda("Creating cudaSpinor\n");
    cudaSpinor = new cudaColorSpinorField(csParam);
    printfQuda("Creating cudaSpinorOut\n");
    cudaSpinorOut = new cudaColorSpinorField(csParam);

    if (test_type == 2) csParam.x[0] /= 2;

    csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
    tmp1 = new cudaColorSpinorField(csParam);
    if (dslash_type == QUDA_CLOVER_WILSON_DSLASH ||
	dslash_type == QUDA_TWISTED_MASS_DSLASH) {
      tmp2 = new cudaColorSpinorField(csParam);
    }

    printfQuda("Sending spinor field to GPU\n");
    *cudaSpinor = *spinor;

    std::cout << "Source: CPU = " << norm2(*spinor) << ", CUDA = " << 
      norm2(*cudaSpinor) << std::endl;

    bool pc = (test_type != 2);
    DiracParam diracParam;
    setDiracParam(diracParam, &inv_param, pc);
    diracParam.verbose = QUDA_VERBOSE;
    diracParam.tmp1 = tmp1;
    diracParam.tmp2 = tmp2;
    
    dirac = Dirac::create(diracParam);
  } else {
    std::cout << "Source: CPU = " << norm2(*spinor) << std::endl;
  }
    
}
コード例 #6
0
ファイル: gauge_force_test.cpp プロジェクト: azrael417/quda
static void
gauge_force_test(void) 
{
  int max_length = 6;    
  
  initQuda(device);
  setVerbosityQuda(QUDA_VERBOSE,"",stdout);

  qudaGaugeParam = newQudaGaugeParam();
  
  qudaGaugeParam.X[0] = xdim;
  qudaGaugeParam.X[1] = ydim;
  qudaGaugeParam.X[2] = zdim;
  qudaGaugeParam.X[3] = tdim;
  
  setDims(qudaGaugeParam.X);
  
  qudaGaugeParam.anisotropy = 1.0;
  qudaGaugeParam.cpu_prec = link_prec;
  qudaGaugeParam.cuda_prec = link_prec;
  qudaGaugeParam.cuda_prec_sloppy = link_prec;
  qudaGaugeParam.reconstruct = link_recon;  
  qudaGaugeParam.reconstruct_sloppy = link_recon;  
  qudaGaugeParam.type = QUDA_SU3_LINKS; // in this context, just means these are site links   
  
  qudaGaugeParam.gauge_order = gauge_order;
  qudaGaugeParam.t_boundary = QUDA_PERIODIC_T;
  qudaGaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO;
  qudaGaugeParam.ga_pad = 0;
  qudaGaugeParam.mom_ga_pad = 0;

  size_t gSize = qudaGaugeParam.cpu_prec;
    
  void* sitelink;  
  void* sitelink_1d;
  
#ifdef GPU_DIRECT
  sitelink_1d = pinned_malloc(4*V*gaugeSiteSize*gSize);
#else
  sitelink_1d = safe_malloc(4*V*gaugeSiteSize*gSize);
#endif
  
  // this is a hack to have site link generated in 2d 
  // then copied to 1d array in "MILC" format
  void* sitelink_2d[4];
#ifdef GPU_DIRECT
  for(int i=0;i<4;i++) sitelink_2d[i] = pinned_malloc(V*gaugeSiteSize*qudaGaugeParam.cpu_prec); 
#else
  for(int i=0;i<4;i++) sitelink_2d[i] = safe_malloc(V*gaugeSiteSize*qudaGaugeParam.cpu_prec);
#endif
  
  // fills the gauge field with random numbers
  createSiteLinkCPU(sitelink_2d, qudaGaugeParam.cpu_prec, 0);
  
  //copy the 2d sitelink to 1d milc format 
  
  for(int dir = 0; dir < 4; dir++){
    for(int i=0; i < V; i++){
      char* src =  ((char*)sitelink_2d[dir]) + i * gaugeSiteSize* qudaGaugeParam.cpu_prec;
      char* dst =  ((char*)sitelink_1d) + (4*i+dir)*gaugeSiteSize*qudaGaugeParam.cpu_prec ;
      memcpy(dst, src, gaugeSiteSize*qudaGaugeParam.cpu_prec);
    }
  }
  if (qudaGaugeParam.gauge_order ==  QUDA_MILC_GAUGE_ORDER){ 
    sitelink =  sitelink_1d;    
  }else if (qudaGaugeParam.gauge_order == QUDA_QDP_GAUGE_ORDER) {
    sitelink = (void**)sitelink_2d;
  } else {
    errorQuda("Unsupported gauge order %d", qudaGaugeParam.gauge_order);
  }
  
#ifdef MULTI_GPU
  void* sitelink_ex_2d[4];
  void* sitelink_ex_1d;

  sitelink_ex_1d = pinned_malloc(4*V_ex*gaugeSiteSize*gSize);
  for(int i=0;i < 4;i++) sitelink_ex_2d[i] = pinned_malloc(V_ex*gaugeSiteSize*gSize);

  int X1= Z[0];
  int X2= Z[1];
  int X3= Z[2];
  int X4= Z[3];

  for(int i=0; i < V_ex; i++){
    int sid = i;
    int oddBit=0;
    if(i >= Vh_ex){
      sid = i - Vh_ex;
      oddBit = 1;
    }
    
    int za = sid/E1h;
    int x1h = sid - za*E1h;
    int zb = za/E2;
    int x2 = za - zb*E2;
    int x4 = zb/E3;
    int x3 = zb - x4*E3;
    int x1odd = (x2 + x3 + x4 + oddBit) & 1;
    int x1 = 2*x1h + x1odd;

    if( x1< 2 || x1 >= X1 +2
        || x2< 2 || x2 >= X2 +2
        || x3< 2 || x3 >= X3 +2
        || x4< 2 || x4 >= X4 +2){
      continue;
    }
    
    x1 = (x1 - 2 + X1) % X1;
    x2 = (x2 - 2 + X2) % X2;
    x3 = (x3 - 2 + X3) % X3;
    x4 = (x4 - 2 + X4) % X4;
    
    int idx = (x4*X3*X2*X1+x3*X2*X1+x2*X1+x1)>>1;
    if(oddBit){
      idx += Vh;
    }
    for(int dir= 0; dir < 4; dir++){
      char* src = (char*)sitelink_2d[dir];
      char* dst = (char*)sitelink_ex_2d[dir];
      memcpy(dst+i*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize);
    }//dir
  }//i
  
  
  for(int dir = 0; dir < 4; dir++){
    for(int i=0; i < V_ex; i++){
      char* src =  ((char*)sitelink_ex_2d[dir]) + i * gaugeSiteSize* qudaGaugeParam.cpu_prec;
      char* dst =  ((char*)sitelink_ex_1d) + (4*i+dir)*gaugeSiteSize*qudaGaugeParam.cpu_prec ;
      memcpy(dst, src, gaugeSiteSize*qudaGaugeParam.cpu_prec);
    }
  }
  
#endif

  void* mom = safe_malloc(4*V*momSiteSize*gSize);
  void* refmom = safe_malloc(4*V*momSiteSize*gSize);

  memset(mom, 0, 4*V*momSiteSize*gSize);
  //initialize some data in cpuMom
  createMomCPU(mom, qudaGaugeParam.cpu_prec);      
  memcpy(refmom, mom, 4*V*momSiteSize*gSize);
  
  
  double loop_coeff_d[sizeof(loop_coeff_f)/sizeof(float)];
  for(unsigned int i=0;i < sizeof(loop_coeff_f)/sizeof(float); i++){
    loop_coeff_d[i] = loop_coeff_f[i];
  }
    
  void* loop_coeff;
  if(qudaGaugeParam.cuda_prec == QUDA_SINGLE_PRECISION){
    loop_coeff = (void*)&loop_coeff_f[0];
  }else{
    loop_coeff = loop_coeff_d;
  }
  double eb3 = 0.3;
  int num_paths = sizeof(path_dir_x)/sizeof(path_dir_x[0]);
  
  int** input_path_buf[4];
  for(int dir =0; dir < 4; dir++){
    input_path_buf[dir] = (int**)safe_malloc(num_paths*sizeof(int*));    
    for(int i=0;i < num_paths;i++){
      input_path_buf[dir][i] = (int*)safe_malloc(length[i]*sizeof(int));
      if(dir == 0) memcpy(input_path_buf[dir][i], path_dir_x[i], length[i]*sizeof(int));
      else if(dir ==1) memcpy(input_path_buf[dir][i], path_dir_y[i], length[i]*sizeof(int));
      else if(dir ==2) memcpy(input_path_buf[dir][i], path_dir_z[i], length[i]*sizeof(int));
      else if(dir ==3) memcpy(input_path_buf[dir][i], path_dir_t[i], length[i]*sizeof(int));
    }
  }

  if (tune) {
    printfQuda("Tuning...\n");
    setTuning(QUDA_TUNE_YES);
  }
  
  struct timeval t0, t1;
  double timeinfo[3];
  /* Multiple execution to exclude warmup time in the first run*/
  for (int i =0;i < attempts; i++){
    gettimeofday(&t0, NULL);
    computeGaugeForceQuda(mom, sitelink,  input_path_buf, length,
			  loop_coeff_d, num_paths, max_length, eb3,
			  &qudaGaugeParam, timeinfo);
    gettimeofday(&t1, NULL);
  }
 
  double total_time = t1.tv_sec - t0.tv_sec + 0.000001*(t1.tv_usec - t0.tv_usec);
  //The number comes from CPU implementation in MILC, gauge_force_imp.c
  int flops=153004;
    
  if (verify_results){	
    for(int i = 0;i < attempts;i++){
#ifdef MULTI_GPU
      //last arg=0 means no optimization for communication, i.e. exchange data in all directions 
      //even they are not partitioned
      int R[4] = {2, 2, 2, 2};
      exchange_cpu_sitelink_ex(qudaGaugeParam.X, R, (void**)sitelink_ex_2d,
			       QUDA_QDP_GAUGE_ORDER, qudaGaugeParam.cpu_prec, 0, 4);    
      gauge_force_reference(refmom, eb3, sitelink_2d, sitelink_ex_2d, qudaGaugeParam.cpu_prec,
			    input_path_buf, length, loop_coeff, num_paths);
#else
      gauge_force_reference(refmom, eb3, sitelink_2d, NULL, qudaGaugeParam.cpu_prec,
			    input_path_buf, length, loop_coeff, num_paths);
#endif
    }
  
    int res;
    res = compare_floats(mom, refmom, 4*V*momSiteSize, 1e-3, qudaGaugeParam.cpu_prec);
    
    strong_check_mom(mom, refmom, 4*V, qudaGaugeParam.cpu_prec);
    
    printf("Test %s\n",(1 == res) ? "PASSED" : "FAILED");
  }  

  double perf = 1.0* flops*V/(total_time*1e+9);
  double kernel_perf = 1.0*flops*V/(timeinfo[1]*1e+9);
  printf("init and cpu->gpu time: %.2f ms, kernel time: %.2f ms, gpu->cpu and cleanup time: %.2f  total time =%.2f ms\n", 
	 timeinfo[0]*1e+3, timeinfo[1]*1e+3, timeinfo[2]*1e+3, total_time*1e+3);
  printf("kernel performance: %.2f GFLOPS, overall performance : %.2f GFLOPS\n", kernel_perf, perf);
  
  for(int dir = 0; dir < 4; dir++){
    for(int i=0;i < num_paths; i++) host_free(input_path_buf[dir][i]);
    host_free(input_path_buf[dir]);
  }
  
  host_free(sitelink_1d);
  for(int dir=0;dir < 4;dir++) host_free(sitelink_2d[dir]);
  
#ifdef MULTI_GPU  
  host_free(sitelink_ex_1d);
  for(int dir=0; dir < 4; dir++) host_free(sitelink_ex_2d[dir]);
#endif


  host_free(mom);
  host_free(refmom);
  endQuda();
}            
コード例 #7
0
static int
unitarize_link_test()
{

  QudaGaugeParam qudaGaugeParam = newQudaGaugeParam();

  initQuda(0);

  cpu_prec = prec;
  gSize = cpu_prec;  
  qudaGaugeParam.anisotropy = 1.0;
  
  qudaGaugeParam.X[0] = xdim;
  qudaGaugeParam.X[1] = ydim;
  qudaGaugeParam.X[2] = zdim;
  qudaGaugeParam.X[3] = tdim;

  setDims(qudaGaugeParam.X);
  
  QudaPrecision link_prec = QUDA_SINGLE_PRECISION;
  QudaReconstructType link_recon = QUDA_RECONSTRUCT_NO;

  qudaGaugeParam.cpu_prec  = link_prec;
  qudaGaugeParam.cuda_prec = link_prec;
  qudaGaugeParam.reconstruct = link_recon;
  qudaGaugeParam.type = QUDA_WILSON_LINKS;


  hisq::fermion_force::hisqForceInitCuda(&qudaGaugeParam);
  
  qudaGaugeParam.t_boundary  	   = QUDA_PERIODIC_T;
  qudaGaugeParam.anisotropy  	   = 1.0;
  qudaGaugeParam.cuda_prec_sloppy   = prec;
  qudaGaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO;
  qudaGaugeParam.gauge_fix   	   = QUDA_GAUGE_FIXED_NO;
  qudaGaugeParam.ga_pad      	   = 0;
  qudaGaugeParam.packed_size 	   = 0;
  qudaGaugeParam.gaugeGiB    	   = 0;
  qudaGaugeParam.flag              = false;

   
  qudaGaugeParam.cpu_prec = cpu_prec;
  qudaGaugeParam.cuda_prec = prec;
  qudaGaugeParam.gauge_order = gauge_order;
  qudaGaugeParam.type=QUDA_WILSON_LINKS;
  qudaGaugeParam.reconstruct = link_recon;
  qudaGaugeParam.flag = QUDA_FAT_PRESERVE_CPU_GAUGE
    | QUDA_FAT_PRESERVE_GPU_GAUGE
    | QUDA_FAT_PRESERVE_COMM_MEM;

  setFatLinkPadding(QUDA_COMPUTE_FAT_STANDARD, &qudaGaugeParam);
 
  GaugeFieldParam gParam(0, qudaGaugeParam);
  gParam.pad = 0;
  gParam.create    = QUDA_REFERENCE_FIELD_CREATE;
  gParam.link_type = QUDA_WILSON_LINKS;
  gParam.order     = QUDA_MILC_GAUGE_ORDER;
  cpuGaugeField *cpuOutLink  = new cpuGaugeField(gParam);

  gParam.pad         = 0;
  gParam.create      = QUDA_NULL_FIELD_CREATE;
  gParam.link_type   = QUDA_WILSON_LINKS;
  gParam.order       = QUDA_QDP_GAUGE_ORDER;
  gParam.reconstruct = QUDA_RECONSTRUCT_NO;
  cudaGaugeField *cudaFatLink = new cudaGaugeField(gParam);
  cudaGaugeField *cudaULink   = new cudaGaugeField(gParam);  

  initCommonConstants(*cudaFatLink);

  void* fatlink = (void*)malloc(4*V*gaugeSiteSize*gSize);
  if(fatlink == NULL){
    errorQuda("ERROR: allocating fatlink failed\n");
  }
  
  void* sitelink[4];
  for(int i=0;i < 4;i++){
    cudaMallocHost((void**)&sitelink[i], V*gaugeSiteSize*gSize);
    if(sitelink[i] == NULL){
      errorQuda("ERROR; allocate sitelink[%d] failed\n", i);
    }
  }
  
  createSiteLinkCPU(sitelink, qudaGaugeParam.cpu_prec, 1);

  double act_path_coeff[6];
  act_path_coeff[0] = 0.625000;
  act_path_coeff[1] = -0.058479;
  act_path_coeff[2] = -0.087719;
  act_path_coeff[3] = 0.030778;
  act_path_coeff[4] = -0.007200;
  act_path_coeff[5] = -0.123113;


  //only record the last call's performance
  //the first one is for creating the cpu/cuda data structures
  
  if(gauge_order == QUDA_QDP_GAUGE_ORDER){
    computeFatLinkQuda(fatlink, sitelink, act_path_coeff, &qudaGaugeParam,
			   QUDA_COMPUTE_FAT_STANDARD);
  } // gauge order is QDP_GAUGE_ORDER

  cpuOutLink->setGauge((void**)fatlink);
  cudaFatLink->loadCPUField(*cpuOutLink, QUDA_CPU_FIELD_LOCATION);
 

 
  hisq::setUnitarizeLinksConstants(unitarize_eps,
				   max_allowed_error,
				   reunit_allow_svd,
				   reunit_svd_only,
				   svd_rel_error,
				   svd_abs_error);
 
  hisq::setUnitarizeLinksPadding(0,0);

  int* num_failures_dev;
  cudaMalloc(&num_failures_dev, sizeof(int));
  cudaMemset(num_failures_dev, 0, sizeof(int));

  struct timeval t0, t1;

  gettimeofday(&t0,NULL);
  hisq::unitarizeLinksCuda(qudaGaugeParam,*cudaFatLink, cudaULink, num_failures_dev);
  cudaThreadSynchronize();
  gettimeofday(&t1,NULL);

  int num_failures=0;
  cudaMemcpy(&num_failures, num_failures_dev, sizeof(int), cudaMemcpyDeviceToHost);

 delete cudaFatLink;
 delete cudaULink;
 for(int dir=0; dir<4; ++dir) cudaFreeHost(sitelink[dir]);
  cudaFree(num_failures_dev); 
#ifdef MULTI_GPU
  exchange_llfat_cleanup();
#endif
  endQuda();
   
  printfQuda("Unitarization time: %g ms\n", TDIFF(t0,t1)*1000); 
  return num_failures;
}
コード例 #8
0
int
invert_test(void)
{
  QudaGaugeParam gaugeParam = newQudaGaugeParam();
  QudaInvertParam inv_param = newQudaInvertParam();

  double mass = 0.95;

  set_params(&gaugeParam, &inv_param,
	     sdim, sdim, sdim, tdim,
	     cpu_prec, prec, prec_sloppy,
	     link_recon, link_recon_sloppy, mass, tol, 500, 1e-3,
	     0.8);
  
  // this must be before the FaceBuffer is created (this is because it allocates pinned memory - FIXME)
  initQuda(device);

  setDims(gaugeParam.X);
  setDimConstants(gaugeParam.X);

  size_t gSize = (gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
  for (int dir = 0; dir < 4; dir++) {
    fatlink[dir] = malloc(V*gaugeSiteSize*gSize);
    longlink[dir] = malloc(V*gaugeSiteSize*gSize);
  }
  
  construct_fat_long_gauge_field(fatlink, longlink, 1, gaugeParam.cpu_prec, &gaugeParam);
    
  for (int dir = 0; dir < 4; dir++) {
    for(int i = 0;i < V*gaugeSiteSize;i++){
      if (gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION){
	((double*)fatlink[dir])[i] = 0.5 *rand()/RAND_MAX;
      }else{
	((float*)fatlink[dir])[i] = 0.5* rand()/RAND_MAX;
      }
    }
  }

  
#ifdef MULTI_GPU

  //exchange_init_dims(gaugeParam.X);
  int ghost_link_len[4] = {
    Vs_x*gaugeSiteSize*gSize,
    Vs_y*gaugeSiteSize*gSize,
    Vs_z*gaugeSiteSize*gSize,
    Vs_t*gaugeSiteSize*gSize
  };

  for(int i=0;i < 4;i++){
    ghost_fatlink[i] = malloc(ghost_link_len[i]);
    ghost_longlink[i] = malloc(3*ghost_link_len[i]);
    if (ghost_fatlink[i] == NULL || ghost_longlink[i] == NULL){
      printf("ERROR: malloc failed for ghost fatlink or ghost longlink\n");
      exit(1);
    }
  }

  //exchange_cpu_links4dir(fatlink, ghost_fatlink, longlink, ghost_longlink, gaugeParam.cpu_prec);

  void *fat_send[4], *long_send[4];
  for(int i=0;i < 4;i++){
    fat_send[i] = malloc(ghost_link_len[i]);
    long_send[i] = malloc(3*ghost_link_len[i]);
  }

  set_dim(Z);
  pack_ghost(fatlink, fat_send, 1, gaugeParam.cpu_prec);
  pack_ghost(longlink, long_send, 3, gaugeParam.cpu_prec);

  int dummyFace = 1;
  FaceBuffer faceBuf (Z, 4, 18, dummyFace, gaugeParam.cpu_prec);
  faceBuf.exchangeCpuLink((void**)ghost_fatlink, (void**)fat_send, 1);
  faceBuf.exchangeCpuLink((void**)ghost_longlink, (void**)long_send, 3);

  for (int i=0; i<4; i++) {
    free(fat_send[i]);
    free(long_send[i]);
  }

#endif

 
  ColorSpinorParam csParam;
  csParam.fieldLocation = QUDA_CPU_FIELD_LOCATION;
  csParam.nColor=3;
  csParam.nSpin=1;
  csParam.nDim=4;
  for(int d = 0; d < 4; d++) {
    csParam.x[d] = gaugeParam.X[d];
  }
  csParam.x[0] /= 2;
  
  csParam.precision = inv_param.cpu_prec;
  csParam.pad = 0;
  csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
  csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
  csParam.fieldOrder  = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
  csParam.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS;
  csParam.create = QUDA_ZERO_FIELD_CREATE;  
  in = new cpuColorSpinorField(csParam);  
  out = new cpuColorSpinorField(csParam);  
  ref = new cpuColorSpinorField(csParam);  
  tmp = new cpuColorSpinorField(csParam);  
  
  if (inv_param.cpu_prec == QUDA_SINGLE_PRECISION){
    constructSpinorField((float*)in->v);    
  }else{
    constructSpinorField((double*)in->v);
  }


  
  
#ifdef MULTI_GPU

  if(testtype == 6){
    record_gauge(fatlink, ghost_fatlink[3], Vsh_t,
		 longlink, ghost_longlink[3], 3*Vsh_t,
		 link_recon, link_recon_sloppy,
		 &gaugeParam);
   }else{
    gaugeParam.type = QUDA_ASQTAD_FAT_LINKS;
    gaugeParam.ga_pad = MAX(sdim*sdim*sdim/2, sdim*sdim*tdim/2);
    gaugeParam.reconstruct= gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO;
    loadGaugeQuda(fatlink, &gaugeParam);
    
    gaugeParam.type = QUDA_ASQTAD_LONG_LINKS;
    gaugeParam.ga_pad = 3*MAX(sdim*sdim*sdim/2, sdim*sdim*tdim/2);
    gaugeParam.reconstruct= link_recon;
    gaugeParam.reconstruct_sloppy = link_recon_sloppy;
    loadGaugeQuda(longlink, &gaugeParam);
  }

#else
  gaugeParam.type = QUDA_ASQTAD_FAT_LINKS;
  gaugeParam.reconstruct = gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO;
  loadGaugeQuda(fatlink, &gaugeParam);
  
  gaugeParam.type = QUDA_ASQTAD_LONG_LINKS;
  gaugeParam.reconstruct = link_recon;
  gaugeParam.reconstruct_sloppy = link_recon_sloppy;
  loadGaugeQuda(longlink, &gaugeParam);
#endif
  
  double time0 = -((double)clock()); // Start the timer
  
  unsigned long volume = Vh;
  unsigned long nflops=2*1187; //from MILC's CG routine
  double nrm2=0;
  double src2=0;
  int ret = 0;


  switch(testtype){
  case 0: //even
    volume = Vh;
    inv_param.solution_type = QUDA_MATPCDAG_MATPC_SOLUTION;
    inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
    
    invertQuda(out->v, in->v, &inv_param);
    
    time0 += clock(); 
    time0 /= CLOCKS_PER_SEC;

#ifdef MULTI_GPU    
    matdagmat_mg4dir(ref, fatlink, ghost_fatlink, longlink, ghost_longlink, 
		     out, mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, QUDA_EVEN_PARITY);
#else
    matdagmat(ref->v, fatlink, longlink, out->v, mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->v, QUDA_EVEN_PARITY);
#endif
    
    mxpy(in->v, ref->v, Vh*mySpinorSiteSize, inv_param.cpu_prec);
    nrm2 = norm_2(ref->v, Vh*mySpinorSiteSize, inv_param.cpu_prec);
    src2 = norm_2(in->v, Vh*mySpinorSiteSize, inv_param.cpu_prec);
    break;

  case 1: //odd
	
    volume = Vh;    
    inv_param.solution_type = QUDA_MATPCDAG_MATPC_SOLUTION;
    inv_param.matpc_type = QUDA_MATPC_ODD_ODD;
    invertQuda(out->v, in->v, &inv_param);	
    time0 += clock(); // stop the timer
    time0 /= CLOCKS_PER_SEC;
    
#ifdef MULTI_GPU
    matdagmat_mg4dir(ref, fatlink, ghost_fatlink, longlink, ghost_longlink, 
		     out, mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, QUDA_ODD_PARITY);
#else
    matdagmat(ref->v, fatlink, longlink, out->v, mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->v, QUDA_ODD_PARITY);	
#endif
    mxpy(in->v, ref->v, Vh*mySpinorSiteSize, inv_param.cpu_prec);
    nrm2 = norm_2(ref->v, Vh*mySpinorSiteSize, inv_param.cpu_prec);
    src2 = norm_2(in->v, Vh*mySpinorSiteSize, inv_param.cpu_prec);
	
    break;
    
  case 2: //full spinor

    errorQuda("full spinor not supported\n");
    break;
    
  case 3: //multi mass CG, even
  case 4:
  case 5:
  case 6:

#define NUM_OFFSETS 4
        
    nflops = 2*(1205 + 15* NUM_OFFSETS); //from MILC's multimass CG routine
    double masses[NUM_OFFSETS] ={5.05, 1.23, 2.64, 2.33};
    double offsets[NUM_OFFSETS];	
    int num_offsets =NUM_OFFSETS;
    void* outArray[NUM_OFFSETS];
    int len;
    
    cpuColorSpinorField* spinorOutArray[NUM_OFFSETS];
    spinorOutArray[0] = out;    
    for(int i=1;i < num_offsets; i++){
      spinorOutArray[i] = new cpuColorSpinorField(csParam);       
    }
    
    for(int i=0;i < num_offsets; i++){
      outArray[i] = spinorOutArray[i]->v;
    }

    for (int i=0; i< num_offsets;i++){
      offsets[i] = 4*masses[i]*masses[i];
    }
    
    len=Vh;
    volume = Vh;      
    inv_param.solution_type = QUDA_MATPCDAG_MATPC_SOLUTION;

    if (testtype == 3){
      inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;      
    } else if (testtype == 4||testtype == 6){
      inv_param.matpc_type = QUDA_MATPC_ODD_ODD;      
    }else { //testtype ==5
      errorQuda("test 5 not supported\n");
    }
    
    double residue_sq;
    if (testtype == 6){
      //invertMultiShiftQudaMixed(spinorOutArray, in->v, &inv_param, offsets, num_offsets, &residue_sq);
    }else{      
      invertMultiShiftQuda(outArray, in->v, &inv_param, offsets, num_offsets, &residue_sq);	
    }
    cudaThreadSynchronize();
    printfQuda("Final residue squred =%g\n", residue_sq);
    time0 += clock(); // stop the timer
    time0 /= CLOCKS_PER_SEC;
    
    printfQuda("done: total time = %g secs, %i iter / %g secs = %g gflops, \n", 
	       time0, inv_param.iter, inv_param.secs,
	       inv_param.gflops/inv_param.secs);
    
    
    printfQuda("checking the solution\n");
    QudaParity parity;
    if (inv_param.solve_type == QUDA_NORMEQ_SOLVE){
      //parity = QUDA_EVENODD_PARITY;
      errorQuda("full parity not supported\n");
    }else if (inv_param.matpc_type == QUDA_MATPC_EVEN_EVEN){
      parity = QUDA_EVEN_PARITY;
    }else if (inv_param.matpc_type == QUDA_MATPC_ODD_ODD){
      parity = QUDA_ODD_PARITY;
    }else{
      errorQuda("ERROR: invalid spinor parity \n");
      exit(1);
    }
    
    for(int i=0;i < num_offsets;i++){
      printfQuda("%dth solution: mass=%f, ", i, masses[i]);
#ifdef MULTI_GPU
      matdagmat_mg4dir(ref, fatlink, ghost_fatlink, longlink, ghost_longlink, 
		       spinorOutArray[i], masses[i], 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, parity);
#else
      matdagmat(ref->v, fatlink, longlink, outArray[i], masses[i], 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->v, parity);
#endif
      mxpy(in->v, ref->v, len*mySpinorSiteSize, inv_param.cpu_prec);
      double nrm2 = norm_2(ref->v, len*mySpinorSiteSize, inv_param.cpu_prec);
      double src2 = norm_2(in->v, len*mySpinorSiteSize, inv_param.cpu_prec);
      
      printfQuda("relative residual, requested = %g, actual = %g\n", inv_param.tol, sqrt(nrm2/src2));

      //emperical, if the cpu residue is more than 2 order the target accuracy, the it fails to converge
      if (sqrt(nrm2/src2) > 100*inv_param.tol){
	ret |=1;
	errorQuda("Converge failed!\n");
      }
    }
    
    for(int i=1; i < num_offsets;i++){
      delete spinorOutArray[i];
    }

    
  }//switch
    

  if (testtype <=2){

    printfQuda("Relative residual, requested = %g, actual = %g\n", inv_param.tol, sqrt(nrm2/src2));
	
    printfQuda("done: total time = %g secs, %i iter / %g secs = %g gflops, \n", 
	       time0, inv_param.iter, inv_param.secs,
	       inv_param.gflops/inv_param.secs);
    
    //emperical, if the cpu residue is more than 2 order the target accuracy, the it fails to converge
    if (sqrt(nrm2/src2) > 100*inv_param.tol){
      ret = 1;
      errorQuda("Convergence failed!\n");
    }
  }

  end();
  return ret;
}
コード例 #9
0
int main(int argc, char **argv)
{
  // set QUDA parameters

  int device = 0; // CUDA device number

  QudaPrecision cpu_prec = QUDA_DOUBLE_PRECISION;
  QudaPrecision cuda_prec = QUDA_SINGLE_PRECISION;
  QudaPrecision cuda_prec_sloppy = QUDA_HALF_PRECISION;

  QudaGaugeParam gauge_param = newQudaGaugeParam();
  QudaInvertParam inv_param = newQudaInvertParam();
 
  gauge_param.X[0] = 16; 
  gauge_param.X[1] = 16;
  gauge_param.X[2] = 16;
  gauge_param.X[3] = 16;
  inv_param.Ls = 16;

  gauge_param.anisotropy = 1.0;
  gauge_param.type = QUDA_WILSON_LINKS;
  gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER;
  gauge_param.t_boundary = QUDA_ANTI_PERIODIC_T;
  
  gauge_param.cpu_prec = cpu_prec;
  gauge_param.cuda_prec = cuda_prec;
  gauge_param.reconstruct = QUDA_RECONSTRUCT_12;
  gauge_param.cuda_prec_sloppy = cuda_prec_sloppy;
  gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_12;
  gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO;

  inv_param.dslash_type = QUDA_DOMAIN_WALL_DSLASH;
  inv_param.inv_type = QUDA_CG_INVERTER;

  inv_param.mass = 0.01;
  inv_param.m5 = -1.5;
  double kappa5 = 0.5/(5 + inv_param.m5);

  inv_param.tol = 5e-8;
  inv_param.maxiter = 1000;
  inv_param.reliable_delta = 0.1;

  inv_param.solution_type = QUDA_MAT_SOLUTION;
  inv_param.solve_type = QUDA_NORMEQ_PC_SOLVE;
  inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
  inv_param.dagger = QUDA_DAG_NO;
  inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION;

  inv_param.cpu_prec = cpu_prec;
  inv_param.cuda_prec = cuda_prec;
  inv_param.cuda_prec_sloppy = cuda_prec_sloppy;
  inv_param.prec_precondition = cuda_prec_sloppy;
  inv_param.preserve_source = QUDA_PRESERVE_SOURCE_NO;
  inv_param.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS;
  inv_param.dirac_order = QUDA_DIRAC_ORDER;

  inv_param.dirac_tune = QUDA_TUNE_YES;
  inv_param.preserve_dirac = QUDA_PRESERVE_DIRAC_YES;

  gauge_param.ga_pad = 0; // 24*24*24;
  inv_param.sp_pad = 0;   // 24*24*24;
  inv_param.cl_pad = 0;   // 24*24*24;

  inv_param.verbosity = QUDA_VERBOSE;

  // Everything between here and the call to initQuda() is application-specific.

  // set parameters for the reference Dslash, and prepare fields to be loaded
  setDims(gauge_param.X, inv_param.Ls);

  size_t gSize = (gauge_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
  size_t sSize = (inv_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);

  void *gauge[4];

  for (int dir = 0; dir < 4; dir++) {
    gauge[dir] = malloc(V*gaugeSiteSize*gSize);
  }
  construct_gauge_field(gauge, 1, gauge_param.cpu_prec, &gauge_param);

  void *spinorIn = malloc(V*spinorSiteSize*sSize*inv_param.Ls);
  void *spinorOut = malloc(V*spinorSiteSize*sSize*inv_param.Ls);
  void *spinorCheck = malloc(V*spinorSiteSize*sSize*inv_param.Ls);

  // create a point source at 0
  if (inv_param.cpu_prec == QUDA_SINGLE_PRECISION) *((float*)spinorIn) = 1.0;
  else *((double*)spinorIn) = 1.0;

  // start the timer
  double time0 = -((double)clock());

  // initialize the QUDA library
  initQuda(device);

  // load the gauge field
  loadGaugeQuda((void*)gauge, &gauge_param);

  // perform the inversion
  invertQuda(spinorOut, spinorIn, &inv_param);

  // stop the timer
  time0 += clock();
  time0 /= CLOCKS_PER_SEC;

  printf("Device memory used:\n   Spinor: %f GiB\n    Gauge: %f GiB\n", 
	 inv_param.spinorGiB, gauge_param.gaugeGiB);
  printf("\nDone: %i iter / %g secs = %g Gflops, total time = %g secs\n", 
	 inv_param.iter, inv_param.secs, inv_param.gflops/inv_param.secs, time0);

  if (inv_param.solution_type == QUDA_MAT_SOLUTION) { 
    mat(spinorCheck, gauge, spinorOut, kappa5, 0, inv_param.cpu_prec, 
	gauge_param.cpu_prec, inv_param.mass); 
    if (inv_param.mass_normalization == QUDA_MASS_NORMALIZATION)
      ax(0.5/kappa5, spinorCheck, V*spinorSiteSize, inv_param.cpu_prec);
  } else if(inv_param.solution_type == QUDA_MATPC_SOLUTION) {   
    matpc(spinorCheck, gauge, spinorOut, kappa5, inv_param.matpc_type, 0, 
	  inv_param.cpu_prec, gauge_param.cpu_prec, inv_param.mass);
    if (inv_param.mass_normalization == QUDA_MASS_NORMALIZATION)
      ax(0.25/(kappa5*kappa5), spinorCheck, V*spinorSiteSize, inv_param.cpu_prec);
  }

  mxpy(spinorIn, spinorCheck, V*spinorSiteSize, inv_param.cpu_prec);
  double nrm2 = norm_2(spinorCheck, V*spinorSiteSize, inv_param.cpu_prec);
  double src2 = norm_2(spinorIn, V*spinorSiteSize, inv_param.cpu_prec);
  printf("Relative residual: requested = %g, actual = %g\n", inv_param.tol, sqrt(nrm2/src2));

  // finalize the QUDA library
  endQuda();

  return 0;
}
コード例 #10
0
ファイル: quda_interface.c プロジェクト: Marcogarofalo/tmLQCD
void _initQuda() {
  if( quda_initialized )
    return;

  if( g_debug_level > 0 )
    if(g_proc_id == 0)
      printf("\n# QUDA: Detected QUDA version %d.%d.%d\n\n", QUDA_VERSION_MAJOR, QUDA_VERSION_MINOR, QUDA_VERSION_SUBMINOR);
  if( QUDA_VERSION_MAJOR == 0 && QUDA_VERSION_MINOR < 7) {
    fprintf(stderr, "Error: minimum QUDA version required is 0.7.0 (for support of chiral basis and removal of bug in mass normalization with preconditioning).\n");
    exit(-2);
  }

  gauge_param = newQudaGaugeParam();
  inv_param = newQudaInvertParam();

  // *** QUDA parameters begin here (sloppy prec. will be adjusted in invert)
  QudaPrecision cpu_prec  = QUDA_DOUBLE_PRECISION;
  QudaPrecision cuda_prec = QUDA_DOUBLE_PRECISION;
  QudaPrecision cuda_prec_sloppy = QUDA_SINGLE_PRECISION;
  QudaPrecision cuda_prec_precondition = QUDA_HALF_PRECISION;

  QudaTune tune = QUDA_TUNE_YES;


  // *** the remainder should not be changed for this application
  // local lattice size
#if USE_LZ_LY_LX_T
  gauge_param.X[0] = LZ;
  gauge_param.X[1] = LY;
  gauge_param.X[2] = LX;
  gauge_param.X[3] = T;
#else
  gauge_param.X[0] = LX;
  gauge_param.X[1] = LY;
  gauge_param.X[2] = LZ;
  gauge_param.X[3] = T;
#endif

  inv_param.Ls = 1;

  gauge_param.anisotropy = 1.0;
  gauge_param.type = QUDA_WILSON_LINKS;
  gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER;

  gauge_param.cpu_prec = cpu_prec;
  gauge_param.cuda_prec = cuda_prec;
  gauge_param.reconstruct = 18;
  gauge_param.cuda_prec_sloppy = cuda_prec_sloppy;
  gauge_param.reconstruct_sloppy = 18;
  gauge_param.cuda_prec_precondition = cuda_prec_precondition;
  gauge_param.reconstruct_precondition = 18;
  gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO;

  inv_param.dagger = QUDA_DAG_NO;
  inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION;
  inv_param.solver_normalization = QUDA_DEFAULT_NORMALIZATION;

  inv_param.pipeline = 0;
  inv_param.gcrNkrylov = 10;

  // require both L2 relative and heavy quark residual to determine convergence
//  inv_param.residual_type = (QudaResidualType)(QUDA_L2_RELATIVE_RESIDUAL | QUDA_HEAVY_QUARK_RESIDUAL);
  inv_param.tol_hq = 1.0;//1e-3; // specify a tolerance for the residual for heavy quark residual
  inv_param.reliable_delta = 1e-2; // ignored by multi-shift solver

  // domain decomposition preconditioner parameters
  inv_param.inv_type_precondition = QUDA_CG_INVERTER;
  inv_param.schwarz_type = QUDA_ADDITIVE_SCHWARZ;
  inv_param.precondition_cycle = 1;
  inv_param.tol_precondition = 1e-1;
  inv_param.maxiter_precondition = 10;
  inv_param.verbosity_precondition = QUDA_SILENT;
  inv_param.cuda_prec_precondition = cuda_prec_precondition;
  inv_param.omega = 1.0;

  inv_param.cpu_prec = cpu_prec;
  inv_param.cuda_prec = cuda_prec;
  inv_param.cuda_prec_sloppy = cuda_prec_sloppy;

  inv_param.clover_cpu_prec = cpu_prec;
  inv_param.clover_cuda_prec = cuda_prec;
  inv_param.clover_cuda_prec_sloppy = cuda_prec_sloppy;
  inv_param.clover_cuda_prec_precondition = cuda_prec_precondition;

  inv_param.preserve_source = QUDA_PRESERVE_SOURCE_YES;
  inv_param.gamma_basis = QUDA_CHIRAL_GAMMA_BASIS;
  inv_param.dirac_order = QUDA_DIRAC_ORDER;

  inv_param.input_location = QUDA_CPU_FIELD_LOCATION;
  inv_param.output_location = QUDA_CPU_FIELD_LOCATION;

  inv_param.tune = tune ? QUDA_TUNE_YES : QUDA_TUNE_NO;

  gauge_param.ga_pad = 0; // 24*24*24/2;
  inv_param.sp_pad = 0; // 24*24*24/2;
  inv_param.cl_pad = 0; // 24*24*24/2;

  // For multi-GPU, ga_pad must be large enough to store a time-slice
  int x_face_size = gauge_param.X[1]*gauge_param.X[2]*gauge_param.X[3]/2;
  int y_face_size = gauge_param.X[0]*gauge_param.X[2]*gauge_param.X[3]/2;
  int z_face_size = gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[3]/2;
  int t_face_size = gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[2]/2;
  int pad_size =MAX(x_face_size, y_face_size);
  pad_size = MAX(pad_size, z_face_size);
  pad_size = MAX(pad_size, t_face_size);
  gauge_param.ga_pad = pad_size;

  // solver verbosity
  if( g_debug_level == 0 )
    inv_param.verbosity = QUDA_SILENT;
  else if( g_debug_level == 1 )
    inv_param.verbosity = QUDA_SUMMARIZE;
  else
    inv_param.verbosity = QUDA_VERBOSE;

  // general verbosity
  setVerbosityQuda( QUDA_SUMMARIZE, "# QUDA: ", stdout);

  // declare the grid mapping used for communications in a multi-GPU grid
#if USE_LZ_LY_LX_T
  int grid[4] = {g_nproc_z, g_nproc_y, g_nproc_x, g_nproc_t};
#else
  int grid[4] = {g_nproc_x, g_nproc_y, g_nproc_z, g_nproc_t};
#endif

  initCommsGridQuda(4, grid, commsMap, NULL);

  // alloc gauge_quda
  size_t gSize = (gauge_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);

  for (int dir = 0; dir < 4; dir++) {
    gauge_quda[dir] = (double*) malloc(VOLUME*18*gSize);
    if(gauge_quda[dir] == NULL) {
      fprintf(stderr, "_initQuda: malloc for gauge_quda[dir] failed");
      exit(-2);
    }
  }

  // alloc space for a temp. spinor, used throughout this module
  tempSpinor  = (double*)malloc( 2*VOLUME*24*sizeof(double) ); /* factor 2 for doublet */
  if(tempSpinor == NULL) {
    fprintf(stderr, "_initQuda: malloc for tempSpinor failed");
    exit(-2);
  }

  // initialize the QUDA library
#ifdef MPI
  initQuda(-1); //sets device numbers automatically
#else
  initQuda(0);  //scalar build: use device 0
#endif
  quda_initialized = 1;
}
コード例 #11
0
void init()
{    

  initQuda(device);

  gaugeParam = newQudaGaugeParam();
  inv_param = newQudaInvertParam();
  
  gaugeParam.X[0] = X[0] = xdim;
  gaugeParam.X[1] = X[1] = ydim;
  gaugeParam.X[2] = X[2] = zdim;
  gaugeParam.X[3] = X[3] = tdim;

  setDims(gaugeParam.X);
  setSpinorSiteSize(6);

  gaugeParam.cpu_prec = QUDA_DOUBLE_PRECISION;
  gaugeParam.cuda_prec = prec;
  gaugeParam.reconstruct = link_recon;
  gaugeParam.reconstruct_sloppy = gaugeParam.reconstruct;
  gaugeParam.cuda_prec_sloppy = gaugeParam.cuda_prec;
    
  gaugeParam.anisotropy = 1.0;
  gaugeParam.tadpole_coeff = 0.8;
  gaugeParam.gauge_order = QUDA_QDP_GAUGE_ORDER;
  gaugeParam.t_boundary = QUDA_ANTI_PERIODIC_T;
  gaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO;
  gaugeParam.gaugeGiB = 0;
    
  inv_param.cpu_prec = QUDA_DOUBLE_PRECISION;
  inv_param.cuda_prec = prec;
  inv_param.dirac_order = QUDA_DIRAC_ORDER;
  inv_param.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS;
  inv_param.dagger = dagger;
  inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
  inv_param.dslash_type = QUDA_ASQTAD_DSLASH;

  inv_param.input_location = QUDA_CPU_FIELD_LOCATION;
  inv_param.output_location = QUDA_CPU_FIELD_LOCATION;

  int tmpint = MAX(X[1]*X[2]*X[3], X[0]*X[2]*X[3]);
  tmpint = MAX(tmpint, X[0]*X[1]*X[3]);
  tmpint = MAX(tmpint, X[0]*X[1]*X[2]);
  
  
  gaugeParam.ga_pad = tmpint;
  inv_param.sp_pad = tmpint;

  ColorSpinorParam csParam;
  csParam.fieldLocation = QUDA_CPU_FIELD_LOCATION;
  csParam.nColor=3;
  csParam.nSpin=1;
  csParam.nDim=4;
  for(int d = 0; d < 4; d++) {
    csParam.x[d] = gaugeParam.X[d];
  }
  csParam.precision = inv_param.cpu_prec;
  csParam.pad = 0;
  if (test_type < 2) {
    inv_param.solution_type = QUDA_MATPC_SOLUTION;
    csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
    csParam.x[0] /= 2;
  } else {
    inv_param.solution_type = QUDA_MAT_SOLUTION;
    csParam.siteSubset = QUDA_FULL_SITE_SUBSET;	
  }

  csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
  csParam.fieldOrder  = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
  csParam.gammaBasis = inv_param.gamma_basis; // this parameter is meaningless for staggered
  csParam.create = QUDA_ZERO_FIELD_CREATE;    

  spinor = new cpuColorSpinorField(csParam);
  spinorOut = new cpuColorSpinorField(csParam);
  spinorRef = new cpuColorSpinorField(csParam);

  csParam.siteSubset = QUDA_FULL_SITE_SUBSET;
  csParam.x[0] = gaugeParam.X[0];
    
  printfQuda("Randomizing fields ...\n");
    
  spinor->Source(QUDA_RANDOM_SOURCE);

  size_t gSize = (gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
    
  for (int dir = 0; dir < 4; dir++) {
    fatlink[dir] = malloc(V*gaugeSiteSize*gSize);
    longlink[dir] = malloc(V*gaugeSiteSize*gSize);
  }
  if (fatlink == NULL || longlink == NULL){
    errorQuda("ERROR: malloc failed for fatlink/longlink");
  }
  construct_fat_long_gauge_field(fatlink, longlink, 1, gaugeParam.cpu_prec, &gaugeParam);
  
#ifdef MULTI_GPU
  gaugeParam.type = QUDA_ASQTAD_FAT_LINKS;
  gaugeParam.reconstruct = QUDA_RECONSTRUCT_NO;
  GaugeFieldParam cpuFatParam(fatlink, gaugeParam);
  cpuFat = new cpuGaugeField(cpuFatParam);
  cpuFat->exchangeGhost();
  ghost_fatlink = cpuFat->Ghost();

  gaugeParam.type = QUDA_ASQTAD_LONG_LINKS;
  GaugeFieldParam cpuLongParam(longlink, gaugeParam);
  cpuLong = new cpuGaugeField(cpuLongParam);
  cpuLong->exchangeGhost();
  ghost_longlink = cpuLong->Ghost();

  int x_face_size = X[1]*X[2]*X[3]/2;
  int y_face_size = X[0]*X[2]*X[3]/2;
  int z_face_size = X[0]*X[1]*X[3]/2;
  int t_face_size = X[0]*X[1]*X[2]/2;
  int pad_size =MAX(x_face_size, y_face_size);
  pad_size = MAX(pad_size, z_face_size);
  pad_size = MAX(pad_size, t_face_size);
  gaugeParam.ga_pad = pad_size;    
#endif

  gaugeParam.type = QUDA_ASQTAD_FAT_LINKS;
  gaugeParam.reconstruct = gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO;

  printfQuda("Fat links sending..."); 
  loadGaugeQuda(fatlink, &gaugeParam);
  printfQuda("Fat links sent\n"); 
  
  gaugeParam.type = QUDA_ASQTAD_LONG_LINKS;  

#ifdef MULTI_GPU
  gaugeParam.ga_pad = 3*pad_size;
#endif

  gaugeParam.reconstruct = gaugeParam.reconstruct_sloppy = link_recon;
  printfQuda("Long links sending..."); 
  loadGaugeQuda(longlink, &gaugeParam);
  printfQuda("Long links sent...\n"); 

    printfQuda("Sending fields to GPU..."); 
    
  if (!transfer) {

    //csParam.verbose = QUDA_DEBUG_VERBOSE;
	
    csParam.fieldLocation = QUDA_CUDA_FIELD_LOCATION;
    csParam.fieldOrder = QUDA_FLOAT2_FIELD_ORDER;
    csParam.pad = inv_param.sp_pad;
    csParam.precision = inv_param.cuda_prec;
    if (test_type < 2){
      csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
      csParam.x[0] /=2;
    }

    printfQuda("Creating cudaSpinor\n");
    cudaSpinor = new cudaColorSpinorField(csParam);

    printfQuda("Creating cudaSpinorOut\n");
    cudaSpinorOut = new cudaColorSpinorField(csParam);
	
    printfQuda("Sending spinor field to GPU\n");
    *cudaSpinor = *spinor;
	
    cudaDeviceSynchronize();
    checkCudaError();
	
    double spinor_norm2 = norm2(*spinor);
    double cuda_spinor_norm2=  norm2(*cudaSpinor);
    printfQuda("Source CPU = %f, CUDA=%f\n", spinor_norm2, cuda_spinor_norm2);
	
    if(test_type == 2){
      csParam.x[0] /=2;
    }
    csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
    tmp = new cudaColorSpinorField(csParam);

    bool pc = (test_type != 2);
    DiracParam diracParam;
    setDiracParam(diracParam, &inv_param, pc);

    diracParam.verbose = QUDA_VERBOSE;
    diracParam.tmp1=tmp;

    dirac = Dirac::create(diracParam);
	
  } else {
    errorQuda("Error not suppported");
  }
    
  return;
}
コード例 #12
0
ファイル: 1hadr.GPU.mom.cpp プロジェクト: cpviolator/QUDA-CPS
int main(int argc, char *argv[]) {

  int seed = atoi(argv[1]);
  int device = atoi(argv[2]);

  initQuda(device);
  Start(&argc,&argv);

  DoArg do_arg;
  setup_do_arg(do_arg, seed, NSITES_3D, NSITES_T, BETA);
  GJP.Initialize(do_arg);

  //VRB.DeactivateAll();
  
  GwilsonFclover lat;
  CommonArg c_arg;

  //Declare args for Gaussian Smearing
  QPropWGaussArg g_arg;
  g_arg.gauss_link_smear_type=GAUSS_LS_TYPE;   //Link smearing
  g_arg.gauss_link_smear_coeff=GAUSS_LS_COEFF; //Link smearing
  g_arg.gauss_link_smear_N=GAUSS_LS_N;         //Link smearing hits
  g_arg.gauss_N = GAUSS_N;                     //Source/Sink smearing hits
  g_arg.gauss_W = sqrt(KAPPA*4*g_arg.gauss_N); //Smearing parameter.

  char is_qu[5];
  #ifdef QUENCH
    GhbArg ghb_arg;
    ghb_arg.num_iter = 1;
    AlgGheatBath hb(lat, &c_arg, &ghb_arg);
    strcpy(is_qu,"QUEN");
  #else
    HmdArg hmd_arg;
    setup_hmd_arg(hmd_arg);
    AlgHmcPhi hmc(lat, &c_arg, &hmd_arg);
    strcpy(is_qu,"UNQU");
  #endif

  int sweep_counter = 0;
  int total_updates = NTHERM + NSKIP*(NDATA-1);

  QPropWArg arg0;
  arg0.t=0;
  arg0.x=0;
  arg0.y=0;
  arg0.z=0;
  arg0.cg.mass = MASS;
  arg0.cg.stop_rsd = STOP_RSD;
  arg0.cg.max_num_iter = MAX_NUM_ITER;
  arg0.cg.Inverter = INVERTER_TYPE;
  arg0.cg.bicgstab_n = BICGSTAB_N;

  int x2[4];
  WilsonMatrix t4;		
  Float d0_t4t4c_re_tr = 0.0;
  int x2_idx = 0;
  int vol3d = pow(NSITES_3D,3);
  char lattice[256]; //lattice config file
  char file[256];  //output file

  //////////////////////
  // Start simulation //
  ////////////////////// 

  while (sweep_counter < total_updates) {
    for (int n = 1; n <= NSKIP; n++) {
#ifdef READ
      //do nothing
#else
      #ifdef QUENCH 
	hb.run();
      #else 
	hmc.run();
      #endif
#endif
      sweep_counter++;
      if (!UniqueID()) {
        printf("step %d complete\n",sweep_counter);
        fflush(stdout);
      }
    }

    if (sweep_counter == NTHERM) printf("thermalization complete. \n");
    if (sweep_counter >= NTHERM) {

      // Use this code to specify a gauge configuration.
      #ifdef QUENCH
        sprintf(lattice, LATT_PATH"QU/lat_hb_B%.2f_%d-%d_%d.dat", BETA, NSITES_3D, NSITES_T, sweep_counter);
      #else
	sprintf(lattice, LATT_PATH"UNQ/lat_hmc_B%.2f_M%.3f_%d-%d_%d.dat", BETA, NSITES_3D, NSITES_T, sweep_counter);
      #endif
#ifdef READ
      ReadLatticeParallel(lat,lattice);
#else
      WriteLatticeParallel(lat,lattice);
#endif
      gaugecounter = 1;

      // Get Point Source Propagator
      // This will place a unit wall source t plane set at the coordinates
      // specified by arg0, modulated by a phase set by P. It will then be
      // smeared using the parameters specified by g_arg.

      //Set the momentum phase.
      int P[3] = {P1,P2,P3};

      //Smear the source using the parameters set by g_arg.
      QPropWMomSrcSmeared qprop0(lat, &arg0, P, &g_arg, &c_arg);
      // Smear the sink with the same g_arg parameters.
      qprop0.GaussSmearSinkProp(g_arg);
      
      //Sum over x2
      for (x2[3]=0; x2[3]<GJP.TnodeSites(); x2[3]++) {
	//Reinitialise trace
	d0_t4t4c_re_tr *= 0.0;	
	for (x2[2]=0; x2[2]<GJP.ZnodeSites(); x2[2]++)
	  for (x2[1]=0; x2[1]<GJP.YnodeSites(); x2[1]++)
	    for (x2[0]=0; x2[0]<GJP.XnodeSites(); x2[0]++) {
	      x2_idx = lat.GsiteOffset(x2)/4;

	      //Get propagator sinked at x2.
	      t4 = qprop0[x2_idx];
	      //Get the real part of the trace.
	      d0_t4t4c_re_tr += MMDag_re_tr(t4);
	    }
	
	//////////////////////////
	// Write trace to file. //
	//////////////////////////
	
	//Write data file so that the data can be reproduced from the name of the file.
	sprintf(file, DATAPATH"MOM_%d%d%d_GPU_%d_B%.2f_M%.3f_N%d_W%.3f_n%d_xi%.2f_1pion_%s_stout_%d-%d.dat",
	P[0], P[1], P[2], seed, BETA, MASS, g_arg.gauss_N, g_arg.gauss_W, g_arg.gauss_link_smear_N, 
	g_arg.gauss_link_smear_coeff, is_qu, NSITES_3D, NSITES_T);
	
	FILE *t4tr=Fopen(file,"a");
	Fprintf(t4tr,"%d %d %d %.16e\n", sweep_counter, x2[3], 0, d0_t4t4c_re_tr);
	Fclose(t4tr);
	cout<<"time slice = "<<x2[3]<<" complete."<<endl;
	
	//////////////////////////////////////////
	// End trace summation at time slice t. //
	//////////////////////////////////////////
	
      }
    }
  }
  
  ////////////////////
  // End simulation //
  ////////////////////
  
  //End();
  endQuda();
  return 0;
}
コード例 #13
0
ファイル: invert_dw_quda_v3.c プロジェクト: etmc/cvc
int main(int argc, char **argv) {
  
  int c, i, mu, status;
  int ispin, icol, isc;
  int n_c = 3;
  int n_s = 4;
  int count        = 0;
  int filename_set = 0;
  int dims[4]      = {0,0,0,0};
  int grid_size[4];
  int l_LX_at, l_LXstart_at;
  int x0, x1, x2, x3, ix, iix, iy, is, it, i3;
  int sl0, sl1, sl2, sl3, have_source_flag=0;
  int source_proc_coords[4], lsl0, lsl1, lsl2, lsl3;
  int check_residuum = 0;
  unsigned int VOL3, V5;
  int do_gt   = 0;
  int full_orbit = 0;
  int smear_source = 0;
  char filename[200], source_filename[200], source_filename_write[200];
  double ratime, retime;
  double plaq_r=0., plaq_m=0., norm, norm2;
  double spinor1[24];
  double *gauge_qdp[4], *gauge_field_timeslice=NULL, *gauge_field_smeared=NULL;
  double _1_2_kappa, _2_kappa, phase;
  FILE *ofs;
  int mu_trans[4] = {3, 0, 1, 2};
  int threadid, nthreads;
  int timeslice, source_timeslice;
  char rng_file_in[100], rng_file_out[100];
  int *source_momentum=NULL;
  int source_momentum_class = -1;
  int source_momentum_no = 0;
  int source_momentum_runs = 1;
  int imom;
  int num_gpu_on_node=0, rank;
  int source_location_5d_iseven;
  int convert_sign=0;
#ifdef HAVE_QUDA
  int rotate_gamma_basis = 1;
#else
  int rotate_gamma_basis = 0;
#endif
  omp_lock_t *lck = NULL, gen_lck[1];
  int key = 0;


  /****************************************************************************/
  /* for smearing parallel to inversion                                       */
  double *smearing_spinor_field[] = {NULL,NULL};
  int dummy_flag = 0;
  /****************************************************************************/


  /****************************************************************************/
#if (defined HAVE_QUDA) && (defined MULTI_GPU)
  int x_face_size, y_face_size, z_face_size, t_face_size, pad_size;
#endif
  /****************************************************************************/

  /************************************************/
  int qlatt_nclass;
  int *qlatt_id=NULL, *qlatt_count=NULL, **qlatt_rep=NULL, **qlatt_map=NULL;
  double **qlatt_list=NULL;
  /************************************************/

  /************************************************/
  double boundary_condition_factor;
  int boundary_condition_factor_set = 0;
  /************************************************/

//#ifdef MPI       
//  kernelPackT = true;
//#endif

  /***********************************************
   * QUDA parameters
   ***********************************************/
#ifdef HAVE_QUDA
  QudaPrecision cpu_prec         = QUDA_DOUBLE_PRECISION;
  QudaPrecision cuda_prec        = QUDA_DOUBLE_PRECISION;
  QudaPrecision cuda_prec_sloppy = QUDA_SINGLE_PRECISION;

  QudaGaugeParam gauge_param = newQudaGaugeParam();
  QudaInvertParam inv_param = newQudaInvertParam();
#endif

  while ((c = getopt(argc, argv, "soch?vgf:p:b:S:R:")) != -1) {
    switch (c) {
    case 'v':
      g_verbose = 1;
      break;
    case 'g':
      do_gt = 1;
      break;
    case 'f':
      strcpy(filename, optarg);
      filename_set=1;
      break;
    case 'c':
      check_residuum = 1;
      fprintf(stdout, "# [invert_dw_quda] will check residuum again\n");
      break;
    case 'p':
      n_c = atoi(optarg);
      fprintf(stdout, "# [invert_dw_quda] will use number of colors = %d\n", n_c);
      break;
    case 'o':
      full_orbit = 1;
      fprintf(stdout, "# [invert_dw_quda] will invert for full orbit, if source momentum set\n");
    case 's':
      smear_source = 1;
      fprintf(stdout, "# [invert_dw_quda] will smear the sources if they are read from file\n");
      break;
    case 'b':
      boundary_condition_factor = atof(optarg);
      boundary_condition_factor_set = 1;
      fprintf(stdout, "# [invert_dw_quda] const. boundary condition factor set to %e\n", boundary_condition_factor);
      break;
    case 'S':
      convert_sign = atoi(optarg);
      fprintf(stdout, "# [invert_dw_quda] using convert sign %d\n", convert_sign);
      break;
    case 'R':
      rotate_gamma_basis = atoi(optarg);
      fprintf(stdout, "# [invert_dw_quda] rotate gamma basis %d\n", rotate_gamma_basis);
      break;
    case 'h':
    case '?':
    default:
      usage();
      break;
    }
  }

  // get the time stamp
  g_the_time = time(NULL);

  /**************************************
   * set the default values, read input
   **************************************/
  if(filename_set==0) strcpy(filename, "cvc.input");
  if(g_proc_id==0) fprintf(stdout, "# Reading input from file %s\n", filename);
  read_input_parser(filename);

#ifdef MPI
#ifdef HAVE_QUDA
  grid_size[0] = g_nproc_x;
  grid_size[1] = g_nproc_y;
  grid_size[2] = g_nproc_z;
  grid_size[3] = g_nproc_t;
  fprintf(stdout, "# [] g_nproc = (%d,%d,%d,%d)\n", g_nproc_x, g_nproc_y, g_nproc_z, g_nproc_t);
  initCommsQuda(argc, argv, grid_size, 4);
#else
  MPI_Init(&argc, &argv);
#endif
#endif

#if (defined PARALLELTX) || (defined PARALLELTXY)
  EXIT_WITH_MSG(1, "[] Error, 2-dim./3-dim. MPI-Version not yet implemented");
#endif


  // some checks on the input data
  if((T_global == 0) || (LX==0) || (LY==0) || (LZ==0)) {
    if(g_proc_id==0) fprintf(stderr, "[invert_dw_quda] Error, T and L's must be set\n");
    usage();
  }

  // set number of openmp threads

  // initialize MPI parameters
  mpi_init(argc, argv);
  
  // the volume of a timeslice
  VOL3 = LX*LY*LZ;
  V5   = T*LX*LY*LZ*L5;
  g_kappa5d = 0.5 / (5. + g_m5);
  if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] kappa5d = %e\n", g_kappa5d);

  fprintf(stdout, "# [%2d] parameters:\n"\
                  "# [%2d] T            = %3d\n"\
		  "# [%2d] Tstart       = %3d\n"\
		  "# [%2d] L5           = %3d\n",\
                  g_cart_id, g_cart_id, T, g_cart_id, Tstart, g_cart_id, L5);


#ifdef MPI
  if(T==0) {
    fprintf(stderr, "[%2d] local T is zero; exit\n", g_cart_id);
    MPI_Abort(MPI_COMM_WORLD, 1);
    MPI_Finalize();
    exit(2);
  }
#endif

  if(init_geometry() != 0) {
    fprintf(stderr, "[invert_dw_quda] Error from init_geometry\n");
    EXIT(1);
  }
  geometry();

  if( init_geometry_5d() != 0 ) {
    fprintf(stderr, "[invert_dw_quda] Error from init_geometry_5d\n");
    EXIT(2);
  }
  geometry_5d();

  /**************************************
   * initialize the QUDA library
   **************************************/
  if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] initializing quda\n");
#ifdef HAVE_QUDA
  // cudaGetDeviceCount(&num_gpu_on_node);
  if(g_gpu_per_node<0) {
    if(g_cart_id==0) fprintf(stderr, "[] Error, number of GPUs per node not set\n");
    EXIT(106);
  } else {
    num_gpu_on_node = g_gpu_per_node;
  }
#ifdef MPI
  rank = comm_rank();
#else
  rank = 0;
#endif
  g_gpu_device_number = rank % num_gpu_on_node;
  fprintf(stdout, "# [] process %d/%d uses device %d\n", rank, g_cart_id, g_gpu_device_number);

  initQuda(g_gpu_device_number);

#endif
 
  /**************************************
   * prepare the gauge field
   **************************************/
  // read the gauge field from file
  alloc_gauge_field(&g_gauge_field, VOLUMEPLUSRAND);
  if(strcmp( gaugefilename_prefix, "identity")==0 ) {
    if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] Setting up unit gauge field\n");
    for(ix=0;ix<VOLUME; ix++) {
      for(mu=0;mu<4;mu++) {
        _cm_eq_id(g_gauge_field+_GGI(ix,mu));
      }
    }
  } else if(strcmp( gaugefilename_prefix, "random")==0 ) {
    if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] Setting up random gauge field with seed = %d\n", g_seed);
    init_rng_state(g_seed, &g_rng_state);
    random_gauge_field(g_gauge_field, 1.);
    plaquette(&plaq_m);
    sprintf(filename, "%s.%.4d", gaugefilename_prefix, Nconf);
    check_error(write_lime_gauge_field(filename, plaq_m, Nconf, 64), "write_lime_gauge_field", NULL, 12);
  } else {
    if(g_gauge_file_format == 0) {
      // ILDG
      sprintf(filename, "%s.%.4d", gaugefilename_prefix, Nconf);
      if(g_cart_id==0) fprintf(stdout, "# Reading gauge field from file %s\n", filename);
      status = read_lime_gauge_field_doubleprec(filename);
    } else if(g_gauge_file_format == 1) {
      // NERSC
      sprintf(filename, "%s.%.5d", gaugefilename_prefix, Nconf);
      if(g_cart_id==0) fprintf(stdout, "# Reading gauge field from file %s\n", filename);
      status = read_nersc_gauge_field(g_gauge_field, filename, &plaq_r);
      //status = read_nersc_gauge_field_3x3(g_gauge_field, filename, &plaq_r);

    }
    if(status != 0) {
      fprintf(stderr, "[invert_dw_quda] Error, could not read gauge field");
      EXIT(12);
    }
  }
#ifdef MPI
  xchange_gauge();
#endif

  // measure the plaquette
  plaquette(&plaq_m);
  if(g_cart_id==0) fprintf(stdout, "# Measured plaquette value: %25.16e\n", plaq_m);
  if(g_cart_id==0) fprintf(stdout, "# Read plaquette value    : %25.16e\n", plaq_r);

#ifndef HAVE_QUDA
  if(N_Jacobi>0) {
#endif
    // allocate the smeared / qdp ordered gauge field
    alloc_gauge_field(&gauge_field_smeared, VOLUMEPLUSRAND);
    for(i=0;i<4;i++) {
      gauge_qdp[i] = gauge_field_smeared + i*18*VOLUME;
    }
#ifndef HAVE_QUDA
  }
#endif

#ifdef HAVE_QUDA
  // transcribe the gauge field

  omp_set_num_threads(g_num_threads);
#pragma omp parallel for private(ix,iy,mu)
  for(ix=0;ix<VOLUME;ix++) {
    iy = g_lexic2eot[ix];
    for(mu=0;mu<4;mu++) {
      _cm_eq_cm(gauge_qdp[mu_trans[mu]]+18*iy, g_gauge_field+_GGI(ix,mu));
    }
  }
  // multiply timeslice T-1 with factor of -1 (antiperiodic boundary condition)
  if(g_proc_coords[0]==g_nproc_t-1) {
    if(!boundary_condition_factor_set) boundary_condition_factor = -1.;
    fprintf(stdout, "# [] process %d multiplies gauge-field timeslice T_global-1 with boundary condition factor %e\n", g_cart_id,
      boundary_condition_factor);

  omp_set_num_threads(g_num_threads);
#pragma omp parallel for private(ix,iy)
    for(ix=0;ix<VOL3;ix++) {
      iix = (T-1)*VOL3 + ix;
      iy = g_lexic2eot[iix];
      _cm_ti_eq_re(gauge_qdp[mu_trans[0]]+18*iy, -1.);
    }
  }

  // QUDA precision parameters
  switch(g_cpu_prec) {
    case 0: cpu_prec = QUDA_HALF_PRECISION; if(g_cart_id==0) fprintf(stdout, "# [] CPU prec = half\n"); break;
    case 1: cpu_prec = QUDA_SINGLE_PRECISION; if(g_cart_id==0) fprintf(stdout, "# [] CPU prec = single\n"); break;
    case 2: cpu_prec = QUDA_DOUBLE_PRECISION; if(g_cart_id==0) fprintf(stdout, "# [] CPU prec = double\n"); break;
    default: cpu_prec = QUDA_DOUBLE_PRECISION; break;
  }
  switch(g_gpu_prec) {
    case 0: cuda_prec = QUDA_HALF_PRECISION; if(g_cart_id==0) fprintf(stdout, "# [] GPU prec = half\n"); break;
    case 1: cuda_prec = QUDA_SINGLE_PRECISION; if(g_cart_id==0) fprintf(stdout, "# [] GPU prec = single\n"); break;
    case 2: cuda_prec = QUDA_DOUBLE_PRECISION; if(g_cart_id==0) fprintf(stdout, "# [] GPU prec = double\n"); break;
    default: cuda_prec = QUDA_DOUBLE_PRECISION; break;
  }
  switch(g_gpu_prec_sloppy) {
    case 0: cuda_prec_sloppy = QUDA_HALF_PRECISION; if(g_cart_id==0) fprintf(stdout, "# [] GPU sloppy prec = half\n"); break;
    case 1: cuda_prec_sloppy = QUDA_SINGLE_PRECISION; if(g_cart_id==0) fprintf(stdout, "# [] GPU sloppy prec = single\n"); break;
    case 2: cuda_prec_sloppy = QUDA_DOUBLE_PRECISION; if(g_cart_id==0) fprintf(stdout, "# [] GPU sloppy prec = double\n"); break;
    default: cuda_prec_sloppy = QUDA_SINGLE_PRECISION; break;
  }

  // QUDA gauge parameters
  gauge_param.X[0] = LX;
  gauge_param.X[1] = LY;
  gauge_param.X[2] = LZ;
  gauge_param.X[3] = T;
  inv_param.Ls = L5;

  gauge_param.anisotropy  = 1.0;
  gauge_param.type        = QUDA_WILSON_LINKS;
  gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER;
  gauge_param.t_boundary  = QUDA_ANTI_PERIODIC_T;

  gauge_param.cpu_prec           = cpu_prec;
  gauge_param.cuda_prec          = cuda_prec;
  gauge_param.reconstruct        = QUDA_RECONSTRUCT_12;
  gauge_param.cuda_prec_sloppy   = cuda_prec_sloppy;
  gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_12;
  gauge_param.gauge_fix          = QUDA_GAUGE_FIXED_NO;

  gauge_param.ga_pad = 0;
  inv_param.sp_pad = 0;
  inv_param.cl_pad = 0;

  // For multi-GPU, ga_pad must be large enough to store a time-slice
#ifdef MULTI_GPU
  x_face_size = inv_param.Ls * gauge_param.X[1]*gauge_param.X[2]*gauge_param.X[3]/2;
  y_face_size = inv_param.Ls * gauge_param.X[0]*gauge_param.X[2]*gauge_param.X[3]/2;
  z_face_size = inv_param.Ls * gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[3]/2;
  t_face_size = inv_param.Ls * gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[2]/2;
  pad_size = _MAX(x_face_size, y_face_size);
  pad_size = _MAX(pad_size, z_face_size);
  pad_size = _MAX(pad_size, t_face_size);
  gauge_param.ga_pad = pad_size;
  if(g_cart_id==0) printf("# [invert_dw_quda] pad_size = %d\n", pad_size);
#endif

  // load the gauge field
  if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] loading gauge field\n");
  loadGaugeQuda((void*)gauge_qdp, &gauge_param);
  gauge_qdp[0] = NULL; 
  gauge_qdp[1] = NULL; 
  gauge_qdp[2] = NULL; 
  gauge_qdp[3] = NULL; 

#endif

  /*********************************************
   * APE smear the gauge field
   *********************************************/
  if(N_Jacobi>0) {
    memcpy(gauge_field_smeared, g_gauge_field, 72*VOLUMEPLUSRAND*sizeof(double));
    fprintf(stdout, "# [invert_dw_quda] APE smearing gauge field with paramters N_APE=%d, alpha_APE=%e\n", N_ape, alpha_ape);
    APE_Smearing_Step_threads(gauge_field_smeared, N_ape, alpha_ape);
    xchange_gauge_field(gauge_field_smeared);
  }

  // allocate memory for the spinor fields
#ifdef HAVE_QUDA
  no_fields = 3+2;
#else
  no_fields = 6+2;
#endif
  g_spinor_field = (double**)calloc(no_fields, sizeof(double*));
  for(i=0; i<no_fields; i++) alloc_spinor_field(&g_spinor_field[i], VOLUMEPLUSRAND*L5);
  smearing_spinor_field[0] = g_spinor_field[no_fields-2];
  smearing_spinor_field[1] = g_spinor_field[no_fields-1];

  switch(g_source_type) {
    case 0:
    case 5:
      // the source locaton
      sl0 =   g_source_location                              / (LX_global*LY_global*LZ);
      sl1 = ( g_source_location % (LX_global*LY_global*LZ) ) / (          LY_global*LZ);
      sl2 = ( g_source_location % (          LY_global*LZ) ) / (                    LZ);
      sl3 =   g_source_location %                      LZ;
      if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] global sl = (%d, %d, %d, %d)\n", sl0, sl1, sl2, sl3);
      source_proc_coords[0] = sl0 / T;
      source_proc_coords[1] = sl1 / LX;
      source_proc_coords[2] = sl2 / LY;
      source_proc_coords[3] = sl3 / LZ;
    #ifdef MPI
      MPI_Cart_rank(g_cart_grid, source_proc_coords, &g_source_proc_id);
    #else
      g_source_proc_id = 0;
    #endif
      have_source_flag = g_source_proc_id == g_cart_id;
    
      lsl0 = sl0 % T;
      lsl1 = sl1 % LX;
      lsl2 = sl2 % LY;
      lsl3 = sl3 % LZ;
      if(have_source_flag) {
        fprintf(stdout, "# [invert_dw_quda] process %d has the source at (%d, %d, %d, %d)\n", g_cart_id, lsl0, lsl1, lsl2, lsl3);
      }
      break;
    case 2:
    case 3:
    case 4:
      // the source timeslice
#ifdef MPI
      source_proc_coords[0] = g_source_timeslice / T;
      source_proc_coords[1] = 0;
      source_proc_coords[2] = 0;
      source_proc_coords[3] = 0;
      MPI_Cart_rank(g_cart_grid, source_proc_coords, &g_source_proc_id);
      have_source_flag = ( g_source_proc_id == g_cart_id );
      source_timeslice = have_source_flag ? g_source_timeslice % T : -1;
#else
      g_source_proc_id = 0;
      have_source_flag = 1;
      source_timeslice = g_source_timeslice;
#endif
      break;
  }

#ifdef HAVE_QUDA
  /*************************************************************
   * QUDA inverter parameters
   *************************************************************/
  inv_param.dslash_type    = QUDA_DOMAIN_WALL_DSLASH;

  if(strcmp(g_inverter_type_name, "cg") == 0) {
    inv_param.inv_type       = QUDA_CG_INVERTER;
    if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] using cg inverter\n"); 
  } else if(strcmp(g_inverter_type_name, "bicgstab") == 0) {
    inv_param.inv_type       = QUDA_BICGSTAB_INVERTER;
    if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] using bicgstab inverter\n");
#ifdef MULTI_GPU    
  } else if(strcmp(g_inverter_type_name, "gcr") == 0) {
    inv_param.inv_type       = QUDA_GCR_INVERTER;
    if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] using gcr inverter\n"); 
#endif
  } else {
    if(g_cart_id==0) fprintf(stderr, "[invert_dw_quda] Error, unrecognized inverter type %s\n", g_inverter_type_name);
    EXIT(123);
  }


  if(inv_param.inv_type == QUDA_CG_INVERTER) {
    inv_param.solution_type = QUDA_MAT_SOLUTION;
    inv_param.solve_type    = QUDA_NORMEQ_PC_SOLVE;
  } else if(inv_param.inv_type == QUDA_BICGSTAB_INVERTER) {
    inv_param.solution_type = QUDA_MAT_SOLUTION;
    inv_param.solve_type    = QUDA_DIRECT_PC_SOLVE;
  } else {
    inv_param.solution_type = QUDA_MATPC_SOLUTION;
    inv_param.solve_type    = QUDA_DIRECT_PC_SOLVE;
  }

  inv_param.m5             = g_m5;
  inv_param.kappa          = 0.5 / (5. + inv_param.m5);
  inv_param.mass           = g_m0;

  inv_param.tol            = solver_precision;
  inv_param.maxiter        = niter_max;
  inv_param.reliable_delta = reliable_delta;

#ifdef MPI
  // domain decomposition preconditioner parameters
  if(inv_param.inv_type == QUDA_GCR_INVERTER) {
    if(g_cart_id == 0) printf("# [] settup DD parameters\n");
    inv_param.gcrNkrylov     = 15;
    inv_param.inv_type_precondition = QUDA_MR_INVERTER;
    inv_param.tol_precondition = 1e-6;
    inv_param.maxiter_precondition = 200;
    inv_param.verbosity_precondition = QUDA_VERBOSE;
    inv_param.prec_precondition = cuda_prec_sloppy;
    inv_param.omega = 0.7;
  }
#endif

  inv_param.matpc_type         = QUDA_MATPC_EVEN_EVEN;
  inv_param.dagger             = QUDA_DAG_NO;
  inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION; //;QUDA_MASS_NORMALIZATION;

  inv_param.cpu_prec         = cpu_prec;
  inv_param.cuda_prec        = cuda_prec;
  inv_param.cuda_prec_sloppy = cuda_prec_sloppy;

  inv_param.verbosity = QUDA_VERBOSE;

  inv_param.preserve_source = QUDA_PRESERVE_SOURCE_NO;
  inv_param.dirac_order = QUDA_DIRAC_ORDER;
#ifdef MPI
  inv_param.preserve_dirac = QUDA_PRESERVE_DIRAC_YES;
  inv_param.prec_precondition = cuda_prec_sloppy;
  inv_param.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS;
  inv_param.dirac_tune = QUDA_TUNE_NO;
#endif
#endif

  /*******************************************
   * write initial rng state to file
   *******************************************/
  if( g_source_type==2 && g_coherent_source==2 ) {
    sprintf(rng_file_out, "%s.0", g_rng_filename);
    status = init_rng_stat_file (g_seed, rng_file_out);
    if( status != 0 ) {
      fprintf(stderr, "[invert_dw_quda] Error, could not write rng status\n");
      EXIT(210);
    }
  } else if( (g_source_type==2 /*&& g_coherent_source==1*/) || g_source_type==3 || g_source_type==4) {
    if( init_rng_state(g_seed, &g_rng_state) != 0 ) {
      fprintf(stderr, "[invert_dw_quda] Error, could initialize rng state\n");
      EXIT(211);
    }
  }

  /*******************************************
   * prepare locks for openmp
   *******************************************/
  nthreads = g_num_threads - 1;
  lck = (omp_lock_t*)malloc(nthreads * sizeof(omp_lock_t));
  if(lck == NULL) {
      EXIT_WITH_MSG(97, "[invert_dw_quda] Error, could not allocate lck\n");
  }
  // init locks
  for(i=0;i<nthreads;i++) {
    omp_init_lock(lck+i);
  }
  omp_init_lock(gen_lck);

  // check the source momenta
  if(g_source_momentum_set) {
    source_momentum = (int*)malloc(3*sizeof(int));

    if(g_source_momentum[0]<0) g_source_momentum[0] += LX_global;
    if(g_source_momentum[1]<0) g_source_momentum[1] += LY_global;
    if(g_source_momentum[2]<0) g_source_momentum[2] += LZ_global;
    fprintf(stdout, "# [invert_dw_quda] using final source momentum ( %d, %d, %d )\n", g_source_momentum[0], g_source_momentum[1], g_source_momentum[2]);


    if(full_orbit) {
      status = make_qcont_orbits_3d_parity_avg( &qlatt_id, &qlatt_count, &qlatt_list, &qlatt_nclass, &qlatt_rep, &qlatt_map);
      if(status != 0) {
        if(g_cart_id==0) fprintf(stderr, "\n[invert_dw_quda] Error while creating O_3-lists\n");
        EXIT(4);
      }
      source_momentum_class = qlatt_id[g_ipt[0][g_source_momentum[0]][g_source_momentum[1]][g_source_momentum[2]]];
      source_momentum_no    = qlatt_count[source_momentum_class];
      source_momentum_runs  = source_momentum_class==0 ? 1 : source_momentum_no + 1;
      if(g_cart_id==0) fprintf(stdout, "# [] source momentum belongs to class %d with %d members, which means %d runs\n",
          source_momentum_class, source_momentum_no, source_momentum_runs);
    }
  }

  if(g_source_type == 5) {
    if(g_seq_source_momentum_set) {
      if(g_seq_source_momentum[0]<0) g_seq_source_momentum[0] += LX_global;
      if(g_seq_source_momentum[1]<0) g_seq_source_momentum[1] += LY_global;
      if(g_seq_source_momentum[2]<0) g_seq_source_momentum[2] += LZ_global;
    } else if(g_source_momentum_set) {
      g_seq_source_momentum[0] = g_source_momentum[0];
      g_seq_source_momentum[1] = g_source_momentum[1];
      g_seq_source_momentum[2] = g_source_momentum[2];
    }
    fprintf(stdout, "# [invert_dw_quda] using final sequential source momentum ( %d, %d, %d )\n",
        g_seq_source_momentum[0], g_seq_source_momentum[1], g_seq_source_momentum[2]);
  }


  /***********************************************
   * loop on spin-color-index
   ***********************************************/
  for(isc=g_source_index[0]; isc<=g_source_index[1]; isc++)
//  for(isc=g_source_index[0]; isc<=g_source_index[0]; isc++)
  {
    ispin = isc / n_c;
    icol  = isc % n_c;

    for(imom=0; imom<source_momentum_runs; imom++) {

      /***********************************************
       * set source momentum
       ***********************************************/
      if(g_source_momentum_set) {
        if(imom == 0) {
          if(full_orbit) {
            source_momentum[0] = 0;
            source_momentum[1] = 0;
            source_momentum[2] = 0;
          } else {
            source_momentum[0] = g_source_momentum[0];
            source_momentum[1] = g_source_momentum[1];
            source_momentum[2] = g_source_momentum[2];
          }
        } else {
          source_momentum[0] = qlatt_map[source_momentum_class][imom-1] / (LY_global*LZ_global);
          source_momentum[1] = ( qlatt_map[source_momentum_class][imom-1] % (LY_global*LZ_global) ) / LZ_global;
          source_momentum[2] = qlatt_map[source_momentum_class][imom-1] % LZ_global;
        }
        if(g_cart_id==0) fprintf(stdout, "# [] run no. %d, source momentum (%d, %d, %d)\n",
            imom, source_momentum[0], source_momentum[1], source_momentum[2]);
      
      }
 
      /***********************************************
       * prepare the souce
       ***********************************************/
      if(g_read_source == 0) {  // create source
        switch(g_source_type) {
          case 0:
            // point source
            if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] Creating point source\n");
            for(ix=0;ix<L5*VOLUME;ix++) { _fv_eq_zero(g_spinor_field[0]+ix); }
            if(have_source_flag) {
              if(g_source_momentum_set) {
                phase = 2*M_PI*( source_momentum[0]*sl1/(double)LX_global + source_momentum[1]*sl2/(double)LY_global + source_momentum[2]*sl3/(double)LZ_global );
                g_spinor_field[0][_GSI(g_ipt[lsl0][lsl1][lsl2][lsl3]) + 2*(n_c*ispin+icol)  ] = cos(phase);
                g_spinor_field[0][_GSI(g_ipt[lsl0][lsl1][lsl2][lsl3]) + 2*(n_c*ispin+icol)+1] = sin(phase);
              } else {
                g_spinor_field[0][_GSI(g_ipt[lsl0][lsl1][lsl2][lsl3]) + 2*(n_c*ispin+icol)  ] = 1.;
              }
            }
            if(g_source_momentum_set) {
              sprintf(source_filename, "%s.%.4d.t%.2dx%.2dy%.2dz%.2d.%.2d.qx%.2dqy%.2dqz%.2d",
                  filename_prefix, Nconf, sl0, sl1, sl2, sl3, n_c*ispin+icol, source_momentum[0], source_momentum[1], source_momentum[2]);
            } else {
              sprintf(source_filename, "%s.%.4d.t%.2dx%.2dy%.2dz%.2d.%.2d", filename_prefix, Nconf, sl0, sl1, sl2, sl3, n_c*ispin+icol);
            }
#ifdef HAVE_QUDA
            // set matpc_tpye
            source_location_5d_iseven = ( (g_iseven[g_ipt[lsl0][lsl1][lsl2][lsl3]] && ispin<n_s/2) || (!g_iseven[g_ipt[lsl0][lsl1][lsl2][lsl3]] && ispin>=n_s/2) ) ? 1 : 0;
            if(source_location_5d_iseven) {
              inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
              if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] matpc type is MATPC_EVEN_EVEN\n");
            } else {
              inv_param.matpc_type = QUDA_MATPC_ODD_ODD;
              if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] matpc type is MATPC_ODD_ODD\n");
            }
#endif
            break;
          case 2:
            // timeslice source
            if(g_coherent_source==1) {
              if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] Creating coherent timeslice source\n");
              status = prepare_coherent_timeslice_source(g_spinor_field[0], gauge_field_smeared, g_coherent_source_base, g_coherent_source_delta, VOLUME, g_rng_state, 1);
              if(status != 0) {
                fprintf(stderr, "[invert_dw_quda] Error from prepare source, status was %d\n", status);
#ifdef MPI
                MPI_Abort(MPI_COMM_WORLD, 123);
                MPI_Finalize();
#endif
                exit(123);
              }
              check_error(prepare_coherent_timeslice_source(g_spinor_field[0], gauge_field_smeared, g_coherent_source_base, g_coherent_source_delta, VOLUME, g_rng_state, 1),
                  "prepare_coherent_timeslice_source", NULL, 123);
              timeslice = g_coherent_source_base;
            } else {
              if(g_coherent_source==2) {
                timeslice = (g_coherent_source_base+isc*g_coherent_source_delta)%T_global;
                fprintf(stdout, "# [invert_dw_quda] Creating timeslice source\n");
                check_error(prepare_timeslice_source(g_spinor_field[0], gauge_field_smeared, timeslice, VOLUME, g_rng_state, 1),
                    "prepare_timeslice_source", NULL, 123);
              } else {
                if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] Creating timeslice source\n");
                check_error(prepare_timeslice_source(g_spinor_field[0], gauge_field_smeared, g_source_timeslice, VOLUME, g_rng_state, 1),
                    "prepare_timeslice_source", NULL, 124);
                timeslice = g_source_timeslice;
              }
            }
            if(g_source_momentum_set) {
              sprintf(source_filename, "%s.%.4d.%.2d.%.5d.qx%.2dqy%.2dqz%.2d", filename_prefix, Nconf, 
                  timeslice, isc, source_momentum[0], source_momentum[1], source_momentum[2]);
            } else {
              sprintf(source_filename, "%s.%.4d.%.2d.%.5d", filename_prefix, Nconf, timeslice, isc);
            }
            break;
          case 3:
            // timeslice sources for one-end trick (spin dilution)
            fprintf(stdout, "# [invert_dw_quda] Creating timeslice source for one-end-trick\n");
            check_error( prepare_timeslice_source_one_end(g_spinor_field[0], gauge_field_smeared, source_timeslice, source_momentum, isc%n_s, g_rng_state, \
                ( isc%n_s==(n_s-1) && imom==source_momentum_runs-1 )), "prepare_timeslice_source_one_end", NULL, 125 );
            c = N_Jacobi > 0 ? isc%n_s + n_s : isc%n_s;
            if(g_source_momentum_set) {
              sprintf(source_filename, "%s.%.4d.%.2d.%.2d.qx%.2dqy%.2dqz%.2d", filename_prefix, Nconf, 
                  g_source_timeslice, c, source_momentum[0], source_momentum[1], source_momentum[2]);
            } else {
              sprintf(source_filename, "%s.%.4d.%.2d.%.2d", filename_prefix, Nconf, g_source_timeslice, c);
            }
            break;
          case 4:
            // timeslice sources for one-end trick (spin and color dilution )
            fprintf(stdout, "# [invert_dw_quda] Creating timeslice source for one-end-trick\n");
            check_error(prepare_timeslice_source_one_end_color(g_spinor_field[0], gauge_field_smeared, source_timeslice, source_momentum,\
                isc%(n_s*n_c), g_rng_state, ( isc%(n_s*n_c)==(n_s*n_c-1)  && imom==source_momentum_runs-1 )), "prepare_timeslice_source_one_end_color", NULL, 126);
            c = N_Jacobi > 0 ? isc%(n_s*n_c) + (n_s*n_c) : isc%(n_s*n_c);
            if(g_source_momentum_set) {
              sprintf(source_filename, "%s.%.4d.%.2d.%.2d.qx%.2dqy%.2dqz%.2d", filename_prefix, Nconf, 
                  g_source_timeslice, c, source_momentum[0], source_momentum[1], source_momentum[2]);
            } else {
              sprintf(source_filename, "%s.%.4d.%.2d.%.2d", filename_prefix, Nconf, g_source_timeslice, c);
            }
            break;
          case 5:
            if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] preparing sequential point source\n");
            check_error( prepare_sequential_point_source (g_spinor_field[0], isc, sl0, g_seq_source_momentum, 
                  smear_source, g_spinor_field[1], gauge_field_smeared), "prepare_sequential_point_source", NULL, 33);
            sprintf(source_filename, "%s.%.4d.t%.2dx%.2d.y%.2d.z%.2d.%.2d.qx%.2dqy%.2dqz%.2d", filename_prefix2, Nconf,
                sl0, sl1, sl2, sl3, isc, g_source_momentum[0], g_source_momentum[1], g_source_momentum[2]);
            break;
          default:
            fprintf(stderr, "\nError, unrecognized source type\n");
            exit(32);
            break;
        }
      } else { // read source
        switch(g_source_type) {
          case 0:  // point source
            if(g_source_momentum_set) {
              sprintf(source_filename, "%s.%.4d.t%.2dx%.2dy%.2dz%.2d.%.2d.qx%.2dqy%.2dqz%.2d", \
                  filename_prefix2, Nconf, sl0, sl1, sl2, sl3, isc, source_momentum[0], source_momentum[1], source_momentum[2]);
            } else  {
              sprintf(source_filename, "%s.%.4d.t%.2dx%.2dy%.2dz%.2d.%.2d", filename_prefix2, Nconf, sl0, sl1, sl2, sl3, isc);
            }
            fprintf(stdout, "# [invert_dw_quda] reading source from file %s\n", source_filename);
            check_error(read_lime_spinor(g_spinor_field[0], source_filename, 0), "read_lime_spinor", NULL, 115);
            break;
          case 2:  // timeslice source
            if(g_source_momentum_set) {
              sprintf(source_filename, "%s.%.4d.%.2d.%.5d.qx%.2dqy%.2dqz%.2d", filename_prefix2, Nconf, g_source_timeslice,
                  isc, source_momentum[0], source_momentum[1], source_momentum[2]);
            } else {
              sprintf(source_filename, "%s.%.4d.%.2d.%.5d", filename_prefix2, Nconf, g_source_timeslice, isc);
            }
            fprintf(stdout, "# [invert_dw_quda] reading source from file %s\n", source_filename);
            check_error(read_lime_spinor(g_spinor_field[0], source_filename, 0), "read_lime_spinor", NULL, 115);
            break;
          default:
            check_error(1, "source type", NULL, 104);
            break;
          case -1:  // timeslice source
            sprintf(source_filename, "%s", filename_prefix2);
            fprintf(stdout, "# [invert_dw_quda] reading source from file %s\n", source_filename);
            check_error(read_lime_spinor(g_spinor_field[0], source_filename, 0), "read_lime_spinor", NULL, 115);
            break;
        }
      }  // of if g_read_source
  
      if(g_write_source) {
        check_error(write_propagator(g_spinor_field[0], source_filename, 0, g_propagator_precision), "write_propagator", NULL, 27);
      }

/***********************************************************************************************
 * here threads split: 
 ***********************************************************************************************/
      if(dummy_flag==0) strcpy(source_filename_write, source_filename);
      memcpy((void*)(smearing_spinor_field[0]), (void*)(g_spinor_field[0]), 24*VOLUME*sizeof(double));
      if(dummy_flag>0) {
        // copy only if smearing has been done; otherwise do not copy, do not invert
        if(g_cart_id==0) fprintf(stdout, "# [] copy smearing field -> g field\n");
        memcpy((void*)(g_spinor_field[0]), (void*)(smearing_spinor_field[1]), 24*VOLUME*sizeof(double));
      }

      omp_set_num_threads(g_num_threads);
#pragma omp parallel private(threadid, _2_kappa, is, ix, iy, iix, ratime, retime) shared(key,g_read_source, smear_source, N_Jacobi, kappa_Jacobi, smearing_spinor_field, g_spinor_field, nthreads, convert_sign, VOLUME, VOL3, T, L5, isc, rotate_gamma_basis, g_cart_id) firstprivate(inv_param, gauge_param, ofs)
{
      threadid = omp_get_thread_num();

  if(threadid < nthreads) {
      fprintf(stdout, "# [] proc%.2d thread%.2d starting source preparation\n", g_cart_id, threadid);

      // smearing
      if( ( !g_read_source || (g_read_source && smear_source ) ) && N_Jacobi > 0 ) {
        if(g_cart_id==0) fprintf(stdout, "#  [invert_dw_quda] smearing source with N_Jacobi=%d, kappa_Jacobi=%e\n", N_Jacobi, kappa_Jacobi);
        Jacobi_Smearing_threaded(gauge_field_smeared, smearing_spinor_field[0], smearing_spinor_field[1], kappa_Jacobi, N_Jacobi, threadid, nthreads);
      }


      /***********************************************
       * create the 5-dim. source field
       ***********************************************/
      if(convert_sign == 0) {
        spinor_4d_to_5d_threaded(smearing_spinor_field[0], smearing_spinor_field[0], threadid, nthreads);
      }  else if(convert_sign == 1 || convert_sign == -1) {
        spinor_4d_to_5d_sign_threaded(smearing_spinor_field[0], smearing_spinor_field[0], convert_sign, threadid, nthreads);
      }


      for(is=0; is<L5; is++) {
        for(it=threadid; it<T; it+=nthreads) {
          memcpy((void*)(g_spinor_field[0]+_GSI(g_ipt_5d[is][it][0][0][0])), (void*)(smearing_spinor_field[0]+_GSI(g_ipt_5d[is][it][0][0][0])), VOL3*24*sizeof(double));
        }
      }


      // reorder, multiply with g2
      for(is=0; is<L5; is++) {
        for(it=threadid; it<T; it+=nthreads) {
          for(i3=0; i3<VOL3; i3++) {
            ix = (is*T+it)*VOL3 + i3;
            _fv_eq_zero(smearing_spinor_field[1]+_GSI(ix));
      }}} 

      if(rotate_gamma_basis) {
        for(it=threadid; it<T; it+=nthreads) {
          for(i3=0; i3<VOL3; i3++) {
            ix = it * VOL3 + i3;
            iy = lexic2eot_5d(0, ix);
            _fv_eq_gamma_ti_fv(smearing_spinor_field[1]+_GSI(iy), 2, smearing_spinor_field[0]+_GSI(ix));
        }}
        for(it=threadid; it<T; it+=nthreads) {
          for(i3=0; i3<VOL3; i3++) {
            ix = it * VOL3 + i3;
            iy = lexic2eot_5d(L5-1, ix);
            _fv_eq_gamma_ti_fv(smearing_spinor_field[1]+_GSI(iy), 2, smearing_spinor_field[0]+_GSI(ix+(L5-1)*VOLUME));
        }}
      } else {
        for(it=threadid; it<T; it+=nthreads) {
          for(i3=0; i3<VOL3; i3++) {
            ix = it * VOL3 + i3;
            iy = lexic2eot_5d(0, ix);
            _fv_eq_fv(smearing_spinor_field[1]+_GSI(iy), smearing_spinor_field[0]+_GSI(ix));
        }}
        for(it=threadid; it<T; it+=nthreads) {
          for(i3=0; i3<VOL3; i3++) {
            ix = it * VOL3 + i3;
            iy = lexic2eot_5d(L5-1, ix);
            _fv_eq_fv(smearing_spinor_field[1]+_GSI(iy), smearing_spinor_field[0]+_GSI(ix+(L5-1)*VOLUME));
        }}
      }
      fprintf(stdout, "# [] proc%.2d thread%.2d finished source preparation\n", g_cart_id, threadid);

  } else if(threadid == g_num_threads-1 && dummy_flag > 0) {  // else branch on threadid
      fprintf(stdout, "# [] proc%.2d thread%.2d starting inversion for dummy_flag = %d\n", g_cart_id, threadid, dummy_flag);

      /***********************************************
       * perform the inversion
       ***********************************************/
      if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] starting inversion\n");

      xchange_field_5d(g_spinor_field[0]);
      memset(g_spinor_field[1], 0, (VOLUME+RAND)*L5*24*sizeof(double));
      ratime = CLOCK;
#ifdef MPI
      if(inv_param.inv_type == QUDA_BICGSTAB_INVERTER  || inv_param.inv_type == QUDA_GCR_INVERTER) {
        if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] calling invertQuda\n");
        invertQuda(g_spinor_field[1], g_spinor_field[0], &inv_param);
      } else if(inv_param.inv_type == QUDA_CG_INVERTER) {
        if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] calling testCG\n");
        testCG(g_spinor_field[1], g_spinor_field[0], &inv_param);
      } else {
        if(g_cart_id==0) fprintf(stderr, "# [invert_dw_quda] unrecognized inverter\n");
      }
#else
      invertQuda(g_spinor_field[1], g_spinor_field[0], &inv_param);
#endif
      retime = CLOCK;

      if(g_cart_id==0) {
        fprintf(stdout, "# [invert_dw_quda] QUDA time:  %e seconds\n", inv_param.secs);
        fprintf(stdout, "# [invert_dw_quda] QUDA Gflops: %e\n", inv_param.gflops/inv_param.secs);
        fprintf(stdout, "# [invert_dw_quda] wall time:  %e seconds\n", retime-ratime);
        fprintf(stdout, "# [invert_dw_quda] Device memory used:\n\tSpinor: %f GiB\n\tGauge: %f GiB\n",
        inv_param.spinorGiB, gauge_param.gaugeGiB);
      }
  }  // of if threadid

// wait till all threads are here
#pragma omp barrier

      if(inv_param.mass_normalization == QUDA_KAPPA_NORMALIZATION) {
        _2_kappa = 2. * g_kappa5d;
        for(ix=threadid; ix<VOLUME*L5;ix+=g_num_threads) {
          _fv_ti_eq_re(g_spinor_field[1]+_GSI(ix), _2_kappa );
        }
      }
  
#pragma omp barrier
      // reorder, multiply with g2
      for(is=0;is<L5;is++) {
      for(ix=threadid; ix<VOLUME; ix+=g_num_threads) {
        iy  = lexic2eot_5d(is, ix);
        iix = is*VOLUME + ix;
        _fv_eq_fv(g_spinor_field[0]+_GSI(iix), g_spinor_field[1]+_GSI(iy));
      }}
#pragma omp barrier
      if(rotate_gamma_basis) {
        for(ix=threadid; ix<VOLUME*L5; ix+=g_num_threads) {
          _fv_eq_gamma_ti_fv(g_spinor_field[1]+_GSI(ix), 2, g_spinor_field[0]+_GSI(ix));
        }
      } else {
        for(ix=threadid; ix<VOLUME*L5;ix+=g_num_threads) {
          _fv_eq_fv(g_spinor_field[1]+_GSI(ix), g_spinor_field[0]+_GSI(ix));
        }
      }
      if(g_cart_id==0 && threadid==g_num_threads-1) fprintf(stdout, "# [invert_dw_quda] inversion done in %e seconds\n", retime-ratime);

#pragma omp single
  {

#ifdef MPI
      xchange_field_5d(g_spinor_field[1]);
#endif
      /***********************************************
       * check residuum
       ***********************************************/
      if(check_residuum && dummy_flag>0) {
        // apply the Wilson Dirac operator in the gamma-basis defined in cvc_linalg,
        //   which uses the tmLQCD conventions (same as in contractions)
        //   without explicit boundary conditions
#ifdef MPI
        xchange_field_5d(g_spinor_field[2]);
        xchange_field_5d(g_spinor_field[1]);
#endif
        memset(g_spinor_field[0], 0, 24*(VOLUME+RAND)*L5*sizeof(double));

        //sprintf(filename, "%s.inverted.ascii.%.2d", source_filename, g_cart_id);
        //ofs = fopen(filename, "w");
        //printf_spinor_field_5d(g_spinor_field[1], ofs);
        //fclose(ofs);

        Q_DW_Wilson_phi(g_spinor_field[0], g_spinor_field[1]);
  
        for(ix=0;ix<VOLUME*L5;ix++) {
          _fv_mi_eq_fv(g_spinor_field[0]+_GSI(ix), g_spinor_field[2]+_GSI(ix));
        }
  
        spinor_scalar_product_re(&norm2, g_spinor_field[2], g_spinor_field[2], VOLUME*L5);
        spinor_scalar_product_re(&norm, g_spinor_field[0], g_spinor_field[0], VOLUME*L5);
        if(g_cart_id==0) fprintf(stdout, "\n# [invert_dw_quda] absolut residuum squared: %e; relative residuum %e\n", norm, sqrt(norm/norm2) );

      }
  
      if(dummy_flag>0) {
        /***********************************************
         * create 4-dim. propagator
         ***********************************************/
        if(convert_sign == 0) {
          spinor_5d_to_4d(g_spinor_field[1], g_spinor_field[1]);
        } else if(convert_sign == -1 || convert_sign == +1) {
          spinor_5d_to_4d_sign(g_spinor_field[1], g_spinor_field[1], convert_sign);
        }
  
        /***********************************************
         * write the solution 
         ***********************************************/
        sprintf(filename, "%s.inverted", source_filename_write);
        if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] writing propagator to file %s\n", filename);
        check_error(write_propagator(g_spinor_field[1], filename, 0, g_propagator_precision), "write_propagator", NULL, 22);
        
        //sprintf(filename, "prop.ascii.4d.%.2d.%.2d.%.2d", isc, g_nproc, g_cart_id);
        //ofs = fopen(filename, "w");
        //printf_spinor_field(g_spinor_field[1], ofs);
        //fclose(ofs);
      }

      if(check_residuum) memcpy(g_spinor_field[2], smearing_spinor_field[0], 24*VOLUME*L5*sizeof(double));

  }  // of omp single

}    // of omp parallel region

      if(dummy_flag > 0) strcpy(source_filename_write, source_filename);

      dummy_flag++;
 
    }  // of loop on momenta

  }  // of isc

#if 0
  // last inversion

  {
      memcpy(g_spinor_field[0], smearing_spinor_field[1], 24*VOLUME*L5*sizeof(double));
      if(g_cart_id==0) fprintf(stdout, "# [] proc%.2d starting last inversion\n", g_cart_id);


      /***********************************************
       * perform the inversion
       ***********************************************/
      if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] starting inversion\n");

      xchange_field_5d(g_spinor_field[0]);
      memset(g_spinor_field[1], 0, (VOLUME+RAND)*L5*24*sizeof(double));
      ratime = CLOCK;
#ifdef MPI
      if(inv_param.inv_type == QUDA_BICGSTAB_INVERTER  || inv_param.inv_type == QUDA_GCR_INVERTER) {
        if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] calling invertQuda\n");
        invertQuda(g_spinor_field[1], g_spinor_field[0], &inv_param);
      } else if(inv_param.inv_type == QUDA_CG_INVERTER) {
        if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] calling testCG\n");
        testCG(g_spinor_field[1], g_spinor_field[0], &inv_param);
      } else {
        if(g_cart_id==0) fprintf(stderr, "# [invert_dw_quda] unrecognized inverter\n");
      }
#else
      invertQuda(g_spinor_field[1], g_spinor_field[0], &inv_param);
#endif
      retime = CLOCK;

      if(g_cart_id==0) {
        fprintf(stdout, "# [invert_dw_quda] QUDA time:  %e seconds\n", inv_param.secs);
        fprintf(stdout, "# [invert_dw_quda] QUDA Gflops: %e\n", inv_param.gflops/inv_param.secs);
        fprintf(stdout, "# [invert_dw_quda] wall time:  %e seconds\n", retime-ratime);
        fprintf(stdout, "# [invert_dw_quda] Device memory used:\n\tSpinor: %f GiB\n\tGauge: %f GiB\n",
        inv_param.spinorGiB, gauge_param.gaugeGiB);
      }

      omp_set_num_threads(g_num_threads);
#pragma omp parallel private(threadid,_2_kappa,is,ix,iy,iix) shared(VOLUME,L5,g_kappa,g_spinor_field,g_num_threads)
    {
      threadid = omp_get_thread_num();

      if(inv_param.mass_normalization == QUDA_KAPPA_NORMALIZATION) {
        _2_kappa = 2. * g_kappa5d;
        for(ix=threadid; ix<VOLUME*L5;ix+=g_num_threads) {
          _fv_ti_eq_re(g_spinor_field[1]+_GSI(ix), _2_kappa );
        }
      }
#pragma omp barrier
      // reorder, multiply with g2
      for(is=0;is<L5;is++) {
      for(ix=threadid; ix<VOLUME; ix+=g_num_threads) {
        iy  = lexic2eot_5d(is, ix);
        iix = is*VOLUME + ix;
        _fv_eq_fv(g_spinor_field[0]+_GSI(iix), g_spinor_field[1]+_GSI(iy));
      }}
#pragma omp barrier
      if(rotate_gamma_basis) {
        for(ix=threadid; ix<VOLUME*L5; ix+=g_num_threads) {
          _fv_eq_gamma_ti_fv(g_spinor_field[1]+_GSI(ix), 2, g_spinor_field[0]+_GSI(ix));
        }
      } else {
        for(ix=threadid; ix<VOLUME*L5;ix+=g_num_threads) {
          _fv_eq_fv(g_spinor_field[1]+_GSI(ix), g_spinor_field[0]+_GSI(ix));
        }
      }

    }  // end of parallel region

    if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] inversion done in %e seconds\n", retime-ratime);


#ifdef MPI
      xchange_field_5d(g_spinor_field[1]);
#endif
      /***********************************************
       * check residuum
       ***********************************************/
      if(check_residuum && dummy_flag>0) {
        // apply the Wilson Dirac operator in the gamma-basis defined in cvc_linalg,
        //   which uses the tmLQCD conventions (same as in contractions)
        //   without explicit boundary conditions
#ifdef MPI
        xchange_field_5d(g_spinor_field[2]);
#endif
        memset(g_spinor_field[0], 0, 24*(VOLUME+RAND)*L5*sizeof(double));

        //sprintf(filename, "%s.inverted.ascii.%.2d", source_filename, g_cart_id);
        //ofs = fopen(filename, "w");
        //printf_spinor_field_5d(g_spinor_field[1], ofs);
        //fclose(ofs);


        Q_DW_Wilson_phi(g_spinor_field[0], g_spinor_field[1]);
  
        for(ix=0;ix<VOLUME*L5;ix++) {
          _fv_mi_eq_fv(g_spinor_field[0]+_GSI(ix), g_spinor_field[2]+_GSI(ix));
        }
  
        spinor_scalar_product_re(&norm, g_spinor_field[0], g_spinor_field[0], VOLUME*L5);
        spinor_scalar_product_re(&norm2, g_spinor_field[2], g_spinor_field[2], VOLUME*L5);
        if(g_cart_id==0) fprintf(stdout, "\n# [invert_dw_quda] absolut residuum squared: %e; relative residuum %e\n", norm, sqrt(norm/norm2) );

      }
  
      /***********************************************
       * create 4-dim. propagator
       ***********************************************/
      if(convert_sign == 0) {
        spinor_5d_to_4d(g_spinor_field[1], g_spinor_field[1]);
      } else if(convert_sign == -1 || convert_sign == +1) {
        spinor_5d_to_4d_sign(g_spinor_field[1], g_spinor_field[1], convert_sign);
      }
  
      /***********************************************
       * write the solution 
       ***********************************************/
      sprintf(filename, "%s.inverted", source_filename_write);
      if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] writing propagator to file %s\n", filename);
      check_error(write_propagator(g_spinor_field[1], filename, 0, g_propagator_precision), "write_propagator", NULL, 22);
        
      //sprintf(filename, "prop.ascii.4d.%.2d.%.2d.%.2d", isc, g_nproc, g_cart_id);
      //ofs = fopen(filename, "w");
      //printf_spinor_field(g_spinor_field[1], ofs);
      //fclose(ofs);
  }  // of last inversion

#endif  // of if 0

  /***********************************************
   * free the allocated memory, finalize 
   ***********************************************/

#ifdef HAVE_QUDA
  // finalize the QUDA library
  if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] finalizing quda\n");
#ifdef MPI
  freeGaugeQuda();
#endif
  endQuda();
#endif
  if(g_gauge_field != NULL) free(g_gauge_field);
  if(gauge_field_smeared != NULL) free(gauge_field_smeared);
  if(no_fields>0) {
    if(g_spinor_field!=NULL) {
      for(i=0; i<no_fields; i++) if(g_spinor_field[i]!=NULL) free(g_spinor_field[i]);
      free(g_spinor_field);
    }
  }
  free_geometry();

  if(g_source_momentum_set && full_orbit) {
    finalize_q_orbits(&qlatt_id, &qlatt_count, &qlatt_list, &qlatt_rep);
    if(qlatt_map != NULL) {
      free(qlatt_map[0]);
      free(qlatt_map);
    }
  }
  if(source_momentum != NULL) free(source_momentum);
  if(lck != NULL) free(lck);


#ifdef MPI
#ifdef HAVE_QUDA
  endCommsQuda();
#else
  MPI_Finalize();
#endif
#endif
  if(g_cart_id==0) {
    g_the_time = time(NULL);
    fprintf(stdout, "\n# [invert_dw_quda] %s# [invert_dw_quda] end of run\n", ctime(&g_the_time));
    fprintf(stderr, "\n# [invert_dw_quda] %s# [invert_dw_quda] end of run\n", ctime(&g_the_time));
  }
  return(0);
}
コード例 #14
0
ファイル: invert_quda_cg.c プロジェクト: etmc/cvc
int main(int argc, char **argv) {
  
  int c, i, mu, status;
  int ispin, icol, isc;
  int n_c = 3;
  int n_s = 4;
  int count        = 0;
  int filename_set = 0;
  int dims[4]      = {0,0,0,0};
  int l_LX_at, l_LXstart_at;
  int x0, x1, x2, x3, ix, iix, iy;
  int sl0, sl1, sl2, sl3, have_source_flag=0;
  int source_proc_coords[4], lsl0, lsl1, lsl2, lsl3, source_proc_id;
  int check_residuum = 0;
  unsigned int VOL3;
  int do_gt   = 0;
  int full_orbit = 0;
  char filename[200], source_filename[200];
  double ratime, retime;
  double plaq_r=0., plaq_m=0., norm, norm2;
  // double spinor1[24], spinor2[24];
  double *gauge_qdp[4], *gauge_field_timeslice=NULL, *gauge_field_smeared=NULL;
  double _1_2_kappa, _2_kappa, phase;
  FILE *ofs;
  int mu_trans[4] = {3, 0, 1, 2};
  int threadid, nthreads;
  int timeslice;
  char rng_file_in[100], rng_file_out[100];
  int *source_momentum=NULL;
  int source_momentum_class = -1;
  int source_momentum_no = 0;
  int source_momentum_runs = 1;
  int imom;

  /************************************************/
  int qlatt_nclass;
  int *qlatt_id=NULL, *qlatt_count=NULL, **qlatt_rep=NULL, **qlatt_map=NULL;
  double **qlatt_list=NULL;
  /************************************************/
       

  /***********************************************
   * QUDA parameters
   ***********************************************/
  QudaPrecision cpu_prec         = QUDA_DOUBLE_PRECISION;
  QudaPrecision cuda_prec        = QUDA_DOUBLE_PRECISION;
  QudaPrecision cuda_prec_sloppy = QUDA_DOUBLE_PRECISION;

  QudaGaugeParam gauge_param = newQudaGaugeParam();
  QudaInvertParam inv_param = newQudaInvertParam();


#ifdef MPI
  MPI_Init(&argc, &argv);
#endif

  while ((c = getopt(argc, argv, "och?vgf:p:")) != -1) {
    switch (c) {
    case 'v':
      g_verbose = 1;
      break;
    case 'g':
      do_gt = 1;
      break;
    case 'f':
      strcpy(filename, optarg);
      filename_set=1;
      break;
    case 'c':
      check_residuum = 1;
      fprintf(stdout, "# [invert_quda] will check residuum again\n");
      break;
    case 'p':
      n_c = atoi(optarg);
      fprintf(stdout, "# [invert_quda] will use number of colors = %d\n", n_c);
      break;
    case 'o':
      full_orbit = 1;
      fprintf(stdout, "# [invert_quda] will invert for full orbit, if source momentum set\n");
      break;
    case 'h':
    case '?':
    default:
      usage();
      break;
    }
  }

  // get the time stamp
  g_the_time = time(NULL);

  /**************************************
   * set the default values, read input
   **************************************/
  if(filename_set==0) strcpy(filename, "cvc.input");
  if(g_proc_id==0) fprintf(stdout, "# Reading input from file %s\n", filename);
  read_input_parser(filename);

  /* some checks on the input data */
  if((T_global == 0) || (LX==0) || (LY==0) || (LZ==0)) {
    if(g_proc_id==0) fprintf(stderr, "[invert_quda] Error, T and L's must be set\n");
    usage();
  }
  if(g_kappa == 0.) {
    if(g_proc_id==0) fprintf(stderr, "[invert_quda] Error, kappa should be > 0.n");
    usage();
  }

  // set number of openmp threads
#ifdef OPENMP
  omp_set_num_threads(g_num_threads);
#else
  fprintf(stdout, "[invert_quda_cg] Warning, resetting global number of threads to 1\n");
  g_num_threads = 1;
#endif

  /* initialize MPI parameters */
  mpi_init(argc, argv);
  
  // the volume of a timeslice
  VOL3 = LX*LY*LZ;

  fprintf(stdout, "# [%2d] parameters:\n"\
                  "# [%2d] T            = %3d\n"\
		  "# [%2d] Tstart       = %3d\n",\
		  g_cart_id, g_cart_id, T, g_cart_id, Tstart);

#ifdef MPI
  if(T==0) {
    fprintf(stderr, "[%2d] local T is zero; exit\n", g_cart_id);
    MPI_Abort(MPI_COMM_WORLD, 1);
    MPI_Finalize();
    exit(2);
  }
#endif

  if(init_geometry() != 0) {
    fprintf(stderr, "ERROR from init_geometry\n");
#ifdef MPI
    MPI_Abort(MPI_COMM_WORLD, 1);
    MPI_Finalize();
#endif
    exit(1);
  }

  geometry();


  /**************************************
   * initialize the QUDA library
   **************************************/
  fprintf(stdout, "# [invert_quda] initializing quda\n");
  initQuda(g_gpu_device_number);
  
  /**************************************
   * prepare the gauge field
   **************************************/
  // read the gauge field from file
  alloc_gauge_field(&g_gauge_field, VOLUMEPLUSRAND);
  if(strcmp( gaugefilename_prefix, "identity")==0 ) {
    if(g_cart_id==0) fprintf(stdout, "# [invert_quda] Setting up unit gauge field\n");
    for(ix=0;ix<VOLUME; ix++) {
      for(mu=0;mu<4;mu++) {
        _cm_eq_id(g_gauge_field+_GGI(ix,mu));
      }
    }
  } else {
    if(g_gauge_file_format == 0) {
      // ILDG
      sprintf(filename, "%s.%.4d", gaugefilename_prefix, Nconf);
      if(g_cart_id==0) fprintf(stdout, "# Reading gauge field from file %s\n", filename);
      status = read_lime_gauge_field_doubleprec(filename);
    } else if(g_gauge_file_format == 1) {
      // NERSC
      sprintf(filename, "%s.%.5d", gaugefilename_prefix, Nconf);
      if(g_cart_id==0) fprintf(stdout, "# Reading gauge field from file %s\n", filename);
      status = read_nersc_gauge_field(g_gauge_field, filename, &plaq_r);
    }
    if(status != 0) {
      fprintf(stderr, "[invert_quda] Error, could not read gauge field");
#ifdef MPI
      MPI_Abort(MPI_COMM_WORLD, 12);
      MPI_Finalize();
#endif
      exit(12);
    }
  }
#ifdef MPI
  xchange_gauge();
#endif

  // measure the plaquette
  plaquette(&plaq_m);
  if(g_cart_id==0) fprintf(stdout, "# Measured plaquette value: %25.16e\n", plaq_m);
  if(g_cart_id==0) fprintf(stdout, "# Read plaquette value    : %25.16e\n", plaq_r);

  // allocate the smeared / qdp ordered gauge field
  alloc_gauge_field(&gauge_field_smeared, VOLUME);
  for(i=0;i<4;i++) {
    gauge_qdp[i] = gauge_field_smeared + i*18*VOLUME;
  }


  // transcribe the gauge field
#ifdef OPENMP
  omp_set_num_threads(g_num_threads);
#pragma omp parallel for private(ix,iy,mu)
#endif
  for(ix=0;ix<VOLUME;ix++) {
    iy = g_lexic2eot[ix];
    for(mu=0;mu<4;mu++) {
      _cm_eq_cm(gauge_qdp[mu_trans[mu]]+18*iy, g_gauge_field+_GGI(ix,mu));
    }
  }
  // multiply timeslice T-1 with factor of -1 (antiperiodic boundary condition)
#ifdef OPENMP
  omp_set_num_threads(g_num_threads);
#pragma omp parallel for private(ix,iy)
#endif
  for(ix=0;ix<VOL3;ix++) {
    iix = (T-1)*VOL3 + ix;
    iy = g_lexic2eot[iix];
    _cm_ti_eq_re(gauge_qdp[mu_trans[0]]+18*iy, -1.);
  }


  // QUDA gauge parameters
  gauge_param.X[0] = LX_global;
  gauge_param.X[1] = LY_global;
  gauge_param.X[2] = LZ_global;
  gauge_param.X[3] = T_global;

  gauge_param.anisotropy  = 1.0;
  gauge_param.type        = QUDA_WILSON_LINKS;
  gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER;
  gauge_param.t_boundary  = QUDA_ANTI_PERIODIC_T;

  gauge_param.cpu_prec           = cpu_prec;
  gauge_param.cuda_prec          = cuda_prec;
  gauge_param.reconstruct        = QUDA_RECONSTRUCT_12;
  gauge_param.cuda_prec_sloppy   = cuda_prec_sloppy;
  gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_12;
  gauge_param.gauge_fix          = QUDA_GAUGE_FIXED_NO;

  gauge_param.ga_pad = 0;

  // load the gauge field
  fprintf(stdout, "# [invert_quda] loading gauge field\n");
  loadGaugeQuda((void*)gauge_qdp, &gauge_param);
  gauge_qdp[0] = NULL; 
  gauge_qdp[1] = NULL; 
  gauge_qdp[2] = NULL; 
  gauge_qdp[3] = NULL; 

  /*********************************************
   * APE smear the gauge field
   *********************************************/
  memcpy(gauge_field_smeared, g_gauge_field, 72*VOLUME*sizeof(double));
  if(N_ape>0) {
    fprintf(stdout, "# [invert_quda] APE smearing gauge field with paramters N_APE=%d, alpha_APE=%e\n", N_ape, alpha_ape);
#ifdef OPENMP
     APE_Smearing_Step_threads(gauge_field_smeared, N_ape, alpha_ape);
#else
    for(i=0; i<N_ape; i++) {
       APE_Smearing_Step(gauge_field_smeared, alpha_ape);
     }
#endif
  }

  /* allocate memory for the spinor fields */
  no_fields = 3;
  g_spinor_field = (double**)calloc(no_fields, sizeof(double*));
  for(i=0; i<no_fields; i++) alloc_spinor_field(&g_spinor_field[i], VOLUMEPLUSRAND);

  /* the source locaton */
  sl0 =   g_source_location                              / (LX_global*LY_global*LZ);
  sl1 = ( g_source_location % (LX_global*LY_global*LZ) ) / (          LY_global*LZ);
  sl2 = ( g_source_location % (          LY_global*LZ) ) / (                    LZ);
  sl3 =   g_source_location %                      LZ;
  if(g_cart_id==0) fprintf(stdout, "# [invert_quda] global sl = (%d, %d, %d, %d)\n", sl0, sl1, sl2, sl3);
  source_proc_coords[0] = sl0 / T;
  source_proc_coords[1] = sl1 / LX;
  source_proc_coords[2] = sl2 / LY;
  source_proc_coords[3] = sl3 / LZ;
#ifdef MPI
  MPI_Cart_rank(g_cart_grid, source_proc_coords, &source_proc_id);
#else
  source_proc_id = 0;
#endif
  have_source_flag = source_proc_id == g_cart_id;

  lsl0 = sl0 % T;
  lsl1 = sl1 % LX;
  lsl2 = sl2 % LY;
  lsl3 = sl3 % LZ;
  if(have_source_flag) {
    fprintf(stdout, "# [invert_quda] process %d has the source at (%d, %d, %d, %d)\n", g_cart_id, lsl0, lsl1, lsl2, lsl3);
  }

  // QUDA inverter parameters
  inv_param.dslash_type    = QUDA_WILSON_DSLASH;
//  inv_param.inv_type       = QUDA_BICGSTAB_INVERTER;
  inv_param.inv_type       = QUDA_CG_INVERTER;
  inv_param.kappa          = g_kappa;
  inv_param.tol            = solver_precision;
  inv_param.maxiter        = niter_max;
  inv_param.reliable_delta = reliable_delta;

  inv_param.solution_type      = QUDA_MAT_SOLUTION;
//  inv_param.solve_type         = QUDA_DIRECT_PC_SOLVE;
  inv_param.solve_type         = QUDA_NORMEQ_PC_SOLVE;
  inv_param.matpc_type         = QUDA_MATPC_EVEN_EVEN; // QUDA_MATPC_EVEN_EVEN;
  inv_param.dagger             = QUDA_DAG_NO;
  inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION; //;QUDA_MASS_NORMALIZATION;

  inv_param.cpu_prec         = cpu_prec;
  inv_param.cuda_prec        = cuda_prec;
  inv_param.cuda_prec_sloppy = cuda_prec_sloppy;
  inv_param.preserve_source  = QUDA_PRESERVE_SOURCE_NO;
  inv_param.dirac_order      = QUDA_DIRAC_ORDER;

  inv_param.sp_pad = 0;
  inv_param.cl_pad = 0;

  inv_param.verbosity = QUDA_VERBOSE;

  // write initial rng state to file
  if(g_source_type==2 && g_coherent_source==2) {
    sprintf(rng_file_out, "%s.0", g_rng_filename);
    if( init_rng_stat_file (g_seed, rng_file_out) != 0 ) {
      fprintf(stderr, "[invert_quda] Error, could not write rng status\n");
      exit(210);
    }
  } else if(g_source_type==3 || g_source_type==4) {
    if( init_rng_state(g_seed, &g_rng_state) != 0 ) {
      fprintf(stderr, "[invert_quda] Error, could initialize rng state\n");
      exit(211);
    }
  }

  // check the source momenta
  if(g_source_momentum_set) {
    source_momentum = (int*)malloc(3*sizeof(int));

    if(g_source_momentum[0]<0) g_source_momentum[0] += LX;
    if(g_source_momentum[1]<0) g_source_momentum[1] += LY;
    if(g_source_momentum[2]<0) g_source_momentum[2] += LZ;
    fprintf(stdout, "# [invert_quda] using final source momentum ( %d, %d, %d )\n", g_source_momentum[0], g_source_momentum[1], g_source_momentum[2]);


    if(full_orbit) {
      status = make_qcont_orbits_3d_parity_avg( &qlatt_id, &qlatt_count, &qlatt_list, &qlatt_nclass, &qlatt_rep, &qlatt_map);
      if(status != 0) {
        fprintf(stderr, "\n[invert_quda] Error while creating O_3-lists\n");
        exit(4);
      }
      source_momentum_class = qlatt_id[g_ipt[0][g_source_momentum[0]][g_source_momentum[1]][g_source_momentum[2]]];
      source_momentum_no    = qlatt_count[source_momentum_class];
      source_momentum_runs  = source_momentum_class==0 ? 1 : source_momentum_no + 1;
      fprintf(stdout, "# [] source momentum belongs to class %d with %d members, which means %d runs\n",
          source_momentum_class, source_momentum_no, source_momentum_runs);
    }
  }


  /***********************************************
   * loop on spin-color-index
   ***********************************************/
  for(isc=g_source_index[0]; isc<=g_source_index[1]; isc++) {
    ispin = isc / n_c;
    icol  = isc % n_c;

    for(imom=0; imom<source_momentum_runs; imom++) {

      /***********************************************
       * set source momentum
       ***********************************************/
      if(g_source_momentum_set) {
        if(imom == 0) {
          if(full_orbit) {
            source_momentum[0] = 0;
            source_momentum[1] = 0;
            source_momentum[2] = 0;
          } else {
            source_momentum[0] = g_source_momentum[0];
            source_momentum[1] = g_source_momentum[1];
            source_momentum[2] = g_source_momentum[2];
          }
        } else {
          source_momentum[0] = qlatt_map[source_momentum_class][imom-1] / (LY*LZ);
          source_momentum[1] = ( qlatt_map[source_momentum_class][imom-1] % (LY*LZ) ) / LZ;
          source_momentum[2] = qlatt_map[source_momentum_class][imom-1] % LZ;
        }
        fprintf(stdout, "# [] run no. %d, source momentum (%d, %d, %d)\n", imom, source_momentum[0], source_momentum[1], source_momentum[2]);
      }
 
      /***********************************************
       * prepare the souce
       ***********************************************/
      if(g_read_source == 0) {  // create source
        switch(g_source_type) {
          case 0:
            // point source
            fprintf(stdout, "# [invert_quda] Creating point source\n");
            for(ix=0;ix<24*VOLUME;ix++) g_spinor_field[0][ix] = 0.;
            if(have_source_flag) {
              if(g_source_momentum_set) {
                phase = 2*M_PI*( source_momentum[0]*lsl1/(double)LX + source_momentum[1]*lsl2/(double)LY + source_momentum[2]*lsl3/(double)LZ );
                g_spinor_field[0][_GSI(g_source_location) + 2*(n_c*ispin+icol)  ] = cos(phase);
                g_spinor_field[0][_GSI(g_source_location) + 2*(n_c*ispin+icol)+1] = sin(phase);
              } else {
                g_spinor_field[0][_GSI(g_source_location) + 2*(n_c*ispin+icol)  ] = 1.;
              }
            }
            if(g_source_momentum_set) {
              sprintf(source_filename, "%s.%.4d.t%.2dx%.2dy%.2dz%.2d.%.2d.qx%.2dqy%.2dqz%.2d",
                  filename_prefix, Nconf, sl0, sl1, sl2, sl3, n_c*ispin+icol, source_momentum[0], source_momentum[1], source_momentum[2]);
            } else {
              sprintf(source_filename, "%s.%.4d.t%.2dx%.2dy%.2dz%.2d.%.2d", filename_prefix, Nconf, sl0, sl1, sl2, sl3, n_c*ispin+icol);
            }
            break;
          case 2:
            // timeslice source
            if(g_coherent_source==1) {
              fprintf(stdout, "# [invert_quda] Creating coherent timeslice source\n");
              status = prepare_coherent_timeslice_source(g_spinor_field[0], gauge_field_smeared, g_coherent_source_base, g_coherent_source_delta, VOLUME, g_rng_filename, NULL);
              if(status != 0) {
                fprintf(stderr, "[invert_quda] Error from prepare source, status was %d\n", status);
                exit(123);
              }
              timeslice = g_coherent_source_base;
            } else {
              if(g_coherent_source==2) {
                strcpy(rng_file_in, rng_file_out);
                if(isc == g_source_index[1]) { strcpy(rng_file_out, g_rng_filename); }
                else                         { sprintf(rng_file_out, "%s.%d", g_rng_filename, isc+1); }
                timeslice = (g_coherent_source_base+isc*g_coherent_source_delta)%T_global;
                fprintf(stdout, "# [invert_quda] Creating timeslice source\n");
                status = prepare_timeslice_source(g_spinor_field[0], gauge_field_smeared, timeslice, VOLUME, rng_file_in, rng_file_out);
                if(status != 0) {
                  fprintf(stderr, "[invert_quda] Error from prepare source, status was %d\n", status);
                  exit(123);
                }
              } else {
                fprintf(stdout, "# [invert_quda] Creating timeslice source\n");
                status = prepare_timeslice_source(g_spinor_field[0], gauge_field_smeared, g_source_timeslice, VOLUME, g_rng_filename, g_rng_filename);
                if(status != 0) {
                  fprintf(stderr, "[invert_quda] Error from prepare source, status was %d\n", status);
                  exit(124);
                }
                timeslice = g_source_timeslice;
              }
            }
            if(g_source_momentum_set) {
              sprintf(source_filename, "%s.%.4d.%.2d.%.5d.qx%.2dqy%.2dqz%.2d", filename_prefix, Nconf, 
                  timeslice, isc, source_momentum[0], source_momentum[1], source_momentum[2]);
            } else {
              sprintf(source_filename, "%s.%.4d.%.2d.%.5d", filename_prefix, Nconf, timeslice, isc);
            }
            break;
          case 3:
            // timeslice sources for one-end trick (spin dilution)
            fprintf(stdout, "# [invert_quda] Creating timeslice source for one-end-trick\n");
            status = prepare_timeslice_source_one_end(g_spinor_field[0], gauge_field_smeared, g_source_timeslice, source_momentum, isc%n_s, g_rng_state, \
                ( isc%n_s==(n_s-1) && imom==source_momentum_runs-1 ) );
            if(status != 0) {
              fprintf(stderr, "[invert_quda] Error from prepare source, status was %d\n", status);
              exit(125);
            }
            c = N_Jacobi > 0 ? isc%n_s + n_s : isc%n_s;
            if(g_source_momentum_set) {
              sprintf(source_filename, "%s.%.4d.%.2d.%.2d.qx%.2dqy%.2dqz%.2d", filename_prefix, Nconf, 
                  g_source_timeslice, c, source_momentum[0], source_momentum[1], source_momentum[2]);
            } else {
              sprintf(source_filename, "%s.%.4d.%.2d.%.2d", filename_prefix, Nconf, g_source_timeslice, c);
            }
            break;
          case 4:
            // timeslice sources for one-end trick (spin and color dilution )
            fprintf(stdout, "# [invert_quda] Creating timeslice source for one-end-trick\n");
            status = prepare_timeslice_source_one_end_color(g_spinor_field[0], gauge_field_smeared, g_source_timeslice, source_momentum,\
                isc%(n_s*n_c), g_rng_state, ( isc%(n_s*n_c)==(n_s*n_c-1)  && imom==source_momentum_runs-1 ) );
            if(status != 0) {
              fprintf(stderr, "[invert_quda] Error from prepare source, status was %d\n", status);
              exit(126);
            }
            c = N_Jacobi > 0 ? isc%(n_s*n_c) + (n_s*n_c) : isc%(n_s*n_c);
            if(g_source_momentum_set) {
              sprintf(source_filename, "%s.%.4d.%.2d.%.2d.qx%.2dqy%.2dqz%.2d", filename_prefix, Nconf, 
                  g_source_timeslice, c, source_momentum[0], source_momentum[1], source_momentum[2]);
            } else {
              sprintf(source_filename, "%s.%.4d.%.2d.%.2d", filename_prefix, Nconf, g_source_timeslice, c);
            }
            break;
          default:
            fprintf(stderr, "\nError, unrecognized source type\n");
            exit(32);
            break;
        }
      } else { // read source
        switch(g_source_type) {
          case 0:  // point source
            if(g_source_momentum_set) {
              sprintf(source_filename, "%s.%.4d.t%.2dx%.2dy%.2dz%.2d.%.2d.qx%.2dqy%.2dqz%.2d", \
                  filename_prefix2, Nconf, sl0, sl1, sl2, sl3, isc, source_momentum[0], source_momentum[1], source_momentum[2]);
            } else  {
              sprintf(source_filename, "%s.%.4d.t%.2dx%.2dy%.2dz%.2d.%.2d", filename_prefix2, Nconf, sl0, sl1, sl2, sl3, isc);
            }
            fprintf(stdout, "# [invert_quda] reading source from file %s\n", source_filename);
            status = read_lime_spinor(g_spinor_field[0], source_filename, 0);
            if(status != 0) {
              fprintf(stderr, "# [invert_quda] Errro, could not read source from file %s\n", source_filename);
              exit(115);
            }
            break;
          case 2:  // timeslice source
            if(g_source_momentum_set) {
              sprintf(source_filename, "%s.%.4d.%.2d.%.5d.qx%.2dqy%.2dqz%.2d", filename_prefix2, Nconf, g_source_timeslice,
                  isc, source_momentum[0], source_momentum[1], source_momentum[2]);
            } else {
              sprintf(source_filename, "%s.%.4d.%.2d.%.5d", filename_prefix2, Nconf, g_source_timeslice, isc);
            }
            fprintf(stdout, "# [invert_quda] reading source from file %s\n", source_filename);
            status = read_lime_spinor(g_spinor_field[0], source_filename, 0);
            if(status != 0) {
              fprintf(stderr, "# [invert_quda] Errro, could not read source from file %s\n", source_filename);
              exit(115);
            }
            break;
          default:
            fprintf(stderr, "[] Error, unrecognized source type for reading\n");
            exit(104);
            break;
        }
      }  // of if g_read_source
  
      //sprintf(filename, "%s.ascii", source_filename);
      //ofs = fopen(filename, "w");
      //printf_spinor_field(g_spinor_field[0], ofs);
      //fclose(ofs);
  
      if(g_write_source) {
        status = write_propagator(g_spinor_field[0], source_filename, 0, g_propagator_precision);
        if(status != 0) {
          fprintf(stderr, "Error from write_propagator, status was %d\n", status);
          exit(27);
        }
      }
  
      // smearing
      if(N_Jacobi > 0) {
  #ifdef OPENMP
        Jacobi_Smearing_Step_one_threads(gauge_field_smeared, g_spinor_field[0], g_spinor_field[1], N_Jacobi, kappa_Jacobi);
  #else
        for(c=0; c<N_Jacobi; c++) {
          Jacobi_Smearing_Step_one(gauge_field_smeared, g_spinor_field[0], g_spinor_field[1], kappa_Jacobi);
        }
  #endif
      }
  
      // multiply with g2
      for(ix=0;ix<VOLUME;ix++) {
        _fv_eq_gamma_ti_fv(g_spinor_field[1]+_GSI(ix), 2, g_spinor_field[0]+_GSI(ix));
      }
  
      // transcribe the spinor field to even-odd ordering with coordinates (x,y,z,t)
      for(ix=0;ix<VOLUME;ix++) {
        iy = g_lexic2eot[ix];
        _fv_eq_fv(g_spinor_field[2]+_GSI(iy), g_spinor_field[1]+_GSI(ix));
      }
  
  
      /***********************************************
       * perform the inversion
       ***********************************************/
      fprintf(stdout, "# [invert_quda] starting inversion\n");
      ratime = (double)clock() / CLOCKS_PER_SEC;
  
      for(ix=0;ix<VOLUME;ix++) {
        _fv_eq_zero(g_spinor_field[1]+_GSI(ix) );
      }
  
      invertQuda(g_spinor_field[1], g_spinor_field[2], &inv_param);
  
      retime = (double)clock() / CLOCKS_PER_SEC;
      fprintf(stdout, "# [invert_quda] inversion done in %e seconds\n", retime-ratime);
      fprintf(stdout, "# [invert_quda] Device memory used:\n\tSpinor: %f GiB\n\tGauge: %f GiB\n",
        inv_param.spinorGiB, gauge_param.gaugeGiB);
  
      if(inv_param.mass_normalization == QUDA_KAPPA_NORMALIZATION) {
        _2_kappa = 2. * g_kappa;
        for(ix=0;ix<VOLUME;ix++) {
          _fv_ti_eq_re(g_spinor_field[1]+_GSI(ix), _2_kappa );
        }
      }
  
      // transcribe the spinor field to lexicographical order with (t,x,y,z)
      for(ix=0;ix<VOLUME;ix++) {
        iy = g_lexic2eot[ix];
        _fv_eq_fv(g_spinor_field[2]+_GSI(ix), g_spinor_field[1]+_GSI(iy));
      }
      // multiply with g2
      for(ix=0;ix<VOLUME;ix++) {
        _fv_eq_gamma_ti_fv(g_spinor_field[1]+_GSI(ix), 2, g_spinor_field[2]+_GSI(ix));
      }
  
      /***********************************************
       * check residuum
       ***********************************************/
      if(check_residuum) {
        // apply the Wilson Dirac operator in the gamma-basis defined in cvc_linalg,
        //   which uses the tmLQCD conventions (same as in contractions)
        //   without explicit boundary conditions
        Q_Wilson_phi(g_spinor_field[2], g_spinor_field[1]);
  
        for(ix=0;ix<VOLUME;ix++) {
          _fv_mi_eq_fv(g_spinor_field[2]+_GSI(ix), g_spinor_field[0]+_GSI(ix));
        }
  
        spinor_scalar_product_re(&norm, g_spinor_field[2], g_spinor_field[2], VOLUME);
        spinor_scalar_product_re(&norm2, g_spinor_field[0], g_spinor_field[0], VOLUME);
        fprintf(stdout, "\n# [invert_quda] absolut residuum squared: %e; relative residuum %e\n", norm, sqrt(norm/norm2) );
      }
  
      /***********************************************
       * write the solution 
       ***********************************************/
      sprintf(filename, "%s.inverted", source_filename);
      fprintf(stdout, "# [invert_quda] writing propagator to file %s\n", filename);
      status = write_propagator(g_spinor_field[1], filename, 0, g_propagator_precision);
      if(status != 0) {
        fprintf(stderr, "Error from write_propagator, status was %d\n", status);
        exit(22);
      }
 
    }  // of loop on momenta

  }  // of isc

  /***********************************************
   * free the allocated memory, finalize 
   ***********************************************/

  // finalize the QUDA library
  fprintf(stdout, "# [invert_quda] finalizing quda\n");
  endQuda();

  free(g_gauge_field);
  free(gauge_field_smeared);
  for(i=0; i<no_fields; i++) free(g_spinor_field[i]);
  free(g_spinor_field);
  free_geometry();

  if(g_source_momentum_set && full_orbit) {
    finalize_q_orbits(&qlatt_id, &qlatt_count, &qlatt_list, &qlatt_rep);
    if(qlatt_map != NULL) {
      free(qlatt_map[0]);
      free(qlatt_map);
    }
  }
  if(source_momentum != NULL) free(source_momentum);

#ifdef MPI
  MPI_Finalize();
#endif

  if(g_cart_id==0) {
    g_the_time = time(NULL);
    fprintf(stdout, "\n# [invert_quda] %s# [invert_quda] end of run\n", ctime(&g_the_time));
    fprintf(stderr, "\n# [invert_quda] %s# [invert_quda] end of run\n", ctime(&g_the_time));
  }
  return(0);
}
コード例 #15
0
void init() {

  gauge_param = newQudaGaugeParam();
  inv_param = newQudaInvertParam();

  gauge_param.X[0] = 12;
  gauge_param.X[1] = 12;
  gauge_param.X[2] = 12;
  gauge_param.X[3] = 12;
  
  setDims(gauge_param.X, Ls);

  gauge_param.anisotropy = 2.3;

  gauge_param.type = QUDA_WILSON_LINKS;
  gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER;
  gauge_param.t_boundary = QUDA_ANTI_PERIODIC_T;

  gauge_param.cpu_prec = cpu_prec;
  gauge_param.cuda_prec = cuda_prec;
  gauge_param.reconstruct = QUDA_RECONSTRUCT_12;
  gauge_param.reconstruct_sloppy = gauge_param.reconstruct;
  gauge_param.cuda_prec_sloppy = gauge_param.cuda_prec;
  gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO;
  gauge_param.type = QUDA_WILSON_LINKS;

  inv_param.inv_type = QUDA_CG_INVERTER;

  inv_param.mass = 0.01;
  inv_param.m5 = -1.5;
  kappa5 = 0.5/(5 + inv_param.m5);

  inv_param.Ls = Ls;
  
  inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
  inv_param.dagger = dagger;

  inv_param.cpu_prec = cpu_prec;
  inv_param.cuda_prec = cuda_prec;

  gauge_param.ga_pad = 0;
  inv_param.sp_pad = 0;
  inv_param.cl_pad = 0;

  inv_param.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS;
  inv_param.dirac_order = QUDA_DIRAC_ORDER;

  if (test_type == 2) {
    inv_param.solution_type = QUDA_MAT_SOLUTION;
  } else {
    inv_param.solution_type = QUDA_MATPC_SOLUTION;
  }

  inv_param.dslash_type = QUDA_DOMAIN_WALL_DSLASH;

  inv_param.verbosity = QUDA_VERBOSE;

  // construct input fields
  for (int dir = 0; dir < 4; dir++) hostGauge[dir] = malloc(V*gaugeSiteSize*gauge_param.cpu_prec);

  ColorSpinorParam csParam;
  
  csParam.fieldLocation = QUDA_CPU_FIELD_LOCATION;
  csParam.nColor = 3;
  csParam.nSpin = 4;
  csParam.nDim = 5;
  for (int d=0; d<4; d++) csParam.x[d] = gauge_param.X[d];
  csParam.x[4] = Ls;
  csParam.precision = inv_param.cpu_prec;
  csParam.pad = 0;
  if (test_type < 2) {
    csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
    csParam.x[0] /= 2;
  } else {
    csParam.siteSubset = QUDA_FULL_SITE_SUBSET;
  }    
  csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
  csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
  csParam.gammaBasis = inv_param.gamma_basis;
  csParam.create = QUDA_ZERO_FIELD_CREATE;
  
  spinor = new cpuColorSpinorField(csParam);
  spinorOut = new cpuColorSpinorField(csParam);
  spinorRef = new cpuColorSpinorField(csParam);

  csParam.siteSubset = QUDA_FULL_SITE_SUBSET;
  csParam.x[0] = gauge_param.X[0];
  
  printfQuda("Randomizing fields... ");

  construct_gauge_field(hostGauge, 1, gauge_param.cpu_prec, &gauge_param);
  spinor->Source(QUDA_RANDOM_SOURCE);

  printfQuda("done.\n"); fflush(stdout);
  
  int dev = 0;
  initQuda(dev);

  printfQuda("Sending gauge field to GPU\n");

  loadGaugeQuda(hostGauge, &gauge_param);

  if (!transfer) {
    csParam.fieldLocation = QUDA_CUDA_FIELD_LOCATION;
    csParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS;
    csParam.pad = inv_param.sp_pad;
    csParam.precision = inv_param.cuda_prec;
    if (csParam.precision == QUDA_DOUBLE_PRECISION ) {
      csParam.fieldOrder = QUDA_FLOAT2_FIELD_ORDER;
    } else {
      /* Single and half */
      csParam.fieldOrder = QUDA_FLOAT4_FIELD_ORDER;
    }
 
    if (test_type < 2) {
      csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
      csParam.x[0] /= 2;
    }

    printfQuda("Creating cudaSpinor\n");
    cudaSpinor = new cudaColorSpinorField(csParam);
    printfQuda("Creating cudaSpinorOut\n");
    cudaSpinorOut = new cudaColorSpinorField(csParam);

    if (test_type == 2) csParam.x[0] /= 2;

    csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
    tmp = new cudaColorSpinorField(csParam);

    printfQuda("Sending spinor field to GPU\n");
    *cudaSpinor = *spinor;

    std::cout << "Source: CPU = " << norm2(*spinor) << ", CUDA = " << 
      norm2(*cudaSpinor) << std::endl;

    bool pc = (test_type != 2);
    DiracParam diracParam;
    setDiracParam(diracParam, &inv_param, pc);
    diracParam.verbose = QUDA_DEBUG_VERBOSE;
    diracParam.tmp1 = tmp;
    diracParam.tmp2 = tmp2;
    
    dirac = Dirac::create(diracParam);

  } else {
    std::cout << "Source: CPU = " << norm2(*spinor) << std::endl;
  }
    
}
コード例 #16
0
ファイル: llfat_test.cpp プロジェクト: azrael417/quda
  static void
llfat_test(int test)
{

  QudaGaugeParam qudaGaugeParam;
#ifdef MULTI_GPU
  void* ghost_sitelink[4];
  void* ghost_sitelink_diag[16];
#endif


  initQuda(device);

  cpu_prec = prec;
  gSize = cpu_prec;  
  qudaGaugeParam = newQudaGaugeParam();

  qudaGaugeParam.anisotropy = 1.0;

  qudaGaugeParam.X[0] = xdim;
  qudaGaugeParam.X[1] = ydim;
  qudaGaugeParam.X[2] = zdim;
  qudaGaugeParam.X[3] = tdim;

  setDims(qudaGaugeParam.X);

  qudaGaugeParam.cpu_prec = cpu_prec;
  qudaGaugeParam.cuda_prec = prec;
  qudaGaugeParam.gauge_order = gauge_order;
  qudaGaugeParam.type=QUDA_WILSON_LINKS;
  qudaGaugeParam.reconstruct = link_recon;
  /*
     qudaGaugeParam.flag = QUDA_FAT_PRESERVE_CPU_GAUGE
     | QUDA_FAT_PRESERVE_GPU_GAUGE
     | QUDA_FAT_PRESERVE_COMM_MEM;
     */
  qudaGaugeParam.preserve_gauge =0;
  void* fatlink;
  if (cudaMallocHost((void**)&fatlink, 4*V*gaugeSiteSize*gSize) == cudaErrorMemoryAllocation) {
    errorQuda("ERROR: cudaMallocHost failed for fatlink\n");
  }

  void* longlink;
  if (cudaMallocHost((void**)&longlink, 4*V*gaugeSiteSize*gSize) == cudaErrorMemoryAllocation) {
    errorQuda("ERROR: cudaMallocHost failed for longlink\n");
  } // page-locked memory

  void* sitelink[4];
  for(int i=0;i < 4;i++){
    if (cudaMallocHost((void**)&sitelink[i], V*gaugeSiteSize*gSize) == cudaErrorMemoryAllocation) {
      errorQuda("ERROR: cudaMallocHost failed for sitelink\n");
    }
  }

  void* sitelink_ex[4];
  for(int i=0;i < 4;i++){
    if (cudaMallocHost((void**)&sitelink_ex[i], V_ex*gaugeSiteSize*gSize) ==  cudaErrorMemoryAllocation) {
      errorQuda("ERROR: cudaMallocHost failed for sitelink_ex\n");
    }
  }


  void* milc_sitelink;
  milc_sitelink = (void*)malloc(4*V*gaugeSiteSize*gSize);
  if(milc_sitelink == NULL){
    errorQuda("ERROR: allocating milc_sitelink failed\n");
  }

  void* milc_sitelink_ex;
  milc_sitelink_ex = (void*)malloc(4*V_ex*gaugeSiteSize*gSize);
  if(milc_sitelink_ex == NULL){
    errorQuda("Error: allocating milc_sitelink failed\n");
  }



  createSiteLinkCPU(sitelink, qudaGaugeParam.cpu_prec, 1);

  if(gauge_order == QUDA_MILC_GAUGE_ORDER){
    for(int i=0; i<V; ++i){
      for(int dir=0; dir<4; ++dir){
        char* src = (char*)sitelink[dir];
        memcpy((char*)milc_sitelink + (i*4 + dir)*gaugeSiteSize*gSize, src+i*gaugeSiteSize*gSize, gaugeSiteSize*gSize);
      }	
    }
  }

  int X1=Z[0];
  int X2=Z[1];
  int X3=Z[2];
  int X4=Z[3];

  for(int i=0; i < V_ex; i++){
    int sid = i;
    int oddBit=0;
    if(i >= Vh_ex){
      sid = i - Vh_ex;
      oddBit = 1;
    }

    int za = sid/E1h;
    int x1h = sid - za*E1h;
    int zb = za/E2;
    int x2 = za - zb*E2;
    int x4 = zb/E3;
    int x3 = zb - x4*E3;
    int x1odd = (x2 + x3 + x4 + oddBit) & 1;
    int x1 = 2*x1h + x1odd;


    if( x1< 2 || x1 >= X1 +2
        || x2< 2 || x2 >= X2 +2
        || x3< 2 || x3 >= X3 +2
        || x4< 2 || x4 >= X4 +2){
#ifdef MULTI_GPU
      continue;
#endif
    }



    x1 = (x1 - 2 + X1) % X1;
    x2 = (x2 - 2 + X2) % X2;
    x3 = (x3 - 2 + X3) % X3;
    x4 = (x4 - 2 + X4) % X4;

    int idx = (x4*X3*X2*X1+x3*X2*X1+x2*X1+x1)>>1;
    if(oddBit){
      idx += Vh;
    }
    for(int dir= 0; dir < 4; dir++){
      char* src = (char*)sitelink[dir];
      char* dst = (char*)sitelink_ex[dir];
      memcpy(dst+i*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize);

      // milc ordering 
      memcpy((char*)milc_sitelink_ex + (i*4 + dir)*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize);
    }//dir
  }//i


  double act_path_coeff[6];
  for(int i=0;i < 6;i++){
    act_path_coeff[i]= 0.1*i;
  }


  //only record the last call's performance
  //the first one is for creating the cpu/cuda data structures
  struct timeval t0, t1;


  void** sitelink_ptr; 
  QudaComputeFatMethod method = (test) ? QUDA_COMPUTE_FAT_EXTENDED_VOLUME : QUDA_COMPUTE_FAT_STANDARD;
  if(gauge_order == QUDA_QDP_GAUGE_ORDER){
    sitelink_ptr = (test) ? (void**)sitelink_ex : (void**)sitelink;
  }else{
    sitelink_ptr = (test) ? (void**)milc_sitelink_ex : (void**)milc_sitelink;
  }
  void* longlink_ptr = longlink;
#ifdef MULTI_GPU
  if(!test) longlink_ptr = NULL; // Have to have an extended volume for the long-link calculation
#endif

  gettimeofday(&t0, NULL);
  computeKSLinkQuda(fatlink, longlink_ptr, NULL, milc_sitelink, act_path_coeff, &qudaGaugeParam, method);
  gettimeofday(&t1, NULL);


  double secs = TDIFF(t0,t1);

  void* fat_reflink[4];
  void* long_reflink[4];
  for(int i=0;i < 4;i++){
    fat_reflink[i] = malloc(V*gaugeSiteSize*gSize);
    if(fat_reflink[i] == NULL){
      errorQuda("ERROR; allocate fat_reflink[%d] failed\n", i);
    }
    long_reflink[i] = malloc(V*gaugeSiteSize*gSize);
    if(long_reflink[i] == NULL) errorQuda("ERROR; allocate long_reflink[%d] failed\n", i);
  }

  if (verify_results){

    //FIXME: we have this compplication because references takes coeff as float/double 
    //        depending on the precision while the GPU code aways take coeff as double
    void* coeff;
    double coeff_dp[6];
    float  coeff_sp[6];
    for(int i=0;i < 6;i++){
      coeff_sp[i] = coeff_dp[i] = act_path_coeff[i];
    }
    if(prec == QUDA_DOUBLE_PRECISION){
      coeff = coeff_dp;
    }else{
      coeff = coeff_sp;
    }
#ifdef MULTI_GPU
    int optflag = 0;
    //we need x,y,z site links in the back and forward T slice
    // so it is 3*2*Vs_t
    int Vs[4] = {Vs_x, Vs_y, Vs_z, Vs_t};
    for(int i=0;i < 4; i++){
      ghost_sitelink[i] = malloc(8*Vs[i]*gaugeSiteSize*gSize);
      if (ghost_sitelink[i] == NULL){
        printf("ERROR: malloc failed for ghost_sitelink[%d] \n",i);
        exit(1);
      }
    }

    /*
       nu |     |
       |_____|
       mu
       */

    for(int nu=0;nu < 4;nu++){
      for(int mu=0; mu < 4;mu++){
        if(nu == mu){
          ghost_sitelink_diag[nu*4+mu] = NULL;
        }else{
          //the other directions
          int dir1, dir2;
          for(dir1= 0; dir1 < 4; dir1++){
            if(dir1 !=nu && dir1 != mu){
              break;
            }
          }
          for(dir2=0; dir2 < 4; dir2++){
            if(dir2 != nu && dir2 != mu && dir2 != dir1){
              break;
            }
          }
          ghost_sitelink_diag[nu*4+mu] = malloc(Z[dir1]*Z[dir2]*gaugeSiteSize*gSize);
          if(ghost_sitelink_diag[nu*4+mu] == NULL){
            errorQuda("malloc failed for ghost_sitelink_diag\n");
          }

          memset(ghost_sitelink_diag[nu*4+mu], 0, Z[dir1]*Z[dir2]*gaugeSiteSize*gSize);
        }

      }
    }

    exchange_cpu_sitelink(qudaGaugeParam.X, sitelink, ghost_sitelink, ghost_sitelink_diag, qudaGaugeParam.cpu_prec, &qudaGaugeParam, optflag);
    llfat_reference_mg(fat_reflink, sitelink, ghost_sitelink, ghost_sitelink_diag, qudaGaugeParam.cpu_prec, coeff);
  

    {
      int R[4] = {2,2,2,2};
      exchange_cpu_sitelink_ex(qudaGaugeParam.X, R, sitelink_ex, QUDA_QDP_GAUGE_ORDER, qudaGaugeParam.cpu_prec, 0, 4);
      computeLongLinkCPU(long_reflink, sitelink_ex, qudaGaugeParam.cpu_prec, coeff);
    }
#else
    llfat_reference(fat_reflink, sitelink, qudaGaugeParam.cpu_prec, coeff);
    computeLongLinkCPU(long_reflink, sitelink, qudaGaugeParam.cpu_prec, coeff);
#endif

  }//verify_results

  //format change for fatlink and longlink
  void* myfatlink[4];
  void* mylonglink[4];
  for(int i=0;i < 4;i++){
    myfatlink[i] = malloc(V*gaugeSiteSize*gSize);
    if(myfatlink[i] == NULL){
      printf("Error: malloc failed for myfatlink[%d]\n", i);
      exit(1);
    }
    mylonglink[i] = malloc(V*gaugeSiteSize*gSize);
    if(mylonglink[i] == NULL){
      printf("Error: malloc failed for mylonglink[%d]\n", i);
      exit(1);
    }
    memset(myfatlink[i], 0, V*gaugeSiteSize*gSize);
    memset(mylonglink[i], 0, V*gaugeSiteSize*gSize);
  }

  for(int i=0;i < V; i++){
    for(int dir=0; dir< 4; dir++){
      char* src = ((char*)fatlink)+ (4*i+dir)*gaugeSiteSize*gSize;
      char* dst = ((char*)myfatlink[dir]) + i*gaugeSiteSize*gSize;
      memcpy(dst, src, gaugeSiteSize*gSize);

      src = ((char*)longlink)+ (4*i+dir)*gaugeSiteSize*gSize;
      dst = ((char*)mylonglink[dir]) + i*gaugeSiteSize*gSize;
      memcpy(dst, src, gaugeSiteSize*gSize);
    }
  }

  if (verify_results) {
    printfQuda("Checking fat links...\n");
    int res=1;
    for(int dir=0; dir<4; dir++){
      res &= compare_floats(fat_reflink[dir], myfatlink[dir], V*gaugeSiteSize, 1e-3, qudaGaugeParam.cpu_prec);
    }
    
    strong_check_link(myfatlink, "GPU results: ",
		      fat_reflink, "CPU reference results:",
		      V, qudaGaugeParam.cpu_prec);
    
    printfQuda("Fat-link test %s\n\n",(1 == res) ? "PASSED" : "FAILED");
#ifdef MULTI_GPU
    if(test){
#endif
      printfQuda("Checking long links...\n");
      res = 1;
      for(int dir=0; dir<4; ++dir){
	res &= compare_floats(long_reflink[dir], mylonglink[dir], V*gaugeSiteSize, 1e-3, qudaGaugeParam.cpu_prec);
      }
      
      strong_check_link(mylonglink, "GPU results: ",
			long_reflink, "CPU reference results:",
			V, qudaGaugeParam.cpu_prec);
      
      printfQuda("Long-link test %s\n\n",(1 == res) ? "PASSED" : "FAILED");
      
#ifdef MULTI_GPU
    }else{ // !test
      printfQuda("Extended volume is required for multi-GPU long-link construction\n");
    }
#endif
  }

  int volume = qudaGaugeParam.X[0]*qudaGaugeParam.X[1]*qudaGaugeParam.X[2]*qudaGaugeParam.X[3];
  int flops= 61632;
#ifdef MULTI_GPU
  if(test) flops += (252*4); // long-link contribution 
#else
  flops += (252*4); // 2*117 + 18 (two matrix-matrix multiplications and a matrix rescale)
#endif

  double perf = 1.0* flops*volume/(secs*1024*1024*1024);
  printfQuda("link computation time =%.2f ms, flops= %.2f Gflops\n", secs*1000, perf);


  for(int i=0;i < 4;i++){
    free(myfatlink[i]);
  }

#ifdef MULTI_GPU
  if (verify_results){
    int i;
    for(i=0;i < 4;i++){
      free(ghost_sitelink[i]);
    }
    for(i=0;i <4; i++){
      for(int j=0;j <4; j++){
        if (i==j){
          continue;
        }
        free(ghost_sitelink_diag[i*4+j]);
      }
    }
  }
#endif

  for(int i=0;i < 4; i++){
    cudaFreeHost(sitelink[i]);
    cudaFreeHost(sitelink_ex[i]);
    free(fat_reflink[i]);
  }
  cudaFreeHost(fatlink);
  cudaFreeHost(longlink);
  if(milc_sitelink) free(milc_sitelink);
  if(milc_sitelink_ex) free(milc_sitelink_ex);
#ifdef MULTI_GPU
  exchange_llfat_cleanup();
#endif
  endQuda();
}
コード例 #17
0
  int
invert_test(void)
{
  QudaGaugeParam gaugeParam = newQudaGaugeParam();
  QudaInvertParam inv_param = newQudaInvertParam();

  set_params(&gaugeParam, &inv_param,
      xdim, ydim, zdim, tdim,
      cpu_prec, prec, prec_sloppy,
      link_recon, link_recon_sloppy, mass, tol, 500, 1e-3,
      0.8);

  // this must be before the FaceBuffer is created (this is because it allocates pinned memory - FIXME)
  initQuda(device);

  setDims(gaugeParam.X);
  setSpinorSiteSize(6);

  size_t gSize = (gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
  for (int dir = 0; dir < 4; dir++) {
    qdp_fatlink[dir] = malloc(V*gaugeSiteSize*gSize);
    qdp_longlink[dir] = malloc(V*gaugeSiteSize*gSize);
  }
  fatlink = malloc(4*V*gaugeSiteSize*gSize);
  longlink = malloc(4*V*gaugeSiteSize*gSize);

  construct_fat_long_gauge_field(qdp_fatlink, qdp_longlink, 1, gaugeParam.cpu_prec, 
				 &gaugeParam, dslash_type);

  const double cos_pi_3 = 0.5; // Cos(pi/3)
  const double sin_pi_3 = sqrt(0.75); // Sin(pi/3)

  for(int dir=0; dir<4; ++dir){
    for(int i=0; i<V; ++i){
      for(int j=0; j<gaugeSiteSize; ++j){
        if(gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION){
          ((double*)qdp_fatlink[dir])[i*gaugeSiteSize + j] = 0.5*rand()/RAND_MAX;
          if(link_recon != QUDA_RECONSTRUCT_8 && link_recon != QUDA_RECONSTRUCT_12){ // incorporate non-trivial phase into long links
            if(j%2 == 0){
              const double real = ((double*)qdp_longlink[dir])[i*gaugeSiteSize + j];
              const double imag = ((double*)qdp_longlink[dir])[i*gaugeSiteSize + j + 1];
              ((double*)qdp_longlink[dir])[i*gaugeSiteSize + j]     = real*cos_pi_3 - imag*sin_pi_3;
              ((double*)qdp_longlink[dir])[i*gaugeSiteSize + j + 1] = real*sin_pi_3 + imag*cos_pi_3;
            }
          }
          ((double*)fatlink)[(i*4 + dir)*gaugeSiteSize + j] = ((double*)qdp_fatlink[dir])[i*gaugeSiteSize + j];
          ((double*)longlink)[(i*4 + dir)*gaugeSiteSize + j] = ((double*)qdp_longlink[dir])[i*gaugeSiteSize + j];
        }else{
          ((float*)qdp_fatlink[dir])[i] = 0.5*rand()/RAND_MAX;
          if(link_recon != QUDA_RECONSTRUCT_8 && link_recon != QUDA_RECONSTRUCT_12){ // incorporate non-trivial phase into long links
            if(j%2 == 0){
              const float real = ((float*)qdp_longlink[dir])[i*gaugeSiteSize + j];
              const float imag = ((float*)qdp_longlink[dir])[i*gaugeSiteSize + j + 1];
              ((float*)qdp_longlink[dir])[i*gaugeSiteSize + j]     = real*cos_pi_3 - imag*sin_pi_3;
              ((float*)qdp_longlink[dir])[i*gaugeSiteSize + j + 1] = real*sin_pi_3 + imag*cos_pi_3;
            }
          }
          ((double*)fatlink)[(i*4 + dir)*gaugeSiteSize + j] = ((double*)qdp_fatlink[dir])[i*gaugeSiteSize + j];
          ((float*)fatlink)[(i*4 + dir)*gaugeSiteSize + j] = ((float*)qdp_fatlink[dir])[i*gaugeSiteSize + j];
          ((float*)longlink)[(i*4 + dir)*gaugeSiteSize + j] = ((float*)qdp_longlink[dir])[i*gaugeSiteSize + j];
        }
      }
    }
  }


  ColorSpinorParam csParam;
  csParam.nColor=3;
  csParam.nSpin=1;
  csParam.nDim=4;
  for(int d = 0; d < 4; d++) {
    csParam.x[d] = gaugeParam.X[d];
  }
  csParam.x[0] /= 2;

  csParam.precision = inv_param.cpu_prec;
  csParam.pad = 0;
  csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
  csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
  csParam.fieldOrder  = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
  csParam.gammaBasis = inv_param.gamma_basis;
  csParam.create = QUDA_ZERO_FIELD_CREATE;  
  in = new cpuColorSpinorField(csParam);  
  out = new cpuColorSpinorField(csParam);  
  ref = new cpuColorSpinorField(csParam);  
  tmp = new cpuColorSpinorField(csParam);  

  if (inv_param.cpu_prec == QUDA_SINGLE_PRECISION){
    constructSpinorField((float*)in->V());    
  }else{
    constructSpinorField((double*)in->V());
  }

#ifdef MULTI_GPU
  int tmp_value = MAX(ydim*zdim*tdim/2, xdim*zdim*tdim/2);
  tmp_value = MAX(tmp_value, xdim*ydim*tdim/2);
  tmp_value = MAX(tmp_value, xdim*ydim*zdim/2);

  int fat_pad = tmp_value;
  int link_pad =  3*tmp_value;

  // FIXME: currently assume staggered is SU(3)
  gaugeParam.type = dslash_type == QUDA_STAGGERED_DSLASH ? 
    QUDA_SU3_LINKS : QUDA_ASQTAD_FAT_LINKS;
  gaugeParam.reconstruct = QUDA_RECONSTRUCT_NO;
  GaugeFieldParam cpuFatParam(fatlink, gaugeParam);
  cpuFat = new cpuGaugeField(cpuFatParam);
  ghost_fatlink = (void**)cpuFat->Ghost();

  gaugeParam.type = QUDA_ASQTAD_LONG_LINKS;
  GaugeFieldParam cpuLongParam(longlink, gaugeParam);
  cpuLong = new cpuGaugeField(cpuLongParam);
  ghost_longlink = (void**)cpuLong->Ghost();

  gaugeParam.type = dslash_type == QUDA_STAGGERED_DSLASH ? 
    QUDA_SU3_LINKS : QUDA_ASQTAD_FAT_LINKS;
  gaugeParam.ga_pad = fat_pad;
  gaugeParam.reconstruct= gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO;
  gaugeParam.cuda_prec_precondition = QUDA_HALF_PRECISION;
  loadGaugeQuda(fatlink, &gaugeParam);

  if (dslash_type == QUDA_ASQTAD_DSLASH) {
    gaugeParam.type = QUDA_ASQTAD_LONG_LINKS;
    gaugeParam.ga_pad = link_pad;
    gaugeParam.reconstruct= link_recon;
    gaugeParam.reconstruct_sloppy = link_recon_sloppy;
    loadGaugeQuda(longlink, &gaugeParam);
  }
#else
  gaugeParam.type = QUDA_ASQTAD_FAT_LINKS;
  gaugeParam.reconstruct = gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO;
  gaugeParam.cuda_prec_precondition = QUDA_HALF_PRECISION;
  loadGaugeQuda(fatlink, &gaugeParam);

  if (dslash_type == QUDA_ASQTAD_DSLASH) {
    gaugeParam.type = QUDA_ASQTAD_LONG_LINKS;
    gaugeParam.reconstruct = link_recon;
    gaugeParam.reconstruct_sloppy = link_recon_sloppy;
    loadGaugeQuda(longlink, &gaugeParam);
  }
#endif

  double time0 = -((double)clock()); // Start the timer

  double nrm2=0;
  double src2=0;
  int ret = 0;



  switch(test_type){
    case 0: //even
      if(inv_type == QUDA_GCR_INVERTER){
      	inv_param.inv_type = QUDA_GCR_INVERTER;
      	inv_param.gcrNkrylov = 50;
      }else if(inv_type == QUDA_PCG_INVERTER){
	inv_param.inv_type = QUDA_PCG_INVERTER;
      }
      inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;

      invertQuda(out->V(), in->V(), &inv_param);

      time0 += clock(); 
      time0 /= CLOCKS_PER_SEC;



#ifdef MULTI_GPU    
      matdagmat_mg4dir(ref, qdp_fatlink, qdp_longlink, ghost_fatlink, ghost_longlink, 
          out, mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, QUDA_EVEN_PARITY);
#else
      matdagmat(ref->V(), qdp_fatlink, qdp_longlink, out->V(), mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->V(), QUDA_EVEN_PARITY);
#endif

      mxpy(in->V(), ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec);
      nrm2 = norm_2(ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec);
      src2 = norm_2(in->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec);

      break;

    case 1: //odd
      if(inv_type == QUDA_GCR_INVERTER){
      	inv_param.inv_type = QUDA_GCR_INVERTER;
      	inv_param.gcrNkrylov = 50;
      }else if(inv_type == QUDA_PCG_INVERTER){
	inv_param.inv_type = QUDA_PCG_INVERTER;
      }

      inv_param.matpc_type = QUDA_MATPC_ODD_ODD;
      invertQuda(out->V(), in->V(), &inv_param);	
      time0 += clock(); // stop the timer
      time0 /= CLOCKS_PER_SEC;

#ifdef MULTI_GPU
      matdagmat_mg4dir(ref, qdp_fatlink, qdp_longlink, ghost_fatlink, ghost_longlink, 
          out, mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, QUDA_ODD_PARITY);
#else
      matdagmat(ref->V(), qdp_fatlink, qdp_longlink, out->V(), mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->V(), QUDA_ODD_PARITY);	
#endif
      mxpy(in->V(), ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec);
      nrm2 = norm_2(ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec);
      src2 = norm_2(in->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec);

      break;

    case 2: //full spinor

      errorQuda("full spinor not supported\n");
      break;

    case 3: //multi mass CG, even
    case 4:

#define NUM_OFFSETS 12

      {    
        double masses[NUM_OFFSETS] ={0.002, 0.0021, 0.0064, 0.070, 0.077, 0.081, 0.1, 0.11, 0.12, 0.13, 0.14, 0.205};
        inv_param.num_offset = NUM_OFFSETS;
        // these can be set independently
        for (int i=0; i<inv_param.num_offset; i++) {
          inv_param.tol_offset[i] = inv_param.tol;
          inv_param.tol_hq_offset[i] = inv_param.tol_hq;
        }
        void* outArray[NUM_OFFSETS];
        int len;

        cpuColorSpinorField* spinorOutArray[NUM_OFFSETS];
        spinorOutArray[0] = out;    
        for(int i=1;i < inv_param.num_offset; i++){
          spinorOutArray[i] = new cpuColorSpinorField(csParam);       
        }

        for(int i=0;i < inv_param.num_offset; i++){
          outArray[i] = spinorOutArray[i]->V();
          inv_param.offset[i] = 4*masses[i]*masses[i];
        }

        len=Vh;

        if (test_type == 3) {
          inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;      
        } else {
          inv_param.matpc_type = QUDA_MATPC_ODD_ODD;      
        }

        invertMultiShiftQuda(outArray, in->V(), &inv_param);	

        cudaDeviceSynchronize();
        time0 += clock(); // stop the timer
        time0 /= CLOCKS_PER_SEC;

        printfQuda("done: total time = %g secs, compute time = %g, %i iter / %g secs = %g gflops\n", 
            time0, inv_param.secs, inv_param.iter, inv_param.secs,
            inv_param.gflops/inv_param.secs);


        printfQuda("checking the solution\n");
        QudaParity parity = QUDA_INVALID_PARITY;
        if (inv_param.solve_type == QUDA_NORMOP_SOLVE){
          //parity = QUDA_EVENODD_PARITY;
          errorQuda("full parity not supported\n");
        }else if (inv_param.matpc_type == QUDA_MATPC_EVEN_EVEN){
          parity = QUDA_EVEN_PARITY;
        }else if (inv_param.matpc_type == QUDA_MATPC_ODD_ODD){
          parity = QUDA_ODD_PARITY;
        }else{
          errorQuda("ERROR: invalid spinor parity \n");
          exit(1);
        }
        for(int i=0;i < inv_param.num_offset;i++){
          printfQuda("%dth solution: mass=%f, ", i, masses[i]);
#ifdef MULTI_GPU
          matdagmat_mg4dir(ref, qdp_fatlink, qdp_longlink, ghost_fatlink, ghost_longlink, 
              spinorOutArray[i], masses[i], 0, inv_param.cpu_prec, 
              gaugeParam.cpu_prec, tmp, parity);
#else
          matdagmat(ref->V(), qdp_fatlink, qdp_longlink, outArray[i], masses[i], 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->V(), parity);
#endif
          mxpy(in->V(), ref->V(), len*mySpinorSiteSize, inv_param.cpu_prec);
          double nrm2 = norm_2(ref->V(), len*mySpinorSiteSize, inv_param.cpu_prec);
          double src2 = norm_2(in->V(), len*mySpinorSiteSize, inv_param.cpu_prec);
          double hqr = sqrt(HeavyQuarkResidualNormCpu(*spinorOutArray[i], *ref).z);
          double l2r = sqrt(nrm2/src2);

          printfQuda("Shift %d residuals: (L2 relative) tol %g, QUDA = %g, host = %g; (heavy-quark) tol %g, QUDA = %g, host = %g\n",
              i, inv_param.tol_offset[i], inv_param.true_res_offset[i], l2r, 
              inv_param.tol_hq_offset[i], inv_param.true_res_hq_offset[i], hqr);

          //emperical, if the cpu residue is more than 1 order the target accuracy, the it fails to converge
          if (sqrt(nrm2/src2) > 10*inv_param.tol_offset[i]){
            ret |=1;
          }
        }

        for(int i=1; i < inv_param.num_offset;i++) delete spinorOutArray[i];
      }
      break;

    default:
      errorQuda("Unsupported test type");

  }//switch

  if (test_type <=2){

    double hqr = sqrt(HeavyQuarkResidualNormCpu(*out, *ref).z);
    double l2r = sqrt(nrm2/src2);

    printfQuda("Residuals: (L2 relative) tol %g, QUDA = %g, host = %g; (heavy-quark) tol %g, QUDA = %g, host = %g\n",
        inv_param.tol, inv_param.true_res, l2r, inv_param.tol_hq, inv_param.true_res_hq, hqr);

    printfQuda("done: total time = %g secs, compute time = %g secs, %i iter / %g secs = %g gflops, \n", 
        time0, inv_param.secs, inv_param.iter, inv_param.secs,
        inv_param.gflops/inv_param.secs);
  }

  end();
  return ret;
}
コード例 #18
0
ファイル: invert_test.cpp プロジェクト: sunilrt/quda
int main(int argc, char **argv)
{

  for (int i = 1; i < argc; i++){
    if(process_command_line_option(argc, argv, &i) == 0){
      continue;
    } 
    printfQuda("ERROR: Invalid option:%s\n", argv[i]);
    usage(argv);
  }

  if (prec_sloppy == QUDA_INVALID_PRECISION){
    prec_sloppy = prec;
  }
  if (link_recon_sloppy == QUDA_RECONSTRUCT_INVALID){
    link_recon_sloppy = link_recon;
  }

  // initialize QMP/MPI, QUDA comms grid and RNG (test_util.cpp)
  initComms(argc, argv, gridsize_from_cmdline);

  display_test_info();

  // *** QUDA parameters begin here.

  if (dslash_type != QUDA_WILSON_DSLASH &&
      dslash_type != QUDA_CLOVER_WILSON_DSLASH &&
      dslash_type != QUDA_TWISTED_MASS_DSLASH &&
      dslash_type != QUDA_DOMAIN_WALL_4D_DSLASH &&
      dslash_type != QUDA_MOBIUS_DWF_DSLASH &&
      dslash_type != QUDA_TWISTED_CLOVER_DSLASH &&
      dslash_type != QUDA_DOMAIN_WALL_DSLASH) {
    printfQuda("dslash_type %d not supported\n", dslash_type);
    exit(0);
  }

  QudaPrecision cpu_prec = QUDA_DOUBLE_PRECISION;
  QudaPrecision cuda_prec = prec;
  QudaPrecision cuda_prec_sloppy = prec_sloppy;
  QudaPrecision cuda_prec_precondition = QUDA_HALF_PRECISION;

  QudaGaugeParam gauge_param = newQudaGaugeParam();
  QudaInvertParam inv_param = newQudaInvertParam();
 
  double kappa5;

  gauge_param.X[0] = xdim;
  gauge_param.X[1] = ydim;
  gauge_param.X[2] = zdim;
  gauge_param.X[3] = tdim;
  inv_param.Ls = 1;

  gauge_param.anisotropy = 1.0;
  gauge_param.type = QUDA_WILSON_LINKS;
  gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER;
  gauge_param.t_boundary = QUDA_ANTI_PERIODIC_T;
  
  gauge_param.cpu_prec = cpu_prec;
  gauge_param.cuda_prec = cuda_prec;
  gauge_param.reconstruct = link_recon;
  gauge_param.cuda_prec_sloppy = cuda_prec_sloppy;
  gauge_param.reconstruct_sloppy = link_recon_sloppy;
  gauge_param.cuda_prec_precondition = cuda_prec_precondition;
  gauge_param.reconstruct_precondition = link_recon_sloppy;
  gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO;

  inv_param.dslash_type = dslash_type;

  inv_param.mass = mass;
  inv_param.kappa = 1.0 / (2.0 * (1 + 3/gauge_param.anisotropy + mass));

  if (dslash_type == QUDA_TWISTED_MASS_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
    inv_param.mu = 0.12;
    inv_param.epsilon = 0.1385;
    inv_param.twist_flavor = twist_flavor;
    inv_param.Ls = (inv_param.twist_flavor == QUDA_TWIST_NONDEG_DOUBLET) ? 2 : 1;
  } else if (dslash_type == QUDA_DOMAIN_WALL_DSLASH ||
             dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH) {
    inv_param.m5 = -1.8;
    kappa5 = 0.5/(5 + inv_param.m5);  
    inv_param.Ls = Lsdim;
  } else if (dslash_type == QUDA_MOBIUS_DWF_DSLASH) {
    inv_param.m5 = -1.8;
    kappa5 = 0.5/(5 + inv_param.m5);  
    inv_param.Ls = Lsdim;
    for(int k = 0; k < Lsdim; k++)
    {
      // b5[k], c[k] values are chosen for arbitrary values,
      // but the difference of them are same as 1.0
      inv_param.b_5[k] = 1.452;
      inv_param.c_5[k] = 0.452;
    }
  }

  // offsets used only by multi-shift solver
  inv_param.num_offset = 4;
  double offset[4] = {0.01, 0.02, 0.03, 0.04};
  for (int i=0; i<inv_param.num_offset; i++) inv_param.offset[i] = offset[i];

  inv_param.inv_type = inv_type;
  if (inv_param.dslash_type == QUDA_TWISTED_MASS_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
    inv_param.solution_type = QUDA_MAT_SOLUTION;
  } else {
    inv_param.solution_type = multishift ? QUDA_MATPCDAG_MATPC_SOLUTION : QUDA_MATPC_SOLUTION;
  }
  inv_param.matpc_type = matpc_type;

  inv_param.dagger = QUDA_DAG_NO;
  inv_param.mass_normalization = normalization;
  inv_param.solver_normalization = QUDA_DEFAULT_NORMALIZATION;

  if (dslash_type == QUDA_DOMAIN_WALL_DSLASH || 
      dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH ||
      dslash_type == QUDA_MOBIUS_DWF_DSLASH ||
      dslash_type == QUDA_TWISTED_MASS_DSLASH || 
      dslash_type == QUDA_TWISTED_CLOVER_DSLASH || 
      multishift || inv_type == QUDA_CG_INVERTER) {
    inv_param.solve_type = QUDA_NORMOP_PC_SOLVE;
  } else {
    inv_param.solve_type = QUDA_DIRECT_PC_SOLVE;
  }

  inv_param.pipeline = 0;

  inv_param.Nsteps = 2;
  inv_param.gcrNkrylov = 10;
  inv_param.tol = 1e-7;
  inv_param.tol_restart = 1e-3; //now theoretical background for this parameter... 
#if __COMPUTE_CAPABILITY__ >= 200
  // require both L2 relative and heavy quark residual to determine convergence
  inv_param.residual_type = static_cast<QudaResidualType>(QUDA_L2_RELATIVE_RESIDUAL | QUDA_HEAVY_QUARK_RESIDUAL);
  inv_param.tol_hq = 1e-3; // specify a tolerance for the residual for heavy quark residual
#else
  // Pre Fermi architecture only supports L2 relative residual norm
  inv_param.residual_type = QUDA_L2_RELATIVE_RESIDUAL;
#endif
  // these can be set individually
  for (int i=0; i<inv_param.num_offset; i++) {
    inv_param.tol_offset[i] = inv_param.tol;
    inv_param.tol_hq_offset[i] = inv_param.tol_hq;
  }
  inv_param.maxiter = 10000;
  inv_param.reliable_delta = 1e-1;
  inv_param.use_sloppy_partial_accumulator = 0;
  inv_param.max_res_increase = 1;

  // domain decomposition preconditioner parameters
  inv_param.inv_type_precondition = precon_type;
    
  inv_param.schwarz_type = QUDA_ADDITIVE_SCHWARZ;
  inv_param.precondition_cycle = 1;
  inv_param.tol_precondition = 1e-1;
  inv_param.maxiter_precondition = 10;
  inv_param.verbosity_precondition = QUDA_SILENT;
  inv_param.cuda_prec_precondition = cuda_prec_precondition;
  inv_param.omega = 1.0;

  inv_param.cpu_prec = cpu_prec;
  inv_param.cuda_prec = cuda_prec;
  inv_param.cuda_prec_sloppy = cuda_prec_sloppy;
  inv_param.preserve_source = QUDA_PRESERVE_SOURCE_NO;
  inv_param.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS;
  inv_param.dirac_order = QUDA_DIRAC_ORDER;

  inv_param.input_location = QUDA_CPU_FIELD_LOCATION;
  inv_param.output_location = QUDA_CPU_FIELD_LOCATION;

  inv_param.tune = tune ? QUDA_TUNE_YES : QUDA_TUNE_NO;

  gauge_param.ga_pad = 0; // 24*24*24/2;
  inv_param.sp_pad = 0; // 24*24*24/2;
  inv_param.cl_pad = 0; // 24*24*24/2;

  // For multi-GPU, ga_pad must be large enough to store a time-slice
#ifdef MULTI_GPU
  int x_face_size = gauge_param.X[1]*gauge_param.X[2]*gauge_param.X[3]/2;
  int y_face_size = gauge_param.X[0]*gauge_param.X[2]*gauge_param.X[3]/2;
  int z_face_size = gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[3]/2;
  int t_face_size = gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[2]/2;
  int pad_size =MAX(x_face_size, y_face_size);
  pad_size = MAX(pad_size, z_face_size);
  pad_size = MAX(pad_size, t_face_size);
  gauge_param.ga_pad = pad_size;    
#endif

  if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
    inv_param.clover_cpu_prec = cpu_prec;
    inv_param.clover_cuda_prec = cuda_prec;
    inv_param.clover_cuda_prec_sloppy = cuda_prec_sloppy;
    inv_param.clover_cuda_prec_precondition = cuda_prec_precondition;
    inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER;
    inv_param.clover_coeff = 1.5*inv_param.kappa;
  }

  inv_param.verbosity = QUDA_VERBOSE;

  // *** Everything between here and the call to initQuda() is
  // *** application-specific.

  // set parameters for the reference Dslash, and prepare fields to be loaded
  if (dslash_type == QUDA_DOMAIN_WALL_DSLASH ||
      dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH ||
      dslash_type == QUDA_MOBIUS_DWF_DSLASH) {
    dw_setDims(gauge_param.X, inv_param.Ls);
  } else {
    setDims(gauge_param.X);
  }

  setSpinorSiteSize(24);

  size_t gSize = (gauge_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
  size_t sSize = (inv_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);

  void *gauge[4], *clover_inv=0, *clover=0;

  for (int dir = 0; dir < 4; dir++) {
    gauge[dir] = malloc(V*gaugeSiteSize*gSize);
  }

  if (strcmp(latfile,"")) {  // load in the command line supplied gauge field
    read_gauge_field(latfile, gauge, gauge_param.cpu_prec, gauge_param.X, argc, argv);
    construct_gauge_field(gauge, 2, gauge_param.cpu_prec, &gauge_param);
  } else { // else generate a random SU(3) field
    construct_gauge_field(gauge, 1, gauge_param.cpu_prec, &gauge_param);
  }

  if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
    double norm = 0.0; // clover components are random numbers in the range (-norm, norm)
    double diag = 1.0; // constant added to the diagonal

    size_t cSize = (inv_param.clover_cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
    clover_inv = malloc(V*cloverSiteSize*cSize);
    construct_clover_field(clover_inv, norm, diag, inv_param.clover_cpu_prec);

    // The uninverted clover term is only needed when solving the unpreconditioned
    // system or when using "asymmetric" even/odd preconditioning.
    int preconditioned = (inv_param.solve_type == QUDA_DIRECT_PC_SOLVE ||
			  inv_param.solve_type == QUDA_NORMOP_PC_SOLVE);
    int asymmetric = preconditioned &&
                         (inv_param.matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC ||
                          inv_param.matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC);
    if (!preconditioned) {
      clover = clover_inv;
      clover_inv = NULL;
    } else if (asymmetric) { // fake it by using the same random matrix
      clover = clover_inv;   // for both clover and clover_inv
    } else {
      clover = NULL;
    }
  }

  void *spinorIn = malloc(V*spinorSiteSize*sSize*inv_param.Ls);
  void *spinorCheck = malloc(V*spinorSiteSize*sSize*inv_param.Ls);

  void *spinorOut = NULL, **spinorOutMulti = NULL;
  if (multishift) {
    spinorOutMulti = (void**)malloc(inv_param.num_offset*sizeof(void *));
    for (int i=0; i<inv_param.num_offset; i++) {
      spinorOutMulti[i] = malloc(V*spinorSiteSize*sSize*inv_param.Ls);
    }
  } else {
    spinorOut = malloc(V*spinorSiteSize*sSize*inv_param.Ls);
  }

  memset(spinorIn, 0, inv_param.Ls*V*spinorSiteSize*sSize);
  memset(spinorCheck, 0, inv_param.Ls*V*spinorSiteSize*sSize);
  if (multishift) {
    for (int i=0; i<inv_param.num_offset; i++) memset(spinorOutMulti[i], 0, inv_param.Ls*V*spinorSiteSize*sSize);    
  } else {
    memset(spinorOut, 0, inv_param.Ls*V*spinorSiteSize*sSize);
  }

  // create a point source at 0 (in each subvolume...  FIXME)

  // create a point source at 0 (in each subvolume...  FIXME)
  
  if (inv_param.cpu_prec == QUDA_SINGLE_PRECISION) {
    //((float*)spinorIn)[0] = 1.0;
    for (int i=0; i<inv_param.Ls*V*spinorSiteSize; i++) ((float*)spinorIn)[i] = rand() / (float)RAND_MAX;
  } else {
    //((double*)spinorIn)[0] = 1.0;
    for (int i=0; i<inv_param.Ls*V*spinorSiteSize; i++) ((double*)spinorIn)[i] = rand() / (double)RAND_MAX;
  }

  // start the timer
  double time0 = -((double)clock());

  // initialize the QUDA library
  initQuda(device);

  // load the gauge field
  loadGaugeQuda((void*)gauge, &gauge_param);

  // load the clover term, if desired
  if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) loadCloverQuda(clover, clover_inv, &inv_param);

  if (dslash_type == QUDA_TWISTED_CLOVER_DSLASH) loadCloverQuda(NULL, NULL, &inv_param);

  // perform the inversion
  if (multishift) {
    invertMultiShiftQuda(spinorOutMulti, spinorIn, &inv_param);
  } else {
    invertQuda(spinorOut, spinorIn, &inv_param);
  }

  // stop the timer
  time0 += clock();
  time0 /= CLOCKS_PER_SEC;
    
  printfQuda("Device memory used:\n   Spinor: %f GiB\n    Gauge: %f GiB\n", 
	 inv_param.spinorGiB, gauge_param.gaugeGiB);
  if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
    printfQuda("   Clover: %f GiB\n", inv_param.cloverGiB);
  }
  printfQuda("\nDone: %i iter / %g secs = %g Gflops, total time = %g secs\n", 
	 inv_param.iter, inv_param.secs, inv_param.gflops/inv_param.secs, time0);

  if (multishift) {
    if (inv_param.mass_normalization == QUDA_MASS_NORMALIZATION) {
      errorQuda("Mass normalization not supported for multi-shift solver in invert_test");
    }

    void *spinorTmp = malloc(V*spinorSiteSize*sSize*inv_param.Ls);

    printfQuda("Host residuum checks: \n");
    for(int i=0; i < inv_param.num_offset; i++) {
      ax(0, spinorCheck, V*spinorSiteSize, inv_param.cpu_prec);
      
      if (dslash_type == QUDA_TWISTED_MASS_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
	if (inv_param.twist_flavor != QUDA_TWIST_MINUS && inv_param.twist_flavor != QUDA_TWIST_PLUS)
	  errorQuda("Twisted mass solution type not supported");
        tm_matpc(spinorTmp, gauge, spinorOutMulti[i], inv_param.kappa, inv_param.mu, inv_param.twist_flavor, 
                 inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param);
        tm_matpc(spinorCheck, gauge, spinorTmp, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, 
                 inv_param.matpc_type, 1, inv_param.cpu_prec, gauge_param);
      } else if (dslash_type == QUDA_WILSON_DSLASH || dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
        wil_matpc(spinorTmp, gauge, spinorOutMulti[i], inv_param.kappa, inv_param.matpc_type, 0,
                  inv_param.cpu_prec, gauge_param);
        wil_matpc(spinorCheck, gauge, spinorTmp, inv_param.kappa, inv_param.matpc_type, 1,
                  inv_param.cpu_prec, gauge_param);
      } else {
        printfQuda("Domain wall not supported for multi-shift\n");
        exit(-1);
      }

      axpy(inv_param.offset[i], spinorOutMulti[i], spinorCheck, Vh*spinorSiteSize, inv_param.cpu_prec);
      mxpy(spinorIn, spinorCheck, Vh*spinorSiteSize, inv_param.cpu_prec);
      double nrm2 = norm_2(spinorCheck, Vh*spinorSiteSize, inv_param.cpu_prec);
      double src2 = norm_2(spinorIn, Vh*spinorSiteSize, inv_param.cpu_prec);
      double l2r = sqrt(nrm2 / src2);

      printfQuda("Shift %d residuals: (L2 relative) tol %g, QUDA = %g, host = %g; (heavy-quark) tol %g, QUDA = %g\n",
		 i, inv_param.tol_offset[i], inv_param.true_res_offset[i], l2r, 
		 inv_param.tol_hq_offset[i], inv_param.true_res_hq_offset[i]);
    }
    free(spinorTmp);

  } else {
    
    if (inv_param.solution_type == QUDA_MAT_SOLUTION) {

      if (dslash_type == QUDA_TWISTED_MASS_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
	if(inv_param.twist_flavor == QUDA_TWIST_PLUS || inv_param.twist_flavor == QUDA_TWIST_MINUS)      
	  tm_mat(spinorCheck, gauge, spinorOut, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, 0, inv_param.cpu_prec, gauge_param);
	else
	{
          int tm_offset = V*spinorSiteSize; //12*spinorRef->Volume(); 	  
	  void *evenOut = spinorCheck;
	  void *oddOut  = cpu_prec == sizeof(double) ? (void*)((double*)evenOut + tm_offset): (void*)((float*)evenOut + tm_offset);
    
	  void *evenIn  = spinorOut;
	  void *oddIn   = cpu_prec == sizeof(double) ? (void*)((double*)evenIn + tm_offset): (void*)((float*)evenIn + tm_offset);
    
	  tm_ndeg_mat(evenOut, oddOut, gauge, evenIn, oddIn, inv_param.kappa, inv_param.mu, inv_param.epsilon, 0, inv_param.cpu_prec, gauge_param);	
	}
      } else if (dslash_type == QUDA_WILSON_DSLASH || dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
        wil_mat(spinorCheck, gauge, spinorOut, inv_param.kappa, 0, inv_param.cpu_prec, gauge_param);
      } else if (dslash_type == QUDA_DOMAIN_WALL_DSLASH) {
        dw_mat(spinorCheck, gauge, spinorOut, kappa5, inv_param.dagger, inv_param.cpu_prec, gauge_param, inv_param.mass);
//      } else if (dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH) {
//        dw_4d_mat(spinorCheck, gauge, spinorOut, kappa5, inv_param.dagger, inv_param.cpu_prec, gauge_param, inv_param.mass);
//      } else if (dslash_type == QUDA_MOBIUS_DWF_DSLASH) {
//        mdw_mat(spinorCheck, gauge, spinorOut, kappa5, inv_param.dagger, inv_param.cpu_prec, gauge_param, inv_param.mass);
      } else {
        printfQuda("Unsupported dslash_type\n");
        exit(-1);
      }
      if (inv_param.mass_normalization == QUDA_MASS_NORMALIZATION) {
        if (dslash_type == QUDA_DOMAIN_WALL_DSLASH || 
            dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH ||
            dslash_type == QUDA_MOBIUS_DWF_DSLASH) {
          ax(0.5/kappa5, spinorCheck, V*spinorSiteSize*inv_param.Ls, inv_param.cpu_prec);
        } else if (dslash_type == QUDA_TWISTED_MASS_DSLASH && twist_flavor == QUDA_TWIST_NONDEG_DOUBLET) {
          ax(0.5/inv_param.kappa, spinorCheck, 2*V*spinorSiteSize, inv_param.cpu_prec);
	} else {
          ax(0.5/inv_param.kappa, spinorCheck, V*spinorSiteSize, inv_param.cpu_prec);
        }
      }

    } else if(inv_param.solution_type == QUDA_MATPC_SOLUTION) {

      if (dslash_type == QUDA_TWISTED_MASS_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
	if (inv_param.twist_flavor != QUDA_TWIST_MINUS && inv_param.twist_flavor != QUDA_TWIST_PLUS)
	  errorQuda("Twisted mass solution type not supported");
        tm_matpc(spinorCheck, gauge, spinorOut, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, 
                 inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param);
      } else if (dslash_type == QUDA_WILSON_DSLASH || dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
        wil_matpc(spinorCheck, gauge, spinorOut, inv_param.kappa, inv_param.matpc_type, 0, 
                  inv_param.cpu_prec, gauge_param);
      } else if (dslash_type == QUDA_DOMAIN_WALL_DSLASH) {
        dw_matpc(spinorCheck, gauge, spinorOut, kappa5, inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param, inv_param.mass);
      } else if (dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH) {
        dw_4d_matpc(spinorCheck, gauge, spinorOut, kappa5, inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param, inv_param.mass);
      } else if (dslash_type == QUDA_MOBIUS_DWF_DSLASH) {
        double *kappa_b, *kappa_c;
        kappa_b = (double*)malloc(Lsdim*sizeof(double));
        kappa_c = (double*)malloc(Lsdim*sizeof(double));
        for(int xs = 0 ; xs < Lsdim ; xs++)
        {
          kappa_b[xs] = 1.0/(2*(inv_param.b_5[xs]*(4.0 + inv_param.m5) + 1.0));
          kappa_c[xs] = 1.0/(2*(inv_param.c_5[xs]*(4.0 + inv_param.m5) - 1.0));
        }
        mdw_matpc(spinorCheck, gauge, spinorOut, kappa_b, kappa_c, inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param, inv_param.mass, inv_param.b_5, inv_param.c_5);
        free(kappa_b);
        free(kappa_c);
      } else {
        printfQuda("Unsupported dslash_type\n");
        exit(-1);
      }

      if (inv_param.mass_normalization == QUDA_MASS_NORMALIZATION) {
        if (dslash_type == QUDA_DOMAIN_WALL_DSLASH ||
            dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH ||
            dslash_type == QUDA_MOBIUS_DWF_DSLASH) {
          ax(0.25/(kappa5*kappa5), spinorCheck, V*spinorSiteSize*inv_param.Ls, inv_param.cpu_prec);
        } else {
          ax(0.25/(inv_param.kappa*inv_param.kappa), spinorCheck, Vh*spinorSiteSize, inv_param.cpu_prec);
      
	}
      }

    } else if (inv_param.solution_type == QUDA_MATPCDAG_MATPC_SOLUTION) {

      void *spinorTmp = malloc(V*spinorSiteSize*sSize*inv_param.Ls);

      ax(0, spinorCheck, V*spinorSiteSize, inv_param.cpu_prec);
      
      if (dslash_type == QUDA_TWISTED_MASS_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
	if (inv_param.twist_flavor != QUDA_TWIST_MINUS && inv_param.twist_flavor != QUDA_TWIST_PLUS)
	  errorQuda("Twisted mass solution type not supported");
        tm_matpc(spinorTmp, gauge, spinorOut, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, 
                 inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param);
        tm_matpc(spinorCheck, gauge, spinorTmp, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, 
                 inv_param.matpc_type, 1, inv_param.cpu_prec, gauge_param);
      } else if (dslash_type == QUDA_WILSON_DSLASH || dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
        wil_matpc(spinorTmp, gauge, spinorOut, inv_param.kappa, inv_param.matpc_type, 0,
                  inv_param.cpu_prec, gauge_param);
        wil_matpc(spinorCheck, gauge, spinorTmp, inv_param.kappa, inv_param.matpc_type, 1,
                  inv_param.cpu_prec, gauge_param);
      } else {
        printfQuda("Unsupported dslash_type\n");
        exit(-1);
      }

      if (inv_param.mass_normalization == QUDA_MASS_NORMALIZATION) {
	errorQuda("Mass normalization not implemented");
      }

      free(spinorTmp);
    }


    int vol = inv_param.solution_type == QUDA_MAT_SOLUTION ? V : Vh;
    mxpy(spinorIn, spinorCheck, vol*spinorSiteSize*inv_param.Ls, inv_param.cpu_prec);
    double nrm2 = norm_2(spinorCheck, vol*spinorSiteSize*inv_param.Ls, inv_param.cpu_prec);
    double src2 = norm_2(spinorIn, vol*spinorSiteSize*inv_param.Ls, inv_param.cpu_prec);
    double l2r = sqrt(nrm2 / src2);

    printfQuda("Residuals: (L2 relative) tol %g, QUDA = %g, host = %g; (heavy-quark) tol %g, QUDA = %g\n",
	       inv_param.tol, inv_param.true_res, l2r, inv_param.tol_hq, inv_param.true_res_hq);

  }

  freeGaugeQuda();
  if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) freeCloverQuda();
  
  // finalize the QUDA library
  endQuda();

  finalizeComms();

  return 0;
}