int
invert_test(void)
{
  QudaGaugeParam gaugeParam = newQudaGaugeParam();
  QudaInvertParam inv_param = newQudaInvertParam();

  double mass = 0.1;

  set_params(&gaugeParam, &inv_param,
	     xdim, ydim, zdim, tdim,
	     cpu_prec, prec, prec_sloppy,
	     link_recon, link_recon_sloppy, mass, tol, 500, 1e-3,
	     0.8);
  
  // this must be before the FaceBuffer is created (this is because it allocates pinned memory - FIXME)
  initQuda(device);

  setDims(gaugeParam.X);
  setDimConstants(gaugeParam.X);

  size_t gSize = (gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
  for (int dir = 0; dir < 4; dir++) {
    fatlink[dir] = malloc(V*gaugeSiteSize*gSize);
    longlink[dir] = malloc(V*gaugeSiteSize*gSize);
  }
  
  construct_fat_long_gauge_field(fatlink, longlink, 1, gaugeParam.cpu_prec, &gaugeParam);
    
  for (int dir = 0; dir < 4; dir++) {
    for(int i = 0;i < V*gaugeSiteSize;i++){
      if (gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION){
	((double*)fatlink[dir])[i] = 0.5 *rand()/RAND_MAX;
      }else{
	((float*)fatlink[dir])[i] = 0.5* rand()/RAND_MAX;
      }
    }
  }  
 
  ColorSpinorParam csParam;
  csParam.fieldLocation = QUDA_CPU_FIELD_LOCATION;
  csParam.nColor=3;
  csParam.nSpin=1;
  csParam.nDim=4;
  for(int d = 0; d < 4; d++) {
    csParam.x[d] = gaugeParam.X[d];
  }
  csParam.x[0] /= 2;
  
  csParam.precision = inv_param.cpu_prec;
  csParam.pad = 0;
  csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
  csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
  csParam.fieldOrder  = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
  csParam.gammaBasis = inv_param.gamma_basis;
  csParam.create = QUDA_ZERO_FIELD_CREATE;  
  in = new cpuColorSpinorField(csParam);  
  out = new cpuColorSpinorField(csParam);  
  ref = new cpuColorSpinorField(csParam);  
  tmp = new cpuColorSpinorField(csParam);  
  
  if (inv_param.cpu_prec == QUDA_SINGLE_PRECISION){
    constructSpinorField((float*)in->V());    
  }else{
    constructSpinorField((double*)in->V());
  }

  int tmp_value = MAX(ydim*zdim*tdim/2, xdim*zdim*tdim/2);
   tmp_value = MAX(tmp_value, xdim*ydim*tdim/2);
   tmp_value = MAX(tmp_value, xdim*ydim*zdim/2);

  int fat_pad = tmp_value;
  int link_pad =  3*tmp_value;

#ifdef MULTI_GPU
  gaugeParam.type = QUDA_ASQTAD_FAT_LINKS;
  gaugeParam.reconstruct = QUDA_RECONSTRUCT_NO;
  GaugeFieldParam cpuFatParam(fatlink, gaugeParam);
  cpuFat = new cpuGaugeField(cpuFatParam);
  cpuFat->exchangeGhost();
  ghost_fatlink = (void**)cpuFat->Ghost();
  
  gaugeParam.type = QUDA_ASQTAD_LONG_LINKS;
  GaugeFieldParam cpuLongParam(longlink, gaugeParam);
  cpuLong = new cpuGaugeField(cpuLongParam);
  cpuLong->exchangeGhost();
  ghost_longlink = (void**)cpuLong->Ghost();
#endif
  
  if(testtype == 6){    
    record_gauge(gaugeParam.X, fatlink, fat_pad,
		 longlink, link_pad,
		 link_recon, link_recon_sloppy,
		 &gaugeParam);        
   }else{ 
    
#ifdef MULTI_GPU
 

    gaugeParam.type = QUDA_ASQTAD_FAT_LINKS;
    gaugeParam.ga_pad = fat_pad;
    gaugeParam.reconstruct= gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO;
    loadGaugeQuda(fatlink, &gaugeParam);
    
    gaugeParam.type = QUDA_ASQTAD_LONG_LINKS;
    gaugeParam.ga_pad = link_pad;
    gaugeParam.reconstruct= link_recon;
    gaugeParam.reconstruct_sloppy = link_recon_sloppy;
    loadGaugeQuda(longlink, &gaugeParam);
#else
    gaugeParam.type = QUDA_ASQTAD_FAT_LINKS;
    gaugeParam.reconstruct = gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO;
    loadGaugeQuda(fatlink, &gaugeParam);
    
    gaugeParam.type = QUDA_ASQTAD_LONG_LINKS;
    gaugeParam.reconstruct = link_recon;
    gaugeParam.reconstruct_sloppy = link_recon_sloppy;
    loadGaugeQuda(longlink, &gaugeParam);
#endif
  }

  
  double time0 = -((double)clock()); // Start the timer
  
  unsigned long volume = Vh;
  unsigned long nflops=2*1187; //from MILC's CG routine
  double nrm2=0;
  double src2=0;
  int ret = 0;

  switch(testtype){
  case 0: //even
    volume = Vh;
    inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
    
    invertQuda(out->V(), in->V(), &inv_param);
    
    time0 += clock(); 
    time0 /= CLOCKS_PER_SEC;

#ifdef MULTI_GPU    
    matdagmat_mg4dir(ref, fatlink, longlink, ghost_fatlink, ghost_longlink, 
		     out, mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, QUDA_EVEN_PARITY);
#else
    matdagmat(ref->V(), fatlink, longlink, out->V(), mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->V(), QUDA_EVEN_PARITY);
#endif
    
    mxpy(in->V(), ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec);
    nrm2 = norm_2(ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec);
    src2 = norm_2(in->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec);

    {
      double sol = norm_2(out->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec);
      double refe = norm_2(ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec);
    }
    break;

  case 1: //odd
	
    volume = Vh;    
    inv_param.matpc_type = QUDA_MATPC_ODD_ODD;
    invertQuda(out->V(), in->V(), &inv_param);	
    time0 += clock(); // stop the timer
    time0 /= CLOCKS_PER_SEC;
    
#ifdef MULTI_GPU
    matdagmat_mg4dir(ref, fatlink, longlink, ghost_fatlink, ghost_longlink, 
		     out, mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, QUDA_ODD_PARITY);
#else
    matdagmat(ref->V(), fatlink, longlink, out->V(), mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->V(), QUDA_ODD_PARITY);	
#endif
    mxpy(in->V(), ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec);
    nrm2 = norm_2(ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec);
    src2 = norm_2(in->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec);
	
    break;
    
  case 2: //full spinor

    errorQuda("full spinor not supported\n");
    break;
    
  case 3: //multi mass CG, even
  case 4:
  case 5:
  case 6:

#define NUM_OFFSETS 7
        
    nflops = 2*(1205 + 15* NUM_OFFSETS); //from MILC's multimass CG routine
    double masses[NUM_OFFSETS] ={5.05, 1.23, 2.64, 2.33, 2.70, 2.77, 2.81};
    double offsets[NUM_OFFSETS];	
    int num_offsets =NUM_OFFSETS;
    void* outArray[NUM_OFFSETS];
    int len;
    
    cpuColorSpinorField* spinorOutArray[NUM_OFFSETS];
    spinorOutArray[0] = out;    
    for(int i=1;i < num_offsets; i++){
      spinorOutArray[i] = new cpuColorSpinorField(csParam);       
    }
    
    for(int i=0;i < num_offsets; i++){
      outArray[i] = spinorOutArray[i]->V();
    }

    for (int i=0; i< num_offsets;i++){
      offsets[i] = 4*masses[i]*masses[i];
    }
    
    len=Vh;
    volume = Vh;      

    if (testtype == 3 || testtype == 6){
      inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;      
    } else if (testtype == 4){
      inv_param.matpc_type = QUDA_MATPC_ODD_ODD;      
    }else { //testtype ==5
      errorQuda("test 5 not supported\n");
    }
    
    double residue_sq;
    if (testtype == 6){
      invertMultiShiftQudaMixed(outArray, in->V(), &inv_param, offsets, num_offsets, &residue_sq);
    }else{      
      invertMultiShiftQuda(outArray, in->V(), &inv_param, offsets, num_offsets, &residue_sq);	
    }
    cudaThreadSynchronize();
    printfQuda("Final residue squred =%g\n", residue_sq);
    time0 += clock(); // stop the timer
    time0 /= CLOCKS_PER_SEC;
    
    printfQuda("done: total time = %g secs, %i iter / %g secs = %g gflops, \n", 
	       time0, inv_param.iter, inv_param.secs,
	       inv_param.gflops/inv_param.secs);
    
    
    printfQuda("checking the solution\n");
    QudaParity parity;
    if (inv_param.solve_type == QUDA_NORMEQ_SOLVE){
      //parity = QUDA_EVENODD_PARITY;
      errorQuda("full parity not supported\n");
    }else if (inv_param.matpc_type == QUDA_MATPC_EVEN_EVEN){
      parity = QUDA_EVEN_PARITY;
    }else if (inv_param.matpc_type == QUDA_MATPC_ODD_ODD){
      parity = QUDA_ODD_PARITY;
    }else{
      errorQuda("ERROR: invalid spinor parity \n");
      exit(1);
    }
    for(int i=0;i < num_offsets;i++){
      printfQuda("%dth solution: mass=%f, ", i, masses[i]);
#ifdef MULTI_GPU
      matdagmat_mg4dir(ref, fatlink, longlink, ghost_fatlink, ghost_longlink, 
		       spinorOutArray[i], masses[i], 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, parity);
#else
      matdagmat(ref->V(), fatlink, longlink, outArray[i], masses[i], 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->V(), parity);
#endif
      mxpy(in->V(), ref->V(), len*mySpinorSiteSize, inv_param.cpu_prec);
      double nrm2 = norm_2(ref->V(), len*mySpinorSiteSize, inv_param.cpu_prec);
      double src2 = norm_2(in->V(), len*mySpinorSiteSize, inv_param.cpu_prec);
      
      printfQuda("relative residual, requested = %g, actual = %g\n", inv_param.tol, sqrt(nrm2/src2));

      //emperical, if the cpu residue is more than 2 order the target accuracy, the it fails to converge
      if (sqrt(nrm2/src2) > 100*inv_param.tol){
	ret |=1;
      }
    }

    if (ret ==1){
      errorQuda("Converge failed!\n");
    }

    for(int i=1; i < num_offsets;i++){
      delete spinorOutArray[i];
    }

    
  }//switch
    

  if (testtype <=2){

    printfQuda("Relative residual, requested = %g, actual = %g\n", inv_param.tol, sqrt(nrm2/src2));
	
    printfQuda("done: total time = %g secs, %i iter / %g secs = %g gflops, \n", 
	       time0, inv_param.iter, inv_param.secs,
	       inv_param.gflops/inv_param.secs);
    
    //emperical, if the cpu residue is more than 2 order the target accuracy, the it fails to converge
    if (sqrt(nrm2/src2) > 100*inv_param.tol){
      ret = 1;
      errorQuda("Convergence failed!\n");
    }
  }

  end();
  return ret;
}
int
invert_test(void)
{
  QudaGaugeParam gaugeParam = newQudaGaugeParam();
  QudaInvertParam inv_param = newQudaInvertParam();

  double mass = 0.95;

  set_params(&gaugeParam, &inv_param,
	     sdim, sdim, sdim, tdim,
	     cpu_prec, prec, prec_sloppy,
	     link_recon, link_recon_sloppy, mass, tol, 500, 1e-3,
	     0.8);
  
  // this must be before the FaceBuffer is created (this is because it allocates pinned memory - FIXME)
  initQuda(device);

  setDims(gaugeParam.X);
  setDimConstants(gaugeParam.X);

  size_t gSize = (gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
  for (int dir = 0; dir < 4; dir++) {
    fatlink[dir] = malloc(V*gaugeSiteSize*gSize);
    longlink[dir] = malloc(V*gaugeSiteSize*gSize);
  }
  
  construct_fat_long_gauge_field(fatlink, longlink, 1, gaugeParam.cpu_prec, &gaugeParam);
    
  for (int dir = 0; dir < 4; dir++) {
    for(int i = 0;i < V*gaugeSiteSize;i++){
      if (gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION){
	((double*)fatlink[dir])[i] = 0.5 *rand()/RAND_MAX;
      }else{
	((float*)fatlink[dir])[i] = 0.5* rand()/RAND_MAX;
      }
    }
  }

  
#ifdef MULTI_GPU

  //exchange_init_dims(gaugeParam.X);
  int ghost_link_len[4] = {
    Vs_x*gaugeSiteSize*gSize,
    Vs_y*gaugeSiteSize*gSize,
    Vs_z*gaugeSiteSize*gSize,
    Vs_t*gaugeSiteSize*gSize
  };

  for(int i=0;i < 4;i++){
    ghost_fatlink[i] = malloc(ghost_link_len[i]);
    ghost_longlink[i] = malloc(3*ghost_link_len[i]);
    if (ghost_fatlink[i] == NULL || ghost_longlink[i] == NULL){
      printf("ERROR: malloc failed for ghost fatlink or ghost longlink\n");
      exit(1);
    }
  }

  //exchange_cpu_links4dir(fatlink, ghost_fatlink, longlink, ghost_longlink, gaugeParam.cpu_prec);

  void *fat_send[4], *long_send[4];
  for(int i=0;i < 4;i++){
    fat_send[i] = malloc(ghost_link_len[i]);
    long_send[i] = malloc(3*ghost_link_len[i]);
  }

  set_dim(Z);
  pack_ghost(fatlink, fat_send, 1, gaugeParam.cpu_prec);
  pack_ghost(longlink, long_send, 3, gaugeParam.cpu_prec);

  int dummyFace = 1;
  FaceBuffer faceBuf (Z, 4, 18, dummyFace, gaugeParam.cpu_prec);
  faceBuf.exchangeCpuLink((void**)ghost_fatlink, (void**)fat_send, 1);
  faceBuf.exchangeCpuLink((void**)ghost_longlink, (void**)long_send, 3);

  for (int i=0; i<4; i++) {
    free(fat_send[i]);
    free(long_send[i]);
  }

#endif

 
  ColorSpinorParam csParam;
  csParam.fieldLocation = QUDA_CPU_FIELD_LOCATION;
  csParam.nColor=3;
  csParam.nSpin=1;
  csParam.nDim=4;
  for(int d = 0; d < 4; d++) {
    csParam.x[d] = gaugeParam.X[d];
  }
  csParam.x[0] /= 2;
  
  csParam.precision = inv_param.cpu_prec;
  csParam.pad = 0;
  csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
  csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
  csParam.fieldOrder  = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
  csParam.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS;
  csParam.create = QUDA_ZERO_FIELD_CREATE;  
  in = new cpuColorSpinorField(csParam);  
  out = new cpuColorSpinorField(csParam);  
  ref = new cpuColorSpinorField(csParam);  
  tmp = new cpuColorSpinorField(csParam);  
  
  if (inv_param.cpu_prec == QUDA_SINGLE_PRECISION){
    constructSpinorField((float*)in->v);    
  }else{
    constructSpinorField((double*)in->v);
  }


  
  
#ifdef MULTI_GPU

  if(testtype == 6){
    record_gauge(fatlink, ghost_fatlink[3], Vsh_t,
		 longlink, ghost_longlink[3], 3*Vsh_t,
		 link_recon, link_recon_sloppy,
		 &gaugeParam);
   }else{
    gaugeParam.type = QUDA_ASQTAD_FAT_LINKS;
    gaugeParam.ga_pad = MAX(sdim*sdim*sdim/2, sdim*sdim*tdim/2);
    gaugeParam.reconstruct= gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO;
    loadGaugeQuda(fatlink, &gaugeParam);
    
    gaugeParam.type = QUDA_ASQTAD_LONG_LINKS;
    gaugeParam.ga_pad = 3*MAX(sdim*sdim*sdim/2, sdim*sdim*tdim/2);
    gaugeParam.reconstruct= link_recon;
    gaugeParam.reconstruct_sloppy = link_recon_sloppy;
    loadGaugeQuda(longlink, &gaugeParam);
  }

#else
  gaugeParam.type = QUDA_ASQTAD_FAT_LINKS;
  gaugeParam.reconstruct = gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO;
  loadGaugeQuda(fatlink, &gaugeParam);
  
  gaugeParam.type = QUDA_ASQTAD_LONG_LINKS;
  gaugeParam.reconstruct = link_recon;
  gaugeParam.reconstruct_sloppy = link_recon_sloppy;
  loadGaugeQuda(longlink, &gaugeParam);
#endif
  
  double time0 = -((double)clock()); // Start the timer
  
  unsigned long volume = Vh;
  unsigned long nflops=2*1187; //from MILC's CG routine
  double nrm2=0;
  double src2=0;
  int ret = 0;


  switch(testtype){
  case 0: //even
    volume = Vh;
    inv_param.solution_type = QUDA_MATPCDAG_MATPC_SOLUTION;
    inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
    
    invertQuda(out->v, in->v, &inv_param);
    
    time0 += clock(); 
    time0 /= CLOCKS_PER_SEC;

#ifdef MULTI_GPU    
    matdagmat_mg4dir(ref, fatlink, ghost_fatlink, longlink, ghost_longlink, 
		     out, mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, QUDA_EVEN_PARITY);
#else
    matdagmat(ref->v, fatlink, longlink, out->v, mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->v, QUDA_EVEN_PARITY);
#endif
    
    mxpy(in->v, ref->v, Vh*mySpinorSiteSize, inv_param.cpu_prec);
    nrm2 = norm_2(ref->v, Vh*mySpinorSiteSize, inv_param.cpu_prec);
    src2 = norm_2(in->v, Vh*mySpinorSiteSize, inv_param.cpu_prec);
    break;

  case 1: //odd
	
    volume = Vh;    
    inv_param.solution_type = QUDA_MATPCDAG_MATPC_SOLUTION;
    inv_param.matpc_type = QUDA_MATPC_ODD_ODD;
    invertQuda(out->v, in->v, &inv_param);	
    time0 += clock(); // stop the timer
    time0 /= CLOCKS_PER_SEC;
    
#ifdef MULTI_GPU
    matdagmat_mg4dir(ref, fatlink, ghost_fatlink, longlink, ghost_longlink, 
		     out, mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, QUDA_ODD_PARITY);
#else
    matdagmat(ref->v, fatlink, longlink, out->v, mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->v, QUDA_ODD_PARITY);	
#endif
    mxpy(in->v, ref->v, Vh*mySpinorSiteSize, inv_param.cpu_prec);
    nrm2 = norm_2(ref->v, Vh*mySpinorSiteSize, inv_param.cpu_prec);
    src2 = norm_2(in->v, Vh*mySpinorSiteSize, inv_param.cpu_prec);
	
    break;
    
  case 2: //full spinor

    errorQuda("full spinor not supported\n");
    break;
    
  case 3: //multi mass CG, even
  case 4:
  case 5:
  case 6:

#define NUM_OFFSETS 4
        
    nflops = 2*(1205 + 15* NUM_OFFSETS); //from MILC's multimass CG routine
    double masses[NUM_OFFSETS] ={5.05, 1.23, 2.64, 2.33};
    double offsets[NUM_OFFSETS];	
    int num_offsets =NUM_OFFSETS;
    void* outArray[NUM_OFFSETS];
    int len;
    
    cpuColorSpinorField* spinorOutArray[NUM_OFFSETS];
    spinorOutArray[0] = out;    
    for(int i=1;i < num_offsets; i++){
      spinorOutArray[i] = new cpuColorSpinorField(csParam);       
    }
    
    for(int i=0;i < num_offsets; i++){
      outArray[i] = spinorOutArray[i]->v;
    }

    for (int i=0; i< num_offsets;i++){
      offsets[i] = 4*masses[i]*masses[i];
    }
    
    len=Vh;
    volume = Vh;      
    inv_param.solution_type = QUDA_MATPCDAG_MATPC_SOLUTION;

    if (testtype == 3){
      inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;      
    } else if (testtype == 4||testtype == 6){
      inv_param.matpc_type = QUDA_MATPC_ODD_ODD;      
    }else { //testtype ==5
      errorQuda("test 5 not supported\n");
    }
    
    double residue_sq;
    if (testtype == 6){
      //invertMultiShiftQudaMixed(spinorOutArray, in->v, &inv_param, offsets, num_offsets, &residue_sq);
    }else{      
      invertMultiShiftQuda(outArray, in->v, &inv_param, offsets, num_offsets, &residue_sq);	
    }
    cudaThreadSynchronize();
    printfQuda("Final residue squred =%g\n", residue_sq);
    time0 += clock(); // stop the timer
    time0 /= CLOCKS_PER_SEC;
    
    printfQuda("done: total time = %g secs, %i iter / %g secs = %g gflops, \n", 
	       time0, inv_param.iter, inv_param.secs,
	       inv_param.gflops/inv_param.secs);
    
    
    printfQuda("checking the solution\n");
    QudaParity parity;
    if (inv_param.solve_type == QUDA_NORMEQ_SOLVE){
      //parity = QUDA_EVENODD_PARITY;
      errorQuda("full parity not supported\n");
    }else if (inv_param.matpc_type == QUDA_MATPC_EVEN_EVEN){
      parity = QUDA_EVEN_PARITY;
    }else if (inv_param.matpc_type == QUDA_MATPC_ODD_ODD){
      parity = QUDA_ODD_PARITY;
    }else{
      errorQuda("ERROR: invalid spinor parity \n");
      exit(1);
    }
    
    for(int i=0;i < num_offsets;i++){
      printfQuda("%dth solution: mass=%f, ", i, masses[i]);
#ifdef MULTI_GPU
      matdagmat_mg4dir(ref, fatlink, ghost_fatlink, longlink, ghost_longlink, 
		       spinorOutArray[i], masses[i], 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, parity);
#else
      matdagmat(ref->v, fatlink, longlink, outArray[i], masses[i], 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->v, parity);
#endif
      mxpy(in->v, ref->v, len*mySpinorSiteSize, inv_param.cpu_prec);
      double nrm2 = norm_2(ref->v, len*mySpinorSiteSize, inv_param.cpu_prec);
      double src2 = norm_2(in->v, len*mySpinorSiteSize, inv_param.cpu_prec);
      
      printfQuda("relative residual, requested = %g, actual = %g\n", inv_param.tol, sqrt(nrm2/src2));

      //emperical, if the cpu residue is more than 2 order the target accuracy, the it fails to converge
      if (sqrt(nrm2/src2) > 100*inv_param.tol){
	ret |=1;
	errorQuda("Converge failed!\n");
      }
    }
    
    for(int i=1; i < num_offsets;i++){
      delete spinorOutArray[i];
    }

    
  }//switch
    

  if (testtype <=2){

    printfQuda("Relative residual, requested = %g, actual = %g\n", inv_param.tol, sqrt(nrm2/src2));
	
    printfQuda("done: total time = %g secs, %i iter / %g secs = %g gflops, \n", 
	       time0, inv_param.iter, inv_param.secs,
	       inv_param.gflops/inv_param.secs);
    
    //emperical, if the cpu residue is more than 2 order the target accuracy, the it fails to converge
    if (sqrt(nrm2/src2) > 100*inv_param.tol){
      ret = 1;
      errorQuda("Convergence failed!\n");
    }
  }

  end();
  return ret;
}