Esempio n. 1
0
void packTest() {

  float spinorGiB = (float)Vh*spinorSiteSize*param.cuda_prec / (1 << 30);
  printf("\nSpinor mem: %.3f GiB\n", spinorGiB);
  printf("Gauge mem: %.3f GiB\n", param.gaugeGiB);

  printf("Sending fields to GPU...\n"); fflush(stdout);
  
  stopwatchStart();
  param.gauge_order = QUDA_CPS_WILSON_GAUGE_ORDER;
  createGaugeField(&cudaGauge, cpsGauge, param.cuda_prec, param.cpu_prec, param.gauge_order, param.reconstruct, 
		   param.gauge_fix, param.t_boundary, param.X, 1.0, 1.0, param.ga_pad, param.type);
  double cpsGtime = stopwatchReadSeconds();
  printf("CPS Gauge send time = %e seconds\n", cpsGtime);

  stopwatchStart();
  restoreGaugeField(cpsGauge, &cudaGauge, param.cpu_prec, param.gauge_order);
  double cpsGRtime = stopwatchReadSeconds();
  printf("CPS Gauge restore time = %e seconds\n", cpsGRtime);

  stopwatchStart();
  param.gauge_order = QUDA_QDP_GAUGE_ORDER;
  createGaugeField(&cudaGauge, qdpGauge, param.cuda_prec, param.cpu_prec, param.gauge_order, param.reconstruct, 
		   param.gauge_fix, param.t_boundary, param.X, 1.0, 1.0, param.ga_pad, param.type);
  double qdpGtime = stopwatchReadSeconds();
  printf("QDP Gauge send time = %e seconds\n", qdpGtime);

  stopwatchStart();
  restoreGaugeField(qdpGauge, &cudaGauge, param.cpu_prec, param.gauge_order);
  double qdpGRtime = stopwatchReadSeconds();
  printf("QDP Gauge restore time = %e seconds\n", qdpGRtime);

  stopwatchStart();
  *cudaSpinor = *spinor;
  double sSendTime = stopwatchReadSeconds();
  printf("Spinor send time = %e seconds\n", sSendTime);

  stopwatchStart();
  *spinor2 = *cudaSpinor;
  double sRecTime = stopwatchReadSeconds();
  printf("Spinor receive time = %e seconds\n", sRecTime);
  
  std::cout << "Norm check: CPU = " << norm2(*spinor) << 
    ", CUDA = " << norm2(*cudaSpinor) << 
    ", CPU =  " << norm2(*spinor2) << std::endl;

  cpuColorSpinorField::Compare(*spinor, *spinor2, 1);

}
Esempio n. 2
0
void CG::operator()(cudaColorSpinorField &x, cudaColorSpinorField &b) 
{
  int k=0;
  int rUpdate = 0;
    
  cudaColorSpinorField r(b);

  ColorSpinorParam param(x);
  param.create = QUDA_ZERO_FIELD_CREATE;
  cudaColorSpinorField y(b, param); 
  
  mat(r, x, y);
  zeroCuda(y);

  double r2 = xmyNormCuda(b, r);
  rUpdate ++;
  
  param.precision = invParam.cuda_prec_sloppy;
  cudaColorSpinorField Ap(x, param);
  cudaColorSpinorField tmp(x, param);
  cudaColorSpinorField tmp2(x, param); // only needed for clover and twisted mass

  cudaColorSpinorField *x_sloppy, *r_sloppy;
  if (invParam.cuda_prec_sloppy == x.Precision()) {
    param.create = QUDA_REFERENCE_FIELD_CREATE;
    x_sloppy = &x;
    r_sloppy = &r;
  } else {
    param.create = QUDA_COPY_FIELD_CREATE;
    x_sloppy = new cudaColorSpinorField(x, param);
    r_sloppy = new cudaColorSpinorField(r, param);
  }

  cudaColorSpinorField &xSloppy = *x_sloppy;
  cudaColorSpinorField &rSloppy = *r_sloppy;

  cudaColorSpinorField p(rSloppy);

  double r2_old;
  double src_norm = norm2(b);
  double stop = src_norm*invParam.tol*invParam.tol; // stopping condition of solver

  double alpha, beta;
  double pAp;

  double rNorm = sqrt(r2);
  double r0Norm = rNorm;
  double maxrx = rNorm;
  double maxrr = rNorm;
  double delta = invParam.reliable_delta;

  if (invParam.verbosity >= QUDA_VERBOSE) printfQuda("CG: %d iterations, r2 = %e\n", k, r2);

  quda::blas_flops = 0;

  stopwatchStart();
  while (r2 > stop && k<invParam.maxiter) {

    matSloppy(Ap, p, tmp, tmp2); // tmp as tmp
    
    pAp = reDotProductCuda(p, Ap);
    alpha = r2 / pAp;        
    r2_old = r2;
    r2 = axpyNormCuda(-alpha, Ap, rSloppy);

    // reliable update conditions
    rNorm = sqrt(r2);
    if (rNorm > maxrx) maxrx = rNorm;
    if (rNorm > maxrr) maxrr = rNorm;
    int updateX = (rNorm < delta*r0Norm && r0Norm <= maxrx) ? 1 : 0;
    int updateR = ((rNorm < delta*maxrr && r0Norm <= maxrr) || updateX) ? 1 : 0;
    
    if (!(updateR || updateX)) {
      beta = r2 / r2_old;
      axpyZpbxCuda(alpha, p, xSloppy, rSloppy, beta);
    } else {
      axpyCuda(alpha, p, xSloppy);
      if (x.Precision() != xSloppy.Precision()) copyCuda(x, xSloppy);
      
      xpyCuda(x, y); // swap these around?
      mat(r, y, x); // here we can use x as tmp
      r2 = xmyNormCuda(b, r);
      if (x.Precision() != rSloppy.Precision()) copyCuda(rSloppy, r);            
      zeroCuda(xSloppy);

      rNorm = sqrt(r2);
      maxrr = rNorm;
      maxrx = rNorm;
      r0Norm = rNorm;      
      rUpdate++;

      beta = r2 / r2_old; 
      xpayCuda(rSloppy, beta, p);
    }

    k++;
    if (invParam.verbosity >= QUDA_VERBOSE)
      printfQuda("CG: %d iterations, r2 = %e\n", k, r2);
  }

  if (x.Precision() != xSloppy.Precision()) copyCuda(x, xSloppy);
  xpyCuda(y, x);

  invParam.secs = stopwatchReadSeconds();

  
  if (k==invParam.maxiter) 
    warningQuda("Exceeded maximum iterations %d", invParam.maxiter);

  if (invParam.verbosity >= QUDA_SUMMARIZE)
    printfQuda("CG: Reliable updates = %d\n", rUpdate);

  double gflops = (quda::blas_flops + mat.flops() + matSloppy.flops())*1e-9;
  reduceDouble(gflops);

  //  printfQuda("%f gflops\n", gflops / stopwatchReadSeconds());
  invParam.gflops = gflops;
  invParam.iter = k;

  quda::blas_flops = 0;

  if (invParam.verbosity >= QUDA_SUMMARIZE){
    mat(r, x, y);
    double true_res = xmyNormCuda(b, r);
    printfQuda("CG: Converged after %d iterations, relative residua: iterated = %e, true = %e\n", 
	       k, sqrt(r2/src_norm), sqrt(true_res / src_norm));    
  }

  if (invParam.cuda_prec_sloppy != x.Precision()) {
    delete r_sloppy;
    delete x_sloppy;
  }

  return;
}
Esempio n. 3
0
void packTest() {

  float spinorGiB = (float)Vh*spinorSiteSize*param.cuda_prec / (1 << 30);
  printf("\nSpinor mem: %.3f GiB\n", spinorGiB);
  printf("Gauge mem: %.3f GiB\n", param.gaugeGiB);

  printf("Sending fields to GPU...\n"); fflush(stdout);
  
  /*{
    param.gauge_order = QUDA_CPS_WILSON_GAUGE_ORDER;
    
    GaugeFieldParam cpsParam(cpsCpuGauge_p, param);
    cpuGaugeField cpsCpuGauge(cpsParam);
    cpsParam.create = QUDA_NULL_FIELD_CREATE;
    cpsParam.precision = param.cuda_prec;
    cpsParam.reconstruct = param.reconstruct;
    cpsParam.pad = param.ga_pad;
    cpsParam.order = (cpsParam.precision == QUDA_DOUBLE_PRECISION || 
		      cpsParam.reconstruct == QUDA_RECONSTRUCT_NO ) ?
      QUDA_FLOAT2_GAUGE_ORDER : QUDA_FLOAT4_GAUGE_ORDER;
    cudaGaugeField cudaCpsGauge(cpsParam);

    stopwatchStart();
    cudaCpsGauge.loadCPUField(cpsCpuGauge);    
    double cpsGtime = stopwatchReadSeconds();
    printf("CPS Gauge send time = %e seconds\n", cpsGtime);

    stopwatchStart();
    cudaCpsGauge.saveCPUField(cpsCpuGauge);
    double cpsGRtime = stopwatchReadSeconds();
    printf("CPS Gauge restore time = %e seconds\n", cpsGRtime);
  }*/

  {
    param.gauge_order = QUDA_QDP_GAUGE_ORDER;
    
    GaugeFieldParam qdpParam(qdpCpuGauge_p, param);
    cpuGaugeField qdpCpuGauge(qdpParam);
    qdpParam.create = QUDA_NULL_FIELD_CREATE;
    qdpParam.precision = param.cuda_prec;
    qdpParam.reconstruct = param.reconstruct;
    qdpParam.pad = param.ga_pad;
    qdpParam.order = (qdpParam.precision == QUDA_DOUBLE_PRECISION || 
		      qdpParam.reconstruct == QUDA_RECONSTRUCT_NO ) ?
      QUDA_FLOAT2_GAUGE_ORDER : QUDA_FLOAT4_GAUGE_ORDER;
    cudaGaugeField cudaQdpGauge(qdpParam);

    stopwatchStart();
    cudaQdpGauge.loadCPUField(qdpCpuGauge);    
    double qdpGtime = stopwatchReadSeconds();
    printf("QDP Gauge send time = %e seconds\n", qdpGtime);

    stopwatchStart();
    cudaQdpGauge.saveCPUField(qdpCpuGauge);
    double qdpGRtime = stopwatchReadSeconds();
    printf("QDP Gauge restore time = %e seconds\n", qdpGRtime);
  }

  stopwatchStart();
  *cudaSpinor = *spinor;
  double sSendTime = stopwatchReadSeconds();
  printf("Spinor send time = %e seconds\n", sSendTime);

  stopwatchStart();
  *spinor2 = *cudaSpinor;
  double sRecTime = stopwatchReadSeconds();
  printf("Spinor receive time = %e seconds\n", sRecTime);
  
  std::cout << "Norm check: CPU = " << norm2(*spinor) << 
    ", CUDA = " << norm2(*cudaSpinor) << 
    ", CPU =  " << norm2(*spinor2) << std::endl;

  cpuColorSpinorField::Compare(*spinor, *spinor2, 1);

}
Esempio n. 4
0
void MultiShiftCG::operator()(cudaColorSpinorField **x, cudaColorSpinorField &b)
{
 
  int num_offset = invParam.num_offset;
  double *offset = invParam.offset;
  double *residue_sq = invParam.tol_offset;
 
  if (num_offset == 0) return;

  int *finished = new int [num_offset];
  double *zeta_i = new double[num_offset];
  double *zeta_im1 = new double[num_offset];
  double *zeta_ip1 = new double[num_offset];
  double *beta_i = new double[num_offset];
  double *beta_im1 = new double[num_offset];
  double *alpha = new double[num_offset];
  int i, j;
  
  int j_low = 0;   
  int num_offset_now = num_offset;
  for (i=0; i<num_offset; i++) {
    finished[i] = 0;
    zeta_im1[i] = zeta_i[i] = 1.0;
    beta_im1[i] = -1.0;
    alpha[i] = 0.0;
  }
  
  //double msq_x4 = offset[0];

  cudaColorSpinorField *r = new cudaColorSpinorField(b);
  
  cudaColorSpinorField **x_sloppy = new cudaColorSpinorField*[num_offset], *r_sloppy;
  
  ColorSpinorParam param;
  param.create = QUDA_ZERO_FIELD_CREATE;
  param.precision = invParam.cuda_prec_sloppy;
  
  if (invParam.cuda_prec_sloppy == x[0]->Precision()) {
    for (i=0; i<num_offset; i++){
      x_sloppy[i] = x[i];
      zeroCuda(*x_sloppy[i]);
    }
    r_sloppy = r;
  } else {
    for (i=0; i<num_offset; i++) {
      x_sloppy[i] = new cudaColorSpinorField(*x[i], param);
    }
    param.create = QUDA_COPY_FIELD_CREATE;
    r_sloppy = new cudaColorSpinorField(*r, param);
  }
  
  cudaColorSpinorField **p = new cudaColorSpinorField*[num_offset];  
  for(i=0;i < num_offset;i++){
    p[i]= new cudaColorSpinorField(*r_sloppy);    
  }
  
  param.create = QUDA_ZERO_FIELD_CREATE;
  param.precision = invParam.cuda_prec_sloppy;
  cudaColorSpinorField* Ap = new cudaColorSpinorField(*r_sloppy, param);
  
  double b2 = 0.0;
  b2 = normCuda(b);
    
  double r2 = b2;
  double r2_old;
  double stop = r2*invParam.tol*invParam.tol; // stopping condition of solver
    
  double pAp;
    
  int k = 0;
    
  stopwatchStart();
  while (r2 > stop &&  k < invParam.maxiter) {
    //dslashCuda_st(tmp_sloppy, fatlinkSloppy, longlinkSloppy, p[0], 1 - oddBit, 0);
    //dslashAxpyCuda(Ap, fatlinkSloppy, longlinkSloppy, tmp_sloppy, oddBit, 0, p[0], msq_x4);
    matSloppy(*Ap, *p[0]);
    if (invParam.dslash_type != QUDA_ASQTAD_DSLASH){
      axpyCuda(offset[0], *p[0], *Ap);
    }
    pAp = reDotProductCuda(*p[0], *Ap);
    beta_i[0] = r2 / pAp;        

    zeta_ip1[0] = 1.0;
    for (j=1; j<num_offset_now; j++) {
      zeta_ip1[j] = zeta_i[j] * zeta_im1[j] * beta_im1[j_low];
      double c1 = beta_i[j_low] * alpha[j_low] * (zeta_im1[j]-zeta_i[j]);
      double c2 = zeta_im1[j] * beta_im1[j_low] * (1.0+(offset[j]-offset[0])*beta_i[j_low]);
      /*THISBLOWSUP
	zeta_ip1[j] /= c1 + c2;
	beta_i[j] = beta_i[j_low] * zeta_ip1[j] / zeta_i[j];
      */
      /*TRYTHIS*/
      if( (c1+c2) != 0.0 )
	zeta_ip1[j] /= (c1 + c2); 
      else {
	zeta_ip1[j] = 0.0;
	finished[j] = 1;
      }
      if( zeta_i[j] != 0.0) {
	beta_i[j] = beta_i[j_low] * zeta_ip1[j] / zeta_i[j];
      } else  {
	zeta_ip1[j] = 0.0;
	beta_i[j] = 0.0;
	finished[j] = 1;
	if (invParam.verbosity >= QUDA_VERBOSE)
	  printfQuda("SETTING A ZERO, j=%d, num_offset_now=%d\n",j,num_offset_now);
	//if(j==num_offset_now-1)node0_PRINTF("REDUCING OFFSET\n");
	if(j==num_offset_now-1) num_offset_now--;
	// don't work any more on finished solutions
	// this only works if largest offsets are last, otherwise
	// just wastes time multiplying by zero
      }
    }	
	
    r2_old = r2;
    r2 = axpyNormCuda(-beta_i[j_low], *Ap, *r_sloppy);

    alpha[0] = r2 / r2_old;
	
    for (j=1; j<num_offset_now; j++) {
      /*THISBLOWSUP
	alpha[j] = alpha[j_low] * zeta_ip1[j] * beta_i[j] /
	(zeta_i[j] * beta_i[j_low]);
      */
      /*TRYTHIS*/
      if( zeta_i[j] * beta_i[j_low] != 0.0)
	alpha[j] = alpha[j_low] * zeta_ip1[j] * beta_i[j] /
	  (zeta_i[j] * beta_i[j_low]);
      else {
	alpha[j] = 0.0;
	finished[j] = 1;
      }
    }
	
    axpyZpbxCuda(beta_i[0], *p[0], *x_sloppy[0], *r_sloppy, alpha[0]);	
    for (j=1; j<num_offset_now; j++) {
      axpyBzpcxCuda(beta_i[j], *p[j], *x_sloppy[j], zeta_ip1[j], *r_sloppy, alpha[j]);
    }
    
    for (j=0; j<num_offset_now; j++) {
      beta_im1[j] = beta_i[j];
      zeta_im1[j] = zeta_i[j];
      zeta_i[j] = zeta_ip1[j];
    }

    k++;
    if (invParam.verbosity >= QUDA_VERBOSE){
      printfQuda("Multimass CG: %d iterations, r2 = %e\n", k, r2);
    }
  }
    
  if (x[0]->Precision() != x_sloppy[0]->Precision()) {
    for(i=0;i < num_offset; i++){
      copyCuda(*x[i], *x_sloppy[i]);
    }
  }

  *residue_sq = r2;

  invParam.secs = stopwatchReadSeconds();
     
  if (k==invParam.maxiter) {
    warningQuda("Exceeded maximum iterations %d\n", invParam.maxiter);
  }
    
  double gflops = (quda::blas_flops + mat.flops() + matSloppy.flops())*1e-9;
  reduceDouble(gflops);

  invParam.gflops = gflops;
  invParam.iter = k;
  
  // Calculate the true residual of the system with the smallest shift
  mat(*r, *x[0]); 
  axpyCuda(offset[0],*x[0], *r); // Offset it.

  double true_res = xmyNormCuda(b, *r);
  if (invParam.verbosity >= QUDA_SUMMARIZE){
    printfQuda("MultiShift CG: Converged after %d iterations, r2 = %e, relative true_r2 = %e\n", 
	       k,r2, (true_res / b2));
  }    
  if (invParam.verbosity >= QUDA_VERBOSE){
    printfQuda("MultiShift CG: Converged after %d iterations\n", k);
    printfQuda(" shift=0 resid_rel=%e\n", sqrt(true_res/b2));
    for(int i=1; i < num_offset; i++) { 
      mat(*r, *x[i]); 
      axpyCuda(offset[i],*x[i], *r); // Offset it.
      true_res = xmyNormCuda(b, *r);
      printfQuda(" shift=%d resid_rel=%e\n",i, sqrt(true_res/b2));
    }
  }      
  
  delete r;
  for(i=0;i < num_offset; i++){
    delete p[i];
  }
  delete p;
  delete Ap;
  
  if (invParam.cuda_prec_sloppy != x[0]->Precision()) {
    for(i=0;i < num_offset;i++){
      delete x_sloppy[i];
    }
    delete r_sloppy;
  }
  delete x_sloppy;
  
  delete []finished;
  delete []zeta_i;
  delete []zeta_im1;
  delete []zeta_ip1;
  delete []beta_i;
  delete []beta_im1;
  delete []alpha;
 
}
Esempio n. 5
0
int main(int argc, char *argv[]) {

  Start(&argc,&argv);
  int seed = atoi(argv[1]);         //
  int SINPz_Pz   = atof(argv[2]);   // integer percentage of the tolerance of sin(p)/p at Z.
  int SINPxy_Pxy = atof(argv[3]);   // integer percentage of the tolerance of sin(p)/p at XY.
  //int t_in = atoi(argv[5]);         //

  DoArg do_arg;
  setup_do_arg(do_arg, seed); 
  GJP.Initialize(do_arg);  

  GwilsonFclover lat;
  CommonArg c_arg;

  //Declare args for Gaussian Smearing
  QPropWGaussArg g_arg_mom;
  setup_g_arg(g_arg_mom);


  int sweep_counter = 0;
  int total_updates = NTHERM + NSKIP*(NDATA-1);

  #ifdef QUENCH
  GhbArg ghb_arg;
  ghb_arg.num_iter = 1;
  AlgGheatBath hb(lat, &c_arg, &ghb_arg);
  #else
  HmdArg hmd_arg; 
  setup_hmd_arg(hmd_arg);
  AlgHmcPhi hmc(lat, &c_arg, &hmd_arg); 
  #endif

  //Declare args for source at 0.
  QPropWArg arg_0;
  setup_qpropwarg_cg(arg_0);
  arg_0.x = 0;
  arg_0.y = 0;
  arg_0.z = 0;
  arg_0.t = 0;

  //Declare args for source at z.
  QPropWArg arg_z;
  setup_qpropwarg_cg(arg_z);

  // Propagator calculation objects and memory allocation
  //
  // Using x[4] = X(x,y,z,t)
  //       y[4] = Y(x,y,z,t)
  //       z[4] = Z(x,y,z,t)
  int x[4];
  int y[4];
  int z[4];
  int x_idx4d, x_idx3d, y_idx4d, y_idx3d, z_idx4d, z_idx3d;
  int vol4d = GJP.XnodeSites()*GJP.YnodeSites()*GJP.ZnodeSites()*GJP.TnodeSites();
  int vol3d = GJP.XnodeSites()*GJP.YnodeSites()*GJP.ZnodeSites();
  int xnodes = GJP.XnodeSites();
  int ynodes = GJP.YnodeSites();
  int znodes = GJP.ZnodeSites();
  double norm = pow(vol3d, -0.5);
  
  int max_mom = NSITES_3D;
  mom3D mom(max_mom, SINPz_Pz/(1.0*100));

  int s1 = 0;
  int c1 = 0;
  int s2 = 0;
  int c2 = 0;
  int sc_idx = 0;

  //use t to represent the time slice.
  //int t = 0;

  //In these arrays, we will use the index convention [sink_index + vol3d*source_index]
  WilsonMatrix *t3_arr = (WilsonMatrix*)smalloc(vol3d*vol3d*sizeof(WilsonMatrix));
  WilsonMatrix *t2_arr = (WilsonMatrix*)smalloc(vol3d*vol3d*sizeof(WilsonMatrix));
  //Initialise
  for (int i=0; i<vol3d*vol3d; i++) {
    t3_arr[i]    *= 0.0;
    t2_arr[i]    *= 0.0;
  }

  //Arrays to store the trace data
  fftw_complex *FT_t4 = (fftw_complex*)smalloc(vol3d*sizeof(fftw_complex));
  fftw_complex *FT_t2 = (fftw_complex*)smalloc(vol3d*vol3d*sizeof(fftw_complex));
  fftw_complex *FT_t3 = (fftw_complex*)smalloc(vol3d*vol3d*sizeof(fftw_complex));
  
  //Use this array several times for 9d D0, D1, D2.
  fftw_complex *FT_9d  = (fftw_complex*)smalloc(vol3d*vol3d*vol3d*sizeof(fftw_complex));
  
  //Momentum source array.
  fftw_complex *FFTW_mom_arr  = (fftw_complex*)smalloc(vol3d*sizeof(fftw_complex));
  //Initialise
  for (int i=0; i<vol3d*vol3d*vol3d; i++) {
    for(int a=0; a<2; a++){
      FT_9d[i][a]  = 0.0;    
      if(i<vol3d*vol3d) {
	FT_t3[i][a]  = 0.0;
	FT_t2[i][a]  = 0.0;
      }
      if(i<vol3d) {
	FT_t4[i][a]  = 0.0;
	FFTW_mom_arr[i][a]  = 0.0;
      }
    }
  }
 //gaahhbage 
  FFT_F(9, NSITES_3D, FT_9d);
  FFT_B(9, NSITES_3D, FT_9d);

  FFT_F(6, NSITES_3D, FT_t2);
  FFT_B(6, NSITES_3D, FT_t2);
 
  FFT_F(3, NSITES_3D, FFTW_mom_arr);
  FFT_B(3, NSITES_3D, FFTW_mom_arr); 

  WilsonMatrix t1;
  WilsonMatrix t1c;
  WilsonMatrix t4;
  WilsonMatrix t4c;
  WilsonMatrix t4t1c;
  WilsonMatrix t2t3c;
  WilsonMatrix t3;
  WilsonMatrix t3c;
  WilsonMatrix t2;
  WilsonMatrix t2c;
		
  //Rcomplex mom_src;
  //WilsonMatrix temp;

  Rcomplex t1t1c_tr;
  Rcomplex t4t4c_tr;
  Rcomplex d2_tr;
  Rcomplex t2t2c_tr;
  Rcomplex t3t3c_tr;

  //////////////////////
  // Start simulation //
  ////////////////////// 

  Float *time = (Float*)smalloc(10*sizeof(Float));
  for(int a=0; a<10; a++) time[a] = 0.0;

  char lattice[256];
  
  while (sweep_counter < total_updates) {
    for (int n = 1; n <= NSKIP; n++) {
#ifndef READ
      #ifdef QUENCH
      hb.run(); 
      #else
      hmc.run();
      #endif
#endif
      sweep_counter++;
      if (!UniqueID()) {
        printf("step %d complete\n",sweep_counter);
        fflush(stdout);
      }
    }
    
    if (sweep_counter == NTHERM) {
      printf("thermalization complete. \n");
    }
    if (sweep_counter >= NTHERM) {
      // Use this code to specify a gauge configuration.
      #ifdef QUENCH
        sprintf(lattice, LATT_PATH"QU/lat_hb_B%.2f_%d-%d_%d.dat", BETA, NSITES_3D, NSITES_T, sweep_counter);
      #else
        sprintf(lattice, LATT_PATH"UNQ/lat_hmc_B%.2f_M%.3f_%d-%d_%d.dat", BETA, MASS, NSITES_3D, NSITES_T, sweep_counter);
      #endif
#ifdef READ
      ReadLatticeParallel(lat,lattice);
#else
      WriteLatticeParallel(lat,lattice);
#endif
      
      gaugecounter = 1;
      
      // We will compute two arrays of momentum source propagators.
      // One array is of t2 S(x,z)
      // One array is of t3 S(y,z)
      // Each array will be indexed arr[sink_index + vol*source_index].
      
      // The sources for these arrays are calculated using the backaward FT of momentum states.
      // E.G., momemtum state P_0=(0,0,0) is used to calculated the position space state
      // X_0[n] = \frac{1}{sqrt(V)} * \sum_{m} e^{(-2i*pi/N)*n*m} * P_0[m].
      // This source is then used in the inversion to calculate an propagator M_0. M_0 <P_0|  has,
      // strong overlap with the P_0 state. This is repeated for small momenta (e.g. |P| < 1) and the propagators
      // from each inversion are summed and normalised by the number of momenta used k:
      // M = 1/sqrt(k) sum_k M_k <P_k|  The resulting propagator M has strong overlap with the low momentum states.
      // N.B. One can show that using all possible momenta K, the full propagator matrix is recovered.
      
      // The 0-mom source at the origin is calculated outside the time loop.
      int P0[3] = {0,0,0};
      
      arg_0.t = 0;
      QPropWMomSrcSmeared qprop_0(lat, &arg_0, P0, &g_arg_mom, &c_arg);
      qprop_0.GaussSmearSinkProp(g_arg_mom);
      cout<<"Sink Smear 0 complete."<<endl;
      
      //////////////////////////////////
      // Begin loop over time slices. //
      //////////////////////////////////

      for (int t=0; t<GJP.TnodeSites(); t++) {
	//Reinitialise all propagator arrays.
        for (int i=0; i<vol3d*vol3d; i++) {
	  t2_arr[i]    *= 0.0;
	  t3_arr[i]    *= 0.0;
        }
	
	stopwatchStart();
	
	//Generate momentum source
	int n_mom_srcs    = 0;

	for (mom.P[2] = 0; mom.P[2] < max_mom; mom.P[2]++)
	  for (mom.P[1] = 0; mom.P[1] < max_mom; mom.P[1]++)
	    for (mom.P[0] = 0; mom.P[0] < max_mom; mom.P[0]++) {
	      
	      cout<<"MOM = "<<mom.P[0]<<" "<<mom.P[1]<<" "<<mom.P[2]<<endl;
	      
	      cout<<"NORM_MOM_SZE = "<<mom.mod()/M_PI<<endl;
	      //frac = sin(p)/p
	      Float frac = sin(mom.mod())/(mom.mod());
	      cout<<"SIN(Pz)/Pz = "<<frac<<endl;
	      
	      if(frac > mom.sin_cutoff || (mom.P[0] == 0 && mom.P[1] == 0 && mom.P[2] == 0) ){
		
		//Set momentum
		int P[3] = {mom.P[0], mom.P[1], mom.P[2]};
		
		// The CPS momentum source function uses an unnormalised
		// source, so we take the product of both normalisation
		// factors and place them here on the FFTW_mom_arr.
		// A further normalisation to perform comes from the number n_mom_srcs
		// of momentum sources. This is done later in when the trace of
		// of the propagators is caculated.
		
		
		//Get Momentum Propagator
		arg_z.t = t;
		//QPropWMomSrc qprop_mom(lat, &arg_z, P, &c_arg);
		QPropWMomSrcSmeared qprop_mom(lat, &arg_z, P, &g_arg_mom, &c_arg);
		cout<<"Inversion "<<(n_mom_srcs+1)<<" complete."<<endl;
		qprop_mom.GaussSmearSinkProp(g_arg_mom);
		cout<<"Sink Smear "<<(n_mom_srcs+1)<<" complete."<<endl;
		
		int z_idx4d, z_idx3d, x_idx4d, x_idx3d, y_idx4d, y_idx3d;
		//Loop over sources at z.
		z[3] = t;
		for (z[2]=0; z[2]<znodes; z[2]++)
		  for (z[1]=0; z[1]<ynodes; z[1]++)
		    for (z[0]=0; z[0]<xnodes; z[0]++) {
		      z_idx4d = lat.GsiteOffset(z)/4;
		      z_idx3d = z_idx4d - vol3d*z[3];
		      
		      cout<<"mom_src "<<qprop_mom.mom_src(z_idx4d)<<endl;
		      
		      //Loop over sinks at x.
		      x[3] = 0;
		      for (x[2]=0; x[2]<znodes; x[2]++)
			for (x[1]=0; x[1]<ynodes; x[1]++)
			  for (x[0]=0; x[0]<xnodes; x[0]++) {
			    x_idx4d = lat.GsiteOffset(x)/4;
			    x_idx3d = x_idx4d - vol3d*x[3];
			    
			    //Build t2 array.
			    t2_arr[x_idx3d + vol3d*z_idx3d] += qprop_mom[x_idx4d]*conj(qprop_mom.mom_src(z_idx4d));
			  }
		      
		      //Loop over sinks at y.
		      y[3] = t;
		      for (y[2]=0; y[2]<znodes; y[2]++)
			for (y[1]=0; y[1]<ynodes; y[1]++)
			  for (y[0]=0; y[0]<xnodes; y[0]++) {
			    y_idx4d = lat.GsiteOffset(y)/4;
			    y_idx3d = y_idx4d - vol3d*y[3];
			    
			    //Build t3 array.
			    t3_arr[y_idx3d + vol3d*z_idx3d] += qprop_mom[y_idx4d]*conj(qprop_mom.mom_src(z_idx4d));
			  }
		    }
		n_mom_srcs++; 
		cout << "momentum sources: "<<1+mom.P[2]*max_mom*max_mom + mom.P[1]*max_mom + mom.P[0]<<" / "<<pow(max_mom,3)<<" checked"<<endl;
	      }
	    }
	
	cout<<"FLAG 1"<<endl;
	//inversions + fill      
	time[1] = stopwatchReadSeconds();
	stopwatchStart();
	
	//////////////////////////////////////////////////////////////////
	// End momentum source propagator calculation for time slice t. //
	//////////////////////////////////////////////////////////////////
	
	
	///////////////////////////////////////////////
	// Begin summation of trace at time slice t. //
	///////////////////////////////////////////////
	      
	// The t1, t1c, t4, and t4c propagators are calculated 'on the fly'
	// within the trace summation.
      
	//Reinitialise all trace variables
	
	t1  *= 0.0;
	t1c *= 0.0;
	t2  *= 0.0;
	t2c *= 0.0;
	t3  *= 0.0;
	t3c *= 0.0;
	t4  *= 0.0;
	t4c *= 0.0;
	t4t1c *= 0.0;
	t2t3c *= 0.0;
      
	t1t1c_tr *= 0.0;
	t2t2c_tr *= 0.0;
	t3t3c_tr *= 0.0;
	t4t4c_tr *= 0.0;
	d2_tr *= 0.0;
            
	for (int i=0; i<vol3d*vol3d*vol3d; i++) 
	  for(int a=0; a<2; a++) {
	    FT_9d[i][a] = 0.0;
	    if(i<vol3d*vol3d) {
	      FT_t3[i][a] = 0.0;
	      FT_t2[i][a] = 0.0;
	    }
	    if(i<vol3d) {
	      FT_t4[i][a] = 0.0;
	    }
	  }
	//Sum over X
	x[3] = 0;
	for (x[2]=0; x[2]<znodes; x[2]++)
	  for (x[1]=0; x[1]<ynodes; x[1]++)
	    for (x[0]=0; x[0]<xnodes; x[0]++) {
	      x_idx4d = lat.GsiteOffset(x)/4;
	      x_idx3d = x_idx4d - vol3d*x[3];
	      
	      t1 = qprop_0[x_idx4d];
	      t1c = t1.conj_cp();
	      
	      //Sum over Y
	      y[3] = t;
	      for (y[2]=0; y[2]<znodes; y[2]++)
		for (y[1]=0; y[1]<ynodes; y[1]++)
		  for (y[0]=0; y[0]<xnodes; y[0]++) {
		    y_idx4d = lat.GsiteOffset(y)/4;
		    y_idx3d = y_idx4d - vol3d*y[3];
		    
		    t4 = qprop_0[y_idx4d];
		  
		    // Use this condition so that t4t4c is calculated only once
		    // over X per time slice.
		    if (x_idx3d == 0) {
		      //Perform t4t4c trace sum for D0 graph.
		      FT_t4[y_idx3d][0] = MMDag_re_tr(t4);
		      FT_t4[y_idx3d][1] = 0.0;
		    }
		    
		    //Declare new Wilson Matrix t4*t1c for D2 and compute
		    t4t1c = t4;
		    t4t1c *= t1c;
		    
		    //Sum over Z.
		    z[3] = t;
		    for (z[2]=0; z[2]<znodes; z[2]++)
		      for (z[1]=0; z[1]<ynodes; z[1]++)
			for (z[0]=0; z[0]<xnodes; z[0]++) {
			  z_idx4d = lat.GsiteOffset(z)/4;
			  z_idx3d = z_idx4d - vol3d*z[3];
			  
			  //Declare new Wilson Matrix t2*t3c and compute it.			  
			  t2t3c = t2_arr[x_idx3d + vol3d*z_idx3d];
			  t3c   = t3_arr[y_idx3d + vol3d*z_idx3d].conj_cp();
			  t2t3c *= t3c;
			  
			  //Perform t4t1c * t2t3c trace sum for D2 graph.
			  d2_tr = Trace(t4t1c, t2t3c);
			  
			  //Create 9d array for D2.			  
			  FT_9d[x_idx3d + vol3d*(y_idx3d + vol3d*z_idx3d)][0] = d2_tr.real();
			  FT_9d[x_idx3d + vol3d*(y_idx3d + vol3d*z_idx3d)][1] = d2_tr.imag();
			  
			  ///////////////////////////////////////////////////////////////////
			  // Use this condition so that t2t2c is calculated only over
			  // x1 and x3 loops per time slice. 
			  if (y_idx3d == 0) {
			    //Retrieve propagators for t2t2c trace sum.
			    FT_t2[x_idx3d + vol3d*z_idx3d][0] = MMDag_re_tr(t2_arr[x_idx3d + vol3d*z_idx3d]);
			    FT_t2[x_idx3d + vol3d*z_idx3d][1] = 0.0;
			  }
			  // Use this condition so that t3t3c is calculated only over
			  // x2 and x3 loops per time slice. 
			  if (x_idx3d == 0) {
			    
			    //Retrieve propagators for t3t3c trace sum.
			    FT_t3[y_idx3d + vol3d*z_idx3d][0] = MMDag_re_tr(t3_arr[y_idx3d + vol3d*z_idx3d]);
			    FT_t3[y_idx3d + vol3d*z_idx3d][1] = 0.0;
			  }
			  ///////////////////////////////////////////////////////////////////
			}
		  }
	    }
	
	//Fill the trace arrays
	time[2] = stopwatchReadSeconds();
	
	cout<<"FLAG 3"<<endl;

	///////////////////////////////////////////////
	// Write traces to file for post-processing. //
	///////////////////////////////////////////////
      
	char file[256];
  	FFT_F(6, NSITES_3D, FT_t2);
	FFT_F(6, NSITES_3D, FT_t3);
	FFT_F(3, NSITES_3D, FT_t4);

	// if(t==0) {    
	// sprintf(file, "%d-%d_3-0.1_msmsFT_6d_data/t1t1c_TR_%d_%d-%d_%d_%d.dat",  NSITES_3D, NSITES_T, n_mom_srcs, NSITES_3D, NSITES_T, sweep_counter, t);	  
	// FILE *qt1tr   = Fopen(file, "a");
	// for(int snk =0; snk<vol3d; snk++) {
	// Fprintf(qt1tr,  "%d %d %d %.16e %.16e\n", sweep_counter, t, snk, FT_t4[snk][0], FT_t4[snk][1]);
	// }
	// Fclose(qt1tr);
	// }
	
	sprintf(file, DATAPATH"t4t4c_TR_%d_%d-%d_%d_%d.dat",  n_mom_srcs, NSITES_3D, NSITES_T, sweep_counter, t);
	FILE *qt4tr   = Fopen(file, "a");
	for(int snk =0; snk<vol3d; snk++) {
	  Fprintf(qt4tr,  "%d %d %d %.16e %.16e\n", sweep_counter, t, snk, FT_t4[snk][0], FT_t4[snk][1]);
	}
	
	sprintf(file, DATAPATH"t2t2c_TR_%d_%d-%d_%d_%d.dat",  n_mom_srcs, NSITES_3D, NSITES_T, sweep_counter, t);
	FILE *qt2tr   = Fopen(file, "a");
	sprintf(file, DATAPATH"t3t3c_TR_%d_%d-%d_%d_%d.dat",  n_mom_srcs, NSITES_3D, NSITES_T, sweep_counter, t);
	FILE *qt3tr   = Fopen(file, "a");
	
	for(int src =0; src<vol3d; src++) {
	  for(int snk =0; snk<vol3d; snk++) {
	    Fprintf(qt2tr,"%d %d %d %d %.16e %.16e\n", sweep_counter, t, src, snk, FT_t2[snk + vol3d*src][0], FT_t2[snk + vol3d*src][1]);
	    Fprintf(qt3tr,"%d %d %d %d %.16e %.16e\n", sweep_counter, t, src, snk, FT_t3[snk + vol3d*src][0], FT_t3[snk + vol3d*src][1]);
	  }
	}
	

	Fclose(qt2tr);
	Fclose(qt3tr);
	Fclose(qt4tr);

	//////////////////////////
	// FFT the 9D D2 array. //
	//////////////////////////
      
	stopwatchStart();
	
	FFT_F(9, NSITES_3D, FT_9d);
	//time for D2 6d FFT
	time[4] = stopwatchReadSeconds();      
	//wtf == 'write to file', include/FFTW_functions.cpp
	FFT_wtf_ZYX(FT_9d, 2, SINPz_Pz, SINPxy_Pxy, n_mom_srcs, NSITES_3D, NSITES_T, sweep_counter, t);
	
	//sprintf(file, "T_data/times_%d-%d_%d_%d.dat", NSITES_3D, NSITES_T, sweep_counter, t);
	//FILE *time_fp = Fopen(file, "a");
	//Fprintf(time_fp, "%.4f %.4f %.4f %.4f\n", time[1], time[2], time[3], time[4]);
	//Fclose(time_fp); 
	
	//////////////////////////////////////////
	// End trace summation at time slice t. //
	//////////////////////////////////////////
      }
    }
  }
  ////////////////////
  // End simulation //
  ////////////////////
  
  sfree(t2_arr);
  sfree(t3_arr);

  //sfree(FT_t1);
  sfree(FT_t4);
  sfree(FT_t2);
  sfree(FT_t3);
  sfree(FT_9d);  


  sfree(time);

  //End();
  return 0;
}