Example #1
0
int main(int argc, char **argv) {
  
  int c, i, mu, status;
  int ispin, icol, isc;
  int n_c = 3;
  int n_s = 4;
  int count        = 0;
  int filename_set = 0;
  int dims[4]      = {0,0,0,0};
  int grid_size[4];
  int l_LX_at, l_LXstart_at;
  int x0, x1, x2, x3, ix, iix, iy, is, it, i3;
  int sl0, sl1, sl2, sl3, have_source_flag=0;
  int source_proc_coords[4], lsl0, lsl1, lsl2, lsl3;
  int check_residuum = 0;
  unsigned int VOL3, V5;
  int do_gt   = 0;
  int full_orbit = 0;
  int smear_source = 0;
  char filename[200], source_filename[200], source_filename_write[200];
  double ratime, retime;
  double plaq_r=0., plaq_m=0., norm, norm2;
  double spinor1[24];
  double *gauge_qdp[4], *gauge_field_timeslice=NULL, *gauge_field_smeared=NULL;
  double _1_2_kappa, _2_kappa, phase;
  FILE *ofs;
  int mu_trans[4] = {3, 0, 1, 2};
  int threadid, nthreads;
  int timeslice, source_timeslice;
  char rng_file_in[100], rng_file_out[100];
  int *source_momentum=NULL;
  int source_momentum_class = -1;
  int source_momentum_no = 0;
  int source_momentum_runs = 1;
  int imom;
  int num_gpu_on_node=0, rank;
  int source_location_5d_iseven;
  int convert_sign=0;
#ifdef HAVE_QUDA
  int rotate_gamma_basis = 1;
#else
  int rotate_gamma_basis = 0;
#endif
  omp_lock_t *lck = NULL, gen_lck[1];
  int key = 0;


  /****************************************************************************/
  /* for smearing parallel to inversion                                       */
  double *smearing_spinor_field[] = {NULL,NULL};
  int dummy_flag = 0;
  /****************************************************************************/


  /****************************************************************************/
#if (defined HAVE_QUDA) && (defined MULTI_GPU)
  int x_face_size, y_face_size, z_face_size, t_face_size, pad_size;
#endif
  /****************************************************************************/

  /************************************************/
  int qlatt_nclass;
  int *qlatt_id=NULL, *qlatt_count=NULL, **qlatt_rep=NULL, **qlatt_map=NULL;
  double **qlatt_list=NULL;
  /************************************************/

  /************************************************/
  double boundary_condition_factor;
  int boundary_condition_factor_set = 0;
  /************************************************/

//#ifdef MPI       
//  kernelPackT = true;
//#endif

  /***********************************************
   * QUDA parameters
   ***********************************************/
#ifdef HAVE_QUDA
  QudaPrecision cpu_prec         = QUDA_DOUBLE_PRECISION;
  QudaPrecision cuda_prec        = QUDA_DOUBLE_PRECISION;
  QudaPrecision cuda_prec_sloppy = QUDA_SINGLE_PRECISION;

  QudaGaugeParam gauge_param = newQudaGaugeParam();
  QudaInvertParam inv_param = newQudaInvertParam();
#endif

  while ((c = getopt(argc, argv, "soch?vgf:p:b:S:R:")) != -1) {
    switch (c) {
    case 'v':
      g_verbose = 1;
      break;
    case 'g':
      do_gt = 1;
      break;
    case 'f':
      strcpy(filename, optarg);
      filename_set=1;
      break;
    case 'c':
      check_residuum = 1;
      fprintf(stdout, "# [invert_dw_quda] will check residuum again\n");
      break;
    case 'p':
      n_c = atoi(optarg);
      fprintf(stdout, "# [invert_dw_quda] will use number of colors = %d\n", n_c);
      break;
    case 'o':
      full_orbit = 1;
      fprintf(stdout, "# [invert_dw_quda] will invert for full orbit, if source momentum set\n");
    case 's':
      smear_source = 1;
      fprintf(stdout, "# [invert_dw_quda] will smear the sources if they are read from file\n");
      break;
    case 'b':
      boundary_condition_factor = atof(optarg);
      boundary_condition_factor_set = 1;
      fprintf(stdout, "# [invert_dw_quda] const. boundary condition factor set to %e\n", boundary_condition_factor);
      break;
    case 'S':
      convert_sign = atoi(optarg);
      fprintf(stdout, "# [invert_dw_quda] using convert sign %d\n", convert_sign);
      break;
    case 'R':
      rotate_gamma_basis = atoi(optarg);
      fprintf(stdout, "# [invert_dw_quda] rotate gamma basis %d\n", rotate_gamma_basis);
      break;
    case 'h':
    case '?':
    default:
      usage();
      break;
    }
  }

  // get the time stamp
  g_the_time = time(NULL);

  /**************************************
   * set the default values, read input
   **************************************/
  if(filename_set==0) strcpy(filename, "cvc.input");
  if(g_proc_id==0) fprintf(stdout, "# Reading input from file %s\n", filename);
  read_input_parser(filename);

#ifdef MPI
#ifdef HAVE_QUDA
  grid_size[0] = g_nproc_x;
  grid_size[1] = g_nproc_y;
  grid_size[2] = g_nproc_z;
  grid_size[3] = g_nproc_t;
  fprintf(stdout, "# [] g_nproc = (%d,%d,%d,%d)\n", g_nproc_x, g_nproc_y, g_nproc_z, g_nproc_t);
  initCommsQuda(argc, argv, grid_size, 4);
#else
  MPI_Init(&argc, &argv);
#endif
#endif

#if (defined PARALLELTX) || (defined PARALLELTXY)
  EXIT_WITH_MSG(1, "[] Error, 2-dim./3-dim. MPI-Version not yet implemented");
#endif


  // some checks on the input data
  if((T_global == 0) || (LX==0) || (LY==0) || (LZ==0)) {
    if(g_proc_id==0) fprintf(stderr, "[invert_dw_quda] Error, T and L's must be set\n");
    usage();
  }

  // set number of openmp threads

  // initialize MPI parameters
  mpi_init(argc, argv);
  
  // the volume of a timeslice
  VOL3 = LX*LY*LZ;
  V5   = T*LX*LY*LZ*L5;
  g_kappa5d = 0.5 / (5. + g_m5);
  if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] kappa5d = %e\n", g_kappa5d);

  fprintf(stdout, "# [%2d] parameters:\n"\
                  "# [%2d] T            = %3d\n"\
		  "# [%2d] Tstart       = %3d\n"\
		  "# [%2d] L5           = %3d\n",\
                  g_cart_id, g_cart_id, T, g_cart_id, Tstart, g_cart_id, L5);


#ifdef MPI
  if(T==0) {
    fprintf(stderr, "[%2d] local T is zero; exit\n", g_cart_id);
    MPI_Abort(MPI_COMM_WORLD, 1);
    MPI_Finalize();
    exit(2);
  }
#endif

  if(init_geometry() != 0) {
    fprintf(stderr, "[invert_dw_quda] Error from init_geometry\n");
    EXIT(1);
  }
  geometry();

  if( init_geometry_5d() != 0 ) {
    fprintf(stderr, "[invert_dw_quda] Error from init_geometry_5d\n");
    EXIT(2);
  }
  geometry_5d();

  /**************************************
   * initialize the QUDA library
   **************************************/
  if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] initializing quda\n");
#ifdef HAVE_QUDA
  // cudaGetDeviceCount(&num_gpu_on_node);
  if(g_gpu_per_node<0) {
    if(g_cart_id==0) fprintf(stderr, "[] Error, number of GPUs per node not set\n");
    EXIT(106);
  } else {
    num_gpu_on_node = g_gpu_per_node;
  }
#ifdef MPI
  rank = comm_rank();
#else
  rank = 0;
#endif
  g_gpu_device_number = rank % num_gpu_on_node;
  fprintf(stdout, "# [] process %d/%d uses device %d\n", rank, g_cart_id, g_gpu_device_number);

  initQuda(g_gpu_device_number);

#endif
 
  /**************************************
   * prepare the gauge field
   **************************************/
  // read the gauge field from file
  alloc_gauge_field(&g_gauge_field, VOLUMEPLUSRAND);
  if(strcmp( gaugefilename_prefix, "identity")==0 ) {
    if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] Setting up unit gauge field\n");
    for(ix=0;ix<VOLUME; ix++) {
      for(mu=0;mu<4;mu++) {
        _cm_eq_id(g_gauge_field+_GGI(ix,mu));
      }
    }
  } else if(strcmp( gaugefilename_prefix, "random")==0 ) {
    if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] Setting up random gauge field with seed = %d\n", g_seed);
    init_rng_state(g_seed, &g_rng_state);
    random_gauge_field(g_gauge_field, 1.);
    plaquette(&plaq_m);
    sprintf(filename, "%s.%.4d", gaugefilename_prefix, Nconf);
    check_error(write_lime_gauge_field(filename, plaq_m, Nconf, 64), "write_lime_gauge_field", NULL, 12);
  } else {
    if(g_gauge_file_format == 0) {
      // ILDG
      sprintf(filename, "%s.%.4d", gaugefilename_prefix, Nconf);
      if(g_cart_id==0) fprintf(stdout, "# Reading gauge field from file %s\n", filename);
      status = read_lime_gauge_field_doubleprec(filename);
    } else if(g_gauge_file_format == 1) {
      // NERSC
      sprintf(filename, "%s.%.5d", gaugefilename_prefix, Nconf);
      if(g_cart_id==0) fprintf(stdout, "# Reading gauge field from file %s\n", filename);
      status = read_nersc_gauge_field(g_gauge_field, filename, &plaq_r);
      //status = read_nersc_gauge_field_3x3(g_gauge_field, filename, &plaq_r);

    }
    if(status != 0) {
      fprintf(stderr, "[invert_dw_quda] Error, could not read gauge field");
      EXIT(12);
    }
  }
#ifdef MPI
  xchange_gauge();
#endif

  // measure the plaquette
  plaquette(&plaq_m);
  if(g_cart_id==0) fprintf(stdout, "# Measured plaquette value: %25.16e\n", plaq_m);
  if(g_cart_id==0) fprintf(stdout, "# Read plaquette value    : %25.16e\n", plaq_r);

#ifndef HAVE_QUDA
  if(N_Jacobi>0) {
#endif
    // allocate the smeared / qdp ordered gauge field
    alloc_gauge_field(&gauge_field_smeared, VOLUMEPLUSRAND);
    for(i=0;i<4;i++) {
      gauge_qdp[i] = gauge_field_smeared + i*18*VOLUME;
    }
#ifndef HAVE_QUDA
  }
#endif

#ifdef HAVE_QUDA
  // transcribe the gauge field

  omp_set_num_threads(g_num_threads);
#pragma omp parallel for private(ix,iy,mu)
  for(ix=0;ix<VOLUME;ix++) {
    iy = g_lexic2eot[ix];
    for(mu=0;mu<4;mu++) {
      _cm_eq_cm(gauge_qdp[mu_trans[mu]]+18*iy, g_gauge_field+_GGI(ix,mu));
    }
  }
  // multiply timeslice T-1 with factor of -1 (antiperiodic boundary condition)
  if(g_proc_coords[0]==g_nproc_t-1) {
    if(!boundary_condition_factor_set) boundary_condition_factor = -1.;
    fprintf(stdout, "# [] process %d multiplies gauge-field timeslice T_global-1 with boundary condition factor %e\n", g_cart_id,
      boundary_condition_factor);

  omp_set_num_threads(g_num_threads);
#pragma omp parallel for private(ix,iy)
    for(ix=0;ix<VOL3;ix++) {
      iix = (T-1)*VOL3 + ix;
      iy = g_lexic2eot[iix];
      _cm_ti_eq_re(gauge_qdp[mu_trans[0]]+18*iy, -1.);
    }
  }

  // QUDA precision parameters
  switch(g_cpu_prec) {
    case 0: cpu_prec = QUDA_HALF_PRECISION; if(g_cart_id==0) fprintf(stdout, "# [] CPU prec = half\n"); break;
    case 1: cpu_prec = QUDA_SINGLE_PRECISION; if(g_cart_id==0) fprintf(stdout, "# [] CPU prec = single\n"); break;
    case 2: cpu_prec = QUDA_DOUBLE_PRECISION; if(g_cart_id==0) fprintf(stdout, "# [] CPU prec = double\n"); break;
    default: cpu_prec = QUDA_DOUBLE_PRECISION; break;
  }
  switch(g_gpu_prec) {
    case 0: cuda_prec = QUDA_HALF_PRECISION; if(g_cart_id==0) fprintf(stdout, "# [] GPU prec = half\n"); break;
    case 1: cuda_prec = QUDA_SINGLE_PRECISION; if(g_cart_id==0) fprintf(stdout, "# [] GPU prec = single\n"); break;
    case 2: cuda_prec = QUDA_DOUBLE_PRECISION; if(g_cart_id==0) fprintf(stdout, "# [] GPU prec = double\n"); break;
    default: cuda_prec = QUDA_DOUBLE_PRECISION; break;
  }
  switch(g_gpu_prec_sloppy) {
    case 0: cuda_prec_sloppy = QUDA_HALF_PRECISION; if(g_cart_id==0) fprintf(stdout, "# [] GPU sloppy prec = half\n"); break;
    case 1: cuda_prec_sloppy = QUDA_SINGLE_PRECISION; if(g_cart_id==0) fprintf(stdout, "# [] GPU sloppy prec = single\n"); break;
    case 2: cuda_prec_sloppy = QUDA_DOUBLE_PRECISION; if(g_cart_id==0) fprintf(stdout, "# [] GPU sloppy prec = double\n"); break;
    default: cuda_prec_sloppy = QUDA_SINGLE_PRECISION; break;
  }

  // QUDA gauge parameters
  gauge_param.X[0] = LX;
  gauge_param.X[1] = LY;
  gauge_param.X[2] = LZ;
  gauge_param.X[3] = T;
  inv_param.Ls = L5;

  gauge_param.anisotropy  = 1.0;
  gauge_param.type        = QUDA_WILSON_LINKS;
  gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER;
  gauge_param.t_boundary  = QUDA_ANTI_PERIODIC_T;

  gauge_param.cpu_prec           = cpu_prec;
  gauge_param.cuda_prec          = cuda_prec;
  gauge_param.reconstruct        = QUDA_RECONSTRUCT_12;
  gauge_param.cuda_prec_sloppy   = cuda_prec_sloppy;
  gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_12;
  gauge_param.gauge_fix          = QUDA_GAUGE_FIXED_NO;

  gauge_param.ga_pad = 0;
  inv_param.sp_pad = 0;
  inv_param.cl_pad = 0;

  // For multi-GPU, ga_pad must be large enough to store a time-slice
#ifdef MULTI_GPU
  x_face_size = inv_param.Ls * gauge_param.X[1]*gauge_param.X[2]*gauge_param.X[3]/2;
  y_face_size = inv_param.Ls * gauge_param.X[0]*gauge_param.X[2]*gauge_param.X[3]/2;
  z_face_size = inv_param.Ls * gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[3]/2;
  t_face_size = inv_param.Ls * gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[2]/2;
  pad_size = _MAX(x_face_size, y_face_size);
  pad_size = _MAX(pad_size, z_face_size);
  pad_size = _MAX(pad_size, t_face_size);
  gauge_param.ga_pad = pad_size;
  if(g_cart_id==0) printf("# [invert_dw_quda] pad_size = %d\n", pad_size);
#endif

  // load the gauge field
  if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] loading gauge field\n");
  loadGaugeQuda((void*)gauge_qdp, &gauge_param);
  gauge_qdp[0] = NULL; 
  gauge_qdp[1] = NULL; 
  gauge_qdp[2] = NULL; 
  gauge_qdp[3] = NULL; 

#endif

  /*********************************************
   * APE smear the gauge field
   *********************************************/
  if(N_Jacobi>0) {
    memcpy(gauge_field_smeared, g_gauge_field, 72*VOLUMEPLUSRAND*sizeof(double));
    fprintf(stdout, "# [invert_dw_quda] APE smearing gauge field with paramters N_APE=%d, alpha_APE=%e\n", N_ape, alpha_ape);
    APE_Smearing_Step_threads(gauge_field_smeared, N_ape, alpha_ape);
    xchange_gauge_field(gauge_field_smeared);
  }

  // allocate memory for the spinor fields
#ifdef HAVE_QUDA
  no_fields = 3+2;
#else
  no_fields = 6+2;
#endif
  g_spinor_field = (double**)calloc(no_fields, sizeof(double*));
  for(i=0; i<no_fields; i++) alloc_spinor_field(&g_spinor_field[i], VOLUMEPLUSRAND*L5);
  smearing_spinor_field[0] = g_spinor_field[no_fields-2];
  smearing_spinor_field[1] = g_spinor_field[no_fields-1];

  switch(g_source_type) {
    case 0:
    case 5:
      // the source locaton
      sl0 =   g_source_location                              / (LX_global*LY_global*LZ);
      sl1 = ( g_source_location % (LX_global*LY_global*LZ) ) / (          LY_global*LZ);
      sl2 = ( g_source_location % (          LY_global*LZ) ) / (                    LZ);
      sl3 =   g_source_location %                      LZ;
      if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] global sl = (%d, %d, %d, %d)\n", sl0, sl1, sl2, sl3);
      source_proc_coords[0] = sl0 / T;
      source_proc_coords[1] = sl1 / LX;
      source_proc_coords[2] = sl2 / LY;
      source_proc_coords[3] = sl3 / LZ;
    #ifdef MPI
      MPI_Cart_rank(g_cart_grid, source_proc_coords, &g_source_proc_id);
    #else
      g_source_proc_id = 0;
    #endif
      have_source_flag = g_source_proc_id == g_cart_id;
    
      lsl0 = sl0 % T;
      lsl1 = sl1 % LX;
      lsl2 = sl2 % LY;
      lsl3 = sl3 % LZ;
      if(have_source_flag) {
        fprintf(stdout, "# [invert_dw_quda] process %d has the source at (%d, %d, %d, %d)\n", g_cart_id, lsl0, lsl1, lsl2, lsl3);
      }
      break;
    case 2:
    case 3:
    case 4:
      // the source timeslice
#ifdef MPI
      source_proc_coords[0] = g_source_timeslice / T;
      source_proc_coords[1] = 0;
      source_proc_coords[2] = 0;
      source_proc_coords[3] = 0;
      MPI_Cart_rank(g_cart_grid, source_proc_coords, &g_source_proc_id);
      have_source_flag = ( g_source_proc_id == g_cart_id );
      source_timeslice = have_source_flag ? g_source_timeslice % T : -1;
#else
      g_source_proc_id = 0;
      have_source_flag = 1;
      source_timeslice = g_source_timeslice;
#endif
      break;
  }

#ifdef HAVE_QUDA
  /*************************************************************
   * QUDA inverter parameters
   *************************************************************/
  inv_param.dslash_type    = QUDA_DOMAIN_WALL_DSLASH;

  if(strcmp(g_inverter_type_name, "cg") == 0) {
    inv_param.inv_type       = QUDA_CG_INVERTER;
    if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] using cg inverter\n"); 
  } else if(strcmp(g_inverter_type_name, "bicgstab") == 0) {
    inv_param.inv_type       = QUDA_BICGSTAB_INVERTER;
    if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] using bicgstab inverter\n");
#ifdef MULTI_GPU    
  } else if(strcmp(g_inverter_type_name, "gcr") == 0) {
    inv_param.inv_type       = QUDA_GCR_INVERTER;
    if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] using gcr inverter\n"); 
#endif
  } else {
    if(g_cart_id==0) fprintf(stderr, "[invert_dw_quda] Error, unrecognized inverter type %s\n", g_inverter_type_name);
    EXIT(123);
  }


  if(inv_param.inv_type == QUDA_CG_INVERTER) {
    inv_param.solution_type = QUDA_MAT_SOLUTION;
    inv_param.solve_type    = QUDA_NORMEQ_PC_SOLVE;
  } else if(inv_param.inv_type == QUDA_BICGSTAB_INVERTER) {
    inv_param.solution_type = QUDA_MAT_SOLUTION;
    inv_param.solve_type    = QUDA_DIRECT_PC_SOLVE;
  } else {
    inv_param.solution_type = QUDA_MATPC_SOLUTION;
    inv_param.solve_type    = QUDA_DIRECT_PC_SOLVE;
  }

  inv_param.m5             = g_m5;
  inv_param.kappa          = 0.5 / (5. + inv_param.m5);
  inv_param.mass           = g_m0;

  inv_param.tol            = solver_precision;
  inv_param.maxiter        = niter_max;
  inv_param.reliable_delta = reliable_delta;

#ifdef MPI
  // domain decomposition preconditioner parameters
  if(inv_param.inv_type == QUDA_GCR_INVERTER) {
    if(g_cart_id == 0) printf("# [] settup DD parameters\n");
    inv_param.gcrNkrylov     = 15;
    inv_param.inv_type_precondition = QUDA_MR_INVERTER;
    inv_param.tol_precondition = 1e-6;
    inv_param.maxiter_precondition = 200;
    inv_param.verbosity_precondition = QUDA_VERBOSE;
    inv_param.prec_precondition = cuda_prec_sloppy;
    inv_param.omega = 0.7;
  }
#endif

  inv_param.matpc_type         = QUDA_MATPC_EVEN_EVEN;
  inv_param.dagger             = QUDA_DAG_NO;
  inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION; //;QUDA_MASS_NORMALIZATION;

  inv_param.cpu_prec         = cpu_prec;
  inv_param.cuda_prec        = cuda_prec;
  inv_param.cuda_prec_sloppy = cuda_prec_sloppy;

  inv_param.verbosity = QUDA_VERBOSE;

  inv_param.preserve_source = QUDA_PRESERVE_SOURCE_NO;
  inv_param.dirac_order = QUDA_DIRAC_ORDER;
#ifdef MPI
  inv_param.preserve_dirac = QUDA_PRESERVE_DIRAC_YES;
  inv_param.prec_precondition = cuda_prec_sloppy;
  inv_param.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS;
  inv_param.dirac_tune = QUDA_TUNE_NO;
#endif
#endif

  /*******************************************
   * write initial rng state to file
   *******************************************/
  if( g_source_type==2 && g_coherent_source==2 ) {
    sprintf(rng_file_out, "%s.0", g_rng_filename);
    status = init_rng_stat_file (g_seed, rng_file_out);
    if( status != 0 ) {
      fprintf(stderr, "[invert_dw_quda] Error, could not write rng status\n");
      EXIT(210);
    }
  } else if( (g_source_type==2 /*&& g_coherent_source==1*/) || g_source_type==3 || g_source_type==4) {
    if( init_rng_state(g_seed, &g_rng_state) != 0 ) {
      fprintf(stderr, "[invert_dw_quda] Error, could initialize rng state\n");
      EXIT(211);
    }
  }

  /*******************************************
   * prepare locks for openmp
   *******************************************/
  nthreads = g_num_threads - 1;
  lck = (omp_lock_t*)malloc(nthreads * sizeof(omp_lock_t));
  if(lck == NULL) {
      EXIT_WITH_MSG(97, "[invert_dw_quda] Error, could not allocate lck\n");
  }
  // init locks
  for(i=0;i<nthreads;i++) {
    omp_init_lock(lck+i);
  }
  omp_init_lock(gen_lck);

  // check the source momenta
  if(g_source_momentum_set) {
    source_momentum = (int*)malloc(3*sizeof(int));

    if(g_source_momentum[0]<0) g_source_momentum[0] += LX_global;
    if(g_source_momentum[1]<0) g_source_momentum[1] += LY_global;
    if(g_source_momentum[2]<0) g_source_momentum[2] += LZ_global;
    fprintf(stdout, "# [invert_dw_quda] using final source momentum ( %d, %d, %d )\n", g_source_momentum[0], g_source_momentum[1], g_source_momentum[2]);


    if(full_orbit) {
      status = make_qcont_orbits_3d_parity_avg( &qlatt_id, &qlatt_count, &qlatt_list, &qlatt_nclass, &qlatt_rep, &qlatt_map);
      if(status != 0) {
        if(g_cart_id==0) fprintf(stderr, "\n[invert_dw_quda] Error while creating O_3-lists\n");
        EXIT(4);
      }
      source_momentum_class = qlatt_id[g_ipt[0][g_source_momentum[0]][g_source_momentum[1]][g_source_momentum[2]]];
      source_momentum_no    = qlatt_count[source_momentum_class];
      source_momentum_runs  = source_momentum_class==0 ? 1 : source_momentum_no + 1;
      if(g_cart_id==0) fprintf(stdout, "# [] source momentum belongs to class %d with %d members, which means %d runs\n",
          source_momentum_class, source_momentum_no, source_momentum_runs);
    }
  }

  if(g_source_type == 5) {
    if(g_seq_source_momentum_set) {
      if(g_seq_source_momentum[0]<0) g_seq_source_momentum[0] += LX_global;
      if(g_seq_source_momentum[1]<0) g_seq_source_momentum[1] += LY_global;
      if(g_seq_source_momentum[2]<0) g_seq_source_momentum[2] += LZ_global;
    } else if(g_source_momentum_set) {
      g_seq_source_momentum[0] = g_source_momentum[0];
      g_seq_source_momentum[1] = g_source_momentum[1];
      g_seq_source_momentum[2] = g_source_momentum[2];
    }
    fprintf(stdout, "# [invert_dw_quda] using final sequential source momentum ( %d, %d, %d )\n",
        g_seq_source_momentum[0], g_seq_source_momentum[1], g_seq_source_momentum[2]);
  }


  /***********************************************
   * loop on spin-color-index
   ***********************************************/
  for(isc=g_source_index[0]; isc<=g_source_index[1]; isc++)
//  for(isc=g_source_index[0]; isc<=g_source_index[0]; isc++)
  {
    ispin = isc / n_c;
    icol  = isc % n_c;

    for(imom=0; imom<source_momentum_runs; imom++) {

      /***********************************************
       * set source momentum
       ***********************************************/
      if(g_source_momentum_set) {
        if(imom == 0) {
          if(full_orbit) {
            source_momentum[0] = 0;
            source_momentum[1] = 0;
            source_momentum[2] = 0;
          } else {
            source_momentum[0] = g_source_momentum[0];
            source_momentum[1] = g_source_momentum[1];
            source_momentum[2] = g_source_momentum[2];
          }
        } else {
          source_momentum[0] = qlatt_map[source_momentum_class][imom-1] / (LY_global*LZ_global);
          source_momentum[1] = ( qlatt_map[source_momentum_class][imom-1] % (LY_global*LZ_global) ) / LZ_global;
          source_momentum[2] = qlatt_map[source_momentum_class][imom-1] % LZ_global;
        }
        if(g_cart_id==0) fprintf(stdout, "# [] run no. %d, source momentum (%d, %d, %d)\n",
            imom, source_momentum[0], source_momentum[1], source_momentum[2]);
      
      }
 
      /***********************************************
       * prepare the souce
       ***********************************************/
      if(g_read_source == 0) {  // create source
        switch(g_source_type) {
          case 0:
            // point source
            if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] Creating point source\n");
            for(ix=0;ix<L5*VOLUME;ix++) { _fv_eq_zero(g_spinor_field[0]+ix); }
            if(have_source_flag) {
              if(g_source_momentum_set) {
                phase = 2*M_PI*( source_momentum[0]*sl1/(double)LX_global + source_momentum[1]*sl2/(double)LY_global + source_momentum[2]*sl3/(double)LZ_global );
                g_spinor_field[0][_GSI(g_ipt[lsl0][lsl1][lsl2][lsl3]) + 2*(n_c*ispin+icol)  ] = cos(phase);
                g_spinor_field[0][_GSI(g_ipt[lsl0][lsl1][lsl2][lsl3]) + 2*(n_c*ispin+icol)+1] = sin(phase);
              } else {
                g_spinor_field[0][_GSI(g_ipt[lsl0][lsl1][lsl2][lsl3]) + 2*(n_c*ispin+icol)  ] = 1.;
              }
            }
            if(g_source_momentum_set) {
              sprintf(source_filename, "%s.%.4d.t%.2dx%.2dy%.2dz%.2d.%.2d.qx%.2dqy%.2dqz%.2d",
                  filename_prefix, Nconf, sl0, sl1, sl2, sl3, n_c*ispin+icol, source_momentum[0], source_momentum[1], source_momentum[2]);
            } else {
              sprintf(source_filename, "%s.%.4d.t%.2dx%.2dy%.2dz%.2d.%.2d", filename_prefix, Nconf, sl0, sl1, sl2, sl3, n_c*ispin+icol);
            }
#ifdef HAVE_QUDA
            // set matpc_tpye
            source_location_5d_iseven = ( (g_iseven[g_ipt[lsl0][lsl1][lsl2][lsl3]] && ispin<n_s/2) || (!g_iseven[g_ipt[lsl0][lsl1][lsl2][lsl3]] && ispin>=n_s/2) ) ? 1 : 0;
            if(source_location_5d_iseven) {
              inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
              if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] matpc type is MATPC_EVEN_EVEN\n");
            } else {
              inv_param.matpc_type = QUDA_MATPC_ODD_ODD;
              if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] matpc type is MATPC_ODD_ODD\n");
            }
#endif
            break;
          case 2:
            // timeslice source
            if(g_coherent_source==1) {
              if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] Creating coherent timeslice source\n");
              status = prepare_coherent_timeslice_source(g_spinor_field[0], gauge_field_smeared, g_coherent_source_base, g_coherent_source_delta, VOLUME, g_rng_state, 1);
              if(status != 0) {
                fprintf(stderr, "[invert_dw_quda] Error from prepare source, status was %d\n", status);
#ifdef MPI
                MPI_Abort(MPI_COMM_WORLD, 123);
                MPI_Finalize();
#endif
                exit(123);
              }
              check_error(prepare_coherent_timeslice_source(g_spinor_field[0], gauge_field_smeared, g_coherent_source_base, g_coherent_source_delta, VOLUME, g_rng_state, 1),
                  "prepare_coherent_timeslice_source", NULL, 123);
              timeslice = g_coherent_source_base;
            } else {
              if(g_coherent_source==2) {
                timeslice = (g_coherent_source_base+isc*g_coherent_source_delta)%T_global;
                fprintf(stdout, "# [invert_dw_quda] Creating timeslice source\n");
                check_error(prepare_timeslice_source(g_spinor_field[0], gauge_field_smeared, timeslice, VOLUME, g_rng_state, 1),
                    "prepare_timeslice_source", NULL, 123);
              } else {
                if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] Creating timeslice source\n");
                check_error(prepare_timeslice_source(g_spinor_field[0], gauge_field_smeared, g_source_timeslice, VOLUME, g_rng_state, 1),
                    "prepare_timeslice_source", NULL, 124);
                timeslice = g_source_timeslice;
              }
            }
            if(g_source_momentum_set) {
              sprintf(source_filename, "%s.%.4d.%.2d.%.5d.qx%.2dqy%.2dqz%.2d", filename_prefix, Nconf, 
                  timeslice, isc, source_momentum[0], source_momentum[1], source_momentum[2]);
            } else {
              sprintf(source_filename, "%s.%.4d.%.2d.%.5d", filename_prefix, Nconf, timeslice, isc);
            }
            break;
          case 3:
            // timeslice sources for one-end trick (spin dilution)
            fprintf(stdout, "# [invert_dw_quda] Creating timeslice source for one-end-trick\n");
            check_error( prepare_timeslice_source_one_end(g_spinor_field[0], gauge_field_smeared, source_timeslice, source_momentum, isc%n_s, g_rng_state, \
                ( isc%n_s==(n_s-1) && imom==source_momentum_runs-1 )), "prepare_timeslice_source_one_end", NULL, 125 );
            c = N_Jacobi > 0 ? isc%n_s + n_s : isc%n_s;
            if(g_source_momentum_set) {
              sprintf(source_filename, "%s.%.4d.%.2d.%.2d.qx%.2dqy%.2dqz%.2d", filename_prefix, Nconf, 
                  g_source_timeslice, c, source_momentum[0], source_momentum[1], source_momentum[2]);
            } else {
              sprintf(source_filename, "%s.%.4d.%.2d.%.2d", filename_prefix, Nconf, g_source_timeslice, c);
            }
            break;
          case 4:
            // timeslice sources for one-end trick (spin and color dilution )
            fprintf(stdout, "# [invert_dw_quda] Creating timeslice source for one-end-trick\n");
            check_error(prepare_timeslice_source_one_end_color(g_spinor_field[0], gauge_field_smeared, source_timeslice, source_momentum,\
                isc%(n_s*n_c), g_rng_state, ( isc%(n_s*n_c)==(n_s*n_c-1)  && imom==source_momentum_runs-1 )), "prepare_timeslice_source_one_end_color", NULL, 126);
            c = N_Jacobi > 0 ? isc%(n_s*n_c) + (n_s*n_c) : isc%(n_s*n_c);
            if(g_source_momentum_set) {
              sprintf(source_filename, "%s.%.4d.%.2d.%.2d.qx%.2dqy%.2dqz%.2d", filename_prefix, Nconf, 
                  g_source_timeslice, c, source_momentum[0], source_momentum[1], source_momentum[2]);
            } else {
              sprintf(source_filename, "%s.%.4d.%.2d.%.2d", filename_prefix, Nconf, g_source_timeslice, c);
            }
            break;
          case 5:
            if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] preparing sequential point source\n");
            check_error( prepare_sequential_point_source (g_spinor_field[0], isc, sl0, g_seq_source_momentum, 
                  smear_source, g_spinor_field[1], gauge_field_smeared), "prepare_sequential_point_source", NULL, 33);
            sprintf(source_filename, "%s.%.4d.t%.2dx%.2d.y%.2d.z%.2d.%.2d.qx%.2dqy%.2dqz%.2d", filename_prefix2, Nconf,
                sl0, sl1, sl2, sl3, isc, g_source_momentum[0], g_source_momentum[1], g_source_momentum[2]);
            break;
          default:
            fprintf(stderr, "\nError, unrecognized source type\n");
            exit(32);
            break;
        }
      } else { // read source
        switch(g_source_type) {
          case 0:  // point source
            if(g_source_momentum_set) {
              sprintf(source_filename, "%s.%.4d.t%.2dx%.2dy%.2dz%.2d.%.2d.qx%.2dqy%.2dqz%.2d", \
                  filename_prefix2, Nconf, sl0, sl1, sl2, sl3, isc, source_momentum[0], source_momentum[1], source_momentum[2]);
            } else  {
              sprintf(source_filename, "%s.%.4d.t%.2dx%.2dy%.2dz%.2d.%.2d", filename_prefix2, Nconf, sl0, sl1, sl2, sl3, isc);
            }
            fprintf(stdout, "# [invert_dw_quda] reading source from file %s\n", source_filename);
            check_error(read_lime_spinor(g_spinor_field[0], source_filename, 0), "read_lime_spinor", NULL, 115);
            break;
          case 2:  // timeslice source
            if(g_source_momentum_set) {
              sprintf(source_filename, "%s.%.4d.%.2d.%.5d.qx%.2dqy%.2dqz%.2d", filename_prefix2, Nconf, g_source_timeslice,
                  isc, source_momentum[0], source_momentum[1], source_momentum[2]);
            } else {
              sprintf(source_filename, "%s.%.4d.%.2d.%.5d", filename_prefix2, Nconf, g_source_timeslice, isc);
            }
            fprintf(stdout, "# [invert_dw_quda] reading source from file %s\n", source_filename);
            check_error(read_lime_spinor(g_spinor_field[0], source_filename, 0), "read_lime_spinor", NULL, 115);
            break;
          default:
            check_error(1, "source type", NULL, 104);
            break;
          case -1:  // timeslice source
            sprintf(source_filename, "%s", filename_prefix2);
            fprintf(stdout, "# [invert_dw_quda] reading source from file %s\n", source_filename);
            check_error(read_lime_spinor(g_spinor_field[0], source_filename, 0), "read_lime_spinor", NULL, 115);
            break;
        }
      }  // of if g_read_source
  
      if(g_write_source) {
        check_error(write_propagator(g_spinor_field[0], source_filename, 0, g_propagator_precision), "write_propagator", NULL, 27);
      }

/***********************************************************************************************
 * here threads split: 
 ***********************************************************************************************/
      if(dummy_flag==0) strcpy(source_filename_write, source_filename);
      memcpy((void*)(smearing_spinor_field[0]), (void*)(g_spinor_field[0]), 24*VOLUME*sizeof(double));
      if(dummy_flag>0) {
        // copy only if smearing has been done; otherwise do not copy, do not invert
        if(g_cart_id==0) fprintf(stdout, "# [] copy smearing field -> g field\n");
        memcpy((void*)(g_spinor_field[0]), (void*)(smearing_spinor_field[1]), 24*VOLUME*sizeof(double));
      }

      omp_set_num_threads(g_num_threads);
#pragma omp parallel private(threadid, _2_kappa, is, ix, iy, iix, ratime, retime) shared(key,g_read_source, smear_source, N_Jacobi, kappa_Jacobi, smearing_spinor_field, g_spinor_field, nthreads, convert_sign, VOLUME, VOL3, T, L5, isc, rotate_gamma_basis, g_cart_id) firstprivate(inv_param, gauge_param, ofs)
{
      threadid = omp_get_thread_num();

  if(threadid < nthreads) {
      fprintf(stdout, "# [] proc%.2d thread%.2d starting source preparation\n", g_cart_id, threadid);

      // smearing
      if( ( !g_read_source || (g_read_source && smear_source ) ) && N_Jacobi > 0 ) {
        if(g_cart_id==0) fprintf(stdout, "#  [invert_dw_quda] smearing source with N_Jacobi=%d, kappa_Jacobi=%e\n", N_Jacobi, kappa_Jacobi);
        Jacobi_Smearing_threaded(gauge_field_smeared, smearing_spinor_field[0], smearing_spinor_field[1], kappa_Jacobi, N_Jacobi, threadid, nthreads);
      }


      /***********************************************
       * create the 5-dim. source field
       ***********************************************/
      if(convert_sign == 0) {
        spinor_4d_to_5d_threaded(smearing_spinor_field[0], smearing_spinor_field[0], threadid, nthreads);
      }  else if(convert_sign == 1 || convert_sign == -1) {
        spinor_4d_to_5d_sign_threaded(smearing_spinor_field[0], smearing_spinor_field[0], convert_sign, threadid, nthreads);
      }


      for(is=0; is<L5; is++) {
        for(it=threadid; it<T; it+=nthreads) {
          memcpy((void*)(g_spinor_field[0]+_GSI(g_ipt_5d[is][it][0][0][0])), (void*)(smearing_spinor_field[0]+_GSI(g_ipt_5d[is][it][0][0][0])), VOL3*24*sizeof(double));
        }
      }


      // reorder, multiply with g2
      for(is=0; is<L5; is++) {
        for(it=threadid; it<T; it+=nthreads) {
          for(i3=0; i3<VOL3; i3++) {
            ix = (is*T+it)*VOL3 + i3;
            _fv_eq_zero(smearing_spinor_field[1]+_GSI(ix));
      }}} 

      if(rotate_gamma_basis) {
        for(it=threadid; it<T; it+=nthreads) {
          for(i3=0; i3<VOL3; i3++) {
            ix = it * VOL3 + i3;
            iy = lexic2eot_5d(0, ix);
            _fv_eq_gamma_ti_fv(smearing_spinor_field[1]+_GSI(iy), 2, smearing_spinor_field[0]+_GSI(ix));
        }}
        for(it=threadid; it<T; it+=nthreads) {
          for(i3=0; i3<VOL3; i3++) {
            ix = it * VOL3 + i3;
            iy = lexic2eot_5d(L5-1, ix);
            _fv_eq_gamma_ti_fv(smearing_spinor_field[1]+_GSI(iy), 2, smearing_spinor_field[0]+_GSI(ix+(L5-1)*VOLUME));
        }}
      } else {
        for(it=threadid; it<T; it+=nthreads) {
          for(i3=0; i3<VOL3; i3++) {
            ix = it * VOL3 + i3;
            iy = lexic2eot_5d(0, ix);
            _fv_eq_fv(smearing_spinor_field[1]+_GSI(iy), smearing_spinor_field[0]+_GSI(ix));
        }}
        for(it=threadid; it<T; it+=nthreads) {
          for(i3=0; i3<VOL3; i3++) {
            ix = it * VOL3 + i3;
            iy = lexic2eot_5d(L5-1, ix);
            _fv_eq_fv(smearing_spinor_field[1]+_GSI(iy), smearing_spinor_field[0]+_GSI(ix+(L5-1)*VOLUME));
        }}
      }
      fprintf(stdout, "# [] proc%.2d thread%.2d finished source preparation\n", g_cart_id, threadid);

  } else if(threadid == g_num_threads-1 && dummy_flag > 0) {  // else branch on threadid
      fprintf(stdout, "# [] proc%.2d thread%.2d starting inversion for dummy_flag = %d\n", g_cart_id, threadid, dummy_flag);

      /***********************************************
       * perform the inversion
       ***********************************************/
      if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] starting inversion\n");

      xchange_field_5d(g_spinor_field[0]);
      memset(g_spinor_field[1], 0, (VOLUME+RAND)*L5*24*sizeof(double));
      ratime = CLOCK;
#ifdef MPI
      if(inv_param.inv_type == QUDA_BICGSTAB_INVERTER  || inv_param.inv_type == QUDA_GCR_INVERTER) {
        if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] calling invertQuda\n");
        invertQuda(g_spinor_field[1], g_spinor_field[0], &inv_param);
      } else if(inv_param.inv_type == QUDA_CG_INVERTER) {
        if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] calling testCG\n");
        testCG(g_spinor_field[1], g_spinor_field[0], &inv_param);
      } else {
        if(g_cart_id==0) fprintf(stderr, "# [invert_dw_quda] unrecognized inverter\n");
      }
#else
      invertQuda(g_spinor_field[1], g_spinor_field[0], &inv_param);
#endif
      retime = CLOCK;

      if(g_cart_id==0) {
        fprintf(stdout, "# [invert_dw_quda] QUDA time:  %e seconds\n", inv_param.secs);
        fprintf(stdout, "# [invert_dw_quda] QUDA Gflops: %e\n", inv_param.gflops/inv_param.secs);
        fprintf(stdout, "# [invert_dw_quda] wall time:  %e seconds\n", retime-ratime);
        fprintf(stdout, "# [invert_dw_quda] Device memory used:\n\tSpinor: %f GiB\n\tGauge: %f GiB\n",
        inv_param.spinorGiB, gauge_param.gaugeGiB);
      }
  }  // of if threadid

// wait till all threads are here
#pragma omp barrier

      if(inv_param.mass_normalization == QUDA_KAPPA_NORMALIZATION) {
        _2_kappa = 2. * g_kappa5d;
        for(ix=threadid; ix<VOLUME*L5;ix+=g_num_threads) {
          _fv_ti_eq_re(g_spinor_field[1]+_GSI(ix), _2_kappa );
        }
      }
  
#pragma omp barrier
      // reorder, multiply with g2
      for(is=0;is<L5;is++) {
      for(ix=threadid; ix<VOLUME; ix+=g_num_threads) {
        iy  = lexic2eot_5d(is, ix);
        iix = is*VOLUME + ix;
        _fv_eq_fv(g_spinor_field[0]+_GSI(iix), g_spinor_field[1]+_GSI(iy));
      }}
#pragma omp barrier
      if(rotate_gamma_basis) {
        for(ix=threadid; ix<VOLUME*L5; ix+=g_num_threads) {
          _fv_eq_gamma_ti_fv(g_spinor_field[1]+_GSI(ix), 2, g_spinor_field[0]+_GSI(ix));
        }
      } else {
        for(ix=threadid; ix<VOLUME*L5;ix+=g_num_threads) {
          _fv_eq_fv(g_spinor_field[1]+_GSI(ix), g_spinor_field[0]+_GSI(ix));
        }
      }
      if(g_cart_id==0 && threadid==g_num_threads-1) fprintf(stdout, "# [invert_dw_quda] inversion done in %e seconds\n", retime-ratime);

#pragma omp single
  {

#ifdef MPI
      xchange_field_5d(g_spinor_field[1]);
#endif
      /***********************************************
       * check residuum
       ***********************************************/
      if(check_residuum && dummy_flag>0) {
        // apply the Wilson Dirac operator in the gamma-basis defined in cvc_linalg,
        //   which uses the tmLQCD conventions (same as in contractions)
        //   without explicit boundary conditions
#ifdef MPI
        xchange_field_5d(g_spinor_field[2]);
        xchange_field_5d(g_spinor_field[1]);
#endif
        memset(g_spinor_field[0], 0, 24*(VOLUME+RAND)*L5*sizeof(double));

        //sprintf(filename, "%s.inverted.ascii.%.2d", source_filename, g_cart_id);
        //ofs = fopen(filename, "w");
        //printf_spinor_field_5d(g_spinor_field[1], ofs);
        //fclose(ofs);

        Q_DW_Wilson_phi(g_spinor_field[0], g_spinor_field[1]);
  
        for(ix=0;ix<VOLUME*L5;ix++) {
          _fv_mi_eq_fv(g_spinor_field[0]+_GSI(ix), g_spinor_field[2]+_GSI(ix));
        }
  
        spinor_scalar_product_re(&norm2, g_spinor_field[2], g_spinor_field[2], VOLUME*L5);
        spinor_scalar_product_re(&norm, g_spinor_field[0], g_spinor_field[0], VOLUME*L5);
        if(g_cart_id==0) fprintf(stdout, "\n# [invert_dw_quda] absolut residuum squared: %e; relative residuum %e\n", norm, sqrt(norm/norm2) );

      }
  
      if(dummy_flag>0) {
        /***********************************************
         * create 4-dim. propagator
         ***********************************************/
        if(convert_sign == 0) {
          spinor_5d_to_4d(g_spinor_field[1], g_spinor_field[1]);
        } else if(convert_sign == -1 || convert_sign == +1) {
          spinor_5d_to_4d_sign(g_spinor_field[1], g_spinor_field[1], convert_sign);
        }
  
        /***********************************************
         * write the solution 
         ***********************************************/
        sprintf(filename, "%s.inverted", source_filename_write);
        if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] writing propagator to file %s\n", filename);
        check_error(write_propagator(g_spinor_field[1], filename, 0, g_propagator_precision), "write_propagator", NULL, 22);
        
        //sprintf(filename, "prop.ascii.4d.%.2d.%.2d.%.2d", isc, g_nproc, g_cart_id);
        //ofs = fopen(filename, "w");
        //printf_spinor_field(g_spinor_field[1], ofs);
        //fclose(ofs);
      }

      if(check_residuum) memcpy(g_spinor_field[2], smearing_spinor_field[0], 24*VOLUME*L5*sizeof(double));

  }  // of omp single

}    // of omp parallel region

      if(dummy_flag > 0) strcpy(source_filename_write, source_filename);

      dummy_flag++;
 
    }  // of loop on momenta

  }  // of isc

#if 0
  // last inversion

  {
      memcpy(g_spinor_field[0], smearing_spinor_field[1], 24*VOLUME*L5*sizeof(double));
      if(g_cart_id==0) fprintf(stdout, "# [] proc%.2d starting last inversion\n", g_cart_id);


      /***********************************************
       * perform the inversion
       ***********************************************/
      if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] starting inversion\n");

      xchange_field_5d(g_spinor_field[0]);
      memset(g_spinor_field[1], 0, (VOLUME+RAND)*L5*24*sizeof(double));
      ratime = CLOCK;
#ifdef MPI
      if(inv_param.inv_type == QUDA_BICGSTAB_INVERTER  || inv_param.inv_type == QUDA_GCR_INVERTER) {
        if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] calling invertQuda\n");
        invertQuda(g_spinor_field[1], g_spinor_field[0], &inv_param);
      } else if(inv_param.inv_type == QUDA_CG_INVERTER) {
        if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] calling testCG\n");
        testCG(g_spinor_field[1], g_spinor_field[0], &inv_param);
      } else {
        if(g_cart_id==0) fprintf(stderr, "# [invert_dw_quda] unrecognized inverter\n");
      }
#else
      invertQuda(g_spinor_field[1], g_spinor_field[0], &inv_param);
#endif
      retime = CLOCK;

      if(g_cart_id==0) {
        fprintf(stdout, "# [invert_dw_quda] QUDA time:  %e seconds\n", inv_param.secs);
        fprintf(stdout, "# [invert_dw_quda] QUDA Gflops: %e\n", inv_param.gflops/inv_param.secs);
        fprintf(stdout, "# [invert_dw_quda] wall time:  %e seconds\n", retime-ratime);
        fprintf(stdout, "# [invert_dw_quda] Device memory used:\n\tSpinor: %f GiB\n\tGauge: %f GiB\n",
        inv_param.spinorGiB, gauge_param.gaugeGiB);
      }

      omp_set_num_threads(g_num_threads);
#pragma omp parallel private(threadid,_2_kappa,is,ix,iy,iix) shared(VOLUME,L5,g_kappa,g_spinor_field,g_num_threads)
    {
      threadid = omp_get_thread_num();

      if(inv_param.mass_normalization == QUDA_KAPPA_NORMALIZATION) {
        _2_kappa = 2. * g_kappa5d;
        for(ix=threadid; ix<VOLUME*L5;ix+=g_num_threads) {
          _fv_ti_eq_re(g_spinor_field[1]+_GSI(ix), _2_kappa );
        }
      }
#pragma omp barrier
      // reorder, multiply with g2
      for(is=0;is<L5;is++) {
      for(ix=threadid; ix<VOLUME; ix+=g_num_threads) {
        iy  = lexic2eot_5d(is, ix);
        iix = is*VOLUME + ix;
        _fv_eq_fv(g_spinor_field[0]+_GSI(iix), g_spinor_field[1]+_GSI(iy));
      }}
#pragma omp barrier
      if(rotate_gamma_basis) {
        for(ix=threadid; ix<VOLUME*L5; ix+=g_num_threads) {
          _fv_eq_gamma_ti_fv(g_spinor_field[1]+_GSI(ix), 2, g_spinor_field[0]+_GSI(ix));
        }
      } else {
        for(ix=threadid; ix<VOLUME*L5;ix+=g_num_threads) {
          _fv_eq_fv(g_spinor_field[1]+_GSI(ix), g_spinor_field[0]+_GSI(ix));
        }
      }

    }  // end of parallel region

    if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] inversion done in %e seconds\n", retime-ratime);


#ifdef MPI
      xchange_field_5d(g_spinor_field[1]);
#endif
      /***********************************************
       * check residuum
       ***********************************************/
      if(check_residuum && dummy_flag>0) {
        // apply the Wilson Dirac operator in the gamma-basis defined in cvc_linalg,
        //   which uses the tmLQCD conventions (same as in contractions)
        //   without explicit boundary conditions
#ifdef MPI
        xchange_field_5d(g_spinor_field[2]);
#endif
        memset(g_spinor_field[0], 0, 24*(VOLUME+RAND)*L5*sizeof(double));

        //sprintf(filename, "%s.inverted.ascii.%.2d", source_filename, g_cart_id);
        //ofs = fopen(filename, "w");
        //printf_spinor_field_5d(g_spinor_field[1], ofs);
        //fclose(ofs);


        Q_DW_Wilson_phi(g_spinor_field[0], g_spinor_field[1]);
  
        for(ix=0;ix<VOLUME*L5;ix++) {
          _fv_mi_eq_fv(g_spinor_field[0]+_GSI(ix), g_spinor_field[2]+_GSI(ix));
        }
  
        spinor_scalar_product_re(&norm, g_spinor_field[0], g_spinor_field[0], VOLUME*L5);
        spinor_scalar_product_re(&norm2, g_spinor_field[2], g_spinor_field[2], VOLUME*L5);
        if(g_cart_id==0) fprintf(stdout, "\n# [invert_dw_quda] absolut residuum squared: %e; relative residuum %e\n", norm, sqrt(norm/norm2) );

      }
  
      /***********************************************
       * create 4-dim. propagator
       ***********************************************/
      if(convert_sign == 0) {
        spinor_5d_to_4d(g_spinor_field[1], g_spinor_field[1]);
      } else if(convert_sign == -1 || convert_sign == +1) {
        spinor_5d_to_4d_sign(g_spinor_field[1], g_spinor_field[1], convert_sign);
      }
  
      /***********************************************
       * write the solution 
       ***********************************************/
      sprintf(filename, "%s.inverted", source_filename_write);
      if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] writing propagator to file %s\n", filename);
      check_error(write_propagator(g_spinor_field[1], filename, 0, g_propagator_precision), "write_propagator", NULL, 22);
        
      //sprintf(filename, "prop.ascii.4d.%.2d.%.2d.%.2d", isc, g_nproc, g_cart_id);
      //ofs = fopen(filename, "w");
      //printf_spinor_field(g_spinor_field[1], ofs);
      //fclose(ofs);
  }  // of last inversion

#endif  // of if 0

  /***********************************************
   * free the allocated memory, finalize 
   ***********************************************/

#ifdef HAVE_QUDA
  // finalize the QUDA library
  if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] finalizing quda\n");
#ifdef MPI
  freeGaugeQuda();
#endif
  endQuda();
#endif
  if(g_gauge_field != NULL) free(g_gauge_field);
  if(gauge_field_smeared != NULL) free(gauge_field_smeared);
  if(no_fields>0) {
    if(g_spinor_field!=NULL) {
      for(i=0; i<no_fields; i++) if(g_spinor_field[i]!=NULL) free(g_spinor_field[i]);
      free(g_spinor_field);
    }
  }
  free_geometry();

  if(g_source_momentum_set && full_orbit) {
    finalize_q_orbits(&qlatt_id, &qlatt_count, &qlatt_list, &qlatt_rep);
    if(qlatt_map != NULL) {
      free(qlatt_map[0]);
      free(qlatt_map);
    }
  }
  if(source_momentum != NULL) free(source_momentum);
  if(lck != NULL) free(lck);


#ifdef MPI
#ifdef HAVE_QUDA
  endCommsQuda();
#else
  MPI_Finalize();
#endif
#endif
  if(g_cart_id==0) {
    g_the_time = time(NULL);
    fprintf(stdout, "\n# [invert_dw_quda] %s# [invert_dw_quda] end of run\n", ctime(&g_the_time));
    fprintf(stderr, "\n# [invert_dw_quda] %s# [invert_dw_quda] end of run\n", ctime(&g_the_time));
  }
  return(0);
}
Example #2
0
void source_generation_pion_only(spinor * const P, spinor * const Q,
				 const int t, const int sample, 
                                 const int nstore, const unsigned int _seed) {

  int reset = 0, i, x, y, z, is, ic, lt, lx, ly, lz, id=0;
  int coords[4], seed, r;
  double rnumber, si=0., co=0.;
  int rlxd_state[105];
  const double sqr2 = 1./sqrt(2.);
  _Complex double * p = NULL;
  
  zero_spinor_field(P,VOLUME/2);
  zero_spinor_field(Q,VOLUME/2);

  /* save the ranlxd_state if neccessary */
  if(ranlxd_init == 1) {
    rlxd_get(rlxd_state);
    reset = 1;
  }

  /* Compute the seed */
  seed =(int) abs(_seed + sample + t*10*97 + nstore*100*53);

  rlxd_init(2, seed);

  lt = t - g_proc_coords[0]*T;
  coords[0] = t / T;
  for(x = 0; x < LX*g_nproc_x; x++) {
    lx = x - g_proc_coords[1]*LX;
    coords[1] = x / LX;
    for(y = 0; y < LY*g_nproc_y; y++) {
      ly = y - g_proc_coords[2]*LY;
      coords[2] = y / LY;
      for(z = 0; z < LZ*g_nproc_z; z++) {
	lz = z - g_proc_coords[3]*LZ;
	coords[3] = z / LZ;
#ifdef TM_USE_MPI
	MPI_Cart_rank(g_cart_grid, coords, &id);
#endif
	for(is = 0; is < 4; is++) {
	  for(ic = 0; ic < 3; ic++) {
	    ranlxd(&rnumber, 1);
	    if(g_cart_id  == id) {
	      r = (int)floor(4.*rnumber);
	      if(r == 0)
	      {
		si = sqr2;
		co = sqr2;
	      }
	      else if(r == 1) {
		si = -sqr2;
		co = sqr2;
	      }
	      else if(r==2) {
		si = sqr2;
		co = -sqr2;
	      }
	      else {
		si = -sqr2;
		co = -sqr2;
	      }
	    
	      i = g_lexic2eosub[ g_ipt[lt][lx][ly][lz] ];
	      if((lt+lx+ly+lz+g_proc_coords[3]*LZ+g_proc_coords[2]*LY 
		  + g_proc_coords[0]*T+g_proc_coords[1]*LX)%2 == 0) {
		p = (_Complex double*)(P + i);
	      }
	      else {
		p = (_Complex double*)(Q + i);
	      }
	      
	      (*(p+3*is+ic)) = co + si * I;
	    }
	  }
	}
      }
    }
  }
	    
  /* reset the ranlxd if neccessary */
  if(reset) {
    rlxd_reset(rlxd_state);
  }
  return;
}
Example #3
0
void mpi_manager_2D::determin_OtherRanks() {

	// Find neighbouring ranks:
	MPI_Cart_shift(comm2d, 0, 1, &left , &right);
	MPI_Cart_shift(comm2d, 1, 1, &front, &back);

	// Determine ranks of neighbour processes:
	int shiftcoord[DIM];
	int lbound[DIM],ubound[DIM];
	for(int dim=0;dim<DIM;dim++){
		lbound[dim]=-nproc[dim];
		ubound[dim]= nproc[dim];
	}
	Neighbours.resize(lbound,ubound);
	Neighbours.clear();

	for(int dim0=-nproc[0]; dim0<=nproc[0]; dim0++){
		shiftcoord[0] = (coords[0]+dim0);
		if(shiftcoord[0] < 0) shiftcoord[0]+=nproc[0];
		for(int dim1=-nproc[1]; dim1<=nproc[1]; dim1++){
			shiftcoord[1] = (coords[1]+dim1);
			if(shiftcoord[1] < 0) shiftcoord[1]+=nproc[1];

			if(shiftcoord[0]>=0 && shiftcoord[0]<nproc[0] &&
			   shiftcoord[1]>=0 && shiftcoord[1]<nproc[1]) {
				// Now determine rank at relative shifted position
				// std::cout << " Cart ";
				// std::cout << shiftcoord[0] << " ";
				// std::cout << shiftcoord[1] << " ";
				// std::cout << rank << " ";
				// std::cout << nproc[0] << " ";
				// std::cout << nproc[1] << " ";
				// std::cout << std::endl;
				MPI_Cart_rank(comm2d, shiftcoord, &Neighbours(dim0,dim1));
			} else {
				// If outside domain set to error value
				Neighbours(dim0, dim1) = MPI_PROC_NULL;
			}

		}
	}

	NeighboursCyclic.resize(lbound,ubound);
	NeighboursCyclic.clear();

	for(int dim0=-nproc[0]; dim0<=nproc[0]; dim0++){
		shiftcoord[0] = (coords[0]+dim0)%nproc[0];
		if(shiftcoord[0] < 0) shiftcoord[0]+=nproc[0];
		for(int dim1=-nproc[1]; dim1<=nproc[1]; dim1++){
			shiftcoord[1] = (coords[1]+dim1)%nproc[1];
			if(shiftcoord[1] < 0) shiftcoord[1]+=nproc[1];

			// Now determine rank at relative shifted position
			MPI_Cart_rank(comm2d, shiftcoord, &NeighboursCyclic(dim0,dim1));

		}
	}
	
	// Now determine absolute position of ranks
	AllRanks.resize(Index::set(0,0),
	                Index::set(nproc[0]-1,nproc[1]-1));

	for(int dim1=0; dim1<nproc[1]; ++dim1) {
		for(int dim0=0; dim0<nproc[0]; ++dim0) {
			int coord[2] = {dim0, dim1};
			MPI_Cart_rank(comm2d, coord, &AllRanks(dim0, dim1));
		}
	}

}
Example #4
0
int main(int argc, char **argv) {
  
  int c, i, mu, status;
  int ispin, icol, isc;
  int n_c = 3;
  int n_s = 4;
  int count        = 0;
  int filename_set = 0;
  int dims[4]      = {0,0,0,0};
  int l_LX_at, l_LXstart_at;
  int x0, x1, x2, x3, ix, iix, iy;
  int sl0, sl1, sl2, sl3, have_source_flag=0;
  int source_proc_coords[4], lsl0, lsl1, lsl2, lsl3, source_proc_id;
  int check_residuum = 0;
  unsigned int VOL3;
  int do_gt   = 0;
  int full_orbit = 0;
  char filename[200], source_filename[200];
  double ratime, retime;
  double plaq_r=0., plaq_m=0., norm, norm2;
  // double spinor1[24], spinor2[24];
  double *gauge_qdp[4], *gauge_field_timeslice=NULL, *gauge_field_smeared=NULL;
  double _1_2_kappa, _2_kappa, phase;
  FILE *ofs;
  int mu_trans[4] = {3, 0, 1, 2};
  int threadid, nthreads;
  int timeslice;
  char rng_file_in[100], rng_file_out[100];
  int *source_momentum=NULL;
  int source_momentum_class = -1;
  int source_momentum_no = 0;
  int source_momentum_runs = 1;
  int imom;

  /************************************************/
  int qlatt_nclass;
  int *qlatt_id=NULL, *qlatt_count=NULL, **qlatt_rep=NULL, **qlatt_map=NULL;
  double **qlatt_list=NULL;
  /************************************************/
       

  /***********************************************
   * QUDA parameters
   ***********************************************/
  QudaPrecision cpu_prec         = QUDA_DOUBLE_PRECISION;
  QudaPrecision cuda_prec        = QUDA_DOUBLE_PRECISION;
  QudaPrecision cuda_prec_sloppy = QUDA_DOUBLE_PRECISION;

  QudaGaugeParam gauge_param = newQudaGaugeParam();
  QudaInvertParam inv_param = newQudaInvertParam();


#ifdef MPI
  MPI_Init(&argc, &argv);
#endif

  while ((c = getopt(argc, argv, "och?vgf:p:")) != -1) {
    switch (c) {
    case 'v':
      g_verbose = 1;
      break;
    case 'g':
      do_gt = 1;
      break;
    case 'f':
      strcpy(filename, optarg);
      filename_set=1;
      break;
    case 'c':
      check_residuum = 1;
      fprintf(stdout, "# [invert_quda] will check residuum again\n");
      break;
    case 'p':
      n_c = atoi(optarg);
      fprintf(stdout, "# [invert_quda] will use number of colors = %d\n", n_c);
      break;
    case 'o':
      full_orbit = 1;
      fprintf(stdout, "# [invert_quda] will invert for full orbit, if source momentum set\n");
      break;
    case 'h':
    case '?':
    default:
      usage();
      break;
    }
  }

  // get the time stamp
  g_the_time = time(NULL);

  /**************************************
   * set the default values, read input
   **************************************/
  if(filename_set==0) strcpy(filename, "cvc.input");
  if(g_proc_id==0) fprintf(stdout, "# Reading input from file %s\n", filename);
  read_input_parser(filename);

  /* some checks on the input data */
  if((T_global == 0) || (LX==0) || (LY==0) || (LZ==0)) {
    if(g_proc_id==0) fprintf(stderr, "[invert_quda] Error, T and L's must be set\n");
    usage();
  }
  if(g_kappa == 0.) {
    if(g_proc_id==0) fprintf(stderr, "[invert_quda] Error, kappa should be > 0.n");
    usage();
  }

  // set number of openmp threads
#ifdef OPENMP
  omp_set_num_threads(g_num_threads);
#else
  fprintf(stdout, "[invert_quda_cg] Warning, resetting global number of threads to 1\n");
  g_num_threads = 1;
#endif

  /* initialize MPI parameters */
  mpi_init(argc, argv);
  
  // the volume of a timeslice
  VOL3 = LX*LY*LZ;

  fprintf(stdout, "# [%2d] parameters:\n"\
                  "# [%2d] T            = %3d\n"\
		  "# [%2d] Tstart       = %3d\n",\
		  g_cart_id, g_cart_id, T, g_cart_id, Tstart);

#ifdef MPI
  if(T==0) {
    fprintf(stderr, "[%2d] local T is zero; exit\n", g_cart_id);
    MPI_Abort(MPI_COMM_WORLD, 1);
    MPI_Finalize();
    exit(2);
  }
#endif

  if(init_geometry() != 0) {
    fprintf(stderr, "ERROR from init_geometry\n");
#ifdef MPI
    MPI_Abort(MPI_COMM_WORLD, 1);
    MPI_Finalize();
#endif
    exit(1);
  }

  geometry();


  /**************************************
   * initialize the QUDA library
   **************************************/
  fprintf(stdout, "# [invert_quda] initializing quda\n");
  initQuda(g_gpu_device_number);
  
  /**************************************
   * prepare the gauge field
   **************************************/
  // read the gauge field from file
  alloc_gauge_field(&g_gauge_field, VOLUMEPLUSRAND);
  if(strcmp( gaugefilename_prefix, "identity")==0 ) {
    if(g_cart_id==0) fprintf(stdout, "# [invert_quda] Setting up unit gauge field\n");
    for(ix=0;ix<VOLUME; ix++) {
      for(mu=0;mu<4;mu++) {
        _cm_eq_id(g_gauge_field+_GGI(ix,mu));
      }
    }
  } else {
    if(g_gauge_file_format == 0) {
      // ILDG
      sprintf(filename, "%s.%.4d", gaugefilename_prefix, Nconf);
      if(g_cart_id==0) fprintf(stdout, "# Reading gauge field from file %s\n", filename);
      status = read_lime_gauge_field_doubleprec(filename);
    } else if(g_gauge_file_format == 1) {
      // NERSC
      sprintf(filename, "%s.%.5d", gaugefilename_prefix, Nconf);
      if(g_cart_id==0) fprintf(stdout, "# Reading gauge field from file %s\n", filename);
      status = read_nersc_gauge_field(g_gauge_field, filename, &plaq_r);
    }
    if(status != 0) {
      fprintf(stderr, "[invert_quda] Error, could not read gauge field");
#ifdef MPI
      MPI_Abort(MPI_COMM_WORLD, 12);
      MPI_Finalize();
#endif
      exit(12);
    }
  }
#ifdef MPI
  xchange_gauge();
#endif

  // measure the plaquette
  plaquette(&plaq_m);
  if(g_cart_id==0) fprintf(stdout, "# Measured plaquette value: %25.16e\n", plaq_m);
  if(g_cart_id==0) fprintf(stdout, "# Read plaquette value    : %25.16e\n", plaq_r);

  // allocate the smeared / qdp ordered gauge field
  alloc_gauge_field(&gauge_field_smeared, VOLUME);
  for(i=0;i<4;i++) {
    gauge_qdp[i] = gauge_field_smeared + i*18*VOLUME;
  }


  // transcribe the gauge field
#ifdef OPENMP
  omp_set_num_threads(g_num_threads);
#pragma omp parallel for private(ix,iy,mu)
#endif
  for(ix=0;ix<VOLUME;ix++) {
    iy = g_lexic2eot[ix];
    for(mu=0;mu<4;mu++) {
      _cm_eq_cm(gauge_qdp[mu_trans[mu]]+18*iy, g_gauge_field+_GGI(ix,mu));
    }
  }
  // multiply timeslice T-1 with factor of -1 (antiperiodic boundary condition)
#ifdef OPENMP
  omp_set_num_threads(g_num_threads);
#pragma omp parallel for private(ix,iy)
#endif
  for(ix=0;ix<VOL3;ix++) {
    iix = (T-1)*VOL3 + ix;
    iy = g_lexic2eot[iix];
    _cm_ti_eq_re(gauge_qdp[mu_trans[0]]+18*iy, -1.);
  }


  // QUDA gauge parameters
  gauge_param.X[0] = LX_global;
  gauge_param.X[1] = LY_global;
  gauge_param.X[2] = LZ_global;
  gauge_param.X[3] = T_global;

  gauge_param.anisotropy  = 1.0;
  gauge_param.type        = QUDA_WILSON_LINKS;
  gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER;
  gauge_param.t_boundary  = QUDA_ANTI_PERIODIC_T;

  gauge_param.cpu_prec           = cpu_prec;
  gauge_param.cuda_prec          = cuda_prec;
  gauge_param.reconstruct        = QUDA_RECONSTRUCT_12;
  gauge_param.cuda_prec_sloppy   = cuda_prec_sloppy;
  gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_12;
  gauge_param.gauge_fix          = QUDA_GAUGE_FIXED_NO;

  gauge_param.ga_pad = 0;

  // load the gauge field
  fprintf(stdout, "# [invert_quda] loading gauge field\n");
  loadGaugeQuda((void*)gauge_qdp, &gauge_param);
  gauge_qdp[0] = NULL; 
  gauge_qdp[1] = NULL; 
  gauge_qdp[2] = NULL; 
  gauge_qdp[3] = NULL; 

  /*********************************************
   * APE smear the gauge field
   *********************************************/
  memcpy(gauge_field_smeared, g_gauge_field, 72*VOLUME*sizeof(double));
  if(N_ape>0) {
    fprintf(stdout, "# [invert_quda] APE smearing gauge field with paramters N_APE=%d, alpha_APE=%e\n", N_ape, alpha_ape);
#ifdef OPENMP
     APE_Smearing_Step_threads(gauge_field_smeared, N_ape, alpha_ape);
#else
    for(i=0; i<N_ape; i++) {
       APE_Smearing_Step(gauge_field_smeared, alpha_ape);
     }
#endif
  }

  /* allocate memory for the spinor fields */
  no_fields = 3;
  g_spinor_field = (double**)calloc(no_fields, sizeof(double*));
  for(i=0; i<no_fields; i++) alloc_spinor_field(&g_spinor_field[i], VOLUMEPLUSRAND);

  /* the source locaton */
  sl0 =   g_source_location                              / (LX_global*LY_global*LZ);
  sl1 = ( g_source_location % (LX_global*LY_global*LZ) ) / (          LY_global*LZ);
  sl2 = ( g_source_location % (          LY_global*LZ) ) / (                    LZ);
  sl3 =   g_source_location %                      LZ;
  if(g_cart_id==0) fprintf(stdout, "# [invert_quda] global sl = (%d, %d, %d, %d)\n", sl0, sl1, sl2, sl3);
  source_proc_coords[0] = sl0 / T;
  source_proc_coords[1] = sl1 / LX;
  source_proc_coords[2] = sl2 / LY;
  source_proc_coords[3] = sl3 / LZ;
#ifdef MPI
  MPI_Cart_rank(g_cart_grid, source_proc_coords, &source_proc_id);
#else
  source_proc_id = 0;
#endif
  have_source_flag = source_proc_id == g_cart_id;

  lsl0 = sl0 % T;
  lsl1 = sl1 % LX;
  lsl2 = sl2 % LY;
  lsl3 = sl3 % LZ;
  if(have_source_flag) {
    fprintf(stdout, "# [invert_quda] process %d has the source at (%d, %d, %d, %d)\n", g_cart_id, lsl0, lsl1, lsl2, lsl3);
  }

  // QUDA inverter parameters
  inv_param.dslash_type    = QUDA_WILSON_DSLASH;
//  inv_param.inv_type       = QUDA_BICGSTAB_INVERTER;
  inv_param.inv_type       = QUDA_CG_INVERTER;
  inv_param.kappa          = g_kappa;
  inv_param.tol            = solver_precision;
  inv_param.maxiter        = niter_max;
  inv_param.reliable_delta = reliable_delta;

  inv_param.solution_type      = QUDA_MAT_SOLUTION;
//  inv_param.solve_type         = QUDA_DIRECT_PC_SOLVE;
  inv_param.solve_type         = QUDA_NORMEQ_PC_SOLVE;
  inv_param.matpc_type         = QUDA_MATPC_EVEN_EVEN; // QUDA_MATPC_EVEN_EVEN;
  inv_param.dagger             = QUDA_DAG_NO;
  inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION; //;QUDA_MASS_NORMALIZATION;

  inv_param.cpu_prec         = cpu_prec;
  inv_param.cuda_prec        = cuda_prec;
  inv_param.cuda_prec_sloppy = cuda_prec_sloppy;
  inv_param.preserve_source  = QUDA_PRESERVE_SOURCE_NO;
  inv_param.dirac_order      = QUDA_DIRAC_ORDER;

  inv_param.sp_pad = 0;
  inv_param.cl_pad = 0;

  inv_param.verbosity = QUDA_VERBOSE;

  // write initial rng state to file
  if(g_source_type==2 && g_coherent_source==2) {
    sprintf(rng_file_out, "%s.0", g_rng_filename);
    if( init_rng_stat_file (g_seed, rng_file_out) != 0 ) {
      fprintf(stderr, "[invert_quda] Error, could not write rng status\n");
      exit(210);
    }
  } else if(g_source_type==3 || g_source_type==4) {
    if( init_rng_state(g_seed, &g_rng_state) != 0 ) {
      fprintf(stderr, "[invert_quda] Error, could initialize rng state\n");
      exit(211);
    }
  }

  // check the source momenta
  if(g_source_momentum_set) {
    source_momentum = (int*)malloc(3*sizeof(int));

    if(g_source_momentum[0]<0) g_source_momentum[0] += LX;
    if(g_source_momentum[1]<0) g_source_momentum[1] += LY;
    if(g_source_momentum[2]<0) g_source_momentum[2] += LZ;
    fprintf(stdout, "# [invert_quda] using final source momentum ( %d, %d, %d )\n", g_source_momentum[0], g_source_momentum[1], g_source_momentum[2]);


    if(full_orbit) {
      status = make_qcont_orbits_3d_parity_avg( &qlatt_id, &qlatt_count, &qlatt_list, &qlatt_nclass, &qlatt_rep, &qlatt_map);
      if(status != 0) {
        fprintf(stderr, "\n[invert_quda] Error while creating O_3-lists\n");
        exit(4);
      }
      source_momentum_class = qlatt_id[g_ipt[0][g_source_momentum[0]][g_source_momentum[1]][g_source_momentum[2]]];
      source_momentum_no    = qlatt_count[source_momentum_class];
      source_momentum_runs  = source_momentum_class==0 ? 1 : source_momentum_no + 1;
      fprintf(stdout, "# [] source momentum belongs to class %d with %d members, which means %d runs\n",
          source_momentum_class, source_momentum_no, source_momentum_runs);
    }
  }


  /***********************************************
   * loop on spin-color-index
   ***********************************************/
  for(isc=g_source_index[0]; isc<=g_source_index[1]; isc++) {
    ispin = isc / n_c;
    icol  = isc % n_c;

    for(imom=0; imom<source_momentum_runs; imom++) {

      /***********************************************
       * set source momentum
       ***********************************************/
      if(g_source_momentum_set) {
        if(imom == 0) {
          if(full_orbit) {
            source_momentum[0] = 0;
            source_momentum[1] = 0;
            source_momentum[2] = 0;
          } else {
            source_momentum[0] = g_source_momentum[0];
            source_momentum[1] = g_source_momentum[1];
            source_momentum[2] = g_source_momentum[2];
          }
        } else {
          source_momentum[0] = qlatt_map[source_momentum_class][imom-1] / (LY*LZ);
          source_momentum[1] = ( qlatt_map[source_momentum_class][imom-1] % (LY*LZ) ) / LZ;
          source_momentum[2] = qlatt_map[source_momentum_class][imom-1] % LZ;
        }
        fprintf(stdout, "# [] run no. %d, source momentum (%d, %d, %d)\n", imom, source_momentum[0], source_momentum[1], source_momentum[2]);
      }
 
      /***********************************************
       * prepare the souce
       ***********************************************/
      if(g_read_source == 0) {  // create source
        switch(g_source_type) {
          case 0:
            // point source
            fprintf(stdout, "# [invert_quda] Creating point source\n");
            for(ix=0;ix<24*VOLUME;ix++) g_spinor_field[0][ix] = 0.;
            if(have_source_flag) {
              if(g_source_momentum_set) {
                phase = 2*M_PI*( source_momentum[0]*lsl1/(double)LX + source_momentum[1]*lsl2/(double)LY + source_momentum[2]*lsl3/(double)LZ );
                g_spinor_field[0][_GSI(g_source_location) + 2*(n_c*ispin+icol)  ] = cos(phase);
                g_spinor_field[0][_GSI(g_source_location) + 2*(n_c*ispin+icol)+1] = sin(phase);
              } else {
                g_spinor_field[0][_GSI(g_source_location) + 2*(n_c*ispin+icol)  ] = 1.;
              }
            }
            if(g_source_momentum_set) {
              sprintf(source_filename, "%s.%.4d.t%.2dx%.2dy%.2dz%.2d.%.2d.qx%.2dqy%.2dqz%.2d",
                  filename_prefix, Nconf, sl0, sl1, sl2, sl3, n_c*ispin+icol, source_momentum[0], source_momentum[1], source_momentum[2]);
            } else {
              sprintf(source_filename, "%s.%.4d.t%.2dx%.2dy%.2dz%.2d.%.2d", filename_prefix, Nconf, sl0, sl1, sl2, sl3, n_c*ispin+icol);
            }
            break;
          case 2:
            // timeslice source
            if(g_coherent_source==1) {
              fprintf(stdout, "# [invert_quda] Creating coherent timeslice source\n");
              status = prepare_coherent_timeslice_source(g_spinor_field[0], gauge_field_smeared, g_coherent_source_base, g_coherent_source_delta, VOLUME, g_rng_filename, NULL);
              if(status != 0) {
                fprintf(stderr, "[invert_quda] Error from prepare source, status was %d\n", status);
                exit(123);
              }
              timeslice = g_coherent_source_base;
            } else {
              if(g_coherent_source==2) {
                strcpy(rng_file_in, rng_file_out);
                if(isc == g_source_index[1]) { strcpy(rng_file_out, g_rng_filename); }
                else                         { sprintf(rng_file_out, "%s.%d", g_rng_filename, isc+1); }
                timeslice = (g_coherent_source_base+isc*g_coherent_source_delta)%T_global;
                fprintf(stdout, "# [invert_quda] Creating timeslice source\n");
                status = prepare_timeslice_source(g_spinor_field[0], gauge_field_smeared, timeslice, VOLUME, rng_file_in, rng_file_out);
                if(status != 0) {
                  fprintf(stderr, "[invert_quda] Error from prepare source, status was %d\n", status);
                  exit(123);
                }
              } else {
                fprintf(stdout, "# [invert_quda] Creating timeslice source\n");
                status = prepare_timeslice_source(g_spinor_field[0], gauge_field_smeared, g_source_timeslice, VOLUME, g_rng_filename, g_rng_filename);
                if(status != 0) {
                  fprintf(stderr, "[invert_quda] Error from prepare source, status was %d\n", status);
                  exit(124);
                }
                timeslice = g_source_timeslice;
              }
            }
            if(g_source_momentum_set) {
              sprintf(source_filename, "%s.%.4d.%.2d.%.5d.qx%.2dqy%.2dqz%.2d", filename_prefix, Nconf, 
                  timeslice, isc, source_momentum[0], source_momentum[1], source_momentum[2]);
            } else {
              sprintf(source_filename, "%s.%.4d.%.2d.%.5d", filename_prefix, Nconf, timeslice, isc);
            }
            break;
          case 3:
            // timeslice sources for one-end trick (spin dilution)
            fprintf(stdout, "# [invert_quda] Creating timeslice source for one-end-trick\n");
            status = prepare_timeslice_source_one_end(g_spinor_field[0], gauge_field_smeared, g_source_timeslice, source_momentum, isc%n_s, g_rng_state, \
                ( isc%n_s==(n_s-1) && imom==source_momentum_runs-1 ) );
            if(status != 0) {
              fprintf(stderr, "[invert_quda] Error from prepare source, status was %d\n", status);
              exit(125);
            }
            c = N_Jacobi > 0 ? isc%n_s + n_s : isc%n_s;
            if(g_source_momentum_set) {
              sprintf(source_filename, "%s.%.4d.%.2d.%.2d.qx%.2dqy%.2dqz%.2d", filename_prefix, Nconf, 
                  g_source_timeslice, c, source_momentum[0], source_momentum[1], source_momentum[2]);
            } else {
              sprintf(source_filename, "%s.%.4d.%.2d.%.2d", filename_prefix, Nconf, g_source_timeslice, c);
            }
            break;
          case 4:
            // timeslice sources for one-end trick (spin and color dilution )
            fprintf(stdout, "# [invert_quda] Creating timeslice source for one-end-trick\n");
            status = prepare_timeslice_source_one_end_color(g_spinor_field[0], gauge_field_smeared, g_source_timeslice, source_momentum,\
                isc%(n_s*n_c), g_rng_state, ( isc%(n_s*n_c)==(n_s*n_c-1)  && imom==source_momentum_runs-1 ) );
            if(status != 0) {
              fprintf(stderr, "[invert_quda] Error from prepare source, status was %d\n", status);
              exit(126);
            }
            c = N_Jacobi > 0 ? isc%(n_s*n_c) + (n_s*n_c) : isc%(n_s*n_c);
            if(g_source_momentum_set) {
              sprintf(source_filename, "%s.%.4d.%.2d.%.2d.qx%.2dqy%.2dqz%.2d", filename_prefix, Nconf, 
                  g_source_timeslice, c, source_momentum[0], source_momentum[1], source_momentum[2]);
            } else {
              sprintf(source_filename, "%s.%.4d.%.2d.%.2d", filename_prefix, Nconf, g_source_timeslice, c);
            }
            break;
          default:
            fprintf(stderr, "\nError, unrecognized source type\n");
            exit(32);
            break;
        }
      } else { // read source
        switch(g_source_type) {
          case 0:  // point source
            if(g_source_momentum_set) {
              sprintf(source_filename, "%s.%.4d.t%.2dx%.2dy%.2dz%.2d.%.2d.qx%.2dqy%.2dqz%.2d", \
                  filename_prefix2, Nconf, sl0, sl1, sl2, sl3, isc, source_momentum[0], source_momentum[1], source_momentum[2]);
            } else  {
              sprintf(source_filename, "%s.%.4d.t%.2dx%.2dy%.2dz%.2d.%.2d", filename_prefix2, Nconf, sl0, sl1, sl2, sl3, isc);
            }
            fprintf(stdout, "# [invert_quda] reading source from file %s\n", source_filename);
            status = read_lime_spinor(g_spinor_field[0], source_filename, 0);
            if(status != 0) {
              fprintf(stderr, "# [invert_quda] Errro, could not read source from file %s\n", source_filename);
              exit(115);
            }
            break;
          case 2:  // timeslice source
            if(g_source_momentum_set) {
              sprintf(source_filename, "%s.%.4d.%.2d.%.5d.qx%.2dqy%.2dqz%.2d", filename_prefix2, Nconf, g_source_timeslice,
                  isc, source_momentum[0], source_momentum[1], source_momentum[2]);
            } else {
              sprintf(source_filename, "%s.%.4d.%.2d.%.5d", filename_prefix2, Nconf, g_source_timeslice, isc);
            }
            fprintf(stdout, "# [invert_quda] reading source from file %s\n", source_filename);
            status = read_lime_spinor(g_spinor_field[0], source_filename, 0);
            if(status != 0) {
              fprintf(stderr, "# [invert_quda] Errro, could not read source from file %s\n", source_filename);
              exit(115);
            }
            break;
          default:
            fprintf(stderr, "[] Error, unrecognized source type for reading\n");
            exit(104);
            break;
        }
      }  // of if g_read_source
  
      //sprintf(filename, "%s.ascii", source_filename);
      //ofs = fopen(filename, "w");
      //printf_spinor_field(g_spinor_field[0], ofs);
      //fclose(ofs);
  
      if(g_write_source) {
        status = write_propagator(g_spinor_field[0], source_filename, 0, g_propagator_precision);
        if(status != 0) {
          fprintf(stderr, "Error from write_propagator, status was %d\n", status);
          exit(27);
        }
      }
  
      // smearing
      if(N_Jacobi > 0) {
  #ifdef OPENMP
        Jacobi_Smearing_Step_one_threads(gauge_field_smeared, g_spinor_field[0], g_spinor_field[1], N_Jacobi, kappa_Jacobi);
  #else
        for(c=0; c<N_Jacobi; c++) {
          Jacobi_Smearing_Step_one(gauge_field_smeared, g_spinor_field[0], g_spinor_field[1], kappa_Jacobi);
        }
  #endif
      }
  
      // multiply with g2
      for(ix=0;ix<VOLUME;ix++) {
        _fv_eq_gamma_ti_fv(g_spinor_field[1]+_GSI(ix), 2, g_spinor_field[0]+_GSI(ix));
      }
  
      // transcribe the spinor field to even-odd ordering with coordinates (x,y,z,t)
      for(ix=0;ix<VOLUME;ix++) {
        iy = g_lexic2eot[ix];
        _fv_eq_fv(g_spinor_field[2]+_GSI(iy), g_spinor_field[1]+_GSI(ix));
      }
  
  
      /***********************************************
       * perform the inversion
       ***********************************************/
      fprintf(stdout, "# [invert_quda] starting inversion\n");
      ratime = (double)clock() / CLOCKS_PER_SEC;
  
      for(ix=0;ix<VOLUME;ix++) {
        _fv_eq_zero(g_spinor_field[1]+_GSI(ix) );
      }
  
      invertQuda(g_spinor_field[1], g_spinor_field[2], &inv_param);
  
      retime = (double)clock() / CLOCKS_PER_SEC;
      fprintf(stdout, "# [invert_quda] inversion done in %e seconds\n", retime-ratime);
      fprintf(stdout, "# [invert_quda] Device memory used:\n\tSpinor: %f GiB\n\tGauge: %f GiB\n",
        inv_param.spinorGiB, gauge_param.gaugeGiB);
  
      if(inv_param.mass_normalization == QUDA_KAPPA_NORMALIZATION) {
        _2_kappa = 2. * g_kappa;
        for(ix=0;ix<VOLUME;ix++) {
          _fv_ti_eq_re(g_spinor_field[1]+_GSI(ix), _2_kappa );
        }
      }
  
      // transcribe the spinor field to lexicographical order with (t,x,y,z)
      for(ix=0;ix<VOLUME;ix++) {
        iy = g_lexic2eot[ix];
        _fv_eq_fv(g_spinor_field[2]+_GSI(ix), g_spinor_field[1]+_GSI(iy));
      }
      // multiply with g2
      for(ix=0;ix<VOLUME;ix++) {
        _fv_eq_gamma_ti_fv(g_spinor_field[1]+_GSI(ix), 2, g_spinor_field[2]+_GSI(ix));
      }
  
      /***********************************************
       * check residuum
       ***********************************************/
      if(check_residuum) {
        // apply the Wilson Dirac operator in the gamma-basis defined in cvc_linalg,
        //   which uses the tmLQCD conventions (same as in contractions)
        //   without explicit boundary conditions
        Q_Wilson_phi(g_spinor_field[2], g_spinor_field[1]);
  
        for(ix=0;ix<VOLUME;ix++) {
          _fv_mi_eq_fv(g_spinor_field[2]+_GSI(ix), g_spinor_field[0]+_GSI(ix));
        }
  
        spinor_scalar_product_re(&norm, g_spinor_field[2], g_spinor_field[2], VOLUME);
        spinor_scalar_product_re(&norm2, g_spinor_field[0], g_spinor_field[0], VOLUME);
        fprintf(stdout, "\n# [invert_quda] absolut residuum squared: %e; relative residuum %e\n", norm, sqrt(norm/norm2) );
      }
  
      /***********************************************
       * write the solution 
       ***********************************************/
      sprintf(filename, "%s.inverted", source_filename);
      fprintf(stdout, "# [invert_quda] writing propagator to file %s\n", filename);
      status = write_propagator(g_spinor_field[1], filename, 0, g_propagator_precision);
      if(status != 0) {
        fprintf(stderr, "Error from write_propagator, status was %d\n", status);
        exit(22);
      }
 
    }  // of loop on momenta

  }  // of isc

  /***********************************************
   * free the allocated memory, finalize 
   ***********************************************/

  // finalize the QUDA library
  fprintf(stdout, "# [invert_quda] finalizing quda\n");
  endQuda();

  free(g_gauge_field);
  free(gauge_field_smeared);
  for(i=0; i<no_fields; i++) free(g_spinor_field[i]);
  free(g_spinor_field);
  free_geometry();

  if(g_source_momentum_set && full_orbit) {
    finalize_q_orbits(&qlatt_id, &qlatt_count, &qlatt_list, &qlatt_rep);
    if(qlatt_map != NULL) {
      free(qlatt_map[0]);
      free(qlatt_map);
    }
  }
  if(source_momentum != NULL) free(source_momentum);

#ifdef MPI
  MPI_Finalize();
#endif

  if(g_cart_id==0) {
    g_the_time = time(NULL);
    fprintf(stdout, "\n# [invert_quda] %s# [invert_quda] end of run\n", ctime(&g_the_time));
    fprintf(stderr, "\n# [invert_quda] %s# [invert_quda] end of run\n", ctime(&g_the_time));
  }
  return(0);
}
Example #5
0
void mpi_manager_3D::setup(NumArray<int> &nproc, NumArray<int> &mx) {
	
	// Save number of processors in each dimension
	for(int dir=0; dir<DIM; ++dir) {
		this->nproc[dir] = nproc[dir];
	}

	// Determine the rank of the current task
	MPI_Comm_rank(MPI_COMM_WORLD, &rank);

	// Get number of ranks from MPI
	int ntasks;
	MPI_Comm_size(MPI_COMM_WORLD, &ntasks);
	this->ntasks = ntasks;

	// Set the distribution of processes:
	if(ntasks != nproc[0]*nproc[1]*nproc[2]){
		std::cerr << " Wrong number of processes " << std::endl;
		std::cout << ntasks << " " << nproc[0]*nproc[1]*nproc[2] << std::endl;
		Finalise();
	}

	if(rank==0) {
		std::cout << " Number of tasks: " << ntasks << std::endl;
	}

	// Check if grid can be subdevided as desired
	for(int dir = 0; dir < DIM; ++dir) {
		if(mx[dir] < nproc[dir] && nproc[dir] > 1) {
			if(rank == 0) {
				std::cerr << " Wrong grid topology for dimension ";
				std::cerr << dir << std::endl;
				std::cerr << "  mx[" << dir << "]:" << mx[dir] << std::endl;
				std::cerr << " nproc[" << dir << "]:" << nproc[dir] << std::endl;
			}
			Finalise();
		}
	}

	// Check if grid is a power of 2:
	double eps = 1.e-12;
	for(int dir = 0; dir < DIM; ++dir) {
		double exponent = log(mx[dir])/log(2.);
		int i_exponent = static_cast<int>(exponent+eps);

		if(exponent - i_exponent > 2.*eps) {
			if(rank == 0) {
				std::cerr << " Error: grid must be of the form mx = 2^n ";
				std::cerr << std::endl;
				std::cerr << " Exiting " << std::endl;
			}
			Finalise();
		}
	}

	// Grid is not periodic
	int periods[3] = {false, false, false};
	int reorder = false;
	// If all is okay: Create new communicator "comm3d"  
	MPI_Cart_create(MPI_COMM_WORLD, DIM, nproc, periods, reorder, &comm3d);

	// Retrieve the cartesian topology
	if (rank == 0) {
		int TopoType;
		std::cout << " Cart topology:  ";
		MPI_Topo_test(comm3d, &TopoType);
		switch (TopoType) {
		case MPI_UNDEFINED : 
			std::cout << " MPI_UNDEFINED " << std::endl;
			break;
		case MPI_GRAPH     :
			std::cout << "MPI_GRAPH" << std::endl;
			break;
		case MPI_CART      :
			std::cout << "MPI_CART" << std::endl;
			break;
		}
	}
	
	//   Determine rank again for cartesian communicator -> overwrite rank
	MPI_Comm_rank(comm3d, &rank);

	// std::cout << " my rank: " << rank << std::endl;

	// Translate rank to coordinates
	MPI_Cart_coords(comm3d, rank, DIM, coords);

	// // Backwards translation
	// int TranslateRank;
	// MPI_Cart_rank(comm3d, coords, &TranslateRank);

	// Find neighbouring ranks
	// Syntax: comm3d, shift direction, displacement, source, destination
	MPI_Cart_shift(comm3d, 0, 1, &left , &right);
	MPI_Cart_shift(comm3d, 1, 1, &front, &back);
	MPI_Cart_shift(comm3d, 2, 1, &bottom, &top);

	// std::cout << " My rank " << rank << " " << left << " " << right << " " << front << " " << back << " " << bottom << " " << top << std::endl;
	if(rank==0) {
		std::cout << " nearby " << right << " " << back << " " << top << std::endl;
	}

	// Determine ranks of neighbour processes:
	int shiftcoord[DIM];
	int lbound[DIM],ubound[DIM];
	for(int dim=0;dim<DIM;dim++){
		lbound[dim]=-1;
		ubound[dim]= 1;
	}
	Neighbour.resize(lbound,ubound);
	Neighbour.clear();

	for(int dim0=-1; dim0<=1; dim0++){
		shiftcoord[0] = (coords[0]+dim0)%nproc[0];
		if(shiftcoord[0] < 0) shiftcoord[0]+=nproc[0];
		for(int dim1=-1; dim1<=1; dim1++){
			shiftcoord[1] = (coords[1]+dim1)%nproc[1];
			if(shiftcoord[1] < 0) shiftcoord[1]+=nproc[1];
			for(int dim2=-1; dim2<=1; dim2++){
				shiftcoord[2] = (coords[2]+dim2)%nproc[2];
				if(shiftcoord[2] < 0) shiftcoord[2]+=nproc[2];
				MPI_Cart_rank(comm3d, shiftcoord,&Neighbour(dim0,dim1,dim2));
			}
		}
	}
	
	// if(rank==1) {
	// 	for(int dim0=-1; dim0<=1; dim0++){
	// 		for(int dim1=-1; dim1<=1; dim1++){
	// 			for(int dim2=-1; dim2<=1; dim2++){
	// 				std::cout << " neighbour " << dim0 << " " << dim1 << " ";
	// 				std::cout << dim2 << " " << Neighbour(dim0, dim1, dim2);
	// 				std::cout << std::endl;
	// 			}
	// 		}
	// 	}
	// }

	// Determine absolute position of any rank:
	AllRanks.resize(Index::set(0,0,0),
	               Index::set(nproc[0]-1,nproc[1]-1,nproc[2]-1));
	
	for(int dim0=0; dim0<nproc[0]; ++dim0) {
		for(int dim1=0; dim1<nproc[1]; ++dim1) {
			for(int dim2=0; dim2<nproc[2]; ++dim2) {
				int coord[3] = {dim0, dim1, dim2};
				MPI_Cart_rank(comm3d, coord, &AllRanks(dim0, dim1, dim2));
			}
		}
	}

	// if(rank==2) {
	// 	std::cout << " Neigh: " << rank << " "<<Neighbour(0,0,0) << " " << AllRanks(2,0,0) << std::endl;
	// }

	
	// Now make additional mpi groups relating to planes:

	int count(0);
	int num_xy = nproc[0]*nproc[1];
	int num_xz = nproc[0]*nproc[2];
	int num_yz = nproc[1]*nproc[2];
	
	NumMatrix<int,1> x_ranks[nproc[0]];
	NumMatrix<int,1> y_ranks[nproc[1]];
	NumMatrix<int,1> z_ranks[nproc[2]];

	// Walk trough z-axis -- xy plane
	for(int irz=0; irz<nproc[2]; irz++) {
		count = 0;
		z_ranks[irz].resize(Index::set(0), Index::set(num_xy));
		for(int irx=0; irx<nproc[0]; irx++) {
			for(int iry=0; iry<nproc[1]; iry++) {
				z_ranks[irz](count) = AllRanks(irx,iry,irz);
				count++;
			}
		}
	}

	// Walk trough y-axis -- xz plane
	for(int iry=0; iry<nproc[1]; iry++) {
		count = 0;
		y_ranks[iry].resize(Index::set(0), Index::set(num_xz));
		for(int irx=0; irx<nproc[0]; irx++) {
			for(int irz=0; irz<nproc[2]; irz++) {
				y_ranks[iry](count) = AllRanks(irx,iry,irz);
				count++;
			}
		}
	}

	// Walk trough x-axis -- yz plane
	for(int irx=0; irx<nproc[0]; irx++) {
		count = 0;
		x_ranks[irx].resize(Index::set(0), Index::set(num_yz));
		for(int iry=0; iry<nproc[1]; iry++) {
			for(int irz=0; irz<nproc[2]; irz++) {
				x_ranks[irx](count) = AllRanks(irx,iry,irz);
				count++;
			}
		}
	}

	// Build local communicator:
	MPI_Group group_all, group_constz, group_consty, group_constx;
	// Get standard group handle:
	MPI_Comm_group(comm3d, &group_all);


	// Devide tasks into groups based on z-position
	MPI_Group_incl(group_all, num_xy, z_ranks[coords[2]], &group_constz);

	// Devide tasks into groups based on z-position
	MPI_Group_incl(group_all, num_xz, y_ranks[coords[1]], &group_consty);

	// Devide tasks into groups based on x-position
	MPI_Group_incl(group_all, num_yz, x_ranks[coords[0]], &group_constx);

	// // Make corresponding communicators:
	// MPI_Comm_create(comm3d, group_constz, &comm_plane_xy); // const z
	// MPI_Comm_create(comm3d, group_consty, &comm_plane_xz); // const x
	// MPI_Comm_create(comm3d, group_constx, &comm_plane_yz); // const x
	// // Get corresponding rank
	// MPI_Group_rank (group_constz, &rank_plane_xy);
	// MPI_Group_rank (group_consty, &rank_plane_xz);
	// MPI_Group_rank (group_constx, &rank_plane_yz);

	int remain_dims[3];
	// x-y plane:
	remain_dims[0] = 1;
	remain_dims[1] = 1;
	remain_dims[2] = 0;
	MPI_Cart_sub(comm3d, remain_dims, &comm_plane_xy);
	MPI_Comm_rank(comm_plane_xy, &rank_plane_xy);

	// x-z plane
	remain_dims[0] = 1;
	remain_dims[1] = 0;
	remain_dims[2] = 1;
	MPI_Cart_sub(comm3d, remain_dims, &comm_plane_xz);
	MPI_Comm_rank(comm_plane_xz, &rank_plane_xz);

	// y-z plane
	remain_dims[0] = 0;
	remain_dims[1] = 1;
	remain_dims[2] = 1;
	MPI_Cart_sub(comm3d, remain_dims, &comm_plane_yz);
	MPI_Comm_rank(comm_plane_yz, &rank_plane_yz);

}
Example #6
0
/**
 * accumulates pieces of the spinor field on nodes with index 0 in the dimensions given in which
 * the collected data is returned
 */
void spinor_fft_reduce_2d(spinor *localSpinorField,int *collectionRank,spinor*** field_collection,spinor **membuff){
  /* this implementation is intended for four dimensional parallelisation */
#if (defined  PARALLELXYZT  && defined MPI && defined HAVE_FFTW)

  int sendRecvCoord[4];
  int i;
  int dims[]={g_nproc_t,g_nproc_x,g_nproc_y,g_nproc_z};


  /* logfile variables */
  char *logFilePrefix="Process";
  char logFileName[512];
  FILE *logFile;
  const int MSG_LOCALDATA = 457;
  MPI_Status ierr;
  MPI_Datatype mpi_local_spinor;
  const int which[]={0,1};


  (*field_collection)=NULL;
  (*membuff)=NULL;

/*   int result; */
  sprintf(logFileName,"./%s_%02d.log",logFilePrefix,g_cart_id);
  logFile=fopen(logFileName,"a");


  MPI_Type_contiguous(VOLUME, field_point, &mpi_local_spinor);
  MPI_Type_commit(&mpi_local_spinor);


  for(i=0;i<4;i++)
    sendRecvCoord[i]=g_proc_coords[i];

  if( g_proc_coords[which[0]] == 0 && g_proc_coords[which[1]] == 0 ){

      /* i am one of the nodes where data is accumulated */
      spinor **accu_field;
      spinor **fft_field;
      spinor *memory_buffer_accu_field;
      spinor *memory_buffer_fft_field;
      int REDUCTIONVOLUME=1;
      int recvRank;
      MPI_Request *requests;
      MPI_Status *status;
      int request_count=0;
      int num_requests;
      fftw_plan local_2d_fft_forward;

      *collectionRank=TRUE;

      /* calculate the number of reduced 2d volume accumulated in this node */
      
      /* number of spinor fields in local units */
      REDUCTIONVOLUME*=dims[which[0]]*dims[which[1]];

      /* number of receive messages */
      num_requests=REDUCTIONVOLUME-1;

      /* reserve space for receive messages */
      requests=(MPI_Request*)malloc(sizeof(MPI_Request)*num_requests);
      status=(MPI_Status*)malloc(sizeof(MPI_Status)*num_requests);

      fprintf(logFile,"reduction volume = %d\n",REDUCTIONVOLUME);

      /* allocate space for spinor field collection */
      allocate_spinor_field_array(&accu_field,&memory_buffer_accu_field,VOLUME,REDUCTIONVOLUME);
      allocate_spinor_field_array(&fft_field,&memory_buffer_fft_field,VOLUME,REDUCTIONVOLUME);


      /* receive from certain nodes pieces of the spinor field */
      for(sendRecvCoord[which[0]] = 0 ; sendRecvCoord[which[0]]< dims[which[0]] ; sendRecvCoord[which[0]]++){
	for(sendRecvCoord[which[1]] = 0 ; sendRecvCoord[which[1]]< dims[which[1]] ; sendRecvCoord[which[1]]++){
	  if( sendRecvCoord[which[0]] != 0 || sendRecvCoord[which[1]]  != 0){

	    MPI_Cart_rank(g_cart_grid,sendRecvCoord,&recvRank);

	    MPI_Irecv(accu_field[sendRecvCoord[which[0]]*dims[which[1]]+sendRecvCoord[which[1]] ] /* buffer */,
		     1, /* how may */
		     mpi_local_spinor, /* mpi data type */
		     recvRank, /* from whom i get it */
		     MSG_LOCALDATA, /* msg id */
		     g_cart_grid, /* communicator , status */
		     requests+request_count);
	    ++request_count;

	  }
	}
      }


      /* wait until all request finished */
      MPI_Waitall(num_requests, requests, status);

      assign(accu_field[0],localSpinorField,VOLUME);

      /* transpose in xp-t space */
      spinor_fft_transpose_xp_t(fft_field[0],accu_field[0],dims[0],dims[1],TRUE,1.);

      /* create fftw plan */
      local_2d_fft_forward=spinor_fftw_plan2d(fft_field[0],accu_field[0],T*dims[0],LX*dims[1],LY*LZ,1,FFTW_ESTIMATE);
      fftw_execute(local_2d_fft_forward);
      fftw_destroy_plan(local_2d_fft_forward);

/*       assign(accu_field[0],fft_field[0],VOLUME*REDUCTIONVOLUME); */


      free_spinor_field_array(&memory_buffer_fft_field); memory_buffer_fft_field=NULL;

/*       free_spinor_field_array(&memory_buffer_accu_field); memory_buffer_accu_field=NULL; */
      (*field_collection)=accu_field;
      (*membuff)=memory_buffer_accu_field;
      free(requests); requests = NULL;
      free(status); status=NULL;

    } else {
      int sendRank;
      MPI_Request request;
      MPI_Status status;

      *collectionRank=FALSE;

      /* coordinates of the "root" */
      sendRecvCoord[which[0]]=0;
      sendRecvCoord[which[1]]=0;

      MPI_Cart_rank(g_cart_grid,sendRecvCoord,&sendRank); 

      MPI_Isend(localSpinorField,1,mpi_local_spinor,sendRank,MSG_LOCALDATA,g_cart_grid,&request);

      MPI_Wait(&request,&status);

    }


    MPI_Type_free(&mpi_local_spinor);

    fclose(logFile);

#else
    if(g_proc_id==0)
      fprintf(stderr,"Error: Please choose FOUR dimensional parallelization!!!\n");

#endif
}
Example #7
0
int main( int argc, char **argv )
{
    int              rank, size, i;
    int              errors=0;
    int              dims[NUM_DIMS];
    int              periods[NUM_DIMS];
    int              coords[NUM_DIMS];
    int              new_coords[NUM_DIMS];
    int              reorder = 0;
    MPI_Comm         comm_temp, comm_cart, new_comm;
    int              topo_status;
    int              ndims;
    int              new_rank;
    int              remain_dims[NUM_DIMS];
    int              newnewrank;

    MPI_Init( &argc, &argv );

    MPI_Comm_rank( MPI_COMM_WORLD, &rank );
    MPI_Comm_size( MPI_COMM_WORLD, &size );

    /* Clear dims array and get dims for topology */
    for(i=0;i<NUM_DIMS;i++) { dims[i] = 0; periods[i] = 0; }
    MPI_Dims_create ( size, NUM_DIMS, dims );

    /* Make a new communicator with a topology */
    MPI_Cart_create ( MPI_COMM_WORLD, 2, dims, periods, reorder, &comm_temp );
    MPI_Comm_dup ( comm_temp, &comm_cart );

    /* Determine the status of the new communicator */
    MPI_Topo_test ( comm_cart, &topo_status );
    if (topo_status != MPI_CART) errors++;

    /* How many dims do we have? */
    MPI_Cartdim_get( comm_cart, &ndims );
    if ( ndims != NUM_DIMS ) errors++;

    /* Get the topology, does it agree with what we put in? */
    for(i=0;i<NUM_DIMS;i++) { dims[i] = 0; periods[i] = 0; }
    MPI_Cart_get ( comm_cart, NUM_DIMS, dims, periods, coords );

    /* Check that the coordinates are correct */
#if NUM_DIMS == 2
    if (rank != coords[1] + coords[0] * dims[1]) {
	errors++;
	fprintf( stderr, 
"Did not get expected coordinate (row major required by MPI standard 6.2)\n" );
    }
#endif
    /* Does the mapping from coords to rank work? */
    MPI_Cart_rank ( comm_cart, coords, &new_rank );
    if ( new_rank != rank ) errors++;

    /* Does the mapping from rank to coords work */
    MPI_Cart_coords ( comm_cart, rank, NUM_DIMS, new_coords );
    for (i=0;i<NUM_DIMS;i++) 
      if ( coords[i] != new_coords[i] ) 
	errors++;

    /* Let's shift in each dimension and see how it works!   */
    /* Because it's late and I'm tired, I'm not making this  */
    /* automatically test itself.                            */
    for (i=0;i<NUM_DIMS;i++) {
      int source, dest;
      MPI_Cart_shift(comm_cart, i, 1, &source, &dest);
#ifdef VERBOSE      
      printf ("[%d] Shifting %d in the %d dimension\n",rank,1,i);
      printf ("[%d]    source = %d  dest = %d\n",rank,source,dest); 
#endif
    }

    /* Subdivide */
    remain_dims[0] = 0; 
    for (i=1; i<NUM_DIMS; i++) remain_dims[i] = 1;
    MPI_Cart_sub ( comm_cart, remain_dims, &new_comm );

    /* Determine the status of the new communicator */
    MPI_Topo_test ( new_comm, &topo_status );
    if (topo_status != MPI_CART) errors++;

    /* How many dims do we have? */
    MPI_Cartdim_get( new_comm, &ndims );
    if ( ndims != NUM_DIMS-1 ) errors++;

    /* Get the topology, does it agree with what we put in? */
    for(i=0;i<NUM_DIMS-1;i++) { dims[i] = 0; periods[i] = 0; }
    MPI_Cart_get ( new_comm, ndims, dims, periods, coords );
    
    /* Does the mapping from coords to rank work? */
    MPI_Comm_rank ( new_comm, &newnewrank );
    MPI_Cart_rank ( new_comm, coords, &new_rank );
    if ( new_rank != newnewrank ) errors++;

    /* Does the mapping from rank to coords work */
    MPI_Cart_coords ( new_comm, new_rank, NUM_DIMS -1, new_coords );
    for (i=0;i<NUM_DIMS-1;i++) 
      if ( coords[i] != new_coords[i] ) 
	errors++;

    /* We're at the end */
    MPI_Comm_free( &new_comm );
    MPI_Comm_free( &comm_temp );
    MPI_Comm_free( &comm_cart );
    Test_Waitforall( );
    if (errors) printf( "[%d] done with %d ERRORS!\n", rank,errors );
    MPI_Finalize();
    return 0;
}
Example #8
0
/*
    Check that the MPI implementation properly handles zero-dimensional
    Cartesian communicators - the original standard implies that these
    should be consistent with higher dimensional topologies and thus
    these should work with any MPI implementation.  MPI 2.1 made this
    requirement explicit.
*/
int main(int argc, char *argv[])
{
    int errs = 0;
    int size, rank, ndims;
    MPI_Comm comm, newcomm;

    MTest_Init(&argc, &argv);

    /* Create a new cartesian communicator in a subset of the processes */
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    if (size < 2) {
        fprintf(stderr, "This test needs at least 2 processes\n");
        MPI_Abort(MPI_COMM_WORLD, 1);
    }

    MPI_Cart_create(MPI_COMM_WORLD, 0, NULL, NULL, 0, &comm);

    if (comm != MPI_COMM_NULL) {
        int csize;
        MPI_Comm_size(comm, &csize);
        if (csize != 1) {
            errs++;
            fprintf(stderr, "Sizes is wrong in cart communicator.  Is %d, should be 1\n", csize);
        }

        /* This function is not meaningful, but should not fail */
        MPI_Dims_create(1, 0, NULL);

        ndims = -1;
        MPI_Cartdim_get(comm, &ndims);
        if (ndims != 0) {
            errs++;
            fprintf(stderr, "MPI_Cartdim_get: ndims is %d, should be 0\n", ndims);
        }

        /* this function should not fail */
        MPI_Cart_get(comm, 0, NULL, NULL, NULL);

        MPI_Cart_rank(comm, NULL, &rank);
        if (rank != 0) {
            errs++;
            fprintf(stderr, "MPI_Cart_rank: rank is %d, should be 0\n", rank);
        }

        /* this function should not fail */
        MPI_Cart_coords(comm, 0, 0, NULL);

        MPI_Cart_sub(comm, NULL, &newcomm);
        ndims = -1;
        MPI_Cartdim_get(newcomm, &ndims);
        if (ndims != 0) {
            errs++;
            fprintf(stderr, "MPI_Cart_sub did not return zero-dimensional communicator\n");
        }

        MPI_Barrier(comm);

        MPI_Comm_free(&comm);
        MPI_Comm_free(&newcomm);
    } else if (rank == 0) {
        errs++;
        fprintf(stderr, "Communicator returned is null!");
    }

    MTest_Finalize(errs);


    return MTestReturnValue(errs);
}
Example #9
0
int main(int argc, char **argv) {

  const int n_c = 3;  // number of colors

  int c, i, j, mu, nu, ir, is, ia, imunu;
  int filename_set = 0;
  int dims[4]      = {0,0,0,0};
  int l_LX_at, l_LXstart_at;
  int source_location, have_source_flag = 0;
  int x0, x1, x2, x3, ix;
  int sx0, sx1, sx2, sx3;
  int isimag[4];
  int gperm[5][4], gperm2[4][4];
  int check_position_space_WI=0;
  int num_threads = 1, nthreads=-1, threadid=-1;
  int exitstatus;
  int write_ascii=0;
  int mms = 0, mass_id = -1;
  int outfile_prefix_set = 0;
  int source_proc_coords[4], source_proc_id = -1;
  int ud_single_file = 0;
  double gperm_sign[5][4], gperm2_sign[4][4];
  double *conn  = NULL;
  double *conn2 = NULL;
  double contact_term[8];
  double *work=NULL;
  int verbose = 0;
  int do_gt   = 0, status;
  char filename[100], contype[400], outfile_prefix[400];
  double ratime, retime;
  double plaq;
  double spinor1[24], spinor2[24], U_[18];
  double *gauge_trafo=(double*)NULL;
  double *phi=NULL, *chi=NULL;
  complex w;
  double Usourcebuff[72], *Usource[4];
  FILE *ofs;

#ifdef MPI
  int *status;
#endif

#ifdef MPI
  MPI_Init(&argc, &argv);
#endif

  while ((c = getopt(argc, argv, "swah?vgf:t:m:o:")) != -1) {
    switch (c) {
    case 'v':
      verbose = 1;
      break;
    case 'g':
      do_gt = 1;
      break;
    case 'f':
      strcpy(filename, optarg);
      filename_set=1;
      break;
    case 'w':
      check_position_space_WI = 1;
      fprintf(stdout, "\n# [avc_exact2_lowmem_xspace] will check Ward identity in position space\n");
      break;
    case 't':
      num_threads = atoi(optarg);
      fprintf(stdout, "\n# [avc_exact2_lowmem_xspace] will use %d threads in spacetime loops\n", num_threads);
      break;
    case 'a':
      write_ascii = 1;
      fprintf(stdout, "\n# [avc_exact2_lowmem_xspace] will write data in ASCII format too\n");
      break;
    case 'm':
      mms = 1;
      mass_id = atoi(optarg);
      fprintf(stdout, "\n# [avc_exact2_lowmem_xspace] will read propagators in MMS format with mass id %d\n", mass_id);
      break;
    case 'o':
      strcpy(outfile_prefix, optarg);
      fprintf(stdout, "\n# [avc_exact2_lowmem_xspace] will use prefix %s for output filenames\n", outfile_prefix);
      outfile_prefix_set = 1;
      break;
    case 's':
      ud_single_file = 1;
      fprintf(stdout, "\n# [avc_exact2_lowmem_xspace] will read up and down propagator from same file\n");
      break;
    case 'h':
    case '?':
    default:
      usage();
      break;
    }
  }

  if(g_cart_id==0) {
    g_the_time = time(NULL);
    fprintf(stdout, "\n# [avc_exact2_lowmem_xspace] using global time stamp %s", ctime(&g_the_time));
  }

  /*********************************
   * set number of openmp threads
   *********************************/
#ifdef OPENMP
  omp_set_num_threads(num_threads);
#endif

  /* set the default values */
  if(filename_set==0) strcpy(filename, "cvc.input");
  fprintf(stdout, "# Reading input from file %s\n", filename);
  read_input_parser(filename);

  /* some checks on the input data */
  if((T_global == 0) || (LX==0) || (LY==0) || (LZ==0)) {
    if(g_proc_id==0) fprintf(stderr, "\n[avc_exact2_lowmem_xspace] T and L's must be set\n");
    usage();
  }
  if(g_kappa == 0.) {
    if(g_proc_id==0) fprintf(stderr, "\n[avc_exact2_lowmem_xspace] kappa should be > 0.n");
    usage();
  }

  /* initialize MPI parameters */
  mpi_init(argc, argv);
#ifdef MPI
  if((status = (int*)calloc(g_nproc, sizeof(int))) == (int*)NULL) {
    MPI_Abort(MPI_COMM_WORLD, 1);
    MPI_Finalize();
    exit(7);
  }
#endif


  dims[0]=T_global; dims[1]=LX; dims[2]=LY; dims[3]=LZ;
#ifndef MPI
  T            = T_global;
  Tstart       = 0;
  l_LX_at      = LX;
  l_LXstart_at = 0;
#endif
  fprintf(stdout, "# [%2d] parameters:\n"\
                  "# [%2d] T            = %3d\n"\
		  "# [%2d] Tstart       = %3d\n"\
		  "# [%2d] l_LX_at      = %3d\n"\
		  "# [%2d] l_LXstart_at = %3d\n",
		  g_cart_id, g_cart_id, T, g_cart_id, Tstart, g_cart_id, l_LX_at,
		  g_cart_id, l_LXstart_at);

#ifdef MPI
  if(T==0) {
    fprintf(stderr, "[%2d] local T is zero; exit\n", g_cart_id);
    MPI_Abort(MPI_COMM_WORLD, 1);
    MPI_Finalize();
    exit(2);
  }
#endif

  if(init_geometry() != 0) {
    fprintf(stderr, "ERROR from init_geometry\n");
#ifdef MPI
    MPI_Abort(MPI_COMM_WORLD, 1);
    MPI_Finalize();
#endif
    exit(1);
  }

  geometry();

  alloc_gauge_field(&g_gauge_field, VOLUMEPLUSRAND);
  if(!(strcmp(gaugefilename_prefix,"identity")==0)) {
    /* read the gauge field */
    sprintf(filename, "%s.%.4d", gaugefilename_prefix, Nconf);
    if(g_cart_id==0) fprintf(stdout, "reading gauge field from file %s\n", filename);
    read_lime_gauge_field_doubleprec(filename);
  } else {
    /* initialize unit matrices */
    if(g_cart_id==0) fprintf(stdout, "\n# [avc_exact] initializing unit matrices\n");
    for(ix=0;ix<VOLUME;ix++) {
      _cm_eq_id( g_gauge_field + _GGI(ix, 0) );
      _cm_eq_id( g_gauge_field + _GGI(ix, 1) );
      _cm_eq_id( g_gauge_field + _GGI(ix, 2) );
      _cm_eq_id( g_gauge_field + _GGI(ix, 3) );
    }
  }
#ifdef MPI
  xchange_gauge();
#endif

  /* measure the plaquette */
  plaquette(&plaq);
  if(g_cart_id==0) fprintf(stdout, "measured plaquette value: %25.16e\n", plaq);
/*
  sprintf(filename, "gauge.%.2d", g_cart_id);
  ofs = fopen(filename, "w");
  for(x0=0;x0<T;x0++) {
  for(x1=0;x1<LX;x1++) {
  for(x2=0;x2<LY;x2++) {
  for(x3=0;x3<LZ;x3++) {
    ix = g_ipt[x0][x1][x2][x3];
    for(mu=0;mu<4;mu++) {
      for(i=0;i<9;i++) {
         fprintf(ofs, "%8d%3d%3d%3d%3d%3d%3d%25.16e%25.16e\n", ix, x0+Tstart, x1+LXstart, x2+LYstart, x3, mu, i, g_gauge_field[_GGI(ix,mu)+2*i], g_gauge_field[_GGI(ix,mu)+2*i+1]);
      }
    }  
  }}}}
  fclose(ofs);

  if(g_cart_id==0) fprintf(stdout, "\nWarning: forced exit\n");
  fflush(stdout);
  fflush(stderr);
#ifdef MPI
  MPI_Abort(MPI_COMM_WORLD, 255);
  MPI_Finalize();
#endif
  exit(255);
*/

  /* allocate memory for the spinor fields */
  no_fields = 2;
  if(mms) no_fields++;
  g_spinor_field = (double**)calloc(no_fields, sizeof(double*));
  for(i=0; i<no_fields; i++) alloc_spinor_field(&g_spinor_field[i], VOLUMEPLUSRAND);
  if(mms) {
    work = g_spinor_field[no_fields-1];
  }

  /* allocate memory for the contractions */
  conn = (double*)calloc(2 * 16 * VOLUME, sizeof(double));
  if( conn==(double*)NULL ) {
    fprintf(stderr, "could not allocate memory for contr. fields\n");
#ifdef MPI
    MPI_Abort(MPI_COMM_WORLD, 3);
    MPI_Finalize();
#endif
    exit(3);
  }
#ifdef OPENMP
#pragma omp parallel for
#endif
  for(ix=0; ix<32*VOLUME; ix++) conn[ix] = 0.;

  conn2 = (double*)calloc(2 * 16 * VOLUME, sizeof(double));
  if( conn2 == NULL ) {
    fprintf(stderr, "could not allocate memory for contr. fields\n");
#ifdef MPI
    MPI_Abort(MPI_COMM_WORLD, 3);
    MPI_Finalize();
#endif
    exit(3);
  }
#ifdef OPENMP
#pragma omp parallel for
#endif
  for(ix=0; ix<32*VOLUME; ix++) conn2[ix] = 0.;

  /***********************************************************
   * determine source coordinates, find out, if source_location is in this process
   ***********************************************************/
#if (defined PARALLELTX) || (defined PARALLELTXY)
  sx0 = g_source_location / (LX_global*LY_global*LZ);
  sx1 = (g_source_location%(LX_global*LY_global*LZ)) / (LY_global*LZ);
  sx2 = (g_source_location%(LY_global*LZ)) / LZ;
  sx3 = (g_source_location%LZ);
  source_proc_coords[0] = sx0 / T;
  source_proc_coords[1] = sx1 / LX;
  source_proc_coords[2] = sx2 / LY;
  source_proc_coords[3] = 0;
  MPI_Cart_rank(g_cart_grid, source_proc_coords, &source_proc_id);
  have_source_flag = (int)(g_cart_id == source_proc_id);
  if(have_source_flag==1) {
    fprintf(stdout, "\n# process %2d has source location\n", source_proc_id);
    fprintf(stdout, "\n# global source coordinates: (%3d,%3d,%3d,%3d)\n",  sx0, sx1, sx2, sx3);
    fprintf(stdout, "\n# source proc coordinates: (%3d,%3d,%3d,%3d)\n",  source_proc_coords[0],
        source_proc_coords[1], source_proc_coords[2], source_proc_coords[3]);
  }
  sx0 = sx0 % T;
  sx1 = sx1 % LX;
  sx2 = sx2 % LY;
  sx3 = sx3 % LZ;
# else
  have_source_flag = (int)(g_source_location/(LX*LY*LZ)>=Tstart && g_source_location/(LX*LY*LZ)<(Tstart+T));
  if(have_source_flag==1) fprintf(stdout, "process %2d has source location\n", g_cart_id);
  sx0 = g_source_location/(LX*LY*LZ)-Tstart;
  sx1 = (g_source_location%(LX*LY*LZ)) / (LY*LZ);
  sx2 = (g_source_location%(LY*LZ)) / LZ;
  sx3 = (g_source_location%LZ);
#endif
  if(have_source_flag==1) { 
    fprintf(stdout, "local source coordinates: (%3d,%3d,%3d,%3d)\n", sx0, sx1, sx2, sx3);
    source_location = g_ipt[sx0][sx1][sx2][sx3];
  }
#ifdef MPI
#  if (defined PARALLELTX) || (defined PARALLELTXY)
  have_source_flag = source_proc_id;
  MPI_Bcast(Usourcebuff, 72, MPI_DOUBLE, have_source_flag, g_cart_grid);
#  else
  MPI_Gather(&have_source_flag, 1, MPI_INT, status, 1, MPI_INT, 0, g_cart_grid);
  if(g_cart_id==0) {
    for(mu=0; mu<g_nproc; mu++) fprintf(stdout, "status[%1d]=%d\n", mu,status[mu]);
  }
  if(g_cart_id==0) {
    for(have_source_flag=0; status[have_source_flag]!=1; have_source_flag++);
    fprintf(stdout, "have_source_flag= %d\n", have_source_flag);
  }
  MPI_Bcast(&have_source_flag, 1, MPI_INT, 0, g_cart_grid);
#  endif
  fprintf(stdout, "[%2d] have_source_flag = %d\n", g_cart_id, have_source_flag);
#else
  have_source_flag = 0;
#endif

/*
  if(g_cart_id==0) fprintf(stdout, "\nWarning: forced exit\n");
  fflush(stdout);
  fflush(stderr);
#ifdef MPI
  MPI_Abort(MPI_COMM_WORLD, 255);
  MPI_Finalize();
#endif
  exit(255);
*/

#ifdef MPI
      ratime = MPI_Wtime();
#else
      ratime = (double)clock() / CLOCKS_PER_SEC;
#endif
  /***********************************************************
   *  initialize the Gamma matrices
   ***********************************************************/
  // gamma_5:
  gperm[4][0] = gamma_permutation[5][ 0] / 6;
  gperm[4][1] = gamma_permutation[5][ 6] / 6;
  gperm[4][2] = gamma_permutation[5][12] / 6;
  gperm[4][3] = gamma_permutation[5][18] / 6;
  gperm_sign[4][0] = gamma_sign[5][ 0];
  gperm_sign[4][1] = gamma_sign[5][ 6];
  gperm_sign[4][2] = gamma_sign[5][12];
  gperm_sign[4][3] = gamma_sign[5][18];
  // gamma_nu gamma_5
  for(nu=0;nu<4;nu++) {
    // permutation
    gperm[nu][0] = gamma_permutation[6+nu][ 0] / 6;
    gperm[nu][1] = gamma_permutation[6+nu][ 6] / 6;
    gperm[nu][2] = gamma_permutation[6+nu][12] / 6;
    gperm[nu][3] = gamma_permutation[6+nu][18] / 6;
    // is imaginary ?
    isimag[nu] = gamma_permutation[6+nu][0] % 2;
    // (overall) sign
    gperm_sign[nu][0] = gamma_sign[6+nu][ 0];
    gperm_sign[nu][1] = gamma_sign[6+nu][ 6];
    gperm_sign[nu][2] = gamma_sign[6+nu][12];
    gperm_sign[nu][3] = gamma_sign[6+nu][18];
    // write to stdout
    if(g_cart_id == 0) {
      fprintf(stdout, "# gamma_%d5 = (%f %d, %f %d, %f %d, %f %d)\n", nu,
          gperm_sign[nu][0], gperm[nu][0], gperm_sign[nu][1], gperm[nu][1], 
          gperm_sign[nu][2], gperm[nu][2], gperm_sign[nu][3], gperm[nu][3]);
    }
  }
  // gamma_nu
  for(nu=0;nu<4;nu++) {
    // permutation
    gperm2[nu][0] = gamma_permutation[nu][ 0] / 6;
    gperm2[nu][1] = gamma_permutation[nu][ 6] / 6;
    gperm2[nu][2] = gamma_permutation[nu][12] / 6;
    gperm2[nu][3] = gamma_permutation[nu][18] / 6;
    // (overall) sign
    gperm2_sign[nu][0] = gamma_sign[nu][ 0];
    gperm2_sign[nu][1] = gamma_sign[nu][ 6];
    gperm2_sign[nu][2] = gamma_sign[nu][12];
    gperm2_sign[nu][3] = gamma_sign[nu][18];
    // write to stdout
    if(g_cart_id == 0) {
    	fprintf(stdout, "# gamma_%d = (%f %d, %f %d, %f %d, %f %d)\n", nu,
        	gperm2_sign[nu][0], gperm2[nu][0], gperm2_sign[nu][1], gperm2[nu][1], 
        	gperm2_sign[nu][2], gperm2[nu][2], gperm2_sign[nu][3], gperm2[nu][3]);
    }
  }

  /**********************************************************
   **********************************************************
   **
   ** first contribution
   **
   **********************************************************
   **********************************************************/  

  /**********************************************
   * loop on the Lorentz index nu at source 
   **********************************************/
for(ia=0; ia<n_c; ia++) {
  for(nu=0; nu<4; nu++) 
  //for(nu=0; nu<4; nu++) 
  {
    // fprintf(stdout, "\n# [avc_exact2_lowmem_xspace] 1st part, processing nu = %d ...\n", nu);

    for(ir=0; ir<4; ir++) {

      // read 1 up-type propagator color components for spinor index ir
	if(!mms) {
      	  get_filename(filename, 0, 3*ir+ia, 1);
          exitstatus = read_lime_spinor(g_spinor_field[0], filename, 0);
          if(exitstatus != 0) {
            fprintf(stderr, "\n# [avc_exact2_lowmem_xspace] Error from read_lime_spinor\n");
            exit(111);
          }
          xchange_field(g_spinor_field[0]);
        } else {
          sprintf(filename, "%s.%.4d.00.%.2d.cgmms.%.2d.inverted", filename_prefix, Nconf, 3*ir+ia, mass_id);
          exitstatus = read_lime_spinor(work, filename, 0);
          if(exitstatus != 0) {
            fprintf(stderr, "\n# [avc_exact2_lowmem_xspace] Error from read_lime_spinor\n");
            exit(111);
          }
          xchange_field(work);
          Qf5(g_spinor_field[0], work, -g_mu);
          xchange_field(g_spinor_field[0]);
        }


      // read 1 dn-type propagator color components for spinor index gamma_perm ( ir )
        if(!mms) {
          if(ud_single_file) {
            get_filename(filename, 0, 3*gperm[nu][ir]+ia, 1);
            exitstatus = read_lime_spinor(g_spinor_field[1], filename, 1);
          } else {
            get_filename(filename, 0, 3*gperm[nu][ir]+ia, -1);
            exitstatus = read_lime_spinor(g_spinor_field[1], filename, 0);
          }
          if(exitstatus != 0) {
            fprintf(stderr, "\n# [avc_exact2_lowmem_xspace] Error from read_lime_spinor\n");
            exit(111);
          }
          xchange_field(g_spinor_field[1]);
        } else {
          sprintf(filename, "%s.%.4d.%.2d.%.2d.cgmms.%.2d.inverted", filename_prefix, Nconf, 4, 3*gperm[nu][ir]+ia, mass_id);
          exitstatus = read_lime_spinor(work, filename, 0);
          if(exitstatus != 0) {
            fprintf(stderr, "\n# [avc_exact2_lowmem_xspace] Error from read_lime_spinor\n");
            exit(111);
          }
          xchange_field(work);
          Qf5(g_spinor_field[1], work, g_mu);
          xchange_field(g_spinor_field[1]);
        }

        phi = g_spinor_field[0];
        chi = g_spinor_field[1];
        //fprintf(stdout, "\n# [nu5] spin index pair (%d, %d); col index %d\n", ir, gperm[nu][ir], ia);
        // 1) gamma_nu gamma_5 x U
        for(mu=0; mu<4; mu++) 
        //for(mu=0; mu<1; mu++) 
        {

          imunu = 4*mu+nu;
#ifdef OPENMP
#pragma omp parallel for private(ix, spinor1, spinor2, U_, w)  shared(imunu, ia, nu, mu)
#endif
          for(ix=0; ix<VOLUME; ix++) {
/*
            threadid = omp_get_thread_num();
            nthreads = omp_get_num_threads();
            fprintf(stdout, "[thread%d] number of threads = %d\n", threadid, nthreads);
*/

            _cm_eq_cm_ti_co(U_, &g_gauge_field[_GGI(ix,mu)], &co_phase_up[mu]);

            _fv_eq_cm_ti_fv(spinor1, U_, phi+_GSI(g_iup[ix][mu]));
            _fv_eq_gamma_ti_fv(spinor2, mu, spinor1);
	    _fv_mi_eq_fv(spinor2, spinor1);
	    _fv_eq_gamma_ti_fv(spinor1, 5, spinor2);
	    _co_eq_fv_dag_ti_fv(&w, chi+_GSI(ix), spinor1);
            if(!isimag[nu]) {
              conn[_GWI(imunu,ix,VOLUME)  ] += gperm_sign[nu][ir] * w.re;
              conn[_GWI(imunu,ix,VOLUME)+1] += gperm_sign[nu][ir] * w.im;
            } else {
              conn[_GWI(imunu,ix,VOLUME)  ] += gperm_sign[nu][ir] * w.im;
              conn[_GWI(imunu,ix,VOLUME)+1] -= gperm_sign[nu][ir] * w.re;
            }

          }  // of ix

#ifdef OPENMP
#pragma omp parallel for private(ix, spinor1, spinor2, U_, w)  shared(imunu, ia, nu, mu)
#endif
          for(ix=0; ix<VOLUME; ix++) {
            _cm_eq_cm_ti_co(U_, &g_gauge_field[_GGI(ix,mu)], &co_phase_up[mu]);

            _fv_eq_cm_dag_ti_fv(spinor1, U_, phi+_GSI(ix));
            _fv_eq_gamma_ti_fv(spinor2, mu, spinor1);
	    _fv_pl_eq_fv(spinor2, spinor1);
	    _fv_eq_gamma_ti_fv(spinor1, 5, spinor2);
	    _co_eq_fv_dag_ti_fv(&w, chi+_GSI(g_iup[ix][mu]), spinor1);
            if(!isimag[nu]) {
              conn[_GWI(imunu,ix,VOLUME)  ] += gperm_sign[nu][ir] * w.re;
              conn[_GWI(imunu,ix,VOLUME)+1] += gperm_sign[nu][ir] * w.im;
            } else {
              conn[_GWI(imunu,ix,VOLUME)  ] += gperm_sign[nu][ir] * w.im;
              conn[_GWI(imunu,ix,VOLUME)+1] -= gperm_sign[nu][ir] * w.re;
            }

          }  // of ix

          // contribution to local-local correlator
#ifdef OPENMP
#pragma omp parallel for private(ix, spinor1, spinor2, U_, w)  shared(imunu, ia, nu, mu)
#endif
          for(ix=0; ix<VOLUME; ix++) {
            _fv_eq_gamma_ti_fv(spinor2, mu, phi+_GSI(ix) );
	    _fv_eq_gamma_ti_fv(spinor1, 5, spinor2);
	    _co_eq_fv_dag_ti_fv(&w, chi+_GSI(ix), spinor1);
            if(!isimag[nu]) {
              conn2[_GWI(imunu,ix,VOLUME)  ] += gperm_sign[nu][ir] * w.re;
              conn2[_GWI(imunu,ix,VOLUME)+1] += gperm_sign[nu][ir] * w.im;
            } else {
              conn2[_GWI(imunu,ix,VOLUME)  ] += gperm_sign[nu][ir] * w.im;
              conn2[_GWI(imunu,ix,VOLUME)+1] -= gperm_sign[nu][ir] * w.re;
            }

          }  // of ix

	} // of mu
    }  // of ir

  }  // of nu
}  // of ia loop on colors

  
  // normalisation of contractions
#ifdef OPENMP
#pragma omp parallel for
#endif
  for(ix=0; ix<32*VOLUME; ix++) conn[ix] *= -0.5;

#ifdef OPENMP
#pragma omp parallel for
#endif
  for(ix=0; ix<32*VOLUME; ix++) conn2[ix] *= -1.;

#ifdef MPI
      retime = MPI_Wtime();
#else
      retime = (double)clock() / CLOCKS_PER_SEC;
#endif
  if(g_cart_id==0) fprintf(stdout, "contractions in %e seconds\n", retime-ratime);

  
  // save results
#ifdef MPI
  ratime = MPI_Wtime();
#else
  ratime = (double)clock() / CLOCKS_PER_SEC;
#endif
  if(outfile_prefix_set) {
    sprintf(filename, "%s/cvc_lvc_x.%.4d.t%.2dx%.2dy%.2dz%.2d", outfile_prefix, Nconf, sx0, sx1, sx2, sx3);
  } else {
    sprintf(filename, "cvc_lvc_x.%.4d.t%.2dx%.2dy%.2dz%.2d", Nconf, sx0, sx1, sx2, sx3);
  }
  sprintf(contype, "cvc - lvc in position space, all 16 components");
  status = write_lime_contraction(conn, filename, 64, 16, contype, Nconf, 0);
  if(status != 0) {
    fprintf(stderr, "[] Error from write_lime_contractions, status was %d\n", status);
    exit(16);
  }

  if(outfile_prefix_set) {
    sprintf(filename, "%s/lvc_lvc_x.%.4d.t%.2dx%.2dy%.2dz%.2d", outfile_prefix, Nconf, sx0, sx1, sx2, sx3);
  } else {
    sprintf(filename, "lvc_lvc_x.%.4d.t%.2dx%.2dy%.2dz%.2d", Nconf, sx0, sx1, sx2, sx3);
  }
  sprintf(contype, "lvc - lvc in position space, all 16 components");
  status = write_lime_contraction(conn2, filename, 64, 16, contype, Nconf, 0);
  if(status != 0) {
    fprintf(stderr, "[] Error from write_lime_contractions, status was %d\n", status);
    exit(17);
  }

#ifndef MPI
  if(write_ascii) {
    if(outfile_prefix_set) {
      sprintf(filename, "%s/cvc_lvc_x.%.4d.ascii", outfile_prefix, Nconf);
    } else {
      sprintf(filename, "cvc_lvc_x.%.4d.ascii", Nconf);
    }
    write_contraction(conn, NULL, filename, 16, 2, 0);

    if(outfile_prefix_set) {
      sprintf(filename, "%s/lvc_lvc_x.%.4d.ascii", outfile_prefix, Nconf);
    } else {
      sprintf(filename, "lvc_lvc_x.%.4d.ascii", Nconf);
    }
    write_contraction(conn2, NULL, filename, 16, 2, 0);
  }
#endif

#ifdef MPI
  retime = MPI_Wtime();
#else
  retime = (double)clock() / CLOCKS_PER_SEC;
#endif
  if(g_cart_id==0) fprintf(stdout, "saved position space results in %e seconds\n", retime-ratime);

#ifndef MPI
  // check the Ward identity in position space
  if(check_position_space_WI) {
    sprintf(filename, "WI_X.%.4d", Nconf);
    ofs = fopen(filename,"w");
    fprintf(stdout, "\n# [avc_exact2_lowmem_xspace] checking Ward identity in position space ...\n");
    for(x0=0; x0<T;  x0++) {
    for(x1=0; x1<LX; x1++) {
    for(x2=0; x2<LY; x2++) {
    for(x3=0; x3<LZ; x3++) {
      fprintf(ofs, "# t=%2d x=%2d y=%2d z=%2d\n", x0, x1, x2, x3);
      ix=g_ipt[x0][x1][x2][x3];
      for(nu=0; nu<4; nu++) {
        w.re = conn[_GWI(4*0+nu,ix,VOLUME)] + conn[_GWI(4*1+nu,ix,VOLUME)]
             + conn[_GWI(4*2+nu,ix,VOLUME)] + conn[_GWI(4*3+nu,ix,VOLUME)]
	     - conn[_GWI(4*0+nu,g_idn[ix][0],VOLUME)] - conn[_GWI(4*1+nu,g_idn[ix][1],VOLUME)]
	     - conn[_GWI(4*2+nu,g_idn[ix][2],VOLUME)] - conn[_GWI(4*3+nu,g_idn[ix][3],VOLUME)];

        w.im = conn[_GWI(4*0+nu,ix,VOLUME)+1] + conn[_GWI(4*1+nu,ix,VOLUME)+1]
            + conn[_GWI(4*2+nu,ix,VOLUME)+1] + conn[_GWI(4*3+nu,ix,VOLUME)+1]
	    - conn[_GWI(4*0+nu,g_idn[ix][0],VOLUME)+1] - conn[_GWI(4*1+nu,g_idn[ix][1],VOLUME)+1]
	    - conn[_GWI(4*2+nu,g_idn[ix][2],VOLUME)+1] - conn[_GWI(4*3+nu,g_idn[ix][3],VOLUME)+1];
      
        fprintf(ofs, "\t%3d%25.16e%25.16e\n", nu, w.re, w.im);
      }
    }}}}
    fclose(ofs);
  }
#endif

  /****************************************
   * free the allocated memory, finalize
   ****************************************/
  free(g_gauge_field);
  for(i=0; i<no_fields; i++) free(g_spinor_field[i]);
  free(g_spinor_field);
  free_geometry();
  if(conn  != NULL) free(conn);
  if(conn2 != NULL) free(conn2);
#ifdef MPI
  free(status);
  MPI_Finalize();
#endif

  if(g_cart_id==0) {
    g_the_time = time(NULL);
    fprintf(stdout, "\n# [cvc_lvc_exact2_lowmem_xspace] %s# [cvc_lvc_exact2_lowmem_xspace] end of run\n", ctime(&g_the_time));
    fprintf(stderr, "\n# [cvc_lvc_exact2_lowmem_xspace] %s# [cvc_lvc_exact2_lowmem_xspace] end of run\n", ctime(&g_the_time));
  }

  return(0);

}