コード例 #1
ファイル: invert_dw_quda_v3.c プロジェクト: etmc/cvc
int main(int argc, char **argv) {
  int c, i, mu, status;
  int ispin, icol, isc;
  int n_c = 3;
  int n_s = 4;
  int count        = 0;
  int filename_set = 0;
  int dims[4]      = {0,0,0,0};
  int grid_size[4];
  int l_LX_at, l_LXstart_at;
  int x0, x1, x2, x3, ix, iix, iy, is, it, i3;
  int sl0, sl1, sl2, sl3, have_source_flag=0;
  int source_proc_coords[4], lsl0, lsl1, lsl2, lsl3;
  int check_residuum = 0;
  unsigned int VOL3, V5;
  int do_gt   = 0;
  int full_orbit = 0;
  int smear_source = 0;
  char filename[200], source_filename[200], source_filename_write[200];
  double ratime, retime;
  double plaq_r=0., plaq_m=0., norm, norm2;
  double spinor1[24];
  double *gauge_qdp[4], *gauge_field_timeslice=NULL, *gauge_field_smeared=NULL;
  double _1_2_kappa, _2_kappa, phase;
  FILE *ofs;
  int mu_trans[4] = {3, 0, 1, 2};
  int threadid, nthreads;
  int timeslice, source_timeslice;
  char rng_file_in[100], rng_file_out[100];
  int *source_momentum=NULL;
  int source_momentum_class = -1;
  int source_momentum_no = 0;
  int source_momentum_runs = 1;
  int imom;
  int num_gpu_on_node=0, rank;
  int source_location_5d_iseven;
  int convert_sign=0;
#ifdef HAVE_QUDA
  int rotate_gamma_basis = 1;
  int rotate_gamma_basis = 0;
  omp_lock_t *lck = NULL, gen_lck[1];
  int key = 0;

  /* for smearing parallel to inversion                                       */
  double *smearing_spinor_field[] = {NULL,NULL};
  int dummy_flag = 0;

#if (defined HAVE_QUDA) && (defined MULTI_GPU)
  int x_face_size, y_face_size, z_face_size, t_face_size, pad_size;

  int qlatt_nclass;
  int *qlatt_id=NULL, *qlatt_count=NULL, **qlatt_rep=NULL, **qlatt_map=NULL;
  double **qlatt_list=NULL;

  double boundary_condition_factor;
  int boundary_condition_factor_set = 0;

//#ifdef MPI       
//  kernelPackT = true;

   * QUDA parameters
#ifdef HAVE_QUDA
  QudaPrecision cpu_prec         = QUDA_DOUBLE_PRECISION;
  QudaPrecision cuda_prec        = QUDA_DOUBLE_PRECISION;
  QudaPrecision cuda_prec_sloppy = QUDA_SINGLE_PRECISION;

  QudaGaugeParam gauge_param = newQudaGaugeParam();
  QudaInvertParam inv_param = newQudaInvertParam();

  while ((c = getopt(argc, argv, "soch?vgf:p:b:S:R:")) != -1) {
    switch (c) {
    case 'v':
      g_verbose = 1;
    case 'g':
      do_gt = 1;
    case 'f':
      strcpy(filename, optarg);
    case 'c':
      check_residuum = 1;
      fprintf(stdout, "# [invert_dw_quda] will check residuum again\n");
    case 'p':
      n_c = atoi(optarg);
      fprintf(stdout, "# [invert_dw_quda] will use number of colors = %d\n", n_c);
    case 'o':
      full_orbit = 1;
      fprintf(stdout, "# [invert_dw_quda] will invert for full orbit, if source momentum set\n");
    case 's':
      smear_source = 1;
      fprintf(stdout, "# [invert_dw_quda] will smear the sources if they are read from file\n");
    case 'b':
      boundary_condition_factor = atof(optarg);
      boundary_condition_factor_set = 1;
      fprintf(stdout, "# [invert_dw_quda] const. boundary condition factor set to %e\n", boundary_condition_factor);
    case 'S':
      convert_sign = atoi(optarg);
      fprintf(stdout, "# [invert_dw_quda] using convert sign %d\n", convert_sign);
    case 'R':
      rotate_gamma_basis = atoi(optarg);
      fprintf(stdout, "# [invert_dw_quda] rotate gamma basis %d\n", rotate_gamma_basis);
    case 'h':
    case '?':

  // get the time stamp
  g_the_time = time(NULL);

   * set the default values, read input
  if(filename_set==0) strcpy(filename, "cvc.input");
  if(g_proc_id==0) fprintf(stdout, "# Reading input from file %s\n", filename);

#ifdef MPI
#ifdef HAVE_QUDA
  grid_size[0] = g_nproc_x;
  grid_size[1] = g_nproc_y;
  grid_size[2] = g_nproc_z;
  grid_size[3] = g_nproc_t;
  fprintf(stdout, "# [] g_nproc = (%d,%d,%d,%d)\n", g_nproc_x, g_nproc_y, g_nproc_z, g_nproc_t);
  initCommsQuda(argc, argv, grid_size, 4);
  MPI_Init(&argc, &argv);

#if (defined PARALLELTX) || (defined PARALLELTXY)
  EXIT_WITH_MSG(1, "[] Error, 2-dim./3-dim. MPI-Version not yet implemented");

  // some checks on the input data
  if((T_global == 0) || (LX==0) || (LY==0) || (LZ==0)) {
    if(g_proc_id==0) fprintf(stderr, "[invert_dw_quda] Error, T and L's must be set\n");

  // set number of openmp threads

  // initialize MPI parameters
  mpi_init(argc, argv);
  // the volume of a timeslice
  VOL3 = LX*LY*LZ;
  V5   = T*LX*LY*LZ*L5;
  g_kappa5d = 0.5 / (5. + g_m5);
  if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] kappa5d = %e\n", g_kappa5d);

  fprintf(stdout, "# [%2d] parameters:\n"\
                  "# [%2d] T            = %3d\n"\
		  "# [%2d] Tstart       = %3d\n"\
		  "# [%2d] L5           = %3d\n",\
                  g_cart_id, g_cart_id, T, g_cart_id, Tstart, g_cart_id, L5);

#ifdef MPI
  if(T==0) {
    fprintf(stderr, "[%2d] local T is zero; exit\n", g_cart_id);
    MPI_Abort(MPI_COMM_WORLD, 1);

  if(init_geometry() != 0) {
    fprintf(stderr, "[invert_dw_quda] Error from init_geometry\n");

  if( init_geometry_5d() != 0 ) {
    fprintf(stderr, "[invert_dw_quda] Error from init_geometry_5d\n");

   * initialize the QUDA library
  if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] initializing quda\n");
#ifdef HAVE_QUDA
  // cudaGetDeviceCount(&num_gpu_on_node);
  if(g_gpu_per_node<0) {
    if(g_cart_id==0) fprintf(stderr, "[] Error, number of GPUs per node not set\n");
  } else {
    num_gpu_on_node = g_gpu_per_node;
#ifdef MPI
  rank = comm_rank();
  rank = 0;
  g_gpu_device_number = rank % num_gpu_on_node;
  fprintf(stdout, "# [] process %d/%d uses device %d\n", rank, g_cart_id, g_gpu_device_number);


   * prepare the gauge field
  // read the gauge field from file
  alloc_gauge_field(&g_gauge_field, VOLUMEPLUSRAND);
  if(strcmp( gaugefilename_prefix, "identity")==0 ) {
    if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] Setting up unit gauge field\n");
    for(ix=0;ix<VOLUME; ix++) {
      for(mu=0;mu<4;mu++) {
  } else if(strcmp( gaugefilename_prefix, "random")==0 ) {
    if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] Setting up random gauge field with seed = %d\n", g_seed);
    init_rng_state(g_seed, &g_rng_state);
    random_gauge_field(g_gauge_field, 1.);
    sprintf(filename, "%s.%.4d", gaugefilename_prefix, Nconf);
    check_error(write_lime_gauge_field(filename, plaq_m, Nconf, 64), "write_lime_gauge_field", NULL, 12);
  } else {
    if(g_gauge_file_format == 0) {
      // ILDG
      sprintf(filename, "%s.%.4d", gaugefilename_prefix, Nconf);
      if(g_cart_id==0) fprintf(stdout, "# Reading gauge field from file %s\n", filename);
      status = read_lime_gauge_field_doubleprec(filename);
    } else if(g_gauge_file_format == 1) {
      // NERSC
      sprintf(filename, "%s.%.5d", gaugefilename_prefix, Nconf);
      if(g_cart_id==0) fprintf(stdout, "# Reading gauge field from file %s\n", filename);
      status = read_nersc_gauge_field(g_gauge_field, filename, &plaq_r);
      //status = read_nersc_gauge_field_3x3(g_gauge_field, filename, &plaq_r);

    if(status != 0) {
      fprintf(stderr, "[invert_dw_quda] Error, could not read gauge field");
#ifdef MPI

  // measure the plaquette
  if(g_cart_id==0) fprintf(stdout, "# Measured plaquette value: %25.16e\n", plaq_m);
  if(g_cart_id==0) fprintf(stdout, "# Read plaquette value    : %25.16e\n", plaq_r);

#ifndef HAVE_QUDA
  if(N_Jacobi>0) {
    // allocate the smeared / qdp ordered gauge field
    alloc_gauge_field(&gauge_field_smeared, VOLUMEPLUSRAND);
    for(i=0;i<4;i++) {
      gauge_qdp[i] = gauge_field_smeared + i*18*VOLUME;
#ifndef HAVE_QUDA

#ifdef HAVE_QUDA
  // transcribe the gauge field

#pragma omp parallel for private(ix,iy,mu)
  for(ix=0;ix<VOLUME;ix++) {
    iy = g_lexic2eot[ix];
    for(mu=0;mu<4;mu++) {
      _cm_eq_cm(gauge_qdp[mu_trans[mu]]+18*iy, g_gauge_field+_GGI(ix,mu));
  // multiply timeslice T-1 with factor of -1 (antiperiodic boundary condition)
  if(g_proc_coords[0]==g_nproc_t-1) {
    if(!boundary_condition_factor_set) boundary_condition_factor = -1.;
    fprintf(stdout, "# [] process %d multiplies gauge-field timeslice T_global-1 with boundary condition factor %e\n", g_cart_id,

#pragma omp parallel for private(ix,iy)
    for(ix=0;ix<VOL3;ix++) {
      iix = (T-1)*VOL3 + ix;
      iy = g_lexic2eot[iix];
      _cm_ti_eq_re(gauge_qdp[mu_trans[0]]+18*iy, -1.);

  // QUDA precision parameters
  switch(g_cpu_prec) {
    case 0: cpu_prec = QUDA_HALF_PRECISION; if(g_cart_id==0) fprintf(stdout, "# [] CPU prec = half\n"); break;
    case 1: cpu_prec = QUDA_SINGLE_PRECISION; if(g_cart_id==0) fprintf(stdout, "# [] CPU prec = single\n"); break;
    case 2: cpu_prec = QUDA_DOUBLE_PRECISION; if(g_cart_id==0) fprintf(stdout, "# [] CPU prec = double\n"); break;
    default: cpu_prec = QUDA_DOUBLE_PRECISION; break;
  switch(g_gpu_prec) {
    case 0: cuda_prec = QUDA_HALF_PRECISION; if(g_cart_id==0) fprintf(stdout, "# [] GPU prec = half\n"); break;
    case 1: cuda_prec = QUDA_SINGLE_PRECISION; if(g_cart_id==0) fprintf(stdout, "# [] GPU prec = single\n"); break;
    case 2: cuda_prec = QUDA_DOUBLE_PRECISION; if(g_cart_id==0) fprintf(stdout, "# [] GPU prec = double\n"); break;
    default: cuda_prec = QUDA_DOUBLE_PRECISION; break;
  switch(g_gpu_prec_sloppy) {
    case 0: cuda_prec_sloppy = QUDA_HALF_PRECISION; if(g_cart_id==0) fprintf(stdout, "# [] GPU sloppy prec = half\n"); break;
    case 1: cuda_prec_sloppy = QUDA_SINGLE_PRECISION; if(g_cart_id==0) fprintf(stdout, "# [] GPU sloppy prec = single\n"); break;
    case 2: cuda_prec_sloppy = QUDA_DOUBLE_PRECISION; if(g_cart_id==0) fprintf(stdout, "# [] GPU sloppy prec = double\n"); break;
    default: cuda_prec_sloppy = QUDA_SINGLE_PRECISION; break;

  // QUDA gauge parameters
  gauge_param.X[0] = LX;
  gauge_param.X[1] = LY;
  gauge_param.X[2] = LZ;
  gauge_param.X[3] = T;
  inv_param.Ls = L5;

  gauge_param.anisotropy  = 1.0;
  gauge_param.type        = QUDA_WILSON_LINKS;
  gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER;
  gauge_param.t_boundary  = QUDA_ANTI_PERIODIC_T;

  gauge_param.cpu_prec           = cpu_prec;
  gauge_param.cuda_prec          = cuda_prec;
  gauge_param.reconstruct        = QUDA_RECONSTRUCT_12;
  gauge_param.cuda_prec_sloppy   = cuda_prec_sloppy;
  gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_12;
  gauge_param.gauge_fix          = QUDA_GAUGE_FIXED_NO;

  gauge_param.ga_pad = 0;
  inv_param.sp_pad = 0;
  inv_param.cl_pad = 0;

  // For multi-GPU, ga_pad must be large enough to store a time-slice
#ifdef MULTI_GPU
  x_face_size = inv_param.Ls * gauge_param.X[1]*gauge_param.X[2]*gauge_param.X[3]/2;
  y_face_size = inv_param.Ls * gauge_param.X[0]*gauge_param.X[2]*gauge_param.X[3]/2;
  z_face_size = inv_param.Ls * gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[3]/2;
  t_face_size = inv_param.Ls * gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[2]/2;
  pad_size = _MAX(x_face_size, y_face_size);
  pad_size = _MAX(pad_size, z_face_size);
  pad_size = _MAX(pad_size, t_face_size);
  gauge_param.ga_pad = pad_size;
  if(g_cart_id==0) printf("# [invert_dw_quda] pad_size = %d\n", pad_size);

  // load the gauge field
  if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] loading gauge field\n");
  loadGaugeQuda((void*)gauge_qdp, &gauge_param);
  gauge_qdp[0] = NULL; 
  gauge_qdp[1] = NULL; 
  gauge_qdp[2] = NULL; 
  gauge_qdp[3] = NULL; 


   * APE smear the gauge field
  if(N_Jacobi>0) {
    memcpy(gauge_field_smeared, g_gauge_field, 72*VOLUMEPLUSRAND*sizeof(double));
    fprintf(stdout, "# [invert_dw_quda] APE smearing gauge field with paramters N_APE=%d, alpha_APE=%e\n", N_ape, alpha_ape);
    APE_Smearing_Step_threads(gauge_field_smeared, N_ape, alpha_ape);

  // allocate memory for the spinor fields
#ifdef HAVE_QUDA
  no_fields = 3+2;
  no_fields = 6+2;
  g_spinor_field = (double**)calloc(no_fields, sizeof(double*));
  for(i=0; i<no_fields; i++) alloc_spinor_field(&g_spinor_field[i], VOLUMEPLUSRAND*L5);
  smearing_spinor_field[0] = g_spinor_field[no_fields-2];
  smearing_spinor_field[1] = g_spinor_field[no_fields-1];

  switch(g_source_type) {
    case 0:
    case 5:
      // the source locaton
      sl0 =   g_source_location                              / (LX_global*LY_global*LZ);
      sl1 = ( g_source_location % (LX_global*LY_global*LZ) ) / (          LY_global*LZ);
      sl2 = ( g_source_location % (          LY_global*LZ) ) / (                    LZ);
      sl3 =   g_source_location %                      LZ;
      if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] global sl = (%d, %d, %d, %d)\n", sl0, sl1, sl2, sl3);
      source_proc_coords[0] = sl0 / T;
      source_proc_coords[1] = sl1 / LX;
      source_proc_coords[2] = sl2 / LY;
      source_proc_coords[3] = sl3 / LZ;
    #ifdef MPI
      MPI_Cart_rank(g_cart_grid, source_proc_coords, &g_source_proc_id);
      g_source_proc_id = 0;
      have_source_flag = g_source_proc_id == g_cart_id;
      lsl0 = sl0 % T;
      lsl1 = sl1 % LX;
      lsl2 = sl2 % LY;
      lsl3 = sl3 % LZ;
      if(have_source_flag) {
        fprintf(stdout, "# [invert_dw_quda] process %d has the source at (%d, %d, %d, %d)\n", g_cart_id, lsl0, lsl1, lsl2, lsl3);
    case 2:
    case 3:
    case 4:
      // the source timeslice
#ifdef MPI
      source_proc_coords[0] = g_source_timeslice / T;
      source_proc_coords[1] = 0;
      source_proc_coords[2] = 0;
      source_proc_coords[3] = 0;
      MPI_Cart_rank(g_cart_grid, source_proc_coords, &g_source_proc_id);
      have_source_flag = ( g_source_proc_id == g_cart_id );
      source_timeslice = have_source_flag ? g_source_timeslice % T : -1;
      g_source_proc_id = 0;
      have_source_flag = 1;
      source_timeslice = g_source_timeslice;

#ifdef HAVE_QUDA
   * QUDA inverter parameters
  inv_param.dslash_type    = QUDA_DOMAIN_WALL_DSLASH;

  if(strcmp(g_inverter_type_name, "cg") == 0) {
    inv_param.inv_type       = QUDA_CG_INVERTER;
    if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] using cg inverter\n"); 
  } else if(strcmp(g_inverter_type_name, "bicgstab") == 0) {
    inv_param.inv_type       = QUDA_BICGSTAB_INVERTER;
    if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] using bicgstab inverter\n");
#ifdef MULTI_GPU    
  } else if(strcmp(g_inverter_type_name, "gcr") == 0) {
    inv_param.inv_type       = QUDA_GCR_INVERTER;
    if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] using gcr inverter\n"); 
  } else {
    if(g_cart_id==0) fprintf(stderr, "[invert_dw_quda] Error, unrecognized inverter type %s\n", g_inverter_type_name);

  if(inv_param.inv_type == QUDA_CG_INVERTER) {
    inv_param.solution_type = QUDA_MAT_SOLUTION;
    inv_param.solve_type    = QUDA_NORMEQ_PC_SOLVE;
  } else if(inv_param.inv_type == QUDA_BICGSTAB_INVERTER) {
    inv_param.solution_type = QUDA_MAT_SOLUTION;
    inv_param.solve_type    = QUDA_DIRECT_PC_SOLVE;
  } else {
    inv_param.solution_type = QUDA_MATPC_SOLUTION;
    inv_param.solve_type    = QUDA_DIRECT_PC_SOLVE;

  inv_param.m5             = g_m5;
  inv_param.kappa          = 0.5 / (5. + inv_param.m5);
  inv_param.mass           = g_m0;

  inv_param.tol            = solver_precision;
  inv_param.maxiter        = niter_max;
  inv_param.reliable_delta = reliable_delta;

#ifdef MPI
  // domain decomposition preconditioner parameters
  if(inv_param.inv_type == QUDA_GCR_INVERTER) {
    if(g_cart_id == 0) printf("# [] settup DD parameters\n");
    inv_param.gcrNkrylov     = 15;
    inv_param.inv_type_precondition = QUDA_MR_INVERTER;
    inv_param.tol_precondition = 1e-6;
    inv_param.maxiter_precondition = 200;
    inv_param.verbosity_precondition = QUDA_VERBOSE;
    inv_param.prec_precondition = cuda_prec_sloppy;
    inv_param.omega = 0.7;

  inv_param.matpc_type         = QUDA_MATPC_EVEN_EVEN;
  inv_param.dagger             = QUDA_DAG_NO;
  inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION; //;QUDA_MASS_NORMALIZATION;

  inv_param.cpu_prec         = cpu_prec;
  inv_param.cuda_prec        = cuda_prec;
  inv_param.cuda_prec_sloppy = cuda_prec_sloppy;

  inv_param.verbosity = QUDA_VERBOSE;

  inv_param.preserve_source = QUDA_PRESERVE_SOURCE_NO;
  inv_param.dirac_order = QUDA_DIRAC_ORDER;
#ifdef MPI
  inv_param.preserve_dirac = QUDA_PRESERVE_DIRAC_YES;
  inv_param.prec_precondition = cuda_prec_sloppy;
  inv_param.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS;
  inv_param.dirac_tune = QUDA_TUNE_NO;

   * write initial rng state to file
  if( g_source_type==2 && g_coherent_source==2 ) {
    sprintf(rng_file_out, "%s.0", g_rng_filename);
    status = init_rng_stat_file (g_seed, rng_file_out);
    if( status != 0 ) {
      fprintf(stderr, "[invert_dw_quda] Error, could not write rng status\n");
  } else if( (g_source_type==2 /*&& g_coherent_source==1*/) || g_source_type==3 || g_source_type==4) {
    if( init_rng_state(g_seed, &g_rng_state) != 0 ) {
      fprintf(stderr, "[invert_dw_quda] Error, could initialize rng state\n");

   * prepare locks for openmp
  nthreads = g_num_threads - 1;
  lck = (omp_lock_t*)malloc(nthreads * sizeof(omp_lock_t));
  if(lck == NULL) {
      EXIT_WITH_MSG(97, "[invert_dw_quda] Error, could not allocate lck\n");
  // init locks
  for(i=0;i<nthreads;i++) {

  // check the source momenta
  if(g_source_momentum_set) {
    source_momentum = (int*)malloc(3*sizeof(int));

    if(g_source_momentum[0]<0) g_source_momentum[0] += LX_global;
    if(g_source_momentum[1]<0) g_source_momentum[1] += LY_global;
    if(g_source_momentum[2]<0) g_source_momentum[2] += LZ_global;
    fprintf(stdout, "# [invert_dw_quda] using final source momentum ( %d, %d, %d )\n", g_source_momentum[0], g_source_momentum[1], g_source_momentum[2]);

    if(full_orbit) {
      status = make_qcont_orbits_3d_parity_avg( &qlatt_id, &qlatt_count, &qlatt_list, &qlatt_nclass, &qlatt_rep, &qlatt_map);
      if(status != 0) {
        if(g_cart_id==0) fprintf(stderr, "\n[invert_dw_quda] Error while creating O_3-lists\n");
      source_momentum_class = qlatt_id[g_ipt[0][g_source_momentum[0]][g_source_momentum[1]][g_source_momentum[2]]];
      source_momentum_no    = qlatt_count[source_momentum_class];
      source_momentum_runs  = source_momentum_class==0 ? 1 : source_momentum_no + 1;
      if(g_cart_id==0) fprintf(stdout, "# [] source momentum belongs to class %d with %d members, which means %d runs\n",
          source_momentum_class, source_momentum_no, source_momentum_runs);

  if(g_source_type == 5) {
    if(g_seq_source_momentum_set) {
      if(g_seq_source_momentum[0]<0) g_seq_source_momentum[0] += LX_global;
      if(g_seq_source_momentum[1]<0) g_seq_source_momentum[1] += LY_global;
      if(g_seq_source_momentum[2]<0) g_seq_source_momentum[2] += LZ_global;
    } else if(g_source_momentum_set) {
      g_seq_source_momentum[0] = g_source_momentum[0];
      g_seq_source_momentum[1] = g_source_momentum[1];
      g_seq_source_momentum[2] = g_source_momentum[2];
    fprintf(stdout, "# [invert_dw_quda] using final sequential source momentum ( %d, %d, %d )\n",
        g_seq_source_momentum[0], g_seq_source_momentum[1], g_seq_source_momentum[2]);

   * loop on spin-color-index
  for(isc=g_source_index[0]; isc<=g_source_index[1]; isc++)
//  for(isc=g_source_index[0]; isc<=g_source_index[0]; isc++)
    ispin = isc / n_c;
    icol  = isc % n_c;

    for(imom=0; imom<source_momentum_runs; imom++) {

       * set source momentum
      if(g_source_momentum_set) {
        if(imom == 0) {
          if(full_orbit) {
            source_momentum[0] = 0;
            source_momentum[1] = 0;
            source_momentum[2] = 0;
          } else {
            source_momentum[0] = g_source_momentum[0];
            source_momentum[1] = g_source_momentum[1];
            source_momentum[2] = g_source_momentum[2];
        } else {
          source_momentum[0] = qlatt_map[source_momentum_class][imom-1] / (LY_global*LZ_global);
          source_momentum[1] = ( qlatt_map[source_momentum_class][imom-1] % (LY_global*LZ_global) ) / LZ_global;
          source_momentum[2] = qlatt_map[source_momentum_class][imom-1] % LZ_global;
        if(g_cart_id==0) fprintf(stdout, "# [] run no. %d, source momentum (%d, %d, %d)\n",
            imom, source_momentum[0], source_momentum[1], source_momentum[2]);
       * prepare the souce
      if(g_read_source == 0) {  // create source
        switch(g_source_type) {
          case 0:
            // point source
            if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] Creating point source\n");
            for(ix=0;ix<L5*VOLUME;ix++) { _fv_eq_zero(g_spinor_field[0]+ix); }
            if(have_source_flag) {
              if(g_source_momentum_set) {
                phase = 2*M_PI*( source_momentum[0]*sl1/(double)LX_global + source_momentum[1]*sl2/(double)LY_global + source_momentum[2]*sl3/(double)LZ_global );
                g_spinor_field[0][_GSI(g_ipt[lsl0][lsl1][lsl2][lsl3]) + 2*(n_c*ispin+icol)  ] = cos(phase);
                g_spinor_field[0][_GSI(g_ipt[lsl0][lsl1][lsl2][lsl3]) + 2*(n_c*ispin+icol)+1] = sin(phase);
              } else {
                g_spinor_field[0][_GSI(g_ipt[lsl0][lsl1][lsl2][lsl3]) + 2*(n_c*ispin+icol)  ] = 1.;
            if(g_source_momentum_set) {
              sprintf(source_filename, "%s.%.4d.t%.2dx%.2dy%.2dz%.2d.%.2d.qx%.2dqy%.2dqz%.2d",
                  filename_prefix, Nconf, sl0, sl1, sl2, sl3, n_c*ispin+icol, source_momentum[0], source_momentum[1], source_momentum[2]);
            } else {
              sprintf(source_filename, "%s.%.4d.t%.2dx%.2dy%.2dz%.2d.%.2d", filename_prefix, Nconf, sl0, sl1, sl2, sl3, n_c*ispin+icol);
#ifdef HAVE_QUDA
            // set matpc_tpye
            source_location_5d_iseven = ( (g_iseven[g_ipt[lsl0][lsl1][lsl2][lsl3]] && ispin<n_s/2) || (!g_iseven[g_ipt[lsl0][lsl1][lsl2][lsl3]] && ispin>=n_s/2) ) ? 1 : 0;
            if(source_location_5d_iseven) {
              inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
              if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] matpc type is MATPC_EVEN_EVEN\n");
            } else {
              inv_param.matpc_type = QUDA_MATPC_ODD_ODD;
              if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] matpc type is MATPC_ODD_ODD\n");
          case 2:
            // timeslice source
            if(g_coherent_source==1) {
              if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] Creating coherent timeslice source\n");
              status = prepare_coherent_timeslice_source(g_spinor_field[0], gauge_field_smeared, g_coherent_source_base, g_coherent_source_delta, VOLUME, g_rng_state, 1);
              if(status != 0) {
                fprintf(stderr, "[invert_dw_quda] Error from prepare source, status was %d\n", status);
#ifdef MPI
                MPI_Abort(MPI_COMM_WORLD, 123);
              check_error(prepare_coherent_timeslice_source(g_spinor_field[0], gauge_field_smeared, g_coherent_source_base, g_coherent_source_delta, VOLUME, g_rng_state, 1),
                  "prepare_coherent_timeslice_source", NULL, 123);
              timeslice = g_coherent_source_base;
            } else {
              if(g_coherent_source==2) {
                timeslice = (g_coherent_source_base+isc*g_coherent_source_delta)%T_global;
                fprintf(stdout, "# [invert_dw_quda] Creating timeslice source\n");
                check_error(prepare_timeslice_source(g_spinor_field[0], gauge_field_smeared, timeslice, VOLUME, g_rng_state, 1),
                    "prepare_timeslice_source", NULL, 123);
              } else {
                if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] Creating timeslice source\n");
                check_error(prepare_timeslice_source(g_spinor_field[0], gauge_field_smeared, g_source_timeslice, VOLUME, g_rng_state, 1),
                    "prepare_timeslice_source", NULL, 124);
                timeslice = g_source_timeslice;
            if(g_source_momentum_set) {
              sprintf(source_filename, "%s.%.4d.%.2d.%.5d.qx%.2dqy%.2dqz%.2d", filename_prefix, Nconf, 
                  timeslice, isc, source_momentum[0], source_momentum[1], source_momentum[2]);
            } else {
              sprintf(source_filename, "%s.%.4d.%.2d.%.5d", filename_prefix, Nconf, timeslice, isc);
          case 3:
            // timeslice sources for one-end trick (spin dilution)
            fprintf(stdout, "# [invert_dw_quda] Creating timeslice source for one-end-trick\n");
            check_error( prepare_timeslice_source_one_end(g_spinor_field[0], gauge_field_smeared, source_timeslice, source_momentum, isc%n_s, g_rng_state, \
                ( isc%n_s==(n_s-1) && imom==source_momentum_runs-1 )), "prepare_timeslice_source_one_end", NULL, 125 );
            c = N_Jacobi > 0 ? isc%n_s + n_s : isc%n_s;
            if(g_source_momentum_set) {
              sprintf(source_filename, "%s.%.4d.%.2d.%.2d.qx%.2dqy%.2dqz%.2d", filename_prefix, Nconf, 
                  g_source_timeslice, c, source_momentum[0], source_momentum[1], source_momentum[2]);
            } else {
              sprintf(source_filename, "%s.%.4d.%.2d.%.2d", filename_prefix, Nconf, g_source_timeslice, c);
          case 4:
            // timeslice sources for one-end trick (spin and color dilution )
            fprintf(stdout, "# [invert_dw_quda] Creating timeslice source for one-end-trick\n");
            check_error(prepare_timeslice_source_one_end_color(g_spinor_field[0], gauge_field_smeared, source_timeslice, source_momentum,\
                isc%(n_s*n_c), g_rng_state, ( isc%(n_s*n_c)==(n_s*n_c-1)  && imom==source_momentum_runs-1 )), "prepare_timeslice_source_one_end_color", NULL, 126);
            c = N_Jacobi > 0 ? isc%(n_s*n_c) + (n_s*n_c) : isc%(n_s*n_c);
            if(g_source_momentum_set) {
              sprintf(source_filename, "%s.%.4d.%.2d.%.2d.qx%.2dqy%.2dqz%.2d", filename_prefix, Nconf, 
                  g_source_timeslice, c, source_momentum[0], source_momentum[1], source_momentum[2]);
            } else {
              sprintf(source_filename, "%s.%.4d.%.2d.%.2d", filename_prefix, Nconf, g_source_timeslice, c);
          case 5:
            if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] preparing sequential point source\n");
            check_error( prepare_sequential_point_source (g_spinor_field[0], isc, sl0, g_seq_source_momentum, 
                  smear_source, g_spinor_field[1], gauge_field_smeared), "prepare_sequential_point_source", NULL, 33);
            sprintf(source_filename, "%s.%.4d.t%.2dx%.2d.y%.2d.z%.2d.%.2d.qx%.2dqy%.2dqz%.2d", filename_prefix2, Nconf,
                sl0, sl1, sl2, sl3, isc, g_source_momentum[0], g_source_momentum[1], g_source_momentum[2]);
            fprintf(stderr, "\nError, unrecognized source type\n");
      } else { // read source
        switch(g_source_type) {
          case 0:  // point source
            if(g_source_momentum_set) {
              sprintf(source_filename, "%s.%.4d.t%.2dx%.2dy%.2dz%.2d.%.2d.qx%.2dqy%.2dqz%.2d", \
                  filename_prefix2, Nconf, sl0, sl1, sl2, sl3, isc, source_momentum[0], source_momentum[1], source_momentum[2]);
            } else  {
              sprintf(source_filename, "%s.%.4d.t%.2dx%.2dy%.2dz%.2d.%.2d", filename_prefix2, Nconf, sl0, sl1, sl2, sl3, isc);
            fprintf(stdout, "# [invert_dw_quda] reading source from file %s\n", source_filename);
            check_error(read_lime_spinor(g_spinor_field[0], source_filename, 0), "read_lime_spinor", NULL, 115);
          case 2:  // timeslice source
            if(g_source_momentum_set) {
              sprintf(source_filename, "%s.%.4d.%.2d.%.5d.qx%.2dqy%.2dqz%.2d", filename_prefix2, Nconf, g_source_timeslice,
                  isc, source_momentum[0], source_momentum[1], source_momentum[2]);
            } else {
              sprintf(source_filename, "%s.%.4d.%.2d.%.5d", filename_prefix2, Nconf, g_source_timeslice, isc);
            fprintf(stdout, "# [invert_dw_quda] reading source from file %s\n", source_filename);
            check_error(read_lime_spinor(g_spinor_field[0], source_filename, 0), "read_lime_spinor", NULL, 115);
            check_error(1, "source type", NULL, 104);
          case -1:  // timeslice source
            sprintf(source_filename, "%s", filename_prefix2);
            fprintf(stdout, "# [invert_dw_quda] reading source from file %s\n", source_filename);
            check_error(read_lime_spinor(g_spinor_field[0], source_filename, 0), "read_lime_spinor", NULL, 115);
      }  // of if g_read_source
      if(g_write_source) {
        check_error(write_propagator(g_spinor_field[0], source_filename, 0, g_propagator_precision), "write_propagator", NULL, 27);

 * here threads split: 
      if(dummy_flag==0) strcpy(source_filename_write, source_filename);
      memcpy((void*)(smearing_spinor_field[0]), (void*)(g_spinor_field[0]), 24*VOLUME*sizeof(double));
      if(dummy_flag>0) {
        // copy only if smearing has been done; otherwise do not copy, do not invert
        if(g_cart_id==0) fprintf(stdout, "# [] copy smearing field -> g field\n");
        memcpy((void*)(g_spinor_field[0]), (void*)(smearing_spinor_field[1]), 24*VOLUME*sizeof(double));

#pragma omp parallel private(threadid, _2_kappa, is, ix, iy, iix, ratime, retime) shared(key,g_read_source, smear_source, N_Jacobi, kappa_Jacobi, smearing_spinor_field, g_spinor_field, nthreads, convert_sign, VOLUME, VOL3, T, L5, isc, rotate_gamma_basis, g_cart_id) firstprivate(inv_param, gauge_param, ofs)
      threadid = omp_get_thread_num();

  if(threadid < nthreads) {
      fprintf(stdout, "# [] proc%.2d thread%.2d starting source preparation\n", g_cart_id, threadid);

      // smearing
      if( ( !g_read_source || (g_read_source && smear_source ) ) && N_Jacobi > 0 ) {
        if(g_cart_id==0) fprintf(stdout, "#  [invert_dw_quda] smearing source with N_Jacobi=%d, kappa_Jacobi=%e\n", N_Jacobi, kappa_Jacobi);
        Jacobi_Smearing_threaded(gauge_field_smeared, smearing_spinor_field[0], smearing_spinor_field[1], kappa_Jacobi, N_Jacobi, threadid, nthreads);

       * create the 5-dim. source field
      if(convert_sign == 0) {
        spinor_4d_to_5d_threaded(smearing_spinor_field[0], smearing_spinor_field[0], threadid, nthreads);
      }  else if(convert_sign == 1 || convert_sign == -1) {
        spinor_4d_to_5d_sign_threaded(smearing_spinor_field[0], smearing_spinor_field[0], convert_sign, threadid, nthreads);

      for(is=0; is<L5; is++) {
        for(it=threadid; it<T; it+=nthreads) {
          memcpy((void*)(g_spinor_field[0]+_GSI(g_ipt_5d[is][it][0][0][0])), (void*)(smearing_spinor_field[0]+_GSI(g_ipt_5d[is][it][0][0][0])), VOL3*24*sizeof(double));

      // reorder, multiply with g2
      for(is=0; is<L5; is++) {
        for(it=threadid; it<T; it+=nthreads) {
          for(i3=0; i3<VOL3; i3++) {
            ix = (is*T+it)*VOL3 + i3;

      if(rotate_gamma_basis) {
        for(it=threadid; it<T; it+=nthreads) {
          for(i3=0; i3<VOL3; i3++) {
            ix = it * VOL3 + i3;
            iy = lexic2eot_5d(0, ix);
            _fv_eq_gamma_ti_fv(smearing_spinor_field[1]+_GSI(iy), 2, smearing_spinor_field[0]+_GSI(ix));
        for(it=threadid; it<T; it+=nthreads) {
          for(i3=0; i3<VOL3; i3++) {
            ix = it * VOL3 + i3;
            iy = lexic2eot_5d(L5-1, ix);
            _fv_eq_gamma_ti_fv(smearing_spinor_field[1]+_GSI(iy), 2, smearing_spinor_field[0]+_GSI(ix+(L5-1)*VOLUME));
      } else {
        for(it=threadid; it<T; it+=nthreads) {
          for(i3=0; i3<VOL3; i3++) {
            ix = it * VOL3 + i3;
            iy = lexic2eot_5d(0, ix);
            _fv_eq_fv(smearing_spinor_field[1]+_GSI(iy), smearing_spinor_field[0]+_GSI(ix));
        for(it=threadid; it<T; it+=nthreads) {
          for(i3=0; i3<VOL3; i3++) {
            ix = it * VOL3 + i3;
            iy = lexic2eot_5d(L5-1, ix);
            _fv_eq_fv(smearing_spinor_field[1]+_GSI(iy), smearing_spinor_field[0]+_GSI(ix+(L5-1)*VOLUME));
      fprintf(stdout, "# [] proc%.2d thread%.2d finished source preparation\n", g_cart_id, threadid);

  } else if(threadid == g_num_threads-1 && dummy_flag > 0) {  // else branch on threadid
      fprintf(stdout, "# [] proc%.2d thread%.2d starting inversion for dummy_flag = %d\n", g_cart_id, threadid, dummy_flag);

       * perform the inversion
      if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] starting inversion\n");

      memset(g_spinor_field[1], 0, (VOLUME+RAND)*L5*24*sizeof(double));
      ratime = CLOCK;
#ifdef MPI
      if(inv_param.inv_type == QUDA_BICGSTAB_INVERTER  || inv_param.inv_type == QUDA_GCR_INVERTER) {
        if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] calling invertQuda\n");
        invertQuda(g_spinor_field[1], g_spinor_field[0], &inv_param);
      } else if(inv_param.inv_type == QUDA_CG_INVERTER) {
        if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] calling testCG\n");
        testCG(g_spinor_field[1], g_spinor_field[0], &inv_param);
      } else {
        if(g_cart_id==0) fprintf(stderr, "# [invert_dw_quda] unrecognized inverter\n");
      invertQuda(g_spinor_field[1], g_spinor_field[0], &inv_param);
      retime = CLOCK;

      if(g_cart_id==0) {
        fprintf(stdout, "# [invert_dw_quda] QUDA time:  %e seconds\n", inv_param.secs);
        fprintf(stdout, "# [invert_dw_quda] QUDA Gflops: %e\n", inv_param.gflops/inv_param.secs);
        fprintf(stdout, "# [invert_dw_quda] wall time:  %e seconds\n", retime-ratime);
        fprintf(stdout, "# [invert_dw_quda] Device memory used:\n\tSpinor: %f GiB\n\tGauge: %f GiB\n",
        inv_param.spinorGiB, gauge_param.gaugeGiB);
  }  // of if threadid

// wait till all threads are here
#pragma omp barrier

      if(inv_param.mass_normalization == QUDA_KAPPA_NORMALIZATION) {
        _2_kappa = 2. * g_kappa5d;
        for(ix=threadid; ix<VOLUME*L5;ix+=g_num_threads) {
          _fv_ti_eq_re(g_spinor_field[1]+_GSI(ix), _2_kappa );
#pragma omp barrier
      // reorder, multiply with g2
      for(is=0;is<L5;is++) {
      for(ix=threadid; ix<VOLUME; ix+=g_num_threads) {
        iy  = lexic2eot_5d(is, ix);
        iix = is*VOLUME + ix;
        _fv_eq_fv(g_spinor_field[0]+_GSI(iix), g_spinor_field[1]+_GSI(iy));
#pragma omp barrier
      if(rotate_gamma_basis) {
        for(ix=threadid; ix<VOLUME*L5; ix+=g_num_threads) {
          _fv_eq_gamma_ti_fv(g_spinor_field[1]+_GSI(ix), 2, g_spinor_field[0]+_GSI(ix));
      } else {
        for(ix=threadid; ix<VOLUME*L5;ix+=g_num_threads) {
          _fv_eq_fv(g_spinor_field[1]+_GSI(ix), g_spinor_field[0]+_GSI(ix));
      if(g_cart_id==0 && threadid==g_num_threads-1) fprintf(stdout, "# [invert_dw_quda] inversion done in %e seconds\n", retime-ratime);

#pragma omp single

#ifdef MPI
       * check residuum
      if(check_residuum && dummy_flag>0) {
        // apply the Wilson Dirac operator in the gamma-basis defined in cvc_linalg,
        //   which uses the tmLQCD conventions (same as in contractions)
        //   without explicit boundary conditions
#ifdef MPI
        memset(g_spinor_field[0], 0, 24*(VOLUME+RAND)*L5*sizeof(double));

        //sprintf(filename, "%s.inverted.ascii.%.2d", source_filename, g_cart_id);
        //ofs = fopen(filename, "w");
        //printf_spinor_field_5d(g_spinor_field[1], ofs);

        Q_DW_Wilson_phi(g_spinor_field[0], g_spinor_field[1]);
        for(ix=0;ix<VOLUME*L5;ix++) {
          _fv_mi_eq_fv(g_spinor_field[0]+_GSI(ix), g_spinor_field[2]+_GSI(ix));
        spinor_scalar_product_re(&norm2, g_spinor_field[2], g_spinor_field[2], VOLUME*L5);
        spinor_scalar_product_re(&norm, g_spinor_field[0], g_spinor_field[0], VOLUME*L5);
        if(g_cart_id==0) fprintf(stdout, "\n# [invert_dw_quda] absolut residuum squared: %e; relative residuum %e\n", norm, sqrt(norm/norm2) );

      if(dummy_flag>0) {
         * create 4-dim. propagator
        if(convert_sign == 0) {
          spinor_5d_to_4d(g_spinor_field[1], g_spinor_field[1]);
        } else if(convert_sign == -1 || convert_sign == +1) {
          spinor_5d_to_4d_sign(g_spinor_field[1], g_spinor_field[1], convert_sign);
         * write the solution 
        sprintf(filename, "%s.inverted", source_filename_write);
        if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] writing propagator to file %s\n", filename);
        check_error(write_propagator(g_spinor_field[1], filename, 0, g_propagator_precision), "write_propagator", NULL, 22);
        //sprintf(filename, "prop.ascii.4d.%.2d.%.2d.%.2d", isc, g_nproc, g_cart_id);
        //ofs = fopen(filename, "w");
        //printf_spinor_field(g_spinor_field[1], ofs);

      if(check_residuum) memcpy(g_spinor_field[2], smearing_spinor_field[0], 24*VOLUME*L5*sizeof(double));

  }  // of omp single

}    // of omp parallel region

      if(dummy_flag > 0) strcpy(source_filename_write, source_filename);

    }  // of loop on momenta

  }  // of isc

#if 0
  // last inversion

      memcpy(g_spinor_field[0], smearing_spinor_field[1], 24*VOLUME*L5*sizeof(double));
      if(g_cart_id==0) fprintf(stdout, "# [] proc%.2d starting last inversion\n", g_cart_id);

       * perform the inversion
      if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] starting inversion\n");

      memset(g_spinor_field[1], 0, (VOLUME+RAND)*L5*24*sizeof(double));
      ratime = CLOCK;
#ifdef MPI
      if(inv_param.inv_type == QUDA_BICGSTAB_INVERTER  || inv_param.inv_type == QUDA_GCR_INVERTER) {
        if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] calling invertQuda\n");
        invertQuda(g_spinor_field[1], g_spinor_field[0], &inv_param);
      } else if(inv_param.inv_type == QUDA_CG_INVERTER) {
        if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] calling testCG\n");
        testCG(g_spinor_field[1], g_spinor_field[0], &inv_param);
      } else {
        if(g_cart_id==0) fprintf(stderr, "# [invert_dw_quda] unrecognized inverter\n");
      invertQuda(g_spinor_field[1], g_spinor_field[0], &inv_param);
      retime = CLOCK;

      if(g_cart_id==0) {
        fprintf(stdout, "# [invert_dw_quda] QUDA time:  %e seconds\n", inv_param.secs);
        fprintf(stdout, "# [invert_dw_quda] QUDA Gflops: %e\n", inv_param.gflops/inv_param.secs);
        fprintf(stdout, "# [invert_dw_quda] wall time:  %e seconds\n", retime-ratime);
        fprintf(stdout, "# [invert_dw_quda] Device memory used:\n\tSpinor: %f GiB\n\tGauge: %f GiB\n",
        inv_param.spinorGiB, gauge_param.gaugeGiB);

#pragma omp parallel private(threadid,_2_kappa,is,ix,iy,iix) shared(VOLUME,L5,g_kappa,g_spinor_field,g_num_threads)
      threadid = omp_get_thread_num();

      if(inv_param.mass_normalization == QUDA_KAPPA_NORMALIZATION) {
        _2_kappa = 2. * g_kappa5d;
        for(ix=threadid; ix<VOLUME*L5;ix+=g_num_threads) {
          _fv_ti_eq_re(g_spinor_field[1]+_GSI(ix), _2_kappa );
#pragma omp barrier
      // reorder, multiply with g2
      for(is=0;is<L5;is++) {
      for(ix=threadid; ix<VOLUME; ix+=g_num_threads) {
        iy  = lexic2eot_5d(is, ix);
        iix = is*VOLUME + ix;
        _fv_eq_fv(g_spinor_field[0]+_GSI(iix), g_spinor_field[1]+_GSI(iy));
#pragma omp barrier
      if(rotate_gamma_basis) {
        for(ix=threadid; ix<VOLUME*L5; ix+=g_num_threads) {
          _fv_eq_gamma_ti_fv(g_spinor_field[1]+_GSI(ix), 2, g_spinor_field[0]+_GSI(ix));
      } else {
        for(ix=threadid; ix<VOLUME*L5;ix+=g_num_threads) {
          _fv_eq_fv(g_spinor_field[1]+_GSI(ix), g_spinor_field[0]+_GSI(ix));

    }  // end of parallel region

    if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] inversion done in %e seconds\n", retime-ratime);

#ifdef MPI
       * check residuum
      if(check_residuum && dummy_flag>0) {
        // apply the Wilson Dirac operator in the gamma-basis defined in cvc_linalg,
        //   which uses the tmLQCD conventions (same as in contractions)
        //   without explicit boundary conditions
#ifdef MPI
        memset(g_spinor_field[0], 0, 24*(VOLUME+RAND)*L5*sizeof(double));

        //sprintf(filename, "%s.inverted.ascii.%.2d", source_filename, g_cart_id);
        //ofs = fopen(filename, "w");
        //printf_spinor_field_5d(g_spinor_field[1], ofs);

        Q_DW_Wilson_phi(g_spinor_field[0], g_spinor_field[1]);
        for(ix=0;ix<VOLUME*L5;ix++) {
          _fv_mi_eq_fv(g_spinor_field[0]+_GSI(ix), g_spinor_field[2]+_GSI(ix));
        spinor_scalar_product_re(&norm, g_spinor_field[0], g_spinor_field[0], VOLUME*L5);
        spinor_scalar_product_re(&norm2, g_spinor_field[2], g_spinor_field[2], VOLUME*L5);
        if(g_cart_id==0) fprintf(stdout, "\n# [invert_dw_quda] absolut residuum squared: %e; relative residuum %e\n", norm, sqrt(norm/norm2) );

       * create 4-dim. propagator
      if(convert_sign == 0) {
        spinor_5d_to_4d(g_spinor_field[1], g_spinor_field[1]);
      } else if(convert_sign == -1 || convert_sign == +1) {
        spinor_5d_to_4d_sign(g_spinor_field[1], g_spinor_field[1], convert_sign);
       * write the solution 
      sprintf(filename, "%s.inverted", source_filename_write);
      if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] writing propagator to file %s\n", filename);
      check_error(write_propagator(g_spinor_field[1], filename, 0, g_propagator_precision), "write_propagator", NULL, 22);
      //sprintf(filename, "prop.ascii.4d.%.2d.%.2d.%.2d", isc, g_nproc, g_cart_id);
      //ofs = fopen(filename, "w");
      //printf_spinor_field(g_spinor_field[1], ofs);
  }  // of last inversion

#endif  // of if 0

   * free the allocated memory, finalize 

#ifdef HAVE_QUDA
  // finalize the QUDA library
  if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] finalizing quda\n");
#ifdef MPI
  if(g_gauge_field != NULL) free(g_gauge_field);
  if(gauge_field_smeared != NULL) free(gauge_field_smeared);
  if(no_fields>0) {
    if(g_spinor_field!=NULL) {
      for(i=0; i<no_fields; i++) if(g_spinor_field[i]!=NULL) free(g_spinor_field[i]);

  if(g_source_momentum_set && full_orbit) {
    finalize_q_orbits(&qlatt_id, &qlatt_count, &qlatt_list, &qlatt_rep);
    if(qlatt_map != NULL) {
  if(source_momentum != NULL) free(source_momentum);
  if(lck != NULL) free(lck);

#ifdef MPI
#ifdef HAVE_QUDA
  if(g_cart_id==0) {
    g_the_time = time(NULL);
    fprintf(stdout, "\n# [invert_dw_quda] %s# [invert_dw_quda] end of run\n", ctime(&g_the_time));
    fprintf(stderr, "\n# [invert_dw_quda] %s# [invert_dw_quda] end of run\n", ctime(&g_the_time));
コード例 #2
ファイル: source_generation.c プロジェクト: Finkenrath/tmLQCD
void source_generation_pion_only(spinor * const P, spinor * const Q,
				 const int t, const int sample, 
                                 const int nstore, const unsigned int _seed) {

  int reset = 0, i, x, y, z, is, ic, lt, lx, ly, lz, id=0;
  int coords[4], seed, r;
  double rnumber, si=0., co=0.;
  int rlxd_state[105];
  const double sqr2 = 1./sqrt(2.);
  _Complex double * p = NULL;

  /* save the ranlxd_state if neccessary */
  if(ranlxd_init == 1) {
    reset = 1;

  /* Compute the seed */
  seed =(int) abs(_seed + sample + t*10*97 + nstore*100*53);

  rlxd_init(2, seed);

  lt = t - g_proc_coords[0]*T;
  coords[0] = t / T;
  for(x = 0; x < LX*g_nproc_x; x++) {
    lx = x - g_proc_coords[1]*LX;
    coords[1] = x / LX;
    for(y = 0; y < LY*g_nproc_y; y++) {
      ly = y - g_proc_coords[2]*LY;
      coords[2] = y / LY;
      for(z = 0; z < LZ*g_nproc_z; z++) {
	lz = z - g_proc_coords[3]*LZ;
	coords[3] = z / LZ;
#ifdef TM_USE_MPI
	MPI_Cart_rank(g_cart_grid, coords, &id);
	for(is = 0; is < 4; is++) {
	  for(ic = 0; ic < 3; ic++) {
	    ranlxd(&rnumber, 1);
	    if(g_cart_id  == id) {
	      r = (int)floor(4.*rnumber);
	      if(r == 0)
		si = sqr2;
		co = sqr2;
	      else if(r == 1) {
		si = -sqr2;
		co = sqr2;
	      else if(r==2) {
		si = sqr2;
		co = -sqr2;
	      else {
		si = -sqr2;
		co = -sqr2;
	      i = g_lexic2eosub[ g_ipt[lt][lx][ly][lz] ];
		  + g_proc_coords[0]*T+g_proc_coords[1]*LX)%2 == 0) {
		p = (_Complex double*)(P + i);
	      else {
		p = (_Complex double*)(Q + i);
	      (*(p+3*is+ic)) = co + si * I;
  /* reset the ranlxd if neccessary */
  if(reset) {
コード例 #3
ファイル: mpi_manager.C プロジェクト: grisu48/Krylov
void mpi_manager_2D::determin_OtherRanks() {

	// Find neighbouring ranks:
	MPI_Cart_shift(comm2d, 0, 1, &left , &right);
	MPI_Cart_shift(comm2d, 1, 1, &front, &back);

	// Determine ranks of neighbour processes:
	int shiftcoord[DIM];
	int lbound[DIM],ubound[DIM];
	for(int dim=0;dim<DIM;dim++){
		ubound[dim]= nproc[dim];

	for(int dim0=-nproc[0]; dim0<=nproc[0]; dim0++){
		shiftcoord[0] = (coords[0]+dim0);
		if(shiftcoord[0] < 0) shiftcoord[0]+=nproc[0];
		for(int dim1=-nproc[1]; dim1<=nproc[1]; dim1++){
			shiftcoord[1] = (coords[1]+dim1);
			if(shiftcoord[1] < 0) shiftcoord[1]+=nproc[1];

			if(shiftcoord[0]>=0 && shiftcoord[0]<nproc[0] &&
			   shiftcoord[1]>=0 && shiftcoord[1]<nproc[1]) {
				// Now determine rank at relative shifted position
				// std::cout << " Cart ";
				// std::cout << shiftcoord[0] << " ";
				// std::cout << shiftcoord[1] << " ";
				// std::cout << rank << " ";
				// std::cout << nproc[0] << " ";
				// std::cout << nproc[1] << " ";
				// std::cout << std::endl;
				MPI_Cart_rank(comm2d, shiftcoord, &Neighbours(dim0,dim1));
			} else {
				// If outside domain set to error value
				Neighbours(dim0, dim1) = MPI_PROC_NULL;



	for(int dim0=-nproc[0]; dim0<=nproc[0]; dim0++){
		shiftcoord[0] = (coords[0]+dim0)%nproc[0];
		if(shiftcoord[0] < 0) shiftcoord[0]+=nproc[0];
		for(int dim1=-nproc[1]; dim1<=nproc[1]; dim1++){
			shiftcoord[1] = (coords[1]+dim1)%nproc[1];
			if(shiftcoord[1] < 0) shiftcoord[1]+=nproc[1];

			// Now determine rank at relative shifted position
			MPI_Cart_rank(comm2d, shiftcoord, &NeighboursCyclic(dim0,dim1));

	// Now determine absolute position of ranks

	for(int dim1=0; dim1<nproc[1]; ++dim1) {
		for(int dim0=0; dim0<nproc[0]; ++dim0) {
			int coord[2] = {dim0, dim1};
			MPI_Cart_rank(comm2d, coord, &AllRanks(dim0, dim1));

コード例 #4
ファイル: invert_quda_cg.c プロジェクト: etmc/cvc
int main(int argc, char **argv) {
  int c, i, mu, status;
  int ispin, icol, isc;
  int n_c = 3;
  int n_s = 4;
  int count        = 0;
  int filename_set = 0;
  int dims[4]      = {0,0,0,0};
  int l_LX_at, l_LXstart_at;
  int x0, x1, x2, x3, ix, iix, iy;
  int sl0, sl1, sl2, sl3, have_source_flag=0;
  int source_proc_coords[4], lsl0, lsl1, lsl2, lsl3, source_proc_id;
  int check_residuum = 0;
  unsigned int VOL3;
  int do_gt   = 0;
  int full_orbit = 0;
  char filename[200], source_filename[200];
  double ratime, retime;
  double plaq_r=0., plaq_m=0., norm, norm2;
  // double spinor1[24], spinor2[24];
  double *gauge_qdp[4], *gauge_field_timeslice=NULL, *gauge_field_smeared=NULL;
  double _1_2_kappa, _2_kappa, phase;
  FILE *ofs;
  int mu_trans[4] = {3, 0, 1, 2};
  int threadid, nthreads;
  int timeslice;
  char rng_file_in[100], rng_file_out[100];
  int *source_momentum=NULL;
  int source_momentum_class = -1;
  int source_momentum_no = 0;
  int source_momentum_runs = 1;
  int imom;

  int qlatt_nclass;
  int *qlatt_id=NULL, *qlatt_count=NULL, **qlatt_rep=NULL, **qlatt_map=NULL;
  double **qlatt_list=NULL;

   * QUDA parameters
  QudaPrecision cpu_prec         = QUDA_DOUBLE_PRECISION;
  QudaPrecision cuda_prec        = QUDA_DOUBLE_PRECISION;
  QudaPrecision cuda_prec_sloppy = QUDA_DOUBLE_PRECISION;

  QudaGaugeParam gauge_param = newQudaGaugeParam();
  QudaInvertParam inv_param = newQudaInvertParam();

#ifdef MPI
  MPI_Init(&argc, &argv);

  while ((c = getopt(argc, argv, "och?vgf:p:")) != -1) {
    switch (c) {
    case 'v':
      g_verbose = 1;
    case 'g':
      do_gt = 1;
    case 'f':
      strcpy(filename, optarg);
    case 'c':
      check_residuum = 1;
      fprintf(stdout, "# [invert_quda] will check residuum again\n");
    case 'p':
      n_c = atoi(optarg);
      fprintf(stdout, "# [invert_quda] will use number of colors = %d\n", n_c);
    case 'o':
      full_orbit = 1;
      fprintf(stdout, "# [invert_quda] will invert for full orbit, if source momentum set\n");
    case 'h':
    case '?':

  // get the time stamp
  g_the_time = time(NULL);

   * set the default values, read input
  if(filename_set==0) strcpy(filename, "cvc.input");
  if(g_proc_id==0) fprintf(stdout, "# Reading input from file %s\n", filename);

  /* some checks on the input data */
  if((T_global == 0) || (LX==0) || (LY==0) || (LZ==0)) {
    if(g_proc_id==0) fprintf(stderr, "[invert_quda] Error, T and L's must be set\n");
  if(g_kappa == 0.) {
    if(g_proc_id==0) fprintf(stderr, "[invert_quda] Error, kappa should be > 0.n");

  // set number of openmp threads
#ifdef OPENMP
  fprintf(stdout, "[invert_quda_cg] Warning, resetting global number of threads to 1\n");
  g_num_threads = 1;

  /* initialize MPI parameters */
  mpi_init(argc, argv);
  // the volume of a timeslice
  VOL3 = LX*LY*LZ;

  fprintf(stdout, "# [%2d] parameters:\n"\
                  "# [%2d] T            = %3d\n"\
		  "# [%2d] Tstart       = %3d\n",\
		  g_cart_id, g_cart_id, T, g_cart_id, Tstart);

#ifdef MPI
  if(T==0) {
    fprintf(stderr, "[%2d] local T is zero; exit\n", g_cart_id);
    MPI_Abort(MPI_COMM_WORLD, 1);

  if(init_geometry() != 0) {
    fprintf(stderr, "ERROR from init_geometry\n");
#ifdef MPI
    MPI_Abort(MPI_COMM_WORLD, 1);


   * initialize the QUDA library
  fprintf(stdout, "# [invert_quda] initializing quda\n");
   * prepare the gauge field
  // read the gauge field from file
  alloc_gauge_field(&g_gauge_field, VOLUMEPLUSRAND);
  if(strcmp( gaugefilename_prefix, "identity")==0 ) {
    if(g_cart_id==0) fprintf(stdout, "# [invert_quda] Setting up unit gauge field\n");
    for(ix=0;ix<VOLUME; ix++) {
      for(mu=0;mu<4;mu++) {
  } else {
    if(g_gauge_file_format == 0) {
      // ILDG
      sprintf(filename, "%s.%.4d", gaugefilename_prefix, Nconf);
      if(g_cart_id==0) fprintf(stdout, "# Reading gauge field from file %s\n", filename);
      status = read_lime_gauge_field_doubleprec(filename);
    } else if(g_gauge_file_format == 1) {
      // NERSC
      sprintf(filename, "%s.%.5d", gaugefilename_prefix, Nconf);
      if(g_cart_id==0) fprintf(stdout, "# Reading gauge field from file %s\n", filename);
      status = read_nersc_gauge_field(g_gauge_field, filename, &plaq_r);
    if(status != 0) {
      fprintf(stderr, "[invert_quda] Error, could not read gauge field");
#ifdef MPI
      MPI_Abort(MPI_COMM_WORLD, 12);
#ifdef MPI

  // measure the plaquette
  if(g_cart_id==0) fprintf(stdout, "# Measured plaquette value: %25.16e\n", plaq_m);
  if(g_cart_id==0) fprintf(stdout, "# Read plaquette value    : %25.16e\n", plaq_r);

  // allocate the smeared / qdp ordered gauge field
  alloc_gauge_field(&gauge_field_smeared, VOLUME);
  for(i=0;i<4;i++) {
    gauge_qdp[i] = gauge_field_smeared + i*18*VOLUME;

  // transcribe the gauge field
#ifdef OPENMP
#pragma omp parallel for private(ix,iy,mu)
  for(ix=0;ix<VOLUME;ix++) {
    iy = g_lexic2eot[ix];
    for(mu=0;mu<4;mu++) {
      _cm_eq_cm(gauge_qdp[mu_trans[mu]]+18*iy, g_gauge_field+_GGI(ix,mu));
  // multiply timeslice T-1 with factor of -1 (antiperiodic boundary condition)
#ifdef OPENMP
#pragma omp parallel for private(ix,iy)
  for(ix=0;ix<VOL3;ix++) {
    iix = (T-1)*VOL3 + ix;
    iy = g_lexic2eot[iix];
    _cm_ti_eq_re(gauge_qdp[mu_trans[0]]+18*iy, -1.);

  // QUDA gauge parameters
  gauge_param.X[0] = LX_global;
  gauge_param.X[1] = LY_global;
  gauge_param.X[2] = LZ_global;
  gauge_param.X[3] = T_global;

  gauge_param.anisotropy  = 1.0;
  gauge_param.type        = QUDA_WILSON_LINKS;
  gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER;
  gauge_param.t_boundary  = QUDA_ANTI_PERIODIC_T;

  gauge_param.cpu_prec           = cpu_prec;
  gauge_param.cuda_prec          = cuda_prec;
  gauge_param.reconstruct        = QUDA_RECONSTRUCT_12;
  gauge_param.cuda_prec_sloppy   = cuda_prec_sloppy;
  gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_12;
  gauge_param.gauge_fix          = QUDA_GAUGE_FIXED_NO;

  gauge_param.ga_pad = 0;

  // load the gauge field
  fprintf(stdout, "# [invert_quda] loading gauge field\n");
  loadGaugeQuda((void*)gauge_qdp, &gauge_param);
  gauge_qdp[0] = NULL; 
  gauge_qdp[1] = NULL; 
  gauge_qdp[2] = NULL; 
  gauge_qdp[3] = NULL; 

   * APE smear the gauge field
  memcpy(gauge_field_smeared, g_gauge_field, 72*VOLUME*sizeof(double));
  if(N_ape>0) {
    fprintf(stdout, "# [invert_quda] APE smearing gauge field with paramters N_APE=%d, alpha_APE=%e\n", N_ape, alpha_ape);
#ifdef OPENMP
     APE_Smearing_Step_threads(gauge_field_smeared, N_ape, alpha_ape);
    for(i=0; i<N_ape; i++) {
       APE_Smearing_Step(gauge_field_smeared, alpha_ape);

  /* allocate memory for the spinor fields */
  no_fields = 3;
  g_spinor_field = (double**)calloc(no_fields, sizeof(double*));
  for(i=0; i<no_fields; i++) alloc_spinor_field(&g_spinor_field[i], VOLUMEPLUSRAND);

  /* the source locaton */
  sl0 =   g_source_location                              / (LX_global*LY_global*LZ);
  sl1 = ( g_source_location % (LX_global*LY_global*LZ) ) / (          LY_global*LZ);
  sl2 = ( g_source_location % (          LY_global*LZ) ) / (                    LZ);
  sl3 =   g_source_location %                      LZ;
  if(g_cart_id==0) fprintf(stdout, "# [invert_quda] global sl = (%d, %d, %d, %d)\n", sl0, sl1, sl2, sl3);
  source_proc_coords[0] = sl0 / T;
  source_proc_coords[1] = sl1 / LX;
  source_proc_coords[2] = sl2 / LY;
  source_proc_coords[3] = sl3 / LZ;
#ifdef MPI
  MPI_Cart_rank(g_cart_grid, source_proc_coords, &source_proc_id);
  source_proc_id = 0;
  have_source_flag = source_proc_id == g_cart_id;

  lsl0 = sl0 % T;
  lsl1 = sl1 % LX;
  lsl2 = sl2 % LY;
  lsl3 = sl3 % LZ;
  if(have_source_flag) {
    fprintf(stdout, "# [invert_quda] process %d has the source at (%d, %d, %d, %d)\n", g_cart_id, lsl0, lsl1, lsl2, lsl3);

  // QUDA inverter parameters
  inv_param.dslash_type    = QUDA_WILSON_DSLASH;
//  inv_param.inv_type       = QUDA_BICGSTAB_INVERTER;
  inv_param.inv_type       = QUDA_CG_INVERTER;
  inv_param.kappa          = g_kappa;
  inv_param.tol            = solver_precision;
  inv_param.maxiter        = niter_max;
  inv_param.reliable_delta = reliable_delta;

  inv_param.solution_type      = QUDA_MAT_SOLUTION;
//  inv_param.solve_type         = QUDA_DIRECT_PC_SOLVE;
  inv_param.solve_type         = QUDA_NORMEQ_PC_SOLVE;
  inv_param.matpc_type         = QUDA_MATPC_EVEN_EVEN; // QUDA_MATPC_EVEN_EVEN;
  inv_param.dagger             = QUDA_DAG_NO;
  inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION; //;QUDA_MASS_NORMALIZATION;

  inv_param.cpu_prec         = cpu_prec;
  inv_param.cuda_prec        = cuda_prec;
  inv_param.cuda_prec_sloppy = cuda_prec_sloppy;
  inv_param.preserve_source  = QUDA_PRESERVE_SOURCE_NO;
  inv_param.dirac_order      = QUDA_DIRAC_ORDER;

  inv_param.sp_pad = 0;
  inv_param.cl_pad = 0;

  inv_param.verbosity = QUDA_VERBOSE;

  // write initial rng state to file
  if(g_source_type==2 && g_coherent_source==2) {
    sprintf(rng_file_out, "%s.0", g_rng_filename);
    if( init_rng_stat_file (g_seed, rng_file_out) != 0 ) {
      fprintf(stderr, "[invert_quda] Error, could not write rng status\n");
  } else if(g_source_type==3 || g_source_type==4) {
    if( init_rng_state(g_seed, &g_rng_state) != 0 ) {
      fprintf(stderr, "[invert_quda] Error, could initialize rng state\n");

  // check the source momenta
  if(g_source_momentum_set) {
    source_momentum = (int*)malloc(3*sizeof(int));

    if(g_source_momentum[0]<0) g_source_momentum[0] += LX;
    if(g_source_momentum[1]<0) g_source_momentum[1] += LY;
    if(g_source_momentum[2]<0) g_source_momentum[2] += LZ;
    fprintf(stdout, "# [invert_quda] using final source momentum ( %d, %d, %d )\n", g_source_momentum[0], g_source_momentum[1], g_source_momentum[2]);

    if(full_orbit) {
      status = make_qcont_orbits_3d_parity_avg( &qlatt_id, &qlatt_count, &qlatt_list, &qlatt_nclass, &qlatt_rep, &qlatt_map);
      if(status != 0) {
        fprintf(stderr, "\n[invert_quda] Error while creating O_3-lists\n");
      source_momentum_class = qlatt_id[g_ipt[0][g_source_momentum[0]][g_source_momentum[1]][g_source_momentum[2]]];
      source_momentum_no    = qlatt_count[source_momentum_class];
      source_momentum_runs  = source_momentum_class==0 ? 1 : source_momentum_no + 1;
      fprintf(stdout, "# [] source momentum belongs to class %d with %d members, which means %d runs\n",
          source_momentum_class, source_momentum_no, source_momentum_runs);

   * loop on spin-color-index
  for(isc=g_source_index[0]; isc<=g_source_index[1]; isc++) {
    ispin = isc / n_c;
    icol  = isc % n_c;

    for(imom=0; imom<source_momentum_runs; imom++) {

       * set source momentum
      if(g_source_momentum_set) {
        if(imom == 0) {
          if(full_orbit) {
            source_momentum[0] = 0;
            source_momentum[1] = 0;
            source_momentum[2] = 0;
          } else {
            source_momentum[0] = g_source_momentum[0];
            source_momentum[1] = g_source_momentum[1];
            source_momentum[2] = g_source_momentum[2];
        } else {
          source_momentum[0] = qlatt_map[source_momentum_class][imom-1] / (LY*LZ);
          source_momentum[1] = ( qlatt_map[source_momentum_class][imom-1] % (LY*LZ) ) / LZ;
          source_momentum[2] = qlatt_map[source_momentum_class][imom-1] % LZ;
        fprintf(stdout, "# [] run no. %d, source momentum (%d, %d, %d)\n", imom, source_momentum[0], source_momentum[1], source_momentum[2]);
       * prepare the souce
      if(g_read_source == 0) {  // create source
        switch(g_source_type) {
          case 0:
            // point source
            fprintf(stdout, "# [invert_quda] Creating point source\n");
            for(ix=0;ix<24*VOLUME;ix++) g_spinor_field[0][ix] = 0.;
            if(have_source_flag) {
              if(g_source_momentum_set) {
                phase = 2*M_PI*( source_momentum[0]*lsl1/(double)LX + source_momentum[1]*lsl2/(double)LY + source_momentum[2]*lsl3/(double)LZ );
                g_spinor_field[0][_GSI(g_source_location) + 2*(n_c*ispin+icol)  ] = cos(phase);
                g_spinor_field[0][_GSI(g_source_location) + 2*(n_c*ispin+icol)+1] = sin(phase);
              } else {
                g_spinor_field[0][_GSI(g_source_location) + 2*(n_c*ispin+icol)  ] = 1.;
            if(g_source_momentum_set) {
              sprintf(source_filename, "%s.%.4d.t%.2dx%.2dy%.2dz%.2d.%.2d.qx%.2dqy%.2dqz%.2d",
                  filename_prefix, Nconf, sl0, sl1, sl2, sl3, n_c*ispin+icol, source_momentum[0], source_momentum[1], source_momentum[2]);
            } else {
              sprintf(source_filename, "%s.%.4d.t%.2dx%.2dy%.2dz%.2d.%.2d", filename_prefix, Nconf, sl0, sl1, sl2, sl3, n_c*ispin+icol);
          case 2:
            // timeslice source
            if(g_coherent_source==1) {
              fprintf(stdout, "# [invert_quda] Creating coherent timeslice source\n");
              status = prepare_coherent_timeslice_source(g_spinor_field[0], gauge_field_smeared, g_coherent_source_base, g_coherent_source_delta, VOLUME, g_rng_filename, NULL);
              if(status != 0) {
                fprintf(stderr, "[invert_quda] Error from prepare source, status was %d\n", status);
              timeslice = g_coherent_source_base;
            } else {
              if(g_coherent_source==2) {
                strcpy(rng_file_in, rng_file_out);
                if(isc == g_source_index[1]) { strcpy(rng_file_out, g_rng_filename); }
                else                         { sprintf(rng_file_out, "%s.%d", g_rng_filename, isc+1); }
                timeslice = (g_coherent_source_base+isc*g_coherent_source_delta)%T_global;
                fprintf(stdout, "# [invert_quda] Creating timeslice source\n");
                status = prepare_timeslice_source(g_spinor_field[0], gauge_field_smeared, timeslice, VOLUME, rng_file_in, rng_file_out);
                if(status != 0) {
                  fprintf(stderr, "[invert_quda] Error from prepare source, status was %d\n", status);
              } else {
                fprintf(stdout, "# [invert_quda] Creating timeslice source\n");
                status = prepare_timeslice_source(g_spinor_field[0], gauge_field_smeared, g_source_timeslice, VOLUME, g_rng_filename, g_rng_filename);
                if(status != 0) {
                  fprintf(stderr, "[invert_quda] Error from prepare source, status was %d\n", status);
                timeslice = g_source_timeslice;
            if(g_source_momentum_set) {
              sprintf(source_filename, "%s.%.4d.%.2d.%.5d.qx%.2dqy%.2dqz%.2d", filename_prefix, Nconf, 
                  timeslice, isc, source_momentum[0], source_momentum[1], source_momentum[2]);
            } else {
              sprintf(source_filename, "%s.%.4d.%.2d.%.5d", filename_prefix, Nconf, timeslice, isc);
          case 3:
            // timeslice sources for one-end trick (spin dilution)
            fprintf(stdout, "# [invert_quda] Creating timeslice source for one-end-trick\n");
            status = prepare_timeslice_source_one_end(g_spinor_field[0], gauge_field_smeared, g_source_timeslice, source_momentum, isc%n_s, g_rng_state, \
                ( isc%n_s==(n_s-1) && imom==source_momentum_runs-1 ) );
            if(status != 0) {
              fprintf(stderr, "[invert_quda] Error from prepare source, status was %d\n", status);
            c = N_Jacobi > 0 ? isc%n_s + n_s : isc%n_s;
            if(g_source_momentum_set) {
              sprintf(source_filename, "%s.%.4d.%.2d.%.2d.qx%.2dqy%.2dqz%.2d", filename_prefix, Nconf, 
                  g_source_timeslice, c, source_momentum[0], source_momentum[1], source_momentum[2]);
            } else {
              sprintf(source_filename, "%s.%.4d.%.2d.%.2d", filename_prefix, Nconf, g_source_timeslice, c);
          case 4:
            // timeslice sources for one-end trick (spin and color dilution )
            fprintf(stdout, "# [invert_quda] Creating timeslice source for one-end-trick\n");
            status = prepare_timeslice_source_one_end_color(g_spinor_field[0], gauge_field_smeared, g_source_timeslice, source_momentum,\
                isc%(n_s*n_c), g_rng_state, ( isc%(n_s*n_c)==(n_s*n_c-1)  && imom==source_momentum_runs-1 ) );
            if(status != 0) {
              fprintf(stderr, "[invert_quda] Error from prepare source, status was %d\n", status);
            c = N_Jacobi > 0 ? isc%(n_s*n_c) + (n_s*n_c) : isc%(n_s*n_c);
            if(g_source_momentum_set) {
              sprintf(source_filename, "%s.%.4d.%.2d.%.2d.qx%.2dqy%.2dqz%.2d", filename_prefix, Nconf, 
                  g_source_timeslice, c, source_momentum[0], source_momentum[1], source_momentum[2]);
            } else {
              sprintf(source_filename, "%s.%.4d.%.2d.%.2d", filename_prefix, Nconf, g_source_timeslice, c);
            fprintf(stderr, "\nError, unrecognized source type\n");
      } else { // read source
        switch(g_source_type) {
          case 0:  // point source
            if(g_source_momentum_set) {
              sprintf(source_filename, "%s.%.4d.t%.2dx%.2dy%.2dz%.2d.%.2d.qx%.2dqy%.2dqz%.2d", \
                  filename_prefix2, Nconf, sl0, sl1, sl2, sl3, isc, source_momentum[0], source_momentum[1], source_momentum[2]);
            } else  {
              sprintf(source_filename, "%s.%.4d.t%.2dx%.2dy%.2dz%.2d.%.2d", filename_prefix2, Nconf, sl0, sl1, sl2, sl3, isc);
            fprintf(stdout, "# [invert_quda] reading source from file %s\n", source_filename);
            status = read_lime_spinor(g_spinor_field[0], source_filename, 0);
            if(status != 0) {
              fprintf(stderr, "# [invert_quda] Errro, could not read source from file %s\n", source_filename);
          case 2:  // timeslice source
            if(g_source_momentum_set) {
              sprintf(source_filename, "%s.%.4d.%.2d.%.5d.qx%.2dqy%.2dqz%.2d", filename_prefix2, Nconf, g_source_timeslice,
                  isc, source_momentum[0], source_momentum[1], source_momentum[2]);
            } else {
              sprintf(source_filename, "%s.%.4d.%.2d.%.5d", filename_prefix2, Nconf, g_source_timeslice, isc);
            fprintf(stdout, "# [invert_quda] reading source from file %s\n", source_filename);
            status = read_lime_spinor(g_spinor_field[0], source_filename, 0);
            if(status != 0) {
              fprintf(stderr, "# [invert_quda] Errro, could not read source from file %s\n", source_filename);
            fprintf(stderr, "[] Error, unrecognized source type for reading\n");
      }  // of if g_read_source
      //sprintf(filename, "%s.ascii", source_filename);
      //ofs = fopen(filename, "w");
      //printf_spinor_field(g_spinor_field[0], ofs);
      if(g_write_source) {
        status = write_propagator(g_spinor_field[0], source_filename, 0, g_propagator_precision);
        if(status != 0) {
          fprintf(stderr, "Error from write_propagator, status was %d\n", status);
      // smearing
      if(N_Jacobi > 0) {
  #ifdef OPENMP
        Jacobi_Smearing_Step_one_threads(gauge_field_smeared, g_spinor_field[0], g_spinor_field[1], N_Jacobi, kappa_Jacobi);
        for(c=0; c<N_Jacobi; c++) {
          Jacobi_Smearing_Step_one(gauge_field_smeared, g_spinor_field[0], g_spinor_field[1], kappa_Jacobi);
      // multiply with g2
      for(ix=0;ix<VOLUME;ix++) {
        _fv_eq_gamma_ti_fv(g_spinor_field[1]+_GSI(ix), 2, g_spinor_field[0]+_GSI(ix));
      // transcribe the spinor field to even-odd ordering with coordinates (x,y,z,t)
      for(ix=0;ix<VOLUME;ix++) {
        iy = g_lexic2eot[ix];
        _fv_eq_fv(g_spinor_field[2]+_GSI(iy), g_spinor_field[1]+_GSI(ix));
       * perform the inversion
      fprintf(stdout, "# [invert_quda] starting inversion\n");
      ratime = (double)clock() / CLOCKS_PER_SEC;
      for(ix=0;ix<VOLUME;ix++) {
        _fv_eq_zero(g_spinor_field[1]+_GSI(ix) );
      invertQuda(g_spinor_field[1], g_spinor_field[2], &inv_param);
      retime = (double)clock() / CLOCKS_PER_SEC;
      fprintf(stdout, "# [invert_quda] inversion done in %e seconds\n", retime-ratime);
      fprintf(stdout, "# [invert_quda] Device memory used:\n\tSpinor: %f GiB\n\tGauge: %f GiB\n",
        inv_param.spinorGiB, gauge_param.gaugeGiB);
      if(inv_param.mass_normalization == QUDA_KAPPA_NORMALIZATION) {
        _2_kappa = 2. * g_kappa;
        for(ix=0;ix<VOLUME;ix++) {
          _fv_ti_eq_re(g_spinor_field[1]+_GSI(ix), _2_kappa );
      // transcribe the spinor field to lexicographical order with (t,x,y,z)
      for(ix=0;ix<VOLUME;ix++) {
        iy = g_lexic2eot[ix];
        _fv_eq_fv(g_spinor_field[2]+_GSI(ix), g_spinor_field[1]+_GSI(iy));
      // multiply with g2
      for(ix=0;ix<VOLUME;ix++) {
        _fv_eq_gamma_ti_fv(g_spinor_field[1]+_GSI(ix), 2, g_spinor_field[2]+_GSI(ix));
       * check residuum
      if(check_residuum) {
        // apply the Wilson Dirac operator in the gamma-basis defined in cvc_linalg,
        //   which uses the tmLQCD conventions (same as in contractions)
        //   without explicit boundary conditions
        Q_Wilson_phi(g_spinor_field[2], g_spinor_field[1]);
        for(ix=0;ix<VOLUME;ix++) {
          _fv_mi_eq_fv(g_spinor_field[2]+_GSI(ix), g_spinor_field[0]+_GSI(ix));
        spinor_scalar_product_re(&norm, g_spinor_field[2], g_spinor_field[2], VOLUME);
        spinor_scalar_product_re(&norm2, g_spinor_field[0], g_spinor_field[0], VOLUME);
        fprintf(stdout, "\n# [invert_quda] absolut residuum squared: %e; relative residuum %e\n", norm, sqrt(norm/norm2) );
       * write the solution 
      sprintf(filename, "%s.inverted", source_filename);
      fprintf(stdout, "# [invert_quda] writing propagator to file %s\n", filename);
      status = write_propagator(g_spinor_field[1], filename, 0, g_propagator_precision);
      if(status != 0) {
        fprintf(stderr, "Error from write_propagator, status was %d\n", status);
    }  // of loop on momenta

  }  // of isc

   * free the allocated memory, finalize 

  // finalize the QUDA library
  fprintf(stdout, "# [invert_quda] finalizing quda\n");

  for(i=0; i<no_fields; i++) free(g_spinor_field[i]);

  if(g_source_momentum_set && full_orbit) {
    finalize_q_orbits(&qlatt_id, &qlatt_count, &qlatt_list, &qlatt_rep);
    if(qlatt_map != NULL) {
  if(source_momentum != NULL) free(source_momentum);

#ifdef MPI

  if(g_cart_id==0) {
    g_the_time = time(NULL);
    fprintf(stdout, "\n# [invert_quda] %s# [invert_quda] end of run\n", ctime(&g_the_time));
    fprintf(stderr, "\n# [invert_quda] %s# [invert_quda] end of run\n", ctime(&g_the_time));
コード例 #5
ファイル: mpi_manager.C プロジェクト: grisu48/Krylov
void mpi_manager_3D::setup(NumArray<int> &nproc, NumArray<int> &mx) {
	// Save number of processors in each dimension
	for(int dir=0; dir<DIM; ++dir) {
		this->nproc[dir] = nproc[dir];

	// Determine the rank of the current task
	MPI_Comm_rank(MPI_COMM_WORLD, &rank);

	// Get number of ranks from MPI
	int ntasks;
	MPI_Comm_size(MPI_COMM_WORLD, &ntasks);
	this->ntasks = ntasks;

	// Set the distribution of processes:
	if(ntasks != nproc[0]*nproc[1]*nproc[2]){
		std::cerr << " Wrong number of processes " << std::endl;
		std::cout << ntasks << " " << nproc[0]*nproc[1]*nproc[2] << std::endl;

	if(rank==0) {
		std::cout << " Number of tasks: " << ntasks << std::endl;

	// Check if grid can be subdevided as desired
	for(int dir = 0; dir < DIM; ++dir) {
		if(mx[dir] < nproc[dir] && nproc[dir] > 1) {
			if(rank == 0) {
				std::cerr << " Wrong grid topology for dimension ";
				std::cerr << dir << std::endl;
				std::cerr << "  mx[" << dir << "]:" << mx[dir] << std::endl;
				std::cerr << " nproc[" << dir << "]:" << nproc[dir] << std::endl;

	// Check if grid is a power of 2:
	double eps = 1.e-12;
	for(int dir = 0; dir < DIM; ++dir) {
		double exponent = log(mx[dir])/log(2.);
		int i_exponent = static_cast<int>(exponent+eps);

		if(exponent - i_exponent > 2.*eps) {
			if(rank == 0) {
				std::cerr << " Error: grid must be of the form mx = 2^n ";
				std::cerr << std::endl;
				std::cerr << " Exiting " << std::endl;

	// Grid is not periodic
	int periods[3] = {false, false, false};
	int reorder = false;
	// If all is okay: Create new communicator "comm3d"  
	MPI_Cart_create(MPI_COMM_WORLD, DIM, nproc, periods, reorder, &comm3d);

	// Retrieve the cartesian topology
	if (rank == 0) {
		int TopoType;
		std::cout << " Cart topology:  ";
		MPI_Topo_test(comm3d, &TopoType);
		switch (TopoType) {
			std::cout << " MPI_UNDEFINED " << std::endl;
		case MPI_GRAPH     :
			std::cout << "MPI_GRAPH" << std::endl;
		case MPI_CART      :
			std::cout << "MPI_CART" << std::endl;
	//   Determine rank again for cartesian communicator -> overwrite rank
	MPI_Comm_rank(comm3d, &rank);

	// std::cout << " my rank: " << rank << std::endl;

	// Translate rank to coordinates
	MPI_Cart_coords(comm3d, rank, DIM, coords);

	// // Backwards translation
	// int TranslateRank;
	// MPI_Cart_rank(comm3d, coords, &TranslateRank);

	// Find neighbouring ranks
	// Syntax: comm3d, shift direction, displacement, source, destination
	MPI_Cart_shift(comm3d, 0, 1, &left , &right);
	MPI_Cart_shift(comm3d, 1, 1, &front, &back);
	MPI_Cart_shift(comm3d, 2, 1, &bottom, &top);

	// std::cout << " My rank " << rank << " " << left << " " << right << " " << front << " " << back << " " << bottom << " " << top << std::endl;
	if(rank==0) {
		std::cout << " nearby " << right << " " << back << " " << top << std::endl;

	// Determine ranks of neighbour processes:
	int shiftcoord[DIM];
	int lbound[DIM],ubound[DIM];
	for(int dim=0;dim<DIM;dim++){
		ubound[dim]= 1;

	for(int dim0=-1; dim0<=1; dim0++){
		shiftcoord[0] = (coords[0]+dim0)%nproc[0];
		if(shiftcoord[0] < 0) shiftcoord[0]+=nproc[0];
		for(int dim1=-1; dim1<=1; dim1++){
			shiftcoord[1] = (coords[1]+dim1)%nproc[1];
			if(shiftcoord[1] < 0) shiftcoord[1]+=nproc[1];
			for(int dim2=-1; dim2<=1; dim2++){
				shiftcoord[2] = (coords[2]+dim2)%nproc[2];
				if(shiftcoord[2] < 0) shiftcoord[2]+=nproc[2];
				MPI_Cart_rank(comm3d, shiftcoord,&Neighbour(dim0,dim1,dim2));
	// if(rank==1) {
	// 	for(int dim0=-1; dim0<=1; dim0++){
	// 		for(int dim1=-1; dim1<=1; dim1++){
	// 			for(int dim2=-1; dim2<=1; dim2++){
	// 				std::cout << " neighbour " << dim0 << " " << dim1 << " ";
	// 				std::cout << dim2 << " " << Neighbour(dim0, dim1, dim2);
	// 				std::cout << std::endl;
	// 			}
	// 		}
	// 	}
	// }

	// Determine absolute position of any rank:
	for(int dim0=0; dim0<nproc[0]; ++dim0) {
		for(int dim1=0; dim1<nproc[1]; ++dim1) {
			for(int dim2=0; dim2<nproc[2]; ++dim2) {
				int coord[3] = {dim0, dim1, dim2};
				MPI_Cart_rank(comm3d, coord, &AllRanks(dim0, dim1, dim2));

	// if(rank==2) {
	// 	std::cout << " Neigh: " << rank << " "<<Neighbour(0,0,0) << " " << AllRanks(2,0,0) << std::endl;
	// }

	// Now make additional mpi groups relating to planes:

	int count(0);
	int num_xy = nproc[0]*nproc[1];
	int num_xz = nproc[0]*nproc[2];
	int num_yz = nproc[1]*nproc[2];
	NumMatrix<int,1> x_ranks[nproc[0]];
	NumMatrix<int,1> y_ranks[nproc[1]];
	NumMatrix<int,1> z_ranks[nproc[2]];

	// Walk trough z-axis -- xy plane
	for(int irz=0; irz<nproc[2]; irz++) {
		count = 0;
		z_ranks[irz].resize(Index::set(0), Index::set(num_xy));
		for(int irx=0; irx<nproc[0]; irx++) {
			for(int iry=0; iry<nproc[1]; iry++) {
				z_ranks[irz](count) = AllRanks(irx,iry,irz);

	// Walk trough y-axis -- xz plane
	for(int iry=0; iry<nproc[1]; iry++) {
		count = 0;
		y_ranks[iry].resize(Index::set(0), Index::set(num_xz));
		for(int irx=0; irx<nproc[0]; irx++) {
			for(int irz=0; irz<nproc[2]; irz++) {
				y_ranks[iry](count) = AllRanks(irx,iry,irz);

	// Walk trough x-axis -- yz plane
	for(int irx=0; irx<nproc[0]; irx++) {
		count = 0;
		x_ranks[irx].resize(Index::set(0), Index::set(num_yz));
		for(int iry=0; iry<nproc[1]; iry++) {
			for(int irz=0; irz<nproc[2]; irz++) {
				x_ranks[irx](count) = AllRanks(irx,iry,irz);

	// Build local communicator:
	MPI_Group group_all, group_constz, group_consty, group_constx;
	// Get standard group handle:
	MPI_Comm_group(comm3d, &group_all);

	// Devide tasks into groups based on z-position
	MPI_Group_incl(group_all, num_xy, z_ranks[coords[2]], &group_constz);

	// Devide tasks into groups based on z-position
	MPI_Group_incl(group_all, num_xz, y_ranks[coords[1]], &group_consty);

	// Devide tasks into groups based on x-position
	MPI_Group_incl(group_all, num_yz, x_ranks[coords[0]], &group_constx);

	// // Make corresponding communicators:
	// MPI_Comm_create(comm3d, group_constz, &comm_plane_xy); // const z
	// MPI_Comm_create(comm3d, group_consty, &comm_plane_xz); // const x
	// MPI_Comm_create(comm3d, group_constx, &comm_plane_yz); // const x
	// // Get corresponding rank
	// MPI_Group_rank (group_constz, &rank_plane_xy);
	// MPI_Group_rank (group_consty, &rank_plane_xz);
	// MPI_Group_rank (group_constx, &rank_plane_yz);

	int remain_dims[3];
	// x-y plane:
	remain_dims[0] = 1;
	remain_dims[1] = 1;
	remain_dims[2] = 0;
	MPI_Cart_sub(comm3d, remain_dims, &comm_plane_xy);
	MPI_Comm_rank(comm_plane_xy, &rank_plane_xy);

	// x-z plane
	remain_dims[0] = 1;
	remain_dims[1] = 0;
	remain_dims[2] = 1;
	MPI_Cart_sub(comm3d, remain_dims, &comm_plane_xz);
	MPI_Comm_rank(comm_plane_xz, &rank_plane_xz);

	// y-z plane
	remain_dims[0] = 0;
	remain_dims[1] = 1;
	remain_dims[2] = 1;
	MPI_Cart_sub(comm3d, remain_dims, &comm_plane_yz);
	MPI_Comm_rank(comm_plane_yz, &rank_plane_yz);

コード例 #6
ファイル: spinor_fft.c プロジェクト: VincentDrach/tmLQCD
 * accumulates pieces of the spinor field on nodes with index 0 in the dimensions given in which
 * the collected data is returned
void spinor_fft_reduce_2d(spinor *localSpinorField,int *collectionRank,spinor*** field_collection,spinor **membuff){
  /* this implementation is intended for four dimensional parallelisation */
#if (defined  PARALLELXYZT  && defined MPI && defined HAVE_FFTW)

  int sendRecvCoord[4];
  int i;
  int dims[]={g_nproc_t,g_nproc_x,g_nproc_y,g_nproc_z};

  /* logfile variables */
  char *logFilePrefix="Process";
  char logFileName[512];
  FILE *logFile;
  const int MSG_LOCALDATA = 457;
  MPI_Status ierr;
  MPI_Datatype mpi_local_spinor;
  const int which[]={0,1};


/*   int result; */

  MPI_Type_contiguous(VOLUME, field_point, &mpi_local_spinor);


  if( g_proc_coords[which[0]] == 0 && g_proc_coords[which[1]] == 0 ){

      /* i am one of the nodes where data is accumulated */
      spinor **accu_field;
      spinor **fft_field;
      spinor *memory_buffer_accu_field;
      spinor *memory_buffer_fft_field;
      int recvRank;
      MPI_Request *requests;
      MPI_Status *status;
      int request_count=0;
      int num_requests;
      fftw_plan local_2d_fft_forward;


      /* calculate the number of reduced 2d volume accumulated in this node */
      /* number of spinor fields in local units */

      /* number of receive messages */

      /* reserve space for receive messages */

      fprintf(logFile,"reduction volume = %d\n",REDUCTIONVOLUME);

      /* allocate space for spinor field collection */

      /* receive from certain nodes pieces of the spinor field */
      for(sendRecvCoord[which[0]] = 0 ; sendRecvCoord[which[0]]< dims[which[0]] ; sendRecvCoord[which[0]]++){
	for(sendRecvCoord[which[1]] = 0 ; sendRecvCoord[which[1]]< dims[which[1]] ; sendRecvCoord[which[1]]++){
	  if( sendRecvCoord[which[0]] != 0 || sendRecvCoord[which[1]]  != 0){


	    MPI_Irecv(accu_field[sendRecvCoord[which[0]]*dims[which[1]]+sendRecvCoord[which[1]] ] /* buffer */,
		     1, /* how may */
		     mpi_local_spinor, /* mpi data type */
		     recvRank, /* from whom i get it */
		     MSG_LOCALDATA, /* msg id */
		     g_cart_grid, /* communicator , status */


      /* wait until all request finished */
      MPI_Waitall(num_requests, requests, status);


      /* transpose in xp-t space */

      /* create fftw plan */

/*       assign(accu_field[0],fft_field[0],VOLUME*REDUCTIONVOLUME); */

      free_spinor_field_array(&memory_buffer_fft_field); memory_buffer_fft_field=NULL;

/*       free_spinor_field_array(&memory_buffer_accu_field); memory_buffer_accu_field=NULL; */
      free(requests); requests = NULL;
      free(status); status=NULL;

    } else {
      int sendRank;
      MPI_Request request;
      MPI_Status status;


      /* coordinates of the "root" */







      fprintf(stderr,"Error: Please choose FOUR dimensional parallelization!!!\n");

コード例 #7
ファイル: cartorder.c プロジェクト: MartinLidh/tddc78
int main( int argc, char **argv )
    int              rank, size, i;
    int              errors=0;
    int              dims[NUM_DIMS];
    int              periods[NUM_DIMS];
    int              coords[NUM_DIMS];
    int              new_coords[NUM_DIMS];
    int              reorder = 0;
    MPI_Comm         comm_temp, comm_cart, new_comm;
    int              topo_status;
    int              ndims;
    int              new_rank;
    int              remain_dims[NUM_DIMS];
    int              newnewrank;

    MPI_Init( &argc, &argv );

    MPI_Comm_rank( MPI_COMM_WORLD, &rank );
    MPI_Comm_size( MPI_COMM_WORLD, &size );

    /* Clear dims array and get dims for topology */
    for(i=0;i<NUM_DIMS;i++) { dims[i] = 0; periods[i] = 0; }
    MPI_Dims_create ( size, NUM_DIMS, dims );

    /* Make a new communicator with a topology */
    MPI_Cart_create ( MPI_COMM_WORLD, 2, dims, periods, reorder, &comm_temp );
    MPI_Comm_dup ( comm_temp, &comm_cart );

    /* Determine the status of the new communicator */
    MPI_Topo_test ( comm_cart, &topo_status );
    if (topo_status != MPI_CART) errors++;

    /* How many dims do we have? */
    MPI_Cartdim_get( comm_cart, &ndims );
    if ( ndims != NUM_DIMS ) errors++;

    /* Get the topology, does it agree with what we put in? */
    for(i=0;i<NUM_DIMS;i++) { dims[i] = 0; periods[i] = 0; }
    MPI_Cart_get ( comm_cart, NUM_DIMS, dims, periods, coords );

    /* Check that the coordinates are correct */
#if NUM_DIMS == 2
    if (rank != coords[1] + coords[0] * dims[1]) {
	fprintf( stderr, 
"Did not get expected coordinate (row major required by MPI standard 6.2)\n" );
    /* Does the mapping from coords to rank work? */
    MPI_Cart_rank ( comm_cart, coords, &new_rank );
    if ( new_rank != rank ) errors++;

    /* Does the mapping from rank to coords work */
    MPI_Cart_coords ( comm_cart, rank, NUM_DIMS, new_coords );
    for (i=0;i<NUM_DIMS;i++) 
      if ( coords[i] != new_coords[i] ) 

    /* Let's shift in each dimension and see how it works!   */
    /* Because it's late and I'm tired, I'm not making this  */
    /* automatically test itself.                            */
    for (i=0;i<NUM_DIMS;i++) {
      int source, dest;
      MPI_Cart_shift(comm_cart, i, 1, &source, &dest);
#ifdef VERBOSE      
      printf ("[%d] Shifting %d in the %d dimension\n",rank,1,i);
      printf ("[%d]    source = %d  dest = %d\n",rank,source,dest); 

    /* Subdivide */
    remain_dims[0] = 0; 
    for (i=1; i<NUM_DIMS; i++) remain_dims[i] = 1;
    MPI_Cart_sub ( comm_cart, remain_dims, &new_comm );

    /* Determine the status of the new communicator */
    MPI_Topo_test ( new_comm, &topo_status );
    if (topo_status != MPI_CART) errors++;

    /* How many dims do we have? */
    MPI_Cartdim_get( new_comm, &ndims );
    if ( ndims != NUM_DIMS-1 ) errors++;

    /* Get the topology, does it agree with what we put in? */
    for(i=0;i<NUM_DIMS-1;i++) { dims[i] = 0; periods[i] = 0; }
    MPI_Cart_get ( new_comm, ndims, dims, periods, coords );
    /* Does the mapping from coords to rank work? */
    MPI_Comm_rank ( new_comm, &newnewrank );
    MPI_Cart_rank ( new_comm, coords, &new_rank );
    if ( new_rank != newnewrank ) errors++;

    /* Does the mapping from rank to coords work */
    MPI_Cart_coords ( new_comm, new_rank, NUM_DIMS -1, new_coords );
    for (i=0;i<NUM_DIMS-1;i++) 
      if ( coords[i] != new_coords[i] ) 

    /* We're at the end */
    MPI_Comm_free( &new_comm );
    MPI_Comm_free( &comm_temp );
    MPI_Comm_free( &comm_cart );
    Test_Waitforall( );
    if (errors) printf( "[%d] done with %d ERRORS!\n", rank,errors );
    return 0;
コード例 #8
ファイル: cartzero.c プロジェクト: jeffhammond/mpich
    Check that the MPI implementation properly handles zero-dimensional
    Cartesian communicators - the original standard implies that these
    should be consistent with higher dimensional topologies and thus
    these should work with any MPI implementation.  MPI 2.1 made this
    requirement explicit.
int main(int argc, char *argv[])
    int errs = 0;
    int size, rank, ndims;
    MPI_Comm comm, newcomm;

    MTest_Init(&argc, &argv);

    /* Create a new cartesian communicator in a subset of the processes */
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    if (size < 2) {
        fprintf(stderr, "This test needs at least 2 processes\n");
        MPI_Abort(MPI_COMM_WORLD, 1);

    MPI_Cart_create(MPI_COMM_WORLD, 0, NULL, NULL, 0, &comm);

    if (comm != MPI_COMM_NULL) {
        int csize;
        MPI_Comm_size(comm, &csize);
        if (csize != 1) {
            fprintf(stderr, "Sizes is wrong in cart communicator.  Is %d, should be 1\n", csize);

        /* This function is not meaningful, but should not fail */
        MPI_Dims_create(1, 0, NULL);

        ndims = -1;
        MPI_Cartdim_get(comm, &ndims);
        if (ndims != 0) {
            fprintf(stderr, "MPI_Cartdim_get: ndims is %d, should be 0\n", ndims);

        /* this function should not fail */
        MPI_Cart_get(comm, 0, NULL, NULL, NULL);

        MPI_Cart_rank(comm, NULL, &rank);
        if (rank != 0) {
            fprintf(stderr, "MPI_Cart_rank: rank is %d, should be 0\n", rank);

        /* this function should not fail */
        MPI_Cart_coords(comm, 0, 0, NULL);

        MPI_Cart_sub(comm, NULL, &newcomm);
        ndims = -1;
        MPI_Cartdim_get(newcomm, &ndims);
        if (ndims != 0) {
            fprintf(stderr, "MPI_Cart_sub did not return zero-dimensional communicator\n");


    } else if (rank == 0) {
        fprintf(stderr, "Communicator returned is null!");


    return MTestReturnValue(errs);
コード例 #9
int main(int argc, char **argv) {

  const int n_c = 3;  // number of colors

  int c, i, j, mu, nu, ir, is, ia, imunu;
  int filename_set = 0;
  int dims[4]      = {0,0,0,0};
  int l_LX_at, l_LXstart_at;
  int source_location, have_source_flag = 0;
  int x0, x1, x2, x3, ix;
  int sx0, sx1, sx2, sx3;
  int isimag[4];
  int gperm[5][4], gperm2[4][4];
  int check_position_space_WI=0;
  int num_threads = 1, nthreads=-1, threadid=-1;
  int exitstatus;
  int write_ascii=0;
  int mms = 0, mass_id = -1;
  int outfile_prefix_set = 0;
  int source_proc_coords[4], source_proc_id = -1;
  int ud_single_file = 0;
  double gperm_sign[5][4], gperm2_sign[4][4];
  double *conn  = NULL;
  double *conn2 = NULL;
  double contact_term[8];
  double *work=NULL;
  int verbose = 0;
  int do_gt   = 0, status;
  char filename[100], contype[400], outfile_prefix[400];
  double ratime, retime;
  double plaq;
  double spinor1[24], spinor2[24], U_[18];
  double *gauge_trafo=(double*)NULL;
  double *phi=NULL, *chi=NULL;
  complex w;
  double Usourcebuff[72], *Usource[4];
  FILE *ofs;

#ifdef MPI
  int *status;

#ifdef MPI
  MPI_Init(&argc, &argv);

  while ((c = getopt(argc, argv, "swah?vgf:t:m:o:")) != -1) {
    switch (c) {
    case 'v':
      verbose = 1;
    case 'g':
      do_gt = 1;
    case 'f':
      strcpy(filename, optarg);
    case 'w':
      check_position_space_WI = 1;
      fprintf(stdout, "\n# [avc_exact2_lowmem_xspace] will check Ward identity in position space\n");
    case 't':
      num_threads = atoi(optarg);
      fprintf(stdout, "\n# [avc_exact2_lowmem_xspace] will use %d threads in spacetime loops\n", num_threads);
    case 'a':
      write_ascii = 1;
      fprintf(stdout, "\n# [avc_exact2_lowmem_xspace] will write data in ASCII format too\n");
    case 'm':
      mms = 1;
      mass_id = atoi(optarg);
      fprintf(stdout, "\n# [avc_exact2_lowmem_xspace] will read propagators in MMS format with mass id %d\n", mass_id);
    case 'o':
      strcpy(outfile_prefix, optarg);
      fprintf(stdout, "\n# [avc_exact2_lowmem_xspace] will use prefix %s for output filenames\n", outfile_prefix);
      outfile_prefix_set = 1;
    case 's':
      ud_single_file = 1;
      fprintf(stdout, "\n# [avc_exact2_lowmem_xspace] will read up and down propagator from same file\n");
    case 'h':
    case '?':

  if(g_cart_id==0) {
    g_the_time = time(NULL);
    fprintf(stdout, "\n# [avc_exact2_lowmem_xspace] using global time stamp %s", ctime(&g_the_time));

   * set number of openmp threads
#ifdef OPENMP

  /* set the default values */
  if(filename_set==0) strcpy(filename, "cvc.input");
  fprintf(stdout, "# Reading input from file %s\n", filename);

  /* some checks on the input data */
  if((T_global == 0) || (LX==0) || (LY==0) || (LZ==0)) {
    if(g_proc_id==0) fprintf(stderr, "\n[avc_exact2_lowmem_xspace] T and L's must be set\n");
  if(g_kappa == 0.) {
    if(g_proc_id==0) fprintf(stderr, "\n[avc_exact2_lowmem_xspace] kappa should be > 0.n");

  /* initialize MPI parameters */
  mpi_init(argc, argv);
#ifdef MPI
  if((status = (int*)calloc(g_nproc, sizeof(int))) == (int*)NULL) {
    MPI_Abort(MPI_COMM_WORLD, 1);

  dims[0]=T_global; dims[1]=LX; dims[2]=LY; dims[3]=LZ;
#ifndef MPI
  T            = T_global;
  Tstart       = 0;
  l_LX_at      = LX;
  l_LXstart_at = 0;
  fprintf(stdout, "# [%2d] parameters:\n"\
                  "# [%2d] T            = %3d\n"\
		  "# [%2d] Tstart       = %3d\n"\
		  "# [%2d] l_LX_at      = %3d\n"\
		  "# [%2d] l_LXstart_at = %3d\n",
		  g_cart_id, g_cart_id, T, g_cart_id, Tstart, g_cart_id, l_LX_at,
		  g_cart_id, l_LXstart_at);

#ifdef MPI
  if(T==0) {
    fprintf(stderr, "[%2d] local T is zero; exit\n", g_cart_id);
    MPI_Abort(MPI_COMM_WORLD, 1);

  if(init_geometry() != 0) {
    fprintf(stderr, "ERROR from init_geometry\n");
#ifdef MPI
    MPI_Abort(MPI_COMM_WORLD, 1);


  alloc_gauge_field(&g_gauge_field, VOLUMEPLUSRAND);
  if(!(strcmp(gaugefilename_prefix,"identity")==0)) {
    /* read the gauge field */
    sprintf(filename, "%s.%.4d", gaugefilename_prefix, Nconf);
    if(g_cart_id==0) fprintf(stdout, "reading gauge field from file %s\n", filename);
  } else {
    /* initialize unit matrices */
    if(g_cart_id==0) fprintf(stdout, "\n# [avc_exact] initializing unit matrices\n");
    for(ix=0;ix<VOLUME;ix++) {
      _cm_eq_id( g_gauge_field + _GGI(ix, 0) );
      _cm_eq_id( g_gauge_field + _GGI(ix, 1) );
      _cm_eq_id( g_gauge_field + _GGI(ix, 2) );
      _cm_eq_id( g_gauge_field + _GGI(ix, 3) );
#ifdef MPI

  /* measure the plaquette */
  if(g_cart_id==0) fprintf(stdout, "measured plaquette value: %25.16e\n", plaq);
  sprintf(filename, "gauge.%.2d", g_cart_id);
  ofs = fopen(filename, "w");
  for(x0=0;x0<T;x0++) {
  for(x1=0;x1<LX;x1++) {
  for(x2=0;x2<LY;x2++) {
  for(x3=0;x3<LZ;x3++) {
    ix = g_ipt[x0][x1][x2][x3];
    for(mu=0;mu<4;mu++) {
      for(i=0;i<9;i++) {
         fprintf(ofs, "%8d%3d%3d%3d%3d%3d%3d%25.16e%25.16e\n", ix, x0+Tstart, x1+LXstart, x2+LYstart, x3, mu, i, g_gauge_field[_GGI(ix,mu)+2*i], g_gauge_field[_GGI(ix,mu)+2*i+1]);

  if(g_cart_id==0) fprintf(stdout, "\nWarning: forced exit\n");
#ifdef MPI
  MPI_Abort(MPI_COMM_WORLD, 255);

  /* allocate memory for the spinor fields */
  no_fields = 2;
  if(mms) no_fields++;
  g_spinor_field = (double**)calloc(no_fields, sizeof(double*));
  for(i=0; i<no_fields; i++) alloc_spinor_field(&g_spinor_field[i], VOLUMEPLUSRAND);
  if(mms) {
    work = g_spinor_field[no_fields-1];

  /* allocate memory for the contractions */
  conn = (double*)calloc(2 * 16 * VOLUME, sizeof(double));
  if( conn==(double*)NULL ) {
    fprintf(stderr, "could not allocate memory for contr. fields\n");
#ifdef MPI
    MPI_Abort(MPI_COMM_WORLD, 3);
#ifdef OPENMP
#pragma omp parallel for
  for(ix=0; ix<32*VOLUME; ix++) conn[ix] = 0.;

  conn2 = (double*)calloc(2 * 16 * VOLUME, sizeof(double));
  if( conn2 == NULL ) {
    fprintf(stderr, "could not allocate memory for contr. fields\n");
#ifdef MPI
    MPI_Abort(MPI_COMM_WORLD, 3);
#ifdef OPENMP
#pragma omp parallel for
  for(ix=0; ix<32*VOLUME; ix++) conn2[ix] = 0.;

   * determine source coordinates, find out, if source_location is in this process
#if (defined PARALLELTX) || (defined PARALLELTXY)
  sx0 = g_source_location / (LX_global*LY_global*LZ);
  sx1 = (g_source_location%(LX_global*LY_global*LZ)) / (LY_global*LZ);
  sx2 = (g_source_location%(LY_global*LZ)) / LZ;
  sx3 = (g_source_location%LZ);
  source_proc_coords[0] = sx0 / T;
  source_proc_coords[1] = sx1 / LX;
  source_proc_coords[2] = sx2 / LY;
  source_proc_coords[3] = 0;
  MPI_Cart_rank(g_cart_grid, source_proc_coords, &source_proc_id);
  have_source_flag = (int)(g_cart_id == source_proc_id);
  if(have_source_flag==1) {
    fprintf(stdout, "\n# process %2d has source location\n", source_proc_id);
    fprintf(stdout, "\n# global source coordinates: (%3d,%3d,%3d,%3d)\n",  sx0, sx1, sx2, sx3);
    fprintf(stdout, "\n# source proc coordinates: (%3d,%3d,%3d,%3d)\n",  source_proc_coords[0],
        source_proc_coords[1], source_proc_coords[2], source_proc_coords[3]);
  sx0 = sx0 % T;
  sx1 = sx1 % LX;
  sx2 = sx2 % LY;
  sx3 = sx3 % LZ;
# else
  have_source_flag = (int)(g_source_location/(LX*LY*LZ)>=Tstart && g_source_location/(LX*LY*LZ)<(Tstart+T));
  if(have_source_flag==1) fprintf(stdout, "process %2d has source location\n", g_cart_id);
  sx0 = g_source_location/(LX*LY*LZ)-Tstart;
  sx1 = (g_source_location%(LX*LY*LZ)) / (LY*LZ);
  sx2 = (g_source_location%(LY*LZ)) / LZ;
  sx3 = (g_source_location%LZ);
  if(have_source_flag==1) { 
    fprintf(stdout, "local source coordinates: (%3d,%3d,%3d,%3d)\n", sx0, sx1, sx2, sx3);
    source_location = g_ipt[sx0][sx1][sx2][sx3];
#ifdef MPI
#  if (defined PARALLELTX) || (defined PARALLELTXY)
  have_source_flag = source_proc_id;
  MPI_Bcast(Usourcebuff, 72, MPI_DOUBLE, have_source_flag, g_cart_grid);
#  else
  MPI_Gather(&have_source_flag, 1, MPI_INT, status, 1, MPI_INT, 0, g_cart_grid);
  if(g_cart_id==0) {
    for(mu=0; mu<g_nproc; mu++) fprintf(stdout, "status[%1d]=%d\n", mu,status[mu]);
  if(g_cart_id==0) {
    for(have_source_flag=0; status[have_source_flag]!=1; have_source_flag++);
    fprintf(stdout, "have_source_flag= %d\n", have_source_flag);
  MPI_Bcast(&have_source_flag, 1, MPI_INT, 0, g_cart_grid);
#  endif
  fprintf(stdout, "[%2d] have_source_flag = %d\n", g_cart_id, have_source_flag);
  have_source_flag = 0;

  if(g_cart_id==0) fprintf(stdout, "\nWarning: forced exit\n");
#ifdef MPI
  MPI_Abort(MPI_COMM_WORLD, 255);

#ifdef MPI
      ratime = MPI_Wtime();
      ratime = (double)clock() / CLOCKS_PER_SEC;
   *  initialize the Gamma matrices
  // gamma_5:
  gperm[4][0] = gamma_permutation[5][ 0] / 6;
  gperm[4][1] = gamma_permutation[5][ 6] / 6;
  gperm[4][2] = gamma_permutation[5][12] / 6;
  gperm[4][3] = gamma_permutation[5][18] / 6;
  gperm_sign[4][0] = gamma_sign[5][ 0];
  gperm_sign[4][1] = gamma_sign[5][ 6];
  gperm_sign[4][2] = gamma_sign[5][12];
  gperm_sign[4][3] = gamma_sign[5][18];
  // gamma_nu gamma_5
  for(nu=0;nu<4;nu++) {
    // permutation
    gperm[nu][0] = gamma_permutation[6+nu][ 0] / 6;
    gperm[nu][1] = gamma_permutation[6+nu][ 6] / 6;
    gperm[nu][2] = gamma_permutation[6+nu][12] / 6;
    gperm[nu][3] = gamma_permutation[6+nu][18] / 6;
    // is imaginary ?
    isimag[nu] = gamma_permutation[6+nu][0] % 2;
    // (overall) sign
    gperm_sign[nu][0] = gamma_sign[6+nu][ 0];
    gperm_sign[nu][1] = gamma_sign[6+nu][ 6];
    gperm_sign[nu][2] = gamma_sign[6+nu][12];
    gperm_sign[nu][3] = gamma_sign[6+nu][18];
    // write to stdout
    if(g_cart_id == 0) {
      fprintf(stdout, "# gamma_%d5 = (%f %d, %f %d, %f %d, %f %d)\n", nu,
          gperm_sign[nu][0], gperm[nu][0], gperm_sign[nu][1], gperm[nu][1], 
          gperm_sign[nu][2], gperm[nu][2], gperm_sign[nu][3], gperm[nu][3]);
  // gamma_nu
  for(nu=0;nu<4;nu++) {
    // permutation
    gperm2[nu][0] = gamma_permutation[nu][ 0] / 6;
    gperm2[nu][1] = gamma_permutation[nu][ 6] / 6;
    gperm2[nu][2] = gamma_permutation[nu][12] / 6;
    gperm2[nu][3] = gamma_permutation[nu][18] / 6;
    // (overall) sign
    gperm2_sign[nu][0] = gamma_sign[nu][ 0];
    gperm2_sign[nu][1] = gamma_sign[nu][ 6];
    gperm2_sign[nu][2] = gamma_sign[nu][12];
    gperm2_sign[nu][3] = gamma_sign[nu][18];
    // write to stdout
    if(g_cart_id == 0) {
    	fprintf(stdout, "# gamma_%d = (%f %d, %f %d, %f %d, %f %d)\n", nu,
        	gperm2_sign[nu][0], gperm2[nu][0], gperm2_sign[nu][1], gperm2[nu][1], 
        	gperm2_sign[nu][2], gperm2[nu][2], gperm2_sign[nu][3], gperm2[nu][3]);

   ** first contribution

   * loop on the Lorentz index nu at source 
for(ia=0; ia<n_c; ia++) {
  for(nu=0; nu<4; nu++) 
  //for(nu=0; nu<4; nu++) 
    // fprintf(stdout, "\n# [avc_exact2_lowmem_xspace] 1st part, processing nu = %d ...\n", nu);

    for(ir=0; ir<4; ir++) {

      // read 1 up-type propagator color components for spinor index ir
	if(!mms) {
      	  get_filename(filename, 0, 3*ir+ia, 1);
          exitstatus = read_lime_spinor(g_spinor_field[0], filename, 0);
          if(exitstatus != 0) {
            fprintf(stderr, "\n# [avc_exact2_lowmem_xspace] Error from read_lime_spinor\n");
        } else {
          sprintf(filename, "%s.%.4d.00.%.2d.cgmms.%.2d.inverted", filename_prefix, Nconf, 3*ir+ia, mass_id);
          exitstatus = read_lime_spinor(work, filename, 0);
          if(exitstatus != 0) {
            fprintf(stderr, "\n# [avc_exact2_lowmem_xspace] Error from read_lime_spinor\n");
          Qf5(g_spinor_field[0], work, -g_mu);

      // read 1 dn-type propagator color components for spinor index gamma_perm ( ir )
        if(!mms) {
          if(ud_single_file) {
            get_filename(filename, 0, 3*gperm[nu][ir]+ia, 1);
            exitstatus = read_lime_spinor(g_spinor_field[1], filename, 1);
          } else {
            get_filename(filename, 0, 3*gperm[nu][ir]+ia, -1);
            exitstatus = read_lime_spinor(g_spinor_field[1], filename, 0);
          if(exitstatus != 0) {
            fprintf(stderr, "\n# [avc_exact2_lowmem_xspace] Error from read_lime_spinor\n");
        } else {
          sprintf(filename, "%s.%.4d.%.2d.%.2d.cgmms.%.2d.inverted", filename_prefix, Nconf, 4, 3*gperm[nu][ir]+ia, mass_id);
          exitstatus = read_lime_spinor(work, filename, 0);
          if(exitstatus != 0) {
            fprintf(stderr, "\n# [avc_exact2_lowmem_xspace] Error from read_lime_spinor\n");
          Qf5(g_spinor_field[1], work, g_mu);

        phi = g_spinor_field[0];
        chi = g_spinor_field[1];
        //fprintf(stdout, "\n# [nu5] spin index pair (%d, %d); col index %d\n", ir, gperm[nu][ir], ia);
        // 1) gamma_nu gamma_5 x U
        for(mu=0; mu<4; mu++) 
        //for(mu=0; mu<1; mu++) 

          imunu = 4*mu+nu;
#ifdef OPENMP
#pragma omp parallel for private(ix, spinor1, spinor2, U_, w)  shared(imunu, ia, nu, mu)
          for(ix=0; ix<VOLUME; ix++) {
            threadid = omp_get_thread_num();
            nthreads = omp_get_num_threads();
            fprintf(stdout, "[thread%d] number of threads = %d\n", threadid, nthreads);

            _cm_eq_cm_ti_co(U_, &g_gauge_field[_GGI(ix,mu)], &co_phase_up[mu]);

            _fv_eq_cm_ti_fv(spinor1, U_, phi+_GSI(g_iup[ix][mu]));
            _fv_eq_gamma_ti_fv(spinor2, mu, spinor1);
	    _fv_mi_eq_fv(spinor2, spinor1);
	    _fv_eq_gamma_ti_fv(spinor1, 5, spinor2);
	    _co_eq_fv_dag_ti_fv(&w, chi+_GSI(ix), spinor1);
            if(!isimag[nu]) {
              conn[_GWI(imunu,ix,VOLUME)  ] += gperm_sign[nu][ir] * w.re;
              conn[_GWI(imunu,ix,VOLUME)+1] += gperm_sign[nu][ir] * w.im;
            } else {
              conn[_GWI(imunu,ix,VOLUME)  ] += gperm_sign[nu][ir] * w.im;
              conn[_GWI(imunu,ix,VOLUME)+1] -= gperm_sign[nu][ir] * w.re;

          }  // of ix

#ifdef OPENMP
#pragma omp parallel for private(ix, spinor1, spinor2, U_, w)  shared(imunu, ia, nu, mu)
          for(ix=0; ix<VOLUME; ix++) {
            _cm_eq_cm_ti_co(U_, &g_gauge_field[_GGI(ix,mu)], &co_phase_up[mu]);

            _fv_eq_cm_dag_ti_fv(spinor1, U_, phi+_GSI(ix));
            _fv_eq_gamma_ti_fv(spinor2, mu, spinor1);
	    _fv_pl_eq_fv(spinor2, spinor1);
	    _fv_eq_gamma_ti_fv(spinor1, 5, spinor2);
	    _co_eq_fv_dag_ti_fv(&w, chi+_GSI(g_iup[ix][mu]), spinor1);
            if(!isimag[nu]) {
              conn[_GWI(imunu,ix,VOLUME)  ] += gperm_sign[nu][ir] * w.re;
              conn[_GWI(imunu,ix,VOLUME)+1] += gperm_sign[nu][ir] * w.im;
            } else {
              conn[_GWI(imunu,ix,VOLUME)  ] += gperm_sign[nu][ir] * w.im;
              conn[_GWI(imunu,ix,VOLUME)+1] -= gperm_sign[nu][ir] * w.re;

          }  // of ix

          // contribution to local-local correlator
#ifdef OPENMP
#pragma omp parallel for private(ix, spinor1, spinor2, U_, w)  shared(imunu, ia, nu, mu)
          for(ix=0; ix<VOLUME; ix++) {
            _fv_eq_gamma_ti_fv(spinor2, mu, phi+_GSI(ix) );
	    _fv_eq_gamma_ti_fv(spinor1, 5, spinor2);
	    _co_eq_fv_dag_ti_fv(&w, chi+_GSI(ix), spinor1);
            if(!isimag[nu]) {
              conn2[_GWI(imunu,ix,VOLUME)  ] += gperm_sign[nu][ir] * w.re;
              conn2[_GWI(imunu,ix,VOLUME)+1] += gperm_sign[nu][ir] * w.im;
            } else {
              conn2[_GWI(imunu,ix,VOLUME)  ] += gperm_sign[nu][ir] * w.im;
              conn2[_GWI(imunu,ix,VOLUME)+1] -= gperm_sign[nu][ir] * w.re;

          }  // of ix

	} // of mu
    }  // of ir

  }  // of nu
}  // of ia loop on colors

  // normalisation of contractions
#ifdef OPENMP
#pragma omp parallel for
  for(ix=0; ix<32*VOLUME; ix++) conn[ix] *= -0.5;

#ifdef OPENMP
#pragma omp parallel for
  for(ix=0; ix<32*VOLUME; ix++) conn2[ix] *= -1.;

#ifdef MPI
      retime = MPI_Wtime();
      retime = (double)clock() / CLOCKS_PER_SEC;
  if(g_cart_id==0) fprintf(stdout, "contractions in %e seconds\n", retime-ratime);

  // save results
#ifdef MPI
  ratime = MPI_Wtime();
  ratime = (double)clock() / CLOCKS_PER_SEC;
  if(outfile_prefix_set) {
    sprintf(filename, "%s/cvc_lvc_x.%.4d.t%.2dx%.2dy%.2dz%.2d", outfile_prefix, Nconf, sx0, sx1, sx2, sx3);
  } else {
    sprintf(filename, "cvc_lvc_x.%.4d.t%.2dx%.2dy%.2dz%.2d", Nconf, sx0, sx1, sx2, sx3);
  sprintf(contype, "cvc - lvc in position space, all 16 components");
  status = write_lime_contraction(conn, filename, 64, 16, contype, Nconf, 0);
  if(status != 0) {
    fprintf(stderr, "[] Error from write_lime_contractions, status was %d\n", status);

  if(outfile_prefix_set) {
    sprintf(filename, "%s/lvc_lvc_x.%.4d.t%.2dx%.2dy%.2dz%.2d", outfile_prefix, Nconf, sx0, sx1, sx2, sx3);
  } else {
    sprintf(filename, "lvc_lvc_x.%.4d.t%.2dx%.2dy%.2dz%.2d", Nconf, sx0, sx1, sx2, sx3);
  sprintf(contype, "lvc - lvc in position space, all 16 components");
  status = write_lime_contraction(conn2, filename, 64, 16, contype, Nconf, 0);
  if(status != 0) {
    fprintf(stderr, "[] Error from write_lime_contractions, status was %d\n", status);

#ifndef MPI
  if(write_ascii) {
    if(outfile_prefix_set) {
      sprintf(filename, "%s/cvc_lvc_x.%.4d.ascii", outfile_prefix, Nconf);
    } else {
      sprintf(filename, "cvc_lvc_x.%.4d.ascii", Nconf);
    write_contraction(conn, NULL, filename, 16, 2, 0);

    if(outfile_prefix_set) {
      sprintf(filename, "%s/lvc_lvc_x.%.4d.ascii", outfile_prefix, Nconf);
    } else {
      sprintf(filename, "lvc_lvc_x.%.4d.ascii", Nconf);
    write_contraction(conn2, NULL, filename, 16, 2, 0);

#ifdef MPI
  retime = MPI_Wtime();
  retime = (double)clock() / CLOCKS_PER_SEC;
  if(g_cart_id==0) fprintf(stdout, "saved position space results in %e seconds\n", retime-ratime);

#ifndef MPI
  // check the Ward identity in position space
  if(check_position_space_WI) {
    sprintf(filename, "WI_X.%.4d", Nconf);
    ofs = fopen(filename,"w");
    fprintf(stdout, "\n# [avc_exact2_lowmem_xspace] checking Ward identity in position space ...\n");
    for(x0=0; x0<T;  x0++) {
    for(x1=0; x1<LX; x1++) {
    for(x2=0; x2<LY; x2++) {
    for(x3=0; x3<LZ; x3++) {
      fprintf(ofs, "# t=%2d x=%2d y=%2d z=%2d\n", x0, x1, x2, x3);
      for(nu=0; nu<4; nu++) {
        w.re = conn[_GWI(4*0+nu,ix,VOLUME)] + conn[_GWI(4*1+nu,ix,VOLUME)]
             + conn[_GWI(4*2+nu,ix,VOLUME)] + conn[_GWI(4*3+nu,ix,VOLUME)]
	     - conn[_GWI(4*0+nu,g_idn[ix][0],VOLUME)] - conn[_GWI(4*1+nu,g_idn[ix][1],VOLUME)]
	     - conn[_GWI(4*2+nu,g_idn[ix][2],VOLUME)] - conn[_GWI(4*3+nu,g_idn[ix][3],VOLUME)];

        w.im = conn[_GWI(4*0+nu,ix,VOLUME)+1] + conn[_GWI(4*1+nu,ix,VOLUME)+1]
            + conn[_GWI(4*2+nu,ix,VOLUME)+1] + conn[_GWI(4*3+nu,ix,VOLUME)+1]
	    - conn[_GWI(4*0+nu,g_idn[ix][0],VOLUME)+1] - conn[_GWI(4*1+nu,g_idn[ix][1],VOLUME)+1]
	    - conn[_GWI(4*2+nu,g_idn[ix][2],VOLUME)+1] - conn[_GWI(4*3+nu,g_idn[ix][3],VOLUME)+1];
        fprintf(ofs, "\t%3d%25.16e%25.16e\n", nu, w.re, w.im);

   * free the allocated memory, finalize
  for(i=0; i<no_fields; i++) free(g_spinor_field[i]);
  if(conn  != NULL) free(conn);
  if(conn2 != NULL) free(conn2);
#ifdef MPI

  if(g_cart_id==0) {
    g_the_time = time(NULL);
    fprintf(stdout, "\n# [cvc_lvc_exact2_lowmem_xspace] %s# [cvc_lvc_exact2_lowmem_xspace] end of run\n", ctime(&g_the_time));
    fprintf(stderr, "\n# [cvc_lvc_exact2_lowmem_xspace] %s# [cvc_lvc_exact2_lowmem_xspace] end of run\n", ctime(&g_the_time));

