コード例 #1
0
ファイル: check-cg.c プロジェクト: usqcd-software/clover
int
main(int argc, char *argv[])
{
    const char *msg;
    int status = 1;
    int mu, i;
    struct QOP_CLOVER_State *clover_state;
    QDP_Int *I_seed;
    int i_seed;
    QDP_RandomState *state;
    QLA_Real plaq;
    QLA_Real n[NELEMS(F)];
    struct QOP_CLOVER_Gauge *c_g;
    struct QOP_CLOVER_Fermion *c_f[NELEMS(F)];
    double kappa;
    double c_sw;
    double in_eps;
    int in_iter;
    int log_flag;
    double out_eps;
    int out_iter;
    int cg_status;
    double run_time;
    long long flops, sent, received;
    
    /* start QDP */
    QDP_initialize(&argc, &argv);

    if (argc != 1 + NDIM + 6) {
        printf0("ERROR: usage: %s Lx ... seed kappa c_sw iter eps log?\n",
                argv[0]);
        goto end;
    }

    for (mu = 0; mu < NDIM; mu++) {
        lattice[mu] = atoi(argv[1 + mu]);
    }
    i_seed = atoi(argv[1 + NDIM]);
    kappa = atof(argv[2 + NDIM]);
    c_sw = atof(argv[3 + NDIM]);
    in_iter = atoi(argv[4 + NDIM]);
    in_eps = atof(argv[5 + NDIM]);
    
    log_flag = atoi(argv[6 + NDIM]) == 0? 0: QOP_CLOVER_LOG_EVERYTHING;

    /* set lattice size and create layout */
    QDP_set_latsize(NDIM, lattice);
    QDP_create_layout();

    primary = QMP_is_primary_node();
    self = QMP_get_node_number();
    get_vector(network, 1, QMP_get_logical_number_of_dimensions(),
               QMP_get_logical_dimensions());
    get_vector(node, 0, QMP_get_logical_number_of_dimensions(),
               QMP_get_logical_coordinates());
        
    printf0("network: ");
    for (i = 0; i < NDIM; i++)
        printf0(" %d", network[i]);
    printf0("\n");

    printf0("node: ");
    for (i = 0; i < NDIM; i++)
        printf0(" %d", node[i]);
    printf0("\n");

    printf0("kappa: %20.15f\n", kappa);
    printf0("c_sw:  %20.15f\n", c_sw);

    printf0("in_iter: %d\n", in_iter);
    printf0("in_eps: %15.2e\n", in_eps);

    /* allocate the gauge field */
    create_Mvector(U, NELEMS(U));
    create_Mvector(C, NELEMS(C));
    create_Dvector(F, NELEMS(F));
    I_seed = QDP_create_I();
    QDP_I_eq_funci(I_seed, icoord, QDP_all);
    state = QDP_create_S();
    QDP_S_eq_seed_i_I(state, i_seed, I_seed, QDP_all);
    
    for (mu = 0; mu < NELEMS(U); mu++) {
        QDP_M_eq_gaussian_S(U[mu], state, QDP_all);
    }
    
    for (i = 0; i < NELEMS(F); i++) {
        QDP_D_eq_gaussian_S(F[i], state, QDP_all);
    }

    /* build the clovers */
    clover(C, U);

    /* initialize CLOVER */
    if (QOP_CLOVER_init(&clover_state, lattice, network, node, primary,
                        sublattice, NULL)) {
        printf0("CLOVER_init() failed\n");
        goto end;
    }

    if (QOP_CLOVER_import_fermion(&c_f[0], clover_state, f_reader, F[0])) {
        printf0("CLOVER_import_fermion(0) failed\n");
        goto end;
    }

    if (QOP_CLOVER_allocate_fermion(&c_f[1], clover_state)) {
        printf0("CLOVER_allocate_fermion(1) failed\n");
        goto end;
    }

    if (QOP_CLOVER_allocate_fermion(&c_f[2], clover_state)) {
        printf0("CLOVER_allocate_fermion(2) failed\n");
        goto end;
    }

    if (QOP_CLOVER_allocate_fermion(&c_f[3], clover_state)) {
        printf0("CLOVER_allocate_fermion(3) failed\n");
        goto end;
    }

    if (QOP_CLOVER_import_gauge(&c_g, clover_state, kappa, c_sw,
                                u_reader, c_reader, NULL)) {
        printf("CLOVER_import_gauge() failed\n");
        goto end;
    }

    QOP_CLOVER_D_operator(c_f[2], c_g, c_f[0]);
    cg_status = QOP_CLOVER_D_CG(c_f[3], &out_iter, &out_eps,
                                c_f[2], c_g, c_f[2], in_iter, in_eps,
                                log_flag);

    msg = QOP_CLOVER_error(clover_state);

    QOP_CLOVER_performance(&run_time, &flops, &sent, &received, clover_state);

    QOP_CLOVER_export_fermion(f_writer, F[3], c_f[3]);

    printf0("CG status: %d\n", cg_status);
    printf0("CG error message: %s\n", msg? msg: "<NONE>");
    printf0("CG iter: %d\n", out_iter);
    printf0("CG eps: %20.10e\n", out_eps);
    printf0("CG performance: runtime %e sec\n", run_time);
    printf0("CG performance: flops  %.3e MFlop/s (%lld)\n",
            flops * 1e-6 / run_time, flops);
    printf0("CG performance: snd    %.3e MB/s (%lld)\n",
            sent * 1e-6 / run_time, sent);
    printf0("CG performance: rcv    %.3e MB (%lld)/s\n",
            received * 1e-6 / run_time, received);

    /* free CLOVER */
    QOP_CLOVER_free_gauge(&c_g);
    for (i = 0; i < NELEMS(c_f); i++)
        QOP_CLOVER_free_fermion(&c_f[i]);

    QOP_CLOVER_fini(&clover_state);

    /* Compute plaquette */
    plaq = plaquette(U);

    /* field norms */
    for (i = 0; i < NELEMS(F); i++)
        QDP_r_eq_norm2_D(&n[i], F[i], QDP_all);
        
    /* Display the values */
    printf0("plaquette = %g\n",
            plaq / (QDP_volume() * QDP_Nc * NDIM * (NDIM - 1) / 2 ));
    for (i = 0; i < NELEMS(F); i++)
        printf0(" |f|^2 [%d] = %20.10e\n", i, (double)(n[i]));

    /* Compute and display <f[1] f[0]> */
    show_dot("1|orig", F[1], F[0]);
    /* Compute and display <f[1] f[3]> */
    show_dot("1|solv", F[1], F[3]);

    QDP_destroy_S(state);
    QDP_destroy_I(I_seed);
    destroy_Mvector(U, NELEMS(U));
    destroy_Mvector(C, NELEMS(C));
    destroy_Dvector(F, NELEMS(F));

    status = 0;
end:
    /* shutdown QDP */
    printf0("end\n");
    QDP_finalize();
        
    return status;
}
コード例 #2
0
ファイル: layout_qdp.c プロジェクト: daschaich/KS_nHYP_FA
const int *get_logical_coordinate(){
  return QMP_get_logical_coordinates();
}
コード例 #3
0
ファイル: wfm_init.C プロジェクト: DeanHowarth/QUDA-CPS
void wfm::init(WilsonArg *wilson_p)  /* pointer to Wilson type structure    */
{
  int spinor_words;             /* size of the spinor field on the         */
				/* sublattice checkerboard                 */

  int half_spinor_words;        /* size of the spin-projected "half_spinors*/
                                /* on the sublattice checkerboard including*/
                                /* the communications padding              */

  int slx;                          /* x-direction size of node sublattice */
  int sly;                          /* y-direction size of node sublattice */
  int slz;                          /* z-direction size of node sublattice */
  int slt;                          /* t-direction size of node sublattice */
  int i;
  int mu;

  SloppyPrecision = wilson_p->SloppyPrecision;
  WFM_BGL             = wilson_p->WFM_BGL;


//  if ( isBoss() ) printf("wfm::init setting up BG/L MMU state\n");
  mmu_optimise();
  mmu_print();

//  CoreCount( wilson_p->CoreCount );
  CoreCount( 1 );

  if ( WFM_BGL ) PAD_HALF_SPINOR_SIZE = 12;
  else  PAD_HALF_SPINOR_SIZE = 16;

  if ( WFM_BGL && (nthread > 1) && SloppyPrecision ) { 
    if ( isBoss() ) printf("Bagel does not maintain L1 coherence in dual core + single precision mode on BlueGene\n");
    if ( isBoss() ) printf("Get on to IBM to give me access to SWOA MMU options, or even better a non-cache image of DRAM\n");
    if ( isBoss() ) printf("If they give me the tools, I'm happy to do the heroics of mainting sfw coherence\n");
    if ( isBoss() ) printf("Bagel insanity check exiting\n");
    exit(-1);
  }

  IR = wilson_p->instruction_reg_num;
/*--------------------------------------------------------------------------*/
/* Set sublattice direction sizes                                           */
/*--------------------------------------------------------------------------*/
  local_latt[0] =  wilson_p->local_latt[0];
  local_latt[1] =  wilson_p->local_latt[1];
  local_latt[2] =  wilson_p->local_latt[2];
  local_latt[3] =  wilson_p->local_latt[3];
  slx = local_latt[0];
  sly = local_latt[1];
  slz = local_latt[2];
  slt = local_latt[3];

#if (defined USE_COMMS_QMP) && (!defined UNIFORM_SEED_NO_COMMS)
  QMP_bool_t qmp_inited=QMP_is_initialized();
  if( !qmp_inited ) { 
	if ( isBoss() ) printf("QMP_not_initialized\n");
        exit(-1);
  }
  const int *ncoor = QMP_get_logical_coordinates();
  base_parity =(ncoor[0]*local_latt[0] 
              + ncoor[1]*local_latt[1]
              + ncoor[2]*local_latt[2]
              + ncoor[3]*local_latt[3])&0x1;

#else
  base_parity = 0;
#endif


/*--------------------------------------------------------------------------*/
/* Set periodic wrap back or not                                            */
/*--------------------------------------------------------------------------*/
  local_comm[0] = wilson_p->local_comm[0];
  local_comm[1] = wilson_p->local_comm[1];
  local_comm[2] = wilson_p->local_comm[2];
  local_comm[3] = wilson_p->local_comm[3];
#ifdef  UNIFORM_SEED_NO_COMMS
  for(int i=0;i<4;i++)
    if(local_comm[0]!=1){
       fprintf(stderr,"wfm::local_comm[%d]=%d!\n",i,local_comm[i]);
       exit(-33);
    }
#endif



/*-----------------------------------------------------------------------*/
/* compute the subgrd volume of each chkbd ... at least two local dims   */
/* must be even for this code to be correct.                             */
/*-----------------------------------------------------------------------*/
  vol = (slx * sly * slz * slt)/2;
  
  nbound[0] = (sly * slz * slt)/2; 
  nbound[1] = (slx * slz * slt)/2;
  nbound[2] = (slx * sly * slt)/2;
  nbound[3] = (slx * sly * slz)/2;


  allbound  = nbound[0]
            + nbound[1]
            + nbound[2]
            + nbound[3];

  if ( nbound[0] * slx * 2 != (slx*sly*slz*slt) ) {
    if ( isBoss() ) printf("wfm::init Even x logic bomb\n");
    exit(-1);
  }
  if ( nbound[1] * sly * 2 != (slx*sly*slz*slt) ) {
    if ( isBoss() ) printf("wfm::init Even y logic bomb\n");
    exit(-1);
  }
  if ( nbound[2] * slz * 2 != (slx*sly*slz*slt) ) {
    if ( isBoss() ) printf("wfm::init Even z logic bomb\n");
    exit(-1);
  }
  if ( nbound[3] * slt * 2 != (slx*sly*slz*slt) ) {
    if ( isBoss() ) printf("wfm::init Even t logic bomb\n");
    exit(-1);
  }

  /*------------------------------------------------------------------------*/
  /* Check shape                                                            */
  /*------------------------------------------------------------------------*/
  if ( (slx&1)  ) {
    if ( isBoss() ) printf("Bagel is refusing to run as x-sub latt is odd\n");
    exit(-1);
  }
  if ( (sly&1) &&(slz&1)&&(slt&1)  ) {
    if ( isBoss() ) printf("Bagel is refusing to run as y,z,t sub latts are all odd\n");
    exit(-1);
  }


/*--------------------------------------------------------------------------*/
/* Reserve memory for 1  temporary spinor (needed by mdagm)                 */
/*--------------------------------------------------------------------------*/
  spinor_words = SPINOR_SIZE * vol;

  spinor_tmp = (Float *)ALLOC(spinor_words*sizeof(Float)*2);
//printf("wfm_init::spinor_tmp=%p\n",spinor_tmp);
//  VRB.Flow(cname,fname,"spinor_tmp=%p\n",spinor_tmp);
#ifdef USE_QALLOC
  // If we used QALLOC, and the ALLOC macro failed we can try 
  // qalloc but without the QFAST flag. Even tho the spinor_tmp is
  // not communicated we leave the QCOMMS bit on in case it puts 
  // spinor tmp into a better place in the memory map
  if(spinor_tmp == 0) {
     if ( isBoss() ) printf("BAGEL: Warning spinor_tmp has spilled out of Edram\n");
     spinor_tmp = (Float *) qalloc(QCOMMS,spinor_words*sizeof(Float)*2); 
  }
#endif  // USE QALLOC

  if(spinor_tmp == 0){
    if ( isBoss() ) printf("wfm::spinor_tmp allocate\n");
    exit(-1);
  }

//~~
//~~ twisted mass fermions:  sets WilsonArg.spinor_tmp tp 
//~~ address of temporary spinor in wfm class
//~~    
  wilson_p->spinor_tmp = spinor_tmp;
//~~
/*--------------------------------------------------------------------------*/
/* Reserve memory for the 4 forward and 4 backward spin projected half      */ 
/* spinors.                                                                 */
/*--------------------------------------------------------------------------*/


  /*PAB 10/1/2001 */
  half_spinor_words = NMinusPlus * ND * PAD_HALF_SPINOR_SIZE * vol;

#ifndef USE_COMMS_QMP  
  two_spinor = (Float *)ALLOC(half_spinor_words*sizeof(Float));

#ifdef USE_QALLOC
  // If we are using QALLOC and the ALLOC macro failed we can still 
  // try to get slow memory. Leave on the QCOMMS bit for good memory map
  // placement
  if(two_spinor == 0) {
     if ( isBoss() ) printf("BAGEL : warning two spinors have spilled out of Edram\n");
     two_spinor = (Float *)qalloc(QCOMMS,half_spinor_words*sizeof(Float));
  }
#endif // USE_QALLOC

  if(two_spinor == 0){
    if ( isBoss() ) printf("wfm::two_spinor allocate\n");
    exit(-1);
  }

#else

  // Since two spinor is now communicated because of the Tface 
  // receive I have to allocate it in the style of QMP
  two_spinor_mem_t = QMP_allocate_aligned_memory(
                                             half_spinor_words*sizeof(Float),
	                                     WFM_ALIGN_ARG,
					     (QMP_MEM_COMMS|QMP_MEM_FAST));

  if( two_spinor_mem_t == 0x0 ) { 
    // Try slow allocation
    two_spinor_mem_t = QMP_allocate_aligned_memory(
	                                      half_spinor_words*sizeof(Float),
					      WFM_ALIGN_ARG,
                                              QMP_MEM_COMMS);

    if( two_spinor_mem_t == 0x0 ) { 
      if ( isBoss() ) printf("wfm_init::could not allocate two spinor_mem_t\n");
      exit(-1);
    }
  }
  two_spinor = (Float *)QMP_get_memory_pointer(two_spinor_mem_t);
  if (two_spinor == 0x0) { 
    if ( isBoss() ) printf("wfm::init QMP_get_memory_pointer returned NULL pointer from non NULL QMP_mem_t\n");
    exit(-1);
  } 
#endif

 /*--------------------------------------------------------------------------*/
 /* Reserve memory for the 4 forward and 4 backward spin projected half      */
 /* spinors.                                                                 */
 /*--------------------------------------------------------------------------*/
  for ( int pm = 0;pm<2;pm++ ) {
    for ( mu = 0 ; mu < 4 ; mu++)
    if (local_comm[mu]==0) {

      half_spinor_words = PAD_HALF_SPINOR_SIZE * nbound[mu];

      // These things are (potentially) communicated so need QMP Style 
      // allocation if using QMP
      //
      // Note: I am allocating the buffers in all directions regardless
      // of whether we are communicating in that dir or not (Copying CPS)
#ifndef USE_COMMS_QMP

      // Not using QMP
      recv_bufs[pm][mu] = (Float *)ALLOC(half_spinor_words*sizeof(Float));
#ifdef  USE_QALLOC

      // If ALLOC fails try slow memory but with QCOMMS bit still set
      if( recv_bufs[pm][mu] == 0x0 ) 
	  recv_bufs[pm][mu] = (Float *)qalloc(QCOMMS, half_spinor_words*sizeof(Float));
#endif
      if(recv_bufs[pm][mu] == 0){
	  if ( isBoss() ) printf("wfm::recv_bufs allocate\n");
	  exit(-1);
      }

      send_bufs[pm][mu]=(Float *)SEND_ALLOC(half_spinor_words*sizeof(Float));
#ifdef USE_QALLOC

      // If SEND ALLOC macro fails try slow memory but with QNONCACHE bit
      // still set
      if( send_bufs[pm][mu] == 0 ) 
        send_bufs[pm][mu]=(Float *)qalloc(QNONCACHE, half_spinor_words*sizeof(Float));
#endif

      if(send_bufs[pm][mu] == 0){
        if ( isBoss() ) printf("wfm::send_bufs allocate\n");
        exit(-1);
      }
#else
      /* QMP memory allocation: A little involved */
      /* Must allocate "opaque" QMP_mem_t first and then get 
         aligned pointer out of it. It's either what is below or a 
         very complicated send alloc */

      /* Peter in the CPS allocs recv_bufs with ALLOC = QCOMMS|FAST */
      recv_bufs_mem_t[pm][mu] = QMP_allocate_aligned_memory(half_spinor_words*sizeof(Float),
	                                                      WFM_ALIGN_ARG,
							      (QMP_MEM_COMMS|QMP_MEM_FAST));
      if( recv_bufs_mem_t[pm][mu] == 0x0 ) {
        // If QMP_allocate memory fails with FAST, try SLOW but keep COMMS
        recv_bufs_mem_t[pm][mu] = QMP_allocate_aligned_memory(half_spinor_words*sizeof(Float),
								 WFM_ALIGN_ARG,
								 QMP_MEM_COMMS);
        if( recv_bufs_mem_t[pm][mu] == 0x0 ) { 
	  if ( isBoss() ) printf("wfm::init recv_bufs_mem_t[%d][%d]: QMP_allocate_aligned_memory returned NULL\n", pm, mu);
	  exit(-1);
        }
      }
	
      /* Now get the aligned pointer */
      recv_bufs[pm][mu] =(Float *)QMP_get_memory_pointer(recv_bufs_mem_t[pm][mu]);
	
      if( recv_bufs[pm][mu] == 0x0 ) { 
        if ( isBoss() ) printf("wfm::init recv_bufs[%d][%d]: NULL aligned pointer in non NULL QMP_mem_t struct \n", pm, mu);
	exit(-1);
      }

      /* Now do the same for the send bufs */
      /* In CPS Peter allocates as SEND_ALLOC = QNONCACHE | QFAST */
     send_bufs_mem_t[pm][mu] = QMP_allocate_aligned_memory(half_spinor_words*sizeof(Float),
                                                             WFM_ALIGN_ARG,
                                                             (QMP_MEM_NONCACHE|QMP_MEM_FAST));
     if( send_bufs_mem_t[pm][mu] == 0x0 ) {
       // if allocator fails, try slow but still NONCACHE
       send_bufs_mem_t[pm][mu] = QMP_allocate_aligned_memory(half_spinor_words*sizeof(Float),
                                                               WFM_ALIGN_ARG,
                                                               QMP_MEM_NONCACHE);
       if( send_bufs_mem_t[pm][mu] == 0x0 ) {
         if ( isBoss() ) printf("wfm::init: send_bufs_mem_t[%d][%d]: QMP_allocate_aligned_memory returned NULL\n", pm, mu);
         exit(-1);
       }
     }
            
     /* Now get the aligned pointer */
     send_bufs[pm][mu] =(Float *)QMP_get_memory_pointer(send_bufs_mem_t[pm][mu]);
     if( send_bufs[pm][mu] == 0x0 ) {
       if ( isBoss() ) printf("wfm::init send_bufs[%d][%d]: NULL aligned pointer in non NULL QMP_mem_t struct \n", pm, mu);
	exit(-1);
      }

#endif	

    }
  }



/*----------------------------------------------------------------------*/
/* Build the pointer table                                              */
/*----------------------------------------------------------------------*/
  pointers_init();
  
/*----------------------------------------------------------------------*/
/* Initialise the comms                                                 */
/*----------------------------------------------------------------------*/

  comm_init();

}
コード例 #4
0
ファイル: check-dxd.c プロジェクト: usqcd-software/clover
int
main(int argc, char *argv[])
{
    int status = 1;
    int mu, i;
    struct QOP_CLOVER_State *clover_state;
    QDP_Int *I_seed;
    int i_seed;
    QDP_RandomState *state;
    QLA_Real plaq;
    QLA_Real n[NELEMS(F)];
    struct QOP_CLOVER_Gauge *c_g;
    struct QOP_CLOVER_Fermion *c_f[NELEMS(F)];
    double kappa;
    double c_sw;

    /* start QDP */
    QDP_initialize(&argc, &argv);

    if (argc != 1 + NDIM + 3) {
        printf0("ERROR: usage: %s Lx ... seed kappa c_sw\n", argv[0]);
        goto end;
    }

    for (mu = 0; mu < NDIM; mu++) {
        lattice[mu] = atoi(argv[1 + mu]);
    }
    i_seed = atoi(argv[1 + NDIM]);
    kappa = atof(argv[2 + NDIM]);
    c_sw = atof(argv[3 + NDIM]);
    
    /* set lattice size and create layout */
    QDP_set_latsize(NDIM, lattice);
    QDP_create_layout();

    primary = QMP_is_primary_node();
    self = QMP_get_node_number();
    get_vector(network, 1, QMP_get_logical_number_of_dimensions(),
               QMP_get_logical_dimensions());
    get_vector(node, 0, QMP_get_logical_number_of_dimensions(),
               QMP_get_logical_coordinates());
        
    printf0("network: ");
    for (i = 0; i < NDIM; i++)
        printf0(" %d", network[i]);
    printf0("\n");

    printf0("node: ");
    for (i = 0; i < NDIM; i++)
        printf0(" %d", node[i]);
    printf0("\n");

    printf0("kappa: %20.15f\n", kappa);
    printf0("c_sw:  %20.15f\n", c_sw);

    /* allocate the gauge field */
    create_Mvector(U, NELEMS(U));
    create_Mvector(C, NELEMS(C));
    create_Dvector(F, NELEMS(F));
    I_seed = QDP_create_I();
    QDP_I_eq_funci(I_seed, icoord, QDP_all);
    state = QDP_create_S();
    QDP_S_eq_seed_i_I(state, i_seed, I_seed, QDP_all);
    
    for (mu = 0; mu < NELEMS(U); mu++) {
        QDP_M_eq_gaussian_S(U[mu], state, QDP_all);
    }
    
    for (i = 0; i < NELEMS(F); i++) {
        QDP_D_eq_gaussian_S(F[i], state, QDP_all);
    }

    /* build the clovers */
    clover(C, U);

    /* initialize CLOVER */
    if (QOP_CLOVER_init(&clover_state, lattice, network, node, primary,
                        sublattice, NULL)) {
        printf0("CLOVER_init() failed\n");
        goto end;
    }

    if (QOP_CLOVER_import_fermion(&c_f[0], clover_state, f_reader, F[0])) {
        printf0("CLOVER_import_fermion(0) failed\n");
        goto end;
    }

    if (QOP_CLOVER_import_fermion(&c_f[1], clover_state, f_reader, F[1])) {
        printf0("CLOVER_import_fermion(1) failed\n");
        goto end;
    }

    if (QOP_CLOVER_allocate_fermion(&c_f[2], clover_state)) {
        printf0("CLOVER_allocate_fermion(2) failed\n");
        goto end;
    }

    if (QOP_CLOVER_allocate_fermion(&c_f[3], clover_state)) {
        printf0("CLOVER_allocate_fermion(3) failed\n");
        goto end;
    }

    if (QOP_CLOVER_import_gauge(&c_g, clover_state, kappa, c_sw,
                                u_reader, c_reader, NULL)) {
        printf("CLOVER_import_gauge() failed\n");
        goto end;
    }

    QOP_CLOVER_D_operator(c_f[2], c_g, c_f[0]);
    QOP_CLOVER_export_fermion(f_writer, F[2], c_f[2]);

    QOP_CLOVER_D_operator_conjugated(c_f[3], c_g, c_f[1]);
    QOP_CLOVER_export_fermion(f_writer, F[3], c_f[3]);
    
    /* free CLOVER */
    QOP_CLOVER_free_gauge(&c_g);
    for (i = 0; i < NELEMS(c_f); i++)
        QOP_CLOVER_free_fermion(&c_f[i]);

    QOP_CLOVER_fini(&clover_state);

    /* Compute plaquette */
    plaq = plaquette(U);

    /* field norms */
    for (i = 0; i < NELEMS(F); i++)
        QDP_r_eq_norm2_D(&n[i], F[i], QDP_all);
        


    /* Display the values */
    printf0("plaquette = %g\n",
            plaq / (QDP_volume() * QDP_Nc * NDIM * (NDIM - 1) / 2 ));
    for (i = 0; i < NELEMS(F); i++)
        printf0(" |f|^2 [%d] = %20.10e\n", i, (double)(n[i]));

    /* Compute and display <f[1] f[2]> */
    show_dot("1|D0", F[1], F[2]);
    /* Compute and display <f[3] f[0]> */
    show_dot("X1|0", F[3], F[0]);

    QDP_destroy_S(state);
    QDP_destroy_I(I_seed);
    destroy_Mvector(U, NELEMS(U));
    destroy_Mvector(C, NELEMS(C));
    destroy_Dvector(F, NELEMS(F));

    status = 0;
end:
    /* shutdown QDP */
    printf0("end\n");
    QDP_finalize();
        
    return status;
}
コード例 #5
0
ファイル: localheat.c プロジェクト: usqcd-software/mdwf
int
main(int argc, char *argv[])
{
  struct QOP_MDWF_State *mdwf_state = NULL;
  struct QOP_MDWF_Parameters *mdwf_params = NULL;
  QMP_thread_level_t qt = QMP_THREAD_SINGLE;
  int status = 1;
  int i;

  if (QMP_init_msg_passing(&argc, &argv, qt, &qt) != QMP_SUCCESS) {
    fprintf(stderr, "QMP_init() failed\n");
    return 1;
  }

  for (i = 0; i < NELEM(b5); i++) {
    b5[i] = 0.1 * i * (NELEM(b5) - i);
    c5[i] = 0.1 * i * i * (NELEM(b5) - i);
  }

  self = QMP_get_node_number();
  primary = QMP_is_primary_node();
  if (argc != 7) {
    zprint("7 arguments expected, found %d", argc);
    zprint("usage: localheat Lx Ly Lz Lt Ls time");
    QMP_finalize_msg_passing();
    return 1;
  }

  for (i = 0; i < 4; i++) {
    mynetwork[i] = 1;
    mylocal[i] = atoi(argv[i+1]);
    mylattice[i] = mylocal[i] * mynetwork[i];
  }
  mylocal[4] = mylattice[4] = atoi(argv[5]);
  total_sec = atoi(argv[6]);

  zshowv4("network", mynetwork);
  zshowv5("local lattice", mylocal);
  zshowv5("lattice", mylattice);
  zprint("total requested runtime %.0f sec", total_sec);

#if 0
  if (QMP_declare_logical_topology(mynetwork, 4) != QMP_SUCCESS) {
    zprint("declare_logical_top failed");
    goto end;
  }

  getv(mynode, 0, QMP_get_logical_number_of_dimensions(),
       QMP_get_logical_coordinates());
#else
  { int i;
    for (i = 0; i < 4; i++)
       mynode[i] = 0;
  }
#endif

  if (QOP_MDWF_init(&mdwf_state,
		    mylattice, mynetwork, mynode, primary,
		    getsub, NULL)) {
    zprint("MDWF_init() failed");
    goto end;
  }

  zprint("MDWF_init() done");

  if (QOP_MDWF_set_generic(&mdwf_params, mdwf_state, b5, c5, 0.123, 0.05)) {
    zprint("MDW_set_generic() failed");
    goto end;
  }
  zprint("MDWF_set_generic() done");

  if (do_run(mdwf_state, mdwf_params)) {
    zprint("float test failed");
    goto end;
  }

  QOP_MDWF_fini(&mdwf_state);

  zprint("Heater test finished");
  status = 0;
 end:
  QMP_finalize_msg_passing();
  return status;
}
コード例 #6
0
ファイル: QMP_perf.c プロジェクト: 6twirl9/qmp
int
main (int argc, char** argv)
{
  int             i, nc;
  QMP_status_t      status;
  int       **smem, **rmem;
  QMP_msgmem_t    *recvmem;
  QMP_msghandle_t *recvh;
  QMP_msgmem_t    *sendmem;
  QMP_msghandle_t *sendh;
  struct perf_argv pargv;
  QMP_thread_level_t req, prv;

  /** 
   * Simple point to point topology 
   */
  int dims[4] = {2,2,2,2};
  int ndims = 1;

  //if(QMP_get_node_number()==0)
  //printf("starting init\n"); fflush(stdout);
  req = QMP_THREAD_SINGLE;
  status = QMP_init_msg_passing (&argc, &argv, req, &prv);
  if (status != QMP_SUCCESS) {
    fprintf (stderr, "QMP_init failed\n");
    return -1;
  }
  if(QMP_get_node_number()==0)
    printf("finished init\n"); fflush(stdout);

  if (parse_options (argc, argv, &pargv) == -1) {
    if(QMP_get_node_number()==0)
      usage (argv[0]);
    exit (1);
  }

  {
    int maxdims = 4;
    int k=0;
    int nodes = QMP_get_number_of_nodes();
    ndims = 0;
    while( (nodes&1) == 0 ) {
      if(ndims<maxdims) ndims++;
      else {
	dims[k] *= 2;
	k++;
	if(k>=maxdims) k = 0;
      }
      nodes /= 2;
    }
    if(nodes != 1) {
      QMP_error("invalid number of nodes %i", QMP_get_number_of_nodes());
      QMP_error(" must power of 2");
      QMP_abort(1);
    }
    pargv.ndims = ndims;
  }

  status = QMP_declare_logical_topology (dims, ndims);
  if (status != QMP_SUCCESS) {
    fprintf (stderr, "Cannot declare logical grid\n");
    return -1;
  }

  /* do a broadcast of parameter */
  if (QMP_broadcast (&pargv, sizeof (pargv)) != QMP_SUCCESS) {
    QMP_printf ("Broadcast parameter failed\n");
    exit (1);
  }

  {
    int k=1;
    const int *lc = QMP_get_logical_coordinates();
    for(i=0; i<ndims; i++) k += lc[i];
    pargv.sender = k&1;
  }

  QMP_printf("%s options: num_channels[%d] verify[%d] option[%d] datasize[%d] numloops[%d] sender[%d] strided_send[%i] strided_recv[%i] strided_array_send[%i] ",
	     argv[0], pargv.num_channels, pargv.verify, 
	     pargv.option, pargv.size, pargv.loops, pargv.sender,
	     strided_send, strided_recv, strided_array_send);
  fflush(stdout);


  /**
   * Create memory
   */
  nc = pargv.num_channels;
  smem = (int **)malloc(nc*sizeof (int *));
  rmem = (int **)malloc(nc*sizeof (int *));
  sendmem = (QMP_msgmem_t *)malloc(ndims*nc*sizeof (QMP_msgmem_t));
  recvmem = (QMP_msgmem_t *)malloc(ndims*nc*sizeof (QMP_msgmem_t));
  sendh = (QMP_msghandle_t *)malloc(nc*sizeof (QMP_msghandle_t));
  recvh = (QMP_msghandle_t *)malloc(nc*sizeof (QMP_msghandle_t));

  QMP_barrier();
  if(QMP_get_node_number()==0) printf("\n"); fflush(stdout);
  if(pargv.option & TEST_SIMUL) {
    int opts = pargv.option;
    pargv.option = TEST_SIMUL;
    if(QMP_get_node_number()==0)
      QMP_printf("starting simultaneous sends"); fflush(stdout);
    for(i=pargv.minsize; i<=pargv.maxsize; i*=pargv.facsize) {
      pargv.size = i;
      create_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc, i, &pargv);
      test_simultaneous_send (smem, rmem, sendh, recvh, &pargv);
      check_mem(rmem, ndims, nc, i);
      free_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc);
    }
    if(QMP_get_node_number()==0)
      QMP_printf("finished simultaneous sends\n"); fflush(stdout);
    pargv.option = opts;
  }

  if(pargv.option & TEST_PINGPONG) {
    int opts = pargv.option;
    pargv.option = TEST_PINGPONG;
    if(QMP_get_node_number()==0)
      QMP_printf("starting ping pong sends"); fflush(stdout);
    for(i=pargv.minsize; i<=pargv.maxsize; i*=pargv.facsize) {
      pargv.size = i;
      create_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc, i, &pargv);
      if(pargv.verify)
	test_pingpong_verify(smem, rmem, sendh, recvh, &pargv);
      else
	test_pingpong(smem, rmem, sendh, recvh, &pargv);
      check_mem(rmem, ndims, nc, i);
      free_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc);
    }
    if(QMP_get_node_number()==0)
      QMP_printf("finished ping pong sends\n"); fflush(stdout);
    pargv.option = opts;
  }

  if(pargv.option & TEST_ONEWAY) {
    int opts = pargv.option;
    pargv.option = TEST_ONEWAY;
    if(QMP_get_node_number()==0)
      QMP_printf("starting one way sends"); fflush(stdout);
    for(i=pargv.minsize; i<=pargv.maxsize; i*=pargv.facsize) {
      pargv.size = i;
      create_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc, i, &pargv);
      test_oneway (smem, rmem, sendh, recvh, &pargv);
      if(!pargv.sender) check_mem(rmem, ndims, nc, i);
      free_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc);
    }
    if(QMP_get_node_number()==0)
      QMP_printf("finished one way sends"); fflush(stdout);
    pargv.option = opts;
  }


  /**
   * Free memory 
   */
  free (smem);
  free (rmem);

  free (sendh);
  free (recvh);

  free (sendmem);
  free (recvmem);

  QMP_finalize_msg_passing ();

  return 0;
}
コード例 #7
0
void make_shift_tables(int bound[2][4][4], halfspinor_array* chi1,
                       halfspinor_array* chi2,

                       halfspinor_array* recv_bufs[2][4],
                       halfspinor_array* send_bufs[2][4],

                       void (*QDP_getSiteCoords)(int coord[], int node, int linearsite),
                       int (*QDP_getLinearSiteIndex)(const int coord[]),

                       int (*QDP_getNodeNumber)(const int coord[]))
{
    volatile int dir,i;
    const int my_node = QMP_get_node_number();
    int coord[4];
    int gcoord[4];
    int gcoord2[4];

    int linear;
    int **shift_table;
    int x,y,z,t;
    int *subgrid_size = getSubgridSize();
    int mu;
    int offset;

    int cb;
    const int *node_coord  = QMP_get_logical_coordinates();
    int p;
    int site, index;

    InvTab4 *xinvtab;
    InvTab4 *invtab;

    int qdp_index;
    int my_index;
    int num;
    int offsite_found;

    /* Setup the subgrid volume for ever after */
    subgrid_vol = 1;
    for(i=0; i < getNumDim(); ++i) {
        subgrid_vol *= getSubgridSize()[i];
    }

    /* Get the checkerboard size for ever after */
    subgrid_vol_cb = subgrid_vol / 2;

    /* Now I want to build the site table */
    /* I want it cache line aligned? */
    xsite_table = (int *)malloc(sizeof(int)*subgrid_vol+63L);
    if(xsite_table == 0x0 ) {
        QMP_error("Couldnt allocate site table");
        QMP_abort(1);
    }

    site_table = (int *)((((ptrdiff_t)(xsite_table))+63L)&(-64L));

    xinvtab = (InvTab4 *)malloc(sizeof(InvTab4)*subgrid_vol + 63L);
    if(xinvtab == 0x0 ) {
        QMP_error("Couldnt allocate site table");
        QMP_abort(1);
    }
    invtab = (InvTab4 *)((((ptrdiff_t)(xinvtab))+63L)&(-64L));

    /* Inversity of functions check:
       Check that myLinearSiteIndex3D is in fact the inverse
       of mySiteCoords3D, and that QDP_getSiteCoords is the
       inverse of QDP_linearSiteIndex()
    */
    for(p=0; p < 2; p++) {
        for(site=0; site < subgrid_vol_cb; site++) {

            /* Linear site index */
            my_index = site + subgrid_vol_cb*p;
            QDP_getSiteCoords(gcoord, my_node, my_index);
            linear=QDP_getLinearSiteIndex(gcoord);

            if( linear != my_index ) {
                printf("P%d cb=%d site=%d : QDP_getSiteCoords not inverse of QDP_getLinearSiteIndex(): my_index=%d linear=%d\n", my_node, p,site, my_index,linear);
            }

            mySiteCoords4D(gcoord, my_node, my_index);
            linear=myLinearSiteIndex4D(gcoord);

            if( linear != my_index ) {
                printf("P%d cb=%d site=%d : mySiteCoords3D not inverse of myLinearSiteIndex3D(): my_index=%d linear=%d\n", my_node, p,site, my_index,linear);
            }
        }
    }


    /* Loop through sites - you can choose your path below */
    /* This is a checkerboarded order which is identical hopefully
       to QDP++'s rb2 subset when QDP++ is in a CB2 layout */
    for(p=0; p < 2; p++) {
        for(t=0; t < subgrid_size[3]; t++) {
            for(z=0; z < subgrid_size[2]; z++) {
                for(y=0; y < subgrid_size[1]; y++) {
                    for(x=0; x < subgrid_size[0]/2; x++) {

                        coord[0] = 2*x + p;
                        coord[1] = y;
                        coord[2] = z;
                        coord[3] = t;

                        /* Make global */
                        for(i=0; i < 4; i++) {
                            coord[i] += subgrid_size[i]*node_coord[i];
                        }

                        /* Index of coordinate -- NB this is not lexicographic
                           but takes into account checkerboarding in QDP++ */
                        qdp_index = QDP_getLinearSiteIndex(coord);

                        /* Index of coordinate in my layout. -- NB this is not lexicographic
                           but takes into account my 3D checkerbaording */
                        my_index = myLinearSiteIndex4D(coord);
                        site_table[my_index] = qdp_index;

                        cb=parity(coord);
                        linear = my_index%subgrid_vol_cb;

                        invtab[qdp_index].cb=cb;
                        invtab[qdp_index].linearcb=linear;
                    }
                }
            }
        }
    }

    /* Site table transitivity check:
       for each site, convert to index in cb3d, convert to qdp index
       convert qdp_index to coordinate
       convert coordinate to back index in cb3d
       Check that your cb3d at the end is the same as you
       started with */
    for(p=0; p < 2; p++) {
        for(site=0; site < subgrid_vol_cb; site++) {

            /* My local index */
            my_index = site + subgrid_vol_cb*p;

            /* Convert to QDP index */
            qdp_index = site_table[ my_index ];

            /* Switch QDP index to coordinates */
            QDP_getSiteCoords(gcoord, my_node,qdp_index);

            /* Convert back to cb3d index */
            linear = myLinearSiteIndex4D(gcoord);

            /* Check new cb,cbsite index matches the old cb index */
            if (linear != my_index) {
                printf("P%d The Circle is broken. My index=%d qdp_index=%d coords=%d,%d,%d,%d linear(=my_index?)=%d\n", my_node, my_index, qdp_index, gcoord[0],gcoord[1],gcoord[2],gcoord[3],linear);
            }
        }
    }


    /* Consistency check 2: Test mySiteCoords 3D
       for all 3d cb,cb3index convert to
       cb3d linear index (my_index)
       convert to qdp_index (lookup in site table)

       Now convert qdp_index and my_index both to
       coordinates. They should produce the same coordinates
    */
    for(p=0; p < 2; p++) {
        for(site=0; site < subgrid_vol_cb; site++) {

            /* My local index */
            my_index = site + subgrid_vol_cb*p;
            mySiteCoords4D(gcoord, my_node, my_index);

            qdp_index = site_table[ my_index ];
            QDP_getSiteCoords(gcoord2, my_node,qdp_index);

            for(mu=0 ; mu < 4; mu++) {
                if( gcoord2[mu] != gcoord[mu] ) {
                    printf("P%d: my_index=%d qdp_index=%d mySiteCoords=(%d,%d,%d,%d) QDPsiteCoords=(%d,%d,%d,%d)\n", my_node, my_index, qdp_index, gcoord[0], gcoord[1], gcoord[2], gcoord[3], gcoord2[0], gcoord2[1], gcoord2[2], gcoord2[3]);
                    continue;
                }
            }

        }
    }

    /* Allocate the shift table */
    /* The structure is as follows: There are 4 shift tables in order:

      [ Table 1 | Table 2 | Table 3 | Table 4 ]
      Table 1: decomp_scatter_index[mu][site]
      Table 2: decomp_hvv_scatter_index[mu][site]
      Table 3: recons_mvv_gather_index[mu][site]
      Table 4: recons_gather_index[mu][site]

    */

    /* This 4 is for the 4 tables: Table 1-4*/
    if ((shift_table = (int **)malloc(4*sizeof(int*))) == 0 ) {
        QMP_error("init_wnxtsu3dslash: could not initialize shift_table");
        QMP_abort(1);

    }

    for(i=0; i < 4; i++) {
        /* This 4 is for the 4 comms dierctions: */
        if ((shift_table[i] = (int *)malloc(4*subgrid_vol*sizeof(int))) == 0) {
            QMP_error("init_wnxtsu3dslash: could not initialize shift_table");
            QMP_abort(1);
        }
    }


    /* Initialize the boundary counters */
    for(cb=0; cb < 2; cb++) {
        for(dir=0; dir < 4; dir++) {
            bound[cb][0][dir] = 0;
            bound[cb][1][dir] = 0;
            bound[cb][2][dir] = 0;
            bound[cb][3][dir] = 0;
        }
    }


    for(cb=0; cb < 2; cb++) {
        for(site=0; site < subgrid_vol_cb; ++site) {

            index = cb*subgrid_vol_cb + site;

            /* Fetch site from site table */
            qdp_index = site_table[index];

            /* Get its coords */
            QDP_getSiteCoords(coord, my_node, qdp_index);

            /* Loop over directions building up shift tables */
            for(dir=0; dir < 4; dir++) {

                int fcoord[4], bcoord[4];
                int fnode, bnode;
                int blinear, flinear;

                /* Backwards displacement*/
                offs(bcoord, coord, dir, -1);
                bnode   = QDP_getNodeNumber(bcoord);
                blinear = QDP_getLinearSiteIndex(bcoord);

                /* Forward displacement */
                offs(fcoord, coord, dir, +1);
                fnode   = QDP_getNodeNumber(fcoord);
                flinear = QDP_getLinearSiteIndex(fcoord);

                /* Scatter:  decomp_{plus,minus} */
                /* Operation: a^F(shift(x,type=0),dir) <- decomp(psi(x),dir) */
                /* Send backwards - also called a receive from forward */
                if (bnode != my_node) {
                    /* Offnode */
                    /* Append to Tail 1, increase boundary count */
                    /* This is the correct code */
                    shift_table[DECOMP_SCATTER][dir+4*index]
                        = subgrid_vol_cb + bound[1-cb][DECOMP_SCATTER][dir];

                    bound[1-cb][DECOMP_SCATTER][dir]++;

                }
                else {
                    /* On node. Note the linear part of its (cb3, linear) bit,
                       using a reverse lookup */
                    shift_table[DECOMP_SCATTER][dir+4*index] =
                        invtab[blinear].linearcb;
                }


                /* Scatter:  decomp_hvv_{plus,minus} */
                /* Operation:  a^B(shift(x,type=1),dir) <- U^dag(x,dir)*decomp(psi(x),dir) */
                /* Send forwards - also called a receive from backward */
                if (fnode != my_node) {
                    /* Offnode */
                    /* Append to Tail 1, increase boundary count */
                    shift_table[DECOMP_HVV_SCATTER][dir+4*index]
                        = subgrid_vol_cb + bound[1-cb][DECOMP_HVV_SCATTER][dir];

                    bound[1-cb][DECOMP_HVV_SCATTER][dir]++;

                }
                else {
                    /* On node. Note the linear part of its (cb3, linear) bit,
                       using a reverse lookup */
                    shift_table[DECOMP_HVV_SCATTER][dir+4*index]           /* Onnode */
                        = invtab[flinear].linearcb ;
                }


                /* Gather:  mvv_recons_{plus,minus} */
                /* Operation:  chi(x) <-  \sum_dir U(x,dir)*a^F(shift(x,type=2),dir) */
                /* Receive from forward */
                if (fnode != my_node) {
                    /* Offnode */
                    /* Append to Tail 2, increase boundary count */

                    shift_table[RECONS_MVV_GATHER][dir+4*index] =
                        2*subgrid_vol_cb + (bound[cb][RECONS_MVV_GATHER][dir]);

                    bound[cb][RECONS_MVV_GATHER][dir]++;

                }
                else {
                    /* On node. Note the linear part of its (cb3, linear) bit,
                       using a reverse lookup. Note this is a recons post shift,
                       so the linear coordinate to invert is mine rather than the neighbours */
                    shift_table[RECONS_MVV_GATHER][dir+4*index] =
                        invtab[qdp_index].linearcb ;
                }

                /* Gather:  recons_{plus,minus} */
                /* Operation:  chi(x) +=  \sum_dir recons(a^B(shift(x,type=3),dir),dir) */
                /* Receive from backward */
                if (bnode != my_node) {

                    shift_table[RECONS_GATHER][dir+4*index] =
                        2*subgrid_vol_cb + bound[cb][RECONS_GATHER][dir];

                    bound[cb][RECONS_GATHER][dir]++;

                }
                else {
                    /* On node. Note the linear part of its (cb3, linear) bit,
                       using a reverse lookup. Note this is a recons post shift,
                       so the linear coordinate to invert is mine rather than the neighbours */

                    shift_table[RECONS_GATHER][dir+4*index] =
                        invtab[qdp_index].linearcb ;
                }
            }
        }
    }

    /* Sanity check - make sure the sending and receiving counters match */
    for(cb=0; cb < 2; cb++) {
        for(dir=0; dir < 4; dir++) {

            /* Sanity 1: Must have same number of boundary sites on each cb for
            a given operation */
            for(i = 0; i < 4; i++) {
                if (bound[1-cb][i][dir] != bound[cb][i][dir]) {

                    QMP_error("SSE Wilson dslash - make_shift_tables: type 0 diff. cb send/recv counts do not match: %d %d",
                              bound[1-cb][i][dir],bound[cb][i][dir]);
                    QMP_abort(1);
                }
            }


        }
    }

    /* Now I want to make the offset table into the half spinor temporaries */
    /* The half spinor temporaries will look like this:

       dir=0 [ Body Half Spinors ][ Tail 1 Half Spinors ][ Tail 2 Half Spinors ]
       dir=1 [ Body Half Spinors ][ Tail 1 Half Spinors ][ Tail 2 Half Spinors ]
       ...

       And each of these blocks of half spinors will be sized to vol_cb
       sites (ie half volume only).  The shift_table() for a given site and
       direction indexes into one of these lines. So the offset table essentially
       delineates which line one picks, by adding an offset of
       3*subgrid_vol_cb*dir
       To the shift. The result from offset table, can be used directly as a
       pointer displacement on the temporaries.

       Perhaps the best way to condsider this is to consider a value
       of shift_table[type][dir/site] that lands in the body. The
       shift table merely gives me a site index. But the data needs
       to be different for each direction for that site index. Hence
       we need to replicate the body, for each dir. The 3xsubgrid_vol_cb
       is just there to take care of the buffers.

       Or another way to think of it is that there is a 'body element' index
       specified by the shift table lookup, and that dir is just the slowest
       varying index.

    */

    /* 4 dims, 4 types, rest of the magic is to align the thingie */
    xoffset_table = (halfspinor_array **)malloc(4*4*subgrid_vol*sizeof(halfspinor_array*)+63L);
    if( xoffset_table == 0 ) {
        QMP_error("init_wnxtsu3dslash: could not initialize offset_table[i]");
        QMP_abort(1);
    }
    /* This is the bit what aligns straight from AMD Manual */
    offset_table = (halfspinor_array**)((((ptrdiff_t)(xoffset_table)) + 63L) & (-64L));

    /* Walk through the shift_table and remap the offsets into actual
       pointers */

    /* DECOMP_SCATTER */
    num=0;
    for(dir =0; dir < Nd; dir++) {

        /* Loop through all the sites. Remap the offsets either to local
           arrays or pointers */
        offsite_found=0;
        for(site=0; site < subgrid_vol; site++) {
            offset = shift_table[DECOMP_SCATTER][dir+4*site];
            if( offset >= subgrid_vol_cb ) {
                /* Found an offsite guy. It's address must be to the send back buffer */
                /* send to back index = recv from forward index = 0  */
                offsite_found++;
                offset_table[ dir + 4*(site + subgrid_vol*DECOMP_SCATTER) ] =
                    send_bufs[0][num]+(offset - subgrid_vol_cb);
            }
            else {
                /* Guy is onsite: This is DECOMP_SCATTER so offset to chi1 */
                offset_table[ dir + 4*(site + subgrid_vol*DECOMP_SCATTER) ] =
                    chi1+shift_table[DECOMP_SCATTER][dir+4*site]+subgrid_vol_cb*dir;
            }
        }

        if( offsite_found > 0 ) {
            /* If we found an offsite guy, next direction has to
            go into the next dir part of the send bufs */
            num++;
        }
    }

    /* DECOMP_HVV_SCATTER */
    /* Restart num-s */
    num=0;
    for(dir =0; dir <Nd; dir++) {
        offsite_found=0;
        for(site=0; site < subgrid_vol; site++) {
            offset = shift_table[DECOMP_HVV_SCATTER][dir+4*site];
            if( offset >= subgrid_vol_cb ) {
                /* Found an offsite guy. It's address must be to the send forw buffer */
                /* send to forward / receive from backward index = 1 */
                offsite_found++;

                offset_table[ dir + 4*(site + subgrid_vol*DECOMP_HVV_SCATTER) ] =
                    send_bufs[1][num]+(offset - subgrid_vol_cb);
            }
            else {
                /* Guy is onsite. This is DECOMP_HVV_SCATTER so offset to chi2 */
                offset_table[ dir + 4*(site + subgrid_vol*DECOMP_HVV_SCATTER) ] =
                    chi2+shift_table[DECOMP_HVV_SCATTER][dir+4*site ]+subgrid_vol_cb*dir;
            }
        }
        if( offsite_found > 0 ) {
            num++;
        }
    }

    /* RECONS_MVV_GATHER */
    num=0;
    for(dir =0; dir <Nd; dir++) {
        offsite_found=0;
        for(site=0; site < subgrid_vol; site++) {
            offset = shift_table[RECONS_MVV_GATHER][dir+4*site];
            if( offset >= 2*subgrid_vol_cb ) {
                /* Found an offsite guy. It's address must be to the recv from front buffer */
                /* recv_from front index = send to back index = 0 */
                offsite_found++;
                offset_table[ dir + 4*(site + subgrid_vol*RECONS_MVV_GATHER) ] =
                    recv_bufs[0][num]+(offset - 2*subgrid_vol_cb);
            }
            else {
                /* Guy is onsite */
                /* This is RECONS_MVV_GATHER so offset with respect to chi1 */
                offset_table[ dir + 4*(site + subgrid_vol*RECONS_MVV_GATHER) ] =
                    chi1+shift_table[RECONS_MVV_GATHER][dir+4*site ]+subgrid_vol_cb*dir;
            }
        }
        if( offsite_found > 0 ) {
            num++;
        }
    }

    /* RECONS_GATHER */
    num=0;
    for(dir =0; dir <Nd; dir++) {
        offsite_found=0;
        for(site=0; site < subgrid_vol; site++) {
            offset = shift_table[RECONS_GATHER][dir+4*site];
            if( offset >= 2*subgrid_vol_cb ) {
                /* Found an offsite guy. It's address must be to the recv from back buffer */
                /* receive from back = send to forward index =  1*/
                offsite_found++;
                offset_table[ dir + 4*(site + subgrid_vol*RECONS_GATHER) ] =
                    recv_bufs[1][num]+(offset - 2*subgrid_vol_cb);
            }
            else {
                /* Guy is onsite */
                /* This is RECONS_GATHER so offset with respect to chi2 */
                offset_table[ dir + 4*(site + subgrid_vol*RECONS_GATHER ) ] =
                    chi2+shift_table[RECONS_GATHER][dir+4*site ]+subgrid_vol_cb*dir;
            }
        }
        if( offsite_found > 0 ) {
            num++;
        }
    }



    /* Free shift table - it is no longer needed. We deal solely with offsets */
    for(i=0; i < 4; i++) {
        free( (shift_table)[i] );
    }
    free( shift_table );

    free( xinvtab );

}