int
main (int argc, char** argv)
{
  QMP_status_t status;
  int this_node;
  QMP_thread_level_t req, prv;

  /* Start QMP */
  req = QMP_THREAD_SINGLE;
  status = QMP_init_msg_passing (&argc, &argv, req, &prv);

  if (status != QMP_SUCCESS) {
    QMP_error ("QMP_init failed: %s\n", QMP_error_string(status));
    QMP_abort(1);
  }

  /* Get my logical node number */
  this_node = QMP_get_node_number();

  /* Print the result */
  printf("%04d",this_node);

  /* Quit */
  QMP_finalize_msg_passing ();

  return 0;
}
Exemple #2
0
/* creates data array if necessary and destroys pointers */
void
QDP_switch_ptr_to_data(QDP_data_common_t *dc)
{
  ENTER;
  if(*(dc->data)==NULL) {
    //*(dc->data) = (char *) malloc(QDP_sites_on_node*dc->size);
    dc->qmpmem = QMP_allocate_aligned_memory( QDP_sites_on_node_L(dc->lat)*dc->size,
					      QDP_mem_align, QDP_mem_flags );
    if(!dc->qmpmem) {
      QMP_error("QDP error: can't allocate memory\n");
      QDP_abort(1);
    }
    *(dc->data) = QMP_get_memory_pointer(dc->qmpmem);
  } else {
    QDP_clear_valid_shift_dest(dc);
  }
  if(*(dc->ptr)!=NULL) {
    QDP_finish_shifts(dc);
    if(!dc->discarded) QDP_copy_ptr_to_data(dc);
    QDP_clear_shift_src(dc);
    free((void*)*(dc->ptr));
    *(dc->ptr) = NULL;
  }
  LEAVE;
}
Exemple #3
0
unsigned int sync() {
QMP_status_t sync_status = QMP_barrier(); 
if (sync_status != QMP_SUCCESS) {
      QMP_error("Error in QMP sync:%s\n", QMP_error_string(sync_status));
      return 0;
}
return 1;
}
Exemple #4
0
QMP_status_t
QMP_comm_binary_reduction_mpi(QMP_comm_t comm, void *lbuffer, size_t count, QMP_binary_func bfunc)
{
  QMP_status_t status = QMP_SUCCESS;
  ENTER;

  QMP_assert(qmp_user_bfunc==NULL);

  /* set up user binary reduction pointer */
  qmp_user_bfunc = bfunc;

  if(!op_inited) {
    status = MPI_Op_create(qmp_bfunc_mpi, 1, &bop);
    if (status != MPI_SUCCESS) {
      QMP_error ("Cannot create MPI operator for binary reduction.\n");
      goto leave;
    }
    op_inited = 1;
  }

  char *rbuffer;
  QMP_alloc(rbuffer, char, count);

  int err =
    MPI_Allreduce(lbuffer,rbuffer,count, MPI_BYTE, bop, comm->mpicomm);

  if(err != MPI_SUCCESS) status = err;
  else {
    memcpy (lbuffer, rbuffer, count);
    QMP_free(rbuffer);
  }

  /* signal end of the binary reduction session */
  qmp_user_bfunc = NULL;

 leave:
  LEAVE;
  return QMP_SUCCESS;
}
Exemple #5
0
CPS_START_NAMESPACE
#ifndef USE_QMP
#define QMP
#endif

void GlobalDataShift::Shift(int direction, int n_disp){
  if (n_disp==0) return;
 
  SCUDir s_dir,r_dir;
  int a_disp;
  void *send_p,*recv_p,*temp_p;
#ifndef USE_QMP
  if (n_disp>0){
   a_disp = n_disp;
   s_dir = gjp_scu_dir[i*2];
   r_dir = gjp_scu_dir[i*2+1];
  } else {
   a_disp = -n_disp;
   s_dir = gjp_scu_dir[i*2+1];
   r_dir = gjp_scu_dir[i*2];
  }
#else
//  int direction = i;
  int sflag;
  if (n_disp > 0)
    sflag = +1;
  else
    sflag = -1;
#endif

  send_p = addr;
  recv_p = temp_buf;

#ifndef USE_QMP
  SCUDirArgIR Send(send_p,s_dir,SCU_SEND,data_len);
  SCUDirArgIR Recv(recv_p,r_dir,SCU_REC,data_len);
#else
  QMP_msgmem_t msgmem[2];
  QMP_msghandle_t msghandle[2];
  QMP_status_t status;
  QMP_msghandle_t multiple;
#endif

//  sys_cacheflush(0);
  for(int i = 0;i<a_disp-1;i++){

#ifndef USE_QMP
    Send.StartTrans();Recv.StartTrans();
    Send.TransComplete();Recv.TransComplete();
#else
    msgmem[0] = QMP_declare_msgmem((void *)send_p, data_len);
    msgmem[1] = QMP_declare_msgmem((void *)recv_p, data_len);
    msghandle[0] = QMP_declare_send_relative(msgmem[0], direction, sflag, 0);
    msghandle[1] = QMP_declare_receive_relative(msgmem[1], direction, -sflag, 0);
    multiple = QMP_declare_multiple(msghandle, 2);
    QMP_start(multiple);
    status = QMP_wait(multiple);
    if (status != QMP_SUCCESS)
      QMP_error("Error in GlobalDataShift::Shift:%s\n", QMP_error_string(status));
    QMP_free_msghandle(multiple);
    QMP_free_msgmem(msgmem[0]);
    QMP_free_msgmem(msgmem[1]);
#endif
  
    temp_p = send_p;
    send_p = recv_p;
    recv_p = temp_p;

#ifndef USE_QMP
    Send.Addr(send_p);
    Recv.Addr(recv_p);
#endif
  }
#ifndef USE_QMP
  Send.StartTrans();Recv.StartTrans();
  Send.TransComplete();Recv.TransComplete();
#else
  msgmem[0] = QMP_declare_msgmem((void *)send_p, data_len);
  msgmem[1] = QMP_declare_msgmem((void *)recv_p, data_len);
  msghandle[0] = QMP_declare_send_relative(msgmem[0], direction, sflag, 0);
  msghandle[1] = QMP_declare_receive_relative(msgmem[1], direction, -sflag, 0);
  multiple = QMP_declare_multiple(msghandle, 2);
  QMP_start(multiple);
  status = QMP_wait(multiple);
  if (status != QMP_SUCCESS)
    QMP_error("Error in GlobalDataShift::Shift:%s\n", QMP_error_string(status));
  QMP_free_msghandle(multiple);
  QMP_free_msgmem(msgmem[0]);
  QMP_free_msgmem(msgmem[1]);
#endif
  
  if (recv_p != addr)
    memcpy(addr,recv_p,data_len);
}
Exemple #6
0
void PT::mat_cb_norm(int n, IFloat **mout, IFloat **min, const int *dir, int
parity, IFloat * gauge)
{
  //List of the different directions
  int wire[MAX_DIR];
  int i;
//  printf("PT::mat_cb_norm\n");

  QMP_msgmem_t *msg_mem_p = (QMP_msgmem_t *)Alloc("","vec_cb_norm", "msg_mem_p", 2*non_local_dirs*sizeof(QMP_msgmem_t));
  QMP_msghandle_t* msg_handle_p = (QMP_msghandle_t *)Alloc("","vec_cb_norm", "msg_handle_p", 2*non_local_dirs*sizeof(QMP_msghandle_t));
  QMP_msghandle_t multiple;
  static int call_num = 0;
  int vlen = VECT_LEN;
  int vlen2 = VECT_LEN;

  call_num++;
  
  //Name our function
  char *fname="pt_mat_cb()";
  //  VRB.Func("",fname);
  
  //Set the transfer directions
  //If wire[i] is even, then we have communication in the negative direction
  //If wire[i] is odd, then we have communication in the positive direction
  for(i=0;i<n;i++)
    wire[i]=dir[i];

#ifdef PROFILE
  Float dtime  = - dclock();
#endif
  int non_local_dir=0;

//#pragma omp parallel default(shared)
{

  //If wire[i] is odd, then we have parallel transport in the
  //positive direction.  In this case, multiplication by the link matrix is
  //done before the field is transferred over to the adjacent node
  //
  //If we have transfer in the negative T direction (wire[i] = 6), then
  //we have to copy the appropriate fields to a send buffer
//#pragma omp for
  for(i=0;i<n;i++)
    {
      if(!local[wire[i]/2])
      {
	if(wire[i]%2)
	  {
	    if(conjugated)
	      pt_cmm_cpp(non_local_chi_cb[wire[i]],(long)uc_nl_cb_pre[parity][wire[i]/2],(long)min[i],(long)snd_buf_cb[wire[i]/2],(long)gauge);
	    else
	      pt_cmm_dag_cpp(non_local_chi_cb[wire[i]],(long)uc_nl_cb_pre[parity][wire[i]/2],(long)min[i],(long)snd_buf_cb[wire[i]/2],(long)gauge);
	  }
	else if((wire[i] == 6))
	  {
	    for(int j = 0; j < non_local_chi_cb[6];j++)
	      memcpy(snd_buf_t_cb + j*GAUGE_LEN,min[i] + 3 * *(Toffset[parity]+j)*3,GAUGE_LEN*sizeof(IFloat));
	  }
      }
    }

//#pragma omp barrier
//#pragma omp master 
{
  for(i=0;i<n;i++)
    if(!local[wire[i]/2])
    {
      //Calculate the starting address for the data to be sent
      IFloat *addr = min[i] + GAUGE_LEN * offset_cb[wire[i]];

      msg_mem_p[2*non_local_dir] = QMP_declare_msgmem((void *)rcv_buf[wire[i]], 3*non_local_chi_cb[wire[i]]*VECT_LEN*sizeof(IFloat));
      
      //Initialize the msg_mem for sends
      if(wire[i]%2) 
	msg_mem_p[2*non_local_dir+1] = QMP_declare_msgmem((void *)snd_buf_cb[wire[i]/2], 3*non_local_chi_cb[wire[i]]*VECT_LEN*sizeof(IFloat));
      else if(wire[i] == 6)
	msg_mem_p[2*non_local_dir+1] = QMP_declare_msgmem((void *)snd_buf_t_cb, 3*non_local_chi_cb[wire[i]]*VECT_LEN*sizeof(IFloat));
      else
	msg_mem_p[2*non_local_dir+1] = QMP_declare_strided_msgmem((void *)addr, (size_t)(3*blklen_cb[wire[i]]), numblk_cb[wire[i]], (ptrdiff_t)(3*stride_cb[wire[i]]+3*blklen_cb[wire[i]]));
      
      msg_handle_p[2*non_local_dir] = QMP_declare_receive_relative(msg_mem_p[2*non_local_dir], wire[i]/2, 1-2*(wire[i]%2), 0);
      msg_handle_p[2*non_local_dir+1] = QMP_declare_send_relative(msg_mem_p[2*non_local_dir+1], wire[i]/2, 2*(wire[i]%2)-1, 0);

      non_local_dir++;

    }

  if(non_local_dir) {
    multiple = QMP_declare_multiple(msg_handle_p, 2*non_local_dir);
    QMP_start(multiple);
  }
} //#pragma omp master {

  //Do local calculations
//#pragma omp for
  for(i=0;i<n;i++)
    {
      if((wire[i]%2 && conjugated) || ((wire[i]%2 == 0) && (conjugated == 0)))
	pt_cmm_cpp(local_chi_cb[wire[i]],(long)uc_l_cb[parity][wire[i]],(long)min[i],(long)mout[i],(long)gauge);
      else
	pt_cmm_dag_cpp(local_chi_cb[wire[i]],(long)uc_l_cb[parity][wire[i]],(long)min[i],(long)mout[i],(long)gauge);
    }

//#pragma omp barrier
//#pragma omp master 
{
  if(non_local_dir) {
    QMP_status_t qmp_complete_status = QMP_wait(multiple);
    if (qmp_complete_status != QMP_SUCCESS)
          QMP_error("Send failed in vec_cb_norm: %s\n", QMP_error_string(qmp_complete_status));
    QMP_free_msghandle(multiple);
    for(int i = 0; i < 2*non_local_dir; i++)
      QMP_free_msgmem(msg_mem_p[i]);
    Free(msg_handle_p);
    Free(msg_mem_p);
  }
} //#pragma omp master {

  //If wire[i] is even, then we have transport in the negative direction
  //In this case, the vector field is multiplied by the SU(3) link matrix
  //after all communication is complete
  IFloat *fp0,*fp1;
//#pragma omp for
  for(i=0;i<n;i++)
    {
      if(!local[wire[i]/2])
      	{
	  if(!(wire[i]%2))
	    {
	      if(conjugated)
		pt_cmm_dag_cpp(non_local_chi_cb[wire[i]],(long)uc_nl_cb[parity][wire[i]],(long)rcv_buf[wire[i]],(long)mout[i],(long)gauge);
	      else
		pt_cmm_cpp(non_local_chi_cb[wire[i]],(long)uc_nl_cb[parity][wire[i]],(long)rcv_buf[wire[i]],(long)mout[i],(long)gauge);
	    }
	  //Otherwise we have parallel transport in the positive direction.
	  //In this case, the received data has already been pre-multiplied
	  //All we need to do is to put the transported field in the correct place
	  else
	    {
	      //int destination, source;
	      //Place the data in the receive buffer into the result vector
	      for(int s=0;s<non_local_chi_cb[wire[i]];s++)
		{
		  //source = uc_nl_cb[parity][wire[i]][s].src;
		  fp0 = (IFloat *)((long)rcv_buf[wire[i]]+3*uc_nl_cb[parity][wire[i]][s].src);
		  //destination = uc_nl_cb[parity][wire[i]][s].dest;
		  fp1 = (IFloat *)(mout[i]+3*uc_nl_cb[parity][wire[i]][s].dest);
		  memcpy(fp1,fp0,GAUGE_LEN*sizeof(IFloat));
		}
	    }
	}
    }

} //#pragma omp parallel
#ifdef PROFILE
  dtime +=dclock();
  print_flops("",fname,99*vol*n,dtime);
#endif
//  ParTrans::PTflops +=99*n*vol;
}
Exemple #7
0
void PT::mat(int n, matrix **mout, matrix **min, const int *dir){
    
  int wire[MAX_DIR];
  int i;
  QMP_msgmem_t msg_mem_p[2*MAX_DIR];
  QMP_msghandle_t msg_handle_p[2*MAX_DIR];
  QMP_msghandle_t multiple;
  static double setup=0.,qmp=0.,localt=0.,nonlocal=0.;
  static int call_num = 0;

  call_num++;
//  char *fname="pt_mat()";
//  VRB.Func("",fname);
//  if (call_num%100==1) printf("PT:mat()\n");

  
  for(i=0;i<n;i++) wire[i] = dir[i]; 
#ifdef PROFILE
  Float dtime2  = - dclock();
#endif

  double dtime = -dclock();

  int non_local_dir=0;
  
  for(i=0;i<n;i++)
  if (!local[wire[i]/2]) {
    //Calculate the address for transfer in a particular direction
    Float * addr = ((Float *)min[i]+GAUGE_LEN*offset[wire[i]]);
    msg_mem_p[2*non_local_dir] = QMP_declare_msgmem((void *)rcv_buf[wire[i]], 3*non_local_chi[wire[i]]*VECT_LEN*sizeof(IFloat));
    msg_mem_p[2*non_local_dir+1] = QMP_declare_strided_msgmem((void *)addr, (size_t)(3*blklen[wire[i]]), numblk[wire[i]], (ptrdiff_t)(3*stride[wire[i]]+3*blklen[wire[i]]));
    
    msg_handle_p[2*non_local_dir] = QMP_declare_receive_relative(msg_mem_p[2*non_local_dir], wire[i]/2, 1-2*(wire[i]%2), 0);
    msg_handle_p[2*non_local_dir+1] = QMP_declare_send_relative(msg_mem_p[2*non_local_dir+1], wire[i]/2, 2*(wire[i]%2)-1, 0);

    non_local_dir++;
  }
  if (call_num==1 && !QMP_get_node_number())
	printf("non_local_dir=%d\n",non_local_dir);

  if(non_local_dir) {
    multiple = QMP_declare_multiple(msg_handle_p, 2*non_local_dir);
    QMP_start(multiple);
  }

  dtime += dclock();
  setup +=dtime;
  dtime = -dclock();
  int if_print = 0;
//  if ( (call_num%10000==1) && (!QMP_get_node_number()) ) if_print=1;

#define USE_TEST2
#ifdef USE_TEST2
//assume nt > n!
    static char *cname="mat()";
#pragma omp parallel default(shared)
{
  int iam,nt,ipoints,istart,offset;
  iam = omp_get_thread_num();
  nt = omp_get_num_threads();
  int nt_dir = nt/n;
  int n_t = iam/nt_dir;
  int i_t = iam%nt_dir;
  if (n_t >= n ){  n_t = n-1;
    i_t = iam - (n-1)*nt_dir;
    nt_dir = nt -(n-1)*nt_dir;
  }
  int w_t = wire[n_t];
  ipoints = (local_chi[w_t]/2)/nt_dir;
  offset = ipoints*i_t;
  if (i_t == (nt_dir-1)) ipoints = (local_chi[w_t]/2)-offset;
    if ( if_print )
      printf("thread %d of %d nt_dir n_t i_t ipoints offset= %d %d %d %d %d\n",iam,nt,nt_dir,n_t,i_t,ipoints,offset);
  //Interleaving of local computation of matrix multiplication
  partrans_cmm_agg((uc_l[w_t]+offset*2),min[n_t],mout[n_t],ipoints);
    if ( if_print )
      printf("thread %d of %d done\n",iam,nt);
}
#else
{
  //Interleaving of local computation of matrix multiplication
#pragma omp parallel for default(shared)
  for(i=0;i<n;i++){
    partrans_cmm_agg(uc_l[wire[i]],min[i],mout[i],local_chi[wire[i]]/2);
  }
}
#endif

  dtime += dclock();
  localt +=dtime;
  dtime = -dclock();
//#pragma omp barrier
//#pragma omp master 
{
  if(non_local_dir) {
    QMP_status_t qmp_complete_status = QMP_wait(multiple);
    if (qmp_complete_status != QMP_SUCCESS)
          QMP_error("Send failed in vec_cb_norm: %s\n", QMP_error_string(qmp_complete_status));
    QMP_free_msghandle(multiple);
    for(int i = 0; i < 2*non_local_dir; i++)
      QMP_free_msgmem(msg_mem_p[i]);
//    Free(msg_handle_p);
//    Free(msg_mem_p);
  }
} //#pragma omp master {
  dtime += dclock();
  qmp +=dtime;
  dtime = -dclock();

  //Do non-local computations
#ifdef USE_TEST2
//assume nt > n!
#pragma omp parallel default(shared)
{
  int iam,nt,ipoints,istart,offset;
  iam = omp_get_thread_num();
  nt = omp_get_num_threads();
  int nt_dir = nt/n;
  int n_t = iam/nt_dir;
  int i_t = iam%nt_dir;
  if (n_t >= n ){  n_t = n-1;
    i_t = iam - (n-1)*nt_dir;
    nt_dir = nt -(n-1)*nt_dir;
  }
  int w_t = wire[n_t];
  ipoints = (non_local_chi[w_t]/2)/nt_dir;
  offset = ipoints*i_t;
  if (i_t == (nt_dir-1)) ipoints = (non_local_chi[w_t]/2)-offset;
    if ( if_print )
      printf("thread %d of %d nt_dir n_t i_t ipoints offset= %d %d %d %d %d\n",iam,nt,nt_dir,n_t,i_t,ipoints,offset);
  //Non-local computation
  if (ipoints>0)
  partrans_cmm_agg((uc_nl[w_t]+offset*2),(matrix *)rcv_buf[w_t],mout[n_t],ipoints);
    if ( if_print )
      printf("thread %d of %d done\n",iam,nt);
}
#else
{
#pragma omp parallel for
  for(i=0;i<n;i++) 
  if (!local[wire[i]/2]) {
#ifdef USE_OMP
    if (call_num%10000==1 && !QMP_get_node_number() ) 
      printf("thread %d of %d i=%d\n",omp_get_thread_num(),omp_get_num_threads(),i);
#endif
    partrans_cmm_agg(uc_nl[wire[i]],(matrix *)rcv_buf[wire[i]],mout[i],non_local_chi[wire[i]]/2);
  }

}//#pragma omp parallel
#endif

  dtime += dclock();
  nonlocal +=dtime;

  if (call_num%100==0){
    static char *cname="mat()";
    if (!QMP_get_node_number() ) {
    print_flops("mat():local*100",0,localt);
    print_flops("mat():nonlocal*100",0,nonlocal);
    print_flops("mat():qmp*100",0,qmp);
    print_flops("mat():setup*100",0,setup);
    }
    localt=nonlocal=qmp=setup=0.;
  }


#ifdef PROFILE
  dtime2 +=dclock();
  print_flops("",fname,198*vol*n,dtime2);
#endif
//  ParTrans::PTflops +=198*n*vol;
}
Exemple #8
0
/*! 
  Computes sum[x] = vect2[x] vect[x + hop dir]^dagger
  where the sum is over n_vect vectors and the hop is in a forward direction.
*/
void PT::vvpd(IFloat **vect2, IFloat ***vect, int n_vect, const int *dir, int n_dir, int hop, IFloat **sum, int overwrite){

  char *fname = "pt_vvpd()";
#if 1
//  ERR.NotImplemented(cname,fname);
   QMP_error("%s""%s Not implemented\n");
#else
//  VRB.Func("",fname);
  int i, s, v;
  Float f = 2.0;
  int wire[MAX_DIR];
  for(i=0;i<n_dir;i++) wire[i] = dir[i]; // from (x,y,z,t) to (t,x,y,z)

  QMP_msgmem_t *msg_mem_p = (QMP_msgmem_t *)Alloc("","vvpd", "msg_mem_p", 2*non_local_dirs*sizeof(QMP_msgmem_t));
  QMP_msgmem_t *msg_mem_p2 = (QMP_msgmem_t *)Alloc("","vvpd", "msg_mem_p", 2*non_local_dirs*sizeof(QMP_msgmem_t));
  QMP_msghandle_t* msg_handle_p = (QMP_msghandle_t *)Alloc("","vvpd", "msg_handle_p", 2*non_local_dirs*sizeof(QMP_msghandle_t));
  QMP_msghandle_t* msg_handle_p2 = (QMP_msghandle_t *)Alloc("","vvpd", "msg_handle_p", 2*non_local_dirs*sizeof(QMP_msghandle_t));
  QMP_msghandle_t multiple;

  //Setup communciation
  int comms=0;
  for(i=0;i<n_dir;i++)
  if( !local[wire[i]/2]) {
    if ( size[wire[i]/2] <hop)
      fprintf(stderr, 
		"%s:size(%d) in direction %d is smaller than the hop(%d)\n",
		fname,size[wire[i]],wire[i],hop);

    comms++;
  }

  for(v=0; v<n_vect; v++){


    if (v%2==0) {
      comms=0;
      for(i=0;i<n_dir;i++)
        if( !local[wire[i]/2]){ 
	  msg_mem_p[2*comms] = QMP_declare_msgmem((void *)rcv_buf[wire[i]], hop*non_local_chi[wire[i]]*VECT_LEN*sizeof(IFloat));    
	  msg_handle_p[2*comms] = QMP_declare_receive_relative(msg_mem_p[2*comms], wire[i]/2, 1-2*(wire[i]%2), 0);
	  msg_mem_p[2*comms+1] = QMP_declare_strided_msgmem((void *)(vect[v][i]+VECT_LEN*set_offset(wire[i], hop)), (size_t)(hop*blklen[wire[i]]), numblk[wire[i]], (ptrdiff_t)(stride[wire[i]] + blklen[wire[i]]));
	  msg_handle_p[2*comms+1] =  QMP_declare_send_relative(msg_mem_p[2*comms+1], wire[i]/2, 2*(wire[i]%2)-1, 0);
          comms++;
        }

      // Start communication
      if(comms) {
	multiple = QMP_declare_multiple(msg_handle_p, 2*comms);
      }
      if (comms) {
	QMP_start(multiple);
	QMP_status_t qmp_complete_status = QMP_wait(multiple);
	if (qmp_complete_status != QMP_SUCCESS)
	  QMP_error("Send failed in vvpd: %s\n", QMP_error_string(qmp_complete_status));
	QMP_free_msghandle(multiple);
	for(int i = 0; i < 2*comms; i++) 
	  QMP_free_msgmem(msg_mem_p[i]);
      }

    } else {
      comms=0;
      for(i=0;i<n_dir;i++)
        if( !local[wire[i]/2]){ 
	  msg_mem_p2[2*comms] = QMP_declare_msgmem((void *)rcv_buf2[wire[i]], hop*non_local_chi[wire[i]]*VECT_LEN*sizeof(IFloat));  
	  msg_handle_p2[2*comms] = QMP_declare_receive_relative(msg_mem_p2[2*comms], wire[i]/2, 1-2*(wire[i]%2), 0);  
	  msg_mem_p2[2*comms+1] = QMP_declare_strided_msgmem((void *)(vect[v][i]+VECT_LEN*set_offset(wire[i], hop)), (size_t)(hop*blklen[wire[i]]), numblk[wire[i]], (ptrdiff_t)(stride[wire[i]] + blklen[wire[i]]));
	  msg_handle_p2[2*comms+1] =  QMP_declare_send_relative(msg_mem_p2[2*comms+1], wire[i]/2, 2*(wire[i]%2)-1, 0);
          comms++;
	}

      // Start communication
      if(comms) {
	multiple = QMP_declare_multiple(msg_handle_p2, 2*comms);
      }
      if (comms) {
	QMP_start(multiple);
	QMP_status_t qmp_complete_status = QMP_wait(multiple);
	if (qmp_complete_status != QMP_SUCCESS)
	  QMP_error("Send failed in vvpd: %s\n", QMP_error_string(qmp_complete_status));
      QMP_free_msghandle(multiple);
      for(int i = 0; i < 2*comms; i++) 
	QMP_free_msgmem(msg_mem_p2[i]);
      }
    }

     
    

    // Perform non-local calculation for previous v
    if (v>0)
      if (v==1 && overwrite==1) {
	for(i=0; i<n_dir; i++)
	  if(non_local_chi[wire[i]]>0)
	    cross_over_lin(sum[i], &f, vect2[v-1],rcv_buf[wire[i]], hop*non_local_chi[wire[i]],
			   src_nl[hop-1][wire[i]], dest_nl[hop-1][wire[i]]);
      } else if (v%2==1) {
	for(i=0; i<n_dir; i++) 
	  if(non_local_chi[wire[i]]>0)
	    cross_lin(sum[i], &f, vect2[v-1],rcv_buf[wire[i]], hop*non_local_chi[wire[i]],
		      src_nl[hop-1][wire[i]], dest_nl[hop-1][wire[i]]);
      } else {
	for(i=0; i<n_dir; i++) 
	  if(non_local_chi[wire[i]]>0)
	    cross_lin(sum[i], &f,vect2[v-1],rcv_buf2[wire[i]], hop*non_local_chi[wire[i]],
		      src_nl[hop-1][wire[i]], dest_nl[hop-1][wire[i]]);
      }
    
    // Perform local calculation for current v
    if (v==0 && overwrite==1)
      {
	for(i=0; i<n_dir; i++)
	  if((vol-hop*non_local_chi[wire[i]])>0)
	    cross_over_look(sum[i], &f, vect2[v], vect[v][i], vol-hop*non_local_chi[wire[i]], src_l[hop-1][wire[i]], dest_l[hop-1][wire[i]]);
      }
    else
      {
	for(i=0; i<n_dir; i++)
	  if((vol-hop*non_local_chi[wire[i]])>0)
	    cross_look(sum[i], &f, vect2[v], vect[v][i], vol-hop*non_local_chi[wire[i]], src_l[hop-1][wire[i]], dest_l[hop-1][wire[i]]);
      }

  }


  if (v==1 && overwrite==1) {
    for(i=0; i<n_dir; i++) 
      if(non_local_chi[wire[i]]>0)
	cross_over_lin(sum[i], &f, vect2[v-1],rcv_buf[wire[i]], hop*non_local_chi[wire[i]],
		       src_nl[hop-1][wire[i]], dest_nl[hop-1][wire[i]]);
  } else if (v%2==1) {
    for(i=0; i<n_dir; i++)
      if(non_local_chi[wire[i]]>0)
	cross_lin(sum[i], &f, vect2[v-1],rcv_buf[wire[i]], hop*non_local_chi[wire[i]],
		  src_nl[hop-1][wire[i]], dest_nl[hop-1][wire[i]]);
  } else {
    for(i=0; i<n_dir; i++) 
      if(non_local_chi[wire[i]]>0)
	cross_lin(sum[i], &f,vect2[v-1],rcv_buf2[wire[i]], hop*non_local_chi[wire[i]],
		  src_nl[hop-1][wire[i]], dest_nl[hop-1][wire[i]]);
  }
#endif
  
  //  ParTrans::PTflops += 90*n_vect*n_dir*vol;
}
Exemple #9
0
//! u[x] = v[x+dir] for n_dir forward or backward directions dir.
void PT::shift_field(IFloat **v, const int *dir, int n_dir,
		     int hop, IFloat **u){
  
  int i, length;
  int wire[n_dir];
  for (i=0; i<n_dir;i++) wire[i] = dir[i];
#ifdef USE_QMP
  QMP_msgmem_t msg_mem_p[20];
  QMP_msghandle_t msg_handle_p[20];
  QMP_msghandle_t multiple;
#else
  SCUDirArgMulti SCUmulti;
  SCUDirArgIR *SCUarg_p[2*n_dir];
#endif
  
  
  int comms=0;
  for (i=0; i<n_dir; i++) 
  if (!local[wire[i]/2]){
#ifndef USE_QMP
    SCUarg_p[2*comms] = SCUarg_mat[hop-1][2*wire[i]];
    SCUarg_p[2*comms+1] = SCUarg_mat[hop-1][2*wire[i]+1];
    SCUarg_p[2*comms+1]->Addr((void *)(v[i]+GAUGE_LEN*set_offset(wire[i], hop)));
#else
    msg_mem_p[2*comms] = QMP_declare_msgmem((void *)rcv_buf[wire[i]], 3*hop*non_local_chi[wire[i]]*VECT_LEN*sizeof(IFloat));
    msg_mem_p[2*comms+1] = QMP_declare_strided_msgmem((void *)(v[i]+GAUGE_LEN*set_offset(wire[i], hop)), (size_t)(3*hop*blklen[wire[i]]), numblk[wire[i]], (ptrdiff_t)(3*stride[wire[i]]+3*blklen[wire[i]]));
    
    msg_handle_p[2*comms] = QMP_declare_receive_relative(msg_mem_p[2*comms], wire[i]/2, 1-2*(wire[i]%2), 0);
    msg_handle_p[2*comms+1] = QMP_declare_send_relative(msg_mem_p[2*comms+1], wire[i]/2, 2*(wire[i]%2)-1, 0);
#endif
   
    comms++;
  }

#ifndef USE_QMP
  if (comms) SCUmulti.Init(SCUarg_p,2*comms);
  if (comms) SCUmulti.SlowStartTrans();
#else
  if(comms) {
    multiple = QMP_declare_multiple(msg_handle_p, 2*comms);
    QMP_start(multiple);
  }
#endif
  
//  SCUmulti.TransComplete();
  
  for (i=0; i<n_dir; i++) {
    length = vol-hop*non_local_chi[wire[i]];
    copy_matrix(u[i],v[i],&length,dest_l[hop-1][wire[i]],
		src_l[hop-1][wire[i]]);
  }
#ifndef USE_QMP
  if (comms) SCUmulti.TransComplete();
#else
  if(comms) {
    QMP_status_t qmp_complete_status = QMP_wait(multiple);
    if (qmp_complete_status != QMP_SUCCESS)
          QMP_error("Send failed in shift_field: %s\n", QMP_error_string(qmp_complete_status));
    QMP_free_msghandle(multiple);
    for(int i = 0; i < 2*comms; i++)
      QMP_free_msgmem(msg_mem_p[i]);
  }
#endif


  for (i=0; i<n_dir; i++) {
    length = hop*non_local_chi[wire[i]];
    copy_matrix(u[i],(IFloat*)rcv_buf[wire[i]],&length,
		dest_nl[hop-1][wire[i]],src_nl[hop-1][wire[i]]);
  }
}
Exemple #10
0
void init_qmp(int * argc, char ***argv) {

#if 0
  printf("init_qmp(%d %p)\n",*argc,*argv);
  for(int i = 0; i<*argc;i++){
    printf("argv[%d](before)=%s\n",i,(*argv)[i]); 
  }
#endif

#if 0
   spi_init();
#endif
  
    QMP_thread_level_t prv;
#ifndef UNIFORM_SEED_NO_COMMS
    QMP_status_t init_status = QMP_init_msg_passing(argc, argv, QMP_THREAD_SINGLE, &prv);
    if (init_status) printf("QMP_init_msg_passing returned %d\n",init_status);
    peRank = QMP_get_node_number();
    peNum = QMP_get_number_of_nodes();
    if(!peRank)printf("QMP_init_msg_passing returned %d\n",init_status);

    if (init_status != QMP_SUCCESS) {
      QMP_error("%s\n",QMP_error_string(init_status));
    }

    // check QMP thread level
    // Added by Hantao
    if(peRank == 0) {
        switch(prv) {
        case QMP_THREAD_SINGLE:
            printf("QMP thread level = QMP_THREAD_SINGLE\n");
            break;
        case QMP_THREAD_FUNNELED:
            printf("QMP thread level = QMP_THREAD_FUNNELED\n");
            break;
        case QMP_THREAD_SERIALIZED:
            printf("QMP thread level = QMP_THREAD_SERIALIZED\n");
            break;
        case QMP_THREAD_MULTIPLE:
            printf("QMP thread level = QMP_THREAD_MULTIPLE\n");
            break;
        default:
            printf("QMP thread level = no idea what this is, boom!\n");
        }
    }

    //Check to make sure that this machine is a GRID machine
    //Exit if not GRID machine
    QMP_ictype qmp_type = QMP_get_msg_passing_type();

    //Get information about the allocated machine
    peNum = QMP_get_number_of_nodes();
    NDIM = QMP_get_allocated_number_of_dimensions();
    peGrid = QMP_get_allocated_dimensions();
    pePos = QMP_get_allocated_coordinates();

    if(peRank==0){
      for(int i = 0; i<*argc;i++){
        printf("argv[%d])(after)=%s\n",i,(*argv)[i]); 
      }
    }
#else
    QMP_status_t init_status = QMP_SUCCESS;
    peRank=0;
    peNum=1;
    NDIM=4;
#endif

//#if (TARGET == BGL) || (TARGET == BGP)
  if (NDIM>5){
    peNum = 1;
    for(int i = 0;i<5;i++)
	peNum *= peGrid[i];
    peRank = peRank % peNum;
  }
  int if_print=1;
  for(int i = 0;i<NDIM;i++)
  if (pePos[i]>=2) if_print=0;

  if (if_print){
      printf("Rank=%d Num=%d NDIM=%d\n",peRank,peNum,NDIM);
      printf("dim:");
      for(int i = 0;i<NDIM;i++)
        printf(" %d",peGrid[i]);
      printf("\n");
      printf("pos:");
      for(int i = 0;i<NDIM;i++)
        printf(" %d",pePos[i]);
      printf("\n");

#if 0
    int rc;
    BGLPersonality pers;
    rts_get_personality(&pers, sizeof(pers));
    printf("from personality: %d %d %d %d\n",pers.xCoord,pers.yCoord,pers.zCoord,rts_get_processor_id());
#endif
  }


//     printf("from personality:\n");

#if 0
    if ( (qmp_type!= QMP_GRID) && (qmp_type !=QMP_MESH)  ) {
      QMP_error("CPS on QMP only implemented for GRID or MESH, not (%d) machines\n",qmp_type);
    }
#endif

//     printf("QMP_declare_logical_topology(peGrid, NDIM)\n");
#ifndef UNIFORM_SEED_NO_COMMS
    //Declare the logical topology (Redundant for GRID machines)
    if (QMP_declare_logical_topology(peGrid, NDIM) != QMP_SUCCESS) {
      QMP_error("Node %d: Failed to declare logical topology\n",peRank);
      exit(-4);
    }
#endif
    initialized = true;
  printf("Rank=%d init_qmp() done\n",peRank);
    
  }
Exemple #11
0
int
main (int argc, char** argv)
{
  int             i, nc;
  QMP_status_t      status;
  int       **smem, **rmem;
  QMP_msgmem_t    *recvmem;
  QMP_msghandle_t *recvh;
  QMP_msgmem_t    *sendmem;
  QMP_msghandle_t *sendh;
  struct perf_argv pargv;
  QMP_thread_level_t req, prv;

  /** 
   * Simple point to point topology 
   */
  int dims[4] = {2,2,2,2};
  int ndims = 1;

  //if(QMP_get_node_number()==0)
  //printf("starting init\n"); fflush(stdout);
  req = QMP_THREAD_SINGLE;
  status = QMP_init_msg_passing (&argc, &argv, req, &prv);
  if (status != QMP_SUCCESS) {
    fprintf (stderr, "QMP_init failed\n");
    return -1;
  }
  if(QMP_get_node_number()==0)
    printf("finished init\n"); fflush(stdout);

  if (parse_options (argc, argv, &pargv) == -1) {
    if(QMP_get_node_number()==0)
      usage (argv[0]);
    exit (1);
  }

  {
    int maxdims = 4;
    int k=0;
    int nodes = QMP_get_number_of_nodes();
    ndims = 0;
    while( (nodes&1) == 0 ) {
      if(ndims<maxdims) ndims++;
      else {
	dims[k] *= 2;
	k++;
	if(k>=maxdims) k = 0;
      }
      nodes /= 2;
    }
    if(nodes != 1) {
      QMP_error("invalid number of nodes %i", QMP_get_number_of_nodes());
      QMP_error(" must power of 2");
      QMP_abort(1);
    }
    pargv.ndims = ndims;
  }

  status = QMP_declare_logical_topology (dims, ndims);
  if (status != QMP_SUCCESS) {
    fprintf (stderr, "Cannot declare logical grid\n");
    return -1;
  }

  /* do a broadcast of parameter */
  if (QMP_broadcast (&pargv, sizeof (pargv)) != QMP_SUCCESS) {
    QMP_printf ("Broadcast parameter failed\n");
    exit (1);
  }

  {
    int k=1;
    const int *lc = QMP_get_logical_coordinates();
    for(i=0; i<ndims; i++) k += lc[i];
    pargv.sender = k&1;
  }

  QMP_printf("%s options: num_channels[%d] verify[%d] option[%d] datasize[%d] numloops[%d] sender[%d] strided_send[%i] strided_recv[%i] strided_array_send[%i] ",
	     argv[0], pargv.num_channels, pargv.verify, 
	     pargv.option, pargv.size, pargv.loops, pargv.sender,
	     strided_send, strided_recv, strided_array_send);
  fflush(stdout);


  /**
   * Create memory
   */
  nc = pargv.num_channels;
  smem = (int **)malloc(nc*sizeof (int *));
  rmem = (int **)malloc(nc*sizeof (int *));
  sendmem = (QMP_msgmem_t *)malloc(ndims*nc*sizeof (QMP_msgmem_t));
  recvmem = (QMP_msgmem_t *)malloc(ndims*nc*sizeof (QMP_msgmem_t));
  sendh = (QMP_msghandle_t *)malloc(nc*sizeof (QMP_msghandle_t));
  recvh = (QMP_msghandle_t *)malloc(nc*sizeof (QMP_msghandle_t));

  QMP_barrier();
  if(QMP_get_node_number()==0) printf("\n"); fflush(stdout);
  if(pargv.option & TEST_SIMUL) {
    int opts = pargv.option;
    pargv.option = TEST_SIMUL;
    if(QMP_get_node_number()==0)
      QMP_printf("starting simultaneous sends"); fflush(stdout);
    for(i=pargv.minsize; i<=pargv.maxsize; i*=pargv.facsize) {
      pargv.size = i;
      create_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc, i, &pargv);
      test_simultaneous_send (smem, rmem, sendh, recvh, &pargv);
      check_mem(rmem, ndims, nc, i);
      free_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc);
    }
    if(QMP_get_node_number()==0)
      QMP_printf("finished simultaneous sends\n"); fflush(stdout);
    pargv.option = opts;
  }

  if(pargv.option & TEST_PINGPONG) {
    int opts = pargv.option;
    pargv.option = TEST_PINGPONG;
    if(QMP_get_node_number()==0)
      QMP_printf("starting ping pong sends"); fflush(stdout);
    for(i=pargv.minsize; i<=pargv.maxsize; i*=pargv.facsize) {
      pargv.size = i;
      create_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc, i, &pargv);
      if(pargv.verify)
	test_pingpong_verify(smem, rmem, sendh, recvh, &pargv);
      else
	test_pingpong(smem, rmem, sendh, recvh, &pargv);
      check_mem(rmem, ndims, nc, i);
      free_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc);
    }
    if(QMP_get_node_number()==0)
      QMP_printf("finished ping pong sends\n"); fflush(stdout);
    pargv.option = opts;
  }

  if(pargv.option & TEST_ONEWAY) {
    int opts = pargv.option;
    pargv.option = TEST_ONEWAY;
    if(QMP_get_node_number()==0)
      QMP_printf("starting one way sends"); fflush(stdout);
    for(i=pargv.minsize; i<=pargv.maxsize; i*=pargv.facsize) {
      pargv.size = i;
      create_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc, i, &pargv);
      test_oneway (smem, rmem, sendh, recvh, &pargv);
      if(!pargv.sender) check_mem(rmem, ndims, nc, i);
      free_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc);
    }
    if(QMP_get_node_number()==0)
      QMP_printf("finished one way sends"); fflush(stdout);
    pargv.option = opts;
  }


  /**
   * Free memory 
   */
  free (smem);
  free (rmem);

  free (sendh);
  free (recvh);

  free (sendmem);
  free (recvmem);

  QMP_finalize_msg_passing ();

  return 0;
}
void dwf_dslash_5_plus_slice(Vector *out, 
		       Vector *in, 
		       Float mass,
		       int dag, 
		       Dwf *dwf_lib_arg, 
		       int s_slice)
{
  int x;
  int s;

// Initializations
//------------------------------------------------------------------
#if 0
  int local_ls = GJP.SnodeSites(); 
  int s_nodes = GJP.Snodes();
  int s_node_coor = GJP.SnodeCoor();
  int vol_4d_cb = dwf_lib_arg->vol_4d / 2;
  int ls_stride = 24 * vol_4d_cb;
#endif

  IFloat *f_in;
  IFloat *f_out;
  IFloat *f_temp;
  IFloat *comm_buf = dwf_lib_arg->comm_buf;
  IFloat two_over_a5 = 2.0 * GJP.DwfA5Inv();
  IFloat neg_mass_two_over_a5 = -2.0 * mass * GJP.DwfA5Inv();

// [1 + gamma_5] term (if dag=1 [1 - gamma_5] term)
//
// out[s] = [1 + gamma_5] in[s-1]
//------------------------------------------------------------------

  if (s_slice<0 || s_slice >=local_ls)
  ERR.General("","dwf_dslash_5_plus_slice","s_slice=%d local_ls=%d!\n",s_slice,local_ls);
  
  if(s_slice>0 ){
    f_in  = (IFloat *) in;
    f_out = (IFloat *) out;
    f_in += (s_slice-1)*ls_stride;
    f_out += (s_slice)*ls_stride;
    if(dag == 1){
      f_in  =  f_in + 12;
      f_out = f_out + 12;
    }
    FtV1pV2Skip_asm(f_out,&two_over_a5,f_in,f_out,vol_4d_cb);
  }


// [1 + gamma_5] for lower boundary term (if dag=1 [1 - gamma_5] term)
// If there's only one node along fifth direction, no communication
// is necessary; Otherwise data from adjacent node in minus direction
// will be needed.
// If the lower boundary is the s=0 term
// out[0] = - m_f * [1 + gamma_5] in[ls-1]
// else, out[s] = [1 + gamma_5] in[s-1]
//
//------------------------------------------------------------------

if (s_slice ==  0 ){
  f_in  = (IFloat *) in;  
  f_in = f_in + (local_ls-1)*ls_stride; 
  f_out = (IFloat *) out;
  
  if(dag == 1){
    f_in  =  f_in + 12;
    f_out = f_out + 12;
  }
  
  f_temp = f_in;
  if (s_nodes > 1 ) {
#ifdef USE_GETPLUS
    getMinusData(comm_buf, f_in, 24*vol_4d_cb, 4);
    f_temp = comm_buf;
#else
      QMP_status_t send_status = QMP_wait(msghandle_down[0]);
      if (send_status != QMP_SUCCESS) 
      QMP_error("Send failed in dwf_dslash_5_plus_slice: %s\n", QMP_error_string(send_status));
      QMP_status_t recv_status = QMP_wait(msghandle_down[1]);
    if (recv_status != QMP_SUCCESS)
      QMP_error("Receive failed in dwf_dslash_5_plus_slice: %s\n", QMP_error_string(recv_status));
     f_temp = rbuf_down; 
     if(dag == 1) f_temp = f_temp + 12;
#endif
  }
    if(s_node_coor == 0) { 
      FtV1pV2Skip_asm(f_out,&neg_mass_two_over_a5,f_temp,f_out,vol_4d_cb);
    } else {
      FtV1pV2Skip_asm(f_out,&two_over_a5,f_temp,f_out,vol_4d_cb);
    }
}


// [1 - gamma_5] term (if dag=1 [1 + gamma_5] term)
// 
// out[s] = [1 - gamma_5] in[s+1]
//------------------------------------------------------------------
if(s_slice > 0 ){
  f_in  = (IFloat *) in;
  f_out = (IFloat *) out;
  f_in += (s_slice)*ls_stride;
  f_out += (s_slice-1)*ls_stride;
  if(dag == 0){
    f_in  =  f_in + 12;
    f_out = f_out + 12;
  }
  FtV1pV2Skip_asm(f_out,&two_over_a5,f_in,f_out,vol_4d_cb);
}


// [1 - gamma_5] for upper boundary term (if dag=1 [1 + gamma_5] term)
// If there's only one node along fifth direction, no communication
// is necessary; Otherwise data from adjacent node in minus direction
// will be needed.
// If the upper boundary is the s=ls term
// out[ls-1] = - m_f * [1 - gamma_5] in[0]
// else out[s] = [1 - gamma_5] in[s+1]
//
//------------------------------------------------------------------

if(s_slice == (local_ls-1) ){
  f_in  = (IFloat *) in;
  f_out = (IFloat *) out;

  if(dag == 0){
    f_in  =  f_in + 12;
    f_out = f_out + 12;
  }

  f_out = f_out + (local_ls-1)*ls_stride;
    f_temp = f_in;
  if (s_nodes > 1 ) {
#ifdef USE_GETPLUS
    getPlusData(comm_buf, f_in, 24*vol_4d_cb, 4);
    f_temp = comm_buf;
#else
      QMP_status_t send_status = QMP_wait(msghandle_up[0]);
      if (send_status != QMP_SUCCESS) 
      QMP_error("Send failed in dwf_dslash_5_plus_slice: %s\n", QMP_error_string(send_status));
      QMP_status_t recv_status = QMP_wait(msghandle_up[1]);
    if (recv_status != QMP_SUCCESS)
      QMP_error("Receive failed in dwf_dslash_5_plus_slice: %s\n", QMP_error_string(recv_status));
     f_temp = rbuf_up; 
     if(dag == 0) f_temp = f_temp + 12;
#endif
  }
    if(s_node_coor == s_nodes - 1) { 
      FtV1pV2Skip_asm(f_out,&neg_mass_two_over_a5,f_temp,f_out,vol_4d_cb);
    } else {
      FtV1pV2Skip_asm(f_out,&two_over_a5,f_temp,f_out,vol_4d_cb);
    }
}
//  DiracOp::CGflops+=2*2*vol_4d_cb*local_ls*12;


}
void make_shift_tables(int bound[2][4][4], halfspinor_array* chi1,
                       halfspinor_array* chi2,

                       halfspinor_array* recv_bufs[2][4],
                       halfspinor_array* send_bufs[2][4],

                       void (*QDP_getSiteCoords)(int coord[], int node, int linearsite),
                       int (*QDP_getLinearSiteIndex)(const int coord[]),

                       int (*QDP_getNodeNumber)(const int coord[]))
{
    volatile int dir,i;
    const int my_node = QMP_get_node_number();
    int coord[4];
    int gcoord[4];
    int gcoord2[4];

    int linear;
    int **shift_table;
    int x,y,z,t;
    int *subgrid_size = getSubgridSize();
    int mu;
    int offset;

    int cb;
    const int *node_coord  = QMP_get_logical_coordinates();
    int p;
    int site, index;

    InvTab4 *xinvtab;
    InvTab4 *invtab;

    int qdp_index;
    int my_index;
    int num;
    int offsite_found;

    /* Setup the subgrid volume for ever after */
    subgrid_vol = 1;
    for(i=0; i < getNumDim(); ++i) {
        subgrid_vol *= getSubgridSize()[i];
    }

    /* Get the checkerboard size for ever after */
    subgrid_vol_cb = subgrid_vol / 2;

    /* Now I want to build the site table */
    /* I want it cache line aligned? */
    xsite_table = (int *)malloc(sizeof(int)*subgrid_vol+63L);
    if(xsite_table == 0x0 ) {
        QMP_error("Couldnt allocate site table");
        QMP_abort(1);
    }

    site_table = (int *)((((ptrdiff_t)(xsite_table))+63L)&(-64L));

    xinvtab = (InvTab4 *)malloc(sizeof(InvTab4)*subgrid_vol + 63L);
    if(xinvtab == 0x0 ) {
        QMP_error("Couldnt allocate site table");
        QMP_abort(1);
    }
    invtab = (InvTab4 *)((((ptrdiff_t)(xinvtab))+63L)&(-64L));

    /* Inversity of functions check:
       Check that myLinearSiteIndex3D is in fact the inverse
       of mySiteCoords3D, and that QDP_getSiteCoords is the
       inverse of QDP_linearSiteIndex()
    */
    for(p=0; p < 2; p++) {
        for(site=0; site < subgrid_vol_cb; site++) {

            /* Linear site index */
            my_index = site + subgrid_vol_cb*p;
            QDP_getSiteCoords(gcoord, my_node, my_index);
            linear=QDP_getLinearSiteIndex(gcoord);

            if( linear != my_index ) {
                printf("P%d cb=%d site=%d : QDP_getSiteCoords not inverse of QDP_getLinearSiteIndex(): my_index=%d linear=%d\n", my_node, p,site, my_index,linear);
            }

            mySiteCoords4D(gcoord, my_node, my_index);
            linear=myLinearSiteIndex4D(gcoord);

            if( linear != my_index ) {
                printf("P%d cb=%d site=%d : mySiteCoords3D not inverse of myLinearSiteIndex3D(): my_index=%d linear=%d\n", my_node, p,site, my_index,linear);
            }
        }
    }


    /* Loop through sites - you can choose your path below */
    /* This is a checkerboarded order which is identical hopefully
       to QDP++'s rb2 subset when QDP++ is in a CB2 layout */
    for(p=0; p < 2; p++) {
        for(t=0; t < subgrid_size[3]; t++) {
            for(z=0; z < subgrid_size[2]; z++) {
                for(y=0; y < subgrid_size[1]; y++) {
                    for(x=0; x < subgrid_size[0]/2; x++) {

                        coord[0] = 2*x + p;
                        coord[1] = y;
                        coord[2] = z;
                        coord[3] = t;

                        /* Make global */
                        for(i=0; i < 4; i++) {
                            coord[i] += subgrid_size[i]*node_coord[i];
                        }

                        /* Index of coordinate -- NB this is not lexicographic
                           but takes into account checkerboarding in QDP++ */
                        qdp_index = QDP_getLinearSiteIndex(coord);

                        /* Index of coordinate in my layout. -- NB this is not lexicographic
                           but takes into account my 3D checkerbaording */
                        my_index = myLinearSiteIndex4D(coord);
                        site_table[my_index] = qdp_index;

                        cb=parity(coord);
                        linear = my_index%subgrid_vol_cb;

                        invtab[qdp_index].cb=cb;
                        invtab[qdp_index].linearcb=linear;
                    }
                }
            }
        }
    }

    /* Site table transitivity check:
       for each site, convert to index in cb3d, convert to qdp index
       convert qdp_index to coordinate
       convert coordinate to back index in cb3d
       Check that your cb3d at the end is the same as you
       started with */
    for(p=0; p < 2; p++) {
        for(site=0; site < subgrid_vol_cb; site++) {

            /* My local index */
            my_index = site + subgrid_vol_cb*p;

            /* Convert to QDP index */
            qdp_index = site_table[ my_index ];

            /* Switch QDP index to coordinates */
            QDP_getSiteCoords(gcoord, my_node,qdp_index);

            /* Convert back to cb3d index */
            linear = myLinearSiteIndex4D(gcoord);

            /* Check new cb,cbsite index matches the old cb index */
            if (linear != my_index) {
                printf("P%d The Circle is broken. My index=%d qdp_index=%d coords=%d,%d,%d,%d linear(=my_index?)=%d\n", my_node, my_index, qdp_index, gcoord[0],gcoord[1],gcoord[2],gcoord[3],linear);
            }
        }
    }


    /* Consistency check 2: Test mySiteCoords 3D
       for all 3d cb,cb3index convert to
       cb3d linear index (my_index)
       convert to qdp_index (lookup in site table)

       Now convert qdp_index and my_index both to
       coordinates. They should produce the same coordinates
    */
    for(p=0; p < 2; p++) {
        for(site=0; site < subgrid_vol_cb; site++) {

            /* My local index */
            my_index = site + subgrid_vol_cb*p;
            mySiteCoords4D(gcoord, my_node, my_index);

            qdp_index = site_table[ my_index ];
            QDP_getSiteCoords(gcoord2, my_node,qdp_index);

            for(mu=0 ; mu < 4; mu++) {
                if( gcoord2[mu] != gcoord[mu] ) {
                    printf("P%d: my_index=%d qdp_index=%d mySiteCoords=(%d,%d,%d,%d) QDPsiteCoords=(%d,%d,%d,%d)\n", my_node, my_index, qdp_index, gcoord[0], gcoord[1], gcoord[2], gcoord[3], gcoord2[0], gcoord2[1], gcoord2[2], gcoord2[3]);
                    continue;
                }
            }

        }
    }

    /* Allocate the shift table */
    /* The structure is as follows: There are 4 shift tables in order:

      [ Table 1 | Table 2 | Table 3 | Table 4 ]
      Table 1: decomp_scatter_index[mu][site]
      Table 2: decomp_hvv_scatter_index[mu][site]
      Table 3: recons_mvv_gather_index[mu][site]
      Table 4: recons_gather_index[mu][site]

    */

    /* This 4 is for the 4 tables: Table 1-4*/
    if ((shift_table = (int **)malloc(4*sizeof(int*))) == 0 ) {
        QMP_error("init_wnxtsu3dslash: could not initialize shift_table");
        QMP_abort(1);

    }

    for(i=0; i < 4; i++) {
        /* This 4 is for the 4 comms dierctions: */
        if ((shift_table[i] = (int *)malloc(4*subgrid_vol*sizeof(int))) == 0) {
            QMP_error("init_wnxtsu3dslash: could not initialize shift_table");
            QMP_abort(1);
        }
    }


    /* Initialize the boundary counters */
    for(cb=0; cb < 2; cb++) {
        for(dir=0; dir < 4; dir++) {
            bound[cb][0][dir] = 0;
            bound[cb][1][dir] = 0;
            bound[cb][2][dir] = 0;
            bound[cb][3][dir] = 0;
        }
    }


    for(cb=0; cb < 2; cb++) {
        for(site=0; site < subgrid_vol_cb; ++site) {

            index = cb*subgrid_vol_cb + site;

            /* Fetch site from site table */
            qdp_index = site_table[index];

            /* Get its coords */
            QDP_getSiteCoords(coord, my_node, qdp_index);

            /* Loop over directions building up shift tables */
            for(dir=0; dir < 4; dir++) {

                int fcoord[4], bcoord[4];
                int fnode, bnode;
                int blinear, flinear;

                /* Backwards displacement*/
                offs(bcoord, coord, dir, -1);
                bnode   = QDP_getNodeNumber(bcoord);
                blinear = QDP_getLinearSiteIndex(bcoord);

                /* Forward displacement */
                offs(fcoord, coord, dir, +1);
                fnode   = QDP_getNodeNumber(fcoord);
                flinear = QDP_getLinearSiteIndex(fcoord);

                /* Scatter:  decomp_{plus,minus} */
                /* Operation: a^F(shift(x,type=0),dir) <- decomp(psi(x),dir) */
                /* Send backwards - also called a receive from forward */
                if (bnode != my_node) {
                    /* Offnode */
                    /* Append to Tail 1, increase boundary count */
                    /* This is the correct code */
                    shift_table[DECOMP_SCATTER][dir+4*index]
                        = subgrid_vol_cb + bound[1-cb][DECOMP_SCATTER][dir];

                    bound[1-cb][DECOMP_SCATTER][dir]++;

                }
                else {
                    /* On node. Note the linear part of its (cb3, linear) bit,
                       using a reverse lookup */
                    shift_table[DECOMP_SCATTER][dir+4*index] =
                        invtab[blinear].linearcb;
                }


                /* Scatter:  decomp_hvv_{plus,minus} */
                /* Operation:  a^B(shift(x,type=1),dir) <- U^dag(x,dir)*decomp(psi(x),dir) */
                /* Send forwards - also called a receive from backward */
                if (fnode != my_node) {
                    /* Offnode */
                    /* Append to Tail 1, increase boundary count */
                    shift_table[DECOMP_HVV_SCATTER][dir+4*index]
                        = subgrid_vol_cb + bound[1-cb][DECOMP_HVV_SCATTER][dir];

                    bound[1-cb][DECOMP_HVV_SCATTER][dir]++;

                }
                else {
                    /* On node. Note the linear part of its (cb3, linear) bit,
                       using a reverse lookup */
                    shift_table[DECOMP_HVV_SCATTER][dir+4*index]           /* Onnode */
                        = invtab[flinear].linearcb ;
                }


                /* Gather:  mvv_recons_{plus,minus} */
                /* Operation:  chi(x) <-  \sum_dir U(x,dir)*a^F(shift(x,type=2),dir) */
                /* Receive from forward */
                if (fnode != my_node) {
                    /* Offnode */
                    /* Append to Tail 2, increase boundary count */

                    shift_table[RECONS_MVV_GATHER][dir+4*index] =
                        2*subgrid_vol_cb + (bound[cb][RECONS_MVV_GATHER][dir]);

                    bound[cb][RECONS_MVV_GATHER][dir]++;

                }
                else {
                    /* On node. Note the linear part of its (cb3, linear) bit,
                       using a reverse lookup. Note this is a recons post shift,
                       so the linear coordinate to invert is mine rather than the neighbours */
                    shift_table[RECONS_MVV_GATHER][dir+4*index] =
                        invtab[qdp_index].linearcb ;
                }

                /* Gather:  recons_{plus,minus} */
                /* Operation:  chi(x) +=  \sum_dir recons(a^B(shift(x,type=3),dir),dir) */
                /* Receive from backward */
                if (bnode != my_node) {

                    shift_table[RECONS_GATHER][dir+4*index] =
                        2*subgrid_vol_cb + bound[cb][RECONS_GATHER][dir];

                    bound[cb][RECONS_GATHER][dir]++;

                }
                else {
                    /* On node. Note the linear part of its (cb3, linear) bit,
                       using a reverse lookup. Note this is a recons post shift,
                       so the linear coordinate to invert is mine rather than the neighbours */

                    shift_table[RECONS_GATHER][dir+4*index] =
                        invtab[qdp_index].linearcb ;
                }
            }
        }
    }

    /* Sanity check - make sure the sending and receiving counters match */
    for(cb=0; cb < 2; cb++) {
        for(dir=0; dir < 4; dir++) {

            /* Sanity 1: Must have same number of boundary sites on each cb for
            a given operation */
            for(i = 0; i < 4; i++) {
                if (bound[1-cb][i][dir] != bound[cb][i][dir]) {

                    QMP_error("SSE Wilson dslash - make_shift_tables: type 0 diff. cb send/recv counts do not match: %d %d",
                              bound[1-cb][i][dir],bound[cb][i][dir]);
                    QMP_abort(1);
                }
            }


        }
    }

    /* Now I want to make the offset table into the half spinor temporaries */
    /* The half spinor temporaries will look like this:

       dir=0 [ Body Half Spinors ][ Tail 1 Half Spinors ][ Tail 2 Half Spinors ]
       dir=1 [ Body Half Spinors ][ Tail 1 Half Spinors ][ Tail 2 Half Spinors ]
       ...

       And each of these blocks of half spinors will be sized to vol_cb
       sites (ie half volume only).  The shift_table() for a given site and
       direction indexes into one of these lines. So the offset table essentially
       delineates which line one picks, by adding an offset of
       3*subgrid_vol_cb*dir
       To the shift. The result from offset table, can be used directly as a
       pointer displacement on the temporaries.

       Perhaps the best way to condsider this is to consider a value
       of shift_table[type][dir/site] that lands in the body. The
       shift table merely gives me a site index. But the data needs
       to be different for each direction for that site index. Hence
       we need to replicate the body, for each dir. The 3xsubgrid_vol_cb
       is just there to take care of the buffers.

       Or another way to think of it is that there is a 'body element' index
       specified by the shift table lookup, and that dir is just the slowest
       varying index.

    */

    /* 4 dims, 4 types, rest of the magic is to align the thingie */
    xoffset_table = (halfspinor_array **)malloc(4*4*subgrid_vol*sizeof(halfspinor_array*)+63L);
    if( xoffset_table == 0 ) {
        QMP_error("init_wnxtsu3dslash: could not initialize offset_table[i]");
        QMP_abort(1);
    }
    /* This is the bit what aligns straight from AMD Manual */
    offset_table = (halfspinor_array**)((((ptrdiff_t)(xoffset_table)) + 63L) & (-64L));

    /* Walk through the shift_table and remap the offsets into actual
       pointers */

    /* DECOMP_SCATTER */
    num=0;
    for(dir =0; dir < Nd; dir++) {

        /* Loop through all the sites. Remap the offsets either to local
           arrays or pointers */
        offsite_found=0;
        for(site=0; site < subgrid_vol; site++) {
            offset = shift_table[DECOMP_SCATTER][dir+4*site];
            if( offset >= subgrid_vol_cb ) {
                /* Found an offsite guy. It's address must be to the send back buffer */
                /* send to back index = recv from forward index = 0  */
                offsite_found++;
                offset_table[ dir + 4*(site + subgrid_vol*DECOMP_SCATTER) ] =
                    send_bufs[0][num]+(offset - subgrid_vol_cb);
            }
            else {
                /* Guy is onsite: This is DECOMP_SCATTER so offset to chi1 */
                offset_table[ dir + 4*(site + subgrid_vol*DECOMP_SCATTER) ] =
                    chi1+shift_table[DECOMP_SCATTER][dir+4*site]+subgrid_vol_cb*dir;
            }
        }

        if( offsite_found > 0 ) {
            /* If we found an offsite guy, next direction has to
            go into the next dir part of the send bufs */
            num++;
        }
    }

    /* DECOMP_HVV_SCATTER */
    /* Restart num-s */
    num=0;
    for(dir =0; dir <Nd; dir++) {
        offsite_found=0;
        for(site=0; site < subgrid_vol; site++) {
            offset = shift_table[DECOMP_HVV_SCATTER][dir+4*site];
            if( offset >= subgrid_vol_cb ) {
                /* Found an offsite guy. It's address must be to the send forw buffer */
                /* send to forward / receive from backward index = 1 */
                offsite_found++;

                offset_table[ dir + 4*(site + subgrid_vol*DECOMP_HVV_SCATTER) ] =
                    send_bufs[1][num]+(offset - subgrid_vol_cb);
            }
            else {
                /* Guy is onsite. This is DECOMP_HVV_SCATTER so offset to chi2 */
                offset_table[ dir + 4*(site + subgrid_vol*DECOMP_HVV_SCATTER) ] =
                    chi2+shift_table[DECOMP_HVV_SCATTER][dir+4*site ]+subgrid_vol_cb*dir;
            }
        }
        if( offsite_found > 0 ) {
            num++;
        }
    }

    /* RECONS_MVV_GATHER */
    num=0;
    for(dir =0; dir <Nd; dir++) {
        offsite_found=0;
        for(site=0; site < subgrid_vol; site++) {
            offset = shift_table[RECONS_MVV_GATHER][dir+4*site];
            if( offset >= 2*subgrid_vol_cb ) {
                /* Found an offsite guy. It's address must be to the recv from front buffer */
                /* recv_from front index = send to back index = 0 */
                offsite_found++;
                offset_table[ dir + 4*(site + subgrid_vol*RECONS_MVV_GATHER) ] =
                    recv_bufs[0][num]+(offset - 2*subgrid_vol_cb);
            }
            else {
                /* Guy is onsite */
                /* This is RECONS_MVV_GATHER so offset with respect to chi1 */
                offset_table[ dir + 4*(site + subgrid_vol*RECONS_MVV_GATHER) ] =
                    chi1+shift_table[RECONS_MVV_GATHER][dir+4*site ]+subgrid_vol_cb*dir;
            }
        }
        if( offsite_found > 0 ) {
            num++;
        }
    }

    /* RECONS_GATHER */
    num=0;
    for(dir =0; dir <Nd; dir++) {
        offsite_found=0;
        for(site=0; site < subgrid_vol; site++) {
            offset = shift_table[RECONS_GATHER][dir+4*site];
            if( offset >= 2*subgrid_vol_cb ) {
                /* Found an offsite guy. It's address must be to the recv from back buffer */
                /* receive from back = send to forward index =  1*/
                offsite_found++;
                offset_table[ dir + 4*(site + subgrid_vol*RECONS_GATHER) ] =
                    recv_bufs[1][num]+(offset - 2*subgrid_vol_cb);
            }
            else {
                /* Guy is onsite */
                /* This is RECONS_GATHER so offset with respect to chi2 */
                offset_table[ dir + 4*(site + subgrid_vol*RECONS_GATHER ) ] =
                    chi2+shift_table[RECONS_GATHER][dir+4*site ]+subgrid_vol_cb*dir;
            }
        }
        if( offsite_found > 0 ) {
            num++;
        }
    }



    /* Free shift table - it is no longer needed. We deal solely with offsets */
    for(i=0; i < 4; i++) {
        free( (shift_table)[i] );
    }
    free( shift_table );

    free( xinvtab );

}