示例#1
0
文件: QMP_perf.c 项目: 6twirl9/qmp
static void
test_simultaneous_send (int** smem,
			int** rmem,
			QMP_msghandle_t* sendh,
			QMP_msghandle_t* recvh,
			struct perf_argv* pargv)
{
  double it, ft, dt, bwval;
  int    i, j;
  QMP_status_t err;
  int nc, ndims;
  int nloops;
  int dsize;

  nc = pargv->num_channels;
  nloops = pargv->loops;
  dsize = pargv->size;
  ndims = pargv->ndims;

  /* do a test for nloops */
  QMP_barrier();
  it = get_current_time ();
  for (i = 0; i < nloops; i++) {
    for (j = 0; j < nc; j++) {
      /* receive operation */
      if ((err = QMP_start (recvh[j])) != QMP_SUCCESS)
	QMP_printf ("Start receiving failed: %s\n", QMP_error_string(err));
    }

    for (j = 0; j < nc; j++) {
      /* Send operation */
      if ((err = QMP_start (sendh[j])) != QMP_SUCCESS)
	QMP_printf ("Start sending failed: %s\n", QMP_error_string(err));
    }

    for (j = 0; j < nc; j++) {
      if (QMP_wait (sendh[j]) != QMP_SUCCESS)
	QMP_printf ("Error in sending %d\n", j );
    }

    for (j = 0; j < nc; j++) {
      if (QMP_wait (recvh[j]) != QMP_SUCCESS)
	QMP_printf ("Error in receiving %d\n", j);
    }

  }
  ft = get_current_time ();

  if(QMP_get_node_number()==0) {
    dt = (ft - it); /* in milli seconds */

    bwval = dsize/(double)1000.0 * 4.0 * nloops * nc*ndims/dt;

    QMP_printf ("Simultaneous send B/W for datasize %d is %g (MB/s)", dsize * 4, bwval);

    QMP_printf ("Time difference is %lf micro seconds", dt*1000.0/nloops);
  }
  QMP_barrier();
}
示例#2
0
void
qx(op_CmB)(struct FermionX *r_x,
           struct eo_lattice *xy,
           const struct SUn *U,
           const struct FermionX *a_x,
           const struct FermionX *a_y,
           long long *flops,
           long long *sent,
           long long *received)
{
    qx(boundary)(xy, qx(up_project_n), qx(down_project_n), U, a_y, flops);

    if (xy->h_valid)
        QMP_start(xy->handle);

    *flops += qx(do_CmB)(r_x, 0, xy->body_size,
                         xy->neighbor, U, a_x, a_y, NULL);

    if (xy->h_valid)
        QMP_wait(xy->handle);

    *flops += qx(do_CmB)(r_x, xy->body_size, xy->face_size,
                         xy->neighbor, U, a_x, a_y, xy->receive_buf);
    *sent += xy->total_send;
    *received += xy->total_receive;
}
示例#3
0
int DML_get_bytes(char *buf, size_t size, int fromnode){
  QMP_msgmem_t mm;
  QMP_msghandle_t mh;

  mm = QMP_declare_msgmem(buf, size);
  mh = QMP_declare_receive_from(mm, fromnode, 0);
  QMP_start(mh);
  QMP_wait(mh);
  QMP_free_msghandle(mh);
  QMP_free_msgmem(mm);
  return 0;
}
示例#4
0
int DML_send_bytes(char *buf, size_t size, int tonode){
  QMP_msgmem_t mm;
  QMP_msghandle_t mh;

  mm = QMP_declare_msgmem(buf, size);
  mh = QMP_declare_send_to(mm, tonode, 0);
  QMP_start(mh);
  QMP_wait(mh);
  QMP_free_msghandle(mh);
  QMP_free_msgmem(mm);
  return 0;
}
示例#5
0
void
stupid_broadcast(void *send_buf, int count)
{
  int node;
  int num_nodes = QMP_get_number_of_nodes();
  QMP_msgmem_t request_msg = QMP_declare_msgmem(send_buf, count);
  QMP_msghandle_t request_mh;

  // Send to each node
  for(node=1; node < num_nodes; ++node)
  {
    if (QMP_get_node_number() == node)
    {
      request_mh = QMP_declare_receive_from(request_msg, 0, 0);

      if (QMP_start(request_mh) != QMP_SUCCESS)
	QMP_abort_string(1, "recvFromWait failed\n");

      QMP_wait(request_mh);
      QMP_free_msghandle(request_mh);
    }

    if (QMP_is_primary_node())
    {
      request_mh = QMP_declare_send_to(request_msg, node, 0);

      if (QMP_start(request_mh) != QMP_SUCCESS)
	QMP_abort_string(1, "sendToWait failed\n");

      QMP_wait(request_mh);
      QMP_free_msghandle(request_mh);
    }
  }

  QMP_free_msgmem(request_msg);
}
示例#6
0
CPS_START_NAMESPACE
#ifndef USE_QMP
#define QMP
#endif

void GlobalDataShift::Shift(int direction, int n_disp){
  if (n_disp==0) return;
 
  SCUDir s_dir,r_dir;
  int a_disp;
  void *send_p,*recv_p,*temp_p;
#ifndef USE_QMP
  if (n_disp>0){
   a_disp = n_disp;
   s_dir = gjp_scu_dir[i*2];
   r_dir = gjp_scu_dir[i*2+1];
  } else {
   a_disp = -n_disp;
   s_dir = gjp_scu_dir[i*2+1];
   r_dir = gjp_scu_dir[i*2];
  }
#else
//  int direction = i;
  int sflag;
  if (n_disp > 0)
    sflag = +1;
  else
    sflag = -1;
#endif

  send_p = addr;
  recv_p = temp_buf;

#ifndef USE_QMP
  SCUDirArgIR Send(send_p,s_dir,SCU_SEND,data_len);
  SCUDirArgIR Recv(recv_p,r_dir,SCU_REC,data_len);
#else
  QMP_msgmem_t msgmem[2];
  QMP_msghandle_t msghandle[2];
  QMP_status_t status;
  QMP_msghandle_t multiple;
#endif

//  sys_cacheflush(0);
  for(int i = 0;i<a_disp-1;i++){

#ifndef USE_QMP
    Send.StartTrans();Recv.StartTrans();
    Send.TransComplete();Recv.TransComplete();
#else
    msgmem[0] = QMP_declare_msgmem((void *)send_p, data_len);
    msgmem[1] = QMP_declare_msgmem((void *)recv_p, data_len);
    msghandle[0] = QMP_declare_send_relative(msgmem[0], direction, sflag, 0);
    msghandle[1] = QMP_declare_receive_relative(msgmem[1], direction, -sflag, 0);
    multiple = QMP_declare_multiple(msghandle, 2);
    QMP_start(multiple);
    status = QMP_wait(multiple);
    if (status != QMP_SUCCESS)
      QMP_error("Error in GlobalDataShift::Shift:%s\n", QMP_error_string(status));
    QMP_free_msghandle(multiple);
    QMP_free_msgmem(msgmem[0]);
    QMP_free_msgmem(msgmem[1]);
#endif
  
    temp_p = send_p;
    send_p = recv_p;
    recv_p = temp_p;

#ifndef USE_QMP
    Send.Addr(send_p);
    Recv.Addr(recv_p);
#endif
  }
#ifndef USE_QMP
  Send.StartTrans();Recv.StartTrans();
  Send.TransComplete();Recv.TransComplete();
#else
  msgmem[0] = QMP_declare_msgmem((void *)send_p, data_len);
  msgmem[1] = QMP_declare_msgmem((void *)recv_p, data_len);
  msghandle[0] = QMP_declare_send_relative(msgmem[0], direction, sflag, 0);
  msghandle[1] = QMP_declare_receive_relative(msgmem[1], direction, -sflag, 0);
  multiple = QMP_declare_multiple(msghandle, 2);
  QMP_start(multiple);
  status = QMP_wait(multiple);
  if (status != QMP_SUCCESS)
    QMP_error("Error in GlobalDataShift::Shift:%s\n", QMP_error_string(status));
  QMP_free_msghandle(multiple);
  QMP_free_msgmem(msgmem[0]);
  QMP_free_msgmem(msgmem[1]);
#endif
  
  if (recv_p != addr)
    memcpy(addr,recv_p,data_len);
}
示例#7
0
void comm_wait(MsgHandle *mh)
{
  QMP_CHECK( QMP_wait(mh->handle) ); 
}
示例#8
0
void PT::mat_cb_norm(int n, IFloat **mout, IFloat **min, const int *dir, int
parity, IFloat * gauge)
{
  //List of the different directions
  int wire[MAX_DIR];
  int i;
//  printf("PT::mat_cb_norm\n");

  QMP_msgmem_t *msg_mem_p = (QMP_msgmem_t *)Alloc("","vec_cb_norm", "msg_mem_p", 2*non_local_dirs*sizeof(QMP_msgmem_t));
  QMP_msghandle_t* msg_handle_p = (QMP_msghandle_t *)Alloc("","vec_cb_norm", "msg_handle_p", 2*non_local_dirs*sizeof(QMP_msghandle_t));
  QMP_msghandle_t multiple;
  static int call_num = 0;
  int vlen = VECT_LEN;
  int vlen2 = VECT_LEN;

  call_num++;
  
  //Name our function
  char *fname="pt_mat_cb()";
  //  VRB.Func("",fname);
  
  //Set the transfer directions
  //If wire[i] is even, then we have communication in the negative direction
  //If wire[i] is odd, then we have communication in the positive direction
  for(i=0;i<n;i++)
    wire[i]=dir[i];

#ifdef PROFILE
  Float dtime  = - dclock();
#endif
  int non_local_dir=0;

//#pragma omp parallel default(shared)
{

  //If wire[i] is odd, then we have parallel transport in the
  //positive direction.  In this case, multiplication by the link matrix is
  //done before the field is transferred over to the adjacent node
  //
  //If we have transfer in the negative T direction (wire[i] = 6), then
  //we have to copy the appropriate fields to a send buffer
//#pragma omp for
  for(i=0;i<n;i++)
    {
      if(!local[wire[i]/2])
      {
	if(wire[i]%2)
	  {
	    if(conjugated)
	      pt_cmm_cpp(non_local_chi_cb[wire[i]],(long)uc_nl_cb_pre[parity][wire[i]/2],(long)min[i],(long)snd_buf_cb[wire[i]/2],(long)gauge);
	    else
	      pt_cmm_dag_cpp(non_local_chi_cb[wire[i]],(long)uc_nl_cb_pre[parity][wire[i]/2],(long)min[i],(long)snd_buf_cb[wire[i]/2],(long)gauge);
	  }
	else if((wire[i] == 6))
	  {
	    for(int j = 0; j < non_local_chi_cb[6];j++)
	      memcpy(snd_buf_t_cb + j*GAUGE_LEN,min[i] + 3 * *(Toffset[parity]+j)*3,GAUGE_LEN*sizeof(IFloat));
	  }
      }
    }

//#pragma omp barrier
//#pragma omp master 
{
  for(i=0;i<n;i++)
    if(!local[wire[i]/2])
    {
      //Calculate the starting address for the data to be sent
      IFloat *addr = min[i] + GAUGE_LEN * offset_cb[wire[i]];

      msg_mem_p[2*non_local_dir] = QMP_declare_msgmem((void *)rcv_buf[wire[i]], 3*non_local_chi_cb[wire[i]]*VECT_LEN*sizeof(IFloat));
      
      //Initialize the msg_mem for sends
      if(wire[i]%2) 
	msg_mem_p[2*non_local_dir+1] = QMP_declare_msgmem((void *)snd_buf_cb[wire[i]/2], 3*non_local_chi_cb[wire[i]]*VECT_LEN*sizeof(IFloat));
      else if(wire[i] == 6)
	msg_mem_p[2*non_local_dir+1] = QMP_declare_msgmem((void *)snd_buf_t_cb, 3*non_local_chi_cb[wire[i]]*VECT_LEN*sizeof(IFloat));
      else
	msg_mem_p[2*non_local_dir+1] = QMP_declare_strided_msgmem((void *)addr, (size_t)(3*blklen_cb[wire[i]]), numblk_cb[wire[i]], (ptrdiff_t)(3*stride_cb[wire[i]]+3*blklen_cb[wire[i]]));
      
      msg_handle_p[2*non_local_dir] = QMP_declare_receive_relative(msg_mem_p[2*non_local_dir], wire[i]/2, 1-2*(wire[i]%2), 0);
      msg_handle_p[2*non_local_dir+1] = QMP_declare_send_relative(msg_mem_p[2*non_local_dir+1], wire[i]/2, 2*(wire[i]%2)-1, 0);

      non_local_dir++;

    }

  if(non_local_dir) {
    multiple = QMP_declare_multiple(msg_handle_p, 2*non_local_dir);
    QMP_start(multiple);
  }
} //#pragma omp master {

  //Do local calculations
//#pragma omp for
  for(i=0;i<n;i++)
    {
      if((wire[i]%2 && conjugated) || ((wire[i]%2 == 0) && (conjugated == 0)))
	pt_cmm_cpp(local_chi_cb[wire[i]],(long)uc_l_cb[parity][wire[i]],(long)min[i],(long)mout[i],(long)gauge);
      else
	pt_cmm_dag_cpp(local_chi_cb[wire[i]],(long)uc_l_cb[parity][wire[i]],(long)min[i],(long)mout[i],(long)gauge);
    }

//#pragma omp barrier
//#pragma omp master 
{
  if(non_local_dir) {
    QMP_status_t qmp_complete_status = QMP_wait(multiple);
    if (qmp_complete_status != QMP_SUCCESS)
          QMP_error("Send failed in vec_cb_norm: %s\n", QMP_error_string(qmp_complete_status));
    QMP_free_msghandle(multiple);
    for(int i = 0; i < 2*non_local_dir; i++)
      QMP_free_msgmem(msg_mem_p[i]);
    Free(msg_handle_p);
    Free(msg_mem_p);
  }
} //#pragma omp master {

  //If wire[i] is even, then we have transport in the negative direction
  //In this case, the vector field is multiplied by the SU(3) link matrix
  //after all communication is complete
  IFloat *fp0,*fp1;
//#pragma omp for
  for(i=0;i<n;i++)
    {
      if(!local[wire[i]/2])
      	{
	  if(!(wire[i]%2))
	    {
	      if(conjugated)
		pt_cmm_dag_cpp(non_local_chi_cb[wire[i]],(long)uc_nl_cb[parity][wire[i]],(long)rcv_buf[wire[i]],(long)mout[i],(long)gauge);
	      else
		pt_cmm_cpp(non_local_chi_cb[wire[i]],(long)uc_nl_cb[parity][wire[i]],(long)rcv_buf[wire[i]],(long)mout[i],(long)gauge);
	    }
	  //Otherwise we have parallel transport in the positive direction.
	  //In this case, the received data has already been pre-multiplied
	  //All we need to do is to put the transported field in the correct place
	  else
	    {
	      //int destination, source;
	      //Place the data in the receive buffer into the result vector
	      for(int s=0;s<non_local_chi_cb[wire[i]];s++)
		{
		  //source = uc_nl_cb[parity][wire[i]][s].src;
		  fp0 = (IFloat *)((long)rcv_buf[wire[i]]+3*uc_nl_cb[parity][wire[i]][s].src);
		  //destination = uc_nl_cb[parity][wire[i]][s].dest;
		  fp1 = (IFloat *)(mout[i]+3*uc_nl_cb[parity][wire[i]][s].dest);
		  memcpy(fp1,fp0,GAUGE_LEN*sizeof(IFloat));
		}
	    }
	}
    }

} //#pragma omp parallel
#ifdef PROFILE
  dtime +=dclock();
  print_flops("",fname,99*vol*n,dtime);
#endif
//  ParTrans::PTflops +=99*n*vol;
}
示例#9
0
void PT::mat(int n, matrix **mout, matrix **min, const int *dir){
    
  int wire[MAX_DIR];
  int i;
  QMP_msgmem_t msg_mem_p[2*MAX_DIR];
  QMP_msghandle_t msg_handle_p[2*MAX_DIR];
  QMP_msghandle_t multiple;
  static double setup=0.,qmp=0.,localt=0.,nonlocal=0.;
  static int call_num = 0;

  call_num++;
//  char *fname="pt_mat()";
//  VRB.Func("",fname);
//  if (call_num%100==1) printf("PT:mat()\n");

  
  for(i=0;i<n;i++) wire[i] = dir[i]; 
#ifdef PROFILE
  Float dtime2  = - dclock();
#endif

  double dtime = -dclock();

  int non_local_dir=0;
  
  for(i=0;i<n;i++)
  if (!local[wire[i]/2]) {
    //Calculate the address for transfer in a particular direction
    Float * addr = ((Float *)min[i]+GAUGE_LEN*offset[wire[i]]);
    msg_mem_p[2*non_local_dir] = QMP_declare_msgmem((void *)rcv_buf[wire[i]], 3*non_local_chi[wire[i]]*VECT_LEN*sizeof(IFloat));
    msg_mem_p[2*non_local_dir+1] = QMP_declare_strided_msgmem((void *)addr, (size_t)(3*blklen[wire[i]]), numblk[wire[i]], (ptrdiff_t)(3*stride[wire[i]]+3*blklen[wire[i]]));
    
    msg_handle_p[2*non_local_dir] = QMP_declare_receive_relative(msg_mem_p[2*non_local_dir], wire[i]/2, 1-2*(wire[i]%2), 0);
    msg_handle_p[2*non_local_dir+1] = QMP_declare_send_relative(msg_mem_p[2*non_local_dir+1], wire[i]/2, 2*(wire[i]%2)-1, 0);

    non_local_dir++;
  }
  if (call_num==1 && !QMP_get_node_number())
	printf("non_local_dir=%d\n",non_local_dir);

  if(non_local_dir) {
    multiple = QMP_declare_multiple(msg_handle_p, 2*non_local_dir);
    QMP_start(multiple);
  }

  dtime += dclock();
  setup +=dtime;
  dtime = -dclock();
  int if_print = 0;
//  if ( (call_num%10000==1) && (!QMP_get_node_number()) ) if_print=1;

#define USE_TEST2
#ifdef USE_TEST2
//assume nt > n!
    static char *cname="mat()";
#pragma omp parallel default(shared)
{
  int iam,nt,ipoints,istart,offset;
  iam = omp_get_thread_num();
  nt = omp_get_num_threads();
  int nt_dir = nt/n;
  int n_t = iam/nt_dir;
  int i_t = iam%nt_dir;
  if (n_t >= n ){  n_t = n-1;
    i_t = iam - (n-1)*nt_dir;
    nt_dir = nt -(n-1)*nt_dir;
  }
  int w_t = wire[n_t];
  ipoints = (local_chi[w_t]/2)/nt_dir;
  offset = ipoints*i_t;
  if (i_t == (nt_dir-1)) ipoints = (local_chi[w_t]/2)-offset;
    if ( if_print )
      printf("thread %d of %d nt_dir n_t i_t ipoints offset= %d %d %d %d %d\n",iam,nt,nt_dir,n_t,i_t,ipoints,offset);
  //Interleaving of local computation of matrix multiplication
  partrans_cmm_agg((uc_l[w_t]+offset*2),min[n_t],mout[n_t],ipoints);
    if ( if_print )
      printf("thread %d of %d done\n",iam,nt);
}
#else
{
  //Interleaving of local computation of matrix multiplication
#pragma omp parallel for default(shared)
  for(i=0;i<n;i++){
    partrans_cmm_agg(uc_l[wire[i]],min[i],mout[i],local_chi[wire[i]]/2);
  }
}
#endif

  dtime += dclock();
  localt +=dtime;
  dtime = -dclock();
//#pragma omp barrier
//#pragma omp master 
{
  if(non_local_dir) {
    QMP_status_t qmp_complete_status = QMP_wait(multiple);
    if (qmp_complete_status != QMP_SUCCESS)
          QMP_error("Send failed in vec_cb_norm: %s\n", QMP_error_string(qmp_complete_status));
    QMP_free_msghandle(multiple);
    for(int i = 0; i < 2*non_local_dir; i++)
      QMP_free_msgmem(msg_mem_p[i]);
//    Free(msg_handle_p);
//    Free(msg_mem_p);
  }
} //#pragma omp master {
  dtime += dclock();
  qmp +=dtime;
  dtime = -dclock();

  //Do non-local computations
#ifdef USE_TEST2
//assume nt > n!
#pragma omp parallel default(shared)
{
  int iam,nt,ipoints,istart,offset;
  iam = omp_get_thread_num();
  nt = omp_get_num_threads();
  int nt_dir = nt/n;
  int n_t = iam/nt_dir;
  int i_t = iam%nt_dir;
  if (n_t >= n ){  n_t = n-1;
    i_t = iam - (n-1)*nt_dir;
    nt_dir = nt -(n-1)*nt_dir;
  }
  int w_t = wire[n_t];
  ipoints = (non_local_chi[w_t]/2)/nt_dir;
  offset = ipoints*i_t;
  if (i_t == (nt_dir-1)) ipoints = (non_local_chi[w_t]/2)-offset;
    if ( if_print )
      printf("thread %d of %d nt_dir n_t i_t ipoints offset= %d %d %d %d %d\n",iam,nt,nt_dir,n_t,i_t,ipoints,offset);
  //Non-local computation
  if (ipoints>0)
  partrans_cmm_agg((uc_nl[w_t]+offset*2),(matrix *)rcv_buf[w_t],mout[n_t],ipoints);
    if ( if_print )
      printf("thread %d of %d done\n",iam,nt);
}
#else
{
#pragma omp parallel for
  for(i=0;i<n;i++) 
  if (!local[wire[i]/2]) {
#ifdef USE_OMP
    if (call_num%10000==1 && !QMP_get_node_number() ) 
      printf("thread %d of %d i=%d\n",omp_get_thread_num(),omp_get_num_threads(),i);
#endif
    partrans_cmm_agg(uc_nl[wire[i]],(matrix *)rcv_buf[wire[i]],mout[i],non_local_chi[wire[i]]/2);
  }

}//#pragma omp parallel
#endif

  dtime += dclock();
  nonlocal +=dtime;

  if (call_num%100==0){
    static char *cname="mat()";
    if (!QMP_get_node_number() ) {
    print_flops("mat():local*100",0,localt);
    print_flops("mat():nonlocal*100",0,nonlocal);
    print_flops("mat():qmp*100",0,qmp);
    print_flops("mat():setup*100",0,setup);
    }
    localt=nonlocal=qmp=setup=0.;
  }


#ifdef PROFILE
  dtime2 +=dclock();
  print_flops("",fname,198*vol*n,dtime2);
#endif
//  ParTrans::PTflops +=198*n*vol;
}
示例#10
0
/*! 
  Computes sum[x] = vect2[x] vect[x + hop dir]^dagger
  where the sum is over n_vect vectors and the hop is in a forward direction.
*/
void PT::vvpd(IFloat **vect2, IFloat ***vect, int n_vect, const int *dir, int n_dir, int hop, IFloat **sum, int overwrite){

  char *fname = "pt_vvpd()";
#if 1
//  ERR.NotImplemented(cname,fname);
   QMP_error("%s""%s Not implemented\n");
#else
//  VRB.Func("",fname);
  int i, s, v;
  Float f = 2.0;
  int wire[MAX_DIR];
  for(i=0;i<n_dir;i++) wire[i] = dir[i]; // from (x,y,z,t) to (t,x,y,z)

  QMP_msgmem_t *msg_mem_p = (QMP_msgmem_t *)Alloc("","vvpd", "msg_mem_p", 2*non_local_dirs*sizeof(QMP_msgmem_t));
  QMP_msgmem_t *msg_mem_p2 = (QMP_msgmem_t *)Alloc("","vvpd", "msg_mem_p", 2*non_local_dirs*sizeof(QMP_msgmem_t));
  QMP_msghandle_t* msg_handle_p = (QMP_msghandle_t *)Alloc("","vvpd", "msg_handle_p", 2*non_local_dirs*sizeof(QMP_msghandle_t));
  QMP_msghandle_t* msg_handle_p2 = (QMP_msghandle_t *)Alloc("","vvpd", "msg_handle_p", 2*non_local_dirs*sizeof(QMP_msghandle_t));
  QMP_msghandle_t multiple;

  //Setup communciation
  int comms=0;
  for(i=0;i<n_dir;i++)
  if( !local[wire[i]/2]) {
    if ( size[wire[i]/2] <hop)
      fprintf(stderr, 
		"%s:size(%d) in direction %d is smaller than the hop(%d)\n",
		fname,size[wire[i]],wire[i],hop);

    comms++;
  }

  for(v=0; v<n_vect; v++){


    if (v%2==0) {
      comms=0;
      for(i=0;i<n_dir;i++)
        if( !local[wire[i]/2]){ 
	  msg_mem_p[2*comms] = QMP_declare_msgmem((void *)rcv_buf[wire[i]], hop*non_local_chi[wire[i]]*VECT_LEN*sizeof(IFloat));    
	  msg_handle_p[2*comms] = QMP_declare_receive_relative(msg_mem_p[2*comms], wire[i]/2, 1-2*(wire[i]%2), 0);
	  msg_mem_p[2*comms+1] = QMP_declare_strided_msgmem((void *)(vect[v][i]+VECT_LEN*set_offset(wire[i], hop)), (size_t)(hop*blklen[wire[i]]), numblk[wire[i]], (ptrdiff_t)(stride[wire[i]] + blklen[wire[i]]));
	  msg_handle_p[2*comms+1] =  QMP_declare_send_relative(msg_mem_p[2*comms+1], wire[i]/2, 2*(wire[i]%2)-1, 0);
          comms++;
        }

      // Start communication
      if(comms) {
	multiple = QMP_declare_multiple(msg_handle_p, 2*comms);
      }
      if (comms) {
	QMP_start(multiple);
	QMP_status_t qmp_complete_status = QMP_wait(multiple);
	if (qmp_complete_status != QMP_SUCCESS)
	  QMP_error("Send failed in vvpd: %s\n", QMP_error_string(qmp_complete_status));
	QMP_free_msghandle(multiple);
	for(int i = 0; i < 2*comms; i++) 
	  QMP_free_msgmem(msg_mem_p[i]);
      }

    } else {
      comms=0;
      for(i=0;i<n_dir;i++)
        if( !local[wire[i]/2]){ 
	  msg_mem_p2[2*comms] = QMP_declare_msgmem((void *)rcv_buf2[wire[i]], hop*non_local_chi[wire[i]]*VECT_LEN*sizeof(IFloat));  
	  msg_handle_p2[2*comms] = QMP_declare_receive_relative(msg_mem_p2[2*comms], wire[i]/2, 1-2*(wire[i]%2), 0);  
	  msg_mem_p2[2*comms+1] = QMP_declare_strided_msgmem((void *)(vect[v][i]+VECT_LEN*set_offset(wire[i], hop)), (size_t)(hop*blklen[wire[i]]), numblk[wire[i]], (ptrdiff_t)(stride[wire[i]] + blklen[wire[i]]));
	  msg_handle_p2[2*comms+1] =  QMP_declare_send_relative(msg_mem_p2[2*comms+1], wire[i]/2, 2*(wire[i]%2)-1, 0);
          comms++;
	}

      // Start communication
      if(comms) {
	multiple = QMP_declare_multiple(msg_handle_p2, 2*comms);
      }
      if (comms) {
	QMP_start(multiple);
	QMP_status_t qmp_complete_status = QMP_wait(multiple);
	if (qmp_complete_status != QMP_SUCCESS)
	  QMP_error("Send failed in vvpd: %s\n", QMP_error_string(qmp_complete_status));
      QMP_free_msghandle(multiple);
      for(int i = 0; i < 2*comms; i++) 
	QMP_free_msgmem(msg_mem_p2[i]);
      }
    }

     
    

    // Perform non-local calculation for previous v
    if (v>0)
      if (v==1 && overwrite==1) {
	for(i=0; i<n_dir; i++)
	  if(non_local_chi[wire[i]]>0)
	    cross_over_lin(sum[i], &f, vect2[v-1],rcv_buf[wire[i]], hop*non_local_chi[wire[i]],
			   src_nl[hop-1][wire[i]], dest_nl[hop-1][wire[i]]);
      } else if (v%2==1) {
	for(i=0; i<n_dir; i++) 
	  if(non_local_chi[wire[i]]>0)
	    cross_lin(sum[i], &f, vect2[v-1],rcv_buf[wire[i]], hop*non_local_chi[wire[i]],
		      src_nl[hop-1][wire[i]], dest_nl[hop-1][wire[i]]);
      } else {
	for(i=0; i<n_dir; i++) 
	  if(non_local_chi[wire[i]]>0)
	    cross_lin(sum[i], &f,vect2[v-1],rcv_buf2[wire[i]], hop*non_local_chi[wire[i]],
		      src_nl[hop-1][wire[i]], dest_nl[hop-1][wire[i]]);
      }
    
    // Perform local calculation for current v
    if (v==0 && overwrite==1)
      {
	for(i=0; i<n_dir; i++)
	  if((vol-hop*non_local_chi[wire[i]])>0)
	    cross_over_look(sum[i], &f, vect2[v], vect[v][i], vol-hop*non_local_chi[wire[i]], src_l[hop-1][wire[i]], dest_l[hop-1][wire[i]]);
      }
    else
      {
	for(i=0; i<n_dir; i++)
	  if((vol-hop*non_local_chi[wire[i]])>0)
	    cross_look(sum[i], &f, vect2[v], vect[v][i], vol-hop*non_local_chi[wire[i]], src_l[hop-1][wire[i]], dest_l[hop-1][wire[i]]);
      }

  }


  if (v==1 && overwrite==1) {
    for(i=0; i<n_dir; i++) 
      if(non_local_chi[wire[i]]>0)
	cross_over_lin(sum[i], &f, vect2[v-1],rcv_buf[wire[i]], hop*non_local_chi[wire[i]],
		       src_nl[hop-1][wire[i]], dest_nl[hop-1][wire[i]]);
  } else if (v%2==1) {
    for(i=0; i<n_dir; i++)
      if(non_local_chi[wire[i]]>0)
	cross_lin(sum[i], &f, vect2[v-1],rcv_buf[wire[i]], hop*non_local_chi[wire[i]],
		  src_nl[hop-1][wire[i]], dest_nl[hop-1][wire[i]]);
  } else {
    for(i=0; i<n_dir; i++) 
      if(non_local_chi[wire[i]]>0)
	cross_lin(sum[i], &f,vect2[v-1],rcv_buf2[wire[i]], hop*non_local_chi[wire[i]],
		  src_nl[hop-1][wire[i]], dest_nl[hop-1][wire[i]]);
  }
#endif
  
  //  ParTrans::PTflops += 90*n_vect*n_dir*vol;
}
示例#11
0
//! u[x] = v[x+dir] for n_dir forward or backward directions dir.
void PT::shift_field(IFloat **v, const int *dir, int n_dir,
		     int hop, IFloat **u){
  
  int i, length;
  int wire[n_dir];
  for (i=0; i<n_dir;i++) wire[i] = dir[i];
#ifdef USE_QMP
  QMP_msgmem_t msg_mem_p[20];
  QMP_msghandle_t msg_handle_p[20];
  QMP_msghandle_t multiple;
#else
  SCUDirArgMulti SCUmulti;
  SCUDirArgIR *SCUarg_p[2*n_dir];
#endif
  
  
  int comms=0;
  for (i=0; i<n_dir; i++) 
  if (!local[wire[i]/2]){
#ifndef USE_QMP
    SCUarg_p[2*comms] = SCUarg_mat[hop-1][2*wire[i]];
    SCUarg_p[2*comms+1] = SCUarg_mat[hop-1][2*wire[i]+1];
    SCUarg_p[2*comms+1]->Addr((void *)(v[i]+GAUGE_LEN*set_offset(wire[i], hop)));
#else
    msg_mem_p[2*comms] = QMP_declare_msgmem((void *)rcv_buf[wire[i]], 3*hop*non_local_chi[wire[i]]*VECT_LEN*sizeof(IFloat));
    msg_mem_p[2*comms+1] = QMP_declare_strided_msgmem((void *)(v[i]+GAUGE_LEN*set_offset(wire[i], hop)), (size_t)(3*hop*blklen[wire[i]]), numblk[wire[i]], (ptrdiff_t)(3*stride[wire[i]]+3*blklen[wire[i]]));
    
    msg_handle_p[2*comms] = QMP_declare_receive_relative(msg_mem_p[2*comms], wire[i]/2, 1-2*(wire[i]%2), 0);
    msg_handle_p[2*comms+1] = QMP_declare_send_relative(msg_mem_p[2*comms+1], wire[i]/2, 2*(wire[i]%2)-1, 0);
#endif
   
    comms++;
  }

#ifndef USE_QMP
  if (comms) SCUmulti.Init(SCUarg_p,2*comms);
  if (comms) SCUmulti.SlowStartTrans();
#else
  if(comms) {
    multiple = QMP_declare_multiple(msg_handle_p, 2*comms);
    QMP_start(multiple);
  }
#endif
  
//  SCUmulti.TransComplete();
  
  for (i=0; i<n_dir; i++) {
    length = vol-hop*non_local_chi[wire[i]];
    copy_matrix(u[i],v[i],&length,dest_l[hop-1][wire[i]],
		src_l[hop-1][wire[i]]);
  }
#ifndef USE_QMP
  if (comms) SCUmulti.TransComplete();
#else
  if(comms) {
    QMP_status_t qmp_complete_status = QMP_wait(multiple);
    if (qmp_complete_status != QMP_SUCCESS)
          QMP_error("Send failed in shift_field: %s\n", QMP_error_string(qmp_complete_status));
    QMP_free_msghandle(multiple);
    for(int i = 0; i < 2*comms; i++)
      QMP_free_msgmem(msg_mem_p[i]);
  }
#endif


  for (i=0; i<n_dir; i++) {
    length = hop*non_local_chi[wire[i]];
    copy_matrix(u[i],(IFloat*)rcv_buf[wire[i]],&length,
		dest_nl[hop-1][wire[i]],src_nl[hop-1][wire[i]]);
  }
}
示例#12
0
文件: QMP_perf.c 项目: 6twirl9/qmp
/**
 * Test oneway blast send
 */
static void
test_oneway (int** smem,
	     int** rmem,
	     QMP_msghandle_t* sendh,
	     QMP_msghandle_t* recvh,
	     struct perf_argv* pargv)
{
  double it, ft, dt, bwval;
  int    i, j;
  QMP_status_t err;
  int nc, ndims;
  int nloops;
  int dsize;
  QMP_bool_t sender;

  nc = pargv->num_channels;
  nloops = pargv->loops;
  dsize = pargv->size;
  sender = pargv->sender;
  ndims = pargv->ndims;

  QMP_barrier();
  if (sender) {
    it = get_current_time ();
    for (i = 0; i < nloops; i++) {
      for (j = 0; j < nc; j++) {
	/* Send operation */
	if ((err = QMP_start (sendh[j])) != QMP_SUCCESS)
	  QMP_printf ("Start sending failed: %s\n", QMP_error_string(err));
      }

      for (j = 0; j < nc; j++) {
	if (QMP_wait (sendh[j]) != QMP_SUCCESS)
	  QMP_printf ("Error in sending %d\n", j );
      }
    }
    ft = get_current_time (); /* In milli seconds */

    dt = (ft - it); /* actual send time milli seconds */

    bwval = dsize/(double)1000.0 * 4 * nloops*nc*ndims/dt;

    if(QMP_get_node_number()==0) {
      QMP_printf ("Oneway Bandwidth for datasize %d is %g (MB/s)", 
		  dsize * 4, bwval);
      QMP_printf ("time is %lf micro seconds", dt*1000.0/nloops);
      fflush(stdout);
    }
    QMP_barrier();
  }
  else { 
    it = get_current_time ();
    for (i = 0; i < nloops; i++) {
      for (j = 0; j < nc; j++) {
	/* receive operation */
	if ((err = QMP_start (recvh[j])) != QMP_SUCCESS)
	  QMP_printf ("Start receiving failed: %s\n", QMP_error_string(err));
      }

      for (j = 0; j < nc; j++) {
	if (QMP_wait (recvh[j]) != QMP_SUCCESS)
	  QMP_printf ("Error in receiving %d\n", j);
      }
    }
    ft = get_current_time (); /* In milli seconds */

    dt = (ft - it); /* actual send time milli seconds */

    bwval = dsize/(double)1000.0 * 4 * nloops*nc*ndims/dt;

    QMP_barrier();
    if(QMP_get_node_number()==1) {
      QMP_printf ("Oneway Bandwidth for datasize %d is %g (MB/s)", 
		  dsize * 4, bwval);
      QMP_printf ("time is %lf micro seconds", dt*1000.0/nloops);
      fflush(stdout);
    }
  }
  QMP_barrier();
}
示例#13
0
文件: QMP_perf.c 项目: 6twirl9/qmp
/**
 * Test ping and verify received message.
 */
static void
test_pingpong_verify (int** smem,
		      int** rmem,
		      QMP_msghandle_t* sendh,
		      QMP_msghandle_t* recvh,
		      struct perf_argv* pargv)
{
  double it, ft, dt, bwval;
  int    i, j, k;
  QMP_status_t err;
  int nc, ndims;
  int nloops;
  int dsize;
  QMP_bool_t sender;

  nc = pargv->num_channels;
  nloops = pargv->loops;
  dsize = pargv->size;
  sender = pargv->sender;
  ndims = pargv->ndims;

  QMP_barrier();
  it = get_current_time ();

  for (i = 0; i < nloops; i++) {
    for (j = 0; j < nc; j++) {
      for (k = 0; k < dsize; k++) {
	rmem[j][k] = 0;
	smem[j][k] = i + k * j + nc*nc;
      }
    }

    if (sender) {
      for (j = 0; j < nc; j++) {
	/* Send operation */
	if ((err = QMP_start (sendh[j])) != QMP_SUCCESS)
	  QMP_printf ("Start sending failed: %s\n", QMP_error_string(err));
      }

      for (j = 0; j < nc; j++) {
	if (QMP_wait (sendh[j]) != QMP_SUCCESS)
	  QMP_printf ("Error in sending %d\n", j );
      }

      for (j = 0; j < nc; j++) {
	/* receive operation */
	if ((err = QMP_start (recvh[j])) != QMP_SUCCESS)
	  QMP_printf ("Start receiving failed: %s\n", QMP_error_string(err));
      }
      
      for (j = 0; j < nc; j++) {
	if (QMP_wait (recvh[j]) != QMP_SUCCESS)
	  QMP_printf ("Error in receiving %d\n", j);
      }
    }
    else { 
      for (j = 0; j < nc; j++) {
	/* receive operation */
	if ((err = QMP_start (recvh[j])) != QMP_SUCCESS)
	  QMP_printf ("Start receiving failed: %s\n", QMP_error_string(err));
      }
	
      for (j = 0; j < nc; j++) {
	if (QMP_wait (recvh[j]) != QMP_SUCCESS)
	  QMP_printf ("Error in receiving %d\n", j);
      }
	
      for (j = 0; j < nc; j++) {
	/* Send operation */
	if ((err = QMP_start (sendh[j])) != QMP_SUCCESS)
	  QMP_printf ("Start sending failed: %s\n", QMP_error_string(err));
      }

      for (j = 0; j < nc; j++) {
	if (QMP_wait (sendh[j]) != QMP_SUCCESS)
	  QMP_printf ("Error in sending %d\n", j );
      }
    }

    /* verify memory */
    for (j = 0; j < nc; j++) {
      for (k = 0; k < dsize; k++)
	if (rmem[j][k] != i + k * j + nc* nc)
	  QMP_printf ("Receiving memory error for memory %d %d %d\n", 
		  j, k, rmem[j][k]);
    }

  }
  ft = get_current_time (); /* In milli seconds */

  dt = (ft - it); /* actual send time milli seconds */

  bwval = 2 * dsize/(double)1000.0 * 4 * nloops*nc*ndims/dt;

  QMP_printf ("Ping Pong Bandwidth for datasize %d is %g (MB/s)", 
	      dsize * 4, bwval);

  QMP_printf ("RTT/2 is %lf micro seconds", dt*1000.0/nloops/2);
}
void dwf_dslash_5_plus_slice(Vector *out, 
		       Vector *in, 
		       Float mass,
		       int dag, 
		       Dwf *dwf_lib_arg, 
		       int s_slice)
{
  int x;
  int s;

// Initializations
//------------------------------------------------------------------
#if 0
  int local_ls = GJP.SnodeSites(); 
  int s_nodes = GJP.Snodes();
  int s_node_coor = GJP.SnodeCoor();
  int vol_4d_cb = dwf_lib_arg->vol_4d / 2;
  int ls_stride = 24 * vol_4d_cb;
#endif

  IFloat *f_in;
  IFloat *f_out;
  IFloat *f_temp;
  IFloat *comm_buf = dwf_lib_arg->comm_buf;
  IFloat two_over_a5 = 2.0 * GJP.DwfA5Inv();
  IFloat neg_mass_two_over_a5 = -2.0 * mass * GJP.DwfA5Inv();

// [1 + gamma_5] term (if dag=1 [1 - gamma_5] term)
//
// out[s] = [1 + gamma_5] in[s-1]
//------------------------------------------------------------------

  if (s_slice<0 || s_slice >=local_ls)
  ERR.General("","dwf_dslash_5_plus_slice","s_slice=%d local_ls=%d!\n",s_slice,local_ls);
  
  if(s_slice>0 ){
    f_in  = (IFloat *) in;
    f_out = (IFloat *) out;
    f_in += (s_slice-1)*ls_stride;
    f_out += (s_slice)*ls_stride;
    if(dag == 1){
      f_in  =  f_in + 12;
      f_out = f_out + 12;
    }
    FtV1pV2Skip_asm(f_out,&two_over_a5,f_in,f_out,vol_4d_cb);
  }


// [1 + gamma_5] for lower boundary term (if dag=1 [1 - gamma_5] term)
// If there's only one node along fifth direction, no communication
// is necessary; Otherwise data from adjacent node in minus direction
// will be needed.
// If the lower boundary is the s=0 term
// out[0] = - m_f * [1 + gamma_5] in[ls-1]
// else, out[s] = [1 + gamma_5] in[s-1]
//
//------------------------------------------------------------------

if (s_slice ==  0 ){
  f_in  = (IFloat *) in;  
  f_in = f_in + (local_ls-1)*ls_stride; 
  f_out = (IFloat *) out;
  
  if(dag == 1){
    f_in  =  f_in + 12;
    f_out = f_out + 12;
  }
  
  f_temp = f_in;
  if (s_nodes > 1 ) {
#ifdef USE_GETPLUS
    getMinusData(comm_buf, f_in, 24*vol_4d_cb, 4);
    f_temp = comm_buf;
#else
      QMP_status_t send_status = QMP_wait(msghandle_down[0]);
      if (send_status != QMP_SUCCESS) 
      QMP_error("Send failed in dwf_dslash_5_plus_slice: %s\n", QMP_error_string(send_status));
      QMP_status_t recv_status = QMP_wait(msghandle_down[1]);
    if (recv_status != QMP_SUCCESS)
      QMP_error("Receive failed in dwf_dslash_5_plus_slice: %s\n", QMP_error_string(recv_status));
     f_temp = rbuf_down; 
     if(dag == 1) f_temp = f_temp + 12;
#endif
  }
    if(s_node_coor == 0) { 
      FtV1pV2Skip_asm(f_out,&neg_mass_two_over_a5,f_temp,f_out,vol_4d_cb);
    } else {
      FtV1pV2Skip_asm(f_out,&two_over_a5,f_temp,f_out,vol_4d_cb);
    }
}


// [1 - gamma_5] term (if dag=1 [1 + gamma_5] term)
// 
// out[s] = [1 - gamma_5] in[s+1]
//------------------------------------------------------------------
if(s_slice > 0 ){
  f_in  = (IFloat *) in;
  f_out = (IFloat *) out;
  f_in += (s_slice)*ls_stride;
  f_out += (s_slice-1)*ls_stride;
  if(dag == 0){
    f_in  =  f_in + 12;
    f_out = f_out + 12;
  }
  FtV1pV2Skip_asm(f_out,&two_over_a5,f_in,f_out,vol_4d_cb);
}


// [1 - gamma_5] for upper boundary term (if dag=1 [1 + gamma_5] term)
// If there's only one node along fifth direction, no communication
// is necessary; Otherwise data from adjacent node in minus direction
// will be needed.
// If the upper boundary is the s=ls term
// out[ls-1] = - m_f * [1 - gamma_5] in[0]
// else out[s] = [1 - gamma_5] in[s+1]
//
//------------------------------------------------------------------

if(s_slice == (local_ls-1) ){
  f_in  = (IFloat *) in;
  f_out = (IFloat *) out;

  if(dag == 0){
    f_in  =  f_in + 12;
    f_out = f_out + 12;
  }

  f_out = f_out + (local_ls-1)*ls_stride;
    f_temp = f_in;
  if (s_nodes > 1 ) {
#ifdef USE_GETPLUS
    getPlusData(comm_buf, f_in, 24*vol_4d_cb, 4);
    f_temp = comm_buf;
#else
      QMP_status_t send_status = QMP_wait(msghandle_up[0]);
      if (send_status != QMP_SUCCESS) 
      QMP_error("Send failed in dwf_dslash_5_plus_slice: %s\n", QMP_error_string(send_status));
      QMP_status_t recv_status = QMP_wait(msghandle_up[1]);
    if (recv_status != QMP_SUCCESS)
      QMP_error("Receive failed in dwf_dslash_5_plus_slice: %s\n", QMP_error_string(recv_status));
     f_temp = rbuf_up; 
     if(dag == 0) f_temp = f_temp + 12;
#endif
  }
    if(s_node_coor == s_nodes - 1) { 
      FtV1pV2Skip_asm(f_out,&neg_mass_two_over_a5,f_temp,f_out,vol_4d_cb);
    } else {
      FtV1pV2Skip_asm(f_out,&two_over_a5,f_temp,f_out,vol_4d_cb);
    }
}
//  DiracOp::CGflops+=2*2*vol_4d_cb*local_ls*12;


}
示例#15
0
void wfm_comm(){

  char *fname="wfm_comm()";
  const int group = 16;
  void *addr[group];
  size_t blksize[group];
  int  numblk[group];
  ptrdiff_t stride[group];
  int index;

  const int MAX_MSGHANDLE=20;
  if (wfm_max_numchunk/group+1 >MAX_MSGHANDLE)
  ERR.General("",fname,"wfm_max_numchunk(%d)/group+1 >MAX_MSGHANDLE",wfm_max_numchunk);

 static   QMP_msgmem_t send_mem[8][MAX_MSGHANDLE];
 static   QMP_msgmem_t recv_mem[8][MAX_MSGHANDLE];
 static   QMP_msghandle_t send_h[8][MAX_MSGHANDLE];
 static   QMP_msghandle_t recv_h[8][MAX_MSGHANDLE];
 static   int pir=0;

  static int wfm_blocks[8];
if (wilson_initted || !initted){
    VRB.Flow("",fname,"wilson_initted=%d initted=%d\n",wilson_initted,initted);
  for(int dir=0;dir<8;dir++) wfm_blocks[dir]=1;
  for(int ig=0; ig<1; ig++){
//    VRB.Flow("",fname,"ig=%d",ig);
    for(int dir=0;dir<8;dir++){
      int sign=1;
      if(dir>3) sign = -1;
//      VRB.Flow("",fname,"dir=%d",dir);
       int n_site=0;
           addr[n_site] = wfm_s_start[dir];
           blksize[n_site] = wfm_blklen[dir];
           numblk[n_site] = wfm_numblk[dir];
           stride[n_site] = wfm_stride[dir];
           n_site++;
       if (initted){
         QMP_free_msghandle(send_h[dir][ig]);
         QMP_free_msghandle(recv_h[dir][ig]);
         QMP_free_msgmem(send_mem[dir][ig]);
         QMP_free_msgmem(recv_mem[dir][ig]);
       }
       if(n_site>0){
         send_mem[dir][ig]  = QMP_declare_strided_array_msgmem(addr,blksize,numblk,stride,n_site);
         send_h[dir][ig] = QMP_declare_send_relative(send_mem[dir][ig],dir%4,sign,0);
         wfm_blocks[dir]=ig+1;
       }

       int r_site=0;
           addr[r_site] = wfm_r_start[dir];
           blksize[r_site] = wfm_blklen[dir];
           numblk[r_site] = wfm_numblk[dir];
           stride[r_site] = wfm_stride[dir];
           r_site++;
       if (n_site!=r_site)
         ERR.General("",fname,"n_site(%d)!=r_site(%d)\n",n_site,r_site);
//       VRB.Flow("",fname,"n_site=%d r_site=%d",n_site,r_site);
       if(r_site>0){
         recv_mem[dir][ig]  = QMP_declare_strided_array_msgmem(addr,blksize,numblk,stride,r_site);
         recv_h[dir][ig] = QMP_declare_receive_relative(recv_mem[dir][ig],dir%4,-sign,0);
       }


    } // dir
  } // ig
  for(int dir=0;dir<8;dir++) 
     VRB.Flow("",fname,"wfm_blocks[%d]=%d",dir,wfm_blocks[dir]);
  pir = CoorT()%2;
//  pir = 0;
  initted=1;
  wilson_initted=0;
} 

#if 0
   for(int dir=0;dir<8;dir++)
     for ( index=0; index <wfm_numchunk[dir];index++){
           Float *tmp_p = wfm_send_ad[dir+8*index];
           if ( (*tmp_p)*(*tmp_p) >0.0001)
           printf("Node %d: wfm_send_ad[%d][%d]=%e\n",UniqueID(),dir,index,*tmp_p); 
   }
#endif

  int dir_g=4;
  for(int ig=0; ig<1; ig++){
    for(index=0;index<8;index++){
           QMP_start(send_h[index][ig]); QMP_start(recv_h[index][ig]);
    }
    for(index=0;index<8;index++){
         QMP_wait(send_h[index][ig]); QMP_wait(recv_h[index][ig]);
    }
  }

#if 0
   for(int dir=0;dir<8;dir++)
     for ( index=0; index <wfm_numchunk[dir];index++){
           Float *tmp_p = wfm_recv_ad[dir+8*index];
           if ( (*tmp_p)*(*tmp_p) >1e-10)
           printf("Node %d: wfm_recv_ad[%d][%d]=%e\n",UniqueID(),dir,index,*tmp_p); 
   }
#endif
}