/* unique numeric SMP-node identifier */
long vt_pform_node_id() {
#ifdef BGL_GROUP_ON_NODEBOARD
  return ((mybgl.location >> 6) & 0x1fff);
#else
  if ( BGLPersonality_virtualNodeMode(&mybgl) )
    return ( BGLPersonality_psetNum(&mybgl) *
           BGLPersonality_numNodesInPset(&mybgl) +
           BGLPersonality_rankInPset(&mybgl)) * 2
           + rts_get_processor_id();
  else
    return BGLPersonality_psetNum(&mybgl) *
           BGLPersonality_numNodesInPset(&mybgl) +
           BGLPersonality_rankInPset(&mybgl);
#endif
}
/* unique string SMP-node identifier */
char* vt_pform_node_name() {
#ifdef BGL_GROUP_ON_NODEBOARD
  static char buf[BGLPERSONALITY_MAX_LOCATION];
  bgl_getNodeidString(&mybgl, buf);
  return buf;
#else
  static char node[128];
  unsigned x = BGLPersonality_xCoord(&mybgl);
  unsigned y = BGLPersonality_yCoord(&mybgl);
  unsigned z = BGLPersonality_zCoord(&mybgl);

  sprintf(node, "node-%03d-%03d-%03d-%d", x, y, z, rts_get_processor_id());

  /* -- BGL internal location string
  static char buf[BGLPERSONALITY_MAX_LOCATION];
  BGLPersonality_getLocationString(&mybgl, buf);
  -- */
  return node;              
#endif
}
Beispiel #3
0
//------------------------------------------------------------------
// It initialize the inter-core ic_locks of the 4 memory-fifos 
// to 0 indicating that the fifos are empty.
//------------------------------------------------------------------
void BGLCPSTorusMFifo_Init (void)
{
  char ic_lock[16] __attribute__((aligned(BGL_QUAD_ALIGNSIZE)));

  // Get core id
  int pir = rts_get_processor_id();

  // Initialize the inter-core ic_locks to 0
  ic_lock[0] = 0;

  if(pir = 0){
    QuadMove(&ic_lock,MFIFO_A,0);
    QuadMove(&ic_lock,MFIFO_D,0);
  } else {
    QuadMove(&ic_lock,MFIFO_B,0);
    QuadMove(&ic_lock,MFIFO_C,0);
  }

  // Local barrier
  BGL_Barrier_Pass(BGL_AppBarriers);
}
Beispiel #4
0
void init_qmp(int * argc, char ***argv) {

#if 0
  printf("init_qmp(%d %p)\n",*argc,*argv);
  for(int i = 0; i<*argc;i++){
    printf("argv[%d](before)=%s\n",i,(*argv)[i]); 
  }
#endif

#if 0
   spi_init();
#endif
  
    QMP_thread_level_t prv;
#ifndef UNIFORM_SEED_NO_COMMS
    QMP_status_t init_status = QMP_init_msg_passing(argc, argv, QMP_THREAD_SINGLE, &prv);
    if (init_status) printf("QMP_init_msg_passing returned %d\n",init_status);
    peRank = QMP_get_node_number();
    peNum = QMP_get_number_of_nodes();
    if(!peRank)printf("QMP_init_msg_passing returned %d\n",init_status);

    if (init_status != QMP_SUCCESS) {
      QMP_error("%s\n",QMP_error_string(init_status));
    }

    // check QMP thread level
    // Added by Hantao
    if(peRank == 0) {
        switch(prv) {
        case QMP_THREAD_SINGLE:
            printf("QMP thread level = QMP_THREAD_SINGLE\n");
            break;
        case QMP_THREAD_FUNNELED:
            printf("QMP thread level = QMP_THREAD_FUNNELED\n");
            break;
        case QMP_THREAD_SERIALIZED:
            printf("QMP thread level = QMP_THREAD_SERIALIZED\n");
            break;
        case QMP_THREAD_MULTIPLE:
            printf("QMP thread level = QMP_THREAD_MULTIPLE\n");
            break;
        default:
            printf("QMP thread level = no idea what this is, boom!\n");
        }
    }

    //Check to make sure that this machine is a GRID machine
    //Exit if not GRID machine
    QMP_ictype qmp_type = QMP_get_msg_passing_type();

    //Get information about the allocated machine
    peNum = QMP_get_number_of_nodes();
    NDIM = QMP_get_allocated_number_of_dimensions();
    peGrid = QMP_get_allocated_dimensions();
    pePos = QMP_get_allocated_coordinates();

    if(peRank==0){
      for(int i = 0; i<*argc;i++){
        printf("argv[%d])(after)=%s\n",i,(*argv)[i]); 
      }
    }
#else
    QMP_status_t init_status = QMP_SUCCESS;
    peRank=0;
    peNum=1;
    NDIM=4;
#endif

//#if (TARGET == BGL) || (TARGET == BGP)
  if (NDIM>5){
    peNum = 1;
    for(int i = 0;i<5;i++)
	peNum *= peGrid[i];
    peRank = peRank % peNum;
  }
  int if_print=1;
  for(int i = 0;i<NDIM;i++)
  if (pePos[i]>=2) if_print=0;

  if (if_print){
      printf("Rank=%d Num=%d NDIM=%d\n",peRank,peNum,NDIM);
      printf("dim:");
      for(int i = 0;i<NDIM;i++)
        printf(" %d",peGrid[i]);
      printf("\n");
      printf("pos:");
      for(int i = 0;i<NDIM;i++)
        printf(" %d",pePos[i]);
      printf("\n");

#if 0
    int rc;
    BGLPersonality pers;
    rts_get_personality(&pers, sizeof(pers));
    printf("from personality: %d %d %d %d\n",pers.xCoord,pers.yCoord,pers.zCoord,rts_get_processor_id());
#endif
  }


//     printf("from personality:\n");

#if 0
    if ( (qmp_type!= QMP_GRID) && (qmp_type !=QMP_MESH)  ) {
      QMP_error("CPS on QMP only implemented for GRID or MESH, not (%d) machines\n",qmp_type);
    }
#endif

//     printf("QMP_declare_logical_topology(peGrid, NDIM)\n");
#ifndef UNIFORM_SEED_NO_COMMS
    //Declare the logical topology (Redundant for GRID machines)
    if (QMP_declare_logical_topology(peGrid, NDIM) != QMP_SUCCESS) {
      QMP_error("Node %d: Failed to declare logical topology\n",peRank);
      exit(-4);
    }
#endif
    initialized = true;
  printf("Rank=%d init_qmp() done\n",peRank);
    
  }
Beispiel #5
0
//------------------------------------------------------------------
// Various initializations
// Here it is assumed that the TLBs for cores 0, 1 are set
// so that the addresses below translate appropriately.
//------------------------------------------------------------------
void BGLCPSVarious_Init (void)
{
  int pir;

  //----------------------------------------------------------------
  // get the core id
  //----------------------------------------------------------------
  pir = rts_get_processor_id();

  //----------------------------------------------------------------
  // Set the addresses where the sender status are stored
  //----------------------------------------------------------------
  if(pir == 0){
    stat_se_ptr[0] = (BGLQuad *) (BGL_MEM_TORUS_G0_BASE+BGL_MEM_TORUS_STATUS0_OFFSET);
    stat_se_ptr[1] = (BGLQuad *) (BGL_MEM_TORUS_G0_BASE+BGL_MEM_TORUS_STATUS0_OFFSET);
    stat_se_ptr[2] = (BGLQuad *) (BGL_MEM_TORUS_G0_BASE+BGL_MEM_TORUS_STATUS0_OFFSET);
    stat_se_ptr[3] = (BGLQuad *) (BGL_MEM_TORUS_G0_BASE+BGL_MEM_TORUS_STATUS0_OFFSET);
    stat_se_ptr[4] = (BGLQuad *) (BGL_MEM_TORUS_G0_BASE+BGL_MEM_TORUS_STATUS0_OFFSET);
    stat_se_ptr[5] = (BGLQuad *) (BGL_MEM_TORUS_G0_BASE+BGL_MEM_TORUS_STATUS0_OFFSET);
    stat_se_ptr[6] = MFIFO_C0_SEND_P;
    stat_se_ptr[7] = MFIFO_C0_SEND_M;
  } else {
    stat_se_ptr[0] = (BGLQuad *) (BGL_MEM_TORUS_G0_BASE+BGL_MEM_TORUS_STATUS0_OFFSET);
    stat_se_ptr[1] = (BGLQuad *) (BGL_MEM_TORUS_G0_BASE+BGL_MEM_TORUS_STATUS0_OFFSET);
    stat_se_ptr[2] = (BGLQuad *) (BGL_MEM_TORUS_G0_BASE+BGL_MEM_TORUS_STATUS0_OFFSET);
    stat_se_ptr[3] = (BGLQuad *) (BGL_MEM_TORUS_G0_BASE+BGL_MEM_TORUS_STATUS0_OFFSET);
    stat_se_ptr[4] = (BGLQuad *) (BGL_MEM_TORUS_G0_BASE+BGL_MEM_TORUS_STATUS0_OFFSET);
    stat_se_ptr[5] = (BGLQuad *) (BGL_MEM_TORUS_G0_BASE+BGL_MEM_TORUS_STATUS0_OFFSET);
    stat_se_ptr[6] = MFIFO_C1_SEND_P;
    stat_se_ptr[7] = MFIFO_C1_SEND_M;
  }

  //----------------------------------------------------------------
  // Set the addresses where the receiver status are stored
  //----------------------------------------------------------------
  if(pir == 0){
    stat_re_ptr[0] = (BGLQuad *) (BGL_MEM_TORUS_G0_BASE+BGL_MEM_TORUS_STATUS0_OFFSET);
    stat_re_ptr[1] = (BGLQuad *) (BGL_MEM_TORUS_G0_BASE+BGL_MEM_TORUS_STATUS0_OFFSET);
    stat_re_ptr[2] = (BGLQuad *) (BGL_MEM_TORUS_G0_BASE+BGL_MEM_TORUS_STATUS0_OFFSET);
    stat_re_ptr[3] = (BGLQuad *) (BGL_MEM_TORUS_G0_BASE+BGL_MEM_TORUS_STATUS0_OFFSET);
    stat_re_ptr[4] = (BGLQuad *) (BGL_MEM_TORUS_G0_BASE+BGL_MEM_TORUS_STATUS0_OFFSET);
    stat_re_ptr[5] = (BGLQuad *) (BGL_MEM_TORUS_G0_BASE+BGL_MEM_TORUS_STATUS0_OFFSET);
    stat_re_ptr[6] = MFIFO_C0_RECV_P;
    stat_re_ptr[7] = MFIFO_C0_RECV_M;
  } else {
    stat_re_ptr[0] = (BGLQuad *) (BGL_MEM_TORUS_G0_BASE+BGL_MEM_TORUS_STATUS0_OFFSET);
    stat_re_ptr[1] = (BGLQuad *) (BGL_MEM_TORUS_G0_BASE+BGL_MEM_TORUS_STATUS0_OFFSET);
    stat_re_ptr[2] = (BGLQuad *) (BGL_MEM_TORUS_G0_BASE+BGL_MEM_TORUS_STATUS0_OFFSET);
    stat_re_ptr[3] = (BGLQuad *) (BGL_MEM_TORUS_G0_BASE+BGL_MEM_TORUS_STATUS0_OFFSET);
    stat_re_ptr[4] = (BGLQuad *) (BGL_MEM_TORUS_G0_BASE+BGL_MEM_TORUS_STATUS0_OFFSET);
    stat_re_ptr[5] = (BGLQuad *) (BGL_MEM_TORUS_G0_BASE+BGL_MEM_TORUS_STATUS0_OFFSET);
    stat_re_ptr[6] = MFIFO_C1_RECV_P;
    stat_re_ptr[7] = MFIFO_C1_RECV_M;
  }

  //----------------------------------------------------------------
  // Set the offset within the status quad-word value,
  // addressed in Bytes, where the 1 Byte status for a given
  // sender is located. The possible offsets are 0, 1, ...15.
  //----------------------------------------------------------------
  stat_se[0] = 7; // status of recv fifo 0
  stat_se[1] = 7; // status of recv fifo 1
  stat_se[2] = 8; // status of recv fifo 2
  stat_se[3] = 8; // status of recv fifo 3
  stat_se[4] = 9; // status of recv fifo 4
  stat_se[5] = 9; // status of recv fifo 5
  stat_se[6] = 0; // status of memory recv fifo 6
  stat_se[7] = 0; // status of memory recv fifo 7

  //----------------------------------------------------------------
  // Set the offset within the status quad-word value,
  // addressed in Bytes, where the 1 Byte status for a given
  // receiver is located. The possible offsets are 0, 1, ...15.
  //----------------------------------------------------------------
  stat_re[0] = 0; // status of recv fifo 0
  stat_re[1] = 1; // status of recv fifo 1
  stat_re[2] = 2; // status of recv fifo 2
  stat_re[3] = 3; // status of recv fifo 3
  stat_re[4] = 4; // status of recv fifo 4
  stat_re[5] = 5; // status of recv fifo 5
  stat_re[6] = 0; // status of memory recv fifo 6
  stat_re[7] = 0; // status of memory recv fifo 7


  //----------------------------------------------------------------
  // Set the sender fifo addresses.
  //----------------------------------------------------------------
  if(pir == 0){
    fifo_se_ptr[0] = (BGLQuad *) (BGL_MEM_TORUS_G0_BASE+BGL_MEM_TORUS_DATAIN_0_OFFSET);
    fifo_se_ptr[1] = (BGLQuad *) (BGL_MEM_TORUS_G0_BASE+BGL_MEM_TORUS_DATAIN_0_OFFSET);
    fifo_se_ptr[2] = (BGLQuad *) (BGL_MEM_TORUS_G0_BASE+BGL_MEM_TORUS_DATAIN_1_OFFSET);
    fifo_se_ptr[3] = (BGLQuad *) (BGL_MEM_TORUS_G0_BASE+BGL_MEM_TORUS_DATAIN_1_OFFSET);
    fifo_se_ptr[4] = (BGLQuad *) (BGL_MEM_TORUS_G0_BASE+BGL_MEM_TORUS_DATAIN_2_OFFSET);
    fifo_se_ptr[5] = (BGLQuad *) (BGL_MEM_TORUS_G0_BASE+BGL_MEM_TORUS_DATAIN_2_OFFSET);
    fifo_se_ptr[6] = MFIFO_C0_SEND_P + 1;
    fifo_se_ptr[7] = MFIFO_C0_SEND_M + 1;
  } else {
    fifo_se_ptr[0] = (BGLQuad *) (BGL_MEM_TORUS_G0_BASE+BGL_MEM_TORUS_DATAIN_0_OFFSET);
    fifo_se_ptr[1] = (BGLQuad *) (BGL_MEM_TORUS_G0_BASE+BGL_MEM_TORUS_DATAIN_0_OFFSET);
    fifo_se_ptr[2] = (BGLQuad *) (BGL_MEM_TORUS_G0_BASE+BGL_MEM_TORUS_DATAIN_1_OFFSET);
    fifo_se_ptr[3] = (BGLQuad *) (BGL_MEM_TORUS_G0_BASE+BGL_MEM_TORUS_DATAIN_1_OFFSET);
    fifo_se_ptr[4] = (BGLQuad *) (BGL_MEM_TORUS_G0_BASE+BGL_MEM_TORUS_DATAIN_2_OFFSET);
    fifo_se_ptr[5] = (BGLQuad *) (BGL_MEM_TORUS_G0_BASE+BGL_MEM_TORUS_DATAIN_2_OFFSET);
    fifo_se_ptr[6] = MFIFO_C1_SEND_P + 1;
    fifo_se_ptr[7] = MFIFO_C1_SEND_M + 1;
  }

  //----------------------------------------------------------------
  // Set the receiver fifo addresses.
  //----------------------------------------------------------------
  if(pir == 0){
    fifo_re_ptr[0] = (BGLQuad *) (BGL_MEM_TORUS_G0_BASE+BGL_MEM_TORUS_DATAOUT_0_OFFSET);
    fifo_re_ptr[1] = (BGLQuad *) (BGL_MEM_TORUS_G0_BASE+BGL_MEM_TORUS_DATAOUT_1_OFFSET);
    fifo_re_ptr[2] = (BGLQuad *) (BGL_MEM_TORUS_G0_BASE+BGL_MEM_TORUS_DATAOUT_2_OFFSET);
    fifo_re_ptr[3] = (BGLQuad *) (BGL_MEM_TORUS_G0_BASE+BGL_MEM_TORUS_DATAOUT_3_OFFSET);
    fifo_re_ptr[4] = (BGLQuad *) (BGL_MEM_TORUS_G0_BASE+BGL_MEM_TORUS_DATAOUT_4_OFFSET);
    fifo_re_ptr[5] = (BGLQuad *) (BGL_MEM_TORUS_G0_BASE+BGL_MEM_TORUS_DATAOUT_5_OFFSET);
    fifo_re_ptr[6] = MFIFO_C0_RECV_P + 1;
    fifo_re_ptr[7] = MFIFO_C0_RECV_M + 1;
  } else {
    fifo_re_ptr[0] = (BGLQuad *) (BGL_MEM_TORUS_G0_BASE+BGL_MEM_TORUS_DATAOUT_0_OFFSET);
    fifo_re_ptr[1] = (BGLQuad *) (BGL_MEM_TORUS_G0_BASE+BGL_MEM_TORUS_DATAOUT_1_OFFSET);
    fifo_re_ptr[2] = (BGLQuad *) (BGL_MEM_TORUS_G0_BASE+BGL_MEM_TORUS_DATAOUT_2_OFFSET);
    fifo_re_ptr[3] = (BGLQuad *) (BGL_MEM_TORUS_G0_BASE+BGL_MEM_TORUS_DATAOUT_3_OFFSET);
    fifo_re_ptr[4] = (BGLQuad *) (BGL_MEM_TORUS_G0_BASE+BGL_MEM_TORUS_DATAOUT_4_OFFSET);
    fifo_re_ptr[5] = (BGLQuad *) (BGL_MEM_TORUS_G0_BASE+BGL_MEM_TORUS_DATAOUT_5_OFFSET);
    fifo_re_ptr[6] = MFIFO_C1_RECV_P + 1;
    fifo_re_ptr[7] = MFIFO_C1_RECV_M + 1;
  }

  for(int i=0; i<8; i++){
    send_poll[i] = 0;
    recv_poll[i] = 0;
  }


}
Beispiel #6
0
//------------------------------------------------------------------
// It fills in the buffer hdr_send_buf allocated in this file with
// 8x3=24 headers : one for each of x+, x-, y+, y-, z+, z-, t_, t-
// and for each of those for sizes 32B, 128B, 256B. The headers
// for t+, t- are set to 0, since they are not used by the memory
// communications.
//
// It also sets the hint bits for nearest neighbor communication.
//
// Should be called once before any other routines in this file are 
// used. 
//------------------------------------------------------------------
void BGLCPSTorusPacketHeader_InitFill (void)
{
  int nn;
  int x, y, z;
  int Lx, Ly, Lz;
  int pir;
  BGLPersonality pers;

  // Get the core id 
  pir = rts_get_processor_id();

  // Get personality info
  rts_get_personality(&pers, sizeof(pers));

  // Set the sizes of each direction 
  // (size starts from 1)
  Lx = pers.xSize;
  Ly = pers.ySize;
  Lz = pers.zSize;

  // Set the coordinates of this node 
  // (coordinate ranges fro 0 to size-1)
  x = pers.xCoord;
  y = pers.yCoord;
  z = pers.zCoord;

  // Fill the header for a packet destined to go to the 
  // nearest neighbor along x+
  if(x == Lx-1){
    nn = 0;
  } else {
    nn = x+1;
  }
  BGLCPSTorusPacketHeader_Init(&(hdr_send_buf[0][0]), 1, 0, 0, 0, 0, 0, nn, y, z, pir, 0);
  BGLCPSTorusPacketHeader_Init(&(hdr_send_buf[0][1]), 1, 0, 0, 0, 0, 0, nn, y, z, pir, 3);
  BGLCPSTorusPacketHeader_Init(&(hdr_send_buf[0][2]), 1, 0, 0, 0, 0, 0, nn, y, z, pir, 7);


  // Fill the header for a packet destined to go to the 
  // nearest neighbor along x-
  if(x == 0){
    nn = Lx-1;
  } else {
    nn = x-1;
  }
  BGLCPSTorusPacketHeader_Init(&(hdr_send_buf[1][0]), 0, 1, 0, 0, 0, 0, nn, y, z, pir, 0);
  BGLCPSTorusPacketHeader_Init(&(hdr_send_buf[1][1]), 0, 1, 0, 0, 0, 0, nn, y, z, pir, 3);
  BGLCPSTorusPacketHeader_Init(&(hdr_send_buf[1][2]), 0, 1, 0, 0, 0, 0, nn, y, z, pir, 7);

  // Fill the header for a packet destined to go to the 
  // nearest neighbor along y+
  if(y == Ly-1){
    nn = 0;
  } else {
    nn = y+1;
  }
  BGLCPSTorusPacketHeader_Init(&(hdr_send_buf[2][0]), 0, 0, 1, 0, 0, 0, x, nn, z, pir, 0);
  BGLCPSTorusPacketHeader_Init(&(hdr_send_buf[2][1]), 0, 0, 1, 0, 0, 0, x, nn, z, pir, 3);
  BGLCPSTorusPacketHeader_Init(&(hdr_send_buf[2][2]), 0, 0, 1, 0, 0, 0, x, nn, z, pir, 7);

  // Fill the header for a packet destined to go to the 
  // nearest neighbor along y-
  if(y == 0){
    nn = Ly-1;
  } else {
    nn = y-1;
  }
  BGLCPSTorusPacketHeader_Init(&(hdr_send_buf[3][0]), 0, 0, 0, 1, 0, 0, x, nn, z, pir, 0);
  BGLCPSTorusPacketHeader_Init(&(hdr_send_buf[3][1]), 0, 0, 0, 1, 0, 0, x, nn, z, pir, 3);
  BGLCPSTorusPacketHeader_Init(&(hdr_send_buf[3][2]), 0, 0, 0, 1, 0, 0, x, nn, z, pir, 7);

  // Fill the header for a packet destined to go to the 
  // nearest neighbor along z+
  if(z == Lz-1){
    nn = 0;
  } else {
    nn = z+1;
  }
  BGLCPSTorusPacketHeader_Init(&(hdr_send_buf[4][0]), 0, 0, 0, 0, 1, 0, x, y, nn, pir, 0);
  BGLCPSTorusPacketHeader_Init(&(hdr_send_buf[4][1]), 0, 0, 0, 0, 1, 0, x, y, nn, pir, 3);
  BGLCPSTorusPacketHeader_Init(&(hdr_send_buf[4][2]), 0, 0, 0, 0, 1, 0, x, y, nn, pir, 7);

  // Fill the header for a packet destined to go to the 
  // nearest neighbor along z-
  if(z == 0){
    nn = Lz-1;
  } else {
    nn = z-1;
  }
  BGLCPSTorusPacketHeader_Init(&(hdr_send_buf[5][0]), 0, 0, 0, 0, 0, 1, x, y, nn, pir, 0);
  BGLCPSTorusPacketHeader_Init(&(hdr_send_buf[5][1]), 0, 0, 0, 0, 0, 1, x, y, nn, pir, 3);
  BGLCPSTorusPacketHeader_Init(&(hdr_send_buf[5][2]), 0, 0, 0, 0, 0, 1, x, y, nn, pir, 7);

  // Fill a dummy header for t+ with 0 
  BGLCPSTorusPacketHeader_Init(&(hdr_send_buf[6][0]), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
  BGLCPSTorusPacketHeader_Init(&(hdr_send_buf[6][1]), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
  BGLCPSTorusPacketHeader_Init(&(hdr_send_buf[6][2]), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);

  // Fill a dummy header for t- with 0 
  BGLCPSTorusPacketHeader_Init(&(hdr_send_buf[7][0]), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
  BGLCPSTorusPacketHeader_Init(&(hdr_send_buf[7][1]), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
  BGLCPSTorusPacketHeader_Init(&(hdr_send_buf[7][2]), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);


}
Beispiel #7
0
//------------------------------------------------------------------
// The Wilson fermion communication routine.
//------------------------------------------------------------------
void wfm_comm()
{
  int i, k, dir, ig, ic, group, d;
  int mu_se[8];
  int mu_re[8];
  int mu_nc[8];
  BGLQuad *fifo;
  BGLQuad *qdata;
  char stat[16]        __attribute__((aligned(BGL_QUAD_ALIGNSIZE)));
  int pir;
  int wfm_dir;

  //  register int u asm("r9") = 16;
  register int u = 16;

  // Get the core id 
  pir = rts_get_processor_id();

  mu_se[0] = 0;
  mu_se[1] = 4;
  mu_se[2] = 1;
  mu_se[3] = 5;
  mu_se[4] = 2;
  mu_se[5] = 6;
  mu_se[6] = 3;
  mu_se[7] = 7;

  mu_re[0] = 4;
  mu_re[1] = 0;
  mu_re[2] = 5;
  mu_re[3] = 1;
  mu_re[4] = 6;
  mu_re[5] = 2;
  mu_re[6] = 7;
  mu_re[7] = 3;

  mu_nc[0] = 0;
  mu_nc[1] = 4;
  mu_nc[2] = 1;
  mu_nc[3] = 5;
  mu_nc[4] = 2;
  mu_nc[5] = 6;
  mu_nc[6] = 3;
  mu_nc[7] = 7;

  group = 4;

  //------------------------------------------------------------------------
  // Send plus receive minus x y z
  //------------------------------------------------------------------------
  for(ig=0; ig<wfm_max_numchunk/group; ig++){

    //Send x y z
    //----------------------------------------------------------------------
    for(ic=0;ic<group;ic++){
      i = group*ig + ic;
      QuadMove(stat_se_ptr[0], &stat, 30);
      for(d=0; d<3; d++){
	dir = 2*d + pir;
	wfm_dir = bgl_cps_dir[dir];
	if( (i < wfm_numchunk[mu_nc[wfm_dir]]) && (grid_end[dir] != 1)   ){
	  QuadMove(stat_se_ptr[dir], &stat, 30);
	  while (1) {
	    if (stat[stat_se[dir]] < SEND_FIFO_LEVEL) { 
	      break;
	    }
	    send_poll[dir]++;
	    QuadMove(stat_se_ptr[dir], &stat, 30);
	  }
	  fifo = fifo_se_ptr[dir];
	  qdata = (BGLQuad *) wfm_send_ad[mu_se[wfm_dir]+8*i];
	  TORUS_SEND_SPINOR(dir, fifo, qdata);
	}
      }
    }

    //    printf("send xyz 0\n");


    //Send / Receive t
    //----------------------------------------------------------------------

    for(ic=0;ic<group;ic++){
      i = group*ig + ic;

      // Send t
      dir = 6;
      wfm_dir = bgl_cps_dir[dir];
      {
	if( (i < wfm_numchunk[mu_nc[wfm_dir]]) && (grid_end[dir] != 1)   ){
	  while (1) {
	    QuadMove(stat_se_ptr[dir], &stat, 30);
	    if (stat[stat_se[dir]] < SEND_FIFO_LEVEL) { 
	      break;
	    }
	    send_poll[dir]++;
	  }
	  fifo = fifo_se_ptr[dir];
	  qdata = (BGLQuad *) wfm_send_ad[mu_se[wfm_dir]+8*i];
	  MEM_SEND_SPINOR(dir, fifo, qdata);
	}
      }     
      
      // Receive t
      dir = 7;
      wfm_dir = bgl_cps_dir[dir];
      {
	if(i < wfm_numchunk[mu_nc[wfm_dir]] && (grid_end[dir] != 1)   ){
	  while (1) {
	    QuadMove(stat_re_ptr[dir], &stat, 30);
	    if (stat[stat_re[dir]] > RECV_FIFO_LEVEL) { 
	      break;
	    }
	    recv_poll[dir]++;
	  }
	  fifo = fifo_re_ptr[dir];
	  qdata = (BGLQuad *) wfm_recv_ad[mu_re[wfm_dir]+8*i];
	  MEM_RECV_SPINOR(dir, fifo, qdata);
	}
	if(i < wfm_numchunk[mu_nc[wfm_dir]] && (grid_end[dir] == 1)   ){
	  IFloat *data = wfm_recv_ad[mu_re[wfm_dir]+8*i];
	  for(k=0; k<12; k++){
	    data[k] = 0;
	  }
	}
      }
    }


    //    printf("send receive t 0\n");


    // Recv x y z
    //----------------------------------------------------------------------
    for(ic=0;ic<group;ic++){
      i = group*ig + ic;
      QuadMove(stat_re_ptr[0], &stat, 30);
      for(d=0; d<3; d++){
	dir = 2*d + (1+pir)%2;
	wfm_dir = bgl_cps_dir[dir];
	if(i < wfm_numchunk[mu_nc[wfm_dir]] && (grid_end[dir] != 1)   ){
          QuadMove(stat_re_ptr[dir], &stat, 30);
	  while (1) {
	    if (stat[stat_re[dir]] > RECV_FIFO_LEVEL) { 
	      break;
	    }
	    recv_poll[dir]++;
	    QuadMove(stat_re_ptr[dir], &stat, 30);
	  }
	  fifo = fifo_re_ptr[dir];
	  qdata = (BGLQuad *) wfm_recv_ad[mu_re[wfm_dir]+8*i];
	  TORUS_RECV_SPINOR(dir, fifo, qdata);
	}
	if(i < wfm_numchunk[mu_nc[wfm_dir]] && (grid_end[dir] == 1)   ){
	  IFloat *data = wfm_recv_ad[mu_re[wfm_dir]+8*i];
	  for(k=0; k<12; k++){
	    data[k] = 0;
	  }
	} 
      }
    }

    //    printf("recv xyz 0\n");
  }



  //------------------------------------------------------------------------
  // Send minus receive plus x,y,z
  //------------------------------------------------------------------------
  for(ig=0; ig<wfm_max_numchunk/group; ig++){

    //    time_4[ig] = BGLTimebase();
    //    poll_count_4[ig] = poll_count;
    
    //Send x y z
    //----------------------------------------------------------------------
    for(ic=0;ic<group;ic++){
      i = group*ig + ic;
      QuadMove(stat_se_ptr[0], &stat, 30);
      for(d=0; d<3; d++){
	dir = 2*d + (1+pir)%2;
	wfm_dir = bgl_cps_dir[dir];
	if( (i < wfm_numchunk[mu_nc[wfm_dir]]) && (grid_end[dir] != 1)   ){
	  QuadMove(stat_se_ptr[dir], &stat, 30);
	  while (1) {
	    if (stat[stat_se[dir]] < SEND_FIFO_LEVEL) { 
	      break;
	    }
	    send_poll[dir]++;
	    QuadMove(stat_se_ptr[dir], &stat, 30);
	  }
	  fifo = fifo_se_ptr[dir];
	  qdata = (BGLQuad *) wfm_send_ad[mu_se[wfm_dir]+8*i];
	  TORUS_SEND_SPINOR(dir, fifo, qdata);
	}
      }
    }

    //    printf("send xyz 1\n");
 
    //Send / Receive t
    //----------------------------------------------------------------------

    for(ic=0;ic<group;ic++){
      i = group*ig + ic;

      // Send t
      dir = 7;
      wfm_dir = bgl_cps_dir[dir];
      {
	if( (i < wfm_numchunk[mu_nc[wfm_dir]]) && (grid_end[dir] != 1)   ){
	  while (1) {
	    QuadMove(stat_se_ptr[dir], &stat, 30);
	    if (stat[stat_se[dir]] < SEND_FIFO_LEVEL) { 
	      break;
	    }
	    send_poll[dir]++;
	  }
	  fifo = fifo_se_ptr[dir];
	  qdata = (BGLQuad *) wfm_send_ad[mu_se[wfm_dir]+8*i];
	  MEM_SEND_SPINOR(dir, fifo, qdata);
	}
      }     
      
      // Receive t
      dir = 6;
      wfm_dir = bgl_cps_dir[dir];
      {
	if(i < wfm_numchunk[mu_nc[wfm_dir]] && (grid_end[dir] != 1)   ){
	  while (1) {
	    QuadMove(stat_re_ptr[dir], &stat, 30);
	    if (stat[stat_re[dir]] > RECV_FIFO_LEVEL) { 
	      break;
	    }
	    recv_poll[dir]++;
	  }
	  fifo = fifo_re_ptr[dir];
	  qdata = (BGLQuad *) wfm_recv_ad[mu_re[wfm_dir]+8*i];
	  MEM_RECV_SPINOR(dir, fifo, qdata);
	}
	if(i < wfm_numchunk[mu_nc[wfm_dir]] && (grid_end[dir] == 1)   ){
	  IFloat *data = wfm_recv_ad[mu_re[wfm_dir]+8*i];
	  for(k=0; k<12; k++){
	    data[k] = 0;
	  }
	}
      }
    }

    //    printf("send recv t 1\n");

    // Recv x y z
    //----------------------------------------------------------------------
    for(ic=0;ic<group;ic++){
      i = group*ig + ic;
      QuadMove(stat_re_ptr[0], &stat, 30);
      for(d=0; d<3; d++){
	dir = 2*d + pir;
	wfm_dir = bgl_cps_dir[dir];
	if(i < wfm_numchunk[mu_nc[wfm_dir]] && (grid_end[dir] != 1)   ){
	  QuadMove(stat_re_ptr[dir], &stat, 30);
	  while (1) {
	    if (stat[stat_re[dir]] > RECV_FIFO_LEVEL) { 
	      break;
	    }
	    recv_poll[dir]++;
	    QuadMove(stat_re_ptr[dir], &stat, 30);
	  }
	  fifo = fifo_re_ptr[dir];
	  qdata = (BGLQuad *) wfm_recv_ad[mu_re[wfm_dir]+8*i];
	  TORUS_RECV_SPINOR(dir, fifo, qdata);
	}
	if(i < wfm_numchunk[mu_nc[wfm_dir]] && (grid_end[dir] == 1)   ){
	  IFloat *data = wfm_recv_ad[mu_re[wfm_dir]+8*i];
	  for(k=0; k<12; k++){
	    data[k] = 0;
	  }
	} 
      }
    }
    //    printf("recv xyz 1\n");

  }

}