static bool _XMP_tca_ring_buf_recv_nowait(tca_ring_buf_t *ring_buf, packet_t *packet)
{
  psn_t local_recv_psn = ring_buf->psn_pairs[1]; //1: local_recv

  //wait until (remote_send_psn > local_recv_psn) become true
  volatile psn_t* remote_send_psn_p = &(ring_buf->psn_pairs[2]); //2: remote_send
  if(*remote_send_psn_p <= local_recv_psn){
    return false;
  }    

  //copy recved data
  int buf_pos = local_recv_psn % RING_SIZE;
  *packet = ring_buf->recv_buffer[buf_pos];

  //fprintf(stderr, "recv: psn=%ull, tag=%d, data=%d\n", local_recv_psn, *tag, *data);

  //update local_recv_psn
  ring_buf->psn_pairs[1] = ++local_recv_psn;

  if(local_recv_psn - ring_buf->last_send_local_recv_psn >= (RING_SIZE/2)){
    _XMP_tca_lock();
    //send local psn pair
    const int psn_wait_slot = _wait_slot_offset + RING_SIZE;
    ring_buf->last_send_local_recv_psn = ring_buf->psn_pairs[1];
    TCA_CHECK(tcaDescSet(ring_buf->psn_desc, _dmac_channel));
    TCA_CHECK(tcaStartDMADesc(_dmac_channel));
    TCA_CHECK(tcaWaitDMARecvDesc(&(ring_buf->local_psn_pairs_handle), psn_wait_slot, _psn_wait_tag));
    TCA_CHECK(tcaWaitDMAC(_dmac_channel)); //important
    _XMP_tca_unlock();
  }

  return true;
}
static void _XMP_tca_ring_buf_finalize(tca_ring_buf_t* ring_buf)
{
  //destroy and free my buffer handles
  /* for(int i = 0; i < RING_SIZE; i++){ */
  /*   TCA_CHECK(tcaDestroyHandle(&ring_buf->local_buffer_handles[i])); */
  /* } */
  TCA_CHECK(tcaDestroyHandle(&ring_buf->local_send_buffer_handle));
  TCA_CHECK(tcaDestroyHandle(&ring_buf->local_recv_buffer_handle));

  //free my buffer
  //_XMP_free(ring_buf->buffer);
  TCA_CHECK(tcaFree(ring_buf->send_buffer, tcaMemoryCPU));
  TCA_CHECK(tcaFree(ring_buf->recv_buffer, tcaMemoryCPU));
  TCA_CHECK(tcaFree(ring_buf->psn_pairs, tcaMemoryCPU));
}
void _XMP_create_TCA_handle(void *acc_addr, _XMP_array_t *adesc)
{
  if(adesc->set_handle)
    return;

  // 64KB align ?
  long tmp = ((long)acc_addr/65536)*65536;
  if(tmp != (long)acc_addr){
    _XMP_fatal("An array is not aligned at 64KB.");
    return;
  }

  size_t size = (size_t)(adesc->type_size * adesc->total_elmts);

#if 0
  printf("[%d] tcaCreateHandle size = %d addr=%p\n", _XMP_world_rank, size, acc_addr);
#endif
  tcaHandle tmp_handle;
  TCA_CHECK(tcaCreateHandle(&tmp_handle, acc_addr, size, tcaMemoryGPU));

  adesc->tca_handle = _XMP_alloc(sizeof(tcaHandle) * _XMP_world_size);
  MPI_Allgather(&tmp_handle, sizeof(tcaHandle), MPI_BYTE,
                adesc->tca_handle, sizeof(tcaHandle), MPI_BYTE, MPI_COMM_WORLD);

  adesc->set_handle = _XMP_N_INT_TRUE;
}
static void _XMP_tca_ring_buf_recv(tca_ring_buf_t *ring_buf, int *tag, int *data)
{
  psn_t local_recv_psn = ring_buf->psn_pairs[1]; //1: local_recv
  int buf_pos = local_recv_psn % RING_SIZE;

  _XMP_tca_lock();

#ifdef _USE_NOTIFY_TO_WAIT_PACKET
  //wait notify for recv_buf
  const int buf_wait_slot = _wait_slot_offset + buf_pos;
  TCA_CHECK(tcaWaitDMARecvDesc(&(ring_buf->remote_psn_pairs_handle), buf_wait_slot, _packet_wait_tag));
#else
  //wait until (remote_send_psn > local_recv_psn) become true
  volatile psn_t* remote_send_psn_p = &(ring_buf->psn_pairs[2]); //2: remote_send
  while(*remote_send_psn_p <= local_recv_psn){
    ++spin_wait_count;
    _mm_pause();
  }
#endif

  //copy recved data
  *tag = ring_buf->recv_buffer[buf_pos].tag;
  *data = ring_buf->recv_buffer[buf_pos].data;

  //fprintf(stderr, "recv: psn=%ull, tag=%d, data=%d\n", local_recv_psn, *tag, *data);

  //update local_recv_psn
  ring_buf->psn_pairs[1] = ++local_recv_psn;

  if(local_recv_psn - ring_buf->last_send_local_recv_psn >= (RING_SIZE/2)){
    //send local psn pair
    const int psn_wait_slot = _wait_slot_offset + RING_SIZE;
    ring_buf->last_send_local_recv_psn = ring_buf->psn_pairs[1];
    TCA_CHECK(tcaDescSet(ring_buf->psn_desc, _dmac_channel));
    TCA_CHECK(tcaStartDMADesc(_dmac_channel));
    TCA_CHECK(tcaWaitDMARecvDesc(&(ring_buf->local_psn_pairs_handle), psn_wait_slot, _psn_wait_tag));
    TCA_CHECK(tcaWaitDMAC(_dmac_channel)); //important
  }

  //printf("psns: %llu, %llu, %llu, %llu\n", ring_buf->psn_pairs[0],ring_buf->psn_pairs[1],ring_buf->psn_pairs[2],ring_buf->psn_pairs[3]);
  _XMP_tca_unlock();
}
static void _XMP_tca_ring_buf_send(tca_ring_buf_t *ring_buf, const int tag, const int data) //postの場合 tag=post_req, packet=post_tag
{
  volatile psn_t *r_recv_psn = &(ring_buf->psn_pairs[3]);
  const unsigned long long local_send_psn = ring_buf->psn_pairs[0]; //0: local_send

  //fprintf(stderr,"send start (tag=%d, data=%d, psn=%llu)\n", tag, data, local_send_psn);
  _XMP_tca_lock();

  //相手のring bufferが空くまで待機
  while(local_send_psn - *r_recv_psn >= RING_SIZE){
    const int psn_wait_slot = _wait_slot_offset + RING_SIZE;
    TCA_CHECK(tcaWaitDMARecvDesc(&(ring_buf->remote_psn_pairs_handle), psn_wait_slot, _psn_wait_tag));
  }

  const int buf_pos = local_send_psn % RING_SIZE;
  const int buf_wait_slot = _wait_slot_offset + buf_pos;

  //send_bufferに値の書き込み
  ring_buf->send_buffer[buf_pos].tag = tag;
  ring_buf->send_buffer[buf_pos].data = data;
  //update local send psn
  ring_buf->psn_pairs[0] = local_send_psn + 1;
  ring_buf->last_send_local_recv_psn = ring_buf->psn_pairs[1];
  
  // set desc
  TCA_CHECK(tcaDescSet(ring_buf->buffer_desc[buf_pos], _dmac_channel));
  // start dmac
  TCA_CHECK(tcaStartDMADesc(_dmac_channel));

  // wait put locally
  TCA_CHECK(tcaWaitDMARecvDesc(&(ring_buf->local_send_buffer_handle), buf_wait_slot, _packet_wait_tag));

  //wait dmac
  TCA_CHECK(tcaWaitDMAC(_dmac_channel)); //important

  //fprintf(stderr,"send end (tag=%d, data=%d, psn=%llu)\n", tag, data, local_send_psn);

  _XMP_tca_unlock();
}
/**
   Initialize TCA
*/
void _XMP_tca_initialize(int argc, char **argv)
{
  if(_XMP_world_rank == 0){
    fprintf(stderr, "TCA Library Version = %s\n", TCA_LIB_VERSION);
  }

  if(_XMP_world_size > 16)
    _XMP_fatal("TCA reflect has been not implemented in 16 more than nodes.");

  _XMP_tca_lock();
  /* TCA_CHECK(tcaInit()); */

  //this is probably unnecessary
  TCA_CHECK(tcaDMADescInt_Init()); // Initialize Descriptor (Internal Memory) Mode
  
  _XMP_tca_comm_init();
  create_comm_thread();
  _XMP_tca_unlock();
}
static void _XMP_tca_ring_buf_init(tca_ring_buf_t* ring_buf, int remote_rank)
{
  ring_buf->remote_rank = remote_rank;
  
  //alloc psn
  size_t psn_size = sizeof(psn_t);
  TCA_CHECK(tcaMalloc((void**)&ring_buf->psn_pairs, psn_size * 4, tcaMemoryCPU));
  //fprintf(stderr, "psn_pairs (%p)\n", ring_buf->psn_pairs);
  memset(ring_buf->psn_pairs, 0, psn_size * 4);

  //create psn handle
  TCA_CHECK(tcaCreateHandle(&(ring_buf->local_psn_pairs_handle), ring_buf->psn_pairs, psn_size * 4, tcaMemoryCPU));

  //exchange psn handle
  MPI_Sendrecv(&(ring_buf->local_psn_pairs_handle), sizeof(tcaHandle), MPI_BYTE, remote_rank, _handle_sendrecv_tag,
	       &(ring_buf->remote_psn_pairs_handle), sizeof(tcaHandle), MPI_BYTE, remote_rank, _handle_sendrecv_tag,
	       MPI_COMM_WORLD, MPI_STATUS_IGNORE);

  {
    tcaDesc* desc = tcaDescNew();
    const int psn_wait_slot = _wait_slot_offset + RING_SIZE;
    TCA_CHECK(tcaDescSetMemcpy(desc,
			       &(ring_buf->remote_psn_pairs_handle), psn_size * 2,
			       &(ring_buf->local_psn_pairs_handle), 0,
			       psn_size * 2,
			       tcaDMAUseInternal | tcaDMAUseNotifyInternal | tcaDMANotifySelf | tcaDMANotify,
			       psn_wait_slot,
			       _psn_wait_tag));
    ring_buf->psn_desc = desc;
  }

  //clear last_send_local_recv_psn
  ring_buf->last_send_local_recv_psn = 0;
  
  //alloc buffer
  size_t packet_size = sizeof(packet_t);
  TCA_CHECK(tcaMalloc((void**)&ring_buf->send_buffer, packet_size * RING_SIZE, tcaMemoryCPU));
  TCA_CHECK(tcaMalloc((void**)&ring_buf->recv_buffer, packet_size * RING_SIZE, tcaMemoryCPU));
  memset(ring_buf->send_buffer, 0, packet_size * RING_SIZE);
  memset(ring_buf->recv_buffer, 0, packet_size * RING_SIZE);
  
  //create buffer handles
  /* for(int i = 0; i < RING_SIZE; i++){ */
  /*   TCA_CHECK(tcaCreateHandle(&(ring_buf->local_buffer_handles[i]), &(ring_buf->buffer[i]), packet_size, tcaMemoryCPU)); */
  /* } */
  TCA_CHECK(tcaCreateHandle(&(ring_buf->local_send_buffer_handle), ring_buf->send_buffer, packet_size * RING_SIZE, tcaMemoryCPU));
  TCA_CHECK(tcaCreateHandle(&(ring_buf->local_recv_buffer_handle), ring_buf->recv_buffer, packet_size * RING_SIZE, tcaMemoryCPU));
  
  //exchange buffer handles
  MPI_Sendrecv(&(ring_buf->local_send_buffer_handle), sizeof(tcaHandle), MPI_BYTE, remote_rank, _handle_sendrecv_tag,
	       &(ring_buf->remote_send_buffer_handle), sizeof(tcaHandle), MPI_BYTE, remote_rank, _handle_sendrecv_tag,
	       MPI_COMM_WORLD, MPI_STATUS_IGNORE);
  MPI_Sendrecv(&(ring_buf->local_recv_buffer_handle), sizeof(tcaHandle), MPI_BYTE, remote_rank, _handle_sendrecv_tag,
	       &(ring_buf->remote_recv_buffer_handle), sizeof(tcaHandle), MPI_BYTE, remote_rank, _handle_sendrecv_tag,
	       MPI_COMM_WORLD, MPI_STATUS_IGNORE);

  for(int i = 0; i < RING_SIZE; i++){
    tcaDesc* desc = tcaDescNew();
    int buf_wait_slot = _wait_slot_offset + i;
    TCA_CHECK(tcaDescSetMemcpy(desc,
			       &(ring_buf->remote_recv_buffer_handle), packet_size * i,
			       &(ring_buf->local_send_buffer_handle), packet_size * i,
			       packet_size,
			       tcaDMAUseInternal | tcaDMAUseNotifyInternal,
			       //tcaDMAUseInternal | tcaDMAUseNotifyInternal | tcaDMANotifySelf | tcaDMANotify,
			       buf_wait_slot,
			       _packet_wait_tag));
    TCA_CHECK(tcaDescSetMemcpy(desc,
			       &(ring_buf->remote_psn_pairs_handle), psn_size * 2,
			       &(ring_buf->local_psn_pairs_handle), 0,
			       psn_size * 2,
#ifdef _USE_NOTIFY_TO_WAIT_PACKET
			       tcaDMAUseInternal | tcaDMAUseNotifyInternal | tcaDMANotifySelf | tcaDMANotify,
			       //tcaDMAUseInternal | tcaDMAUseNotifyInternal,
#else
			       tcaDMAUseInternal | tcaDMAUseNotifyInternal | tcaDMANotifySelf,
#endif
			       buf_wait_slot,
			       _packet_wait_tag));
    ring_buf->buffer_desc[i] = desc;
  }
}