Пример #1
0
void _XMP_create_TCA_handle(void *acc_addr, _XMP_array_t *adesc)
{
  if(adesc->set_handle)
    return;

  // 64KB align ?
  long tmp = ((long)acc_addr/65536)*65536;
  if(tmp != (long)acc_addr){
    _XMP_fatal("An array is not aligned at 64KB.");
    return;
  }

  size_t size = (size_t)(adesc->type_size * adesc->total_elmts);

#if 0
  printf("[%d] tcaCreateHandle size = %d addr=%p\n", _XMP_world_rank, size, acc_addr);
#endif
  tcaHandle tmp_handle;
  TCA_CHECK(tcaCreateHandle(&tmp_handle, acc_addr, size, tcaMemoryGPU));

  adesc->tca_handle = _XMP_alloc(sizeof(tcaHandle) * _XMP_world_size);
  MPI_Allgather(&tmp_handle, sizeof(tcaHandle), MPI_BYTE,
                adesc->tca_handle, sizeof(tcaHandle), MPI_BYTE, MPI_COMM_WORLD);

  adesc->set_handle = _XMP_N_INT_TRUE;
}
static void _XMP_tca_ring_buf_init(tca_ring_buf_t* ring_buf, int remote_rank)
{
  ring_buf->remote_rank = remote_rank;
  
  //alloc psn
  size_t psn_size = sizeof(psn_t);
  TCA_CHECK(tcaMalloc((void**)&ring_buf->psn_pairs, psn_size * 4, tcaMemoryCPU));
  //fprintf(stderr, "psn_pairs (%p)\n", ring_buf->psn_pairs);
  memset(ring_buf->psn_pairs, 0, psn_size * 4);

  //create psn handle
  TCA_CHECK(tcaCreateHandle(&(ring_buf->local_psn_pairs_handle), ring_buf->psn_pairs, psn_size * 4, tcaMemoryCPU));

  //exchange psn handle
  MPI_Sendrecv(&(ring_buf->local_psn_pairs_handle), sizeof(tcaHandle), MPI_BYTE, remote_rank, _handle_sendrecv_tag,
	       &(ring_buf->remote_psn_pairs_handle), sizeof(tcaHandle), MPI_BYTE, remote_rank, _handle_sendrecv_tag,
	       MPI_COMM_WORLD, MPI_STATUS_IGNORE);

  {
    tcaDesc* desc = tcaDescNew();
    const int psn_wait_slot = _wait_slot_offset + RING_SIZE;
    TCA_CHECK(tcaDescSetMemcpy(desc,
			       &(ring_buf->remote_psn_pairs_handle), psn_size * 2,
			       &(ring_buf->local_psn_pairs_handle), 0,
			       psn_size * 2,
			       tcaDMAUseInternal | tcaDMAUseNotifyInternal | tcaDMANotifySelf | tcaDMANotify,
			       psn_wait_slot,
			       _psn_wait_tag));
    ring_buf->psn_desc = desc;
  }

  //clear last_send_local_recv_psn
  ring_buf->last_send_local_recv_psn = 0;
  
  //alloc buffer
  size_t packet_size = sizeof(packet_t);
  TCA_CHECK(tcaMalloc((void**)&ring_buf->send_buffer, packet_size * RING_SIZE, tcaMemoryCPU));
  TCA_CHECK(tcaMalloc((void**)&ring_buf->recv_buffer, packet_size * RING_SIZE, tcaMemoryCPU));
  memset(ring_buf->send_buffer, 0, packet_size * RING_SIZE);
  memset(ring_buf->recv_buffer, 0, packet_size * RING_SIZE);
  
  //create buffer handles
  /* for(int i = 0; i < RING_SIZE; i++){ */
  /*   TCA_CHECK(tcaCreateHandle(&(ring_buf->local_buffer_handles[i]), &(ring_buf->buffer[i]), packet_size, tcaMemoryCPU)); */
  /* } */
  TCA_CHECK(tcaCreateHandle(&(ring_buf->local_send_buffer_handle), ring_buf->send_buffer, packet_size * RING_SIZE, tcaMemoryCPU));
  TCA_CHECK(tcaCreateHandle(&(ring_buf->local_recv_buffer_handle), ring_buf->recv_buffer, packet_size * RING_SIZE, tcaMemoryCPU));
  
  //exchange buffer handles
  MPI_Sendrecv(&(ring_buf->local_send_buffer_handle), sizeof(tcaHandle), MPI_BYTE, remote_rank, _handle_sendrecv_tag,
	       &(ring_buf->remote_send_buffer_handle), sizeof(tcaHandle), MPI_BYTE, remote_rank, _handle_sendrecv_tag,
	       MPI_COMM_WORLD, MPI_STATUS_IGNORE);
  MPI_Sendrecv(&(ring_buf->local_recv_buffer_handle), sizeof(tcaHandle), MPI_BYTE, remote_rank, _handle_sendrecv_tag,
	       &(ring_buf->remote_recv_buffer_handle), sizeof(tcaHandle), MPI_BYTE, remote_rank, _handle_sendrecv_tag,
	       MPI_COMM_WORLD, MPI_STATUS_IGNORE);

  for(int i = 0; i < RING_SIZE; i++){
    tcaDesc* desc = tcaDescNew();
    int buf_wait_slot = _wait_slot_offset + i;
    TCA_CHECK(tcaDescSetMemcpy(desc,
			       &(ring_buf->remote_recv_buffer_handle), packet_size * i,
			       &(ring_buf->local_send_buffer_handle), packet_size * i,
			       packet_size,
			       tcaDMAUseInternal | tcaDMAUseNotifyInternal,
			       //tcaDMAUseInternal | tcaDMAUseNotifyInternal | tcaDMANotifySelf | tcaDMANotify,
			       buf_wait_slot,
			       _packet_wait_tag));
    TCA_CHECK(tcaDescSetMemcpy(desc,
			       &(ring_buf->remote_psn_pairs_handle), psn_size * 2,
			       &(ring_buf->local_psn_pairs_handle), 0,
			       psn_size * 2,
#ifdef _USE_NOTIFY_TO_WAIT_PACKET
			       tcaDMAUseInternal | tcaDMAUseNotifyInternal | tcaDMANotifySelf | tcaDMANotify,
			       //tcaDMAUseInternal | tcaDMAUseNotifyInternal,
#else
			       tcaDMAUseInternal | tcaDMAUseNotifyInternal | tcaDMANotifySelf,
#endif
			       buf_wait_slot,
			       _packet_wait_tag));
    ring_buf->buffer_desc[i] = desc;
  }
}