static bool _XMP_tca_ring_buf_recv_nowait(tca_ring_buf_t *ring_buf, packet_t *packet) { psn_t local_recv_psn = ring_buf->psn_pairs[1]; //1: local_recv //wait until (remote_send_psn > local_recv_psn) become true volatile psn_t* remote_send_psn_p = &(ring_buf->psn_pairs[2]); //2: remote_send if(*remote_send_psn_p <= local_recv_psn){ return false; } //copy recved data int buf_pos = local_recv_psn % RING_SIZE; *packet = ring_buf->recv_buffer[buf_pos]; //fprintf(stderr, "recv: psn=%ull, tag=%d, data=%d\n", local_recv_psn, *tag, *data); //update local_recv_psn ring_buf->psn_pairs[1] = ++local_recv_psn; if(local_recv_psn - ring_buf->last_send_local_recv_psn >= (RING_SIZE/2)){ _XMP_tca_lock(); //send local psn pair const int psn_wait_slot = _wait_slot_offset + RING_SIZE; ring_buf->last_send_local_recv_psn = ring_buf->psn_pairs[1]; TCA_CHECK(tcaDescSet(ring_buf->psn_desc, _dmac_channel)); TCA_CHECK(tcaStartDMADesc(_dmac_channel)); TCA_CHECK(tcaWaitDMARecvDesc(&(ring_buf->local_psn_pairs_handle), psn_wait_slot, _psn_wait_tag)); TCA_CHECK(tcaWaitDMAC(_dmac_channel)); //important _XMP_tca_unlock(); } return true; }
static void _XMP_tca_ring_buf_finalize(tca_ring_buf_t* ring_buf) { //destroy and free my buffer handles /* for(int i = 0; i < RING_SIZE; i++){ */ /* TCA_CHECK(tcaDestroyHandle(&ring_buf->local_buffer_handles[i])); */ /* } */ TCA_CHECK(tcaDestroyHandle(&ring_buf->local_send_buffer_handle)); TCA_CHECK(tcaDestroyHandle(&ring_buf->local_recv_buffer_handle)); //free my buffer //_XMP_free(ring_buf->buffer); TCA_CHECK(tcaFree(ring_buf->send_buffer, tcaMemoryCPU)); TCA_CHECK(tcaFree(ring_buf->recv_buffer, tcaMemoryCPU)); TCA_CHECK(tcaFree(ring_buf->psn_pairs, tcaMemoryCPU)); }
void _XMP_create_TCA_handle(void *acc_addr, _XMP_array_t *adesc) { if(adesc->set_handle) return; // 64KB align ? long tmp = ((long)acc_addr/65536)*65536; if(tmp != (long)acc_addr){ _XMP_fatal("An array is not aligned at 64KB."); return; } size_t size = (size_t)(adesc->type_size * adesc->total_elmts); #if 0 printf("[%d] tcaCreateHandle size = %d addr=%p\n", _XMP_world_rank, size, acc_addr); #endif tcaHandle tmp_handle; TCA_CHECK(tcaCreateHandle(&tmp_handle, acc_addr, size, tcaMemoryGPU)); adesc->tca_handle = _XMP_alloc(sizeof(tcaHandle) * _XMP_world_size); MPI_Allgather(&tmp_handle, sizeof(tcaHandle), MPI_BYTE, adesc->tca_handle, sizeof(tcaHandle), MPI_BYTE, MPI_COMM_WORLD); adesc->set_handle = _XMP_N_INT_TRUE; }
static void _XMP_tca_ring_buf_recv(tca_ring_buf_t *ring_buf, int *tag, int *data) { psn_t local_recv_psn = ring_buf->psn_pairs[1]; //1: local_recv int buf_pos = local_recv_psn % RING_SIZE; _XMP_tca_lock(); #ifdef _USE_NOTIFY_TO_WAIT_PACKET //wait notify for recv_buf const int buf_wait_slot = _wait_slot_offset + buf_pos; TCA_CHECK(tcaWaitDMARecvDesc(&(ring_buf->remote_psn_pairs_handle), buf_wait_slot, _packet_wait_tag)); #else //wait until (remote_send_psn > local_recv_psn) become true volatile psn_t* remote_send_psn_p = &(ring_buf->psn_pairs[2]); //2: remote_send while(*remote_send_psn_p <= local_recv_psn){ ++spin_wait_count; _mm_pause(); } #endif //copy recved data *tag = ring_buf->recv_buffer[buf_pos].tag; *data = ring_buf->recv_buffer[buf_pos].data; //fprintf(stderr, "recv: psn=%ull, tag=%d, data=%d\n", local_recv_psn, *tag, *data); //update local_recv_psn ring_buf->psn_pairs[1] = ++local_recv_psn; if(local_recv_psn - ring_buf->last_send_local_recv_psn >= (RING_SIZE/2)){ //send local psn pair const int psn_wait_slot = _wait_slot_offset + RING_SIZE; ring_buf->last_send_local_recv_psn = ring_buf->psn_pairs[1]; TCA_CHECK(tcaDescSet(ring_buf->psn_desc, _dmac_channel)); TCA_CHECK(tcaStartDMADesc(_dmac_channel)); TCA_CHECK(tcaWaitDMARecvDesc(&(ring_buf->local_psn_pairs_handle), psn_wait_slot, _psn_wait_tag)); TCA_CHECK(tcaWaitDMAC(_dmac_channel)); //important } //printf("psns: %llu, %llu, %llu, %llu\n", ring_buf->psn_pairs[0],ring_buf->psn_pairs[1],ring_buf->psn_pairs[2],ring_buf->psn_pairs[3]); _XMP_tca_unlock(); }
static void _XMP_tca_ring_buf_send(tca_ring_buf_t *ring_buf, const int tag, const int data) //postの場合 tag=post_req, packet=post_tag { volatile psn_t *r_recv_psn = &(ring_buf->psn_pairs[3]); const unsigned long long local_send_psn = ring_buf->psn_pairs[0]; //0: local_send //fprintf(stderr,"send start (tag=%d, data=%d, psn=%llu)\n", tag, data, local_send_psn); _XMP_tca_lock(); //相手のring bufferが空くまで待機 while(local_send_psn - *r_recv_psn >= RING_SIZE){ const int psn_wait_slot = _wait_slot_offset + RING_SIZE; TCA_CHECK(tcaWaitDMARecvDesc(&(ring_buf->remote_psn_pairs_handle), psn_wait_slot, _psn_wait_tag)); } const int buf_pos = local_send_psn % RING_SIZE; const int buf_wait_slot = _wait_slot_offset + buf_pos; //send_bufferに値の書き込み ring_buf->send_buffer[buf_pos].tag = tag; ring_buf->send_buffer[buf_pos].data = data; //update local send psn ring_buf->psn_pairs[0] = local_send_psn + 1; ring_buf->last_send_local_recv_psn = ring_buf->psn_pairs[1]; // set desc TCA_CHECK(tcaDescSet(ring_buf->buffer_desc[buf_pos], _dmac_channel)); // start dmac TCA_CHECK(tcaStartDMADesc(_dmac_channel)); // wait put locally TCA_CHECK(tcaWaitDMARecvDesc(&(ring_buf->local_send_buffer_handle), buf_wait_slot, _packet_wait_tag)); //wait dmac TCA_CHECK(tcaWaitDMAC(_dmac_channel)); //important //fprintf(stderr,"send end (tag=%d, data=%d, psn=%llu)\n", tag, data, local_send_psn); _XMP_tca_unlock(); }
/** Initialize TCA */ void _XMP_tca_initialize(int argc, char **argv) { if(_XMP_world_rank == 0){ fprintf(stderr, "TCA Library Version = %s\n", TCA_LIB_VERSION); } if(_XMP_world_size > 16) _XMP_fatal("TCA reflect has been not implemented in 16 more than nodes."); _XMP_tca_lock(); /* TCA_CHECK(tcaInit()); */ //this is probably unnecessary TCA_CHECK(tcaDMADescInt_Init()); // Initialize Descriptor (Internal Memory) Mode _XMP_tca_comm_init(); create_comm_thread(); _XMP_tca_unlock(); }
static void _XMP_tca_ring_buf_init(tca_ring_buf_t* ring_buf, int remote_rank) { ring_buf->remote_rank = remote_rank; //alloc psn size_t psn_size = sizeof(psn_t); TCA_CHECK(tcaMalloc((void**)&ring_buf->psn_pairs, psn_size * 4, tcaMemoryCPU)); //fprintf(stderr, "psn_pairs (%p)\n", ring_buf->psn_pairs); memset(ring_buf->psn_pairs, 0, psn_size * 4); //create psn handle TCA_CHECK(tcaCreateHandle(&(ring_buf->local_psn_pairs_handle), ring_buf->psn_pairs, psn_size * 4, tcaMemoryCPU)); //exchange psn handle MPI_Sendrecv(&(ring_buf->local_psn_pairs_handle), sizeof(tcaHandle), MPI_BYTE, remote_rank, _handle_sendrecv_tag, &(ring_buf->remote_psn_pairs_handle), sizeof(tcaHandle), MPI_BYTE, remote_rank, _handle_sendrecv_tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE); { tcaDesc* desc = tcaDescNew(); const int psn_wait_slot = _wait_slot_offset + RING_SIZE; TCA_CHECK(tcaDescSetMemcpy(desc, &(ring_buf->remote_psn_pairs_handle), psn_size * 2, &(ring_buf->local_psn_pairs_handle), 0, psn_size * 2, tcaDMAUseInternal | tcaDMAUseNotifyInternal | tcaDMANotifySelf | tcaDMANotify, psn_wait_slot, _psn_wait_tag)); ring_buf->psn_desc = desc; } //clear last_send_local_recv_psn ring_buf->last_send_local_recv_psn = 0; //alloc buffer size_t packet_size = sizeof(packet_t); TCA_CHECK(tcaMalloc((void**)&ring_buf->send_buffer, packet_size * RING_SIZE, tcaMemoryCPU)); TCA_CHECK(tcaMalloc((void**)&ring_buf->recv_buffer, packet_size * RING_SIZE, tcaMemoryCPU)); memset(ring_buf->send_buffer, 0, packet_size * RING_SIZE); memset(ring_buf->recv_buffer, 0, packet_size * RING_SIZE); //create buffer handles /* for(int i = 0; i < RING_SIZE; i++){ */ /* TCA_CHECK(tcaCreateHandle(&(ring_buf->local_buffer_handles[i]), &(ring_buf->buffer[i]), packet_size, tcaMemoryCPU)); */ /* } */ TCA_CHECK(tcaCreateHandle(&(ring_buf->local_send_buffer_handle), ring_buf->send_buffer, packet_size * RING_SIZE, tcaMemoryCPU)); TCA_CHECK(tcaCreateHandle(&(ring_buf->local_recv_buffer_handle), ring_buf->recv_buffer, packet_size * RING_SIZE, tcaMemoryCPU)); //exchange buffer handles MPI_Sendrecv(&(ring_buf->local_send_buffer_handle), sizeof(tcaHandle), MPI_BYTE, remote_rank, _handle_sendrecv_tag, &(ring_buf->remote_send_buffer_handle), sizeof(tcaHandle), MPI_BYTE, remote_rank, _handle_sendrecv_tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE); MPI_Sendrecv(&(ring_buf->local_recv_buffer_handle), sizeof(tcaHandle), MPI_BYTE, remote_rank, _handle_sendrecv_tag, &(ring_buf->remote_recv_buffer_handle), sizeof(tcaHandle), MPI_BYTE, remote_rank, _handle_sendrecv_tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE); for(int i = 0; i < RING_SIZE; i++){ tcaDesc* desc = tcaDescNew(); int buf_wait_slot = _wait_slot_offset + i; TCA_CHECK(tcaDescSetMemcpy(desc, &(ring_buf->remote_recv_buffer_handle), packet_size * i, &(ring_buf->local_send_buffer_handle), packet_size * i, packet_size, tcaDMAUseInternal | tcaDMAUseNotifyInternal, //tcaDMAUseInternal | tcaDMAUseNotifyInternal | tcaDMANotifySelf | tcaDMANotify, buf_wait_slot, _packet_wait_tag)); TCA_CHECK(tcaDescSetMemcpy(desc, &(ring_buf->remote_psn_pairs_handle), psn_size * 2, &(ring_buf->local_psn_pairs_handle), 0, psn_size * 2, #ifdef _USE_NOTIFY_TO_WAIT_PACKET tcaDMAUseInternal | tcaDMAUseNotifyInternal | tcaDMANotifySelf | tcaDMANotify, //tcaDMAUseInternal | tcaDMAUseNotifyInternal, #else tcaDMAUseInternal | tcaDMAUseNotifyInternal | tcaDMANotifySelf, #endif buf_wait_slot, _packet_wait_tag)); ring_buf->buffer_desc[i] = desc; } }