gaspi_return_t pgaspi_dev_wait (const gaspi_queue_id_t queue, int * counter, const gaspi_timeout_t timeout_ms) { int ne = 0, i; struct ibv_wc wc; const int nr = *counter; const gaspi_cycles_t s0 = gaspi_get_cycles (); for (i = 0; i < nr; i++) { do { ne = ibv_poll_cq (glb_gaspi_ctx_ib.scqC[queue], 1, &wc); *counter -= ne; if (ne == 0) { const gaspi_cycles_t s1 = gaspi_get_cycles (); const gaspi_cycles_t tdelta = s1 - s0; const float ms = (float) tdelta * glb_gaspi_ctx.cycles_to_msecs; if (ms > timeout_ms) { return GASPI_TIMEOUT; } } } while (ne == 0); if ((ne < 0) || (wc.status != IBV_WC_SUCCESS)) { gaspi_print_error("Failed request to %lu. Queue %d might be broken %s", wc.wr_id, queue, ibv_wc_status_str(wc.status) ); glb_gaspi_ctx.qp_state_vec[queue][wc.wr_id] = GASPI_STATE_CORRUPT; return GASPI_ERROR; } } #ifdef GPI2_CUDA int j,k; for(k = 0;k < glb_gaspi_ctx.gpu_count; k++) { for(j = 0; j < GASPI_CUDA_EVENTS; j++) gpus[k].events[queue][j].ib_use = 0; } #endif return GASPI_SUCCESS; }
gaspi_return_t pgaspi_time_ticks (gaspi_cycles_t * const ticks) { gaspi_verify_null_ptr(ticks); *ticks = gaspi_get_cycles (); return GASPI_SUCCESS; }
void gaspi_stats_start_timer(enum gaspi_timer t) { if( _timers[t].running ) { return; } lock_gaspi(&gaspi_stats_lock); _timers[t].tstart = gaspi_get_cycles(); _timers[t].running = 1; unlock_gaspi(&gaspi_stats_lock); }
gaspi_return_t pgaspi_time_get (gaspi_time_t * const wtime) { gaspi_verify_null_ptr(wtime); float cycles_to_msecs; if (!glb_gaspi_init) { const float cpu_mhz = gaspi_get_cpufreq (); cycles_to_msecs = 1.0f / (cpu_mhz * 1000.0f); } else { cycles_to_msecs = glb_gaspi_ctx.cycles_to_msecs; } const gaspi_cycles_t s1 = gaspi_get_cycles (); *wtime = (gaspi_time_t) (s1 * cycles_to_msecs); return GASPI_SUCCESS; }
void gaspi_stats_stop_timer(enum gaspi_timer t) { gaspi_context_t const * const gctx = &glb_gaspi_ctx; if( !_timers[t].running ) { return; } lock_gaspi(&gaspi_stats_lock); _timers[t].tend = gaspi_get_cycles(); _timers[t].ttotal += (_timers[t].tend - _timers[t].tstart); _timers[t].ttotal_ms = (float) _timers[t].ttotal * gctx->cycles_to_msecs; _timers[t].running = 0; _timers[GASPI_ALL_TIMER].ttotal += (_timers[t].tend - _timers[t].tstart); _timers[GASPI_ALL_TIMER].ttotal_ms = (float) _timers[GASPI_ALL_TIMER].ttotal * gctx->cycles_to_msecs; _timers[GASPI_ALL_TIMER].running = 0; unlock_gaspi(&gaspi_stats_lock); }
gaspi_return_t pgaspi_passive_send (const gaspi_segment_id_t segment_id_local, const gaspi_offset_t offset_local, const gaspi_rank_t rank, const gaspi_size_t size, const gaspi_timeout_t timeout_ms) { #ifdef DEBUG if (glb_gaspi_ctx_ib.rrmd[segment_id_local] == NULL) { gaspi_printf("Debug: Invalid local segment (gaspi_passive_send)\n"); return GASPI_ERROR; } if( rank >= glb_gaspi_ctx.tnc) { gaspi_printf("Debug: Invalid rank (gaspi_passive_send)\n"); return GASPI_ERROR; } if( offset_local > glb_gaspi_ctx_ib.rrmd[segment_id_local][glb_gaspi_ctx.rank].size) { gaspi_printf("Debug: Invalid offsets (gaspi_passive_send)\n"); return GASPI_ERROR; } if( size < 1 || size > GASPI_MAX_TSIZE_P ) { gaspi_printf("Debug: Invalid size (gaspi_passive_send)\n"); return GASPI_ERROR; } #endif struct ibv_send_wr *bad_wr; struct ibv_sge slist; struct ibv_send_wr swr; struct ibv_wc wc_send; gaspi_cycles_t s0; lock_gaspi_tout (&glb_gaspi_ctx.lockPS, timeout_ms); const int byte_id = rank >> 3; const int bit_pos = rank - (byte_id * 8); const unsigned char bit_cmp = 1 << bit_pos; if (glb_gaspi_ctx_ib.ne_count_p[byte_id] & bit_cmp) goto checkL; slist.addr = (uintptr_t) (glb_gaspi_ctx_ib. rrmd[segment_id_local][glb_gaspi_ctx.rank].addr + NOTIFY_OFFSET + offset_local); slist.length = size; slist.lkey = glb_gaspi_ctx_ib.rrmd[segment_id_local][glb_gaspi_ctx.rank].mr->lkey; swr.sg_list = &slist; swr.num_sge = 1; swr.opcode = IBV_WR_SEND; swr.wr_id = rank; swr.send_flags = IBV_SEND_SIGNALED; swr.next = NULL; if (ibv_post_send (glb_gaspi_ctx_ib.qpP[rank], &swr, &bad_wr)) { glb_gaspi_ctx.qp_state_vec[GASPI_PASSIVE_QP][rank] = 1; unlock_gaspi (&glb_gaspi_ctx.lockPS); return GASPI_ERROR; } glb_gaspi_ctx_ib.ne_count_p[byte_id] |= bit_cmp; checkL: s0 = gaspi_get_cycles (); int ne = 0; do { ne = ibv_poll_cq (glb_gaspi_ctx_ib.scqP, 1, &wc_send); if (ne == 0) { const gaspi_cycles_t s1 = gaspi_get_cycles (); const gaspi_cycles_t tdelta = s1 - s0; const float ms = (float) tdelta * glb_gaspi_ctx.cycles_to_msecs; if (ms > timeout_ms) { unlock_gaspi (&glb_gaspi_ctx.lockPS); return GASPI_TIMEOUT; } } } while (ne == 0); if ((ne < 0) || (wc_send.status != IBV_WC_SUCCESS)) { glb_gaspi_ctx.qp_state_vec[GASPI_PASSIVE_QP][wc_send.wr_id] = 1; unlock_gaspi (&glb_gaspi_ctx.lockPS); return GASPI_ERROR; } glb_gaspi_ctx_ib.ne_count_p[byte_id] &= (~bit_cmp); unlock_gaspi (&glb_gaspi_ctx.lockPS); return GASPI_SUCCESS; }
gaspi_return_t pgaspi_notify_waitsome (const gaspi_segment_id_t segment_id_local, const gaspi_notification_id_t notification_begin, const gaspi_number_t num, gaspi_notification_id_t * const first_id, const gaspi_timeout_t timeout_ms) { gaspi_verify_init("gaspi_notify_waitsome"); gaspi_verify_segment(segment_id_local); gaspi_verify_null_ptr(glb_gaspi_ctx.rrmd[segment_id_local]); gaspi_verify_null_ptr(first_id); #ifdef DEBUG if( num >= GASPI_MAX_NOTIFICATION) return GASPI_ERR_INV_NUM; #endif volatile unsigned char *segPtr; int loop = 1; gaspi_notification_id_t n; if(num == 0) return GASPI_SUCCESS; #ifdef GPI2_CUDA if(glb_gaspi_ctx.rrmd[segment_id_local][glb_gaspi_ctx.rank].cudaDevId >=0 ) { segPtr = (volatile unsigned char*)glb_gaspi_ctx.rrmd[segment_id_local][glb_gaspi_ctx.rank].host_addr; } else #endif segPtr = (volatile unsigned char *) glb_gaspi_ctx.rrmd[segment_id_local][glb_gaspi_ctx.rank].addr; volatile unsigned int *p = (volatile unsigned int *) segPtr; if (timeout_ms == GASPI_BLOCK) { while (loop) { for (n = notification_begin; n < (notification_begin + num); n++) { if (p[n]) { *first_id = n; return GASPI_SUCCESS; } } gaspi_delay (); } } else if (timeout_ms == GASPI_TEST) { for (n = notification_begin; n < (notification_begin + num); n++) { if (p[n]) { *first_id = n; return GASPI_SUCCESS; } } return GASPI_TIMEOUT; } const gaspi_cycles_t s0 = gaspi_get_cycles (); while (loop) { for (n = notification_begin; n < (notification_begin + num); n++) { if (p[n]) { *first_id = n; loop = 0; break; } } const gaspi_cycles_t s1 = gaspi_get_cycles (); const gaspi_cycles_t tdelta = s1 - s0; const float ms = (float) tdelta * glb_gaspi_ctx.cycles_to_msecs; if (ms > timeout_ms) { return GASPI_TIMEOUT; } gaspi_delay (); } return GASPI_SUCCESS; }
gaspi_return_t pgaspi_gpu_write_notify(const gaspi_segment_id_t segment_id_local, const gaspi_offset_t offset_local, const gaspi_rank_t rank, const gaspi_segment_id_t segment_id_remote, const gaspi_offset_t offset_remote, const gaspi_size_t size, const gaspi_notification_id_t notification_id, const gaspi_notification_t notification_value, const gaspi_queue_id_t queue, const gaspi_timeout_t timeout_ms) { if(glb_gaspi_ctx.rrmd[segment_id_local][glb_gaspi_ctx.rank].cudaDevId < 0 || size <= GASPI_GPU_DIRECT_MAX ) { return gaspi_write_notify(segment_id_local, offset_local, rank, segment_id_remote, offset_remote, size,notification_id, notification_value, queue, timeout_ms); } if(lock_gaspi_tout (&glb_gaspi_ctx.lockC[queue], timeout_ms)) return GASPI_TIMEOUT; char *host_ptr = (char*)(glb_gaspi_ctx.rrmd[segment_id_local][glb_gaspi_ctx.rank].host_ptr+NOTIFY_OFFSET+offset_local); char* device_ptr =(char*)(glb_gaspi_ctx.rrmd[segment_id_local][glb_gaspi_ctx.rank].addr+offset_local); gaspi_gpu* agpu = _gaspi_find_gpu(glb_gaspi_ctx.rrmd[segment_id_local][glb_gaspi_ctx.rank].cudaDevId); if( !agpu ) { gaspi_print_error("No GPU found or not initialized (gaspi_init_GPUs)."); unlock_gaspi(&glb_gaspi_ctx.lockC[queue]); return GASPI_ERROR; } int copy_size = 0; int gpu_offset = 0; int size_left = size; int BLOCK_SIZE= GASPI_GPU_BUFFERED; const gaspi_cycles_t s0 = gaspi_get_cycles (); while(size_left > 0) { int i; for(i = 0; i < GASPI_CUDA_EVENTS; i++) { if(size_left > BLOCK_SIZE) copy_size = BLOCK_SIZE; else copy_size = size_left; if(cudaMemcpyAsync(host_ptr+gpu_offset, device_ptr + gpu_offset, copy_size, cudaMemcpyDeviceToHost, agpu->streams[queue])) { unlock_gaspi(&glb_gaspi_ctx.lockC[queue]); return GASPI_ERROR; } glb_gaspi_ctx.ne_count_c[queue]++; agpu->events[queue][i].segment_remote = segment_id_remote; agpu->events[queue][i].segment_local = segment_id_local; agpu->events[queue][i].size = copy_size; agpu->events[queue][i].rank = rank; agpu->events[queue][i].offset_local = offset_local+gpu_offset; agpu->events[queue][i].offset_remote = offset_remote+gpu_offset; agpu->events[queue][i].in_use = 1; cudaError_t err = cudaEventRecord(agpu->events[queue][i].event,agpu->streams[queue]); if(err != cudaSuccess) { unlock_gaspi(&glb_gaspi_ctx.lockC[queue]); return GASPI_ERROR; } /* Thats not beautiful at all, however, else we have a overflow soon in the queue */ if(agpu->events[queue][i].ib_use) { struct ibv_wc wc; int ne; do { ne = ibv_poll_cq (glb_gaspi_ctx_ib.scqC[queue], 1, &wc); glb_gaspi_ctx.ne_count_c[queue] -= ne; if (ne == 0) { const gaspi_cycles_t s1 = gaspi_get_cycles (); const gaspi_cycles_t tdelta = s1 - s0; const float ms = (float) tdelta * glb_gaspi_ctx.cycles_to_msecs; if (ms > timeout_ms) { unlock_gaspi (&glb_gaspi_ctx.lockC[queue]); return GASPI_TIMEOUT; } } } while(ne == 0); agpu->events[queue][i].ib_use = 0; } gpu_offset += copy_size; size_left -= copy_size; if(size_left == 0) break; } for(i = 0; i < GASPI_CUDA_EVENTS; i++) { cudaError_t error; if (agpu->events[queue][i].in_use == 1 ) { do { error = cudaEventQuery(agpu->events[queue][i].event ); if( cudaSuccess == error ) { if (_gaspi_event_send(&agpu->events[queue][i],queue) ) { unlock_gaspi (&glb_gaspi_ctx.lockC[queue]); return GASPI_ERROR; } agpu->events[queue][i].in_use = 0; } else if(error == cudaErrorNotReady) { const gaspi_cycles_t s1 = gaspi_get_cycles (); const gaspi_cycles_t tdelta = s1 - s0; const float ms = (float) tdelta * glb_gaspi_ctx.cycles_to_msecs; if (ms > timeout_ms) { unlock_gaspi (&glb_gaspi_ctx.lockC[queue]); return GASPI_TIMEOUT; } } else { unlock_gaspi (&glb_gaspi_ctx.lockC[queue]); return GASPI_ERROR; } } while(error != cudaSuccess); } } } struct ibv_send_wr *bad_wr; struct ibv_sge slistN; struct ibv_send_wr swrN; slistN.addr = (uintptr_t)(glb_gaspi_ctx.nsrc.buf + notification_id * sizeof(gaspi_notification_id_t)); *((unsigned int *) slistN.addr) = notification_value; slistN.length = sizeof(gaspi_notification_id_t); slistN.lkey =((struct ibv_mr *) glb_gaspi_ctx.nsrc.mr)->lkey; if((glb_gaspi_ctx.rrmd[segment_id_remote][rank].cudaDevId >= 0)) { swrN.wr.rdma.remote_addr = (glb_gaspi_ctx.rrmd[segment_id_remote][rank].host_addr + notification_id * sizeof(gaspi_notification_id_t)); swrN.wr.rdma.rkey = glb_gaspi_ctx.rrmd[segment_id_remote][rank].host_rkey; } else { swrN.wr.rdma.remote_addr = (glb_gaspi_ctx.rrmd[segment_id_remote][rank].addr + notification_id * sizeof(gaspi_notification_id_t)); swrN.wr.rdma.rkey = glb_gaspi_ctx.rrmd[segment_id_remote][rank].rkey; } swrN.sg_list = &slistN; swrN.num_sge = 1; swrN.wr_id = rank; swrN.opcode = IBV_WR_RDMA_WRITE; swrN.send_flags = IBV_SEND_SIGNALED | IBV_SEND_INLINE;; swrN.next = NULL; if (ibv_post_send (glb_gaspi_ctx_ib.qpC[queue][rank], &swrN, &bad_wr)) { glb_gaspi_ctx.qp_state_vec[queue][rank] = GASPI_STATE_CORRUPT; unlock_gaspi (&glb_gaspi_ctx.lockC[queue]); return GASPI_ERROR; } glb_gaspi_ctx.ne_count_c[queue]++; unlock_gaspi (&glb_gaspi_ctx.lockC[queue]); return GASPI_SUCCESS; }
gaspi_return_t pgaspi_gpu_write(const gaspi_segment_id_t segment_id_local, const gaspi_offset_t offset_local, const gaspi_rank_t rank, const gaspi_segment_id_t segment_id_remote, const gaspi_offset_t offset_remote, const gaspi_size_t size, const gaspi_queue_id_t queue, const gaspi_timeout_t timeout_ms) { if( glb_gaspi_ctx.rrmd[segment_id_local][glb_gaspi_ctx.rank].cudaDevId < 0 || size <= GASPI_GPU_DIRECT_MAX ) { return gaspi_write(segment_id_local, offset_local, rank, segment_id_remote, offset_remote, size, queue, timeout_ms); } if(lock_gaspi_tout (&glb_gaspi_ctx.lockC[queue], timeout_ms)) return GASPI_TIMEOUT; char* host_ptr = (char*)(glb_gaspi_ctx.rrmd[segment_id_local][glb_gaspi_ctx.rank].host_ptr + NOTIFY_OFFSET + offset_local); char* device_ptr = (char*)(glb_gaspi_ctx.rrmd[segment_id_local][glb_gaspi_ctx.rank].addr + offset_local); gaspi_gpu* agpu = _gaspi_find_gpu(glb_gaspi_ctx.rrmd[segment_id_local][glb_gaspi_ctx.rank].cudaDevId); if( !agpu ) { gaspi_print_error("No GPU found or not initialized (gaspi_init_GPUs)."); return GASPI_ERROR; } int size_left = size; int copy_size = 0; int gpu_offset = 0; const int BLOCK_SIZE = GASPI_GPU_BUFFERED; const gaspi_cycles_t s0 = gaspi_get_cycles (); while(size_left > 0) { int i; for(i = 0; i < GASPI_CUDA_EVENTS; i++) { if(size_left > BLOCK_SIZE) copy_size = BLOCK_SIZE; else copy_size = size_left; if( cudaMemcpyAsync(host_ptr + gpu_offset, device_ptr + gpu_offset, copy_size, cudaMemcpyDeviceToHost, agpu->streams[queue])) { unlock_gaspi(&glb_gaspi_ctx.lockC[queue]); return GASPI_ERROR; } glb_gaspi_ctx.ne_count_c[queue]++; agpu->events[queue][i].segment_remote = segment_id_remote; agpu->events[queue][i].segment_local = segment_id_local; agpu->events[queue][i].size = copy_size; agpu->events[queue][i].rank = rank; agpu->events[queue][i].offset_local = offset_local+gpu_offset; agpu->events[queue][i].offset_remote = offset_remote+gpu_offset; agpu->events[queue][i].in_use =1; cudaError_t err = cudaEventRecord(agpu->events[queue][i].event, agpu->streams[queue]); if(err != cudaSuccess) { glb_gaspi_ctx.qp_state_vec[queue][rank] = GASPI_STATE_CORRUPT; unlock_gaspi(&glb_gaspi_ctx.lockC[queue]); return GASPI_ERROR; } gpu_offset += copy_size; size_left -= copy_size; if(size_left == 0) break; if(agpu->events[queue][i].ib_use) { struct ibv_wc wc; int ne; do { ne = ibv_poll_cq (glb_gaspi_ctx_ib.scqC[queue], 1, &wc); glb_gaspi_ctx.ne_count_c[queue] -= ne; if (ne == 0) { const gaspi_cycles_t s1 = gaspi_get_cycles (); const gaspi_cycles_t tdelta = s1 - s0; const float ms = (float) tdelta * glb_gaspi_ctx.cycles_to_msecs; if (ms > timeout_ms) { unlock_gaspi (&glb_gaspi_ctx.lockC[queue]); return GASPI_TIMEOUT; } } } while(ne==0); agpu->events[queue][i].ib_use = 0; } } for(i = 0; i < GASPI_CUDA_EVENTS; i++) { cudaError_t error; if ( agpu->events[queue][i].in_use == 1 ) { do { error = cudaEventQuery(agpu->events[queue][i].event ); if( cudaSuccess == error ) { if (_gaspi_event_send(&agpu->events[queue][i],queue)) { unlock_gaspi (&glb_gaspi_ctx.lockC[queue]); return GASPI_ERROR; } agpu->events[queue][i].in_use = 0; } else if(error == cudaErrorNotReady) { const gaspi_cycles_t s1 = gaspi_get_cycles (); const gaspi_cycles_t tdelta = s1 - s0; const float ms = (float) tdelta * glb_gaspi_ctx.cycles_to_msecs; if (ms > timeout_ms) { unlock_gaspi (&glb_gaspi_ctx.lockC[queue]); return GASPI_TIMEOUT; } } else { unlock_gaspi (&glb_gaspi_ctx.lockC[queue]); return GASPI_ERROR; } } while(error != cudaSuccess); } } } unlock_gaspi (&glb_gaspi_ctx.lockC[queue]); return GASPI_SUCCESS; }