unsigned _starpu_driver_test_request_completion(struct _starpu_async_channel *async_channel) { #ifdef STARPU_SIMGRID unsigned ret; STARPU_PTHREAD_MUTEX_LOCK(&async_channel->event.mutex); ret = async_channel->event.finished; STARPU_PTHREAD_MUTEX_UNLOCK(&async_channel->event.mutex); return ret; #else /* !SIMGRID */ enum starpu_node_kind kind = async_channel->type; unsigned success = 0; #ifdef STARPU_USE_CUDA cudaEvent_t event; #endif switch (kind) { #ifdef STARPU_USE_CUDA case STARPU_CUDA_RAM: event = (*async_channel).event.cuda_event; cudaError_t cures = cudaEventQuery(event); success = (cures == cudaSuccess); if (success) cudaEventDestroy(event); else if (cures != cudaErrorNotReady) STARPU_CUDA_REPORT_ERROR(cures); break; #endif #ifdef STARPU_USE_OPENCL case STARPU_OPENCL_RAM: { cl_int event_status; cl_event opencl_event = (*async_channel).event.opencl_event; if (opencl_event == NULL) STARPU_ABORT(); cl_int err = clGetEventInfo(opencl_event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(event_status), &event_status, NULL); if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err); if (event_status < 0) STARPU_OPENCL_REPORT_ERROR(event_status); success = (event_status == CL_COMPLETE); break; } #endif #ifdef STARPU_USE_MIC case STARPU_MIC_RAM: success = _starpu_mic_request_is_complete(&(async_channel->event.mic_event)); break; #endif case STARPU_DISK_RAM: success = starpu_disk_test_request(async_channel); break; case STARPU_CPU_RAM: default: STARPU_ABORT(); } return success; #endif /* !SIMGRID */ }
// same but for event void GPUDataTransferer::SyncEvent(cudaEvent_t ev) { auto rc = cudaEventQuery(ev); if (rc != cudaErrorNotReady) { // if Event is ready then no need to wait rc || "cudaEventQuery failed"; return; } // we must wait cudaEventSynchronize(ev) || "cudaEventSynchronize failed"; }
bool AsyncCopier::pollEvent(Event* event) { auto result = cudaEventQuery(*event->event); switch (result) { case cudaSuccess: VLOG(2) << "Poll event " << *event->event << ": ready"; return true; case cudaErrorNotReady: VLOG(2) << "Poll event " << *event->event << ": not ready"; return false; default: throwCudaError(result, "cudaEventQuery"); } }
static bool areAnyCudaKernelsRunning() { cudaEvent_t event; cudaEventCreate(&event); cudaEventRecord(event, 0); bool running = cudaEventQuery(event) == cudaErrorNotReady; cudaEventDestroy(event); return running; }
bool CudaEvent::isFinished() { // avoid cuda driver calls if event is already finished if( finished ) return true; assert( isRecorded ); cudaError_t rc = cudaEventQuery(event); if(rc == cudaSuccess) { finished = true; return true; } else if(rc == cudaErrorNotReady) return false; else PMACC_PRINT_CUDA_ERROR_AND_THROW(rc, "Event query failed"); }
static UCS_F_ALWAYS_INLINE unsigned uct_cuda_copy_progress_event_queue(ucs_queue_head_t *event_queue, unsigned max_events) { unsigned count = 0; cudaError_t result = cudaSuccess; uct_cuda_copy_event_desc_t *cuda_event; ucs_queue_iter_t iter; ucs_queue_for_each_safe(cuda_event, iter, event_queue, queue) { result = cudaEventQuery(cuda_event->event); if (cudaSuccess != result) { break; } ucs_queue_del_iter(event_queue, iter); if (cuda_event->comp != NULL) { uct_invoke_completion(cuda_event->comp, UCS_OK); } ucs_trace_poll("CUDA Event Done :%p", cuda_event); ucs_mpool_put(cuda_event); count++; if (count >= max_events) { break; } }
void blasx_gpu_dgemm(void *arg_data) { int i; //----------GPU Argument Prepare------------// struct gpu_thread_data *arg = (struct gpu_thread_data *) arg_data; const int GPU_id = arg->GPU_id; cuda_err = cudaSetDevice(GPU_id); assert(cuda_err == cudaSuccess); //matrix configuration reader_tracker addr_track[1300]; //CRITICAL int x = arg->mat_conf->x; int y = arg->mat_conf->y; int z = arg->mat_conf->z; double *A = (double*) arg->mat_conf->A; double *B = (double*) arg->mat_conf->B; double *C = (double*) arg->mat_conf->C; int lda = arg->mat_conf->lda; int ldb = arg->mat_conf->ldb; int ldc = arg->mat_conf->ldc; double beta = arg->mat_conf->beta; double alpha = arg->mat_conf->alpha; int nrowa = arg->mat_conf->nrowa; int nrowb = arg->mat_conf->nrowb; int nrowc = arg->mat_conf->nrowc; int ncola = arg->mat_conf->ncola; int ncolb = arg->mat_conf->ncolb; int ncolc = arg->mat_conf->ncolc; enum CBLAS_TRANSPOSE TransA = arg->mat_conf->TransA; enum CBLAS_TRANSPOSE TransB = arg->mat_conf->TransB; int block_dim = arg->mat_conf->block_dim; //GPU configuration const int GPUs = arg->GPUs; LRU_t **LRUs = arg->LRUs; cublasHandle_t handle = handles_DGEMM[GPU_id]; queue_t *tasks_queue = arg->tasks_queue; //------------------------------------------// //hook C_dev double *C_dev[STREAMNUM*2]; for (i = 0; i < STREAMNUM*2; i++) { C_dev[i] = C_dev_DGEMM[i+STREAMNUM*GPU_id*2]; } cudaStream_t stream[STREAMNUM]; cudaEvent_t task_event[STREAMNUM]; for (i = 0 ; i < STREAMNUM; i++) { //hook event task_event[i] = event_DGEMM[i+GPU_id*STREAMNUM]; //hook stream stream[i] = streams_DGEMM[i+GPU_id*STREAMNUM]; } #ifdef affinity //thread setup assert( blasx_set_affinity(GPU_id) == 0); #endif #ifdef thread_barrier pthread_barrier_t* barr = arg->barr; int rc = pthread_barrier_wait(barr); assert(!(rc != 0 && rc != PTHREAD_BARRIER_SERIAL_THREAD)); #endif #ifdef thread_profile printf("thread%d start@%f\n", GPU_id, get_cur_time()); #endif //------------------------------------------// //----------------GPU-START-----------------// int tasks_rs[STREAMNUM*2]; // mimic reseravation station int tasks_rs_size[2] = { 0, 0 }; // always tracking the first unused int switcher = 0; int task_batch_counter = 0; int mem_cpy_counter = 0; while (tasks_queue->TAIL >= 0) { /*------RS------*/ int rs_counter = 0; tasks_rs_size[switcher] = 0; for (rs_counter = 0; rs_counter < STREAMNUM; rs_counter++) { int task_id = dequeue(tasks_queue); #ifdef task_tracker printf("==>GPU%d %d\n", GPU_id, task_id); #endif if (task_id >= 0) { tasks_rs[tasks_rs_size[switcher]+STREAMNUM*switcher] = task_id; tasks_rs_size[switcher]++; } } /*--event_sync---*/ while (cudaEventQuery(task_event[0]) != cudaSuccess); /*--reduce_reader--*/ int addr_counter = 0; for (addr_counter = 0; addr_counter < mem_cpy_counter; addr_counter++) { void *key = addr_track[addr_counter].addr; int target_GPU_id = addr_track[addr_counter].GPU_id; int is_trans_done = addr_track[addr_counter].is_trans_done; rbt_node *n = rbt_find(key, &(LRUs[target_GPU_id]->hash_map)); assert(n != NULL); if (is_trans_done == 0 && (target_GPU_id == GPU_id)) { assert(target_GPU_id == GPU_id); n->associated_LRU_elem->is_trans_done = 1; } atomic_reader_minus(n); } /*--kernel_exe---*/ mem_cpy_counter = 0; int j = 0; for(j = 0; j <= z; j++){ for (rs_counter = 0; rs_counter < tasks_rs_size[switcher]; rs_counter++) { int current_stream = rs_counter; int current_task = tasks_rs[rs_counter+STREAMNUM*switcher]; int prior_task = tasks_rs[rs_counter+(1-switcher)*STREAMNUM]; cudaStream_t *curt_stream = &stream[current_stream]; blasx_gpu_dgemm_kernel(j, nrowa, ncola, nrowb, ncolb, nrowc, ncolc, current_task, prior_task, TransA, TransB, A, B, C, lda, ldb, ldc, x, y, z, C_dev, curt_stream, &handle, current_stream, alpha, beta, block_dim, switcher, &task_batch_counter, LRUs, GPUs, &mem_cpy_counter, addr_track, GPU_id); if ( j == z && rs_counter == tasks_rs_size[switcher]-1) { /*--event_record--*/ cudaError_t err = cudaEventRecord(task_event[0], stream[0]); if(err != cudaSuccess) printf("event record fail\n"); } } } switcher = 1 - switcher; task_batch_counter++; } //------------------------------------------// //---------------RESULT-HARVEST-------------// collect_final_result_dgemm(tasks_rs, tasks_rs_size, switcher, stream, C_dev, block_dim, STREAMNUM, x, y, z, nrowc, ncolc, ldc, C); //------------------------------------------// #ifdef thread_profile printf("thread%d end@%f\n", GPU_id, get_cur_time()); #endif }
bool Event::isCompleted() const { cudaError_t err = cudaEventQuery( m_event ); DP_ASSERT( ( err == cudaSuccess ) || ( err == cudaErrorNotReady ) ); return( err == cudaSuccess ); }
void pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID category) { struct pb_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current]; struct pb_SubTimer *curr = (subtimerlist != NULL) ? subtimerlist->current : NULL; if (timers->current != pb_TimerID_NONE) { if (!is_async(timers->current) ) { if (timers->current != category) { if (curr != NULL) { pb_StopTimerAndSubTimer(&timers->timers[timers->current], &curr->timer); } else { pb_StopTimer(&timers->timers[timers->current]); } } else { if (curr != NULL) { pb_StopTimer(&curr->timer); } } } else { insert_submarker(timers, label, category); if (!is_async(category)) { // if switching to async too, keep driver going pb_StopTimer(&timers->timers[pb_TimerID_DRIVER]); } } } pb_Timestamp currentTime = get_time(); /* The only cases we check for asynchronous task completion is * when an overlapping CPU operation completes, or the next * segment blocks on completion of previous async operations */ if( asyncs_outstanding(timers) && (!is_async(timers->current) || is_blocking(category) ) ) { struct pb_async_time_marker_list * last_event = get_last_async(timers); /* cudaSuccess if completed */ cudaError_t async_done = cudaEventQuery(*((cudaEvent_t *)last_event->marker)); if(is_blocking(category)) { /* Async operations completed after previous CPU operations: * overlapped time is the total CPU time since this set of async * operations were first issued */ // timer to switch to is COPY or NONE // if it hasn't already finished, then just take now and use that as the elapsed time in OVERLAP // anything happening after now isn't OVERLAP because everything is being stopped to wait for synchronization // it seems that the extra sync wall time isn't being recorded anywhere if(async_done != cudaSuccess) accumulate_time(&(timers->timers[pb_TimerID_OVERLAP].elapsed), timers->async_begin,currentTime); /* Wait on async operation completion */ cudaEventSynchronize(*((cudaEvent_t *)last_event->marker)); pb_Timestamp total_async_time = record_async_times(timers); /* Async operations completed before previous CPU operations: * overlapped time is the total async time */ // If it did finish, then accumulate all the async time that did happen into OVERLAP // the immediately preceding EventSynchronize theoretically didn't have any effect since it was already completed. if(async_done == cudaSuccess) timers->timers[pb_TimerID_OVERLAP].elapsed += total_async_time; } else /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */ // i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are outstanding // so something is deeper in stack if(async_done == cudaSuccess) { /* Async operations completed before previous CPU operations: * overlapped time is the total async time */ timers->timers[pb_TimerID_OVERLAP].elapsed += record_async_times(timers); } // else, this isn't blocking, so just check the next time around } subtimerlist = timers->sub_timer_list[category]; struct pb_SubTimer *subtimer = NULL; if (label != NULL) { subtimer = subtimerlist->subtimer_list; while (subtimer != NULL) { if (strcmp(subtimer->label, label) == 0) { break; } else { subtimer = subtimer->next; } } } /* Start the new timer */ if (category != pb_TimerID_NONE) { if(!is_async(category)) { if (subtimerlist != NULL) { subtimerlist->current = subtimer; } if (category != timers->current && subtimer != NULL) { pb_StartTimerAndSubTimer(&timers->timers[category], &subtimer->timer); } else if (subtimer != NULL) { pb_StartTimer(&subtimer->timer); } else { pb_StartTimer(&timers->timers[category]); } } else { if (subtimerlist != NULL) { subtimerlist->current = subtimer; } // toSwitchTo Is Async (KERNEL/COPY_ASYNC) if (!asyncs_outstanding(timers)) { /* No asyncs outstanding, insert a fresh async marker */ insert_submarker(timers, label, category); timers->async_begin = currentTime; } else if(!is_async(timers->current)) { /* Previous asyncs still in flight, but a previous SwitchTo * already marked the end of the most recent async operation, * so we can rename that marker as the beginning of this async * operation */ struct pb_async_time_marker_list * last_event = get_last_async(timers); last_event->timerID = category; last_event->label = label; } // else, marker for switchToThis was already inserted //toSwitchto is already asynchronous, but if current/prev state is async too, then DRIVER is already running if (!is_async(timers->current)) { pb_StartTimer(&timers->timers[pb_TimerID_DRIVER]); } } } timers->current = category; }
void pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer) { /* Stop the currently running timer */ if (timers->current != pb_TimerID_NONE) { struct pb_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current]; struct pb_SubTimer *currSubTimer = (subtimerlist != NULL) ? subtimerlist->current : NULL; if (!is_async(timers->current) ) { if (timers->current != timer) { if (currSubTimer != NULL) { pb_StopTimerAndSubTimer(&timers->timers[timers->current], &currSubTimer->timer); } else { pb_StopTimer(&timers->timers[timers->current]); } } else { if (currSubTimer != NULL) { pb_StopTimer(&currSubTimer->timer); } } } else { insert_marker(timers, timer); if (!is_async(timer)) { // if switching to async too, keep driver going pb_StopTimer(&timers->timers[pb_TimerID_DRIVER]); } } } pb_Timestamp currentTime = get_time(); /* The only cases we check for asynchronous task completion is * when an overlapping CPU operation completes, or the next * segment blocks on completion of previous async operations */ if( asyncs_outstanding(timers) && (!is_async(timers->current) || is_blocking(timer) ) ) { struct pb_async_time_marker_list * last_event = get_last_async(timers); /* cudaSuccess if completed */ cudaError_t async_done = cudaEventQuery(*((cudaEvent_t *)last_event->marker)); if(is_blocking(timer)) { /* Async operations completed after previous CPU operations: * overlapped time is the total CPU time since this set of async * operations were first issued */ // timer to switch to is COPY or NONE if(async_done != cudaSuccess) accumulate_time(&(timers->timers[pb_TimerID_OVERLAP].elapsed), timers->async_begin,currentTime); /* Wait on async operation completion */ cudaEventSynchronize(*((cudaEvent_t *)last_event->marker)); pb_Timestamp total_async_time = record_async_times(timers); /* Async operations completed before previous CPU operations: * overlapped time is the total async time */ if(async_done == cudaSuccess) timers->timers[pb_TimerID_OVERLAP].elapsed += total_async_time; } else /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */ // i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are outstanding // so something is deeper in stack if(async_done == cudaSuccess) { /* Async operations completed before previous CPU operations: * overlapped time is the total async time */ timers->timers[pb_TimerID_OVERLAP].elapsed += record_async_times(timers); } } /* Start the new timer */ if (timer != pb_TimerID_NONE) { if(!is_async(timer)) { pb_StartTimer(&timers->timers[timer]); } else { // toSwitchTo Is Async (KERNEL/COPY_ASYNC) if (!asyncs_outstanding(timers)) { /* No asyncs outstanding, insert a fresh async marker */ insert_marker(timers, timer); timers->async_begin = currentTime; } else if(!is_async(timers->current)) { /* Previous asyncs still in flight, but a previous SwitchTo * already marked the end of the most recent async operation, * so we can rename that marker as the beginning of this async * operation */ struct pb_async_time_marker_list * last_event = get_last_async(timers); last_event->label = NULL; last_event->timerID = timer; } if (!is_async(timers->current)) { pb_StartTimer(&timers->timers[pb_TimerID_DRIVER]); } } } timers->current = timer; }
cudaError_t WINAPI wine_cudaEventQuery( cudaEvent_t event ) { WINE_TRACE("\n"); return cudaEventQuery( event ); }
cudaError_t WINAPI wine_cudaLaunch(const char *entry) { WINE_TRACE("%p\n", entry); if (QUEUE_MAX == numQueued) { cudaError_t evtErr; if (WINE_TRACE_ON(cuda)) { /* print out if event was recorded or not */ WINE_TRACE("check event recorded %s\n", debug_cudaError(cudaEventQuery(event))); } /* wait for event */ unsigned int sleepCount = 0; char * SLTIME = getenv("SLEEPTIME"); if ( SLTIME == NULL ) { sleep = 300000; } else { sleep = atoi ( SLTIME ); } while (cudaEventQuery(event) != cudaSuccess) { nanosleep(sleep, NULL); sleepCount++; } WINE_TRACE("slept %u times\n", sleepCount); WINE_TRACE("event recorded, continuing\n"); /* record a new event and subtract HALF_QUEUE_MAX from numQueued */ numQueued = HALF_QUEUE_MAX; evtErr = cudaEventRecord(event, 0); if (evtErr) { WINE_ERR("cudaEventRecord: %s\n", debug_cudaError(evtErr)); } } cudaError_t err = cudaLaunch(entry); if (!eventInitialized) { /* Create an event on the first cudaLaunch call. This is done here so the calling program * has a chance to select the GPU device with cudaSetDevice if desired. */ cudaError_t evtErr = cudaEventCreate(&event); if (evtErr) { WINE_ERR("cudaEventCreate: %s\n", debug_cudaError(evtErr)); } /* cudaEventCreate can WINE_TRACE("\n"); return errors from previous asynchronous calls, so an error here does * not necessarily mean the event wasn't created. Assume it was created for now. */ eventInitialized = TRUE; WINE_TRACE("created event %d\n", event); } /* record an event at HALF_QUEUE_MAX */ if (HALF_QUEUE_MAX == ++numQueued) { cudaError_t evtErr = cudaEventRecord(event, 0); /* Assuming everything using stream 0 */ if (evtErr) { WINE_ERR("cudaEventRecord: %s\n", debug_cudaError(evtErr)); } } if (err) { WINE_TRACE("return %s\n", debug_cudaError(err)); } return err; }
gaspi_return_t pgaspi_gpu_write_notify(const gaspi_segment_id_t segment_id_local, const gaspi_offset_t offset_local, const gaspi_rank_t rank, const gaspi_segment_id_t segment_id_remote, const gaspi_offset_t offset_remote, const gaspi_size_t size, const gaspi_notification_id_t notification_id, const gaspi_notification_t notification_value, const gaspi_queue_id_t queue, const gaspi_timeout_t timeout_ms) { if(glb_gaspi_ctx.rrmd[segment_id_local][glb_gaspi_ctx.rank].cudaDevId < 0 || size <= GASPI_GPU_DIRECT_MAX ) { return gaspi_write_notify(segment_id_local, offset_local, rank, segment_id_remote, offset_remote, size,notification_id, notification_value, queue, timeout_ms); } if(lock_gaspi_tout (&glb_gaspi_ctx.lockC[queue], timeout_ms)) return GASPI_TIMEOUT; char *host_ptr = (char*)(glb_gaspi_ctx.rrmd[segment_id_local][glb_gaspi_ctx.rank].host_ptr+NOTIFY_OFFSET+offset_local); char* device_ptr =(char*)(glb_gaspi_ctx.rrmd[segment_id_local][glb_gaspi_ctx.rank].addr+offset_local); gaspi_gpu* agpu = _gaspi_find_gpu(glb_gaspi_ctx.rrmd[segment_id_local][glb_gaspi_ctx.rank].cudaDevId); if( !agpu ) { gaspi_print_error("No GPU found or not initialized (gaspi_init_GPUs)."); unlock_gaspi(&glb_gaspi_ctx.lockC[queue]); return GASPI_ERROR; } int copy_size = 0; int gpu_offset = 0; int size_left = size; int BLOCK_SIZE= GASPI_GPU_BUFFERED; const gaspi_cycles_t s0 = gaspi_get_cycles (); while(size_left > 0) { int i; for(i = 0; i < GASPI_CUDA_EVENTS; i++) { if(size_left > BLOCK_SIZE) copy_size = BLOCK_SIZE; else copy_size = size_left; if(cudaMemcpyAsync(host_ptr+gpu_offset, device_ptr + gpu_offset, copy_size, cudaMemcpyDeviceToHost, agpu->streams[queue])) { unlock_gaspi(&glb_gaspi_ctx.lockC[queue]); return GASPI_ERROR; } glb_gaspi_ctx.ne_count_c[queue]++; agpu->events[queue][i].segment_remote = segment_id_remote; agpu->events[queue][i].segment_local = segment_id_local; agpu->events[queue][i].size = copy_size; agpu->events[queue][i].rank = rank; agpu->events[queue][i].offset_local = offset_local+gpu_offset; agpu->events[queue][i].offset_remote = offset_remote+gpu_offset; agpu->events[queue][i].in_use = 1; cudaError_t err = cudaEventRecord(agpu->events[queue][i].event,agpu->streams[queue]); if(err != cudaSuccess) { unlock_gaspi(&glb_gaspi_ctx.lockC[queue]); return GASPI_ERROR; } /* Thats not beautiful at all, however, else we have a overflow soon in the queue */ if(agpu->events[queue][i].ib_use) { struct ibv_wc wc; int ne; do { ne = ibv_poll_cq (glb_gaspi_ctx_ib.scqC[queue], 1, &wc); glb_gaspi_ctx.ne_count_c[queue] -= ne; if (ne == 0) { const gaspi_cycles_t s1 = gaspi_get_cycles (); const gaspi_cycles_t tdelta = s1 - s0; const float ms = (float) tdelta * glb_gaspi_ctx.cycles_to_msecs; if (ms > timeout_ms) { unlock_gaspi (&glb_gaspi_ctx.lockC[queue]); return GASPI_TIMEOUT; } } } while(ne == 0); agpu->events[queue][i].ib_use = 0; } gpu_offset += copy_size; size_left -= copy_size; if(size_left == 0) break; } for(i = 0; i < GASPI_CUDA_EVENTS; i++) { cudaError_t error; if (agpu->events[queue][i].in_use == 1 ) { do { error = cudaEventQuery(agpu->events[queue][i].event ); if( cudaSuccess == error ) { if (_gaspi_event_send(&agpu->events[queue][i],queue) ) { unlock_gaspi (&glb_gaspi_ctx.lockC[queue]); return GASPI_ERROR; } agpu->events[queue][i].in_use = 0; } else if(error == cudaErrorNotReady) { const gaspi_cycles_t s1 = gaspi_get_cycles (); const gaspi_cycles_t tdelta = s1 - s0; const float ms = (float) tdelta * glb_gaspi_ctx.cycles_to_msecs; if (ms > timeout_ms) { unlock_gaspi (&glb_gaspi_ctx.lockC[queue]); return GASPI_TIMEOUT; } } else { unlock_gaspi (&glb_gaspi_ctx.lockC[queue]); return GASPI_ERROR; } } while(error != cudaSuccess); } } } struct ibv_send_wr *bad_wr; struct ibv_sge slistN; struct ibv_send_wr swrN; slistN.addr = (uintptr_t)(glb_gaspi_ctx.nsrc.buf + notification_id * sizeof(gaspi_notification_id_t)); *((unsigned int *) slistN.addr) = notification_value; slistN.length = sizeof(gaspi_notification_id_t); slistN.lkey =((struct ibv_mr *) glb_gaspi_ctx.nsrc.mr)->lkey; if((glb_gaspi_ctx.rrmd[segment_id_remote][rank].cudaDevId >= 0)) { swrN.wr.rdma.remote_addr = (glb_gaspi_ctx.rrmd[segment_id_remote][rank].host_addr + notification_id * sizeof(gaspi_notification_id_t)); swrN.wr.rdma.rkey = glb_gaspi_ctx.rrmd[segment_id_remote][rank].host_rkey; } else { swrN.wr.rdma.remote_addr = (glb_gaspi_ctx.rrmd[segment_id_remote][rank].addr + notification_id * sizeof(gaspi_notification_id_t)); swrN.wr.rdma.rkey = glb_gaspi_ctx.rrmd[segment_id_remote][rank].rkey; } swrN.sg_list = &slistN; swrN.num_sge = 1; swrN.wr_id = rank; swrN.opcode = IBV_WR_RDMA_WRITE; swrN.send_flags = IBV_SEND_SIGNALED | IBV_SEND_INLINE;; swrN.next = NULL; if (ibv_post_send (glb_gaspi_ctx_ib.qpC[queue][rank], &swrN, &bad_wr)) { glb_gaspi_ctx.qp_state_vec[queue][rank] = GASPI_STATE_CORRUPT; unlock_gaspi (&glb_gaspi_ctx.lockC[queue]); return GASPI_ERROR; } glb_gaspi_ctx.ne_count_c[queue]++; unlock_gaspi (&glb_gaspi_ctx.lockC[queue]); return GASPI_SUCCESS; }
gaspi_return_t pgaspi_gpu_write(const gaspi_segment_id_t segment_id_local, const gaspi_offset_t offset_local, const gaspi_rank_t rank, const gaspi_segment_id_t segment_id_remote, const gaspi_offset_t offset_remote, const gaspi_size_t size, const gaspi_queue_id_t queue, const gaspi_timeout_t timeout_ms) { if( glb_gaspi_ctx.rrmd[segment_id_local][glb_gaspi_ctx.rank].cudaDevId < 0 || size <= GASPI_GPU_DIRECT_MAX ) { return gaspi_write(segment_id_local, offset_local, rank, segment_id_remote, offset_remote, size, queue, timeout_ms); } if(lock_gaspi_tout (&glb_gaspi_ctx.lockC[queue], timeout_ms)) return GASPI_TIMEOUT; char* host_ptr = (char*)(glb_gaspi_ctx.rrmd[segment_id_local][glb_gaspi_ctx.rank].host_ptr + NOTIFY_OFFSET + offset_local); char* device_ptr = (char*)(glb_gaspi_ctx.rrmd[segment_id_local][glb_gaspi_ctx.rank].addr + offset_local); gaspi_gpu* agpu = _gaspi_find_gpu(glb_gaspi_ctx.rrmd[segment_id_local][glb_gaspi_ctx.rank].cudaDevId); if( !agpu ) { gaspi_print_error("No GPU found or not initialized (gaspi_init_GPUs)."); return GASPI_ERROR; } int size_left = size; int copy_size = 0; int gpu_offset = 0; const int BLOCK_SIZE = GASPI_GPU_BUFFERED; const gaspi_cycles_t s0 = gaspi_get_cycles (); while(size_left > 0) { int i; for(i = 0; i < GASPI_CUDA_EVENTS; i++) { if(size_left > BLOCK_SIZE) copy_size = BLOCK_SIZE; else copy_size = size_left; if( cudaMemcpyAsync(host_ptr + gpu_offset, device_ptr + gpu_offset, copy_size, cudaMemcpyDeviceToHost, agpu->streams[queue])) { unlock_gaspi(&glb_gaspi_ctx.lockC[queue]); return GASPI_ERROR; } glb_gaspi_ctx.ne_count_c[queue]++; agpu->events[queue][i].segment_remote = segment_id_remote; agpu->events[queue][i].segment_local = segment_id_local; agpu->events[queue][i].size = copy_size; agpu->events[queue][i].rank = rank; agpu->events[queue][i].offset_local = offset_local+gpu_offset; agpu->events[queue][i].offset_remote = offset_remote+gpu_offset; agpu->events[queue][i].in_use =1; cudaError_t err = cudaEventRecord(agpu->events[queue][i].event, agpu->streams[queue]); if(err != cudaSuccess) { glb_gaspi_ctx.qp_state_vec[queue][rank] = GASPI_STATE_CORRUPT; unlock_gaspi(&glb_gaspi_ctx.lockC[queue]); return GASPI_ERROR; } gpu_offset += copy_size; size_left -= copy_size; if(size_left == 0) break; if(agpu->events[queue][i].ib_use) { struct ibv_wc wc; int ne; do { ne = ibv_poll_cq (glb_gaspi_ctx_ib.scqC[queue], 1, &wc); glb_gaspi_ctx.ne_count_c[queue] -= ne; if (ne == 0) { const gaspi_cycles_t s1 = gaspi_get_cycles (); const gaspi_cycles_t tdelta = s1 - s0; const float ms = (float) tdelta * glb_gaspi_ctx.cycles_to_msecs; if (ms > timeout_ms) { unlock_gaspi (&glb_gaspi_ctx.lockC[queue]); return GASPI_TIMEOUT; } } } while(ne==0); agpu->events[queue][i].ib_use = 0; } } for(i = 0; i < GASPI_CUDA_EVENTS; i++) { cudaError_t error; if ( agpu->events[queue][i].in_use == 1 ) { do { error = cudaEventQuery(agpu->events[queue][i].event ); if( cudaSuccess == error ) { if (_gaspi_event_send(&agpu->events[queue][i],queue)) { unlock_gaspi (&glb_gaspi_ctx.lockC[queue]); return GASPI_ERROR; } agpu->events[queue][i].in_use = 0; } else if(error == cudaErrorNotReady) { const gaspi_cycles_t s1 = gaspi_get_cycles (); const gaspi_cycles_t tdelta = s1 - s0; const float ms = (float) tdelta * glb_gaspi_ctx.cycles_to_msecs; if (ms > timeout_ms) { unlock_gaspi (&glb_gaspi_ctx.lockC[queue]); return GASPI_TIMEOUT; } } else { unlock_gaspi (&glb_gaspi_ctx.lockC[queue]); return GASPI_ERROR; } } while(error != cudaSuccess); } } } unlock_gaspi (&glb_gaspi_ctx.lockC[queue]); return GASPI_SUCCESS; }
/** * check whether the event is finished * * @return true if event is finished else false */ bool isFinished() const { assert(isValid); return cudaEventQuery(event) == cudaSuccess; }