Example #1
0
unsigned _starpu_driver_test_request_completion(struct _starpu_async_channel *async_channel)
{
#ifdef STARPU_SIMGRID
	unsigned ret;
	STARPU_PTHREAD_MUTEX_LOCK(&async_channel->event.mutex);
	ret = async_channel->event.finished;
	STARPU_PTHREAD_MUTEX_UNLOCK(&async_channel->event.mutex);
	return ret;
#else /* !SIMGRID */
	enum starpu_node_kind kind = async_channel->type;
	unsigned success = 0;
#ifdef STARPU_USE_CUDA
	cudaEvent_t event;
#endif

	switch (kind)
	{
#ifdef STARPU_USE_CUDA
	case STARPU_CUDA_RAM:
		event = (*async_channel).event.cuda_event;
		cudaError_t cures = cudaEventQuery(event);

		success = (cures == cudaSuccess);
		if (success)
			cudaEventDestroy(event);
		else if (cures != cudaErrorNotReady)
			STARPU_CUDA_REPORT_ERROR(cures);
		break;
#endif
#ifdef STARPU_USE_OPENCL
	case STARPU_OPENCL_RAM:
	{
		cl_int event_status;
		cl_event opencl_event = (*async_channel).event.opencl_event;
		if (opencl_event == NULL) STARPU_ABORT();
		cl_int err = clGetEventInfo(opencl_event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(event_status), &event_status, NULL);
		if (STARPU_UNLIKELY(err != CL_SUCCESS))
			STARPU_OPENCL_REPORT_ERROR(err);
		if (event_status < 0)
			STARPU_OPENCL_REPORT_ERROR(event_status);
		success = (event_status == CL_COMPLETE);
		break;
	}
#endif
#ifdef STARPU_USE_MIC
	case STARPU_MIC_RAM:
		success = _starpu_mic_request_is_complete(&(async_channel->event.mic_event));
		break;
#endif
	case STARPU_DISK_RAM:
		success = starpu_disk_test_request(async_channel);
		break;
	case STARPU_CPU_RAM:
	default:
		STARPU_ABORT();
	}

	return success;
#endif /* !SIMGRID */
}
Example #2
0
// same but for event
void GPUDataTransferer::SyncEvent(cudaEvent_t ev)
{
    auto rc = cudaEventQuery(ev);
    if (rc != cudaErrorNotReady)
    {
        // if Event is ready then no need to wait
        rc || "cudaEventQuery failed";
        return;
    }
    // we must wait
    cudaEventSynchronize(ev) || "cudaEventSynchronize failed";
}
Example #3
0
bool AsyncCopier::pollEvent(Event* event) {
  auto result = cudaEventQuery(*event->event);
  switch (result) {
  case cudaSuccess:
    VLOG(2) << "Poll event " << *event->event << ": ready";
    return true;
  case cudaErrorNotReady:
    VLOG(2) << "Poll event " << *event->event << ": not ready";
    return false;
  default:
    throwCudaError(result, "cudaEventQuery");
  }
}
static bool areAnyCudaKernelsRunning()
{
    cudaEvent_t event;

    cudaEventCreate(&event);

    cudaEventRecord(event, 0);

    bool running = cudaEventQuery(event) == cudaErrorNotReady;

    cudaEventDestroy(event);

    return running;
}
Example #5
0
    bool CudaEvent::isFinished()
    {
        // avoid cuda driver calls if event is already finished
        if( finished )
            return true;
        assert( isRecorded );

        cudaError_t rc = cudaEventQuery(event);

        if(rc == cudaSuccess)
        {
            finished = true;
            return true;
        }
        else if(rc == cudaErrorNotReady)
            return false;
        else
            PMACC_PRINT_CUDA_ERROR_AND_THROW(rc, "Event query failed");
    }
Example #6
0
static UCS_F_ALWAYS_INLINE unsigned
uct_cuda_copy_progress_event_queue(ucs_queue_head_t *event_queue, unsigned max_events)
{
    unsigned count = 0;
    cudaError_t result = cudaSuccess;
    uct_cuda_copy_event_desc_t *cuda_event;
    ucs_queue_iter_t iter;

    ucs_queue_for_each_safe(cuda_event, iter, event_queue, queue) {
        result = cudaEventQuery(cuda_event->event);
        if (cudaSuccess != result) {
            break;
        }
        ucs_queue_del_iter(event_queue, iter);
        if (cuda_event->comp != NULL) {
            uct_invoke_completion(cuda_event->comp, UCS_OK);
        }
        ucs_trace_poll("CUDA Event Done :%p", cuda_event);
        ucs_mpool_put(cuda_event);
        count++;
        if (count >= max_events) {
            break;
        }
    }
Example #7
0
void blasx_gpu_dgemm(void *arg_data)
{
    int i;
    //----------GPU Argument Prepare------------//
    struct gpu_thread_data *arg = (struct gpu_thread_data *) arg_data;
    const int GPU_id = arg->GPU_id;
    cuda_err = cudaSetDevice(GPU_id);
    assert(cuda_err == cudaSuccess);
    //matrix configuration
    reader_tracker addr_track[1300]; //CRITICAL
    int x                       = arg->mat_conf->x;
    int y                       = arg->mat_conf->y;
    int z                       = arg->mat_conf->z;
    double *A                    = (double*) arg->mat_conf->A;
    double *B                    = (double*) arg->mat_conf->B;
    double *C                    = (double*) arg->mat_conf->C;
    int lda                     = arg->mat_conf->lda;
    int ldb                     = arg->mat_conf->ldb;
    int ldc                     = arg->mat_conf->ldc;
    double beta                  = arg->mat_conf->beta;
    double alpha                 = arg->mat_conf->alpha;
    int nrowa                   = arg->mat_conf->nrowa;
    int nrowb                   = arg->mat_conf->nrowb;
    int nrowc                   = arg->mat_conf->nrowc;
    int ncola                   = arg->mat_conf->ncola;
    int ncolb                   = arg->mat_conf->ncolb;
    int ncolc                   = arg->mat_conf->ncolc;
    enum CBLAS_TRANSPOSE TransA = arg->mat_conf->TransA;
    enum CBLAS_TRANSPOSE TransB = arg->mat_conf->TransB;
    int block_dim               = arg->mat_conf->block_dim;
    //GPU configuration
    const int GPUs              = arg->GPUs;
    LRU_t   **LRUs              = arg->LRUs;
    cublasHandle_t  handle      = handles_DGEMM[GPU_id];
    queue_t *tasks_queue        = arg->tasks_queue;
    //------------------------------------------//
    //hook C_dev
    double *C_dev[STREAMNUM*2];
    for (i = 0; i < STREAMNUM*2; i++) {
        C_dev[i] = C_dev_DGEMM[i+STREAMNUM*GPU_id*2];
    }
    cudaStream_t stream[STREAMNUM];
    cudaEvent_t task_event[STREAMNUM];
    for (i = 0 ; i < STREAMNUM; i++) {
        //hook event
        task_event[i] = event_DGEMM[i+GPU_id*STREAMNUM];
        //hook stream
        stream[i]     = streams_DGEMM[i+GPU_id*STREAMNUM];
    }
    
#ifdef affinity
    //thread setup
    assert( blasx_set_affinity(GPU_id) == 0);
#endif
#ifdef thread_barrier
    pthread_barrier_t* barr     = arg->barr;
    int rc = pthread_barrier_wait(barr);
    assert(!(rc != 0 && rc != PTHREAD_BARRIER_SERIAL_THREAD));
#endif
#ifdef thread_profile
    printf("thread%d start@%f\n", GPU_id, get_cur_time());
#endif
    //------------------------------------------//


    //----------------GPU-START-----------------//
    int tasks_rs[STREAMNUM*2]; // mimic reseravation station
    int tasks_rs_size[2] = { 0, 0 };   // always tracking the first unused
    int switcher = 0;
    int task_batch_counter = 0;
    int mem_cpy_counter = 0;

    while (tasks_queue->TAIL >= 0) {
        /*------RS------*/
        int rs_counter          = 0;
        tasks_rs_size[switcher] = 0;
        for (rs_counter = 0; rs_counter < STREAMNUM; rs_counter++) {
            int task_id = dequeue(tasks_queue);
#ifdef task_tracker
            printf("==>GPU%d %d\n", GPU_id, task_id);
#endif
            if (task_id >= 0) {
                tasks_rs[tasks_rs_size[switcher]+STREAMNUM*switcher] = task_id;
                tasks_rs_size[switcher]++;
            }
        }
        
        /*--event_sync---*/
        while (cudaEventQuery(task_event[0]) != cudaSuccess);
        
        /*--reduce_reader--*/
        int addr_counter = 0;
        for (addr_counter = 0; addr_counter < mem_cpy_counter; addr_counter++) {
            void *key          = addr_track[addr_counter].addr;
            int target_GPU_id  = addr_track[addr_counter].GPU_id;
            int is_trans_done  = addr_track[addr_counter].is_trans_done;
            rbt_node *n        = rbt_find(key, &(LRUs[target_GPU_id]->hash_map));
            assert(n != NULL);
            if (is_trans_done == 0 && (target_GPU_id == GPU_id)) {
                assert(target_GPU_id == GPU_id);
                n->associated_LRU_elem->is_trans_done = 1;
            }
            atomic_reader_minus(n);
        }

        /*--kernel_exe---*/
        mem_cpy_counter = 0;
        int j = 0;
        for(j = 0; j <= z; j++){
            for (rs_counter = 0; rs_counter < tasks_rs_size[switcher]; rs_counter++) {
                int current_stream   = rs_counter;
                int current_task   = tasks_rs[rs_counter+STREAMNUM*switcher];
                int prior_task     = tasks_rs[rs_counter+(1-switcher)*STREAMNUM];
                cudaStream_t *curt_stream = &stream[current_stream];
                blasx_gpu_dgemm_kernel(j,
                                       nrowa, ncola,
                                       nrowb, ncolb,
                                       nrowc, ncolc,
                                       current_task, prior_task,
                                       TransA, TransB,
                                       A, B, C,
                                       lda, ldb, ldc,
                                       x, y, z,
                                       C_dev,
                                       curt_stream, &handle,
                                       current_stream,
                                       alpha, beta, block_dim,
                                       switcher, &task_batch_counter,
                                       LRUs, GPUs,
                                       &mem_cpy_counter,
                                       addr_track,
                                       GPU_id);
                if ( j == z && rs_counter == tasks_rs_size[switcher]-1) {
                    /*--event_record--*/
                    cudaError_t err = cudaEventRecord(task_event[0], stream[0]);
                    if(err != cudaSuccess) printf("event record fail\n");
                }
            }
        }
        switcher = 1 - switcher;
        task_batch_counter++;
    }
    //------------------------------------------//

    //---------------RESULT-HARVEST-------------//
    collect_final_result_dgemm(tasks_rs, tasks_rs_size, switcher, stream, C_dev, block_dim, STREAMNUM, x, y, z, nrowc, ncolc, ldc, C);
    //------------------------------------------//
#ifdef thread_profile
    printf("thread%d end@%f\n", GPU_id, get_cur_time());
#endif
}
Example #8
0
 bool Event::isCompleted() const
 {
   cudaError_t err = cudaEventQuery( m_event );
   DP_ASSERT( ( err == cudaSuccess ) || ( err == cudaErrorNotReady ) );
   return( err == cudaSuccess );
 }
Example #9
0
void
pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID category) 
{
  struct pb_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current];
  struct pb_SubTimer *curr = (subtimerlist != NULL) ? subtimerlist->current : NULL;
  
  if (timers->current != pb_TimerID_NONE) {
    if (!is_async(timers->current) ) {
      if (timers->current != category) {
        if (curr != NULL) {
          pb_StopTimerAndSubTimer(&timers->timers[timers->current], &curr->timer);
        } else {
          pb_StopTimer(&timers->timers[timers->current]);
        }
      } else {
        if (curr != NULL) {
          pb_StopTimer(&curr->timer);
        }
      }
    } else {
      insert_submarker(timers, label, category);
      if (!is_async(category)) { // if switching to async too, keep driver going
        pb_StopTimer(&timers->timers[pb_TimerID_DRIVER]);
      }
    }
  }

  pb_Timestamp currentTime = get_time();

  /* The only cases we check for asynchronous task completion is 
   * when an overlapping CPU operation completes, or the next 
   * segment blocks on completion of previous async operations */
  if( asyncs_outstanding(timers) && 
      (!is_async(timers->current) || is_blocking(category) ) ) {

    struct pb_async_time_marker_list * last_event = get_last_async(timers);
    /* cudaSuccess if completed */
    cudaError_t async_done = cudaEventQuery(*((cudaEvent_t *)last_event->marker));

    if(is_blocking(category)) {
      /* Async operations completed after previous CPU operations: 
       * overlapped time is the total CPU time since this set of async 
       * operations were first issued */
       
      // timer to switch to is COPY or NONE 
      // if it hasn't already finished, then just take now and use that as the elapsed time in OVERLAP
      // anything happening after now isn't OVERLAP because everything is being stopped to wait for synchronization
      // it seems that the extra sync wall time isn't being recorded anywhere
      if(async_done != cudaSuccess) 
        accumulate_time(&(timers->timers[pb_TimerID_OVERLAP].elapsed), 
	                  timers->async_begin,currentTime);

      /* Wait on async operation completion */
      cudaEventSynchronize(*((cudaEvent_t *)last_event->marker));
      pb_Timestamp total_async_time = record_async_times(timers);

      /* Async operations completed before previous CPU operations: 
       * overlapped time is the total async time */
       // If it did finish, then accumulate all the async time that did happen into OVERLAP
       // the immediately preceding EventSynchronize theoretically didn't have any effect since it was already completed.
      if(async_done == cudaSuccess)
        timers->timers[pb_TimerID_OVERLAP].elapsed += total_async_time;

    } else 
    /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */
    // i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are outstanding
    // so something is deeper in stack
    if(async_done == cudaSuccess) {
      /* Async operations completed before previous CPU operations: 
       * overlapped time is the total async time */
      timers->timers[pb_TimerID_OVERLAP].elapsed += record_async_times(timers);
    }   
    // else, this isn't blocking, so just check the next time around
  }
  
  subtimerlist = timers->sub_timer_list[category];
  struct pb_SubTimer *subtimer = NULL;
  
  if (label != NULL) {  
    subtimer = subtimerlist->subtimer_list;
    while (subtimer != NULL) {
      if (strcmp(subtimer->label, label) == 0) {
        break;
      } else {
        subtimer = subtimer->next;
      }
    }
  }

  /* Start the new timer */
  if (category != pb_TimerID_NONE) {
    if(!is_async(category)) {
    
      if (subtimerlist != NULL) {
        subtimerlist->current = subtimer;
      }
    
      if (category != timers->current && subtimer != NULL) {
        pb_StartTimerAndSubTimer(&timers->timers[category], &subtimer->timer);
      } else if (subtimer != NULL) {
        pb_StartTimer(&subtimer->timer);
      } else {
        pb_StartTimer(&timers->timers[category]);
      }            
    } else {
      if (subtimerlist != NULL) {
        subtimerlist->current = subtimer;
      }
    
      // toSwitchTo Is Async (KERNEL/COPY_ASYNC)
      if (!asyncs_outstanding(timers)) {
        /* No asyncs outstanding, insert a fresh async marker */
        insert_submarker(timers, label, category);
        timers->async_begin = currentTime;
      } else if(!is_async(timers->current)) {
        /* Previous asyncs still in flight, but a previous SwitchTo
         * already marked the end of the most recent async operation, 
         * so we can rename that marker as the beginning of this async 
         * operation */
                  
        struct pb_async_time_marker_list * last_event = get_last_async(timers);
        last_event->timerID = category;
        last_event->label = label;
      } // else, marker for switchToThis was already inserted
      
      //toSwitchto is already asynchronous, but if current/prev state is async too, then DRIVER is already running
      if (!is_async(timers->current)) {
        pb_StartTimer(&timers->timers[pb_TimerID_DRIVER]);
      }
    }
  }
  
  timers->current = category;  
}
Example #10
0
void
pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer)
{
  /* Stop the currently running timer */
  if (timers->current != pb_TimerID_NONE) {
    struct pb_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current];
    struct pb_SubTimer *currSubTimer = (subtimerlist != NULL) ? subtimerlist->current : NULL;
  
    if (!is_async(timers->current) ) {
      if (timers->current != timer) {
        if (currSubTimer != NULL) {
          pb_StopTimerAndSubTimer(&timers->timers[timers->current], &currSubTimer->timer);
        } else {
          pb_StopTimer(&timers->timers[timers->current]);
        }
      } else {
        if (currSubTimer != NULL) {
          pb_StopTimer(&currSubTimer->timer);
        }
      }
    } else {
      insert_marker(timers, timer);
      if (!is_async(timer)) { // if switching to async too, keep driver going
        pb_StopTimer(&timers->timers[pb_TimerID_DRIVER]);
      }
    }
  }
  
  pb_Timestamp currentTime = get_time();

  /* The only cases we check for asynchronous task completion is 
   * when an overlapping CPU operation completes, or the next 
   * segment blocks on completion of previous async operations */
  if( asyncs_outstanding(timers) && 
      (!is_async(timers->current) || is_blocking(timer) ) ) {

    struct pb_async_time_marker_list * last_event = get_last_async(timers);
    /* cudaSuccess if completed */
    cudaError_t async_done = cudaEventQuery(*((cudaEvent_t *)last_event->marker));

    if(is_blocking(timer)) {
      /* Async operations completed after previous CPU operations: 
       * overlapped time is the total CPU time since this set of async 
       * operations were first issued */
       
      // timer to switch to is COPY or NONE 
      if(async_done != cudaSuccess) 
        accumulate_time(&(timers->timers[pb_TimerID_OVERLAP].elapsed), 
	                  timers->async_begin,currentTime);

      /* Wait on async operation completion */
      cudaEventSynchronize(*((cudaEvent_t *)last_event->marker));
      pb_Timestamp total_async_time = record_async_times(timers);

      /* Async operations completed before previous CPU operations: 
       * overlapped time is the total async time */
      if(async_done == cudaSuccess)
        timers->timers[pb_TimerID_OVERLAP].elapsed += total_async_time;

    } else 
    /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */
    // i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are outstanding
    // so something is deeper in stack
    if(async_done == cudaSuccess) {
      /* Async operations completed before previous CPU operations: 
       * overlapped time is the total async time */
      timers->timers[pb_TimerID_OVERLAP].elapsed += record_async_times(timers);
    }   
  }

  /* Start the new timer */
  if (timer != pb_TimerID_NONE) {
    if(!is_async(timer)) {
      pb_StartTimer(&timers->timers[timer]);
    } else {
      // toSwitchTo Is Async (KERNEL/COPY_ASYNC)
      if (!asyncs_outstanding(timers)) {
        /* No asyncs outstanding, insert a fresh async marker */
      
        insert_marker(timers, timer);
        timers->async_begin = currentTime;
      } else if(!is_async(timers->current)) {
        /* Previous asyncs still in flight, but a previous SwitchTo
         * already marked the end of the most recent async operation, 
         * so we can rename that marker as the beginning of this async 
         * operation */
         
        struct pb_async_time_marker_list * last_event = get_last_async(timers);
        last_event->label = NULL;
        last_event->timerID = timer;
      }
      if (!is_async(timers->current)) {
        pb_StartTimer(&timers->timers[pb_TimerID_DRIVER]);
      }
    }
  }
  timers->current = timer;

}
Example #11
0
cudaError_t WINAPI wine_cudaEventQuery( cudaEvent_t event ) {
    WINE_TRACE("\n");
    return cudaEventQuery( event );
}
Example #12
0
cudaError_t WINAPI wine_cudaLaunch(const char *entry) {
    WINE_TRACE("%p\n", entry);

    if (QUEUE_MAX == numQueued) {
        cudaError_t evtErr;

        if (WINE_TRACE_ON(cuda)) {
            /* print out if event was recorded or not */
            WINE_TRACE("check event recorded %s\n", debug_cudaError(cudaEventQuery(event)));
        }

        /* wait for event */
        unsigned int sleepCount = 0;

        char * SLTIME = getenv("SLEEPTIME");
        if ( SLTIME == NULL ) {
            sleep = 300000;
        }
        else {
            sleep = atoi ( SLTIME );
        }

        while (cudaEventQuery(event) != cudaSuccess) {
            nanosleep(sleep, NULL);
            sleepCount++;
        }

        WINE_TRACE("slept %u times\n", sleepCount);

        WINE_TRACE("event recorded, continuing\n");

        /* record a new event and subtract HALF_QUEUE_MAX from numQueued */
        numQueued = HALF_QUEUE_MAX;
        evtErr = cudaEventRecord(event, 0);

        if (evtErr) {
            WINE_ERR("cudaEventRecord: %s\n", debug_cudaError(evtErr));
        }
    }

    cudaError_t err = cudaLaunch(entry);

    if (!eventInitialized) {
        /* Create an event on the first cudaLaunch call.  This is done here so the calling program
         * has a chance to select the GPU device with cudaSetDevice if desired. */
        cudaError_t evtErr = cudaEventCreate(&event);

        if (evtErr) {
            WINE_ERR("cudaEventCreate: %s\n", debug_cudaError(evtErr));
        }

        /* cudaEventCreate can WINE_TRACE("\n");
        return errors from previous asynchronous calls, so an error here does
                    * not necessarily mean the event wasn't created.  Assume it was created for now. */
        eventInitialized = TRUE;
        WINE_TRACE("created event %d\n", event);
    }

    /* record an event at HALF_QUEUE_MAX */
    if (HALF_QUEUE_MAX == ++numQueued) {
        cudaError_t evtErr = cudaEventRecord(event, 0);  /* Assuming everything using stream 0 */

        if (evtErr) {
            WINE_ERR("cudaEventRecord: %s\n", debug_cudaError(evtErr));
        }
    }

    if (err) {
        WINE_TRACE("return %s\n", debug_cudaError(err));
    }

    return err;
}
Example #13
0
gaspi_return_t
pgaspi_gpu_write_notify(const gaspi_segment_id_t segment_id_local,
			const gaspi_offset_t offset_local,
			const gaspi_rank_t rank,
			const gaspi_segment_id_t segment_id_remote,
			const gaspi_offset_t offset_remote,
			const gaspi_size_t size,
			const gaspi_notification_id_t notification_id,
			const gaspi_notification_t notification_value,
			const gaspi_queue_id_t queue,
			const gaspi_timeout_t timeout_ms)
{

  if(glb_gaspi_ctx.rrmd[segment_id_local][glb_gaspi_ctx.rank].cudaDevId < 0 ||
     size <= GASPI_GPU_DIRECT_MAX )
    {
      return gaspi_write_notify(segment_id_local, offset_local, rank, segment_id_remote, offset_remote, size,notification_id, notification_value, queue, timeout_ms);
    }

  if(lock_gaspi_tout (&glb_gaspi_ctx.lockC[queue], timeout_ms))
    return GASPI_TIMEOUT;

  char *host_ptr = (char*)(glb_gaspi_ctx.rrmd[segment_id_local][glb_gaspi_ctx.rank].host_ptr+NOTIFY_OFFSET+offset_local);
  char* device_ptr =(char*)(glb_gaspi_ctx.rrmd[segment_id_local][glb_gaspi_ctx.rank].addr+offset_local);

  gaspi_gpu* agpu = _gaspi_find_gpu(glb_gaspi_ctx.rrmd[segment_id_local][glb_gaspi_ctx.rank].cudaDevId);
  if( !agpu )
    {
      gaspi_print_error("No GPU found or not initialized (gaspi_init_GPUs).");
      unlock_gaspi(&glb_gaspi_ctx.lockC[queue]);
      return GASPI_ERROR;
    }

  int copy_size = 0;
  int gpu_offset = 0;
  int size_left = size;
  int BLOCK_SIZE= GASPI_GPU_BUFFERED;

  const gaspi_cycles_t s0 = gaspi_get_cycles ();

  while(size_left > 0)
    {
      int i;
      for(i = 0; i < GASPI_CUDA_EVENTS; i++)
	{
	  if(size_left > BLOCK_SIZE)
	    copy_size = BLOCK_SIZE;
	  else
	    copy_size = size_left;

	  if(cudaMemcpyAsync(host_ptr+gpu_offset, device_ptr + gpu_offset, copy_size, cudaMemcpyDeviceToHost, agpu->streams[queue]))
	    {
	      unlock_gaspi(&glb_gaspi_ctx.lockC[queue]);
	      return GASPI_ERROR;
	    }

	  glb_gaspi_ctx.ne_count_c[queue]++;

	  agpu->events[queue][i].segment_remote = segment_id_remote;
	  agpu->events[queue][i].segment_local = segment_id_local;
	  agpu->events[queue][i].size = copy_size;
	  agpu->events[queue][i].rank = rank;
	  agpu->events[queue][i].offset_local = offset_local+gpu_offset;
	  agpu->events[queue][i].offset_remote = offset_remote+gpu_offset;
	  agpu->events[queue][i].in_use  = 1;
	  cudaError_t err = cudaEventRecord(agpu->events[queue][i].event,agpu->streams[queue]);
	  if(err != cudaSuccess)
	    {
	      unlock_gaspi(&glb_gaspi_ctx.lockC[queue]);
	      return GASPI_ERROR;
	    }
	  /* Thats not beautiful at all, however, else we have a overflow soon in the queue */
	  if(agpu->events[queue][i].ib_use)
	    {
	      struct ibv_wc wc;
	      int ne;
	      do
		{
		  ne = ibv_poll_cq (glb_gaspi_ctx_ib.scqC[queue], 1, &wc);
		  glb_gaspi_ctx.ne_count_c[queue] -= ne;
		  if (ne == 0)
		    {
		      const gaspi_cycles_t s1 = gaspi_get_cycles ();
		      const gaspi_cycles_t tdelta = s1 - s0;

		      const float ms = (float) tdelta * glb_gaspi_ctx.cycles_to_msecs;
		      if (ms > timeout_ms)
			{
			  unlock_gaspi (&glb_gaspi_ctx.lockC[queue]);
			  return GASPI_TIMEOUT;
			}
		    }

		} while(ne == 0);
	      agpu->events[queue][i].ib_use = 0;
	    }

	  gpu_offset += copy_size;
	  size_left -= copy_size;
	  if(size_left == 0)
	    break;
	}

      for(i = 0; i < GASPI_CUDA_EVENTS; i++)
	{
	  cudaError_t error;
	  if (agpu->events[queue][i].in_use == 1 )
	    {
	      do
		{
		  error = cudaEventQuery(agpu->events[queue][i].event );
		  if( cudaSuccess == error )
		    {
		      if (_gaspi_event_send(&agpu->events[queue][i],queue) )
			{
			  unlock_gaspi (&glb_gaspi_ctx.lockC[queue]);
			  return GASPI_ERROR;
			}

		      agpu->events[queue][i].in_use  = 0;
		    }
		  else if(error == cudaErrorNotReady)
		    {
		      const gaspi_cycles_t s1 = gaspi_get_cycles ();
		      const gaspi_cycles_t tdelta = s1 - s0;

		      const float ms = (float) tdelta * glb_gaspi_ctx.cycles_to_msecs;
		      if (ms > timeout_ms)
			{
			  unlock_gaspi (&glb_gaspi_ctx.lockC[queue]);
			  return GASPI_TIMEOUT;
			}
		    }
		  else
		    {
		      unlock_gaspi (&glb_gaspi_ctx.lockC[queue]);
		      return GASPI_ERROR;
		    }
		} while(error != cudaSuccess);
	    }
	}
    }

  struct ibv_send_wr *bad_wr;
  struct ibv_sge slistN;
  struct ibv_send_wr swrN;

  slistN.addr = (uintptr_t)(glb_gaspi_ctx.nsrc.buf + notification_id * sizeof(gaspi_notification_id_t));

  *((unsigned int *) slistN.addr) = notification_value;

  slistN.length = sizeof(gaspi_notification_id_t);
  slistN.lkey =((struct ibv_mr *) glb_gaspi_ctx.nsrc.mr)->lkey;

  if((glb_gaspi_ctx.rrmd[segment_id_remote][rank].cudaDevId >= 0))
    {
      swrN.wr.rdma.remote_addr = (glb_gaspi_ctx.rrmd[segment_id_remote][rank].host_addr + notification_id * sizeof(gaspi_notification_id_t));
      swrN.wr.rdma.rkey = glb_gaspi_ctx.rrmd[segment_id_remote][rank].host_rkey;
    }
  else
    {
      swrN.wr.rdma.remote_addr = (glb_gaspi_ctx.rrmd[segment_id_remote][rank].addr + notification_id * sizeof(gaspi_notification_id_t));
      swrN.wr.rdma.rkey = glb_gaspi_ctx.rrmd[segment_id_remote][rank].rkey;
    }

  swrN.sg_list = &slistN;
  swrN.num_sge = 1;
  swrN.wr_id = rank;
  swrN.opcode = IBV_WR_RDMA_WRITE;
  swrN.send_flags = IBV_SEND_SIGNALED | IBV_SEND_INLINE;;
  swrN.next = NULL;

  if (ibv_post_send (glb_gaspi_ctx_ib.qpC[queue][rank], &swrN, &bad_wr))
    {
      glb_gaspi_ctx.qp_state_vec[queue][rank] = GASPI_STATE_CORRUPT;
      unlock_gaspi (&glb_gaspi_ctx.lockC[queue]);
      return GASPI_ERROR;
    }

  glb_gaspi_ctx.ne_count_c[queue]++;

  unlock_gaspi (&glb_gaspi_ctx.lockC[queue]);

  return GASPI_SUCCESS;
}
Example #14
0
gaspi_return_t
pgaspi_gpu_write(const gaspi_segment_id_t segment_id_local,
		 const gaspi_offset_t offset_local,
		 const gaspi_rank_t rank,
		 const gaspi_segment_id_t segment_id_remote,
		 const gaspi_offset_t offset_remote,
		 const gaspi_size_t size,
		 const gaspi_queue_id_t queue,
		 const gaspi_timeout_t timeout_ms)
{
  if( glb_gaspi_ctx.rrmd[segment_id_local][glb_gaspi_ctx.rank].cudaDevId < 0 ||
      size <= GASPI_GPU_DIRECT_MAX )
    {
      return gaspi_write(segment_id_local, offset_local, rank, segment_id_remote, offset_remote, size, queue, timeout_ms);
    }

  if(lock_gaspi_tout (&glb_gaspi_ctx.lockC[queue], timeout_ms))
    return GASPI_TIMEOUT;

  char* host_ptr = (char*)(glb_gaspi_ctx.rrmd[segment_id_local][glb_gaspi_ctx.rank].host_ptr + NOTIFY_OFFSET + offset_local);
  char* device_ptr = (char*)(glb_gaspi_ctx.rrmd[segment_id_local][glb_gaspi_ctx.rank].addr + offset_local);

  gaspi_gpu* agpu =  _gaspi_find_gpu(glb_gaspi_ctx.rrmd[segment_id_local][glb_gaspi_ctx.rank].cudaDevId);
  if( !agpu )
    {
      gaspi_print_error("No GPU found or not initialized (gaspi_init_GPUs).");
      return GASPI_ERROR;
    }

  int size_left = size;
  int copy_size = 0;
  int gpu_offset = 0;
  const int BLOCK_SIZE = GASPI_GPU_BUFFERED;

  const gaspi_cycles_t s0 = gaspi_get_cycles ();

  while(size_left > 0)
    {
      int i;
      for(i = 0; i < GASPI_CUDA_EVENTS; i++)
	{
	  if(size_left > BLOCK_SIZE)
	    copy_size = BLOCK_SIZE;
	  else
	    copy_size = size_left;

	  if( cudaMemcpyAsync(host_ptr + gpu_offset, device_ptr + gpu_offset, copy_size, cudaMemcpyDeviceToHost, agpu->streams[queue]))
	    {
	      unlock_gaspi(&glb_gaspi_ctx.lockC[queue]);
	      return GASPI_ERROR;
	    }

	  glb_gaspi_ctx.ne_count_c[queue]++;

	  agpu->events[queue][i].segment_remote = segment_id_remote;
	  agpu->events[queue][i].segment_local = segment_id_local;
	  agpu->events[queue][i].size = copy_size;
	  agpu->events[queue][i].rank = rank;
	  agpu->events[queue][i].offset_local = offset_local+gpu_offset;
	  agpu->events[queue][i].offset_remote = offset_remote+gpu_offset;
	  agpu->events[queue][i].in_use =1;

	  cudaError_t err = cudaEventRecord(agpu->events[queue][i].event, agpu->streams[queue]);
	  if(err != cudaSuccess)
	    {
	      glb_gaspi_ctx.qp_state_vec[queue][rank] = GASPI_STATE_CORRUPT;
	      unlock_gaspi(&glb_gaspi_ctx.lockC[queue]);
	      return GASPI_ERROR;
	    }

	  gpu_offset += copy_size;
	  size_left -= copy_size;

	  if(size_left == 0)
	    break;

	  if(agpu->events[queue][i].ib_use)
	    {
	      struct ibv_wc wc;
	      int ne;
	      do
		{
		  ne = ibv_poll_cq (glb_gaspi_ctx_ib.scqC[queue], 1, &wc);
		  glb_gaspi_ctx.ne_count_c[queue] -= ne;
		  if (ne == 0)
		    {
		      const gaspi_cycles_t s1 = gaspi_get_cycles ();
		      const gaspi_cycles_t tdelta = s1 - s0;

		      const float ms = (float) tdelta * glb_gaspi_ctx.cycles_to_msecs;
		      if (ms > timeout_ms)
			{
			  unlock_gaspi (&glb_gaspi_ctx.lockC[queue]);
			  return GASPI_TIMEOUT;
			}
		    }
		} while(ne==0);
	      agpu->events[queue][i].ib_use = 0;
	    }
	}

      for(i = 0; i < GASPI_CUDA_EVENTS; i++)
	{
	  cudaError_t error;
	  if ( agpu->events[queue][i].in_use == 1 )
	    {
	      do
		{
		  error = cudaEventQuery(agpu->events[queue][i].event );
		  if( cudaSuccess == error )
		    {
		      if (_gaspi_event_send(&agpu->events[queue][i],queue))
			{
			  unlock_gaspi (&glb_gaspi_ctx.lockC[queue]);
			  return GASPI_ERROR;
			}

		      agpu->events[queue][i].in_use = 0;
		    }
		  else if(error == cudaErrorNotReady)
		    {
		      const gaspi_cycles_t s1 = gaspi_get_cycles ();
		      const gaspi_cycles_t tdelta = s1 - s0;

		      const float ms = (float) tdelta * glb_gaspi_ctx.cycles_to_msecs;
		      if (ms > timeout_ms)
			{
			  unlock_gaspi (&glb_gaspi_ctx.lockC[queue]);
			  return GASPI_TIMEOUT;
			}
		    }
		  else
		    {
		      unlock_gaspi (&glb_gaspi_ctx.lockC[queue]);
		      return GASPI_ERROR;
		    }
		} while(error != cudaSuccess);
	    }
	}
    }

  unlock_gaspi (&glb_gaspi_ctx.lockC[queue]);

  return GASPI_SUCCESS;
}
Example #15
0
 /**
  * check whether the event is finished
  *
  * @return true if event is finished else false
  */
 bool isFinished() const
 {
     assert(isValid);
     return cudaEventQuery(event) == cudaSuccess;
 }