예제 #1
0
static void
nvptx_wait_all (void)
{
  CUresult r;
  struct ptx_stream *s;
  pthread_t self = pthread_self ();
  struct nvptx_thread *nvthd = nvptx_thread ();

  pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);

  /* Wait for active streams initiated by this thread (or by multiple threads)
     to complete.  */
  for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
    {
      if (s->multithreaded || pthread_equal (s->host_thread, self))
	{
	  r = cuStreamQuery (s->stream);
	  if (r == CUDA_SUCCESS)
	    continue;
	  else if (r != CUDA_ERROR_NOT_READY)
	    GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));

	  r = cuStreamSynchronize (s->stream);
	  if (r != CUDA_SUCCESS)
	    GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
	}
    }

  pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);

  event_gc (true);
}
예제 #2
0
static int
nvptx_async_test_all (void)
{
  struct ptx_stream *s;
  pthread_t self = pthread_self ();
  struct nvptx_thread *nvthd = nvptx_thread ();

  pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);

  for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
    {
      if ((s->multithreaded || pthread_equal (s->host_thread, self))
	  && cuStreamQuery (s->stream) == CUDA_ERROR_NOT_READY)
	{
	  pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
	  return 0;
	}
    }

  pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);

  event_gc (true);

  return 1;
}
예제 #3
0
static int
nvptx_async_test (int async)
{
  CUresult r;
  struct ptx_stream *s;

  s = select_stream_for_async (async, pthread_self (), false, NULL);

  if (!s)
    GOMP_PLUGIN_fatal ("unknown async %d", async);

  r = cuStreamQuery (s->stream);
  if (r == CUDA_SUCCESS)
    {
      /* The oacc-parallel.c:goacc_wait function calls this hook to determine
	 whether all work has completed on this stream, and if so omits the call
	 to the wait hook.  If that happens, event_gc might not get called
	 (which prevents variables from getting unmapped and their associated
	 device storage freed), so call it here.  */
      event_gc (true);
      return 1;
    }
  else if (r == CUDA_ERROR_NOT_READY)
    return 0;

  GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));

  return 0;
}
예제 #4
0
파일: plugin-nvptx.c 프로젝트: SuperLu/gcc
static void
nvptx_wait_async (int async1, int async2)
{
  CUevent *e;
  struct ptx_stream *s1, *s2;
  pthread_t self = pthread_self ();

  /* The stream that is waiting (rather than being waited for) doesn't
     necessarily have to exist already.  */
  s2 = select_stream_for_async (async2, self, true, NULL);

  s1 = select_stream_for_async (async1, self, false, NULL);
  if (!s1)
    GOMP_PLUGIN_fatal ("invalid async 1\n");

  if (s1 == s2)
    GOMP_PLUGIN_fatal ("identical parameters");

  e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));

  CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);

  event_gc (true);

  CUDA_CALL_ASSERT (cuEventRecord, *e, s1->stream);

  event_add (PTX_EVT_SYNC, e, NULL, 0);

  CUDA_CALL_ASSERT (cuStreamWaitEvent, s2->stream, *e, 0);
}
예제 #5
0
파일: plugin-nvptx.c 프로젝트: SuperLu/gcc
static bool
nvptx_dev2host (void *h, const void *d, size_t s)
{
  CUdeviceptr pb;
  size_t ps;
  struct nvptx_thread *nvthd = nvptx_thread ();

  if (!s)
    return true;
  if (!d)
    {
      GOMP_PLUGIN_error ("invalid device address");
      return false;
    }

  CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);

  if (!pb)
    {
      GOMP_PLUGIN_error ("invalid device address");
      return false;
    }
  if (!h)
    {
      GOMP_PLUGIN_error ("invalid host address");
      return false;
    }
  if (d == h)
    {
      GOMP_PLUGIN_error ("invalid host or device address");
      return false;
    }
  if ((void *)(d + s) > (void *)(pb + ps))
    {
      GOMP_PLUGIN_error ("invalid size");
      return false;
    }

#ifndef DISABLE_ASYNC
  if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
    {
      CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
      CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
      event_gc (false);
      CUDA_CALL (cuMemcpyDtoHAsync,
		 h, (CUdeviceptr) d, s, nvthd->current_stream->stream);
      CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
      event_add (PTX_EVT_MEM, e, (void *)h, 0);
    }
  else
#endif
    CUDA_CALL (cuMemcpyDtoH, h, (CUdeviceptr) d, s);

  return true;
}
예제 #6
0
파일: plugin-nvptx.c 프로젝트: SuperLu/gcc
static void
nvptx_wait (int async)
{
  struct ptx_stream *s;

  s = select_stream_for_async (async, pthread_self (), false, NULL);
  if (!s)
    GOMP_PLUGIN_fatal ("unknown async %d", async);

  CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);

  event_gc (true);
}
예제 #7
0
static void
nvptx_wait_all_async (int async)
{
  CUresult r;
  struct ptx_stream *waiting_stream, *other_stream;
  CUevent *e;
  struct nvptx_thread *nvthd = nvptx_thread ();
  pthread_t self = pthread_self ();

  /* The stream doing the waiting.  This could be the first mention of the
     stream, so create it if necessary.  */
  waiting_stream
    = select_stream_for_async (async, pthread_self (), true, NULL);

  /* Launches on the null stream already block on other streams in the
     context.  */
  if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
    return;

  event_gc (true);

  pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);

  for (other_stream = nvthd->ptx_dev->active_streams;
       other_stream != NULL;
       other_stream = other_stream->next)
    {
      if (!other_stream->multithreaded
	  && !pthread_equal (other_stream->host_thread, self))
	continue;

      e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));

      r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
      if (r != CUDA_SUCCESS)
	GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));

      /* Record an event on the waited-for stream.  */
      r = cuEventRecord (*e, other_stream->stream);
      if (r != CUDA_SUCCESS)
	GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));

      event_add (PTX_EVT_SYNC, e, NULL);

      r = cuStreamWaitEvent (waiting_stream->stream, *e, 0);
      if (r != CUDA_SUCCESS)
	GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r));
   }

  pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
}
예제 #8
0
static void
nvptx_wait (int async)
{
  CUresult r;
  struct ptx_stream *s;

  s = select_stream_for_async (async, pthread_self (), false, NULL);

  if (!s)
    GOMP_PLUGIN_fatal ("unknown async %d", async);

  r = cuStreamSynchronize (s->stream);
  if (r != CUDA_SUCCESS)
    GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));

  event_gc (true);
}
예제 #9
0
void
nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
	    int async, unsigned *dims, void *targ_mem_desc)
{
  struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
  CUfunction function;
  CUresult r;
  int i;
  struct ptx_stream *dev_str;
  void *kargs[1];
  void *hp, *dp;
  struct nvptx_thread *nvthd = nvptx_thread ();
  const char *maybe_abort_msg = "(perhaps abort was called)";

  function = targ_fn->fn;

  dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
  assert (dev_str == nvthd->current_stream);

  /* Initialize the launch dimensions.  Typically this is constant,
     provided by the device compiler, but we must permit runtime
     values.  */
  for (i = 0; i != 3; i++)
    if (targ_fn->launch->dim[i])
      dims[i] = targ_fn->launch->dim[i];

  /* This reserves a chunk of a pre-allocated page of memory mapped on both
     the host and the device. HP is a host pointer to the new chunk, and DP is
     the corresponding device pointer.  */
  map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp);

  GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);

  /* Copy the array of arguments to the mapped page.  */
  for (i = 0; i < mapnum; i++)
    ((void **) hp)[i] = devaddrs[i];

  /* Copy the (device) pointers to arguments to the device (dp and hp might in
     fact have the same value on a unified-memory system).  */
  r = cuMemcpy ((CUdeviceptr)dp, (CUdeviceptr)hp, mapnum * sizeof (void *));
  if (r != CUDA_SUCCESS)
    GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuda_error (r));

  GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
		     " gangs=%u, workers=%u, vectors=%u\n",
		     __FUNCTION__, targ_fn->launch->fn,
		     dims[0], dims[1], dims[2]);

  // OpenACC		CUDA
  //
  // num_gangs		nctaid.x
  // num_workers	ntid.y
  // vector length	ntid.x

  kargs[0] = &dp;
  r = cuLaunchKernel (function,
		      dims[GOMP_DIM_GANG], 1, 1,
		      dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
		      0, dev_str->stream, kargs, 0);
  if (r != CUDA_SUCCESS)
    GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));

#ifndef DISABLE_ASYNC
  if (async < acc_async_noval)
    {
      r = cuStreamSynchronize (dev_str->stream);
      if (r == CUDA_ERROR_LAUNCH_FAILED)
	GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
			   maybe_abort_msg);
      else if (r != CUDA_SUCCESS)
        GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
    }
  else
    {
      CUevent *e;

      e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));

      r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
      if (r == CUDA_ERROR_LAUNCH_FAILED)
	GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
			   maybe_abort_msg);
      else if (r != CUDA_SUCCESS)
        GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));

      event_gc (true);

      r = cuEventRecord (*e, dev_str->stream);
      if (r != CUDA_SUCCESS)
        GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));

      event_add (PTX_EVT_KNL, e, (void *)dev_str);
    }
#else
  r = cuCtxSynchronize ();
  if (r == CUDA_ERROR_LAUNCH_FAILED)
    GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
		       maybe_abort_msg);
  else if (r != CUDA_SUCCESS)
    GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
#endif

  GOMP_PLUGIN_debug (0, "  %s: kernel %s: finished\n", __FUNCTION__,
		     targ_fn->launch->fn);

#ifndef DISABLE_ASYNC
  if (async < acc_async_noval)
#endif
    map_pop (dev_str);
}
예제 #10
0
static void *
nvptx_dev2host (void *h, const void *d, size_t s)
{
  CUresult r;
  CUdeviceptr pb;
  size_t ps;
  struct nvptx_thread *nvthd = nvptx_thread ();

  if (!s)
    return 0;

  if (!d)
    GOMP_PLUGIN_fatal ("invalid device address");

  r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
  if (r != CUDA_SUCCESS)
    GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));

  if (!pb)
    GOMP_PLUGIN_fatal ("invalid device address");

  if (!h)
    GOMP_PLUGIN_fatal ("invalid host address");

  if (d == h)
    GOMP_PLUGIN_fatal ("invalid host or device address");

  if ((void *)(d + s) > (void *)(pb + ps))
    GOMP_PLUGIN_fatal ("invalid size");

#ifndef DISABLE_ASYNC
  if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
    {
      CUevent *e;

      e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));

      r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
      if (r != CUDA_SUCCESS)
        GOMP_PLUGIN_fatal ("cuEventCreate error: %s\n", cuda_error (r));

      event_gc (false);

      r = cuMemcpyDtoHAsync (h, (CUdeviceptr)d, s,
			     nvthd->current_stream->stream);
      if (r != CUDA_SUCCESS)
        GOMP_PLUGIN_fatal ("cuMemcpyDtoHAsync error: %s", cuda_error (r));

      r = cuEventRecord (*e, nvthd->current_stream->stream);
      if (r != CUDA_SUCCESS)
        GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));

      event_add (PTX_EVT_MEM, e, (void *)h);
    }
  else
#endif
    {
      r = cuMemcpyDtoH (h, (CUdeviceptr)d, s);
      if (r != CUDA_SUCCESS)
	GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
    }

  return 0;
}