Ejemplo n.º 1
0
void *cuda_make_ctx(CUcontext ctx, int flags) {
  cuda_context *res;
  void *p;

  res = malloc(sizeof(*res));
  if (res == NULL)
    return NULL;
  res->ctx = ctx;
  res->err = CUDA_SUCCESS;
  res->blas_handle = NULL;
  res->refcnt = 1;
  res->flags = flags;
  res->enter = 0;
  res->freeblocks = NULL;
  if (detect_arch(ARCH_PREFIX, res->bin_id, &err)) {
    goto fail_cache;
  }
  res->extcopy_cache = cache_lru(64, 32, (cache_eq_fn)extcopy_eq,
                                 (cache_hash_fn)extcopy_hash,
                                 (cache_freek_fn)extcopy_free,
                                 (cache_freev_fn)cuda_freekernel);
  if (res->extcopy_cache == NULL) {
    goto fail_cache;
  }
  err = cuStreamCreate(&res->s, 0);
  if (err != CUDA_SUCCESS) {
    goto fail_stream;
  }
  err = cuStreamCreate(&res->mem_s, CU_STREAM_NON_BLOCKING);
  if (err != CUDA_SUCCESS) {
    goto fail_mem_stream;
  }
  err = cuMemAllocHost(&p, 16);
  if (err != CUDA_SUCCESS) {
    goto fail_errbuf;
  }
  memset(p, 0, 16);
  /* Need to tag for new_gpudata */
  TAG_CTX(res);
  res->errbuf = new_gpudata(res, (CUdeviceptr)p, 16);
  if (res->errbuf == NULL) {
    err = res->err;
    goto fail_end;
  }
  res->errbuf->flags |= CUDA_MAPPED_PTR;
  return res;
 fail_end:
  cuMemFreeHost(p);
 fail_errbuf:
  cuStreamDestroy(res->mem_s);
 fail_mem_stream:
  cuStreamDestroy(res->s);
 fail_stream:
  cache_destroy(res->extcopy_cache);
 fail_cache:
  free(res);
  return NULL;
}
Ejemplo n.º 2
0
InteropResource::~InteropResource()
{
    //CUDA_WARN(cuCtxPushCurrent(ctx)); //error invalid value
    if (res[0].cuRes)
        CUDA_WARN(cuGraphicsUnregisterResource(res[0].cuRes));
    if (res[1].cuRes)
        CUDA_WARN(cuGraphicsUnregisterResource(res[1].cuRes));
    if (res[0].stream)
        CUDA_WARN(cuStreamDestroy(res[0].stream));
    if (res[1].stream)
        CUDA_WARN(cuStreamDestroy(res[1].stream));

    // FIXME: we own the context. But why crash to destroy ctx? CUDA_ERROR_INVALID_VALUE
    //CUDA_ENSURE(cuCtxDestroy(ctx));
}
Ejemplo n.º 3
0
bool VideoDecoderCUDAPrivate::releaseCuda()
{
    available = false;
    if (!can_load)
        return true;
    if (dec) {
        cuvidDestroyDecoder(dec);
        dec = 0;
    }
    if (parser) {
        cuvidDestroyVideoParser(parser);
        parser = 0;
    }
    if (stream) {
        cuStreamDestroy(stream);
        stream = 0;
    }
    if (host_data) {
        cuMemFreeHost(host_data);
        host_data = 0;
        host_data_size = 0;
    }
    if (vid_ctx_lock) {
        cuvidCtxLockDestroy(vid_ctx_lock);
        vid_ctx_lock = 0;
    }
    if (cuctx) {
        checkCudaErrors(cuCtxDestroy(cuctx));
    }
    // TODO: dllapi unload
    return true;
}
Ejemplo n.º 4
0
static bool
fini_streams_for_device (struct ptx_device *ptx_dev)
{
  free (ptx_dev->async_streams.arr);

  bool ret = true;
  while (ptx_dev->active_streams != NULL)
    {
      struct ptx_stream *s = ptx_dev->active_streams;
      ptx_dev->active_streams = ptx_dev->active_streams->next;

      ret &= map_fini (s);

      CUresult r = cuStreamDestroy (s->stream);
      if (r != CUDA_SUCCESS)
	{
	  GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r));
	  ret = false;
	}
      free (s);
    }

  ret &= map_fini (ptx_dev->null_stream);
  free (ptx_dev->null_stream);
  return ret;
}
Ejemplo n.º 5
0
	~CNvidiaNvencCodec()
	{
		if (m_NvidiaNvencCodecContext.GetSaveOutputToFile())
		{
			fclose(m_pOutputFile);
		}
#ifdef ASYNCHRONOUS
		CHECK_NVENC_STATUS(m_FunctionList.nvEncUnregisterAsyncEvent(m_pEncoder, &m_EventParameters));
#endif
		CHECK_NVENC_STATUS(m_FunctionList.nvEncUnmapInputResource(m_pEncoder, m_PictureParameters.inputBuffer));
		CHECK_NVENC_STATUS(m_FunctionList.nvEncUnregisterResource(m_pEncoder, m_pRegisteredResource));
		CHECK_NVENC_STATUS(m_FunctionList.nvEncDestroyBitstreamBuffer(m_pEncoder, m_pOutputBitstream));
		CHECK_NVENC_STATUS(m_FunctionList.nvEncDestroyEncoder(m_pEncoder));
		CHECK_CUDA_DRV_STATUS(cuStreamDestroy(m_Stream));
		if (m_NvidiaNvencCodecContext.GetUseSwscaleInsteadOfCuda())
		{
			cudaFree(m_pPageLockedNv12Buffer);
		}
		else
		{
			cudaFree(m_pPageLockedRgb32Buffer);
			CHECK_CUDA_DRV_STATUS(cuMemFree(m_pRgb32Buffer));
		}
		CHECK_CUDA_DRV_STATUS(cuMemFree(m_pNv12Buffer));
		CHECK_CUDA_DRV_STATUS(cuCtxDestroy(m_Context));
		FreeModule(m_hNvEncodeAPI64);
	}
Ejemplo n.º 6
0
static void cuda_free_ctx(cuda_context *ctx) {
  gpuarray_blas_ops *blas_ops;
  gpudata *next, *curr;

  ASSERT_CTX(ctx);
  ctx->refcnt--;
  if (ctx->refcnt == 0) {
    assert(ctx->enter == 0 && "Context was active when freed!");
    if (ctx->blas_handle != NULL) {
      ctx->err = cuda_property(ctx, NULL, NULL, GA_CTX_PROP_BLAS_OPS,
                               &blas_ops);
      blas_ops->teardown(ctx);
    }
    cuMemFreeHost((void *)ctx->errbuf->ptr);
    deallocate(ctx->errbuf);

    cuStreamDestroy(ctx->s);

    /* Clear out the freelist */
    for (curr = ctx->freeblocks; curr != NULL; curr = next) {
      next = curr->next;
      cuMemFree(curr->ptr);
      deallocate(curr);
    }

    if (!(ctx->flags & DONTFREE))
      cuCtxDestroy(ctx->ctx);
    cache_destroy(ctx->extcopy_cache);
    CLEAR(ctx);
    free(ctx);
  }
}
Ejemplo n.º 7
0
void GPUInterface::ResizeStreamCount(int newStreamCount) {
#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr,"\t\t\tEntering GPUInterface::ResizeStreamCount\n");
#endif
    SAFE_CUDA(cuCtxPushCurrent(cudaContext));

    SAFE_CUDA(cuCtxSynchronize());

    if (cudaStreams != NULL) {
        for(int i=0; i<numStreams; i++) {
            if (cudaStreams[i] != NULL)
                SAFE_CUDA(cuStreamDestroy(cudaStreams[i]));
        }
        free(cudaStreams);
    }

    if (cudaEvents != NULL) {
        for(int i=0; i<numStreams; i++) {
            if (cudaEvents[i] != NULL)
                SAFE_CUDA(cuEventDestroy(cudaEvents[i]));
        }
        free(cudaEvents);
    }

    if (newStreamCount == 1) {
        numStreams = 1;
        cudaStreams = (CUstream*) malloc(sizeof(CUstream) * numStreams);
        cudaEvents = (CUevent*) malloc(sizeof(CUevent) * (numStreams + 1));
        cudaStreams[0] = NULL;
        CUevent event;
        for(int i=0; i<2; i++) {
            SAFE_CUDA(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING));
            cudaEvents[i] = event;
        }
    } else {
        numStreams = newStreamCount;
        if (numStreams > BEAGLE_STREAM_COUNT) {
            numStreams = BEAGLE_STREAM_COUNT;
        }
        cudaStreams = (CUstream*) malloc(sizeof(CUstream) * numStreams);
        CUstream stream;
        cudaEvents = (CUevent*) malloc(sizeof(CUevent) * (numStreams + 1));
        CUevent event;
        for(int i=0; i<numStreams; i++) {
            SAFE_CUDA(cuStreamCreate(&stream, CU_STREAM_DEFAULT));
            cudaStreams[i] = stream;
            SAFE_CUDA(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING));
            cudaEvents[i] = event;
        }
        SAFE_CUDA(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING));
        cudaEvents[numStreams] = event;
    }

    SAFE_CUDA(cuCtxPopCurrent(&cudaContext));

#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr,"\t\t\tLeaving  GPUInterface::ResizeStreamCount\n");
#endif
}
	void CudaVideoRender::terminateCudaVideo(bool bDestroyContext)
	{
		if (m_pVideoParser)  delete m_pVideoParser;
		if (m_pVideoDecoder) delete m_pVideoDecoder;
		if (m_pVideoSource)  delete m_pVideoSource;
		if (m_pFrameQueue)   delete m_pFrameQueue;

		if (m_CtxLock) {
			cutilDrvSafeCallNoSync( cuvidCtxLockDestroy(m_CtxLock) );
		}
		if (m_cuContext && bDestroyContext)  {
			cutilDrvSafeCallNoSync( cuCtxDestroy(m_cuContext) );
			m_cuContext = NULL;
		}

		if (m_ReadbackSID)   cuStreamDestroy(m_ReadbackSID);
		if (m_KernelSID)     cuStreamDestroy(m_KernelSID);
	}
Ejemplo n.º 9
0
void
freeCudaResources(bool bDestroyContext)
{
    if (g_pVideoParser)
    {
        delete g_pVideoParser;
    }

    if (g_pVideoDecoder)
    {
        delete g_pVideoDecoder;
    }

    if (g_pVideoSource)
    {
        delete g_pVideoSource;
    }

    if (g_pFrameQueue)
    {
        delete g_pFrameQueue;
    }

    if (g_CtxLock)
    {
        checkCudaErrors(cuvidCtxLockDestroy(g_CtxLock));
    }

    if (g_oContext && bDestroyContext)
    {
        checkCudaErrors(cuCtxDestroy(g_oContext));
        g_oContext = NULL;
    }

    if (g_ReadbackSID)
    {
        cuStreamDestroy(g_ReadbackSID);
    }

    if (g_KernelSID)
    {
        cuStreamDestroy(g_KernelSID);
    }
}
Ejemplo n.º 10
0
static int
nvptx_set_cuda_stream (int async, void *stream)
{
  struct ptx_stream *oldstream;
  pthread_t self = pthread_self ();
  struct nvptx_thread *nvthd = nvptx_thread ();

  pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);

  if (async < 0)
    GOMP_PLUGIN_fatal ("bad async %d", async);

  /* We have a list of active streams and an array mapping async values to
     entries of that list.  We need to take "ownership" of the passed-in stream,
     and add it to our list, removing the previous entry also (if there was one)
     in order to prevent resource leaks.  Note the potential for surprise
     here: maybe we should keep track of passed-in streams and leave it up to
     the user to tidy those up, but that doesn't work for stream handles
     returned from acc_get_cuda_stream above...  */

  oldstream = select_stream_for_async (async, self, false, NULL);

  if (oldstream)
    {
      if (nvthd->ptx_dev->active_streams == oldstream)
	nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
      else
	{
	  struct ptx_stream *s = nvthd->ptx_dev->active_streams;
	  while (s->next != oldstream)
	    s = s->next;
	  s->next = s->next->next;
	}

      cuStreamDestroy (oldstream->stream);
      map_fini (oldstream);
      free (oldstream);
    }

  pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);

  (void) select_stream_for_async (async, self, true, (CUstream) stream);

  return 1;
}
Ejemplo n.º 11
0
static void
fini_streams_for_device (struct ptx_device *ptx_dev)
{
  free (ptx_dev->async_streams.arr);

  while (ptx_dev->active_streams != NULL)
    {
      struct ptx_stream *s = ptx_dev->active_streams;
      ptx_dev->active_streams = ptx_dev->active_streams->next;

      map_fini (s);
      cuStreamDestroy (s->stream);
      free (s);
    }

  map_fini (ptx_dev->null_stream);
  free (ptx_dev->null_stream);
}
Ejemplo n.º 12
0
static void cuda_free_ctx(cuda_context *ctx) {
  gpuarray_blas_ops *blas_ops;

  ASSERT_CTX(ctx);
  ctx->refcnt--;
  if (ctx->refcnt == 0) {
    if (ctx->blas_handle != NULL) {
      ctx->err = cuda_property(ctx, NULL, NULL, GA_CTX_PROP_BLAS_OPS, &blas_ops);
      blas_ops->teardown(ctx);
    }
    cuStreamDestroy(ctx->s);
    if (!(ctx->flags & DONTFREE))
      cuCtxDestroy(ctx->ctx);
    cache_free(ctx->extcopy_cache);
    CLEAR(ctx);
    free(ctx);
  }
}
Ejemplo n.º 13
0
void *cuda_make_ctx(CUcontext ctx, int flags) {
  int64_t v = 0;
  cuda_context *res;
  int e = 0;

  res = malloc(sizeof(*res));
  if (res == NULL)
    return NULL;
  res->ctx = ctx;
  res->err = CUDA_SUCCESS;
  res->blas_handle = NULL;
  res->refcnt = 1;
  res->flags = flags;
  res->enter = 0;
  if (detect_arch(ARCH_PREFIX, res->bin_id, &err)) {
    free(res);
    return NULL;
  }
  res->extcopy_cache = cache_alloc(64, 32);
  if (res->extcopy_cache == NULL) {
    free(res);
    return NULL;
  }
  err = cuStreamCreate(&res->s, 0);
  if (err != CUDA_SUCCESS) {
    cache_free(res->extcopy_cache);
    free(res);
    return NULL;
  }
  TAG_CTX(res); /* Need to tag before cuda_alloc */
  res->errbuf = cuda_alloc(res, 8, &v, GA_BUFFER_INIT, &e);
  if (e != GA_NO_ERROR) {
    err = res->err;
    cache_free(res->extcopy_cache);
    cuStreamDestroy(res->s);
    free(res);
    return NULL;
  }
  res->refcnt--; /* Don't want to create a reference loop with the errbuf */
  return res;
}
Ejemplo n.º 14
0
bool VideoDecoderCUDAPrivate::releaseCuda()
{
    if (!can_load)
        return true;
    if (dec) {
        cuvidDestroyDecoder(dec);
        dec = 0;
    }
    if (parser) {
        cuvidDestroyVideoParser(parser);
        parser = 0;
    }
    if (stream) {
        cuStreamDestroy(stream);
        stream = 0;
    }
    cuvidCtxLockDestroy(vid_ctx_lock);
    if (cuctx) {
        checkCudaErrors(cuCtxDestroy(cuctx));
    }
    return true;
}
Ejemplo n.º 15
0
static void cuda_free_ctx(cuda_context *ctx) {
  gpuarray_blas_ops *blas_ops;

  ASSERT_CTX(ctx);
  ctx->refcnt--;
  if (ctx->refcnt == 0) {
    assert(ctx->enter == 0 && "Context was active when freed!");
    if (ctx->blas_handle != NULL) {
      ctx->err = cuda_property(ctx, NULL, NULL, GA_CTX_PROP_BLAS_OPS,
                               &blas_ops);
      blas_ops->teardown(ctx);
    }
    ctx->refcnt = 2; /* Prevent recursive calls */
    cuda_free(ctx->errbuf);
    cuStreamDestroy(ctx->s);
    if (!(ctx->flags & DONTFREE))
      cuCtxDestroy(ctx->ctx);
    cache_free(ctx->extcopy_cache);
    CLEAR(ctx);
    free(ctx);
  }
}
Ejemplo n.º 16
0
int main(int argc, char **argv)
{
  //data
  CUdeviceptr  d_data0   = 0;
  CUdeviceptr  d_data1   = 0;
  DataStruct *h_data0  = 0;
  DataStruct *h_data1  = 0;
  DataStruct h_data_reference0;
  DataStruct h_data_reference1;
  unsigned int memSize = sizeof(DataStruct);
  
  //device references
  CUcontext    hContext = 0;
  CUdevice     hDevice  = 0;
  CUmodule     hModule  = 0;
  CUstream     hStream  = 0;

  // Initialize the device and get a handle to the kernel
  CUresult status = initialize(0, &hContext, &hDevice, &hModule, &hStream);
  
  // Allocate memory on host and device
  if ((h_data0 = (DataStruct *)malloc(memSize)) == NULL)
    {
      std::cerr << "Could not allocate host memory" << std::endl;
      exit(-1);
    }
  status = cuMemAlloc(&d_data0, memSize);

  if ((h_data1 = (DataStruct *)malloc(memSize)) == NULL)
    {
      std::cerr << "Could not allocate host memory" << std::endl;
      exit(-1);
    }
  status = cuMemAlloc(&d_data1, memSize);
  if (status != CUDA_SUCCESS)
    printf("ERROR: during cuMemAlloc\n");

  ///////////////////////////////////////////////////////////////////////////////
  //======================= test cases ========================================//
  ///////////////////////////////////////////////////////////////////////////////
  std::string name = "";
  unsigned int testnum=0;
  unsigned int passed=0;

  //++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  /////////////////////// Ralf ///////////////////////////////////////////////////
  //++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

  if(runRalfFunction("test_phi_scalar", test_phi_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize))
    passed++;
  testnum++;
  if(runRalfFunction("test_phi2_scalar", test_phi2_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize))
    passed++;
  testnum++;
  if(runRalfFunction("test_phi3_scalar", test_phi3_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize))
    passed++;
  testnum++;
  if(runRalfFunction("test_phi4_scalar", test_phi4_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize))
    passed++;
  testnum++;
  if(runRalfFunction("test_phi5_scalar", test_phi5_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize))
    passed++;
  testnum++;
  if(runRalfFunction("test_phi6_scalar", test_phi6_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize))
    passed++;
  testnum++;
  if(runRalfFunction("test_phi7_scalar", test_phi7_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize))
    passed++;
  testnum++;
  if(runRalfFunction("test_phi8_scalar", test_phi8_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize))
    passed++;
  testnum++;
  if(runRalfFunction("test_phi9_scalar", test_phi9_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize))
    passed++;
  testnum++;

  if(runRalfFunction("test_loopbad_scalar", test_loopbad_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize))
    passed++;
  testnum++;
  if(runRalfFunction("test_loop23_scalar", test_loop23_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize))
    passed++;
  testnum++;
  if(runRalfFunction("test_loop13_scalar", test_loop13_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize))
    passed++;
  testnum++;

  ////////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_GetElementPointer_constant"; /////////////////////
  setZero(h_data0,&h_data_reference0);

  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_GetElementPointer_constant(&h_data_reference0);                //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;
  
  ///////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_calculate"; /////////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->i = h_data_reference0.i = 3;
  h_data0->f = h_data_reference0.f = 3.2;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_calculate(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;

  ///////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_parquetShader"; /////////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->f = h_data_reference0.f = 1;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_parquetShader(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;

  ///////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_GetElementPointer_dyn"; /////////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->i = h_data_reference0.i = 3;
  h_data0->u = h_data_reference0.u = 7;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_GetElementPointer_dyn(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;

  ///////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_branch_simple"; // Branch 1 /////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->f = h_data_reference0.f = -4;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_branch_simple(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;

  ///////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_branch_simple"; // Branch 2 /////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->f = h_data_reference0.f = 8;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_branch_simple(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;
  
  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_branch_simplePHI"; // Branch 1 /////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->f = h_data_reference0.f = -10;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_branch_simplePHI(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;

  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_branch_loop"; //////////////////////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->i = h_data_reference0.i = 100;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_branch_loop(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;

  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_math"; //////////////////////////////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->f = h_data_reference0.f = 1.4;
  h_data0->i = h_data_reference0.i = 3;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_math(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;

  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_signedOperands"; //////////////////////////////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->i = h_data_reference0.i = 3;
  h_data0->f = h_data_reference0.f = -7;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_signedOperands(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;

    
  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_constantOperands"; //////////////////////////////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->i = h_data_reference0.i = 3;
  h_data0->f = h_data_reference0.f = -1.44;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_constantOperands(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;
    
  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_branch_loop_semihard"; /////////////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->i = h_data_reference0.i = 10;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_branch_loop_semihard(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;

  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_branch_loop_hard"; // Branch 1 /////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->i = h_data_reference0.i = 1;
  h_data0->u = h_data_reference0.u = 3;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_branch_loop_hard(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;
  
  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*////////////*/ name = "test_branch_loop_hard"; // Branch 2 /////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->i = h_data_reference0.i = 7;
  h_data0->u = h_data_reference0.u = 10;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_branch_loop_hard(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;
 
  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_binaryInst"; /////////////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->i = h_data_reference0.i = 5;
  h_data0->f = h_data_reference0.f = -121.23;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_binaryInst(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;

  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_selp"; /////////////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->i = h_data_reference0.i = -15;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_selp(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;

  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_GetElementPointer_complicated"; /////////////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->i = h_data_reference0.i = 1;
  h_data_reference0.s.s.f = h_data0->s.s.f = 3.11;
  h_data_reference0.s.sa[2].f = h_data0->s.sa[2].f = -4.32;
  h_data_reference0.s.sa[h_data0->i].f = h_data0->s.sa[h_data0->i].f = 111.3;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_GetElementPointer_complicated(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;

  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_call"; /////////////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->i = h_data_reference0.i = 10;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_call(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;

  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*/////////////*/ name = "test_alloca"; /////////////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->i = h_data_reference0.i = 1;
  h_data0->f = h_data_reference0.f = -3.23;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_alloca(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                     //compare Data
    {passed++; std::cout << " => Test passed!!!\n";}
  testnum++;

  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_alloca_complicated"; /////////////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->i = h_data_reference0.i = 1;
  h_data0->f = h_data_reference0.f = 23.213;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_alloca_complicated(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;


  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_globalVariables"; /////////////////////////
  setZero(h_data0,&h_data_reference0);
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_globalVariables(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;

  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_specialRegisters_x"; /////////////////////////
  setZero(h_data0,&h_data_reference0);
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize, 2,3,4, 2,3);   //run device function
  runHostTestFunction(test_specialRegisters_x, &h_data_reference0,   2,3,4, 2,3);   //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;


  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_specialRegisters_y"; /////////////////////////
  setZero(h_data0,&h_data_reference0);
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize, 2,3,4, 2,3);   //run device function
  runHostTestFunction(test_specialRegisters_x, &h_data_reference0,   2,3,4, 2,3);   //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;

  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_dualArgument"; /////////////////////////
  setZero(h_data0,&h_data_reference0);
  setZero(h_data1,&h_data_reference1);
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunDualTestFunction(&hModule, name, d_data0, d_data1, h_data0, h_data1, memSize);   //run device function

  test_dualArgument(&h_data_reference0,&h_data_reference1);   //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  if(compareData(h_data1,&h_data_reference1))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;  testnum++;

  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_vector"; /////////////////////////
  setZero(h_data0,&h_data_reference0);

  h_data0->fa[0] = h_data_reference0.fa[0] = 0.43f;
  h_data0->fa[1] = h_data_reference0.fa[1] = 0.234f;
  h_data0->fa[2] = h_data_reference0.fa[2] = 12893.f;
  h_data0->fa[3] = h_data_reference0.fa[3] = 13.33f;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_vector(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                     //compare Data
    {passed++; std::cout << " => Test passed!!!\n";}
  testnum++;

  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_reg2Const"; /////////////////////////
  setZero(h_data0,&h_data_reference0);

  /*
  unsigned int bytes; //size of constant
  CUdeviceptr devptr_const=0; 
  status = cuModuleGetGlobal(&devptr_const,
			     &bytes,
			     hModule, "__ptx_constant_data_global");

  cuMemcpyHtoD(devptr_const, h_data0, memSize);
  */

  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_reg2Const(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                     //compare Data
    {passed++; std::cout << " => Test passed!!!\n";}
  testnum++;

  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_constantMemory"; /////////////////////////
  setZero(h_data0,&h_data_reference0);

  h_data0->fa[0] = __ptx_constant_data_global.fa[0] = 0.2348f;
  
  unsigned int bytes; //size of constant
  CUdeviceptr devptr_const=0; 
  status = cuModuleGetGlobal(&devptr_const,
			     &bytes,
			     hModule, "__ptx_constant_data_global");

  cuMemcpyHtoD(devptr_const, h_data0, memSize);

  setZero(h_data0,&h_data_reference0);

  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_constantMemory(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                     //compare Data
    {passed++; std::cout << " => Test passed!!!\n";}
  testnum++;


  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_sharedMemory"; /////////////////////////
  setZero(h_data0,&h_data_reference0);

  for(int i = 0; i < ARRAY_N/2; i++)
    h_data0->fa[i*2] = i;

  for(int i = 0; i < ARRAY_N/2; i++)
    h_data0->fa[i*2+1] = -i;

  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize, 32,1,1, 1,1);   //run device function

  for(int i = 0; i < ARRAY_N/2; i++)
    h_data_reference0.fa[i] = i;
  for(int i = 0; i < ARRAY_N/2; i++)
    h_data_reference0.fa[i+32] = -i;
  //  runHostTestFunction(test_sharedMemory, &h_data_reference0, 16,1,1, 1,1);                                 //run host reference

  if(compareData(h_data0,&h_data_reference0))                     //compare Data
    {passed++; std::cout << " => Test passed!!!\n";}
  testnum++;

  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_lightShader"; /////////////////////////
  setZero(h_data0,&h_data_reference0);

  /*
  unsigned int bytes; //size of constant
  CUdeviceptr devptr_const=0; 
  status = cuModuleGetGlobal(&devptr_const,
			     &bytes,
			     hModule, "__ptx_constant_data_global");

  cuMemcpyHtoD(devptr_const, h_data0, memSize);
  */

  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function

  /*
  test_lightShader(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                     //compare Data
    {passed++; std::cout << " => Test passed!!!\n";}
  testnum++;
  */

  ///////////////////////////////////////////////////////////////////////////////
  //======================= test cases END ====================================//
  ///////////////////////////////////////////////////////////////////////////////

  // Check the result
  std::cout << "\nPASSED " << passed << " tests" << std::endl;
  std::cout << "FAILED " << (testnum-passed) << " tests" << std::endl;

  // Cleanup
  if (d_data0)
    {
      cuMemFree(d_data0);
      d_data0 = 0;
    }
  if (d_data1)
    {
      cuMemFree(d_data1);
      d_data1 = 0;
    }
  if (h_data0)
    {
      free(h_data0);
      h_data0 = 0;
    }
  if (h_data1)
    {
      free(h_data1);
      h_data1 = 0;
    }
  if (hModule)
    {
      cuModuleUnload(hModule);
      hModule = 0;
    }
  if (hStream)
    {
      cuStreamDestroy(hStream);
      hStream = 0;
    }
  if (hContext)
    {
      cuCtxDestroy(hContext);
      hContext = 0;
    }
  return 0;
}
Ejemplo n.º 17
0
/*
// Property Message
//
// API
//static int getPathOfFeaturePyramidGPUStream(IplImage * image, float step,
          int numStep, int startIndex, int sideLength, int bx, int by,
          CvLSVMFeaturePyramid **maps)
// INPUT
// image
// step
// numStep
// startIndex
// sideLength
// bx
// by
// OUTPUT
// maps
// RESULT
// Error status
*/
static int getPathOfFeaturePyramidGPUStream(IplImage * image, float step,
        int numStep, int startIndex, int sideLength, int bx, int by,
        CvLSVMFeaturePyramid **maps)
{
    CvLSVMFeatureMap **feature_maps;

    int i;
    int width, height, numChannels, sizeX, sizeY, p, pp, newSizeX, newSizeY;
    float *scales;
    CvLSVMFeatureMapGPU **devs_img, **devs_map_pre_norm, **devs_map_pre_pca;
    CUstream *streams;
    CUresult res;

    scales = (float *) malloc(sizeof(float) * (numStep));
    devs_img = (CvLSVMFeatureMapGPU **) malloc(
            sizeof(CvLSVMFeatureMapGPU*) * (numStep));
    devs_map_pre_norm = (CvLSVMFeatureMapGPU **) malloc(
            sizeof(CvLSVMFeatureMapGPU*) * (numStep));
    devs_map_pre_pca = (CvLSVMFeatureMapGPU **) malloc(
            sizeof(CvLSVMFeatureMapGPU*) * (numStep));
    streams = (CUstream *) malloc(sizeof(CUstream) * (numStep));
    feature_maps = (CvLSVMFeatureMap **) malloc(
            sizeof(CvLSVMFeatureMap *) * (numStep));

    // allocate device memory
    for (i = 0; i < numStep; i++)
    {
        scales[i] = 1.0f / powf(step, (float) i);
        width  = (int) (((float) image->width ) * scales[i] + 0.5);
        height = (int) (((float) image->height) * scales[i] + 0.5);
        numChannels = image->nChannels;
        sizeX = width  / sideLength;
        sizeY = height / sideLength;
        p  = NUM_SECTOR * 3;
        pp = NUM_SECTOR * 12;
        newSizeX = sizeX - 2;
        newSizeY = sizeY - 2;

        allocFeatureMapObjectGPU<float>(&devs_img[i], width, height,
                numChannels);
        allocFeatureMapObjectGPU<float>(&devs_map_pre_norm[i], sizeX, sizeY, p);
        allocFeatureMapObjectGPU<float>(&devs_map_pre_pca[i], newSizeX,
                newSizeY, pp);
        res = cuStreamCreate(&streams[i], CU_STREAM_DEFAULT);
        CUDA_CHECK(res, "cuStreamCreate(stream)");
    }

    // excute main function
    resizeGPUStream(numStep, image, scales, devs_img, streams);

    getFeatureMapsGPUStream(numStep, sideLength, devs_img, devs_map_pre_norm,
            streams);

    normalizeAndTruncateGPUStream(numStep, Val_Of_Truncate, devs_map_pre_norm,
            devs_map_pre_pca, streams);

    PCAFeatureMapsGPUStream(numStep, bx, by, devs_map_pre_pca, feature_maps,
            streams);

    // synchronize cuda stream
    for (i = 0; i < numStep; i++)
    {
        cuStreamSynchronize(streams[i]);
        cuStreamDestroy(streams[i]);
    }

    for (i = 0; i < numStep; i++)
    {
        (*maps)->pyramid[startIndex + i] = feature_maps[i];
    }/*for(i = 0; i < numStep; i++)*/

    // free device memory
    for (i = 0; i < numStep; i++)
    {
        freeFeatureMapObjectGPU(&devs_img[i]);
        freeFeatureMapObjectGPU(&devs_map_pre_norm[i]);
        freeFeatureMapObjectGPU(&devs_map_pre_pca[i]);
    }

    free(scales);
    free(devs_img);
    free(devs_map_pre_norm);
    free(devs_map_pre_pca);
    free(streams);
    free(feature_maps);

    return LATENT_SVM_OK;
}
Ejemplo n.º 18
0
 static void dispose(CUstream stream) {
     cuda_check( cuStreamDestroy(stream) );
 }
Ejemplo n.º 19
0
int cuda_test_memcpy_async(unsigned int size)
{
	int i;
	CUresult res;
	CUdevice dev;
	CUcontext ctx;
	CUstream stream;
	CUdeviceptr data_addr;
	unsigned int *in, *out;
	struct timeval tv;
	struct timeval tv_total_start, tv_total_end;
	unsigned long total;
	struct timeval tv_h2d_start, tv_h2d_end;
	float h2d;
	struct timeval tv_d2h_start, tv_d2h_end;
	float d2h;

	gettimeofday(&tv_total_start, NULL);

	res = cuInit(0);
	if (res != CUDA_SUCCESS) {
		printf("cuInit failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuDeviceGet(&dev, 0);
	if (res != CUDA_SUCCESS) {
		printf("cuDeviceGet failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuCtxCreate(&ctx, 0, dev);
	if (res != CUDA_SUCCESS) {
		printf("cuCtxCreate failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuStreamCreate(&stream, 0);
	if (res != CUDA_SUCCESS) {
		printf("cuStreamCreate failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuMemAlloc(&data_addr, size);
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuMemAllocHost((void **)&in, size);
	if (res != CUDA_SUCCESS) {
		printf("cuMemAllocHost(in) failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuMemAllocHost((void **)&out, size);
	if (res != CUDA_SUCCESS) {
		printf("cuMemAllocHost(out) failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	for (i = 0; i < size / 4; i++) {
		in[i] = i+1;
		out[i] = 0;
	}

	gettimeofday(&tv_h2d_start, NULL);
	res = cuMemcpyHtoDAsync(data_addr, in, size, stream);
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoDAsync failed: res = %u\n", (unsigned int)res);
		return -1;
	}
	res = cuStreamSynchronize(stream);
	if (res != CUDA_SUCCESS) {
		printf("cuStreamSynchronize() failed: res = %u\n", (unsigned int)res);
		return -1;
	}
	gettimeofday(&tv_h2d_end, NULL);

	gettimeofday(&tv_d2h_start, NULL);
	res = cuMemcpyDtoHAsync(out, data_addr, size, stream);
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyDtoHAsync failed: res = %u\n", (unsigned int)res);
		return -1;
	}
	res = cuStreamSynchronize(stream);
	if (res != CUDA_SUCCESS) {
		printf("cuStreamSynchronize() failed: res = %u\n", (unsigned int)res);
		return -1;
	}
	gettimeofday(&tv_d2h_end, NULL);

	for (i = 0; i < size / 4; i++) {
		if (in[i] != out[i]) {
			printf("in[%d] = %u, out[%d] = %u\n",
				   i, in[i], i, out[i]);
		}
	}

	res = cuMemFreeHost(out);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFreeHost(out) failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuMemFreeHost(in);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFreeHost(in) failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuMemFree(data_addr);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFree failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuStreamDestroy(stream);
	if (res != CUDA_SUCCESS) {
		printf("cuStreamDestroy failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuCtxDestroy(ctx);
	if (res != CUDA_SUCCESS) {
		printf("cuCtxDestroy failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	gettimeofday(&tv_total_end, NULL);

	tvsub(&tv_h2d_end, &tv_h2d_start, &tv);
	h2d = tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
	tvsub(&tv_d2h_end, &tv_d2h_start, &tv);
	d2h = tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
	tvsub(&tv_total_end, &tv_total_start, &tv);
	total = tv.tv_sec * 1000 + tv.tv_usec / 1000;

	printf("HtoD: %f\n", h2d);
	printf("DtoH: %f\n", d2h);

	return 0;

end:

	return -1;
}
Ejemplo n.º 20
0
static void vq_handle_output(VirtIODevice *vdev, VirtQueue *vq)
{
	VirtQueueElement elem;
	
	while(virtqueue_pop(vq, &elem)) {
		struct param *p = elem.out_sg[0].iov_base;
	
		//for all library routines: get required arguments from buffer, execute, and push results back in virtqueue
		switch (p->syscall_type) {
		case CUINIT: {
			p->result = cuInit(p->flags);
			break;
		}
		case CUDRIVERGETVERSION: {
			p->result = cuDriverGetVersion(&p->val1);
			break;
		}
		case CUDEVICEGETCOUNT: {
			p->result = cuDeviceGetCount(&p->val1);
			break;
		}
		case CUDEVICEGET: {
			p->result = cuDeviceGet(&p->device, p->val1);
			break;
		}
		case CUDEVICECOMPUTECAPABILITY: {
			p->result = cuDeviceComputeCapability(&p->val1, &p->val2, p->device);
			break;
		}
		case CUDEVICEGETNAME: {
			p->result = cuDeviceGetName(elem.in_sg[0].iov_base, p->val1, p->device);
			break;
		}
		case CUDEVICEGETATTRIBUTE: {
			p->result = cuDeviceGetAttribute(&p->val1, p->attrib, p->device);
			break;
		}
		case CUCTXCREATE: {
                        p->result = cuCtxCreate(&p->ctx, p->flags, p->device);				
			break;
		}
		case CUCTXDESTROY: {
			p->result = cuCtxDestroy(p->ctx);
			break;
		}
		case CUCTXGETCURRENT: {
			p->result = cuCtxGetCurrent(&p->ctx);
			break;
		}
		case CUCTXGETDEVICE: {
			p->result = cuCtxGetDevice(&p->device);
			break;
		}
		case CUCTXPOPCURRENT: {
			p->result = cuCtxPopCurrent(&p->ctx);
			break;
		}
		case CUCTXSETCURRENT: {
			p->result = cuCtxSetCurrent(p->ctx);
	                break;
		}
	        case CUCTXSYNCHRONIZE: {
		        p->result = cuCtxSynchronize();
	                break;
	        }
		case CUMODULELOAD: {
			//hardcoded path - needs improvement
			//all .cubin files should be stored in $QEMU_NFS_PATH - currently $QEMU_NFS_PATH is shared between host and guest with NFS
			char *binname = malloc((strlen((char *)elem.out_sg[1].iov_base)+strlen(getenv("QEMU_NFS_PATH")+1))*sizeof(char));
			if (!binname) {
				p->result = 0;
		                virtqueue_push(vq, &elem, 0);
				break;
			}
		        strcpy(binname, getenv("QEMU_NFS_PATH"));
		        strcat(binname, (char *)elem.out_sg[1].iov_base);
			//change current CUDA context
			//each CUDA contets has its own virtual memory space - isolation is ensured by switching contexes
                        if (cuCtxSetCurrent(p->ctx) != 0) {
				p->result = 999;
                                break;
			}
			p->result = cuModuleLoad(&p->module, binname);
			free(binname);
			break;
		}
                case CUMODULEGETGLOBAL: {
                        char *name = malloc(100*sizeof(char));
                        if (!name) {
                                p->result = 999;
                                break;
                        }
                        strcpy(name, (char *)elem.out_sg[1].iov_base);
                        p->result = cuModuleGetGlobal(&p->dptr,&p->size1,p->module,(const char *)name);
                        break;
                }
		case CUMODULEUNLOAD: {
			p->result = cuModuleUnload(p->module);
			break;			
		}
		case CUMEMALLOC: {
			if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
			p->result = cuMemAlloc(&p->dptr, p->bytesize);
			break;
		}
                case CUMEMALLOCPITCH: {
                        if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
			p->result = cuMemAllocPitch(&p->dptr, &p->size3, p->size1, p->size2, p->bytesize);
			break;
		}
		//large buffers are alocated in smaller chuncks in guest kernel space
		//gets each chunck seperately and copies it to device memory
	        case CUMEMCPYHTOD: {
			int i;
			size_t offset;
                        unsigned long s, nr_pages = p->nr_pages;
                        if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
			offset = 0;
			for (i=0; i<nr_pages; i++) {
				s = *(long *)elem.out_sg[1+2*i+1].iov_base;
				p->result = cuMemcpyHtoD(p->dptr+offset, elem.out_sg[1+2*i].iov_base, s);
				if (p->result != 0) break;
				offset += s;
			}
	                break;
		}
		case CUMEMCPYHTODASYNC: {
			int i;
                        size_t offset;
                        unsigned long s, nr_pages = p->nr_pages;
                        if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
                        offset = 0;
			for (i=0; i<nr_pages; i++) {
                                s = *(long *)elem.out_sg[1+2*i+1].iov_base;
                                p->result = cuMemcpyHtoDAsync(p->dptr+offset, elem.out_sg[1+2*i].iov_base, s, p->stream);
                                if (p->result != 0) break;
                                offset += s;
                        }
                        break;
		}
		case CUMEMCPYDTODASYNC: {
			p->result = cuMemcpyDtoDAsync(p->dptr, p->dptr1, p->size1, p->stream);
                        break;		
		}
	        case CUMEMCPYDTOH: {
			int i;
                        unsigned long s, nr_pages = p->nr_pages;
                        if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
			size_t offset = 0;
			for (i=0; i<nr_pages; i++) {
				s = *(long *)elem.in_sg[0+2*i+1].iov_base;
				p->result = cuMemcpyDtoH(elem.in_sg[0+2*i].iov_base, p->dptr+offset, s);
				if (p->result != 0) break;
				offset += s;
			}
			break;
		}
		case CUMEMCPYDTOHASYNC: {
			int i;
                        unsigned long s, nr_pages = p->nr_pages;
                        if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
                        size_t offset = 0;
			for (i=0; i<nr_pages; i++) {
                                s = *(long *)elem.in_sg[0+2*i+1].iov_base;
                                p->result = cuMemcpyDtoHAsync(elem.in_sg[0+2*i].iov_base, p->dptr+offset, s, p->stream);
                                if (p->result != 0) break;
                                offset += s;
                        }
			break;
		}
		case CUMEMSETD32: {
			p->result = cuMemsetD32(p->dptr, p->bytecount, p->bytesize);
			break;
		}
	        case CUMEMFREE: {
	                p->result = cuMemFree(p->dptr);
	                break;
	        }
		case CUMODULEGETFUNCTION: {
			char *name = (char *)elem.out_sg[1].iov_base;
			name[p->length] = '\0';
                        if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
			p->result = cuModuleGetFunction(&p->function, p->module, name);
			break;	
		}
		case CULAUNCHKERNEL: {
			void **args = malloc(p->val1*sizeof(void *));
	                if (!args) {
				p->result = 9999;
	                        break;
        	        }
			int i;
			for (i=0; i<p->val1; i++) {
				args[i] = elem.out_sg[1+i].iov_base;
			}
                        if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
			p->result = cuLaunchKernel(p->function,
					p->gridDimX, p->gridDimY, p->gridDimZ,
			                p->blockDimX, p->blockDimY, p->blockDimZ,
					p->bytecount, 0, args, 0);
			free(args);
			break;
		}
		case CUEVENTCREATE: {
			p->result = cuEventCreate(&p->event1, p->flags);
			break;
		}
		case CUEVENTDESTROY: {
			p->result = cuEventDestroy(p->event1);
			break;
		}
		case CUEVENTRECORD: {
			p->result = cuEventRecord(p->event1, p->stream);
			break;
		}
		case CUEVENTSYNCHRONIZE: {
			p->result = cuEventSynchronize(p->event1);
			break;
		}
		case CUEVENTELAPSEDTIME: {
			p->result = cuEventElapsedTime(&p->pMilliseconds, p->event1, p->event2);
			break;
		}
		case CUSTREAMCREATE: {
			p->result =  cuStreamCreate(&p->stream, 0);
			break;
		}		
                case CUSTREAMSYNCHRONIZE: {
                        p->result = cuStreamSynchronize(p->stream);
                        break;
                }
                case CUSTREAMQUERY: {
                        p->result = cuStreamQuery(p->stream);
                        break;
                }
		case CUSTREAMDESTROY: {
                        p->result = cuStreamDestroy(p->stream);
                        break;
                }

		default: 
			printf("Unknown syscall_type\n");
		}
		virtqueue_push(vq, &elem, 0);
	}
	//notify frontend - trigger virtual interrupt
	virtio_notify(vdev, vq);
	return;
}
Ejemplo n.º 21
0
 void device_t<CUDA>::freeStream(stream s){
   OCCA_CUDA_CHECK("Device: freeStream",
                   cuStreamDestroy( *((CUstream*) s) ));
   delete (CUstream*) s;
 }