コード例 #1
1
ファイル: saracufft.cpp プロジェクト: sara-nl/Omnimatch
extern "C" void sarafft_init() {
  printf( "Cuda is about to be initialized!\n" );
  fflush ( stdout );
  char *OMPI_COMM_WORLD_LOCAL_RANK = getenv( "OMPI_COMM_WORLD_LOCAL_RANK" );
  if ( NULL == OMPI_COMM_WORLD_LOCAL_RANK ) {
    printf( "OMPI_COMM_WORLD_LOCAL_RANK not set!\n" );
    fflush ( stdout );
    exit( 80 );
  }
  int localRank = atoi( OMPI_COMM_WORLD_LOCAL_RANK );
  printf( "Local rank is %d\n", localRank );
  fflush ( stdout );
  if ( CUDA_SUCCESS != cuInit( 0 ) ) {
    printf( "cuInit failed!\n" );
    fflush ( stdout );
    exit( 81 );
  }
  CUdevice device;
  if ( CUDA_SUCCESS != cuDeviceGet( &device, localRank ) ) {
    printf( "cuDeviceGet failed!\n" );
    fflush ( stdout );
    exit( 82 );
  }
  if ( CUDA_SUCCESS != cuCtxCreate( &cuda_context, CU_CTX_SCHED_YIELD, device ) ) {
    printf( "cuCtxCreate failed!\n" );
    fflush ( stdout );
    exit( 83 );
  }
  printf( "Cuda was initialized successfully!\n" );
  fflush ( stdout );
}
コード例 #2
0
	void CudaVideoRender::setVideoFile(LPCWSTR a_pVideoFilename)
	{
		m_sFileName=a_pVideoFilename;

		//char* video_file=new char[MAX_PATH];
		//WideCharToMultiByte( CP_ACP, 0, a_pVideoFilename, -1, video_file, MAX_PATH, NULL, NULL );

		// Find out the video size 
		m_bIsProgressive = loadVideoSource(a_pVideoFilename, 
			m_nVideoWidth, m_nVideoHeight, 
			m_nWindowWidth, m_nWindowHeight );
		//delete video_file;

		// Initialize CUDA
		//TODO: dont init cuda twice??
		cuInit(0);

		int bTCC = 0;
		// If we are using TCC driver, then always turn off interop
		if (bTCC) m_bInterop = false;

		// Initialize CUDA/D3D9 context and other video memory resources
		initCudaResources(m_bInterop, bTCC);

		m_pVideoSource->start();
	}
コード例 #3
0
void CudaModule::staticInit(void)
{
  if (s_inited) {
    return;
  }
  
  s_inited = true;
  s_available = false;

  checkError("cuInit", cuInit(0));
  s_available = true;
  
  s_device = selectDevice();
  printDeviceInfo(s_device);

  U32 flags = 0;
  flags |= CU_CTX_SCHED_SPIN; // use sync() if you want to yield
  
#if (CUDA_VERSION >= 2030)
  if (getDriverVersion() >= 23) 
  {
    // reduce launch overhead with large localmem
    flags |= CU_CTX_LMEM_RESIZE_TO_MAX; 
  }
#endif

  // OpenGL & window context must have been initialized !
  checkError("cuGLCtxCreate", cuGLCtxCreate( &s_context, flags, s_device));

  checkError("cuEventCreate", cuEventCreate(&s_startEvent, 0));
  checkError("cuEventCreate", cuEventCreate(&s_endEvent, 0));
}
コード例 #4
0
ファイル: run_cuda_init.cpp プロジェクト: FreeAlex/Halide
int main(int argc, char* argv[])
{
    cuInit(0);
    int devs = 0;
    cuDeviceGetCount(&devs);
    assert(devs > 0);
    CUdevice dev;
    CUresult status;
    CUcontext ctx = 0;
    cuDeviceGet(&dev, 0);
    cuCtxCreate(&ctx, 0, dev);
    {
        size_t f = 0, t = 0;
        CUresult r = cuMemGetInfo(&f, &t);
        fprintf( stderr, "Do cuMemGetInfo: %d, %zu/%zu\n", r, f, t );
    }
    
    __init("\n");
 
    printf("\nPress any key to exit...");
    char c;
    scanf("%c", &c);
 
    return 0;
}
コード例 #5
0
ファイル: mcopy.c プロジェクト: Constellation/gdev-bench
int mcopy_gpu_init(struct device_info *device_info)
{
	char fname[256];
	CUresult res;

	/* printf("madd_gpu_init called.\n"); */

	/* Initialization */
	if ((res = cuInit(0)) != CUDA_SUCCESS) {
		printf("cuInit failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	if ((res = cuDeviceGet(&device_info->dev, 0)) != CUDA_SUCCESS) {
		printf("cuDeviceGet failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	if ((res = cuCtxCreate(&device_info->context, 0, device_info->dev)) !=
	 CUDA_SUCCESS) {
		printf("cuCtxCreate failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	return 0;
}
コード例 #6
0
ファイル: device_cuda.cpp プロジェクト: baysmith/blender
	CUDADevice(DeviceInfo& info, Stats &stats, bool background_) : Device(stats)
	{
		background = background_;

		cuDevId = info.num;
		cuDevice = 0;
		cuContext = 0;

		/* intialize */
		if(cuda_error(cuInit(0)))
			return;

		/* setup device and context */
		if(cuda_error(cuDeviceGet(&cuDevice, cuDevId)))
			return;

		CUresult result;

		if(background) {
			result = cuCtxCreate(&cuContext, 0, cuDevice);
		}
		else {
			result = cuGLCtxCreate(&cuContext, 0, cuDevice);

			if(result != CUDA_SUCCESS) {
				result = cuCtxCreate(&cuContext, 0, cuDevice);
				background = true;
			}
		}

		if(cuda_error_(result, "cuCtxCreate"))
			return;

		cuda_pop_context();
	}
コード例 #7
0
ファイル: plugin-nvptx.c プロジェクト: chinabin/gcc-tiny
static bool
nvptx_init (void)
{
  CUresult r;
  int ndevs;

  if (instantiated_devices != 0)
    return true;

  r = cuInit (0);
  if (r != CUDA_SUCCESS)
    GOMP_PLUGIN_fatal ("cuInit error: %s", cuda_error (r));

  ptx_events = NULL;

  pthread_mutex_init (&ptx_event_lock, NULL);

  r = cuDeviceGetCount (&ndevs);
  if (r != CUDA_SUCCESS)
    GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));

  ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
					    * ndevs);

  return true;
}
コード例 #8
0
ファイル: gfree.c プロジェクト: wbkifun/my_stuff
int main() {

	int ngpu;
	CUdevice cuDevice;
	CUcontext cuContext;
	cuInit(0);
	cuDeviceGetCount(&ngpu);
	//printf("ngpu = %d\n", ngpu);

	size_t *totals, *frees ;
	totals = (size_t *) calloc (ngpu, sizeof(size_t));
	frees = (size_t *) calloc (ngpu, sizeof(size_t));

	int tid;
	omp_set_num_threads(ngpu);
	#pragma omp parallel private(tid, cuDevice, cuContext) shared(frees, totals)
	{
		tid = omp_get_thread_num();
		//printf("nthreads = %d, tid = %d\n", omp_get_num_threads(), tid);
		cuDeviceGet(&cuDevice, tid);
		cuCtxCreate(&cuContext, tid, cuDevice);
		cuMemGetInfo((size_t*)&frees[tid], (size_t*)&totals[tid]);
	}

	printf ("\ttotal\t\tfree\t\tused\n");
	for(int i=0; i<ngpu; i++) {
		printf("GPU %d\t%lu\t%lu\t%lu\n", i, (size_t)totals[i], (size_t)frees[i], (size_t)totals[i]-(size_t)frees[i]);
	}

	return 0;
}
コード例 #9
0
ファイル: util.c プロジェクト: Constellation/gdev-bench
CUresult cuda_driver_api_init(CUcontext *pctx, CUmodule *pmod, const char *f)
{
	CUresult res;
	CUdevice dev;

	res = cuInit(0);
	if (res != CUDA_SUCCESS) {
		printf("cuInit failed: res = %lu\n", (unsigned long)res);
		return res;
	}

	res = cuDeviceGet(&dev, 0);
	if (res != CUDA_SUCCESS) {
		printf("cuDeviceGet failed: res = %lu\n", (unsigned long)res);
		return res;
	}

	res = cuCtxCreate(pctx, 0, dev);
	if (res != CUDA_SUCCESS) {
		printf("cuCtxCreate failed: res = %lu\n", (unsigned long)res);
		return res;
	}
	
	res = cuModuleLoad(pmod, f);
	if (res != CUDA_SUCCESS) {
		printf("cuModuleLoad() failed\n");
		cuCtxDestroy(*pctx);
		return res;
	}

	return CUDA_SUCCESS;
}
コード例 #10
0
ファイル: plugin-nvptx.c プロジェクト: chinabin/gcc-tiny
static int
nvptx_get_num_devices (void)
{
  int n;
  CUresult r;

  /* PR libgomp/65099: Currently, we only support offloading in 64-bit
     configurations.  */
  if (sizeof (void *) != 8)
    return 0;

  /* This function will be called before the plugin has been initialized in
     order to enumerate available devices, but CUDA API routines can't be used
     until cuInit has been called.  Just call it now (but don't yet do any
     further initialization).  */
  if (instantiated_devices == 0)
    {
      r = cuInit (0);
      /* This is not an error: e.g. we may have CUDA libraries installed but
         no devices available.  */
      if (r != CUDA_SUCCESS)
        return 0;
    }

  r = cuDeviceGetCount (&n);
  if (r!= CUDA_SUCCESS)
    GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));

  return n;
}
コード例 #11
0
ファイル: osu_bw.c プロジェクト: LLNL/mpi-tools
int
init_cuda_context (void)
{
#ifdef _ENABLE_CUDA_
    CUresult curesult = CUDA_SUCCESS;
    CUdevice cuDevice;
    int local_rank, dev_count;
    int dev_id = 0;
    char * str;

    if ((str = getenv("LOCAL_RANK")) != NULL) {
        cudaGetDeviceCount(&dev_count);
        local_rank = atoi(str);
        dev_id = local_rank % dev_count;
    }

    curesult = cuInit(0);
    if (curesult != CUDA_SUCCESS) {
        return 1;
    }

    curesult = cuDeviceGet(&cuDevice, dev_id);
    if (curesult != CUDA_SUCCESS) {
        return 1;
    }

    curesult = cuCtxCreate(&cuContext, 0, cuDevice);
    if (curesult != CUDA_SUCCESS) {
        return 1;
    }
#endif
    return 0;
}
コード例 #12
0
ファイル: check_GPU.c プロジェクト: Aand1/ROSCH
main()
{
  /* initialize CUDA */
  CUresult res;
  res = cuInit(0);
  MY_CUDA_CHECK(res, "cuInit()");

  /* check GPU is setted or not */
  int device_num;
  res = cuDeviceGetCount(&device_num);
  MY_CUDA_CHECK(res, "cuDeviceGetCount()");

  if (device_num == 0) {        // no GPU is detected
    fprintf(stderr, "no CUDA capable GPU is detected...\n");
    exit(1);
  }

  printf("%d GPUs are detected\n", device_num);

  for (int i=0; i<device_num; i++)
    {
      /* get device handle of GPU No.i */
      CUdevice dev;
      res = cuDeviceGet(&dev, i);
      MY_CUDA_CHECK(res, "cuDeviceGet()");
      
      /* search compute capability of GPU No.i */
      int major=0, minor=0;
      res = cuDeviceComputeCapability(&major, &minor, dev);
      MY_CUDA_CHECK(res, "cuDeviceComputeCapability()");
     
      printf("GPU[%d] : actual compute capability is : %d%d\n", i, major, minor);
    }
}
コード例 #13
0
ファイル: cuda.c プロジェクト: mwh/grace-cuda
Object cuda_over_map(Object self, int nparts, int *argcv,
        Object *argv, int flags) {
    CUresult error;
    cuInit(0);
    int deviceCount = 0;
    error = cuDeviceGetCount(&deviceCount);
    if (deviceCount == 0) {
        raiseError("No CUDA devices found");
    }
    CUdevice cuDevice;
    CUcontext cuContext;
    CUmodule cuModule;
    CUfunction cuFunc;
    error = cuDeviceGet(&cuDevice, 0);
    error = cuCtxCreate(&cuContext, 0, cuDevice);
    CUdeviceptr d_A;
    CUdeviceptr d_B;
    CUdeviceptr d_res;
    errcheck(cuModuleLoad(&cuModule, grcstring(argv[argcv[0]])));
    CUdeviceptr dps[argcv[0]];
    void *args[argcv[0]+2];
    int size = INT_MAX;
    for (int i=0; i<argcv[0]; i++) {
        struct CudaFloatArray *a = (struct CudaFloatArray *)argv[i];
        if (a->size < size)
            size = a->size;
        errcheck(cuMemAlloc(&dps[i], size * sizeof(float)));
        errcheck(cuMemcpyHtoD(dps[i], &a->data, size * sizeof(float)));
        args[i+1] = &dps[i];
    }
    struct CudaFloatArray *r =
        (struct CudaFloatArray *)(alloc_CudaFloatArray(size));
    int fsize = sizeof(float) * size;
    errcheck(cuMemAlloc(&d_res, fsize));
    errcheck(cuMemcpyHtoD(d_res, &r->data, fsize));
    args[0] = &d_res;
    args[argcv[0]+1] = &size;

    int threadsPerBlock = 256;
    int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock;
    char name[256];
    strcpy(name, "block");
    strcat(name, grcstring(argv[argcv[0]]) + strlen("_cuda/"));
    for (int i=0; name[i] != 0; i++)
        if (name[i] == '.') {
            name[i] = 0;
            break;
        }
    errcheck(cuModuleGetFunction(&cuFunc, cuModule, name));
    errcheck(cuLaunchKernel(cuFunc, blocksPerGrid, 1, 1,
        threadsPerBlock, 1, 1,
        0,
        NULL, args, NULL));
    errcheck(cuMemcpyDtoH(&r->data, d_res, fsize));
    cuMemFree(d_res);
    for (int i=0; i<argcv[0]; i++)
        cuMemFree(dps[i]);
    return (Object)r;
}
コード例 #14
0
ファイル: device_server.cpp プロジェクト: STEllAR-GROUP/hpxcl
device::device() {
	cuInit(0);
	cuDeviceGet(&cu_device, 0);
	checkCudaError("device::device Init");
	//cuCtxCreate(&cu_context, 0, cu_device);
	//checkCudaError("device::device Create context");
	device_name = props.name;
}
コード例 #15
0
ファイル: occaCUDA.cpp プロジェクト: maxhutch/OCCA2
    void init(){
      if(!isNotInitialized)
        return;

      cuInit(0);

      isNotInitialized = false;
    }
コード例 #16
0
void Application::_initCUDA( int argc, char *argv[])
{
  if (CUDA_SUCCESS != cuInit(0))
  {
    fprintf( stderr, "Error: CUDA initialization has failed.\n");
    exit( EXIT_FAILURE );
  }
}
コード例 #17
0
int
init_accel (void)
{
#if defined(_ENABLE_OPENACC_) || defined(_ENABLE_CUDA_)
     char * str;
     int local_rank, dev_count;
     int dev_id = 0;
#endif
#ifdef _ENABLE_CUDA_
    CUresult curesult = CUDA_SUCCESS;
    CUdevice cuDevice;
#endif

    switch (options.accel) {
#ifdef _ENABLE_CUDA_
        case managed:
        case cuda:
            if ((str = getenv("LOCAL_RANK")) != NULL) {
                cudaGetDeviceCount(&dev_count);
                local_rank = atoi(str);
                dev_id = local_rank % dev_count;
            }
        
            curesult = cuInit(0);
            if (curesult != CUDA_SUCCESS) {
                return 1;
            }
        
            curesult = cuDeviceGet(&cuDevice, dev_id);
            if (curesult != CUDA_SUCCESS) {
                return 1;
            }
        
            curesult = cuCtxCreate(&cuContext, 0, cuDevice);
            if (curesult != CUDA_SUCCESS) {
                return 1;
            }
            break;
#endif   
#ifdef _ENABLE_OPENACC_
        case openacc:
            if ((str = getenv("LOCAL_RANK")) != NULL) {
                dev_count = acc_get_num_devices(acc_device_not_host);
                local_rank = atoi(str);
                dev_id = local_rank % dev_count;
            }
        
            acc_set_device_num (dev_id, acc_device_not_host);
            break;
#endif   
        default:
            fprintf(stderr, "Invalid device type, should be cuda or openacc\n");
            return 1;
    }

    return 0;
}
コード例 #18
0
/**
 * This measures the overhead in launching a kernel function on each GPU in the
 * system.
 *
 * It does this by executing a small kernel (copying 1 value in global memory) a
 * very large number of times and taking the average execution time.  This
 * program uses the CUDA driver API.
 */
int main() {
  CU_ERROR_CHECK(cuInit(0));

  int count;
  CU_ERROR_CHECK(cuDeviceGetCount(&count));

  float x = 5.0f;
  for (int d = 0; d < count; d++) {
    CUdevice device;
    CU_ERROR_CHECK(cuDeviceGet(&device, d));

    CUcontext context;
    CU_ERROR_CHECK(cuCtxCreate(&context, 0, device));

    CUdeviceptr in, out;
    CU_ERROR_CHECK(cuMemAlloc(&in, sizeof(float)));
    CU_ERROR_CHECK(cuMemAlloc(&out, sizeof(float)));
    CU_ERROR_CHECK(cuMemcpyHtoD(in, &x, sizeof(float)));

    CUmodule module;
    CU_ERROR_CHECK(cuModuleLoadData(&module, imageBytes));

    CUfunction function;
    CU_ERROR_CHECK(cuModuleGetFunction(&function, module, "kernel"));

    void * params[] = { &in, &out };

    CUevent start, stop;
    CU_ERROR_CHECK(cuEventCreate(&start, 0));
    CU_ERROR_CHECK(cuEventCreate(&stop, 0));

    CU_ERROR_CHECK(cuEventRecord(start, 0));
    for (int i = 0; i < ITERATIONS; i++)
      CU_ERROR_CHECK(cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, params, NULL));

    CU_ERROR_CHECK(cuEventRecord(stop, 0));
    CU_ERROR_CHECK(cuEventSynchronize(stop));

    float time;
    CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop));

    CU_ERROR_CHECK(cuEventDestroy(start));
    CU_ERROR_CHECK(cuEventDestroy(stop));

    CU_ERROR_CHECK(cuMemFree(in));
    CU_ERROR_CHECK(cuMemFree(out));

    fprintf(stdout, "Device %d: %fms\n", d, (time / (double)ITERATIONS));

    CU_ERROR_CHECK(cuModuleUnload(module));

    CU_ERROR_CHECK(cuCtxDestroy(context));
  }

  return 0;
}
コード例 #19
0
ファイル: cuInit.cpp プロジェクト: xavigibert/ShearCuda
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {

  if (nrhs != 0)
    mexErrMsgTxt("Wrong number of arguments");

  CUresult err = cuInit(0);

  plhs[0] = mxCreateDoubleScalar(err);

}
コード例 #20
0
ファイル: ov.c プロジェクト: CPFL/gtraffic
/*
 * Initializaiton in order to use kernel program 
 */
void
init_cuda(void){

  thread_num = (N <= 16) ? N : 16 ;  
  block_num = N / (thread_num*thread_num);
  if(N % (thread_num*thread_num) != 0) block_num++;
  
  res = cuInit(0);
  if(res != CUDA_SUCCESS){
    printf("cuInit failed: res = %s\n", conv(res));
    exit(1);
  }
  
  res = cuDeviceGet(&dev, 0);
  if(res != CUDA_SUCCESS){
    printf("cuDeviceGet failed: res = %s\n", conv(res));
    exit(1);
  }
  
  res = cuCtxCreate(&ctx, 0, dev);
  if(res != CUDA_SUCCESS){
    printf("cuCtxCreate failed: res = %s\n", conv(res));
    exit(1);
  }
  
  res = cuModuleLoad(&module, "./cuda_main.cubin");
  if(res != CUDA_SUCCESS){
    printf("cuModuleLoad() failed: res = %s\n", conv(res));
    exit(1);
  }
  
  res = cuModuleGetFunction(&function, module, "cuda_main");
  if(res != CUDA_SUCCESS){
    printf("cuModuleGetFunction() failed: res = %s\n",  conv(res));
    exit(1);
  }
  

  /* 
   * preparation for launch kernel 
   */
  res = cuFuncSetSharedSize(function, 0x40);  /* just random */
  if(res != CUDA_SUCCESS){
    printf("cuFuncSetSharedSize() failed: res = %s\n", conv(res));
    exit(1);
  }
  
  res = cuFuncSetBlockShape(function, thread_num, thread_num, 1);
  if(res != CUDA_SUCCESS){
    printf("cuFuncSetBlockShape() failed: res = %s\n", conv(res));
    exit(1);
  }

}
コード例 #21
0
ファイル: cuda.c プロジェクト: mwh/grace-cuda
Object cuda_computeCapability(Object self, int nparts, int *argcv,
        Object *argv, int flags) {
    cuInit(0);
    int deviceCount = 0;
    cuDeviceGetCount(&deviceCount);
    if (deviceCount == 0) {
        raiseError("No CUDA devices found");
    }
    CUdevice cuDevice;
    int major, minor;
    cuDeviceComputeCapability(&major, &minor, cuDevice);
    return alloc_Float64(major + minor / 10.0);
}
コード例 #22
0
ファイル: mem_alloc.c プロジェクト: Arella/rootbeer1
int main(int argc, char *argv[])
{
	char c;
	CUcontext ctx;
	CUdevice dev = 0;
	void *toSpace;
	int status, free, total;
	CUdeviceptr ptr = (CUdeviceptr)NULL;
	int size;
	
	if(argc != 2){
		fprintf(stderr,"Usage: mem_alloc.exe [MEMORY TO ALLOCATE IN MB]\n");
		exit(1);
	}
	
	printf("All status results should be 0, if not an error has occured.\nIf 2 is reported an out of memory error has occured for\nwhich you should decrease the memory input\n");
	size = atoi(argv[1]);
	
	printf("\nTrying to allocate %iMB of memory on host and GPU\n",size);
	
	if(size <= 0){
		fprintf(stderr,"\nERROR: Memory must be greater than 0\n");
		exit(1);
	}
	
	status = cuInit(0);
	printf("Init status: %i\n",status); 

	status = cuCtxCreate(&ctx, 0, dev);
	printf("Context creation status: %i\n",status); 
	
	cuMemGetInfo(&free, &total);
	printf("Get memory info status: %i\n",status); 
	
	printf("\n%.1f/%.1f (Free/Total) MB\n", free/1024.0/1024.0, total/1024.0/1024.0);
	
	status = cuMemHostAlloc(&toSpace, size*1024*1024, 0); 
	printf("Host allocation status: %i %s\n",status, (status==CUDA_SUCCESS) ? "SUCCESS" : "FAILED"); 

	status = cuMemAlloc(&ptr, size*1024*1024);
	printf("GPU allocation status: %i %s\n",status, (status==CUDA_SUCCESS) ? "SUCCESS" : "FAILED");

	printf("\nPress any key to exit...");
	scanf("%c", &c);
	
	status = cuCtxDestroy(ctx);
	printf("Context destroy status: %i\n",status); 

	return 0;
}
コード例 #23
0
ファイル: device_server.cpp プロジェクト: STEllAR-GROUP/hpxcl
device::device(int device_id) {
	cuInit(0);
	checkCudaError("device::device Init");
	cuDeviceGet(&cu_device, device_id);
	checkCudaError("device::device Get device");
	//cuCtxCreate(&cu_context, 0, cu_device);
	//checkCudaError("device::device Create context");
	this->set_device(device_id);

	cudaGetDeviceProperties(&props, device_id);
	checkCudaError("device::device Get properties ");

	this->device_name = props.name;
}
コード例 #24
0
ファイル: cudadevice.cpp プロジェクト: appleseedhq/appleseed
    Impl()
    {
        // Initialize CUDA.
        check_cuda_error(cuInit(0));

        int device_count;
        check_cuda_error(cuDeviceGetCount(&device_count));

        m_devices.reserve(static_cast<std::size_t>(device_count));
        m_contexts.resize(static_cast<std::size_t>(device_count), nullptr);

        for (int i = 0; i < device_count; ++i)
            m_devices.emplace_back(i);
    }
コード例 #25
0
ファイル: magma_solve.cpp プロジェクト: ORNL-Fusion/aorsa2d
int magma_solve ( int *dA_dim, int *lWork, double2 *A, int *ipiv, int *N ){

	// Check inputs
	//
	fprintf (stderr, "Using MAGMA solve\n" );
	fprintf (stderr, "	dA_dim: %i\n", *dA_dim );
	fprintf (stderr, "	N: %i\n", *N );
	fprintf (stderr, "	lWork: %i\n", *lWork );

	cuInit(0);
	cublasInit();
	printout_devices();

	cublasStatus status;

	double2 *d_A, *work;
	status = cublasAlloc ( *dA_dim, sizeof(double2), (void**)&d_A );

	if ( status != CUBLAS_STATUS_SUCCESS ){
			fprintf (stderr, "ERROR: device memory allocation error (d_A)\n" );
			fprintf (stderr, "ERROR: dA_dim: %i\n", dA_dim );
	}

	cudaError_t err;
	err = cudaMallocHost ( (void**)&work, *lWork * sizeof(double2) );

	if(err != cudaSuccess){
		fprintf (stderr, "ERROR: cudaMallocHost error (work)\n" );
	}

	int info[1];
	TimeStruct start, end;

	start = get_current_time ();
	magma_zgetrf ( N, N, A, N, ipiv, work, d_A, info );
	end = get_current_time ();

	double gpu_perf;
	gpu_perf = 4.*2.*(*N)*(*N)*(*N)/(3.*1000000*GetTimerValue(start,end));

	if ( info[0] != 0 ){
			fprintf (stderr, "ERROR: magma_zgetrf failed\n" );
	}

	printf("	GPU performance: %6.2f GFlop/s\n", gpu_perf);

	int stat = 0;
	return stat;

}
コード例 #26
0
ファイル: cuda.c プロジェクト: mwh/grace-cuda
Object cuda_deviceName(Object self, int nparts, int *argcv,
        Object *argv, int flags) {
    cuInit(0);
    int deviceCount = 0;
    cuDeviceGetCount(&deviceCount);
    if (deviceCount == 0) {
        raiseError("No CUDA devices found");
    }
    CUdevice cuDevice;
    cuDeviceGet(&cuDevice, 0);
    char name[100];
    cuDeviceGetName(name, 100, cuDevice);
    return alloc_String(name);
}
コード例 #27
0
ファイル: cudaspawner.c プロジェクト: jtarosky/wdp
int main(int argc,char **argv){
	unsigned total = 0;
	unsigned long zul;
	ctx marsh;

	if(argc != 3){
		usage(argv[0]);
		exit(EXIT_FAILURE);
	}
	if(getzul(argv[1],&zul)){
		usage(argv[0]);
		exit(EXIT_FAILURE);
	}
	if(getzul(argv[2],&marsh.s)){
		usage(argv[0]);
		exit(EXIT_FAILURE);
	}
	if(cuInit(0)){
		fprintf(stderr,"Couldn't initialize cuda\n");
		exit(EXIT_FAILURE);
	}
	if(cuDeviceGet(&marsh.dev,zul)){
		fprintf(stderr,"Couldn't get device %lu\n",zul);
		exit(EXIT_FAILURE);
	}
	while( (marsh.threadno = ++total) ){
		pthread_t tid;
		int err;

		if( (err = pthread_create(&tid,NULL,thread,&marsh)) ){
			fprintf(stderr,"Couldn't create thread %d (%s?)\n",
					total,strerror(err));
			exit(EXIT_SUCCESS);
		}
		pthread_mutex_lock(&lock);
		while(!thrdone && threadsmaintain){
			pthread_cond_wait(&cond,&lock);
		}
		thrdone = 0;
		if(!threadsmaintain){
			pthread_mutex_unlock(&lock);
			fprintf(stderr,"Thread %d exited with an error.\n",total);
			break;
		}
		pthread_mutex_unlock(&lock);
		printf("Created thread %d\n",total);
	}	
	exit(EXIT_SUCCESS);
}
コード例 #28
0
ファイル: wbInit.cpp プロジェクト: alexdalton/mingus
#include "cuda.h"
#include <wb.h>
#include <wbCUDA.h>

#define MB (1 << 20)
#ifndef WB_DEFAULT_HEAP_SIZE
const size_t WB_DEFAULT_HEAP_SIZE = (256 * MB);
#endif /* WB_DEFAULT_HEAP_SIZE */

static bool _initializedQ = wbFalse;

#ifndef _MSC_VER
__attribute__((__constructor__))
#endif /* _MSC_VER */
    void wb_init(void) {
  if (_initializedQ == wbTrue) {
    return;
  }

#ifdef WB_USE_CUDA
  cuInit(0);

  /* Select a random GPU */

  {
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);

    srand(time(NULL));
    cudaSetDevice(rand() % deviceCount);
  }

  cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 1*MB);
  cudaDeviceSetLimit(cudaLimitMallocHeapSize, WB_DEFAULT_HEAP_SIZE);

  cudaDeviceSynchronize();

#endif /* WB_USE_CUDA */

#ifdef WB_USE_CUSTOM_MALLOC
  wbMemoryManager_new(WB_DEFAULT_HEAP_SIZE);
#endif /* WB_USE_CUSTOM_MALLOC */

#ifdef _MSC_VER
  QueryPerformanceFrequency((LARGE_INTEGER *)&_hrtime_frequency);
#endif /* _MSC_VER */

  _hrtime();

  _timer = wbTimer_new();
  _logger = wbLogger_new();
  _initializedQ = wbTrue;

  wbFile_init();

#ifdef WB_USE_SANDBOX
  wbSandbox_new();
#endif /* WB_USE_SANDBOX */

  solutionJSON = NULL;

  atexit(wb_atExit);
}
コード例 #29
0
ファイル: maxindex.cpp プロジェクト: CindyYang85/mgpu
int main(int argc, char** argv) {
	cuInit(0);
	
	DevicePtr device;
	CUresult result = CreateCuDevice(0, &device);

	ContextPtr context;
	result = CreateCuContext(device, 0, &context);

	std::auto_ptr<MaxIndexEngine> engine;
	result = CreateMaxIndexEngine("../../src/maxindex/maxindex.cubin", &engine);
	if(CUDA_SUCCESS != result) {
		printf("Could not create max index engine.\n");
		return 0;
	}

	// Search through 5 million elements.
	const int NumElements = 5000000;
	std::vector<float> data(NumElements);
	std::tr1::uniform_real<float> r(-1e9, 1e9);
	for(int i(0); i < NumElements; ++i)
		data[i] = r(mt19937);
	
	// Use CPU to find the max element and index.
	float maxX = -1e37f;
	int maxIndex = 0;

	for(int i(0); i < NumElements; ++i)
		if(data[i] > maxX) {
			maxX = data[i];
			maxIndex = i;
		}

	printf("CPU says max x = %f, max index = %d.\n", maxX, maxIndex);

	// Use GPU to find the max element and index.
	DeviceMemPtr deviceData;
	context->MemAlloc(data, &deviceData);

	result = FindGlobalMax(engine.get(), deviceData->Handle(), NumElements, 
		&maxX, &maxIndex);
	if(CUDA_SUCCESS != result) {
		printf("Failure running max index kernel.\n");
		return 0;
	}

	printf("GPU says max x = %f, max index = %d.\n", maxX, maxIndex);
}
コード例 #30
0
int
main()
{
  CUresult result;
  result = cuInit(0);
  CUdevice device;
  result = cuDeviceGet(&device, 0);
  CUcontext ctx;
  result = cuCtxCreate(&ctx, 0, device);
  CUmodule module;
  result = cuModuleLoad(&module, "cuda-shift-throughput.cubin");
  CUfunction kernel;
  result = cuModuleGetFunction(&kernel, module, "kernel");
  int block;
  result = cuFuncGetAttribute(&block,
                              CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
                              kernel);
  int grid = 1024 * 1024;
  CUevent event[2];
  for (ptrdiff_t i = 0; i < 2; ++i) {
    result = cuEventCreate(&event[i], 0);
  }
  result = cuEventRecord(event[0], 0);
  result = cuLaunchKernel(kernel, grid, 1, 1, block, 1, 1, 0, 0, 0, 0);
  result = cuEventRecord(event[1], 0);
  result = cuEventSynchronize(event[1]);
  float time;
  result = cuEventElapsedTime(&time, event[0], event[1]);
  int gpuclock;
  result =
    cuDeviceGetAttribute(&gpuclock, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device);
  int gpump;
  result =
    cuDeviceGetAttribute(&gpump, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
                         device);
  std::printf("Clock: %d KHz, # of MPs: %d\n", gpuclock, gpump);
  std::printf("Elapsed Time: %f milliseconds\n", time);
  std::printf("# of Threads: %d, # of SHLs : %lld\n", block,
              1024ll * block * grid);
  std::printf("Throughput: %f\n",
              1024.0 * block * grid / ((double) gpump * gpuclock * time));
  for (ptrdiff_t i = 0; i < 2; ++i) {
    result = cuEventDestroy(event[i]);
  }
  result = cuModuleUnload(module);
  result = cuCtxDestroy(ctx);
  return 0;
}