Ejemplo n.º 1
1
extern "C" void sarafft_init() {
  printf( "Cuda is about to be initialized!\n" );
  fflush ( stdout );
  char *OMPI_COMM_WORLD_LOCAL_RANK = getenv( "OMPI_COMM_WORLD_LOCAL_RANK" );
  if ( NULL == OMPI_COMM_WORLD_LOCAL_RANK ) {
    printf( "OMPI_COMM_WORLD_LOCAL_RANK not set!\n" );
    fflush ( stdout );
    exit( 80 );
  }
  int localRank = atoi( OMPI_COMM_WORLD_LOCAL_RANK );
  printf( "Local rank is %d\n", localRank );
  fflush ( stdout );
  if ( CUDA_SUCCESS != cuInit( 0 ) ) {
    printf( "cuInit failed!\n" );
    fflush ( stdout );
    exit( 81 );
  }
  CUdevice device;
  if ( CUDA_SUCCESS != cuDeviceGet( &device, localRank ) ) {
    printf( "cuDeviceGet failed!\n" );
    fflush ( stdout );
    exit( 82 );
  }
  if ( CUDA_SUCCESS != cuCtxCreate( &cuda_context, CU_CTX_SCHED_YIELD, device ) ) {
    printf( "cuCtxCreate failed!\n" );
    fflush ( stdout );
    exit( 83 );
  }
  printf( "Cuda was initialized successfully!\n" );
  fflush ( stdout );
}
	void CudaVideoRender::setVideoFile(LPCWSTR a_pVideoFilename)
	{
		m_sFileName=a_pVideoFilename;

		//char* video_file=new char[MAX_PATH];
		//WideCharToMultiByte( CP_ACP, 0, a_pVideoFilename, -1, video_file, MAX_PATH, NULL, NULL );

		// Find out the video size 
		m_bIsProgressive = loadVideoSource(a_pVideoFilename, 
			m_nVideoWidth, m_nVideoHeight, 
			m_nWindowWidth, m_nWindowHeight );
		//delete video_file;

		// Initialize CUDA
		//TODO: dont init cuda twice??
		cuInit(0);

		int bTCC = 0;
		// If we are using TCC driver, then always turn off interop
		if (bTCC) m_bInterop = false;

		// Initialize CUDA/D3D9 context and other video memory resources
		initCudaResources(m_bInterop, bTCC);

		m_pVideoSource->start();
	}
Ejemplo n.º 3
0
void CudaModule::staticInit(void)
{
  if (s_inited) {
    return;
  }
  
  s_inited = true;
  s_available = false;

  checkError("cuInit", cuInit(0));
  s_available = true;
  
  s_device = selectDevice();
  printDeviceInfo(s_device);

  U32 flags = 0;
  flags |= CU_CTX_SCHED_SPIN; // use sync() if you want to yield
  
#if (CUDA_VERSION >= 2030)
  if (getDriverVersion() >= 23) 
  {
    // reduce launch overhead with large localmem
    flags |= CU_CTX_LMEM_RESIZE_TO_MAX; 
  }
#endif

  // OpenGL & window context must have been initialized !
  checkError("cuGLCtxCreate", cuGLCtxCreate( &s_context, flags, s_device));

  checkError("cuEventCreate", cuEventCreate(&s_startEvent, 0));
  checkError("cuEventCreate", cuEventCreate(&s_endEvent, 0));
}
Ejemplo n.º 4
0
int main(int argc, char* argv[])
{
    cuInit(0);
    int devs = 0;
    cuDeviceGetCount(&devs);
    assert(devs > 0);
    CUdevice dev;
    CUresult status;
    CUcontext ctx = 0;
    cuDeviceGet(&dev, 0);
    cuCtxCreate(&ctx, 0, dev);
    {
        size_t f = 0, t = 0;
        CUresult r = cuMemGetInfo(&f, &t);
        fprintf( stderr, "Do cuMemGetInfo: %d, %zu/%zu\n", r, f, t );
    }
    
    __init("\n");
 
    printf("\nPress any key to exit...");
    char c;
    scanf("%c", &c);
 
    return 0;
}
Ejemplo n.º 5
0
int mcopy_gpu_init(struct device_info *device_info)
{
	char fname[256];
	CUresult res;

	/* printf("madd_gpu_init called.\n"); */

	/* Initialization */
	if ((res = cuInit(0)) != CUDA_SUCCESS) {
		printf("cuInit failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	if ((res = cuDeviceGet(&device_info->dev, 0)) != CUDA_SUCCESS) {
		printf("cuDeviceGet failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	if ((res = cuCtxCreate(&device_info->context, 0, device_info->dev)) !=
	 CUDA_SUCCESS) {
		printf("cuCtxCreate failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	return 0;
}
Ejemplo n.º 6
0
	CUDADevice(DeviceInfo& info, Stats &stats, bool background_) : Device(stats)
	{
		background = background_;

		cuDevId = info.num;
		cuDevice = 0;
		cuContext = 0;

		/* intialize */
		if(cuda_error(cuInit(0)))
			return;

		/* setup device and context */
		if(cuda_error(cuDeviceGet(&cuDevice, cuDevId)))
			return;

		CUresult result;

		if(background) {
			result = cuCtxCreate(&cuContext, 0, cuDevice);
		}
		else {
			result = cuGLCtxCreate(&cuContext, 0, cuDevice);

			if(result != CUDA_SUCCESS) {
				result = cuCtxCreate(&cuContext, 0, cuDevice);
				background = true;
			}
		}

		if(cuda_error_(result, "cuCtxCreate"))
			return;

		cuda_pop_context();
	}
Ejemplo n.º 7
0
static bool
nvptx_init (void)
{
  CUresult r;
  int ndevs;

  if (instantiated_devices != 0)
    return true;

  r = cuInit (0);
  if (r != CUDA_SUCCESS)
    GOMP_PLUGIN_fatal ("cuInit error: %s", cuda_error (r));

  ptx_events = NULL;

  pthread_mutex_init (&ptx_event_lock, NULL);

  r = cuDeviceGetCount (&ndevs);
  if (r != CUDA_SUCCESS)
    GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));

  ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
					    * ndevs);

  return true;
}
Ejemplo n.º 8
0
int main() {

	int ngpu;
	CUdevice cuDevice;
	CUcontext cuContext;
	cuInit(0);
	cuDeviceGetCount(&ngpu);
	//printf("ngpu = %d\n", ngpu);

	size_t *totals, *frees ;
	totals = (size_t *) calloc (ngpu, sizeof(size_t));
	frees = (size_t *) calloc (ngpu, sizeof(size_t));

	int tid;
	omp_set_num_threads(ngpu);
	#pragma omp parallel private(tid, cuDevice, cuContext) shared(frees, totals)
	{
		tid = omp_get_thread_num();
		//printf("nthreads = %d, tid = %d\n", omp_get_num_threads(), tid);
		cuDeviceGet(&cuDevice, tid);
		cuCtxCreate(&cuContext, tid, cuDevice);
		cuMemGetInfo((size_t*)&frees[tid], (size_t*)&totals[tid]);
	}

	printf ("\ttotal\t\tfree\t\tused\n");
	for(int i=0; i<ngpu; i++) {
		printf("GPU %d\t%lu\t%lu\t%lu\n", i, (size_t)totals[i], (size_t)frees[i], (size_t)totals[i]-(size_t)frees[i]);
	}

	return 0;
}
Ejemplo n.º 9
0
CUresult cuda_driver_api_init(CUcontext *pctx, CUmodule *pmod, const char *f)
{
	CUresult res;
	CUdevice dev;

	res = cuInit(0);
	if (res != CUDA_SUCCESS) {
		printf("cuInit failed: res = %lu\n", (unsigned long)res);
		return res;
	}

	res = cuDeviceGet(&dev, 0);
	if (res != CUDA_SUCCESS) {
		printf("cuDeviceGet failed: res = %lu\n", (unsigned long)res);
		return res;
	}

	res = cuCtxCreate(pctx, 0, dev);
	if (res != CUDA_SUCCESS) {
		printf("cuCtxCreate failed: res = %lu\n", (unsigned long)res);
		return res;
	}
	
	res = cuModuleLoad(pmod, f);
	if (res != CUDA_SUCCESS) {
		printf("cuModuleLoad() failed\n");
		cuCtxDestroy(*pctx);
		return res;
	}

	return CUDA_SUCCESS;
}
Ejemplo n.º 10
0
static int
nvptx_get_num_devices (void)
{
  int n;
  CUresult r;

  /* PR libgomp/65099: Currently, we only support offloading in 64-bit
     configurations.  */
  if (sizeof (void *) != 8)
    return 0;

  /* This function will be called before the plugin has been initialized in
     order to enumerate available devices, but CUDA API routines can't be used
     until cuInit has been called.  Just call it now (but don't yet do any
     further initialization).  */
  if (instantiated_devices == 0)
    {
      r = cuInit (0);
      /* This is not an error: e.g. we may have CUDA libraries installed but
         no devices available.  */
      if (r != CUDA_SUCCESS)
        return 0;
    }

  r = cuDeviceGetCount (&n);
  if (r!= CUDA_SUCCESS)
    GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));

  return n;
}
Ejemplo n.º 11
0
int
init_cuda_context (void)
{
#ifdef _ENABLE_CUDA_
    CUresult curesult = CUDA_SUCCESS;
    CUdevice cuDevice;
    int local_rank, dev_count;
    int dev_id = 0;
    char * str;

    if ((str = getenv("LOCAL_RANK")) != NULL) {
        cudaGetDeviceCount(&dev_count);
        local_rank = atoi(str);
        dev_id = local_rank % dev_count;
    }

    curesult = cuInit(0);
    if (curesult != CUDA_SUCCESS) {
        return 1;
    }

    curesult = cuDeviceGet(&cuDevice, dev_id);
    if (curesult != CUDA_SUCCESS) {
        return 1;
    }

    curesult = cuCtxCreate(&cuContext, 0, cuDevice);
    if (curesult != CUDA_SUCCESS) {
        return 1;
    }
#endif
    return 0;
}
Ejemplo n.º 12
0
main()
{
  /* initialize CUDA */
  CUresult res;
  res = cuInit(0);
  MY_CUDA_CHECK(res, "cuInit()");

  /* check GPU is setted or not */
  int device_num;
  res = cuDeviceGetCount(&device_num);
  MY_CUDA_CHECK(res, "cuDeviceGetCount()");

  if (device_num == 0) {        // no GPU is detected
    fprintf(stderr, "no CUDA capable GPU is detected...\n");
    exit(1);
  }

  printf("%d GPUs are detected\n", device_num);

  for (int i=0; i<device_num; i++)
    {
      /* get device handle of GPU No.i */
      CUdevice dev;
      res = cuDeviceGet(&dev, i);
      MY_CUDA_CHECK(res, "cuDeviceGet()");
      
      /* search compute capability of GPU No.i */
      int major=0, minor=0;
      res = cuDeviceComputeCapability(&major, &minor, dev);
      MY_CUDA_CHECK(res, "cuDeviceComputeCapability()");
     
      printf("GPU[%d] : actual compute capability is : %d%d\n", i, major, minor);
    }
}
Ejemplo n.º 13
0
Object cuda_over_map(Object self, int nparts, int *argcv,
        Object *argv, int flags) {
    CUresult error;
    cuInit(0);
    int deviceCount = 0;
    error = cuDeviceGetCount(&deviceCount);
    if (deviceCount == 0) {
        raiseError("No CUDA devices found");
    }
    CUdevice cuDevice;
    CUcontext cuContext;
    CUmodule cuModule;
    CUfunction cuFunc;
    error = cuDeviceGet(&cuDevice, 0);
    error = cuCtxCreate(&cuContext, 0, cuDevice);
    CUdeviceptr d_A;
    CUdeviceptr d_B;
    CUdeviceptr d_res;
    errcheck(cuModuleLoad(&cuModule, grcstring(argv[argcv[0]])));
    CUdeviceptr dps[argcv[0]];
    void *args[argcv[0]+2];
    int size = INT_MAX;
    for (int i=0; i<argcv[0]; i++) {
        struct CudaFloatArray *a = (struct CudaFloatArray *)argv[i];
        if (a->size < size)
            size = a->size;
        errcheck(cuMemAlloc(&dps[i], size * sizeof(float)));
        errcheck(cuMemcpyHtoD(dps[i], &a->data, size * sizeof(float)));
        args[i+1] = &dps[i];
    }
    struct CudaFloatArray *r =
        (struct CudaFloatArray *)(alloc_CudaFloatArray(size));
    int fsize = sizeof(float) * size;
    errcheck(cuMemAlloc(&d_res, fsize));
    errcheck(cuMemcpyHtoD(d_res, &r->data, fsize));
    args[0] = &d_res;
    args[argcv[0]+1] = &size;

    int threadsPerBlock = 256;
    int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock;
    char name[256];
    strcpy(name, "block");
    strcat(name, grcstring(argv[argcv[0]]) + strlen("_cuda/"));
    for (int i=0; name[i] != 0; i++)
        if (name[i] == '.') {
            name[i] = 0;
            break;
        }
    errcheck(cuModuleGetFunction(&cuFunc, cuModule, name));
    errcheck(cuLaunchKernel(cuFunc, blocksPerGrid, 1, 1,
        threadsPerBlock, 1, 1,
        0,
        NULL, args, NULL));
    errcheck(cuMemcpyDtoH(&r->data, d_res, fsize));
    cuMemFree(d_res);
    for (int i=0; i<argcv[0]; i++)
        cuMemFree(dps[i]);
    return (Object)r;
}
Ejemplo n.º 14
0
device::device() {
	cuInit(0);
	cuDeviceGet(&cu_device, 0);
	checkCudaError("device::device Init");
	//cuCtxCreate(&cu_context, 0, cu_device);
	//checkCudaError("device::device Create context");
	device_name = props.name;
}
Ejemplo n.º 15
0
    void init(){
      if(!isNotInitialized)
        return;

      cuInit(0);

      isNotInitialized = false;
    }
Ejemplo n.º 16
0
void Application::_initCUDA( int argc, char *argv[])
{
  if (CUDA_SUCCESS != cuInit(0))
  {
    fprintf( stderr, "Error: CUDA initialization has failed.\n");
    exit( EXIT_FAILURE );
  }
}
Ejemplo n.º 17
0
int
init_accel (void)
{
#if defined(_ENABLE_OPENACC_) || defined(_ENABLE_CUDA_)
     char * str;
     int local_rank, dev_count;
     int dev_id = 0;
#endif
#ifdef _ENABLE_CUDA_
    CUresult curesult = CUDA_SUCCESS;
    CUdevice cuDevice;
#endif

    switch (options.accel) {
#ifdef _ENABLE_CUDA_
        case managed:
        case cuda:
            if ((str = getenv("LOCAL_RANK")) != NULL) {
                cudaGetDeviceCount(&dev_count);
                local_rank = atoi(str);
                dev_id = local_rank % dev_count;
            }
        
            curesult = cuInit(0);
            if (curesult != CUDA_SUCCESS) {
                return 1;
            }
        
            curesult = cuDeviceGet(&cuDevice, dev_id);
            if (curesult != CUDA_SUCCESS) {
                return 1;
            }
        
            curesult = cuCtxCreate(&cuContext, 0, cuDevice);
            if (curesult != CUDA_SUCCESS) {
                return 1;
            }
            break;
#endif   
#ifdef _ENABLE_OPENACC_
        case openacc:
            if ((str = getenv("LOCAL_RANK")) != NULL) {
                dev_count = acc_get_num_devices(acc_device_not_host);
                local_rank = atoi(str);
                dev_id = local_rank % dev_count;
            }
        
            acc_set_device_num (dev_id, acc_device_not_host);
            break;
#endif   
        default:
            fprintf(stderr, "Invalid device type, should be cuda or openacc\n");
            return 1;
    }

    return 0;
}
Ejemplo n.º 18
0
/**
 * This measures the overhead in launching a kernel function on each GPU in the
 * system.
 *
 * It does this by executing a small kernel (copying 1 value in global memory) a
 * very large number of times and taking the average execution time.  This
 * program uses the CUDA driver API.
 */
int main() {
  CU_ERROR_CHECK(cuInit(0));

  int count;
  CU_ERROR_CHECK(cuDeviceGetCount(&count));

  float x = 5.0f;
  for (int d = 0; d < count; d++) {
    CUdevice device;
    CU_ERROR_CHECK(cuDeviceGet(&device, d));

    CUcontext context;
    CU_ERROR_CHECK(cuCtxCreate(&context, 0, device));

    CUdeviceptr in, out;
    CU_ERROR_CHECK(cuMemAlloc(&in, sizeof(float)));
    CU_ERROR_CHECK(cuMemAlloc(&out, sizeof(float)));
    CU_ERROR_CHECK(cuMemcpyHtoD(in, &x, sizeof(float)));

    CUmodule module;
    CU_ERROR_CHECK(cuModuleLoadData(&module, imageBytes));

    CUfunction function;
    CU_ERROR_CHECK(cuModuleGetFunction(&function, module, "kernel"));

    void * params[] = { &in, &out };

    CUevent start, stop;
    CU_ERROR_CHECK(cuEventCreate(&start, 0));
    CU_ERROR_CHECK(cuEventCreate(&stop, 0));

    CU_ERROR_CHECK(cuEventRecord(start, 0));
    for (int i = 0; i < ITERATIONS; i++)
      CU_ERROR_CHECK(cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, params, NULL));

    CU_ERROR_CHECK(cuEventRecord(stop, 0));
    CU_ERROR_CHECK(cuEventSynchronize(stop));

    float time;
    CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop));

    CU_ERROR_CHECK(cuEventDestroy(start));
    CU_ERROR_CHECK(cuEventDestroy(stop));

    CU_ERROR_CHECK(cuMemFree(in));
    CU_ERROR_CHECK(cuMemFree(out));

    fprintf(stdout, "Device %d: %fms\n", d, (time / (double)ITERATIONS));

    CU_ERROR_CHECK(cuModuleUnload(module));

    CU_ERROR_CHECK(cuCtxDestroy(context));
  }

  return 0;
}
Ejemplo n.º 19
0
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {

  if (nrhs != 0)
    mexErrMsgTxt("Wrong number of arguments");

  CUresult err = cuInit(0);

  plhs[0] = mxCreateDoubleScalar(err);

}
Ejemplo n.º 20
0
Archivo: ov.c Proyecto: CPFL/gtraffic
/*
 * Initializaiton in order to use kernel program 
 */
void
init_cuda(void){

  thread_num = (N <= 16) ? N : 16 ;  
  block_num = N / (thread_num*thread_num);
  if(N % (thread_num*thread_num) != 0) block_num++;
  
  res = cuInit(0);
  if(res != CUDA_SUCCESS){
    printf("cuInit failed: res = %s\n", conv(res));
    exit(1);
  }
  
  res = cuDeviceGet(&dev, 0);
  if(res != CUDA_SUCCESS){
    printf("cuDeviceGet failed: res = %s\n", conv(res));
    exit(1);
  }
  
  res = cuCtxCreate(&ctx, 0, dev);
  if(res != CUDA_SUCCESS){
    printf("cuCtxCreate failed: res = %s\n", conv(res));
    exit(1);
  }
  
  res = cuModuleLoad(&module, "./cuda_main.cubin");
  if(res != CUDA_SUCCESS){
    printf("cuModuleLoad() failed: res = %s\n", conv(res));
    exit(1);
  }
  
  res = cuModuleGetFunction(&function, module, "cuda_main");
  if(res != CUDA_SUCCESS){
    printf("cuModuleGetFunction() failed: res = %s\n",  conv(res));
    exit(1);
  }
  

  /* 
   * preparation for launch kernel 
   */
  res = cuFuncSetSharedSize(function, 0x40);  /* just random */
  if(res != CUDA_SUCCESS){
    printf("cuFuncSetSharedSize() failed: res = %s\n", conv(res));
    exit(1);
  }
  
  res = cuFuncSetBlockShape(function, thread_num, thread_num, 1);
  if(res != CUDA_SUCCESS){
    printf("cuFuncSetBlockShape() failed: res = %s\n", conv(res));
    exit(1);
  }

}
Ejemplo n.º 21
0
Object cuda_computeCapability(Object self, int nparts, int *argcv,
        Object *argv, int flags) {
    cuInit(0);
    int deviceCount = 0;
    cuDeviceGetCount(&deviceCount);
    if (deviceCount == 0) {
        raiseError("No CUDA devices found");
    }
    CUdevice cuDevice;
    int major, minor;
    cuDeviceComputeCapability(&major, &minor, cuDevice);
    return alloc_Float64(major + minor / 10.0);
}
Ejemplo n.º 22
0
int main(int argc, char *argv[])
{
	char c;
	CUcontext ctx;
	CUdevice dev = 0;
	void *toSpace;
	int status, free, total;
	CUdeviceptr ptr = (CUdeviceptr)NULL;
	int size;
	
	if(argc != 2){
		fprintf(stderr,"Usage: mem_alloc.exe [MEMORY TO ALLOCATE IN MB]\n");
		exit(1);
	}
	
	printf("All status results should be 0, if not an error has occured.\nIf 2 is reported an out of memory error has occured for\nwhich you should decrease the memory input\n");
	size = atoi(argv[1]);
	
	printf("\nTrying to allocate %iMB of memory on host and GPU\n",size);
	
	if(size <= 0){
		fprintf(stderr,"\nERROR: Memory must be greater than 0\n");
		exit(1);
	}
	
	status = cuInit(0);
	printf("Init status: %i\n",status); 

	status = cuCtxCreate(&ctx, 0, dev);
	printf("Context creation status: %i\n",status); 
	
	cuMemGetInfo(&free, &total);
	printf("Get memory info status: %i\n",status); 
	
	printf("\n%.1f/%.1f (Free/Total) MB\n", free/1024.0/1024.0, total/1024.0/1024.0);
	
	status = cuMemHostAlloc(&toSpace, size*1024*1024, 0); 
	printf("Host allocation status: %i %s\n",status, (status==CUDA_SUCCESS) ? "SUCCESS" : "FAILED"); 

	status = cuMemAlloc(&ptr, size*1024*1024);
	printf("GPU allocation status: %i %s\n",status, (status==CUDA_SUCCESS) ? "SUCCESS" : "FAILED");

	printf("\nPress any key to exit...");
	scanf("%c", &c);
	
	status = cuCtxDestroy(ctx);
	printf("Context destroy status: %i\n",status); 

	return 0;
}
Ejemplo n.º 23
0
device::device(int device_id) {
	cuInit(0);
	checkCudaError("device::device Init");
	cuDeviceGet(&cu_device, device_id);
	checkCudaError("device::device Get device");
	//cuCtxCreate(&cu_context, 0, cu_device);
	//checkCudaError("device::device Create context");
	this->set_device(device_id);

	cudaGetDeviceProperties(&props, device_id);
	checkCudaError("device::device Get properties ");

	this->device_name = props.name;
}
Ejemplo n.º 24
0
    Impl()
    {
        // Initialize CUDA.
        check_cuda_error(cuInit(0));

        int device_count;
        check_cuda_error(cuDeviceGetCount(&device_count));

        m_devices.reserve(static_cast<std::size_t>(device_count));
        m_contexts.resize(static_cast<std::size_t>(device_count), nullptr);

        for (int i = 0; i < device_count; ++i)
            m_devices.emplace_back(i);
    }
Ejemplo n.º 25
0
int magma_solve ( int *dA_dim, int *lWork, double2 *A, int *ipiv, int *N ){

	// Check inputs
	//
	fprintf (stderr, "Using MAGMA solve\n" );
	fprintf (stderr, "	dA_dim: %i\n", *dA_dim );
	fprintf (stderr, "	N: %i\n", *N );
	fprintf (stderr, "	lWork: %i\n", *lWork );

	cuInit(0);
	cublasInit();
	printout_devices();

	cublasStatus status;

	double2 *d_A, *work;
	status = cublasAlloc ( *dA_dim, sizeof(double2), (void**)&d_A );

	if ( status != CUBLAS_STATUS_SUCCESS ){
			fprintf (stderr, "ERROR: device memory allocation error (d_A)\n" );
			fprintf (stderr, "ERROR: dA_dim: %i\n", dA_dim );
	}

	cudaError_t err;
	err = cudaMallocHost ( (void**)&work, *lWork * sizeof(double2) );

	if(err != cudaSuccess){
		fprintf (stderr, "ERROR: cudaMallocHost error (work)\n" );
	}

	int info[1];
	TimeStruct start, end;

	start = get_current_time ();
	magma_zgetrf ( N, N, A, N, ipiv, work, d_A, info );
	end = get_current_time ();

	double gpu_perf;
	gpu_perf = 4.*2.*(*N)*(*N)*(*N)/(3.*1000000*GetTimerValue(start,end));

	if ( info[0] != 0 ){
			fprintf (stderr, "ERROR: magma_zgetrf failed\n" );
	}

	printf("	GPU performance: %6.2f GFlop/s\n", gpu_perf);

	int stat = 0;
	return stat;

}
Ejemplo n.º 26
0
Object cuda_deviceName(Object self, int nparts, int *argcv,
        Object *argv, int flags) {
    cuInit(0);
    int deviceCount = 0;
    cuDeviceGetCount(&deviceCount);
    if (deviceCount == 0) {
        raiseError("No CUDA devices found");
    }
    CUdevice cuDevice;
    cuDeviceGet(&cuDevice, 0);
    char name[100];
    cuDeviceGetName(name, 100, cuDevice);
    return alloc_String(name);
}
Ejemplo n.º 27
0
int main(int argc,char **argv){
	unsigned total = 0;
	unsigned long zul;
	ctx marsh;

	if(argc != 3){
		usage(argv[0]);
		exit(EXIT_FAILURE);
	}
	if(getzul(argv[1],&zul)){
		usage(argv[0]);
		exit(EXIT_FAILURE);
	}
	if(getzul(argv[2],&marsh.s)){
		usage(argv[0]);
		exit(EXIT_FAILURE);
	}
	if(cuInit(0)){
		fprintf(stderr,"Couldn't initialize cuda\n");
		exit(EXIT_FAILURE);
	}
	if(cuDeviceGet(&marsh.dev,zul)){
		fprintf(stderr,"Couldn't get device %lu\n",zul);
		exit(EXIT_FAILURE);
	}
	while( (marsh.threadno = ++total) ){
		pthread_t tid;
		int err;

		if( (err = pthread_create(&tid,NULL,thread,&marsh)) ){
			fprintf(stderr,"Couldn't create thread %d (%s?)\n",
					total,strerror(err));
			exit(EXIT_SUCCESS);
		}
		pthread_mutex_lock(&lock);
		while(!thrdone && threadsmaintain){
			pthread_cond_wait(&cond,&lock);
		}
		thrdone = 0;
		if(!threadsmaintain){
			pthread_mutex_unlock(&lock);
			fprintf(stderr,"Thread %d exited with an error.\n",total);
			break;
		}
		pthread_mutex_unlock(&lock);
		printf("Created thread %d\n",total);
	}	
	exit(EXIT_SUCCESS);
}
Ejemplo n.º 28
0
#include "cuda.h"
#include <wb.h>
#include <wbCUDA.h>

#define MB (1 << 20)
#ifndef WB_DEFAULT_HEAP_SIZE
const size_t WB_DEFAULT_HEAP_SIZE = (256 * MB);
#endif /* WB_DEFAULT_HEAP_SIZE */

static bool _initializedQ = wbFalse;

#ifndef _MSC_VER
__attribute__((__constructor__))
#endif /* _MSC_VER */
    void wb_init(void) {
  if (_initializedQ == wbTrue) {
    return;
  }

#ifdef WB_USE_CUDA
  cuInit(0);

  /* Select a random GPU */

  {
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);

    srand(time(NULL));
    cudaSetDevice(rand() % deviceCount);
  }

  cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 1*MB);
  cudaDeviceSetLimit(cudaLimitMallocHeapSize, WB_DEFAULT_HEAP_SIZE);

  cudaDeviceSynchronize();

#endif /* WB_USE_CUDA */

#ifdef WB_USE_CUSTOM_MALLOC
  wbMemoryManager_new(WB_DEFAULT_HEAP_SIZE);
#endif /* WB_USE_CUSTOM_MALLOC */

#ifdef _MSC_VER
  QueryPerformanceFrequency((LARGE_INTEGER *)&_hrtime_frequency);
#endif /* _MSC_VER */

  _hrtime();

  _timer = wbTimer_new();
  _logger = wbLogger_new();
  _initializedQ = wbTrue;

  wbFile_init();

#ifdef WB_USE_SANDBOX
  wbSandbox_new();
#endif /* WB_USE_SANDBOX */

  solutionJSON = NULL;

  atexit(wb_atExit);
}
Ejemplo n.º 29
0
int main(int argc, char** argv) {
	cuInit(0);
	
	DevicePtr device;
	CUresult result = CreateCuDevice(0, &device);

	ContextPtr context;
	result = CreateCuContext(device, 0, &context);

	std::auto_ptr<MaxIndexEngine> engine;
	result = CreateMaxIndexEngine("../../src/maxindex/maxindex.cubin", &engine);
	if(CUDA_SUCCESS != result) {
		printf("Could not create max index engine.\n");
		return 0;
	}

	// Search through 5 million elements.
	const int NumElements = 5000000;
	std::vector<float> data(NumElements);
	std::tr1::uniform_real<float> r(-1e9, 1e9);
	for(int i(0); i < NumElements; ++i)
		data[i] = r(mt19937);
	
	// Use CPU to find the max element and index.
	float maxX = -1e37f;
	int maxIndex = 0;

	for(int i(0); i < NumElements; ++i)
		if(data[i] > maxX) {
			maxX = data[i];
			maxIndex = i;
		}

	printf("CPU says max x = %f, max index = %d.\n", maxX, maxIndex);

	// Use GPU to find the max element and index.
	DeviceMemPtr deviceData;
	context->MemAlloc(data, &deviceData);

	result = FindGlobalMax(engine.get(), deviceData->Handle(), NumElements, 
		&maxX, &maxIndex);
	if(CUDA_SUCCESS != result) {
		printf("Failure running max index kernel.\n");
		return 0;
	}

	printf("GPU says max x = %f, max index = %d.\n", maxX, maxIndex);
}
Ejemplo n.º 30
0
int
main()
{
  CUresult result;
  result = cuInit(0);
  CUdevice device;
  result = cuDeviceGet(&device, 0);
  CUcontext ctx;
  result = cuCtxCreate(&ctx, 0, device);
  CUmodule module;
  result = cuModuleLoad(&module, "cuda-shift-throughput.cubin");
  CUfunction kernel;
  result = cuModuleGetFunction(&kernel, module, "kernel");
  int block;
  result = cuFuncGetAttribute(&block,
                              CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
                              kernel);
  int grid = 1024 * 1024;
  CUevent event[2];
  for (ptrdiff_t i = 0; i < 2; ++i) {
    result = cuEventCreate(&event[i], 0);
  }
  result = cuEventRecord(event[0], 0);
  result = cuLaunchKernel(kernel, grid, 1, 1, block, 1, 1, 0, 0, 0, 0);
  result = cuEventRecord(event[1], 0);
  result = cuEventSynchronize(event[1]);
  float time;
  result = cuEventElapsedTime(&time, event[0], event[1]);
  int gpuclock;
  result =
    cuDeviceGetAttribute(&gpuclock, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device);
  int gpump;
  result =
    cuDeviceGetAttribute(&gpump, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
                         device);
  std::printf("Clock: %d KHz, # of MPs: %d\n", gpuclock, gpump);
  std::printf("Elapsed Time: %f milliseconds\n", time);
  std::printf("# of Threads: %d, # of SHLs : %lld\n", block,
              1024ll * block * grid);
  std::printf("Throughput: %f\n",
              1024.0 * block * grid / ((double) gpump * gpuclock * time));
  for (ptrdiff_t i = 0; i < 2; ++i) {
    result = cuEventDestroy(event[i]);
  }
  result = cuModuleUnload(module);
  result = cuCtxDestroy(ctx);
  return 0;
}