Esempio n. 1
1
extern "C" void sarafft_init() {
  printf( "Cuda is about to be initialized!\n" );
  fflush ( stdout );
  char *OMPI_COMM_WORLD_LOCAL_RANK = getenv( "OMPI_COMM_WORLD_LOCAL_RANK" );
  if ( NULL == OMPI_COMM_WORLD_LOCAL_RANK ) {
    printf( "OMPI_COMM_WORLD_LOCAL_RANK not set!\n" );
    fflush ( stdout );
    exit( 80 );
  }
  int localRank = atoi( OMPI_COMM_WORLD_LOCAL_RANK );
  printf( "Local rank is %d\n", localRank );
  fflush ( stdout );
  if ( CUDA_SUCCESS != cuInit( 0 ) ) {
    printf( "cuInit failed!\n" );
    fflush ( stdout );
    exit( 81 );
  }
  CUdevice device;
  if ( CUDA_SUCCESS != cuDeviceGet( &device, localRank ) ) {
    printf( "cuDeviceGet failed!\n" );
    fflush ( stdout );
    exit( 82 );
  }
  if ( CUDA_SUCCESS != cuCtxCreate( &cuda_context, CU_CTX_SCHED_YIELD, device ) ) {
    printf( "cuCtxCreate failed!\n" );
    fflush ( stdout );
    exit( 83 );
  }
  printf( "Cuda was initialized successfully!\n" );
  fflush ( stdout );
}
Esempio n. 2
0
CUresult cuda_driver_api_init(CUcontext *pctx, CUmodule *pmod, const char *f)
{
	CUresult res;
	CUdevice dev;

	res = cuInit(0);
	if (res != CUDA_SUCCESS) {
		printf("cuInit failed: res = %lu\n", (unsigned long)res);
		return res;
	}

	res = cuDeviceGet(&dev, 0);
	if (res != CUDA_SUCCESS) {
		printf("cuDeviceGet failed: res = %lu\n", (unsigned long)res);
		return res;
	}

	res = cuCtxCreate(pctx, 0, dev);
	if (res != CUDA_SUCCESS) {
		printf("cuCtxCreate failed: res = %lu\n", (unsigned long)res);
		return res;
	}
	
	res = cuModuleLoad(pmod, f);
	if (res != CUDA_SUCCESS) {
		printf("cuModuleLoad() failed\n");
		cuCtxDestroy(*pctx);
		return res;
	}

	return CUDA_SUCCESS;
}
Esempio n. 3
0
int main() {

	int ngpu;
	CUdevice cuDevice;
	CUcontext cuContext;
	cuInit(0);
	cuDeviceGetCount(&ngpu);
	//printf("ngpu = %d\n", ngpu);

	size_t *totals, *frees ;
	totals = (size_t *) calloc (ngpu, sizeof(size_t));
	frees = (size_t *) calloc (ngpu, sizeof(size_t));

	int tid;
	omp_set_num_threads(ngpu);
	#pragma omp parallel private(tid, cuDevice, cuContext) shared(frees, totals)
	{
		tid = omp_get_thread_num();
		//printf("nthreads = %d, tid = %d\n", omp_get_num_threads(), tid);
		cuDeviceGet(&cuDevice, tid);
		cuCtxCreate(&cuContext, tid, cuDevice);
		cuMemGetInfo((size_t*)&frees[tid], (size_t*)&totals[tid]);
	}

	printf ("\ttotal\t\tfree\t\tused\n");
	for(int i=0; i<ngpu; i++) {
		printf("GPU %d\t%lu\t%lu\t%lu\n", i, (size_t)totals[i], (size_t)frees[i], (size_t)totals[i]-(size_t)frees[i]);
	}

	return 0;
}
void GPUInterface::GetDeviceDescription(int deviceNumber,
                                        char* deviceDescription) {
#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr, "\t\t\tEntering GPUInterface::GetDeviceDescription\n");
#endif

    CUdevice tmpCudaDevice;

    SAFE_CUDA(cuDeviceGet(&tmpCudaDevice, (*resourceMap)[deviceNumber]));

#if CUDA_VERSION >= 3020
    size_t totalGlobalMemory = 0;
#else
    unsigned int totalGlobalMemory = 0;
#endif
    int clockSpeed = 0;
    int mpCount = 0;
    int major = 0;
    int minor = 0;

    SAFE_CUDA(cuDeviceComputeCapability(&major, &minor, tmpCudaDevice));
    SAFE_CUDA(cuDeviceTotalMem(&totalGlobalMemory, tmpCudaDevice));
    SAFE_CUDA(cuDeviceGetAttribute(&clockSpeed, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, tmpCudaDevice));
    SAFE_CUDA(cuDeviceGetAttribute(&mpCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, tmpCudaDevice));

    sprintf(deviceDescription,
            "Global memory (MB): %d | Clock speed (Ghz): %1.2f | Number of cores: %d",
            int(totalGlobalMemory / 1024.0 / 1024.0 + 0.5),
            clockSpeed / 1000000.0,
            nGpuArchCoresPerSM[major] * mpCount);

#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr, "\t\t\tLeaving  GPUInterface::GetDeviceDescription\n");
#endif
}
Esempio n. 5
0
	CUDADevice(DeviceInfo& info, Stats &stats, bool background_) : Device(stats)
	{
		background = background_;

		cuDevId = info.num;
		cuDevice = 0;
		cuContext = 0;

		/* intialize */
		if(cuda_error(cuInit(0)))
			return;

		/* setup device and context */
		if(cuda_error(cuDeviceGet(&cuDevice, cuDevId)))
			return;

		CUresult result;

		if(background) {
			result = cuCtxCreate(&cuContext, 0, cuDevice);
		}
		else {
			result = cuGLCtxCreate(&cuContext, 0, cuDevice);

			if(result != CUDA_SUCCESS) {
				result = cuCtxCreate(&cuContext, 0, cuDevice);
				background = true;
			}
		}

		if(cuda_error_(result, "cuCtxCreate"))
			return;

		cuda_pop_context();
	}
Esempio n. 6
0
main()
{
  /* initialize CUDA */
  CUresult res;
  res = cuInit(0);
  MY_CUDA_CHECK(res, "cuInit()");

  /* check GPU is setted or not */
  int device_num;
  res = cuDeviceGetCount(&device_num);
  MY_CUDA_CHECK(res, "cuDeviceGetCount()");

  if (device_num == 0) {        // no GPU is detected
    fprintf(stderr, "no CUDA capable GPU is detected...\n");
    exit(1);
  }

  printf("%d GPUs are detected\n", device_num);

  for (int i=0; i<device_num; i++)
    {
      /* get device handle of GPU No.i */
      CUdevice dev;
      res = cuDeviceGet(&dev, i);
      MY_CUDA_CHECK(res, "cuDeviceGet()");
      
      /* search compute capability of GPU No.i */
      int major=0, minor=0;
      res = cuDeviceComputeCapability(&major, &minor, dev);
      MY_CUDA_CHECK(res, "cuDeviceComputeCapability()");
     
      printf("GPU[%d] : actual compute capability is : %d%d\n", i, major, minor);
    }
}
Esempio n. 7
0
void printout_devices( )
{
  int ndevices;
  cuDeviceGetCount( &ndevices );
  for( int idevice = 0; idevice < ndevices; idevice++ )
    {
      char name[200];
#if CUDA_VERSION > 3010 
      size_t totalMem;
#else
      unsigned int totalMem;
#endif

      int clock;
      CUdevice dev;

      cuDeviceGet( &dev, idevice );
      cuDeviceGetName( name, sizeof(name), dev );
      cuDeviceTotalMem( &totalMem, dev );
      cuDeviceGetAttribute( &clock,
                            CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev );
      printf( "device %d: %s, %.1f MHz clock, %.1f MB memory\n",
              idevice, name, clock/1000.f, totalMem/1024.f/1024.f );
    }
}
Esempio n. 8
0
int main(int argc, char* argv[])
{
    cuInit(0);
    int devs = 0;
    cuDeviceGetCount(&devs);
    assert(devs > 0);
    CUdevice dev;
    CUresult status;
    CUcontext ctx = 0;
    cuDeviceGet(&dev, 0);
    cuCtxCreate(&ctx, 0, dev);
    {
        size_t f = 0, t = 0;
        CUresult r = cuMemGetInfo(&f, &t);
        fprintf( stderr, "Do cuMemGetInfo: %d, %zu/%zu\n", r, f, t );
    }
    
    __init("\n");
 
    printf("\nPress any key to exit...");
    char c;
    scanf("%c", &c);
 
    return 0;
}
Esempio n. 9
0
int
init_cuda_context (void)
{
#ifdef _ENABLE_CUDA_
    CUresult curesult = CUDA_SUCCESS;
    CUdevice cuDevice;
    int local_rank, dev_count;
    int dev_id = 0;
    char * str;

    if ((str = getenv("LOCAL_RANK")) != NULL) {
        cudaGetDeviceCount(&dev_count);
        local_rank = atoi(str);
        dev_id = local_rank % dev_count;
    }

    curesult = cuInit(0);
    if (curesult != CUDA_SUCCESS) {
        return 1;
    }

    curesult = cuDeviceGet(&cuDevice, dev_id);
    if (curesult != CUDA_SUCCESS) {
        return 1;
    }

    curesult = cuCtxCreate(&cuContext, 0, cuDevice);
    if (curesult != CUDA_SUCCESS) {
        return 1;
    }
#endif
    return 0;
}
Esempio n. 10
0
int mcopy_gpu_init(struct device_info *device_info)
{
	char fname[256];
	CUresult res;

	/* printf("madd_gpu_init called.\n"); */

	/* Initialization */
	if ((res = cuInit(0)) != CUDA_SUCCESS) {
		printf("cuInit failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	if ((res = cuDeviceGet(&device_info->dev, 0)) != CUDA_SUCCESS) {
		printf("cuDeviceGet failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	if ((res = cuCtxCreate(&device_info->context, 0, device_info->dev)) !=
	 CUDA_SUCCESS) {
		printf("cuCtxCreate failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	return 0;
}
Esempio n. 11
0
std::pair< std::vector<context>, std::vector<command_queue> >
queue_list(DevFilter &&filter, unsigned queue_flags = 0)
{
    cuda_check( do_init() );

    std::vector<context>       ctx;
    std::vector<command_queue> queue;

    int ndev;
    cuda_check( cuDeviceGetCount(&ndev) );

    for(int d = 0; d < ndev; ++d) {
        try {
            CUdevice dev;
            cuda_check( cuDeviceGet(&dev, d) );
            if (!filter(dev)) continue;

            context       c(dev);
            command_queue q(c, dev, queue_flags);

            ctx.push_back(c);
            queue.push_back(q);
        } catch(const error&) { }
    }

    return std::make_pair(ctx, queue);
}
Esempio n. 12
0
bool initCuda(CUcontext & cuContext)
{
    // Initialize Cuda
    CUresult cerr;
	int deviceCount;
	cudaGetDeviceCount(&deviceCount);

	if (deviceCount == 0)
	{
        fprintf(stderr, "Sorry, no CUDA device found");
        return false;
	}

	int selectedDevice = 0;
	if (selectedDevice >= deviceCount)
	{
        fprintf(stderr, "Choose device ID between 0 and %d\n", deviceCount-1);
        return false;
	}

	// Initialize the CUDA device 
    CUdevice cuDevice;
	cerr = cuDeviceGet(&cuDevice,selectedDevice);
	checkError(cerr);

	cerr = cuCtxCreate(&cuContext, CU_CTX_MAP_HOST|CU_CTX_BLOCKING_SYNC, cuDevice);
	checkError(cerr);
}
Esempio n. 13
0
Object cuda_over_map(Object self, int nparts, int *argcv,
        Object *argv, int flags) {
    CUresult error;
    cuInit(0);
    int deviceCount = 0;
    error = cuDeviceGetCount(&deviceCount);
    if (deviceCount == 0) {
        raiseError("No CUDA devices found");
    }
    CUdevice cuDevice;
    CUcontext cuContext;
    CUmodule cuModule;
    CUfunction cuFunc;
    error = cuDeviceGet(&cuDevice, 0);
    error = cuCtxCreate(&cuContext, 0, cuDevice);
    CUdeviceptr d_A;
    CUdeviceptr d_B;
    CUdeviceptr d_res;
    errcheck(cuModuleLoad(&cuModule, grcstring(argv[argcv[0]])));
    CUdeviceptr dps[argcv[0]];
    void *args[argcv[0]+2];
    int size = INT_MAX;
    for (int i=0; i<argcv[0]; i++) {
        struct CudaFloatArray *a = (struct CudaFloatArray *)argv[i];
        if (a->size < size)
            size = a->size;
        errcheck(cuMemAlloc(&dps[i], size * sizeof(float)));
        errcheck(cuMemcpyHtoD(dps[i], &a->data, size * sizeof(float)));
        args[i+1] = &dps[i];
    }
    struct CudaFloatArray *r =
        (struct CudaFloatArray *)(alloc_CudaFloatArray(size));
    int fsize = sizeof(float) * size;
    errcheck(cuMemAlloc(&d_res, fsize));
    errcheck(cuMemcpyHtoD(d_res, &r->data, fsize));
    args[0] = &d_res;
    args[argcv[0]+1] = &size;

    int threadsPerBlock = 256;
    int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock;
    char name[256];
    strcpy(name, "block");
    strcat(name, grcstring(argv[argcv[0]]) + strlen("_cuda/"));
    for (int i=0; name[i] != 0; i++)
        if (name[i] == '.') {
            name[i] = 0;
            break;
        }
    errcheck(cuModuleGetFunction(&cuFunc, cuModule, name));
    errcheck(cuLaunchKernel(cuFunc, blocksPerGrid, 1, 1,
        threadsPerBlock, 1, 1,
        0,
        NULL, args, NULL));
    errcheck(cuMemcpyDtoH(&r->data, d_res, fsize));
    cuMemFree(d_res);
    for (int i=0; i<argcv[0]; i++)
        cuMemFree(dps[i]);
    return (Object)r;
}
Esempio n. 14
0
device::device() {
	cuInit(0);
	cuDeviceGet(&cu_device, 0);
	checkCudaError("device::device Init");
	//cuCtxCreate(&cu_context, 0, cu_device);
	//checkCudaError("device::device Create context");
	device_name = props.name;
}
Esempio n. 15
0
int get_suitable_block_num(int device,
			   int *max_block_num,
			   int *mp_num,
			   int word_size,
			   int thread_num,
			   int large_size)
{
#ifdef TODO
    cudaDeviceProp dev;
    CUdevice cuDevice;
    int max_thread_dev;
    int max_block, max_block_mem, max_block_dev;
    int major, minor, ver;
    //int regs, max_block_regs;

    ccudaGetDeviceProperties(&dev, device);
    cuDeviceGet(&cuDevice, device);
    cuDeviceComputeCapability(&major, &minor, cuDevice);
    //cudaFuncGetAttributes()
#if 0
    if (word_size == 4) {
	regs = 14;
    } else {
	regs = 16;
    }
    max_block_regs = dev.regsPerBlock / (regs * thread_num);
#endif
    max_block_mem = dev.sharedMemPerBlock / (large_size * word_size + 16);
    if (major == 9999 && minor == 9999) {
	return -1;
    }
    ver = major * 100 + minor;
    if (ver <= 101) {
	max_thread_dev = 768;
    } else if (ver <= 103) {
	max_thread_dev = 1024;
    } else if (ver <= 200) {
	max_thread_dev = 1536;
    } else {
	max_thread_dev = 1536;
    }
    max_block_dev = max_thread_dev / thread_num;
    if (max_block_mem < max_block_dev) {
	max_block = max_block_mem;
    } else {
	max_block = max_block_dev;
    }
#if 0
    if (max_block_regs < max_block) {
	max_block = max_block_regs;
    }
#endif
    *max_block_num = max_block;
    *mp_num = dev.multiProcessorCount;
    return max_block * dev.multiProcessorCount;
#endif
    return 0;
}
bool GPUInterface::GetSupportsDoublePrecision(int deviceNumber) {
    CUdevice tmpCudaDevice;
    SAFE_CUDA(cuDeviceGet(&tmpCudaDevice, (*resourceMap)[deviceNumber]));

    int major = 0;
    int minor = 0;
    SAFE_CUDA(cuDeviceComputeCapability(&major, &minor, tmpCudaDevice));
    return (major >= 2 || (major >= 1 && minor >= 3));
}
Esempio n. 17
0
int
init_accel (void)
{
#if defined(_ENABLE_OPENACC_) || defined(_ENABLE_CUDA_)
     char * str;
     int local_rank, dev_count;
     int dev_id = 0;
#endif
#ifdef _ENABLE_CUDA_
    CUresult curesult = CUDA_SUCCESS;
    CUdevice cuDevice;
#endif

    switch (options.accel) {
#ifdef _ENABLE_CUDA_
        case managed:
        case cuda:
            if ((str = getenv("LOCAL_RANK")) != NULL) {
                cudaGetDeviceCount(&dev_count);
                local_rank = atoi(str);
                dev_id = local_rank % dev_count;
            }
        
            curesult = cuInit(0);
            if (curesult != CUDA_SUCCESS) {
                return 1;
            }
        
            curesult = cuDeviceGet(&cuDevice, dev_id);
            if (curesult != CUDA_SUCCESS) {
                return 1;
            }
        
            curesult = cuCtxCreate(&cuContext, 0, cuDevice);
            if (curesult != CUDA_SUCCESS) {
                return 1;
            }
            break;
#endif   
#ifdef _ENABLE_OPENACC_
        case openacc:
            if ((str = getenv("LOCAL_RANK")) != NULL) {
                dev_count = acc_get_num_devices(acc_device_not_host);
                local_rank = atoi(str);
                dev_id = local_rank % dev_count;
            }
        
            acc_set_device_num (dev_id, acc_device_not_host);
            break;
#endif   
        default:
            fprintf(stderr, "Invalid device type, should be cuda or openacc\n");
            return 1;
    }

    return 0;
}
Esempio n. 18
0
static void *cuda_init(int ord, int flags, int *ret) {
    CUdevice dev;
    cuda_context *res;
    static int init_done = 0;

    if (ord == -2) {
      CUcontext ctx;
      /* Grab the ambient context */
      err = cuCtxGetCurrent(&ctx);
      CHKFAIL(NULL);
      /* If somebody made a context, then the api is initialized */
      init_done = 1;
      res = cuda_make_ctx(ctx, DONTFREE);
      if (res == NULL) {
        FAIL(NULL, GA_IMPL_ERROR);
      }
      res->flags |= flags;
      return res;
    }

    if (!init_done) {
      err = cuInit(0);
      CHKFAIL(NULL);
      init_done = 1;
    }

    if (ord == -1) {
      int i, c;
      err = cuDeviceGetCount(&c);
      CHKFAIL(NULL);
      for (i = 0; i < c; i++) {
        err = cuDeviceGet(&dev, i);
        CHKFAIL(NULL);
        res = do_init(dev, flags, NULL);
        if (res != NULL)
          return res;
      }
      FAIL(NULL, GA_NODEV_ERROR);
    } else {
      err = cuDeviceGet(&dev, ord);
      CHKFAIL(NULL);
      return do_init(dev, flags, ret);
    }
}
Esempio n. 19
0
void getBestDevice(){
  int num_devices;
  int status;
  int i;
  CUdevice temp_device;
  int curr_multiprocessors;
  int max_multiprocessors = -1;
  int max_i = -1;
  
  status = cuDeviceGetCount(&num_devices);   
  if (CUDA_SUCCESS != status) 
  {
    printf("error in cuDeviceGetCount\n");
  }
  for(i = 0; i < num_devices; ++i){
    status = cuDeviceGet(&temp_device, i);
    if (CUDA_SUCCESS != status) 
    {
      printf("error in cuDeviceGet\n");
    }
    status = cuDeviceGetAttribute(&curr_multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, temp_device);    
    if (CUDA_SUCCESS != status) 
    {
      printf("error in cuDeviceGetAttribute CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT\n");
    }
    if(curr_multiprocessors > max_multiprocessors)
    {
      max_multiprocessors = curr_multiprocessors;
      max_i = i;
    }
  }

  status = cuDeviceGet(&cuDevice, max_i); 
  if (CUDA_SUCCESS != status) 
  {
    printf("error in cuDeviceGetName\n");
  }
  status = cuDeviceGetAttribute(&maxGridDim, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, cuDevice);    
  if (CUDA_SUCCESS != status) 
  {
    printf("error in cuDeviceGetAttribute CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X\n");
  }
  numMultiProcessors = max_multiprocessors;
}
Esempio n. 20
0
/**
 * This measures the overhead in launching a kernel function on each GPU in the
 * system.
 *
 * It does this by executing a small kernel (copying 1 value in global memory) a
 * very large number of times and taking the average execution time.  This
 * program uses the CUDA driver API.
 */
int main() {
  CU_ERROR_CHECK(cuInit(0));

  int count;
  CU_ERROR_CHECK(cuDeviceGetCount(&count));

  float x = 5.0f;
  for (int d = 0; d < count; d++) {
    CUdevice device;
    CU_ERROR_CHECK(cuDeviceGet(&device, d));

    CUcontext context;
    CU_ERROR_CHECK(cuCtxCreate(&context, 0, device));

    CUdeviceptr in, out;
    CU_ERROR_CHECK(cuMemAlloc(&in, sizeof(float)));
    CU_ERROR_CHECK(cuMemAlloc(&out, sizeof(float)));
    CU_ERROR_CHECK(cuMemcpyHtoD(in, &x, sizeof(float)));

    CUmodule module;
    CU_ERROR_CHECK(cuModuleLoadData(&module, imageBytes));

    CUfunction function;
    CU_ERROR_CHECK(cuModuleGetFunction(&function, module, "kernel"));

    void * params[] = { &in, &out };

    CUevent start, stop;
    CU_ERROR_CHECK(cuEventCreate(&start, 0));
    CU_ERROR_CHECK(cuEventCreate(&stop, 0));

    CU_ERROR_CHECK(cuEventRecord(start, 0));
    for (int i = 0; i < ITERATIONS; i++)
      CU_ERROR_CHECK(cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, params, NULL));

    CU_ERROR_CHECK(cuEventRecord(stop, 0));
    CU_ERROR_CHECK(cuEventSynchronize(stop));

    float time;
    CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop));

    CU_ERROR_CHECK(cuEventDestroy(start));
    CU_ERROR_CHECK(cuEventDestroy(stop));

    CU_ERROR_CHECK(cuMemFree(in));
    CU_ERROR_CHECK(cuMemFree(out));

    fprintf(stdout, "Device %d: %fms\n", d, (time / (double)ITERATIONS));

    CU_ERROR_CHECK(cuModuleUnload(module));

    CU_ERROR_CHECK(cuCtxDestroy(context));
  }

  return 0;
}
Esempio n. 21
0
/*===========================================================================*/
bool Device::create( const int ordinal )
{
    KVS_CU_CALL( cuDeviceGet( &m_handler, ordinal ) );
    if ( kvs::cuda::DriverAPI::HasError() ) return false;

    KVS_CU_CALL( cuDeviceGetProperties( &m_property, m_handler ) );
    if ( kvs::cuda::DriverAPI::HasError() ) return false;

    return true;
}
Esempio n. 22
0
bool GPUInterface::GetSupportsDoublePrecision(int deviceNumber) {
	CUdevice tmpCudaDevice;
	SAFE_CUDA(cuDeviceGet(&tmpCudaDevice, (*resourceMap)[deviceNumber]));

	int major = 0;
	int minor = 0;
    SAFE_CUDA(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, tmpCudaDevice));
    SAFE_CUDA(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, tmpCudaDevice));
	return (major >= 2 || (major >= 1 && minor >= 3));
}
Esempio n. 23
0
Handle<Value> CudaDevice::New(const Arguments& args) {
  HandleScope scope;

  int ordinal = args[0]->IntegerValue();
  CudaDevice *cu = new CudaDevice();
  cuDeviceGet(&(cu->m_device), ordinal);

  cu->Wrap(args.This());
  return args.This();
}
Esempio n. 24
0
File: ov.c Progetto: CPFL/gtraffic
/*
 * Initializaiton in order to use kernel program 
 */
void
init_cuda(void){

  thread_num = (N <= 16) ? N : 16 ;  
  block_num = N / (thread_num*thread_num);
  if(N % (thread_num*thread_num) != 0) block_num++;
  
  res = cuInit(0);
  if(res != CUDA_SUCCESS){
    printf("cuInit failed: res = %s\n", conv(res));
    exit(1);
  }
  
  res = cuDeviceGet(&dev, 0);
  if(res != CUDA_SUCCESS){
    printf("cuDeviceGet failed: res = %s\n", conv(res));
    exit(1);
  }
  
  res = cuCtxCreate(&ctx, 0, dev);
  if(res != CUDA_SUCCESS){
    printf("cuCtxCreate failed: res = %s\n", conv(res));
    exit(1);
  }
  
  res = cuModuleLoad(&module, "./cuda_main.cubin");
  if(res != CUDA_SUCCESS){
    printf("cuModuleLoad() failed: res = %s\n", conv(res));
    exit(1);
  }
  
  res = cuModuleGetFunction(&function, module, "cuda_main");
  if(res != CUDA_SUCCESS){
    printf("cuModuleGetFunction() failed: res = %s\n",  conv(res));
    exit(1);
  }
  

  /* 
   * preparation for launch kernel 
   */
  res = cuFuncSetSharedSize(function, 0x40);  /* just random */
  if(res != CUDA_SUCCESS){
    printf("cuFuncSetSharedSize() failed: res = %s\n", conv(res));
    exit(1);
  }
  
  res = cuFuncSetBlockShape(function, thread_num, thread_num, 1);
  if(res != CUDA_SUCCESS){
    printf("cuFuncSetBlockShape() failed: res = %s\n", conv(res));
    exit(1);
  }

}
Esempio n. 25
0
void getBestDevice(JNIEnv *env){
  int num_devices;
  int status;
  int i;
  CUdevice temp_device;
  int curr_multiprocessors;
  int max_multiprocessors = -1;
  int max_i = -1;
  
  status = cuDeviceGetCount(&num_devices);
  CHECK_STATUS(env,"error in cuDeviceGetCount",status)
          
  if(num_devices == 0)
      throw_cuda_errror_exception(env,"0 Cuda Devices were found",0);
  
  for(i = 0; i < num_devices; ++i){
    status = cuDeviceGet(&temp_device, i);
    CHECK_STATUS(env,"error in cuDeviceGet",status)
            
    status = cuDeviceGetAttribute(&curr_multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, temp_device);    
    CHECK_STATUS(env,"error in cuDeviceGetAttribute",status)
            
    if(curr_multiprocessors > max_multiprocessors)
    {
      max_multiprocessors = curr_multiprocessors;
      max_i = i;
    }
  }

  status = cuDeviceGet(&cuDevice, max_i); 
  CHECK_STATUS(env,"error in cuDeviceGet",status)
          
  status = cuDeviceGetAttribute(&maxGridDim, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, cuDevice);    
  CHECK_STATUS(env,"error in cuDeviceGetAttribute",status)
          
  numMultiProcessors = max_multiprocessors;

}
Esempio n. 26
0
Object cuda_deviceName(Object self, int nparts, int *argcv,
        Object *argv, int flags) {
    cuInit(0);
    int deviceCount = 0;
    cuDeviceGetCount(&deviceCount);
    if (deviceCount == 0) {
        raiseError("No CUDA devices found");
    }
    CUdevice cuDevice;
    cuDeviceGet(&cuDevice, 0);
    char name[100];
    cuDeviceGetName(name, 100, cuDevice);
    return alloc_String(name);
}
// General initialization call to pick the best CUDA Device
inline CUdevice findCudaDeviceDRV(bool verbose = false)
{
    CUdevice cuDevice;
    int devID = 0;

    // Otherwise pick the device with highest Gflops/s
    char name[100];
    devID = 0;
    CHECK_CUDA_CALL( cuDeviceGet(&cuDevice, devID)
                     , "Couldn't get the device");
    cuDeviceGetName(name, 100, cuDevice);

    if(verbose)
    {
        std::cout << "Using CUDA Device "
                  << devID << ": " << name
                  << std::endl;
    }

    cuDeviceGet(&cuDevice, devID);

    return cuDevice;
}
Esempio n. 28
0
device::device(int device_id) {
	cuInit(0);
	checkCudaError("device::device Init");
	cuDeviceGet(&cu_device, device_id);
	checkCudaError("device::device Get device");
	//cuCtxCreate(&cu_context, 0, cu_device);
	//checkCudaError("device::device Create context");
	this->set_device(device_id);

	cudaGetDeviceProperties(&props, device_id);
	checkCudaError("device::device Get properties ");

	this->device_name = props.name;
}
Esempio n. 29
0
int main(int argc,char **argv){
	unsigned total = 0;
	unsigned long zul;
	ctx marsh;

	if(argc != 3){
		usage(argv[0]);
		exit(EXIT_FAILURE);
	}
	if(getzul(argv[1],&zul)){
		usage(argv[0]);
		exit(EXIT_FAILURE);
	}
	if(getzul(argv[2],&marsh.s)){
		usage(argv[0]);
		exit(EXIT_FAILURE);
	}
	if(cuInit(0)){
		fprintf(stderr,"Couldn't initialize cuda\n");
		exit(EXIT_FAILURE);
	}
	if(cuDeviceGet(&marsh.dev,zul)){
		fprintf(stderr,"Couldn't get device %lu\n",zul);
		exit(EXIT_FAILURE);
	}
	while( (marsh.threadno = ++total) ){
		pthread_t tid;
		int err;

		if( (err = pthread_create(&tid,NULL,thread,&marsh)) ){
			fprintf(stderr,"Couldn't create thread %d (%s?)\n",
					total,strerror(err));
			exit(EXIT_SUCCESS);
		}
		pthread_mutex_lock(&lock);
		while(!thrdone && threadsmaintain){
			pthread_cond_wait(&cond,&lock);
		}
		thrdone = 0;
		if(!threadsmaintain){
			pthread_mutex_unlock(&lock);
			fprintf(stderr,"Thread %d exited with an error.\n",total);
			break;
		}
		pthread_mutex_unlock(&lock);
		printf("Created thread %d\n",total);
	}	
	exit(EXIT_SUCCESS);
}
Esempio n. 30
0
int
main()
{
  CUresult result;
  result = cuInit(0);
  CUdevice device;
  result = cuDeviceGet(&device, 0);
  CUcontext ctx;
  result = cuCtxCreate(&ctx, 0, device);
  CUmodule module;
  result = cuModuleLoad(&module, "cuda-shift-throughput.cubin");
  CUfunction kernel;
  result = cuModuleGetFunction(&kernel, module, "kernel");
  int block;
  result = cuFuncGetAttribute(&block,
                              CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
                              kernel);
  int grid = 1024 * 1024;
  CUevent event[2];
  for (ptrdiff_t i = 0; i < 2; ++i) {
    result = cuEventCreate(&event[i], 0);
  }
  result = cuEventRecord(event[0], 0);
  result = cuLaunchKernel(kernel, grid, 1, 1, block, 1, 1, 0, 0, 0, 0);
  result = cuEventRecord(event[1], 0);
  result = cuEventSynchronize(event[1]);
  float time;
  result = cuEventElapsedTime(&time, event[0], event[1]);
  int gpuclock;
  result =
    cuDeviceGetAttribute(&gpuclock, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device);
  int gpump;
  result =
    cuDeviceGetAttribute(&gpump, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
                         device);
  std::printf("Clock: %d KHz, # of MPs: %d\n", gpuclock, gpump);
  std::printf("Elapsed Time: %f milliseconds\n", time);
  std::printf("# of Threads: %d, # of SHLs : %lld\n", block,
              1024ll * block * grid);
  std::printf("Throughput: %f\n",
              1024.0 * block * grid / ((double) gpump * gpuclock * time));
  for (ptrdiff_t i = 0; i < 2; ++i) {
    result = cuEventDestroy(event[i]);
  }
  result = cuModuleUnload(module);
  result = cuCtxDestroy(ctx);
  return 0;
}