Ejemplo n.º 1
1
extern "C" void sarafft_init() {
  printf( "Cuda is about to be initialized!\n" );
  fflush ( stdout );
  char *OMPI_COMM_WORLD_LOCAL_RANK = getenv( "OMPI_COMM_WORLD_LOCAL_RANK" );
  if ( NULL == OMPI_COMM_WORLD_LOCAL_RANK ) {
    printf( "OMPI_COMM_WORLD_LOCAL_RANK not set!\n" );
    fflush ( stdout );
    exit( 80 );
  }
  int localRank = atoi( OMPI_COMM_WORLD_LOCAL_RANK );
  printf( "Local rank is %d\n", localRank );
  fflush ( stdout );
  if ( CUDA_SUCCESS != cuInit( 0 ) ) {
    printf( "cuInit failed!\n" );
    fflush ( stdout );
    exit( 81 );
  }
  CUdevice device;
  if ( CUDA_SUCCESS != cuDeviceGet( &device, localRank ) ) {
    printf( "cuDeviceGet failed!\n" );
    fflush ( stdout );
    exit( 82 );
  }
  if ( CUDA_SUCCESS != cuCtxCreate( &cuda_context, CU_CTX_SCHED_YIELD, device ) ) {
    printf( "cuCtxCreate failed!\n" );
    fflush ( stdout );
    exit( 83 );
  }
  printf( "Cuda was initialized successfully!\n" );
  fflush ( stdout );
}
Ejemplo n.º 2
0
	CUDADevice(DeviceInfo& info, Stats &stats, bool background_) : Device(stats)
	{
		background = background_;

		cuDevId = info.num;
		cuDevice = 0;
		cuContext = 0;

		/* intialize */
		if(cuda_error(cuInit(0)))
			return;

		/* setup device and context */
		if(cuda_error(cuDeviceGet(&cuDevice, cuDevId)))
			return;

		CUresult result;

		if(background) {
			result = cuCtxCreate(&cuContext, 0, cuDevice);
		}
		else {
			result = cuGLCtxCreate(&cuContext, 0, cuDevice);

			if(result != CUDA_SUCCESS) {
				result = cuCtxCreate(&cuContext, 0, cuDevice);
				background = true;
			}
		}

		if(cuda_error_(result, "cuCtxCreate"))
			return;

		cuda_pop_context();
	}
Ejemplo n.º 3
0
static void *do_init(CUdevice dev, int flags, int *ret) {
    cuda_context *res;
    CUcontext ctx;
    unsigned int fl = CU_CTX_SCHED_AUTO;
    int i;

    CHKFAIL(NULL);
    if (flags & GA_CTX_SINGLE_THREAD)
      fl = CU_CTX_SCHED_SPIN;
    if (flags & GA_CTX_MULTI_THREAD)
      fl = CU_CTX_SCHED_YIELD;
    err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev);
    CHKFAIL(NULL);
    if (i != 1)
      FAIL(NULL, GA_UNSUPPORTED_ERROR);
    err = cuCtxCreate(&ctx, fl, dev);
    CHKFAIL(NULL);
    res = cuda_make_ctx(ctx, 0);
    if (res == NULL) {
      cuCtxDestroy(ctx);
      FAIL(NULL, GA_IMPL_ERROR);
    }
    res->flags |= flags;
    /* Don't leave the context on the thread stack */
    cuCtxPopCurrent(NULL);

    return res;
}
Ejemplo n.º 4
0
bool initCuda(CUcontext & cuContext)
{
    // Initialize Cuda
    CUresult cerr;
	int deviceCount;
	cudaGetDeviceCount(&deviceCount);

	if (deviceCount == 0)
	{
        fprintf(stderr, "Sorry, no CUDA device found");
        return false;
	}

	int selectedDevice = 0;
	if (selectedDevice >= deviceCount)
	{
        fprintf(stderr, "Choose device ID between 0 and %d\n", deviceCount-1);
        return false;
	}

	// Initialize the CUDA device 
    CUdevice cuDevice;
	cerr = cuDeviceGet(&cuDevice,selectedDevice);
	checkError(cerr);

	cerr = cuCtxCreate(&cuContext, CU_CTX_MAP_HOST|CU_CTX_BLOCKING_SYNC, cuDevice);
	checkError(cerr);
}
Ejemplo n.º 5
0
int mcopy_gpu_init(struct device_info *device_info)
{
	char fname[256];
	CUresult res;

	/* printf("madd_gpu_init called.\n"); */

	/* Initialization */
	if ((res = cuInit(0)) != CUDA_SUCCESS) {
		printf("cuInit failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	if ((res = cuDeviceGet(&device_info->dev, 0)) != CUDA_SUCCESS) {
		printf("cuDeviceGet failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	if ((res = cuCtxCreate(&device_info->context, 0, device_info->dev)) !=
	 CUDA_SUCCESS) {
		printf("cuCtxCreate failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	return 0;
}
Ejemplo n.º 6
0
int main(int argc, char* argv[])
{
    cuInit(0);
    int devs = 0;
    cuDeviceGetCount(&devs);
    assert(devs > 0);
    CUdevice dev;
    CUresult status;
    CUcontext ctx = 0;
    cuDeviceGet(&dev, 0);
    cuCtxCreate(&ctx, 0, dev);
    {
        size_t f = 0, t = 0;
        CUresult r = cuMemGetInfo(&f, &t);
        fprintf( stderr, "Do cuMemGetInfo: %d, %zu/%zu\n", r, f, t );
    }
    
    __init("\n");
 
    printf("\nPress any key to exit...");
    char c;
    scanf("%c", &c);
 
    return 0;
}
Ejemplo n.º 7
0
size_t initContext(JNIEnv * env, jint max_blocks_per_proc, jint max_threads_per_block)
{
  size_t to_space_size;
  int status;
  int deviceCount = 0;
  size_t f_mem;
  size_t t_mem;
  jint num_blocks;
  
  status = cuDeviceGetCount(&deviceCount);
  CHECK_STATUS_RTN(env,"error in cuDeviceGetCount",status, 0);

  getBestDevice(env);

  status = cuCtxCreate(&cuContext, CU_CTX_MAP_HOST, cuDevice);  
  CHECK_STATUS_RTN(env,"error in cuCtxCreate",status, 0)
  
  status = cuMemGetInfo (&f_mem, &t_mem);
  CHECK_STATUS_RTN(env,"error in cuMemGetInfo",status, 0)
  
  to_space_size = f_mem;

  //space for 100 types in the scene
  classMemSize = sizeof(jint)*100;
  
  num_blocks = numMultiProcessors * max_threads_per_block * max_blocks_per_proc;
  
  gc_space_size = 1024;
  to_space_size -= (num_blocks * sizeof(jlong));
  to_space_size -= (num_blocks * sizeof(jlong));
  to_space_size -= gc_space_size;
  to_space_size -= classMemSize;
  
  return to_space_size;
}
Ejemplo n.º 8
0
int main() {

	int ngpu;
	CUdevice cuDevice;
	CUcontext cuContext;
	cuInit(0);
	cuDeviceGetCount(&ngpu);
	//printf("ngpu = %d\n", ngpu);

	size_t *totals, *frees ;
	totals = (size_t *) calloc (ngpu, sizeof(size_t));
	frees = (size_t *) calloc (ngpu, sizeof(size_t));

	int tid;
	omp_set_num_threads(ngpu);
	#pragma omp parallel private(tid, cuDevice, cuContext) shared(frees, totals)
	{
		tid = omp_get_thread_num();
		//printf("nthreads = %d, tid = %d\n", omp_get_num_threads(), tid);
		cuDeviceGet(&cuDevice, tid);
		cuCtxCreate(&cuContext, tid, cuDevice);
		cuMemGetInfo((size_t*)&frees[tid], (size_t*)&totals[tid]);
	}

	printf ("\ttotal\t\tfree\t\tused\n");
	for(int i=0; i<ngpu; i++) {
		printf("GPU %d\t%lu\t%lu\t%lu\n", i, (size_t)totals[i], (size_t)frees[i], (size_t)totals[i]-(size_t)frees[i]);
	}

	return 0;
}
Ejemplo n.º 9
0
CUresult cuda_driver_api_init(CUcontext *pctx, CUmodule *pmod, const char *f)
{
	CUresult res;
	CUdevice dev;

	res = cuInit(0);
	if (res != CUDA_SUCCESS) {
		printf("cuInit failed: res = %lu\n", (unsigned long)res);
		return res;
	}

	res = cuDeviceGet(&dev, 0);
	if (res != CUDA_SUCCESS) {
		printf("cuDeviceGet failed: res = %lu\n", (unsigned long)res);
		return res;
	}

	res = cuCtxCreate(pctx, 0, dev);
	if (res != CUDA_SUCCESS) {
		printf("cuCtxCreate failed: res = %lu\n", (unsigned long)res);
		return res;
	}
	
	res = cuModuleLoad(pmod, f);
	if (res != CUDA_SUCCESS) {
		printf("cuModuleLoad() failed\n");
		cuCtxDestroy(*pctx);
		return res;
	}

	return CUDA_SUCCESS;
}
Ejemplo n.º 10
0
bool GLInteropResource::ensureResource(int w, int h, int H, GLuint tex, int plane)
{
    Q_ASSERT(plane < 2 && "plane number must be 0 or 1 for NV12");
    TexRes &r = res[plane];
    if (r.texture == tex && r.w == w && r.h == h && r.H == H && r.cuRes)
        return true;
    if (!ctx) {
        // TODO: how to use pop/push decoder's context without the context in opengl context
        CUDA_ENSURE(cuCtxCreate(&ctx, CU_CTX_SCHED_BLOCKING_SYNC, dev), false);
        if (USE_STREAM) {
            CUDA_WARN(cuStreamCreate(&res[0].stream, CU_STREAM_DEFAULT));
            CUDA_WARN(cuStreamCreate(&res[1].stream, CU_STREAM_DEFAULT));
        }
        qDebug("cuda contex on gl thread: %p", ctx);
        CUDA_ENSURE(cuCtxPopCurrent(&ctx), false); // TODO: why cuMemcpy2D need this
    }
    if (r.cuRes) {
        CUDA_ENSURE(cuGraphicsUnregisterResource(r.cuRes), false);
        r.cuRes = NULL;
    }
    // CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD works too for opengl, but not d3d
    CUDA_ENSURE(cuGraphicsGLRegisterImage(&r.cuRes, tex, GL_TEXTURE_2D, CU_GRAPHICS_REGISTER_FLAGS_NONE), false);
    r.texture = tex;
    r.w = w;
    r.h = h;
    r.H = H;
    return true;
}
Ejemplo n.º 11
0
int
init_cuda_context (void)
{
#ifdef _ENABLE_CUDA_
    CUresult curesult = CUDA_SUCCESS;
    CUdevice cuDevice;
    int local_rank, dev_count;
    int dev_id = 0;
    char * str;

    if ((str = getenv("LOCAL_RANK")) != NULL) {
        cudaGetDeviceCount(&dev_count);
        local_rank = atoi(str);
        dev_id = local_rank % dev_count;
    }

    curesult = cuInit(0);
    if (curesult != CUDA_SUCCESS) {
        return 1;
    }

    curesult = cuDeviceGet(&cuDevice, dev_id);
    if (curesult != CUDA_SUCCESS) {
        return 1;
    }

    curesult = cuCtxCreate(&cuContext, 0, cuDevice);
    if (curesult != CUDA_SUCCESS) {
        return 1;
    }
#endif
    return 0;
}
Ejemplo n.º 12
0
Object cuda_over_map(Object self, int nparts, int *argcv,
        Object *argv, int flags) {
    CUresult error;
    cuInit(0);
    int deviceCount = 0;
    error = cuDeviceGetCount(&deviceCount);
    if (deviceCount == 0) {
        raiseError("No CUDA devices found");
    }
    CUdevice cuDevice;
    CUcontext cuContext;
    CUmodule cuModule;
    CUfunction cuFunc;
    error = cuDeviceGet(&cuDevice, 0);
    error = cuCtxCreate(&cuContext, 0, cuDevice);
    CUdeviceptr d_A;
    CUdeviceptr d_B;
    CUdeviceptr d_res;
    errcheck(cuModuleLoad(&cuModule, grcstring(argv[argcv[0]])));
    CUdeviceptr dps[argcv[0]];
    void *args[argcv[0]+2];
    int size = INT_MAX;
    for (int i=0; i<argcv[0]; i++) {
        struct CudaFloatArray *a = (struct CudaFloatArray *)argv[i];
        if (a->size < size)
            size = a->size;
        errcheck(cuMemAlloc(&dps[i], size * sizeof(float)));
        errcheck(cuMemcpyHtoD(dps[i], &a->data, size * sizeof(float)));
        args[i+1] = &dps[i];
    }
    struct CudaFloatArray *r =
        (struct CudaFloatArray *)(alloc_CudaFloatArray(size));
    int fsize = sizeof(float) * size;
    errcheck(cuMemAlloc(&d_res, fsize));
    errcheck(cuMemcpyHtoD(d_res, &r->data, fsize));
    args[0] = &d_res;
    args[argcv[0]+1] = &size;

    int threadsPerBlock = 256;
    int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock;
    char name[256];
    strcpy(name, "block");
    strcat(name, grcstring(argv[argcv[0]]) + strlen("_cuda/"));
    for (int i=0; name[i] != 0; i++)
        if (name[i] == '.') {
            name[i] = 0;
            break;
        }
    errcheck(cuModuleGetFunction(&cuFunc, cuModule, name));
    errcheck(cuLaunchKernel(cuFunc, blocksPerGrid, 1, 1,
        threadsPerBlock, 1, 1,
        0,
        NULL, args, NULL));
    errcheck(cuMemcpyDtoH(&r->data, d_res, fsize));
    cuMemFree(d_res);
    for (int i=0; i<argcv[0]; i++)
        cuMemFree(dps[i]);
    return (Object)r;
}
static CUresult
initCuda(CUcontext _cuContext, char* executablePath, CUfunction *mathop, 
	int argc, char** argv, const char* cubin_name, const char* kernel_name)
{
    CUdevice cuDevice;
    CUT_DEVICE_INIT_DRV(cuDevice, argc, argv);
    print_GetProperties(cuDevice);

    CUresult status = cuCtxCreate( &_cuContext, 0, cuDevice );
    if ( CUDA_SUCCESS != status ) {
        Error(_cuContext, status);
    }
    else printf("(1) context creation successful\n");

    char* module_path = cutFindFilePath(cubin_name, executablePath);
    printf ("\t cubin:%s, path:%s, mmp_ptr:%lu\n", cubin_name, executablePath, module_path);
    if(module_path != NULL)
      printf ("\t cubin:%s, path:%s, module_path:%c%c%c%c\n", cubin_name, executablePath, *module_path, *(module_path+1), *(module_path+2), *(module_path+3));
    char* data_path = "./data/";
    size_t len_path = strlen(data_path);
    size_t len_fn = strlen(cubin_name);
    // printf ("Sizes: data:%lu, cubinname:%lu\n", len_path, len_fn);

    char* module_path_new = (char*)malloc(sizeof(char) * (len_path + len_fn));
    strcpy(module_path_new, data_path);
    strcat(module_path_new, cubin_name);
    strcat(module_path_new, "\0");
    if (module_path_new == 0) {
        status = CUDA_ERROR_NOT_FOUND;
        Error(_cuContext, status);
    }
    FILE *fp = fopen(module_path_new,"r");
    if( fp ) {
	printf("(2) cubin_File found in modulepath:%s\n", module_path_new);
        fclose(fp);
    } else {
	printf("(2) cubin file not exist: %s\n", module_path_new);
    }
    CUmodule cuModule;
    status = cuModuleLoad(&cuModule, module_path_new);
    cutFree(module_path_new);
    if ( CUDA_SUCCESS != status ) {
        Error(_cuContext, status);
    }
    else printf ("(3) module Load successful\n");

    CUfunction cuFunction = 0;
    status = cuModuleGetFunction(&cuFunction, cuModule, kernel_name);
    if ( CUDA_SUCCESS != status) {
        Error(_cuContext, status);
    }
    else printf ("(4) getFunction successful w/cuFunction\n");

    *mathop = cuFunction;

    return CUDA_SUCCESS;

}
Ejemplo n.º 14
0
int
init_accel (void)
{
#if defined(_ENABLE_OPENACC_) || defined(_ENABLE_CUDA_)
     char * str;
     int local_rank, dev_count;
     int dev_id = 0;
#endif
#ifdef _ENABLE_CUDA_
    CUresult curesult = CUDA_SUCCESS;
    CUdevice cuDevice;
#endif

    switch (options.accel) {
#ifdef _ENABLE_CUDA_
        case managed:
        case cuda:
            if ((str = getenv("LOCAL_RANK")) != NULL) {
                cudaGetDeviceCount(&dev_count);
                local_rank = atoi(str);
                dev_id = local_rank % dev_count;
            }
        
            curesult = cuInit(0);
            if (curesult != CUDA_SUCCESS) {
                return 1;
            }
        
            curesult = cuDeviceGet(&cuDevice, dev_id);
            if (curesult != CUDA_SUCCESS) {
                return 1;
            }
        
            curesult = cuCtxCreate(&cuContext, 0, cuDevice);
            if (curesult != CUDA_SUCCESS) {
                return 1;
            }
            break;
#endif   
#ifdef _ENABLE_OPENACC_
        case openacc:
            if ((str = getenv("LOCAL_RANK")) != NULL) {
                dev_count = acc_get_num_devices(acc_device_not_host);
                local_rank = atoi(str);
                dev_id = local_rank % dev_count;
            }
        
            acc_set_device_num (dev_id, acc_device_not_host);
            break;
#endif   
        default:
            fprintf(stderr, "Invalid device type, should be cuda or openacc\n");
            return 1;
    }

    return 0;
}
Ejemplo n.º 15
0
void GPUInterface::SetDevice(int deviceNumber, int paddedStateCount, int categoryCount, int paddedPatternCount,
                             long flags) {
#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr,"\t\t\tEntering GPUInterface::SetDevice\n");
#endif

    SAFE_CUDA(cuDeviceGet(&cudaDevice, (*resourceMap)[deviceNumber]));

    if (flags & BEAGLE_FLAG_SCALING_DYNAMIC) {
        SAFE_CUDA(cuCtxCreate(&cudaContext, CU_CTX_SCHED_AUTO | CU_CTX_MAP_HOST, cudaDevice));
    } else {
        SAFE_CUDA(cuCtxCreate(&cudaContext, CU_CTX_SCHED_AUTO, cudaDevice));
    }


    if (kernelMap == NULL) {
        // kernels have not yet been initialized; do so now.  Hopefully, this only occurs once per library load.
        InitializeKernelMap();
    }

    int id = paddedStateCount;
    if (flags & BEAGLE_FLAG_PRECISION_DOUBLE) {
        id *= -1;
    }

    if (kernelMap->count(id) == 0) {
        fprintf(stderr,"Critical error: unable to find kernel code for %d states.\n",paddedStateCount);
        exit(-1);
    }

    kernelResource = (*kernelMap)[id].copy();
    kernelResource->categoryCount = categoryCount;
    kernelResource->patternCount = paddedPatternCount;
    kernelResource->flags = flags;

    SAFE_CUDA(cuModuleLoadData(&cudaModule, kernelResource->kernelCode));

    SAFE_CUDA(cuCtxPopCurrent(&cudaContext));

#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr,"\t\t\tLeaving  GPUInterface::SetDevice\n");
#endif

}
Ejemplo n.º 16
0
/**
 * This measures the overhead in launching a kernel function on each GPU in the
 * system.
 *
 * It does this by executing a small kernel (copying 1 value in global memory) a
 * very large number of times and taking the average execution time.  This
 * program uses the CUDA driver API.
 */
int main() {
  CU_ERROR_CHECK(cuInit(0));

  int count;
  CU_ERROR_CHECK(cuDeviceGetCount(&count));

  float x = 5.0f;
  for (int d = 0; d < count; d++) {
    CUdevice device;
    CU_ERROR_CHECK(cuDeviceGet(&device, d));

    CUcontext context;
    CU_ERROR_CHECK(cuCtxCreate(&context, 0, device));

    CUdeviceptr in, out;
    CU_ERROR_CHECK(cuMemAlloc(&in, sizeof(float)));
    CU_ERROR_CHECK(cuMemAlloc(&out, sizeof(float)));
    CU_ERROR_CHECK(cuMemcpyHtoD(in, &x, sizeof(float)));

    CUmodule module;
    CU_ERROR_CHECK(cuModuleLoadData(&module, imageBytes));

    CUfunction function;
    CU_ERROR_CHECK(cuModuleGetFunction(&function, module, "kernel"));

    void * params[] = { &in, &out };

    CUevent start, stop;
    CU_ERROR_CHECK(cuEventCreate(&start, 0));
    CU_ERROR_CHECK(cuEventCreate(&stop, 0));

    CU_ERROR_CHECK(cuEventRecord(start, 0));
    for (int i = 0; i < ITERATIONS; i++)
      CU_ERROR_CHECK(cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, params, NULL));

    CU_ERROR_CHECK(cuEventRecord(stop, 0));
    CU_ERROR_CHECK(cuEventSynchronize(stop));

    float time;
    CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop));

    CU_ERROR_CHECK(cuEventDestroy(start));
    CU_ERROR_CHECK(cuEventDestroy(stop));

    CU_ERROR_CHECK(cuMemFree(in));
    CU_ERROR_CHECK(cuMemFree(out));

    fprintf(stdout, "Device %d: %fms\n", d, (time / (double)ITERATIONS));

    CU_ERROR_CHECK(cuModuleUnload(module));

    CU_ERROR_CHECK(cuCtxDestroy(context));
  }

  return 0;
}
Ejemplo n.º 17
0
CUresult CreateCuContext(CuDevice* device, uint flags, ContextPtr* ppContext) {
	ContextPtr context(new CuContext(true));
	CUresult result = cuCtxCreate(&context->_h, flags, device->Handle());
	HANDLE_RESULT();

	context->_destroyOnDtor = true;
	context->_device = device;
	
	ppContext->swap(context);
	return CUDA_SUCCESS;
}
Ejemplo n.º 18
0
Archivo: ov.c Proyecto: CPFL/gtraffic
/*
 * Initializaiton in order to use kernel program 
 */
void
init_cuda(void){

  thread_num = (N <= 16) ? N : 16 ;  
  block_num = N / (thread_num*thread_num);
  if(N % (thread_num*thread_num) != 0) block_num++;
  
  res = cuInit(0);
  if(res != CUDA_SUCCESS){
    printf("cuInit failed: res = %s\n", conv(res));
    exit(1);
  }
  
  res = cuDeviceGet(&dev, 0);
  if(res != CUDA_SUCCESS){
    printf("cuDeviceGet failed: res = %s\n", conv(res));
    exit(1);
  }
  
  res = cuCtxCreate(&ctx, 0, dev);
  if(res != CUDA_SUCCESS){
    printf("cuCtxCreate failed: res = %s\n", conv(res));
    exit(1);
  }
  
  res = cuModuleLoad(&module, "./cuda_main.cubin");
  if(res != CUDA_SUCCESS){
    printf("cuModuleLoad() failed: res = %s\n", conv(res));
    exit(1);
  }
  
  res = cuModuleGetFunction(&function, module, "cuda_main");
  if(res != CUDA_SUCCESS){
    printf("cuModuleGetFunction() failed: res = %s\n",  conv(res));
    exit(1);
  }
  

  /* 
   * preparation for launch kernel 
   */
  res = cuFuncSetSharedSize(function, 0x40);  /* just random */
  if(res != CUDA_SUCCESS){
    printf("cuFuncSetSharedSize() failed: res = %s\n", conv(res));
    exit(1);
  }
  
  res = cuFuncSetBlockShape(function, thread_num, thread_num, 1);
  if(res != CUDA_SUCCESS){
    printf("cuFuncSetBlockShape() failed: res = %s\n", conv(res));
    exit(1);
  }

}
Ejemplo n.º 19
0
int main(int argc, char *argv[])
{
	char c;
	CUcontext ctx;
	CUdevice dev = 0;
	void *toSpace;
	int status, free, total;
	CUdeviceptr ptr = (CUdeviceptr)NULL;
	int size;
	
	if(argc != 2){
		fprintf(stderr,"Usage: mem_alloc.exe [MEMORY TO ALLOCATE IN MB]\n");
		exit(1);
	}
	
	printf("All status results should be 0, if not an error has occured.\nIf 2 is reported an out of memory error has occured for\nwhich you should decrease the memory input\n");
	size = atoi(argv[1]);
	
	printf("\nTrying to allocate %iMB of memory on host and GPU\n",size);
	
	if(size <= 0){
		fprintf(stderr,"\nERROR: Memory must be greater than 0\n");
		exit(1);
	}
	
	status = cuInit(0);
	printf("Init status: %i\n",status); 

	status = cuCtxCreate(&ctx, 0, dev);
	printf("Context creation status: %i\n",status); 
	
	cuMemGetInfo(&free, &total);
	printf("Get memory info status: %i\n",status); 
	
	printf("\n%.1f/%.1f (Free/Total) MB\n", free/1024.0/1024.0, total/1024.0/1024.0);
	
	status = cuMemHostAlloc(&toSpace, size*1024*1024, 0); 
	printf("Host allocation status: %i %s\n",status, (status==CUDA_SUCCESS) ? "SUCCESS" : "FAILED"); 

	status = cuMemAlloc(&ptr, size*1024*1024);
	printf("GPU allocation status: %i %s\n",status, (status==CUDA_SUCCESS) ? "SUCCESS" : "FAILED");

	printf("\nPress any key to exit...");
	scanf("%c", &c);
	
	status = cuCtxDestroy(ctx);
	printf("Context destroy status: %i\n",status); 

	return 0;
}
Ejemplo n.º 20
0
SEXP
R_cuCtxCreate(SEXP r_flags, SEXP r_dev)
{
    SEXP r_ans = R_NilValue;
    CUcontext pctx;
    unsigned int flags = REAL(r_flags)[0];
    int dev = INTEGER(r_dev)[0];
    CUresult ans;
    ans = cuCtxCreate(& pctx,  flags,  dev);
    if(ans)
       return(R_cudaErrorInfo(ans));
    r_ans = R_createRef(pctx, "CUcontext") ;
    return(r_ans);
}
Ejemplo n.º 21
0
int
main()
{
  CUresult result;
  result = cuInit(0);
  CUdevice device;
  result = cuDeviceGet(&device, 0);
  CUcontext ctx;
  result = cuCtxCreate(&ctx, 0, device);
  CUmodule module;
  result = cuModuleLoad(&module, "cuda-shift-throughput.cubin");
  CUfunction kernel;
  result = cuModuleGetFunction(&kernel, module, "kernel");
  int block;
  result = cuFuncGetAttribute(&block,
                              CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
                              kernel);
  int grid = 1024 * 1024;
  CUevent event[2];
  for (ptrdiff_t i = 0; i < 2; ++i) {
    result = cuEventCreate(&event[i], 0);
  }
  result = cuEventRecord(event[0], 0);
  result = cuLaunchKernel(kernel, grid, 1, 1, block, 1, 1, 0, 0, 0, 0);
  result = cuEventRecord(event[1], 0);
  result = cuEventSynchronize(event[1]);
  float time;
  result = cuEventElapsedTime(&time, event[0], event[1]);
  int gpuclock;
  result =
    cuDeviceGetAttribute(&gpuclock, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device);
  int gpump;
  result =
    cuDeviceGetAttribute(&gpump, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
                         device);
  std::printf("Clock: %d KHz, # of MPs: %d\n", gpuclock, gpump);
  std::printf("Elapsed Time: %f milliseconds\n", time);
  std::printf("# of Threads: %d, # of SHLs : %lld\n", block,
              1024ll * block * grid);
  std::printf("Throughput: %f\n",
              1024.0 * block * grid / ((double) gpump * gpuclock * time));
  for (ptrdiff_t i = 0; i < 2; ++i) {
    result = cuEventDestroy(event[i]);
  }
  result = cuModuleUnload(module);
  result = cuCtxDestroy(ctx);
  return 0;
}
Ejemplo n.º 22
0
static int
init_thread(CUcontext *pctx,CUdevice dev,size_t s){
	CUdeviceptr ptr;
	CUresult cerr;

	if( (cerr = cuCtxCreate(pctx,0,dev)) ){
		fprintf(stderr," Error (%d) creating CUDA context\n",cerr);
		return -1;
	}
	if(s){
		if( (cerr = cuMemAlloc(&ptr,s)) ){
			fprintf(stderr," Error (%d) allocating %zub\n",cerr,s);
			return -1;
		}
	}
	return 0;
}
Ejemplo n.º 23
0
static void *cuda_init(int ord, int flags, int *ret) {
    CUdevice dev;
    CUcontext ctx;
    cuda_context *res;
    static int init_done = 0;
    unsigned int fl = CU_CTX_SCHED_AUTO;

    if (ord == -1) {
      /* Grab the ambient context */
      err = cuCtxGetCurrent(&ctx);
      CHKFAIL(NULL);
      res = cuda_make_ctx(ctx, DONTFREE);
      if (res == NULL) {
        FAIL(NULL, GA_IMPL_ERROR);
      }
      res->flags |= flags;
      return res;
    }

    if (!init_done) {
      err = cuInit(0);
      CHKFAIL(NULL);
      init_done = 1;
    }
    err = cuDeviceGet(&dev, ord);
    CHKFAIL(NULL);
    if (flags & GA_CTX_SINGLE_THREAD)
      fl = CU_CTX_SCHED_SPIN;
    if (flags & GA_CTX_MULTI_THREAD)
      fl = CU_CTX_SCHED_YIELD;
    err = cuCtxCreate(&ctx, fl, dev);
    CHKFAIL(NULL);
    res = cuda_make_ctx(ctx, 0);
    res->flags |= flags;
    if (res == NULL) {
      cuCtxDestroy(ctx);
      FAIL(NULL, GA_IMPL_ERROR);
    }
    /* Don't leave the context on the thread stack */
    cuCtxPopCurrent(NULL);
    return res;
}
Ejemplo n.º 24
0
  void device_t<CUDA>::setup(argInfoMap &aim){
    cuda::init();

    data = new CUDADeviceData_t;

    OCCA_EXTRACT_DATA(CUDA, Device);

    if(!aim.has("deviceID")){
      std::cout << "[CUDA] device not given [deviceID]\n";
      throw 1;
    }

    const int deviceID = aim.iGet("deviceID");

    OCCA_CUDA_CHECK("Device: Creating Device",
                    cuDeviceGet(&data_.device, deviceID));

    OCCA_CUDA_CHECK("Device: Creating Context",
                    cuCtxCreate(&data_.context, CU_CTX_SCHED_AUTO, data_.device));
  }
Ejemplo n.º 25
0
ERL_NIF_TERM pteracuda_nifs_new_context(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
    CUdevice device;
    int deviceNum = 0;
    PCudaContextRef *ref = (PCudaContextRef *) enif_alloc_resource(pteracuda_context_resource, sizeof(PCudaContextRef));
    if (!ref) {
        return OOM_ERROR;
    }
    if (argc == 1 && !enif_get_int(env, argv[0], &deviceNum)) {
        return enif_make_badarg(env);
    }
    if (cuDeviceGet(&device, deviceNum) == CUDA_SUCCESS &&
        cuCtxCreate(&(ref->ctx), CU_CTX_SCHED_AUTO, device) == CUDA_SUCCESS) {
        ref->destroyed = false;
        ERL_NIF_TERM result = enif_make_resource(env, ref);
        enif_release_resource(ref);
        return enif_make_tuple2(env, ATOM_OK, result);
    }
    else {
        return ATOM_ERROR;
    }
}
Ejemplo n.º 26
0
CUresult initialize(int device, CUcontext *phContext, CUdevice *phDevice, CUmodule *phModule, CUstream *phStream)
{
  // Initialize the device and create the context
  cuInit(0);
  cuDeviceGet(phDevice, device);
  CUresult status = cuCtxCreate(phContext, 0, *phDevice);
  if (status != CUDA_SUCCESS)
    {std::cout << "ERROR: could not create context\n"; exit(0);}

    status = cuModuleLoad(phModule, "PTXTestFunctions.o.ptx");
  
  if (status != CUDA_SUCCESS)
    {std::cout << "ERROR: could not load .ptx module: " << status << "\n"; exit(0);}

  // Create stream
  status = cuStreamCreate(phStream, 0);
  if (status != CUDA_SUCCESS)
    {printf("ERROR: during stream creation\n"); exit(0);}

  return status;
}
Ejemplo n.º 27
0
int main(int argc, char ** argv)
{
	int dev_count = 0;

	CUdevice   device;
	CUcontext  context;
	CUmodule   module;
	CUfunction function;

	cuInit(0);

	cuDeviceGetCount(&dev_count);

	if (dev_count < 1) return -1;

	cuDeviceGet( &device, 0 );
	cuCtxCreate( &context, 0, device );
	
	cuModuleLoad( &module, "hello.cuda_runtime.ptx" );
	cuModuleGetFunction( &function, module, "_Z6kernelPf" );

	int N = 512;
	CUdeviceptr pData;
	cuMemAlloc( &pData, N * sizeof(float) );
	cuFuncSetBlockShape( function, N, 1, 1 );
	cuParamSeti( function, 0, pData );
	cuParamSetSize( function, 4 );

	cuLaunchGrid( function, 1, 1 );

	float * pHostData = new float[N];

	cuMemcpyDtoH( pHostData, pData, N * sizeof( float) );

	cuMemFree( pData );

	delete [] pHostData;

	return 0;
}
Ejemplo n.º 28
0
__attribute__((constructor)) void
initTrace()
{
	//get the arguments from the environment variables
	int deviceId, sampRate;
	
    CUcontext cuCtx;
	deviceId = atoi(getenv("GPU_DEVICE_ID"));
    cuInit(0);
	cuCtxCreate(&cuCtx,0,deviceId);
	CUPTI_CALL(cuptiActivityRegisterCallbacks(bufferRequested, bufferCompleted));
    CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_PC_SAMPLING));
	//CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_BRANCH));

	CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL));
	CUPTI_CALL(cuptiSubscribe(&g_subscriber, (CUpti_CallbackFunc)traceCallback, NULL));
	CUPTI_CALL(cuptiEnableDomain(1, g_subscriber, CUPTI_CB_DOMAIN_RESOURCE));
	CUpti_ActivityPCSamplingConfig config;
	sampRate=atoi(getenv("PC_SAMPLING_RATE"));
	config.samplingPeriod= sampRate;
	CUPTI_CALL(cuptiActivityConfigurePCSampling(cuCtx, &config));
}
Ejemplo n.º 29
0
int madd_gpu_init(struct device_info *device_info)
{
	char fname[256];
	CUresult res;

	/* printf("madd_gpu_init called.\n"); */

	/* Initialization */
	if ((res = cuInit(0)) != CUDA_SUCCESS) {
		printf("cuInit failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	if ((res = cuDeviceGet(&device_info->dev, 0)) != CUDA_SUCCESS) {
		printf("cuDeviceGet failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	if ((res = cuCtxCreate(&device_info->context, 0, device_info->dev)) !=
	 CUDA_SUCCESS) {
		printf("cuCtxCreate failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	/* binary files are located in the same directory as the source code */
	if ((res = cuModuleLoad(&device_info->module, MODULE_FILE_NAME)) !=
	 CUDA_SUCCESS) {
		printf("cuModuleLoad() failed\n");
		return -1;
	}

	if ((res = cuModuleGetFunction(&device_info->kernel, 
	 device_info->module, KERNEL_NAME)) != CUDA_SUCCESS) {
		printf("cuModuleGetFunction() failed\n");
		return -1;
	}

	return 0;
}
Ejemplo n.º 30
0
bool VideoDecoderCUDAPrivate::initCuda()
{
    CUresult result = cuInit(0);
    if (result != CUDA_SUCCESS) {
        available = false;
        qWarning("cuInit(0) faile (%d)", result);
        return false;
    }
    cudev = GetMaxGflopsGraphicsDeviceId();

    int clockRate;
    cuDeviceGetAttribute(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, cudev);
    int major, minor;
    cuDeviceComputeCapability(&major, &minor, cudev);
    char devname[256];
    cuDeviceGetName(devname, 256, cudev);
    description = QString("CUDA device: %1 %2.%3 %4 MHz").arg(devname).arg(major).arg(minor).arg(clockRate/1000);

    //TODO: cuD3DCtxCreate > cuGLCtxCreate > cuCtxCreate
    checkCudaErrors(cuCtxCreate(&cuctx, CU_CTX_SCHED_BLOCKING_SYNC, cudev)); //CU_CTX_SCHED_AUTO?
    CUcontext cuCurrent = NULL;
    result = cuCtxPopCurrent(&cuCurrent);
    if (result != CUDA_SUCCESS) {
        qWarning("cuCtxPopCurrent: %d\n", result);
        return false;
    }
    checkCudaErrors(cuvidCtxLockCreate(&vid_ctx_lock, cuctx));
    {
        AutoCtxLock lock(this, vid_ctx_lock);
        Q_UNUSED(lock);
        //Flags- Parameters for stream creation (must be 0 (CU_STREAM_DEFAULT=0 in cuda5) in cuda 4.2, no CU_STREAM_NON_BLOCKING)
        checkCudaErrors(cuStreamCreate(&stream, 0));//CU_STREAM_NON_BLOCKING)); //CU_STREAM_DEFAULT
        //require compute capability >= 1.1
        //flag: Reserved for future use, must be 0
        //cuStreamAddCallback(stream, CUstreamCallback, this, 0);
    }
    return true;
}