unsigned int GPUInterface::GetAvailableMemory() {
#if CUDA_VERSION >= 3020
    size_t availableMem = 0;
    size_t totalMem = 0;
    SAFE_CUPP(cuMemGetInfo(&availableMem, &totalMem));
#else
    unsigned int availableMem = 0;
    unsigned int totalMem = 0;
    SAFE_CUPP(cuMemGetInfo(&availableMem, &totalMem));
#endif
    return availableMem;
}
Exemple #2
0
int main() {

	int ngpu;
	CUdevice cuDevice;
	CUcontext cuContext;
	cuInit(0);
	cuDeviceGetCount(&ngpu);
	//printf("ngpu = %d\n", ngpu);

	size_t *totals, *frees ;
	totals = (size_t *) calloc (ngpu, sizeof(size_t));
	frees = (size_t *) calloc (ngpu, sizeof(size_t));

	int tid;
	omp_set_num_threads(ngpu);
	#pragma omp parallel private(tid, cuDevice, cuContext) shared(frees, totals)
	{
		tid = omp_get_thread_num();
		//printf("nthreads = %d, tid = %d\n", omp_get_num_threads(), tid);
		cuDeviceGet(&cuDevice, tid);
		cuCtxCreate(&cuContext, tid, cuDevice);
		cuMemGetInfo((size_t*)&frees[tid], (size_t*)&totals[tid]);
	}

	printf ("\ttotal\t\tfree\t\tused\n");
	for(int i=0; i<ngpu; i++) {
		printf("GPU %d\t%lu\t%lu\t%lu\n", i, (size_t)totals[i], (size_t)frees[i], (size_t)totals[i]-(size_t)frees[i]);
	}

	return 0;
}
Exemple #3
0
size_t initContext(JNIEnv * env, jint max_blocks_per_proc, jint max_threads_per_block)
{
  size_t to_space_size;
  int status;
  int deviceCount = 0;
  size_t f_mem;
  size_t t_mem;
  jint num_blocks;
  
  status = cuDeviceGetCount(&deviceCount);
  CHECK_STATUS_RTN(env,"error in cuDeviceGetCount",status, 0);

  getBestDevice(env);

  status = cuCtxCreate(&cuContext, CU_CTX_MAP_HOST, cuDevice);  
  CHECK_STATUS_RTN(env,"error in cuCtxCreate",status, 0)
  
  status = cuMemGetInfo (&f_mem, &t_mem);
  CHECK_STATUS_RTN(env,"error in cuMemGetInfo",status, 0)
  
  to_space_size = f_mem;

  //space for 100 types in the scene
  classMemSize = sizeof(jint)*100;
  
  num_blocks = numMultiProcessors * max_threads_per_block * max_blocks_per_proc;
  
  gc_space_size = 1024;
  to_space_size -= (num_blocks * sizeof(jlong));
  to_space_size -= (num_blocks * sizeof(jlong));
  to_space_size -= gc_space_size;
  to_space_size -= classMemSize;
  
  return to_space_size;
}
Exemple #4
0
candidate*
immaculate_conception (CUZMEM_CONTEXT ctx)
{
    unsigned long long DNA;
    unsigned int loc, gpu_mem_free, gpu_mem_total, gpu_mem_req;
    unsigned int creating = 1;
    cuzmem_plan* entry = ctx->plan;
    candidate* c = (candidate*)malloc (sizeof(candidate));

    cuMemGetInfo (&gpu_mem_free, &gpu_mem_total);

    while (creating) {
        c->DNA = rand();
        c->DNA = c->DNA << 32;
        c->DNA = c->DNA + rand();
        c->DNA &= generate_mask(ctx->num_knobs);

        // gpu memory utilization
        gpu_mem_req = 0;
        entry = ctx->plan;
        while (entry != NULL) {
            if (entry->gold_member) {
                loc = (c->DNA >> entry->id) & 0x0001;
                gpu_mem_req += entry->size * loc;
            }
            entry = entry->next;
        }

        // check constraint
        if (gpu_mem_req > gpu_mem_free * MIN_GPU_MEM) {
            creating = 0;
        }
    }
Exemple #5
0
int main(int argc, char* argv[])
{
    cuInit(0);
    int devs = 0;
    cuDeviceGetCount(&devs);
    assert(devs > 0);
    CUdevice dev;
    CUresult status;
    CUcontext ctx = 0;
    cuDeviceGet(&dev, 0);
    cuCtxCreate(&ctx, 0, dev);
    {
        size_t f = 0, t = 0;
        CUresult r = cuMemGetInfo(&f, &t);
        fprintf( stderr, "Do cuMemGetInfo: %d, %zu/%zu\n", r, f, t );
    }
    
    __init("\n");
 
    printf("\nPress any key to exit...");
    char c;
    scanf("%c", &c);
 
    return 0;
}
/* ========================================================================== */
int sci_gpuDeviceMemInfo(char *fname)
{
    #ifdef WITH_CUDA
    if(isGpuInit())
    {
        if (useCuda())
        {
            size_t free = 0, total = 0;
            cuMemGetInfo(&free,&total);
            double freeMem = (double)free;

            createScalarDouble(pvApiCtx, Rhs + 1, freeMem);
        }
        else
        {
            double zero = 0.;
            createScalarDouble(pvApiCtx, Rhs + 1, zero);
            sciprint("not implemented with OpenCL.\n");
        }

        LhsVar(1) = Rhs + 1;
        PutLhsVar();
    }
    else
    {
        Scierror(999,"%s","gpu is not initialised. Please launch gpuInit() before use this function.\n");
    }

    #else
        sciprint("not implemented with OpenCL.\n");
    #endif
    return 0;
}
Exemple #7
0
size_t swanMemAvailable( void ) {
	size_t free, total;
	try_init();

	cuMemGetInfo( &free, &total );
	
	return free;
}
Exemple #8
0
void checkCUDAmemory(char* t) {

	//cudaDeviceSynchronize();
	size_t free, total;
	cuMemGetInfo(&free, &total);
	fprintf(stderr,"%s mem %ld total %ld\n", t, free / 1024 / 1024, total / 1024 / 1024);

}
/**
 * Returns the memory usage statistics.
 *
 * @param usedMem receives the current used memory.
 * @param totalMem receives the total memory of the device.
 */
void getMemoryUsage(size_t* usedMem, size_t* totalMem) {
	size_t myFreeMem;
	size_t myTotalMem;
	cudaFree(0); // Ensures the CUDA context creation. Otherwise cuMemGetInfo (driver API) will return zero.
	cuMemGetInfo(&myFreeMem, &myTotalMem);
	if (usedMem != NULL) {
		*usedMem = myTotalMem-myFreeMem;
	}
	if (totalMem != NULL) {
		*totalMem = myTotalMem;
	}
}
S64 CudaModule::getMemoryUsed(void)
{
  staticInit();

  if (!s_available) {
    return 0;
  }

  size_t free = 0;
  size_t total = 0;
  cuMemGetInfo(&free, &total);
  return total - free;
}
Exemple #11
0
int main(int argc, char *argv[])
{
	char c;
	CUcontext ctx;
	CUdevice dev = 0;
	void *toSpace;
	int status, free, total;
	CUdeviceptr ptr = (CUdeviceptr)NULL;
	int size;
	
	if(argc != 2){
		fprintf(stderr,"Usage: mem_alloc.exe [MEMORY TO ALLOCATE IN MB]\n");
		exit(1);
	}
	
	printf("All status results should be 0, if not an error has occured.\nIf 2 is reported an out of memory error has occured for\nwhich you should decrease the memory input\n");
	size = atoi(argv[1]);
	
	printf("\nTrying to allocate %iMB of memory on host and GPU\n",size);
	
	if(size <= 0){
		fprintf(stderr,"\nERROR: Memory must be greater than 0\n");
		exit(1);
	}
	
	status = cuInit(0);
	printf("Init status: %i\n",status); 

	status = cuCtxCreate(&ctx, 0, dev);
	printf("Context creation status: %i\n",status); 
	
	cuMemGetInfo(&free, &total);
	printf("Get memory info status: %i\n",status); 
	
	printf("\n%.1f/%.1f (Free/Total) MB\n", free/1024.0/1024.0, total/1024.0/1024.0);
	
	status = cuMemHostAlloc(&toSpace, size*1024*1024, 0); 
	printf("Host allocation status: %i %s\n",status, (status==CUDA_SUCCESS) ? "SUCCESS" : "FAILED"); 

	status = cuMemAlloc(&ptr, size*1024*1024);
	printf("GPU allocation status: %i %s\n",status, (status==CUDA_SUCCESS) ? "SUCCESS" : "FAILED");

	printf("\nPress any key to exit...");
	scanf("%c", &c);
	
	status = cuCtxDestroy(ctx);
	printf("Context destroy status: %i\n",status); 

	return 0;
}
Exemple #12
0
/*===========================================================================*/
void Device::update()
{
    kvs::cuda::DriverAPI::Context context( *this );

#if defined( cuMemGetInfo )
    /* In this case, the function 'cuMemGetInfo' is defined to 'cuMemGetInfo_v2'
     * as "#define cuMemGetInfo cuMemGetInfo_v2". And then, the function
     * 'cuMemGetInfo_v2' is defined as follows:
     * CUresult cuMemGetInfo_v2( size_t * free, size_t * total )
     */
    KVS_CU_CALL( cuMemGetInfo( &m_free_memory, &m_total_memory ) );
#else
    /* The function 'cuMemGetInfo' is defined as follows:
     * CUresult cuMemGetInfo( unsigned int * free, unsigned int * total )
     * Therefore, the temporary parameters defined as unsigned int are used
     * to obtain the memory information.
     */
    unsigned int free_memory = 0;
    unsigned int total_memory = 0;
    KVS_CU_CALL( cuMemGetInfo( &free_memory, &total_memory ) );
    m_free_memory = static_cast<size_t>( free_memory );
    m_total_memory = static_cast<size_t>( total_memory );
#endif
}
Exemple #13
0
SEXP
R_auto_cuMemGetInfo()
{
    SEXP r_ans = R_NilValue;
    size_t free;
    size_t total;
    CUresult ans;
    ans = cuMemGetInfo(& free, & total);
    if(ans)
       return(R_cudaErrorInfo(ans));
    PROTECT(r_ans = NEW_LIST(2));
    SEXP r_names;
    PROTECT(r_names = NEW_CHARACTER(2));
    SET_VECTOR_ELT(r_ans, 0, ScalarReal(free));
    SET_VECTOR_ELT(r_ans, 1, ScalarReal(total));
    SET_STRING_ELT(r_names, 0, mkChar("free"));
    SET_STRING_ELT(r_names, 1, mkChar("total"));
    SET_NAMES(r_ans, r_names);
    UNPROTECT(2);
    return(r_ans);
}
static int cuda_property(void *c, gpudata *buf, gpukernel *k, int prop_id,
                         void *res) {
  cuda_context *ctx = NULL;
  if (c != NULL) {
    ctx = (cuda_context *)c;
    ASSERT_CTX(ctx);
  } else if (buf != NULL) {
    ASSERT_BUF(buf);
    ctx = buf->ctx;
  } else if (k != NULL) {
    ASSERT_KER(k);
    ctx = k->ctx;
  }
  /* I know that 512 and 1024 are magic numbers.
     There is an indication in buffer.h, though. */
  if (prop_id < 512) {
    if (ctx == NULL)
      return GA_VALUE_ERROR;
  } else if (prop_id < 1024) {
    if (buf == NULL)
      return GA_VALUE_ERROR;
  } else {
    if (k == NULL)
      return GA_VALUE_ERROR;
  }

  switch (prop_id) {
    char *s;
    CUdevice id;
    int i;
    size_t sz;

  case GA_CTX_PROP_DEVNAME:
    cuda_enter(ctx);
    ctx->err = cuCtxGetDevice(&id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    /* 256 is what the CUDA API uses so it's good enough for me */
    s = malloc(256);
    if (s == NULL) {
      cuda_exit(ctx);
      return GA_MEMORY_ERROR;
    }
    ctx->err = cuDeviceGetName(s, 256, id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    *((char **)res) = s;
    cuda_exit(ctx);
    return GA_NO_ERROR;

  case GA_CTX_PROP_MAXLSIZE:
    cuda_enter(ctx);
    ctx->err = cuCtxGetDevice(&id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
                                    id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    *((size_t *)res) = i;
    cuda_exit(ctx);
    return GA_NO_ERROR;

  case GA_CTX_PROP_LMEMSIZE:
    cuda_enter(ctx);
    ctx->err = cuCtxGetDevice(&id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
                                    id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    *((size_t *)res) = i;
    cuda_exit(ctx);
    return GA_NO_ERROR;

  case GA_CTX_PROP_NUMPROCS:
    cuda_enter(ctx);
    ctx->err = cuCtxGetDevice(&id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    ctx->err = cuDeviceGetAttribute(&i,
                                    CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
                                    id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    *((unsigned int *)res) = i;
    cuda_exit(ctx);
    return GA_NO_ERROR;

  case GA_CTX_PROP_MAXGSIZE:
    cuda_enter(ctx);
    ctx->err = cuCtxGetDevice(&id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
                                    id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    *((size_t *)res) = i;
    cuda_exit(ctx);
    return GA_NO_ERROR;

  case GA_CTX_PROP_BLAS_OPS:
#ifdef WITH_CUDA_CUBLAS
    *((gpuarray_blas_ops **)res) = &cublas_ops;
    return GA_NO_ERROR;
#else
    *((void **)res) = NULL;
    return GA_DEVSUP_ERROR;
#endif

  case GA_CTX_PROP_BIN_ID:
    *((const char **)res) = ctx->bin_id;
    return GA_NO_ERROR;

  case GA_CTX_PROP_ERRBUF:
    *((gpudata **)res) = ctx->errbuf;
    return GA_NO_ERROR;

  case GA_CTX_PROP_TOTAL_GMEM:
    cuda_enter(ctx);
    ctx->err = cuMemGetInfo(&sz, (size_t *)res);
    cuda_exit(ctx);
    return ctx->err == CUDA_SUCCESS ? GA_NO_ERROR : GA_IMPL_ERROR;

  case GA_CTX_PROP_FREE_GMEM:
    cuda_enter(ctx);
    ctx->err = cuMemGetInfo((size_t *)res, &sz);
    cuda_exit(ctx);
    return ctx->err == CUDA_SUCCESS ? GA_NO_ERROR : GA_IMPL_ERROR;

  case GA_BUFFER_PROP_REFCNT:
    *((unsigned int *)res) = buf->refcnt;
    return GA_NO_ERROR;

  case GA_BUFFER_PROP_SIZE:
    *((size_t *)res) = buf->sz;
    return GA_NO_ERROR;

  case GA_BUFFER_PROP_CTX:
  case GA_KERNEL_PROP_CTX:
    *((void **)res) = (void *)ctx;
    return GA_NO_ERROR;

  case GA_KERNEL_PROP_MAXLSIZE:
    cuda_enter(ctx);
    ctx->err = cuFuncGetAttribute(&i,
                                  CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
                                  k->k);
    cuda_exit(ctx);
    if (ctx->err != CUDA_SUCCESS)
      return GA_IMPL_ERROR;
    *((size_t *)res) = i;
    return GA_NO_ERROR;

  case GA_KERNEL_PROP_PREFLSIZE:
    cuda_enter(ctx);
    ctx->err = cuCtxGetDevice(&id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_WARP_SIZE, id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    cuda_exit(ctx);
    *((size_t *)res) = i;
    return GA_NO_ERROR;

  case GA_KERNEL_PROP_NUMARGS:
    *((unsigned int *)res) = k->argcount;
    return GA_NO_ERROR;

  case GA_KERNEL_PROP_TYPES:
    *((const int **)res) = k->types;
    return GA_NO_ERROR;

  default:
    return GA_INVALID_ERROR;
  }
}
Exemple #15
0
void WaterPlaneCUDA::configure(Vector upperLeft, Vector lowerRight, float dampFactor, float resolution)
{

	cudaSetDevice(cutGetMaxGflopsDeviceId());
	cudaGLSetGLDevice(cutGetMaxGflopsDeviceId());
	
	timeSinceLast = timePassed = 0;

	unsigned int free, total;
	int gpuCount, i;
	CUresult res;
	CUdevice dev;
	CUcontext ctx;

	cuInit(0);

	cuDeviceGetCount(&gpuCount);
	printf("Detected %d GPU\n",gpuCount);

	for (i=0; i<gpuCount; i++)
	{
		cuDeviceGet(&dev,i);
		cuCtxCreate(&ctx, 0, dev);
		res = cuMemGetInfo(&free, &total);
		if(res != CUDA_SUCCESS)
			printf("!!!! cuMemGetInfo failed! (status = %x)", res);
		printf("^^^^ Device: %d\n",i);
		printf("^^^^ Free : %lu bytes (%lu KB) (%lu MB)\n", free, inKB(free), inMB(free));
		printf("^^^^ Total: %lu bytes (%lu KB) (%lu MB)\n", total, inKB(total), inMB(total));
		printf("^^^^ %f%% free, %f%% used\n", 100.0*free/(double)total, 100.0*(total - free)/(double)total);
		cuCtxDetach(ctx);
	}


	this->stepSize = 1.0f/resolution;

	this->resolutionFactor = resolution;

	//reale Z - Achse ist x - Achse der WaterPlaneCUDA
	this->sizeX = (unsigned int) abs(upperLeft.z - lowerRight.z);

	//reale X -Achse ist y- Achse der WaterPlaneCUDA
	this->sizeY = (unsigned int) abs(upperLeft.x - lowerRight.x);

	//Anzahl der Netzpunkte in X -Richtung
	this->pointsX = (unsigned int)(sizeX * resolution);

	//Anzahl der Netzpunkte in Y -Richtung
	pointsY = (unsigned int)(sizeY * resolution);

	uLeft = upperLeft;

	lRight = lowerRight;

	//Der "Meeresspiegel"
	baseHeight = lRight.y;

	//Das Höhenfeld der WaterPlaneCUDA
	waveMap = NULL;

	initBuffer();

	gpu_newVertices = new float3[pointsX*pointsY];
	gpu_oldVertices = new float3[pointsX*pointsY];
	gpu_normals = new float3[pointsX*pointsY];

	for (int i=0;i<pointsX*pointsY;i++) {
		gpu_newVertices[i]=make_float3(0,0,0);
		gpu_oldVertices[i]=make_float3(0,0,0);
		gpu_normals[i]=make_float3(0,1.0,0);
	}
	cutilSafeCall(cudaMalloc((void**)&gpu_newVertices,pointsX*pointsY*sizeof(float3)));
	cutilSafeCall(cudaMalloc((void**)&gpu_oldVertices,pointsX*pointsY*sizeof(float3)));
	cutilSafeCall(cudaMalloc((void**)&gpu_normals,pointsX*pointsY*sizeof(float3)));

	drawMesh();
}
Exemple #16
0
void
pocl_cuda_init (cl_device_id device, const char *parameters)
{
  CUresult result;

  result = cuInit (0);
  CUDA_CHECK (result, "cuInit");

  if (device->data)
    return;

  pocl_cuda_device_data_t *data = malloc (sizeof (pocl_cuda_device_data_t));
  result = cuDeviceGet (&data->device, 0);
  CUDA_CHECK (result, "cuDeviceGet");

  // Get specific device name
  device->long_name = device->short_name = malloc (256 * sizeof (char));
  cuDeviceGetName (device->long_name, 256, data->device);

  // Get other device properties
  cuDeviceGetAttribute ((int *)&device->max_work_group_size,
                        CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
                        data->device);
  cuDeviceGetAttribute ((int *)(device->max_work_item_sizes + 0),
                        CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, data->device);
  cuDeviceGetAttribute ((int *)(device->max_work_item_sizes + 1),
                        CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, data->device);
  cuDeviceGetAttribute ((int *)(device->max_work_item_sizes + 2),
                        CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, data->device);
  cuDeviceGetAttribute (
      (int *)&device->local_mem_size,
      CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR, data->device);
  cuDeviceGetAttribute ((int *)&device->max_compute_units,
                        CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
                        data->device);
  cuDeviceGetAttribute ((int *)&device->max_clock_frequency,
                        CU_DEVICE_ATTRIBUTE_CLOCK_RATE, data->device);
  cuDeviceGetAttribute ((int *)&device->error_correction_support,
                        CU_DEVICE_ATTRIBUTE_ECC_ENABLED, data->device);
  cuDeviceGetAttribute ((int *)&device->host_unified_memory,
                        CU_DEVICE_ATTRIBUTE_INTEGRATED, data->device);
  cuDeviceGetAttribute ((int *)&device->max_constant_buffer_size,
                        CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY,
                        data->device);

  device->preferred_vector_width_char = 1;
  device->preferred_vector_width_short = 1;
  device->preferred_vector_width_int = 1;
  device->preferred_vector_width_long = 1;
  device->preferred_vector_width_float = 1;
  device->preferred_vector_width_double = 1;
  device->preferred_vector_width_half = 0;
  device->native_vector_width_char = 1;
  device->native_vector_width_short = 1;
  device->native_vector_width_int = 1;
  device->native_vector_width_long = 1;
  device->native_vector_width_float = 1;
  device->native_vector_width_double = 1;
  device->native_vector_width_half = 0;

  device->single_fp_config = CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO
                             | CL_FP_ROUND_TO_INF | CL_FP_FMA | CL_FP_INF_NAN
                             | CL_FP_DENORM;
  device->double_fp_config = CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO
                             | CL_FP_ROUND_TO_INF | CL_FP_FMA | CL_FP_INF_NAN
                             | CL_FP_DENORM;

  device->local_mem_type = CL_LOCAL;
  device->host_unified_memory = 0;

  // Get GPU architecture name
  int sm_maj, sm_min;
  cuDeviceGetAttribute (&sm_maj, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
                        data->device);
  cuDeviceGetAttribute (&sm_min, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
                        data->device);
  char *gpu_arch = malloc (16 * sizeof (char));
  snprintf (gpu_arch, 16, "sm_%d%d", sm_maj, sm_min);
  device->llvm_cpu = pocl_get_string_option ("POCL_CUDA_GPU_ARCH", gpu_arch);
  POCL_MSG_PRINT_INFO ("[CUDA] GPU architecture = %s\n", device->llvm_cpu);

  // Create context
  result = cuCtxCreate (&data->context, CU_CTX_MAP_HOST, data->device);
  CUDA_CHECK (result, "cuCtxCreate");

  // Get global memory size
  size_t memfree, memtotal;
  result = cuMemGetInfo (&memfree, &memtotal);
  device->max_mem_alloc_size = max (memtotal / 4, 128 * 1024 * 1024);
  device->global_mem_size = memtotal;

  device->data = data;
}
Exemple #17
0
/*
 * Class:     edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2
 * Method:    setup
 * Signature: ()V
 */
JNIEXPORT void JNICALL Java_edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2_setup
  (JNIEnv *env, jobject this_ref, jint max_blocks_per_proc, jint max_threads_per_block, jint free_space)
{
  int status;
  jint num_blocks;
  int deviceCount = 0;
  size_t f_mem;
  size_t t_mem;
  size_t to_space_size;
  //size_t free_space = 1530L*1024L*1024L;
  textureMemSize = 1;
  
  status = cuInit(0);
  if (CUDA_SUCCESS != status) 
  {
    printf("error in cuInit\n");
  }
  
  status = cuDeviceGetCount(&deviceCount);
  if (CUDA_SUCCESS != status) 
  {
    printf("error in cuDeviceGet\n");
  }

  getBestDevice();
  
  status = cuCtxCreate(&cuContext, CU_CTX_MAP_HOST, cuDevice);  
  if (CUDA_SUCCESS != status) 
  {
    printf("error in cuCtxCreate %d\n", status);
  }
  
  // ddb - not using this as this returns the total memory not the free memory
  //to_space_size = memSize();
  
  cuMemGetInfo(&f_mem, &t_mem);
  
  to_space_size = f_mem;
  
  num_blocks = numMultiProcessors * max_threads_per_block * max_blocks_per_proc;
  
#if DEBUG

  printf("Memory: %i(MB)/%i(MB) (Free/Total)\n",f_mem/1024/1024, t_mem/1024/1024);
  
  printf("num_blocks = %i\n",num_blocks);
  printf("numMultiProcessors = %i\n",numMultiProcessors);
  printf("max_threads_per_block = %i\n",max_threads_per_block);
  printf("max_blocks_per_proc = %i\n",max_blocks_per_proc);
  fflush(stdout);
#endif
  
  gc_space_size = 1024;
  to_space_size -= (num_blocks * sizeof(jlong));
  to_space_size -= (num_blocks * sizeof(jlong));
  to_space_size -= gc_space_size;
  to_space_size -= free_space;
  //to_space_size -= textureMemSize;
  bufferSize = to_space_size;

  status = cuMemHostAlloc(&toSpace, to_space_size, 0);  
  
  if (CUDA_SUCCESS != status) {
    throw_cuda_errror_exception(env, "toSpace memory allocation failed", status);
    return;
  }
  
  status = cuMemAlloc(&gpuToSpace, to_space_size);
  
  if (CUDA_SUCCESS != status) {
    throw_cuda_errror_exception(env, "gpuToSpace memory allocation failed", status);
    return;
  }
  
/*
  status = cuMemHostAlloc(&textureMemory, textureMemSize, 0);  
  if (CUDA_SUCCESS != status) 
  {
    printf("error in cuMemHostAlloc textureMemory %d\n", status);
  }

  status = cuMemAlloc(&gpuTexture, textureMemSize);
  if (CUDA_SUCCESS != status) 
  {
    printf("error in cuMemAlloc gpuTexture %d\n", status);
  }
*/
  status = cuMemHostAlloc(&handlesMemory, num_blocks * sizeof(jlong), CU_MEMHOSTALLOC_WRITECOMBINED); 
  
  if (CUDA_SUCCESS != status) {
    throw_cuda_errror_exception(env, "handlesMemory memory allocation failed", status);
    return;
  }

  status = cuMemAlloc(&gpuHandlesMemory, num_blocks * sizeof(jlong)); 
  
  if (CUDA_SUCCESS != status) {
    throw_cuda_errror_exception(env, "gpuHandlesMemory memory allocation failed", status);
    return;
  }

  status = cuMemHostAlloc(&exceptionsMemory, num_blocks * sizeof(jlong), 0); 
  
  if (CUDA_SUCCESS != status) {
    throw_cuda_errror_exception(env, "exceptionsMemory memory allocation failed", status);
    return;
  }

  status = cuMemAlloc(&gpuExceptionsMemory, num_blocks * sizeof(jlong)); 
 
  if (CUDA_SUCCESS != status) {
    throw_cuda_errror_exception(env, "gpuExceptionsMemory memory allocation failed", status);
    return;
  }

  status = cuMemAlloc(&gcInfoSpace, gc_space_size);  
  
  if (CUDA_SUCCESS != status) {
    throw_cuda_errror_exception(env, "gcInfoSpace memory allocation failed", status);
    return;
  }

  status = cuMemAlloc(&gpuHeapEndPtr, 8);
  
  if (CUDA_SUCCESS != status) {
    throw_cuda_errror_exception(env, "gpuHeapEndPtr memory allocation failed", status);
    return;
  }

  status = cuMemAlloc(&gpuBufferSize, 8);
  
  if (CUDA_SUCCESS != status) {
    throw_cuda_errror_exception(env, "gpuBufferSize memory allocation failed", status);
    return;
  }

  thisRefClass = (*env)->GetObjectClass(env, this_ref);
  setLongField(env, this_ref, "m_ToSpaceAddr", (jlong) toSpace);
  setLongField(env, this_ref, "m_GpuToSpaceAddr", (jlong) gpuToSpace);
  setLongField(env, this_ref, "m_TextureAddr", (jlong) textureMemory);
  setLongField(env, this_ref, "m_GpuTextureAddr", (jlong) gpuTexture);
  setLongField(env, this_ref, "m_HandlesAddr", (jlong) handlesMemory);
  setLongField(env, this_ref, "m_GpuHandlesAddr", (jlong) gpuHandlesMemory);
  setLongField(env, this_ref, "m_ExceptionsHandlesAddr", (jlong) exceptionsMemory);
  setLongField(env, this_ref, "m_GpuExceptionsHandlesAddr", (jlong) gpuExceptionsMemory);
  setLongField(env, this_ref, "m_ToSpaceSize", (jlong) bufferSize);
  setLongField(env, this_ref, "m_MaxGridDim", (jlong) maxGridDim);
  setLongField(env, this_ref, "m_NumMultiProcessors", (jlong) numMultiProcessors);
}
Exemple #18
0
value spoc_getCudaDevice(value i)
{
	CAMLparam1(i);
	CAMLlocal4(general_info, cuda_info, specific_info, gc_info);
	CAMLlocal3(device,  maxT, maxG);
	int nb_devices;
	CUdevprop dev_infos;
	CUdevice dev;
	CUcontext ctx;
	CUstream queue[2];
	spoc_cu_context *spoc_ctx;
	//CUcontext gl_ctx;
	char infoStr[1024];
	int infoInt;
	size_t infoUInt;
	int major, minor;
	enum cudaError_enum cuda_error; 


	cuDeviceGetCount (&nb_devices);

	if ((Int_val(i)) > nb_devices)
		raise_constant(*caml_named_value("no_cuda_device")) ;


	CUDA_CHECK_CALL(cuDeviceGet(&dev, Int_val(i)));
	CUDA_CHECK_CALL(cuDeviceGetProperties(&dev_infos, dev));

	general_info = caml_alloc (9, 0);
	CUDA_CHECK_CALL(cuDeviceGetName(infoStr, sizeof(infoStr), dev));

	Store_field(general_info,0, copy_string(infoStr));//
	CUDA_CHECK_CALL(cuDeviceTotalMem(&infoUInt, dev));

	Store_field(general_info,1, Val_int(infoUInt));//
	Store_field(general_info,2, Val_int(dev_infos.sharedMemPerBlock));//
	Store_field(general_info,3, Val_int(dev_infos.clockRate));//
	Store_field(general_info,4, Val_int(dev_infos.totalConstantMemory));//
	CUDA_CHECK_CALL(cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev));
	Store_field(general_info,5, Val_int(infoInt));//
	CUDA_CHECK_CALL(cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev));
	Store_field(general_info,6, Val_bool(infoInt));//
	Store_field(general_info,7, i);
	CUDA_CHECK_CALL(cuCtxCreate	(&ctx,
			CU_CTX_SCHED_BLOCKING_SYNC | CU_CTX_MAP_HOST,
			dev));
	spoc_ctx = malloc(sizeof(spoc_cl_context));
	spoc_ctx->ctx = ctx;
	CUDA_CHECK_CALL(cuStreamCreate(&queue[0], 0));
	CUDA_CHECK_CALL(cuStreamCreate(&queue[1], 0));
	spoc_ctx->queue[0] = queue[0];
	spoc_ctx->queue[1] = queue[1];
	Store_field(general_info,8, (value)spoc_ctx);
	CUDA_CHECK_CALL(cuCtxSetCurrent(ctx));


	cuda_info = caml_alloc(1, 0); //0 -> Cuda
	specific_info = caml_alloc(18, 0);

	cuDeviceComputeCapability(&major, &minor, dev);
	Store_field(specific_info,0, Val_int(major));//
	Store_field(specific_info,1, Val_int(minor));//
	Store_field(specific_info,2, Val_int(dev_infos.regsPerBlock));//
	Store_field(specific_info,3, Val_int(dev_infos.SIMDWidth));//
	Store_field(specific_info,4, Val_int(dev_infos.memPitch));//
	Store_field(specific_info,5, Val_int(dev_infos.maxThreadsPerBlock));//

	maxT = caml_alloc(3, 0);
	Store_field(maxT,0, Val_int(dev_infos.maxThreadsDim[0]));//
	Store_field(maxT,1, Val_int(dev_infos.maxThreadsDim[1]));//
	Store_field(maxT,2, Val_int(dev_infos.maxThreadsDim[2]));//
	Store_field(specific_info,6, maxT);

	maxG = caml_alloc(3, 0);
	Store_field(maxG,0, Val_int(dev_infos.maxGridSize[0]));//
	Store_field(maxG,1, Val_int(dev_infos.maxGridSize[1]));//
	Store_field(maxG,2, Val_int(dev_infos.maxGridSize[2]));//
	Store_field(specific_info,7, maxG);

	Store_field(specific_info,8, Val_int(dev_infos.textureAlign));//
	cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
	Store_field(specific_info,9, Val_bool(infoInt));//
	cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, dev);
	Store_field(specific_info,10, Val_bool(infoInt));//
	cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
	Store_field(specific_info,11, Val_bool(infoInt));//
	cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
	Store_field(specific_info,12, Val_bool(infoInt));//
	cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
	Store_field(specific_info,13, Val_int(infoInt));//
	cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
	Store_field(specific_info,14, Val_bool(infoInt));//
	cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev);
	Store_field(specific_info,15, Val_int(infoInt));
	cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev);
	Store_field(specific_info,16, Val_int(infoInt));
	cuDriverGetVersion(&infoInt);
	Store_field(specific_info, 17, Val_int(infoInt));

	Store_field(cuda_info, 0, specific_info);
	device = caml_alloc(4, 0);
	Store_field(device, 0, general_info);
	Store_field(device, 1, cuda_info);

	{spoc_cuda_gc_info* gcInfo = (spoc_cuda_gc_info*)malloc(sizeof(spoc_cuda_gc_info));
	CUDA_CHECK_CALL(cuMemGetInfo(&infoUInt, NULL));
	infoUInt -= (32*1024*1024);

	Store_field(device, 2, (value)gcInfo);


	{cuda_event_list* events = NULL;
	Store_field(device, 3, (value)events);



	CAMLreturn(device);}}
}
Exemple #19
0
JNIEXPORT jobject JNICALL Java_org_trifort_rootbeer_runtime_CUDARuntime_loadGpuDevices
  (JNIEnv * env, jobject this_ref)
{
  int i;
  int status;
  int num_devices;
  CUdevice device;
  CUcontext context;
  
  jclass array_list_class;
  jmethodID array_list_init;
  jmethodID array_list_add;
  jobject ret;

  jclass gpu_device_class;
  jmethodID gpu_device_init;
  jobject gpu_device;

  int major_version;
  int minor_version;
  char device_name[4096];
  size_t free_mem;
  size_t total_mem;
  int registers_per_block;
  int warp_size;
  int pitch;
  int threads_per_block;
  int shared_mem_per_block;
  int clock_rate;
  int mem_clock_rate;
  int const_mem;
  int integrated;
  int threads_per_multiprocessor;
  int multiprocessor_count;
  int max_block_dim_x;
  int max_block_dim_y;
  int max_block_dim_z;
  int max_grid_dim_x;
  int max_grid_dim_y;
  int max_grid_dim_z;

  array_list_class = (*env)->FindClass(env, "java/util/ArrayList");
  array_list_init = (*env)->GetMethodID(env, array_list_class, "<init>", "()V");
  array_list_add = (*env)->GetMethodID(env, array_list_class, "add", "(Ljava/lang/Object;)Z");

  ret = (*env)->NewObject(env, array_list_class, array_list_init);

  gpu_device_class = (*env)->FindClass(env, "org/trifort/rootbeer/runtime/GpuDevice");
  gpu_device_init = (*env)->GetStaticMethodID(env, gpu_device_class, "newCudaDevice", 
    "(IIILjava/lang/String;JJIIIIIIIIZIIIIIIII)Lorg/trifort/rootbeer/runtime/GpuDevice;");

  status = cuInit(0);
  if(status != CUDA_SUCCESS){
    return ret;
  }
  
  cuDeviceGetCount(&num_devices);

  for(i = 0; i < num_devices; ++i){
    status = cuDeviceGet(&device, i);
    if(status != CUDA_SUCCESS){
      continue;
    }

    cuDeviceComputeCapability(&major_version, &minor_version, device);
    cuDeviceGetName(device_name, 4096, device);
    cuCtxCreate(&context, CU_CTX_MAP_HOST, device);
    cuMemGetInfo(&free_mem, &total_mem);
    cuCtxDestroy(context);
    cuDeviceGetAttribute(&registers_per_block, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, device);
    cuDeviceGetAttribute(&warp_size, CU_DEVICE_ATTRIBUTE_WARP_SIZE, device);
    cuDeviceGetAttribute(&pitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, device);
    cuDeviceGetAttribute(&threads_per_block, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, device);
    cuDeviceGetAttribute(&shared_mem_per_block, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, device);
    cuDeviceGetAttribute(&clock_rate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device);
    cuDeviceGetAttribute(&mem_clock_rate, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device);
    cuDeviceGetAttribute(&const_mem, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, device);
    cuDeviceGetAttribute(&integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, device);
    cuDeviceGetAttribute(&threads_per_multiprocessor, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, device);
    cuDeviceGetAttribute(&multiprocessor_count, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device);
    cuDeviceGetAttribute(&max_block_dim_x, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, device);
    cuDeviceGetAttribute(&max_block_dim_y, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, device);
    cuDeviceGetAttribute(&max_block_dim_z, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, device);
    cuDeviceGetAttribute(&max_grid_dim_x, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, device);
    cuDeviceGetAttribute(&max_grid_dim_y, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, device);
    cuDeviceGetAttribute(&max_grid_dim_z, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, device);

    gpu_device = (*env)->CallObjectMethod(env, gpu_device_class, gpu_device_init,
      i, major_version, minor_version, (*env)->NewStringUTF(env, device_name), 
      (jlong) free_mem, (jlong) total_mem, registers_per_block, warp_size, pitch, threads_per_block, 
      shared_mem_per_block, clock_rate, mem_clock_rate, const_mem, integrated, 
      threads_per_multiprocessor, multiprocessor_count, max_block_dim_x, 
      max_block_dim_y, max_block_dim_z, max_grid_dim_x, max_grid_dim_y, 
      max_grid_dim_z);
    (*env)->CallBooleanMethod(env, ret, array_list_add, gpu_device);
  }

  return ret;
}
/**
 * initialization after the GL context is set
 */
void GLCUPixelBuffer::init(int w, int h)
{
  width = w+BLOCKSZ-(w%BLOCKSZ);
  height = h+BLOCKSZ-(h%BLOCKSZ);

  Q_ASSERT(pixelBuffer==0);
  int size_tex_data = sizeof(GLubyte) * width * height * 4;

  /*
   * there will be in and out here,
   * take care to the usage pattern.
   * the blockop we will ll be pointing at
   * will show the desired level of op on the framebuffer
   * side note : working on ultra high resolution,
   * will require a downsampling before rendering
   */
  size_t free, total;
  CUDCHK(cuMemGetInfo(&free,&total)) 	;
  qDebug() << "GPU memory : free="<<free<< " /"<< total;
  pixelBuffer = new QGLBuffer(QGLBuffer::PixelUnpackBuffer);
  pixelBuffer->setUsagePattern(QGLBuffer::StreamDraw);
  pixelBuffer->create();

  pixelBuffer->bind();
  pixelBuffer->allocate(size_tex_data);
  pixelBuffer->release();

  // cudaGLRegisterBufferObject( pixelBuffer->bufferId() ) ); // deprecated as of cuda 3.0
  CUDCHK ( cuGraphicsGLRegisterBuffer(&cudaResource, pixelBuffer->bufferId(),
                                      CU_GRAPHICS_REGISTER_FLAGS_NONE
                                      //CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD
                                      ) );
                              //cudaGraphicsRegisterFlagsWriteDiscard) );

  /*
   * create textures
   * delete texture object if necessary
   * for reallocating tex mem, e.g. at different size
   */
  deleteTexture();
  textureID = new GLuint[1]; // Generate a texture id
  glGenTextures(1, textureID);
  glBindTexture( GL_TEXTURE_2D, textureID[0]);
  // Allocate the texture memory. The last parameter is NULL since we only
  // want to allocate memory, not initialize it
  glTexImage2D( GL_TEXTURE_2D, 0, GL_RGBA8, width, height, 0,
            GL_BGRA,GL_UNSIGNED_BYTE, NULL);

  // Must set the filter mode, GL_LINEAR enables interpolation when scaling
  /*glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);*/

  /* Note: GL_TEXTURE_RECTANGLE_ARB may be used instead of
   GL_TEXTURE_2D for improved performance if linear interpolation is
   not desired. Replace GL_LINEAR with GL_NEAREST in the
   glTexParameteri() call
  */
  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
  glBindTexture(GL_TEXTURE_2D, 0);
  //CUDCHK(cuMemGetInfo(&free,&total)) 	;
  //qDebug() << "memory : free="<<free<< " /"<< total;
}
Exemple #21
0
void initDevice(JNIEnv * env, jobject this_ref, jint max_blocks_per_proc, jint max_threads_per_block, jlong free_space)
{          
  int status;
  jint num_blocks;
  int deviceCount = 0;
  size_t f_mem;
  size_t t_mem;
  size_t to_space_size;
  textureMemSize = 1;

  status = cuDeviceGetCount(&deviceCount);
  CHECK_STATUS(env,"error in cuDeviceGetCount",status)

  getBestDevice(env);
  
  status = cuCtxCreate(&cuContext, CU_CTX_MAP_HOST, cuDevice);  
  CHECK_STATUS(env,"error in cuCtxCreate",status)
  
  status = cuMemGetInfo (&f_mem, &t_mem);
  CHECK_STATUS(env,"error in cuMemGetInfo",status)
          
  to_space_size = f_mem;
  
  num_blocks = numMultiProcessors * max_threads_per_block * max_blocks_per_proc;
  
#if DEBUG

  printf("Memory: %i(MB)/%i(MB) (Free/Total)\n",f_mem/1024/1024, t_mem/1024/1024);
  
  printf("num_blocks = %i\n",num_blocks);
  printf("numMultiProcessors = %i\n",numMultiProcessors);
  printf("max_threads_per_block = %i\n",max_threads_per_block);
  printf("max_blocks_per_proc = %i\n",max_blocks_per_proc);
  fflush(stdout);
#endif

  //space for 100 types in the scene
  classMemSize = sizeof(jint)*100;
  
  gc_space_size = 1024;
  to_space_size -= (num_blocks * sizeof(jlong));
  to_space_size -= (num_blocks * sizeof(jlong));
  to_space_size -= gc_space_size;
  to_space_size -= free_space;
  to_space_size -= classMemSize;

  //to_space_size -= textureMemSize;
  bufferSize = to_space_size;

  status = cuMemHostAlloc(&toSpace, to_space_size, 0);  
  CHECK_STATUS(env,"toSpace memory allocation failed",status)
    
  status = cuMemAlloc(&gpuToSpace, to_space_size);
  CHECK_STATUS(env,"gpuToSpace memory allocation failed",status)
    
  status = cuMemAlloc(&gpuClassMemory, classMemSize);
  CHECK_STATUS(env,"gpuClassMemory memory allocation failed",status)
  
/*
  status = cuMemHostAlloc(&textureMemory, textureMemSize, 0);  
  if (CUDA_SUCCESS != status) 
  {
    printf("error in cuMemHostAlloc textureMemory %d\n", status);
  }

  status = cuMemAlloc(&gpuTexture, textureMemSize);
  if (CUDA_SUCCESS != status) 
  {
    printf("error in cuMemAlloc gpuTexture %d\n", status);
  }
*/

  status = cuMemHostAlloc(&handlesMemory, num_blocks * sizeof(jlong), CU_MEMHOSTALLOC_WRITECOMBINED); 
  CHECK_STATUS(env,"handlesMemory memory allocation failed",status)

  status = cuMemAlloc(&gpuHandlesMemory, num_blocks * sizeof(jlong)); 
  CHECK_STATUS(env,"gpuHandlesMemory memory allocation failed",status)

  status = cuMemHostAlloc(&exceptionsMemory, num_blocks * sizeof(jlong), 0); 
  CHECK_STATUS(env,"exceptionsMemory memory allocation failed",status)

  status = cuMemAlloc(&gpuExceptionsMemory, num_blocks * sizeof(jlong)); 
  CHECK_STATUS(env,"gpuExceptionsMemory memory allocation failed",status)

  status = cuMemAlloc(&gcInfoSpace, gc_space_size);  
  CHECK_STATUS(env,"gcInfoSpace memory allocation failed",status)

  status = cuMemAlloc(&gpuHeapEndPtr, 8);
  CHECK_STATUS(env,"gpuHeapEndPtr memory allocation failed",status)

  status = cuMemAlloc(&gpuBufferSize, 8);
  CHECK_STATUS(env,"gpuBufferSize memory allocation failed",status)

  thisRefClass = (*env)->GetObjectClass(env, this_ref);
  setLongField(env, this_ref, "m_ToSpaceAddr", (jlong) toSpace);
  setLongField(env, this_ref, "m_GpuToSpaceAddr", (jlong) gpuToSpace);
  setLongField(env, this_ref, "m_TextureAddr", (jlong) textureMemory);
  setLongField(env, this_ref, "m_GpuTextureAddr", (jlong) gpuTexture);
  setLongField(env, this_ref, "m_HandlesAddr", (jlong) handlesMemory);
  setLongField(env, this_ref, "m_GpuHandlesAddr", (jlong) gpuHandlesMemory);
  setLongField(env, this_ref, "m_ExceptionsHandlesAddr", (jlong) exceptionsMemory);
  setLongField(env, this_ref, "m_GpuExceptionsHandlesAddr", (jlong) gpuExceptionsMemory);
  setLongField(env, this_ref, "m_ToSpaceSize", (jlong) bufferSize);
  setLongField(env, this_ref, "m_MaxGridDim", (jlong) maxGridDim);
  setLongField(env, this_ref, "m_NumMultiProcessors", (jlong) numMultiProcessors);
}
Exemple #22
0
/*
 * Class:     edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2
 * Method:    printDeviceInfo
 * Signature: ()V
 */
JNIEXPORT void JNICALL Java_edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2_printDeviceInfo
  (JNIEnv *env, jclass cls)
{
    int i, a=0, b=0, status;
    int num_devices = 0;
    char str[1024];
    size_t free_mem, total_mem;
 
    status = cuInit(0);
    CHECK_STATUS(env,"error in cuInit",status)
    
    cuDeviceGetCount(&num_devices);
    printf("%d cuda gpus found\n", num_devices);
 
    for (i = 0; i < num_devices; ++i)
    {
        CUdevice dev;
        status = cuDeviceGet(&dev, i);
        CHECK_STATUS(env,"error in cuDeviceGet",status)

        status = cuCtxCreate(&cuContext, CU_CTX_MAP_HOST, dev);
        CHECK_STATUS(env,"error in cuCtxCreate",status)
                
        printf("\nGPU:%d\n", i);
        
        if(cuDeviceComputeCapability(&a, &b, dev) == CUDA_SUCCESS)
            printf("Version:                       %i.%i\n", a, b);
        
        if(cuDeviceGetName(str,1024,dev) == CUDA_SUCCESS)
            printf("Name:                          %s\n", str);
        
        if(cuMemGetInfo(&free_mem, &total_mem) == CUDA_SUCCESS){
          #if (defined linux || defined __APPLE_CC__)
            printf("Total global memory:           %zu/%zu (Free/Total) MBytes\n", free_mem/1024/1024, total_mem/1024/1024);
          #else
            printf("Total global memory:           %Iu/%Iu (Free/Total) MBytes\n", free_mem/1024/1024, total_mem/1024/1024);
          #endif
        }
        
        if(cuDeviceGetAttribute(&a, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK,dev) == CUDA_SUCCESS)
            printf("Total registers per block:     %i\n", a);
        if(cuDeviceGetAttribute(&a, CU_DEVICE_ATTRIBUTE_WARP_SIZE,dev) == CUDA_SUCCESS)
            printf("Warp size:                     %i\n", a);
        if(cuDeviceGetAttribute(&a, CU_DEVICE_ATTRIBUTE_MAX_PITCH,dev) == CUDA_SUCCESS)
            printf("Maximum memory pitch:          %i\n", a);
        if(cuDeviceGetAttribute(&a, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,dev) == CUDA_SUCCESS)
            printf("Maximum threads per block:     %i\n", a);        
        if(cuDeviceGetAttribute(&a, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,dev) == CUDA_SUCCESS)
            printf("Total shared memory per block  %.2f KB\n", a/1024.0);
        if(cuDeviceGetAttribute(&a, CU_DEVICE_ATTRIBUTE_CLOCK_RATE,dev) == CUDA_SUCCESS)
            printf("Clock rate:                    %.2f MHz\n",  a/1000000.0);
        if(cuDeviceGetAttribute(&a, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,dev) == CUDA_SUCCESS)
            printf("Memory Clock rate:             %.2f\n",  a/1000000.0);
        if(cuDeviceGetAttribute(&a, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY,dev) == CUDA_SUCCESS)
            printf("Total constant memory:         %.2f MB\n",  a/1024.0/1024.0);
        if(cuDeviceGetAttribute(&a, CU_DEVICE_ATTRIBUTE_INTEGRATED,dev) == CUDA_SUCCESS)
            printf("Integrated:                    %i\n",  a);
        if(cuDeviceGetAttribute(&a, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,dev) == CUDA_SUCCESS)
            printf("Max threads per multiprocessor:%i\n",  a);    
        if(cuDeviceGetAttribute(&a, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,dev) == CUDA_SUCCESS)
            printf("Number of multiprocessors:     %i\n",  a);    
      
        if(cuDeviceGetAttribute(&a, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,dev) == CUDA_SUCCESS)
            printf("Maximum dimension x of block:  %i\n", a);
        if(cuDeviceGetAttribute(&a, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,dev) == CUDA_SUCCESS)
            printf("Maximum dimension y of block:  %i\n", a);
        if(cuDeviceGetAttribute(&a, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,dev) == CUDA_SUCCESS)
            printf("Maximum dimension z of block:  %i\n", a);
        
        if(cuDeviceGetAttribute(&a, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,dev) == CUDA_SUCCESS)
            printf("Maximum dimension x of grid:   %i\n", a);
        if(cuDeviceGetAttribute(&a, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y,dev) == CUDA_SUCCESS)
            printf("Maximum dimension y of grid:   %i\n", a);
        if(cuDeviceGetAttribute(&a, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z,dev) == CUDA_SUCCESS)
            printf("Maximum dimension z of grid:   %i\n", a);
			
		cuCtxDestroy(cuContext);
    } 
}
Exemple #23
0
IMT_MRGui::IMT_MRGui(QWidget *parent) :
    QMainWindow(parent),
    ui(new Ui::IMT_MRGui)
{
  // Initialize the first GPU
  GPU0_.allocateGPU(0);
  GPU0_.printInformation(std::cout);

  size_t free;
  size_t total;
  cuMemGetInfo(&free, &total);
  std::cout << "free memory: " << free / 1024 / 1024 << "mb, total memory: " << total / 1024 / 1024 << "mb" << std::endl;

  ui->setupUi(this);

  read_siemens = 0;
  fftobj_ = new agile::FFT<TType>();
  kspacefovobj_ = new agile::KSpaceFOV<TType>();

  open_irgnpara_window = new IRGN_para(); // Be sure to destroy you window somewhere
  open_irgnpara_window->hide();

  _pathsetting = new PathSetting(); // Be sure to destroy you window somewhere
  _pathsetting->hide();

  _specialsetting = new SpecialSetting(); // Be sure to destroy you window somewhere
  _specialsetting->hide();


  setStatusBar();
  setTitleText("");

  future = new QFuture<void>;
  watcher = new QFutureWatcher<void>;
  _cycleread_thread = new CycleRead;
  _cycleread_thread->set_max_acq_time(_specialsetting->get_autoload_timeout());


  _infotext="";

  QObject::connect(this,SIGNAL(sendInfo(QString, bool)),this,SLOT(set_Info(QString, bool)));
  QObject::connect(this,SIGNAL(sendWarning(QString)),this,SLOT(set_Warning(QString)));
  QObject::connect(this,SIGNAL(sendSaveFile(QString,QString)),this,SLOT(set_SaveFile(QString,QString)),Qt::BlockingQueuedConnection);

  QObject::connect(_cycleread_thread, SIGNAL(send_filenames(QStringList, QStringList)),
          this, SLOT(startauto(QStringList, QStringList)));

  _act_index = 0;
  _automatic_on = false;
  _write_file_index=0;

  ui->cb_savingtype->setCurrentIndex(2);
  ui->pb_automatic->setText("Auto Off");

  _postprocess = new agile::PostProcess<TType, TType_real>();


  QLabel* imageLabel = new QLabel(this);
  imageLabel->setBackgroundRole(QPalette::Base);
  imageLabel->setSizePolicy(QSizePolicy::Ignored, QSizePolicy::Ignored);
  imageLabel->setScaledContents(true);
  QImage image(":/images/wappen.png");
  imageLabel->setPixmap(QPixmap::fromImage(image));
  imageLabel->resize(imageLabel->pixmap()->size());
  imageLabel->setGeometry((this->width() - imageLabel->sizeHint().width()), 40, imageLabel->sizeHint().width(), imageLabel->sizeHint().height());
  imageLabel->show();
}