void getBestDevice(JNIEnv *env){ int num_devices; int status; int i; CUdevice temp_device; int curr_multiprocessors; int max_multiprocessors = -1; int max_i = -1; status = cuDeviceGetCount(&num_devices); CHECK_STATUS(env,"error in cuDeviceGetCount",status) if(num_devices == 0) throw_cuda_errror_exception(env,"0 Cuda Devices were found",0); for(i = 0; i < num_devices; ++i){ status = cuDeviceGet(&temp_device, i); CHECK_STATUS(env,"error in cuDeviceGet",status) status = cuDeviceGetAttribute(&curr_multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, temp_device); CHECK_STATUS(env,"error in cuDeviceGetAttribute",status) if(curr_multiprocessors > max_multiprocessors) { max_multiprocessors = curr_multiprocessors; max_i = i; } } status = cuDeviceGet(&cuDevice, max_i); CHECK_STATUS(env,"error in cuDeviceGet",status) status = cuDeviceGetAttribute(&maxGridDim, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, cuDevice); CHECK_STATUS(env,"error in cuDeviceGetAttribute",status) numMultiProcessors = max_multiprocessors; }
/* * Class: edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2 * Method: findReserveMem * Signature: ()I */ JNIEXPORT jlong JNICALL Java_edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2_findReserveMem (JNIEnv * env, jobject this_ref, jint max_blocks_per_proc, jint max_threads_per_block) { size_t to_space_size; size_t temp_size; int status; int deviceCount = 0; jlong prev_i; jlong i; size_t f_mem; size_t t_mem; jint num_blocks; status = cuInit(0); CHECK_STATUS(env,"error in cuInit",status) printf("automatically determining CUDA reserve space...\n"); to_space_size = initContext(env, max_blocks_per_proc, max_threads_per_block); //space for 100 types in the scene classMemSize = sizeof(jint)*100; num_blocks = numMultiProcessors * max_threads_per_block * max_blocks_per_proc; gc_space_size = 1024; to_space_size -= (num_blocks * sizeof(jlong)); to_space_size -= (num_blocks * sizeof(jlong)); to_space_size -= gc_space_size; to_space_size -= classMemSize; for(i = 1024L*1024L; i < to_space_size; i += 100L*1024L*1024L){ temp_size = to_space_size - i; printf("attempting allocation with temp_size: %lu to_space_size: %lu i: %ld\n", temp_size, to_space_size, i); status = cuMemHostAlloc(&toSpace, temp_size, 0); if(status != CUDA_SUCCESS){ cuCtxDestroy(cuContext); initContext(env, max_blocks_per_proc, max_threads_per_block); continue; } status = cuMemAlloc(&gpuToSpace, temp_size); if(status != CUDA_SUCCESS){ cuCtxDestroy(cuContext); initContext(env, max_blocks_per_proc, max_threads_per_block); continue; } status = cuMemAlloc(&gpuClassMemory, classMemSize); if(status != CUDA_SUCCESS){ cuCtxDestroy(cuContext); initContext(env, max_blocks_per_proc, max_threads_per_block); continue; } status = cuMemHostAlloc(&handlesMemory, num_blocks * sizeof(jlong), CU_MEMHOSTALLOC_WRITECOMBINED); if(status != CUDA_SUCCESS){ cuCtxDestroy(cuContext); initContext(env, max_blocks_per_proc, max_threads_per_block); continue; } status = cuMemAlloc(&gpuHandlesMemory, num_blocks * sizeof(jlong)); if(status != CUDA_SUCCESS){ cuCtxDestroy(cuContext); initContext(env, max_blocks_per_proc, max_threads_per_block); continue; } status = cuMemHostAlloc(&exceptionsMemory, num_blocks * sizeof(jlong), 0); if(status != CUDA_SUCCESS){ cuCtxDestroy(cuContext); initContext(env, max_blocks_per_proc, max_threads_per_block); continue; } status = cuMemAlloc(&gpuExceptionsMemory, num_blocks * sizeof(jlong)); if(status != CUDA_SUCCESS){ cuCtxDestroy(cuContext); initContext(env, max_blocks_per_proc, max_threads_per_block); continue; } status = cuMemAlloc(&gcInfoSpace, gc_space_size); if(status != CUDA_SUCCESS){ cuCtxDestroy(cuContext); initContext(env, max_blocks_per_proc, max_threads_per_block); continue; } status = cuMemAlloc(&gpuHeapEndPtr, 8); if(status != CUDA_SUCCESS){ cuCtxDestroy(cuContext); initContext(env, max_blocks_per_proc, max_threads_per_block); continue; } status = cuMemAlloc(&gpuBufferSize, 8); if(status != CUDA_SUCCESS){ cuCtxDestroy(cuContext); initContext(env, max_blocks_per_proc, max_threads_per_block); continue; } //done, free everything cuMemFree(gpuToSpace); cuMemFree(gpuClassMemory); cuMemFree(gpuHandlesMemory); cuMemFree(gpuExceptionsMemory); cuMemFree(gcInfoSpace); cuMemFree(gpuHeapEndPtr); cuMemFree(gpuBufferSize); cuMemFreeHost(toSpace); cuMemFreeHost(handlesMemory); cuMemFreeHost(exceptionsMemory); return i; } throw_cuda_errror_exception(env, "unable to find enough space using CUDA", 0); return 0; }
/* * Class: edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2 * Method: setup * Signature: ()V */ JNIEXPORT void JNICALL Java_edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2_setup (JNIEnv *env, jobject this_ref, jint max_blocks_per_proc, jint max_threads_per_block, jint free_space) { int status; jint num_blocks; int deviceCount = 0; size_t f_mem; size_t t_mem; size_t to_space_size; //size_t free_space = 1530L*1024L*1024L; textureMemSize = 1; status = cuInit(0); if (CUDA_SUCCESS != status) { printf("error in cuInit\n"); } status = cuDeviceGetCount(&deviceCount); if (CUDA_SUCCESS != status) { printf("error in cuDeviceGet\n"); } getBestDevice(); status = cuCtxCreate(&cuContext, CU_CTX_MAP_HOST, cuDevice); if (CUDA_SUCCESS != status) { printf("error in cuCtxCreate %d\n", status); } // ddb - not using this as this returns the total memory not the free memory //to_space_size = memSize(); cuMemGetInfo(&f_mem, &t_mem); to_space_size = f_mem; num_blocks = numMultiProcessors * max_threads_per_block * max_blocks_per_proc; #if DEBUG printf("Memory: %i(MB)/%i(MB) (Free/Total)\n",f_mem/1024/1024, t_mem/1024/1024); printf("num_blocks = %i\n",num_blocks); printf("numMultiProcessors = %i\n",numMultiProcessors); printf("max_threads_per_block = %i\n",max_threads_per_block); printf("max_blocks_per_proc = %i\n",max_blocks_per_proc); fflush(stdout); #endif gc_space_size = 1024; to_space_size -= (num_blocks * sizeof(jlong)); to_space_size -= (num_blocks * sizeof(jlong)); to_space_size -= gc_space_size; to_space_size -= free_space; //to_space_size -= textureMemSize; bufferSize = to_space_size; status = cuMemHostAlloc(&toSpace, to_space_size, 0); if (CUDA_SUCCESS != status) { throw_cuda_errror_exception(env, "toSpace memory allocation failed", status); return; } status = cuMemAlloc(&gpuToSpace, to_space_size); if (CUDA_SUCCESS != status) { throw_cuda_errror_exception(env, "gpuToSpace memory allocation failed", status); return; } /* status = cuMemHostAlloc(&textureMemory, textureMemSize, 0); if (CUDA_SUCCESS != status) { printf("error in cuMemHostAlloc textureMemory %d\n", status); } status = cuMemAlloc(&gpuTexture, textureMemSize); if (CUDA_SUCCESS != status) { printf("error in cuMemAlloc gpuTexture %d\n", status); } */ status = cuMemHostAlloc(&handlesMemory, num_blocks * sizeof(jlong), CU_MEMHOSTALLOC_WRITECOMBINED); if (CUDA_SUCCESS != status) { throw_cuda_errror_exception(env, "handlesMemory memory allocation failed", status); return; } status = cuMemAlloc(&gpuHandlesMemory, num_blocks * sizeof(jlong)); if (CUDA_SUCCESS != status) { throw_cuda_errror_exception(env, "gpuHandlesMemory memory allocation failed", status); return; } status = cuMemHostAlloc(&exceptionsMemory, num_blocks * sizeof(jlong), 0); if (CUDA_SUCCESS != status) { throw_cuda_errror_exception(env, "exceptionsMemory memory allocation failed", status); return; } status = cuMemAlloc(&gpuExceptionsMemory, num_blocks * sizeof(jlong)); if (CUDA_SUCCESS != status) { throw_cuda_errror_exception(env, "gpuExceptionsMemory memory allocation failed", status); return; } status = cuMemAlloc(&gcInfoSpace, gc_space_size); if (CUDA_SUCCESS != status) { throw_cuda_errror_exception(env, "gcInfoSpace memory allocation failed", status); return; } status = cuMemAlloc(&gpuHeapEndPtr, 8); if (CUDA_SUCCESS != status) { throw_cuda_errror_exception(env, "gpuHeapEndPtr memory allocation failed", status); return; } status = cuMemAlloc(&gpuBufferSize, 8); if (CUDA_SUCCESS != status) { throw_cuda_errror_exception(env, "gpuBufferSize memory allocation failed", status); return; } thisRefClass = (*env)->GetObjectClass(env, this_ref); setLongField(env, this_ref, "m_ToSpaceAddr", (jlong) toSpace); setLongField(env, this_ref, "m_GpuToSpaceAddr", (jlong) gpuToSpace); setLongField(env, this_ref, "m_TextureAddr", (jlong) textureMemory); setLongField(env, this_ref, "m_GpuTextureAddr", (jlong) gpuTexture); setLongField(env, this_ref, "m_HandlesAddr", (jlong) handlesMemory); setLongField(env, this_ref, "m_GpuHandlesAddr", (jlong) gpuHandlesMemory); setLongField(env, this_ref, "m_ExceptionsHandlesAddr", (jlong) exceptionsMemory); setLongField(env, this_ref, "m_GpuExceptionsHandlesAddr", (jlong) gpuExceptionsMemory); setLongField(env, this_ref, "m_ToSpaceSize", (jlong) bufferSize); setLongField(env, this_ref, "m_MaxGridDim", (jlong) maxGridDim); setLongField(env, this_ref, "m_NumMultiProcessors", (jlong) numMultiProcessors); }