void CudaModule::launchKernel(CUfunction kernel, const Vec2i& blockSize, const Vec2i& gridSize, bool async, CUstream stream) { if (!kernel) { fail("CudaModule: No kernel specified!"); } #if (CUDA_VERSION >= 3000) if (NULL != cuFuncSetCacheConfig) { CUfunc_cache cache = (s_preferL1)? CU_FUNC_CACHE_PREFER_L1 : CU_FUNC_CACHE_PREFER_SHARED; checkError("cuFuncSetCacheConfig", cuFuncSetCacheConfig( kernel, cache) ); } #endif updateGlobals(); updateTexRefs(kernel); checkError("cuFuncSetBlockShape", cuFuncSetBlockShape(kernel, blockSize.x, blockSize.y, 1)); if (async && (NULL != cuLaunchGridAsync)) { checkError("cuLaunchGridAsync", cuLaunchGridAsync(kernel, gridSize.x, gridSize.y, stream)); } else { checkError("cuLaunchGrid", cuLaunchGrid(kernel, gridSize.x, gridSize.y)); } }
/* * Class: edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2 * Method: loadFunction * Signature: ()V */ JNIEXPORT void JNICALL Java_edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2_loadFunction (JNIEnv *env, jobject this_obj, jlong heap_end_ptr, jstring filename, jint num_blocks){ void * cubin_file; int offset; CUresult status; char * native_filename; heapEndPtr = heap_end_ptr; native_filename = (*env)->GetStringUTFChars(env, filename, 0); status = cuModuleLoad(&cuModule, native_filename); CHECK_STATUS(env, "error in cuModuleLoad", status); (*env)->ReleaseStringUTFChars(env, filename, native_filename); status = cuModuleGetFunction(&cuFunction, cuModule, "_Z5entryPcS_PiPxS1_S0_S0_i"); CHECK_STATUS(env,"error in cuModuleGetFunction",status) status = cuFuncSetCacheConfig(cuFunction, CU_FUNC_CACHE_PREFER_L1); CHECK_STATUS(env,"error in cuFuncSetCacheConfig",status) status = cuParamSetSize(cuFunction, (7 * sizeof(CUdeviceptr) + sizeof(int))); CHECK_STATUS(env,"error in cuParamSetSize",status) offset = 0; status = cuParamSetv(cuFunction, offset, (void *) &gcInfoSpace, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gcInfoSpace",status) offset += sizeof(CUdeviceptr); status = cuParamSetv(cuFunction, offset, (void *) &gpuToSpace, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gpuToSpace",status) offset += sizeof(CUdeviceptr); status = cuParamSetv(cuFunction, offset, (void *) &gpuHandlesMemory, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gpuHandlesMemory %",status) offset += sizeof(CUdeviceptr); status = cuParamSetv(cuFunction, offset, (void *) &gpuHeapEndPtr, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gpuHeapEndPtr",status) offset += sizeof(CUdeviceptr); status = cuParamSetv(cuFunction, offset, (void *) &gpuBufferSize, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gpuBufferSize",status) offset += sizeof(CUdeviceptr); status = cuParamSetv(cuFunction, offset, (void *) &gpuExceptionsMemory, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gpuExceptionsMemory",status) offset += sizeof(CUdeviceptr); status = cuParamSetv(cuFunction, offset, (void *) &gpuClassMemory, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gpuClassMemory",status) offset += sizeof(CUdeviceptr); status = cuParamSeti(cuFunction, offset, num_blocks); CHECK_STATUS(env,"error in cuParamSetv num_blocks",status) offset += sizeof(int); }
/* * Class: edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2 * Method: loadFunction * Signature: ()V */ JNIEXPORT void JNICALL Java_edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2_loadFunction (JNIEnv *env, jobject this_obj, jlong heap_end_ptr, jobject buffers, jint size, jint total_size, jint num_blocks){ void * cubin_file; int offset; CUresult status; heapEndPtr = heap_end_ptr; //void * cubin_file = readCubinFile("code_file.cubin"); cubin_file = readCubinFileFromBuffers(env, buffers, size, total_size); status = cuModuleLoadData(&cuModule, cubin_file); CHECK_STATUS(env,"error in cuModuleLoad",status) free(cubin_file); status = cuModuleGetFunction(&cuFunction, cuModule, "_Z5entryPcS_PiPxS1_S0_i"); CHECK_STATUS(env,"error in cuModuleGetFunction",status) status = cuFuncSetCacheConfig(cuFunction, CU_FUNC_CACHE_PREFER_L1); CHECK_STATUS(env,"error in cuFuncSetCacheConfig",status) status = cuParamSetSize(cuFunction, (6 * sizeof(CUdeviceptr) + sizeof(int))); CHECK_STATUS(env,"error in cuParamSetSize",status) offset = 0; status = cuParamSetv(cuFunction, offset, (void *) &gcInfoSpace, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gcInfoSpace",status) offset += sizeof(CUdeviceptr); status = cuParamSetv(cuFunction, offset, (void *) &gpuToSpace, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gpuToSpace",status) offset += sizeof(CUdeviceptr); status = cuParamSetv(cuFunction, offset, (void *) &gpuHandlesMemory, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gpuHandlesMemory %",status) offset += sizeof(CUdeviceptr); status = cuParamSetv(cuFunction, offset, (void *) &gpuHeapEndPtr, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gpuHeapEndPtr",status) offset += sizeof(CUdeviceptr); status = cuParamSetv(cuFunction, offset, (void *) &gpuBufferSize, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gpuBufferSize",status) offset += sizeof(CUdeviceptr); status = cuParamSetv(cuFunction, offset, (void *) &gpuExceptionsMemory, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gpuExceptionsMemory",status) offset += sizeof(CUdeviceptr); status = cuParamSeti(cuFunction, offset, num_blocks); CHECK_STATUS(env,"error in cuParamSetv num_blocks",status) offset += sizeof(int); }
SEXP R_auto_cuFuncSetCacheConfig(SEXP r_hfunc, SEXP r_config) { SEXP r_ans = R_NilValue; CUfunction hfunc = (CUfunction) getRReference(r_hfunc); CUfunc_cache config = (CUfunc_cache) INTEGER(r_config)[0]; CUresult ans; ans = cuFuncSetCacheConfig(hfunc, config); r_ans = Renum_convert_CUresult(ans) ; return(r_ans); }
void swanRunKernelAsync( const char *kernel, block_config_t grid , block_config_t block, size_t shmem, int flags, void *ptrs[], int *types ) { // find the kernel if( !grid.x || !grid.y || !grid.z || !block.x || !block.y || !block.z ) { return; } // suppress launch of kernel if any of the launch dims are 0 CUfunction f = NULL; int i; int offset = 0; CUresult err; int type; int idx=0; try_init(); for( i=0; i < state.num_funcs; i++ ) { if( !strcmp( state.func_names[i], kernel ) ) { f = state.funcs[i]; break; } } if( f == NULL ) { for( i=0; i < state.num_mods; i++ ) { cuModuleGetFunction( &f, state.mods[i], kernel ); if( f!= NULL ) { // found a kernel. store it for future use int j = state.num_funcs; state.num_funcs++; state.funcs = (CUfunction*) realloc( state.funcs, sizeof(CUfunction) * state.num_funcs ); state.funcs[j] = f; state.func_names = (char**) realloc( state.func_names, sizeof(char*) * state.num_funcs ); state.func_names[j] = (char*) malloc( strlen(kernel) + 1 ); strcpy( state.func_names[j], kernel ); break; } } } if( f== NULL ) { fprintf(stderr, "Error running kernel [%s] : \n", kernel ); error( "No kernel found" ); } if( grid.z != 1 ) { printf("Kernel [%s] launched with (%d %d %d)(%d %d %d)\n", kernel, grid.x, grid.y, grid.z, block.x, block.y, block.z ); error( "grid.z needs to be 1" ); } //printf("Running kernel [%s]\n", kernel ); type = types[idx]; while( type != SWAN_END ) { void *ptr = ptrs[idx]; switch( type ) { // DEBLOCK( SWAN_uchar, uchar, 1 ); DEBLOCK( SWAN_uchar2, uchar2, 2 ); DEBLOCK( SWAN_uchar3, uchar3, 1 ); DEBLOCK( SWAN_uchar4, uchar4, 4 ); DEBLOCK( SWAN_char , int, 1 ); // DEBLOCK( SWAN_char1 , char1, 1 ); DEBLOCK( SWAN_char2 , char2, 2 ); DEBLOCK( SWAN_char3 , char3, 1 ); DEBLOCK( SWAN_char4 , char4, 4 ); DEBLOCK( SWAN_int, int, 4 ); // DEBLOCK( SWAN_int1, int1, 4 ); DEBLOCK( SWAN_int2, int2, 8 ); DEBLOCK( SWAN_int3, int3, 4 ); DEBLOCK( SWAN_int4, int4, 16 ); // DEBLOCK( SWAN_float, double, 4 ); // DEBLOCK( SWAN_float1, float1, 4 ); DEBLOCK( SWAN_float2, float2, 8 ); DEBLOCK( SWAN_float3, float3, 4 ); DEBLOCK( SWAN_float4, float4, 16 ); DEBLOCK( SWAN_uint, uint, 4 ); DEBLOCK( SWAN_uint2, uint2, 8 ); DEBLOCK( SWAN_uint3, uint3, 4 ); DEBLOCK( SWAN_uint4, uint4, 16 ); DEBLOCK( SWAN_float, float, 4 ); //#define DEBLOCK(swan_type,type,OFFSET) #if ( CUDA_MAJOR == 3 && CUDA_MINOR >= 2 ) || CUDA_MAJOR >= 4 case SWAN_PTR: { //printf("PTR as NATIVE\n"); ALIGN_UP( offset, (sizeof(void*))); cuParamSetv( f, offset, ptr, sizeof(void*) ); offset += sizeof(void*); } break; #else case SWAN_PTR: { //printf("PTR as INT\n"); ALIGN_UP( offset, (sizeof(int))); cuParamSetv( f, offset, ptr, sizeof(int) ); offset += sizeof(int); } break; #endif default: printf("%d\n", type ); error("Parameter type not handled\n"); } idx++; type = types[idx]; } //printf("Launching kernel [%s] [%X] with (%d %d %d) (%d %d %d)\n", kernel, f, grid.x, grid.y, grid.z, block.x, block.y, block.z ); //printf(" TOTAL OFFSET %d\n", offset ); CU_SAFE_CALL_NO_SYNC( cuParamSetSize( f, offset ) ); CU_SAFE_CALL_NO_SYNC( cuFuncSetBlockShape( f, block.x, block.y, block.z ) ); CU_SAFE_CALL_NO_SYNC( cuFuncSetSharedSize( f, shmem ) ); #if (CUDA_MAJOR ==3 && CUDA_MINOR >=1 ) || CUDA_MAJOR>=4 cuFuncSetCacheConfig( f, CU_FUNC_CACHE_PREFER_SHARED ); // This seems to be better in every case for acemd #endif err = cuLaunchGridAsync( f, grid.x, grid.y, NULL ) ; //state.stream ) ; if( err != CUDA_SUCCESS ) { fprintf( stderr , "SWAN : FATAL : Failure executing kernel [%s] [%d] [%d,%d,%d][%d,%d,%d]\n", kernel, err, grid.x ,grid.y, grid.z, block.x, block.y, block.z ); assert(0); exit(-99); } //printf("Kernel completed\n" ); }