示例#1
0
CUresult loadCUDAModules()
{
    CUmodule cuModule_;
    checkCudaErrors(cuModuleLoad(&cuModule_, "videoPP64.ptx"));
    checkCudaErrors(cuModuleGetFunction(&g_kernelNV12toARGB, cuModule_, "NV12ToARGBdrvapi"));
    checkCudaErrors(cuModuleGetFunction(&g_kernelARGBtoNV12, cuModule_, "ARGBToNv12drvapi"));
    checkCudaErrors(cuModuleGetFunction(&g_kernelARGBpostprocess, cuModule_, "ARGBpostprocess"));
}
示例#2
0
CUfunction CudaModule::getKernel(const std::string& name, int paramSize)
{
  CUfunction kernel = NULL;
  cuModuleGetFunction(&kernel, m_module, name.c_str());
  
  if (!kernel) {
    std::string funcName(std::string("__globfunc_") + name);    
    cuModuleGetFunction( &kernel, m_module, funcName.c_str() );
  }
  
  if (kernel) {
    checkError( "cuParamSetSize", cuParamSetSize(kernel, paramSize));
  }
  return kernel;
}
示例#3
0
/* int is 64-bit for some reason... */
CUresult bpnn_adjust_weights_launch
(CUmodule mod, CUdeviceptr delta, long hid, CUdeviceptr ly, long in,          
 CUdeviceptr w, CUdeviceptr oldw) 
{
	int bdx, bdy, gdx, gdy;
	void* param[] = {&delta, &hid, &ly, &in, &w, &oldw};
	CUfunction f;
	CUresult res;

	bdx = 16;
	bdy = 16;
	gdx = 1;
	gdy = num_blocks;

	/* get functions. */
	res = cuModuleGetFunction(&f, mod, "_Z24bpnn_adjust_weights_cudaPfiS_iS_S_");
	if (res != CUDA_SUCCESS) {
		printf("cuModuleGetFunction(adjust_weights) failed: res = %u\n", res);
		return res;
	}

	res = cuLaunchKernel(f, gdx, gdy, 1, bdx, bdy, 1, 0, 0, (void**) param, 0);
	if (res != CUDA_SUCCESS) {
		printf("cuLaunchKernel(adjust_weights) failed: res = %u\n", res);
		return res;
	}

	return CUDA_SUCCESS;
}
示例#4
0
CUresult CreateCuFunction(const char* name, CuModule* module, int3 blockShape, 
	FunctionPtr* ppFunction) {
	CUfunction func;
	CUresult result = cuModuleGetFunction(&func, module->Handle(), name);
	if(CUDA_SUCCESS != result) return result;

	FunctionPtr f(new CuFunction);

	CuFuncAttr& attr = f->_attributes;
	cuFuncGetAttribute(&attr.maxThreadsPerBlock,
		CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func);
	cuFuncGetAttribute(&attr.sharedSizeBytes, 
		CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, func);
	cuFuncGetAttribute(&attr.constSizeBytes, 
		CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES, func);
	cuFuncGetAttribute(&attr.localSizeBytes, 
		CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, func);
	cuFuncGetAttribute(&attr.numRegs, 
		CU_FUNC_ATTRIBUTE_NUM_REGS, func);
	cuFuncGetAttribute(&attr.ptxVersion, 
		CU_FUNC_ATTRIBUTE_PTX_VERSION, func);
	cuFuncGetAttribute(&attr.binaryVersion, 
		CU_FUNC_ATTRIBUTE_BINARY_VERSION, func);
	f->_function = func;
	f->_module = module;
	f->_functionName = name;
	f->_blockShape = blockShape;
	ppFunction->swap(f);
	return CUDA_SUCCESS;
}
示例#5
0
int main(){
	init_test();
	const std::string source = 
	".version 4.2\n"
	".target sm_20\n"
	".address_size 64\n"
	".visible .entry kernel(.param .u64 kernel_param_0) {\n"
	".reg .s32 	%r<2>;\n"
	".reg .s64 	%rd<3>;\n"
	"bra 	BB1_2;\n"
	"ld.param.u64 	%rd1, [kernel_param_0];\n"
	"cvta.to.global.u64 	%rd2, %rd1;\n"
	"mov.u32 	%r1, 5;\n"
	"st.global.u32 	[%rd2], %r1;\n"
	"BB1_2: ret;\n"
	"}\n";
	CUmodule modId = 0;
	CUfunction funcHandle = 0;
	cu_assert(cuModuleLoadData(&modId, source.c_str()));
	cu_assert(cuModuleGetFunction(&funcHandle, modId, "kernel"));
	CUdeviceptr devValue;
	int hostValue = 10;
	cu_assert(cuMemAlloc(&devValue, sizeof(int)));
	cu_assert(cuMemcpyHtoD(devValue, &hostValue, sizeof(hostValue)));
	void * params[] = {&devValue};
	cu_assert(cuLaunchKernel(funcHandle, 1,1,1, 1,1,1, 0,0, params, nullptr));
	cu_assert(cuMemcpyDtoH(&hostValue, devValue, sizeof(hostValue)));
	assert(hostValue == 10);
	std::cout << hostValue << "\n";
	cu_assert(cuMemFree(devValue));
	cu_assert(cuModuleUnload(modId));
	return 0;
}
示例#6
0
WEAK int halide_dev_run(void *user_context,
                        void *state_ptr,
                        const char* entry_name,
                        int blocksX, int blocksY, int blocksZ,
                        int threadsX, int threadsY, int threadsZ,
                        int shared_mem_bytes,
                        size_t arg_sizes[],
                        void* args[]) {
    DEBUG_PRINTF( user_context, "CUDA: halide_dev_run (user_context: %p, entry: %s, blocks: %dx%dx%d, threads: %dx%dx%d, shmem: %d)\n",
                  user_context, entry_name,
                  blocksX, blocksY, blocksZ,
                  threadsX, threadsY, threadsZ,
                  shared_mem_bytes );

    CUresult err;
    CudaContext ctx(user_context);
    if (ctx.error != CUDA_SUCCESS) {
        return ctx.error;
    }

    #ifdef DEBUG
    uint64_t t_before = halide_current_time_ns(user_context);
    #endif

    halide_assert(user_context, state_ptr);
    CUmodule mod = ((module_state*)state_ptr)->module;
    halide_assert(user_context, mod);
    CUfunction f;
    err = cuModuleGetFunction(&f, mod, entry_name);
    if (err != CUDA_SUCCESS) {
        halide_error_varargs(user_context, "CUDA: cuModuleGetFunction failed (%s)",
                             _get_error_name(err));
        return err;
    }

    err = cuLaunchKernel(f,
                         blocksX,  blocksY,  blocksZ,
                         threadsX, threadsY, threadsZ,
                         shared_mem_bytes,
                         NULL, // stream
                         args,
                         NULL);
    if (err != CUDA_SUCCESS) {
        halide_error_varargs(user_context, "CUDA: cuLaunchKernel failed (%s)",
                             _get_error_name(err));
        return err;
    }

    #ifdef DEBUG
    err = cuCtxSynchronize();
    if (err != CUDA_SUCCESS) {
        halide_error_varargs(user_context, "CUDA: cuCtxSynchronize failed (%s)\n",
                             _get_error_name(err));
        return err;
    }
    uint64_t t_after = halide_current_time_ns(user_context);
    halide_printf(user_context, "    Time: %f ms\n", (t_after - t_before) / 1.0e6);
    #endif
    return 0;
}
示例#7
0
CUresult bpnn_layerforward_launch
(CUmodule mod, CUdeviceptr input_cuda, CUdeviceptr output_hidden_cuda,
 CUdeviceptr input_hidden_cuda, CUdeviceptr hidden_partial_sum,
 int in, int hid) 
{
	int bdx, bdy, gdx, gdy;
	void* param[] = {&input_cuda, &output_hidden_cuda, &input_hidden_cuda,
					 &hidden_partial_sum, &in, &hid};
	CUfunction f;
	CUresult res;

	bdx = 16;
	bdy = 16;
	gdx = 1;
	gdy = num_blocks;

	/* get functions. */
	res = cuModuleGetFunction(&f, mod, "_Z22bpnn_layerforward_CUDAPfS_S_S_ii");
	if (res != CUDA_SUCCESS) {
		printf("cuModuleGetFunction(layerforward) failed: res = %u\n", res);
		return res;
	}

	res = cuLaunchKernel(f, gdx, gdy, 1, bdx, bdy, 1, 0, 0, (void**) param, 0);
	if (res != CUDA_SUCCESS) {
		printf("cuLaunchKernel(layerforward) failed: res = %u\n", res);
		return res;
	}

	return CUDA_SUCCESS;
}
示例#8
0
文件: cuda.c 项目: mwh/grace-cuda
Object cuda_over_map(Object self, int nparts, int *argcv,
        Object *argv, int flags) {
    CUresult error;
    cuInit(0);
    int deviceCount = 0;
    error = cuDeviceGetCount(&deviceCount);
    if (deviceCount == 0) {
        raiseError("No CUDA devices found");
    }
    CUdevice cuDevice;
    CUcontext cuContext;
    CUmodule cuModule;
    CUfunction cuFunc;
    error = cuDeviceGet(&cuDevice, 0);
    error = cuCtxCreate(&cuContext, 0, cuDevice);
    CUdeviceptr d_A;
    CUdeviceptr d_B;
    CUdeviceptr d_res;
    errcheck(cuModuleLoad(&cuModule, grcstring(argv[argcv[0]])));
    CUdeviceptr dps[argcv[0]];
    void *args[argcv[0]+2];
    int size = INT_MAX;
    for (int i=0; i<argcv[0]; i++) {
        struct CudaFloatArray *a = (struct CudaFloatArray *)argv[i];
        if (a->size < size)
            size = a->size;
        errcheck(cuMemAlloc(&dps[i], size * sizeof(float)));
        errcheck(cuMemcpyHtoD(dps[i], &a->data, size * sizeof(float)));
        args[i+1] = &dps[i];
    }
    struct CudaFloatArray *r =
        (struct CudaFloatArray *)(alloc_CudaFloatArray(size));
    int fsize = sizeof(float) * size;
    errcheck(cuMemAlloc(&d_res, fsize));
    errcheck(cuMemcpyHtoD(d_res, &r->data, fsize));
    args[0] = &d_res;
    args[argcv[0]+1] = &size;

    int threadsPerBlock = 256;
    int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock;
    char name[256];
    strcpy(name, "block");
    strcat(name, grcstring(argv[argcv[0]]) + strlen("_cuda/"));
    for (int i=0; name[i] != 0; i++)
        if (name[i] == '.') {
            name[i] = 0;
            break;
        }
    errcheck(cuModuleGetFunction(&cuFunc, cuModule, name));
    errcheck(cuLaunchKernel(cuFunc, blocksPerGrid, 1, 1,
        threadsPerBlock, 1, 1,
        0,
        NULL, args, NULL));
    errcheck(cuMemcpyDtoH(&r->data, d_res, fsize));
    cuMemFree(d_res);
    for (int i=0; i<argcv[0]; i++)
        cuMemFree(dps[i]);
    return (Object)r;
}
static CUresult
initCuda(CUcontext _cuContext, char* executablePath, CUfunction *mathop, 
	int argc, char** argv, const char* cubin_name, const char* kernel_name)
{
    CUdevice cuDevice;
    CUT_DEVICE_INIT_DRV(cuDevice, argc, argv);
    print_GetProperties(cuDevice);

    CUresult status = cuCtxCreate( &_cuContext, 0, cuDevice );
    if ( CUDA_SUCCESS != status ) {
        Error(_cuContext, status);
    }
    else printf("(1) context creation successful\n");

    char* module_path = cutFindFilePath(cubin_name, executablePath);
    printf ("\t cubin:%s, path:%s, mmp_ptr:%lu\n", cubin_name, executablePath, module_path);
    if(module_path != NULL)
      printf ("\t cubin:%s, path:%s, module_path:%c%c%c%c\n", cubin_name, executablePath, *module_path, *(module_path+1), *(module_path+2), *(module_path+3));
    char* data_path = "./data/";
    size_t len_path = strlen(data_path);
    size_t len_fn = strlen(cubin_name);
    // printf ("Sizes: data:%lu, cubinname:%lu\n", len_path, len_fn);

    char* module_path_new = (char*)malloc(sizeof(char) * (len_path + len_fn));
    strcpy(module_path_new, data_path);
    strcat(module_path_new, cubin_name);
    strcat(module_path_new, "\0");
    if (module_path_new == 0) {
        status = CUDA_ERROR_NOT_FOUND;
        Error(_cuContext, status);
    }
    FILE *fp = fopen(module_path_new,"r");
    if( fp ) {
	printf("(2) cubin_File found in modulepath:%s\n", module_path_new);
        fclose(fp);
    } else {
	printf("(2) cubin file not exist: %s\n", module_path_new);
    }
    CUmodule cuModule;
    status = cuModuleLoad(&cuModule, module_path_new);
    cutFree(module_path_new);
    if ( CUDA_SUCCESS != status ) {
        Error(_cuContext, status);
    }
    else printf ("(3) module Load successful\n");

    CUfunction cuFunction = 0;
    status = cuModuleGetFunction(&cuFunction, cuModule, kernel_name);
    if ( CUDA_SUCCESS != status) {
        Error(_cuContext, status);
    }
    else printf ("(4) getFunction successful w/cuFunction\n");

    *mathop = cuFunction;

    return CUDA_SUCCESS;

}
/*
 * This function load the ptx file ptxPath and extract the kernel kName
 * to phKernel
 * @param phKernel    Output kernel handle
 * @param ptxPath     ptx file name
 * @param kName       kernel name
 */
void ptxJIT(CUmodule *phModule, CUfunction *phKernel, const char *ptxPath, const char *kName)
{
  CUlinkState cuLinkState;
  CUjit_option options[6];
  void *optionVals[6];
  float walltime;
  char error_log[8192], info_log[8192];
  unsigned int logSize = 8192;
  void *cuOut;
  size_t outSize;
  int myErr = 0;

  // Setup linker options
  // Return walltime from JIT compilation
  options[0] = CU_JIT_WALL_TIME;
  optionVals[0] = (void *) &walltime;
  // Pass a buffer for info messages
  options[1] = CU_JIT_INFO_LOG_BUFFER;
  optionVals[1] = (void *) info_log;
  // Pass the size of the info buffer
  options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
  optionVals[2] = (void *) (long)logSize;
  // Pass a buffer for error message
  options[3] = CU_JIT_ERROR_LOG_BUFFER;
  optionVals[3] = (void *) error_log;
  // Pass the size of the error buffer
  options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
  optionVals[4] = (void *) (long) logSize;
  // Make the linker verbose
  options[5] = CU_JIT_LOG_VERBOSE;
  optionVals[5] = (void *) 1;

  // Create a pending linker invocation
  checkCudaErrors(cuLinkCreate(6,options, optionVals, &cuLinkState));

  // Load the ptx from the file
  myErr = cuLinkAddFile(cuLinkState, CU_JIT_INPUT_PTX, ptxPath, 0, 0, 0);
  if (myErr != CUDA_SUCCESS){
    // Errors will be put in error_log, per CU_JIT_ERROR_LOG_BUFFER option above.
    fprintf(stderr,"PTX Linker Error:\n%s\n",error_log);
  }

  // Complete the linker step
  checkCudaErrors(cuLinkComplete(cuLinkState, &cuOut, &outSize));

  // Linker walltime and info_log were requested in options above.
  printf("CUDA Link Completed in %fms. Linker Output:\n%s\n", walltime, info_log);

  // Load resulting cuBin into module
  checkCudaErrors(cuModuleLoadData(phModule, cuOut));

  // Locate the kernel entry point
  checkCudaErrors(cuModuleGetFunction(phKernel, *phModule, kName));

  // Destroy the linker invocation
  checkCudaErrors(cuLinkDestroy(cuLinkState));
}
示例#11
0
/**
 * This measures the overhead in launching a kernel function on each GPU in the
 * system.
 *
 * It does this by executing a small kernel (copying 1 value in global memory) a
 * very large number of times and taking the average execution time.  This
 * program uses the CUDA driver API.
 */
int main() {
  CU_ERROR_CHECK(cuInit(0));

  int count;
  CU_ERROR_CHECK(cuDeviceGetCount(&count));

  float x = 5.0f;
  for (int d = 0; d < count; d++) {
    CUdevice device;
    CU_ERROR_CHECK(cuDeviceGet(&device, d));

    CUcontext context;
    CU_ERROR_CHECK(cuCtxCreate(&context, 0, device));

    CUdeviceptr in, out;
    CU_ERROR_CHECK(cuMemAlloc(&in, sizeof(float)));
    CU_ERROR_CHECK(cuMemAlloc(&out, sizeof(float)));
    CU_ERROR_CHECK(cuMemcpyHtoD(in, &x, sizeof(float)));

    CUmodule module;
    CU_ERROR_CHECK(cuModuleLoadData(&module, imageBytes));

    CUfunction function;
    CU_ERROR_CHECK(cuModuleGetFunction(&function, module, "kernel"));

    void * params[] = { &in, &out };

    CUevent start, stop;
    CU_ERROR_CHECK(cuEventCreate(&start, 0));
    CU_ERROR_CHECK(cuEventCreate(&stop, 0));

    CU_ERROR_CHECK(cuEventRecord(start, 0));
    for (int i = 0; i < ITERATIONS; i++)
      CU_ERROR_CHECK(cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, params, NULL));

    CU_ERROR_CHECK(cuEventRecord(stop, 0));
    CU_ERROR_CHECK(cuEventSynchronize(stop));

    float time;
    CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop));

    CU_ERROR_CHECK(cuEventDestroy(start));
    CU_ERROR_CHECK(cuEventDestroy(stop));

    CU_ERROR_CHECK(cuMemFree(in));
    CU_ERROR_CHECK(cuMemFree(out));

    fprintf(stdout, "Device %d: %fms\n", d, (time / (double)ITERATIONS));

    CU_ERROR_CHECK(cuModuleUnload(module));

    CU_ERROR_CHECK(cuCtxDestroy(context));
  }

  return 0;
}
CUresult
TestSAXPY( chCUDADevice *chDevice, size_t N, float alpha )
{
    CUresult status;
    CUdeviceptr dptrOut = 0;
    CUdeviceptr dptrIn = 0;
    float *hostOut = 0;
    float *hostIn = 0;

    CUDA_CHECK( cuCtxPushCurrent( chDevice->context() ) );

    CUDA_CHECK( cuMemAlloc( &dptrOut, N*sizeof(float) ) );
    CUDA_CHECK( cuMemsetD32( dptrOut, 0, N ) );
    CUDA_CHECK( cuMemAlloc( &dptrIn, N*sizeof(float) ) );
    CUDA_CHECK( cuMemHostAlloc( (void **) &hostOut, N*sizeof(float), 0 ) );
    CUDA_CHECK( cuMemHostAlloc( (void **) &hostIn, N*sizeof(float), 0 ) );
    for ( size_t i = 0; i < N; i++ ) {
        hostIn[i] = (float) rand() / (float) RAND_MAX;
    }
    CUDA_CHECK( cuMemcpyHtoDAsync( dptrIn, hostIn, N*sizeof(float ), NULL ) );

    {
        CUmodule moduleSAXPY;
        CUfunction kernelSAXPY;
        void *params[] = { &dptrOut, &dptrIn, &N, &alpha };
        
        moduleSAXPY = chDevice->module( "saxpy.ptx" );
        if ( ! moduleSAXPY ) {
            status = CUDA_ERROR_NOT_FOUND;
            goto Error;
        }
        CUDA_CHECK( cuModuleGetFunction( &kernelSAXPY, moduleSAXPY, "saxpy" ) );

        CUDA_CHECK( cuLaunchKernel( kernelSAXPY, 1500, 1, 1, 512, 1, 1, 0, NULL, params, NULL ) );

    }

    CUDA_CHECK( cuMemcpyDtoHAsync( hostOut, dptrOut, N*sizeof(float), NULL ) );
    CUDA_CHECK( cuCtxSynchronize() );
    for ( size_t i = 0; i < N; i++ ) {
        if ( fabsf( hostOut[i] - alpha*hostIn[i] ) > 1e-5f ) {
            status = CUDA_ERROR_UNKNOWN;
            goto Error;
        }
    }
    status = CUDA_SUCCESS;
    printf( "Well it worked!\n" );

Error:
    cuCtxPopCurrent( NULL );
    cuMemFreeHost( hostOut );
    cuMemFreeHost( hostIn );
    cuMemFree( dptrOut );
    cuMemFree( dptrIn );
    return status;
}
示例#13
0
/*
 * Class:     edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2
 * Method:    loadFunction
 * Signature: ()V
 */
JNIEXPORT void JNICALL Java_edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2_loadFunction
  (JNIEnv *env, jobject this_obj, jlong heap_end_ptr, jstring filename, jint num_blocks){

  void * cubin_file;
  int offset;
  CUresult status;
  char * native_filename;
  heapEndPtr = heap_end_ptr;
  
  native_filename = (*env)->GetStringUTFChars(env, filename, 0);
  status = cuModuleLoad(&cuModule, native_filename);
  CHECK_STATUS(env, "error in cuModuleLoad", status);
  (*env)->ReleaseStringUTFChars(env, filename, native_filename);

  status = cuModuleGetFunction(&cuFunction, cuModule, "_Z5entryPcS_PiPxS1_S0_S0_i"); 
  CHECK_STATUS(env,"error in cuModuleGetFunction",status)

  status = cuFuncSetCacheConfig(cuFunction, CU_FUNC_CACHE_PREFER_L1);
  CHECK_STATUS(env,"error in cuFuncSetCacheConfig",status)

  status = cuParamSetSize(cuFunction, (7 * sizeof(CUdeviceptr) + sizeof(int))); 
  CHECK_STATUS(env,"error in cuParamSetSize",status)

  offset = 0;
  status = cuParamSetv(cuFunction, offset, (void *) &gcInfoSpace, sizeof(CUdeviceptr)); 
  CHECK_STATUS(env,"error in cuParamSetv gcInfoSpace",status)
  offset += sizeof(CUdeviceptr);

  status = cuParamSetv(cuFunction, offset, (void *) &gpuToSpace, sizeof(CUdeviceptr)); 
  CHECK_STATUS(env,"error in cuParamSetv gpuToSpace",status)
  offset += sizeof(CUdeviceptr);

  status = cuParamSetv(cuFunction, offset, (void *) &gpuHandlesMemory, sizeof(CUdeviceptr)); 
  CHECK_STATUS(env,"error in cuParamSetv gpuHandlesMemory %",status)
  offset += sizeof(CUdeviceptr);

  status = cuParamSetv(cuFunction, offset, (void *) &gpuHeapEndPtr, sizeof(CUdeviceptr)); 
  CHECK_STATUS(env,"error in cuParamSetv gpuHeapEndPtr",status)
  offset += sizeof(CUdeviceptr);

  status = cuParamSetv(cuFunction, offset, (void *) &gpuBufferSize, sizeof(CUdeviceptr));
  CHECK_STATUS(env,"error in cuParamSetv gpuBufferSize",status)
  offset += sizeof(CUdeviceptr); 

  status = cuParamSetv(cuFunction, offset, (void *) &gpuExceptionsMemory, sizeof(CUdeviceptr)); 
  CHECK_STATUS(env,"error in cuParamSetv gpuExceptionsMemory",status)
  offset += sizeof(CUdeviceptr);

  status = cuParamSetv(cuFunction, offset, (void *) &gpuClassMemory, sizeof(CUdeviceptr)); 
  CHECK_STATUS(env,"error in cuParamSetv gpuClassMemory",status)
  offset += sizeof(CUdeviceptr);

  status = cuParamSeti(cuFunction, offset, num_blocks); 
  CHECK_STATUS(env,"error in cuParamSetv num_blocks",status)
  offset += sizeof(int);
}
示例#14
0
 /// Constructor. Extracts a backend::kernel instance from backend::program.
 kernel(const command_queue &queue,
        const program &P,
        const std::string &name,
        std::function<size_t(size_t)> smem
        )
     : ctx(queue.context()), P(P), smem(0)
 {
     cuda_check( cuModuleGetFunction(&K, P.raw(), name.c_str()) );
     config(queue, smem);
 }
示例#15
0
 /// Constructor. Creates a backend::kernel instance from source.
 kernel(const command_queue &queue,
        const std::string &src, const std::string &name,
        std::function<size_t(size_t)> smem,
        const std::string &options = ""
        )
     : ctx(queue.context()), P(build_sources(queue, src, options)), smem(0)
 {
     cuda_check( cuModuleGetFunction(&K, P.raw(), name.c_str()) );
     config(queue, smem);
 }
示例#16
0
CAMLprim value spoc_cuda_debug_compile(value moduleSrc, value function_name, value gi){
	CAMLparam3(moduleSrc, function_name, gi);
	CUmodule module;
	CUfunction *kernel;
	char* functionN;
	char *ptx_source;
	const unsigned int jitNumOptions = 4;

	CUjit_option jitOptions[4];
	void *jitOptVals[4];
	int jitLogBufferSize;
	char *jitLogBuffer;
	int jitRegCount = 32;

	BLOCKING_CUDA_GET_CONTEXT;

	kernel = malloc(sizeof(CUfunction));
	functionN = String_val(function_name);
	ptx_source = String_val(moduleSrc);

	// set up size of compilation log buffer
	jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
	jitLogBufferSize = 1024;
	jitOptVals[0] = (void *)(size_t)jitLogBufferSize;

	// set up pointer to the compilation log buffer
	jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
	jitLogBuffer = malloc(sizeof(char)*jitLogBufferSize);
	jitOptVals[1] = jitLogBuffer;

	// set up pointer to set the Maximum # of registers for a particular kernel
	jitOptions[2] = CU_JIT_MAX_REGISTERS;
	jitOptVals[2] = (void *)(size_t)jitRegCount;

	jitOptions[3] = CU_JIT_TARGET_FROM_CUCONTEXT;
	//CU_JIT_TARGET;
//	jitOptVals[3] =  (void*)(uintptr_t)CU_TARGET_COMPUTE_10;


	cuda_error = (cuModuleLoadDataEx(&module, ptx_source, jitNumOptions, jitOptions, (void **)jitOptVals));
	if (cuda_error)
	  {
	    fprintf (stderr,"%s\n", jitLogBuffer);
	    fflush (stderr);
	  }
	cuda_error = (cuModuleGetFunction(kernel, module, functionN));
	if (cuda_error)
	  {
	    fprintf (stderr, "%s\n", jitLogBuffer);
	    fflush (stderr);
	  }
	BLOCKING_CUDA_RESTORE_CONTEXT;
	free(jitLogBuffer);
	CAMLreturn((value) kernel);
}
示例#17
0
文件: ov.c 项目: CPFL/gtraffic
/*
 * Initializaiton in order to use kernel program 
 */
void
init_cuda(void){

  thread_num = (N <= 16) ? N : 16 ;  
  block_num = N / (thread_num*thread_num);
  if(N % (thread_num*thread_num) != 0) block_num++;
  
  res = cuInit(0);
  if(res != CUDA_SUCCESS){
    printf("cuInit failed: res = %s\n", conv(res));
    exit(1);
  }
  
  res = cuDeviceGet(&dev, 0);
  if(res != CUDA_SUCCESS){
    printf("cuDeviceGet failed: res = %s\n", conv(res));
    exit(1);
  }
  
  res = cuCtxCreate(&ctx, 0, dev);
  if(res != CUDA_SUCCESS){
    printf("cuCtxCreate failed: res = %s\n", conv(res));
    exit(1);
  }
  
  res = cuModuleLoad(&module, "./cuda_main.cubin");
  if(res != CUDA_SUCCESS){
    printf("cuModuleLoad() failed: res = %s\n", conv(res));
    exit(1);
  }
  
  res = cuModuleGetFunction(&function, module, "cuda_main");
  if(res != CUDA_SUCCESS){
    printf("cuModuleGetFunction() failed: res = %s\n",  conv(res));
    exit(1);
  }
  

  /* 
   * preparation for launch kernel 
   */
  res = cuFuncSetSharedSize(function, 0x40);  /* just random */
  if(res != CUDA_SUCCESS){
    printf("cuFuncSetSharedSize() failed: res = %s\n", conv(res));
    exit(1);
  }
  
  res = cuFuncSetBlockShape(function, thread_num, thread_num, 1);
  if(res != CUDA_SUCCESS){
    printf("cuFuncSetBlockShape() failed: res = %s\n", conv(res));
    exit(1);
  }

}
示例#18
0
/*
 * Class:     edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2
 * Method:    loadFunction
 * Signature: ()V
 */
JNIEXPORT void JNICALL Java_edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2_loadFunction
  (JNIEnv *env, jobject this_obj, jlong heap_end_ptr, jobject buffers, jint size, 
   jint total_size, jint num_blocks){

  void * cubin_file;
  int offset;
  CUresult status;
  heapEndPtr = heap_end_ptr;
  
  //void * cubin_file = readCubinFile("code_file.cubin");
  cubin_file = readCubinFileFromBuffers(env, buffers, size, total_size);
  status = cuModuleLoadData(&cuModule, cubin_file);
  CHECK_STATUS(env,"error in cuModuleLoad",status)
  
  free(cubin_file);

  status = cuModuleGetFunction(&cuFunction, cuModule, "_Z5entryPcS_PiPxS1_S0_i"); 
  CHECK_STATUS(env,"error in cuModuleGetFunction",status)

  status = cuFuncSetCacheConfig(cuFunction, CU_FUNC_CACHE_PREFER_L1);
  CHECK_STATUS(env,"error in cuFuncSetCacheConfig",status)

  status = cuParamSetSize(cuFunction, (6 * sizeof(CUdeviceptr) + sizeof(int))); 
  CHECK_STATUS(env,"error in cuParamSetSize",status)

  offset = 0;
  status = cuParamSetv(cuFunction, offset, (void *) &gcInfoSpace, sizeof(CUdeviceptr)); 
  CHECK_STATUS(env,"error in cuParamSetv gcInfoSpace",status)
  offset += sizeof(CUdeviceptr);

  status = cuParamSetv(cuFunction, offset, (void *) &gpuToSpace, sizeof(CUdeviceptr)); 
  CHECK_STATUS(env,"error in cuParamSetv gpuToSpace",status)
  offset += sizeof(CUdeviceptr);

  status = cuParamSetv(cuFunction, offset, (void *) &gpuHandlesMemory, sizeof(CUdeviceptr)); 
  CHECK_STATUS(env,"error in cuParamSetv gpuHandlesMemory %",status)
  offset += sizeof(CUdeviceptr);

  status = cuParamSetv(cuFunction, offset, (void *) &gpuHeapEndPtr, sizeof(CUdeviceptr)); 
  CHECK_STATUS(env,"error in cuParamSetv gpuHeapEndPtr",status)
  offset += sizeof(CUdeviceptr);

  status = cuParamSetv(cuFunction, offset, (void *) &gpuBufferSize, sizeof(CUdeviceptr));
  CHECK_STATUS(env,"error in cuParamSetv gpuBufferSize",status)
  offset += sizeof(CUdeviceptr); 

  status = cuParamSetv(cuFunction, offset, (void *) &gpuExceptionsMemory, sizeof(CUdeviceptr)); 
  CHECK_STATUS(env,"error in cuParamSetv gpuExceptionsMemory",status)
  offset += sizeof(CUdeviceptr);

  status = cuParamSeti(cuFunction, offset, num_blocks); 
  CHECK_STATUS(env,"error in cuParamSetv num_blocks",status)
  offset += sizeof(int);
}
示例#19
0
        /// Constructor. Extracts a backend::kernel instance from backend::program.
        kernel(const command_queue &queue,
               const program &P,
               const std::string &name,
               size_t smem_per_thread = 0
               )
            : ctx(queue.context()), P(P), smem(0)
        {
            cuda_check( cuModuleGetFunction(&K, P.raw(), name.c_str()) );

            config(queue,
                    [smem_per_thread](size_t wgs){ return wgs * smem_per_thread; });
        }
示例#20
0
CUresult loadAndRunDualTestFunction(CUmodule *phModule, std::string name, CUdeviceptr &d_data0, 
				CUdeviceptr &d_data1, 
				DataStruct *h_data0, 
				DataStruct *h_data1, 
				unsigned int memSize, 
                                int thread_x=1,int thread_y=1,int thread_z=1,
                                int block_x=1, int block_y=1, int block_z=1)
{
  //  std::cout << "  Start Loading" << std::endl;

  // load data the to device
  cuMemcpyHtoD(d_data0, h_data0, memSize);         
  cuMemcpyHtoD(d_data1, h_data1, memSize);         

  // Locate the kernel entry point
  CUfunction phKernel = 0;
  CUresult status = cuModuleGetFunction(&phKernel, *phModule, name.data());
   if (status != CUDA_SUCCESS)
     {printf("ERROR: could not load function\n");}
    
  // Set the kernel parameters
  status = cuFuncSetBlockShape(phKernel, thread_x, thread_y, thread_z);
   if (status != CUDA_SUCCESS)
     {printf("ERROR: during setBlockShape\n");}

  int paramOffset = 0, size=0;

  size = sizeof(CUdeviceptr);
  status = cuParamSetv(phKernel, paramOffset, &d_data0, size);
  paramOffset += size;
  status = cuParamSetv(phKernel, paramOffset, &d_data1, size);
  paramOffset += size;



  status = cuParamSetSize(phKernel, paramOffset);
   if (status != CUDA_SUCCESS)
     {printf("ERROR: during cuParamSetv\n");}
    
  // Launch the kernel
  status = cuLaunchGrid(phKernel, block_x, block_y);
  if (status != CUDA_SUCCESS)
    {printf("ERROR: during grid launch\n");}

  //  std::cout << "  launched CUDA kernel!!" << std::endl;
  
  // Copy the result back to the host
  status = cuMemcpyDtoH(h_data0, d_data0, memSize);
  status = cuMemcpyDtoH(h_data1, d_data1, memSize);
  if (status != CUDA_SUCCESS)
    {printf("ERROR: during MemcpyDtoH\n");}
}
示例#21
0
CUresult compute_tran_temp
(CUmodule mod, CUdeviceptr MatrixPower, CUdeviceptr MatrixTemp[2], 
 int col, int row, int total_iterations, int num_iterations, int blockCols, 
 int blockRows, int borderCols, int borderRows) 
{
	int gdx = blockCols;
	int gdy = blockRows;
	int bdx = BLOCK_SIZE;
	int bdy = BLOCK_SIZE;
	
	float grid_height = chip_height / row;
	float grid_width = chip_width / col;
	
	float Cap = FACTOR_CHIP * SPEC_HEAT_SI * t_chip * grid_width * grid_height;
	float Rx = grid_width / (2.0 * K_SI * t_chip * grid_height);
	float Ry = grid_height / (2.0 * K_SI * t_chip * grid_width);
	float Rz = t_chip / (K_SI * grid_height * grid_width);
	
	float max_slope = MAX_PD / (FACTOR_CHIP * t_chip * SPEC_HEAT_SI);
	float step = PRECISION / max_slope;
	float t;
	float time_elapsed;
	time_elapsed=0.001;
	
	int src = 1, dst = 0;
	CUfunction f;
	CUresult res;

	res = cuModuleGetFunction(&f, mod, "_Z14calculate_tempiPfS_S_iiiiffffff");
	if (res != CUDA_SUCCESS) {
		printf("cuModuleGetFunction failed: res = %u\n", res);
		return 0;
	}
	
	for (t = 0; t < total_iterations; t+=num_iterations) {
		int it = MIN(num_iterations, total_iterations-t);
		int temp = src;
		src = dst;
		dst = temp;
		void *param[] = {&it, &MatrixPower, &MatrixTemp[src], &MatrixTemp[dst],
						 &col, &row, &borderCols, &borderRows, &Cap,
						 &Rx, &Ry, &Rz, &step, &time_elapsed};
		res = cuLaunchKernel(f, gdx, gdy, 1, bdx, bdy, 1, 0xc00, 0, 
							 (void**) param, NULL);
		if (res != CUDA_SUCCESS) {
			printf("cuLaunchKernel(euclid) failed: res = %u\n", res);
			return 0;
		}
	}

	return dst;
}
示例#22
0
GpuCompilationContext::GpuCompilationContext(const void* image,
                                             const std::string& kernel_name,
                                             const int device_id,
                                             const void* cuda_mgr,
                                             unsigned int num_options,
                                             CUjit_option* options,
                                             void** option_vals)
    : module_(nullptr), kernel_(nullptr), device_id_(device_id), cuda_mgr_(cuda_mgr) {
  static_cast<const CudaMgr_Namespace::CudaMgr*>(cuda_mgr_)->setContext(device_id_);
  checkCudaErrors(cuModuleLoadDataEx(&module_, image, num_options, options, option_vals));
  CHECK(module_);
  checkCudaErrors(cuModuleGetFunction(&kernel_, module_, kernel_name.c_str()));
}
示例#23
0
        /// Constructor. Creates a backend::kernel instance from source.
        kernel(const command_queue &queue,
               const std::string &src,
               const std::string &name,
               size_t smem_per_thread = 0,
               const std::string &options = ""
               )
            : ctx(queue.context()), P(build_sources(queue, src, options)), smem(0)
        {
            cuda_check( cuModuleGetFunction(&K, P.raw(), name.c_str()) );

            config(queue,
                    [smem_per_thread](size_t wgs){ return wgs * smem_per_thread; });
        }
示例#24
0
static CUfunction __get_kernel(void *user_context, const char* entry_name)
{
    CUfunction f;

    #ifdef DEBUG
    char msg[256];
    snprintf(msg, 256, "get_kernel %s (t=%lld)", entry_name, (long long)halide_current_time_ns(user_context) );
    #endif

    // Get kernel function ptr
    TIME_CALL( cuModuleGetFunction(&f, __mod, entry_name), msg );

    return f;
}
示例#25
0
SEXP R_auto_cuModuleGetFunction(SEXP r_hfunc, SEXP r_hmod, SEXP r_name)
{
    SEXP r_ans = R_NilValue;
    CUfunction * hfunc = GET_REF(r_hfunc, CUfunction );
    CUmodule hmod = (CUmodule) getRReference(r_hmod);
    const char * name = CHAR(STRING_ELT(r_name, 0));
    
    CUresult ans;
    ans = cuModuleGetFunction(hfunc, hmod, name);
    
    r_ans = Renum_convert_CUresult(ans) ;
    
    return(r_ans);
}
示例#26
0
  kernel_t<CUDA>* kernel_t<CUDA>::buildFromBinary(const std::string &filename,
                                                 const std::string &functionName_){
    OCCA_EXTRACT_DATA(CUDA, Kernel);

    functionName = functionName_;

    OCCA_CUDA_CHECK("Kernel (" + functionName + ") : Loading Module",
                    cuModuleLoad(&data_.module, filename.c_str()));

    OCCA_CUDA_CHECK("Kernel (" + functionName + ") : Loading Function",
                    cuModuleGetFunction(&data_.function, data_.module, functionName.c_str()));

    return this;
  }
示例#27
0
  kernel_t<CUDA>* kernel_t<CUDA>::loadFromLibrary(const char *cache,
                                                  const std::string &functionName_){
    OCCA_EXTRACT_DATA(CUDA, Kernel);

    functionName = functionName_;

    OCCA_CUDA_CHECK("Kernel (" + functionName + ") : Loading Module",
                    cuModuleLoadData(&data_.module, cache));

    OCCA_CUDA_CHECK("Kernel (" + functionName + ") : Loading Function",
                    cuModuleGetFunction(&data_.function, data_.module, functionName.c_str()));

    return this;
  }
示例#28
0
static CUfunction __get_kernel(const char* entry_name)
{
    CUfunction f;

    #ifndef NDEBUG
    char msg[256];
    snprintf(msg, 256, "get_kernel %s (t=%d)", entry_name, halide_current_time() );
    #endif

    // Get kernel function ptr
    TIME_CALL( cuModuleGetFunction(&f, __mod, entry_name), msg );

    return f;
}
GPUFunction GPUInterface::GetFunction(const char* functionName) {
#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr,"\t\t\tEntering GPUInterface::GetFunction\n");
#endif

    GPUFunction cudaFunction;

    SAFE_CUPP(cuModuleGetFunction(&cudaFunction, cudaModule, functionName));

#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr,"\t\t\tLeaving  GPUInterface::GetFunction\n");
#endif

    return cudaFunction;
}
示例#30
0
int
main()
{
  CUresult result;
  result = cuInit(0);
  CUdevice device;
  result = cuDeviceGet(&device, 0);
  CUcontext ctx;
  result = cuCtxCreate(&ctx, 0, device);
  CUmodule module;
  result = cuModuleLoad(&module, "cuda-shift-throughput.cubin");
  CUfunction kernel;
  result = cuModuleGetFunction(&kernel, module, "kernel");
  int block;
  result = cuFuncGetAttribute(&block,
                              CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
                              kernel);
  int grid = 1024 * 1024;
  CUevent event[2];
  for (ptrdiff_t i = 0; i < 2; ++i) {
    result = cuEventCreate(&event[i], 0);
  }
  result = cuEventRecord(event[0], 0);
  result = cuLaunchKernel(kernel, grid, 1, 1, block, 1, 1, 0, 0, 0, 0);
  result = cuEventRecord(event[1], 0);
  result = cuEventSynchronize(event[1]);
  float time;
  result = cuEventElapsedTime(&time, event[0], event[1]);
  int gpuclock;
  result =
    cuDeviceGetAttribute(&gpuclock, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device);
  int gpump;
  result =
    cuDeviceGetAttribute(&gpump, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
                         device);
  std::printf("Clock: %d KHz, # of MPs: %d\n", gpuclock, gpump);
  std::printf("Elapsed Time: %f milliseconds\n", time);
  std::printf("# of Threads: %d, # of SHLs : %lld\n", block,
              1024ll * block * grid);
  std::printf("Throughput: %f\n",
              1024.0 * block * grid / ((double) gpump * gpuclock * time));
  for (ptrdiff_t i = 0; i < 2; ++i) {
    result = cuEventDestroy(event[i]);
  }
  result = cuModuleUnload(module);
  result = cuCtxDestroy(ctx);
  return 0;
}